@@ -140,13 +140,10 @@ def __init__(self, env_name, model_name=""):
140140 self .env = gym .make (env_name )
141141 self .action_size = self .env .action_space .shape [0 ]
142142 self .state_size = self .env .observation_space .shape
143- self .high = self .env .observation_space .high
144- self .low = self .env .observation_space .low
145143 self .EPISODES = 200000 # total episodes to train through all environments
146144 self .episode = 0 # used to track the episodes total count of episodes played through all thread environments
147145 self .max_average = 0 # when average score is above 0 model will be saved
148146 self .lr = 0.00025
149- self .keep_running_thread = True
150147 self .epochs = 10 # training epochs
151148 self .shuffle = True
152149 self .Training_batch = 512
@@ -167,18 +164,21 @@ def __init__(self, env_name, model_name=""):
167164 self .Critic_name = f"{ self .env_name } _PPO_Critic.h5"
168165 #self.load() # uncomment to continue training from old weights
169166
167+ # do not change bellow
168+ self .log_std = - 0.5 * np .ones (self .action_size , dtype = np .float32 )
169+ self .std = np .exp (self .log_std )
170+
170171
171172 def act (self , state ):
172173 # Use the network to predict the next action to take, using the model
173174 pred = self .Actor .predict (state )
174175
175- log_std = - 0.5 * np .ones (self .action_size , dtype = np .float32 )
176- std = np .exp (log_std )
177- action = pred + np .random .normal (size = self .action_size ) * std
178- action = np .clip (action , self .low , self .high )
179- logp_t = self .gaussian_likelihood (action , pred , log_std )
176+ action = pred + np .random .normal (size = pred .shape ) * self .std
177+ action = np .clip (action , - 1 , 1 ) # -1 and 1 are boundaries of tanh
178+
179+ logp_t = self .gaussian_likelihood (action , pred , self .log_std )
180180
181- return action [ 0 ] , logp_t [ 0 ]
181+ return action , logp_t
182182
183183 def gaussian_likelihood (self , action , pred , log_std ):
184184 pre_sum = - 0.5 * (((action - pred )/ (np .exp (log_std )+ 1e-8 ))** 2 + 2 * log_std + np .log (2 * np .pi ))
@@ -305,17 +305,15 @@ def run_batch(self):
305305 self .env .render ()
306306 # Actor picks an action
307307 action , logp_t = self .act (state )
308- #action = np.clip(action, -1, 1)
309- #print(action)
310308 # Retrieve new state, reward, and whether the state is terminal
311- next_state , reward , done , _ = self .env .step (action )
309+ next_state , reward , done , _ = self .env .step (action [ 0 ] )
312310 # Memorize (state, next_states, action, reward, done, logp_ts) for training
313311 states .append (state )
314312 next_states .append (np .reshape (next_state , [1 , self .state_size [0 ]]))
315313 actions .append (action )
316314 rewards .append (reward )
317315 dones .append (done )
318- logp_ts .append (logp_t )
316+ logp_ts .append (logp_t [ 0 ] )
319317 # Update current state shape
320318 state = np .reshape (next_state , [1 , self .state_size [0 ]])
321319 score += reward
@@ -359,15 +357,9 @@ def run_multiprocesses(self, num_worker = 4):
359357 for worker_id , parent_conn in enumerate (parent_conns ):
360358 state [worker_id ] = parent_conn .recv ()
361359
362- log_std = - 0.5 * np .ones (self .action_size , dtype = np .float32 )
363- std = np .exp (log_std )
364360 while self .episode < self .EPISODES :
365- predictions_list = self .Actor .predict (np .reshape (state , [num_worker , self .state_size [0 ]]))
366-
367- action = predictions_list + np .random .normal (size = predictions_list .shape ) * std
368- action = np .clip (action , self .low , self .high )
369-
370- logp_pi = self .gaussian_likelihood (action , predictions_list , log_std )
361+ # get batch of action's and log_pi's
362+ action , logp_pi = self .act (np .reshape (state , [num_worker , self .state_size [0 ]]))
371363
372364 for worker_id , parent_conn in enumerate (parent_conns ):
373365 parent_conn .send (action [worker_id ])
@@ -438,5 +430,5 @@ def test(self, test_episodes = 100):#evaluate
438430 env_name = 'BipedalWalker-v3'
439431 agent = PPOAgent (env_name )
440432 #agent.run_batch() # train as PPO
441- #agent.run_multiprocesses(num_worker = 8 ) # train PPO multiprocessed (fastest)
433+ #agent.run_multiprocesses(num_worker = 2 ) # train PPO multiprocessed (fastest)
442434 agent .test ()
0 commit comments