Skip to content

Commit dc1319c

Browse files
Update BipedalWalker-v3_PPO.py
1 parent c61fe1a commit dc1319c

File tree

1 file changed

+14
-22
lines changed

1 file changed

+14
-22
lines changed

BipedalWalker-v3_PPO/BipedalWalker-v3_PPO.py

Lines changed: 14 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -140,13 +140,10 @@ def __init__(self, env_name, model_name=""):
140140
self.env = gym.make(env_name)
141141
self.action_size = self.env.action_space.shape[0]
142142
self.state_size = self.env.observation_space.shape
143-
self.high = self.env.observation_space.high
144-
self.low = self.env.observation_space.low
145143
self.EPISODES = 200000 # total episodes to train through all environments
146144
self.episode = 0 # used to track the episodes total count of episodes played through all thread environments
147145
self.max_average = 0 # when average score is above 0 model will be saved
148146
self.lr = 0.00025
149-
self.keep_running_thread = True
150147
self.epochs = 10 # training epochs
151148
self.shuffle = True
152149
self.Training_batch = 512
@@ -167,18 +164,21 @@ def __init__(self, env_name, model_name=""):
167164
self.Critic_name = f"{self.env_name}_PPO_Critic.h5"
168165
#self.load() # uncomment to continue training from old weights
169166

167+
# do not change bellow
168+
self.log_std = -0.5 * np.ones(self.action_size, dtype=np.float32)
169+
self.std = np.exp(self.log_std)
170+
170171

171172
def act(self, state):
172173
# Use the network to predict the next action to take, using the model
173174
pred = self.Actor.predict(state)
174175

175-
log_std = -0.5 * np.ones(self.action_size, dtype=np.float32)
176-
std = np.exp(log_std)
177-
action = pred + np.random.normal(size=self.action_size) * std
178-
action = np.clip(action, self.low, self.high)
179-
logp_t = self.gaussian_likelihood(action, pred, log_std)
176+
action = pred + np.random.normal(size=pred.shape) * self.std
177+
action = np.clip(action, -1, 1) # -1 and 1 are boundaries of tanh
178+
179+
logp_t = self.gaussian_likelihood(action, pred, self.log_std)
180180

181-
return action[0], logp_t[0]
181+
return action, logp_t
182182

183183
def gaussian_likelihood(self, action, pred, log_std):
184184
pre_sum = -0.5 * (((action-pred)/(np.exp(log_std)+1e-8))**2 + 2*log_std + np.log(2*np.pi))
@@ -305,17 +305,15 @@ def run_batch(self):
305305
self.env.render()
306306
# Actor picks an action
307307
action, logp_t = self.act(state)
308-
#action = np.clip(action, -1, 1)
309-
#print(action)
310308
# Retrieve new state, reward, and whether the state is terminal
311-
next_state, reward, done, _ = self.env.step(action)
309+
next_state, reward, done, _ = self.env.step(action[0])
312310
# Memorize (state, next_states, action, reward, done, logp_ts) for training
313311
states.append(state)
314312
next_states.append(np.reshape(next_state, [1, self.state_size[0]]))
315313
actions.append(action)
316314
rewards.append(reward)
317315
dones.append(done)
318-
logp_ts.append(logp_t)
316+
logp_ts.append(logp_t[0])
319317
# Update current state shape
320318
state = np.reshape(next_state, [1, self.state_size[0]])
321319
score += reward
@@ -359,15 +357,9 @@ def run_multiprocesses(self, num_worker = 4):
359357
for worker_id, parent_conn in enumerate(parent_conns):
360358
state[worker_id] = parent_conn.recv()
361359

362-
log_std = -0.5 * np.ones(self.action_size, dtype=np.float32)
363-
std = np.exp(log_std)
364360
while self.episode < self.EPISODES:
365-
predictions_list = self.Actor.predict(np.reshape(state, [num_worker, self.state_size[0]]))
366-
367-
action = predictions_list + np.random.normal(size=predictions_list.shape) * std
368-
action = np.clip(action, self.low, self.high)
369-
370-
logp_pi = self.gaussian_likelihood(action, predictions_list, log_std)
361+
# get batch of action's and log_pi's
362+
action, logp_pi = self.act(np.reshape(state, [num_worker, self.state_size[0]]))
371363

372364
for worker_id, parent_conn in enumerate(parent_conns):
373365
parent_conn.send(action[worker_id])
@@ -438,5 +430,5 @@ def test(self, test_episodes = 100):#evaluate
438430
env_name = 'BipedalWalker-v3'
439431
agent = PPOAgent(env_name)
440432
#agent.run_batch() # train as PPO
441-
#agent.run_multiprocesses(num_worker = 8) # train PPO multiprocessed (fastest)
433+
#agent.run_multiprocesses(num_worker = 2) # train PPO multiprocessed (fastest)
442434
agent.test()

0 commit comments

Comments
 (0)