Skip to content

Commit 296dbd5

Browse files
Tested with TF 2.3.1
Tested with TF 2.3.1
1 parent c81c0e6 commit 296dbd5

File tree

1 file changed

+346
-0
lines changed

1 file changed

+346
-0
lines changed

11_Pong-v0_PPO/Pong-v0_PPO_TF2.py

Lines changed: 346 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,346 @@
1+
# Tutorial by www.pylessons.com
2+
# Tutorial written for - Tensorflow 2.3.1
3+
4+
import os
5+
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
6+
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
7+
import random
8+
import gym
9+
import pylab
10+
import numpy as np
11+
import tensorflow as tf
12+
from tensorflow.keras.models import Model, load_model
13+
from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten
14+
from tensorflow.keras.optimizers import Adam, RMSprop
15+
from tensorflow.keras import backend as K
16+
import cv2
17+
18+
import threading
19+
from threading import Thread, Lock
20+
import time
21+
22+
gpus = tf.config.experimental.list_physical_devices('GPU')
23+
if len(gpus) > 0:
24+
print(f'GPUs {gpus}')
25+
try: tf.config.experimental.set_memory_growth(gpus[0], True)
26+
except RuntimeError: pass
27+
28+
29+
def OurModel(input_shape, action_space, lr):
30+
X_input = Input(input_shape)
31+
32+
#X = Conv2D(32, 8, strides=(4, 4),padding="valid", activation="elu", data_format="channels_first", input_shape=input_shape)(X_input)
33+
#X = Conv2D(64, 4, strides=(2, 2),padding="valid", activation="elu", data_format="channels_first")(X)
34+
#X = Conv2D(64, 3, strides=(1, 1),padding="valid", activation="elu", data_format="channels_first")(X)
35+
X = Flatten(input_shape=input_shape)(X_input)
36+
37+
X = Dense(512, activation="elu", kernel_initializer='he_uniform')(X)
38+
#X = Dense(256, activation="elu", kernel_initializer='he_uniform')(X)
39+
#X = Dense(64, activation="elu", kernel_initializer='he_uniform')(X)
40+
41+
action = Dense(action_space, activation="softmax", kernel_initializer='he_uniform')(X)
42+
value = Dense(1, activation='linear', kernel_initializer='he_uniform')(X)
43+
44+
def ppo_loss(y_true, y_pred):
45+
# Defined in https://arxiv.org/abs/1707.06347
46+
advantages, prediction_picks, actions = y_true[:, :1], y_true[:, 1:1+action_space], y_true[:, 1+action_space:]
47+
LOSS_CLIPPING = 0.2
48+
ENTROPY_LOSS = 5e-3
49+
50+
prob = y_pred * actions
51+
old_prob = actions * prediction_picks
52+
r = prob/(old_prob + 1e-10)
53+
p1 = r * advantages
54+
p2 = K.clip(r, min_value=1 - LOSS_CLIPPING, max_value=1 + LOSS_CLIPPING) * advantages
55+
loss = -K.mean(K.minimum(p1, p2) + ENTROPY_LOSS * -(prob * K.log(prob + 1e-10)))
56+
57+
return loss
58+
59+
Actor = Model(inputs = X_input, outputs = action)
60+
Actor.compile(loss=ppo_loss, optimizer=RMSprop(lr=lr))
61+
62+
Critic = Model(inputs = X_input, outputs = value)
63+
Critic.compile(loss='mse', optimizer=RMSprop(lr=lr))
64+
65+
return Actor, Critic
66+
67+
class PPOAgent:
68+
# PPO Main Optimization Algorithm
69+
def __init__(self, env_name):
70+
# Initialization
71+
# Environment and PPO parameters
72+
self.env_name = env_name
73+
self.env = gym.make(env_name)
74+
self.action_size = self.env.action_space.n
75+
self.EPISODES, self.episode, self.max_average = 10000, 0, -21.0 # specific for pong
76+
self.lock = Lock() # lock all to update parameters without other thread interruption
77+
self.lr = 0.0001
78+
79+
self.ROWS = 80
80+
self.COLS = 80
81+
self.REM_STEP = 4
82+
self.EPOCHS = 10
83+
84+
# Instantiate plot memory
85+
self.scores, self.episodes, self.average = [], [], []
86+
87+
self.Save_Path = 'Models'
88+
self.state_size = (self.REM_STEP, self.ROWS, self.COLS)
89+
90+
if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
91+
self.path = '{}_APPO_{}'.format(self.env_name, self.lr)
92+
self.Model_name = os.path.join(self.Save_Path, self.path)
93+
94+
# Create Actor-Critic network model
95+
self.Actor, self.Critic = OurModel(input_shape=self.state_size, action_space = self.action_size, lr=self.lr)
96+
97+
def act(self, state):
98+
# Use the network to predict the next action to take, using the model
99+
prediction = self.Actor.predict(state)[0]
100+
action = np.random.choice(self.action_size, p=prediction)
101+
return action, prediction
102+
103+
def discount_rewards(self, reward):
104+
# Compute the gamma-discounted rewards over an episode
105+
gamma = 0.99 # discount rate
106+
running_add = 0
107+
discounted_r = np.zeros_like(reward)
108+
for i in reversed(range(0,len(reward))):
109+
if reward[i] != 0: # reset the sum, since this was a game boundary (pong specific!)
110+
running_add = 0
111+
running_add = running_add * gamma + reward[i]
112+
discounted_r[i] = running_add
113+
114+
discounted_r -= np.mean(discounted_r) # normalizing the result
115+
discounted_r /= np.std(discounted_r) # divide by standard deviation
116+
return discounted_r
117+
118+
def replay(self, states, actions, rewards, predictions):
119+
# reshape memory to appropriate shape for training
120+
states = np.vstack(states)
121+
actions = np.vstack(actions)
122+
predictions = np.vstack(predictions)
123+
124+
# Compute discounted rewards
125+
discounted_r = np.vstack(self.discount_rewards(rewards))
126+
127+
# Get Critic network predictions
128+
values = self.Critic.predict(states)
129+
# Compute advantages
130+
advantages = discounted_r - values
131+
132+
'''
133+
pylab.plot(discounted_r,'-')
134+
pylab.plot(advantages,'.')
135+
ax=pylab.gca()
136+
ax.grid(True)
137+
pylab.show()
138+
'''
139+
# stack everything to numpy array
140+
y_true = np.hstack([advantages, predictions, actions])
141+
142+
# training Actor and Critic networks
143+
self.Actor.fit(states, y_true, epochs=self.EPOCHS, verbose=0, shuffle=True, batch_size=len(rewards))
144+
self.Critic.fit(states, discounted_r, epochs=self.EPOCHS, verbose=0, shuffle=True, batch_size=len(rewards))
145+
146+
def load(self, Actor_name, Critic_name):
147+
self.Actor = load_model(Actor_name, compile=False)
148+
#self.Critic = load_model(Critic_name, compile=False)
149+
150+
def save(self):
151+
self.Actor.save(self.Model_name + '_Actor.h5')
152+
#self.Critic.save(self.Model_name + '_Critic.h5')
153+
154+
pylab.figure(figsize=(18, 9))
155+
def PlotModel(self, score, episode):
156+
self.scores.append(score)
157+
self.episodes.append(episode)
158+
self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
159+
if str(episode)[-2:] == "00":# much faster than episode % 100
160+
pylab.plot(self.episodes, self.scores, 'b')
161+
pylab.plot(self.episodes, self.average, 'r')
162+
pylab.ylabel('Score', fontsize=18)
163+
pylab.xlabel('Steps', fontsize=18)
164+
try:
165+
pylab.savefig(self.path+".png")
166+
except OSError:
167+
pass
168+
169+
return self.average[-1]
170+
171+
def imshow(self, image, rem_step=0):
172+
cv2.imshow("cartpole"+str(rem_step), image[rem_step,...])
173+
if cv2.waitKey(25) & 0xFF == ord("q"):
174+
cv2.destroyAllWindows()
175+
return
176+
177+
def GetImage(self, frame, image_memory):
178+
if image_memory.shape == (1,*self.state_size):
179+
image_memory = np.squeeze(image_memory)
180+
181+
# croping frame to 80x80 size
182+
frame_cropped = frame[35:195:2, ::2,:]
183+
if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS:
184+
# OpenCV resize function
185+
frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC)
186+
187+
# converting to RGB (numpy way)
188+
frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2]
189+
190+
# convert everything to black and white (agent will train faster)
191+
frame_rgb[frame_rgb < 100] = 0
192+
frame_rgb[frame_rgb >= 100] = 255
193+
# converting to RGB (OpenCV way)
194+
#frame_rgb = cv2.cvtColor(frame_cropped, cv2.COLOR_RGB2GRAY)
195+
196+
# dividing by 255 we expresses value to 0-1 representation
197+
new_frame = np.array(frame_rgb).astype(np.float32) / 255.0
198+
199+
# push our data by 1 frame, similar as deq() function work
200+
image_memory = np.roll(image_memory, 1, axis = 0)
201+
202+
# inserting new frame to free space
203+
image_memory[0,:,:] = new_frame
204+
205+
# show image frame
206+
#self.imshow(image_memory,0)
207+
#self.imshow(image_memory,1)
208+
#self.imshow(image_memory,2)
209+
#self.imshow(image_memory,3)
210+
211+
return np.expand_dims(image_memory, axis=0)
212+
213+
def reset(self, env):
214+
image_memory = np.zeros(self.state_size)
215+
frame = env.reset()
216+
for i in range(self.REM_STEP):
217+
state = self.GetImage(frame, image_memory)
218+
return state
219+
220+
def step(self, action, env, image_memory):
221+
next_state, reward, done, info = env.step(action)
222+
next_state = self.GetImage(next_state, image_memory)
223+
return next_state, reward, done, info
224+
225+
def run(self):
226+
for e in range(self.EPISODES):
227+
state = self.reset(self.env)
228+
done, score, SAVING = False, 0, ''
229+
# Instantiate or reset games memory
230+
states, actions, rewards, predictions = [], [], [], []
231+
while not done:
232+
#self.env.render()
233+
# Actor picks an action
234+
action, prediction = self.act(state)
235+
# Retrieve new state, reward, and whether the state is terminal
236+
next_state, reward, done, _ = self.step(action, self.env, state)
237+
# Memorize (state, action, reward) for training
238+
states.append(state)
239+
action_onehot = np.zeros([self.action_size])
240+
action_onehot[action] = 1
241+
actions.append(action_onehot)
242+
rewards.append(reward)
243+
predictions.append(prediction)
244+
# Update current state
245+
state = next_state
246+
score += reward
247+
if done:
248+
average = self.PlotModel(score, e)
249+
# saving best models
250+
if average >= self.max_average:
251+
self.max_average = average
252+
self.save()
253+
SAVING = "SAVING"
254+
else:
255+
SAVING = ""
256+
print("episode: {}/{}, score: {}, average: {:.2f} {}".format(e, self.EPISODES, score, average, SAVING))
257+
258+
self.replay(states, actions, rewards, predictions)
259+
260+
self.env.close()
261+
262+
def train(self, n_threads):
263+
self.env.close()
264+
# Instantiate one environment per thread
265+
envs = [gym.make(self.env_name) for i in range(n_threads)]
266+
267+
# Create threads
268+
threads = [threading.Thread(
269+
target=self.train_threading,
270+
daemon=True,
271+
args=(self,
272+
envs[i],
273+
i)) for i in range(n_threads)]
274+
275+
for t in threads:
276+
time.sleep(2)
277+
t.start()
278+
279+
for t in threads:
280+
time.sleep(10)
281+
t.join()
282+
283+
def train_threading(self, agent, env, thread):
284+
while self.episode < self.EPISODES:
285+
# Reset episode
286+
score, done, SAVING = 0, False, ''
287+
state = self.reset(env)
288+
# Instantiate or reset games memory
289+
states, actions, rewards, predictions = [], [], [], []
290+
while not done:
291+
action, prediction = agent.act(state)
292+
next_state, reward, done, _ = self.step(action, env, state)
293+
294+
states.append(state)
295+
action_onehot = np.zeros([self.action_size])
296+
action_onehot[action] = 1
297+
actions.append(action_onehot)
298+
rewards.append(reward)
299+
predictions.append(prediction)
300+
301+
score += reward
302+
state = next_state
303+
304+
self.lock.acquire()
305+
self.replay(states, actions, rewards, predictions)
306+
self.lock.release()
307+
308+
# Update episode count
309+
with self.lock:
310+
average = self.PlotModel(score, self.episode)
311+
# saving best models
312+
if average >= self.max_average:
313+
self.max_average = average
314+
self.save()
315+
SAVING = "SAVING"
316+
else:
317+
SAVING = ""
318+
print("episode: {}/{}, thread: {}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, thread, score, average, SAVING))
319+
if(self.episode < self.EPISODES):
320+
self.episode += 1
321+
env.close()
322+
323+
def test(self, Actor_name, Critic_name):
324+
self.load(Actor_name, Critic_name)
325+
for e in range(100):
326+
state = self.reset(self.env)
327+
done = False
328+
score = 0
329+
while not done:
330+
self.env.render()
331+
action = np.argmax(self.Actor.predict(state))
332+
state, reward, done, _ = self.step(action, self.env, state)
333+
score += reward
334+
if done:
335+
print("episode: {}/{}, score: {}".format(e, self.EPISODES, score))
336+
break
337+
self.env.close()
338+
339+
if __name__ == "__main__":
340+
env_name = 'PongDeterministic-v4'
341+
#env_name = 'Pong-v0'
342+
agent = PPOAgent(env_name)
343+
#agent.run() # use as PPO
344+
agent.train(n_threads=5) # use as APPO
345+
#agent.test('Models/Pong-v0_APPO_0.0001_Actor.h5', '')
346+
agent.test('Models/Pong-v0_APPO_0.0001_Actor_CNN.h5', '')

0 commit comments

Comments
 (0)