Skip to content

Commit c81c0e6

Browse files
Tested with TF 2.3.1
Tested with TF 2.3.1
1 parent 964594f commit c81c0e6

File tree

1 file changed

+313
-0
lines changed

1 file changed

+313
-0
lines changed

10_Pong-v0_A3C/Pong-v0_A3C_TF2.py

Lines changed: 313 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,313 @@
1+
# Tutorial by www.pylessons.com
2+
# Tutorial written for - Tensorflow 2.3.1
3+
4+
import os
5+
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
6+
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
7+
import random
8+
import gym
9+
import pylab
10+
import numpy as np
11+
import tensorflow as tf
12+
from tensorflow.keras.models import Model, load_model
13+
from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten
14+
from tensorflow.keras.optimizers import Adam, RMSprop
15+
from tensorflow.keras import backend as K
16+
import cv2
17+
import threading
18+
from threading import Thread, Lock
19+
import time
20+
21+
gpus = tf.config.experimental.list_physical_devices('GPU')
22+
if len(gpus) > 0:
23+
print(f'GPUs {gpus}')
24+
try: tf.config.experimental.set_memory_growth(gpus[0], True)
25+
except RuntimeError: pass
26+
27+
def OurModel(input_shape, action_space, lr):
28+
X_input = Input(input_shape)
29+
30+
#X = Conv2D(32, 8, strides=(4, 4),padding="valid", activation="elu", data_format="channels_first", input_shape=input_shape)(X_input)
31+
#X = Conv2D(64, 4, strides=(2, 2),padding="valid", activation="elu", data_format="channels_first")(X)
32+
#X = Conv2D(64, 3, strides=(1, 1),padding="valid", activation="elu", data_format="channels_first")(X)
33+
X = Flatten(input_shape=input_shape)(X_input)
34+
35+
X = Dense(512, activation="elu", kernel_initializer='he_uniform')(X)
36+
#X = Dense(256, activation="elu", kernel_initializer='he_uniform')(X)
37+
#X = Dense(64, activation="elu", kernel_initializer='he_uniform')(X)
38+
39+
action = Dense(action_space, activation="softmax", kernel_initializer='he_uniform')(X)
40+
value = Dense(1, kernel_initializer='he_uniform')(X)
41+
42+
Actor = Model(inputs = X_input, outputs = action)
43+
Actor.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=lr))
44+
45+
Critic = Model(inputs = X_input, outputs = value)
46+
Critic.compile(loss='mse', optimizer=RMSprop(lr=lr))
47+
48+
return Actor, Critic
49+
50+
class A3CAgent:
51+
# Actor-Critic Main Optimization Algorithm
52+
def __init__(self, env_name):
53+
# Initialization
54+
# Environment and PPO parameters
55+
self.env_name = env_name
56+
self.env = gym.make(env_name)
57+
self.action_size = self.env.action_space.n
58+
self.EPISODES, self.episode, self.max_average = 20000, 0, -21.0 # specific for pong
59+
self.lock = Lock()
60+
self.lr = 0.000025
61+
62+
self.ROWS = 80
63+
self.COLS = 80
64+
self.REM_STEP = 4
65+
66+
# Instantiate plot memory
67+
self.scores, self.episodes, self.average = [], [], []
68+
69+
self.Save_Path = 'Models'
70+
self.state_size = (self.REM_STEP, self.ROWS, self.COLS)
71+
72+
if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
73+
self.path = '{}_A3C_{}'.format(self.env_name, self.lr)
74+
self.Model_name = os.path.join(self.Save_Path, self.path)
75+
76+
# Create Actor-Critic network model
77+
self.Actor, self.Critic = OurModel(input_shape=self.state_size, action_space = self.action_size, lr=self.lr)
78+
79+
def act(self, state):
80+
# Use the network to predict the next action to take, using the model
81+
prediction = self.Actor.predict(state)[0]
82+
action = np.random.choice(self.action_size, p=prediction)
83+
return action
84+
85+
def discount_rewards(self, reward):
86+
# Compute the gamma-discounted rewards over an episode
87+
gamma = 0.99 # discount rate
88+
running_add = 0
89+
discounted_r = np.zeros_like(reward)
90+
for i in reversed(range(0,len(reward))):
91+
if reward[i] != 0: # reset the sum, since this was a game boundary (pong specific!)
92+
running_add = 0
93+
running_add = running_add * gamma + reward[i]
94+
discounted_r[i] = running_add
95+
96+
discounted_r -= np.mean(discounted_r) # normalizing the result
97+
discounted_r /= np.std(discounted_r) # divide by standard deviation
98+
return discounted_r
99+
100+
def replay(self, states, actions, rewards):
101+
# reshape memory to appropriate shape for training
102+
states = np.vstack(states)
103+
actions = np.vstack(actions)
104+
105+
# Compute discounted rewards
106+
discounted_r = self.discount_rewards(rewards)
107+
108+
# Get Critic network predictions
109+
value = self.Critic.predict(states)[:, 0]
110+
# Compute advantages
111+
advantages = discounted_r - value
112+
# training Actor and Critic networks
113+
self.Actor.fit(states, actions, sample_weight=advantages, epochs=1, verbose=0)
114+
self.Critic.fit(states, discounted_r, epochs=1, verbose=0)
115+
116+
def load(self, Actor_name, Critic_name):
117+
self.Actor = load_model(Actor_name, compile=False)
118+
#self.Critic = load_model(Critic_name, compile=False)
119+
120+
def save(self):
121+
self.Actor.save(self.Model_name + '_Actor.h5')
122+
#self.Critic.save(self.Model_name + '_Critic.h5')
123+
124+
pylab.figure(figsize=(18, 9))
125+
def PlotModel(self, score, episode):
126+
self.scores.append(score)
127+
self.episodes.append(episode)
128+
self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
129+
if str(episode)[-2:] == "00":# much faster than episode % 100
130+
pylab.plot(self.episodes, self.scores, 'b')
131+
pylab.plot(self.episodes, self.average, 'r')
132+
pylab.ylabel('Score', fontsize=18)
133+
pylab.xlabel('Steps', fontsize=18)
134+
try:
135+
pylab.savefig(self.path+".png")
136+
except OSError:
137+
pass
138+
139+
return self.average[-1]
140+
141+
def imshow(self, image, rem_step=0):
142+
cv2.imshow(self.Model_name+str(rem_step), image[rem_step,...])
143+
if cv2.waitKey(25) & 0xFF == ord("q"):
144+
cv2.destroyAllWindows()
145+
return
146+
147+
def GetImage(self, frame, image_memory):
148+
if image_memory.shape == (1,*self.state_size):
149+
image_memory = np.squeeze(image_memory)
150+
151+
# croping frame to 80x80 size
152+
frame_cropped = frame[35:195:2, ::2,:]
153+
if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS:
154+
# OpenCV resize function
155+
frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC)
156+
157+
# converting to RGB (numpy way)
158+
frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2]
159+
160+
# convert everything to black and white (agent will train faster)
161+
frame_rgb[frame_rgb < 100] = 0
162+
frame_rgb[frame_rgb >= 100] = 255
163+
# converting to RGB (OpenCV way)
164+
#frame_rgb = cv2.cvtColor(frame_cropped, cv2.COLOR_RGB2GRAY)
165+
166+
# dividing by 255 we expresses value to 0-1 representation
167+
new_frame = np.array(frame_rgb).astype(np.float32) / 255.0
168+
169+
# push our data by 1 frame, similar as deq() function work
170+
image_memory = np.roll(image_memory, 1, axis = 0)
171+
172+
# inserting new frame to free space
173+
image_memory[0,:,:] = new_frame
174+
175+
# show image frame
176+
#self.imshow(image_memory,0)
177+
#self.imshow(image_memory,1)
178+
#self.imshow(image_memory,2)
179+
#self.imshow(image_memory,3)
180+
181+
return np.expand_dims(image_memory, axis=0)
182+
183+
def reset(self, env):
184+
image_memory = np.zeros(self.state_size)
185+
frame = env.reset()
186+
for i in range(self.REM_STEP):
187+
state = self.GetImage(frame, image_memory)
188+
return state
189+
190+
def step(self, action, env, image_memory):
191+
next_state, reward, done, info = env.step(action)
192+
next_state = self.GetImage(next_state, image_memory)
193+
return next_state, reward, done, info
194+
195+
def run(self):
196+
for e in range(self.EPISODES):
197+
state = self.reset(self.env)
198+
done, score, SAVING = False, 0, ''
199+
# Instantiate or reset games memory
200+
states, actions, rewards = [], [], []
201+
while not done:
202+
#self.env.render()
203+
# Actor picks an action
204+
action = self.act(state)
205+
# Retrieve new state, reward, and whether the state is terminal
206+
next_state, reward, done, _ = self.step(action, self.env, state)
207+
# Memorize (state, action, reward) for training
208+
states.append(state)
209+
action_onehot = np.zeros([self.action_size])
210+
action_onehot[action] = 1
211+
actions.append(action_onehot)
212+
rewards.append(reward)
213+
# Update current state
214+
state = next_state
215+
score += reward
216+
if done:
217+
average = self.PlotModel(score, e)
218+
# saving best models
219+
if average >= self.max_average:
220+
self.max_average = average
221+
self.save()
222+
SAVING = "SAVING"
223+
else:
224+
SAVING = ""
225+
print("episode: {}/{}, score: {}, average: {:.2f} {}".format(e, self.EPISODES, score, average, SAVING))
226+
227+
self.replay(states, actions, rewards)
228+
# close environemnt when finish training
229+
self.env.close()
230+
231+
def train(self, n_threads):
232+
self.env.close()
233+
# Instantiate one environment per thread
234+
envs = [gym.make(self.env_name) for i in range(n_threads)]
235+
236+
# Create threads
237+
threads = [threading.Thread(
238+
target=self.train_threading,
239+
daemon=True,
240+
args=(self,
241+
envs[i],
242+
i)) for i in range(n_threads)]
243+
244+
for t in threads:
245+
time.sleep(2)
246+
t.start()
247+
248+
for t in threads:
249+
time.sleep(10)
250+
t.join()
251+
252+
def train_threading(self, agent, env, thread):
253+
while self.episode < self.EPISODES:
254+
# Reset episode
255+
score, done, SAVING = 0, False, ''
256+
state = self.reset(env)
257+
# Instantiate or reset games memory
258+
states, actions, rewards = [], [], []
259+
while not done:
260+
action = agent.act(state)
261+
next_state, reward, done, _ = self.step(action, env, state)
262+
263+
states.append(state)
264+
action_onehot = np.zeros([self.action_size])
265+
action_onehot[action] = 1
266+
actions.append(action_onehot)
267+
rewards.append(reward)
268+
269+
score += reward
270+
state = next_state
271+
272+
self.lock.acquire()
273+
self.replay(states, actions, rewards)
274+
self.lock.release()
275+
276+
# Update episode count
277+
with self.lock:
278+
average = self.PlotModel(score, self.episode)
279+
# saving best models
280+
if average >= self.max_average:
281+
self.max_average = average
282+
self.save()
283+
SAVING = "SAVING"
284+
else:
285+
SAVING = ""
286+
print("episode: {}/{}, thread: {}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, thread, score, average, SAVING))
287+
if(self.episode < self.EPISODES):
288+
self.episode += 1
289+
env.close()
290+
291+
def test(self, Actor_name, Critic_name):
292+
self.load(Actor_name, Critic_name)
293+
for e in range(100):
294+
state = self.reset(self.env)
295+
done = False
296+
score = 0
297+
while not done:
298+
self.env.render()
299+
action = np.argmax(self.Actor.predict(state))
300+
state, reward, done, _ = self.step(action, self.env, state)
301+
score += reward
302+
if done:
303+
print("episode: {}/{}, score: {}".format(e, self.EPISODES, score))
304+
break
305+
self.env.close()
306+
307+
if __name__ == "__main__":
308+
env_name = 'PongDeterministic-v4'
309+
#env_name = 'Pong-v0'
310+
agent = A3CAgent(env_name)
311+
#agent.run() # use as A2C
312+
agent.train(n_threads=5) # use as A3C
313+
#agent.test('Models/Pong-v0_A3C_2.5e-05_Actor.h5', '')

0 commit comments

Comments
 (0)