Skip to content

Commit 1075a23

Browse files
Tested with TF 2.3.1
Tested with TF 2.3.1
1 parent 81cc14e commit 1075a23

File tree

1 file changed

+231
-0
lines changed

1 file changed

+231
-0
lines changed
Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
# Tutorial by www.pylessons.com
2+
# Tutorial written for - Tensorflow 2.3.1
3+
4+
import os
5+
import random
6+
import gym
7+
import pylab
8+
import numpy as np
9+
from tensorflow.keras.models import Model, load_model
10+
from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten
11+
from tensorflow.keras.optimizers import Adam, RMSprop
12+
from tensorflow.keras import backend as K
13+
import cv2
14+
15+
def OurModel(input_shape, action_space, lr):
16+
X_input = Input(input_shape)
17+
18+
#X = Conv2D(32, 8, strides=(4, 4),padding="valid", activation="elu", data_format="channels_first", input_shape=input_shape)(X_input)
19+
#X = Conv2D(64, 4, strides=(2, 2),padding="valid", activation="elu", data_format="channels_first")(X)
20+
#X = Conv2D(64, 3, strides=(1, 1),padding="valid", activation="elu", data_format="channels_first")(X)
21+
X = Flatten(input_shape=input_shape)(X_input)
22+
23+
X = Dense(512, activation="elu", kernel_initializer='he_uniform')(X)
24+
#X = Dense(256, activation="elu", kernel_initializer='he_uniform')(X)
25+
#X = Dense(64, activation="elu", kernel_initializer='he_uniform')(X)
26+
27+
action = Dense(action_space, activation="softmax", kernel_initializer='he_uniform')(X)
28+
29+
Actor = Model(inputs = X_input, outputs = action)
30+
Actor.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=lr))
31+
32+
return Actor
33+
34+
class PGAgent:
35+
# Policy Gradient Main Optimization Algorithm
36+
def __init__(self, env_name):
37+
# Initialization
38+
# Environment and PG parameters
39+
self.env_name = env_name
40+
self.env = gym.make(env_name)
41+
self.action_size = self.env.action_space.n
42+
self.EPISODES, self.max_average = 10000, -21.0 # specific for pong
43+
self.lr = 0.000025
44+
45+
self.ROWS = 80
46+
self.COLS = 80
47+
self.REM_STEP = 4
48+
49+
# Instantiate games and plot memory
50+
self.states, self.actions, self.rewards = [], [], []
51+
self.scores, self.episodes, self.average = [], [], []
52+
53+
self.Save_Path = 'Models'
54+
self.state_size = (self.REM_STEP, self.ROWS, self.COLS)
55+
self.image_memory = np.zeros(self.state_size)
56+
57+
if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
58+
self.path = '{}_PG_{}'.format(self.env_name, self.lr)
59+
self.Model_name = os.path.join(self.Save_Path, self.path)
60+
61+
# Create Actor network model
62+
self.Actor = OurModel(input_shape=self.state_size, action_space = self.action_size, lr=self.lr)
63+
64+
def remember(self, state, action, reward):
65+
# store episode actions to memory
66+
self.states.append(state)
67+
action_onehot = np.zeros([self.action_size])
68+
action_onehot[action] = 1
69+
self.actions.append(action_onehot)
70+
self.rewards.append(reward)
71+
72+
def act(self, state):
73+
# Use the network to predict the next action to take, using the model
74+
prediction = self.Actor.predict(state)[0]
75+
action = np.random.choice(self.action_size, p=prediction)
76+
return action
77+
78+
def discount_rewards(self, reward):
79+
# Compute the gamma-discounted rewards over an episode
80+
gamma = 0.99 # discount rate
81+
running_add = 0
82+
discounted_r = np.zeros_like(reward)
83+
for i in reversed(range(0,len(reward))):
84+
if reward[i] != 0: # reset the sum, since this was a game boundary (pong specific!)
85+
running_add = 0
86+
running_add = running_add * gamma + reward[i]
87+
discounted_r[i] = running_add
88+
89+
discounted_r -= np.mean(discounted_r) # normalizing the result
90+
discounted_r /= np.std(discounted_r) # divide by standard deviation
91+
return discounted_r
92+
93+
def replay(self):
94+
# reshape memory to appropriate shape for training
95+
states = np.vstack(self.states)
96+
actions = np.vstack(self.actions)
97+
98+
# Compute discounted rewards
99+
discounted_r = self.discount_rewards(self.rewards)
100+
101+
# training PG network
102+
self.Actor.fit(states, actions, sample_weight=discounted_r, epochs=1, verbose=0)
103+
# reset training memory
104+
self.states, self.actions, self.rewards = [], [], []
105+
106+
def load(self, Actor_name):
107+
self.Actor = load_model(Actor_name, compile=False)
108+
109+
def save(self):
110+
self.Actor.save(self.Model_name + '.h5')
111+
112+
pylab.figure(figsize=(18, 9))
113+
def PlotModel(self, score, episode):
114+
self.scores.append(score)
115+
self.episodes.append(episode)
116+
self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
117+
if str(episode)[-2:] == "00":# much faster than episode % 100
118+
pylab.plot(self.episodes, self.scores, 'b')
119+
pylab.plot(self.episodes, self.average, 'r')
120+
pylab.ylabel('Score', fontsize=18)
121+
pylab.xlabel('Steps', fontsize=18)
122+
try:
123+
pylab.savefig(self.path+".png")
124+
except OSError:
125+
pass
126+
127+
return self.average[-1]
128+
129+
def imshow(self, image, rem_step=0):
130+
cv2.imshow(self.Model_name+str(rem_step), image[rem_step,...])
131+
if cv2.waitKey(25) & 0xFF == ord("q"):
132+
cv2.destroyAllWindows()
133+
return
134+
135+
def GetImage(self, frame):
136+
# croping frame to 80x80 size
137+
frame_cropped = frame[35:195:2, ::2,:]
138+
if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS:
139+
# OpenCV resize function
140+
frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC)
141+
142+
# converting to RGB (numpy way)
143+
frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2]
144+
145+
# convert everything to black and white (agent will train faster)
146+
frame_rgb[frame_rgb < 100] = 0
147+
frame_rgb[frame_rgb >= 100] = 255
148+
# converting to RGB (OpenCV way)
149+
#frame_rgb = cv2.cvtColor(frame_cropped, cv2.COLOR_RGB2GRAY)
150+
151+
# dividing by 255 we expresses value to 0-1 representation
152+
new_frame = np.array(frame_rgb).astype(np.float32) / 255.0
153+
154+
# push our data by 1 frame, similar as deq() function work
155+
self.image_memory = np.roll(self.image_memory, 1, axis = 0)
156+
157+
# inserting new frame to free space
158+
self.image_memory[0,:,:] = new_frame
159+
160+
# show image frame
161+
#self.imshow(self.image_memory,0)
162+
#self.imshow(self.image_memory,1)
163+
#self.imshow(self.image_memory,2)
164+
#self.imshow(self.image_memory,3)
165+
return np.expand_dims(self.image_memory, axis=0)
166+
167+
def reset(self):
168+
frame = self.env.reset()
169+
for i in range(self.REM_STEP):
170+
state = self.GetImage(frame)
171+
return state
172+
173+
def step(self,action):
174+
next_state, reward, done, info = self.env.step(action)
175+
next_state = self.GetImage(next_state)
176+
return next_state, reward, done, info
177+
178+
def run(self):
179+
for e in range(self.EPISODES):
180+
state = self.reset()
181+
done, score, SAVING = False, 0, ''
182+
while not done:
183+
#self.env.render()
184+
# Actor picks an action
185+
action = self.act(state)
186+
# Retrieve new state, reward, and whether the state is terminal
187+
next_state, reward, done, _ = self.step(action)
188+
# Memorize (state, action, reward) for training
189+
self.remember(state, action, reward)
190+
# Update current state
191+
state = next_state
192+
score += reward
193+
if done:
194+
average = self.PlotModel(score, e)
195+
# saving best models
196+
if average >= self.max_average:
197+
self.max_average = average
198+
self.save()
199+
SAVING = "SAVING"
200+
else:
201+
SAVING = ""
202+
print("episode: {}/{}, score: {}, average: {:.2f} {}".format(e, self.EPISODES, score, average, SAVING))
203+
204+
self.replay()
205+
206+
# close environemnt when finish training
207+
self.env.close()
208+
209+
def test(self, Model_name):
210+
self.load(Model_name)
211+
for e in range(100):
212+
state = self.reset()
213+
done = False
214+
score = 0
215+
while not done:
216+
self.env.render()
217+
action = np.argmax(self.Actor.predict(state))
218+
state, reward, done, _ = self.step(action)
219+
score += reward
220+
if done:
221+
print("episode: {}/{}, score: {}".format(e, self.EPISODES, score))
222+
break
223+
self.env.close()
224+
225+
if __name__ == "__main__":
226+
#env_name = 'Pong-v0'
227+
env_name = 'PongDeterministic-v4'
228+
agent = PGAgent(env_name)
229+
agent.run()
230+
#agent.test('Models/PongDeterministic-v4_PG_2.5e-05.h5')
231+
#agent.test('Models/Pong-v0_PG_2.5e-05.h5')

0 commit comments

Comments
 (0)