Skip to content

Commit 81cc14e

Browse files
Tested with TF 2.3.1
Tested with TF 2.3.1
1 parent f8bd4d5 commit 81cc14e

File tree

1 file changed

+339
-0
lines changed

1 file changed

+339
-0
lines changed
Lines changed: 339 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,339 @@
1+
# Tutorial by www.pylessons.com
2+
# Tutorial written for - Tensorflow 2.3.1
3+
4+
import os
5+
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
6+
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
7+
import random
8+
import gym
9+
import pylab
10+
import numpy as np
11+
from collections import deque
12+
from tensorflow.keras.models import Model, load_model
13+
from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten
14+
from tensorflow.keras.optimizers import Adam, RMSprop
15+
from tensorflow.keras import backend as K
16+
from PER import *
17+
import cv2
18+
19+
def OurModel(input_shape, action_space, dueling):
20+
X_input = Input(input_shape)
21+
X = X_input
22+
23+
#X = Conv2D(64, 5, strides=(3, 3),padding="valid", input_shape=input_shape, activation="relu", data_format="channels_first")(X)
24+
X = Conv2D(32, 8, strides=(4, 4),padding="valid", input_shape=input_shape, activation="relu", data_format="channels_first")(X)
25+
X = Conv2D(64, 4, strides=(2, 2),padding="valid", activation="relu", data_format="channels_first")(X)
26+
X = Conv2D(64, 3, strides=(1, 1),padding="valid", activation="relu", data_format="channels_first")(X)
27+
X = Flatten()(X)
28+
29+
# 'Dense' is the basic form of a neural network layer
30+
X = Dense(512, activation="relu", kernel_initializer='he_uniform')(X)
31+
32+
X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
33+
34+
X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
35+
36+
if dueling:
37+
state_value = Dense(1, kernel_initializer='he_uniform')(X)
38+
state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value)
39+
40+
action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X)
41+
action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage)
42+
43+
X = Add()([state_value, action_advantage])
44+
else:
45+
# Output Layer with # of actions: 2 nodes (left, right)
46+
X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)
47+
48+
model = Model(inputs = X_input, outputs = X)
49+
#model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])
50+
#model.compile(optimizer=Adam(lr=0.00025), loss='mean_squared_error')
51+
model.compile(optimizer=Adam(lr=0.00005), loss='mean_squared_error')
52+
53+
model.summary()
54+
return model
55+
56+
class DQNAgent:
57+
def __init__(self, env_name):
58+
self.env_name = env_name
59+
self.env = gym.make(env_name)
60+
#self.env.seed(0)
61+
self.action_size = self.env.action_space.n
62+
self.EPISODES = 1000
63+
64+
# Instantiate memory
65+
memory_size = 25000
66+
self.MEMORY = Memory(memory_size)
67+
self.memory = deque(maxlen=memory_size)
68+
69+
self.gamma = 0.99 # discount rate
70+
71+
# EXPLORATION HYPERPARAMETERS for epsilon and epsilon greedy strategy
72+
self.epsilon = 1.0 # exploration probability at start
73+
self.epsilon_min = 0.02 # minimum exploration probability
74+
self.epsilon_decay = 0.00002 # exponential decay rate for exploration prob
75+
76+
self.batch_size = 32
77+
78+
# defining model parameters
79+
self.ddqn = True # use doudle deep q network
80+
self.dueling = True # use dealing netowrk
81+
self.epsilon_greedy = False # use epsilon greedy strategy
82+
self.USE_PER = True # use priority experienced replay
83+
84+
85+
self.Save_Path = 'Models'
86+
if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
87+
self.scores, self.episodes, self.average = [], [], []
88+
89+
self.Model_name = os.path.join(self.Save_Path, self.env_name+"_CNN.h5")
90+
91+
self.ROWS = 80
92+
self.COLS = 80
93+
self.REM_STEP = 4
94+
self.update_model_steps = 1000
95+
96+
self.state_size = (self.REM_STEP, self.ROWS, self.COLS)
97+
self.image_memory = np.zeros(self.state_size)
98+
99+
# create main model and target model
100+
self.model = OurModel(input_shape=self.state_size, action_space = self.action_size, dueling = self.dueling)
101+
self.target_model = OurModel(input_shape=self.state_size, action_space = self.action_size, dueling = self.dueling)
102+
103+
# after some time interval update the target model to be same with model
104+
def update_target_model(self, game_steps):
105+
if game_steps % self.update_model_steps == 0:
106+
self.target_model.set_weights(self.model.get_weights())
107+
return
108+
109+
def remember(self, state, action, reward, next_state, done):
110+
experience = state, action, reward, next_state, done
111+
if self.USE_PER:
112+
self.MEMORY.store(experience)
113+
else:
114+
self.memory.append((experience))
115+
116+
def act(self, state, decay_step):
117+
# EPSILON GREEDY STRATEGY
118+
if self.epsilon_greedy:
119+
# Here we'll use an improved version of our epsilon greedy strategy for Q-learning
120+
explore_probability = self.epsilon_min + (self.epsilon - self.epsilon_min) * np.exp(-self.epsilon_decay * decay_step)
121+
# OLD EPSILON STRATEGY
122+
else:
123+
if self.epsilon > self.epsilon_min:
124+
self.epsilon *= (1-self.epsilon_decay)
125+
explore_probability = self.epsilon
126+
127+
if explore_probability > np.random.rand():
128+
# Make a random action (exploration)
129+
return random.randrange(self.action_size), explore_probability
130+
else:
131+
# Get action from Q-network (exploitation)
132+
# Estimate the Qs values state
133+
# Take the biggest Q value (= the best action)
134+
return np.argmax(self.model.predict(state)), explore_probability
135+
136+
def replay(self):
137+
if self.USE_PER:
138+
# Sample minibatch from the PER memory
139+
tree_idx, minibatch = self.MEMORY.sample(self.batch_size)
140+
else:
141+
if len(self.memory) > self.batch_size:
142+
# Randomly sample minibatch from the deque memory
143+
minibatch = random.sample(self.memory, self.batch_size)
144+
else:
145+
return
146+
147+
state = np.zeros((self.batch_size, *self.state_size), dtype=np.float32)
148+
action = np.zeros(self.batch_size, dtype=np.int32)
149+
reward = np.zeros(self.batch_size, dtype=np.float32)
150+
next_state = np.zeros((self.batch_size, *self.state_size), dtype=np.float32)
151+
done = np.zeros(self.batch_size, dtype=np.uint8)
152+
153+
# do this before prediction
154+
# for speedup, this could be done on the tensor level
155+
# but easier to understand using a loop
156+
for i in range(len(minibatch)):
157+
state[i], action[i], reward[i], next_state[i], done[i] = minibatch[i]
158+
159+
# do batch prediction to save speed
160+
# predict Q-values for starting state using the main network
161+
target = self.model.predict(state)
162+
target_old = np.array(target)
163+
# predict best action in ending state using the main network
164+
target_next = self.model.predict(next_state)
165+
# predict Q-values for ending state using the target network
166+
target_val = self.target_model.predict(next_state)
167+
168+
169+
for i in range(len(minibatch)):
170+
# correction on the Q value for the action used
171+
if done[i]:
172+
target[i][action[i]] = reward[i]
173+
else:
174+
# the key point of Double DQN
175+
# selection of action is from model
176+
# update is from target model
177+
if self.ddqn: # Double - DQN
178+
# current Q Network selects the action
179+
# a'_max = argmax_a' Q(s', a')
180+
a = np.argmax(target_next[i])
181+
# target Q Network evaluates the action
182+
# Q_max = Q_target(s', a'_max)
183+
target[i][action[i]] = reward[i] + self.gamma * target_val[i][a]
184+
else: # Standard - DQN
185+
# DQN chooses the max Q value among next actions
186+
# selection and evaluation of action is on the target Q Network
187+
# Q_max = max_a' Q_target(s', a')
188+
# when using target model in simple DQN rules, we get better performance
189+
target[i][action[i]] = reward[i] + self.gamma * np.amax(target_val[i])
190+
191+
if self.USE_PER:
192+
indices = np.arange(self.batch_size, dtype=np.int32)
193+
absolute_errors = np.abs(target_old[indices, action]-target[indices, action])
194+
195+
# Update priority
196+
self.MEMORY.batch_update(tree_idx, absolute_errors)
197+
198+
# Train the Neural Network with batches
199+
self.model.fit(state, target, batch_size=self.batch_size, verbose=0)
200+
201+
def load(self, name):
202+
self.model = load_model(name)
203+
204+
def save(self, name):
205+
self.model.save(name)
206+
207+
pylab.figure(figsize=(18, 9))
208+
def PlotModel(self, score, episode):
209+
self.scores.append(score)
210+
self.episodes.append(episode)
211+
self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
212+
pylab.plot(self.episodes, self.average, 'r')
213+
pylab.plot(self.episodes, self.scores, 'b')
214+
pylab.ylabel('Score', fontsize=18)
215+
pylab.xlabel('Games', fontsize=18)
216+
dqn = '_DQN'
217+
dueling = ''
218+
greedy = ''
219+
PER = ''
220+
if self.ddqn: dqn = '_DDQN'
221+
if self.dueling: dueling = '_Dueling'
222+
if self.epsilon_greedy: greedy = '_Greedy'
223+
if self.USE_PER: PER = '_PER'
224+
try:
225+
pylab.savefig(self.env_name+dqn+dueling+greedy+PER+"_CNN.png")
226+
except OSError:
227+
pass
228+
# no need to worry about model, when doing a lot of experiments
229+
self.Model_name = os.path.join(self.Save_Path, self.env_name+dqn+dueling+greedy+PER+"_CNN.h5")
230+
231+
return self.average[-1]
232+
233+
def imshow(self, image, rem_step=0):
234+
cv2.imshow("cartpole"+str(rem_step), image[rem_step,...])
235+
if cv2.waitKey(25) & 0xFF == ord("q"):
236+
cv2.destroyAllWindows()
237+
return
238+
239+
def GetImage(self, frame):
240+
self.env.render()
241+
242+
# croping frame to 80x80 size
243+
frame_cropped = frame[35:195:2, ::2,:]
244+
if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS:
245+
# OpenCV resize function
246+
frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC)
247+
248+
# converting to RGB (numpy way)
249+
frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2]
250+
# converting to RGB (OpenCV way)
251+
#frame_rgb = cv2.cvtColor(frame_cropped, cv2.COLOR_RGB2GRAY)
252+
253+
# dividing by 255 we expresses value to 0-1 representation
254+
new_frame = np.array(frame_rgb).astype(np.float32) / 255.0
255+
256+
# push our data by 1 frame, similar as deq() function work
257+
self.image_memory = np.roll(self.image_memory, 1, axis = 0)
258+
259+
# inserting new frame to free space
260+
self.image_memory[0,:,:] = new_frame
261+
262+
# show image frame
263+
#self.imshow(self.image_memory,0)
264+
#self.imshow(self.image_memory,1)
265+
#self.imshow(self.image_memory,2)
266+
#self.imshow(self.image_memory,3)
267+
268+
return np.expand_dims(self.image_memory, axis=0)
269+
270+
def reset(self):
271+
frame = self.env.reset()
272+
for i in range(self.REM_STEP):
273+
state = self.GetImage(frame)
274+
return state
275+
276+
def step(self,action):
277+
next_state, reward, done, info = self.env.step(action)
278+
next_state = self.GetImage(next_state)
279+
return next_state, reward, done, info
280+
281+
def run(self):
282+
decay_step = 0
283+
max_average = -21.0
284+
for e in range(self.EPISODES):
285+
state = self.reset()
286+
done = False
287+
score = 0
288+
SAVING = ''
289+
while not done:
290+
decay_step += 1
291+
action, explore_probability = self.act(state, decay_step)
292+
next_state, reward, done, _ = self.step(action)
293+
self.remember(state, action, reward, next_state, done)
294+
state = next_state
295+
score += reward
296+
297+
if done:
298+
# every episode, plot the result
299+
average = self.PlotModel(score, e)
300+
301+
# saving best models
302+
if average >= max_average:
303+
max_average = average
304+
self.save(self.Model_name)
305+
SAVING = "SAVING"
306+
else:
307+
SAVING = ""
308+
print("episode: {}/{}, score: {}, e: {:.2f}, average: {:.2f} {}".format(e, self.EPISODES, score, explore_probability, average, SAVING))
309+
310+
# update target model
311+
self.update_target_model(decay_step)
312+
313+
# train model
314+
self.replay()
315+
316+
# close environemnt when finish training
317+
self.env.close()
318+
319+
def test(self, Model_name):
320+
self.load(Model_name)
321+
for e in range(self.EPISODES):
322+
state = self.reset()
323+
done = False
324+
score = 0
325+
while not done:
326+
self.env.render()
327+
action = np.argmax(self.model.predict(state))
328+
state, reward, done, _ = self.step(action)
329+
score += reward
330+
if done:
331+
print("episode: {}/{}, score: {}".format(e, self.EPISODES, score))
332+
break
333+
self.env.close()
334+
335+
if __name__ == "__main__":
336+
env_name = 'Pong-v0'
337+
agent = DQNAgent(env_name)
338+
agent.run()
339+
#agent.test('Models/Pong-v0_DDQN_CNN.h5')

0 commit comments

Comments
 (0)