Skip to content

Commit 1e531bf

Browse files
Tested with TF 2.3.1
Tested with TF 2.3.1
1 parent 3576709 commit 1e531bf

File tree

1 file changed

+247
-0
lines changed

1 file changed

+247
-0
lines changed
Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
# Tutorial by www.pylessons.com
2+
# Tutorial written for - Tensorflow 2.3.1
3+
4+
import os
5+
import random
6+
import gym
7+
import pylab
8+
import numpy as np
9+
from collections import deque
10+
import tensorflow as tf
11+
from tensorflow.keras.models import Model, load_model
12+
from tensorflow.keras.layers import Input, Dense, Lambda, Add
13+
from tensorflow.keras.optimizers import Adam, RMSprop
14+
from tensorflow.keras import backend as K
15+
16+
def OurModel(input_shape, action_space, dueling):
17+
X_input = Input(input_shape)
18+
X = X_input
19+
20+
# 'Dense' is the basic form of a neural network layer
21+
# Input Layer of state size(4) and Hidden Layer with 512 nodes
22+
X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X)
23+
24+
# Hidden layer with 256 nodes
25+
X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
26+
27+
# Hidden layer with 64 nodes
28+
X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
29+
30+
if dueling:
31+
state_value = Dense(1, kernel_initializer='he_uniform')(X)
32+
state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value)
33+
34+
action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X)
35+
action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage)
36+
37+
X = Add()([state_value, action_advantage])
38+
else:
39+
# Output Layer with # of actions: 2 nodes (left, right)
40+
X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)
41+
42+
model = Model(inputs = X_input, outputs = X)
43+
model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])
44+
45+
model.summary()
46+
return model
47+
48+
class DQNAgent:
49+
def __init__(self, env_name):
50+
self.env_name = env_name
51+
self.env = gym.make(env_name)
52+
self.env.seed(0)
53+
# by default, CartPole-v1 has max episode steps = 500
54+
self.env._max_episode_steps = 4000
55+
self.state_size = self.env.observation_space.shape[0]
56+
self.action_size = self.env.action_space.n
57+
58+
self.EPISODES = 1000
59+
self.memory = deque(maxlen=2000)
60+
61+
self.gamma = 0.95 # discount rate
62+
self.epsilon = 1.0 # exploration rate
63+
self.epsilon_min = 0.01 # minimum exploration probability
64+
self.epsilon_decay = 0.999 # exponential decay rate for exploration prob
65+
self.batch_size = 32
66+
self.train_start = 1000
67+
68+
# defining model parameters
69+
self.ddqn = True # use doudle deep q network
70+
self.Soft_Update = False # use soft parameter update
71+
self.dueling = True # use dealing netowrk
72+
73+
self.TAU = 0.1 # target network soft update hyperparameter
74+
75+
self.Save_Path = 'Models'
76+
if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
77+
self.scores, self.episodes, self.average = [], [], []
78+
79+
if self.ddqn:
80+
print("----------Double DQN--------")
81+
self.Model_name = os.path.join(self.Save_Path,"Dueling DDQN_"+self.env_name+".h5")
82+
else:
83+
print("-------------DQN------------")
84+
self.Model_name = os.path.join(self.Save_Path,"Dueling DQN_"+self.env_name+".h5")
85+
86+
# create main model and target model
87+
self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling)
88+
self.target_model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling)
89+
90+
# after some time interval update the target model to be same with model
91+
def update_target_model(self):
92+
if not self.Soft_Update and self.ddqn:
93+
self.target_model.set_weights(self.model.get_weights())
94+
return
95+
if self.Soft_Update and self.ddqn:
96+
q_model_theta = self.model.get_weights()
97+
target_model_theta = self.target_model.get_weights()
98+
counter = 0
99+
for q_weight, target_weight in zip(q_model_theta, target_model_theta):
100+
target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU
101+
target_model_theta[counter] = target_weight
102+
counter += 1
103+
self.target_model.set_weights(target_model_theta)
104+
105+
def remember(self, state, action, reward, next_state, done):
106+
self.memory.append((state, action, reward, next_state, done))
107+
if len(self.memory) > self.train_start:
108+
if self.epsilon > self.epsilon_min:
109+
self.epsilon *= self.epsilon_decay
110+
111+
def act(self, state):
112+
if np.random.random() <= self.epsilon:
113+
return random.randrange(self.action_size)
114+
else:
115+
return np.argmax(self.model.predict(state))
116+
117+
def replay(self):
118+
if len(self.memory) < self.train_start:
119+
return
120+
# Randomly sample minibatch from the memory
121+
minibatch = random.sample(self.memory, self.batch_size)
122+
123+
state = np.zeros((self.batch_size, self.state_size))
124+
next_state = np.zeros((self.batch_size, self.state_size))
125+
action, reward, done = [], [], []
126+
127+
# do this before prediction
128+
# for speedup, this could be done on the tensor level
129+
# but easier to understand using a loop
130+
for i in range(self.batch_size):
131+
state[i] = minibatch[i][0]
132+
action.append(minibatch[i][1])
133+
reward.append(minibatch[i][2])
134+
next_state[i] = minibatch[i][3]
135+
done.append(minibatch[i][4])
136+
137+
# do batch prediction to save speed
138+
# predict Q-values for starting state using the main network
139+
target = self.model.predict(state)
140+
# predict best action in ending state using the main network
141+
target_next = self.model.predict(next_state)
142+
# predict Q-values for ending state using the target network
143+
target_val = self.target_model.predict(next_state)
144+
145+
for i in range(len(minibatch)):
146+
# correction on the Q value for the action used
147+
if done[i]:
148+
target[i][action[i]] = reward[i]
149+
else:
150+
if self.ddqn: # Double - DQN
151+
# current Q Network selects the action
152+
# a'_max = argmax_a' Q(s', a')
153+
a = np.argmax(target_next[i])
154+
# target Q Network evaluates the action
155+
# Q_max = Q_target(s', a'_max)
156+
target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a])
157+
else: # Standard - DQN
158+
# DQN chooses the max Q value among next actions
159+
# selection and evaluation of action is on the target Q Network
160+
# Q_max = max_a' Q_target(s', a')
161+
target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))
162+
163+
# Train the Neural Network with batches
164+
self.model.fit(state, target, batch_size=self.batch_size, verbose=0)
165+
166+
def load(self, name):
167+
self.model = load_model(name)
168+
169+
def save(self, name):
170+
self.model.save(name)
171+
172+
pylab.figure(figsize=(18, 9))
173+
def PlotModel(self, score, episode):
174+
self.scores.append(score)
175+
self.episodes.append(episode)
176+
self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
177+
pylab.plot(self.episodes, self.average, 'r')
178+
pylab.plot(self.episodes, self.scores, 'b')
179+
pylab.ylabel('Score', fontsize=18)
180+
pylab.xlabel('Steps', fontsize=18)
181+
dqn = 'DQN_'
182+
softupdate = ''
183+
dueling = ''
184+
if self.ddqn: dqn = 'DDQN_'
185+
if self.Soft_Update: softupdate = '_soft'
186+
if self.dueling: dueling = '_Dueling'
187+
try:
188+
pylab.savefig(dqn+self.env_name+softupdate+dueling+".png")
189+
except OSError:
190+
pass
191+
192+
return str(self.average[-1])[:5]
193+
194+
def run(self):
195+
for e in range(self.EPISODES):
196+
state = self.env.reset()
197+
state = np.reshape(state, [1, self.state_size])
198+
done = False
199+
i = 0
200+
while not done:
201+
#self.env.render()
202+
action = self.act(state)
203+
next_state, reward, done, _ = self.env.step(action)
204+
next_state = np.reshape(next_state, [1, self.state_size])
205+
if not done or i == self.env._max_episode_steps-1:
206+
reward = reward
207+
else:
208+
reward = -100
209+
self.remember(state, action, reward, next_state, done)
210+
state = next_state
211+
i += 1
212+
if done:
213+
# every step update target model
214+
self.update_target_model()
215+
216+
# every episode, plot the result
217+
average = self.PlotModel(i, e)
218+
219+
print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, self.epsilon, average))
220+
if i == self.env._max_episode_steps:
221+
print("Saving trained model as", self.Model_name)
222+
#self.save(self.Model_name)
223+
break
224+
self.replay()
225+
226+
def test(self):
227+
self.load(self.Model_name)
228+
for e in range(self.EPISODES):
229+
state = self.env.reset()
230+
state = np.reshape(state, [1, self.state_size])
231+
done = False
232+
i = 0
233+
while not done:
234+
self.env.render()
235+
action = np.argmax(self.model.predict(state))
236+
next_state, reward, done, _ = self.env.step(action)
237+
state = np.reshape(next_state, [1, self.state_size])
238+
i += 1
239+
if done:
240+
print("episode: {}/{}, score: {}".format(e, self.EPISODES, i))
241+
break
242+
243+
if __name__ == "__main__":
244+
env_name = 'CartPole-v1'
245+
agent = DQNAgent(env_name)
246+
agent.run()
247+
#agent.test()

0 commit comments

Comments
 (0)