Skip to content

Commit 4016a9c

Browse files
Tested with TF 2.3.1
Tested with TF 2.3.1
1 parent e8a68f1 commit 4016a9c

File tree

1 file changed

+276
-0
lines changed

1 file changed

+276
-0
lines changed
Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
# Tutorial by www.pylessons.com
2+
# Tutorial written for - Tensorflow 2.3.1
3+
4+
import os
5+
import random
6+
import gym
7+
import pylab
8+
import numpy as np
9+
from collections import deque
10+
from tensorflow.keras.models import Model, load_model
11+
from tensorflow.keras.layers import Input, Dense, Lambda, Add
12+
from tensorflow.keras.optimizers import Adam, RMSprop
13+
from tensorflow.keras import backend as K
14+
from PER import *
15+
16+
def OurModel(input_shape, action_space, dueling):
17+
X_input = Input(input_shape)
18+
X = X_input
19+
20+
# 'Dense' is the basic form of a neural network layer
21+
# Input Layer of state size(4) and Hidden Layer with 512 nodes
22+
X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X)
23+
24+
# Hidden layer with 256 nodes
25+
X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
26+
27+
# Hidden layer with 64 nodes
28+
X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
29+
30+
if dueling:
31+
state_value = Dense(1, kernel_initializer='he_uniform')(X)
32+
state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value)
33+
34+
action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X)
35+
action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage)
36+
37+
X = Add()([state_value, action_advantage])
38+
else:
39+
# Output Layer with # of actions: 2 nodes (left, right)
40+
X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)
41+
42+
model = Model(inputs = X_input, outputs = X)
43+
model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])
44+
45+
model.summary()
46+
return model
47+
48+
class DQNAgent:
49+
def __init__(self, env_name):
50+
self.env_name = env_name
51+
self.env = gym.make(env_name)
52+
self.env.seed(0)
53+
# by default, CartPole-v1 has max episode steps = 500
54+
self.env._max_episode_steps = 4000
55+
self.state_size = self.env.observation_space.shape[0]
56+
self.action_size = self.env.action_space.n
57+
58+
self.EPISODES = 1000
59+
memory_size = 10000
60+
self.MEMORY = Memory(memory_size)
61+
self.memory = deque(maxlen=2000)
62+
self.gamma = 0.95 # discount rate
63+
64+
# EXPLORATION HYPERPARAMETERS for epsilon and epsilon greedy strategy
65+
self.epsilon = 1.0 # exploration probability at start
66+
self.epsilon_min = 0.01 # minimum exploration probability
67+
self.epsilon_decay = 0.0005 # exponential decay rate for exploration prob
68+
69+
self.batch_size = 32
70+
71+
# defining model parameters
72+
self.ddqn = True # use doudle deep q network
73+
self.Soft_Update = False # use soft parameter update
74+
self.dueling = True # use dealing netowrk
75+
self.epsilot_greedy = False # use epsilon greedy strategy
76+
self.USE_PER = True
77+
78+
self.TAU = 0.1 # target network soft update hyperparameter
79+
80+
self.Save_Path = 'Models'
81+
if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
82+
self.scores, self.episodes, self.average = [], [], []
83+
84+
self.Model_name = os.path.join(self.Save_Path, self.env_name+"_e_greedy.h5")
85+
86+
# create main model and target model
87+
self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling)
88+
self.target_model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling)
89+
90+
# after some time interval update the target model to be same with model
91+
def update_target_model(self):
92+
if not self.Soft_Update and self.ddqn:
93+
self.target_model.set_weights(self.model.get_weights())
94+
return
95+
if self.Soft_Update and self.ddqn:
96+
q_model_theta = self.model.get_weights()
97+
target_model_theta = self.target_model.get_weights()
98+
counter = 0
99+
for q_weight, target_weight in zip(q_model_theta, target_model_theta):
100+
target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU
101+
target_model_theta[counter] = target_weight
102+
counter += 1
103+
self.target_model.set_weights(target_model_theta)
104+
105+
def remember(self, state, action, reward, next_state, done):
106+
experience = state, action, reward, next_state, done
107+
if self.USE_PER:
108+
self.MEMORY.store(experience)
109+
else:
110+
self.memory.append((experience))
111+
112+
def act(self, state, decay_step):
113+
# EPSILON GREEDY STRATEGY
114+
if self.epsilot_greedy:
115+
# Here we'll use an improved version of our epsilon greedy strategy for Q-learning
116+
explore_probability = self.epsilon_min + (self.epsilon - self.epsilon_min) * np.exp(-self.epsilon_decay * decay_step)
117+
# OLD EPSILON STRATEGY
118+
else:
119+
if self.epsilon > self.epsilon_min:
120+
self.epsilon *= (1-self.epsilon_decay)
121+
explore_probability = self.epsilon
122+
123+
if explore_probability > np.random.rand():
124+
# Make a random action (exploration)
125+
return random.randrange(self.action_size), explore_probability
126+
else:
127+
# Get action from Q-network (exploitation)
128+
# Estimate the Qs values state
129+
# Take the biggest Q value (= the best action)
130+
return np.argmax(self.model.predict(state)), explore_probability
131+
132+
def replay(self):
133+
if self.USE_PER:
134+
tree_idx, minibatch = self.MEMORY.sample(self.batch_size)
135+
else:
136+
minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size))
137+
138+
state = np.zeros((self.batch_size, self.state_size))
139+
next_state = np.zeros((self.batch_size, self.state_size))
140+
action, reward, done = [], [], []
141+
142+
# do this before prediction
143+
# for speedup, this could be done on the tensor level
144+
# but easier to understand using a loop
145+
for i in range(self.batch_size):
146+
state[i] = minibatch[i][0]
147+
action.append(minibatch[i][1])
148+
reward.append(minibatch[i][2])
149+
next_state[i] = minibatch[i][3]
150+
done.append(minibatch[i][4])
151+
152+
# do batch prediction to save speed
153+
# predict Q-values for starting state using the main network
154+
target = self.model.predict(state)
155+
target_old = np.array(target)
156+
# predict best action in ending state using the main network
157+
target_next = self.model.predict(next_state)
158+
# predict Q-values for ending state using the target network
159+
target_val = self.target_model.predict(next_state)
160+
161+
for i in range(len(minibatch)):
162+
# correction on the Q value for the action used
163+
if done[i]:
164+
target[i][action[i]] = reward[i]
165+
else:
166+
if self.ddqn: # Double - DQN
167+
# current Q Network selects the action
168+
# a'_max = argmax_a' Q(s', a')
169+
a = np.argmax(target_next[i])
170+
# target Q Network evaluates the action
171+
# Q_max = Q_target(s', a'_max)
172+
target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a])
173+
else: # Standard - DQN
174+
# DQN chooses the max Q value among next actions
175+
# selection and evaluation of action is on the target Q Network
176+
# Q_max = max_a' Q_target(s', a')
177+
target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))
178+
179+
if self.USE_PER:
180+
indices = np.arange(self.batch_size, dtype=np.int32)
181+
absolute_errors = np.abs(target_old[indices, np.array(action)]-target[indices, np.array(action)])
182+
# Update priority
183+
self.MEMORY.batch_update(tree_idx, absolute_errors)
184+
185+
# Train the Neural Network with batches
186+
self.model.fit(state, target, batch_size=self.batch_size, verbose=0)
187+
188+
def load(self, name):
189+
self.model = load_model(name)
190+
191+
def save(self, name):
192+
self.model.save(name)
193+
194+
pylab.figure(figsize=(18, 9))
195+
def PlotModel(self, score, episode):
196+
self.scores.append(score)
197+
self.episodes.append(episode)
198+
self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
199+
pylab.plot(self.episodes, self.average, 'r')
200+
pylab.plot(self.episodes, self.scores, 'b')
201+
pylab.ylabel('Score', fontsize=18)
202+
pylab.xlabel('Steps', fontsize=18)
203+
dqn = 'DQN_'
204+
softupdate = ''
205+
dueling = ''
206+
greedy = ''
207+
PER = ''
208+
if self.ddqn: dqn = 'DDQN_'
209+
if self.Soft_Update: softupdate = '_soft'
210+
if self.dueling: dueling = '_Dueling'
211+
if self.epsilot_greedy: greedy = '_Greedy'
212+
if self.USE_PER: PER = '_PER'
213+
try:
214+
pylab.savefig(dqn+self.env_name+softupdate+dueling+greedy+PER+".png")
215+
except OSError:
216+
pass
217+
218+
return str(self.average[-1])[:5]
219+
220+
def run(self):
221+
decay_step = 0
222+
for e in range(self.EPISODES):
223+
state = self.env.reset()
224+
state = np.reshape(state, [1, self.state_size])
225+
done = False
226+
i = 0
227+
while not done:
228+
#self.env.render()
229+
decay_step += 1
230+
action, explore_probability = self.act(state, decay_step)
231+
next_state, reward, done, _ = self.env.step(action)
232+
next_state = np.reshape(next_state, [1, self.state_size])
233+
if not done or i == self.env._max_episode_steps-1:
234+
reward = reward
235+
else:
236+
reward = -100
237+
self.remember(state, action, reward, next_state, done)
238+
state = next_state
239+
i += 1
240+
if done:
241+
# every step update target model
242+
self.update_target_model()
243+
244+
# every episode, plot the result
245+
average = self.PlotModel(i, e)
246+
247+
print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, explore_probability, average))
248+
if i == self.env._max_episode_steps:
249+
print("Saving trained model to", self.Model_name)
250+
#self.save(self.Model_name)
251+
break
252+
self.replay()
253+
self.env.close()
254+
255+
def test(self):
256+
self.load(self.Model_name)
257+
for e in range(self.EPISODES):
258+
state = self.env.reset()
259+
state = np.reshape(state, [1, self.state_size])
260+
done = False
261+
i = 0
262+
while not done:
263+
self.env.render()
264+
action = np.argmax(self.model.predict(state))
265+
next_state, reward, done, _ = self.env.step(action)
266+
state = np.reshape(next_state, [1, self.state_size])
267+
i += 1
268+
if done:
269+
print("episode: {}/{}, score: {}".format(e, self.EPISODES, i))
270+
break
271+
272+
if __name__ == "__main__":
273+
env_name = 'CartPole-v1'
274+
agent = DQNAgent(env_name)
275+
agent.run()
276+
#agent.test()

0 commit comments

Comments
 (0)