Skip to content

Commit e8a68f1

Browse files
Tested with TF 2.3.1
Tested with TF 2.3.1
1 parent 1e531bf commit e8a68f1

File tree

1 file changed

+260
-0
lines changed

1 file changed

+260
-0
lines changed
Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
# Tutorial by www.pylessons.com
2+
# Tutorial written for - Tensorflow 2.3.1
3+
4+
import os
5+
import random
6+
import gym
7+
import pylab
8+
import numpy as np
9+
from collections import deque
10+
from tensorflow.keras.models import Model, load_model
11+
from tensorflow.keras.layers import Input, Dense, Lambda, Add
12+
from tensorflow.keras.optimizers import Adam, RMSprop
13+
from tensorflow.keras import backend as K
14+
15+
def OurModel(input_shape, action_space, dueling):
16+
X_input = Input(input_shape)
17+
X = X_input
18+
19+
# 'Dense' is the basic form of a neural network layer
20+
# Input Layer of state size(4) and Hidden Layer with 512 nodes
21+
X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X)
22+
23+
# Hidden layer with 256 nodes
24+
X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
25+
26+
# Hidden layer with 64 nodes
27+
X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)
28+
29+
if dueling:
30+
state_value = Dense(1, kernel_initializer='he_uniform')(X)
31+
state_value = Lambda(lambda s: K.expand_dims(s[:, 0], -1), output_shape=(action_space,))(state_value)
32+
33+
action_advantage = Dense(action_space, kernel_initializer='he_uniform')(X)
34+
action_advantage = Lambda(lambda a: a[:, :] - K.mean(a[:, :], keepdims=True), output_shape=(action_space,))(action_advantage)
35+
36+
X = Add()([state_value, action_advantage])
37+
else:
38+
# Output Layer with # of actions: 2 nodes (left, right)
39+
X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)
40+
41+
model = Model(inputs = X_input, outputs = X)
42+
model.compile(loss="mean_squared_error", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])
43+
44+
model.summary()
45+
return model
46+
47+
class DQNAgent:
48+
def __init__(self, env_name):
49+
self.env_name = env_name
50+
self.env = gym.make(env_name)
51+
self.env.seed(0)
52+
# by default, CartPole-v1 has max episode steps = 500
53+
self.env._max_episode_steps = 4000
54+
self.state_size = self.env.observation_space.shape[0]
55+
self.action_size = self.env.action_space.n
56+
57+
self.EPISODES = 1000
58+
self.memory = deque(maxlen=2000)
59+
self.gamma = 0.95 # discount rate
60+
61+
# EXPLORATION HYPERPARAMETERS for epsilon and epsilon greedy strategy
62+
self.epsilon = 1.0 # exploration probability at start
63+
self.epsilon_min = 0.01 # minimum exploration probability
64+
self.epsilon_decay = 0.0005 # exponential decay rate for exploration prob
65+
66+
self.batch_size = 32
67+
68+
# defining model parameters
69+
self.ddqn = True # use double deep q network
70+
self.Soft_Update = False # use soft parameter update
71+
self.dueling = True # use dealing network
72+
self.epsilon_greedy = True # use epsilon greedy strategy
73+
74+
self.TAU = 0.1 # target network soft update hyperparameter
75+
76+
self.Save_Path = 'Models'
77+
if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
78+
self.scores, self.episodes, self.average = [], [], []
79+
80+
self.Model_name = os.path.join(self.Save_Path, self.env_name+"_e_greedy.h5")
81+
82+
# create main model and target model
83+
self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling)
84+
self.target_model = OurModel(input_shape=(self.state_size,), action_space = self.action_size, dueling = self.dueling)
85+
86+
# after some time interval update the target model to be same with model
87+
def update_target_model(self):
88+
if not self.Soft_Update and self.ddqn:
89+
self.target_model.set_weights(self.model.get_weights())
90+
return
91+
if self.Soft_Update and self.ddqn:
92+
q_model_theta = self.model.get_weights()
93+
target_model_theta = self.target_model.get_weights()
94+
counter = 0
95+
for q_weight, target_weight in zip(q_model_theta, target_model_theta):
96+
target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU
97+
target_model_theta[counter] = target_weight
98+
counter += 1
99+
self.target_model.set_weights(target_model_theta)
100+
101+
def remember(self, state, action, reward, next_state, done):
102+
experience = state, action, reward, next_state, done
103+
self.memory.append((experience))
104+
105+
def act(self, state, decay_step):
106+
# EPSILON GREEDY STRATEGY
107+
if self.epsilon_greedy:
108+
# Here we'll use an improved version of our epsilon greedy strategy for Q-learning
109+
explore_probability = self.epsilon_min + (self.epsilon - self.epsilon_min) * np.exp(-self.epsilon_decay * decay_step)
110+
# OLD EPSILON STRATEGY
111+
else:
112+
if self.epsilon > self.epsilon_min:
113+
self.epsilon *= (1-self.epsilon_decay)
114+
explore_probability = self.epsilon
115+
116+
if explore_probability > np.random.rand():
117+
# Make a random action (exploration)
118+
return random.randrange(self.action_size), explore_probability
119+
else:
120+
# Get action from Q-network (exploitation)
121+
# Estimate the Qs values state
122+
# Take the biggest Q value (= the best action)
123+
return np.argmax(self.model.predict(state)), explore_probability
124+
125+
def replay(self):
126+
if len(self.memory) < self.batch_size:
127+
return
128+
# Randomly sample minibatch from the memory
129+
minibatch = random.sample(self.memory, self.batch_size)
130+
131+
state = np.zeros((self.batch_size, self.state_size))
132+
next_state = np.zeros((self.batch_size, self.state_size))
133+
action, reward, done = [], [], []
134+
135+
# do this before prediction
136+
# for speedup, this could be done on the tensor level
137+
# but easier to understand using a loop
138+
for i in range(self.batch_size):
139+
state[i] = minibatch[i][0]
140+
action.append(minibatch[i][1])
141+
reward.append(minibatch[i][2])
142+
next_state[i] = minibatch[i][3]
143+
done.append(minibatch[i][4])
144+
145+
# do batch prediction to save speed
146+
# predict Q-values for starting state using the main network
147+
target = self.model.predict(state)
148+
# predict best action in ending state using the main network
149+
target_next = self.model.predict(next_state)
150+
# predict Q-values for ending state using the target network
151+
target_val = self.target_model.predict(next_state)
152+
153+
for i in range(len(minibatch)):
154+
# correction on the Q value for the action used
155+
if done[i]:
156+
target[i][action[i]] = reward[i]
157+
else:
158+
if self.ddqn: # Double - DQN
159+
# current Q Network selects the action
160+
# a'_max = argmax_a' Q(s', a')
161+
a = np.argmax(target_next[i])
162+
# target Q Network evaluates the action
163+
# Q_max = Q_target(s', a'_max)
164+
target[i][action[i]] = reward[i] + self.gamma * (target_val[i][a])
165+
else: # Standard - DQN
166+
# DQN chooses the max Q value among next actions
167+
# selection and evaluation of action is on the target Q Network
168+
# Q_max = max_a' Q_target(s', a')
169+
target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))
170+
171+
# Train the Neural Network with batches
172+
self.model.fit(state, target, batch_size=self.batch_size, verbose=0)
173+
174+
def load(self, name):
175+
self.model = load_model(name)
176+
177+
def save(self, name):
178+
self.model.save(name)
179+
180+
pylab.figure(figsize=(18, 9))
181+
def PlotModel(self, score, episode):
182+
self.scores.append(score)
183+
self.episodes.append(episode)
184+
self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
185+
pylab.plot(self.episodes, self.average, 'r')
186+
pylab.plot(self.episodes, self.scores, 'b')
187+
pylab.ylabel('Score', fontsize=18)
188+
pylab.xlabel('Steps', fontsize=18)
189+
dqn = 'DQN_'
190+
softupdate = ''
191+
dueling = ''
192+
greedy = ''
193+
if self.ddqn: dqn = 'DDQN_'
194+
if self.Soft_Update: softupdate = '_soft'
195+
if self.dueling: dueling = '_Dueling'
196+
if self.epsilon_greedy: greedy = '_Greedy'
197+
try:
198+
pylab.savefig(dqn+self.env_name+softupdate+dueling+greedy+".png")
199+
except OSError:
200+
pass
201+
202+
return str(self.average[-1])[:5]
203+
204+
def run(self):
205+
decay_step = 0
206+
for e in range(self.EPISODES):
207+
state = self.env.reset()
208+
state = np.reshape(state, [1, self.state_size])
209+
done = False
210+
i = 0
211+
while not done:
212+
#self.env.render()
213+
decay_step += 1
214+
action, explore_probability = self.act(state, decay_step)
215+
next_state, reward, done, _ = self.env.step(action)
216+
next_state = np.reshape(next_state, [1, self.state_size])
217+
if not done or i == self.env._max_episode_steps-1:
218+
reward = reward
219+
else:
220+
reward = -100
221+
self.remember(state, action, reward, next_state, done)
222+
state = next_state
223+
i += 1
224+
if done:
225+
# every step update target model
226+
self.update_target_model()
227+
228+
# every episode, plot the result
229+
average = self.PlotModel(i, e)
230+
231+
print("episode: {}/{}, score: {}, e: {:.2}, average: {}".format(e, self.EPISODES, i, explore_probability, average))
232+
if i == self.env._max_episode_steps:
233+
print("Saving trained model to", self.Model_name)
234+
self.save(self.Model_name)
235+
break
236+
237+
self.replay()
238+
239+
def test(self):
240+
self.load(self.Model_name)
241+
for e in range(self.EPISODES):
242+
state = self.env.reset()
243+
state = np.reshape(state, [1, self.state_size])
244+
done = False
245+
i = 0
246+
while not done:
247+
self.env.render()
248+
action = np.argmax(self.model.predict(state))
249+
next_state, reward, done, _ = self.env.step(action)
250+
state = np.reshape(next_state, [1, self.state_size])
251+
i += 1
252+
if done:
253+
print("episode: {}/{}, score: {}".format(e, self.EPISODES, i))
254+
break
255+
256+
if __name__ == "__main__":
257+
env_name = 'CartPole-v1'
258+
agent = DQNAgent(env_name)
259+
agent.run()
260+
#agent.test()

0 commit comments

Comments
 (0)