Skip to content

Commit 7e7e0af

Browse files
authored
Merge branch 'master' into nlp
2 parents da0327b + ac65fcf commit 7e7e0af

File tree

6 files changed

+22
-15
lines changed

6 files changed

+22
-15
lines changed

examples/reinforcement_learning/README.md

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,11 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t
4141
| Algorithms | Observation Space | Action Space | Tutorial Env |
4242
| --------------- | ----------------- | ------------ | -------------- |
4343
| Q-learning | Discrete | Discrete | FrozenLake |
44-
| C51 | Discrete | Discrete | Pong, CartPole |
44+
| C51 | Continuous | Discrete | Pong, CartPole |
4545
| DQN | Discrete | Discrete | FrozenLake |
46-
| Variants of DQN | Discrete | Discrete | Pong, CartPole |
47-
| Retrace | Discrete | Discrete | Pong, CartPole |
48-
| PER | Discrete | Discrete | Pong, CartPole |
46+
| Variants of DQN | Continuous | Discrete | Pong, CartPole |
47+
| Retrace | Continuous | Discrete | Pong, CartPole |
48+
| PER | Continuous | Discrete | Pong, CartPole |
4949
| Actor-Critic | Continuous | Discrete | CartPole |
5050
| A3C | Continuous | Continuous | BipedalWalker |
5151
| DDPG | Continuous | Continuous | Pendulum |
@@ -106,18 +106,22 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t
106106

107107
<u>Paper</u>: [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461)
108108

109+
[Dueling Network Architectures for Deep Reinforcement Learning](https://arxiv.org/abs/1511.06581)
110+
111+
[Noisy Networks for Exploration](https://arxiv.org/abs/1706.10295)
112+
109113
<u>Description</u>:
110114

111115
```
112116
We implement Double DQN, Dueling DQN and Noisy DQN here.
113117
114118
-The max operator in standard DQN uses the same values both to select and to evaluate an action by:
115119
116-
Q(s_t, a_t) = R\_{t+1\} + gamma \* max\_{a}Q\_\{target\}(s_{t+1}, a).
120+
Q(s_t, a_t) = R_{t+1} + gamma * max_{a}Q_{target}(s_{t+1}, a).
117121
118122
-Double DQN proposes to use following evaluation to address overestimation problem of max operator:
119123
120-
Q(s_t, a_t) = R\_{t+1\} + gamma \* Q\_{target}(s\_\{t+1\}, max{a}Q(s_{t+1}, a)).
124+
Q(s_t, a_t) = R_{t+1} + gamma * Q_{target}(s_{t+1}, max_{a}Q(s_{t+1}, a)).
121125
122126
-Dueling DQN uses dueling architecture where the value of state and the advantage of each action is estimated separately.
123127

examples/reinforcement_learning/tutorial_DDPG.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ def learn(self):
176176
with tf.GradientTape() as tape:
177177
a = self.actor(bs)
178178
q = self.critic([bs, a])
179-
a_loss = -tf.reduce_mean(q) # maximize the q
179+
a_loss = - tf.reduce_mean(q) # maximize the q
180180
a_grads = tape.gradient(a_loss, self.actor.trainable_weights)
181181
self.actor_opt.apply_gradients(zip(a_grads, self.actor.trainable_weights))
182182

examples/reinforcement_learning/tutorial_DPPO.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ def __init__(self):
9292
# actor
9393
self.actor = self._build_anet('pi', trainable=True)
9494
self.actor_old = self._build_anet('oldpi', trainable=False)
95+
self.actor_opt = tf.optimizers.Adam(A_LR)
96+
self.critic_opt = tf.optimizers.Adam(C_LR)
9597

9698
def a_train(self, tfs, tfa, tfadv):
9799
'''
@@ -126,7 +128,7 @@ def a_train(self, tfs, tfa, tfadv):
126128
)
127129
a_gard = tape.gradient(aloss, self.actor.trainable_weights)
128130

129-
tf.optimizers.Adam(A_LR).apply_gradients(zip(a_gard, self.actor.trainable_weights))
131+
self.actor_opt.apply_gradients(zip(a_gard, self.actor.trainable_weights))
130132

131133
if METHOD['name'] == 'kl_pen':
132134
return kl_mean
@@ -151,7 +153,7 @@ def c_train(self, tfdc_r, s):
151153
advantage = tfdc_r - self.critic(s)
152154
closs = tf.reduce_mean(tf.square(advantage))
153155
grad = tape.gradient(closs, self.critic.trainable_weights)
154-
tf.optimizers.Adam(C_LR).apply_gradients(zip(grad, self.critic.trainable_weights))
156+
self.critic_opt.apply_gradients(zip(grad, self.critic.trainable_weights))
155157

156158
def cal_adv(self, tfs, tfdc_r):
157159
'''

examples/reinforcement_learning/tutorial_PG.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
python tutorial_PG.py --train/test
2727
2828
"""
29-
3029
import argparse
3130
import os
3231
import time

examples/reinforcement_learning/tutorial_PPO.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
python tutorial_PPO.py --train/test
2828
2929
"""
30-
3130
import argparse
3231
import os
3332
import time
@@ -85,6 +84,8 @@ def __init__(self):
8584
# actor
8685
self.actor = self._build_anet('pi', trainable=True)
8786
self.actor_old = self._build_anet('oldpi', trainable=False)
87+
self.actor_opt = tf.optimizers.Adam(A_LR)
88+
self.critic_opt = tf.optimizers.Adam(C_LR)
8889

8990
def a_train(self, tfs, tfa, tfadv):
9091
'''
@@ -119,7 +120,7 @@ def a_train(self, tfs, tfa, tfadv):
119120
)
120121
a_gard = tape.gradient(aloss, self.actor.trainable_weights)
121122

122-
tf.optimizers.Adam(A_LR).apply_gradients(zip(a_gard, self.actor.trainable_weights))
123+
self.actor_opt.apply_gradients(zip(a_gard, self.actor.trainable_weights))
123124

124125
if METHOD['name'] == 'kl_pen':
125126
return kl_mean
@@ -146,7 +147,7 @@ def c_train(self, tfdc_r, s):
146147
closs = tf.reduce_mean(tf.square(advantage))
147148
# print('tfdc_r value', tfdc_r)
148149
grad = tape.gradient(closs, self.critic.trainable_weights)
149-
tf.optimizers.Adam(C_LR).apply_gradients(zip(grad, self.critic.trainable_weights))
150+
self.critic_opt.apply_gradients(zip(grad, self.critic.trainable_weights))
150151

151152
def cal_adv(self, tfs, tfdc_r):
152153
'''
@@ -171,7 +172,7 @@ def update(self, s, a, r):
171172

172173
self.update_old_pi()
173174
adv = self.cal_adv(s, r)
174-
# adv = (adv - adv.mean())/(adv.std()+1e-6) # sometimes helpful
175+
# adv = (adv - adv.mean())/(adv.std()+1e-6) # sometimes helpful
175176

176177
# update actor
177178
if METHOD['name'] == 'kl_pen':

examples/reinforcement_learning/tutorial_TRPO.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -447,10 +447,11 @@ def get(self):
447447
adv_mean, adv_std = np.mean(self.adv_buf), np.std(self.adv_buf)
448448
self.adv_buf = (self.adv_buf - adv_mean) / adv_std
449449
return [self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf, self.logp_buf
450-
] + values_as_sorted_list(self.info_bufs)
450+
] + values_as_sorted_list(self.info_bufs)
451451

452452

453453
##################### TRPO ####################
454+
454455
"""
455456
456457
Trust Region Policy Optimization

0 commit comments

Comments
 (0)