Merge branch 'master' into nlp

zsdonghao · web-flow · commit 7e7e0af82591 · 2019-06-13T11:31:08.000+08:00
diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md
@@ -41,11 +41,11 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t
 | Algorithms      | Observation Space | Action Space | Tutorial Env   |
 | --------------- | ----------------- | ------------ | -------------- |
 | Q-learning      | Discrete          | Discrete     | FrozenLake     |
-| C51             | Discrete          | Discrete     | Pong, CartPole |
+| C51             | Continuous        | Discrete     | Pong, CartPole |
 | DQN             | Discrete          | Discrete     | FrozenLake     |
-| Variants of DQN | Discrete          | Discrete     | Pong, CartPole |
-| Retrace         | Discrete          | Discrete     | Pong, CartPole |
-| PER             | Discrete          | Discrete     | Pong, CartPole |
+| Variants of DQN | Continuous        | Discrete     | Pong, CartPole |
+| Retrace         | Continuous        | Discrete     | Pong, CartPole |
+| PER             | Continuous        | Discrete     | Pong, CartPole |
 | Actor-Critic    | Continuous        | Discrete     | CartPole       |
 | A3C             | Continuous        | Continuous   | BipedalWalker  |
 | DDPG            | Continuous        | Continuous   | Pendulum       |
@@ -106,18 +106,22 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t
 
   <u>Paper</u>: [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461)
 
+  [Dueling Network Architectures for Deep Reinforcement Learning](https://arxiv.org/abs/1511.06581)
+
+  [Noisy Networks for Exploration](https://arxiv.org/abs/1706.10295)
+
   <u>Description</u>: 
 
   ```
   We implement Double DQN, Dueling DQN and Noisy DQN here.
   
   -The max operator in standard DQN uses the same values both to select and to evaluate an action by:
   
-     Q(s_t, a_t) = R\_{t+1\} + gamma \* max\_{a}Q\_\{target\}(s_{t+1}, a).
+     Q(s_t, a_t) = R_{t+1} + gamma * max_{a}Q_{target}(s_{t+1}, a).
   
   -Double DQN proposes to use following evaluation to address overestimation problem of max operator:
   
-     Q(s_t, a_t) = R\_{t+1\} + gamma \* Q\_{target}(s\_\{t+1\}, max{a}Q(s_{t+1}, a)).
+     Q(s_t, a_t) = R_{t+1} + gamma * Q_{target}(s_{t+1}, max_{a}Q(s_{t+1}, a)).
   
   -Dueling DQN uses dueling architecture where the value of state and the advantage of each action is estimated separately.
   
diff --git a/examples/reinforcement_learning/tutorial_DDPG.py b/examples/reinforcement_learning/tutorial_DDPG.py
@@ -176,7 +176,7 @@ def learn(self):
         with tf.GradientTape() as tape:
             a = self.actor(bs)
             q = self.critic([bs, a])
-            a_loss = -tf.reduce_mean(q)  # maximize the q
+            a_loss = - tf.reduce_mean(q)  # maximize the q
         a_grads = tape.gradient(a_loss, self.actor.trainable_weights)
         self.actor_opt.apply_gradients(zip(a_grads, self.actor.trainable_weights))
 
diff --git a/examples/reinforcement_learning/tutorial_DPPO.py b/examples/reinforcement_learning/tutorial_DPPO.py
@@ -92,6 +92,8 @@ def __init__(self):
         # actor
         self.actor = self._build_anet('pi', trainable=True)
         self.actor_old = self._build_anet('oldpi', trainable=False)
+        self.actor_opt = tf.optimizers.Adam(A_LR)
+        self.critic_opt = tf.optimizers.Adam(C_LR)
 
     def a_train(self, tfs, tfa, tfadv):
         '''
@@ -126,7 +128,7 @@ def a_train(self, tfs, tfa, tfadv):
                 )
         a_gard = tape.gradient(aloss, self.actor.trainable_weights)
 
-        tf.optimizers.Adam(A_LR).apply_gradients(zip(a_gard, self.actor.trainable_weights))
+        self.actor_opt.apply_gradients(zip(a_gard, self.actor.trainable_weights))
 
         if METHOD['name'] == 'kl_pen':
             return kl_mean
@@ -151,7 +153,7 @@ def c_train(self, tfdc_r, s):
             advantage = tfdc_r - self.critic(s)
             closs = tf.reduce_mean(tf.square(advantage))
         grad = tape.gradient(closs, self.critic.trainable_weights)
-        tf.optimizers.Adam(C_LR).apply_gradients(zip(grad, self.critic.trainable_weights))
+        self.critic_opt.apply_gradients(zip(grad, self.critic.trainable_weights))
 
     def cal_adv(self, tfs, tfdc_r):
         '''
diff --git a/examples/reinforcement_learning/tutorial_PG.py b/examples/reinforcement_learning/tutorial_PG.py
@@ -26,7 +26,6 @@
 python tutorial_PG.py --train/test
 
 """
-
 import argparse
 import os
 import time
diff --git a/examples/reinforcement_learning/tutorial_PPO.py b/examples/reinforcement_learning/tutorial_PPO.py
@@ -27,7 +27,6 @@
 python tutorial_PPO.py --train/test
 
 """
-
 import argparse
 import os
 import time
@@ -85,6 +84,8 @@ def __init__(self):
         # actor
         self.actor = self._build_anet('pi', trainable=True)
         self.actor_old = self._build_anet('oldpi', trainable=False)
+        self.actor_opt = tf.optimizers.Adam(A_LR)
+        self.critic_opt = tf.optimizers.Adam(C_LR)
 
     def a_train(self, tfs, tfa, tfadv):
         '''
@@ -119,7 +120,7 @@ def a_train(self, tfs, tfa, tfadv):
                 )
         a_gard = tape.gradient(aloss, self.actor.trainable_weights)
 
-        tf.optimizers.Adam(A_LR).apply_gradients(zip(a_gard, self.actor.trainable_weights))
+        self.actor_opt.apply_gradients(zip(a_gard, self.actor.trainable_weights))
 
         if METHOD['name'] == 'kl_pen':
             return kl_mean
@@ -146,7 +147,7 @@ def c_train(self, tfdc_r, s):
             closs = tf.reduce_mean(tf.square(advantage))
         # print('tfdc_r value', tfdc_r)
         grad = tape.gradient(closs, self.critic.trainable_weights)
-        tf.optimizers.Adam(C_LR).apply_gradients(zip(grad, self.critic.trainable_weights))
+        self.critic_opt.apply_gradients(zip(grad, self.critic.trainable_weights))
 
     def cal_adv(self, tfs, tfdc_r):
         '''
@@ -171,7 +172,7 @@ def update(self, s, a, r):
 
         self.update_old_pi()
         adv = self.cal_adv(s, r)
-        # adv = (adv - adv.mean())/(adv.std()+1e-6)     # sometimes helpful
+        # adv = (adv - adv.mean())/(adv.std()+1e-6)  # sometimes helpful
 
         # update actor
         if METHOD['name'] == 'kl_pen':
diff --git a/examples/reinforcement_learning/tutorial_TRPO.py b/examples/reinforcement_learning/tutorial_TRPO.py
@@ -447,10 +447,11 @@ def get(self):
         adv_mean, adv_std = np.mean(self.adv_buf), np.std(self.adv_buf)
         self.adv_buf = (self.adv_buf - adv_mean) / adv_std
         return [self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf, self.logp_buf
-               ] + values_as_sorted_list(self.info_bufs)
+                ] + values_as_sorted_list(self.info_bufs)
 
 
 #####################  TRPO  ####################
+
 """
 
 Trust Region Policy Optimization