update DQN example, release log_weight

zsdonghao · zsdonghao · commit eb88aa10258b · 2017-07-22T01:26:10.000+01:00
diff --git a/docs/modules/rein.rst b/docs/modules/rein.rst
@@ -9,6 +9,7 @@ Reinforcement Learning.
 
   discount_episode_rewards
   cross_entropy_reward_loss
+  log_weight
   choice_action_by_probs
 
 
@@ -18,8 +19,15 @@ Reward functions
 
 Cost functions
 ---------------------
+
+Weighted Cross Entropy
+^^^^^^^^^^^^^^^^^^^^^^^^
 .. autofunction:: cross_entropy_reward_loss
 
+Log weight
+^^^^^^^^^^^^^^
+.. autofunction:: log_weight
+
 Sampling functions
 ---------------------
 .. autofunction:: choice_action_by_probs
diff --git a/example/tutorial_frozenlake_dqn.py b/example/tutorial_frozenlake_dqn.py
@@ -61,7 +61,6 @@ def to_one_hot(i, n_classes=None):
 lambd = .99    # decay factor
 e = 0.1        # e-Greedy Exploration, the larger the more random
 num_episodes = 10000
-rList = []     # rewards for each episode
 with tf.Session() as sess:
     tl.layers.initialize_global_variables(sess)
     for i in range(num_episodes):
@@ -95,6 +94,6 @@ def to_one_hot(i, n_classes=None):
                 break
 
         ## Note that, the rewards here with random action
-        running_reward = r if running_reward is None else running_reward * 0.99 + r * 0.01
+        running_reward = rAll if running_reward is None else running_reward * 0.99 + rAll * 0.01
         print("Episode [%d/%d] sum reward:%f running reward:%f took:%.5fs %s" %
             (i, num_episodes, rAll, running_reward, time.time()-episode_time, '' if rAll == 0 else ' !!!!!!!!'))
diff --git a/tensorlayer/rein.py b/tensorlayer/rein.py
@@ -72,18 +72,32 @@ def cross_entropy_reward_loss(logits, actions, rewards, name=None):
     >>> train_op = tf.train.RMSPropOptimizer(learning_rate, decay_rate).minimize(loss)
     """
 
-    try: # TF 1.0
+    try: # TF 1.0+
         cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=actions, logits=logits, name=name)
     except:
         cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, targets=actions)
         # cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, actions)
 
-    try: ## TF1.0
+    try: ## TF1.0+
         loss = tf.reduce_sum(tf.multiply(cross_entropy, rewards))
     except: ## TF0.12
         loss = tf.reduce_sum(tf.mul(cross_entropy, rewards))   # element-wise mul
     return loss
 
+def log_weight(probs, weights, name='log_weight'):
+    """Log weight.
+
+    Parameters
+    -----------
+    probs : tensor
+        If it is a network output, usually we should scale it to [0, 1] via softmax.
+    weights : tensor
+    """
+    with tf.variable_scope(name):
+        exp_v = tf.reduce_mean(tf.log(probs) * weights)
+        return exp_v
+
+
 
 def choice_action_by_probs(probs=[0.5, 0.5], action_list=None):
     """Choice and return an an action by given the action probability distribution.