release choice_action_by_probs for rein

zsdonghao · zsdonghao · commit 5b9fcd9a3b3d · 2017-07-19T16:04:00.000+01:00
diff --git a/docs/modules/rein.rst b/docs/modules/rein.rst
@@ -9,14 +9,17 @@ Reinforcement Learning.
 
   discount_episode_rewards
   cross_entropy_reward_loss
+  choice_action_by_probs
 
 
 Reward functions
 ---------------------
-
 .. autofunction:: discount_episode_rewards
 
 Cost functions
 ---------------------
-
 .. autofunction:: cross_entropy_reward_loss
+
+Sampling functions
+---------------------
+.. autofunction:: choice_action_by_probs
diff --git a/example/tutorial_atari_pong.py b/example/tutorial_atari_pong.py
@@ -2,7 +2,7 @@
 # -*- coding: utf8 -*-
 
 
-""" Policy Network π(a|s)
+""" Monte-Carlo Policy Network π(a|s)  (REINFORCE)
 
 To understand Reinforcement Learning, we let computer to learn how to play
 Pong game from the original screen inputs. Before we start, we highly recommend
@@ -24,9 +24,8 @@
 
 import tensorflow as tf
 import tensorlayer as tl
-import gym
+import gym, time
 import numpy as np
-import time
 
 # hyperparameters
 image_size = 80
@@ -101,7 +100,8 @@ def prepro(I):
             feed_dict={states_batch_pl: x}
         )
         # action. 1: STOP  2: UP  3: DOWN
-        action = np.random.choice([1,2,3], p=prob.flatten())
+        # action = np.random.choice([1,2,3], p=prob.flatten())
+        action = tl.rein.choice_action_by_probs(prob.flatten(), [1,2,3]) 
 
         observation, reward, done, _ = env.step(action)
         reward_sum += reward
diff --git a/tensorlayer/rein.py b/tensorlayer/rein.py
@@ -83,3 +83,32 @@ def cross_entropy_reward_loss(logits, actions, rewards, name=None):
     except: ## TF0.12
         loss = tf.reduce_sum(tf.mul(cross_entropy, rewards))   # element-wise mul
     return loss
+
+
+def choice_action_by_probs(probs=[0.5, 0.5], action_list=None):
+    """Choice and return an an action by given the action probability distribution.
+
+    Parameters
+    ------------
+    probs : a list of float.
+        The probability distribution of all actions.
+    action_list : None or a list of action in integer.
+        If None, return an integer range between 0 and len(probs)-1.
+
+    Examples
+    ----------
+    >>> for _ in range(5):
+    >>>     a = choice_action_by_probs(probs=[0.2, 0.4, 0.4])
+    >>>     print(a)
+    ... 0
+    ... 1
+    ... 1
+    ... 2
+    ... 1
+    """
+    if action_list is None:
+        n_action = len(probs)
+        action_list = np.arange(n_action)
+    else:
+        assert len(action_list) == len(probs), "Number of actions should equal to number of probabilities."
+    return np.random.choice(action_list, p=probs)