Skip to content

Commit 5b9fcd9

Browse files
committed
release choice_action_by_probs for rein
1 parent 0f4cf91 commit 5b9fcd9

File tree

3 files changed

+38
-6
lines changed

3 files changed

+38
-6
lines changed

docs/modules/rein.rst

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,17 @@ Reinforcement Learning.
99

1010
discount_episode_rewards
1111
cross_entropy_reward_loss
12+
choice_action_by_probs
1213

1314

1415
Reward functions
1516
---------------------
16-
1717
.. autofunction:: discount_episode_rewards
1818

1919
Cost functions
2020
---------------------
21-
2221
.. autofunction:: cross_entropy_reward_loss
22+
23+
Sampling functions
24+
---------------------
25+
.. autofunction:: choice_action_by_probs

example/tutorial_atari_pong.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# -*- coding: utf8 -*-
33

44

5-
""" Policy Network π(a|s)
5+
""" Monte-Carlo Policy Network π(a|s) (REINFORCE)
66
77
To understand Reinforcement Learning, we let computer to learn how to play
88
Pong game from the original screen inputs. Before we start, we highly recommend
@@ -24,9 +24,8 @@
2424

2525
import tensorflow as tf
2626
import tensorlayer as tl
27-
import gym
27+
import gym, time
2828
import numpy as np
29-
import time
3029

3130
# hyperparameters
3231
image_size = 80
@@ -101,7 +100,8 @@ def prepro(I):
101100
feed_dict={states_batch_pl: x}
102101
)
103102
# action. 1: STOP 2: UP 3: DOWN
104-
action = np.random.choice([1,2,3], p=prob.flatten())
103+
# action = np.random.choice([1,2,3], p=prob.flatten())
104+
action = tl.rein.choice_action_by_probs(prob.flatten(), [1,2,3])
105105

106106
observation, reward, done, _ = env.step(action)
107107
reward_sum += reward

tensorlayer/rein.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,3 +83,32 @@ def cross_entropy_reward_loss(logits, actions, rewards, name=None):
8383
except: ## TF0.12
8484
loss = tf.reduce_sum(tf.mul(cross_entropy, rewards)) # element-wise mul
8585
return loss
86+
87+
88+
def choice_action_by_probs(probs=[0.5, 0.5], action_list=None):
89+
"""Choice and return an an action by given the action probability distribution.
90+
91+
Parameters
92+
------------
93+
probs : a list of float.
94+
The probability distribution of all actions.
95+
action_list : None or a list of action in integer.
96+
If None, return an integer range between 0 and len(probs)-1.
97+
98+
Examples
99+
----------
100+
>>> for _ in range(5):
101+
>>> a = choice_action_by_probs(probs=[0.2, 0.4, 0.4])
102+
>>> print(a)
103+
... 0
104+
... 1
105+
... 1
106+
... 2
107+
... 1
108+
"""
109+
if action_list is None:
110+
n_action = len(probs)
111+
action_list = np.arange(n_action)
112+
else:
113+
assert len(action_list) == len(probs), "Number of actions should equal to number of probabilities."
114+
return np.random.choice(action_list, p=probs)

0 commit comments

Comments
 (0)