Skip to content

Commit eb88aa1

Browse files
committed
update DQN example, release log_weight
1 parent e8971b3 commit eb88aa1

File tree

3 files changed

+25
-4
lines changed

3 files changed

+25
-4
lines changed

docs/modules/rein.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ Reinforcement Learning.
99

1010
discount_episode_rewards
1111
cross_entropy_reward_loss
12+
log_weight
1213
choice_action_by_probs
1314

1415

@@ -18,8 +19,15 @@ Reward functions
1819

1920
Cost functions
2021
---------------------
22+
23+
Weighted Cross Entropy
24+
^^^^^^^^^^^^^^^^^^^^^^^^
2125
.. autofunction:: cross_entropy_reward_loss
2226

27+
Log weight
28+
^^^^^^^^^^^^^^
29+
.. autofunction:: log_weight
30+
2331
Sampling functions
2432
---------------------
2533
.. autofunction:: choice_action_by_probs

example/tutorial_frozenlake_dqn.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ def to_one_hot(i, n_classes=None):
6161
lambd = .99 # decay factor
6262
e = 0.1 # e-Greedy Exploration, the larger the more random
6363
num_episodes = 10000
64-
rList = [] # rewards for each episode
6564
with tf.Session() as sess:
6665
tl.layers.initialize_global_variables(sess)
6766
for i in range(num_episodes):
@@ -95,6 +94,6 @@ def to_one_hot(i, n_classes=None):
9594
break
9695

9796
## Note that, the rewards here with random action
98-
running_reward = r if running_reward is None else running_reward * 0.99 + r * 0.01
97+
running_reward = rAll if running_reward is None else running_reward * 0.99 + rAll * 0.01
9998
print("Episode [%d/%d] sum reward:%f running reward:%f took:%.5fs %s" %
10099
(i, num_episodes, rAll, running_reward, time.time()-episode_time, '' if rAll == 0 else ' !!!!!!!!'))

tensorlayer/rein.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,18 +72,32 @@ def cross_entropy_reward_loss(logits, actions, rewards, name=None):
7272
>>> train_op = tf.train.RMSPropOptimizer(learning_rate, decay_rate).minimize(loss)
7373
"""
7474

75-
try: # TF 1.0
75+
try: # TF 1.0+
7676
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=actions, logits=logits, name=name)
7777
except:
7878
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, targets=actions)
7979
# cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, actions)
8080

81-
try: ## TF1.0
81+
try: ## TF1.0+
8282
loss = tf.reduce_sum(tf.multiply(cross_entropy, rewards))
8383
except: ## TF0.12
8484
loss = tf.reduce_sum(tf.mul(cross_entropy, rewards)) # element-wise mul
8585
return loss
8686

87+
def log_weight(probs, weights, name='log_weight'):
88+
"""Log weight.
89+
90+
Parameters
91+
-----------
92+
probs : tensor
93+
If it is a network output, usually we should scale it to [0, 1] via softmax.
94+
weights : tensor
95+
"""
96+
with tf.variable_scope(name):
97+
exp_v = tf.reduce_mean(tf.log(probs) * weights)
98+
return exp_v
99+
100+
87101

88102
def choice_action_by_probs(probs=[0.5, 0.5], action_list=None):
89103
"""Choice and return an an action by given the action probability distribution.

0 commit comments

Comments
 (0)