Skip to content

Commit 34300b9

Browse files
author
Ervin T
authored
Update reward signals in parallel with policy (#2362)
1 parent b617de4 commit 34300b9

File tree

13 files changed

+334
-346
lines changed

13 files changed

+334
-346
lines changed

docs/Reward-Signals.md

Lines changed: 1 addition & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ Typical Range: `0.8` - `0.995`
7171

7272
### Curiosity Reward Signal
7373

74-
The `curiosity` reward signal enables the Intrinsic Curiosity Module. This is an implementation
74+
The `curiosity` Reward Signal enables the Intrinsic Curiosity Module. This is an implementation
7575
of the approach described in "Curiosity-driven Exploration by Self-supervised Prediction"
7676
by Pathak, et al. It trains two networks:
7777
* an inverse model, which takes the current and next obersvation of the agent, encodes them, and
@@ -121,15 +121,6 @@ Default Value: `3e-4`
121121

122122
Typical Range: `1e-5` - `1e-3`
123123

124-
#### (Optional) Num Epochs
125-
126-
`num_epoch` The number of passes to make through the experience buffer when performing gradient
127-
descent optimization for the ICM. This typically should be set to the same as used for PPO.
128-
129-
Default Value: `3`
130-
131-
Typical Range: `3` - `10`
132-
133124
### GAIL Reward Signal
134125

135126
GAIL, or [Generative Adversarial Imitation Learning](https://arxiv.org/abs/1606.03476), is an
@@ -214,23 +205,3 @@ However, it does increase training time. Enable this if you notice your imitatio
214205
unstable, or unable to learn the task at hand.
215206

216207
Default Value: `false`
217-
218-
#### (Optional) Samples Per Update
219-
220-
`samples_per_update` is the maximum number of samples to use during each discriminator update. You may
221-
want to lower this if your buffer size is very large to avoid overfitting the discriminator on current data.
222-
If set to 0, we will use the minimum of buffer size and the number of demonstration samples.
223-
224-
Default Value: `0`
225-
226-
Typical Range: Approximately equal to [`buffer_size`](Training-PPO.md)
227-
228-
#### (Optional) Num Epochs
229-
230-
`num_epoch` The number of passes to make through the experience buffer when performing gradient
231-
descent optimization for the discriminator. To avoid overfitting, this typically should be set to
232-
the same as or less than used for PPO.
233-
234-
Default Value: `3`
235-
236-
Typical Range: `1` - `10`

ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py

Lines changed: 34 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,24 @@
22
import numpy as np
33
from mlagents.envs.brain import BrainInfo
44

5+
import tensorflow as tf
6+
57
from mlagents.trainers.buffer import Buffer
68
from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
79
from mlagents.trainers.components.reward_signals.curiosity.model import CuriosityModel
810
from mlagents.trainers.tf_policy import TFPolicy
11+
from mlagents.trainers.models import LearningModel
912

1013

1114
class CuriosityRewardSignal(RewardSignal):
1215
def __init__(
1316
self,
1417
policy: TFPolicy,
18+
policy_model: LearningModel,
1519
strength: float,
1620
gamma: float,
1721
encoding_size: int = 128,
1822
learning_rate: float = 3e-4,
19-
num_epoch: int = 3,
2023
):
2124
"""
2225
Creates the Curiosity reward generator
@@ -26,18 +29,20 @@ def __init__(
2629
:param gamma: The time discounting factor used for this reward.
2730
:param encoding_size: The size of the hidden encoding layer for the ICM
2831
:param learning_rate: The learning rate for the ICM.
29-
:param num_epoch: The number of epochs to train over the training buffer for the ICM.
3032
"""
31-
super().__init__(policy, strength, gamma)
33+
super().__init__(policy, policy_model, strength, gamma)
3234
self.model = CuriosityModel(
33-
policy.model, encoding_size=encoding_size, learning_rate=learning_rate
35+
policy_model, encoding_size=encoding_size, learning_rate=learning_rate
3436
)
35-
self.num_epoch = num_epoch
3637
self.use_terminal_states = False
3738
self.update_dict = {
38-
"forward_loss": self.model.forward_loss,
39-
"inverse_loss": self.model.inverse_loss,
40-
"update": self.model.update_batch,
39+
"curiosity_forward_loss": self.model.forward_loss,
40+
"curiosity_inverse_loss": self.model.inverse_loss,
41+
"curiosity_update": self.model.update_batch,
42+
}
43+
self.stats_name_to_update_name = {
44+
"Losses/Curiosity Forward Loss": "curiosity_forward_loss",
45+
"Losses/Curiosity Inverse Loss": "curiosity_inverse_loss",
4146
}
4247
self.has_updated = False
4348

@@ -89,67 +94,39 @@ def check_config(
8994
param_keys = ["strength", "gamma", "encoding_size"]
9095
super().check_config(config_dict, param_keys)
9196

92-
def update(self, update_buffer: Buffer, num_sequences: int) -> Dict[str, float]:
93-
"""
94-
Updates Curiosity model using training buffer. Divides training buffer into mini batches and performs
95-
gradient descent.
96-
:param update_buffer: Update buffer from which to pull data from.
97-
:param num_sequences: Number of sequences in the update buffer.
98-
:return: Dict of stats that should be reported to Tensorboard.
99-
"""
100-
forward_total: List[float] = []
101-
inverse_total: List[float] = []
102-
for _ in range(self.num_epoch):
103-
update_buffer.shuffle(sequence_length=self.policy.sequence_length)
104-
buffer = update_buffer
105-
for l in range(len(update_buffer["actions"]) // num_sequences):
106-
start = l * num_sequences
107-
end = (l + 1) * num_sequences
108-
run_out_curio = self._update_batch(
109-
buffer.make_mini_batch(start, end), num_sequences
110-
)
111-
inverse_total.append(run_out_curio["inverse_loss"])
112-
forward_total.append(run_out_curio["forward_loss"])
113-
114-
update_stats = {
115-
"Losses/Curiosity Forward Loss": np.mean(forward_total),
116-
"Losses/Curiosity Inverse Loss": np.mean(inverse_total),
117-
}
118-
return update_stats
119-
120-
def _update_batch(
121-
self, mini_batch: Dict[str, np.ndarray], num_sequences: int
122-
) -> Dict[str, float]:
97+
def prepare_update(
98+
self,
99+
policy_model: LearningModel,
100+
mini_batch: Dict[str, np.ndarray],
101+
num_sequences: int,
102+
) -> Dict[tf.Tensor, Any]:
123103
"""
124-
Updates model using buffer.
104+
Prepare for update and get feed_dict.
125105
:param num_sequences: Number of trajectories in batch.
126106
:param mini_batch: Experience batch.
127-
:return: Output from update process.
107+
:return: Feed_dict needed for update.
128108
"""
129109
feed_dict = {
130-
self.policy.model.batch_size: num_sequences,
131-
self.policy.model.sequence_length: self.policy.sequence_length,
132-
self.policy.model.mask_input: mini_batch["masks"],
133-
self.policy.model.advantage: mini_batch["advantages"],
134-
self.policy.model.all_old_log_probs: mini_batch["action_probs"],
110+
policy_model.batch_size: num_sequences,
111+
policy_model.sequence_length: self.policy.sequence_length,
112+
policy_model.mask_input: mini_batch["masks"],
113+
policy_model.advantage: mini_batch["advantages"],
114+
policy_model.all_old_log_probs: mini_batch["action_probs"],
135115
}
136116
if self.policy.use_continuous_act:
137-
feed_dict[self.policy.model.output_pre] = mini_batch["actions_pre"]
117+
feed_dict[policy_model.output_pre] = mini_batch["actions_pre"]
138118
else:
139-
feed_dict[self.policy.model.action_holder] = mini_batch["actions"]
119+
feed_dict[policy_model.action_holder] = mini_batch["actions"]
140120
if self.policy.use_vec_obs:
141-
feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"]
121+
feed_dict[policy_model.vector_in] = mini_batch["vector_obs"]
142122
feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"]
143-
if self.policy.model.vis_obs_size > 0:
144-
for i, _ in enumerate(self.policy.model.visual_in):
145-
feed_dict[self.policy.model.visual_in[i]] = mini_batch[
146-
"visual_obs%d" % i
147-
]
148-
for i, _ in enumerate(self.policy.model.visual_in):
123+
if policy_model.vis_obs_size > 0:
124+
for i, _ in enumerate(policy_model.visual_in):
125+
feed_dict[policy_model.visual_in[i]] = mini_batch["visual_obs%d" % i]
126+
for i, _ in enumerate(policy_model.visual_in):
149127
feed_dict[self.model.next_visual_in[i]] = mini_batch[
150128
"next_visual_obs%d" % i
151129
]
152130

153131
self.has_updated = True
154-
run_out = self.policy._execute_model(feed_dict, self.update_dict)
155-
return run_out
132+
return feed_dict

ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,25 @@
55
from mlagents.trainers.buffer import Buffer
66
from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
77
from mlagents.trainers.tf_policy import TFPolicy
8+
from mlagents.trainers.models import LearningModel
89

910

1011
class ExtrinsicRewardSignal(RewardSignal):
11-
def __init__(self, policy: TFPolicy, strength: float, gamma: float):
12+
def __init__(
13+
self,
14+
policy: TFPolicy,
15+
policy_model: LearningModel,
16+
strength: float,
17+
gamma: float,
18+
):
1219
"""
1320
The extrinsic reward generator. Returns the reward received by the environment
1421
:param policy: The Policy object (e.g. PPOPolicy) that this Reward Signal will apply to.
1522
:param strength: The strength of the reward. The reward's raw value will be multiplied by this value.
1623
:param gamma: The time discounting factor used for this reward.
1724
:return: An ExtrinsicRewardSignal object.
1825
"""
19-
super().__init__(policy, strength, gamma)
26+
super().__init__(policy, policy_model, strength, gamma)
2027

2128
@classmethod
2229
def check_config(
@@ -46,9 +53,3 @@ def evaluate(
4653
unscaled_reward = np.array(next_info.rewards)
4754
scaled_reward = self.strength * unscaled_reward
4855
return RewardSignalResult(scaled_reward, unscaled_reward)
49-
50-
def update(self, update_buffer: Buffer, num_sequences: int) -> Dict[str, float]:
51-
"""
52-
This method does nothing, as there is nothing to update.
53-
"""
54-
return {}

ml-agents/mlagents/trainers/components/reward_signals/gail/model.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -37,28 +37,22 @@ def __init__(
3737
self.gradient_penalty_weight = gradient_penalty_weight
3838
self.use_vail = use_vail
3939
self.use_actions = use_actions # True # Not using actions
40-
self.make_beta()
4140
self.make_inputs()
4241
self.create_network()
4342
self.create_loss(learning_rate)
43+
if self.use_vail:
44+
self.make_beta_update()
4445

45-
def make_beta(self) -> None:
46+
def make_beta_update(self) -> None:
4647
"""
4748
Creates the beta parameter and its updater for GAIL
4849
"""
49-
self.beta = tf.get_variable(
50-
"gail_beta",
51-
[],
52-
trainable=False,
53-
dtype=tf.float32,
54-
initializer=tf.ones_initializer(),
55-
)
56-
self.kl_div_input = tf.placeholder(shape=[], dtype=tf.float32)
50+
5751
new_beta = tf.maximum(
58-
self.beta + self.alpha * (self.kl_div_input - self.mutual_information),
59-
EPSILON,
52+
self.beta + self.alpha * (self.kl_loss - self.mutual_information), EPSILON
6053
)
61-
self.update_beta = tf.assign(self.beta, new_beta)
54+
with tf.control_dependencies(self.update_batch):
55+
self.update_beta = tf.assign(self.beta, new_beta)
6256

6357
def make_inputs(self) -> None:
6458
"""
@@ -271,6 +265,15 @@ def create_loss(self, learning_rate: float) -> None:
271265
self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate)
272266
self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate)
273267

268+
if self.use_vail:
269+
self.beta = tf.get_variable(
270+
"gail_beta",
271+
[],
272+
trainable=False,
273+
dtype=tf.float32,
274+
initializer=tf.ones_initializer(),
275+
)
276+
274277
self.discriminator_loss = -tf.reduce_mean(
275278
tf.log(self.expert_estimate + EPSILON)
276279
+ tf.log(1.0 - self.policy_estimate + EPSILON)

0 commit comments

Comments
 (0)