Unity-Technologies
diff --git a/‎docs/Reward-Signals.md‎
Lines changed: 1 addition & 30 deletions b/‎docs/Reward-Signals.md‎
Lines changed: 1 addition & 30 deletions
diff --git a/‎ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py‎
Lines changed: 34 additions & 57 deletions b/‎ml-agents/mlagents/trainers/components/reward_signals/curiosity/signal.py‎
Lines changed: 34 additions & 57 deletions
diff --git a/‎ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py‎
Lines changed: 9 additions & 8 deletions b/‎ml-agents/mlagents/trainers/components/reward_signals/extrinsic/signal.py‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎ml-agents/mlagents/trainers/components/reward_signals/gail/model.py‎
Lines changed: 16 additions & 13 deletions b/‎ml-agents/mlagents/trainers/components/reward_signals/gail/model.py‎
Lines changed: 16 additions & 13 deletions
@@ -71,7 +71,7 @@ Typical Range: `0.8` - `0.995`
 
 ### Curiosity Reward Signal
 
-The `curiosity` reward signal enables the Intrinsic Curiosity Module. This is an implementation
+The `curiosity` Reward Signal enables the Intrinsic Curiosity Module. This is an implementation
 of the approach described in "Curiosity-driven Exploration by Self-supervised Prediction"
 by Pathak, et al. It trains two networks:
 * an inverse model, which takes the current and next obersvation of the agent, encodes them, and
@@ -121,15 +121,6 @@ Default Value: `3e-4`
 
 Typical Range: `1e-5` - `1e-3`
 
-#### (Optional) Num Epochs
-
-`num_epoch` The number of passes to make through the experience buffer when performing gradient
-descent optimization for the ICM. This typically should be set to the same as used for PPO.
-
-Default Value: `3`
-
-Typical Range: `3` - `10`
-
 ### GAIL Reward Signal
 
 GAIL, or [Generative Adversarial Imitation Learning](https://arxiv.org/abs/1606.03476), is an
@@ -214,23 +205,3 @@ However, it does increase training time. Enable this if you notice your imitatio
 unstable, or unable to learn the task at hand.
 
 Default Value: `false`
-
-#### (Optional) Samples Per Update
-
-`samples_per_update` is the maximum number of samples to use during each discriminator update. You may
-want to lower this if your buffer size is very large to avoid overfitting the discriminator on current data.
-If set to 0, we will use the minimum of buffer size and the number of demonstration samples.
-
-Default Value: `0`
-
-Typical Range: Approximately equal to [`buffer_size`](Training-PPO.md)
-
-#### (Optional) Num Epochs
-
-`num_epoch` The number of passes to make through the experience buffer when performing gradient
-descent optimization for the discriminator. To avoid overfitting, this typically should be set to
-the same as or less than used for PPO.
-
-Default Value: `3`
-
-Typical Range: `1` - `10`
@@ -2,21 +2,24 @@
 import numpy as np
 from mlagents.envs.brain import BrainInfo
 
+import tensorflow as tf
+
 from mlagents.trainers.buffer import Buffer
 from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
 from mlagents.trainers.components.reward_signals.curiosity.model import CuriosityModel
 from mlagents.trainers.tf_policy import TFPolicy
+from mlagents.trainers.models import LearningModel
 
 
 class CuriosityRewardSignal(RewardSignal):
     def __init__(
         self,
         policy: TFPolicy,
+        policy_model: LearningModel,
         strength: float,
         gamma: float,
         encoding_size: int = 128,
         learning_rate: float = 3e-4,
-        num_epoch: int = 3,
     ):
         """
         Creates the Curiosity reward generator
@@ -26,18 +29,20 @@ def __init__(
         :param gamma: The time discounting factor used for this reward.
         :param encoding_size: The size of the hidden encoding layer for the ICM
         :param learning_rate: The learning rate for the ICM.
-        :param num_epoch: The number of epochs to train over the training buffer for the ICM.
         """
-        super().__init__(policy, strength, gamma)
+        super().__init__(policy, policy_model, strength, gamma)
         self.model = CuriosityModel(
-            policy.model, encoding_size=encoding_size, learning_rate=learning_rate
+            policy_model, encoding_size=encoding_size, learning_rate=learning_rate
         )
-        self.num_epoch = num_epoch
         self.use_terminal_states = False
         self.update_dict = {
-            "forward_loss": self.model.forward_loss,
-            "inverse_loss": self.model.inverse_loss,
-            "update": self.model.update_batch,
+            "curiosity_forward_loss": self.model.forward_loss,
+            "curiosity_inverse_loss": self.model.inverse_loss,
+            "curiosity_update": self.model.update_batch,
+        }
+        self.stats_name_to_update_name = {
+            "Losses/Curiosity Forward Loss": "curiosity_forward_loss",
+            "Losses/Curiosity Inverse Loss": "curiosity_inverse_loss",
         }
         self.has_updated = False
 
@@ -89,67 +94,39 @@ def check_config(
         param_keys = ["strength", "gamma", "encoding_size"]
         super().check_config(config_dict, param_keys)
 
-    def update(self, update_buffer: Buffer, num_sequences: int) -> Dict[str, float]:
-        """
-        Updates Curiosity model using training buffer. Divides training buffer into mini batches and performs
-        gradient descent.
-        :param update_buffer: Update buffer from which to pull data from.
-        :param num_sequences: Number of sequences in the update buffer.
-        :return: Dict of stats that should be reported to Tensorboard.
-        """
-        forward_total: List[float] = []
-        inverse_total: List[float] = []
-        for _ in range(self.num_epoch):
-            update_buffer.shuffle(sequence_length=self.policy.sequence_length)
-            buffer = update_buffer
-            for l in range(len(update_buffer["actions"]) // num_sequences):
-                start = l * num_sequences
-                end = (l + 1) * num_sequences
-                run_out_curio = self._update_batch(
-                    buffer.make_mini_batch(start, end), num_sequences
-                )
-                inverse_total.append(run_out_curio["inverse_loss"])
-                forward_total.append(run_out_curio["forward_loss"])
-
-        update_stats = {
-            "Losses/Curiosity Forward Loss": np.mean(forward_total),
-            "Losses/Curiosity Inverse Loss": np.mean(inverse_total),
-        }
-        return update_stats
-
-    def _update_batch(
-        self, mini_batch: Dict[str, np.ndarray], num_sequences: int
-    ) -> Dict[str, float]:
+    def prepare_update(
+        self,
+        policy_model: LearningModel,
+        mini_batch: Dict[str, np.ndarray],
+        num_sequences: int,
+    ) -> Dict[tf.Tensor, Any]:
         """
-        Updates model using buffer.
+        Prepare for update and get feed_dict.
         :param num_sequences: Number of trajectories in batch.
         :param mini_batch: Experience batch.
-        :return: Output from update process.
+        :return: Feed_dict needed for update.
         """
         feed_dict = {
-            self.policy.model.batch_size: num_sequences,
-            self.policy.model.sequence_length: self.policy.sequence_length,
-            self.policy.model.mask_input: mini_batch["masks"],
-            self.policy.model.advantage: mini_batch["advantages"],
-            self.policy.model.all_old_log_probs: mini_batch["action_probs"],
+            policy_model.batch_size: num_sequences,
+            policy_model.sequence_length: self.policy.sequence_length,
+            policy_model.mask_input: mini_batch["masks"],
+            policy_model.advantage: mini_batch["advantages"],
+            policy_model.all_old_log_probs: mini_batch["action_probs"],
         }
         if self.policy.use_continuous_act:
-            feed_dict[self.policy.model.output_pre] = mini_batch["actions_pre"]
+            feed_dict[policy_model.output_pre] = mini_batch["actions_pre"]
         else:
-            feed_dict[self.policy.model.action_holder] = mini_batch["actions"]
+            feed_dict[policy_model.action_holder] = mini_batch["actions"]
         if self.policy.use_vec_obs:
-            feed_dict[self.policy.model.vector_in] = mini_batch["vector_obs"]
+            feed_dict[policy_model.vector_in] = mini_batch["vector_obs"]
             feed_dict[self.model.next_vector_in] = mini_batch["next_vector_in"]
-        if self.policy.model.vis_obs_size > 0:
-            for i, _ in enumerate(self.policy.model.visual_in):
-                feed_dict[self.policy.model.visual_in[i]] = mini_batch[
-                    "visual_obs%d" % i
-                ]
-            for i, _ in enumerate(self.policy.model.visual_in):
+        if policy_model.vis_obs_size > 0:
+            for i, _ in enumerate(policy_model.visual_in):
+                feed_dict[policy_model.visual_in[i]] = mini_batch["visual_obs%d" % i]
+            for i, _ in enumerate(policy_model.visual_in):
                 feed_dict[self.model.next_visual_in[i]] = mini_batch[
                     "next_visual_obs%d" % i
                 ]
 
         self.has_updated = True
-        run_out = self.policy._execute_model(feed_dict, self.update_dict)
-        return run_out
+        return feed_dict
@@ -5,18 +5,25 @@
 from mlagents.trainers.buffer import Buffer
 from mlagents.trainers.components.reward_signals import RewardSignal, RewardSignalResult
 from mlagents.trainers.tf_policy import TFPolicy
+from mlagents.trainers.models import LearningModel
 
 
 class ExtrinsicRewardSignal(RewardSignal):
-    def __init__(self, policy: TFPolicy, strength: float, gamma: float):
+    def __init__(
+        self,
+        policy: TFPolicy,
+        policy_model: LearningModel,
+        strength: float,
+        gamma: float,
+    ):
         """
         The extrinsic reward generator. Returns the reward received by the environment
         :param policy: The Policy object (e.g. PPOPolicy) that this Reward Signal will apply to.
         :param strength: The strength of the reward. The reward's raw value will be multiplied by this value.
         :param gamma: The time discounting factor used for this reward.
         :return: An ExtrinsicRewardSignal object.
         """
-        super().__init__(policy, strength, gamma)
+        super().__init__(policy, policy_model, strength, gamma)
 
     @classmethod
     def check_config(
@@ -46,9 +53,3 @@ def evaluate(
         unscaled_reward = np.array(next_info.rewards)
         scaled_reward = self.strength * unscaled_reward
         return RewardSignalResult(scaled_reward, unscaled_reward)
-
-    def update(self, update_buffer: Buffer, num_sequences: int) -> Dict[str, float]:
-        """
-        This method does nothing, as there is nothing to update.
-        """
-        return {}
 
@@ -37,28 +37,22 @@ def __init__(
         self.gradient_penalty_weight = gradient_penalty_weight
         self.use_vail = use_vail
         self.use_actions = use_actions  # True # Not using actions
-        self.make_beta()
         self.make_inputs()
         self.create_network()
         self.create_loss(learning_rate)
+        if self.use_vail:
+            self.make_beta_update()
 
-    def make_beta(self) -> None:
+    def make_beta_update(self) -> None:
         """
         Creates the beta parameter and its updater for GAIL
         """
-        self.beta = tf.get_variable(
-            "gail_beta",
-            [],
-            trainable=False,
-            dtype=tf.float32,
-            initializer=tf.ones_initializer(),
-        )
-        self.kl_div_input = tf.placeholder(shape=[], dtype=tf.float32)
+
         new_beta = tf.maximum(
-            self.beta + self.alpha * (self.kl_div_input - self.mutual_information),
-            EPSILON,
+            self.beta + self.alpha * (self.kl_loss - self.mutual_information), EPSILON
         )
-        self.update_beta = tf.assign(self.beta, new_beta)
+        with tf.control_dependencies(self.update_batch):
+            self.update_beta = tf.assign(self.beta, new_beta)
 
     def make_inputs(self) -> None:
         """
@@ -271,6 +265,15 @@ def create_loss(self, learning_rate: float) -> None:
         self.mean_expert_estimate = tf.reduce_mean(self.expert_estimate)
         self.mean_policy_estimate = tf.reduce_mean(self.policy_estimate)
 
+        if self.use_vail:
+            self.beta = tf.get_variable(
+                "gail_beta",
+                [],
+                trainable=False,
+                dtype=tf.float32,
+                initializer=tf.ones_initializer(),
+            )
+
         self.discriminator_loss = -tf.reduce_mean(
             tf.log(self.expert_estimate + EPSILON)
             + tf.log(1.0 - self.policy_estimate + EPSILON)