Clear cumulative_returns_since_policy_update (#2120)

Ervin T · xiaomaogy · commit c5226f61777a · 2019-06-11T10:56:50.000-07:00
Before the CSV file's mean rewards would lag much behind the rest of the code since this buffer was never cleared.
diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -422,6 +422,7 @@ def update_policy(self):
             number_experiences=len(self.training_buffer.update_buffer["actions"]),
             mean_return=float(np.mean(self.cumulative_returns_since_policy_update)),
         )
+        self.cumulative_returns_since_policy_update = []
         n_sequences = max(
             int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1
         )

Original file line number	Diff line number	Diff line change
`@@ -422,6 +422,7 @@ def update_policy(self):`
`422`	`422`	`number_experiences=len(self.training_buffer.update_buffer["actions"]),`
`423`	`423`	`mean_return=float(np.mean(self.cumulative_returns_since_policy_update)),`
`424`	`424`	`)`
	`425`	`+ self.cumulative_returns_since_policy_update = []`
`425`	`426`	`n_sequences = max(`
`426`	`427`	`int(self.trainer_parameters["batch_size"] / self.policy.sequence_length), 1`
`427`	`428`	`)`