Skip to content

Commit 60ae629

Browse files
author
Ervin T
authored
[bug-fix] Add clipping to PyTorch policy, fix initialization (#4649) (#4662)
1 parent 6895bf1 commit 60ae629

File tree

9 files changed

+36
-15
lines changed

9 files changed

+36
-15
lines changed

ml-agents/mlagents/trainers/policy/torch_policy.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ def __init__(
7777
conditional_sigma=self.condition_sigma_on_obs,
7878
tanh_squash=tanh_squash,
7979
)
80+
self._clip_action = not tanh_squash
8081
# Save the m_size needed for export
8182
self._export_m_size = self.m_size
8283
# m_size needed for training is determined by network, not trainer settings
@@ -203,8 +204,13 @@ def evaluate(
203204
action, log_probs, entropy, memories = self.sample_actions(
204205
vec_obs, vis_obs, masks=masks, memories=memories
205206
)
206-
run_out["action"] = ModelUtils.to_numpy(action)
207+
208+
if self._clip_action and self.use_continuous_act:
209+
clipped_action = torch.clamp(action, -3, 3) / 3
210+
else:
211+
clipped_action = action
207212
run_out["pre_action"] = ModelUtils.to_numpy(action)
213+
run_out["action"] = ModelUtils.to_numpy(clipped_action)
208214
# Todo - make pre_action difference
209215
run_out["log_probs"] = ModelUtils.to_numpy(log_probs)
210216
run_out["entropy"] = ModelUtils.to_numpy(entropy)

ml-agents/mlagents/trainers/ppo/optimizer_torch.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def update(self, batch: AgentBuffer, num_sequences: int) -> Dict[str, float]:
136136
vec_obs = [ModelUtils.list_to_tensor(batch["vector_obs"])]
137137
act_masks = ModelUtils.list_to_tensor(batch["action_mask"])
138138
if self.policy.use_continuous_act:
139-
actions = ModelUtils.list_to_tensor(batch["actions"]).unsqueeze(-1)
139+
actions = ModelUtils.list_to_tensor(batch["actions_pre"]).unsqueeze(-1)
140140
else:
141141
actions = ModelUtils.list_to_tensor(batch["actions"], dtype=torch.long)
142142

ml-agents/mlagents/trainers/tests/torch/test_reward_providers/test_curiosity.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def test_reward_decreases(behavior_spec: BehaviorSpec, seed: int) -> None:
6868
buffer = create_agent_buffer(behavior_spec, 5)
6969
curiosity_rp.update(buffer)
7070
reward_old = curiosity_rp.evaluate(buffer)[0]
71-
for _ in range(10):
71+
for _ in range(20):
7272
curiosity_rp.update(buffer)
7373
reward_new = curiosity_rp.evaluate(buffer)[0]
7474
assert reward_new < reward_old

ml-agents/mlagents/trainers/tests/torch/test_simple_rl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def test_recurrent_ppo(use_discrete):
121121
PPO_TORCH_CONFIG,
122122
hyperparameters=new_hyperparams,
123123
network_settings=new_network_settings,
124-
max_steps=5000,
124+
max_steps=6000,
125125
)
126126
check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
127127

ml-agents/mlagents/trainers/tests/torch/test_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ def test_get_probs_and_entropy():
160160
action_list, dist_list
161161
)
162162
assert log_probs.shape == (1, 2, 2)
163-
assert entropies.shape == (1, 2, 2)
163+
assert entropies.shape == (1, 1, 2)
164164
assert all_probs is None
165165

166166
for log_prob in log_probs.flatten():

ml-agents/mlagents/trainers/torch/distributions.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,11 @@ def pdf(self, value):
6666
return torch.exp(log_prob)
6767

6868
def entropy(self):
69-
return 0.5 * torch.log(2 * math.pi * math.e * self.std + EPSILON)
69+
return torch.mean(
70+
0.5 * torch.log(2 * math.pi * math.e * self.std + EPSILON),
71+
dim=1,
72+
keepdim=True,
73+
) # Use equivalent behavior to TF
7074

7175

7276
class TanhGaussianDistInstance(GaussianDistInstance):
@@ -131,7 +135,7 @@ def __init__(
131135
hidden_size,
132136
num_outputs,
133137
kernel_init=Initialization.KaimingHeNormal,
134-
kernel_gain=0.1,
138+
kernel_gain=0.2,
135139
bias_init=Initialization.Zero,
136140
)
137141
self.tanh_squash = tanh_squash
@@ -140,7 +144,7 @@ def __init__(
140144
hidden_size,
141145
num_outputs,
142146
kernel_init=Initialization.KaimingHeNormal,
143-
kernel_gain=0.1,
147+
kernel_gain=0.2,
144148
bias_init=Initialization.Zero,
145149
)
146150
else:

ml-agents/mlagents/trainers/torch/encoders.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def __init__(
133133
self.final_flat,
134134
self.h_size,
135135
kernel_init=Initialization.KaimingHeNormal,
136-
kernel_gain=1.0,
136+
kernel_gain=1.41, # Use ReLU gain
137137
),
138138
nn.LeakyReLU(),
139139
)
@@ -165,7 +165,7 @@ def __init__(
165165
self.final_flat,
166166
self.h_size,
167167
kernel_init=Initialization.KaimingHeNormal,
168-
kernel_gain=1.0,
168+
kernel_gain=1.41, # Use ReLU gain
169169
),
170170
nn.LeakyReLU(),
171171
)
@@ -200,7 +200,7 @@ def __init__(
200200
self.final_flat,
201201
self.h_size,
202202
kernel_init=Initialization.KaimingHeNormal,
203-
kernel_gain=1.0,
203+
kernel_gain=1.41, # Use ReLU gain
204204
),
205205
nn.LeakyReLU(),
206206
)
@@ -251,7 +251,7 @@ def __init__(
251251
n_channels[-1] * height * width,
252252
output_size,
253253
kernel_init=Initialization.KaimingHeNormal,
254-
kernel_gain=1.0,
254+
kernel_gain=1.41, # Use ReLU gain
255255
)
256256
self.sequential = nn.Sequential(*layers)
257257

ml-agents/mlagents/trainers/torch/layers.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,18 @@ def linear_layer(
3939
:param output_size: The size of the output tensor
4040
:param kernel_init: The Initialization to use for the weights of the layer
4141
:param kernel_gain: The multiplier for the weights of the kernel. Note that in
42-
TensorFlow, calling variance_scaling with scale 0.01 is equivalent to calling
43-
KaimingHeNormal with kernel_gain of 0.1
42+
TensorFlow, the gain is square-rooted. Therefore calling with scale 0.01 is equivalent to calling
43+
KaimingHeNormal with kernel_gain of 0.1
4444
:param bias_init: The Initialization to use for the weights of the bias layer
4545
"""
4646
layer = torch.nn.Linear(input_size, output_size)
47-
_init_methods[kernel_init](layer.weight.data)
47+
if (
48+
kernel_init == Initialization.KaimingHeNormal
49+
or kernel_init == Initialization.KaimingHeUniform
50+
):
51+
_init_methods[kernel_init](layer.weight.data, nonlinearity="linear")
52+
else:
53+
_init_methods[kernel_init](layer.weight.data)
4854
layer.weight.data *= kernel_gain
4955
_init_methods[bias_init](layer.bias.data)
5056
return layer

ml-agents/mlagents/trainers/torch/networks.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,9 @@ def __init__(
292292
self.distribution = MultiCategoricalDistribution(
293293
self.encoding_size, self.action_spec.discrete_branches
294294
)
295+
# During training, clipping is done in TorchPolicy, but we need to clip before ONNX
296+
# export as well.
297+
self._clip_action_on_export = not tanh_squash
295298

296299
@property
297300
def memory_size(self) -> int:
@@ -339,6 +342,8 @@ def forward(
339342
if self.action_spec.is_continuous():
340343
action_list = self.sample_action(dists)
341344
action_out = torch.stack(action_list, dim=-1)
345+
if self._clip_action_on_export:
346+
action_out = torch.clamp(action_out, -3, 3) / 3
342347
else:
343348
action_out = torch.cat([dist.all_log_prob() for dist in dists], dim=1)
344349
return (

0 commit comments

Comments
 (0)