Skip to content

Commit 5e02e21

Browse files
authored
fix: fix pi grpo yaml inconsistency (RLinf#450)
Signed-off-by: chenkang <455130517@qq.com>
1 parent c040edc commit 5e02e21

8 files changed

+23
-28
lines changed

examples/embodiment/config/libero_10_grpo_openpi.yaml

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,6 @@ runner:
3838
resume_dir: null
3939

4040
algorithm:
41-
auto_reset: False
42-
ignore_terminations: False
43-
use_fixed_reset_state_ids: True
4441
normalize_advantages: True
4542
kl_penalty: kl # how to estimate kl divergence: kl or kl_penalty
4643
group_size: 8
@@ -52,7 +49,7 @@ algorithm:
5249
logprob_type: chunk_level
5350
entropy_type: token_level
5451

55-
update_epoch: 4
52+
update_epoch: 2
5653
adv_type: grpo
5754
loss_type: actor
5855
loss_agg_func: "token-mean"
@@ -67,7 +64,7 @@ algorithm:
6764
gamma: 0.99
6865
gae_lambda: 0.95
6966

70-
filter_rewards: False
67+
filter_rewards: True
7168
rewards_lower_bound: 0.1
7269
rewards_upper_bound: 0.9
7370
# params for generation
@@ -97,18 +94,17 @@ env:
9794

9895
train:
9996
total_num_envs: 64
100-
max_episode_steps: 480 # max episode steps for truncation
101-
max_steps_per_rollout_epoch: 480
10297
reward_coef: ${algorithm.reward_coef}
10398
group_size: ${algorithm.group_size}
99+
max_episode_steps: 480 # max episode steps for truncation
100+
max_steps_per_rollout_epoch: 480
104101
eval:
105102
total_num_envs: 500
106103
auto_reset: True
107104
ignore_terminations: True
108105
max_episode_steps: 480
109106
max_steps_per_rollout_epoch: 480
110107
group_size: 1
111-
use_fixed_reset_state_ids: True
112108
is_eval: True
113109
video_cfg:
114110
save_video: True

examples/embodiment/config/libero_10_grpo_openpi_pi05.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ algorithm:
6363
gamma: 0.99
6464
gae_lambda: 0.95
6565

66-
filter_rewards: False
66+
filter_rewards: True
6767
rewards_lower_bound: 0.1
6868
rewards_upper_bound: 0.9
6969
# params for generation
@@ -142,6 +142,7 @@ actor:
142142
# Override the default values in model/pi0_5
143143
model:
144144
model_path: "/path/to/model/RLinf-Pi05-SFT"
145+
num_steps: 4
145146

146147
optim:
147148
lr: 5.0e-6

examples/embodiment/config/libero_goal_grpo_openpi.yaml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ algorithm:
4949
logprob_type: chunk_level
5050
entropy_type: token_level
5151

52-
update_epoch: 4
52+
update_epoch: 2
5353
adv_type: grpo
5454
loss_type: actor
5555
loss_agg_func: "token-mean"
@@ -64,7 +64,7 @@ algorithm:
6464
gamma: 0.99
6565
gae_lambda: 0.95
6666

67-
filter_rewards: False
67+
filter_rewards: True
6868
rewards_lower_bound: 0.1
6969
rewards_upper_bound: 0.9
7070
# params for generation
@@ -105,7 +105,6 @@ env:
105105
max_episode_steps: 320
106106
max_steps_per_rollout_epoch: 320
107107
group_size: 1
108-
use_fixed_reset_state_ids: True
109108
is_eval: True
110109
video_cfg:
111110
save_video: True
@@ -126,7 +125,7 @@ rollout:
126125
model:
127126
model_path: "/path/to/model/RLinf-Pi0-SFT-Spatial-Object-Goal"
128127
precision: ${actor.model.precision}
129-
128+
130129
actor:
131130
group_name: "ActorGroup"
132131
channel:

examples/embodiment/config/libero_goal_grpo_openpi_pi05.yaml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ algorithm:
4848
reward_type: chunk_level
4949
logprob_type: chunk_level
5050
entropy_type: token_level
51-
update_epoch: 4
51+
update_epoch: 1
5252

5353
adv_type: grpo
5454
loss_type: actor
@@ -64,7 +64,7 @@ algorithm:
6464
gamma: 0.99
6565
gae_lambda: 0.95
6666

67-
filter_rewards: False
67+
filter_rewards: True
6868
rewards_lower_bound: 0.1
6969
rewards_upper_bound: 0.9
7070
# params for generation
@@ -142,8 +142,11 @@ actor:
142142
# Override the default values in model/pi0_5
143143
model:
144144
model_path: "/path/to/model/RLinf-Pi05-SFT"
145-
model_type: "openpi"
146145
num_action_chunks: 5
146+
model_type: "openpi"
147+
# openpi specific parameters
148+
openpi:
149+
noise_level: 0.3
147150

148151
optim:
149152
lr: 5.0e-6

examples/embodiment/config/libero_object_grpo_openpi.yaml

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ algorithm:
4949
logprob_type: chunk_level
5050
entropy_type: token_level
5151

52-
update_epoch: 4
52+
update_epoch: 2
5353
adv_type: grpo
5454
loss_type: actor
5555
loss_agg_func: "token-mean"
@@ -64,7 +64,7 @@ algorithm:
6464
gamma: 0.99
6565
gae_lambda: 0.95
6666

67-
filter_rewards: False
67+
filter_rewards: True
6868
rewards_lower_bound: 0.1
6969
rewards_upper_bound: 0.9
7070
# params for generation
@@ -98,16 +98,13 @@ env:
9898
group_size: ${algorithm.group_size}
9999
max_episode_steps: 240
100100
max_steps_per_rollout_epoch: 240
101-
use_fixed_reset_state_ids: True
102-
use_ordered_reset_state_ids: False
103101
eval:
104102
total_num_envs: 500
105103
auto_reset: True
106104
ignore_terminations: True
107105
max_episode_steps: 240
108106
max_steps_per_rollout_epoch: 240
109107
group_size: 1
110-
use_fixed_reset_state_ids: True
111108
is_eval: True
112109
video_cfg:
113110
save_video: True

examples/embodiment/config/libero_object_grpo_openpi_pi05.yaml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ algorithm:
6464
gamma: 0.99
6565
gae_lambda: 0.95
6666

67-
filter_rewards: False
67+
filter_rewards: True
6868
rewards_lower_bound: 0.1
6969
rewards_upper_bound: 0.9
7070
# params for generation
@@ -98,8 +98,6 @@ env:
9898
group_size: ${algorithm.group_size}
9999
max_episode_steps: 240
100100
max_steps_per_rollout_epoch: 240
101-
use_fixed_reset_state_ids: True
102-
use_ordered_reset_state_ids: False
103101
eval:
104102
total_num_envs: 500
105103
auto_reset: True
@@ -147,7 +145,8 @@ actor:
147145
model_path: "/path/to/model/RLinf-Pi05-SFT"
148146
model_type: "openpi"
149147
num_action_chunks: 5 # interface for the env
150-
num_steps: 3
148+
openpi:
149+
noise_level: 0.3
151150

152151
optim:
153152
lr: 5.0e-6

examples/embodiment/config/libero_spatial_grpo_openpi.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ algorithm:
5050
logprob_type: chunk_level
5151
entropy_type: token_level
5252

53-
update_epoch: 4
53+
update_epoch: 2
5454
adv_type: grpo
5555
loss_type: actor
5656
loss_agg_func: "token-mean"
@@ -65,7 +65,7 @@ algorithm:
6565
gamma: 0.99
6666
gae_lambda: 0.95
6767

68-
filter_rewards: False
68+
filter_rewards: True
6969
rewards_lower_bound: 0.1
7070
rewards_upper_bound: 0.9
7171
# params for generation

examples/embodiment/config/libero_spatial_grpo_openpi_pi05.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ algorithm:
6464
gamma: 0.99
6565
gae_lambda: 0.95
6666

67-
filter_rewards: False
67+
filter_rewards: True
6868
rewards_lower_bound: 0.1
6969
rewards_upper_bound: 0.9
7070
# params for generation

0 commit comments

Comments
 (0)