RLinf/examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml at 73fb07e444ae90462e478c9723905ae79c9e5e8d · justbin-coder/RLinf · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
defaults:
  - env/train: PutCarrotOnPlateInScene
  - env/eval: PutCarrotOnPlateInScene
  - override hydra/job_logging: stdout

hydra:
  run:
    dir: .
  output_subdir: null
  searchpath:
    - file://${oc.env:EMBODIED_PATH}/config/

cluster:
  num_nodes: 1
  component_placement:
    actor,env,rollout: 0-1

runner:
  task_type: embodied
  logger:
    log_path: "../results"
    project_name: rlinf
    experiment_name: "test_openvla"
    logger_backends: ["tensorboard"] # wandb, swanlab

  max_epochs: 1000
  max_steps: -1

  only_eval: False
  val_check_interval: -1
  save_interval: 40
  seq_length: 4096
  max_prompt_length: 30

algorithm:
  auto_reset: True
  ignore_terminations: True
  use_fixed_reset_state_ids: False
  require_values: True
  normalize_advantages: True
  kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
  group_size: 1

  n_chunk_steps: 80
  n_eval_chunk_steps: 80
  # training rollout mbs
  rollout_micro_batch_size: 64
  num_group_envs: 32
  rollout_epoch: 1

  reward_type: action_level
  logprob_type: action_level
  entropy_type: action_level

  # mbs to do log prob inference, can be set to
  # lower than rollout_micro_batch_size to reduce
  # memory usage
  logprob_forward_micro_batch_size: 16 # ${.rollout_micro_batch_size}

  adv_type: embodied_gae
  loss_type: embodied_ppo
  loss_agg_func: "token-mean"
  kl_beta: 0.0
  ratio_clip_eps: 0.2
  entropy_bonus: 0
  clip_ratio_high: 0.2
  clip_ratio_low: 0.2
  clip_ratio_c: 3.0
  value_clip: 0.2

  huber_delta: 10.0

  gamma: 0.99
  gae_lambda: 0.95

  # params for rollout
  sampling_params:
    use_greedy: False
    temperature_train: 1.0
    temperature_eval: 0.6
    top_k: 50
    top_p: 1.0
    repetition_penalty: 1.0

  # length argument for autoregressive sampling
  # max length means max amount of tokens to generate
  length_params:
    max_new_token: 7
    max_length: 1024
    min_length: 1

env:
  group_name: "EnvGroup"
  channel:
    name: "env_buffer_list"
    queue_name: "obs_buffer"
    queue_size: 0
  enable_offload: True

rollout:
  group_name: "RolloutGroup"
  channel:
    name: ${env.channel.name}
    queue_name: "action_buffer"
    queue_size: 0
  mode: "colocate"
  backend: "huggingface"
  model_dir: "/path/to/model/openvla-7b-rlvla-warmup/"
  gpu_memory_utilization: 0.5
  enforce_eager: True
  enable_offload: True
  pipeline_stage_num: 2

actor:
  group_name: "ActorGroup"
  channel:
    name: ${env.channel.name}
    queue_name: "replay_buffer"
    queue_size: 0
  training_backend: "fsdp"

  checkpoint_load_path: "/path/to/model/openvla-7b-rlvla-warmup/"
  checkpoint_save_path: "../results"

  micro_batch_size: 20
  global_batch_size: 160
  seed: 1234
  enable_offload: True

  tokenizer:
    tokenizer_type: "HuggingFaceTokenizer"
    tokenizer_model: "/path/to/model/openvla-7b-rlvla-warmup/"
    extra_vocab_size: 421
    use_fast: False
    trust_remote_code: True
    padding_side: "right"

  model:
    model_name: "openvla"
    action_dim: 7
    num_action_chunks: 1
    use_proprio: False
    unnorm_key: bridge_orig
    micro_batch_size: 1
    val_micro_batch_size: 8
    center_crop: True
    do_sample: False

    precision: "bf16"
    add_bias_linear: False
    add_qkv_bias: True
    vocab_size: 32000
    hidden_size: 4096
    policy_setup: "widowx_bridge"
    vh_mode: "a0"
    image_size: [224, 224]
    is_lora: True
    lora_rank: 32
    ckpt_path: null
    attn_implementation: "flash_attention_2"
    low_cpu_mem_usage: True
    trust_remote_code: True
    gradient_checkpointing: False

  optim:
    lr: 1.0e-4
    value_lr: 3.0e-3
    adam_beta1: 0.9
    adam_beta2: 0.95
    adam_eps: 1.0e-05
    clip_grad: 1.0

  fsdp:
    forward_prefetch: False
    limit_all_gathers: False
    backward_prefetch: False
    use_orig_params: False

reward:
  use_reward_model: False

critic:
  use_critic_model: False