Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
263da1b
Init R3L
shiweijiezero Oct 24, 2025
d648b6d
Merge branch 'modelscope:main' into main
shiweijiezero Oct 24, 2025
177879f
Init R3L
shiweijiezero Oct 24, 2025
55dc215
重构R3L以及各类Baseline
Oct 26, 2025
08c8138
重构R3L以及各类Baseline
Oct 27, 2025
eb7936e
Merge branch 'modelscope:main' into main
shiweijiezero Oct 27, 2025
3466de2
调整配置
Oct 29, 2025
62beb35
Merge branch 'modelscope:main' into featureA
shiweijiezero Oct 30, 2025
1619b46
Merge branch 'modelscope:main' into featureA
shiweijiezero Nov 4, 2025
188e962
调整配置
Nov 4, 2025
ef2f22a
更新Alfworld范式
shiweijiezero Nov 12, 2025
9f0b519
Merge remote-tracking branch 'origin/featureA' into featureA
shiweijiezero Nov 12, 2025
1d20754
DAPO config
shiweijiezero Nov 12, 2025
5fe327b
Add Latex Code
shiweijiezero Nov 14, 2025
96cfa53
添加参考论文
shiweijiezero Nov 14, 2025
0f63591
Add R3L environments from featureA branch and fix Countdown syntax
claude Nov 14, 2025
b14c7b9
Remove R3L_Fix_Summary.md - will update CLAUDE.md instead
claude Nov 14, 2025
0409653
Unify all R3L environments to Alfworld reflection schema
claude Nov 14, 2025
02d31b4
Unify self_correction prompts and reflect_report_to_guidance_prompt f…
claude Nov 14, 2025
441a111
Fix critical R3L workflow issues and add missing functionality
claude Nov 14, 2025
21400da
Add format reminders to webshop and scienceworld user prompts
claude Nov 14, 2025
09bd678
Fix critical bugs and standardize R3L implementations across all envi…
claude Nov 14, 2025
1d260ed
Restore default webshop path while keeping environment variable option
claude Nov 14, 2025
01aff02
Fix WebShop hardcoded paths and add DAPO format reminders
claude Nov 14, 2025
b04e091
Align RAFT error handling and fix WebShop workflow inconsistencies
claude Nov 14, 2025
254a346
Add missing DAPO workflows and revert WebShop RAFT registration name
claude Nov 14, 2025
dee5c05
添加参考论文
shiweijiezero Nov 14, 2025
893db72
merge
shiweijiezero Nov 14, 2025
f9ab6de
Fix RAFT function registration naming consistency
claude Nov 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
9 changes: 9 additions & 0 deletions .claude/settings.local.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"permissions": {
"allow": [
"WebFetch(domain:arxiv.org)"
],
"deny": [],
"ask": []
}
}
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ repos:
rev: 23.7.0
hooks:
- id: black
language_version: python3.10
# language_version: python3.10
args: [--line-length=100]

- repo: https://github.com/pycqa/isort
Expand Down
4 changes: 2 additions & 2 deletions benchmark/config/gsm8k-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ explorer:
engine_num: 2
tensor_parallel_size: 1
enforce_eager: false
enable_prefix_caching: false
enable_chunked_prefill: false
enable_prefix_caching: true
enable_chunked_prefill: true
gpu_memory_utilization: 0.9
dtype: bfloat16
seed: 42
Expand Down
72 changes: 72 additions & 0 deletions examples/R3L/alfworld/RAFT_1.5B.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
project: "ALFWORLD"
name: "ALFWORLD_RFT_Qwen_1.5B_RAFT_Baseline"
checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,./checkpoints}
algorithm:
algorithm_type: raft
repeat_times: 1
optimizer:
lr: 1e-6
model:
model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen2.5-1.5B-Instruct}
max_response_tokens: 512
max_model_len: 20480
cluster:
node_num: 1
gpu_per_node: 4
buffer:
total_epochs: 20
batch_size: 96
explorer_input:
taskset:
name: alfworld
storage_type: file
path: 'examples/R3L/alfworld/alfworld_data'
split: 'train'
format:
prompt_key: 'task_id'
rollout_args:
temperature: 1.0
logprobs: 0
eval_tasksets:
- name: alfworld-eval
storage_type: file
path: 'examples/R3L/alfworld/alfworld_data'
split: 'test'
format:
prompt_key: 'task_id'
rollout_args:
temperature: 0.4
default_workflow_type: 'RAFT_baseline_alfworld_workflow'
trainer_input:
experience_buffer:
name: alfworld_raft_baseline_1.5B_buffer
storage_type: queue
replay_buffer:
enable: true
priority_fn: decay_limit_randomization
path: 'sqlite:///alfworld_raft_baseline_1.5B.db'
explorer:
runner_per_model: 32
eval_interval: 20
rollout_model:
engine_num: 2
tensor_parallel_size: 1
enable_prefix_caching: false
enforce_eager: false
dtype: bfloat16
seed: 0
gpu_memory_utilization: 0.7
enable_chunked_prefill: true
synchronizer:
sync_style: dynamic_by_explorer
sync_method: 'nccl'
sync_interval: 1
sync_timeout: 12000
trainer:
save_interval: 20
grad_clip: 1.0
use_dynamic_bsz: true
max_token_len_per_gpu: 10240
ulysses_sequence_parallel_size: 1
monitor:
monitor_type: wandb
72 changes: 72 additions & 0 deletions examples/R3L/alfworld/RAFT_7B.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
project: "ALFWORLD"
name: "ALFWORLD_RFT_Qwen_7B_RAFT_Baseline"
checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,./checkpoints}
algorithm:
algorithm_type: raft
repeat_times: 8
optimizer:
lr: 1e-6
model:
model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen2.5-7B-Instruct}
max_response_tokens: 512
max_model_len: 20480
cluster:
node_num: 1
gpu_per_node: 4
buffer:
total_epochs: 20
batch_size: 1
explorer_input:
taskset:
name: alfworld
storage_type: file
path: 'examples/R3L/alfworld/alfworld_data'
split: 'train'
format:
prompt_key: 'task_id'
rollout_args:
temperature: 1.0
logprobs: 0
eval_tasksets:
- name: alfworld-eval
storage_type: file
path: 'examples/R3L/alfworld/alfworld_data'
split: 'test'
format:
prompt_key: 'task_id'
rollout_args:
temperature: 0.4
default_workflow_type: 'RAFT_baseline_alfworld_workflow'
trainer_input:
experience_buffer:
name: alfworld_raft_baseline_7B_buffer
storage_type: queue
replay_buffer:
enable: true
priority_fn: decay_limit_randomization
path: 'sqlite:///alfworld_raft_baseline_7B.db'
explorer:
runner_per_model: 32
eval_interval: 20
rollout_model:
engine_num: 2
tensor_parallel_size: 1
enable_prefix_caching: false
enforce_eager: false
dtype: bfloat16
seed: 0
gpu_memory_utilization: 0.7
enable_chunked_prefill: true
synchronizer:
sync_style: dynamic_by_explorer
sync_method: 'nccl'
sync_interval: 1
sync_timeout: 12000
trainer:
save_interval: 20
grad_clip: 1.0
use_dynamic_bsz: true
max_token_len_per_gpu: 10240
ulysses_sequence_parallel_size: 1
monitor:
monitor_type: wandb
88 changes: 88 additions & 0 deletions examples/R3L/alfworld/dapo_1.5B.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
project: "ALFWORLD"
name: "ALFWORLD_RFT_Qwen_1.5B_DAPO"
checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,./checkpoints}
algorithm:
algorithm_type: grpo
kl_loss_fn: k3
kl_loss_fn_args:
kl_coef: 0.01
repeat_times: 8
optimizer:
lr: 1e-6
lr_warmup_steps: 20
policy_loss_fn_args:
clip_range_low: 0.2
clip_range_high: 0.28
model:
model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen2.5-1.5B-Instruct}
max_response_tokens: 512
max_model_len: 20480
cluster:
node_num: 1
gpu_per_node: 4
buffer:
total_epochs: 20
batch_size: 48 # 如果是96会OOM
explorer_input:
taskset:
name: alfworld
storage_type: file
path: 'examples/R3L/alfworld/alfworld_data'
split: 'train'
format:
prompt_key: 'task_id'
rollout_args:
temperature: 1.0
logprobs: 0
workflow_args:
enable_overlong_penalty: true
penalty_factor: 1.0
max_response_length: 512
cache_length: 400
eval_tasksets:
- name: alfworld-eval
storage_type: file
path: 'examples/R3L/alfworld/alfworld_data'
split: 'test'
format:
prompt_key: 'task_id'
rollout_args:
temperature: 0.4
default_workflow_type: 'dapo_alfworld_workflow'
trainer_input:
experience_buffer:
name: alfworld_dapo_1.5B_buffer
storage_type: queue
replay_buffer:
enable: true
priority_fn: decay_limit_randomization
path: 'sqlite:///alfworld_dapo_1.5B.db'
explorer:
runner_per_model: 32
eval_interval: 20
rollout_model:
engine_num: 2
tensor_parallel_size: 1
enable_prefix_caching: false
enforce_eager: false
dtype: bfloat16
seed: 42
gpu_memory_utilization: 0.7
enable_chunked_prefill: true
data_processor:
experience_pipeline:
operators:
- name: "OPMD_filter"
synchronizer:
sync_style: dynamic_by_explorer
sync_method: 'nccl'
sync_interval: 1
sync_timeout: 12000
trainer:
save_interval: 20
grad_clip: 1.0
use_dynamic_bsz: true
max_token_len_per_gpu: 10240
ulysses_sequence_parallel_size: 1
monitor:
monitor_type: wandb
88 changes: 88 additions & 0 deletions examples/R3L/alfworld/dapo_7B.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
project: "ALFWORLD"
name: "ALFWORLD_RFT_Qwen_7B_DAPO"
checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,./checkpoints}
algorithm:
algorithm_type: grpo
kl_loss_fn: k3
kl_loss_fn_args:
kl_coef: 0.01
repeat_times: 8
optimizer:
lr: 1e-6
lr_warmup_steps: 20
policy_loss_fn_args:
clip_range_low: 0.2
clip_range_high: 0.28
model:
model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen2.5-7B-Instruct}
max_response_tokens: 512
max_model_len: 20480
cluster:
node_num: 1
gpu_per_node: 4
buffer:
total_epochs: 20
batch_size: 96
explorer_input:
taskset:
name: alfworld
storage_type: file
path: 'examples/R3L/alfworld/alfworld_data'
split: 'train'
format:
prompt_key: 'task_id'
rollout_args:
temperature: 1.0
logprobs: 0
workflow_args:
enable_overlong_penalty: true
penalty_factor: 1.0
max_response_length: 512
cache_length: 400
eval_tasksets:
- name: alfworld-eval
storage_type: file
path: 'examples/R3L/alfworld/alfworld_data'
split: 'test'
format:
prompt_key: 'task_id'
rollout_args:
temperature: 0.4
default_workflow_type: 'dapo_alfworld_workflow'
trainer_input:
experience_buffer:
name: alfworld_dapo_7B_buffer
storage_type: queue
replay_buffer:
enable: true
priority_fn: decay_limit_randomization
path: 'sqlite:///alfworld_dapo_7B.db'
explorer:
runner_per_model: 32
eval_interval: 20
rollout_model:
engine_num: 2
tensor_parallel_size: 1
enable_prefix_caching: false
enforce_eager: false
dtype: bfloat16
seed: 42
gpu_memory_utilization: 0.7
enable_chunked_prefill: true
data_processor:
experience_pipeline:
operators:
- name: "OPMD_filter"
synchronizer:
sync_style: dynamic_by_explorer
sync_method: 'nccl'
sync_interval: 1
sync_timeout: 12000
trainer:
save_interval: 20
grad_clip: 1.0
use_dynamic_bsz: true
max_token_len_per_gpu: 10240
ulysses_sequence_parallel_size: 1
monitor:
monitor_type: wandb
Loading
Loading