Skip to content

Commit feb4c6d

Browse files
shamanezshamanez
authored andcommitted
added the openreward support.
1 parent dab543a commit feb4c6d

File tree

3 files changed

+229
-1
lines changed

3 files changed

+229
-1
lines changed

examples/agentic_demo/openreward_endless_terminals_IPA_qwen35_2b.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ skip_mock_system_prompt: true
5151

5252
track_with: wandb
5353
tracker_kwargs:
54-
api_key: "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"
54+
api_key: ${oc.env:WANDB_API_KEY}
5555
project: roll-agentic
5656
name: ${exp_name}
5757

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
# OpenReward + EndlessTerminals + STEP_REINFORCE (vanilla PG) config
2+
# Simpler baseline without IPA chunk-level loss.
3+
# The model IS the agent — no iflow, no sandbox, no anti_call_llm.
4+
#
5+
# Usage:
6+
# bash examples/agentic_demo/run_openreward_endless_terminals.sh reinforce
7+
# # or directly:
8+
# python examples/start_agentic_pipeline.py \
9+
# --config_path agentic_demo \
10+
# --config_name openreward_endless_terminals_reinforce_qwen35_2b
11+
12+
defaults:
13+
- ../config/deepspeed_zero@_here_
14+
- ../config/deepspeed_zero2@_here_
15+
- ../config/deepspeed_zero3@_here_
16+
- ../config/deepspeed_zero3_cpuoffload@_here_
17+
18+
hydra:
19+
run:
20+
dir: .
21+
output_subdir: null
22+
23+
exp_name: "openreward_endless_terminals_reinforce_qwen35_2b"
24+
seed: 42
25+
26+
logging_dir: ./output/logs
27+
output_dir: ./output
28+
model_name: ${exp_name}-${now:%Y%m%d_%H%M%S}
29+
rollout_dump_dir: /home/ubuntu/ALE-latest/ROLL-personal/output/rollout_dump
30+
system_envs:
31+
USE_MODELSCOPE: '1'
32+
33+
checkpoint_config:
34+
type: file_system
35+
output_dir: /data
36+
37+
num_gpus_per_node: 8
38+
rpc_timeout: 72000
39+
40+
max_steps: 10
41+
save_steps: 50
42+
logging_steps: 1
43+
eval_steps: 0
44+
resume_from_checkpoint: false
45+
46+
async_generation_ratio: 1
47+
parse_tool_call_parameter_to_dict: true
48+
skip_mock_system_prompt: true
49+
50+
track_with: wandb
51+
tracker_kwargs:
52+
api_key: ${oc.env:WANDB_API_KEY}
53+
project: roll-agentic
54+
name: ${exp_name}
55+
56+
rollout_batch_size: 16
57+
val_batch_size: 1
58+
sequence_length: 32768
59+
60+
max_tokens_per_step: 4096
61+
62+
# --- Vanilla STEP_REINFORCE config ---
63+
advantage_clip: 0.2
64+
ppo_epochs: 1
65+
adv_estimator: "step_reinforce"
66+
batch_adjust_mode: "random_sample"
67+
step_reward_gamma: 1.0
68+
69+
init_kl_coef: 0.0
70+
whiten_advantages: true
71+
entropy_loss_coef: 0
72+
max_grad_norm: 1.0
73+
74+
# --- Model configs ---
75+
pretrain: /home/ubuntu/ALE-latest/model-checkpoints/Qwen3.5-2B
76+
reward_pretrain: /home/ubuntu/ALE-latest/model-checkpoints/Qwen3.5-2B
77+
actor_train:
78+
model_args:
79+
flash_attn: sdpa
80+
attn_implementation: sdpa
81+
disable_gradient_checkpointing: false
82+
dtype: bf16
83+
model_type: ~
84+
freeze_module_prefix: vision_model
85+
training_args:
86+
learning_rate: 1.0e-6
87+
weight_decay: 0
88+
per_device_train_batch_size: 1
89+
gradient_accumulation_steps: 4
90+
warmup_steps: 0
91+
data_args:
92+
template: qwen3_coder
93+
strategy_args:
94+
strategy_name: megatron_train
95+
strategy_config:
96+
tensor_model_parallel_size: 2
97+
pipeline_model_parallel_size: 1
98+
expert_model_parallel_size: 1
99+
context_parallel_size: 2
100+
sequence_parallel: true
101+
use_distributed_optimizer: true
102+
recompute_granularity: full
103+
device_mapping: list(range(0,4))
104+
infer_batch_size: 1
105+
actor_infer:
106+
model_args:
107+
flash_attn: sdpa
108+
attn_implementation: sdpa
109+
disable_gradient_checkpointing: true
110+
dtype: bf16
111+
generating_args:
112+
max_new_tokens: ${max_tokens_per_step}
113+
top_p: 1.0
114+
top_k: 50
115+
num_beams: 1
116+
temperature: 1.0
117+
num_return_sequences: 1
118+
stop_strings: ["</tool_call>"]
119+
include_stop_str_in_output: true
120+
data_args:
121+
template: qwen3_coder
122+
strategy_args:
123+
strategy_name: vllm
124+
strategy_config:
125+
gpu_memory_utilization: 0.6
126+
block_size: 16
127+
load_format: auto
128+
tensor_parallel_size: 1
129+
max_model_len: 32768
130+
device_mapping: list(range(0,8))
131+
132+
reference:
133+
model_args:
134+
attn_implementation: sdpa
135+
disable_gradient_checkpointing: true
136+
dtype: bf16
137+
model_type: ~
138+
freeze_module_prefix: vision_model
139+
data_args:
140+
template: qwen3_coder
141+
strategy_args:
142+
strategy_name: megatron_infer
143+
strategy_config:
144+
tensor_model_parallel_size: 2
145+
pipeline_model_parallel_size: 1
146+
expert_model_parallel_size: 1
147+
context_parallel_size: 2
148+
device_mapping: list(range(0,4))
149+
infer_batch_size: 1
150+
151+
reward_normalization:
152+
grouping: traj_group_id
153+
method: identity
154+
155+
# --- Environment config (OpenReward) ---
156+
max_actions_per_traj: 16
157+
env_manager_cls: roll.pipeline.agentic.env_manager.agent_native_env_manager.AgentNativeStepEnvManager
158+
159+
train_env_manager:
160+
max_env_num_per_worker: 1
161+
num_env_groups: 1
162+
group_size: 1
163+
tags: [OpenRewardEndlessTerminalsTrain]
164+
num_groups_partition: [1]
165+
166+
val_env_manager:
167+
max_env_num_per_worker: 1
168+
num_env_groups: 1
169+
group_size: 1
170+
tags: [OpenRewardEndlessTerminalsVal]
171+
num_groups_partition: [1]
172+
173+
custom_envs:
174+
OpenRewardEndlessTerminalsTrain:
175+
env_type: "openreward_env"
176+
max_steps: ${max_actions_per_traj}
177+
max_tokens_per_step: ${max_tokens_per_step}
178+
env_manager_cls: ${env_manager_cls}
179+
agent_system_template: "unused — system prompt built dynamically from OpenReward tool specs"
180+
agent_template: "unused — observation is full message list from OpenRewardEnv"
181+
env_config:
182+
environment_name: "kanishk/EndlessTerminals"
183+
split: "train"
184+
mode: "train"
185+
max_steps: ${max_actions_per_traj}
186+
reward_reduction: "sum"
187+
nonterminal_reward: 0.0
188+
retry_max_attempts: 3
189+
retry_backoff_seconds: 5.0
190+
OpenRewardEndlessTerminalsVal:
191+
env_type: "openreward_env"
192+
max_steps: ${max_actions_per_traj}
193+
max_tokens_per_step: ${max_tokens_per_step}
194+
env_manager_cls: ${env_manager_cls}
195+
agent_system_template: "unused"
196+
agent_template: "unused"
197+
env_config:
198+
environment_name: "kanishk/EndlessTerminals"
199+
split: "train"
200+
mode: "val"
201+
max_steps: ${max_actions_per_traj}
202+
reward_reduction: "sum"
203+
nonterminal_reward: 0.0
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/bin/bash
2+
# Run OpenReward EndlessTerminals REINFORCE training with Qwen3.5-2B.
3+
#
4+
# Prerequisites:
5+
# pip install openreward # inside the docker container
6+
#
7+
# Usage (inside roll_openreward_runner container):
8+
# export OPENREWARD_API_KEY="..."
9+
# export WANDB_API_KEY="..."
10+
# cd /home/ubuntu/ALE-latest/ROLL-personal
11+
# bash examples/agentic_demo/run_openreward_endless_terminals.sh
12+
13+
set -euo pipefail
14+
15+
: "${OPENREWARD_API_KEY:?Set OPENREWARD_API_KEY}"
16+
: "${WANDB_API_KEY:?Set WANDB_API_KEY}"
17+
18+
export NCCL_NET_PLUGIN=''
19+
export NCCL_TUNER_PLUGIN=''
20+
export NCCL_NET=Socket
21+
export PYTHONPATH="${PWD}:${PYTHONPATH:-}"
22+
23+
python examples/start_agentic_pipeline.py \
24+
--config_path agentic_demo \
25+
--config_name openreward_endless_terminals_reinforce_qwen35_2b

0 commit comments

Comments
 (0)