Skip to content

Commit dab543a

Browse files
shamanezshamanez
authored andcommitted
added OpenRewards
1 parent 345edea commit dab543a

File tree

6 files changed

+801
-2
lines changed

6 files changed

+801
-2
lines changed
Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
# OpenReward + EndlessTerminals + IPA chunk-level loss config
2+
# Based on agent_val_rock_swe_IPA_qwen35_2b.yaml but replaces ROCK with OpenReward.
3+
# The model IS the agent — no iflow, no sandbox, no anti_call_llm.
4+
#
5+
# Usage:
6+
# python examples/start_agentic_pipeline.py \
7+
# --config_path agentic_demo \
8+
# --config_name openreward_endless_terminals_IPA_qwen35_2b
9+
#
10+
# For REINFORCE instead of IPA, change:
11+
# pg_variant: "vanilla"
12+
# adv_estimator: "reinforce"
13+
14+
defaults:
15+
- ../config/deepspeed_zero@_here_
16+
- ../config/deepspeed_zero2@_here_
17+
- ../config/deepspeed_zero3@_here_
18+
- ../config/deepspeed_zero3_cpuoffload@_here_
19+
20+
hydra:
21+
run:
22+
dir: .
23+
output_subdir: null
24+
25+
exp_name: "openreward_endless_terminals_IPA_qwen35_2b"
26+
seed: 42
27+
28+
logging_dir: ./output/logs
29+
output_dir: ./output
30+
model_name: ${exp_name}-${now:%Y%m%d_%H%M%S}
31+
rollout_dump_dir: /home/ubuntu/ALE-latest/ROLL-personal/output/rollout_dump
32+
system_envs:
33+
USE_MODELSCOPE: '1'
34+
35+
checkpoint_config:
36+
type: file_system
37+
output_dir: /data
38+
39+
num_gpus_per_node: 8
40+
rpc_timeout: 72000
41+
42+
max_steps: 10
43+
save_steps: 50
44+
logging_steps: 1
45+
eval_steps: 0
46+
resume_from_checkpoint: false
47+
48+
async_generation_ratio: 1
49+
parse_tool_call_parameter_to_dict: true
50+
skip_mock_system_prompt: true
51+
52+
track_with: wandb
53+
tracker_kwargs:
54+
api_key: "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"
55+
project: roll-agentic
56+
name: ${exp_name}
57+
58+
rollout_batch_size: 16
59+
val_batch_size: 1
60+
sequence_length: 32768
61+
62+
max_tokens_per_step: 4096
63+
64+
# --- IPA chunk-level loss config (identical to ROCK config) ---
65+
ppo_epochs: 1
66+
adv_estimator: "step_reinforce"
67+
batch_adjust_mode: "random_sample"
68+
step_reward_gamma: 0.95 # chunk-level temporal credit: G_k = 0.95^{K-k} * R_final
69+
ipa_failure_reward: ~ # positive rewards only — no penalty for failures
70+
ratio_type: "segment" # chunk-level geometric mean IS ratio
71+
72+
init_kl_coef: 0.0
73+
whiten_advantages: false # IPA uses absolute returns, no relative ranking
74+
entropy_loss_coef: 0
75+
max_grad_norm: 1.0
76+
77+
# Non-trivial IS ratio: use vLLM rollout logprobs as old_log_probs (not recomputed Megatron)
78+
force_disable_old_logprobs_recompute: true
79+
80+
# Train-infer mismatch mask (paper Section 8): chunk-level segment filter
81+
train_infer_correction:
82+
is_weight:
83+
enabled: false # IPA handles IS internally
84+
filters:
85+
- enabled: true
86+
agg_type: segment # chunk-level geometric mean mismatch
87+
ratio_enabled: true
88+
ratio_low: 0.0
89+
ratio_high: 5.0 # threshold H; chunks above are masked out
90+
91+
# --- Model configs (identical to ROCK config) ---
92+
pretrain: /home/ubuntu/ALE-latest/model-checkpoints/Qwen3.5-2B
93+
reward_pretrain: /home/ubuntu/ALE-latest/model-checkpoints/Qwen3.5-2B
94+
actor_train:
95+
worker_cls: roll.pipeline.agentic.agentic_actor_pg_worker.ActorWorker
96+
pg_variant: "ipa_chunk" # chunk-level IPA loss (paper Eq. 9)
97+
model_args:
98+
flash_attn: sdpa
99+
attn_implementation: sdpa
100+
disable_gradient_checkpointing: false
101+
dtype: bf16
102+
model_type: ~
103+
freeze_module_prefix: vision_model
104+
training_args:
105+
learning_rate: 1.0e-6
106+
weight_decay: 0
107+
per_device_train_batch_size: 1
108+
gradient_accumulation_steps: 4
109+
warmup_steps: 0
110+
data_args:
111+
template: qwen3_coder
112+
strategy_args:
113+
strategy_name: megatron_train
114+
strategy_config:
115+
tensor_model_parallel_size: 2
116+
pipeline_model_parallel_size: 1
117+
expert_model_parallel_size: 1
118+
context_parallel_size: 2
119+
sequence_parallel: true
120+
use_distributed_optimizer: true
121+
recompute_granularity: full
122+
device_mapping: list(range(0,4))
123+
infer_batch_size: 1
124+
actor_infer:
125+
model_args:
126+
flash_attn: sdpa
127+
attn_implementation: sdpa
128+
disable_gradient_checkpointing: true
129+
dtype: bf16
130+
generating_args:
131+
max_new_tokens: ${max_tokens_per_step}
132+
top_p: 1.0
133+
top_k: 50
134+
num_beams: 1
135+
temperature: 1.0
136+
num_return_sequences: 1
137+
stop_strings: ["</tool_call>"]
138+
include_stop_str_in_output: true
139+
data_args:
140+
template: qwen3_coder
141+
strategy_args:
142+
strategy_name: vllm
143+
strategy_config:
144+
gpu_memory_utilization: 0.6
145+
block_size: 16
146+
load_format: auto
147+
tensor_parallel_size: 1
148+
max_model_len: 32768
149+
device_mapping: list(range(0,8))
150+
151+
reference:
152+
model_args:
153+
attn_implementation: sdpa
154+
disable_gradient_checkpointing: true
155+
dtype: bf16
156+
model_type: ~
157+
freeze_module_prefix: vision_model
158+
data_args:
159+
template: qwen3_coder
160+
strategy_args:
161+
strategy_name: megatron_infer
162+
strategy_config:
163+
tensor_model_parallel_size: 2
164+
pipeline_model_parallel_size: 1
165+
expert_model_parallel_size: 1
166+
context_parallel_size: 2
167+
device_mapping: list(range(0,4))
168+
infer_batch_size: 1
169+
170+
# IPA uses absolute returns — no reward normalization
171+
reward_normalization:
172+
grouping: traj_group_id
173+
method: identity
174+
175+
# --- Environment config (OpenReward — replaces ROCK) ---
176+
max_actions_per_traj: 16 # 16 turns max (matches ROCK config)
177+
env_manager_cls: roll.pipeline.agentic.env_manager.agent_native_env_manager.AgentNativeStepEnvManager
178+
179+
train_env_manager:
180+
max_env_num_per_worker: 1
181+
num_env_groups: 1
182+
group_size: 1 # 16 trajectories per batch
183+
tags: [OpenRewardEndlessTerminalsTrain]
184+
num_groups_partition: [1]
185+
186+
val_env_manager:
187+
max_env_num_per_worker: 1
188+
num_env_groups: 1
189+
group_size: 1
190+
tags: [OpenRewardEndlessTerminalsVal]
191+
num_groups_partition: [1]
192+
193+
custom_envs:
194+
OpenRewardEndlessTerminalsTrain:
195+
env_type: "openreward_env"
196+
max_steps: ${max_actions_per_traj}
197+
max_tokens_per_step: ${max_tokens_per_step}
198+
env_manager_cls: ${env_manager_cls}
199+
agent_system_template: "unused — system prompt built dynamically from OpenReward tool specs"
200+
agent_template: "unused — observation is full message list from OpenRewardEnv"
201+
env_config:
202+
environment_name: "kanishk/EndlessTerminals"
203+
split: "train"
204+
mode: "train"
205+
max_steps: ${max_actions_per_traj}
206+
reward_reduction: "sum"
207+
nonterminal_reward: 0.0
208+
retry_max_attempts: 3
209+
retry_backoff_seconds: 5.0
210+
OpenRewardEndlessTerminalsVal:
211+
env_type: "openreward_env"
212+
max_steps: ${max_actions_per_traj}
213+
max_tokens_per_step: ${max_tokens_per_step}
214+
env_manager_cls: ${env_manager_cls}
215+
agent_system_template: "unused"
216+
agent_template: "unused"
217+
env_config:
218+
environment_name: "kanishk/EndlessTerminals"
219+
split: "train"
220+
mode: "val"
221+
max_steps: ${max_actions_per_traj}
222+
reward_reduction: "sum"
223+
nonterminal_reward: 0.0

roll/pipeline/agentic/env/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,10 @@
1717
gem.register("deepeyes", entry_point="roll.pipeline.agentic.env.deepeyes:DeepEyesEnv")
1818
gem.register("rock_tb_native_env", entry_point="roll.pipeline.agentic.env.sandbox.rock_tb_native_env:RockTBNativeEnv")
1919

20-
20+
try:
21+
gem.register("openreward_env", entry_point="roll.pipeline.agentic.env.openreward:OpenRewardEnv")
22+
except Exception as e:
23+
logger.info(f"Failed to register openreward_env: {e}")
2124

2225
try:
2326
# add webshop-minimal to PYTHONPATH
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .openreward_env import OpenRewardEnv
2+
3+
__all__ = ["OpenRewardEnv"]

0 commit comments

Comments
 (0)