Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
# OpenReward + EndlessTerminals + STEP_REINFORCE (vanilla PG) config
# Simpler baseline without IPA chunk-level loss.
# The model IS the agent — no iflow, no sandbox, no anti_call_llm.
#
# Usage:
# bash examples/agentic_demo/run_openreward_endless_terminals.sh reinforce
# # or directly:
# python examples/start_agentic_pipeline.py \
# --config_path agentic_demo \
# --config_name openreward_endless_terminals_reinforce_qwen35_2b

defaults:
- ../config/deepspeed_zero@_here_
- ../config/deepspeed_zero2@_here_
- ../config/deepspeed_zero3@_here_
- ../config/deepspeed_zero3_cpuoffload@_here_

hydra:
run:
dir: .
output_subdir: null

exp_name: "openreward_endless_terminals_reinforce_qwen35_2b"
seed: 42

logging_dir: ./output/logs
output_dir: ./output
model_name: ${exp_name}-${now:%Y%m%d_%H%M%S}
rollout_dump_dir: /home/ubuntu/ALE-latest/ROLL-personal/output/rollout_dump
system_envs:
USE_MODELSCOPE: '1'

checkpoint_config:
type: file_system
output_dir: /data

num_gpus_per_node: 8
rpc_timeout: 72000

max_steps: 10
save_steps: 50
logging_steps: 1
eval_steps: 0
resume_from_checkpoint: false

async_generation_ratio: 1
parse_tool_call_parameter_to_dict: true
skip_mock_system_prompt: true

track_with: wandb
tracker_kwargs:
api_key: ${oc.env:WANDB_API_KEY}
project: roll-agentic
name: ${exp_name}

rollout_batch_size: 16
val_batch_size: 1
sequence_length: 32768

max_tokens_per_step: 4096

# --- Vanilla STEP_REINFORCE config ---
advantage_clip: 0.2
ppo_epochs: 1
adv_estimator: "step_reinforce"
batch_adjust_mode: "random_sample"
step_reward_gamma: 1.0

init_kl_coef: 0.0
whiten_advantages: true
entropy_loss_coef: 0
max_grad_norm: 1.0

# --- Model configs ---
pretrain: /home/ubuntu/ALE-latest/model-checkpoints/Qwen3.5-2B
reward_pretrain: /home/ubuntu/ALE-latest/model-checkpoints/Qwen3.5-2B
actor_train:
model_args:
flash_attn: sdpa
attn_implementation: sdpa
disable_gradient_checkpointing: false
dtype: bf16
model_type: ~
freeze_module_prefix: vision_model
training_args:
learning_rate: 1.0e-6
weight_decay: 0
per_device_train_batch_size: 1
gradient_accumulation_steps: 4
warmup_steps: 0
data_args:
template: qwen3_coder
strategy_args:
strategy_name: megatron_train
strategy_config:
tensor_model_parallel_size: 2
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
context_parallel_size: 2
sequence_parallel: true
use_distributed_optimizer: true
recompute_granularity: full
device_mapping: list(range(0,4))
infer_batch_size: 1
actor_infer:
model_args:
flash_attn: sdpa
attn_implementation: sdpa
disable_gradient_checkpointing: true
dtype: bf16
generating_args:
max_new_tokens: ${max_tokens_per_step}
top_p: 1.0
top_k: 50
num_beams: 1
temperature: 1.0
num_return_sequences: 1
stop_strings: ["</tool_call>"]
include_stop_str_in_output: true
data_args:
template: qwen3_coder
strategy_args:
strategy_name: vllm
strategy_config:
gpu_memory_utilization: 0.6
block_size: 16
load_format: auto
tensor_parallel_size: 1
max_model_len: 32768
device_mapping: list(range(0,8))

reference:
model_args:
attn_implementation: sdpa
disable_gradient_checkpointing: true
dtype: bf16
model_type: ~
freeze_module_prefix: vision_model
data_args:
template: qwen3_coder
strategy_args:
strategy_name: megatron_infer
strategy_config:
tensor_model_parallel_size: 2
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
context_parallel_size: 2
device_mapping: list(range(0,4))
infer_batch_size: 1

reward_normalization:
grouping: traj_group_id
method: identity

# --- Environment config (OpenReward) ---
max_actions_per_traj: 16
env_manager_cls: roll.pipeline.agentic.env_manager.agent_native_env_manager.AgentNativeStepEnvManager

train_env_manager:
max_env_num_per_worker: 1
num_env_groups: 1
group_size: 1
tags: [OpenRewardEndlessTerminalsTrain]
num_groups_partition: [1]

val_env_manager:
max_env_num_per_worker: 1
num_env_groups: 1
group_size: 1
tags: [OpenRewardEndlessTerminalsVal]
num_groups_partition: [1]

custom_envs:
OpenRewardEndlessTerminalsTrain:
env_type: "openreward_env"
max_steps: ${max_actions_per_traj}
max_tokens_per_step: ${max_tokens_per_step}
env_manager_cls: ${env_manager_cls}
agent_system_template: "unused — system prompt built dynamically from OpenReward tool specs"
agent_template: "unused — observation is full message list from OpenRewardEnv"
env_config:
environment_name: "kanishk/EndlessTerminals"
split: "train"
mode: "train"
max_steps: ${max_actions_per_traj}
reward_reduction: "sum"
nonterminal_reward: 0.0
retry_max_attempts: 3
retry_backoff_seconds: 5.0
OpenRewardEndlessTerminalsVal:
env_type: "openreward_env"
max_steps: ${max_actions_per_traj}
max_tokens_per_step: ${max_tokens_per_step}
env_manager_cls: ${env_manager_cls}
agent_system_template: "unused"
agent_template: "unused"
env_config:
environment_name: "kanishk/EndlessTerminals"
split: "train"
mode: "val"
max_steps: ${max_actions_per_traj}
reward_reduction: "sum"
nonterminal_reward: 0.0
25 changes: 25 additions & 0 deletions examples/agentic_demo/run_openreward_endless_terminals.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash
# Run OpenReward EndlessTerminals REINFORCE training with Qwen3.5-2B.
#
# Prerequisites:
# pip install openreward # inside the docker container
#
# Usage (inside roll_openreward_runner container):
# export OPENREWARD_API_KEY="..."
# export WANDB_API_KEY="..."
# cd /home/ubuntu/ALE-latest/ROLL-personal
# bash examples/agentic_demo/run_openreward_endless_terminals.sh

set -euo pipefail

: "${OPENREWARD_API_KEY:?Set OPENREWARD_API_KEY}"
: "${WANDB_API_KEY:?Set WANDB_API_KEY}"

export NCCL_NET_PLUGIN=''
export NCCL_TUNER_PLUGIN=''
export NCCL_NET=Socket
export PYTHONPATH="${PWD}:${PYTHONPATH:-}"

python examples/start_agentic_pipeline.py \
--config_path agentic_demo \
--config_name openreward_endless_terminals_reinforce_qwen35_2b
5 changes: 4 additions & 1 deletion roll/pipeline/agentic/env/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@
gem.register("deepeyes", entry_point="roll.pipeline.agentic.env.deepeyes:DeepEyesEnv")
gem.register("rock_tb_native_env", entry_point="roll.pipeline.agentic.env.sandbox.rock_tb_native_env:RockTBNativeEnv")


try:
gem.register("openreward_env", entry_point="roll.pipeline.agentic.env.openreward:OpenRewardEnv")
except Exception as e:
logger.info(f"Failed to register openreward_env: {e}")

try:
# add webshop-minimal to PYTHONPATH
Expand Down
3 changes: 3 additions & 0 deletions roll/pipeline/agentic/env/openreward/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .openreward_env import OpenRewardEnv

__all__ = ["OpenRewardEnv"]
Loading
Loading