Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
f18c460
Sandboxes work rebased on top of SkyRL main 20eb6f1
CharlieFRuan Nov 21, 2025
891916a
[TBenchGen] Add more logs to print out results when failing (#1)
CharlieFRuan Nov 21, 2025
ca85618
[Trivial] Update gitignore, and remove terminal_bench.yaml content
CharlieFRuan Nov 21, 2025
dae1a02
[Fix] Fix litellm asyncio logging error when running TBench by using …
CharlieFRuan Nov 21, 2025
39e42b3
[TbenchGen] Add sticky routing with session_id (#3)
CharlieFRuan Nov 21, 2025
7fdef1a
On Policy Distillation for TerminalBench (#4)
atutej Nov 23, 2025
d68c31d
[Logs][wandb] Add systems metrics like GPU Util for multi node (#5)
CharlieFRuan Nov 23, 2025
e44bc9d
[skyrl-train][Fix] Fix epoch counter after resuming from checkpoint (#7)
CharlieFRuan Nov 24, 2025
1fe6c5a
[Fix] Fix chat templating in Mini-SWE-Agent and Terminal-Bench exampl…
CharlieFRuan Nov 26, 2025
f22fbc4
[train][TBench][MiniSwe] Fix custom generator loss masking (#710)
CharlieFRuan Nov 26, 2025
a086077
[Harbor] Bump to use Harbor (#8)
CharlieFRuan Nov 26, 2025
996b6a9
[fix] abort all requests before sleep (#458)
vutrung96 Oct 14, 2025
9948063
[Harbor] minor change on error handling for chat history
CharlieFRuan Dec 2, 2025
c493316
[engine] Fix abort request handling for v0.9.2 vllm, add logger
CharlieFRuan Dec 2, 2025
e162dd4
[Hack] Add custom chat template for /chat/completions, hardcode sampl…
CharlieFRuan Dec 3, 2025
01c5a91
[Harbor] Add tbench config enable_summarize (#11)
CharlieFRuan Dec 3, 2025
fb0728b
[Hack][Fix] Fix custom template by applying that in GeneratorOutput p…
CharlieFRuan Dec 3, 2025
e0eb1d5
temp working for async
Dec 17, 2025
27d61e8
working on async rl
Dec 19, 2025
d7472ac
should be good post merge
Dec 19, 2025
090f325
asyncrl seems to be working e2e
Dec 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,13 @@ docs/_spelling/
/skyrl-gym/dist

*.log
<<<<<<< HEAD
trials/
=======

# SQLite database files
*.db

# uv lock files
uv.lock
uv.lock
>>>>>>> main
6 changes: 3 additions & 3 deletions skyrl-train/examples/async/main_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import hydra
from omegaconf import DictConfig
from skyrl_train.entrypoints.main_base import BasePPOExp, config_dir, validate_cfg
from .async_trainer import AsyncRayPPOTrainer
from skyrl_train.fully_async_trainer import FullyAsyncRayPPOTrainer
import asyncio
from skyrl_train.utils import initialize_ray
import ray
Expand All @@ -23,7 +23,7 @@ def get_trainer(
generator,
colocate_pg,
):
return AsyncRayPPOTrainer(
return FullyAsyncRayPPOTrainer(
cfg=cfg,
tracker=tracker,
tokenizer=tokenizer,
Expand All @@ -33,7 +33,7 @@ def get_trainer(
generator=generator,
colocate_pg=colocate_pg,
)

def run(self):
trainer = self._setup_trainer()
# Start the async training loop
Expand Down
4 changes: 4 additions & 0 deletions skyrl-train/examples/on_policy_distillation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,12 @@ In `main_on_policy_distill.py` we provide a simple example for modifying SkyRL t
To get started, first set up the dataset from the DAPO example:

```bash
<<<<<<< HEAD
uv run examples/algorithms/dapo/prepare_dapo_data.sh
=======
# Run from the `skyrl-train` directory
bash examples/algorithms/dapo/prepare_dapo_data.sh
>>>>>>> main
Comment on lines +18 to +23
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

This file contains unresolved merge conflict markers (<<<<<<<, =======, >>>>>>>). Please resolve them.

Suggested change
<<<<<<< HEAD
uv run examples/algorithms/dapo/prepare_dapo_data.sh
=======
# Run from the `skyrl-train` directory
bash examples/algorithms/dapo/prepare_dapo_data.sh
>>>>>>> main
uv run examples/algorithms/dapo/prepare_dapo_data.sh

```

Then, just make sure to set the path to your desired teacher model, and you're ready to kick off training!
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ set -x

# Running on policy distillation for Math on the DAPO math dataset, with eval on AIME 2024.
# Uses Qwen-3-1.7B-Base as the student model and an RL trained Qwen-3-4B as the teacher model
<<<<<<< HEAD
# uv run examples/algorithms/dapo/prepare_dapo_data.sh
=======
# bash examples/algorithms/dapo/prepare_dapo_data.sh
>>>>>>> main
Comment on lines +5 to +9
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

This file contains unresolved merge conflict markers (<<<<<<<, =======, >>>>>>>). Please resolve them.

Suggested change
<<<<<<< HEAD
# uv run examples/algorithms/dapo/prepare_dapo_data.sh
=======
# bash examples/algorithms/dapo/prepare_dapo_data.sh
>>>>>>> main
# uv run examples/algorithms/dapo/prepare_dapo_data.sh

# bash examples/on_policy_distillation/run_on_policy_distill_math_qwen3_1.7b.sh

DATA_DIR="$HOME/data/dapo"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ set -x

# Running on policy distillation for Math on the DAPO math dataset, with eval on AIME 2024.
# Uses Qwen-3-4B-Base as the student model and an RL trained Qwen-3-4B as the teacher model
<<<<<<< HEAD
# uv run examples/algorithms/dapo/prepare_dapo_data.sh
=======
# bash examples/algorithms/dapo/prepare_dapo_data.sh
>>>>>>> main
Comment on lines +5 to +9
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

This file contains unresolved merge conflict markers (<<<<<<<, =======, >>>>>>>). Please resolve them.

Suggested change
<<<<<<< HEAD
# uv run examples/algorithms/dapo/prepare_dapo_data.sh
=======
# bash examples/algorithms/dapo/prepare_dapo_data.sh
>>>>>>> main
# uv run examples/algorithms/dapo/prepare_dapo_data.sh

# bash examples/on_policy_distillation/run_on_policy_distill_math_qwen3_4b.sh

DATA_DIR="$HOME/data/dapo"
Expand Down
24 changes: 23 additions & 1 deletion skyrl-train/examples/terminal_bench/entrypoints/main_tbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from skyrl_train.utils.utils import initialize_ray
from examples.terminal_bench.terminal_bench_generator import TerminalBenchGenerator
from examples.terminal_bench.dataset import TerminalBenchTaskDataset

from skyrl_train.fully_async_trainer import FullyAsyncRayPPOTrainer

class TerminalBenchExp(BasePPOExp):
def get_generator(self, cfg, tokenizer, inference_engine_client):
Expand Down Expand Up @@ -52,6 +52,28 @@ def get_eval_dataset(self):
return prompts_dataset
return None

def get_trainer(
self,
cfg,
tracker,
tokenizer,
train_dataset,
eval_dataset,
inference_engine_client,
generator,
colocate_pg,
):
return FullyAsyncRayPPOTrainer(
cfg=cfg,
tracker=tracker,
tokenizer=tokenizer,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
inference_engine_client=inference_engine_client,
generator=generator,
colocate_pg=colocate_pg,
)


@ray.remote(num_cpus=1)
def skyrl_entrypoint(cfg: DictConfig):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
config_dir,
)
from skyrl_train.generators.base import GeneratorInput
from examples.terminal_bench.generator.terminal_bench_generator import TerminalBenchGenerator
from examples.terminal_bench.terminal_bench_generator import TerminalBenchGenerator
from examples.terminal_bench.dataset import TerminalBenchTaskDataset


Expand Down
36 changes: 36 additions & 0 deletions skyrl-train/examples/terminal_bench/entrypoints/main_tbench_opd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
Main entrypoint for training on terminal bench tasks.
"""
import ray
import hydra
from omegaconf import DictConfig
from skyrl_train.entrypoints.main_base import BasePPOExp, config_dir
from skyrl_train.utils import validate_cfg
from skyrl_train.utils.utils import initialize_ray
from examples.terminal_bench.terminal_bench_generator import TerminalBenchGenerator
from examples.terminal_bench.dataset import TerminalBenchTaskDataset
from examples.terminal_bench.entrypoints.main_tbench import TerminalBenchExp
from examples.on_policy_distillation.main_on_policy_distill import OnPolicyDistillationTrainer

class OnPolicyDistillationTerminalBenchExp(TerminalBenchExp):
def get_trainer(self, *args, **kwargs):
return OnPolicyDistillationTrainer(*args, **kwargs)


@ray.remote(num_cpus=1)
def skyrl_entrypoint(cfg: DictConfig):
# make sure that the training loop is not run on the head node.
exp = OnPolicyDistillationTerminalBenchExp(cfg)
exp.run()

@hydra.main(config_path=config_dir, config_name="ppo_base_config", version_base=None)
def main(cfg: DictConfig) -> None:
# validate the arguments
validate_cfg(cfg)

initialize_ray(cfg)
ray.get(skyrl_entrypoint.remote(cfg))


if __name__ == "__main__":
main()

This file was deleted.

Loading