NovaSky-AI · EtashGuha · Nov 21, 2025 · Nov 21, 2025 · Nov 21, 2025 · Nov 21, 2025
diff --git a/.gitignore b/.gitignore
@@ -39,9 +39,13 @@ docs/_spelling/
 /skyrl-gym/dist
 
 *.log
+<<<<<<< HEAD
+trials/
+=======
 
 # SQLite database files
 *.db
 
 # uv lock files
-uv.lock
+uv.lock
+>>>>>>> main
diff --git a/skyrl-train/examples/async/main_async.py b/skyrl-train/examples/async/main_async.py
@@ -5,7 +5,7 @@
 import hydra
 from omegaconf import DictConfig
 from skyrl_train.entrypoints.main_base import BasePPOExp, config_dir, validate_cfg
-from .async_trainer import AsyncRayPPOTrainer
+from skyrl_train.fully_async_trainer import FullyAsyncRayPPOTrainer
 import asyncio
 from skyrl_train.utils import initialize_ray
 import ray
@@ -23,7 +23,7 @@ def get_trainer(
         generator,
         colocate_pg,
     ):
-        return AsyncRayPPOTrainer(
+        return FullyAsyncRayPPOTrainer(
             cfg=cfg,
             tracker=tracker,
             tokenizer=tokenizer,
@@ -33,7 +33,7 @@ def get_trainer(
             generator=generator,
             colocate_pg=colocate_pg,
         )
-
+        
     def run(self):
         trainer = self._setup_trainer()
         # Start the async training loop

diff --git a/skyrl-train/examples/on_policy_distillation/README.md b/skyrl-train/examples/on_policy_distillation/README.md
@@ -15,8 +15,12 @@ In `main_on_policy_distill.py` we provide a simple example for modifying SkyRL t
 To get started, first set up the dataset from the DAPO example:
 
 ```bash
+<<<<<<< HEAD
+uv run examples/algorithms/dapo/prepare_dapo_data.sh
+=======
 # Run from the `skyrl-train` directory
 bash examples/algorithms/dapo/prepare_dapo_data.sh
+>>>>>>> main
-<<<<<<< HEAD
-uv run examples/algorithms/dapo/prepare_dapo_data.sh
-=======
-# Run from the `skyrl-train` directory
-bash examples/algorithms/dapo/prepare_dapo_data.sh
->>>>>>> main
+uv run examples/algorithms/dapo/prepare_dapo_data.sh
-<<<<<<< HEAD
-uv run examples/algorithms/dapo/prepare_dapo_data.sh
-=======
-# Run from the `skyrl-train` directory
-bash examples/algorithms/dapo/prepare_dapo_data.sh
->>>>>>> main
+uv run examples/algorithms/dapo/prepare_dapo_data.sh
 ```
 
 Then, just make sure to set the path to your desired teacher model, and you're ready to kick off training!

diff --git a/skyrl-train/examples/on_policy_distillation/run_on_policy_distill_math_qwen3_1.7b.sh b/skyrl-train/examples/on_policy_distillation/run_on_policy_distill_math_qwen3_1.7b.sh
@@ -2,7 +2,11 @@ set -x
 
 # Running on policy distillation for Math on the DAPO math dataset, with eval on AIME 2024.
 # Uses Qwen-3-1.7B-Base as the student model and an RL trained Qwen-3-4B as the teacher model
+<<<<<<< HEAD
+# uv run examples/algorithms/dapo/prepare_dapo_data.sh
+=======
 # bash examples/algorithms/dapo/prepare_dapo_data.sh
+>>>>>>> main
-<<<<<<< HEAD
-# uv run examples/algorithms/dapo/prepare_dapo_data.sh
-=======
-# bash examples/algorithms/dapo/prepare_dapo_data.sh
->>>>>>> main
+# uv run examples/algorithms/dapo/prepare_dapo_data.sh
-<<<<<<< HEAD
-# uv run examples/algorithms/dapo/prepare_dapo_data.sh
-=======
-# bash examples/algorithms/dapo/prepare_dapo_data.sh
->>>>>>> main
+# uv run examples/algorithms/dapo/prepare_dapo_data.sh
 # bash examples/on_policy_distillation/run_on_policy_distill_math_qwen3_1.7b.sh
 
 DATA_DIR="$HOME/data/dapo"

diff --git a/skyrl-train/examples/on_policy_distillation/run_on_policy_distill_math_qwen3_4b.sh b/skyrl-train/examples/on_policy_distillation/run_on_policy_distill_math_qwen3_4b.sh
@@ -2,7 +2,11 @@ set -x
 
 # Running on policy distillation for Math on the DAPO math dataset, with eval on AIME 2024.
 # Uses Qwen-3-4B-Base as the student model and an RL trained Qwen-3-4B as the teacher model
+<<<<<<< HEAD
+# uv run examples/algorithms/dapo/prepare_dapo_data.sh
+=======
 # bash examples/algorithms/dapo/prepare_dapo_data.sh
+>>>>>>> main
-<<<<<<< HEAD
-# uv run examples/algorithms/dapo/prepare_dapo_data.sh
-=======
-# bash examples/algorithms/dapo/prepare_dapo_data.sh
->>>>>>> main
+# uv run examples/algorithms/dapo/prepare_dapo_data.sh
-<<<<<<< HEAD
-# uv run examples/algorithms/dapo/prepare_dapo_data.sh
-=======
-# bash examples/algorithms/dapo/prepare_dapo_data.sh
->>>>>>> main
+# uv run examples/algorithms/dapo/prepare_dapo_data.sh
 # bash examples/on_policy_distillation/run_on_policy_distill_math_qwen3_4b.sh
 
 DATA_DIR="$HOME/data/dapo"

diff --git a/skyrl-train/examples/terminal_bench/entrypoints/main_tbench.py b/skyrl-train/examples/terminal_bench/entrypoints/main_tbench.py
@@ -10,7 +10,7 @@
 from skyrl_train.utils.utils import initialize_ray
 from examples.terminal_bench.terminal_bench_generator import TerminalBenchGenerator
 from examples.terminal_bench.dataset import TerminalBenchTaskDataset
-
+from skyrl_train.fully_async_trainer import FullyAsyncRayPPOTrainer
 
 class TerminalBenchExp(BasePPOExp):
     def get_generator(self, cfg, tokenizer, inference_engine_client):
@@ -52,6 +52,28 @@ def get_eval_dataset(self):
             return prompts_dataset
         return None
 
+    def get_trainer(
+        self,
+        cfg,
+        tracker,
+        tokenizer,
+        train_dataset,
+        eval_dataset,
+        inference_engine_client,
+        generator,
+        colocate_pg,
+    ):
+        return FullyAsyncRayPPOTrainer(
+            cfg=cfg,
+            tracker=tracker,
+            tokenizer=tokenizer,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            inference_engine_client=inference_engine_client,
+            generator=generator,
+            colocate_pg=colocate_pg,
+        )
+
 
 @ray.remote(num_cpus=1)
 def skyrl_entrypoint(cfg: DictConfig):

diff --git a/skyrl-train/examples/terminal_bench/entrypoints/main_tbench_generate.py b/skyrl-train/examples/terminal_bench/entrypoints/main_tbench_generate.py
@@ -18,7 +18,7 @@
     config_dir,
 )
 from skyrl_train.generators.base import GeneratorInput
-from examples.terminal_bench.generator.terminal_bench_generator import TerminalBenchGenerator
+from examples.terminal_bench.terminal_bench_generator import TerminalBenchGenerator
 from examples.terminal_bench.dataset import TerminalBenchTaskDataset
 
 

diff --git a/skyrl-train/examples/terminal_bench/entrypoints/main_tbench_opd.py b/skyrl-train/examples/terminal_bench/entrypoints/main_tbench_opd.py
@@ -0,0 +1,36 @@
+"""
+Main entrypoint for training on terminal bench tasks.
+"""
+import ray
+import hydra
+from omegaconf import DictConfig
+from skyrl_train.entrypoints.main_base import BasePPOExp, config_dir
+from skyrl_train.utils import validate_cfg
+from skyrl_train.utils.utils import initialize_ray
+from examples.terminal_bench.terminal_bench_generator import TerminalBenchGenerator
+from examples.terminal_bench.dataset import TerminalBenchTaskDataset
+from examples.terminal_bench.entrypoints.main_tbench import TerminalBenchExp
+from examples.on_policy_distillation.main_on_policy_distill import OnPolicyDistillationTrainer
+
+class OnPolicyDistillationTerminalBenchExp(TerminalBenchExp):
+    def get_trainer(self, *args, **kwargs):
+        return OnPolicyDistillationTrainer(*args, **kwargs)
+
+
+@ray.remote(num_cpus=1)
+def skyrl_entrypoint(cfg: DictConfig):
+    # make sure that the training loop is not run on the head node.
+    exp = OnPolicyDistillationTerminalBenchExp(cfg)
+    exp.run()
+
+@hydra.main(config_path=config_dir, config_name="ppo_base_config", version_base=None)
+def main(cfg: DictConfig) -> None:
+    # validate the arguments
+    validate_cfg(cfg)
+
+    initialize_ray(cfg)
+    ray.get(skyrl_entrypoint.remote(cfg))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skyrl-train/examples/terminal_bench/generator/terminal_bench_generator.py b/skyrl-train/examples/terminal_bench/generator/terminal_bench_generator.py