pytorch
diff --git a/‎torchtitan/components/validate.py‎
Lines changed: 1 addition & 1 deletion b/‎torchtitan/components/validate.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchtitan/distributed/utils.py‎
Lines changed: 2 additions & 2 deletions b/‎torchtitan/distributed/utils.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎torchtitan/experiments/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎torchtitan/experiments/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchtitan/experiments/rl/README.md‎
Lines changed: 66 additions & 9 deletions b/‎torchtitan/experiments/rl/README.md‎
Lines changed: 66 additions & 9 deletions
diff --git a/‎…titan/experiments/rl/unified/__init__.py‎ ‎torchtitan/experiments/rl/__init__.py‎torchtitan/experiments/rl/unified/__init__.py renamed to torchtitan/experiments/rl/__init__.py
Lines changed: 3 additions & 7 deletions b/‎…titan/experiments/rl/unified/__init__.py‎ ‎torchtitan/experiments/rl/__init__.py‎torchtitan/experiments/rl/unified/__init__.py renamed to torchtitan/experiments/rl/__init__.py
Lines changed: 3 additions & 7 deletions
diff --git a/‎…periments/rl/unified/actors/generator.py‎ ‎…titan/experiments/rl/actors/generator.py‎torchtitan/experiments/rl/unified/actors/generator.py renamed to torchtitan/experiments/rl/actors/generator.py
Lines changed: 2 additions & 2 deletions b/‎…periments/rl/unified/actors/generator.py‎ ‎…titan/experiments/rl/actors/generator.py‎torchtitan/experiments/rl/unified/actors/generator.py renamed to torchtitan/experiments/rl/actors/generator.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎…/experiments/rl/unified/actors/grader.py‎ ‎…rchtitan/experiments/rl/actors/grader.py‎torchtitan/experiments/rl/unified/actors/grader.py renamed to torchtitan/experiments/rl/actors/grader.py
Lines changed: 1 addition & 1 deletion b/‎…/experiments/rl/unified/actors/grader.py‎ ‎…rchtitan/experiments/rl/actors/grader.py‎torchtitan/experiments/rl/unified/actors/grader.py renamed to torchtitan/experiments/rl/actors/grader.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎…experiments/rl/unified/actors/trainer.py‎ ‎…chtitan/experiments/rl/actors/trainer.py‎torchtitan/experiments/rl/unified/actors/trainer.py renamed to torchtitan/experiments/rl/actors/trainer.py
Lines changed: 3 additions & 3 deletions b/‎…experiments/rl/unified/actors/trainer.py‎ ‎…chtitan/experiments/rl/actors/trainer.py‎torchtitan/experiments/rl/unified/actors/trainer.py renamed to torchtitan/experiments/rl/actors/trainer.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎…n/experiments/rl/unified/actors/utils.py‎ ‎torchtitan/experiments/rl/actors/utils.py‎torchtitan/experiments/rl/unified/actors/utils.py renamed to torchtitan/experiments/rl/actors/utils.py b/‎…n/experiments/rl/unified/actors/utils.py‎ ‎torchtitan/experiments/rl/actors/utils.py‎torchtitan/experiments/rl/unified/actors/utils.py renamed to torchtitan/experiments/rl/actors/utils.py
diff --git a/‎…xperiments/rl/unified/config_registry.py‎ ‎…htitan/experiments/rl/config_registry.py‎torchtitan/experiments/rl/unified/config_registry.py renamed to torchtitan/experiments/rl/config_registry.py
Lines changed: 4 additions & 4 deletions b/‎…xperiments/rl/unified/config_registry.py‎ ‎…htitan/experiments/rl/config_registry.py‎torchtitan/experiments/rl/unified/config_registry.py renamed to torchtitan/experiments/rl/config_registry.py
Lines changed: 4 additions & 4 deletions
@@ -316,7 +316,7 @@ def validate(
                 loss, parallel_dims.get_optional_mesh("loss")
             )
         else:
-            global_avg_loss = loss.item()
+            global_avg_loss = float(loss.item())
 
         self.metrics_processor.log_validation(loss=global_avg_loss, step=step)
 
 
@@ -52,10 +52,10 @@ def _dist_reduce(
         x = funcol.all_reduce(x, reduceOp=reduceOp, group=extra_pg)
 
     if mesh is None:
-        return x.item()
+        return float(x.item())
 
     assert x.numel() == 1  # required by `.item()`
-    return funcol.all_reduce(x, reduceOp=reduceOp, group=mesh).item()
+    return float(funcol.all_reduce(x, reduceOp=reduceOp, group=mesh).item())
 
 
 # TODO: rename this to maybe_dist_max
 
@@ -14,6 +14,6 @@
         "autoparallel.deepseek_v3",
         "autoparallel.local_map_deepseek_v3",
         "ft.llama3",
-        "rl.unified",
+        "rl",
     ]
 )
@@ -1,12 +1,69 @@
-# Deterministic RL Training with vLLM
+# RL Training with TorchTitan and vLLM
 
-This package provides two approaches for integrating TorchTitan models with vLLM:
+This directory contains code for RL training using TorchTitan model definitions with vLLM inference engine for fast rollout generation.
 
-1. vllm_compat/ - vLLM-Compatible approach
-   - Separate model definition matching vLLM's weight format
-   - Support batch-invariant and bit-wise identity between train and inference
-   - Custom backward passes for attention gradient computation
+## Overview
+The integration consists of the following components:
 
-2. unified/ - Unified approach
-   - Uses canonical TorchTitan model definition for inference directly
-   - Replaces attention with vLLM Compatible attention for inference
+1. **vLLM Model Wrapper** (`models/vllm_wrapper.py`): Adapts TorchTitan models for vLLM's inference engine
+2. **RL Training Loop** (`simple_grpo_sum_digits.py`): GRPO-based RL training with Monarch actors
+3. **Inference Script** (`inference_example.py`): Standalone inference using the vLLM engine
+
+
+## Quick Start
+### Prerequisites
+
+0. Create and activate environment with uv:
+```bash
+uv venv --python 3.12 titan-rl
+source titan-rl/bin/activate
+```
+
+1. Install Monarch:
+```bash
+uv pip install torchmonarch
+```
+
+
+2. Install PyTorch nightly for torchtitan, and pre-built vllm wheels (based on PyTorch nightly version).
+```bash
+# Install vllm with nightly torch
+uv pip install torch vllm xformers  --pre \
+--extra-index-url https://download.pytorch.org/whl/nightly/cu128 \
+--index-strategy unsafe-best-match
+```
+
+**NOTE:** The pre-built vLLM wheels are only compatible with CUDA 12.8, though they should work with most older CUDA versions. Alternatively, you can install the corresponding vLLM pre-built wheels directly from https://download.pytorch.org/whl/nightly/cu128, for example: `uv pip install vllm-1.0.0.dev20260219+cu130-<suffix>.whl`. Ensure the build version number (e.g., `dev20260219`) matches your PyTorch nightly installation.
+
+
+3. Install TorchTitan in editable mode:
+```bash
+uv pip install -e .
+```
+
+4. Download `Qwen/Qwen3-0.6B` (or `Qwen/Qwen3-1.7B`) checkpoint from HuggingFace to `torchtitan/experiments/rl/example_checkpoint` folder.
+```bash
+python scripts/download_hf_assets.py --repo_id Qwen/Qwen3-0.6B --local_dir torchtitan/experiments/rl/example_checkpoint --all --hf_token=...
+
+python scripts/download_hf_assets.py --repo_id Qwen/Qwen3-1.7B --local_dir torchtitan/experiments/rl/example_checkpoint --all --hf_token=...
+```
+
+5. Run inference with torchtitan model definition:
+```bash
+torchrun --nproc_per_node=2 torchtitan/experiments/rl/inference_example.py
+```
+
+**NOTE:**: Set `--nproc_per_node` to the world size, which should match the `tensor_parallel_degree` in the `VLLMGenerator` config.
+
+6. Run simple GRPO RL loop to learn sum digits task
+```bash
+python torchtitan/experiments/rl/simple_grpo_sum_digits.py --module rl --config rl_grpo_qwen3_0_6b
+```
+
+**NOTE:** If you downloaded your HF model to a different path than the one in step 4, specify it in your command with `--hf_assets_path=<path_to_model_checkpoint>`.
+
+We use a unified model definition from torchtitan for the trainer and generator, ensuring bitwise-identical models to address a class of subtle correctness bugs in RL for LLMs.
+
+
+
+**Current status:** Batch invariance is only supported for single-GPU configurations (TP=1) for both the trainer and generator. When tensor parallelism is enabled (TP > 1), batch-invariant mode is not yet supported.
@@ -8,18 +8,14 @@
 Unified approach for running TorchTitan models with vLLM inference.
 
 To register TorchTitan models with vLLM:
-    from torchtitan.experiments.rl.unified.plugin import register
+    from torchtitan.experiments.rl.plugin import register
     register(model_spec)
 """
 
-from torchtitan.experiments.rl.unified.models.vllm_wrapper import (
-    TorchTitanVLLMModelWrapper,
-)
+from torchtitan.experiments.rl.models.vllm_wrapper import TorchTitanVLLMModelWrapper
 
 # Export plugin register function for manual use (no auto-registration)
-from torchtitan.experiments.rl.unified.plugin import (
-    register_model_to_vllm_model_registry,
-)
+from torchtitan.experiments.rl.plugin import register_model_to_vllm_model_registry
 
 
 __all__ = [
 
@@ -14,11 +14,11 @@
 from monarch.actor import Actor, endpoint
 from torchtitan.config import Configurable
 from torchtitan.config.configs import ParallelismConfig
-from torchtitan.experiments.rl.unified.plugin import (
+from torchtitan.experiments.rl.plugin import (
     register_model_to_vllm_model_registry,
     VLLM_MODEL_NAME,
 )
-from torchtitan.experiments.rl.unified.types import Episode
+from torchtitan.experiments.rl.types import Episode
 from torchtitan.protocols.model_spec import ModelSpec
 from vllm import EngineArgs, LLMEngine, SamplingParams
 from vllm.config import AttentionConfig, CompilationConfig
 
@@ -9,7 +9,7 @@
 
 import torch
 from monarch.actor import Actor, endpoint
-from torchtitan.experiments.rl.unified.types import Episode
+from torchtitan.experiments.rl.types import Episode
 
 logger = logging.getLogger(__name__)
 
 
@@ -23,15 +23,15 @@
 from torchtitan.config import CommConfig, Configurable, TORCH_DTYPE_MAP
 from torchtitan.config.configs import ParallelismConfig, TrainingConfig
 from torchtitan.distributed import ParallelDims, utils as dist_utils
-from torchtitan.experiments.rl.unified.actors.utils import (
+from torchtitan.experiments.rl.actors.utils import (
     compute_policy_gradient_loss,
     compute_token_log_probs,
     verify_logprob_identity,
 )
-from torchtitan.experiments.rl.unified.models.attention import (
+from torchtitan.experiments.rl.models.attention import (
     replace_with_vllm_compatible_flash_attention,
 )
-from torchtitan.experiments.rl.unified.types import Episode
+from torchtitan.experiments.rl.types import Episode
 from torchtitan.protocols.model_spec import ModelSpec
 from torchtitan.tools import utils
 
 
@@ -8,19 +8,19 @@
 Config entry points for the RL/unified experiment.
 
 Each function returns a complete ``RLTrainer.Config`` and is discoverable by
-``ConfigManager`` via ``--module rl.unified --config <function_name>``.
+``ConfigManager`` via ``--module rl --config <function_name>``.
 """
 
 from torchtitan.components.lr_scheduler import LRSchedulersContainer
 from torchtitan.components.optimizer import OptimizersContainer
 from torchtitan.config.configs import ParallelismConfig, TrainingConfig
-from torchtitan.experiments.rl.unified.actors.generator import (
+from torchtitan.experiments.rl.actors.generator import (
     GeneratorCompileConfig,
     SamplingConfig,
     VLLMGenerator,
 )
-from torchtitan.experiments.rl.unified.actors.trainer import PolicyTrainer
-from torchtitan.experiments.rl.unified.simple_grpo_sum_digits import RLTrainer
+from torchtitan.experiments.rl.actors.trainer import PolicyTrainer
+from torchtitan.experiments.rl.simple_grpo_sum_digits import RLTrainer
 from torchtitan.models.qwen3 import model_registry
Original file line number	Diff line number	Diff line change
`@@ -316,7 +316,7 @@ def validate(`
`316`	`316`	`loss, parallel_dims.get_optional_mesh("loss")`
`317`	`317`	`)`
`318`	`318`	`else:`
`319`		`- global_avg_loss = loss.item()`
	`319`	`+ global_avg_loss = float(loss.item())`
`320`	`320`
`321`	`321`	`self.metrics_processor.log_validation(loss=global_avg_loss, step=step)`
`322`	`322`
Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,6 @@`
`14`	`14`	`"autoparallel.deepseek_v3",`
`15`	`15`	`"autoparallel.local_map_deepseek_v3",`
`16`	`16`	`"ft.llama3",`
`17`		`- "rl.unified",`
	`17`	`+ "rl",`
`18`	`18`	`]`
`19`	`19`	`)`