pytorch
diff --git a/‎torchtitan/experiments/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎torchtitan/experiments/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchtitan/experiments/rl/README.md‎
Lines changed: 67 additions & 9 deletions b/‎torchtitan/experiments/rl/README.md‎
Lines changed: 67 additions & 9 deletions
diff --git a/‎…titan/experiments/rl/unified/__init__.py‎ ‎torchtitan/experiments/rl/__init__.py‎torchtitan/experiments/rl/unified/__init__.py renamed to torchtitan/experiments/rl/__init__.py
Lines changed: 3 additions & 3 deletions b/‎…titan/experiments/rl/unified/__init__.py‎ ‎torchtitan/experiments/rl/__init__.py‎torchtitan/experiments/rl/unified/__init__.py renamed to torchtitan/experiments/rl/__init__.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎…periments/rl/unified/actors/generator.py‎ ‎…titan/experiments/rl/actors/generator.py‎torchtitan/experiments/rl/unified/actors/generator.py renamed to torchtitan/experiments/rl/actors/generator.py
Lines changed: 2 additions & 2 deletions b/‎…periments/rl/unified/actors/generator.py‎ ‎…titan/experiments/rl/actors/generator.py‎torchtitan/experiments/rl/unified/actors/generator.py renamed to torchtitan/experiments/rl/actors/generator.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎…/experiments/rl/unified/actors/grader.py‎ ‎…rchtitan/experiments/rl/actors/grader.py‎torchtitan/experiments/rl/unified/actors/grader.py renamed to torchtitan/experiments/rl/actors/grader.py
Lines changed: 1 addition & 1 deletion b/‎…/experiments/rl/unified/actors/grader.py‎ ‎…rchtitan/experiments/rl/actors/grader.py‎torchtitan/experiments/rl/unified/actors/grader.py renamed to torchtitan/experiments/rl/actors/grader.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎…experiments/rl/unified/actors/trainer.py‎ ‎…chtitan/experiments/rl/actors/trainer.py‎torchtitan/experiments/rl/unified/actors/trainer.py renamed to torchtitan/experiments/rl/actors/trainer.py
Lines changed: 3 additions & 3 deletions b/‎…experiments/rl/unified/actors/trainer.py‎ ‎…chtitan/experiments/rl/actors/trainer.py‎torchtitan/experiments/rl/unified/actors/trainer.py renamed to torchtitan/experiments/rl/actors/trainer.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎…n/experiments/rl/unified/actors/utils.py‎ ‎torchtitan/experiments/rl/actors/utils.py‎torchtitan/experiments/rl/unified/actors/utils.py renamed to torchtitan/experiments/rl/actors/utils.py b/‎…n/experiments/rl/unified/actors/utils.py‎ ‎torchtitan/experiments/rl/actors/utils.py‎torchtitan/experiments/rl/unified/actors/utils.py renamed to torchtitan/experiments/rl/actors/utils.py
diff --git a/‎…xperiments/rl/unified/config_registry.py‎ ‎…htitan/experiments/rl/config_registry.py‎torchtitan/experiments/rl/unified/config_registry.py renamed to torchtitan/experiments/rl/config_registry.py
Lines changed: 44 additions & 4 deletions b/‎…xperiments/rl/unified/config_registry.py‎ ‎…htitan/experiments/rl/config_registry.py‎torchtitan/experiments/rl/unified/config_registry.py renamed to torchtitan/experiments/rl/config_registry.py
Lines changed: 44 additions & 4 deletions
diff --git a/‎…eriments/rl/unified/inference_example.py‎ ‎…itan/experiments/rl/inference_example.py‎torchtitan/experiments/rl/unified/inference_example.py renamed to torchtitan/experiments/rl/inference_example.py
Lines changed: 4 additions & 4 deletions b/‎…eriments/rl/unified/inference_example.py‎ ‎…itan/experiments/rl/inference_example.py‎torchtitan/experiments/rl/unified/inference_example.py renamed to torchtitan/experiments/rl/inference_example.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎…periments/rl/unified/models/attention.py‎ ‎…titan/experiments/rl/models/attention.py‎torchtitan/experiments/rl/unified/models/attention.py renamed to torchtitan/experiments/rl/models/attention.py
Lines changed: 2 additions & 2 deletions b/‎…periments/rl/unified/models/attention.py‎ ‎…titan/experiments/rl/models/attention.py‎torchtitan/experiments/rl/unified/models/attention.py renamed to torchtitan/experiments/rl/models/attention.py
Lines changed: 2 additions & 2 deletions
@@ -14,6 +14,6 @@
         "autoparallel.deepseek_v3",
         "autoparallel.local_map_deepseek_v3",
         "ft.llama3",
-        "rl.unified",
+        "rl",
     ]
 )
@@ -1,12 +1,70 @@
-# Deterministic RL Training with vLLM
+# Run vLLM inference with TorchTitan Qwen3 Model
 
-This package provides two approaches for integrating TorchTitan models with vLLM:
+This directory contains code to run a single canonical model definition (TorchTitan model definition) with vLLM inference engine (not batch-invariant yet, working in progress). This work is actively developing and only supports inference for now.
 
-1. vllm_compat/ - vLLM-Compatible approach
-   - Separate model definition matching vLLM's weight format
-   - Support batch-invariant and bit-wise identity between train and inference
-   - Custom backward passes for attention gradient computation
+This work is inspired by https://github.com/vllm-project/vllm/pull/28685.
 
-2. unified/ - Unified approach
-   - Uses canonical TorchTitan model definition for inference directly
-   - Replaces attention with vLLM Compatible attention for inference
+## Overview
+The integration consists of two main components:
+
+1. **Model Adapter** (`model/qwen3.py`): A custom model class that extends vLLM's `Qwen3ForCausalLM` to handle TorchTitan checkpoint naming conventions
+2. **Inference Script** (`inference_example.py`): A simple script to register the model and run inference
+
+
+## Quick Start
+### Prerequisites
+
+0. Create and activate environment with uv:
+```bash
+uv venv --python 3.12 titan-rl
+source titan-rl/bin/activate
+```
+
+1. Install Monarch:
+```bash
+uv pip install torchmonarch
+```
+
+
+2. Install PyTorch nightly for torchtitan, and pre-built vllm wheels (based on PyTorch nightly version).
+```bash
+# Install vllm with nightly torch
+uv pip install torch vllm xformers  --pre \
+--extra-index-url https://download.pytorch.org/whl/nightly/cu128 \
+--index-strategy unsafe-best-match
+```
+
+**NOTE:** The pre-built vLLM wheels are only compatible with CUDA 12.8, though they should work with most older CUDA versions. Alternatively, you can install the corresponding vLLM pre-built wheels directly from https://download.pytorch.org/whl/nightly/cu128, for example: `uv pip install vllm-1.0.0.dev20260219+cu130-<suffix>.whl`. Ensure the build version number (e.g., `dev20260219`) matches your PyTorch nightly installation.
+
+
+3. Install TorchTitan in editable mode:
+```bash
+uv pip install -e .
+```
+
+4. Download `Qwen/Qwen3-0.6B` (or `Qwen/Qwen3-1.7B`) checkpoint from HuggingFace to `torchtitan/experiments/rl/example_checkpoint` folder.
+```bash
+python scripts/download_hf_assets.py --repo_id Qwen/Qwen3-0.6B --local_dir torchtitan/experiments/rl/example_checkpoint --all --hf_token=...
+
+python scripts/download_hf_assets.py --repo_id Qwen/Qwen3-1.7B --local_dir torchtitan/experiments/rl/example_checkpoint --all --hf_token=...
+```
+
+5. Run inference with torchtitan model definition:
+```bash
+torchrun --nproc_per_node=2 torchtitan/experiments/rl/inference_example.py
+```
+
+**NOTE:**: Set `--nproc_per_node` to the world size, which should match the `tensor_parallel_degree` in the `VLLMGenerator` config.
+
+6. Run simple GRPO RL loop to learn sum digits task
+```bash
+python torchtitan/experiments/rl/simple_grpo_sum_digits.py --module rl --config rl_grpo_qwen3_0_6b
+```
+
+**NOTE:** If you downloaded your HF model to a different path than the one in step 4, specify it in your command with `--hf_assets_path=<path_to_model_checkpoint>`.
+
+We use a unified model definition from torchtitan for the trainer and generator, ensuring bitwise-identical models to address a class of subtle correctness bugs in RL for LLMs.
+
+
+
+**Current status:** Batch invariance is only supported for single-GPU configurations (TP=1) for both the trainer and generator. When tensor parallelism is enabled (TP > 1), batch-invariant mode is not yet supported.
@@ -8,16 +8,16 @@
 Unified approach for running TorchTitan models with vLLM inference.
 
 To register TorchTitan models with vLLM:
-    from torchtitan.experiments.rl.unified.plugin import register
+    from torchtitan.experiments.rl.plugin import register
     register(model_spec)
 """
 
-from torchtitan.experiments.rl.unified.models.vllm_wrapper import (
+from torchtitan.experiments.rl.models.vllm_wrapper import (
     TorchTitanVLLMModelWrapper,
 )
 
 # Export plugin register function for manual use (no auto-registration)
-from torchtitan.experiments.rl.unified.plugin import (
+from torchtitan.experiments.rl.plugin import (
     register_model_to_vllm_model_registry,
 )
 
 
@@ -14,11 +14,11 @@
 from torch.distributed.tensor import distribute_tensor, DTensor
 from torchtitan.config import Configurable
 from torchtitan.config.configs import ParallelismConfig
-from torchtitan.experiments.rl.unified.plugin import (
+from torchtitan.experiments.rl.plugin import (
     register_model_to_vllm_model_registry,
     VLLM_MODEL_NAME,
 )
-from torchtitan.experiments.rl.unified.types import Episode
+from torchtitan.experiments.rl.rl_types import Episode
 from torchtitan.protocols.model_spec import ModelSpec
 from vllm import EngineArgs, LLMEngine, SamplingParams
 from vllm.config import AttentionConfig, CompilationConfig
 
@@ -9,7 +9,7 @@
 
 import torch
 from monarch.actor import Actor, endpoint
-from torchtitan.experiments.rl.unified.types import Episode
+from torchtitan.experiments.rl.rl_types import Episode
 
 logger = logging.getLogger(__name__)
 
 
@@ -23,15 +23,15 @@
 from torchtitan.config import CommConfig, Configurable, TORCH_DTYPE_MAP
 from torchtitan.config.configs import ParallelismConfig, TrainingConfig
 from torchtitan.distributed import ParallelDims, utils as dist_utils
-from torchtitan.experiments.rl.unified.actors.utils import (
+from torchtitan.experiments.rl.actors.utils import (
     compute_policy_gradient_loss,
     compute_token_log_probs,
     verify_logprob_identity,
 )
-from torchtitan.experiments.rl.unified.models.attention import (
+from torchtitan.experiments.rl.models.attention import (
     replace_with_vllm_compatible_flash_attention,
 )
-from torchtitan.experiments.rl.unified.types import Episode
+from torchtitan.experiments.rl.rl_types import Episode
 from torchtitan.protocols.model_spec import ModelSpec
 from torchtitan.tools import utils
 
 
@@ -8,19 +8,19 @@
 Config entry points for the RL/unified experiment.
 
 Each function returns a complete ``RLTrainer.Config`` and is discoverable by
-``ConfigManager`` via ``--module rl.unified --config <function_name>``.
+``ConfigManager`` via ``--module rl --config <function_name>``.
 """
 
 from torchtitan.components.lr_scheduler import LRSchedulersContainer
 from torchtitan.components.optimizer import OptimizersContainer
 from torchtitan.config.configs import ParallelismConfig, TrainingConfig
-from torchtitan.experiments.rl.unified.actors.generator import (
+from torchtitan.experiments.rl.actors.generator import (
     GeneratorCompileConfig,
     SamplingConfig,
     VLLMGenerator,
 )
-from torchtitan.experiments.rl.unified.actors.trainer import PolicyTrainer
-from torchtitan.experiments.rl.unified.simple_grpo_sum_digits import RLTrainer
+from torchtitan.experiments.rl.actors.trainer import PolicyTrainer
+from torchtitan.experiments.rl.simple_grpo_sum_digits import RLTrainer
 from torchtitan.models.qwen3 import model_registry
 
 
@@ -102,6 +102,46 @@ def rl_grpo_qwen3_1_7b() -> RLTrainer.Config:
     )
 
 
+def rl_grpo_qwen3_0_6b_tp1() -> RLTrainer.Config:
+    """GRPO training config for Qwen3-0.6B with TP=1 (2 GPUs: 1 gen + 1 train)."""
+    return RLTrainer.Config(
+        model_spec=model_registry("0.6B"),
+        hf_assets_path="torchtitan/experiments/rl/example_checkpoint/Qwen3-0.6B",
+        num_steps=10,
+        batch_invariant_mode=True,
+        trainer=PolicyTrainer.Config(
+            optimizer=OptimizersContainer.Config(lr=2e-6),
+            lr_scheduler=LRSchedulersContainer.Config(
+                warmup_steps=2,
+                decay_type="linear",
+            ),
+            training=TrainingConfig(),
+            parallelism=ParallelismConfig(
+                tensor_parallel_degree=1,
+                data_parallel_replicate_degree=1,
+            ),
+        ),
+        generator=VLLMGenerator.Config(
+            model_dtype="bfloat16",
+            compile=GeneratorCompileConfig(
+                backend="eager",
+                cudagraph_mode="piecewise",
+            ),
+            parallelism=ParallelismConfig(
+                tensor_parallel_degree=1,
+                data_parallel_replicate_degree=1,
+            ),
+            num_samples_per_prompt=8,
+            sampling=SamplingConfig(
+                temperature=0.8,
+                top_p=0.95,
+                max_tokens=100,
+            ),
+            attention_backend="FLASH_ATTN",
+        ),
+    )
+
+
 def rl_grpo_qwen3_debug() -> RLTrainer.Config:
     """Debug config for quick iteration -- small model, few steps (2 GPUs: 1 gen + 1 train)."""
     return RLTrainer.Config(
 
@@ -12,7 +12,7 @@
 the vLLM engine and sampling parameters.
 
 Run: torchrun --nproc_per_node=2 \
-      torchtitan/experiments/rl/unified/infer.py
+      torchtitan/experiments/rl/inference_example.py
 """
 import os
 
@@ -21,7 +21,7 @@
 # See also https://docs.vllm.ai/en/v0.8.3/design/multiprocessing.html#python-multiprocessing
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
-from torchtitan.experiments.rl.unified.config_registry import rl_grpo_qwen3_0_6b
+from torchtitan.experiments.rl.config_registry import rl_grpo_qwen3_0_6b
 
 from vllm import EngineArgs, LLMEngine, SamplingParams
 from vllm.logger import init_logger
@@ -38,12 +38,12 @@ def generate():
 
     # Patch model_spec to use the RL-specific parallelize function.
     # TODO: Switch to canonical Qwen3 parallel plan
-    from torchtitan.experiments.rl.unified.models.parallelize import parallelize_qwen3
+    from torchtitan.experiments.rl.models.parallelize import parallelize_qwen3
 
     config.model_spec.parallelize_fn = parallelize_qwen3
 
     # Register TorchTitan model with vLLM before engine creation
-    from torchtitan.experiments.rl.unified.plugin import (
+    from torchtitan.experiments.rl.plugin import (
         register_model_to_vllm_model_registry,
         VLLM_MODEL_NAME,
     )
 
@@ -8,11 +8,11 @@
 
 import torch
 from torch.distributed.tensor import DTensor
-from torchtitan.experiments.rl.vllm_compat.models.attention import (
+from torchtitan.experiments.rl.models.vllm_compat_attention import (
     VLLMCompatibleFlashAttention,
 )
 from torchtitan.protocols.module import Module
-from vllm.model_executor.layers.attention import Attention
+from vllm.attention.layer import Attention
 
 logger = logging.getLogger(__name__)
Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,6 @@`
`14`	`14`	`"autoparallel.deepseek_v3",`
`15`	`15`	`"autoparallel.local_map_deepseek_v3",`
`16`	`16`	`"ft.llama3",`
`17`		`- "rl.unified",`
	`17`	`+ "rl",`
`18`	`18`	`]`
`19`	`19`	`)`