Update README and remove rl_grpo_qwen3_0_6b_tp1 config

wwwjn · wwwjn · commit bf9ef7b027b7 · 2026-03-18T02:04:20.000-07:00
diff --git a/torchtitan/experiments/rl/README.md b/torchtitan/experiments/rl/README.md
@@ -1,14 +1,13 @@
-# Run vLLM inference with TorchTitan Qwen3 Model
+# RL Training with TorchTitan and vLLM
 
-This directory contains code to run a single canonical model definition (TorchTitan model definition) with vLLM inference engine (not batch-invariant yet, working in progress). This work is actively developing and only supports inference for now.
-
-This work is inspired by https://github.com/vllm-project/vllm/pull/28685.
+This directory contains code for RL training using TorchTitan model definitions with vLLM inference engine for fast rollout generation.
 
 ## Overview
-The integration consists of two main components:
+The integration consists of the following components:
 
-1. **Model Adapter** (`model/qwen3.py`): A custom model class that extends vLLM's `Qwen3ForCausalLM` to handle TorchTitan checkpoint naming conventions
-2. **Inference Script** (`inference_example.py`): A simple script to register the model and run inference
+1. **vLLM Model Wrapper** (`models/vllm_wrapper.py`): Adapts TorchTitan models for vLLM's inference engine
+2. **RL Training Loop** (`simple_grpo_sum_digits.py`): GRPO-based RL training with Monarch actors
+3. **Inference Script** (`inference_example.py`): Standalone inference using the vLLM engine
 
 
 ## Quick Start
diff --git a/torchtitan/experiments/rl/__init__.py b/torchtitan/experiments/rl/__init__.py
@@ -12,14 +12,10 @@
     register(model_spec)
 """
 
-from torchtitan.experiments.rl.models.vllm_wrapper import (
-    TorchTitanVLLMModelWrapper,
-)
+from torchtitan.experiments.rl.models.vllm_wrapper import TorchTitanVLLMModelWrapper
 
 # Export plugin register function for manual use (no auto-registration)
-from torchtitan.experiments.rl.plugin import (
-    register_model_to_vllm_model_registry,
-)
+from torchtitan.experiments.rl.plugin import register_model_to_vllm_model_registry
 
 
 __all__ = [
diff --git a/torchtitan/experiments/rl/config_registry.py b/torchtitan/experiments/rl/config_registry.py
@@ -102,46 +102,6 @@ def rl_grpo_qwen3_1_7b() -> RLTrainer.Config:
     )
 
 
-def rl_grpo_qwen3_0_6b_tp1() -> RLTrainer.Config:
-    """GRPO training config for Qwen3-0.6B with TP=1 (2 GPUs: 1 gen + 1 train)."""
-    return RLTrainer.Config(
-        model_spec=model_registry("0.6B"),
-        hf_assets_path="torchtitan/experiments/rl/example_checkpoint/Qwen3-0.6B",
-        num_steps=10,
-        batch_invariant_mode=True,
-        trainer=PolicyTrainer.Config(
-            optimizer=OptimizersContainer.Config(lr=2e-6),
-            lr_scheduler=LRSchedulersContainer.Config(
-                warmup_steps=2,
-                decay_type="linear",
-            ),
-            training=TrainingConfig(),
-            parallelism=ParallelismConfig(
-                tensor_parallel_degree=1,
-                data_parallel_replicate_degree=1,
-            ),
-        ),
-        generator=VLLMGenerator.Config(
-            model_dtype="bfloat16",
-            compile=GeneratorCompileConfig(
-                backend="eager",
-                cudagraph_mode="piecewise",
-            ),
-            parallelism=ParallelismConfig(
-                tensor_parallel_degree=1,
-                data_parallel_replicate_degree=1,
-            ),
-            num_samples_per_prompt=8,
-            sampling=SamplingConfig(
-                temperature=0.8,
-                top_p=0.95,
-                max_tokens=100,
-            ),
-            attention_backend="FLASH_ATTN",
-        ),
-    )
-
-
 def rl_grpo_qwen3_debug() -> RLTrainer.Config:
     """Debug config for quick iteration -- small model, few steps (2 GPUs: 1 gen + 1 train)."""
     return RLTrainer.Config(
diff --git a/torchtitan/experiments/rl/inference_example.py b/torchtitan/experiments/rl/inference_example.py
@@ -21,11 +21,11 @@
 # See also https://docs.vllm.ai/en/v0.8.3/design/multiprocessing.html#python-multiprocessing
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
-from torchtitan.experiments.rl.config_registry import rl_grpo_qwen3_0_6b
-
 from vllm import EngineArgs, LLMEngine, SamplingParams
 from vllm.logger import init_logger
 
+from torchtitan.experiments.rl.config_registry import rl_grpo_qwen3_0_6b
+
 
 logger = init_logger(__name__)
 
diff --git a/torchtitan/experiments/rl/models/vllm_wrapper.py b/torchtitan/experiments/rl/models/vllm_wrapper.py
@@ -24,9 +24,7 @@
 
 from torchtitan.config import ParallelismConfig
 from torchtitan.distributed.parallel_dims import ParallelDims
-from torchtitan.experiments.rl.models.attention import (
-    replace_with_vllm_attention,
-)
+from torchtitan.experiments.rl.models.attention import replace_with_vllm_attention
 from torchtitan.protocols.model_spec import ModelSpec
 from torchtitan.protocols.module import Module
 from vllm.compilation.decorators import support_torch_compile
diff --git a/torchtitan/experiments/rl/plugin.py b/torchtitan/experiments/rl/plugin.py
@@ -30,12 +30,11 @@ def register_model_to_vllm_model_registry(
     Args:
         model_spec: TorchTitan ModelSpec containing model config and components
     """
-    from torchtitan.experiments.rl.models.vllm_wrapper import (
-        TorchTitanVLLMModelWrapper,
-    )
     from vllm.logger import init_logger
     from vllm.model_executor.models.registry import ModelRegistry
 
+    from torchtitan.experiments.rl.models.vllm_wrapper import TorchTitanVLLMModelWrapper
+
     logger = init_logger(__name__)
 
     # Create dynamic model class capturing ModelSpec in the closure
diff --git a/torchtitan/experiments/rl/simple_grpo_sum_digits.py b/torchtitan/experiments/rl/simple_grpo_sum_digits.py
@@ -34,13 +34,14 @@
 import torch
 from monarch.actor import this_host
 from monarch.spmd import setup_torch_elastic_env_async
+
 from torchtitan.config import Configurable
 from torchtitan.config.manager import ConfigManager
 from torchtitan.experiments.rl.actors.generator import VLLMGenerator
 from torchtitan.experiments.rl.actors.grader import Grader
 from torchtitan.experiments.rl.actors.trainer import PolicyTrainer
-from torchtitan.experiments.rl.sum_digits import extract_answer, SumDigitsTask
 from torchtitan.experiments.rl.rl_types import Episode
+from torchtitan.experiments.rl.sum_digits import extract_answer, SumDigitsTask
 from torchtitan.protocols.model_spec import ModelSpec
 
 logger = logging.getLogger(__name__)
@@ -140,9 +141,7 @@ def __init__(self, config: Config):
 
         # Patch model_spec to use the RL-specific parallelize function.
         # TODO: Switch to canonical Qwen3 parallel plan
-        from torchtitan.experiments.rl.models.parallelize import (
-            parallelize_qwen3,
-        )
+        from torchtitan.experiments.rl.models.parallelize import parallelize_qwen3
 
         config.model_spec.parallelize_fn = parallelize_qwen3
 
diff --git a/torchtitan/experiments/rl/types.py b/torchtitan/experiments/rl/types.py