Small fixes to inference_example.py and README (#2467)

daniellepintz · web-flow · commit b4bbb2cbc4a2 · 2026-03-02T17:35:33.000+01:00
diff --git a/torchtitan/experiments/rl/unified/README.md b/torchtitan/experiments/rl/unified/README.md
@@ -8,7 +8,7 @@ This work is inspired by https://github.com/vllm-project/vllm/pull/28685.
 The integration consists of two main components:
 
 1. **Model Adapter** (`model/qwen3.py`): A custom model class that extends vLLM's `Qwen3ForCausalLM` to handle TorchTitan checkpoint naming conventions
-2. **Inference Script** (`infer.py`): A simple script to register the model and run inference
+2. **Inference Script** (`inference_example.py`): A simple script to register the model and run inference
 
 
 ## Quick Start
@@ -49,10 +49,11 @@ python scripts/download_hf_assets.py --repo_id Qwen/Qwen3-0.6B --local_dir torch
 
 5. Run inference with unified model definition:
 ```bash
-torchrun --nproc_per_node=<world_size> \
-      torchtitan/experiments/rl/unified/inference_example.py
+torchrun --nproc_per_node=2 torchtitan/experiments/rl/unified/inference_example.py
 ```
 
+**NOTE:**: Set `--nproc_per_node` to the world size, which should match the `tensor_parallel_degree` in the `VLLMGenerator` config.
+
 6. Run simple GRPO RL loop
 ```bash
 python torchtitan/experiments/rl/unified/simple_grpo.py --module rl.unified --config rl_grpo_qwen3_0_6b --hf_assets_path=<path_to_model_checkpoint>
diff --git a/torchtitan/experiments/rl/unified/inference_example.py b/torchtitan/experiments/rl/unified/inference_example.py
@@ -36,6 +36,12 @@ def generate():
     gen_config = config.generator
     model_path = config.trainer.hf_assets_path
 
+    # Patch model_spec to use the RL-specific parallelize function.
+    # TODO: Switch to canonical Qwen3 parallel plan
+    from torchtitan.experiments.rl.unified.models.parallelize import parallelize_qwen3
+
+    config.model_spec.parallelize_fn = parallelize_qwen3
+
     # Register TorchTitan model with vLLM before engine creation
     from torchtitan.experiments.rl.unified.plugin import (
         register_model_to_vllm_model_registry,
@@ -52,7 +58,7 @@ def generate():
     )
 
     # Create EngineArgs from config
-    engine_args = EngineArgs(
+    engine_kwargs = dict(
         # Model configuration
         model=model_path,
         trust_remote_code=True,
@@ -65,11 +71,12 @@ def generate():
         # Memory and performance
         gpu_memory_utilization=gen_config.gpu_memory_limit,
         enforce_eager=gen_config.enforce_eager,
-        # Seed
-        seed=gen_config.seed,
         # HuggingFace overrides
         hf_overrides={"architectures": [VLLM_MODEL_NAME]},
     )
+    if gen_config.seed is not None:
+        engine_kwargs["seed"] = gen_config.seed
+    engine_args = EngineArgs(**engine_kwargs)
 
     logger.debug("Initializing LLMEngine from EngineArgs...")
     engine = LLMEngine.from_engine_args(engine_args)