Update on "[rl] Add CI for numerics test against vllm native inference"

wwwjn · wwwjn · commit 72978cffd24e · 2026-03-23T16:33:13.000-07:00
Test cases:
1. Integration tests: 
   - single GPU, no compile + cudagraph
   - multiple GPU (with TP),  no compile + cudagraph
   - multiple GPU, with compile + cudagraph
   - This test runs on A10G (default CI GPU type)
3. Numerics parity test: vLLM native model vs vLLM + TorchTitan wrapper.
    - test_weights_match:          max_diff &lt;= 1e-5 (exact weight loading)
    - test_attention_module:       atol=1e-5 (TP=1)
    - test_end_to_end_logits:      atol=1e-3 (TP=1)
    - We would need to run numerics test for only TP=1. This is because we are assuming both torchtitan and vllm will make sure their multi-GPU implementation is on par with single GPU. And we can add more numerics test under parallelism if needed. 
    - This test runs on H100, and runs FA3 kernel for attention. 

[ghstack-poisoned]
diff --git a/torchtitan/experiments/rl/actors/generator.py b/torchtitan/experiments/rl/actors/generator.py
@@ -145,6 +145,12 @@ class Config(Configurable.Config):
         num_samples_per_prompt: int = 8
         """Number of completions to generate per prompt."""
 
+        max_model_len: int | None = None
+        """Maximum context length for vLLM's KV cache allocation. vLLM
+        pre-allocates paged KV cache blocks up to this length; None lets
+        vLLM use the model's max_position_embeddings (e.g. 40960 for
+        Qwen3-0.6B)"""
+
         seed: int | None = None
         """Random seed for vLLM engine and sampling. None for non-deterministic."""
 
@@ -211,6 +217,8 @@ def __init__(
         vllm_compilation_config = config.compile.get_vllm_compilation_config()
         if vllm_compilation_config is not None:
             engine_kwargs["compilation_config"] = vllm_compilation_config
+        if config.max_model_len is not None:
+            engine_kwargs["max_model_len"] = config.max_model_len
         if config.seed is not None:
             engine_kwargs["seed"] = config.seed
         engine_args = EngineArgs(**engine_kwargs)
diff --git a/torchtitan/experiments/rl/tests/integration_tests.py b/torchtitan/experiments/rl/tests/integration_tests.py
@@ -36,11 +36,12 @@ def build_rl_test_list() -> list[OverrideDefinitions]:
                     "--config rl_grpo_qwen3_0_6b",
                     "--trainer.parallelism.tensor_parallel_degree 2",
                     "--generator.parallelism.tensor_parallel_degree 2",
+                    "--generator.max_model_len 2048",
                     "--generator.compile.backend none",
                     "--generator.compile.cudagraph_mode none",
                 ],
             ],
-            "RL GRPO TP=2 no compile (debug model)",
+            "RL GRPO TP=2 no compile",
             "rl_grpo_tp2_no_compile",
             ngpu=4,
         ),
@@ -51,9 +52,10 @@ def build_rl_test_list() -> list[OverrideDefinitions]:
                     "--config rl_grpo_qwen3_0_6b",
                     "--trainer.parallelism.tensor_parallel_degree 2",
                     "--generator.parallelism.tensor_parallel_degree 2",
+                    "--generator.max_model_len 2048",
                 ],
             ],
-            "RL GRPO TP=2 compile (debug model)",
+            "RL GRPO TP=2 compile",
             "rl_grpo_tp2_compile",
             ngpu=4,
         ),