Incorporate feedback on reusing config

Lucaskabela · Lucaskabela · commit 520d314dbf16 · 2026-03-13T16:49:37.000-07:00
diff --git a/torchtitan/experiments/rl/unified/actors/trainer.py b/torchtitan/experiments/rl/unified/actors/trainer.py
@@ -21,7 +21,7 @@
 from torchtitan.components.lr_scheduler import LRSchedulersContainer
 from torchtitan.components.optimizer import OptimizersContainer
 from torchtitan.config import CommConfig, Configurable, TORCH_DTYPE_MAP
-from torchtitan.config.configs import ParallelismConfig, TrainingConfig
+from torchtitan.config.configs import CompileConfig, ParallelismConfig, TrainingConfig
 from torchtitan.distributed import ParallelDims, utils as dist_utils
 from torchtitan.experiments.rl.unified.actors.utils import (
     compute_policy_gradient_loss,
@@ -38,16 +38,6 @@
 logger = logging.getLogger(__name__)
 
 
-@dataclass(kw_only=True, slots=True)
-class TrainerCompileConfig:
-    """Compilation settings for the PolicyTrainer."""
-
-    enable: bool = False
-    """Enable per-layer torch.compile on the training model."""
-    backend: str = "eager"
-    """torch.compile backend (e.g. 'eager', 'aot_eager', 'inductor')."""
-
-
 class PolicyTrainer(Actor, Configurable):
     """
     Updates policy based on collected Episode using TorchTitan components.
@@ -74,7 +64,7 @@ class Config(Configurable.Config):
         parallelism: ParallelismConfig = field(default_factory=ParallelismConfig)
         comm: CommConfig = field(default_factory=CommConfig)
         """Communication configuration for distributed initialization."""
-        compile: TrainerCompileConfig = field(default_factory=TrainerCompileConfig)
+        compile: CompileConfig = field(default_factory=CompileConfig)
 
     def __init__(
         self,
@@ -120,8 +110,6 @@ def __init__(
             model_spec, config, device_type, batch_invariant_mode, hf_assets_path
         )
         model.train()
-        if config.compile.enable:
-            model = self._compile_model(model, config.compile.backend)
         self.model = model
         self.model_parts = [model]
 
@@ -225,6 +213,7 @@ def _build_model(
             model,
             parallel_dims=self.parallel_dims,
             parallelism=config.parallelism,
+            compile_config=config.compile,
         )
 
         model.to_empty(device=device_type)
@@ -236,20 +225,6 @@ def _build_model(
 
         return model
 
-    def _compile_model(self, model: torch.nn.Module, backend: str) -> torch.nn.Module:
-        """Compile each transformer layer with torch.compile.
-
-        Args:
-            model: The model whose layers will be compiled.
-            backend: torch.compile backend (e.g. 'eager', 'aot_eager', 'inductor').
-        """
-        for layer_id in model.layers:
-            model.layers[layer_id].compile(backend=backend, fullgraph=True)
-        logger.info(
-            f"Compiled {len(model.layers)} transformer layers with {backend} backend"
-        )
-        return model
-
     @endpoint
     async def get_weights(self) -> dict:
         """Get model weights for generator.
diff --git a/torchtitan/experiments/rl/unified/config_registry.py b/torchtitan/experiments/rl/unified/config_registry.py
@@ -13,16 +13,13 @@
 
 from torchtitan.components.lr_scheduler import LRSchedulersContainer
 from torchtitan.components.optimizer import OptimizersContainer
-from torchtitan.config.configs import ParallelismConfig, TrainingConfig
+from torchtitan.config.configs import CompileConfig, ParallelismConfig, TrainingConfig
 from torchtitan.experiments.rl.unified.actors.generator import (
     GeneratorCompileConfig,
     SamplingConfig,
     VLLMGenerator,
 )
-from torchtitan.experiments.rl.unified.actors.trainer import (
-    PolicyTrainer,
-    TrainerCompileConfig,
-)
+from torchtitan.experiments.rl.unified.actors.trainer import PolicyTrainer
 from torchtitan.experiments.rl.unified.simple_grpo_sum_digits import RLTrainer
 from torchtitan.models.qwen3 import model_registry
 
@@ -44,7 +41,7 @@ def rl_grpo_qwen3_0_6b() -> RLTrainer.Config:
             parallelism=ParallelismConfig(
                 tensor_parallel_degree=2,
             ),
-            compile=TrainerCompileConfig(enable=True, backend="aot_eager"),
+            compile=CompileConfig(enable=True, backend="aot_eager"),
         ),
         generator=VLLMGenerator.Config(
             model_dtype="bfloat16",
@@ -84,7 +81,7 @@ def rl_grpo_qwen3_1_7b() -> RLTrainer.Config:
             parallelism=ParallelismConfig(
                 tensor_parallel_degree=2,
             ),
-            compile=TrainerCompileConfig(enable=True, backend="aot_eager"),
+            compile=CompileConfig(enable=True, backend="aot_eager"),
         ),
         generator=VLLMGenerator.Config(
             model_dtype="bfloat16",
@@ -124,7 +121,7 @@ def rl_grpo_qwen3_debug() -> RLTrainer.Config:
                 tensor_parallel_degree=1,
                 data_parallel_replicate_degree=1,
             ),
-            compile=TrainerCompileConfig(enable=True, backend="aot_eager"),
+            compile=CompileConfig(enable=True, backend="aot_eager"),
         ),
         generator=VLLMGenerator.Config(
             compile=GeneratorCompileConfig(
diff --git a/torchtitan/experiments/rl/unified/models/parallelize.py b/torchtitan/experiments/rl/unified/models/parallelize.py
@@ -11,6 +11,7 @@
 
 import logging
 
+import torch
 import torch.nn as nn
 
 from torch.distributed.device_mesh import DeviceMesh
@@ -24,6 +25,7 @@
 )
 
 from torchtitan.config import ParallelismConfig
+from torchtitan.config.configs import CompileConfig
 from torchtitan.distributed import ParallelDims
 
 logger = logging.getLogger(__name__)
@@ -34,6 +36,7 @@ def parallelize_qwen3(
     *,
     parallel_dims: ParallelDims,
     parallelism: ParallelismConfig,
+    compile_config: CompileConfig | None = None,
     has_position_id: bool = False,
 ):
     """
@@ -44,6 +47,8 @@ def parallelize_qwen3(
     TODO: Change to core torchtitan's Qwen3 parallel plan when full DTensor is ready
 
     Args:
+        compile_config: If provided and enabled, applies per-layer torch.compile
+            after TP (matching the pattern in torchtitan/models/llama3/parallelize.py).
         has_position_id: Whether position IDs are passed as an explicit argument
             to the attention module. True for vLLM inference (generator),
             False for training (trainer).
@@ -60,9 +65,30 @@ def parallelize_qwen3(
             has_position_id=has_position_id,
         )
 
+    if (
+        compile_config is not None
+        and compile_config.enable
+        and "model" in compile_config.components
+    ):
+        apply_compile(model, compile_config)
+
     return model
 
 
+def apply_compile(model: nn.Module, compile_config: CompileConfig):
+    """Apply torch.compile to each TransformerBlock.
+
+    Follows the same pattern as torchtitan/models/llama3/parallelize.py.
+    """
+    for layer_id, transformer_block in model.layers.named_children():
+        transformer_block = torch.compile(
+            transformer_block, backend=compile_config.backend, fullgraph=True
+        )
+        model.layers.register_module(layer_id, transformer_block)
+
+    logger.info("Compiling each TransformerBlock with torch.compile")
+
+
 def apply_non_moe_tp(
     model: nn.Module,
     tp_mesh: DeviceMesh,