Incorporate feedback on reusing config

Lucaskabela · Lucaskabela · commit b0e8401d82f9 · 2026-03-19T13:53:09.000-07:00
diff --git a/torchtitan/experiments/rl/actors/generator.py b/torchtitan/experiments/rl/actors/generator.py
@@ -318,14 +318,12 @@ async def pull_model_state_dict(self, version: int) -> None:
         Args:
             version: New policy version number.
         """
-        from monarch.rdma import is_rdma_available
-
         model_sd = self._get_model().model.state_dict()
         await ts.get_state_dict(
             "model_state_dict",
             user_state_dict=model_sd,
             strict=False,
-            direct_rdma=is_rdma_available(),
+            direct_rdma=False,
         )
         self.policy_version = version
         logger.debug(
diff --git a/torchtitan/experiments/rl/actors/trainer.py b/torchtitan/experiments/rl/actors/trainer.py
@@ -21,7 +21,7 @@
 from torchtitan.components.lr_scheduler import LRSchedulersContainer
 from torchtitan.components.optimizer import OptimizersContainer
 from torchtitan.config import CommConfig, Configurable, TORCH_DTYPE_MAP
-from torchtitan.config.configs import ParallelismConfig, TrainingConfig
+from torchtitan.config.configs import CompileConfig, ParallelismConfig, TrainingConfig
 from torchtitan.distributed import ParallelDims, utils as dist_utils
 from torchtitan.experiments.rl.actors.utils import (
     compute_policy_gradient_loss,
@@ -38,16 +38,6 @@
 logger = logging.getLogger(__name__)
 
 
-@dataclass(kw_only=True, slots=True)
-class TrainerCompileConfig:
-    """Compilation settings for the PolicyTrainer."""
-
-    enable: bool = False
-    """Enable per-layer torch.compile on the training model."""
-    backend: str = "eager"
-    """torch.compile backend (e.g. 'eager', 'aot_eager', 'inductor')."""
-
-
 class PolicyTrainer(Actor, Configurable):
     """
     Updates policy based on collected Episode using TorchTitan components.
@@ -74,7 +64,7 @@ class Config(Configurable.Config):
         parallelism: ParallelismConfig = field(default_factory=ParallelismConfig)
         comm: CommConfig = field(default_factory=CommConfig)
         """Communication configuration for distributed initialization."""
-        compile: TrainerCompileConfig = field(default_factory=TrainerCompileConfig)
+        compile: CompileConfig = field(default_factory=CompileConfig)
 
     def __init__(
         self,
@@ -85,6 +75,11 @@ def __init__(
         hf_assets_path: str = "",
         transfer_dtype: str = "",
     ):
+        # Silence noisy torchstore per-tensor transport logs in actor subprocess
+        logging.getLogger("torchstore.transport.shared_memory").setLevel(
+            logging.WARNING
+        )
+
         self.config = config
         self.model_spec = model_spec
         # Only cast if transfer dtype differs from training dtype, otherwise
@@ -126,8 +121,6 @@ def __init__(
             model_spec, config, device_type, batch_invariant_mode, hf_assets_path
         )
         model.train()
-        if config.compile.enable:
-            model = self._compile_model(model, config.compile.backend)
         self.model = model
         self.model_parts = [model]
 
@@ -231,6 +224,7 @@ def _build_model(
             model,
             parallel_dims=self.parallel_dims,
             parallelism=config.parallelism,
+            compile_config=config.compile,
         )
 
         model.to_empty(device=device_type)
@@ -242,20 +236,6 @@ def _build_model(
 
         return model
 
-    def _compile_model(self, model: torch.nn.Module, backend: str) -> torch.nn.Module:
-        """Compile each transformer layer with torch.compile.
-
-        Args:
-            model: The model whose layers will be compiled.
-            backend: torch.compile backend (e.g. 'eager', 'aot_eager', 'inductor').
-        """
-        for layer_id in model.layers:
-            model.layers[layer_id].compile(backend=backend, fullgraph=True)
-        logger.info(
-            f"Compiled {len(model.layers)} transformer layers with {backend} backend"
-        )
-        return model
-
     @endpoint
     async def push_model_state_dict(self) -> None:
         """Publish model weights for generator consumption via TorchStore.
@@ -271,12 +251,10 @@ async def push_model_state_dict(self) -> None:
         means "skip StorageVolumes and let the destination read directly
         from the source's GPU memory".
         """
-        from monarch.rdma import is_rdma_available
-
         await ts.put_state_dict(
             self.model.state_dict(),
             "model_state_dict",
-            direct_rdma=is_rdma_available(),
+            direct_rdma=False,
             transfer_dtype=self._transfer_dtype,
         )
 
diff --git a/torchtitan/experiments/rl/config_registry.py b/torchtitan/experiments/rl/config_registry.py
@@ -13,16 +13,13 @@
 
 from torchtitan.components.lr_scheduler import LRSchedulersContainer
 from torchtitan.components.optimizer import OptimizersContainer
-from torchtitan.config.configs import ParallelismConfig, TrainingConfig
+from torchtitan.config.configs import CompileConfig, ParallelismConfig, TrainingConfig
 from torchtitan.experiments.rl.actors.generator import (
     GeneratorCompileConfig,
     SamplingConfig,
     VLLMGenerator,
 )
-from torchtitan.experiments.rl.actors.trainer import (
-    PolicyTrainer,
-    TrainerCompileConfig,
-)
+from torchtitan.experiments.rl.actors.trainer import PolicyTrainer
 from torchtitan.experiments.rl.simple_grpo_sum_digits import RLTrainer
 from torchtitan.models.qwen3 import model_registry
 
@@ -44,7 +41,7 @@ def rl_grpo_qwen3_0_6b() -> RLTrainer.Config:
             parallelism=ParallelismConfig(
                 tensor_parallel_degree=2,
             ),
-            compile=TrainerCompileConfig(enable=True, backend="aot_eager"),
+            compile=CompileConfig(enable=True, backend="aot_eager"),
         ),
         generator=VLLMGenerator.Config(
             model_dtype="bfloat16",
@@ -84,7 +81,7 @@ def rl_grpo_qwen3_1_7b() -> RLTrainer.Config:
             parallelism=ParallelismConfig(
                 tensor_parallel_degree=2,
             ),
-            compile=TrainerCompileConfig(enable=True, backend="aot_eager"),
+            compile=CompileConfig(enable=True, backend="aot_eager"),
         ),
         generator=VLLMGenerator.Config(
             model_dtype="bfloat16",
@@ -124,7 +121,7 @@ def rl_grpo_qwen3_debug() -> RLTrainer.Config:
                 tensor_parallel_degree=1,
                 data_parallel_replicate_degree=1,
             ),
-            compile=TrainerCompileConfig(enable=True, backend="aot_eager"),
+            compile=CompileConfig(enable=True, backend="aot_eager"),
         ),
         generator=VLLMGenerator.Config(
             compile=GeneratorCompileConfig(
diff --git a/torchtitan/experiments/rl/models/parallelize.py b/torchtitan/experiments/rl/models/parallelize.py
@@ -24,6 +24,7 @@
 )
 
 from torchtitan.config import ParallelismConfig
+from torchtitan.config.configs import CompileConfig
 from torchtitan.distributed import ParallelDims
 
 logger = logging.getLogger(__name__)
@@ -34,6 +35,7 @@ def parallelize_qwen3(
     *,
     parallel_dims: ParallelDims,
     parallelism: ParallelismConfig,
+    compile_config: CompileConfig | None = None,
     has_position_id: bool = False,
 ):
     """
@@ -44,6 +46,8 @@ def parallelize_qwen3(
     TODO: Change to core torchtitan's Qwen3 parallel plan when full DTensor is ready
 
     Args:
+        compile_config: If provided and enabled, applies per-layer torch.compile
+            after TP (matching the pattern in torchtitan/models/llama3/parallelize.py).
         has_position_id: Whether position IDs are passed as an explicit argument
             to the attention module. True for vLLM inference (generator),
             False for training (trainer).
@@ -60,9 +64,31 @@ def parallelize_qwen3(
             has_position_id=has_position_id,
         )
 
+    if (
+        compile_config is not None
+        and compile_config.enable
+        and "model" in compile_config.components
+    ):
+        apply_compile(model, compile_config)
+
     return model
 
 
+def apply_compile(model: nn.Module, compile_config: CompileConfig):
+    """Apply torch.compile to each TransformerBlock.
+
+    Follows the same pattern as torchtitan/models/llama3/parallelize.py.
+    """
+    # NOTE: we MUST use `.compile()` instead of `torch.compile()` here, because
+    # compatibility with weight naming between this model and vLLM definition.
+    # `.compile()` modifies the module in-place and returns None, so we must
+    # NOT reassign or re-register the module.
+    for transformer_block in model.layers.values():
+        transformer_block.compile(backend=compile_config.backend, fullgraph=True)
+
+    logger.info("Compiling each TransformerBlock with torch.compile")
+
+
 def apply_non_moe_tp(
     model: nn.Module,
     tp_mesh: DeviceMesh,
diff --git a/torchtitan/experiments/rl/models/vllm_compat_attention.py b/torchtitan/experiments/rl/models/vllm_compat_attention.py
@@ -12,17 +12,6 @@
 from torch.distributed._tensor import DTensor
 
 from torchtitan.protocols.module import Module
-from vllm.v1.attention.backends.fa_utils import flash_attn_varlen_func
-
-
-# ---------------------------------------------------------------------------
-# Custom op wrapping vLLM's flash-attention varlen forward.
-#
-# Registering as a ``torch.library.custom_op`` with a fake implementation
-# and explicit autograd lets AOT Autograd trace through the op with
-# FakeTensors (required by the compiler_toolkit's joint-graph export path)
-# and correctly capture the backward graph.
-# ---------------------------------------------------------------------------
 
 
 @torch.library.custom_op("rl::flash_attn_varlen_fwd", mutates_args=())
@@ -34,7 +23,6 @@ def _flash_attn_varlen_fwd(
     seq_len: int,
     scale: float,
     num_splits: int,
-    enable_gqa: bool,
 ) -> torch.Tensor:
     from vllm.v1.attention.backends.fa_utils import (
         flash_attn_varlen_func as _flash_fn,
@@ -69,38 +57,37 @@ def _flash_attn_varlen_fwd_fake(
     seq_len: int,
     scale: float,
     num_splits: int,
-    enable_gqa: bool,
 ) -> torch.Tensor:
-    # Output shape matches Q: (total_tokens, num_heads, head_dim)
     return torch.empty(
         (q.shape[0], q.shape[1], q.shape[2]), dtype=q.dtype, device=q.device
     )
 
 
-class FlashAttnVarlenFunction(torch.autograd.Function):
-    """autograd.Function wrapping the vLLM flash-attention custom op.
-
-    The forward calls the ``rl::flash_attn_varlen_fwd`` custom op (which
-    has a registered fake implementation for torch.compile tracing).
-    The backward is a manual PyTorch attention recompute.
-    """
-
-    @staticmethod
-    def forward(q, k, v, cu_seqlens, seq_len, scale, num_splits, enable_gqa):
-        return _flash_attn_varlen_fwd(
-            q, k, v, cu_seqlens, seq_len, scale, num_splits, enable_gqa
-        )
-
+class FlashAttnWithBackward(torch.autograd.Function):
     @staticmethod
-    def setup_context(ctx, inputs, output):
-        q, k, v, cu_seqlens, seq_len, scale, num_splits, enable_gqa = inputs
+    def forward(
+        ctx: torch.autograd.function.FunctionCtx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        seq_len: int,
+        scale: float,
+        num_splits: int,
+        enable_gqa: bool,
+    ) -> torch.Tensor:
+        output = _flash_attn_varlen_fwd(q, k, v, cu_seqlens, seq_len, scale, num_splits)
+        # Save for backward
         ctx.save_for_backward(q, k, v, output)
         ctx.scale = scale
         ctx.seq_len = seq_len
         ctx.enable_gqa = enable_gqa
+        return output
 
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(
+        ctx: torch.autograd.function.FunctionCtx, grad_output: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, None, None, None, None, None,]:
         q, k, v, output = ctx.saved_tensors
         scale = ctx.scale
         seq_len = ctx.seq_len
@@ -207,12 +194,9 @@ class VLLMCompatibleFlashAttention(Module):
 
     def __init__(self) -> None:
         super().__init__()
-        self.flash_attn_varlen_func = flash_attn_varlen_func
         from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
-        from vllm.v1.attention.backends.fa_utils import get_flash_attn_version
 
         self.vllm_is_batch_invariant = vllm_is_batch_invariant
-        self.fa_version = get_flash_attn_version()
 
     def forward(
         self,
@@ -267,8 +251,8 @@ def forward(
         if scale is None:
             scale = 1.0 / math.sqrt(q.size(-1))
 
-        # Call flash attention via autograd.Function (which wraps the custom op)
-        output_varlen = FlashAttnVarlenFunction.apply(
+        # Call Flash Attention varlen with custom backward
+        output_varlen = FlashAttnWithBackward.apply(
             q_varlen,
             k_varlen,
             v_varlen,
diff --git a/torchtitan/experiments/rl/unified/bench_compile_loss.py b/torchtitan/experiments/rl/unified/bench_compile_loss.py