fix: correct misleading TODO comments about positions guard

joecummings · joecummings · commit d2433d536dce · 2026-03-18T08:40:06.000-07:00
The comments blamed DTensor+FSDP for the positions guard, but the
actual issue is an out-of-bounds RoPE cache index: per-document
position IDs from packed datasets can exceed max_seq_len (e.g. 6545
vs cache size 2048). The guard is also semantically correct — causal
attention treats the packed sequence as one document, so sequential
positions via the None path are what we want.
diff --git a/torchtitan/components/validate.py b/torchtitan/components/validate.py
@@ -187,10 +187,12 @@ def post_dataloading_process(
 
         # TODO: deduplicate with Trainer.post_dataloading_process which has
         # the same logic; extract a shared function to prevent further drift.
-        # TODO: remove this guard once RoPE handles DTensor+positions.
-        # The positions!=None path in RoPE uses torch.gather which fails
-        # with DTensor+FSDP. For now, only pass positions through when
-        # using flex/varlen + block_causal (where it's needed and works).
+        # Per-document position IDs are only needed for block_causal
+        # attention, where each packed document gets its own RoPE reset.
+        # For causal attention the whole sequence is one document, so
+        # sequential positions (the positions=None path) are correct.
+        # Passing them through would also OOB the RoPE cache, since
+        # individual document lengths can exceed max_seq_len.
         model_config = getattr(model_parts[0], "config", None)
         layer = getattr(model_config, "layer", None)
         attn_config = getattr(layer, "attention", None) if layer else None
diff --git a/torchtitan/trainer.py b/torchtitan/trainer.py
@@ -591,10 +591,12 @@ def post_dataloading_process(
         # extra_kwargs are.
         extra_kwargs: dict[str, Any] = {}
 
-        # TODO: remove this guard once RoPE handles DTensor+positions.
-        # The positions!=None path in RoPE uses torch.gather which fails
-        # with DTensor+FSDP. For now, only pass positions through when
-        # using flex/varlen + block_causal (where it's needed and works).
+        # Per-document position IDs are only needed for block_causal
+        # attention, where each packed document gets its own RoPE reset.
+        # For causal attention the whole sequence is one document, so
+        # sequential positions (the positions=None path) are correct.
+        # Passing them through would also OOB the RoPE cache, since
+        # individual document lengths can exceed max_seq_len.
         layer = getattr(self.model_config, "layer", None)
         attn_config = getattr(layer, "attention", None) if layer else None
         attn_mask_type = getattr(attn_config, "attn_mask_type", "causal")