fix: wrap positions as DTensor in RoPE and warn on missing position_buffer

joecummings · joecummings · commit 13c1d030d8fc · 2026-03-18T08:40:29.000-07:00
When TP uses use_local_output=False (DeepSeek V3, Qwen3, GPT-OSS),
freqs_cis becomes a DTensor(Replicate) but positions remains a plain
tensor. torch.gather requires both operands to be the same type, causing
a runtime error. Fix by wrapping positions via DTensor.from_local() at
the apply_rotary_emb public API boundary.

Also add a logger.warning when loading a checkpoint that is missing the
position_buffer key in the dataset state dict, to help users debug
incorrect RoPE positions when resuming from older checkpoints.
diff --git a/torchtitan/hf_datasets/text_datasets.py b/torchtitan/hf_datasets/text_datasets.py
@@ -158,6 +158,14 @@ def __iter__(self):
 
     def load_state_dict(self, state_dict):
         self._token_buffer = state_dict["token_buffer"]
+        if "position_buffer" not in state_dict:
+            logger.warning(
+                "Checkpoint missing 'position_buffer' key in dataset state. "
+                "Falling back to empty position buffer. This is expected when "
+                "resuming from a checkpoint saved before position tracking was "
+                "added, but may cause incorrect RoPE positions with "
+                "block_causal attention (document packing)."
+            )
         self._position_buffer = state_dict.get("position_buffer", [])
 
         if isinstance(self._data, Dataset):
diff --git a/torchtitan/models/common/rope.py b/torchtitan/models/common/rope.py
@@ -9,6 +9,7 @@
 from typing import Literal
 
 import torch
+from torch.distributed.tensor import DTensor, Replicate
 
 from torchtitan.protocols.module import Module
 
@@ -289,6 +290,35 @@ def _rotate_half(x: torch.Tensor) -> torch.Tensor:
     return torch.cat((-x2, x1), dim=-1)
 
 
+def _maybe_wrap_positions(
+    positions: torch.Tensor | None,
+    freqs_cis: torch.Tensor,
+) -> torch.Tensor | None:
+    """Wrap positions as a DTensor if freqs_cis is a DTensor.
+
+    When TP uses use_local_output=False (DeepSeek V3, Qwen3, GPT-OSS),
+    freqs_cis is a DTensor (Replicate) but positions is a plain tensor.
+    The downstream torch.gather requires both operands to be the same type.
+    Since positions (int64 indices) has no gradient, grad_placements is
+    not needed.
+    """
+    if (
+        positions is not None
+        and isinstance(freqs_cis, DTensor)
+        and not isinstance(positions, DTensor)
+    ):
+        assert all(
+            isinstance(p, Replicate) for p in freqs_cis.placements
+        ), f"Expected Replicate placements on freqs_cis, got {freqs_cis.placements}"
+        positions = DTensor.from_local(
+            positions,
+            freqs_cis.device_mesh,
+            freqs_cis.placements,
+            run_check=False,
+        )
+    return positions
+
+
 # TODO: consolidate apply_rotary_emb_complex and apply_rotary_emb_single_complex
 def apply_rotary_emb_complex(
     xq: torch.Tensor,
@@ -304,6 +334,7 @@ def apply_rotary_emb_complex(
         freqs_cis: (max_seqlen, head_dim // 2) complex
         positions: optional position indices
     """
+    positions = _maybe_wrap_positions(positions, freqs_cis)
     xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
     xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
     freqs_cis = _reshape_for_broadcast_complex(freqs_cis, xq_, positions)
@@ -324,6 +355,7 @@ def apply_rotary_emb_single_complex(
         freqs_cis: (max_seqlen, head_dim // 2) complex
         positions: optional position indices
     """
+    positions = _maybe_wrap_positions(positions, freqs_cis)
     dtype = x.dtype
     x = torch.view_as_complex(x.float().view(*x.shape[:-1], -1, 2))
     freqs_cis = _reshape_for_broadcast_complex(freqs_cis, x, positions)
@@ -345,6 +377,7 @@ def apply_rotary_emb_cos_sin(
         rope_cache: (max_seqlen, head_dim * 2) with cos and sin concatenated
         positions: optional position indices
     """
+    positions = _maybe_wrap_positions(positions, rope_cache)
     head_dim = xq.shape[-1]
     rope_cache = _reshape_for_broadcast_cos_sin(rope_cache, xq, positions)
     cos = rope_cache[..., :head_dim].to(device=xq.device)