fix: wrap per-document RoPE positions at seq_len to prevent OOB gather

joecummings · joecummings · commit 7bc29c9be5df · 2026-03-17T12:03:01.000-07:00
Documents longer than seq_len produce position IDs that exceed the RoPE
cache size, causing an index-out-of-bounds error in torch.gather during
apply_rotary_emb. Wrap positions with modulo seq_len in the dataloader,
which effectively chunks long documents for RoPE purposes while
preserving all tokens for training.

Also update comments to clarify: per-document positions are dropped for
causal attention (whole sequence is one document), and kept for
block_causal to match inference frameworks (e.g. vLLM) that reset
positions to 0 per request.
diff --git a/torchtitan/components/validate.py b/torchtitan/components/validate.py
@@ -185,14 +185,8 @@ def post_dataloading_process(
         # extra_kwargs are.
         extra_kwargs: dict[str, Any] = {}
 
-        # TODO: deduplicate with Trainer.post_dataloading_process which has
-        # the same logic; extract a shared function to prevent further drift.
-        # Per-document position IDs are only needed for block_causal
-        # attention, where each packed document gets its own RoPE reset.
-        # For causal attention the whole sequence is one document, so
-        # sequential positions (the positions=None path) are correct.
-        # Passing them through would also OOB the RoPE cache, since
-        # individual document lengths can exceed max_seq_len.
+        # For causal attention the whole packed sequence is one document,
+        # so sequential RoPE positions (positions=None) are correct.
         model_config = getattr(model_parts[0], "config", None)
         layer = getattr(model_config, "layer", None)
         attn_config = getattr(layer, "attention", None) if layer else None
diff --git a/torchtitan/hf_datasets/text_datasets.py b/torchtitan/hf_datasets/text_datasets.py
@@ -120,7 +120,15 @@ def __iter__(self):
                     sample_text, add_bos=True, add_eos=True
                 )
                 self._token_buffer.extend(sample_tokens)
-                self._position_buffer.extend(range(len(sample_tokens)))
+                # Per-document positions reset at document boundaries,
+                # matching inference frameworks (e.g. vLLM) that start
+                # positions at 0 per request.  Positions wrap at seq_len
+                # to stay within the RoPE cache, effectively chunking
+                # long documents into seq_len-sized segments.
+                # TODO: make overflow policy configurable (chunk / truncate / drop).
+                self._position_buffer.extend(
+                    i % self.seq_len for i in range(len(sample_tokens))
+                )
                 self._sample_idx += 1
 
                 while len(self._token_buffer) >= max_buffer_token_len:
diff --git a/torchtitan/trainer.py b/torchtitan/trainer.py
@@ -596,12 +596,8 @@ def post_dataloading_process(
         # extra_kwargs are.
         extra_kwargs: dict[str, Any] = {}
 
-        # Per-document position IDs are only needed for block_causal
-        # attention, where each packed document gets its own RoPE reset.
-        # For causal attention the whole sequence is one document, so
-        # sequential positions (the positions=None path) are correct.
-        # Passing them through would also OOB the RoPE cache, since
-        # individual document lengths can exceed max_seq_len.
+        # For causal attention the whole packed sequence is one document,
+        # so sequential RoPE positions (positions=None) are correct.
         layer = getattr(self.model_config, "layer", None)
         attn_config = getattr(layer, "attention", None) if layer else None
         attn_mask_type = getattr(attn_config, "attn_mask_type", "causal")