fix: wrap per-document RoPE positions at seq_len to prevent OOB gather

joecummings · joecummings · commit 4e6888b3e6e9 · 2026-03-18T08:40:06.000-07:00
Documents longer than seq_len produce position IDs that exceed the RoPE
cache size, causing an index-out-of-bounds error in torch.gather during
apply_rotary_emb. Wrap positions with modulo seq_len in the dataloader,
which effectively chunks long documents for RoPE purposes while
preserving all tokens for training.

Also update comments to clarify: per-document positions are dropped for
causal attention (whole sequence is one document), and kept for
block_causal to match inference frameworks (e.g. vLLM) that reset
positions to 0 per request.
diff --git a/torchtitan/components/validate.py b/torchtitan/components/validate.py
@@ -187,12 +187,8 @@ def post_dataloading_process(
 
         # TODO: deduplicate with Trainer.post_dataloading_process which has
         # the same logic; extract a shared function to prevent further drift.
-        # Per-document position IDs are only needed for block_causal
-        # attention, where each packed document gets its own RoPE reset.
-        # For causal attention the whole sequence is one document, so
-        # sequential positions (the positions=None path) are correct.
-        # Passing them through would also OOB the RoPE cache, since
-        # individual document lengths can exceed max_seq_len.
+        # For causal attention the whole packed sequence is one document,
+        # so sequential RoPE positions (positions=None) are correct.
         model_config = getattr(model_parts[0], "config", None)
         layer = getattr(model_config, "layer", None)
         attn_config = getattr(layer, "attention", None) if layer else None
diff --git a/torchtitan/hf_datasets/text_datasets.py b/torchtitan/hf_datasets/text_datasets.py
@@ -120,7 +120,15 @@ def __iter__(self):
                     sample_text, add_bos=True, add_eos=True
                 )
                 self._token_buffer.extend(sample_tokens)
-                self._position_buffer.extend(range(len(sample_tokens)))
+                # Per-document positions reset at document boundaries,
+                # matching inference frameworks (e.g. vLLM) that start
+                # positions at 0 per request.  Positions wrap at seq_len
+                # to stay within the RoPE cache, effectively chunking
+                # long documents into seq_len-sized segments.
+                # TODO: make overflow policy configurable (chunk / truncate / drop).
+                self._position_buffer.extend(
+                    i % self.seq_len for i in range(len(sample_tokens))
+                )
                 self._sample_idx += 1
 
                 while len(self._token_buffer) >= max_buffer_token_len:
diff --git a/torchtitan/trainer.py b/torchtitan/trainer.py
@@ -591,12 +591,8 @@ def post_dataloading_process(
         # extra_kwargs are.
         extra_kwargs: dict[str, Any] = {}
 
-        # Per-document position IDs are only needed for block_causal
-        # attention, where each packed document gets its own RoPE reset.
-        # For causal attention the whole sequence is one document, so
-        # sequential positions (the positions=None path) are correct.
-        # Passing them through would also OOB the RoPE cache, since
-        # individual document lengths can exceed max_seq_len.
+        # For causal attention the whole packed sequence is one document,
+        # so sequential RoPE positions (positions=None) are correct.
         layer = getattr(self.model_config, "layer", None)
         attn_config = getattr(layer, "attention", None) if layer else None
         attn_mask_type = getattr(attn_config, "attn_mask_type", "causal")