Yield per-document RoPE position IDs from HuggingFaceTextDataset

joecummings · joecummings · commit b3d7f6011851 · 2026-03-16T13:18:51.000-07:00
Add a position buffer that tracks per-document RoPE positions,
resetting at each document boundary. These positions are yielded
alongside input tokens and used when block_causal attention is
configured.

Also add is_packed validation to catch misconfigured attention
backends at trainer init time: packed dataloaders require flex or
varlen with block_causal to prevent cross-document attention leakage.
diff --git a/tests/unit_tests/test_dataset_checkpointing.py b/tests/unit_tests/test_dataset_checkpointing.py
@@ -55,6 +55,10 @@ def test_c4_resumption(self):
                         assert torch.equal(
                             input_ids["input"], expected_input_ids["input"]
                         )
+                        assert torch.equal(
+                            input_ids["positions"],
+                            expected_input_ids["positions"],
+                        )
                         assert torch.equal(labels, expected_labels)
 
     def _build_dataloader(self, dataset_name, batch_size, seq_len, world_size, rank):
diff --git a/torchtitan/hf_datasets/text_datasets.py b/torchtitan/hf_datasets/text_datasets.py
@@ -96,6 +96,7 @@ def __init__(
         # Variables for checkpointing
         self._sample_idx = 0
         self._token_buffer: list[int] = []
+        self._position_buffer: list[int] = []
 
     def _get_data_iter(self):
         # For map-style datasets, resume by skipping to the correct index
@@ -119,15 +120,19 @@ def __iter__(self):
                     sample_text, add_bos=True, add_eos=True
                 )
                 self._token_buffer.extend(sample_tokens)
+                self._position_buffer.extend(range(len(sample_tokens)))
                 self._sample_idx += 1
 
                 while len(self._token_buffer) >= max_buffer_token_len:
                     x = torch.LongTensor(self._token_buffer[:max_buffer_token_len])
-                    # update tokens to the remaining tokens
+                    pos = torch.LongTensor(self._position_buffer[:max_buffer_token_len])
+                    # update buffers to the remaining tokens
                     self._token_buffer = self._token_buffer[max_buffer_token_len:]
+                    self._position_buffer = self._position_buffer[max_buffer_token_len:]
                     input = x[:-1]
                     label = x[1:]
-                    yield {"input": input}, label
+                    positions = pos[:-1]
+                    yield {"input": input, "positions": positions}, label
 
             if not self.infinite:
                 logger.warning(f"Dataset {self.dataset_name} has run out of data")
@@ -145,6 +150,7 @@ def __iter__(self):
 
     def load_state_dict(self, state_dict):
         self._token_buffer = state_dict["token_buffer"]
+        self._position_buffer = state_dict.get("position_buffer", [])
 
         if isinstance(self._data, Dataset):
             self._sample_idx = state_dict["sample_idx"]
@@ -153,7 +159,10 @@ def load_state_dict(self, state_dict):
             self._data.load_state_dict(state_dict["data"])
 
     def state_dict(self):
-        _state_dict: dict[str, Any] = {"token_buffer": self._token_buffer}
+        _state_dict: dict[str, Any] = {
+            "token_buffer": self._token_buffer,
+            "position_buffer": self._position_buffer,
+        }
 
         if isinstance(self._data, Dataset):
             _state_dict["sample_idx"] = self._sample_idx