Yield per-document RoPE position IDs from HuggingFaceTextDataset

joecummings · joecummings · commit 0b0f7d78f08d · 2026-03-16T13:12:07.000-07:00
Add a position buffer that tracks per-document RoPE positions,
resetting at each document boundary. These positions are yielded
alongside input tokens and used when block_causal attention is
configured.

Also add is_packed validation to catch misconfigured attention
backends at trainer init time: packed dataloaders require flex or
varlen with block_causal to prevent cross-document attention leakage.
diff --git a/tests/unit_tests/test_dataset_checkpointing.py b/tests/unit_tests/test_dataset_checkpointing.py
@@ -55,6 +55,10 @@ def test_c4_resumption(self):
                         assert torch.equal(
                             input_ids["input"], expected_input_ids["input"]
                         )
+                        assert torch.equal(
+                            input_ids["positions"],
+                            expected_input_ids["positions"],
+                        )
                         assert torch.equal(labels, expected_labels)
 
     def _build_dataloader(self, dataset_name, batch_size, seq_len, world_size, rank):
diff --git a/torchtitan/components/dataloader.py b/torchtitan/components/dataloader.py
@@ -47,6 +47,11 @@ class Config(Configurable.Config):
         dataset: str = ""
         dataset_path: str | None = None
 
+    @property
+    def is_packed(self) -> bool:
+        """Whether the underlying dataset packs multiple documents per sequence."""
+        return getattr(self.dataset, "is_packed", False)
+
     @abstractmethod
     def __iter__(self) -> Iterator[tuple[dict[str, torch.Tensor], torch.Tensor]]:
         ...
diff --git a/torchtitan/hf_datasets/text_datasets.py b/torchtitan/hf_datasets/text_datasets.py
@@ -68,6 +68,8 @@ def _validate_dataset(
 
 
 class HuggingFaceTextDataset(IterableDataset, Stateful):
+    is_packed: bool = True
+
     def __init__(
         self,
         dataset_name: str,
@@ -96,6 +98,7 @@ def __init__(
         # Variables for checkpointing
         self._sample_idx = 0
         self._token_buffer: list[int] = []
+        self._position_buffer: list[int] = []
 
     def _get_data_iter(self):
         # For map-style datasets, resume by skipping to the correct index
@@ -119,15 +122,19 @@ def __iter__(self):
                     sample_text, add_bos=True, add_eos=True
                 )
                 self._token_buffer.extend(sample_tokens)
+                self._position_buffer.extend(range(len(sample_tokens)))
                 self._sample_idx += 1
 
                 while len(self._token_buffer) >= max_buffer_token_len:
                     x = torch.LongTensor(self._token_buffer[:max_buffer_token_len])
-                    # update tokens to the remaining tokens
+                    pos = torch.LongTensor(self._position_buffer[:max_buffer_token_len])
+                    # update buffers to the remaining tokens
                     self._token_buffer = self._token_buffer[max_buffer_token_len:]
+                    self._position_buffer = self._position_buffer[max_buffer_token_len:]
                     input = x[:-1]
                     label = x[1:]
-                    yield {"input": input}, label
+                    positions = pos[:-1]
+                    yield {"input": input, "positions": positions}, label
 
             if not self.infinite:
                 logger.warning(f"Dataset {self.dataset_name} has run out of data")
@@ -145,6 +152,7 @@ def __iter__(self):
 
     def load_state_dict(self, state_dict):
         self._token_buffer = state_dict["token_buffer"]
+        self._position_buffer = state_dict.get("position_buffer", [])
 
         if isinstance(self._data, Dataset):
             self._sample_idx = state_dict["sample_idx"]
@@ -153,7 +161,10 @@ def load_state_dict(self, state_dict):
             self._data.load_state_dict(state_dict["data"])
 
     def state_dict(self):
-        _state_dict: dict[str, Any] = {"token_buffer": self._token_buffer}
+        _state_dict: dict[str, Any] = {
+            "token_buffer": self._token_buffer,
+            "position_buffer": self._position_buffer,
+        }
 
         if isinstance(self._data, Dataset):
             _state_dict["sample_idx"] = self._sample_idx
@@ -168,8 +179,10 @@ def state_dict(self):
 class HuggingFaceTextDataLoader(ParallelAwareDataloader):
     """Configurable text dataloader that wraps HuggingFaceTextDataset.
 
-    This dataloader can be used for both training and validation by
-    configuring the appropriate dataset, seq_len, batch_size, etc.
+    This dataloader packs multiple documents into each sequence by
+    concatenating tokenized documents into a continuous stream and
+    slicing fixed-size chunks. Use with block_causal attention to
+    prevent cross-document attention leakage.
     """
 
     @dataclass(kw_only=True, slots=True)
diff --git a/torchtitan/trainer.py b/torchtitan/trainer.py
@@ -255,6 +255,20 @@ def __init__(self, config: Config):
         )
         self.model_config = model_config
 
+        # Validate that packed dataloaders use block_causal attention
+        if self.dataloader.is_packed:
+            attn_config = model_config.layer.attention
+            if (
+                attn_config.attn_backend == "sdpa"
+                or attn_config.attn_mask_type != "block_causal"
+            ):
+                raise ValueError(
+                    "Packed dataloader requires attn_backend='flex' or 'varlen' "
+                    "with attn_mask_type='block_causal' for document isolation. "
+                    f"Got attn_backend='{attn_config.attn_backend}', "
+                    f"attn_mask_type='{attn_config.attn_mask_type}'."
+                )
+
         logger.info(
             f"Building {model_spec.name} {model_spec.flavor} "
             f"with {json.dumps(model_config.to_dict(), indent=2, ensure_ascii=False)}"
@@ -597,8 +611,7 @@ def post_dataloading_process(
         extra_kwargs: dict[str, Any] = {}
 
         # TODO: improve the logic on obtaining attention masks
-        layer = getattr(self.model_config, "layer", None)
-        attn_config = getattr(layer, "attention", None) if layer else None
+        attn_config = self._get_attn_config()
         attn_backend = getattr(attn_config, "attn_backend", "sdpa")
         if attn_backend in ["flex", "varlen"]:
             assert (
@@ -851,6 +864,11 @@ def train(self):
 
         logger.info("Training completed")
 
+    def _get_attn_config(self):
+        """Extract attention config from model config, or None if not available."""
+        layer = getattr(self.model_config, "layer", None)
+        return getattr(layer, "attention", None) if layer else None
+
     def should_continue_training(self) -> bool:
         return self.step < self.config.training.steps