NVIDIA-NeMo
diff --git a/‎docs/training/packed-sequences.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/training/packed-sequences.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎scripts/performance/run_recipe.py‎
Lines changed: 11 additions & 2 deletions b/‎scripts/performance/run_recipe.py‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎scripts/performance/utils/datasets.py‎
Lines changed: 2 additions & 2 deletions b/‎scripts/performance/utils/datasets.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎scripts/performance/utils/overrides.py‎
Lines changed: 12 additions & 2 deletions b/‎scripts/performance/utils/overrides.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎src/megatron/bridge/data/builders/finetuning_dataset.py‎
Lines changed: 6 additions & 1 deletion b/‎src/megatron/bridge/data/builders/finetuning_dataset.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/megatron/bridge/data/datasets/packed_sequence.py‎
Lines changed: 64 additions & 2 deletions b/‎src/megatron/bridge/data/datasets/packed_sequence.py‎
Lines changed: 64 additions & 2 deletions
diff --git a/‎src/megatron/bridge/data/datasets/sft.py‎
Lines changed: 11 additions & 13 deletions b/‎src/megatron/bridge/data/datasets/sft.py‎
Lines changed: 11 additions & 13 deletions
diff --git a/‎src/megatron/bridge/data/datasets/utils.py‎
Lines changed: 8 additions & 13 deletions b/‎src/megatron/bridge/data/datasets/utils.py‎
Lines changed: 8 additions & 13 deletions
@@ -60,6 +60,7 @@ The {py:class}`bridge.data.datasets.packed_sequence.PackedSequenceSpecs` class p
 | `packed_train_data_path` | `str` | `None` | Custom path for packed training dataset file (`.npy` format). |
 | `packed_val_data_path` | `str` | `None` | Custom path for packed validation dataset file (`.npy` format). |
 | `packed_metadata_path` | `str` | `None` | Custom path for packing metadata file (`.jsonl` format). |
+| `pad_seq_to_mult` | `int \| None` | `None` | Pad each sample to a multiple of this value when generating packed datasets (e.g., set to `2 * context_parallel_size` for THD CP). |
 | `pad_cu_seqlens` | `bool` | `False` | Whether to pad `cu_seqlens` to constant size, required for CUDA graphs. |
 
 ### Batch Size Considerations
 
@@ -67,6 +67,9 @@ def set_user_overrides(config, args):
     # Dataset configuration
     logging.info(f"Configuring dataset: type={args.data}")
 
+    cp_size = getattr(config.model, "context_parallel_size", 1) or 1
+    pad_seq_to_mult = cp_size * 2 if cp_size > 1 else 1
+
     # Create dataset configuration based on type
     if args.data == "mock":
         config.dataset = create_mock_dataset_config(seq_length=args.seq_length or 8192)
@@ -82,13 +85,19 @@ def set_user_overrides(config, args):
         if not args.dataset_root:
             raise ValueError("--dataset-root is required for squad dataset")
         config.dataset = create_squad_dataset_config(
-            dataset_root=args.dataset_root, seq_length=args.seq_length or 8192, packed=False
+            dataset_root=args.dataset_root,
+            seq_length=args.seq_length or 8192,
+            packed=False,
+            pad_seq_to_mult=pad_seq_to_mult,
         )
     elif args.data == "squad_packed":
         if not args.dataset_root:
             raise ValueError("--dataset-root is required for squad_packed dataset")
         config.dataset = create_squad_dataset_config(
-            dataset_root=args.dataset_root, seq_length=args.seq_length or 8192, packed=True
+            dataset_root=args.dataset_root,
+            seq_length=args.seq_length or 8192,
+            packed=True,
+            pad_seq_to_mult=pad_seq_to_mult,
         )
     else:
         raise ValueError(f"Unknown dataset type: {args.data}")
 
@@ -58,7 +58,7 @@ def create_rp2_dataset_config(dataset_paths, seq_length, index_mapping_dir=None)
     )
 
 
-def create_squad_dataset_config(dataset_root, seq_length, packed=False):
+def create_squad_dataset_config(dataset_root, seq_length, packed=False, pad_seq_to_mult=1):
     """Create SQuAD dataset configuration for Megatron-Bridge using HF dataset."""
     from megatron.bridge.data.builders.hf_dataset import HFDatasetConfig
     from megatron.bridge.data.datasets.packed_sequence import PackedSequenceSpecs
@@ -67,7 +67,7 @@ def create_squad_dataset_config(dataset_root, seq_length, packed=False):
     # Create packed sequence specs if needed
     packed_sequence_specs = None
     if packed:
-        packed_sequence_specs = PackedSequenceSpecs(packed_sequence_size=seq_length)
+        packed_sequence_specs = PackedSequenceSpecs(packed_sequence_size=seq_length, pad_seq_to_mult=pad_seq_to_mult)
 
     return HFDatasetConfig(
         dataset_name="squad",  # Hugging Face dataset name
 
@@ -287,14 +287,24 @@ def set_user_overrides(recipe: ConfigContainer, args: argparse.Namespace) -> Con
     elif args.data == "squad":
         if not args.dataset_root:
             raise ValueError("--dataset-root is required for squad dataset")
+        cp_size = getattr(recipe.model, "context_parallel_size", 1) or 1
+        pad_seq_to_mult = cp_size * 2 if cp_size > 1 else 1
         recipe.dataset = create_squad_dataset_config(
-            dataset_root=args.dataset_root, seq_length=args.seq_length or recipe.model.seq_length, packed=False
+            dataset_root=args.dataset_root,
+            seq_length=args.seq_length or recipe.model.seq_length,
+            packed=False,
+            pad_seq_to_mult=pad_seq_to_mult,
         )
     elif args.data == "squad_packed":
         if not args.dataset_root:
             raise ValueError("--dataset-root is required for squad_packed dataset")
+        cp_size = getattr(recipe.model, "context_parallel_size", 1) or 1
+        pad_seq_to_mult = cp_size * 2 if cp_size > 1 else 1
         recipe.dataset = create_squad_dataset_config(
-            dataset_root=args.dataset_root, seq_length=args.seq_length or recipe.model.seq_length, packed=True
+            dataset_root=args.dataset_root,
+            seq_length=args.seq_length or recipe.model.seq_length,
+            packed=True,
+            pad_seq_to_mult=pad_seq_to_mult,
         )
         if recipe.model.cuda_graph_impl != "none":
             recipe.dataset.packed_sequence_specs.pad_cu_seqlens = True
 
@@ -77,6 +77,7 @@ def __init__(
         self.packed_sequence_size = -1 if not packed_sequence_specs else packed_sequence_specs.packed_sequence_size
         self.dataset_kwargs = dataset_kwargs or {}
         self._pad_cu_seqlens = False if not packed_sequence_specs else packed_sequence_specs.pad_cu_seqlens
+        self._pad_seq_to_mult = None if not packed_sequence_specs else packed_sequence_specs.pad_seq_to_mult
 
         self.do_validation = do_validation
         self.do_test = do_test
@@ -106,6 +107,7 @@ def prepare_packed_data(self) -> None:
                     seed=self.seed,
                     output_metadata_path=self.pack_metadata,
                     dataset_kwargs=self.dataset_kwargs,
+                    pad_seq_to_mult=self._pad_seq_to_mult,
                 )
 
             if self.do_validation and not self.validation_path_packed.is_file():
@@ -119,6 +121,7 @@ def prepare_packed_data(self) -> None:
                     seed=self.seed,
                     output_metadata_path=self.pack_metadata,
                     dataset_kwargs=self.dataset_kwargs,
+                    pad_seq_to_mult=self._pad_seq_to_mult,
                 )
 
     def build(self) -> list[Optional[Any]]:
@@ -235,7 +238,9 @@ def default_pack_path(self) -> Path:
             The Path object for the default packing directory.
         """
         tokenizer_model_name = self._extract_tokenizer_model_name()
-        default_pack_path = self.dataset_root / "packed" / tokenizer_model_name
+        default_pack_path = (
+            self.dataset_root / "packed" / f"{tokenizer_model_name}_pad_seq_to_mult{self._pad_seq_to_mult}"
+        )
         if not default_pack_path.exists():
             default_pack_path.mkdir(parents=True, exist_ok=True)
             logger.info(f"Using default path for packing files: {str(default_pack_path)}")
 
@@ -33,6 +33,7 @@ def tokenize_dataset(
     max_seq_length: int,
     seed: int,
     dataset_kwargs: dict | None = None,
+    pad_seq_to_mult: int | None = 1,
 ):
     """
     Tokenizes a dataset from the provided path using the specified tokenizer
@@ -45,6 +46,8 @@ def tokenize_dataset(
         seed (int): Random seed for shuffling the dataset.
         dataset_kwargs (dict | None): Additional keyword arguments to pass to create_sft_dataset.
             Can include 'chat', 'use_hf_tokenizer_chat_template', 'tool_schemas', etc.
+        pad_seq_to_mult (int | None): Optional multiple to pad each sequence to during packing
+            preparation (e.g., set to 2 * context_parallel_size for THD CP).
 
     Returns:
         np.ndarray: A NumPy array containing the tokenized data.
@@ -66,15 +69,56 @@ def tokenize_dataset(
         if hasattr(tokenizer, "_tokenizer"):
             tokenizer._tokenizer.chat_template = chat_template
 
+    if pad_seq_to_mult is not None and pad_seq_to_mult <= 0:
+        raise ValueError("pad_seq_to_mult must be a positive integer when provided.")
+
+    # Keep the historical minimum of 16 unless a larger multiple is requested.
+    pad_seq_length_to_mult = 1 if pad_seq_to_mult is None else max(1, pad_seq_to_mult)
+
     dataset = create_sft_dataset(
         path=path,
         tokenizer=tokenizer,
         seq_length=max_seq_length,
         seed=seed,
         is_test=True,
+        pad_seq_length_to_mult=pad_seq_length_to_mult,
         **dataset_kwargs,
     )
-    return np.array([dataset[i] for i in range(len(dataset))])
+
+    pad_id = dataset.tokenizer.eod
+    pad_seq_length_to_mult = dataset.pad_seq_length_to_mult
+    max_seq_length = dataset.max_seq_length
+    dataset = np.array([dataset[i] for i in range(len(dataset))])
+
+    if pad_seq_to_mult > 1:
+
+        def pre_pad_dataset(data, max_seq_length, max_length_to_pad, pad_id):
+            """
+            Pad each individual data point to the length of max_length_to_pad.
+            This keeps packed samples divisible by the requested multiple (used for CP/THD).
+            """
+            assert max_seq_length >= max_length_to_pad
+            for key, val in data.items():
+                if key in {"input_ids", "context_ids"}:
+                    if len(val) <= max_length_to_pad:
+                        # input_ids are truncated by 1 for labels; add 1 extra pad token
+                        val = val + [pad_id] * (max_length_to_pad - len(val) + 1)
+                    elif len(val) > max_seq_length:
+                        logging.info(
+                            "Sequence length %d is larger than max_seq_length %d; truncating for packing.",
+                            len(val),
+                            max_seq_length,
+                        )
+                        val = val[:max_seq_length]
+                    data[key] = val
+            return
+
+        ceil_to_nearest = lambda n, m: (n + m - 1) // m * m
+        for data in dataset:
+            max_length_to_pad = min(max_seq_length, ceil_to_nearest(len(data["input_ids"]), pad_seq_length_to_mult))
+            pre_pad_dataset(data, max_seq_length, max_length_to_pad, pad_id)
+
+    return dataset
 
 
 def prepare_packed_sequence_data(
@@ -87,6 +131,7 @@ def prepare_packed_sequence_data(
     seed: int | None = 0,
     packing_algorithm: str = "first_fit_shuffle",
     dataset_kwargs: dict | None = None,
+    pad_seq_to_mult: int | None = 1,
 ):
     """
     Prepares a packed sequence dataset from a given input file and saves it to an output file.
@@ -103,12 +148,21 @@ def prepare_packed_sequence_data(
                 currently supports "first_fit_shuffle" and "first_fit_decreasing".
         dataset_kwargs (dict | None): Additional keyword arguments to pass to create_sft_dataset.
             Enables packing with chat templates, tool schemas, etc.
+        pad_seq_to_mult (int | None): Optional multiple to pad each sequence to during packing
+            preparation (e.g., set to 2 * context_parallel_size for THD CP).
 
     Returns:
         None: Saves the packed sequence data to the specified output path.
     """
     logger.info(f"Preparing packed sequence from {input_path}")
-    dataset = tokenize_dataset(input_path, tokenizer, max_seq_length, seed, dataset_kwargs)
+    dataset = tokenize_dataset(
+        input_path,
+        tokenizer,
+        max_seq_length,
+        seed,
+        dataset_kwargs,
+        pad_seq_to_mult=pad_seq_to_mult,
+    )
     sequences, histogram = create_hist(dataset, max_seq_length)
 
     assignments, packing_metadata = create_packing_strategy(histogram, packed_sequence_size, packing_algorithm)
@@ -185,6 +239,11 @@ class PackedSequenceSpecs:
     """
     If True, pad cu_seqlens to a constant size, which is required for use with cudagraphs.
     """
+    pad_seq_to_mult: int | None = 1
+    """
+    Optional multiple to pad each sample to when generating packed datasets.
+    For THD/context parallel, set to (context_parallel_size * 2) to keep samples divisible.
+    """
 
     def __post_init__(self):
         if self.packed_train_data_path is not None:
@@ -212,3 +271,6 @@ def __post_init__(self):
             assert self.packed_val_data_path.exists(), (
                 f"packed validation data file does not exist: {self.packed_val_data_path}"
             )
+
+        if self.pad_seq_to_mult is not None and self.pad_seq_to_mult <= 0:
+            raise ValueError("pad_seq_to_mult must be a positive integer when provided.")
@@ -225,7 +225,6 @@ def __init__(
         output_original_text: bool = False,
         ceil_to_power_2: bool = False,
         get_attention_mask_from_fusion: bool = True,
-        sanity_check_dist_workers: bool = True,
     ):
         """
         file_path: Path to a JSONL GPT supervised fine-tuning dataset.
@@ -274,7 +273,6 @@ def __init__(
         output_original_text (bool): if true, will keep the original text in the output alongside the tokenized ids.
         get_attention_mask_from_fusion (bool): if true, lets attention kernel handle creation of causal mask instead
             of adding it to the batch dict.
-        sanity_check_dist_workers (bool): if true, will run sanity check across workers when making mapping.
         """
         self.tokenizer = tokenizer
         self.file_path = file_path
@@ -303,7 +301,6 @@ def __init__(
         self.output_original_text = output_original_text
         self.ceil_to_power_2 = ceil_to_power_2
         self.get_attention_mask_from_fusion = get_attention_mask_from_fusion
-        self.sanity_check_dist_workers = sanity_check_dist_workers
 
         if special_tokens is None:
             self.special_tokens = {
@@ -385,7 +382,6 @@ def _build_samples_mapping(self):
                 binary_head=False,
                 index_mapping_dir=self.index_mapping_dir,
                 samples_mapping=osm,
-                sanity_check_dist_workers=self.sanity_check_dist_workers,
             )
         else:
             self.samples_mapping = None
@@ -914,13 +910,10 @@ def collate_fn(self, batch):
             for i in range(len(item["seq_boundaries"]) - 1):
                 current_seq = item["input_ids"][item["seq_boundaries"][i] : item["seq_boundaries"][i + 1] - 1]
 
-                # since the data could be prepadded with tokenizer's eos_id,
-                # we can find out the index of all the eos_id
-                eos_idx = np.where(np.array(current_seq) == self.tokenizer.eos_id)
-
-                # The second eos_id index marks the length of the original unpadded sequence if the sequence is
-                # prepadded for cp_size > 1. Otherwise, there is no extra padding.
-                seqlen_unpadded = eos_idx[0][1] + 1 if eos_idx[0].shape[0] > 1 else len(current_seq)
+                # Stop unpadded lengths at the last non-eos token so padding eos are excluded.
+                current_seq_arr = np.array(current_seq)
+                non_eos_positions = np.where(current_seq_arr != self.tokenizer.eos_id)[0]
+                seqlen_unpadded = non_eos_positions[-1] + 1 if non_eos_positions.size > 0 else 0
                 cu_seqlens_unpadded[-1].append(cu_seqlens_unpadded[-1][-1] + seqlen_unpadded)
 
             # if extra paddings are added in the packed sequence, they can't be counted as
@@ -944,10 +937,15 @@ def collate_fn(self, batch):
         loss_mask = self._collate_item(loss_mask, max_length=max_length, pad_id=0)
         position_ids = self._collate_item(position_ids, max_length=max_length, pad_id=0)
 
+        tokens = torch.LongTensor(input_ids)
+        loss_mask = torch.LongTensor(loss_mask)
+        # drop any padding/eos tokens from contributing to the loss
+        loss_mask[tokens == self.tokenizer.eos_id] = 0
+
         processed_batch = {
-            "tokens": torch.LongTensor(input_ids),
+            "tokens": tokens,
             "labels": torch.LongTensor(labels),
-            "loss_mask": torch.LongTensor(loss_mask),
+            "loss_mask": loss_mask,
             "position_ids": torch.LongTensor(position_ids),
             "token_count": token_count,
         }
 
@@ -731,11 +731,10 @@ def _get_samples_mapping(
     binary_head,
     index_mapping_dir: str = None,
     samples_mapping: Any = None,
-    sanity_check_dist_workers: bool = True,
 ):
     """Get a list that maps a sample index to a starting sentence index, end sentence index, and length"""
-
-    from megatron.core import parallel_state
+    is_distributed = torch.distributed.is_available() and torch.distributed.is_initialized()
+    rank = torch.distributed.get_rank() if is_distributed else 0
 
     if not num_epochs:
         if not max_num_samples:
@@ -760,7 +759,7 @@ def _get_samples_mapping(
     indexmap_filename += ".npy"
 
     # Build the indexed mapping if not exist and not provided externally.
-    if samples_mapping is None and torch.distributed.get_rank() == 0 and not os.path.isfile(indexmap_filename):
+    if samples_mapping is None and rank == 0 and not os.path.isfile(indexmap_filename):
         # Fake index mapping if missing
         if (getattr(indexed_dataset, "doc_idx", None) is None) and (getattr(indexed_dataset, "sizes", None) is None):
             _make_indexed_dataset_compatibility(indexed_dataset)
@@ -776,7 +775,7 @@ def _get_samples_mapping(
         assert indexed_dataset.sizes.dtype == np.int32
 
         # Build samples mapping
-        verbose = torch.distributed.get_rank() == 0
+        verbose = rank == 0
         start_time = time.time()
         logger.info(" > building samples index mapping for {} ...".format(name))
         # First compile and then import.
@@ -806,15 +805,11 @@ def _get_samples_mapping(
             " > elasped time to build and save samples mapping (seconds): {:4f}".format(time.time() - start_time)
         )
 
-    if sanity_check_dist_workers:
+    # Ensure the mapping exists before all ranks attempt to load it.
+    # Skip barrier when invoked from a rank-0-only data preparation flow (see `rank_0_prepare_data()`).
+    if is_distributed and not rank_0_prepare_data():
         torch.distributed.barrier()
-        counts = torch.cuda.LongTensor([1])
-        torch.distributed.all_reduce(counts, group=parallel_state.get_data_parallel_group(with_context_parallel=True))
-        torch.distributed.all_reduce(counts, group=parallel_state.get_pipeline_model_parallel_group())
-        assert counts[0].item() == (
-            torch.distributed.get_world_size()
-            // torch.distributed.get_world_size(group=parallel_state.get_tensor_model_parallel_group())
-        )
+
     # Load indexed dataset if not given externally.
     if samples_mapping is None:
         logger.info(" > loading indexed mapping from {}".format(indexmap_filename))