resolve comments

LiuzcEECS · LiuzcEECS · commit 4dc4bbb6d652 · 2026-02-19T04:29:17.000Z
diff --git a/tests/data/test_dynamic_batching_dataset.py b/tests/data/test_dynamic_batching_dataset.py
@@ -1,21 +1,35 @@
 """Tests for DynamicBatchingSizeDataset functionality.
 
-This module tests the DynamicBatchingSizeDataset class using DummyIterableDataset
-and DummyMappingDataset. It validates that DynamicBatchingSizeDataset can properly:
+This module tests the ``DynamicBatchingSizeDataset`` class using ``DummyIterableDataset``.
+It validates that ``DynamicBatchingSizeDataset`` can properly:
 
-1. Batch samples based on token count (micro_batch_seq_length)
-2. Handle buffer management with ready_for_micro_batch_threshold
-3. Work with both shuffled and non-shuffled iterable datasets
-4. Support state_dict save/load for checkpointing in distributed environments
+1. Batch samples based on token count (``micro_batch_seq_length``).
+2. Handle buffer management with ``ready_for_micro_batch_threshold``.
+3. Work with both shuffled and non-shuffled iterable datasets.
+4. Drain remaining buffer contents after the upstream dataset is exhausted.
+5. Reject invalid construction arguments (``save_by_idx`` without ``get_item``).
+6. Save and restore buffer state for exact checkpoint / resume in distributed
+   environments, both by storing full samples and by storing only indices.
 
 The test suite includes:
-    - Unit tests that can run without distributed setup:
-        - test_dynamic_batching_basic
-    - End-to-end tests that require multi-GPU distributed environments:
-        - test_dynamic_batching_dataset_shuffled
-        - test_dynamic_batching_dataset_no_shuffle
+
+    Unit tests (run without distributed setup, CPU-compatible):
+        - ``test_dynamic_batching_basic`` – core batching logic and expected batch
+          contents for shuffled and non-shuffled data.
+        - ``test_force_long_sequence`` – overlong samples are emitted rather than
+          dropped when ``force_generate_long_sequence=True``.
+        - ``test_last_batch_on_dataset_end`` – remaining buffer items are yielded
+          after upstream exhaustion.
+        - ``test_dynamic_batching_without_get_item`` – ``ValueError`` is raised when
+          ``save_by_idx=True`` but the dataset lacks ``get_item``.
+
+    End-to-end distributed tests (require ``torchrun`` with 2 processes):
+        - ``test_dynamic_batching_dataset_distributed`` – parametrised over
+          ``shuffle × save_by_idx`` (4 combinations), verifying that resumed
+          batches are byte-for-byte identical to the original run.
 """
 
+import argparse
 import os
 import subprocess
 import sys
@@ -279,7 +293,7 @@ def test_last_batch_on_dataset_end(setup_dynamic_batching_dataset):
 
 
 def test_dynamic_batching_without_get_item():
-    """Test DynamicBatchingSizeDataset initialization without get_item povided.
+    """Test DynamicBatchingSizeDataset initialization without get_item provided.
 
     Tests that DynamicBatchingSizeDataset cannot be initialized with save_by_idx=True
     when the dataset doesn't have get_item method.
@@ -316,7 +330,7 @@ def __iter__(self):
 def test_dynamic_batching_dataset_distributed(shuffle, save_by_idx):
     """Test DynamicBatchingSizeDataset in distributed setting.
 
-    Runs main_distributed_test() by torchrun with or without data shuffling
+    Runs _main_distributed_test() by torchrun with or without data shuffling
     and with or without save_by_idx for checkpoint buffer saving.
 
     Args:
@@ -358,16 +372,18 @@ def build_command(shuffle=True, save_by_idx=True):
         "--data.train_size=2000",
         "--data.max_seq_len=16",
         "--train.micro_batch_size=2",
-        f"--data.shuffle={str(shuffle).lower()}",
+        # NOTE: Do not rely on veomni_patch adding `data.shuffle` into DataArguments.
+        # Keep this as a test-only flag (parsed via argparse in _run_distributed_test).
+        f"--shuffle={str(shuffle).lower()}",
         "--train.global_batch_size=16",
         "--train.data_parallel_mode=ddp",
         "--train.ckpt_manager=dcp",
         "--train.output_dir=.tests/cache",
         "--train.rmpad=false",
         "--train.rmpad_with_pos_ids=true",
         "--train.dyn_bsz=true",
-        "--train.dyn_bsz_in_worker_loop=false",
-        f"--train.dyn_bsz_dataset_save_by_idx={str(save_by_idx).lower()}",
+        "--dyn_bsz_in_dataloader=false",
+        f"--save_by_idx={str(save_by_idx).lower()}",
         "--train.seed=42",
     ]
     return command
@@ -389,20 +405,47 @@ class Arguments:
     train: "TrainingArguments" = field(default_factory=TrainingArguments)
 
 
-def main_distributed_test():
-    """
-    Tests:
-    - Dynamic batching with shuffled iterable dataset
-    - Checkpoint save/load with buffer state
-    - Multi-process distributed training
+def _main_distributed_test():
+    """Entry point for the distributed test launched by ``torchrun``.
+
+    It wraps ``_run_distributed_test()` and in the testing it is supposed to be
+    triggered by test_dynamic_batching_dataset_distributed().
     """
     # Patch empty_cache to avoid AttributeError on CPU
     with patch("veomni.utils.device.empty_cache", _mock_empty_cache):
         _run_distributed_test()
 
 
 def _run_distributed_test():
-    """Internal function that runs the actual distributed test."""
+    """Run a full checkpoint-resume cycle and assert batch reproducibility.
+
+    Procedure
+    ---------
+    1. **Parse CLI flags**
+    2. **Initialise torch distributed state**
+    3. **Build a StatefulDataLoader** wrapping ``DummyIterableDataset`` →
+       ``DynamicBatchingSizeDataset`` with ``num_workers=2``.
+    4. **First pass (2 epochs)** – iterate the dataloader for both epochs.  Batches
+       before the designated save point (``epoch=1, step=2``) are discarded; batches
+       *after* that point are stored in ``batches_after_save_step`` as ground truth.
+       At the save point a checkpoint is written via ``Checkpointer.save()``,
+       capturing model weights, ``dataloader.state_dict()``, and
+       ``environ_meter.state_dict()``.
+    5. **Load checkpoint** – ``Checkpointer.load()`` restores all state; the
+       dataloader, dataset and environ-meter are restored through ``load_state_dict()``.
+    6. **Second pass (resume)** – iterate from the saved epoch / step through the
+       end of both epochs, collecting resumed batches in ``batch_after_resume``.
+    7. **Assert equality** – verify that ``batches_after_save_step`` and
+       ``batch_after_resume`` have the same length and that every tensor in every
+       micro-batch is identical element-wise.
+    """
+    _parser = argparse.ArgumentParser()
+    _parser.add_argument("--shuffle", type=lambda x: x.lower() == "true", default=True)
+    _parser.add_argument("--save_by_idx", type=lambda x: x.lower() == "true", default=True)
+    _parser.add_argument("--dyn_bsz_in_dataloader", type=lambda x: x.lower() == "true", default=True)
+    test_args, remaining_argv = _parser.parse_known_args()
+    sys.argv = [sys.argv[0]] + remaining_argv
+
     args = parse_args(Arguments)
     world_size = int(os.environ["WORLD_SIZE"])
     rank = int(os.environ["RANK"])
@@ -433,8 +476,7 @@ def _run_distributed_test():
 
     # Create DummyMappingDataset and DummyIterableDataset
     mapping_dataset = DummyMappingDataset(size=DATASET_SIZE)
-    shuffle = getattr(args.data, "shuffle", True)
-    iterable_dataset = DummyIterableDataset(mapping_dataset, shuffle=shuffle, seed=args.train.seed)
+    iterable_dataset = DummyIterableDataset(mapping_dataset, shuffle=test_args.shuffle, seed=args.train.seed)
 
     # Compute train_steps based on dataset size
     dataset_length = len(mapping_dataset)
@@ -452,11 +494,11 @@ def _run_distributed_test():
         train_steps=train_steps,
         rmpad=args.train.rmpad,
         dyn_bsz=args.train.dyn_bsz,
-        dyn_bsz_in_worker_loop=args.train.dyn_bsz_in_worker_loop,
+        dyn_bsz_in_dataloader=test_args.dyn_bsz_in_dataloader,
         bsz_warmup_ratio=args.train.bsz_warmup_ratio,
         rmpad_with_pos_ids=args.train.rmpad_with_pos_ids,
         dyn_bsz_buffer_size=READY_FOR_MICRO_BATCH_THRESHOLD,
-        dyn_bsz_dataset_save_by_idx=args.train.dyn_bsz_dataset_save_by_idx,
+        dyn_bsz_dataset_save_by_idx=test_args.save_by_idx,
         num_workers=2,
         drop_last=False,
         pin_memory=args.data.pin_memory,
@@ -472,7 +514,6 @@ def _run_distributed_test():
         empty_cache_steps=args.train.empty_cache_steps,
     )
 
-    batches_before_save_step = []
     batches_after_save_step = []
     epoch_num = 2  # Run 2 epochs
     start_epoch, start_step, global_step = 0, 0, 0
@@ -504,18 +545,16 @@ def _run_distributed_test():
 
             # Print batch info for debugging
             """
-            logger.info(f"[rank{rank}] epoch:{epoch} step:{local_step} global_step:{global_step} num_micro_batches:{len(micro_batches)}")
+            logger.error(f"[rank{rank}] epoch:{epoch} step:{local_step} global_step:{global_step} num_micro_batches:{len(micro_batches)} dataset_iter: {dataloader.dataset._data_iter}")
             for micro_idx, micro_batch in enumerate(micro_batches):
                 # Extract sample indices from input_ids (each sample has all same values)
                 input_ids = micro_batch["input_ids"].squeeze(0)  # Remove batch dim
                 input_ids = set(input_ids.tolist())
-                logger.info(f"[rank{rank}] epoch:{epoch} step:{local_step} global_step:{global_step} micro_batch[{micro_idx}]: {input_ids}")
+                logger.error(f"[rank{rank}] epoch:{epoch} step:{local_step} global_step:{global_step} micro_batch[{micro_idx}]: {input_ids}")
             """
 
             if epoch > save_epoch or (epoch == save_epoch and local_step > save_step):
                 batches_after_save_step.append(micro_batches)
-            else:
-                batches_before_save_step.append(micro_batches)
 
             for _, micro_batch in enumerate(micro_batches):
                 environ_meter.add(micro_batch)
@@ -623,4 +662,4 @@ def _run_distributed_test():
 
 
 if __name__ == "__main__":
-    main_distributed_test()
+    _main_distributed_test()
diff --git a/tests/data/utils.py b/tests/data/utils.py
@@ -17,20 +17,39 @@
 
 
 class DummyMappingDataset(Dataset):
-    """Mapping-style dataset that generates dummy data based on index."""
+    """Mapping-style dataset that generates deterministic dummy samples by index.
+
+    * Sample at 0-based index ``i`` contains **i + 1** tokens, each with value
+      ``i + 1``.  For example index 0 → ``[1]``, index 4 → ``[5, 5, 5, 5, 5]``.
+    """
 
     def __init__(self, size: int = 100):
         """
         Args:
-            size: Total number of samples in the dataset
+            size: Total number of samples in the dataset.
         """
         self.size = size
 
     def __len__(self):
         return self.size
 
     def __getitem__(self, idx):
-        """Generate data following the same pattern as DummyDataset.generate_data"""
+        """Return the dummy sample at position *idx*.
+
+        Args:
+            idx: 0-based integer index into the dataset.
+
+        Returns:
+            dict with keys:
+
+            * ``"input_ids"`` – 1-D ``LongTensor`` of length ``idx + 1``, filled
+              with the scalar value ``idx + 1``.
+            * ``"attention_mask"`` – all-ones tensor of the same shape.
+            * ``"labels"`` – clone of ``input_ids``.
+
+        Raises:
+            IndexError: If ``idx`` is outside ``[0, size)``.
+        """
         if idx < 0 or idx >= self.size:
             raise IndexError(f"Index {idx} out of range [0, {self.size})")
 
@@ -41,14 +60,34 @@ def __getitem__(self, idx):
 
 
 class DummyIterableDataset(IterableDataset):
-    """Iterable dataset that reads from DummyMappingDataset sequentially or with shuffle."""
+    """Iterable wrapper around ``DummyMappingDataset`` with built-in sharding and optional shuffle.
+
+    Designed to tested with ``DynamicBatchingSizeDataset`` and ``StatefulDataLoader`` checkpointing:
+
+    * **Sharding** – samples are distributed across distributed ranks *and* DataLoader
+      workers using a round-robin interleave strategy (rank-major, then worker-minor),
+      so each dataloader worker on each rank sees a disjoint, deterministic subset of the data.
+    * **Shuffle** – when ``shuffle=True``, a fixed ``torch.randperm`` generated from
+      ``seed`` at construction time is used so that the shuffled order is reproducible
+      and consistent across checkpoint / resume cycles.
+    * **Index output** – when ``output_refetch_idx`` is set to ``True`` (by
+      ``DynamicBatchingSizeDataset`` when ``save_by_idx=True``), each ``__iter__``
+      yield is a ``(sample_dict, original_index)`` tuple instead of a bare dict,
+      allowing the consumer to store the indices instead of the full samples when saving checkpoints,
+      and reconstruct the buffer from indices on resume.
+    * **State dict** – ``state_dict()`` / ``load_state_dict()`` persist
+      ``_current_idx`` so that ``StatefulDataLoader`` can snapshot and restore the
+      exact position of the iterator.
+    """
 
     def __init__(self, mapping_dataset: DummyMappingDataset, shuffle: bool = False, seed: int = 42):
         """
         Args:
-            mapping_dataset: The underlying DummyMappingDataset to read from
-            shuffle: Whether to shuffle the reading order
-            seed: Random seed for shuffling
+            mapping_dataset: The upstream ``DummyMappingDataset`` to read from.
+            shuffle: Whether to shuffle the reading order.  Shuffling is performed
+                once at construction time using ``seed`` so that it is stable across
+                distributed workers.
+            seed: Random seed used to generate the permutation when ``shuffle=True``.
         """
         self.mapping_dataset = mapping_dataset
         self.shuffle = shuffle
@@ -111,6 +150,18 @@ def __iter__(self):
                 yield self.mapping_dataset[idx]
 
     def get_item(self, idx):
+        """Fetch a single sample by its original dataset index.
+
+        Used by ``DynamicBatchingSizeDataset.load_state_dict()`` to reconstruct
+        buffer contents when ``save_by_idx=True``: the saved indices are passed
+        back here one-by-one to rebuild the exact pre-checkpoint buffer.
+
+        Args:
+            idx: 0-based integer index into the underlying ``DummyMappingDataset``.
+
+        Returns:
+            Sample as returned by ``DummyMappingDataset.__getitem__``.
+        """
         return self.mapping_dataset[idx]
 
     def state_dict(self):
diff --git a/veomni/arguments/arguments_types.py b/veomni/arguments/arguments_types.py
@@ -420,22 +420,10 @@ class TrainingArguments:
         default="worker",
         metadata={"help": "Use main process or worker process to run dynamic batch size."},
     )
-    dyn_bsz_in_worker_loop: bool = field(
-        default=True,
-        metadata={
-            "help": "Whether the dynamic batch construction is in DataLoader's worker loop or in Dataset's iterator."
-        },
-    )
     dyn_bsz_buffer_size: int = field(
         default=200,
         metadata={"help": "Buffer size for dynamic batch size."},
     )
-    dyn_bsz_dataset_save_by_idx: bool = field(
-        default=True,
-        metadata={
-            "help": "When dyn_bsz_in_worker_loop is False, it is to decide whether to save buffer by index for checkpointing in DynamicBatchingSizeDataset."
-        },
-    )
     bsz_warmup_ratio: float = field(
         default=0,
         metadata={"help": "Ratio of batch size warmup steps."},
@@ -740,15 +728,9 @@ def __post_init__(self):
 
         # calculate dataloader batch size
         # for:
-        #   - DynamicBatchingSizeDataset and StatefulDataLoader
         #   - StreamingDataset and StreamingDataLoader
-        if (self.rmpad or self.rmpad_with_pos_ids) and self.dyn_bsz:
-            if self.dyn_bsz_in_worker_loop:
-                self.dataloader_batch_size = 1
-            else:
-                self.dataloader_batch_size = self.global_batch_size // (
-                    self.micro_batch_size * self.data_parallel_size
-                )
+        if (self.rmpad or self.rmpad_with_pos_ids) and self.dyn_bsz_runtime == "worker" and self.dyn_bsz:
+            self.dataloader_batch_size = 1
         else:
             self.dataloader_batch_size = self.global_batch_size // self.data_parallel_size  # = micro bsz * grad accu
 
diff --git a/veomni/data/batching_strategy.py b/veomni/data/batching_strategy.py
@@ -92,24 +92,6 @@ def merge(self, buffer_to_merge: "DynBszBuffer"):
             self.append(item)
 
 
-class IdentityPacker:
-    def __init__(self, token_micro_bsz, bsz_warmup_steps, bsz_warmup_init_mbtoken):
-        self.token_micro_bsz = token_micro_bsz
-        self.bsz_warmup_steps = bsz_warmup_steps
-        self.bsz_warmup_init_mbtoken = bsz_warmup_init_mbtoken
-
-    def __call__(self, samples):
-        return samples
-
-    def get_token_num_to_request(self, cur_step, warmup):
-        return (
-            (self.token_micro_bsz - self.bsz_warmup_init_mbtoken) * cur_step // self.bsz_warmup_steps
-            + self.bsz_warmup_init_mbtoken
-            if warmup
-            else self.token_micro_bsz
-        )
-
-
 class BaseBatchingStrategy:
     """
     Base class for batching strategy.
@@ -128,6 +110,24 @@ def empty(self) -> bool:
         raise NotImplementedError("should implement `empty`")
 
 
+class IdentityPacker:
+    def __init__(self, token_micro_bsz, bsz_warmup_steps, bsz_warmup_init_mbtoken):
+        self.token_micro_bsz = token_micro_bsz
+        self.bsz_warmup_steps = bsz_warmup_steps
+        self.bsz_warmup_init_mbtoken = bsz_warmup_init_mbtoken
+
+    def __call__(self, samples):
+        return samples
+
+    def get_token_num_to_request(self, cur_step, warmup):
+        return (
+            (self.token_micro_bsz - self.bsz_warmup_init_mbtoken) * cur_step // self.bsz_warmup_steps
+            + self.bsz_warmup_init_mbtoken
+            if warmup
+            else self.token_micro_bsz
+        )
+
+
 class TextBatchingStrategy(BaseBatchingStrategy):
     """ "
     Batching strategy for text data.
diff --git a/veomni/data/data_loader.py b/veomni/data/data_loader.py
diff --git a/veomni/data/dynamic_batching.py b/veomni/data/dynamic_batching.py