Merge remote-tracking branch 'upstream/main'

dchourasia · dchourasia · commit 2a71af8020c3 · 2025-04-11T00:15:49.000Z
diff --git a/README.md b/README.md
@@ -885,17 +885,18 @@ Notes:
  * When using `fused_ops_and_kernels` together with `quantized_lora_config`,
  make sure to appropriately set `--fused_lora auto_gptq True` or `bitsandbytes True`; the `True` sets `fast_lora==True`.
  * `fused_ops_and_kernels` works for full-finetuning, LoRA, QLoRA and GPTQ-LORA, 
-    - pass `--fast_kernels True True True` for full finetuning/LoRA
-    - pass `--fast_kernels True True True --auto_gptq triton_v2 --fused_lora auto_gptq True` for GPTQ-LoRA
-    - pass `--fast_kernels True True True --bitsandbytes nf4 --fused_lora bitsandbytes True` for QLoRA
+    - Pass `--fast_kernels True True True` for full finetuning/LoRA
+    - Pass `--fast_kernels True True True --auto_gptq triton_v2 --fused_lora auto_gptq True` for GPTQ-LoRA
+    - Pass `--fast_kernels True True True --bitsandbytes nf4 --fused_lora bitsandbytes True` for QLoRA
     - Note the list of supported models [here](https://github.com/foundation-model-stack/fms-acceleration/blob/main/plugins/fused-ops-and-kernels/README.md#supported-models).
  * Notes on Padding Free
-    - works for both *single* and *multi-gpu*. 
-    - works on both *pretokenized* and *untokenized* datasets
-    - verified against the version found in HF main, merged in via PR https://github.com/huggingface/transformers/pull/31629.
+    - Works for both *single* and *multi-gpu*. 
+    - Works on both *pretokenized* and *untokenized* datasets
+    - Verified against the version found in HF main, merged in via PR https://github.com/huggingface/transformers/pull/31629.
  * Notes on Multipack
-    - works only for *multi-gpu*.
-    - currently only includes the version of *multipack* optimized for linear attention implementations like *flash-attn*.
+    - Works only for *multi-gpu*.
+    - Currently only includes the version of *multipack* optimized for linear attention implementations like *flash-attn*.
+    - Streaming datasets or use of `IterableDatasets` is not compatible with the fms-acceleration multipack plugin because multipack sampler has to run thorugh the full dataset every epoch. Using multipack and streaming together will raise an error.
  * Notes on Fast MoE
     - `--fast_moe` takes either an integer or boolean value.
       - When an integer `n` is passed, it enables expert parallel sharding with the expert parallel degree as `n` along with Scatter MoE kernels enabled.
diff --git a/docs/advanced-data-preprocessing.md b/docs/advanced-data-preprocessing.md
@@ -255,7 +255,7 @@ Needless to say the sampling ratio of a datasets is a float and all the sampling
 We also allow users to pass a [`seed`](https://huggingface.co/docs/datasets/v3.2.0/en/package_reference/main_classes#datasets.interleave_datasets.seed) to randomize the interleaving of datasets and a [`stopping_strategy`](https://huggingface.co/docs/datasets/v3.2.0/en/package_reference/main_classes#datasets.interleave_datasets.stopping_strategy) to describe when to stop sampling. Both values should remain the same for experiment reproducibility. Both these values are common for all datasets and should be supplied at top level in the `datapreprocessor` as shown [above](#how-the-user-can-write-data-configs). For a list of the supported values of these arguments see the corresponding HF API.
 
 
-`Note: If a user specifies data sampling they can expect the datasets to be mixed and individual samples in the dataset to not be broken unless the max_seq_len argument is smaller than the length of individual samples in the dataset`
+Note: If a user specifies data sampling they can expect the datasets to be mixed and individual samples in the dataset to not be broken unless the max_seq_len argument is smaller than the length of individual samples in the dataset
 
 ### Data Streaming
 Dataset streaming allows users to utilize the functionality of iterable datasets to pass in data piece by piece, avoiding memory constraints with large datasets for use-cases like extended pre-training.
@@ -271,6 +271,8 @@ dataprocessor:
 
 When using streaming, `split_batches` in the `TrainingArguments` will automatically be set to `True`, by doing so, the main process will fetch a full batch and slice it into `num_processes` batches for each process. This means that `num_processes` must be divisible by `batch_size`. This will replace the global batch size.
 
+Note: Streaming datasets or use of `IterableDatasets` is not compatible with the fms-acceleration multipack plugin because multipack sampler has to run thorugh the full dataset every epoch. Using multipack and streaming together will raise an error.
+
 **When using streaming, the user must set `max_steps` in the `TrainingArguments` instead of `num_train_epochs`.** Since iterable datasets are loaded chunk-by-chunk, data cannot run through epochs in a typical fashion as the **Trainer** can not know length of the dataset as it is being passed through. If both `max_steps` and `num_train_epochs` are given in a training config, `max_steps` will overwrite `num_train_epochs` since `max_steps` directly specifies the total number of optimization steps, which is needed when dataset length cannot be known. 
 
 If the dataset size is known to the user, `max_steps` can be calculated as the total number of samples divided by the batch size.
diff --git a/tests/data/test_data_preprocessing.py b/tests/data/test_data_preprocessing.py
@@ -61,6 +61,7 @@
 
 # Local
 from tuning.config import configs
+from tuning.config.acceleration_configs import AttentionAndDistributedPackingConfig
 from tuning.data.data_config import DataPreProcessorConfig, DataSetConfig
 from tuning.data.data_preprocessing_utils import get_data_collator
 from tuning.data.data_processors import DataPreProcessor, get_datapreprocessor
@@ -832,6 +833,67 @@ def test_process_dataconfig_file_with_streaming_no_max_steps_errors(
         (train_set, _, _) = _process_dataconfig_file(data_args, TRAIN_ARGS, tokenizer)
 
 
+@pytest.mark.parametrize(
+    "data_config_path, data_path",
+    [
+        (
+            DATA_CONFIG_YAML_STREAMING_INPUT_OUTPUT,
+            TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSON,
+        ),
+    ],
+)
+def test_process_dataconfig_file_with_streaming_and_multipack_throws_error(
+    data_config_path, data_path
+):
+    """Ensure that if multipack is passed with streaming, error is raised"""
+    with open(data_config_path, "r") as f:
+        yaml_content = yaml.safe_load(f)
+    yaml_content["datasets"][0]["data_paths"][0] = data_path
+    datasets_name = yaml_content["datasets"][0]["name"]
+
+    # Modify input_field_name and output_field_name according to dataset
+    if datasets_name == "text_dataset_input_output_masking":
+        yaml_content["datasets"][0]["data_handlers"][0]["arguments"]["fn_kwargs"] = {
+            "input_field_name": "input",
+            "output_field_name": "output",
+        }
+
+    # Modify dataset_text_field and template according to dataset
+    formatted_dataset_field = "formatted_data_field"
+    if datasets_name == "apply_custom_data_template":
+        template = "### Input: {{Tweet text}} \n\n ### Response: {{text_label}}"
+        yaml_content["datasets"][0]["data_handlers"][0]["arguments"]["fn_kwargs"] = {
+            "dataset_text_field": formatted_dataset_field,
+            "template": template,
+        }
+
+    with tempfile.NamedTemporaryFile(
+        "w", delete=False, suffix=".yaml"
+    ) as temp_yaml_file:
+        yaml.dump(yaml_content, temp_yaml_file)
+        temp_yaml_file_path = temp_yaml_file.name
+        data_args = configs.DataArguments(data_config_path=temp_yaml_file_path)
+
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+
+    TRAIN_ARGS = configs.TrainingArguments(
+        output_dir="tmp",  # Not needed but positional
+        max_steps=1,
+    )
+
+    attention_and_distributed_packing_config = AttentionAndDistributedPackingConfig(
+        None, None
+    )
+    attention_and_distributed_packing_config.multipack = 16
+
+    is_multipack = attention_and_distributed_packing_config.is_multipack
+
+    with pytest.raises(ValueError):
+        (train_set, _, _) = _process_dataconfig_file(
+            data_args, TRAIN_ARGS, tokenizer, is_multipack=is_multipack
+        )
+
+
 @pytest.mark.parametrize(
     "data_config_path, data_path",
     [
diff --git a/tuning/config/acceleration_configs/attention_and_distributed_packing.py b/tuning/config/acceleration_configs/attention_and_distributed_packing.py
@@ -51,3 +51,7 @@ def __post_init__(self):
     @property
     def is_padding_free(self):
         return self.padding_free is not None
+
+    @property
+    def is_multipack(self):
+        return self.multipack is not None
diff --git a/tuning/data/setup_dataprocessor.py b/tuning/data/setup_dataprocessor.py
@@ -69,6 +69,7 @@ def _process_dataconfig_file(
     train_args: TrainingArguments,
     tokenizer: AutoTokenizer,
     additional_data_handlers: Dict[str, DataHandler] = None,
+    is_multipack: bool = False,
 ):
     data_config = load_and_validate_data_config(data_args.data_config_path)
     processor = get_datapreprocessor(
@@ -95,6 +96,16 @@ def _process_dataconfig_file(
             raise ValueError(
                 "`--max_steps` must be set when streaming is set in data preprocessor config"
             )
+        if is_multipack:
+            logging.error(
+                "Multipack is not compatible with streaming=true please set streaming=false "
+                "or disable multipack sampler"
+            )
+
+            raise ValueError(
+                "Multipack is not compatible with streaming=true please set streaming=false "
+                "or disable multipack sampler"
+            )
     train_dataset = processor.process_dataset_configs(data_config.datasets)
 
     return (train_dataset, None, data_args.dataset_text_field)
@@ -333,6 +344,7 @@ def process_dataargs(
     train_args: TrainingArguments,
     additional_data_handlers: Dict[str, DataHandler] = None,
     is_padding_free: bool = False,
+    is_multipack: bool = False,
 ):
     """
     Args:
@@ -345,6 +357,8 @@ def process_dataargs(
             which need to be registered with the data preprocessor
         is_padding_free: A bool representing if Padding free plugin is enabled.
                          Defaults to False.
+        is_multipack: A bool representing is Multipack plugin is enabled.
+                         Defauts to False.
     Returns:
         Tuple(Dataset, Dataset, str, DataCollator, int, Dict)
             tuple containing
@@ -371,7 +385,7 @@ def process_dataargs(
 
     if data_args.data_config_path:
         train_dataset, eval_dataset, dataset_text_field = _process_dataconfig_file(
-            data_args, train_args, tokenizer, additional_data_handlers
+            data_args, train_args, tokenizer, additional_data_handlers, is_multipack
         )
     else:
         train_dataset, eval_dataset, dataset_text_field = _process_raw_data_args(
diff --git a/tuning/sft_trainer.py b/tuning/sft_trainer.py
@@ -290,8 +290,10 @@ def train(
     logger.info("Packing is set to %s ", train_args.packing)
 
     is_padding_free = False
+    is_multipack = False
     if attention_and_distributed_packing_config is not None:
         is_padding_free = attention_and_distributed_packing_config.is_padding_free
+        is_multipack = attention_and_distributed_packing_config.is_multipack
 
     data_preprocessing_time = time.time()
     (
@@ -307,6 +309,7 @@ def train(
         train_args,
         additional_data_handlers,
         is_padding_free=is_padding_free,
+        is_multipack=is_multipack,
     )
     additional_metrics["data_preprocessing_time"] = (
         time.time() - data_preprocessing_time