WIP: working

kylesayrs · kylesayrs · commit 29bf737bb7c9 · 2025-11-30T22:27:51.000Z
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/examples/multimodal_vision/gemma3_example.py b/examples/multimodal_vision/gemma3_example.py
@@ -48,7 +48,7 @@ def data_collator(batch):
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     trust_remote_code_model=True,
-    data_collator=data_collator,
+    # data_collator=data_collator,
 )
 
 # Confirm generations of the quantized model look sane.
diff --git a/examples/quantization_w4a16/llama3_example.py b/examples/quantization_w4a16/llama3_example.py
@@ -6,7 +6,7 @@
 from llmcompressor.utils import dispatch_for_generation
 
 # Select model and load it.
-model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+model_id = "meta-llama/Llama-3.2-3B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
@@ -16,7 +16,7 @@
 
 # Select number of samples. 512 samples is a good place to start.
 # Increasing the number of samples can improve accuracy.
-NUM_CALIBRATION_SAMPLES = 12
+NUM_CALIBRATION_SAMPLES = 512
 MAX_SEQUENCE_LENGTH = 2048
 
 # Load dataset and preprocess.
@@ -57,10 +57,10 @@ def tokenize(sample):
 oneshot(
     model=model,
     dataset=ds,
-    batch_size=12,
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    batch_size=4,
 )
 
 # Confirm generations of the quantized model look sane.
diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py
@@ -8,9 +8,7 @@
 """
 
 from dataclasses import dataclass, field
-from typing import Any, Callable
-
-from transformers import DefaultDataCollator
+from typing import Callable, Optional
 
 
 @dataclass
@@ -69,8 +67,8 @@ class CustomDatasetArguments(DVCDatasetArguments):
         },
     )
 
-    data_collator: Callable[[Any], Any] = field(
-        default_factory=lambda: DefaultDataCollator(),
+    data_collator: Optional[Callable] = field(
+        default=None,
         metadata={"help": "The function to used to form a batch from the dataset"},
     )
 
diff --git a/src/llmcompressor/datasets/utils.py b/src/llmcompressor/datasets/utils.py
@@ -15,7 +15,7 @@
 from datasets import Dataset
 from loguru import logger
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
-from transformers.data import default_data_collator
+from transformers.data import DataCollatorWithPadding
 
 from llmcompressor.args import DatasetArguments
 from llmcompressor.transformers.finetune.data import TextGenerationDataset
@@ -115,22 +115,29 @@ def get_calibration_dataloader(
     )
 
     calibration_dataset = datasets.get("calibration")
+    tokenizer = getattr(processor, "tokenizer", processor)
+    collate_fn = dataset_args.data_collator or DataCollatorWithPadding(tokenizer)
+    if dataset_args.batch_size > 1 and (
+        tokenizer.pad_token is None or tokenizer.pad_token_id < 0
+    ):
+        logger.warning("Could not find padding token. Setting PAD token to EOS token")
+        tokenizer.pad_token = tokenizer.eos_token
 
     return format_calibration_data(
         tokenized_dataset=calibration_dataset,
+        collate_fn=collate_fn,
         num_calibration_samples=dataset_args.num_calibration_samples,
         batch_size=dataset_args.batch_size,
         do_shuffle=dataset_args.shuffle_calibration_samples,
-        collate_fn=dataset_args.data_collator,
     )
 
 
 def format_calibration_data(
     tokenized_dataset: Dataset,
+    collate_fn: Callable,
     num_calibration_samples: int | None = None,
     batch_size: int = 1,
     do_shuffle: bool = True,
-    collate_fn: Callable = default_data_collator,
 ) -> list[torch.Tensor]:
     """
     Creates a dataloader out of the calibration dataset split, trimming it to
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -12,7 +12,7 @@
 import os
 from datetime import datetime
 from pathlib import Path
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Callable, Optional
 
 from loguru import logger
 from torch.utils.data import DataLoader
@@ -249,6 +249,8 @@ def oneshot(
     dataset_config_name: str | None = None,
     dataset_path: str | None = None,
     splits: str | list[str] | dict[str, str] | None = None,
+    batch_size: int = 1,
+    data_collator: Optional[Callable] = None,
     num_calibration_samples: int = 512,
     shuffle_calibration_samples: bool = True,
     max_seq_length: int = 384,
diff --git a/src/llmcompressor/pipelines/basic/pipeline.py b/src/llmcompressor/pipelines/basic/pipeline.py
@@ -10,7 +10,12 @@
 from llmcompressor.modifiers.utils.pytorch_helpers import apply_pad_mask_to_batch
 from llmcompressor.pipelines.registry import CalibrationPipeline
 from llmcompressor.pytorch.utils.helpers import tensors_to_device
-from llmcompressor.utils import calibration_forward_context, dispatch_for_generation, targets_lm_head, disable_lm_head
+from llmcompressor.utils import (
+    calibration_forward_context,
+    disable_lm_head,
+    dispatch_for_generation,
+    targets_lm_head,
+)
 
 if TYPE_CHECKING:
     from llmcompressor.args.dataset_arguments import DatasetArguments
diff --git a/src/llmcompressor/pipelines/sequential/pipeline.py b/src/llmcompressor/pipelines/sequential/pipeline.py
@@ -19,8 +19,8 @@
     DISABLE_QAC_MODIFIERS,
     DisableQuantization,
     calibration_forward_context,
-    targets_lm_head,
     disable_lm_head,
+    targets_lm_head,
 )
 
 if TYPE_CHECKING:
diff --git a/src/llmcompressor/utils/helpers.py b/src/llmcompressor/utils/helpers.py
@@ -18,7 +18,7 @@
 from collections import OrderedDict
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Tuple, Union, TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Tuple, Union
 from urllib.parse import urlparse
 
 import numpy
@@ -1094,16 +1094,15 @@ def disable_lm_head(model: torch.nn.Module):
 
 
 def targets_lm_head(model: PreTrainedModel, modifiers: list["Modifier"]) -> bool:
-    """ Returns True if the given modifiers target the lm_head """
+    """Returns True if the given modifiers target the lm_head"""
     from llmcompressor.transformers.compression.compressed_tensors_utils import (
-        targets_embeddings
+        targets_embeddings,
     )
 
     targets = sum(
         (list(modifier.get_targets(model)) for modifier in modifiers), start=[]
     )
     return targets_embeddings(model, targets, check_input=True, check_output=False)
-    
 
 
 @contextlib.contextmanager

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ def data_collator(batch):`
`48`	`48`	`max_seq_length=MAX_SEQUENCE_LENGTH,`
`49`	`49`	`num_calibration_samples=NUM_CALIBRATION_SAMPLES,`
`50`	`50`	`trust_remote_code_model=True,`
`51`		`- data_collator=data_collator,`
	`51`	`+ # data_collator=data_collator,`
`52`	`52`	`)`
`53`	`53`
`54`	`54`	`# Confirm generations of the quantized model look sane.`
Original file line number	Diff line number	Diff line change
`@@ -19,8 +19,8 @@`
`19`	`19`	`DISABLE_QAC_MODIFIERS,`
`20`	`20`	`DisableQuantization,`
`21`	`21`	`calibration_forward_context,`
`22`		`- targets_lm_head,`
`23`	`22`	`disable_lm_head,`
	`23`	`+ targets_lm_head,`
`24`	`24`	`)`
`25`	`25`
`26`	`26`	`if TYPE_CHECKING:`