[Entrypoints] initialize processor error handling (#1796)

brian-dellabetta · web-flow · commit 21de94321fa5 · 2025-09-16T18:22:37.000Z
SUMMARY: Resolves #1795 * #1795 Currently, we initialize a processor in entrypoint `pre_process` even if one isn't provided, even though it isn't needed for data-free recipes like `FP8_DYNAMIC` or `W4A16`, causing downstream user issues like #1795. This updates pre-processing to - wrap processor initialization in a try/catch - error out if initialization fails and a processor is required (i.e. if a dataset is needed for training/calibration) - otherwise, log a warning if an output_dir is provided, because the processor will not be saved with the trained/compressed model. TEST PLAN: Example script in #1795 succeeds on this branch, confirmed error is raised if `output_dir` is set and error is raised if `dataset` is set. --------- Signed-off-by: Brian Dellabetta <bdellabe@redhat.com>
diff --git a/src/llmcompressor/args/dataset_arguments.py b/src/llmcompressor/args/dataset_arguments.py
@@ -217,3 +217,6 @@ class DatasetArguments(CustomDatasetArguments):
             "Default is set to True."
         },
     )
+
+    def is_dataset_provided(self) -> bool:
+        return self.dataset is not None or self.dataset_path is not None
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -125,7 +125,7 @@ def __init__(
         self.output_dir = output_dir
 
         # initialize the model and processor
-        pre_process(model_args)
+        pre_process(model_args, dataset_args, output_dir)
 
         # Set instance attributes
         self.model = self.model_args.model
diff --git a/src/llmcompressor/entrypoints/train.py b/src/llmcompressor/entrypoints/train.py
@@ -59,11 +59,11 @@ def train(**kwargs) -> PreTrainedModel:
         ```
 
     """
-    model_args, dataset_args, recipe_args, training_args, _ = parse_args(
+    model_args, dataset_args, recipe_args, training_args, output_dir = parse_args(
         include_training_args=True, **kwargs
     )
 
-    pre_process(model_args)
+    pre_process(model_args, dataset_args, output_dir)
     dispatch_for_generation(model_args.model)  # train is dispatched same as generation
 
     processed_dataset = get_processed_dataset(
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
@@ -15,7 +15,12 @@
 )
 from transformers.utils.quantization_config import CompressedTensorsConfig
 
-from llmcompressor.args import ModelArguments, RecipeArguments, TrainingArguments
+from llmcompressor.args import (
+    DatasetArguments,
+    ModelArguments,
+    RecipeArguments,
+    TrainingArguments,
+)
 from llmcompressor.core import reset_session
 from llmcompressor.pytorch.model_load.helpers import parse_dtype
 from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
@@ -30,7 +35,11 @@
 from llmcompressor.utils.fsdp.helpers import is_fsdp_model
 
 
-def pre_process(model_args: "ModelArguments"):
+def pre_process(
+    model_args: ModelArguments,
+    dataset_args: DatasetArguments,
+    output_dir: Optional[str],
+):
     """
     Prepares the model and tokenizer/processor for calibration.
     - Initializes the model if it's specified as a path or string.
@@ -54,11 +63,27 @@ def pre_process(model_args: "ModelArguments"):
         model_args.model = model
         model_args.distill_teacher = distill_teacher
 
-    # Initialize processor
+    # Initialize processor if dataset provided
     if isinstance(model_args.processor, (str, type(None))):
-        model_args.processor = initialize_processor_from_path(
-            model_args, model_args.model
-        )
+        try:
+            model_args.processor = initialize_processor_from_path(
+                model_args, model_args.model
+            )
+        except Exception as e:
+            if dataset_args.is_dataset_provided():
+                raise RuntimeError(
+                    "An error occurred when attempting to initialize "
+                    "model processor, which is required when a dataset "
+                    "is provided. To resolve, create and pass in a "
+                    "processor directly to `oneshot`/`train`."
+                ) from e
+            elif output_dir:
+                logger.warning(
+                    "Model processor could not be auto-initialized and "
+                    "will not be saved along with the model. To resolve, "
+                    "create and pass in a processor directly to "
+                    f"`oneshot`/`train`.\nInitialization Error: {e}"
+                )
 
     # untie tie_word_embeddings weights
     if not model_args.tie_word_embeddings:

Original file line number	Diff line number	Diff line change
`@@ -217,3 +217,6 @@ class DatasetArguments(CustomDatasetArguments):`
`217`	`217`	`"Default is set to True."`
`218`	`218`	`},`
`219`	`219`	`)`
	`220`	`+`
	`221`	`+ def is_dataset_provided(self) -> bool:`
	`222`	`+ return self.dataset is not None or self.dataset_path is not None`