Merge branch 'main' into add-non-uniform-e2e-tests

HDCharles · web-flow · commit 75fab6ffeaa7 · 2026-02-02T11:47:26.000-05:00
diff --git a/experimental/attention/llama3_attention.py b/experimental/attention/llama3_attention.py
@@ -1,10 +1,10 @@
+from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.utils import dispatch_for_generation
-from compressed_tensors.quantization import QuantizationScheme, QuantizationArgs
 
 # Select model and load it.
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
diff --git a/experimental/attention/llama3_attention_r3_nvfp4.py b/experimental/attention/llama3_attention_r3_nvfp4.py
@@ -1,12 +1,12 @@
+from compressed_tensors.quantization import QuantizationScheme
+from compressed_tensors.quantization.quant_scheme import NVFP4
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.modifiers.transform import SpinQuantModifier
 from llmcompressor.utils import dispatch_for_generation
-from compressed_tensors.quantization import QuantizationScheme
-from compressed_tensors.quantization.quant_scheme import NVFP4
 
 # Select model and load it.
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -31,6 +31,9 @@
     from datasets import Dataset, DatasetDict
 
 
+TOKENIZERS_PARALLELISM_ENV = "TOKENIZERS_PARALLELISM"
+
+
 class Oneshot:
     """
     Class responsible for carrying out one-shot calibration on a pretrained model.
@@ -121,6 +124,19 @@ def __init__(
         :param log_dir: Path to save logs during oneshot run.
             Nothing is logged to file if None.
         """
+        # Disable tokenizer parallelism to prevent warning when using
+        # multiprocessing for dataset preprocessing. The warning occurs because
+        # FastTokenizer's internal threading conflicts with dataset.map's num_proc.
+        # See: https://github.com/vllm-project/llm-compressor/issues/2007
+        if TOKENIZERS_PARALLELISM_ENV not in os.environ:
+            os.environ[TOKENIZERS_PARALLELISM_ENV] = "false"
+            logger.warning(
+                "Disabling tokenizer parallelism due to threading conflict between "
+                "FastTokenizer and Datasets. Set "
+                f"{TOKENIZERS_PARALLELISM_ENV}=false to "
+                "suppress this warning."
+            )
+
         # Set up file logging (no default files):
         # 1) If LLM_COMPRESSOR_LOG_FILE is set, log to that file.
         # 2) Else, if an explicit log_dir is provided, create a timestamped file there.
@@ -213,6 +229,7 @@ def apply_recipe_modifiers(
                 recipe_stage=recipe_stage,
                 recipe_args=self.recipe_args.recipe_args,
                 calib_data=calibration_dataloader,
+                sequential_targets=self.dataset_args.sequential_targets,
             )
             user_pipeline = self.dataset_args.pipeline
             pipeline = CalibrationPipeline.from_modifiers(
diff --git a/src/llmcompressor/modifiers/pruning/sparsegpt/sgpt_base.py b/src/llmcompressor/modifiers/pruning/sparsegpt/sgpt_base.py
@@ -113,7 +113,11 @@ def on_initialize(self, state: "State", **kwargs) -> bool:
         dataloader: torch.utils.data.DataLoader = state.data.calib
 
         # infer module and sequential targets
-        self.sequential_targets = self._infer_sequential_targets(model)
+        # Note: only pass sequential_targets from kwargs, not the full kwargs dict
+        # which may contain 'model' and cause duplicate argument errors
+        self.sequential_targets = self._infer_sequential_targets(
+            model, sequential_targets=kwargs.get("sequential_targets")
+        )
         layers = get_layers(self.sequential_targets, model)
         self._target_layers = get_layers(
             self.targets, model
@@ -192,9 +196,27 @@ def on_end(self, state: State, event: Event, **kwargs):
         self.ended_ = True
         self.remove_hooks()
 
-    def _infer_sequential_targets(self, model: torch.nn.Module) -> str | list[str]:
+    def _infer_sequential_targets(
+        self, model: torch.nn.Module, **kwargs
+    ) -> str | list[str]:
+        targets_from_kwargs = kwargs.get("sequential_targets")
+
+        # Validate that sequential_targets is not provided from both sources
+        if self.sequential_targets is not None and targets_from_kwargs is not None:
+            raise ValueError(
+                "sequential_targets was provided both in the modifier config and in "
+                "oneshot() dataset_args. Please provide sequential_targets in only "
+                "one location to avoid conflicts."
+            )
+
         match self.sequential_targets:
             case None:
+                # Check if sequential_targets was passed via kwargs (from dataset_args)
+                if targets_from_kwargs is not None:
+                    if isinstance(targets_from_kwargs, str):
+                        return [targets_from_kwargs]
+                    return targets_from_kwargs
+                # Fall back to auto-inference
                 return get_no_split_params(model)
             case str():
                 return [self.sequential_targets]
diff --git a/tests/llmcompressor/transformers/oneshot/test_tokenizer_parallelism.py b/tests/llmcompressor/transformers/oneshot/test_tokenizer_parallelism.py
@@ -0,0 +1,47 @@
+import os
+
+import pytest
+
+from llmcompressor.entrypoints.oneshot import (
+    TOKENIZERS_PARALLELISM_ENV as _TOKENIZERS_PARALLELISM_ENV,
+)
+
+
+class TestTokenizerParallelism:
+    """Tests for tokenizer parallelism warning suppression (issue #2007)."""
+
+    def test_oneshot_sets_tokenizers_parallelism_when_not_set(self, monkeypatch):
+        """
+        Test that Oneshot sets TOKENIZERS_PARALLELISM=false when not already set.
+
+        This prevents the warning:
+        "huggingface/tokenizers: The current process just got forked, after
+        parallelism has already been used. Disabling parallelism to avoid deadlocks..."
+
+        See: https://github.com/vllm-project/llm-compressor/issues/2007
+        """
+        monkeypatch.delenv(_TOKENIZERS_PARALLELISM_ENV, raising=False)
+
+        from llmcompressor.entrypoints.oneshot import Oneshot
+
+        # Create a minimal Oneshot instance to trigger __init__
+        # We expect it to fail due to missing model, but the env var should be set
+        with pytest.raises(Exception):
+            Oneshot(model="nonexistent-model")
+
+        assert os.environ[_TOKENIZERS_PARALLELISM_ENV] == "false"
+
+    def test_oneshot_respects_existing_tokenizers_parallelism(self, monkeypatch):
+        """
+        Test that Oneshot respects user's existing TOKENIZERS_PARALLELISM setting.
+
+        If a user has explicitly set TOKENIZERS_PARALLELISM, we should not override it.
+        """
+        monkeypatch.setenv(_TOKENIZERS_PARALLELISM_ENV, "true")
+
+        from llmcompressor.entrypoints.oneshot import Oneshot
+
+        with pytest.raises(Exception):
+            Oneshot(model="nonexistent-model")
+
+        assert os.environ[_TOKENIZERS_PARALLELISM_ENV] == "true"
diff --git a/tools/collect_env.py b/tools/collect_env.py
@@ -3,9 +3,9 @@
 creating bug reports. See `.github/ISSUE_TEMPLATE/bug_report.md`
 """
 
+import importlib
 import platform
 import sys
-import importlib
 
 
 def get_version(pkg_name):