fix: suppress tokenizer parallelism warning in oneshot (vllm-project#2183)

majiayu000 · kylesayrs · dsikka · web-flow · commit bd111bccfbd4 · 2026-02-02T10:54:14.000-05:00
SUMMARY: Suppress the tokenizer parallelism warning that appears during oneshot calibration by setting `TOKENIZERS_PARALLELISM=false` in `Oneshot.__init__`. The warning occurs when FastTokenizer's internal threading conflicts with `dataset.map`'s multiprocessing (`num_proc` parameter). This fix sets the environment variable early in the oneshot lifecycle to prevent the conflict, while respecting any existing user-set value. Closes vllm-project#2007 TEST PLAN: - Added unit tests in `tests/llmcompressor/transformers/oneshot/test_tokenizer_parallelism.py` - Tests verify: 1. `TOKENIZERS_PARALLELISM` is set to `false` when not already set 2. Existing user-set `TOKENIZERS_PARALLELISM` values are respected - All tests pass locally with `pytest tests/llmcompressor/transformers/oneshot/test_tokenizer_parallelism.py -v` --------- Signed-off-by: majiayu000 <1835304752@qq.com> Signed-off-by: lif <1835304752@qq.com> Co-authored-by: Kyle Sayers <kylesayrs@gmail.com> Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com> Co-authored-by: HDCharles <39544797+HDCharles@users.noreply.github.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Brian Dellabetta <brian-dellabetta@users.noreply.github.com>
diff --git a/experimental/attention/llama3_attention.py b/experimental/attention/llama3_attention.py
@@ -1,10 +1,10 @@
+from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.utils import dispatch_for_generation
-from compressed_tensors.quantization import QuantizationScheme, QuantizationArgs
 
 # Select model and load it.
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
diff --git a/experimental/attention/llama3_attention_r3_nvfp4.py b/experimental/attention/llama3_attention_r3_nvfp4.py
@@ -1,12 +1,12 @@
+from compressed_tensors.quantization import QuantizationScheme
+from compressed_tensors.quantization.quant_scheme import NVFP4
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
 from llmcompressor.modifiers.transform import SpinQuantModifier
 from llmcompressor.utils import dispatch_for_generation
-from compressed_tensors.quantization import QuantizationScheme
-from compressed_tensors.quantization.quant_scheme import NVFP4
 
 # Select model and load it.
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -31,6 +31,9 @@
     from datasets import Dataset, DatasetDict
 
 
+TOKENIZERS_PARALLELISM_ENV = "TOKENIZERS_PARALLELISM"
+
+
 class Oneshot:
     """
     Class responsible for carrying out one-shot calibration on a pretrained model.
@@ -121,6 +124,19 @@ def __init__(
         :param log_dir: Path to save logs during oneshot run.
             Nothing is logged to file if None.
         """
+        # Disable tokenizer parallelism to prevent warning when using
+        # multiprocessing for dataset preprocessing. The warning occurs because
+        # FastTokenizer's internal threading conflicts with dataset.map's num_proc.
+        # See: https://github.com/vllm-project/llm-compressor/issues/2007
+        if TOKENIZERS_PARALLELISM_ENV not in os.environ:
+            os.environ[TOKENIZERS_PARALLELISM_ENV] = "false"
+            logger.warning(
+                "Disabling tokenizer parallelism due to threading conflict between "
+                "FastTokenizer and Datasets. Set "
+                f"{TOKENIZERS_PARALLELISM_ENV}=false to "
+                "suppress this warning."
+            )
+
         # Set up file logging (no default files):
         # 1) If LLM_COMPRESSOR_LOG_FILE is set, log to that file.
         # 2) Else, if an explicit log_dir is provided, create a timestamped file there.
diff --git a/tests/llmcompressor/transformers/oneshot/test_tokenizer_parallelism.py b/tests/llmcompressor/transformers/oneshot/test_tokenizer_parallelism.py
@@ -0,0 +1,47 @@
+import os
+
+import pytest
+
+from llmcompressor.entrypoints.oneshot import (
+    TOKENIZERS_PARALLELISM_ENV as _TOKENIZERS_PARALLELISM_ENV,
+)
+
+
+class TestTokenizerParallelism:
+    """Tests for tokenizer parallelism warning suppression (issue #2007)."""
+
+    def test_oneshot_sets_tokenizers_parallelism_when_not_set(self, monkeypatch):
+        """
+        Test that Oneshot sets TOKENIZERS_PARALLELISM=false when not already set.
+
+        This prevents the warning:
+        "huggingface/tokenizers: The current process just got forked, after
+        parallelism has already been used. Disabling parallelism to avoid deadlocks..."
+
+        See: https://github.com/vllm-project/llm-compressor/issues/2007
+        """
+        monkeypatch.delenv(_TOKENIZERS_PARALLELISM_ENV, raising=False)
+
+        from llmcompressor.entrypoints.oneshot import Oneshot
+
+        # Create a minimal Oneshot instance to trigger __init__
+        # We expect it to fail due to missing model, but the env var should be set
+        with pytest.raises(Exception):
+            Oneshot(model="nonexistent-model")
+
+        assert os.environ[_TOKENIZERS_PARALLELISM_ENV] == "false"
+
+    def test_oneshot_respects_existing_tokenizers_parallelism(self, monkeypatch):
+        """
+        Test that Oneshot respects user's existing TOKENIZERS_PARALLELISM setting.
+
+        If a user has explicitly set TOKENIZERS_PARALLELISM, we should not override it.
+        """
+        monkeypatch.setenv(_TOKENIZERS_PARALLELISM_ENV, "true")
+
+        from llmcompressor.entrypoints.oneshot import Oneshot
+
+        with pytest.raises(Exception):
+            Oneshot(model="nonexistent-model")
+
+        assert os.environ[_TOKENIZERS_PARALLELISM_ENV] == "true"
diff --git a/tools/collect_env.py b/tools/collect_env.py
@@ -3,9 +3,9 @@
 creating bug reports. See `.github/ISSUE_TEMPLATE/bug_report.md`
 """
 
+import importlib
 import platform
 import sys
-import importlib
 
 
 def get_version(pkg_name):