vllm-project · dsikka · Feb 24, 2025 · Jan 28, 2025 · Feb 4, 2025 · Feb 4, 2025
diff --git a/README.md b/README.md
@@ -58,7 +58,7 @@ Quantization is applied by selecting an algorithm and calling the `oneshot` API.
 ```python
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 from transformers import AutoModelForCausalLM
 
 # Select quantization algorithm. In this case, we:

diff --git a/examples/big_models_with_accelerate/cpu_offloading_fp8.py b/examples/big_models_with_accelerate/cpu_offloading_fp8.py
@@ -1,7 +1,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 
 MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
 OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"

diff --git a/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py b/examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
@@ -2,9 +2,9 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
 MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"

diff --git a/examples/big_models_with_accelerate/multi_gpu_int8.py b/examples/big_models_with_accelerate/multi_gpu_int8.py
@@ -1,8 +1,8 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 
 MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
 SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic"

diff --git a/examples/multimodal_audio/whisper_example.py b/examples/multimodal_audio/whisper_example.py
@@ -2,8 +2,8 @@
 from datasets import load_dataset
 from transformers import WhisperProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.tracing import TraceableWhisperForConditionalGeneration
 
 # Select model and load it.

diff --git a/examples/multimodal_vision/idefics3_example.py b/examples/multimodal_vision/idefics3_example.py
@@ -4,8 +4,8 @@
 from PIL import Image
 from transformers import AutoProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.tracing import TraceableIdefics3ForConditionalGeneration
 
 # Load model.

diff --git a/examples/multimodal_vision/llava_example.py b/examples/multimodal_vision/llava_example.py
@@ -3,8 +3,8 @@
 from PIL import Image
 from transformers import AutoProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.tracing import TraceableLlavaForConditionalGeneration
 
 # Load model.

diff --git a/examples/multimodal_vision/mllama_example.py b/examples/multimodal_vision/mllama_example.py
@@ -3,8 +3,8 @@
 from PIL import Image
 from transformers import AutoProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.tracing import TraceableMllamaForConditionalGeneration
 
 # Load model.

diff --git a/examples/multimodal_vision/phi3_vision_example.py b/examples/multimodal_vision/phi3_vision_example.py
@@ -5,8 +5,8 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 
 # Load model.
 model_id = "microsoft/Phi-3-vision-128k-instruct"

diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py
@@ -3,8 +3,8 @@
 from PIL import Image
 from transformers import AutoProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.tracing import TraceableLlavaForConditionalGeneration
 
 # Load model.

diff --git a/examples/multimodal_vision/qwen2_vl_example.py b/examples/multimodal_vision/qwen2_vl_example.py
@@ -6,8 +6,8 @@
 from qwen_vl_utils import process_vision_info
 from transformers import AutoProcessor
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.tracing import TraceableQwen2VLForConditionalGeneration
 
 # Load model.

diff --git a/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py b/examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
@@ -17,7 +17,7 @@
 dataset = "ultrachat-200k"
 
 # save location of quantized model
-output_dir = "output_llama7b_2of4_w4a16_channel"
+output_dir = "output_llama7b_2of4_w4a16_channel-refac"
 
 # set dataset config parameters
 splits = {"calibration": "train_gen[:5%]", "train": "train_gen"}
@@ -33,6 +33,7 @@
 bf16 = False  # using full precision for training
 lr_scheduler_type = "cosine"
 warmup_ratio = 0.1
+preprocessing_num_workers = 8
 
 # this will run the recipe stage by stage:
 # oneshot sparsification -> finetuning -> oneshot quantization
@@ -52,6 +53,7 @@
     learning_rate=learning_rate,
     lr_scheduler_type=lr_scheduler_type,
     warmup_ratio=warmup_ratio,
+    preprocessing_num_workers=preprocessing_num_workers,
 )
 logger.info(
     "Note: llcompressor does not currently support running ",

diff --git a/examples/quantization_kv_cache/README.md b/examples/quantization_kv_cache/README.md
@@ -75,7 +75,7 @@ Configure and apply the FP8 quantization for weights, activations, and KV cache.
 Notice the new `kv_cache_scheme` section:
 
 ```python
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 
 recipe = """
 quant_stage:

diff --git a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
@@ -1,7 +1,7 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 
 # Select model and load it.
 MODEL_ID = "google/gemma-2-9b-it"

diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
@@ -2,7 +2,7 @@
 from loguru import logger
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"

diff --git a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
@@ -1,7 +1,7 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 
 # Select model and load it.
 # Phi-3.5 is a special case for KV cache quantization because it has

diff --git a/examples/quantization_w4a16/README.md b/examples/quantization_w4a16/README.md
@@ -86,7 +86,7 @@ In our case, we will apply the default GPTQ recipe for `int4` (which uses static
 > See the `Recipes` documentation for more information on making complex recipes
 
 ```python
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
 
 # Configure the quantization algorithm to run.

diff --git a/examples/quantization_w8a8_fp8/README.md b/examples/quantization_w8a8_fp8/README.md
@@ -54,7 +54,7 @@ We recommend targeting all `Linear` layers using the `FP8_DYNAMIC` scheme, which
 Since simple PTQ does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
 
 ```python
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
 
 # Configure the simple PTQ quantization

diff --git a/examples/quantization_w8a8_fp8/gemma2_example.py b/examples/quantization_w8a8_fp8/gemma2_example.py
@@ -1,7 +1,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 
 MODEL_ID = "google/gemma-2-27b-it"
 

diff --git a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
@@ -1,7 +1,7 @@
 from transformers import AutoProcessor, MllamaForConditionalGeneration
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 
 MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 

diff --git a/examples/quantization_w8a8_fp8/llama3_example.py b/examples/quantization_w8a8_fp8/llama3_example.py
@@ -1,7 +1,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 

diff --git a/examples/quantization_w8a8_fp8/llava1.5_example.py b/examples/quantization_w8a8_fp8/llava1.5_example.py
@@ -1,7 +1,7 @@
 from transformers import AutoProcessor, LlavaForConditionalGeneration
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 
 MODEL_ID = "llava-hf/llava-1.5-7b-hf"
 

diff --git a/examples/quantization_w8a8_fp8/qwen2vl_example.py b/examples/quantization_w8a8_fp8/qwen2vl_example.py
@@ -1,7 +1,7 @@
 from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 
 MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"
 

diff --git a/examples/quantization_w8a8_fp8/whisper_example.py b/examples/quantization_w8a8_fp8/whisper_example.py
@@ -1,8 +1,8 @@
 from datasets import load_dataset
 from transformers import AutoProcessor, WhisperForConditionalGeneration
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 
 MODEL_ID = "openai/whisper-large-v2"
 

diff --git a/examples/quantization_w8a8_int8/README.md b/examples/quantization_w8a8_int8/README.md
@@ -86,7 +86,7 @@ We first select the quantization algorithm. For W8A8, we want to:
 > See the `Recipes` documentation for more information on recipes
 
 ```python
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
 

diff --git a/examples/quantization_w8a8_int8/gemma2_example.py b/examples/quantization_w8a8_int8/gemma2_example.py
@@ -1,8 +1,8 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 
 # 1) Select model and load it.
 MODEL_ID = "google/gemma-2-2b-it"

diff --git a/examples/quantization_w8a8_int8/llama3_example.py b/examples/quantization_w8a8_int8/llama3_example.py
@@ -1,9 +1,9 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
-from llmcompressor.transformers import oneshot
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"

diff --git a/examples/quantizing_moe/README.md b/examples/quantizing_moe/README.md
@@ -51,7 +51,7 @@ NOTE: `.*block_sparse_moe.gate` layers do not quantize well, hence they are igno
 The `oneshot` method applies the selected recipe to your model and dataset without requiring any fine-tuning. The model will be sparsified and saved to `Mixtral-8x7B-Instruct-v0.1-FP8`.
 
 ```python
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 
 output_dir = "Mixtral-8x7B-Instruct-v0.1-FP8"
 
@@ -61,7 +61,7 @@ oneshot(
     recipe=recipe,
     save_compressed=True,
     output_dir=output_dir,
-    overwrite_output_dir=True,
+
     max_seq_length=2048,
     num_calibration_samples=512,
 )

diff --git a/examples/quantizing_moe/deepseek_moe_w4a16.py b/examples/quantizing_moe/deepseek_moe_w4a16.py
@@ -2,7 +2,7 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
 # NOTE: transformers 4.48.0 has an import error with DeepSeek.

diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py b/examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
@@ -1,8 +1,8 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 
 # NOTE: transformers 4.48.0 has an import error with DeepSeek.
 # Please consider either downgrading your transformers version to a

diff --git a/examples/quantizing_moe/deepseek_moe_w8a8_int8.py b/examples/quantizing_moe/deepseek_moe_w8a8_int8.py
@@ -2,8 +2,8 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
 # NOTE: transformers 4.48.0 has an import error with DeepSeek.

diff --git a/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py b/examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
@@ -2,8 +2,8 @@
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
 
 MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
@@ -45,7 +45,6 @@
     max_seq_length=MAX_SEQ_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     save_compressed=SAVE_COMPRESSED,
-    overwrite_output_dir=True,
     output_dir=SAVE_DIR,
 )
 

diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
@@ -3,10 +3,10 @@
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from llmcompressor import oneshot
 from llmcompressor.modifiers.obcq import SparseGPTModifier
 from llmcompressor.modifiers.pruning import ConstantPruningModifier
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.transformers import oneshot
 
 # Configuration
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"

diff --git a/setup.py b/setup.py
@@ -87,17 +87,6 @@
             "pre-commit",
         ]
     },
-    entry_points={
-        "console_scripts": [
-            "llmcompressor.transformers.text_generation.apply=llmcompressor.transformers.finetune.text_generation:apply",  # noqa 501
-            "llmcompressor.transformers.text_generation.compress=llmcompressor.transformers.finetune.text_generation:apply",  # noqa 501
-            "llmcompressor.transformers.text_generation.train=llmcompressor.transformers.finetune.text_generation:train",  # noqa 501
-            "llmcompressor.transformers.text_generation.finetune=llmcompressor.transformers.finetune.text_generation:train",  # noqa 501
-            "llmcompressor.transformers.text_generation.eval=llmcompressor.transformers.finetune.text_generation:eval",  # noqa 501
-            "llmcompressor.transformers.text_generation.oneshot=llmcompressor.transformers.finetune.text_generation:oneshot",  # noqa 501
-            "llmcompressor.trace=llmcompressor.transformers.tracing.debug:main",
-        ]
-    },
     python_requires=">=3.8",
     classifiers=[
         "Development Status :: 5 - Production/Stable",

diff --git a/src/llmcompressor/__init__.py b/src/llmcompressor/__init__.py
@@ -36,11 +36,11 @@
 
 from llmcompressor.core.session_functions import (
     active_session,
-    apply,
     callbacks,
     create_session,
     finalize,
     initialize,
     pre_initialize_structure,
     reset_session,
 )
+from llmcompressor.entrypoints import Oneshot, oneshot
@@ -5,7 +5,7 @@ Parsers in `llm-compressor` define the input arguments required for various entr
 Each entry point (e.g., oneshot) carries out its logic based on the provided input arguments, `model`, `recipe`, and `dataset`.
 
 ```python
-from llmcompressor.transformers import oneshot
+from llmcompressor import oneshot
 
 model = ...
 recipe = ...
@@ -24,7 +24,7 @@ oneshot(
 
 These input arguments can be overloaded into the function signature and will be parsed using Hugging Face's [argument parser](https://github.com/huggingface/transformers/blob/main/src/transformers/hf_argparser.py). The parsers define the acceptable inputs; therefore any arguments to be passed in must be defined.
 
-`llm-compressor` uses four parsers, located in `llm_compressor/arg_parser`:
+`llm-compressor` uses four parsers, located in `llm_compressor/args`:
 * ModelArguments
 * DatasetArguments
 * RecipeArguments