Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/guides/saving_a_model.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ The simplest approach is to use `oneshot`, which handles both compression and wr
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Load model
model = AutoModelForCausalLM.from_pretrained("your-model")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier
from llmcompressor.modifiers.transform.smoothquant import SmoothQuantModifier

# Select model and load it.
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_audio/whisper_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
)

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Select model and load it.
MODEL_ID = "openai/whisper-large-v3"
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_vision/gemma3_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from transformers import AutoProcessor, Gemma3ForConditionalGeneration

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Load model.
model_id = "google/gemma-3-4b-it"
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_vision/idefics3_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from transformers import AutoProcessor, Idefics3ForConditionalGeneration

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Load model.
model_id = "HuggingFaceM4/Idefics3-8B-Llama3" # or "HuggingFaceTB/SmolVLM-Instruct"
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_vision/internvl3_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from transformers import AutoModelForImageTextToText, AutoProcessor

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Load model.
model_id = "OpenGVLab/InternVL3-8B-hf"
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_vision/llama4_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from transformers import Llama4ForConditionalGeneration, Llama4Processor

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Select model and load it.
model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_vision/llava_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from transformers import AutoProcessor, LlavaForConditionalGeneration

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Load model.
model_id = "llava-hf/llava-1.5-7b-hf"
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_vision/medgemma_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from transformers import AutoProcessor, Gemma3ForConditionalGeneration

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Load model.
model_id = "google/medgemma-27b-it"
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_vision/mistral3_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
)

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Load model.
model_id = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_vision/mllama_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from transformers import AutoProcessor, MllamaForConditionalGeneration

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Load model.
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_vision/phi3_vision_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from transformers import AutoModelForCausalLM, AutoProcessor

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Load model.
model_id = "microsoft/Phi-3-vision-128k-instruct"
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_vision/pixtral_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
)

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Load model.
model_id = "mgoin/pixtral-12b"
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_vision/qwen2_vl_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Load model.
model_id = "Qwen/Qwen2-VL-2B-Instruct"
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_vision/qwen3_omni_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from llmcompressor import oneshot
from llmcompressor.modeling.patch.qwen3_omni_patch import fast_pos_embed_interpolate
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier
from llmcompressor.transformers.compression.compressed_tensors_utils import (
modify_save_pretrained,
)
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_vision/qwen_2_5_vl_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Load model.
model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQMapping, AWQModifier
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier


def parse_args():
Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w4a16/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ In our case, we will apply the default GPTQ recipe for `int4` (which uses static
```python
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Configure the quantization algorithm to run.
recipe = GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"])
Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w4a16/llama3_ddp_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from llmcompressor import oneshot
from llmcompressor.datasets.utils import get_rank_partition
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w4a16/llama3_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Select model and load it.
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w4a16_fp4/mxfp4/llama3_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Select model and load it.
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w4a4_fp4/llama3_gptq_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Select model and load it.
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w4a8_fp8/llama3_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Select model and load it.
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w8a8_int8/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ We first select the quantization algorithm. For W8A8, we want to:

```python
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier
from llmcompressor.modifiers.transform.smoothquant import SmoothQuantModifier

# Configure the quantization algorithms to run.
Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w8a8_int8/gemma2_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# 1) Select model and load it.
MODEL_ID = "google/gemma-2-2b-it"
Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w8a8_int8/llama3_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier
from llmcompressor.modifiers.transform.smoothquant import SmoothQuantModifier

# Select model and load it.
Expand Down
2 changes: 1 addition & 1 deletion examples/quantizing_moe/deepseek_r1_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# Select model and load it.

Expand Down
2 changes: 1 addition & 1 deletion examples/quantizing_moe/qwen_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

# select a Mixture of Experts model for quantization
MODEL_ID = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
Expand Down
2 changes: 1 addition & 1 deletion src/llmcompressor/modifiers/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ bin using a scale and (optional) zero point. This basic quantization algorithm i
suitable for FP8 quantization. A variety of quantization schemes are supported via the
[compressed-tensors](https://github.com/neuralmagic/compressed-tensors) library.

### [GPTQ](./quantization/gptq/base.py)
### [GPTQ](./gptq/base.py)
One-shot algorithm that uses calibration data to select the ideal bin for weight quantization.
This algorithm is applied on top of the basic quantization algorithm, and affects weights only.
The implementation is based on [GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers](https://arxiv.org/pdf/2210.17323). The algorithm is very similar to SparseGPT: A small amount of calibration data is used
Expand Down
3 changes: 3 additions & 0 deletions src/llmcompressor/modifiers/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ def load_from_package(package_path: str) -> dict[str, type[Modifier]]:
deprecated_packages = [
"llmcompressor.modifiers.obcq",
"llmcompressor.modifiers.obcq.sgpt_base",
"llmcompressor.modifiers.quantization.gptq",
"llmcompressor.modifiers.quantization.gptq.base",
"llmcompressor.modifiers.quantization.gptq.gptq_quantize",
]
for _importer, modname, _is_pkg in pkgutil.walk_packages(
main_package.__path__, package_path + "."
Expand Down
4 changes: 4 additions & 0 deletions src/llmcompressor/modifiers/gptq/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# ruff: noqa

from .base import *
from .gptq_quantize import *
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@

from llmcompressor.core import Event, EventType, State
from llmcompressor.modifiers import Modifier
from llmcompressor.modifiers.quantization.calibration import update_weight_global_scale
from llmcompressor.modifiers.quantization.gptq.gptq_quantize import (
from llmcompressor.modifiers.gptq.gptq_quantize import (
accumulate_hessian,
make_empty_hessian,
quantize_weight,
)
from llmcompressor.modifiers.quantization.calibration import update_weight_global_scale
from llmcompressor.modifiers.quantization.quantization import QuantizationMixin
from llmcompressor.modifiers.utils import update_fused_layer_weight_global_scales
from llmcompressor.sentinel import Sentinel
Expand Down
2 changes: 1 addition & 1 deletion src/llmcompressor/modifiers/quantization/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# ruff: noqa

from .gptq import *
from llmcompressor.modifiers.gptq import *
from .quantization import *
10 changes: 9 additions & 1 deletion src/llmcompressor/modifiers/quantization/gptq/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# ruff: noqa
import warnings

from .base import *
warnings.warn(
"Importing from llmcompressor.modifiers.quantization.gptq is deprecated. "
"Please import from llmcompressor.modifiers.gptq instead.",
DeprecationWarning,
stacklevel=2,
)

from llmcompressor.modifiers.gptq import *
2 changes: 1 addition & 1 deletion src/llmcompressor/pipelines/sequential/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Sequential Pipeline #
The sequential pipeline is a data pipeline, primarily used for compressing models with the
[GPTQModifier](/src/llmcompressor/modifiers/quantization/gptq/base.py) or the
[GPTQModifier](/src/llmcompressor/modifiers/gptq/base.py) or the
[SparseGPTModifier](/src/llmcompressor/modifiers/pruning/sparsegpt/base.py).
3 changes: 2 additions & 1 deletion tests/e2e/e2e_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from transformers import AutoProcessor, DefaultDataCollator

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
from llmcompressor.modifiers.gptq import GPTQModifier
from llmcompressor.modifiers.quantization import QuantizationModifier
from tests.test_timer.timer_utils import log_time
from tests.testing_utils import process_dataset

Expand Down
2 changes: 1 addition & 1 deletion tests/llmcompressor/modifiers/quantization/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest
from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme

from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier


@pytest.fixture
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pytest
import torch

from llmcompressor.modifiers.gptq import GPTQModifier
from llmcompressor.modifiers.pruning.sparsegpt import SparseGPTModifier
from llmcompressor.modifiers.quantization.gptq import GPTQModifier
from tests.llmcompressor.modifiers.conf import LifecyleTestingHarness
from tests.llmcompressor.pytorch.helpers import LinearNet

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from llmcompressor import oneshot
from llmcompressor.core.session_functions import reset_session
from llmcompressor.modifiers.quantization.gptq import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier
from llmcompressor.modifiers.transform.smoothquant import SmoothQuantModifier
from llmcompressor.modifiers.transform.smoothquant.utils import (
DEFAULT_SMOOTHQUANT_MAPPINGS,
Expand Down
2 changes: 1 addition & 1 deletion tests/llmcompressor/transformers/gptq/test_gptq_oneshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from transformers import AutoModelForCausalLM

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization.gptq import GPTQModifier
from llmcompressor.modifiers.gptq import GPTQModifier

recipe_str = """
quant_stage:
Expand Down