Skip to content
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
26bcbb0
oneshot refac
Jan 28, 2025
8c0a255
comments
Feb 4, 2025
468a714
Merge branch 'main' into oneshot-refac-main
Feb 4, 2025
428a2c7
merge main
Feb 11, 2025
58b3d6a
stashed changes
Feb 11, 2025
380d164
run examples pass
Feb 11, 2025
0ae9ade
pass tests
Feb 11, 2025
9be9174
Merge branch 'main' into oneshot-refac-main
Feb 11, 2025
95bfaa1
add entrypoints
Feb 11, 2025
c4dd9cc
Merge branch 'oneshot-refac-main' of github.com:vllm-project/llm-comp…
Feb 11, 2025
d2ffc4a
udpate read me on /finetune
Feb 11, 2025
fc4c42f
pass tests
Feb 11, 2025
09942b7
pass tests
Feb 11, 2025
00dd629
add readme and remove breakpoint
Feb 12, 2025
723b6b5
update read me
Feb 12, 2025
d140f11
comments
Feb 12, 2025
e208a69
GPUS to GPUs
Feb 12, 2025
d1855cf
add deprecation warning
Feb 12, 2025
af8515d
update readme
Feb 13, 2025
3f7c3ac
update sparse24 in readme
Feb 13, 2025
a8517b0
-s
Feb 13, 2025
0dd5987
update bf16
Feb 13, 2025
1d97841
comments
Feb 13, 2025
b5112cd
fix bug on processor
Feb 13, 2025
1929938
Merge branch 'main' into oneshot-refac-main
Feb 13, 2025
7402143
fix preprocess logic
Feb 14, 2025
0f86cf3
Merge branch 'oneshot-refac-main' of github.com:vllm-project/llm-comp…
Feb 14, 2025
58a7a5a
fix self attr population
Feb 14, 2025
ca7da03
fix test, get torch model not stub
Feb 14, 2025
103fd71
use non-gated model
Feb 14, 2025
7cf5f1a
Merge branch 'main' into oneshot-refac-main
Feb 14, 2025
333dc42
lint
Feb 17, 2025
e7e838f
Merge branch 'main' into oneshot-refac-main
Feb 17, 2025
5a1dccf
update stages
Feb 19, 2025
fb2af8d
Merge branch 'main' into oneshot-refac-main
Feb 19, 2025
ca9f295
revert output_dir name
Feb 19, 2025
a9e8597
Merge branch 'oneshot-refac-main' of github.com:vllm-project/llm-comp…
Feb 19, 2025
d2e6274
Merge branch 'main' into oneshot-refac-main
Feb 19, 2025
5f1d383
Merge branch 'main' into oneshot-refac-main
Feb 20, 2025
eb3094c
comments
Feb 20, 2025
b3a09a4
remove if condition
Feb 24, 2025
fe6b797
Merge branch 'main' into oneshot-refac-main
Feb 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ Quantization is applied by selecting an algorithm and calling the `oneshot` API.
```python
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
from llmcompressor.transformers import oneshot
from llmcompressor import oneshot
from transformers import AutoModelForCausalLM

# Select quantization algorithm. In this case, we:
Expand Down
2 changes: 1 addition & 1 deletion examples/big_models_with_accelerate/cpu_offloading_fp8.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import oneshot

MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
from llmcompressor.transformers import oneshot
from llmcompressor.transformers.compression.helpers import calculate_offload_device_map

MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
Expand Down
2 changes: 1 addition & 1 deletion examples/big_models_with_accelerate/multi_gpu_int8.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import oneshot

MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic"
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_audio/whisper_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from datasets import load_dataset
from transformers import WhisperProcessor

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import oneshot
from llmcompressor.transformers.tracing import TraceableWhisperForConditionalGeneration

# Select model and load it.
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_vision/idefics3_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from PIL import Image
from transformers import AutoProcessor

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import oneshot
from llmcompressor.transformers.tracing import TraceableIdefics3ForConditionalGeneration

# Load model.
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_vision/llava_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from PIL import Image
from transformers import AutoProcessor

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import oneshot
from llmcompressor.transformers.tracing import TraceableLlavaForConditionalGeneration

# Load model.
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_vision/mllama_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from PIL import Image
from transformers import AutoProcessor

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import oneshot
from llmcompressor.transformers.tracing import TraceableMllamaForConditionalGeneration

# Load model.
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_vision/phi3_vision_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoProcessor

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import oneshot

# Load model.
model_id = "microsoft/Phi-3-vision-128k-instruct"
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_vision/pixtral_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from PIL import Image
from transformers import AutoProcessor

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import oneshot
from llmcompressor.transformers.tracing import TraceableLlavaForConditionalGeneration

# Load model.
Expand Down
2 changes: 1 addition & 1 deletion examples/multimodal_vision/qwen2_vl_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import oneshot
from llmcompressor.transformers.tracing import TraceableQwen2VLForConditionalGeneration

# Load model.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
dataset = "ultrachat-200k"

# save location of quantized model
output_dir = "output_llama7b_2of4_w4a16_channel"
output_dir = "output_llama7b_2of4_w4a16_channel-refac"

# set dataset config parameters
splits = {"calibration": "train_gen[:5%]", "train": "train_gen"}
Expand All @@ -33,6 +33,7 @@
bf16 = False # using full precision for training
lr_scheduler_type = "cosine"
warmup_ratio = 0.1
preprocessing_num_workers = 8

# this will run the recipe stage by stage:
# oneshot sparsification -> finetuning -> oneshot quantization
Expand All @@ -52,6 +53,7 @@
learning_rate=learning_rate,
lr_scheduler_type=lr_scheduler_type,
warmup_ratio=warmup_ratio,
preprocessing_num_workers=preprocessing_num_workers,
)
logger.info(
"Note: llcompressor does not currently support running ",
Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_kv_cache/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ Configure and apply the FP8 quantization for weights, activations, and KV cache.
Notice the new `kv_cache_scheme` section:

```python
from llmcompressor.transformers import oneshot
from llmcompressor import oneshot

recipe = """
quant_stage:
Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_kv_cache/gemma2_fp8_kv_example.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor.transformers import oneshot
from llmcompressor import oneshot

# Select model and load it.
MODEL_ID = "google/gemma-2-9b-it"
Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_kv_cache/llama3_fp8_kv_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from loguru import logger
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor.transformers import oneshot
from llmcompressor import oneshot

# Select model and load it.
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor.transformers import oneshot
from llmcompressor import oneshot

# Select model and load it.
# Phi-3.5 is a special case for KV cache quantization because it has
Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w4a16/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ In our case, we will apply the default GPTQ recipe for `int4` (which uses static
> See the `Recipes` documentation for more information on making complex recipes

```python
from llmcompressor.transformers import oneshot
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier

# Configure the quantization algorithm to run.
Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w8a8_fp8/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ We recommend targeting all `Linear` layers using the `FP8_DYNAMIC` scheme, which
Since simple PTQ does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.

```python
from llmcompressor.transformers import oneshot
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier

# Configure the simple PTQ quantization
Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w8a8_fp8/gemma2_example.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import oneshot

MODEL_ID = "google/gemma-2-27b-it"

Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w8a8_fp8/llama3.2_vision_example.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from transformers import AutoProcessor, MllamaForConditionalGeneration

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import oneshot

MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"

Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w8a8_fp8/llama3_example.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import oneshot

MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"

Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w8a8_fp8/llava1.5_example.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from transformers import AutoProcessor, LlavaForConditionalGeneration

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import oneshot

MODEL_ID = "llava-hf/llava-1.5-7b-hf"

Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w8a8_fp8/qwen2vl_example.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import oneshot

MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"

Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w8a8_fp8/whisper_example.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from datasets import load_dataset
from transformers import AutoProcessor, WhisperForConditionalGeneration

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import oneshot

MODEL_ID = "openai/whisper-large-v2"

Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w8a8_int8/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ We first select the quantization algorithm. For W8A8, we want to:
> See the `Recipes` documentation for more information on recipes

```python
from llmcompressor.transformers import oneshot
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier

Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w8a8_int8/gemma2_example.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import oneshot

# 1) Select model and load it.
MODEL_ID = "google/gemma-2-2b-it"
Expand Down
2 changes: 1 addition & 1 deletion examples/quantization_w8a8_int8/llama3_example.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
from llmcompressor.transformers import oneshot

# Select model and load it.
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
Expand Down
4 changes: 2 additions & 2 deletions examples/quantizing_moe/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ NOTE: `.*block_sparse_moe.gate` layers do not quantize well, hence they are igno
The `oneshot` method applies the selected recipe to your model and dataset without requiring any fine-tuning. The model will be sparsified and saved to `Mixtral-8x7B-Instruct-v0.1-FP8`.

```python
from llmcompressor.transformers import oneshot
from llmcompressor import oneshot

output_dir = "Mixtral-8x7B-Instruct-v0.1-FP8"

Expand All @@ -61,7 +61,7 @@ oneshot(
recipe=recipe,
save_compressed=True,
output_dir=output_dir,
overwrite_output_dir=True,

max_seq_length=2048,
num_calibration_samples=512,
)
Expand Down
2 changes: 1 addition & 1 deletion examples/quantizing_moe/deepseek_moe_w4a16.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor.transformers import oneshot
from llmcompressor import oneshot
from llmcompressor.transformers.compression.helpers import calculate_offload_device_map

# NOTE: transformers 4.48.0 has an import error with DeepSeek.
Expand Down
2 changes: 1 addition & 1 deletion examples/quantizing_moe/deepseek_moe_w8a8_fp8.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import oneshot

# NOTE: transformers 4.48.0 has an import error with DeepSeek.
# Please consider either downgrading your transformers version to a
Expand Down
2 changes: 1 addition & 1 deletion examples/quantizing_moe/deepseek_moe_w8a8_int8.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import oneshot
from llmcompressor.transformers.compression.helpers import calculate_offload_device_map

# NOTE: transformers 4.48.0 has an import error with DeepSeek.
Expand Down
3 changes: 1 addition & 2 deletions examples/quantizing_moe/mixtral_moe_w8a8_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import oneshot
from llmcompressor.transformers.compression.helpers import calculate_offload_device_map

MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
Expand Down Expand Up @@ -45,7 +45,6 @@
max_seq_length=MAX_SEQ_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
save_compressed=SAVE_COMPRESSED,
overwrite_output_dir=True,
output_dir=SAVE_DIR,
)

Expand Down
2 changes: 1 addition & 1 deletion examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.obcq import SparseGPTModifier
from llmcompressor.modifiers.pruning import ConstantPruningModifier
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import oneshot

# Configuration
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
Expand Down
11 changes: 0 additions & 11 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,6 @@
"pre-commit",
]
},
entry_points={
"console_scripts": [
"llmcompressor.transformers.text_generation.apply=llmcompressor.transformers.finetune.text_generation:apply", # noqa 501
"llmcompressor.transformers.text_generation.compress=llmcompressor.transformers.finetune.text_generation:apply", # noqa 501
"llmcompressor.transformers.text_generation.train=llmcompressor.transformers.finetune.text_generation:train", # noqa 501
"llmcompressor.transformers.text_generation.finetune=llmcompressor.transformers.finetune.text_generation:train", # noqa 501
"llmcompressor.transformers.text_generation.eval=llmcompressor.transformers.finetune.text_generation:eval", # noqa 501
"llmcompressor.transformers.text_generation.oneshot=llmcompressor.transformers.finetune.text_generation:oneshot", # noqa 501
"llmcompressor.trace=llmcompressor.transformers.tracing.debug:main",
]
},
python_requires=">=3.8",
classifiers=[
"Development Status :: 5 - Production/Stable",
Expand Down
2 changes: 1 addition & 1 deletion src/llmcompressor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@

from llmcompressor.core.session_functions import (
active_session,
apply,
callbacks,
create_session,
finalize,
initialize,
pre_initialize_structure,
reset_session,
)
from llmcompressor.entrypoints import Oneshot, oneshot
4 changes: 2 additions & 2 deletions src/llmcompressor/args/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Parsers in `llm-compressor` define the input arguments required for various entr
Each entry point (e.g., oneshot) carries out its logic based on the provided input arguments, `model`, `recipe`, and `dataset`.

```python
from llmcompressor.transformers import oneshot
from llmcompressor import oneshot

model = ...
recipe = ...
Expand All @@ -24,7 +24,7 @@ oneshot(

These input arguments can be overloaded into the function signature and will be parsed using Hugging Face's [argument parser](https://github.com/huggingface/transformers/blob/main/src/transformers/hf_argparser.py). The parsers define the acceptable inputs; therefore any arguments to be passed in must be defined.

`llm-compressor` uses four parsers, located in `llm_compressor/arg_parser`:
`llm-compressor` uses four parsers, located in `llm_compressor/args`:
* ModelArguments
* DatasetArguments
* RecipeArguments
Expand Down
Loading
Loading