vllm-project
diff --git a/‎Makefile
Lines changed: 2 additions & 2 deletions b/‎Makefile
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/awq/llama_example.py
Lines changed: 1 addition & 1 deletion b/‎examples/awq/llama_example.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/awq/qwen3_moe_example.py
Lines changed: 1 addition & 1 deletion b/‎examples/awq/qwen3_moe_example.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/big_models_with_accelerate/cpu_offloading_fp8.py
Lines changed: 26 additions & 0 deletions b/‎examples/big_models_with_accelerate/cpu_offloading_fp8.py
Lines changed: 26 additions & 0 deletions
diff --git a/‎examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
Lines changed: 81 additions & 0 deletions b/‎examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
Lines changed: 81 additions & 0 deletions
diff --git a/‎examples/big_models_with_accelerate/multi_gpu_int8.py
Lines changed: 78 additions & 0 deletions b/‎examples/big_models_with_accelerate/multi_gpu_int8.py
Lines changed: 78 additions & 0 deletions
diff --git a/‎examples/multimodal_audio/whisper_example.py
Lines changed: 1 addition & 1 deletion b/‎examples/multimodal_audio/whisper_example.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/multimodal_vision/gemma3_example.py
Lines changed: 6 additions & 2 deletions b/‎examples/multimodal_vision/gemma3_example.py
Lines changed: 6 additions & 2 deletions
diff --git a/‎examples/multimodal_vision/idefics3_example.py
Lines changed: 1 addition & 1 deletion b/‎examples/multimodal_vision/idefics3_example.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/multimodal_vision/llava_example.py
Lines changed: 1 addition & 1 deletion b/‎examples/multimodal_vision/llava_example.py
Lines changed: 1 addition & 1 deletion
@@ -26,14 +26,14 @@ quality:
 	@echo "Running python quality checks";
 	ruff check $(CHECKDIRS);
 	isort --check-only $(CHECKDIRS);
-	flake8 $(CHECKDIRS) --max-line-length 88 --extend-ignore E203;
+	flake8 $(CHECKDIRS) --max-line-length 88 --extend-ignore E203,W605;
 
 # style the code according to accepted standards for the repo
 style:
 	@echo "Running python styling";
 	ruff format $(CHECKDIRS);
 	isort $(CHECKDIRS);
-	flake8 $(CHECKDIRS) --max-line-length 88 --extend-ignore E203;
+	flake8 $(CHECKDIRS) --max-line-length 88 --extend-ignore E203,W605;
 
 # run tests for the repo
 test:
 
@@ -70,6 +70,6 @@ def tokenize(sample):
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-asym"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-asym"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
@@ -77,6 +77,6 @@ def tokenize(sample):
 print("==========================================\n\n")
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[-1] + "-awq-sym"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-awq-sym"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 tokenizer.save_pretrained(SAVE_DIR)
@@ -0,0 +1,26 @@
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
+OUTPUT_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-Dynamic"
+
+# Load model
+# Note: device_map="auto" will offload to CPU if not enough space on GPU.
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True
+)
+
+# Configure the quantization scheme and algorithm (PTQ + FP8_DYNAMIC).
+recipe = QuantizationModifier(
+    targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]
+)
+
+# Apply quantization and save in `compressed-tensors` format.
+oneshot(
+    model=model,
+    recipe=recipe,
+    tokenizer=AutoTokenizer.from_pretrained(MODEL_ID),
+    output_dir=OUTPUT_DIR,
+)
@@ -0,0 +1,81 @@
+import torch
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
+
+MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
+
+# adjust based off number of desired GPUs
+# reserve_for_hessians=True reserves memory which is required by
+# GPTQModifier and SparseGPTModifier
+device_map = calculate_offload_device_map(
+    MODEL_ID, num_gpus=1, reserve_for_hessians=True, torch_dtype=torch.bfloat16
+)
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# define a llmcompressor recipe for W8A8 quantization
+recipe = [
+    SmoothQuantModifier(smoothing_strength=0.8),
+    GPTQModifier(
+        targets="Linear",
+        scheme="W8A8",
+        ignore=["lm_head"],
+    ),
+]
+
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-INT8"
+
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    save_compressed=True,
+    output_dir=SAVE_DIR,
+)
@@ -0,0 +1,78 @@
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier
+
+MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8-Dynamic"
+
+# 1) Load model (device_map="auto" with shard the model over multiple GPUs!).
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
+    trust_remote_code=True,
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# 2) Prepare calibration dataset (in this case, we use ultrachat).
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 1024
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# 3) Configure algorithms. In this case, we:
+#   * quantize the weights to int8 with GPTQ (static per channel)
+#   * quantize the activations to int8 (dynamic per token)
+recipe = [
+    GPTQModifier(
+        targets="Linear", scheme="W8A8", ignore=["lm_head"], dampening_frac=0.1
+    ),
+]
+
+# 4) Apply algorithms and save in `compressed-tensors` format.
+# if you encounter GPU out-of-memory issues, consider using an explicit
+# device map (see multi_gpus_int8_device_map.py)
+oneshot(
+    model=model,
+    tokenizer=tokenizer,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+    output_dir=SAVE_DIR,
+)
@@ -103,6 +103,6 @@ def data_collator(batch):
 # and it was a great thing for what it was at the time but it's not a passive house
 
 # Save to disk compressed.
-SAVE_DIR = MODEL_ID.split("/")[1] + "-W4A16-G128"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 processor.save_pretrained(SAVE_DIR)
@@ -30,7 +30,11 @@ def data_collator(batch):
     GPTQModifier(
         targets="Linear",
         scheme="W4A16",
-        ignore=["re:*.lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
+        ignore=[
+            "lm_head",
+            "re:model\.vision_tower.*",
+            "re:model\.multi_modal_projector.*",
+        ],
     ),
 ]
 
@@ -70,6 +74,6 @@ def data_collator(batch):
 print("==========================================")
 
 # Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 processor.save_pretrained(SAVE_DIR)
@@ -115,6 +115,6 @@ def tokenize(sample):
 print("==========================================")
 
 # Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 processor.save_pretrained(SAVE_DIR)
@@ -70,6 +70,6 @@ def data_collator(batch):
 print("==========================================")
 
 # Save to disk compressed.
-SAVE_DIR = model_id.split("/")[1] + "-W4A16-G128"
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-W4A16-G128"
 model.save_pretrained(SAVE_DIR, save_compressed=True)
 processor.save_pretrained(SAVE_DIR)