Skip to content

Commit d4f5ee7

Browse files
authored
AutoFP8 to llmcompressor migration for FP8 quantization (#2701)
1 parent 6e0cca2 commit d4f5ee7

File tree

3 files changed

+53
-60
lines changed

3 files changed

+53
-60
lines changed

serving/docker/lmi-container-requirements-common.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,11 @@ onnx
2323
sentence_transformers
2424
onnxruntime-gpu==1.20.0
2525
autoawq==0.2.5
26+
llmcompressor==0.3.1
2627
tokenizers==0.20.3
2728
pydantic==2.9.2
2829
optimum==1.23.2
2930
torch==2.5.1
3031
torchvision==0.20.1
3132
# sequence scheduler wheel for hf accelerate rolling batch
32-
https://publish.djl.ai/seq_scheduler/seq_scheduler-0.1.0-py3-none-any.whl
33+
https://publish.djl.ai/seq_scheduler/seq_scheduler-0.1.0-py3-none-any.whl

serving/docker/partition/partition.py

Lines changed: 43 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222

2323
from properties_manager import PropertiesManager
2424
from huggingface_hub import snapshot_download
25-
from datasets import load_dataset
2625

2726
from utils import (get_partition_cmd, extract_python_jar,
2827
get_python_executable, get_download_dir,
@@ -217,8 +216,8 @@ def run_quantization(self):
217216
self.properties_manager.generate_properties_file()
218217
self.upload_checkpoints_to_s3()
219218
elif quant_method == 'fp8':
220-
logging.info("Running AutoFP8 quantization")
221-
self.autofp8_quantize()
219+
logging.info("Running FP8 quantization")
220+
self.fp8_quantize()
222221
self.properties_manager.generate_properties_file()
223222
self.upload_checkpoints_to_s3()
224223
else:
@@ -266,67 +265,52 @@ def autoawq_quantize(self):
266265
raise ImportError(
267266
"AutoAWQ is not installed. Failing during quantization.")
268267

269-
def autofp8_quantize(self):
268+
def fp8_quantize(self):
270269
"""
271-
Quantizes model using AutoFP8.
270+
Quantizes model using llm-compressor.
271+
Recipe: Simple PTQ + FP8 weight & activation quantization.
272272
"""
273-
# initialize configs
274-
hf_configs, tokenizer = load_hf_config_and_tokenizer(self.properties)
275-
if not tokenizer.pad_token:
276-
tokenizer.pad_token = tokenizer.eos_token
273+
from llmcompressor.modifiers.quantization import QuantizationModifier
274+
from llmcompressor.transformers import oneshot
275+
from transformers import AutoModelForCausalLM
277276

278-
quant_config = {
279-
"activation_scheme":
280-
self.properties.get("option.fp8_activation_scheme", "static"),
277+
# initialize configs and model
278+
hf_configs, tokenizer = load_hf_config_and_tokenizer(self.properties)
279+
output_path = self.properties['option.save_mp_checkpoint_path']
280+
model = AutoModelForCausalLM.from_pretrained(
281+
hf_configs.model_id_or_path, **hf_configs.kwargs)
282+
283+
# parse options and define quantization recipe
284+
quant_config = {"targets": "Linear"}
285+
quant_config["scheme"] = self.properties.get("option.fp8_scheme",
286+
"FP8")
287+
quant_config["ignore"] = [
288+
s.strip() for s in self.properties.get("option.fp8_ignore",
289+
"lm_head").split(',')
290+
]
291+
recipe = QuantizationModifier(**quant_config)
292+
293+
# calibration dataset options
294+
oneshot_kwargs = {
295+
"model": model,
296+
"recipe": recipe,
281297
}
282-
if self.properties.get("option.fp8_kv_cache_quant_targets"):
283-
quant_config["kv_cache_quant_targets"] = tuple([
284-
s.strip() for s in self.properties.get(
285-
"option.fp8_kv_cache_quant_targets").split(',')
286-
])
287-
if self.properties.get("option.fp8_ignore_patterns"):
288-
quant_config["ignore_patterns"] = [
289-
s.strip() for s in self.properties.get(
290-
"option.fp8_ignore_patterns").split(',')
291-
]
292-
293-
# create samples for calibrating scaling factors
294-
if quant_config["activation_scheme"] == "dynamic":
295-
# If using dynamic activation scales, a calibration dataset is not required
296-
examples = []
298+
if "dynamic" in recipe.scheme:
299+
pass
297300
else:
298-
calib_size = int(self.properties.get("option.calib_size", 512))
299-
# Tokenize dataset for calibrating static activation scales
300-
ds = load_dataset("abisee/cnn_dailymail",
301-
"3.0.0",
302-
split="validation").shuffle(seed=42).select(
303-
range(calib_size))
304-
examples = [batch["article"] for batch in ds]
305-
examples = tokenizer(examples,
306-
padding=True,
307-
truncation=True,
308-
return_tensors="pt").to("cuda")
309-
310-
# quantization
311-
try:
312-
from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig
313-
quantize_config = BaseQuantizeConfig(**quant_config)
314-
logging.info(
315-
f"Using the following configurations for fp8 quantization: {vars(quantize_config)}"
316-
)
317-
model = AutoFP8ForCausalLM.from_pretrained(
318-
hf_configs.model_id_or_path, quantize_config,
319-
**hf_configs.kwargs)
320-
model.quantize(examples)
321-
output_path = self.properties['option.save_mp_checkpoint_path']
322-
logging.info(
323-
f"Quantization complete. Saving model to: {output_path}")
324-
model.save_quantized(output_path)
325-
except ImportError:
326-
logging.error(
327-
"AutoFP8 is not installed. Failing during quantization.")
328-
raise ImportError(
329-
"AutoFP8 is not installed. Failing during quantization.")
301+
oneshot_kwargs["dataset"] = "cnn_dailymail"
302+
oneshot_kwargs["num_calibration_samples"] = int(
303+
self.properties.get("option.calib_size", 512))
304+
oneshot_kwargs["max_seq_length"] = int(
305+
self.properties.get("option.max_model_len", 2048))
306+
307+
logging.info(
308+
f"Using the following configuartions for fp8 quantization: {oneshot_kwargs}"
309+
)
310+
oneshot(**oneshot_kwargs)
311+
logging.info(f"Quantization complete. Saving model to: {output_path}")
312+
model.save_pretrained(output_path)
313+
tokenizer.save_pretrained(output_path)
330314

331315

332316
def main():

serving/docker/partition/sm_neo_quantize.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,11 +99,16 @@ def write_properties(self):
9999
"""
100100
Updates outputted serving.properties.
101101
102+
## tensor parallel degree & device_map
102103
We set option.tensor_parallel_degree & option.device_map for quantization.
103104
This function passes through these values to the outputted serving.properties if received from the customer.
104105
Otherwise, nothing is outputted for these values.
106+
107+
## quantization
108+
For FP8 quantization with llm-compressor, vllm requires quantization_method to be set to 'compressed-tensors'
105109
"""
106110
passthrough_properties = {}
111+
# checking if customer set property through envvar or serving.properties.
107112
passthrough_properties[
108113
"option.tensor_parallel_degree"] = os.environ.get(
109114
"OPTION_TENSOR_PARALLEL_DEGREE") if os.environ.get(
@@ -127,6 +132,9 @@ def write_properties(self):
127132
f"User did not pass {k}. Outputted serving.properties "
128133
"will not include this field.")
129134

135+
if output_properties.get("option.quantize") == "fp8":
136+
output_properties["option.quantize"] = "compressed-tensors"
137+
130138
self.properties_manager.properties = output_properties
131139
self.properties_manager.generate_properties_file()
132140

0 commit comments

Comments
 (0)