|
22 | 22 |
|
23 | 23 | from properties_manager import PropertiesManager |
24 | 24 | from huggingface_hub import snapshot_download |
25 | | -from datasets import load_dataset |
26 | 25 |
|
27 | 26 | from utils import (get_partition_cmd, extract_python_jar, |
28 | 27 | get_python_executable, get_download_dir, |
@@ -217,8 +216,8 @@ def run_quantization(self): |
217 | 216 | self.properties_manager.generate_properties_file() |
218 | 217 | self.upload_checkpoints_to_s3() |
219 | 218 | elif quant_method == 'fp8': |
220 | | - logging.info("Running AutoFP8 quantization") |
221 | | - self.autofp8_quantize() |
| 219 | + logging.info("Running FP8 quantization") |
| 220 | + self.fp8_quantize() |
222 | 221 | self.properties_manager.generate_properties_file() |
223 | 222 | self.upload_checkpoints_to_s3() |
224 | 223 | else: |
@@ -266,67 +265,52 @@ def autoawq_quantize(self): |
266 | 265 | raise ImportError( |
267 | 266 | "AutoAWQ is not installed. Failing during quantization.") |
268 | 267 |
|
269 | | - def autofp8_quantize(self): |
| 268 | + def fp8_quantize(self): |
270 | 269 | """ |
271 | | - Quantizes model using AutoFP8. |
| 270 | + Quantizes model using llm-compressor. |
| 271 | + Recipe: Simple PTQ + FP8 weight & activation quantization. |
272 | 272 | """ |
273 | | - # initialize configs |
274 | | - hf_configs, tokenizer = load_hf_config_and_tokenizer(self.properties) |
275 | | - if not tokenizer.pad_token: |
276 | | - tokenizer.pad_token = tokenizer.eos_token |
| 273 | + from llmcompressor.modifiers.quantization import QuantizationModifier |
| 274 | + from llmcompressor.transformers import oneshot |
| 275 | + from transformers import AutoModelForCausalLM |
277 | 276 |
|
278 | | - quant_config = { |
279 | | - "activation_scheme": |
280 | | - self.properties.get("option.fp8_activation_scheme", "static"), |
| 277 | + # initialize configs and model |
| 278 | + hf_configs, tokenizer = load_hf_config_and_tokenizer(self.properties) |
| 279 | + output_path = self.properties['option.save_mp_checkpoint_path'] |
| 280 | + model = AutoModelForCausalLM.from_pretrained( |
| 281 | + hf_configs.model_id_or_path, **hf_configs.kwargs) |
| 282 | + |
| 283 | + # parse options and define quantization recipe |
| 284 | + quant_config = {"targets": "Linear"} |
| 285 | + quant_config["scheme"] = self.properties.get("option.fp8_scheme", |
| 286 | + "FP8") |
| 287 | + quant_config["ignore"] = [ |
| 288 | + s.strip() for s in self.properties.get("option.fp8_ignore", |
| 289 | + "lm_head").split(',') |
| 290 | + ] |
| 291 | + recipe = QuantizationModifier(**quant_config) |
| 292 | + |
| 293 | + # calibration dataset options |
| 294 | + oneshot_kwargs = { |
| 295 | + "model": model, |
| 296 | + "recipe": recipe, |
281 | 297 | } |
282 | | - if self.properties.get("option.fp8_kv_cache_quant_targets"): |
283 | | - quant_config["kv_cache_quant_targets"] = tuple([ |
284 | | - s.strip() for s in self.properties.get( |
285 | | - "option.fp8_kv_cache_quant_targets").split(',') |
286 | | - ]) |
287 | | - if self.properties.get("option.fp8_ignore_patterns"): |
288 | | - quant_config["ignore_patterns"] = [ |
289 | | - s.strip() for s in self.properties.get( |
290 | | - "option.fp8_ignore_patterns").split(',') |
291 | | - ] |
292 | | - |
293 | | - # create samples for calibrating scaling factors |
294 | | - if quant_config["activation_scheme"] == "dynamic": |
295 | | - # If using dynamic activation scales, a calibration dataset is not required |
296 | | - examples = [] |
| 298 | + if "dynamic" in recipe.scheme: |
| 299 | + pass |
297 | 300 | else: |
298 | | - calib_size = int(self.properties.get("option.calib_size", 512)) |
299 | | - # Tokenize dataset for calibrating static activation scales |
300 | | - ds = load_dataset("abisee/cnn_dailymail", |
301 | | - "3.0.0", |
302 | | - split="validation").shuffle(seed=42).select( |
303 | | - range(calib_size)) |
304 | | - examples = [batch["article"] for batch in ds] |
305 | | - examples = tokenizer(examples, |
306 | | - padding=True, |
307 | | - truncation=True, |
308 | | - return_tensors="pt").to("cuda") |
309 | | - |
310 | | - # quantization |
311 | | - try: |
312 | | - from auto_fp8 import AutoFP8ForCausalLM, BaseQuantizeConfig |
313 | | - quantize_config = BaseQuantizeConfig(**quant_config) |
314 | | - logging.info( |
315 | | - f"Using the following configurations for fp8 quantization: {vars(quantize_config)}" |
316 | | - ) |
317 | | - model = AutoFP8ForCausalLM.from_pretrained( |
318 | | - hf_configs.model_id_or_path, quantize_config, |
319 | | - **hf_configs.kwargs) |
320 | | - model.quantize(examples) |
321 | | - output_path = self.properties['option.save_mp_checkpoint_path'] |
322 | | - logging.info( |
323 | | - f"Quantization complete. Saving model to: {output_path}") |
324 | | - model.save_quantized(output_path) |
325 | | - except ImportError: |
326 | | - logging.error( |
327 | | - "AutoFP8 is not installed. Failing during quantization.") |
328 | | - raise ImportError( |
329 | | - "AutoFP8 is not installed. Failing during quantization.") |
| 301 | + oneshot_kwargs["dataset"] = "cnn_dailymail" |
| 302 | + oneshot_kwargs["num_calibration_samples"] = int( |
| 303 | + self.properties.get("option.calib_size", 512)) |
| 304 | + oneshot_kwargs["max_seq_length"] = int( |
| 305 | + self.properties.get("option.max_model_len", 2048)) |
| 306 | + |
| 307 | + logging.info( |
| 308 | + f"Using the following configuartions for fp8 quantization: {oneshot_kwargs}" |
| 309 | + ) |
| 310 | + oneshot(**oneshot_kwargs) |
| 311 | + logging.info(f"Quantization complete. Saving model to: {output_path}") |
| 312 | + model.save_pretrained(output_path) |
| 313 | + tokenizer.save_pretrained(output_path) |
330 | 314 |
|
331 | 315 |
|
332 | 316 | def main(): |
|
0 commit comments