Skip to content

Commit 29b2ac9

Browse files
mvafineaidovanikita-savelyevvecharlaix
authored
Support AWQ models (#1049)
* Support AWQ models * Add tests * Add dependencies * Fix tests * enable awq export only if ov support it * fix style (#2) * disable awq and gptq install for old torch (#3) * fix style * disable autogptq and autoawq install for old transformers testing * separate common quant models patching and gptq (#4) * disable windows install (#5) * separate common quant models patching and gptq * disable awq windows * skip logits check for quantized models (#6) * fix test after rebase * fix testing condition for 2024.6 and unpatch in case if failed * Fix qwen2-vl tests (#1084) * Skip private mdoel loading test for external contributors (#1082) * Fix reshaping unet if timestep is 0d tensor (#1083) * Disable kv cache compression for fp vlm (#1080) * Support AWQ models * Add tests * Add dependencies * Fix tests * enable awq export only if ov support it * fix style (#2) * disable awq and gptq install for old torch (#3) * fix style * disable autogptq and autoawq install for old transformers testing * separate common quant models patching and gptq (#4) * disable windows install (#5) * separate common quant models patching and gptq * disable awq windows * skip logits check for quantized models (#6) * fix test after rebase * fix testing condition for 2024.6 and unpatch in case if failed * add necessary packages in test_openvino_full * fix code style after rebase (#7) --------- Co-authored-by: eaidova <[email protected]> Co-authored-by: Nikita Savelyev <[email protected]> Co-authored-by: Ella Charlaix <[email protected]>
1 parent ea6fa42 commit 29b2ac9

File tree

7 files changed

+265
-155
lines changed

7 files changed

+265
-155
lines changed

.github/workflows/test_openvino.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,11 @@ jobs:
5050
name: Install specific dependencies and versions required for older transformers
5151
run: |
5252
pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.30.* transformers_stream_generator
53+
54+
- if: ${{ matrix.transformers-version == 'latest' && matrix.test-pattern == '*modeling*'}}
55+
name: Install auto-gptq, autoawq
56+
run: |
57+
pip install auto-gptq autoawq --extra-index-url https://download.pytorch.org/whl/cpu
5358
5459
- if: ${{ matrix.test-pattern == '*modeling*' }}
5560
name: Uninstall NNCF

.github/workflows/test_openvino_full.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,11 @@ jobs:
7878
if: ${{ matrix.transformers-version != 'latest' }}
7979
run: pip install transformers==${{ matrix.transformers-version }}
8080

81+
- if: ${{ matrix.transformers-version == 'latest' && matrix.os != 'windows-2019' }}
82+
name: Install auto-gptq, autoawq
83+
run: |
84+
pip install auto-gptq autoawq --extra-index-url https://download.pytorch.org/whl/cpu
85+
8186
- name: Pip freeze
8287
run: pip freeze
8388

.github/workflows/test_openvino_slow.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,11 @@ jobs:
4949
name: Install specific dependencies and versions required for older transformers
5050
run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*, diffusers==0.30.* transformers_stream_generator
5151

52+
- if: ${{ matrix.transformers-version == 'latest' && matrix.os != 'windows-2019' }}
53+
name: Install auto-gptq, autoawq
54+
run: |
55+
pip install auto-gptq autoawq --extra-index-url https://download.pytorch.org/whl/cpu
56+
5257
- name: Pip freeze
5358
run: pip freeze
5459

optimum/exporters/openvino/__main__.py

Lines changed: 150 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,7 @@ def main_export(
232232
)
233233

234234
do_gptq_patching = False
235+
do_quant_patching = False
235236
custom_architecture = False
236237
patch_16bit = False
237238
loading_kwargs = model_loading_kwargs or {}
@@ -247,7 +248,11 @@ def main_export(
247248
trust_remote_code=trust_remote_code,
248249
)
249250
quantization_config = getattr(config, "quantization_config", None)
250-
do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq"
251+
supported_quant_methods = ["gptq"]
252+
if is_openvino_version(">=", "2024.6.0"):
253+
supported_quant_methods.append("awq")
254+
do_quant_patching = quantization_config and quantization_config["quant_method"] in supported_quant_methods
255+
do_gptq_patching = do_quant_patching and quantization_config["quant_method"] == "gptq"
251256
model_type = config.model_type.replace("_", "-")
252257
if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
253258
custom_architecture = True
@@ -296,7 +301,6 @@ def main_export(
296301
if (
297302
dtype is None
298303
and framework == "pt"
299-
and not do_gptq_patching
300304
and (
301305
task.startswith("text-generation")
302306
or getattr(config, "model_type", None) in MULTI_MODAL_TEXT_GENERATION_MODELS
@@ -315,28 +319,28 @@ def main_export(
315319
patch_16bit = True
316320
loading_kwargs["torch_dtype"] = dtype
317321
# Patch the modules to export of GPTQ models w/o GPU
318-
if do_gptq_patching:
319-
torch.set_default_dtype(torch.float32)
322+
if do_quant_patching:
320323
orig_cuda_check = torch.cuda.is_available
321324
torch.cuda.is_available = lambda: True
322325

323-
from optimum.gptq import GPTQQuantizer
326+
if do_gptq_patching:
327+
from optimum.gptq import GPTQQuantizer
324328

325-
orig_post_init_model = GPTQQuantizer.post_init_model
329+
orig_post_init_model = GPTQQuantizer.post_init_model
326330

327-
def post_init_model(self, model):
328-
from auto_gptq import exllama_set_max_input_length
331+
def post_init_model(self, model):
332+
from auto_gptq import exllama_set_max_input_length
329333

330-
class StoreAttr(object):
331-
pass
334+
class StoreAttr(object):
335+
pass
332336

333-
model.quantize_config = StoreAttr()
334-
model.quantize_config.desc_act = self.desc_act
335-
if self.desc_act and not self.disable_exllama and self.max_input_length is not None:
336-
model = exllama_set_max_input_length(model, self.max_input_length)
337-
return model
337+
model.quantize_config = StoreAttr()
338+
model.quantize_config.desc_act = self.desc_act
339+
if self.desc_act and not self.disable_exllama and self.max_input_length is not None:
340+
model = exllama_set_max_input_length(model, self.max_input_length)
341+
return model
338342

339-
GPTQQuantizer.post_init_model = post_init_model
343+
GPTQQuantizer.post_init_model = post_init_model
340344
elif library_name == "diffusers" and is_openvino_version(">=", "2024.6"):
341345
dtype = deduce_diffusers_dtype(
342346
model_name_or_path,
@@ -351,143 +355,150 @@ class StoreAttr(object):
351355
loading_kwargs["torch_dtype"] = dtype
352356
patch_16bit = True
353357

354-
if library_name == "open_clip":
355-
model = _OpenClipForZeroShotImageClassification.from_pretrained(model_name_or_path, cache_dir=cache_dir)
356-
else:
357-
model = TasksManager.get_model_from_task(
358-
task,
359-
model_name_or_path,
360-
subfolder=subfolder,
361-
revision=revision,
362-
cache_dir=cache_dir,
363-
token=token,
364-
local_files_only=local_files_only,
365-
force_download=force_download,
366-
trust_remote_code=trust_remote_code,
367-
framework=framework,
368-
device=device,
369-
library_name=library_name,
370-
**loading_kwargs,
371-
)
358+
try:
359+
if library_name == "open_clip":
360+
model = _OpenClipForZeroShotImageClassification.from_pretrained(model_name_or_path, cache_dir=cache_dir)
361+
else:
362+
model = TasksManager.get_model_from_task(
363+
task,
364+
model_name_or_path,
365+
subfolder=subfolder,
366+
revision=revision,
367+
cache_dir=cache_dir,
368+
token=token,
369+
local_files_only=local_files_only,
370+
force_download=force_download,
371+
trust_remote_code=trust_remote_code,
372+
framework=framework,
373+
device=device,
374+
library_name=library_name,
375+
**loading_kwargs,
376+
)
372377

373-
needs_pad_token_id = task == "text-classification" and getattr(model.config, "pad_token_id", None) is None
378+
needs_pad_token_id = task == "text-classification" and getattr(model.config, "pad_token_id", None) is None
374379

375-
if needs_pad_token_id:
376-
if pad_token_id is not None:
377-
model.config.pad_token_id = pad_token_id
378-
else:
379-
tok = AutoTokenizer.from_pretrained(model_name_or_path)
380-
pad_token_id = getattr(tok, "pad_token_id", None)
381-
if pad_token_id is None:
382-
raise ValueError(
383-
"Could not infer the pad token id, which is needed in this case, please provide it with the --pad_token_id argument"
384-
)
385-
model.config.pad_token_id = pad_token_id
380+
if needs_pad_token_id:
381+
if pad_token_id is not None:
382+
model.config.pad_token_id = pad_token_id
383+
else:
384+
tok = AutoTokenizer.from_pretrained(model_name_or_path)
385+
pad_token_id = getattr(tok, "pad_token_id", None)
386+
if pad_token_id is None:
387+
raise ValueError(
388+
"Could not infer the pad token id, which is needed in this case, please provide it with the --pad_token_id argument"
389+
)
390+
model.config.pad_token_id = pad_token_id
386391

387-
if hasattr(model.config, "export_model_type"):
388-
model_type = model.config.export_model_type.replace("_", "-")
389-
else:
390-
model_type = model.config.model_type.replace("_", "-")
391-
392-
if (
393-
not custom_architecture
394-
and library_name != "diffusers"
395-
and task + "-with-past"
396-
in TasksManager.get_supported_tasks_for_model_type(model_type, exporter="openvino", library_name=library_name)
397-
):
398-
# Make -with-past the default if --task was not explicitely specified
399-
if original_task == "auto":
400-
task = task + "-with-past"
392+
if hasattr(model.config, "export_model_type"):
393+
model_type = model.config.export_model_type.replace("_", "-")
401394
else:
402-
logger.info(
403-
f"The task `{task}` was manually specified, and past key values will not be reused in the decoding."
404-
f" if needed, please pass `--task {task}-with-past` to export using the past key values."
395+
model_type = model.config.model_type.replace("_", "-")
396+
397+
if (
398+
not custom_architecture
399+
and library_name != "diffusers"
400+
and task + "-with-past"
401+
in TasksManager.get_supported_tasks_for_model_type(
402+
model_type, exporter="openvino", library_name=library_name
405403
)
404+
):
405+
# Make -with-past the default if --task was not explicitely specified
406+
if original_task == "auto":
407+
task = task + "-with-past"
408+
else:
409+
logger.info(
410+
f"The task `{task}` was manually specified, and past key values will not be reused in the decoding."
411+
f" if needed, please pass `--task {task}-with-past` to export using the past key values."
412+
)
406413

407-
if original_task == "auto":
408-
synonyms_for_task = sorted(TasksManager.synonyms_for_task(task))
409-
if synonyms_for_task:
410-
synonyms_for_task = ", ".join(synonyms_for_task)
411-
possible_synonyms = f" (possible synonyms are: {synonyms_for_task})"
412-
else:
413-
possible_synonyms = ""
414-
logger.info(f"Automatic task detection to {task}{possible_synonyms}.")
414+
if original_task == "auto":
415+
synonyms_for_task = sorted(TasksManager.synonyms_for_task(task))
416+
if synonyms_for_task:
417+
synonyms_for_task = ", ".join(synonyms_for_task)
418+
possible_synonyms = f" (possible synonyms are: {synonyms_for_task})"
419+
else:
420+
possible_synonyms = ""
421+
logger.info(f"Automatic task detection to {task}{possible_synonyms}.")
415422

416-
preprocessors = maybe_load_preprocessors(
417-
model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
418-
)
423+
preprocessors = maybe_load_preprocessors(
424+
model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
425+
)
419426

420-
submodel_paths = export_from_model(
421-
model=model,
422-
output=output,
423-
task=task,
424-
ov_config=ov_config,
425-
stateful=stateful,
426-
model_kwargs=model_kwargs,
427-
custom_export_configs=custom_export_configs,
428-
fn_get_submodels=fn_get_submodels,
429-
preprocessors=preprocessors,
430-
device=device,
431-
trust_remote_code=trust_remote_code,
432-
patch_16bit_model=patch_16bit,
433-
**kwargs_shapes,
434-
)
427+
submodel_paths = export_from_model(
428+
model=model,
429+
output=output,
430+
task=task,
431+
ov_config=ov_config,
432+
stateful=stateful,
433+
model_kwargs=model_kwargs,
434+
custom_export_configs=custom_export_configs,
435+
fn_get_submodels=fn_get_submodels,
436+
preprocessors=preprocessors,
437+
device=device,
438+
trust_remote_code=trust_remote_code,
439+
patch_16bit_model=patch_16bit,
440+
**kwargs_shapes,
441+
)
435442

436-
if convert_tokenizer:
437-
maybe_convert_tokenizers(library_name, output, model, preprocessors, task=task)
438-
439-
clear_class_registry()
440-
del model
441-
gc.collect()
442-
443-
for submodel_path in submodel_paths:
444-
submodel_path = Path(output) / submodel_path
445-
submodel = core.read_model(submodel_path)
446-
447-
quantization_config = None
448-
if ov_config is None:
449-
num_parameters = 0
450-
for op in submodel.get_ops():
451-
if op.get_type_name() == "Constant" and op.get_element_type() in [Type.f16, Type.f32, Type.bf16]:
452-
num_parameters += reduce(operator.mul, op.shape, 1)
453-
del op
454-
if num_parameters >= _MAX_UNCOMPRESSED_SIZE:
455-
if is_nncf_available():
456-
quantization_config = {"bits": 8, "sym": False}
457-
logger.info("The model weights will be quantized to int8_asym.")
458-
else:
459-
logger.warning(
460-
"The model will be converted with no weights quantization. Quantization of the weights to int8 "
461-
"requires nncf. Please install it with `pip install nncf`"
462-
)
463-
break
464-
else:
465-
quantization_config = ov_config.quantization_config
466-
if quantization_config is None:
467-
del submodel
468-
gc.collect()
469-
continue
443+
if convert_tokenizer:
444+
maybe_convert_tokenizers(library_name, output, model, preprocessors, task=task)
470445

471-
if not is_nncf_available():
472-
raise ImportError("Quantization of the weights requires nncf, please install it with `pip install nncf`")
446+
clear_class_registry()
447+
del model
448+
gc.collect()
473449

474-
from optimum.intel.openvino.quantization import _weight_only_quantization
450+
for submodel_path in submodel_paths:
451+
submodel_path = Path(output) / submodel_path
452+
submodel = core.read_model(submodel_path)
453+
454+
quantization_config = None
455+
if ov_config is None:
456+
num_parameters = 0
457+
for op in submodel.get_ops():
458+
if op.get_type_name() == "Constant" and op.get_element_type() in [Type.f16, Type.f32, Type.bf16]:
459+
num_parameters += reduce(operator.mul, op.shape, 1)
460+
del op
461+
if num_parameters >= _MAX_UNCOMPRESSED_SIZE:
462+
if is_nncf_available():
463+
quantization_config = {"bits": 8, "sym": False}
464+
logger.info("The model weights will be quantized to int8_asym.")
465+
else:
466+
logger.warning(
467+
"The model will be converted with no weights quantization. Quantization of the weights to int8 "
468+
"requires nncf. Please install it with `pip install nncf`"
469+
)
470+
break
471+
else:
472+
quantization_config = ov_config.quantization_config
473+
if quantization_config is None:
474+
del submodel
475+
gc.collect()
476+
continue
477+
478+
if not is_nncf_available():
479+
raise ImportError(
480+
"Quantization of the weights requires nncf, please install it with `pip install nncf`"
481+
)
475482

476-
_weight_only_quantization(submodel, quantization_config)
477-
compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
478-
save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
479-
del submodel
480-
gc.collect()
483+
from optimum.intel.openvino.quantization import _weight_only_quantization
481484

482-
submodel_path.unlink()
483-
submodel_path.with_suffix(".bin").unlink()
484-
compressed_submodel_path.rename(submodel_path)
485-
compressed_submodel_path.with_suffix(".bin").rename(submodel_path.with_suffix(".bin"))
485+
_weight_only_quantization(submodel, quantization_config)
486+
compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
487+
save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
488+
del submodel
489+
gc.collect()
486490

487-
# Unpatch modules after GPTQ export
488-
if do_gptq_patching:
489-
torch.cuda.is_available = orig_cuda_check
490-
GPTQQuantizer.post_init_model = orig_post_init_model
491+
submodel_path.unlink()
492+
submodel_path.with_suffix(".bin").unlink()
493+
compressed_submodel_path.rename(submodel_path)
494+
compressed_submodel_path.with_suffix(".bin").rename(submodel_path.with_suffix(".bin"))
495+
496+
finally:
497+
# Unpatch modules after quantized model export
498+
if do_quant_patching:
499+
torch.cuda.is_available = orig_cuda_check
500+
if do_gptq_patching:
501+
GPTQQuantizer.post_init_model = orig_post_init_model
491502

492503

493504
def maybe_convert_tokenizers(library_name: str, output: Path, model=None, preprocessors=None, task=None):

optimum/exporters/openvino/convert.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -456,7 +456,11 @@ def ts_patched_forward(*args, **kwargs):
456456
from openvino.frontend.pytorch.patch_model import unpatch_model
457457

458458
unpatch_model(model, "_openvino_module_extension_patch_orig_forward")
459-
model.to(torch.float32)
459+
for m in model.modules():
460+
if any(p.dtype in [torch.float16, torch.bfloat16] for p in m.parameters(False)) or any(
461+
b.dtype in [torch.float16, torch.bfloat16] for b in m.buffers(False)
462+
):
463+
m.float()
460464

461465
return export_pytorch_via_onnx(
462466
model,

0 commit comments

Comments
 (0)