@@ -232,6 +232,7 @@ def main_export(
232232 )
233233
234234 do_gptq_patching = False
235+ do_quant_patching = False
235236 custom_architecture = False
236237 patch_16bit = False
237238 loading_kwargs = model_loading_kwargs or {}
@@ -247,7 +248,11 @@ def main_export(
247248 trust_remote_code = trust_remote_code ,
248249 )
249250 quantization_config = getattr (config , "quantization_config" , None )
250- do_gptq_patching = quantization_config and quantization_config ["quant_method" ] == "gptq"
251+ supported_quant_methods = ["gptq" ]
252+ if is_openvino_version (">=" , "2024.6.0" ):
253+ supported_quant_methods .append ("awq" )
254+ do_quant_patching = quantization_config and quantization_config ["quant_method" ] in supported_quant_methods
255+ do_gptq_patching = do_quant_patching and quantization_config ["quant_method" ] == "gptq"
251256 model_type = config .model_type .replace ("_" , "-" )
252257 if model_type not in TasksManager ._SUPPORTED_MODEL_TYPE :
253258 custom_architecture = True
@@ -296,7 +301,6 @@ def main_export(
296301 if (
297302 dtype is None
298303 and framework == "pt"
299- and not do_gptq_patching
300304 and (
301305 task .startswith ("text-generation" )
302306 or getattr (config , "model_type" , None ) in MULTI_MODAL_TEXT_GENERATION_MODELS
@@ -315,28 +319,28 @@ def main_export(
315319 patch_16bit = True
316320 loading_kwargs ["torch_dtype" ] = dtype
317321 # Patch the modules to export of GPTQ models w/o GPU
318- if do_gptq_patching :
319- torch .set_default_dtype (torch .float32 )
322+ if do_quant_patching :
320323 orig_cuda_check = torch .cuda .is_available
321324 torch .cuda .is_available = lambda : True
322325
323- from optimum .gptq import GPTQQuantizer
326+ if do_gptq_patching :
327+ from optimum .gptq import GPTQQuantizer
324328
325- orig_post_init_model = GPTQQuantizer .post_init_model
329+ orig_post_init_model = GPTQQuantizer .post_init_model
326330
327- def post_init_model (self , model ):
328- from auto_gptq import exllama_set_max_input_length
331+ def post_init_model (self , model ):
332+ from auto_gptq import exllama_set_max_input_length
329333
330- class StoreAttr (object ):
331- pass
334+ class StoreAttr (object ):
335+ pass
332336
333- model .quantize_config = StoreAttr ()
334- model .quantize_config .desc_act = self .desc_act
335- if self .desc_act and not self .disable_exllama and self .max_input_length is not None :
336- model = exllama_set_max_input_length (model , self .max_input_length )
337- return model
337+ model .quantize_config = StoreAttr ()
338+ model .quantize_config .desc_act = self .desc_act
339+ if self .desc_act and not self .disable_exllama and self .max_input_length is not None :
340+ model = exllama_set_max_input_length (model , self .max_input_length )
341+ return model
338342
339- GPTQQuantizer .post_init_model = post_init_model
343+ GPTQQuantizer .post_init_model = post_init_model
340344 elif library_name == "diffusers" and is_openvino_version (">=" , "2024.6" ):
341345 dtype = deduce_diffusers_dtype (
342346 model_name_or_path ,
@@ -351,143 +355,150 @@ class StoreAttr(object):
351355 loading_kwargs ["torch_dtype" ] = dtype
352356 patch_16bit = True
353357
354- if library_name == "open_clip" :
355- model = _OpenClipForZeroShotImageClassification .from_pretrained (model_name_or_path , cache_dir = cache_dir )
356- else :
357- model = TasksManager .get_model_from_task (
358- task ,
359- model_name_or_path ,
360- subfolder = subfolder ,
361- revision = revision ,
362- cache_dir = cache_dir ,
363- token = token ,
364- local_files_only = local_files_only ,
365- force_download = force_download ,
366- trust_remote_code = trust_remote_code ,
367- framework = framework ,
368- device = device ,
369- library_name = library_name ,
370- ** loading_kwargs ,
371- )
358+ try :
359+ if library_name == "open_clip" :
360+ model = _OpenClipForZeroShotImageClassification .from_pretrained (model_name_or_path , cache_dir = cache_dir )
361+ else :
362+ model = TasksManager .get_model_from_task (
363+ task ,
364+ model_name_or_path ,
365+ subfolder = subfolder ,
366+ revision = revision ,
367+ cache_dir = cache_dir ,
368+ token = token ,
369+ local_files_only = local_files_only ,
370+ force_download = force_download ,
371+ trust_remote_code = trust_remote_code ,
372+ framework = framework ,
373+ device = device ,
374+ library_name = library_name ,
375+ ** loading_kwargs ,
376+ )
372377
373- needs_pad_token_id = task == "text-classification" and getattr (model .config , "pad_token_id" , None ) is None
378+ needs_pad_token_id = task == "text-classification" and getattr (model .config , "pad_token_id" , None ) is None
374379
375- if needs_pad_token_id :
376- if pad_token_id is not None :
377- model .config .pad_token_id = pad_token_id
378- else :
379- tok = AutoTokenizer .from_pretrained (model_name_or_path )
380- pad_token_id = getattr (tok , "pad_token_id" , None )
381- if pad_token_id is None :
382- raise ValueError (
383- "Could not infer the pad token id, which is needed in this case, please provide it with the --pad_token_id argument"
384- )
385- model .config .pad_token_id = pad_token_id
380+ if needs_pad_token_id :
381+ if pad_token_id is not None :
382+ model .config .pad_token_id = pad_token_id
383+ else :
384+ tok = AutoTokenizer .from_pretrained (model_name_or_path )
385+ pad_token_id = getattr (tok , "pad_token_id" , None )
386+ if pad_token_id is None :
387+ raise ValueError (
388+ "Could not infer the pad token id, which is needed in this case, please provide it with the --pad_token_id argument"
389+ )
390+ model .config .pad_token_id = pad_token_id
386391
387- if hasattr (model .config , "export_model_type" ):
388- model_type = model .config .export_model_type .replace ("_" , "-" )
389- else :
390- model_type = model .config .model_type .replace ("_" , "-" )
391-
392- if (
393- not custom_architecture
394- and library_name != "diffusers"
395- and task + "-with-past"
396- in TasksManager .get_supported_tasks_for_model_type (model_type , exporter = "openvino" , library_name = library_name )
397- ):
398- # Make -with-past the default if --task was not explicitely specified
399- if original_task == "auto" :
400- task = task + "-with-past"
392+ if hasattr (model .config , "export_model_type" ):
393+ model_type = model .config .export_model_type .replace ("_" , "-" )
401394 else :
402- logger .info (
403- f"The task `{ task } ` was manually specified, and past key values will not be reused in the decoding."
404- f" if needed, please pass `--task { task } -with-past` to export using the past key values."
395+ model_type = model .config .model_type .replace ("_" , "-" )
396+
397+ if (
398+ not custom_architecture
399+ and library_name != "diffusers"
400+ and task + "-with-past"
401+ in TasksManager .get_supported_tasks_for_model_type (
402+ model_type , exporter = "openvino" , library_name = library_name
405403 )
404+ ):
405+ # Make -with-past the default if --task was not explicitely specified
406+ if original_task == "auto" :
407+ task = task + "-with-past"
408+ else :
409+ logger .info (
410+ f"The task `{ task } ` was manually specified, and past key values will not be reused in the decoding."
411+ f" if needed, please pass `--task { task } -with-past` to export using the past key values."
412+ )
406413
407- if original_task == "auto" :
408- synonyms_for_task = sorted (TasksManager .synonyms_for_task (task ))
409- if synonyms_for_task :
410- synonyms_for_task = ", " .join (synonyms_for_task )
411- possible_synonyms = f" (possible synonyms are: { synonyms_for_task } )"
412- else :
413- possible_synonyms = ""
414- logger .info (f"Automatic task detection to { task } { possible_synonyms } ." )
414+ if original_task == "auto" :
415+ synonyms_for_task = sorted (TasksManager .synonyms_for_task (task ))
416+ if synonyms_for_task :
417+ synonyms_for_task = ", " .join (synonyms_for_task )
418+ possible_synonyms = f" (possible synonyms are: { synonyms_for_task } )"
419+ else :
420+ possible_synonyms = ""
421+ logger .info (f"Automatic task detection to { task } { possible_synonyms } ." )
415422
416- preprocessors = maybe_load_preprocessors (
417- model_name_or_path , subfolder = subfolder , trust_remote_code = trust_remote_code
418- )
423+ preprocessors = maybe_load_preprocessors (
424+ model_name_or_path , subfolder = subfolder , trust_remote_code = trust_remote_code
425+ )
419426
420- submodel_paths = export_from_model (
421- model = model ,
422- output = output ,
423- task = task ,
424- ov_config = ov_config ,
425- stateful = stateful ,
426- model_kwargs = model_kwargs ,
427- custom_export_configs = custom_export_configs ,
428- fn_get_submodels = fn_get_submodels ,
429- preprocessors = preprocessors ,
430- device = device ,
431- trust_remote_code = trust_remote_code ,
432- patch_16bit_model = patch_16bit ,
433- ** kwargs_shapes ,
434- )
427+ submodel_paths = export_from_model (
428+ model = model ,
429+ output = output ,
430+ task = task ,
431+ ov_config = ov_config ,
432+ stateful = stateful ,
433+ model_kwargs = model_kwargs ,
434+ custom_export_configs = custom_export_configs ,
435+ fn_get_submodels = fn_get_submodels ,
436+ preprocessors = preprocessors ,
437+ device = device ,
438+ trust_remote_code = trust_remote_code ,
439+ patch_16bit_model = patch_16bit ,
440+ ** kwargs_shapes ,
441+ )
435442
436- if convert_tokenizer :
437- maybe_convert_tokenizers (library_name , output , model , preprocessors , task = task )
438-
439- clear_class_registry ()
440- del model
441- gc .collect ()
442-
443- for submodel_path in submodel_paths :
444- submodel_path = Path (output ) / submodel_path
445- submodel = core .read_model (submodel_path )
446-
447- quantization_config = None
448- if ov_config is None :
449- num_parameters = 0
450- for op in submodel .get_ops ():
451- if op .get_type_name () == "Constant" and op .get_element_type () in [Type .f16 , Type .f32 , Type .bf16 ]:
452- num_parameters += reduce (operator .mul , op .shape , 1 )
453- del op
454- if num_parameters >= _MAX_UNCOMPRESSED_SIZE :
455- if is_nncf_available ():
456- quantization_config = {"bits" : 8 , "sym" : False }
457- logger .info ("The model weights will be quantized to int8_asym." )
458- else :
459- logger .warning (
460- "The model will be converted with no weights quantization. Quantization of the weights to int8 "
461- "requires nncf. Please install it with `pip install nncf`"
462- )
463- break
464- else :
465- quantization_config = ov_config .quantization_config
466- if quantization_config is None :
467- del submodel
468- gc .collect ()
469- continue
443+ if convert_tokenizer :
444+ maybe_convert_tokenizers (library_name , output , model , preprocessors , task = task )
470445
471- if not is_nncf_available ():
472- raise ImportError ("Quantization of the weights requires nncf, please install it with `pip install nncf`" )
446+ clear_class_registry ()
447+ del model
448+ gc .collect ()
473449
474- from optimum .intel .openvino .quantization import _weight_only_quantization
450+ for submodel_path in submodel_paths :
451+ submodel_path = Path (output ) / submodel_path
452+ submodel = core .read_model (submodel_path )
453+
454+ quantization_config = None
455+ if ov_config is None :
456+ num_parameters = 0
457+ for op in submodel .get_ops ():
458+ if op .get_type_name () == "Constant" and op .get_element_type () in [Type .f16 , Type .f32 , Type .bf16 ]:
459+ num_parameters += reduce (operator .mul , op .shape , 1 )
460+ del op
461+ if num_parameters >= _MAX_UNCOMPRESSED_SIZE :
462+ if is_nncf_available ():
463+ quantization_config = {"bits" : 8 , "sym" : False }
464+ logger .info ("The model weights will be quantized to int8_asym." )
465+ else :
466+ logger .warning (
467+ "The model will be converted with no weights quantization. Quantization of the weights to int8 "
468+ "requires nncf. Please install it with `pip install nncf`"
469+ )
470+ break
471+ else :
472+ quantization_config = ov_config .quantization_config
473+ if quantization_config is None :
474+ del submodel
475+ gc .collect ()
476+ continue
477+
478+ if not is_nncf_available ():
479+ raise ImportError (
480+ "Quantization of the weights requires nncf, please install it with `pip install nncf`"
481+ )
475482
476- _weight_only_quantization (submodel , quantization_config )
477- compressed_submodel_path = submodel_path .parent / f"{ submodel_path .stem } _compressed.xml"
478- save_model (submodel , compressed_submodel_path , compress_to_fp16 = False )
479- del submodel
480- gc .collect ()
483+ from optimum .intel .openvino .quantization import _weight_only_quantization
481484
482- submodel_path .unlink ()
483- submodel_path .with_suffix (".bin" ).unlink ()
484- compressed_submodel_path .rename (submodel_path )
485- compressed_submodel_path .with_suffix (".bin" ).rename (submodel_path .with_suffix (".bin" ))
485+ _weight_only_quantization (submodel , quantization_config )
486+ compressed_submodel_path = submodel_path .parent / f"{ submodel_path .stem } _compressed.xml"
487+ save_model (submodel , compressed_submodel_path , compress_to_fp16 = False )
488+ del submodel
489+ gc .collect ()
486490
487- # Unpatch modules after GPTQ export
488- if do_gptq_patching :
489- torch .cuda .is_available = orig_cuda_check
490- GPTQQuantizer .post_init_model = orig_post_init_model
491+ submodel_path .unlink ()
492+ submodel_path .with_suffix (".bin" ).unlink ()
493+ compressed_submodel_path .rename (submodel_path )
494+ compressed_submodel_path .with_suffix (".bin" ).rename (submodel_path .with_suffix (".bin" ))
495+
496+ finally :
497+ # Unpatch modules after quantized model export
498+ if do_quant_patching :
499+ torch .cuda .is_available = orig_cuda_check
500+ if do_gptq_patching :
501+ GPTQQuantizer .post_init_model = orig_post_init_model
491502
492503
493504def maybe_convert_tokenizers (library_name : str , output : Path , model = None , preprocessors = None , task = None ):
0 commit comments