11# Copyright The FMS Model Optimizer Authors
2-
2+ #
33# Licensed under the Apache License, Version 2.0 (the "License");
44# you may not use this file except in compliance with the License.
55# You may obtain a copy of the License at
6-
6+ #
77# http://www.apache.org/licenses/LICENSE-2.0
8-
8+ #
99# Unless required by applicable law or agreed to in writing, software
1010# distributed under the License is distributed on an "AS IS" BASIS,
1111# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
2121# Standard
2222from pathlib import Path
2323import logging
24+ import os
2425
2526# Third Party
2627from datasets import load_from_disk
3334 default_data_collator ,
3435)
3536import torch
37+ import sys
3638
37- import os
3839# Local
3940from fms_mo import qconfig_init , qmodel_prep
4041from fms_mo .custom_ext_kernels .utils import (
4849 get_act_scales_1gpu ,
4950)
5051from fms_mo .utils .aiu_utils import save_for_aiu
51- from fms_mo .utils .dq_utils import config_quantize_smooth_layers
52- from fms_mo .utils .eval_utils import Evaluator , eval_llm_1GPU
53- from fms_mo .utils .utils import patch_torch_bmm , prepare_input
5452from fms_mo .utils .dq_inf import (
55- save_vllm_fp8 ,
56- convert_fp8_vllm_to_fms_mo ,
5753 check_quantization_setting ,
54+ convert_fp8_vllm_to_fms_mo ,
55+ save_vllm_fp8 ,
5856)
57+ from fms_mo .utils .dq_utils import config_quantize_smooth_layers
58+ from fms_mo .utils .eval_utils import Evaluator , eval_llm_1GPU
59+ from fms_mo .utils .utils import patch_torch_bmm , prepare_input
5960
6061logger = logging .getLogger (__name__ )
6162
@@ -133,16 +134,15 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
133134 low_cpu_mem_usage = bool (model_args .device_map ),
134135 )
135136
136- inference = model .config .to_dict ().get ("quantization_config" ,None )
137+ inference = model .config .to_dict ().get ("quantization_config" , None )
137138
138139 if inference :
139140 quant_setting = check_quantization_setting (inference )
140141 if quant_setting :
141142 logger .info ("Quantization config settings validated " )
142- model = convert_fp8_vllm_to_fms_mo (model = model )
143+ model = convert_fp8_vllm_to_fms_mo (model = model )
143144 else :
144- exit ("__This quantization config is wrong/not supported__" )
145-
145+ sys .exit ("Error: This quantization config is wrong/not supported" )
146146
147147 embedding_size = model .get_input_embeddings ().weight .shape [0 ]
148148 if len (tokenizer ) > embedding_size :
@@ -157,17 +157,22 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
157157 qcfg = qconfig_init (recipe = "dq" , args = fms_mo_args )
158158 else :
159159 logger .info ("inference mode activated" )
160- if os .path .isfile (model_args .model_name_or_path + "/qcfg.json" ):
160+ if os .path .isfile (model_args .model_name_or_path + "/qcfg.json" ):
161161 if fms_mo_args .override_fms_args :
162- logger .info ("qcfg file found and some parameters are being over-written " )
163- qcfg = qconfig_init (recipe = model_args .model_name_or_path + "/qcfg" , args = fms_mo_args )
162+ logger .info (
163+ "qcfg file found and some parameters are being over-written "
164+ )
165+ qcfg = qconfig_init (
166+ recipe = model_args .model_name_or_path + "/qcfg" , args = fms_mo_args
167+ )
164168 else :
165169 logger .info ("qcfg file found, loading the qcfg file " )
166- qcfg = qconfig_init (recipe = model_args .model_name_or_path + "/qcfg" )
170+ qcfg = qconfig_init (recipe = model_args .model_name_or_path + "/qcfg" )
167171 else :
168- logger .info ("qcfg file not found in {model_args.model_name_or_path},\
172+ logger .info (
173+ "qcfg file not found in {model_args.model_name_or_path},\
169174 loading fms_mo_args and recipe"
170- )
175+ )
171176 qcfg = qconfig_init (recipe = "dq" , args = fms_mo_args )
172177
173178 model_size = model_size_Wb (model , unit = "GB" )
@@ -225,7 +230,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
225230 )
226231
227232 # For loading or creating smoothquant scale. Sometimes we may include scales in ckpt as well.
228- if not inference and qcfg ["smoothq" ] :
233+ if not inference and qcfg ["smoothq" ]:
229234 scale_file = Path (f"./act_scales/{ qcfg ['model' ].replace ('/' , '-' )} .pt" )
230235 if qcfg .get ("act_scale_path" , None ):
231236 # user provided a scale file (or a dir)
@@ -295,11 +300,11 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
295300 logger .info (
296301 f"Saving model processed for vLLM and tokenizer to { opt_args .output_dir } "
297302 )
298- save_vllm_fp8 (model ,qcfg ,tokenizer ,opt_args .output_dir )
303+ save_vllm_fp8 (model , qcfg , tokenizer , opt_args .output_dir )
299304 elif opt_args .save_ckpt :
300305 logger .info (
301306 f"Saving quantized model and tokenizer to { opt_args .output_dir } "
302- )
307+ )
303308 model .save_pretrained (opt_args .output_dir , use_safetensors = True )
304309 tokenizer .save_pretrained (opt_args .output_dir )
305310
0 commit comments