3535import torch
3636
3737# Local
38- from fms_mo import qconfig_init , qmodel_prep
38+ from fms_mo import qconfig_init , qmodel_prep , qconfig_load
3939from fms_mo .fx .utils import model_size_Wb
4040from fms_mo .quant .ptq import (
4141 calibration_llm_1GPU ,
@@ -145,7 +145,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
145145 ]
146146 qcfg ["large_model" ] = any (
147147 name in model_args .model_name_or_path for name in known_large_models
148- ) or (gpu_mem_util_per > 0.7 )
148+ ) or (gpu_mem_util_per > 0.1 )
149149 dev = "cpu" if qcfg ["large_model" ] else "cuda"
150150 if model_args .device_map is None :
151151 model .to (dev )
@@ -154,6 +154,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
154154 qcfg ["model_type" ] = model .config .model_type
155155
156156 qcfg ["model" ] = model_args .model_name_or_path
157+ qcfg ["qskip_large_mag_layers" ] = True
157158 # config layers to skip, smooth scale
158159 config_quantize_smooth_layers (qcfg )
159160
@@ -174,6 +175,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
174175 qcfg ["model" ] = model_args .model_name_or_path
175176 qcfg ["smoothq" ] = True
176177 qcfg ["plotsvg" ] = False
178+
177179
178180 calibration_dataset = load_from_disk (data_args .training_data_path )
179181 calibration_dataset = calibration_dataset .with_format ("torch" )
@@ -184,62 +186,80 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
184186 collate_fn = default_data_collator ,
185187 batch_size = 1 ,
186188 )
187-
189+ #print(fms_mo_args)
190+ #ii
188191 # For loading or creating smoothquant scale. Sometimes we may include scales in ckpt as well.
189- scale_file = Path (f"./act_scales/{ qcfg ['model' ].replace ('/' , '-' )} .pt" )
190- if qcfg .get ("act_scale_path" , None ):
191- # user provided a scale file (or a dir)
192- scale_file_or_dir = Path (qcfg ["act_scale_path" ])
193- if scale_file_or_dir .is_dir ():
194- scale_file = scale_file_or_dir / f"{ qcfg ['model' ].replace ('/' , '-' )} .pt"
195- elif scale_file_or_dir .is_file ():
196- scale_file = scale_file_or_dir
192+ if not fms_mo_args .inference :
193+ scale_file = Path (f"./act_scales/{ qcfg ['model' ].replace ('/' , '-' )} .pt" )
194+ if qcfg .get ("act_scale_path" , None ):
195+ # user provided a scale file (or a dir)
196+ scale_file_or_dir = Path (qcfg ["act_scale_path" ])
197+ if scale_file_or_dir .is_dir ():
198+ scale_file = scale_file_or_dir / f"{ qcfg ['model' ].replace ('/' , '-' )} .pt"
199+ elif scale_file_or_dir .is_file ():
200+ scale_file = scale_file_or_dir
197201
198- if not scale_file .parent .exists ():
199- scale_file .parent .mkdir (exist_ok = False )
202+ if not scale_file .parent .exists ():
203+ scale_file .parent .mkdir (exist_ok = False )
200204
201- if scale_file .exists ():
202- act_scales = torch .load (scale_file , map_location = getattr (model , "device" , dev ))
203- else :
204- logger .info ("Generate activation scales" )
205- if qcfg ["large_model" ]:
206- act_scales = get_act_scales_1gpu (model , dq_dataloader , qcfg )
205+ if scale_file .exists ():
206+ act_scales = torch .load (scale_file , map_location = getattr (model , "device" , dev ))
207207 else :
208- act_scales = get_act_scales (model , dq_dataloader , qcfg )
209- torch .save (act_scales , scale_file )
208+ logger .info ("Generate activation scales" )
209+ if qcfg ["large_model" ]:
210+ act_scales = get_act_scales_1gpu (model , dq_dataloader , qcfg )
211+ else :
212+ act_scales = get_act_scales (model , dq_dataloader , qcfg )
213+ torch .save (act_scales , scale_file )
214+ else :
215+ import json
216+ q_file = open ('qcfg_llama.json' , "r" , encoding = "utf-8" )
217+ a = json .load (q_file )
218+ print (a )
219+ qcfg .update (a )
220+ print (qcfg )
221+
210222 qmodel_prep (
211223 model ,
212224 dq_dataloader ,
213225 qcfg ,
214226 use_layer_name_pattern_matching = use_layer_name_pattern_matching ,
215227 use_dynamo = use_dynamo ,
216228 dev = dev ,
229+ mode = fms_mo_args .inference ,
217230 save_fname = "dq" ,
218231 )
219232 logger .info (f"Quantized model { model } " )
220- logger .info ("Starting to apply smooth scale" )
221- dq_llm (model , act_scales , qcfg )
222- logger .info ("Finished applying smooth scale" )
223- logger .info ("==" * 20 )
224- if qcfg ["qmodel_calibration_new" ] > 0 :
225- logger .info ("Starting to calibrate activation clip_val" )
226- if qcfg ["large_model" ]:
227- calibration_llm_1GPU (qcfg , model , dq_dataloader )
228- else :
229- model .to ("cuda:0" )
230- pbar = tqdm (
231- dq_dataloader ,
232- desc = " calibration after applying smoothq scale and before inference" ,
233- total = qcfg ["qmodel_calibration_new" ],
234- )
235- for data_mb , _ in zip (pbar , range (qcfg ["qmodel_calibration_new" ])):
236- data_mb = prepare_input (model .device , data_mb )
237- with patch_torch_bmm (qcfg ):
238- model (** data_mb )
239233
240- logger .info (f"Saving quantized model and tokenizer to { opt_args .output_dir } " )
241- model .save_pretrained (opt_args .output_dir , use_safetensors = True )
242- tokenizer .save_pretrained (opt_args .output_dir )
234+ if not fms_mo_args .inference :
235+ logger .info ("Starting to apply smooth scale" )
236+ dq_llm (model , act_scales , qcfg )
237+ logger .info ("Finished applying smooth scale" )
238+ logger .info ("==" * 20 )
239+ if qcfg ["qmodel_calibration_new" ] > 0 :
240+ logger .info ("Starting to calibrate activation clip_val" )
241+ if qcfg ["large_model" ]:
242+ calibration_llm_1GPU (qcfg , model , dq_dataloader )
243+ else :
244+ model .to ("cuda:0" )
245+ pbar = tqdm (
246+ dq_dataloader ,
247+ desc = " calibration after applying smoothq scale and before inference" ,
248+ total = qcfg ["qmodel_calibration_new" ],
249+ )
250+ for data_mb , _ in zip (pbar , range (qcfg ["qmodel_calibration_new" ])):
251+ data_mb = prepare_input (model .device , data_mb )
252+ with patch_torch_bmm (qcfg ):
253+ model (** data_mb )
254+ logger .info (f"Saving quantized model and tokenizer to { opt_args .output_dir } " )
255+ model .save_pretrained (opt_args .output_dir , use_safetensors = True )
256+ tokenizer .save_pretrained (opt_args .output_dir )
257+ else :
258+ pass
259+ from accelerate import load_checkpoint_and_dispatch
260+ model = load_checkpoint_and_dispatch ( model , checkpoint = opt_args .output_dir , device_map = None , no_split_module_classes = ['Block' ])
261+
262+
243263
244264 if fms_mo_args .eval_ppl :
245265 path_test = Path (data_args .test_data_path )
@@ -253,7 +273,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
253273
254274 logger .info (f"Model for evaluation: { model } " )
255275 if qcfg ["large_model" ]:
256- eval_llm_1GPU (qcfg , model , test_dataset )
276+ eval_llm_1GPU (qcfg , model . to ( 'cpu' ) , test_dataset )
257277 else :
258278 model .to (torch .device ("cuda:0" ))
259279 n_samples = int (test_dataset .input_ids .shape [1 ] / block_size )
0 commit comments