|
50 | 50 | from fms_mo.utils.dq_utils import config_quantize_smooth_layers |
51 | 51 | from fms_mo.utils.eval_utils import Evaluator, eval_llm_1GPU |
52 | 52 | from fms_mo.utils.utils import patch_torch_bmm, prepare_input |
| 53 | +from fms_mo.utils.dq_inf import load_fp8_vllm, save_vllm_fp8 |
| 54 | +from accelerate import load_checkpoint_and_dispatch |
53 | 55 |
|
54 | 56 | logger = logging.getLogger(__name__) |
55 | 57 |
|
@@ -134,7 +136,11 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args): |
134 | 136 | logger.info(f"Initialized model is: \n {model}") |
135 | 137 | logger.info(f"Model is at {model.device} after intialization") |
136 | 138 | logger.info(f"Tokenizer is {tokenizer}, block size is {block_size}") |
137 | | - qcfg = qconfig_init(recipe="dq", args=fms_mo_args) |
| 139 | + |
| 140 | + if not fms_mo_args.inference or fms_mo_args.vllm_fp8_load: |
| 141 | + qcfg = qconfig_init(recipe="dq", args=fms_mo_args) |
| 142 | + else: |
| 143 | + qcfg = qconfig_init(recipe=opt_args.output_dir+"/qcfg") |
138 | 144 |
|
139 | 145 | model_size = model_size_Wb(model, unit="GB") |
140 | 146 | gpu_mem_util_per = model_size / total_gpu_memory |
@@ -190,7 +196,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args): |
190 | 196 | ) |
191 | 197 |
|
192 | 198 | # For loading or creating smoothquant scale. Sometimes we may include scales in ckpt as well. |
193 | | - if qcfg["smoothq"]: |
| 199 | + if not fms_mo_args.inference and qcfg["smoothq"] : |
194 | 200 | scale_file = Path(f"./act_scales/{qcfg['model'].replace('/', '-')}.pt") |
195 | 201 | if qcfg.get("act_scale_path", None): |
196 | 202 | # user provided a scale file (or a dir) |
@@ -224,53 +230,76 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args): |
224 | 230 | use_layer_name_pattern_matching=use_layer_name_pattern_matching, |
225 | 231 | use_dynamo=use_dynamo, |
226 | 232 | dev=dev, |
| 233 | + mode=fms_mo_args.inference, |
227 | 234 | save_fname="dq", |
| 235 | + folder=opt_args.output_dir, |
228 | 236 | ) |
229 | 237 | logger.info(f"Quantized model {model}") |
230 | 238 | logger.info("==" * 20) |
231 | 239 |
|
232 | | - if qcfg["smoothq"]: |
233 | | - logger.info("Starting to apply smooth scale") |
234 | | - dq_llm(model, act_scales, qcfg) |
235 | | - logger.info("Finished applying smooth scale") |
| 240 | + if not fms_mo_args.inference: |
| 241 | + if qcfg["smoothq"]: |
| 242 | + logger.info("Starting to apply smooth scale") |
| 243 | + dq_llm(model, act_scales, qcfg) |
| 244 | + logger.info("Finished applying smooth scale") |
| 245 | + |
| 246 | + if qcfg["qmodel_calibration_new"] > 0: |
| 247 | + logger.info("Starting to calibrate activation clip_val") |
| 248 | + if qcfg["large_model"]: |
| 249 | + calibration_llm_1GPU_v2(qcfg, model, dq_dataloader) |
| 250 | + else: |
| 251 | + model.to("cuda") |
| 252 | + pbar = tqdm( |
| 253 | + dq_dataloader, |
| 254 | + desc=" calibration after applying smoothq scale and before inference", |
| 255 | + total=qcfg["qmodel_calibration_new"], |
| 256 | + ) |
| 257 | + for data_mb, _ in zip(pbar, range(qcfg["qmodel_calibration_new"])): |
| 258 | + data_mb = prepare_input(model.device, data_mb) |
| 259 | + with patch_torch_bmm(qcfg): |
| 260 | + model(**data_mb) |
| 261 | + |
| 262 | + if opt_args.save_ckpt_for_aiu: |
| 263 | + logger.info( |
| 264 | + f"Saving model processed for AIU and tokenizer to {opt_args.output_dir}" |
| 265 | + ) |
| 266 | + save_for_aiu(model, qcfg, output_dir=opt_args.output_dir, verbose=True) |
| 267 | + elif opt_args.save_ckpt_for_vllm: |
| 268 | + logger.info( |
| 269 | + f"Saving model processed for vLLM and tokenizer to {opt_args.output_dir}" |
| 270 | + ) |
| 271 | + save_vllm_fp8(model,qcfg,tokenizer,opt_args.output_dir) |
| 272 | + elif opt_args.save_ckpt: |
| 273 | + logger.info( |
| 274 | + f"Saving quantized model and tokenizer to {opt_args.output_dir}" |
| 275 | + ) |
| 276 | + model.save_pretrained(opt_args.output_dir, use_safetensors=True) |
| 277 | + tokenizer.save_pretrained(opt_args.output_dir) |
| 278 | + |
| 279 | + if fms_mo_args.aiu_sim_triton: |
| 280 | + # NOTE plz apply correct HW settings here, defaults are not real HW params |
| 281 | + lower_qmodel_triton( |
| 282 | + model, |
| 283 | + use_dyn_max_act=-1 if qcfg["qa_mode"] == "pertokenmax" else False, |
| 284 | + max_acc_bits=qcfg.get("max_acc_bits", 32), |
| 285 | + num_lsb_to_truncate=qcfg.get("lsb_trun_bits", 0), |
| 286 | + chunk_size=qcfg.get("chunk_size", 32), # 1024 |
| 287 | + clamp_acc_to_dl16=fms_mo_args.aiu_sim_triton == "fp8", |
| 288 | + # layer_to_exclude=["lm_head",] |
| 289 | + ) |
| 290 | + else: |
| 291 | + if fms_mo_args.vllm_fp8_load: |
| 292 | + logger.info("loading llmcompressor fp8 model saved_checkpoint") |
| 293 | + model = load_fp8_vllm( model=model, checkpoint=opt_args.output_dir) |
236 | 294 |
|
237 | | - if qcfg["qmodel_calibration_new"] > 0: |
238 | | - logger.info("Starting to calibrate activation clip_val") |
239 | | - if qcfg["large_model"]: |
240 | | - calibration_llm_1GPU_v2(qcfg, model, dq_dataloader) |
241 | 295 | else: |
242 | | - model.to("cuda") |
243 | | - pbar = tqdm( |
244 | | - dq_dataloader, |
245 | | - desc=" calibration after applying smoothq scale and before inference", |
246 | | - total=qcfg["qmodel_calibration_new"], |
| 296 | + logger.info("loading dq fms_mo fp8 model saved_checkpoint") |
| 297 | + model = load_checkpoint_and_dispatch( |
| 298 | + model, |
| 299 | + checkpoint=opt_args.output_dir, |
| 300 | + device_map=None, |
| 301 | + no_split_module_classes=['Block'] |
247 | 302 | ) |
248 | | - for data_mb, _ in zip(pbar, range(qcfg["qmodel_calibration_new"])): |
249 | | - data_mb = prepare_input(model.device, data_mb) |
250 | | - with patch_torch_bmm(qcfg): |
251 | | - model(**data_mb) |
252 | | - |
253 | | - if opt_args.save_ckpt_for_aiu: |
254 | | - logger.info( |
255 | | - f"Saving model processed for AIU and tokenizer to {opt_args.output_dir}" |
256 | | - ) |
257 | | - save_for_aiu(model, qcfg, output_dir=opt_args.output_dir, verbose=True) |
258 | | - elif opt_args.save_ckpt: |
259 | | - logger.info(f"Saving quantized model and tokenizer to {opt_args.output_dir}") |
260 | | - model.save_pretrained(opt_args.output_dir, use_safetensors=True) |
261 | | - tokenizer.save_pretrained(opt_args.output_dir) |
262 | | - |
263 | | - if fms_mo_args.aiu_sim_triton: |
264 | | - # NOTE plz apply correct HW settings here, defaults are not real HW params |
265 | | - lower_qmodel_triton( |
266 | | - model, |
267 | | - use_dyn_max_act=-1 if qcfg["qa_mode"] == "pertokenmax" else False, |
268 | | - max_acc_bits=qcfg.get("max_acc_bits", 32), |
269 | | - num_lsb_to_truncate=qcfg.get("lsb_trun_bits", 0), |
270 | | - chunk_size=qcfg.get("chunk_size", 32), # 1024 |
271 | | - clamp_acc_to_dl16=fms_mo_args.aiu_sim_triton == "fp8", |
272 | | - # layer_to_exclude=["lm_head",] |
273 | | - ) |
274 | 303 |
|
275 | 304 | if fms_mo_args.eval_ppl: |
276 | 305 | path_test = Path(data_args.test_data_path) |
|
0 commit comments