feat: enable fast loading and vllm format saving functionality in fms_mo

bayo-ibm · bayo-ibm · commit 0b5d68ab5e9d · 2025-08-20T11:14:25.000-04:00
Signed-off-by: Omobayode Fagbohungbe &lt;omobayode.fagbohungbe@ibm.com&gt;
diff --git a/fms_mo/dq.py b/fms_mo/dq.py
@@ -1,11 +1,11 @@
 # Copyright The FMS Model Optimizer Authors
-#
+
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -34,6 +34,7 @@
 )
 import torch
 
+import os
 # Local
 from fms_mo import qconfig_init, qmodel_prep
 from fms_mo.custom_ext_kernels.utils import (
@@ -50,8 +51,11 @@
 from fms_mo.utils.dq_utils import config_quantize_smooth_layers
 from fms_mo.utils.eval_utils import Evaluator, eval_llm_1GPU
 from fms_mo.utils.utils import patch_torch_bmm, prepare_input
-from fms_mo.utils.dq_inf import load_fp8_vllm, save_vllm_fp8
-from accelerate import load_checkpoint_and_dispatch
+from fms_mo.utils.dq_inf import (
+    save_vllm_fp8,
+    convert_fp8_vllm_to_fms_mo,
+    check_quantization_setting,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -129,18 +133,42 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
         low_cpu_mem_usage=bool(model_args.device_map),
     )
 
+    inference= model.config.to_dict().get("quantization_config",None)
+
+    if inference:
+        quant_setting = check_quantization_setting(inference)
+        if quant_setting:
+            logger.info("Quantization config settings validated ")
+            model = convert_fp8_vllm_to_fms_mo(model = model)
+        else:
+            exit("__This quantization config is wrong/not supported__")
+
+
     embedding_size = model.get_input_embeddings().weight.shape[0]
     if len(tokenizer) > embedding_size:
         model.resize_token_embeddings(len(tokenizer))
 
     logger.info(f"Initialized model is: \n {model}")
     logger.info(f"Model is at {model.device} after intialization")
     logger.info(f"Tokenizer is {tokenizer}, block size is {block_size}")
-    
-    if not fms_mo_args.inference or fms_mo_args.vllm_fp8_load:
+
+    if not inference:
+        logger.info("quantization mode activated, initalizing the qcfg file ")
         qcfg = qconfig_init(recipe="dq", args=fms_mo_args)
     else:
-        qcfg = qconfig_init(recipe=opt_args.output_dir+"/qcfg")
+        logger.info("inference mode activated")
+        if os.path.isfile(model_args.model_name_or_path+"/qcfg.json"):
+            if fms_mo_args.override_fms_args:
+                logger.info("qcfg file found and some parameters are being over-written ")
+                qcfg = qconfig_init(recipe=model_args.model_name_or_path+"/qcfg", args=fms_mo_args)
+            else:
+                logger.info("qcfg file found, loading the qcfg file ")
+                qcfg = qconfig_init(recipe=model_args.model_name_or_path+"/qcfg")
+        else:
+            logger.info("qcfg file not found in {model_args.model_name_or_path},\
+                        loading fms_mo_args and recipe"
+                        )
+            qcfg = qconfig_init(recipe="dq", args=fms_mo_args)
 
     model_size = model_size_Wb(model, unit="GB")
     gpu_mem_util_per = model_size / total_gpu_memory
@@ -184,6 +212,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
     qcfg["model"] = model_args.model_name_or_path
     qcfg["smoothq"] = qcfg.get("smoothq_alpha", -1) >= 0 and "mx_specs" not in qcfg
     qcfg["plotsvg"] = False
+    qcfg["output_folder"] = opt_args.output_dir
 
     calibration_dataset = load_from_disk(data_args.training_data_path)
     calibration_dataset = calibration_dataset.with_format("torch")
@@ -196,7 +225,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
     )
 
     # For loading or creating smoothquant scale. Sometimes we may include scales in ckpt as well.
-    if not fms_mo_args.inference and qcfg["smoothq"] :
+    if not inference and qcfg["smoothq"] :
         scale_file = Path(f"./act_scales/{qcfg['model'].replace('/', '-')}.pt")
         if qcfg.get("act_scale_path", None):
             # user provided a scale file (or a dir)
@@ -230,14 +259,12 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
             use_layer_name_pattern_matching=use_layer_name_pattern_matching,
             use_dynamo=use_dynamo,
             dev=dev,
-            mode=fms_mo_args.inference,
+            mode=inference,
             save_fname="dq",
-            folder=opt_args.output_dir,
         )
         logger.info(f"Quantized model {model}")
         logger.info("==" * 20)
-
-    if not fms_mo_args.inference:
+    if not inference:
         if qcfg["smoothq"]:
             logger.info("Starting to apply smooth scale")
             dq_llm(model, act_scales, qcfg)
@@ -264,7 +291,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
                 f"Saving model processed for AIU and tokenizer to {opt_args.output_dir}"
             )
             save_for_aiu(model, qcfg, output_dir=opt_args.output_dir, verbose=True)
-        elif opt_args.save_ckpt_for_vllm:
+        elif not opt_args.save_ckpt:
             logger.info(
                 f"Saving model processed for vLLM and tokenizer to {opt_args.output_dir}"
             )
@@ -287,19 +314,6 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
                 clamp_acc_to_dl16=fms_mo_args.aiu_sim_triton == "fp8",
                 # layer_to_exclude=["lm_head",]
             )
-    else:
-        if fms_mo_args.vllm_fp8_load:
-            logger.info("loading llmcompressor fp8 model saved_checkpoint")
-            model = load_fp8_vllm( model=model, checkpoint=opt_args.output_dir)
-
-        else:
-            logger.info("loading dq fms_mo fp8 model saved_checkpoint")
-            model = load_checkpoint_and_dispatch( 
-                model,
-                checkpoint=opt_args.output_dir,
-                device_map=None,
-                no_split_module_classes=['Block']
-            )
 
     if fms_mo_args.eval_ppl:
         path_test = Path(data_args.test_data_path)
diff --git a/fms_mo/modules/linear.py b/fms_mo/modules/linear.py
@@ -281,6 +281,7 @@ def forward(self, x):
                 )
 
             # pylint: disable=not-callable
+            
             return F.linear(x, self.W_fp, self.bias)
         else:
             qinput = self.quantize_feature(x / scale).to(x.dtype)
@@ -296,7 +297,6 @@ def forward(self, x):
                 )
 
         qbias = self.bias
-
         # pylint: disable=not-callable
         output = F.linear(qinput, qweight, qbias)
 
diff --git a/fms_mo/prep.py b/fms_mo/prep.py
@@ -23,7 +23,7 @@
 # Third Party
 from torch import nn
 import torch
-
+import compressed_tensors
 # Local
 from fms_mo.calib import qmodel_calib
 from fms_mo.modules import QBmm_modules, QConv2d_modules, QLinear_modules, QLSTM_modules
@@ -391,12 +391,14 @@ def make_quant_module(module, curr_full_name, qcfg, verbose=False):
     # For nn.Linear
     elif isinstance(module, nn.Linear):
         if module.__class__ != nn.Linear:
-            logger.warning(
-                f"{curr_full_name} {type(module)} seems to be a wrapper of Linear."
-                "Please make sure it doesn't wrap BN and activ func."
-                "Otherwise please create an equivalen Linear wrapper and change qcfg['mapping']."
-            )
-
+            if isinstance(module, compressed_tensors.linear.compressed_linear.CompressedLinear):
+                pass
+            else:
+                logger.warning(
+                    f"{curr_full_name} {type(module)} seems to be a wrapper of Linear."
+                    "Please make sure it doesn't wrap BN and activ func."
+                    "Otherwise please create an equivalen Linear wrapper and change qcfg['mapping']."
+                    )
         QLin = mapping.get(nn.Linear, None)
         if QLin is None:
             if verbose:
@@ -571,8 +573,8 @@ def has_quantized_module(model):
     return any(isinstance(m, quantized_modules) for m in model.modules())
 
 def swap_qbmm(model: nn.Module, qcfg: dict):
-    """Go through all model.named_modules(), try to create an equivalent Qbmm layer to replace each of
-    the existing linear Bmm layers.
+    """Go through all model.named_modules(), try to create an equivalent
+    Qbmm layer to replace each of the existing linear Bmm layers.
 
     Args:
         model (nn.Module): input model to be "prepared"
@@ -605,7 +607,7 @@ def swap_qbmm(model: nn.Module, qcfg: dict):
                 qcfg=qcfg,
             )
             setattr(mod_bmm_happened, f"QBmm{ln}", newQBmm)
-            
+
 def qmodel_prep(
     model,
     dloader,
@@ -619,7 +621,6 @@ def qmodel_prep(
     use_dynamo=False,
     mode=False,
     verbose=False,
-    folder=None,
     **kwargs,
 ):
     """Prepare a given PyTorch model for quantization process through three parts:
@@ -951,7 +952,7 @@ def qmodel_prep(
             model, device_ids=DPorDDPdevices
         )
 
-    qconfig_save(qcfg, fname=folder+"/qcfg.json")
+    qconfig_save(qcfg, fname=qcfg["output_folder"]+"/qcfg.json")
     qcfg["tb_writer"] = tb_writer
 
     logger.info(f"--- Quantized model --- \n{model}\n")
diff --git a/fms_mo/quant/quantizers.py b/fms_mo/quant/quantizers.py
@@ -237,6 +237,7 @@ def get_weight_quantizer(
     recompute=False,
     perGp=None,
     use_subnormal=False,
+    emulate = True,
 ):
     """Return a quantizer for weight quantization
     Regular quantizers:
@@ -346,7 +347,7 @@ def get_weight_quantizer(
                 weight_quantizer = to_fp8(
                     nbits,
                     q_mode=qw_mode,
-                    emulate=True,
+                    emulate=emulate,
                     perCh=Nch,
                 )
         else:
diff --git a/fms_mo/training_args.py b/fms_mo/training_args.py
@@ -160,10 +160,6 @@ class OptArguments(TypeChecker):
         default=False,
         metadata={"help": "Prepare and save AIU-compliant checkpoint."},
     )
-    save_ckpt_for_vllm: bool = field(
-        default=False,
-        metadata={"help": "Prepare and save vllm-compliant checkpoint."},
-    )
 
 
 @dataclass
@@ -214,8 +210,7 @@ class FMSMOArguments(TypeChecker):
         metadata={"help": "Apply recomputation during checkpoint saving for AIU."},
     )
     fp8_use_subnormal: bool = field(default=False)
-    inference: bool = field(default=False)
-    vllm_fp8_load: bool = field(default=False)
+    override_fms_args: bool = field(default=False)
 
 
 @dataclass
diff --git a/fms_mo/utils/dq_inf.py b/fms_mo/utils/dq_inf.py

Original file line number	Diff line number	Diff line change
`@@ -281,6 +281,7 @@ def forward(self, x):`
`281`	`281`	`)`
`282`	`282`
`283`	`283`	`# pylint: disable=not-callable`
	`284`	`+`
`284`	`285`	`return F.linear(x, self.W_fp, self.bias)`
`285`	`286`	`else:`
`286`	`287`	`qinput = self.quantize_feature(x / scale).to(x.dtype)`
`@@ -296,7 +297,6 @@ def forward(self, x):`
`296`	`297`	`)`
`297`	`298`
`298`	`299`	`qbias = self.bias`
`299`		`-`
`300`	`300`	`# pylint: disable=not-callable`
`301`	`301`	`output = F.linear(qinput, qweight, qbias)`
`302`	`302`