fix: correcting errors

bayo-ibm · bayo-ibm · commit 71b5aa469f93 · 2025-08-20T12:13:16.000-04:00
Signed-off-by: Omobayode Fagbohungbe &lt;omobayode.fagbohungbe@ibm.com&gt;
diff --git a/fms_mo/dq.py b/fms_mo/dq.py
@@ -1,11 +1,11 @@
 # Copyright The FMS Model Optimizer Authors
-
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,6 +21,7 @@
 # Standard
 from pathlib import Path
 import logging
+import os
 
 # Third Party
 from datasets import load_from_disk
@@ -33,8 +34,8 @@
     default_data_collator,
 )
 import torch
+import sys
 
-import os
 # Local
 from fms_mo import qconfig_init, qmodel_prep
 from fms_mo.custom_ext_kernels.utils import (
@@ -48,14 +49,14 @@
     get_act_scales_1gpu,
 )
 from fms_mo.utils.aiu_utils import save_for_aiu
-from fms_mo.utils.dq_utils import config_quantize_smooth_layers
-from fms_mo.utils.eval_utils import Evaluator, eval_llm_1GPU
-from fms_mo.utils.utils import patch_torch_bmm, prepare_input
 from fms_mo.utils.dq_inf import (
-    save_vllm_fp8,
-    convert_fp8_vllm_to_fms_mo,
     check_quantization_setting,
+    convert_fp8_vllm_to_fms_mo,
+    save_vllm_fp8,
 )
+from fms_mo.utils.dq_utils import config_quantize_smooth_layers
+from fms_mo.utils.eval_utils import Evaluator, eval_llm_1GPU
+from fms_mo.utils.utils import patch_torch_bmm, prepare_input
 
 logger = logging.getLogger(__name__)
 
@@ -133,16 +134,15 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
         low_cpu_mem_usage=bool(model_args.device_map),
     )
 
-    inference= model.config.to_dict().get("quantization_config",None)
+    inference = model.config.to_dict().get("quantization_config", None)
 
     if inference:
         quant_setting = check_quantization_setting(inference)
         if quant_setting:
             logger.info("Quantization config settings validated ")
-            model = convert_fp8_vllm_to_fms_mo(model = model)
+            model = convert_fp8_vllm_to_fms_mo(model=model)
         else:
-            exit("__This quantization config is wrong/not supported__")
-
+            sys.exit("Error: This quantization config is wrong/not supported")
 
     embedding_size = model.get_input_embeddings().weight.shape[0]
     if len(tokenizer) > embedding_size:
@@ -157,17 +157,22 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
         qcfg = qconfig_init(recipe="dq", args=fms_mo_args)
     else:
         logger.info("inference mode activated")
-        if os.path.isfile(model_args.model_name_or_path+"/qcfg.json"):
+        if os.path.isfile(model_args.model_name_or_path + "/qcfg.json"):
             if fms_mo_args.override_fms_args:
-                logger.info("qcfg file found and some parameters are being over-written ")
-                qcfg = qconfig_init(recipe=model_args.model_name_or_path+"/qcfg", args=fms_mo_args)
+                logger.info(
+                    "qcfg file found and some parameters are being over-written "
+                )
+                qcfg = qconfig_init(
+                    recipe=model_args.model_name_or_path + "/qcfg", args=fms_mo_args
+                )
             else:
                 logger.info("qcfg file found, loading the qcfg file ")
-                qcfg = qconfig_init(recipe=model_args.model_name_or_path+"/qcfg")
+                qcfg = qconfig_init(recipe=model_args.model_name_or_path + "/qcfg")
         else:
-            logger.info("qcfg file not found in {model_args.model_name_or_path},\
+            logger.info(
+                "qcfg file not found in {model_args.model_name_or_path},\
                         loading fms_mo_args and recipe"
-                        )
+            )
             qcfg = qconfig_init(recipe="dq", args=fms_mo_args)
 
     model_size = model_size_Wb(model, unit="GB")
@@ -225,7 +230,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
     )
 
     # For loading or creating smoothquant scale. Sometimes we may include scales in ckpt as well.
-    if not inference and qcfg["smoothq"] :
+    if not inference and qcfg["smoothq"]:
         scale_file = Path(f"./act_scales/{qcfg['model'].replace('/', '-')}.pt")
         if qcfg.get("act_scale_path", None):
             # user provided a scale file (or a dir)
@@ -295,11 +300,11 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
             logger.info(
                 f"Saving model processed for vLLM and tokenizer to {opt_args.output_dir}"
             )
-            save_vllm_fp8(model,qcfg,tokenizer,opt_args.output_dir)
+            save_vllm_fp8(model, qcfg, tokenizer, opt_args.output_dir)
         elif opt_args.save_ckpt:
             logger.info(
                 f"Saving quantized model and tokenizer to {opt_args.output_dir}"
-                )
+            )
             model.save_pretrained(opt_args.output_dir, use_safetensors=True)
             tokenizer.save_pretrained(opt_args.output_dir)
 
diff --git a/fms_mo/modules/linear.py b/fms_mo/modules/linear.py
@@ -281,7 +281,7 @@ def forward(self, x):
                 )
 
             # pylint: disable=not-callable
-            
+
             return F.linear(x, self.W_fp, self.bias)
         else:
             qinput = self.quantize_feature(x / scale).to(x.dtype)
@@ -297,6 +297,7 @@ def forward(self, x):
                 )
 
         qbias = self.bias
+        
         # pylint: disable=not-callable
         output = F.linear(qinput, qweight, qbias)
 
diff --git a/fms_mo/prep.py b/fms_mo/prep.py
@@ -22,8 +22,9 @@
 
 # Third Party
 from torch import nn
-import torch
 import compressed_tensors
+import torch
+
 # Local
 from fms_mo.calib import qmodel_calib
 from fms_mo.modules import QBmm_modules, QConv2d_modules, QLinear_modules, QLSTM_modules
@@ -391,14 +392,16 @@ def make_quant_module(module, curr_full_name, qcfg, verbose=False):
     # For nn.Linear
     elif isinstance(module, nn.Linear):
         if module.__class__ != nn.Linear:
-            if isinstance(module, compressed_tensors.linear.compressed_linear.CompressedLinear):
+            if isinstance(
+                module, compressed_tensors.linear.compressed_linear.CompressedLinear
+            ):
                 pass
             else:
                 logger.warning(
                     f"{curr_full_name} {type(module)} seems to be a wrapper of Linear."
-                    "Please make sure it doesn't wrap BN and activ func."
-                    "Otherwise please create an equivalen Linear wrapper and change qcfg['mapping']."
-                    )
+                    "Please make sure it doesn't wrap BN and activ func. Otherwise"
+                    "please create an equivalen Linear wrapper and change qcfg['mapping']."
+                )
         QLin = mapping.get(nn.Linear, None)
         if QLin is None:
             if verbose:
@@ -572,6 +575,7 @@ def has_quantized_module(model):
     """Check if model is already quantized - do not want to quantize twice if so"""
     return any(isinstance(m, quantized_modules) for m in model.modules())
 
+
 def swap_qbmm(model: nn.Module, qcfg: dict):
     """Go through all model.named_modules(), try to create an equivalent
     Qbmm layer to replace each of the existing linear Bmm layers.
@@ -581,14 +585,13 @@ def swap_qbmm(model: nn.Module, qcfg: dict):
         qcfg (dict): quant config
 
     Returns: updated model is returned with the Qbmm added
-        
+
     """
 
+    # Local
     from fms_mo.modules import QBmm
 
-    qcfg["which2patch_contextmanager"] = qcfg["bmm_prep"][
-        "which2patch_contextmanager"
-    ]
+    qcfg["which2patch_contextmanager"] = qcfg["bmm_prep"]["which2patch_contextmanager"]
     isbmm = qcfg["which2patch_contextmanager"] == "torch.bmm"
     for mod_name, line_nums in qcfg["bmm_prep"]["layers_with_bmm"].items():
         mod_bmm_happened = model.get_submodule(mod_name)
@@ -608,6 +611,7 @@ def swap_qbmm(model: nn.Module, qcfg: dict):
             )
             setattr(mod_bmm_happened, f"QBmm{ln}", newQBmm)
 
+
 def qmodel_prep(
     model,
     dloader,
@@ -696,13 +700,12 @@ def qmodel_prep(
         nn.Module: quantized model ready for further PTQ/QAT
     """
     if mode:
-        
-        if qcfg.get("QBmm"): 
-            swap_qbmm(model,qcfg)
+        if qcfg.get("QBmm"):
+            swap_qbmm(model, qcfg)
 
-        model = q_any_net_5(model, qcfg, verbose = False)
+        model = q_any_net_5(model, qcfg, verbose=False)
         return model
-    
+
     sys.setrecursionlimit(4000)
 
     currDev = next(model.parameters()).device if dev is None else dev
@@ -952,7 +955,7 @@ def qmodel_prep(
             model, device_ids=DPorDDPdevices
         )
 
-    qconfig_save(qcfg, fname=qcfg["output_folder"]+"/qcfg.json")
+    qconfig_save(qcfg, fname=qcfg["output_folder"] + "/qcfg.json")
     qcfg["tb_writer"] = tb_writer
 
     logger.info(f"--- Quantized model --- \n{model}\n")
diff --git a/fms_mo/quant/quantizers.py b/fms_mo/quant/quantizers.py
@@ -237,7 +237,7 @@ def get_weight_quantizer(
     recompute=False,
     perGp=None,
     use_subnormal=False,
-    emulate = True,
+    emulate=True,
 ):
     """Return a quantizer for weight quantization
     Regular quantizers:
diff --git a/fms_mo/utils/dq_inf.py b/fms_mo/utils/dq_inf.py
diff --git a/fms_mo/utils/dq_utils.py b/fms_mo/utils/dq_utils.py

Original file line number	Diff line number	Diff line change
`@@ -281,7 +281,7 @@ def forward(self, x):`
`281`	`281`	`)`
`282`	`282`
`283`	`283`	`# pylint: disable=not-callable`
`284`		`-`
	`284`	`+`
`285`	`285`	`return F.linear(x, self.W_fp, self.bias)`
`286`	`286`	`else:`
`287`	`287`	`qinput = self.quantize_feature(x / scale).to(x.dtype)`
`@@ -297,6 +297,7 @@ def forward(self, x):`
`297`	`297`	`)`
`298`	`298`
`299`	`299`	`qbias = self.bias`
	`300`	`+`
`300`	`301`	`# pylint: disable=not-callable`
`301`	`302`	`output = F.linear(qinput, qweight, qbias)`
`302`	`303`