fix: updated the inference file

bayo-ibm · bayo-ibm · commit 31dd8c7ed048 · 2025-09-02T20:25:14.000-04:00
Signed-off-by: Omobayode Fagbohungbe &lt;omobayode.fagbohungbe@ibm.com&gt;
diff --git a/fms_mo/dq.py b/fms_mo/dq.py
@@ -21,8 +21,6 @@
 # Standard
 from pathlib import Path
 import logging
-import os
-import sys
 
 # Third Party
 from datasets import load_from_disk
@@ -52,6 +50,7 @@
 from fms_mo.utils.dq_inf import (
     check_quantization_setting,
     convert_fp8_vllm_to_fms_mo,
+    load_inference_qconfig_file,
     save_vllm_fp8,
 )
 from fms_mo.utils.dq_utils import config_quantize_smooth_layers
@@ -134,18 +133,6 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
         low_cpu_mem_usage=bool(model_args.device_map),
     )
 
-    inference_qconfig = None
-    if hasattr(model, "config"):
-        inference_qconfig = model.config.to_dict().get("quantization_config", None)
-
-    if inference_qconfig:
-        quant_setting = check_quantization_setting(inference_qconfig)
-        if quant_setting:
-            logger.info("Quantization config settings validated ")
-            model = convert_fp8_vllm_to_fms_mo(model=model)
-        else:
-            sys.exit("Error: This quantization config is wrong/not supported")
-
     embedding_size = model.get_input_embeddings().weight.shape[0]
     if len(tokenizer) > embedding_size:
         model.resize_token_embeddings(len(tokenizer))
@@ -154,29 +141,17 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
     logger.info(f"Model is at {model.device} after intialization")
     logger.info(f"Tokenizer is {tokenizer}, block size is {block_size}")
 
-    if not inference_qconfig:
+    quant_mode = check_quantization_setting(model)
+
+    if not quant_mode:
         logger.info("quantization mode activated, initalizing the qcfg file ")
         qcfg = qconfig_init(recipe="dq", args=fms_mo_args)
     else:
         logger.info("inference mode activated")
-        if os.path.isfile(model_args.model_name_or_path + "/qcfg.json"):
-            if fms_mo_args.override_fms_args:
-                logger.info(
-                    "qcfg file found and some parameters are being over-written "
-                )
-                qcfg = qconfig_init(
-                    recipe=model_args.model_name_or_path + "/qcfg", args=fms_mo_args
-                )
-            else:
-                logger.info("qcfg file found, loading the qcfg file ")
-                qcfg = qconfig_init(recipe=model_args.model_name_or_path + "/qcfg")
-        else:
-            logger.info(
-                "qcfg file not found in {model_args.model_name_or_path},\
-                        loading fms_mo_args and recipe"
-            )
-            qcfg = qconfig_init(recipe="dq", args=fms_mo_args)
-        qcfg["fp8_inference"] = True
+        qcfg = load_inference_qconfig_file(model_args, fms_mo_args)
+
+    if quant_mode:
+        model = convert_fp8_vllm_to_fms_mo(model=model)
 
     model_size = model_size_Wb(model, unit="GB")
     gpu_mem_util_per = model_size / total_gpu_memory
@@ -201,7 +176,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
 
     qcfg["model"] = model_args.model_name_or_path
     # config layers to skip, smooth scale
-    if not inference_qconfig:
+    if not quant_mode:
         config_quantize_smooth_layers(qcfg)
 
     use_dynamo = True
@@ -234,7 +209,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
     )
 
     # For loading or creating smoothquant scale. Sometimes we may include scales in ckpt as well.
-    if not inference_qconfig and qcfg["smoothq"]:
+    if not quant_mode and qcfg["smoothq"]:
         scale_file = Path(f"./act_scales/{qcfg['model'].replace('/', '-')}.pt")
         if qcfg.get("act_scale_path", None):
             # user provided a scale file (or a dir)
@@ -272,7 +247,8 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
         )
         logger.info(f"Quantized model {model}")
         logger.info("==" * 20)
-    if not inference_qconfig:
+
+    if not quant_mode:
         if qcfg["smoothq"]:
             logger.info("Starting to apply smooth scale")
             dq_llm(model, act_scales, qcfg)
diff --git a/fms_mo/prep.py b/fms_mo/prep.py
@@ -395,16 +395,16 @@ def make_quant_module(module, curr_full_name, qcfg, verbose=False):
                 # Third Party
                 import compressed_tensors
 
-                if isinstance(
-                    module, compressed_tensors.linear.compressed_linear.CompressedLinear
-                ):
-                    pass
-                else:
-                    logger.warning(
-                        f"{curr_full_name} {type(module)} seems to be a wrapper of Linear."
-                        "Please make sure it doesn't wrap BN and activ func. Otherwise"
-                        "please create an equivalen Linear wrapper and change qcfg['mapping']."
-                    )
+            if isinstance(
+                module, compressed_tensors.linear.compressed_linear.CompressedLinear
+            ):
+                pass
+            else:
+                logger.warning(
+                    f"{curr_full_name} {type(module)} seems to be a wrapper of Linear."
+                    "Please make sure it doesn't wrap BN and activ func. Otherwise"
+                    "please create an equivalent Linear wrapper and change qcfg['mapping']."
+                )
         QLin = mapping.get(nn.Linear, None)
         if QLin is None:
             if verbose:
diff --git a/fms_mo/quant/quantizers.py b/fms_mo/quant/quantizers.py
@@ -237,7 +237,6 @@ def get_weight_quantizer(
     recompute=False,
     perGp=None,
     use_subnormal=False,
-    emulate=True,
 ):
     """Return a quantizer for weight quantization
     Regular quantizers:
@@ -347,7 +346,7 @@ def get_weight_quantizer(
                 weight_quantizer = to_fp8(
                     nbits,
                     q_mode=qw_mode,
-                    emulate=emulate,
+                    emulate=True,
                     perCh=Nch,
                 )
         else:
diff --git a/fms_mo/utils/dq_inf.py b/fms_mo/utils/dq_inf.py
@@ -29,23 +29,119 @@
 import torch
 
 # Local
+from fms_mo import qconfig_init
 from fms_mo.quant.quantizers import to_fp8_scaled_perCh
 from fms_mo.utils.qconfig_utils import get_recipe
 
 logger = logging.getLogger(__name__)
 
 
-def check_quantization_setting(inference: dict = None):
+def check_quantization_setting(model: nn.Module = None):
     """
     function checks if the checkpoint is from fp8 quantization
     """
-    return (
-        inference["config_groups"]["group_0"]["input_activations"]["num_bits"] == 8
-        and inference["config_groups"]["group_0"]["weights"]["num_bits"] == 8
-        and inference["config_groups"]["group_0"]["weights"]["type"] == "float"
-        and inference["config_groups"]["group_0"]["input_activations"]["type"]
-        == "float"
-    )
+    quant_config = None
+    if hasattr(model, "config"):
+        quant_config = model.config.to_dict().get("quantization_config", None)
+    if quant_config is None:
+        return False
+
+    logger.info("Validating config settings")
+    if quant_config["quant_method"] == "compressed-tensors":
+        if quant_config["format"] != "float-quantized":
+            raise Exception(
+                "The input activation and weight quantization dtypes are not supported"
+            )
+
+        if (
+            quant_config["config_groups"]["group_0"]["input_activations"]["num_bits"]
+            != 8
+        ):
+            raise Exception("Only 8 bit FP input activation quantization is supported")
+
+        if quant_config["config_groups"]["group_0"]["weights"]["num_bits"] != 8:
+            raise Exception("Only 8-bit FP weight quantization  is supported")
+
+        if quant_config["kv_cache_scheme"] is None:
+            pass
+        else:
+            if quant_config["kv_cache_scheme"]["type"] is not float:
+                raise Exception("The KV-Cache quantization dtype is not supported")
+
+            if quant_config["kv_cache_scheme"]["num_bits"] != 8:
+                raise Exception("Only 8-bit KV-Cache quantization dtype is supported")
+
+        return True
+
+    raise Exception("This quantization method is not supported for inferencing")
+
+
+def load_inference_qconfig_file(model_args, fms_mo_args):
+    """
+    Function to load the inference quantization config for fms_mo
+    """
+    if os.path.isfile(model_args.model_name_or_path + "/qcfg.json"):
+        if fms_mo_args.override_qcfg_args:
+            logger.info("qcfg file found and some parameters are being over-written")
+            qcfg = qconfig_init(
+                recipe=model_args.model_name_or_path + "/qcfg", args=fms_mo_args
+            )
+        else:
+            logger.info("qcfg file found, loading the qcfg file ")
+            qcfg = qconfig_init(recipe=model_args.model_name_or_path + "/qcfg")
+    else:
+        logger.info(
+            f"qcfg file not found in {model_args.model_name_or_path},\
+                    loading fms_mo_args and recipe"
+        )
+        qcfg = qconfig_init(recipe="dq", args=fms_mo_args)
+        qcfg = update_qcfg_from_model_config(model_args, qcfg)
+    qcfg["fp8_inference"] = True
+
+    return qcfg
+
+
+def update_qcfg_from_model_config(model_args, qcfg):
+    """
+    function to update the default qcfg setting with settings in the model config file.
+    Important for the case where qcfg file does not exist.
+    """
+    config = get_recipe(model_args.model_name_or_path + "/config")
+    if (
+        config["quantization_config"]["config_groups"]["group_0"]["input_activations"][
+            "strategy"
+        ]
+        == "token"
+    ):
+        qcfg["qa_mode"] = "fp8_e4m3_scale_perToken"
+    else:
+        raise Exception("Only perToken Fp8 activation quantizer is supported")
+
+    if (
+        config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"]
+        == "channel"
+    ):
+        qcfg["qw_mode"] = "fp8_e4m3_scale_perCh"
+    elif (
+        config["quantization_config"]["config_groups"]["group_0"]["weights"]["strategy"]
+        == "tensor"
+    ):
+        qcfg["qw_mode"] = "fp8_e4m3_scale"
+    else:
+        raise Exception(
+            "Only perChannel or pertensor FP8 quantizers are currently supported"
+        )
+
+    qcfg["smoothq"] = False
+    qcfg["nbits_a"] = config["quantization_config"]["config_groups"]["group_0"][
+        "input_activations"
+    ]["num_bits"]
+    qcfg["nbits_w"] = config["quantization_config"]["config_groups"]["group_0"][
+        "weights"
+    ]["num_bits"]
+    qcfg["torch_dtype"] = "float16"
+
+    return qcfg
 
 
 # def rename_fms_dict_to_vllm_dict (model_dict : dict= None, qcfg : dict = None):
diff --git a/pyproject.toml b/pyproject.toml
@@ -35,7 +35,7 @@ dependencies = [
 
 [project.optional-dependencies]
 examples = ["ninja>=1.11.1.1,<2.0", "evaluate", "huggingface_hub"]
-fp8 = ["llmcompressor", "torchao==0.11", "compressed_tensors"]
+fp8 = ["llmcompressor", "torchao==0.11"]
 gptq = ["Cython", "gptqmodel>=1.7.3"]
 mx = ["microxcaling>=1.1"]
 opt = ["fms-model-optimizer[fp8, gptq, mx]"]