1. temp enables Qmax.dequant=False, bmgroth will officially enable it later, 2. add util func to lower qmodel to triton kernel, 3. additional fix for dq, e.g. torch.load

chichun-charlie-liu · chichun-charlie-liu · commit 553c7a6733eb · 2025-05-23T16:22:16.000Z
Signed-off-by: cliu-us &lt;cliu@us.ibm.com&gt;
diff --git a/fms_mo/custom_ext_kernels/utils.py b/fms_mo/custom_ext_kernels/utils.py
@@ -859,6 +859,64 @@ def lower_qmodel_cutlass(
     return mod
 
 
+def lower_qmodel_triton(
+    model: torch.nn.Module,
+    use_dyn_max_act=False,
+    max_acc_bits=32,
+    num_lsb_to_truncate=0,
+    chunk_size=32,
+):
+    """
+    Examplar GPU lowering function using triton. Only swap Qlinears in transformers, nothing else.
+    Triton kernel can be used to:
+    1. test INT8 or FP8 HW performance (kernel is not optimized)
+    2. simulate MSB/LSB truncation effect
+
+    Args:
+        model: nn.Module. should be a fms_mo Qmodel, will do inplace layer swapping, no deepcopy
+        use_dyn_max_act: bool or int, can be False or -1 for per-token, or -2 for perCh. will use
+                        dynamic max quantizer for activation if not False.
+        max_acc_bits: max bits for accumulator, typically FP32 for all FP matmuls and INT32 for all
+                        INT matmuls. But some HW could use fewer bits to trade-off power
+                        efficiency at the expense of higher chance of accumulation "overflow".
+                        For example, an INT24 accumulator can only hold values ranged from -2^23 to
+                        2^23 -1, as opposed to typical range -2^31 to -2^31 -1.
+        num_lsb_to_truncate: number of bits to truncate from LSB side. For example, given fp32 is
+                        s1e8m23, if we choose to truncate 13 mantissa bits from right most side,
+                        i.e. LSB, the resulting number will be s1e8m10, which is TF32.
+        chunk_size: given a matmul of (m, k) @ (k, n), the inner product will be "accumulated" along
+                    k-dim. Since the entire matrix will be partitioned into smaller tiles when being
+                    computed, accumulator will only add a certain num of elements in one shot. This
+                    "chunk size" in k-dim will affect the overflow/underflow of accumulator.
+    """
+    # Third Party
+    from torch.ao.quantization.utils import _parent_name
+
+    # Local
+    from fms_mo.modules.linear import QLinear, QLinearINT8Deploy
+
+    for name, m in model.named_modules():
+        if not isinstance(m, QLinear):
+            continue
+        parent_name, module_name = _parent_name(name)
+        parent_mod = model.get_submodule(parent_name)
+        qmod = getattr(parent_mod, module_name)
+        setattr(
+            parent_mod,
+            module_name,
+            QLinearINT8Deploy.from_fms_mo(
+                qmod,
+                use_int_kernel="triton",
+                use_dynamic_max_act_Qfunc=use_dyn_max_act,
+                max_acc_bits=max_acc_bits,
+                truncate_lsb=num_lsb_to_truncate,
+                chunk_size=chunk_size,
+            ),
+        )
+
+    logger.info(f"\nModel lowering with triton kernel is done.\n{model}")
+
+
 ### -------------------------------------------------------------
 #   GPTQ tensor packing functions for Exllama kernel
 ### -------------------------------------------------------------
diff --git a/fms_mo/dq.py b/fms_mo/dq.py
@@ -172,7 +172,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
 
     qcfg["seq_len"] = block_size
     qcfg["model"] = model_args.model_name_or_path
-    qcfg["smoothq"] = True
+    qcfg["smoothq"] = fms_mo_args.smoothq_alpha != -1
     qcfg["plotsvg"] = False
 
     calibration_dataset = load_from_disk(data_args.training_data_path)
@@ -217,9 +217,10 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
         save_fname="dq",
     )
     logger.info(f"Quantized model {model}")
-    logger.info("Starting to apply smooth scale")
-    dq_llm(model, act_scales, qcfg)
-    logger.info("Finished applying smooth scale")
+    if qcfg["smoothq"]:
+        logger.info("Starting to apply smooth scale")
+        dq_llm(model, act_scales, qcfg)
+        logger.info("Finished applying smooth scale")
     logger.info("==" * 20)
     if qcfg["qmodel_calibration_new"] > 0:
         logger.info("Starting to calibrate activation clip_val")
@@ -249,7 +250,7 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
             test_dataset = load_from_disk(data_args.test_data_path)
             test_dataset = test_dataset.with_format("torch")
         elif len(pt_files) > 0:
-            test_dataset = torch.load(pt_files[0])
+            test_dataset = torch.load(pt_files[0], weights_only=False)
 
         logger.info(f"Model for evaluation: {model}")
         if qcfg["large_model"]:
@@ -258,7 +259,8 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
             model.to(torch.device("cuda:0"))
             n_samples = int(test_dataset.input_ids.shape[1] / block_size)
             evaluator = Evaluator(test_dataset, "cuda", n_samples=n_samples)
-            ppl = evaluator.evaluate(model, block_size=block_size)
+            with patch_torch_bmm(qcfg):
+                ppl = evaluator.evaluate(model, block_size=block_size)
             logger.info(f"Model perplexity: {ppl}")
         logger.info("-" * 50)
         logger.info("Finished evaluation")
diff --git a/fms_mo/modules/linear.py b/fms_mo/modules/linear.py
@@ -29,6 +29,7 @@
 # Local
 from fms_mo.custom_ext_kernels.utils import pack_vectorized
 from fms_mo.quant.quantizers import (
+    SAWB,
     HardPrune,
     Qbypass,
     Qdynamic,
@@ -751,7 +752,7 @@ def from_fms_mo(cls, fms_mo_qlinear, **kwargs):
             fms_mo_qlinear.in_features,
             fms_mo_qlinear.out_features,
             bias=fms_mo_qlinear.bias is not None,
-            device=tar_dev,
+            device="meta",  # init on tar_dev is unnecessary
         )
         # Make sure to register an Op for integer matmul, could be real INT matmul or emulation
         qcfg = getattr(fms_mo_qlinear, "qcfg", {})
@@ -777,39 +778,34 @@ def from_fms_mo(cls, fms_mo_qlinear, **kwargs):
         with torch.no_grad():
             Qa = fms_mo_qlinear.quantize_feature
             Qw = fms_mo_qlinear.quantize_weight
-            w_cv = Qw.clip_val.item()
+            w_cv = Qw.clip_val
             if qlin_int.useDynMaxQfunc in [-1, -2]:  # [-1, -2] indicates reduce_dim
                 # dynamic Qmax has no clipvals, reg fake ones, won't be used in real calc
                 Qa.register_buffer("clip_val", torch.tensor(8.0, device=tar_dev))
                 Qa.register_buffer("clip_valn", torch.tensor(-8.0, device=tar_dev))
-            a_cv, a_cvn = Qa.clip_val.item(), Qa.clip_valn.item()
-            # Store original cv_a and cv_w (in python floats, not tensors), and sq scales
-            # for later use (probably not necessary)
-            qlin_int.cvs = [a_cv, a_cvn, w_cv]
-            # NOTE: Keep w transposed to prevent confusion
-            Qw.dequantize = False
-            # trigger Qw.clipval re-calc for SAWB (if needed)
-            w_int8 = Qw(fms_mo_qlinear.weight.float())
-            qlin_int.weight = nn.Parameter(
-                w_int8.to(torch.int8), requires_grad=False
-            )  # NOTE: may need INT W stored as FP in some cases
+            a_cv = Qa.clip_val
+            a_cvn = Qa.clip_valn
+            # Store original cv_a and cv_w in python floats (instead of tensors) will be more
+            # accurate, but not compatible for per-ch and per-token.
+            qlin_int.cvs = [a_cv, a_cvn, w_cv]  # TODO remove the need of this.
 
+            # may need to trigger Qw.clipval re-calc for SAWB here, (if needed?)
             if qlin_int.useDynMaxQfunc in [-1, -2]:
                 input_scale = torch.tensor(1.0, device=tar_dev)
                 input_zero_point = torch.tensor(128, dtype=torch.int, device=tar_dev)
-                w_scale = torch.tensor([w_cv * 2 / w_levels], device=tar_dev)
+                w_scale = w_cv * 2 / w_levels
             elif qlin_int.usePTnativeQfunc:
                 input_scale = torch.tensor([(a_cv - a_cvn) / a_levels], device=tar_dev)
                 input_zero_point = torch.round(-a_cvn / input_scale).to(torch.int)
-                w_scale = torch.tensor([w_cv * 2 / w_levels], device=tar_dev)
+                w_scale = w_cv * 2 / w_levels
             else:
                 # fms_mo formula is a bit different from conventional PT formula
                 quant_scale = a_levels / torch.tensor([a_cv - a_cvn], device=tar_dev)
                 quant_stepsize = 1.0 / quant_scale
                 quant_zero_point = torch.round(a_cvn * quant_scale)
                 input_scale = quant_stepsize
                 input_zero_point = -quant_zero_point
-                quant_w_scale = w_levels / torch.tensor([w_cv * 2], device=tar_dev)
+                quant_w_scale = w_levels / (w_cv * 2)
                 w_scale = 1.0 / quant_w_scale
                 qlin_int.register_buffer("quant_scale", quant_scale)
                 qlin_int.register_buffer("quant_stepsize", quant_stepsize)
@@ -821,6 +817,21 @@ def from_fms_mo(cls, fms_mo_qlinear, **kwargs):
             qlin_int.register_buffer("w_scale", w_scale)
             qlin_int.register_buffer("w_zp", w_zp)
 
+            # NOTE:
+            # 1. Keep W transposed to prevent confusion, hence (W.t()/scale).t()
+            # 2. only a few quantizer have .dequantize working correctly
+            if isinstance(Qw, SAWB):
+                Qw.dequantize = False
+                w_int8 = Qw(fms_mo_qlinear.weight.float())
+            else:
+                w_int8 = (
+                    torch.round(fms_mo_qlinear.weight.t() / w_scale)
+                    .clamp(-w_levels / 2, w_levels / 2)
+                    .t()
+                )
+
+            qlin_int.weight = nn.Parameter(w_int8.to(torch.int8), requires_grad=False)
+
             corr_term = (
                 (input_zero_point - 128 + qlin_int.useSymAct)
                 * (w_int8.sum(dim=1))
@@ -836,8 +847,11 @@ def from_fms_mo(cls, fms_mo_qlinear, **kwargs):
                     (fms_mo_qlinear.bias - corr_term).to(fms_mo_w_dtype),
                     requires_grad=False,
                 )
+
                 qlin_int.org_model_has_bias = True
             else:
+                delattr(qlin_int, "bias")
+                # even if bias is None, reg_buffer() is still unhappy about it
                 qlin_int.register_buffer("bias", -corr_term.to(fms_mo_w_dtype))
                 qlin_int.org_model_has_bias = False
 
diff --git a/fms_mo/quant/quantizers.py b/fms_mo/quant/quantizers.py
@@ -3183,9 +3183,7 @@ class QmaxPerChSTE(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(
-        ctx, input_tensor, num_bits, _dequantize, inplace, cv, _cvn, align_zero
-    ):
+    def forward(ctx, input_tensor, num_bits, dequantize, inplace, cv, _cvn, align_zero):
         if inplace:
             ctx.mark_dirty(input_tensor)
         scale = (2**num_bits - 2) if align_zero else (2**num_bits - 1)
@@ -3206,6 +3204,9 @@ def forward(
             quant_min=int_l,
             quant_max=int_u,
         ).to(input_tensor.dtype)
+
+        if not dequantize:
+            return (output.t() / scale).t()
         return output
 
     @staticmethod
diff --git a/fms_mo/training_args.py b/fms_mo/training_args.py
@@ -173,6 +173,9 @@ class FMSMOArguments(TypeChecker):
         default=2048, metadata={"help": "input sequence length after tokenization"}
     )
     eval_ppl: bool = field(default=False)
+    aiu_sim_triton: bool = field(
+        default=False, metadata={"help": ("AIU simulation with triton kernel")}
+    )
 
 
 @dataclass

Original file line number	Diff line number	Diff line change
`@@ -173,6 +173,9 @@ class FMSMOArguments(TypeChecker):`
`173`	`173`	`default=2048, metadata={"help": "input sequence length after tokenization"}`
`174`	`174`	`)`
`175`	`175`	`eval_ppl: bool = field(default=False)`
	`176`	`+ aiu_sim_triton: bool = field(`
	`177`	`+ default=False, metadata={"help": ("AIU simulation with triton kernel")}`
	`178`	`+ )`
`176`	`179`
`177`	`180`
`178`	`181`	`@dataclass`