fix triton DL16 aiu sim with subnorm flushing

chichun-charlie-liu · chichun-charlie-liu · commit b685ea8b921b · 2025-07-09T05:40:05.000Z
Signed-off-by: cliu-us &lt;cliu@us.ibm.com&gt;
diff --git a/fms_mo/custom_ext_kernels/triton_kernels.py b/fms_mo/custom_ext_kernels/triton_kernels.py
@@ -160,13 +160,8 @@ def matmul_kernel(
     # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
     # of fp32 values for higher accuracy.
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    ## ------ prepare LSB rounding/truncation masks -------
-    # NOTE mask will be applied on accumulator, which is alway FP32, so we may truncate up to 23b
-    # e.g., 20b -> trun_mask = 0xFFF00000, round_bit = 0x00080000
-    #        8b -> trun_mask = 0xFFFFFF00, round_bit = 0x00000080
-    trun_mask = ~tl.cast((1 << chunk_trun_bits) - 1, tl.uint32)
-    round_bit = 1 << (chunk_trun_bits - 1) if chunk_trun_bits > 0 else 0
-    ## ---------------------------------------------------------
+    ## ------ prepare LSB rounding/truncation masks outside the for loop -------
+    round_bit, trun_mask = round_and_trun_mask(chunk_trun_bits, clamp_acc_to_dl16)
 
     for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
         # Load the next block of A and B, generate a mask by checking the K dimension.
@@ -181,10 +176,10 @@ def matmul_kernel(
         # tl.dot() default is using TF32 approximation, not good enough for LSB truncation exp
 
         ## ------ add chunky LSB rounding/masking --------
-        if chunk_trun_bits > 0:
-            accumulator_inner = round_and_trun(accumulator_inner, round_bit, trun_mask)
-        if clamp_acc_to_dl16:
-            accumulator_inner = fp32_clamp_to_dl16(accumulator_inner)
+        if clamp_acc_to_dl16 or chunk_trun_bits > 0:
+            accumulator_inner = round_and_trun(
+                accumulator_inner, round_bit, trun_mask, clamp_acc_to_dl16
+            )
         ## ---------------------------------------------------------
         if truncate_then_accumulate:
             accumulator += accumulator_inner
@@ -382,13 +377,8 @@ def matmul_kernel_DABC(
     # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
     # of fp32 values for higher accuracy, i.e. C should have been cast to fp32 already
     accumulator = tl.load(c_ptrs, mask=c_mask, other=0.0)
-    ## ------ prepare LSB rounding/truncation masks -------
-    # NOTE mask will be applied on accumulator, which is alway FP32, so we may truncate up to 23b
-    # e.g., 20b -> trun_mask = 0xFFF00000, round_bit = 0x00080000
-    #        8b -> trun_mask = 0xFFFFFF00, round_bit = 0x00000080
-    trun_mask = ~tl.cast((1 << chunk_trun_bits) - 1, tl.uint32)
-    round_bit = 1 << (chunk_trun_bits - 1) if chunk_trun_bits > 0 else 0
-    ## ---------------------------------------------------------
+    ## ------ prepare LSB rounding/truncation masks outside the for loop -------
+    round_bit, trun_mask = round_and_trun_mask(chunk_trun_bits, clamp_acc_to_dl16)
 
     for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
         # Load the next block of A, B, and C, generate a mask by checking the K dimension.
@@ -408,10 +398,10 @@ def matmul_kernel_DABC(
         #       precision as well, hence, could lose some precision!
 
         ## ------ add chunky LSB rounding/masking --------
-        if chunk_trun_bits > 0:
-            accumulator_inner = round_and_trun(accumulator_inner, round_bit, trun_mask)
-        if clamp_acc_to_dl16:
-            accumulator_inner = fp32_clamp_to_dl16(accumulator_inner)
+        if clamp_acc_to_dl16 or chunk_trun_bits > 0:
+            accumulator_inner = round_and_trun(
+                accumulator_inner, round_bit, trun_mask, clamp_acc_to_dl16
+            )
         ## ---------------------------------------------------------
         if truncate_then_accumulate:
             accumulator += accumulator_inner
@@ -440,34 +430,64 @@ def leaky_relu(x):
 
 
 @triton.jit
-def round_and_trun(x, round_bit, trun_mask):
-    """Round and truncate (usually for accumulator)."""
-    return libdevice.uint_as_float((libdevice.float_as_uint(x) + round_bit) & trun_mask)
+def round_and_trun_mask(chunk_trun_bits, clamp_acc_to_dl16):
+    """
+    Rounding and LSB truncation masks only need to be generated once.
+    These mask will be applied on "inner" accumulator, which is alway FP32 (e8m23). We may truncate
+    up to 23b for mantissa. If DL16/DL8, pay attention to exponent bias.
+    Examples: 20b -> trun_mask = 0xFFF00000, round_bit = 0x00080000
+               8b -> trun_mask = 0xFFFFFF00, round_bit = 0x00000080
+    """
+    if clamp_acc_to_dl16:
+        # DL16 is e6m9, hence, truncate 23 - 9 = 14 bits
+        chunk_trun_bits = 14
+    round_bit = 1 << (chunk_trun_bits - 1) if chunk_trun_bits > 0 else 0
+    trun_mask = ~tl.cast((1 << chunk_trun_bits) - 1, tl.uint32)
+    return round_bit, trun_mask
 
 
 @triton.jit
-def fp32_clamp_to_dl16(x):
-    """clamp FP32 (1-8-23) TENSOR x to DL16 (1-6-9) range."""
-    # 1. rounding: add round bit, zero out last 13 bits, back to float
-    x = libdevice.float_as_uint(x)
-    round_bit = 1 << (23 - 9 - 1)
-    mask_13x0 = ~tl.cast((1 << 13) - 1, tl.uint32)
-    x = libdevice.uint_as_float((x + round_bit) & mask_13x0)
-
-    # 2. clamp to min/max:
-    #   max = 2^32 * 1.(1111 1111 0)_base2 => 2^32*1.(1111 1111 1) will become inf
-    #         (32 + 127) << 23 | (0xFF8 << (23 - 12)) in FP32 is 8581545984.0
-    #   min = 2^-31 * 1.(0000 0000 1)_base2 => set to 0 for those smaller than this
-    #         (-31 + 127) << 23 | (1 << (23 - 9)) in FP32 is 4.665707820095122e-10
-    dl16_max = 8581545984.0
-    dl16_min = 4.665707820095122e-10
-    x = tl.where(x >= dl16_max, float("inf"), x)
-    x = tl.where(x <= -dl16_max, float("-inf"), x)
-    x = tl.where(tl.abs(x) < dl16_min, 0, x)
-
+def round_and_trun(x, round_bit, trun_mask, clamp_acc_to_dl16):
+    """Round and truncate (usually for accumulator)."""
+    x = libdevice.uint_as_float((libdevice.float_as_uint(x) + round_bit) & trun_mask)
+
+    if clamp_acc_to_dl16:
+        # clamp to DL16 min/max:
+        #   max = 2^32 * 1.(1111 1111 0)_base2 = 2^32*(2 - 2^-9) = 8581545984.0
+        #         greater than this will become +inf (or -inf)
+        #   min = 2^-31 * 1.(0000 0000 1)_base2 = 2^-31*(1 + 2^-9)> = 4.665707820095122e-10
+        #         smaller than this will become 0
+        dl16_max = 8581545984.0
+        dl16_min = 4.665707820095122e-10
+        x = tl.where(x >= dl16_max, float("inf"), x)
+        x = tl.where(x <= -dl16_max, float("-inf"), x)
+        x = tl.where(tl.abs(x) < dl16_min, 0, x)
     return x
 
 
+# @triton.jit
+# def fp32_clamp_to_dl16(x):
+#     """clamp FP32 (1-8-23) TENSOR x to DL16 (1-6-9) range."""
+#     # 1. rounding: add round bit, zero out last 13 bits, back to float
+#     x = libdevice.float_as_uint(x)
+#     round_bit = 1 << (23 - 9 - 1)
+#     mask_13x0 = ~tl.cast((1 << 13) - 1, tl.uint32)
+#     x = libdevice.uint_as_float((x + round_bit) & mask_13x0)
+
+#     # 2. clamp to min/max:
+#     #   max = 2^32 * 1.(1111 1111 0)_base2 => 2^32*1.(1111 1111 1) will become inf
+#     #         (32 + 127) << 23 | (0xFF8 << (23 - 12)) in FP32 is 8581545984.0
+#     #   min = 2^-31 * 1.(0000 0000 1)_base2 => set to 0 for those smaller than this
+#     #         (-31 + 127) << 23 | (1 << (23 - 9)) in FP32 is 4.665707820095122e-10
+#     dl16_max = 8581545984.0
+#     dl16_min = 4.665707820095122e-10
+#     x = tl.where(x >= dl16_max, float("inf"), x)
+#     x = tl.where(x <= -dl16_max, float("-inf"), x)
+#     x = tl.where(tl.abs(x) < dl16_min, 0, x)
+
+#     return x
+
+
 def tl_matmul_chunk_truncate(
     a,
     b,
diff --git a/fms_mo/dq.py b/fms_mo/dq.py
@@ -161,18 +161,18 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
     # config layers to skip, smooth scale
     config_quantize_smooth_layers(qcfg)
 
+    use_dynamo = True
+    # use dynamo as default unless really needed, False -> fallback to TorchScript tracing
     if any(x != 32 for x in attn_bits):
         logger.info("Quantize attention bmms or kvcache, will use dynamo for prep")
         use_layer_name_pattern_matching = False
         qcfg["qlayer_name_pattern"] = []
         assert (
             qcfg["qlayer_name_pattern"] == []
         ), "ensure nothing in qlayer_name_pattern when use dynamo"
-        use_dynamo = True
     else:
         logger.info("Attention bmms will not be quantized.")
         use_layer_name_pattern_matching = True
-        use_dynamo = False
 
     qcfg["seq_len"] = block_size
     qcfg["model"] = model_args.model_name_or_path
@@ -271,6 +271,33 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
             clamp_acc_to_dl16=False,  # fms_mo_args.aiu_sim_triton == "fp8"
             # layer_to_exclude=["lm_head",]
         )
+    # [CL] -------- record W, A, qW, qA with hooks ----------------
+    # from fms_mo.modules.linear import QLinear, QLinearINT8Deploy
+    # from fms_mo.quant.ptq import HookRecPostQuantInOut
+    #     cache_dict = {}
+    #     hook_handles = []
+    #     for n, m in model.named_modules():
+    #         if not isinstance(m, (QLinear, QLinearINT8Deploy, torch.nn.Linear)):
+    #             continue
+
+    #         m.mod_name = n
+    #         hook_handles.append(
+    #             m.register_forward_hook( HookRecPostQuantInOut(cache_dict, n))
+    #         )
+
+    #     data_mb = next(iter(eval_dataloader))
+    #     with torch.no_grad():
+    #         model(**data_mb)
+
+    #     for h in hook_handles:
+    #         h.remove()
+
+    #     torch.save(
+    #         cache_dict,
+    #         f"roberta_sqv2_data_dump_{qcfg['qa_mode']}_{qcfg['qw_mode']}_chunk64_lsb{args.aiu_int_lsb_trun}_dq.pt"
+    #     )
+    #     return
+
     if fms_mo_args.eval_ppl:
         path_test = Path(data_args.test_data_path)
         arrow_files = list(path_test.glob("*.arrow"))
diff --git a/fms_mo/modules/linear.py b/fms_mo/modules/linear.py
@@ -1926,6 +1926,7 @@ def forward(
         ctx.chunk_size = chunk_size
         ctx.fp8_dyn = fp8_dyn
         ctx.clamp_acc_to_dl16 = clamp_acc_to_dl16
+        ctx.dl8_min = 0.0087890625
 
         if fp8_dyn:
             # use Q/dQ simulation for now, meaning still compute in fp16/bf16
@@ -1943,6 +1944,11 @@ def forward(
 
             x = (x / x_scale).to(torch.float8_e4m3fn).to(org_dtype) * x_scale
             weight = (weight / w_scale).to(torch.float8_e4m3fn).to(org_dtype) * w_scale
+            if clamp_acc_to_dl16:
+                # NOTE For DL8@DL8 acc in DL16, as DL8 doesn't support subnorm numbers like PyTorch
+                # (whose real min for e4m3fn is 2^-9), need to flush subnorm numbers to 0
+                x.masked_fill_(x < ctx.dl8_min, 0)
+                weight.masked_fill_(weight < ctx.dl8_min, 0)
 
         # triton kernel assumes 2D inputs and cast the return to input.dtype
         output = tl_matmul(
@@ -1983,6 +1989,11 @@ def backward(ctx, grad_output):
             grad_output_2D = (grad_output_2D / grad_out_scale).to(torch.float8_e5m2).to(
                 grad_output.dtype
             ) * grad_out_scale
+            if ctx.clamp_acc_to_dl16:
+                # flush subnorm numbers to 0 as DL8 doesn't support it
+                x.masked_fill_(x < ctx.dl8_min, 0)
+                weight.masked_fill_(weight < ctx.dl8_min, 0)
+                grad_output_2D.masked_fill_(grad_output_2D < ctx.dl8_min, 0)
 
         # Compute grad_weight, shape = [out, in]
         # NOTE: this triton kernel requires A matrix to be contiguous
diff --git a/fms_mo/quant/ptq.py b/fms_mo/quant/ptq.py
@@ -42,6 +42,7 @@
 # Local
 from fms_mo.modules import QBmm, QLinear
 from fms_mo.modules.conv import QConv2dPTQv2
+from fms_mo.modules.linear import LinearFPxAcc, QLinearINT8Deploy
 from fms_mo.quant.quantizers import (
     AdaRoundQuantizer,
     Qdynamic,
@@ -481,8 +482,118 @@ def __call__(self, mod, inputs, *args, **_kwargs):
         assert not self.stop_after_rec
 
 
-# this hook is meant for ptq_loss_func == 'fisher_diag' and to temp hold the "Q_out" of the module
+class HookRecPostQuantInOut(torch.nn.Module):
+    """Another simplified hook to check post-quantized input/output, e.g. within +-127 for INT8."""
+
+    def __init__(self, cache_dict={}, mod_name=None):
+        super().__init__()
+        self.cache_dict = cache_dict
+        self.mod_name = mod_name
+        name_split = mod_name.split(".")
+        self.lay_idx = int(name_split[3])
+        self.lay_key = name_split[6]
+
+        self.cache_dev = "cpu"
+        # prepare empty dict for later use
+        self.cache_dict[mod_name] = {}
+        self.fwd_mapping = {
+            LinearFPxAcc: self.call_func_for_fpxacc,
+            QLinear: self.call_func_for_qlinear,
+            QLinearINT8Deploy: self.call_func_for_qlinear_int,
+            torch.nn.Linear: self.call_func_for_nnlinear,
+        }
+
+    def call_func_for_fpxacc(self, mod, inputs, outputs, **_kwargs):
+        raise NotImplementedError
+
+    def call_func_for_qlinear(self, mod, inputs, outputs, **_kwargs):
+        lay_idx = self.lay_idx
+        lay_key = self.lay_key
+        mod_name = self.mod_name
+        cache_dict = self.cache_dict
+
+        act_max = inputs[0].abs().amax(dim=[d for d in range(len(inputs[0].shape) - 1)])
+        # mod.smoothq_act_scale
+        w_max = mod.weight.abs().max(dim=0, keepdim=True)[0].clamp(min=1e-5)
+        is_smq_layer = not torch.all(act_max == 0).item()
+        # smoothQ scale = smoothq_act_scale**alpha / weight_scale**(1.0 - alpha)
+        # smoothq_scale = mod.get_smoothq_scale(inputs[0])
+        smoothq_scale = getattr(mod, "smq_scale", 1.0)
+        # "smq_scale" only available in QLin_INT8
+
+        with torch.no_grad():
+            smoothed_inp = inputs[0] / smoothq_scale
+            smoothed_w = mod.weight * smoothq_scale
+
+            # this is assuming pertokenmax quantizer, NOTE calc quant scale after smoothing
+            absmax = smoothed_inp.abs().max(dim=-1, keepdim=True)[0]
+            qa_scale = absmax.clamp(min=1e-5) / 127
+            qinput = torch.round(smoothed_inp / qa_scale).clamp(-127, 127)
+            # should clamp to -128?
+            if mod.qa_mode == "pertokenmax":
+                # doesnt implement dequant=False yet, do it manually
+                cva = mod.quantize_feature.clip_val
+                qa_scale = cva.clamp(min=1e-5).div(127)
+                qinput = smoothed_inp.div(qa_scale).round()
+            else:
+                mod.quantize_feature.dequantize = False
+                qinput = mod.quantize_feature(smoothed_inp)
+                mod.quantize_feature.dequantize = True
+
+            # also record quantized, smoothed W in INT8, calc both maxperCh and SAWBperCh
+            cvw = mod.quantize_weight.clip_val
+            scale_w = cvw / 127
+            mod.quantize_weight.dequantize = False
+            qw = mod.quantize_weight(smoothed_w)
+            mod.quantize_weight.dequantize = True
+
+            # inputs is a tuple, QLinear only has 1 valid input
+            cache_dict[mod_name]["input"] = inputs[0].to(self.cache_dev)
+            cache_dict[mod_name]["cva"] = cva.to(self.cache_dev)
+            cache_dict[mod_name]["cvw"] = cvw.to(self.cache_dev)
+            cache_dict[mod_name]["smoothed_input"] = smoothed_inp.to(self.cache_dev)
+            cache_dict[mod_name]["smoothed_weight"] = smoothed_w.to(self.cache_dev)
+            cache_dict[mod_name]["qinput"] = qinput.to(self.cache_dev)
+            # NOTE in INT8, *scales if need dQ
+            cache_dict[mod_name]["qweight"] = qw.to(self.cache_dev)
+            # torch.round(smoothed_w.T/scale_w).clamp(-127, 127).to(self.cache_dev)
+            # cache_dict[mod_name]["qoutput"] = outputs.to(self.cache_dev)
+
+    def call_func_for_qlinear_int(self, mod, inputs, outputs, **_kwargs):
+        smoothq_scale = getattr(mod, "smq_scale", 1.0)
+        mod_name = self.mod_name
+        cache_dict = self.cache_dict
+        with torch.no_grad():
+            if mod.useDynMaxQfunc in [-1, -2]:
+                qinput = mod.qa_dynamic_max_qfunc(inputs[0])
+            elif mod.use_fake_zero_shift:
+                qinput = mod.qa_dyn_max_fake_zero_shift(inputs[0])
+            elif mod.usePTnativeQfunc:
+                qinput = mod.qa_raw_qfunc(inputs[0])
+            else:
+                qinput = mod.qa_fmo_mo_qfunc(inputs[0])
+
+            # inputs is a tuple, QLinear only has 1 valid input
+            cache_dict[mod_name]["input"] = inputs[0].to(self.cache_dev)
+            cache_dict[mod_name]["cva"] = mod.cvs[0].to(self.cache_dev)
+            cache_dict[mod_name]["cvw"] = mod.cvs[2].to(self.cache_dev)
+            cache_dict[mod_name]["qinput"] = qinput.to(self.cache_dev)
+            cache_dict[mod_name]["qweight"] = mod.weight.to(self.cache_dev)
+
+    def call_func_for_nnlinear(self, mod, inputs, outputs, **_kwargs):
+        mod_name = self.mod_name
+        cache_dict = self.cache_dict
+        cache_dict[mod_name]["input"] = inputs[0].to(self.cache_dev)
+        cache_dict[mod_name]["weight"] = mod.weight.to(self.cache_dev)
+
+    def __call__(self, mod, inputs, outputs, **_kwargs):
+        self.fwd_mapping[type(mod)](mod, inputs, outputs, **_kwargs)
+
+
 class PTQHookRecQOut(nn.Module):
+    """This hook is for ptq_loss_func == 'fisher_diag' and will temporarily hold the "Q_out" of the
+    module"""
+
     def __init__(self, qcfg):
         super().__init__()
         self.qcfg = qcfg
diff --git a/fms_mo/training_args.py b/fms_mo/training_args.py
@@ -192,7 +192,7 @@ class FMSMOArguments(TypeChecker):
         default=2048, metadata={"help": "input sequence length after tokenization"}
     )
     eval_ppl: bool = field(default=False)
-    aiu_sim_triton: str = field(
+    aiu_sim_triton: Optional[str] = field(
         default=None,
         metadata={
             "help": (
diff --git a/fms_mo/utils/dq_utils.py b/fms_mo/utils/dq_utils.py

Original file line number	Diff line number	Diff line change
`@@ -192,7 +192,7 @@ class FMSMOArguments(TypeChecker):`
`192`	`192`	`default=2048, metadata={"help": "input sequence length after tokenization"}`
`193`	`193`	`)`
`194`	`194`	`eval_ppl: bool = field(default=False)`
`195`		`- aiu_sim_triton: str = field(`
	`195`	`+ aiu_sim_triton: Optional[str] = field(`
`196`	`196`	`default=None,`
`197`	`197`	`metadata={`
`198`	`198`	`"help": (`