add DL16 option for LinearFPx (FP8 aiu sim)

chichun-charlie-liu · chichun-charlie-liu · commit 005d54bdb83d · 2025-06-26T17:57:40.000Z
Signed-off-by: cliu-us &lt;cliu@us.ibm.com&gt;
diff --git a/fms_mo/custom_ext_kernels/triton_kernels.py b/fms_mo/custom_ext_kernels/triton_kernels.py
@@ -114,6 +114,7 @@ def matmul_kernel(
     stride_cn,
     chunk_trun_bits,
     max_acc_bits,  # pylint: disable=unused-argument
+    clamp_acc_to_dl16,
     truncate_then_accumulate,
     # Meta-parameters
     BLOCK_SIZE_M: tl.constexpr,
@@ -182,6 +183,8 @@ def matmul_kernel(
         ## ------ add chunky LSB rounding/masking --------
         if chunk_trun_bits > 0:
             accumulator_inner = round_and_trun(accumulator_inner, round_bit, trun_mask)
+        if clamp_acc_to_dl16:
+            accumulator = fp32_clamp_to_dl16(accumulator)
         ## ---------------------------------------------------------
         if truncate_then_accumulate:
             accumulator += accumulator_inner
@@ -226,6 +229,7 @@ def imatmul_kernel(
     stride_cn,
     chunk_trun_bits,
     max_acc_bits,
+    clamp_acc_to_dl16,  # pylint: disable=unused-argument
     truncate_then_accumulate,
     # Meta-parameters
     BLOCK_SIZE_M: tl.constexpr,
@@ -324,6 +328,7 @@ def matmul_kernel_DABC(
     stride_cn,
     chunk_trun_bits,
     max_acc_bits,  # pylint: disable=unused-argument
+    clamp_acc_to_dl16,
     truncate_then_accumulate,
     # Meta-parameters
     BLOCK_SIZE_M: tl.constexpr,
@@ -405,6 +410,8 @@ def matmul_kernel_DABC(
         ## ------ add chunky LSB rounding/masking --------
         if chunk_trun_bits > 0:
             accumulator_inner = round_and_trun(accumulator_inner, round_bit, trun_mask)
+        if clamp_acc_to_dl16:
+            accumulator = fp32_clamp_to_dl16(accumulator)
         ## ---------------------------------------------------------
         if truncate_then_accumulate:
             accumulator += accumulator_inner
@@ -438,6 +445,28 @@ def round_and_trun(x, round_bit, trun_mask):
     return libdevice.uint_as_float((libdevice.float_as_uint(x) + round_bit) & trun_mask)
 
 
+@triton.jit
+def fp32_clamp_to_dl16(x):
+    """clamp FP32 (1-8-23) TENSOR x to DL16 (1-6-9) range."""
+    # 1. rounding, add round bit to full uint representation
+    x = libdevice.float_as_uint(x)
+    round_bit = 1 << (23 - 9 - 1)
+    x = libdevice.uint_as_float(x + round_bit)
+
+    # 2. clamp to min/max:
+    #   max = 2^32 * 1.(1111 1111 0)_base2 => 2^32*1.(1111 1111 1) will become inf
+    #         (32 + 127) << 23 | (0xFF8 << (23 - 12)) in FP32 is 8581545984.0
+    #   min = 2^-31 * 1.(0000 0000 1)_base2 => set to 0 for those smaller than this
+    #         (-31 + 127) << 23 | (1 << (23 - 9)) in FP32 is 4.665707820095122e-10
+    dl16_max = 8581545984.0
+    dl16_min = 4.665707820095122e-10
+    x = tl.where(x >= dl16_max, float("inf"), x)
+    x = tl.where(x <= -dl16_max, float("-inf"), x)
+    x = tl.where(tl.abs(x) < dl16_min, 0, x)
+
+    return x
+
+
 def tl_matmul_chunk_truncate(
     a,
     b,
@@ -448,6 +477,7 @@ def tl_matmul_chunk_truncate(
     max_acc_bits=32,
     truncate_then_accumulate=True,
     cast_output_to_input_dtype=None,
+    clamp_acc_to_dl16=False,
 ):
     """Triton matmul for HW behavior simulation. Supports float and int8.
     i. variable chunk size (i.e., BLOCK_SIZE_K)
@@ -461,7 +491,8 @@ def tl_matmul_chunk_truncate(
         chunk_size (int, optional): BLOCK_SIZE_K, some HW has specific chunk size. must >= 16.
         max_acc_bits (int, optional): num of bits for the accumulator, e.g. if INT24 is used, will
                                         clamp each chunk of a*b to [-2**23-1, 2**23].
-                                        (assuming no inf when overflow)
+                                        (only used by INT)
+        clamp_acc_to_dl16(bool): Only used by FP8, whether to clamp local accumulator (FP32) to DL16
         truncate_then_accumulate (bool, optional): if True, c = truncate(a*b) + c, otherwise
                                                     c = truncate(a*b+c)
         cast_output_to_input_dtype (bool, optional): accumulator has higher prec than input, usually
@@ -473,7 +504,7 @@ def tl_matmul_chunk_truncate(
 
     NOTE:
     use empirical way to determine BLOCK sizes, may not be optimal. But need to avoid autotune for
-    real model inference. otherwise auto-tune will be triggered in every forward call.
+    real model inference. otherwise auto-tune may be triggered in every forward call.
     """
 
     # Check constraints.
@@ -584,6 +615,7 @@ def grid(META):
         c.stride(1),
         chunk_trun_bits=chunk_trun_bits,
         max_acc_bits=max_acc_bits,
+        clamp_acc_to_dl16=clamp_acc_to_dl16,
         truncate_then_accumulate=truncate_then_accumulate,
         ACTIVATION=activation,
         **kernel_config,  # if using auto-tune, comment this line out.
diff --git a/fms_mo/custom_ext_kernels/utils.py b/fms_mo/custom_ext_kernels/utils.py
@@ -870,14 +870,15 @@ def lower_qmodel_triton(
     model: torch.nn.Module,
     use_dyn_max_act=False,
     max_acc_bits=32,
+    clamp_acc_to_dl16=False,
     num_lsb_to_truncate=0,
     chunk_size=32,
 ):
     """
-    Examplar GPU lowering function using triton. Only swap Qlinears in transformers, nothing else.
+    Examplar GPU lowering function using triton. Only swap Linear/Qlinear in transformers.
     Triton kernel can be used to:
     1. test INT8 or FP8 HW performance (kernel is not optimized)
-    2. simulate MSB/LSB truncation effect
+    2. simulate MSB/LSB truncation effect or special dtype (DL16) accumulation
 
     Args:
         model: nn.Module. should be a fms_mo Qmodel, will do inplace layer swapping, no deepcopy
@@ -888,6 +889,8 @@ def lower_qmodel_triton(
                         efficiency at the expense of higher chance of accumulation "overflow".
                         For example, an INT24 accumulator can only hold values ranged from -2^23 to
                         2^23 -1, as opposed to typical range -2^31 to -2^31 -1.
+        clamp_acc_to_dl16: clamp local accumulator to DL16 (1-6-9) range. To simulate this special
+                        dtype effect on accumulation.
         num_lsb_to_truncate: number of bits to truncate from LSB side. For example, given fp32 is
                         s1e8m23, if we choose to truncate 13 mantissa bits from right most side,
                         i.e. LSB, the resulting number will be s1e8m10, which is TF32.
@@ -900,25 +903,47 @@ def lower_qmodel_triton(
     from torch.ao.quantization.utils import _parent_name
 
     # Local
-    from fms_mo.modules.linear import QLinear, QLinearINT8Deploy
+    from fms_mo.modules.linear import LinearFPxAcc, QLinear, QLinearINT8Deploy
+
+    # Currently QLinearINT8 has more options in dynamic quantization than LinearFP. Here we resolve
+    # the differences as a patch solution (will unify the codes in future release)
+    linFP_dyn_code = (
+        "per_token"
+        if use_dyn_max_act in [-1, -2]
+        else "per_tensor"
+        if use_dyn_max_act
+        else False
+    )
 
     for name, m in model.named_modules():
-        if not isinstance(m, QLinear):
+        if not isinstance(m, (QLinear, torch.nn.Linear)):
             continue
         parent_name, module_name = _parent_name(name)
         parent_mod = model.get_submodule(parent_name)
-        qmod = getattr(parent_mod, module_name)
-        setattr(
-            parent_mod,
-            module_name,
-            QLinearINT8Deploy.from_fms_mo(
-                qmod,
+
+        # Only support simulations of 1) QLinear -> INT8, 2) nnLinear->FP8 for now
+        if isinstance(m, QLinear):
+            new_lin = QLinearINT8Deploy.from_fms_mo(
+                m,
                 use_int_kernel="triton",
                 use_dynamic_max_act_Qfunc=use_dyn_max_act,
                 max_acc_bits=max_acc_bits,
                 truncate_lsb=num_lsb_to_truncate,
                 chunk_size=chunk_size,
-            ),
+            )
+        else:
+            new_lin = LinearFPxAcc.from_nn(
+                m,
+                trun_bits=max_acc_bits,
+                chunk_size=chunk_size,
+                dynamic_fp8=linFP_dyn_code,
+                clamp_acc_to_dl16=clamp_acc_to_dl16,
+            )
+
+        setattr(
+            parent_mod,
+            module_name,
+            new_lin,
         )
 
     logger.info(f"\nModel lowering with triton kernel is done.\n{model}")
diff --git a/fms_mo/modules/linear.py b/fms_mo/modules/linear.py
@@ -1899,7 +1899,16 @@ class LinearFuncFPxFwdBwd(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, x, weight, bias=None, trun_bits=0, chunk_size=16, fp8_dyn=False):
+    def forward(
+        ctx,
+        x,
+        weight,
+        bias=None,
+        trun_bits=0,
+        chunk_size=16,
+        fp8_dyn=False,
+        clamp_acc_to_dl16=False,
+    ):
         assert x.dtype in [torch.float, torch.bfloat16, torch.float16]
         # input can be 2D or 3D, need to reshape before tl_matmul
         org_dtype = x.dtype
@@ -1916,6 +1925,7 @@ def forward(ctx, x, weight, bias=None, trun_bits=0, chunk_size=16, fp8_dyn=False
         ctx.trun_bits = trun_bits
         ctx.chunk_size = chunk_size
         ctx.fp8_dyn = fp8_dyn
+        ctx.clamp_acc_to_dl16 = clamp_acc_to_dl16
 
         if fp8_dyn:
             # use Q/dQ simulation for now, meaning still compute in fp16/bf16
@@ -1936,6 +1946,7 @@ def forward(ctx, x, weight, bias=None, trun_bits=0, chunk_size=16, fp8_dyn=False
             weight.t().to(org_dtype),
             chunk_trun_bits=trun_bits,
             chunk_size=chunk_size,
+            clamp_acc_to_dl16=clamp_acc_to_dl16,
         ).reshape(target_shape_output)
 
         if bias is not None:
@@ -1976,6 +1987,7 @@ def backward(ctx, grad_output):
             x,
             chunk_trun_bits=trun_bits,
             chunk_size=chunk_size,
+            clamp_acc_to_dl16=ctx.clamp_acc_to_dl16,
         ).to(weight.dtype)
         # Compute grad_input in 2D then reshape to target shape, could be 3D or 2D
         grad_input = (
@@ -1984,6 +1996,7 @@ def backward(ctx, grad_output):
                 weight.to(dtype_input),
                 chunk_trun_bits=trun_bits,
                 chunk_size=chunk_size,
+                clamp_acc_to_dl16=ctx.clamp_acc_to_dl16,
             )
             .reshape(target_shape_grad_input)
             .to(dtype_input)
@@ -1994,7 +2007,7 @@ def backward(ctx, grad_output):
         else:
             grad_bias = grad_output_2D.sum(0).to(ctx.bias_dtype)
 
-        return grad_input, grad_weight, grad_bias, None, None, None
+        return grad_input, grad_weight, grad_bias, None, None, None, None
 
 
 class LinearFPxAcc(torch.nn.Linear):
@@ -2016,6 +2029,10 @@ def from_nn(cls, nnlin, trun_bits=0, **kwargs):
             cls (class): The class to be created.
             nnlin (torch.nn.Linear): The original torch.nn.Linear module.
             trun_bits (int): truncate [0 to 22] LSBs from FP32 accumulation.
+            dynamic_fp8: whether to use dynamic quantization for fp8 activations, available options
+                        are ["per_tensor", "per_token", False]
+            clamp_acc_to_dl16: clamp local accumulator into DL16 range, to simulate the effect of
+                                this special dtype
             **kwargs: Additional keyword arguments.
 
         Returns:
@@ -2037,7 +2054,7 @@ def from_nn(cls, nnlin, trun_bits=0, **kwargs):
         lin24acc.trun_bits = trun_bits
         lin24acc.chunk_size = kwargs.get("chunk_size", False)
         lin24acc.fp8_dyn = kwargs.get("dynamic_fp8", False)
-        # available options are ["per_tensor", "per_token"]
+        lin24acc.clamp_acc_to_dl16 = kwargs.get("clamp_acc_to_dl16", False)
 
         if nnlin.bias is not None:
             lin24acc.bias = nnlin.bias
@@ -2052,6 +2069,7 @@ def forward(self, inputs):
             self.trun_bits,
             self.chunk_size,
             self.fp8_dyn,
+            self.clamp_acc_to_dl16,
         )
 
     def extra_repr(self) -> str: