new triton verson doesn't like 0xFFFFFFFF as a const

chichun-charlie-liu · chichun-charlie-liu · commit 275d47df5322 · 2025-07-07T16:54:10.000-04:00
Signed-off-by: cliu-us &lt;cliu@us.ibm.com&gt;
diff --git a/fms_mo/custom_ext_kernels/triton_kernels.py b/fms_mo/custom_ext_kernels/triton_kernels.py
@@ -164,7 +164,7 @@ def matmul_kernel(
     # NOTE mask will be applied on accumulator, which is alway FP32, so we may truncate up to 23b
     # e.g., 20b -> trun_mask = 0xFFF00000, round_bit = 0x00080000
     #        8b -> trun_mask = 0xFFFFFF00, round_bit = 0x00000080
-    trun_mask = tl.cast((0xFFFFFFFF >> chunk_trun_bits) << chunk_trun_bits, tl.uint32)
+    trun_mask = ~tl.cast((1 << chunk_trun_bits) - 1, tl.uint32)
     round_bit = 1 << (chunk_trun_bits - 1) if chunk_trun_bits > 0 else 0
     ## ---------------------------------------------------------
 
@@ -386,7 +386,7 @@ def matmul_kernel_DABC(
     # NOTE mask will be applied on accumulator, which is alway FP32, so we may truncate up to 23b
     # e.g., 20b -> trun_mask = 0xFFF00000, round_bit = 0x00080000
     #        8b -> trun_mask = 0xFFFFFF00, round_bit = 0x00000080
-    trun_mask = tl.cast((0xFFFFFFFF >> chunk_trun_bits) << chunk_trun_bits, tl.uint32)
+    trun_mask = ~tl.cast((1 << chunk_trun_bits) - 1, tl.uint32)
     round_bit = 1 << (chunk_trun_bits - 1) if chunk_trun_bits > 0 else 0
     ## ---------------------------------------------------------
 
@@ -448,10 +448,11 @@ def round_and_trun(x, round_bit, trun_mask):
 @triton.jit
 def fp32_clamp_to_dl16(x):
     """clamp FP32 (1-8-23) TENSOR x to DL16 (1-6-9) range."""
-    # 1. rounding: add round bit to full uint representation, zero out last 13 bits, back to float
+    # 1. rounding: add round bit, zero out last 13 bits, back to float
     x = libdevice.float_as_uint(x)
     round_bit = 1 << (23 - 9 - 1)
-    x = libdevice.uint_as_float(((x + round_bit) >> 13) << 13)
+    mask_13x0 = ~tl.cast((1 << 13) - 1, tl.uint32)
+    x = libdevice.uint_as_float((x + round_bit) & mask_13x0)
 
     # 2. clamp to min/max:
     #   max = 2^32 * 1.(1111 1111 0)_base2 => 2^32*1.(1111 1111 1) will become inf
diff --git a/fms_mo/custom_ext_kernels/utils.py b/fms_mo/custom_ext_kernels/utils.py
@@ -918,6 +918,12 @@ def lower_qmodel_triton(
 
     if layer_to_exclude is None:
         layer_to_exclude = []
+    elif isinstance(layer_to_exclude, str):
+        layer_to_exclude = [
+            layer_to_exclude,
+        ]
+    elif not isinstance(layer_to_exclude, (list, tuple)):
+        raise RuntimeError("layer_to_exclude has to be either str, list, or tuple.")
 
     for name, m in model.named_modules():
         if not isinstance(m, (QLinear, torch.nn.Linear)) or name in layer_to_exclude:
diff --git a/fms_mo/dq.py b/fms_mo/dq.py
@@ -216,17 +216,18 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
                 act_scales = get_act_scales(model, dq_dataloader, qcfg)
             torch.save(act_scales, scale_file)
 
-    qmodel_prep(
-        model,
-        dq_dataloader,
-        qcfg,
-        use_layer_name_pattern_matching=use_layer_name_pattern_matching,
-        use_dynamo=use_dynamo,
-        dev=dev,
-        save_fname="dq",
-    )
-    logger.info(f"Quantized model {model}")
-    logger.info("==" * 20)
+    if fms_mo_args.aiu_sim_triton != "fp8":
+        qmodel_prep(
+            model,
+            dq_dataloader,
+            qcfg,
+            use_layer_name_pattern_matching=use_layer_name_pattern_matching,
+            use_dynamo=use_dynamo,
+            dev=dev,
+            save_fname="dq",
+        )
+        logger.info(f"Quantized model {model}")
+        logger.info("==" * 20)
 
     if qcfg["smoothq"]:
         logger.info("Starting to apply smooth scale")
@@ -260,14 +261,16 @@ def run_dq(model_args, data_args, opt_args, fms_mo_args):
         tokenizer.save_pretrained(opt_args.output_dir)
 
     if fms_mo_args.aiu_sim_triton:
+        # NOTE plz apply correct HW settings here, defaults are not real HW params
         lower_qmodel_triton(
             model,
             use_dyn_max_act=-1 if qcfg["qa_mode"] == "pertokenmax" else False,
             max_acc_bits=qcfg.get("max_acc_bits", 32),
             num_lsb_to_truncate=qcfg.get("lsb_trun_bits", 0),
-            chunk_size=qcfg.get("chunk_size", 1024),
+            chunk_size=qcfg.get("chunk_size", 32),  # 1024
+            clamp_acc_to_dl16=False,  # fms_mo_args.aiu_sim_triton == "fp8"
+            # layer_to_exclude=["lm_head",]
         )
-
     if fms_mo_args.eval_ppl:
         path_test = Path(data_args.test_data_path)
         arrow_files = list(path_test.glob("*.arrow"))
diff --git a/fms_mo/modules/linear.py b/fms_mo/modules/linear.py
@@ -1934,8 +1934,12 @@ def forward(
             ctx.fp8_e4m3_max = torch.finfo(torch.float8_e4m3fn).max
             ctx.fp8_e5m2_max = torch.finfo(torch.float8_e5m2).max
             reduce_dim = None if fp8_dyn == "per_tensor" else 1
-            x_scale = x.abs().amax(dim=reduce_dim) / ctx.fp8_e4m3_max
-            w_scale = weight.abs().amax(dim=reduce_dim) / ctx.fp8_e4m3_max
+            x_scale = (
+                x.abs().amax(dim=reduce_dim, keepdim=True) / ctx.fp8_e4m3_max
+            ).clamp(min=1e-5)
+            w_scale = (
+                weight.abs().amax(dim=reduce_dim, keepdim=True) / ctx.fp8_e4m3_max
+            ).clamp(min=1e-5)
 
             x = (x / x_scale).to(torch.float8_e4m3fn).to(org_dtype) * x_scale
             weight = (weight / w_scale).to(torch.float8_e4m3fn).to(org_dtype) * w_scale
diff --git a/fms_mo/training_args.py b/fms_mo/training_args.py
@@ -181,8 +181,15 @@ class FMSMOArguments(TypeChecker):
         default=2048, metadata={"help": "input sequence length after tokenization"}
     )
     eval_ppl: bool = field(default=False)
-    aiu_sim_triton: bool = field(
-        default=False, metadata={"help": ("AIU simulation with triton kernel")}
+    aiu_sim_triton: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "AIU simulation with triton kernel. ['int8', 'fp8', None]\n"
+                "'int8' mode will trigger qmodel_prep() and swap QLinears"
+                "'fp8' mode will directly replace existing nn.Linears"
+            )
+        },
     )
     recompute_narrow_weights: bool = field(
         default=False,