minor adjustment

chichun-charlie-liu · chichun-charlie-liu · commit 8ffad1bc5306 · 2025-05-15T15:22:40.000Z
Signed-off-by: cliu-us &lt;cliu@us.ibm.com&gt;
diff --git a/fms_mo/custom_ext_kernels/triton_kernels.py b/fms_mo/custom_ext_kernels/triton_kernels.py
@@ -264,7 +264,8 @@ def imatmul_kernel(
         else:
             accumulator_inner = tl.dot(a, b, accumulator, input_precision="ieee")
 
-        ## ------ MSB truncation by clamp, chunky LSB truncation by rounding/masking --------
+        ## ------ INT MSB truncation is simulated by clamping,
+        #           "special" INT LSB truncation by right and left shift --------
         if max_acc_bits < 32:
             accumulator_inner = tl.maximum(
                 tl.minimum(accumulator_inner, acc_max), acc_min
@@ -530,7 +531,6 @@ def isPowerofTwo(x):
         c_org_dtype = c.dtype
         c = c.to(acc_dtype)
         assert c.shape[0] == M and c.shape[1] == N, "C shape is inconsistent with A B."
-        # assert acc_dtype == torch.float32, "INT truncation is not yet supported."
 
     # 1D launch kernel where each block gets its own program.
     def grid(META):
diff --git a/tests/triton_kernels/test_triton_mm.py b/tests/triton_kernels/test_triton_mm.py
@@ -94,23 +94,29 @@ def test_triton_matmul_int8(mkn):
     torch_output = torch.matmul(a.to(torch.float), b.to(torch.float))
     # cast tl_matmul results to float because torch.norm only supports float
     tl_output_no_trun = tl_matmul(a, b).to(torch.float)
-    # check LSB truncation effect
+    # check LSB truncation effect (underflow)
     tl_output_trun_8b = tl_matmul(a, b, chunk_trun_bits=8).to(torch.float)
-    # check MSB truncation effect
-    # max(1 int8 * 1 int8) ~ 2^17 -> each chunk acc 32 elem, possible max ~ 2^22
-    # -> truncate to 18b -> should see large err than LSB-only case
+    # check MSB truncation effect (overflow)
+    # max(1 int8 * 1 int8) ~ 2^14 -> each chunk acc 32 elem only, achievable max ~ 2^19
+    # -> truncate to 18b -> should see slightly large err than LSB-only case
     tl_output_trun_18b8b = tl_matmul(a, b, max_acc_bits=18, chunk_trun_bits=8).to(
         torch.float
     )
+    # use larger chunk size to accumulate more elem, MSB truncation (overflow) issue should worsen
+    tl_output_trun_18b8b_128 = tl_matmul(
+        a, b, max_acc_bits=18, chunk_trun_bits=8, chunk_size=min(128, k)
+    ).to(torch.float)
 
     ref = torch.norm(torch_output)
     rel_err_no_trun = torch.norm(torch_output - tl_output_no_trun) / ref
     rel_err_trun_8b = torch.norm(torch_output - tl_output_trun_8b) / ref
     rel_err_trun_18b8b = torch.norm(torch_output - tl_output_trun_18b8b) / ref
+    rel_err_trun_18b8b_128 = torch.norm(torch_output - tl_output_trun_18b8b_128) / ref
 
     assert rel_err_no_trun < 1e-5
     assert rel_err_trun_8b < 1e-2
     assert rel_err_trun_18b8b < 1e-2
+    assert rel_err_trun_18b8b_128 >= rel_err_trun_18b8b
 
 
 @pytest.mark.parametrize("feat_in_out", [(64, 128), (256, 1024), (1024, 4096)])