enable "dilation" for fp8, if chunk_size<32

chichun-charlie-liu · chichun-charlie-liu · commit aba51f8dbf65 · 2025-03-27T22:51:44.000-04:00
Signed-off-by: cliu-us &lt;cliu@us.ibm.com&gt;
diff --git a/fms_mo/custom_ext_kernels/triton_kernels.py b/fms_mo/custom_ext_kernels/triton_kernels.py
@@ -471,20 +471,21 @@ def isPowerofTwo(x):
     min_chunk_size = 32 if a.dtype in DTYPE_8BIT else 16
 
     # because min k (chunk size in this case) for fp16/bf16 is 16, if smaller is needed, we could
-    # insert 0s in between elements, i.e. pad [m,k] -> [m,2k], [k,n]->[2k,n], out=[m,n] unchanged.
-    # Do not support I8 or F8 for now. (as F8/FP24 simulation is treated as BF16 currently)
-    if chunk_size == 8 and a.dtype in [torch.float16, torch.bfloat16]:
+    # insert 0s in between elements, e.g. pad [m,k] -> [m,2k], [k,n]->[2k,n], out=[m,n] unchanged.
+    # Do not support INT8 for now.
+    if chunk_size == 8 and a.dtype in [torch.float8_e4m3fn, torch.float16, torch.bfloat16]:
+        exp_ratio = min_chunk_size//chunk_size
         a_padded = torch.zeros(
-            a.shape[0], a.shape[1] * 2, dtype=a.dtype, device=a.device
+            a.shape[0], a.shape[1] * exp_ratio, dtype=a.dtype, device=a.device
         )
-        a_padded[:, ::2] = a
+        a_padded[:, ::exp_ratio] = a
         a = a_padded
         b_padded = torch.zeros(
-            b.shape[0] * 2, b.shape[1], dtype=b.dtype, device=b.device
+            b.shape[0] * exp_ratio, b.shape[1], dtype=b.dtype, device=b.device
         )
-        b_padded[::2, :] = b
+        b_padded[::exp_ratio, :] = b
         b = b_padded
-        chunk_size = 16
+        chunk_size = min_chunk_size
     else:
         chunk_size = (
             max(chunk_size, min_chunk_size)