enable chunk_size=8

chichun-charlie-liu · chichun-charlie-liu · commit 0f7201eeda9f · 2025-03-03T14:24:52.000-05:00
Signed-off-by: cliu-us &lt;cliu@us.ibm.com&gt;
diff --git a/fms_mo/custom_ext_kernels/triton_kernels.py b/fms_mo/custom_ext_kernels/triton_kernels.py
@@ -323,10 +323,20 @@ def isPowerofTwo(x):
         return (x & (x - 1)) == 0
 
     min_chunk_size = 32 if a.dtype in DTYPE_8BIT else 16
-    if isPowerofTwo(chunk_size):
-        chunk_size = max(chunk_size, min_chunk_size)
+
+    # because min k (chunk size in this case) for fp16/bf16 is 16, if smaller is needed, we could
+    # insert 0s in between elements, i.e. pad [m,k] -> [m,2k], [k,n]->[k,2n], out=[m,n] unchanged.
+    # Do not support I8 or F8 for now. (as F8/FP24 simulation is treated as BF16 currently)
+    if chunk_size == 8 and a.dtype in [torch.float16, torch.bfloat16]:
+        a_padded = torch.zeros(a.shape[0], a.shape[1]*2, dtype=a.dtype, device=a.device)
+        a_padded[:, ::2] = a
+        a = a_padded
+        b_padded = torch.zeros(b.shape[0]*2, b.shape[1], dtype=b.dtype, device=b.device)
+        b_padded[::2, :] = b
+        b = b_padded
+        chunk_size = 16
     else:
-        chunk_size = min_chunk_size
+        chunk_size = max(chunk_size, min_chunk_size) if isPowerofTwo(chunk_size) else min_chunk_size
 
     if a.dtype in DTYPE_I8:
         acc_dtype = torch.int32
@@ -345,7 +355,7 @@ def grid(META):
 
     if M < 1024 or N < 1024:
         kernel_config = {
-            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_M": 64,
             "BLOCK_SIZE_K": chunk_size,
             "BLOCK_SIZE_N": 32,
             "GROUP_SIZE_M": 8,
diff --git a/fms_mo/modules/linear.py b/fms_mo/modules/linear.py
@@ -1797,7 +1797,7 @@ class LinearFuncFPxFwdBwd(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, x, weight, bias=None, trun_bits=0, chunk_size=16):
+    def forward(ctx, x, weight, bias=None, trun_bits=0, chunk_size=16, fp8_dyn=False):
         assert x.dtype in [torch.float, torch.bfloat16, torch.float16]
         # input can be 2D or 3D, need to reshape before tl_matmul
         org_dtype = x.dtype
@@ -1813,6 +1813,20 @@ def forward(ctx, x, weight, bias=None, trun_bits=0, chunk_size=16):
         ctx.save_for_backward(x, weight)  # x, W are saved in their original dtype
         ctx.trun_bits = trun_bits
         ctx.chunk_size = chunk_size
+        ctx.fp8_dyn = fp8_dyn
+
+        if fp8_dyn:
+            # use Q/dQ simulation for now, meaning still compute in fp16/bf16
+            # if choose per_token for input, use per_channel for W
+            # (W saved as [out, in], reduce inCh-dim, => reduce_dim=1)
+            ctx.fp8_e4m3_max = torch.finfo(torch.float8_e4m3fn).max
+            ctx.fp8_e5m2_max = torch.finfo(torch.float8_e5m2).max
+            reduce_dim = None if fp8_dyn == "per_tensor" else 1
+            x_scale = x.abs().amax(dim=reduce_dim) / ctx.fp8_e4m3_max
+            w_scale = weight.abs().amax(dim=reduce_dim) / ctx.fp8_e4m3_max
+
+            x = (x/x_scale).to(torch.float8_e4m3fn).to(org_dtype)*x_scale
+            weight = (weight/w_scale).to(torch.float8_e4m3fn).to(org_dtype)*w_scale
 
         # triton kernel assumes 2D inputs and cast the return to input.dtype
         output = tl_matmul(
@@ -1840,6 +1854,18 @@ def backward(ctx, grad_output):
         target_shape_grad_input = grad_output.shape[:-1] + (in_dim,)
         grad_output_2D = grad_output.reshape(-1, out_dim).to(dtype_input)
 
+        if ctx.fp8_dyn:
+            reduce_dim = None if ctx.fp8_dyn == "per_tensor" else 1
+            x_scale = x.abs().amax(dim=reduce_dim) / ctx.fp8_e5m2_max
+            w_scale = weight.abs().amax(dim=reduce_dim) / ctx.fp8_e5m2_max
+            grad_out_scale = grad_output_2D.abs().amax(dim=None) / ctx.fp8_e5m2_max  # always perT
+
+            x = (x/x_scale).to(torch.float8_e5m2).to(dtype_input)*x_scale
+            weight = (weight/w_scale).to(torch.float8_e5m2).to(weight.dtype)*w_scale
+            grad_output_2D = (grad_output_2D/grad_out_scale).to(torch.float8_e5m2
+                                                                ).to(grad_output.dtype
+                                                                     )*grad_out_scale
+
         # Compute grad_weight, shape = [out, in]
         # NOTE: this triton kernel requires A matrix to be contiguous
         grad_weight = tl_matmul(
@@ -1865,7 +1891,7 @@ def backward(ctx, grad_output):
         else:
             grad_bias = grad_output_2D.sum(0).to(ctx.bias_dtype)
 
-        return grad_input, grad_weight, grad_bias, None
+        return grad_input, grad_weight, grad_bias, None, None, None
 
 
 class LinearFPxAcc(torch.nn.Linear):
@@ -1906,20 +1932,23 @@ def from_nn(cls, nnlin, trun_bits=0, **kwargs):
 
         lin24acc.weight = nnlin.weight
         lin24acc.trun_bits = trun_bits
+        lin24acc.chunk_size = kwargs.get("chunk_size", False)
+        lin24acc.fp8_dyn = kwargs.get("dynamic_fp8", False)  #["per_tensor", "per_token"]
 
         if nnlin.bias is not None:
             lin24acc.bias = nnlin.bias
         return lin24acc.to(target_device)
 
     def forward(self, inputs):
         # This Linear Class will cast to BF16 before matmul and return FP32
-        return LinearFuncFPxFwdBwd.apply(inputs, self.weight, self.bias, self.trun_bits)
+        return LinearFuncFPxFwdBwd.apply(inputs, self.weight, self.bias, self.trun_bits, 
+                                         self.chunk_size, self.fp8_dyn)
 
     def extra_repr(self) -> str:
         """
         Returns an alternative string representation of the object.
         """
         return (
             f"in={self.in_features}, out={self.out_features}, bias={self.bias is not None}, "
-            f"trun_bits={self.trun_bits}"
+            f"trun_bits={self.trun_bits},fp8_dyn={self.fp8_dyn},chunk_size={self.chunk_size}"
         )