linting

chichun-charlie-liu · chichun-charlie-liu · commit 434e68f89816 · 2025-03-27T11:48:26.000-04:00
Signed-off-by: cliu-us &lt;cliu@us.ibm.com&gt;
diff --git a/fms_mo/custom_ext_kernels/triton_kernels.py b/fms_mo/custom_ext_kernels/triton_kernels.py
@@ -266,7 +266,6 @@ def imatmul_kernel(
         else:
             accumulator = accumulator_inner
 
-
         a_ptrs += BLOCK_SIZE_K * stride_ak
         b_ptrs += BLOCK_SIZE_K * stride_bk
     if ACTIVATION == "leaky_relu":
@@ -281,7 +280,6 @@ def imatmul_kernel(
     tl.store(c_ptrs, c, mask=c_mask)
 
 
-
 @triton.jit
 def matmul_kernel_DABC(
     # Pointers to matrices
@@ -311,11 +309,11 @@ def matmul_kernel_DABC(
     ACTIVATION: tl.constexpr,
 ):
     """Kernel for computing the matmul D = A x B + C that include LSB truncation.
-    A has shape (M, K), B has shape (K, N) and C/D has shape (M, N). 
+    A has shape (M, K), B has shape (K, N) and C/D has shape (M, N).
     NOTE:
         C should be consistent with accumulator dtype, e.g. fp8xfp8 -> fp32.
         *D ptr is supposed to be the same as C ptr, no need to provide D as arg
-        **we can be used C to verify unintended truncation by CUDA as well. 
+        **we can be used C to verify unintended truncation by CUDA as well.
     Args:
         chunk_trun_bits (int): number of LSB to truncate/round. [0 to 23]
     """
@@ -353,9 +351,8 @@ def matmul_kernel_DABC(
     # -----------------------------------------------------------
     # Iterate to compute a block of the C matrix.
     # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
-    # of fp32 values for higher accuracy.
-    # accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    accumulator = tl.load(c_ptrs, mask=c_mask, other=0.0)  # should have been cast to fp32 already
+    # of fp32 values for higher accuracy, i.e. C should have been cast to fp32 already
+    accumulator = tl.load(c_ptrs, mask=c_mask, other=0.0)
     ## ------ prepare LSB rounding/truncation masks -------
     # NOTE mask will be applied on accumulator, which is alway FP32, so we may truncate up to 23b
     # e.g., 20b -> trun_mask = 0xFFF00000, round_bit = 0x00080000
@@ -477,15 +474,23 @@ def isPowerofTwo(x):
     # insert 0s in between elements, i.e. pad [m,k] -> [m,2k], [k,n]->[2k,n], out=[m,n] unchanged.
     # Do not support I8 or F8 for now. (as F8/FP24 simulation is treated as BF16 currently)
     if chunk_size == 8 and a.dtype in [torch.float16, torch.bfloat16]:
-        a_padded = torch.zeros(a.shape[0], a.shape[1]*2, dtype=a.dtype, device=a.device)
+        a_padded = torch.zeros(
+            a.shape[0], a.shape[1] * 2, dtype=a.dtype, device=a.device
+        )
         a_padded[:, ::2] = a
         a = a_padded
-        b_padded = torch.zeros(b.shape[0]*2, b.shape[1], dtype=b.dtype, device=b.device)
+        b_padded = torch.zeros(
+            b.shape[0] * 2, b.shape[1], dtype=b.dtype, device=b.device
+        )
         b_padded[::2, :] = b
         b = b_padded
         chunk_size = 16
     else:
-        chunk_size = max(chunk_size, min_chunk_size) if isPowerofTwo(chunk_size) else min_chunk_size
+        chunk_size = (
+            max(chunk_size, min_chunk_size)
+            if isPowerofTwo(chunk_size)
+            else min_chunk_size
+        )
 
     M, K = a.shape
     K, N = b.shape
@@ -504,8 +509,8 @@ def isPowerofTwo(x):
         # if C is in fp16, accumulate in fp32 no matter what, decide whether to cast back later
         c_org_dtype = c.dtype
         c = c.to(acc_dtype)
-        assert c.shape[0]==M and c.shape[1]==N, "C shape is inconsistent with A B."
-        assert acc_dtype==torch.float32, "INT truncation experiment is not yet supported."
+        assert c.shape[0] == M and c.shape[1] == N, "C shape is inconsistent with A B."
+        assert acc_dtype == torch.float32, "INT truncation is not yet supported."
 
     # 1D launch kernel where each block gets its own program.
     def grid(META):
diff --git a/fms_mo/modules/linear.py b/fms_mo/modules/linear.py
@@ -1825,8 +1825,8 @@ def forward(ctx, x, weight, bias=None, trun_bits=0, chunk_size=16, fp8_dyn=False
             x_scale = x.abs().amax(dim=reduce_dim) / ctx.fp8_e4m3_max
             w_scale = weight.abs().amax(dim=reduce_dim) / ctx.fp8_e4m3_max
 
-            x = (x/x_scale).to(torch.float8_e4m3fn).to(org_dtype)*x_scale
-            weight = (weight/w_scale).to(torch.float8_e4m3fn).to(org_dtype)*w_scale
+            x = (x / x_scale).to(torch.float8_e4m3fn).to(org_dtype) * x_scale
+            weight = (weight / w_scale).to(torch.float8_e4m3fn).to(org_dtype) * w_scale
 
         # triton kernel assumes 2D inputs and cast the return to input.dtype
         output = tl_matmul(
@@ -1858,13 +1858,14 @@ def backward(ctx, grad_output):
             reduce_dim = None if ctx.fp8_dyn == "per_tensor" else 1
             x_scale = x.abs().amax(dim=reduce_dim) / ctx.fp8_e5m2_max
             w_scale = weight.abs().amax(dim=reduce_dim) / ctx.fp8_e5m2_max
-            grad_out_scale = grad_output_2D.abs().amax(dim=None) / ctx.fp8_e5m2_max  # always perT
+            # always assume perT in this case
+            grad_out_scale = grad_output_2D.abs().amax(dim=None) / ctx.fp8_e5m2_max
 
-            x = (x/x_scale).to(torch.float8_e5m2).to(dtype_input)*x_scale
-            weight = (weight/w_scale).to(torch.float8_e5m2).to(weight.dtype)*w_scale
-            grad_output_2D = (grad_output_2D/grad_out_scale).to(torch.float8_e5m2
-                                                                ).to(grad_output.dtype
-                                                                     )*grad_out_scale
+            x = (x / x_scale).to(torch.float8_e5m2).to(dtype_input) * x_scale
+            weight = (weight / w_scale).to(torch.float8_e5m2).to(weight.dtype) * w_scale
+            grad_output_2D = (grad_output_2D / grad_out_scale).to(torch.float8_e5m2).to(
+                grad_output.dtype
+            ) * grad_out_scale
 
         # Compute grad_weight, shape = [out, in]
         # NOTE: this triton kernel requires A matrix to be contiguous
@@ -1933,16 +1934,23 @@ def from_nn(cls, nnlin, trun_bits=0, **kwargs):
         lin24acc.weight = nnlin.weight
         lin24acc.trun_bits = trun_bits
         lin24acc.chunk_size = kwargs.get("chunk_size", False)
-        lin24acc.fp8_dyn = kwargs.get("dynamic_fp8", False)  #["per_tensor", "per_token"]
+        lin24acc.fp8_dyn = kwargs.get("dynamic_fp8", False)
+        # available options are ["per_tensor", "per_token"]
 
         if nnlin.bias is not None:
             lin24acc.bias = nnlin.bias
         return lin24acc.to(target_device)
 
     def forward(self, inputs):
         # This Linear Class will cast to BF16 before matmul and return FP32
-        return LinearFuncFPxFwdBwd.apply(inputs, self.weight, self.bias, self.trun_bits, 
-                                         self.chunk_size, self.fp8_dyn)
+        return LinearFuncFPxFwdBwd.apply(
+            inputs,
+            self.weight,
+            self.bias,
+            self.trun_bits,
+            self.chunk_size,
+            self.fp8_dyn,
+        )
 
     def extra_repr(self) -> str:
         """