EmbeddedLLM
diff --git a/‎aiter/ops/triton/_triton_kernels/gmm.py‎
Lines changed: 80 additions & 12 deletions b/‎aiter/ops/triton/_triton_kernels/gmm.py‎
Lines changed: 80 additions & 12 deletions
diff --git a/‎aiter/ops/triton/configs/gfx942-GMM.json‎
Lines changed: 17 additions & 0 deletions b/‎aiter/ops/triton/configs/gfx942-GMM.json‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎aiter/ops/triton/configs/gfx950-GMM.json‎
Lines changed: 17 additions & 0 deletions b/‎aiter/ops/triton/configs/gfx950-GMM.json‎
Lines changed: 17 additions & 0 deletions
@@ -25,7 +25,9 @@
 
 
 @functools.lru_cache()
-def get_config(gmm_type: str, M: int, K: int, N: int, G: int) -> dict[str, int]:
+def get_config(
+    gmm_type: str, M: int, K: int, N: int, G: int, accumulate: bool = False
+) -> dict[str, int]:
     assert gmm_type in {
         "gmm",
         "ptgmm",
@@ -49,7 +51,8 @@ def get_config(gmm_type: str, M: int, K: int, N: int, G: int) -> dict[str, int]:
     assert (
         "default" in get_config._config_dict[gmm_type]
     ), "Default configuration is absent."
-    return get_config._config_dict[gmm_type]["default"]
+    key = "accumulate" if accumulate else "default"
+    return get_config._config_dict[gmm_type][key]
 
 
 # Common code shared by GMM and TGMM kernels.
@@ -90,6 +93,7 @@ def gmm_kernel(
     rhs_ptr,
     group_sizes_ptr,
     out_ptr,
+    bias_ptr,
     # Tensor shapes:
     M: int,
     K: int,
@@ -103,6 +107,7 @@ def gmm_kernel(
     K_DIVISIBLE_BY_BLOCK_SIZE_K: tl.constexpr,
     GROUP_SIZE: tl.constexpr,
     GRID_DIM: tl.constexpr,
+    USE_BIAS: tl.constexpr,
 ):
     tl.assume(M > 0)
     tl.assume(K > 0)
@@ -204,6 +209,19 @@ def gmm_kernel(
                 else:
                     rhs_ptrs += BLOCK_SIZE_K * N
 
+            # Add bias if enabled
+            if USE_BIAS:
+                offs_bias_n = tile_n.to(tl.int64) * BLOCK_SIZE_N + tl.arange(
+                    0, BLOCK_SIZE_N
+                )
+                bias_ptrs = bias_ptr + g.to(tl.int64) * N + offs_bias_n
+                bias = tl.load(bias_ptrs, mask=offs_bias_n < N, other=0.0)
+                # Convert bias to float32 to match accumulator precision
+                bias = bias.to(tl.float32)
+                # Broadcast bias across M dimension and add in float32
+                acc += bias[None, :]
+
+            # Convert to output dtype after all computations
             acc = acc.to(out_ptr.type.element_ty)
 
             offs_out_m = tile_m.to(tl.int64) * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
@@ -246,6 +264,7 @@ def tgmm_persistent_kernel(
     rhs_ptr,
     group_sizes_ptr,
     out_ptr,
+    bias_grad_ptr,
     # Tensor shapes:
     M: int,
     K: int,
@@ -258,6 +277,8 @@ def tgmm_persistent_kernel(
     BLOCK_SIZE_N: tl.constexpr,
     GROUP_SIZE: tl.constexpr,
     GRID_DIM: tl.constexpr,
+    COMPUTE_BIAS_GRAD: tl.constexpr,
+    ACCUMULATE: tl.constexpr,
 ):
     tl.assume(M > 0)
     tl.assume(K > 0)
@@ -334,12 +355,21 @@ def tgmm_persistent_kernel(
 
             acc = tl.zeros((BLOCK_SIZE_K, BLOCK_SIZE_N), dtype=tl.float32)
 
+            # Initialize bias accumulator
+            bias_acc = tl.zeros((BLOCK_SIZE_K,), dtype=tl.float32)
+
             for _ in range(0, loop_m):
                 lhs = tl.load(lhs_ptrs)
                 rhs = tl.load(rhs_ptrs)
 
                 acc += tl.dot(lhs, rhs, input_precision="ieee")
 
+                # Accumulate for bias gradient: sum lhs across M dimension
+                if COMPUTE_BIAS_GRAD and tile_n == 0:
+                    bias_acc += tl.sum(
+                        lhs, axis=1
+                    )  # Sum across M dimension [K, M] -> [K]
+
                 if TRANS_LHS:
                     lhs_ptrs += BLOCK_SIZE_M * K
                 else:
@@ -359,6 +389,10 @@ def tgmm_persistent_kernel(
                 rhs = tl.load(rhs_ptrs, mask=offs_m[:, None] < m, other=0)
                 acc += tl.dot(lhs, rhs, input_precision="ieee")
 
+                # Accumulate last chunk for bias gradient
+                if COMPUTE_BIAS_GRAD and tile_n == 0:
+                    bias_acc += tl.sum(lhs, axis=1)
+
             acc = acc.to(out_ptr.type.element_ty)
 
             offs_out_k = tile_k.to(tl.int64) * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
@@ -371,11 +405,23 @@ def tgmm_persistent_kernel(
                 + offs_out_n[None, :]
             )
 
-            tl.store(
-                out_ptrs,
-                acc,
-                mask=(offs_out_k[:, None] < K) & (offs_out_n[None, :] < N),
-            )
+            mask = (offs_out_k[:, None] < K) & (offs_out_n[None, :] < N)
+            if ACCUMULATE:
+                # Load existing values and add to them (like beta=1 in BLAS)
+                old_vals = tl.load(out_ptrs, mask=mask, other=0.0)
+                tl.store(out_ptrs, acc + old_vals, mask=mask)
+            else:
+                # Overwrite output (like beta=0 in BLAS)
+                tl.store(out_ptrs, acc, mask=mask)
+
+            # Store bias gradient (only for first N tile, sum across all M)
+            if COMPUTE_BIAS_GRAD and tile_n == 0:
+                # Keep as float32 for atomic_add (bf16 not supported for atomics)
+                bias_grad_ptrs = bias_grad_ptr + g.to(tl.int64) * K + offs_out_k
+                # Use atomic add since multiple K-tiles may write to same expert's bias
+                tl.atomic_add(
+                    bias_grad_ptrs, bias_acc, mask=offs_out_k < K, sem="relaxed"
+                )
 
             # Go to the next tile by advancing number of programs.
             tile += GRID_DIM
@@ -405,6 +451,7 @@ def tgmm_non_persistent_kernel(
     rhs_ptr,
     group_sizes_ptr,
     out_ptr,
+    bias_grad_ptr,
     # Tensor shapes:
     M: int,
     K: int,
@@ -417,6 +464,8 @@ def tgmm_non_persistent_kernel(
     BLOCK_SIZE_K: tl.constexpr,
     BLOCK_SIZE_N: tl.constexpr,
     GROUP_SIZE: tl.constexpr,
+    COMPUTE_BIAS_GRAD: tl.constexpr,
+    ACCUMULATE: tl.constexpr,
 ):
     tl.assume(M > 0)
     tl.assume(K > 0)
@@ -477,13 +526,19 @@ def tgmm_non_persistent_kernel(
         loop_m -= 1
 
     acc = tl.zeros((BLOCK_SIZE_K, BLOCK_SIZE_N), dtype=tl.float32)
+    # Initialize bias accumulator
+    bias_acc = tl.zeros((BLOCK_SIZE_K,), dtype=tl.float32)
 
     for _ in range(0, loop_m):
         lhs = tl.load(lhs_ptrs)
         rhs = tl.load(rhs_ptrs)
 
         acc += tl.dot(lhs, rhs, input_precision="ieee")
 
+        # Accumulate for bias gradient: sum lhs across M dimension
+        if COMPUTE_BIAS_GRAD and tile_n == 0:
+            bias_acc += tl.sum(lhs, axis=1)  # [K, M] -> [K]
+
         if TRANS_LHS:
             lhs_ptrs += BLOCK_SIZE_M * K
         else:
@@ -502,6 +557,9 @@ def tgmm_non_persistent_kernel(
         lhs = tl.load(lhs_ptrs, mask=offs_m[None, :] < m, other=0)
         rhs = tl.load(rhs_ptrs, mask=offs_m[:, None] < m, other=0)
         acc += tl.dot(lhs, rhs, input_precision="ieee")
+        # Accumulate last chunk for bias gradient
+        if COMPUTE_BIAS_GRAD and tile_n == 0:
+            bias_acc += tl.sum(lhs, axis=1)
 
     acc = acc.to(out_ptr.type.element_ty)
 
@@ -512,8 +570,18 @@ def tgmm_non_persistent_kernel(
         out_ptr + g.to(tl.int64) * K * N + offs_out_k[:, None] * N + offs_out_n[None, :]
     )
 
-    tl.store(
-        out_ptrs,
-        acc,
-        mask=(offs_out_k[:, None] < K) & (offs_out_n[None, :] < N),
-    )
+    mask = (offs_out_k[:, None] < K) & (offs_out_n[None, :] < N)
+    if ACCUMULATE:
+        # Load existing values and add to them (like beta=1 in BLAS)
+        old_vals = tl.load(out_ptrs, mask=mask, other=0.0)
+        tl.store(out_ptrs, acc + old_vals, mask=mask)
+    else:
+        # Overwrite output (like beta=0 in BLAS)
+        tl.store(out_ptrs, acc, mask=mask)
+
+    # Store bias gradient (only for first N tile, sum across all M)
+    if COMPUTE_BIAS_GRAD and tile_n == 0:
+        # Keep as float32 for atomic_add (bf16/fp16 not supported for atomics)
+        bias_grad_ptrs = bias_grad_ptr + g.to(tl.int64) * K + offs_out_k
+        # Use atomic add since multiple K-tiles may write to same expert's bias
+        tl.atomic_add(bias_grad_ptrs, bias_acc, mask=offs_out_k < K, sem="relaxed")
@@ -19,6 +19,15 @@
             "GRID_DIM": 304,
             "num_warps": 8,
             "num_stages": 1
+        },
+        "accumulate": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_K": 128,
+            "BLOCK_SIZE_N": 128,
+            "GROUP_SIZE": 1,
+            "GRID_DIM": 304,
+            "num_warps": 8,
+            "num_stages": 1
         }
     },
     "nptgmm": {
@@ -29,6 +38,14 @@
             "GROUP_SIZE": 1,
             "num_warps": 8,
             "num_stages": 1
+        },
+        "accumulate": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_K": 128,
+            "BLOCK_SIZE_N": 128,
+            "GROUP_SIZE": 1,
+            "num_warps": 8,
+            "num_stages": 1
         }
     }
 }
@@ -19,6 +19,15 @@
             "GRID_DIM": 256,
             "num_warps": 8,
             "num_stages": 1
+        },
+        "accumulate": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_K": 128,
+            "BLOCK_SIZE_N": 128,
+            "GROUP_SIZE": 1,
+            "GRID_DIM": 256,
+            "num_warps": 8,
+            "num_stages": 1
         }
     },
     "nptgmm": {
@@ -29,6 +38,14 @@
             "GROUP_SIZE": 1,
             "num_warps": 8,
             "num_stages": 1
+        },
+        "accumulate": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_K": 128,
+            "BLOCK_SIZE_N": 128,
+            "GROUP_SIZE": 1,
+            "num_warps": 8,
+            "num_stages": 1
         }
     }
 }