intel
diff --git a/‎lib/Analysis/AxisInfo.cpp‎
Lines changed: 4 additions & 1 deletion b/‎lib/Analysis/AxisInfo.cpp‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 20 additions & 9 deletions b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 20 additions & 9 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/AutomaticWarpSpecialization.cpp‎
Lines changed: 2 additions & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/AutomaticWarpSpecialization.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/triton_kernels/tests/test_matmul.py‎
Lines changed: 29 additions & 2 deletions b/‎python/triton_kernels/tests/test_matmul.py‎
Lines changed: 29 additions & 2 deletions
diff --git a/‎python/triton_kernels/triton_kernels/matmul_ogs.py‎
Lines changed: 14 additions & 5 deletions b/‎python/triton_kernels/triton_kernels/matmul_ogs.py‎
Lines changed: 14 additions & 5 deletions
diff --git a/‎python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py‎
Lines changed: 1 addition & 1 deletion b/‎python/triton_kernels/triton_kernels/matmul_ogs_details/_matmul_ogs.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py‎
Lines changed: 8 additions & 4 deletions b/‎python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py‎
Lines changed: 6 additions & 1 deletion b/‎python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_nvidia.py‎
Lines changed: 3 additions & 0 deletions b/‎python/triton_kernels/triton_kernels/matmul_ogs_details/opt_flags_details/opt_flags_nvidia.py‎
Lines changed: 3 additions & 0 deletions
@@ -1085,7 +1085,10 @@ void AxisInfoAnalysis::visitForOpInductionVar(
   AxisInfo::DimVectorT knownContiguity(1, 1);
   AxisInfo::DimVectorT knownDivisibility(1, 1);
   AxisInfo::DimVectorT knownConstancy(1, 1);
-  knownDivisibility[0] = gcd(lb.getDivisibility(0), step.getDivisibility(0));
+  auto lbDivisibility = lb.getDivisibility();
+  auto stepDivisibility = step.getDivisibility();
+  if (!lbDivisibility.empty() && !stepDivisibility.empty())
+    knownDivisibility[0] = gcd(lbDivisibility[0], stepDivisibility[0]);
   auto inductionVar =
       AxisInfo(knownContiguity, knownDivisibility, knownConstancy);
   (void)argLattices[0]->join(inductionVar);
 
@@ -1151,6 +1151,12 @@ LinearLayout SliceEncodingAttr::toLinearLayout(ArrayRef<int64_t> shape) const {
 
 LinearLayout tensorMemoryToLinearLayout(ArrayRef<int64_t> shape,
                                         TensorMemoryEncodingAttr encoding) {
+  // [Zeros in TMEM LinearLayouts]
+  // If there is a zero in bases rows=32,64 this means that there is
+  // broadcasting, i.e. the same tensor element is duplicated in different
+  // addressable blocks If the zero is in any other row/col (i.e. within a given
+  // warp-addressable tmem space) it means it is not defined
+
   // We model packed layouts as having the rows/cols dimensions of bitwidth=16
   // This means that a layout with unpacked=True is the same as one with
   // unpacked=False
@@ -1186,25 +1192,26 @@ LinearLayout tensorMemoryToLinearLayout(ArrayRef<int64_t> shape,
   auto blockM = encoding.getBlockM();
   auto blockN = std::min<int32_t>(encoding.getBlockN(), shape[1]);
   assert(blockM == 64 || blockM == 128);
-  LinearLayout tile;
+  LinearLayout tile =
+      LinearLayout::zeros1D(encoding.getColStride(), kCol, dims[1]);
   if (blockM == 64) {
-    tile = LinearLayout::identity1D(16, kRow, dims[0]) *
-           LinearLayout::identity1D(blockN, kCol, dims[1]);
+    tile *= LinearLayout::identity1D(16, kRow, dims[0]) *
+            LinearLayout::identity1D(blockN, kCol, dims[1]);
     auto bases = tile.getBases();
     if (shape[0] > blockM) {
       bases[kRow].push_back({64, 0});
     } else if (shape[1] > blockN) {
       bases[kRow].push_back({0, blockN});
     } else {
-      // Empty. This is modelled as broadcasting, same as for TMA(fp4)
+      // Empty, meaning the element is not defined
       bases[kRow].push_back({0, 0});
     }
     bases[kRow].push_back({16, 0});
     bases[kRow].push_back({32, 0});
     tile = LinearLayout(bases, dims);
   } else {
-    tile = LinearLayout::identity1D(blockM, kRow, dims[0]) *
-           LinearLayout::identity1D(blockN, kCol, dims[1]);
+    tile *= LinearLayout::identity1D(blockM, kRow, dims[0]) *
+            LinearLayout::identity1D(blockN, kCol, dims[1]);
   }
   auto repsM = shape[0] / tile.getOutDimSize(dims[0]);
   auto repsN = shape[1] / tile.getOutDimSize(dims[1]);
@@ -1223,14 +1230,18 @@ tensorMemoryScalesToLinearLayout(ArrayRef<int64_t> shape,
   auto kRow = S("row");
   auto kCol = S("col");
   auto dims = standardOutDimNames(ctx, 2);
-  // nb. this can be done with
-  // ensureLayoutNotSmallerThan/ensureLayoutNotLargerThan but it's a bit less
-  // clear IMO
+  // See [Zeros in TMEM LinearLayouts]
   // https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-a-layout-1x
   // We choose repOrder = [0, 1]
   auto tile =
       LinearLayout::identity1D(std::min<int>(32, shape[0]), kRow, dims[0]) *
+      // If shape[0] < 32, we have some rows undefined
+      LinearLayout::zeros1D(32 / std::min<int>(32, shape[0]), kRow, dims[0]) *
+      // Broadcasting
+      LinearLayout::zeros1D(4, kRow, dims[0]) *
       LinearLayout::identity1D(std::min<int>(4, shape[1]), kCol, dims[1]) *
+      // If shape[1] < 4, we have some cols undefined
+      LinearLayout::zeros1D(4 / std::min<int>(4, shape[1]), kCol, dims[1]) *
       // reps
       LinearLayout::identity1D(std::max<int>(1, shape[0] / 32), kCol, dims[0]) *
       LinearLayout::identity1D(std::max<int>(1, shape[1] / 4), kCol, dims[1]);
 
@@ -35,7 +35,8 @@ struct AutomaticWarpSpecialization
 void AutomaticWarpSpecialization::runOnOperation() {
   OpPassManager pm;
   pm.addPass(createTritonGPUPartitionScheduling());
-  pm.addPass(createNVWSInsertAref());
+  // TODO: re-enable once the regression is fixed.
+  // pm.addPass(createNVWSInsertAref());
   pm.addPass(createTritonGPULoadMMASpecialization({numStages}));
   pm.addPass(createTritonGPURewritePartitionDependencies());
   // `int-range-optimizations` and SCCP are good at cleaning up loop arithmetic.
 
@@ -159,6 +159,9 @@ class Case:
     split_k: int = 1
     hbm_swizzling: bool = False
     epilogue_subtile: Union[int, None] = None
+    x_transpose: bool = False
+    w_transpose: bool = False
+    y_transpose: bool = False
 
 
 @pytest.mark.parametrize(
@@ -252,6 +255,13 @@ class Case:
             Case(1000, 400, 400, "ragged", "float8_e4m3fn", "float8_e4m3fn", 3, 1),
             Case(600, 400, 400, "ragged", "float8_e4m3fn", "float8_e4m3fn", 4, 2),
             Case(600, 400, 400, "ragged", "float8_e4m3fn", "float8_e4m3fn", 4, 2, n_expt_shards=2),
+        ] + [
+            Case(320, 400, 400, mode, dtype, dtype, x_transpose=x_transpose, w_transpose=w_transpose, y_transpose=y_transpose)
+            for mode in ("batched", "ragged")
+            for dtype in ("float16", "float8_e5m2")
+            for x_transpose in (False, True)
+            for w_transpose in (False, True)
+            for y_transpose in (False, True)
         ]
     ],
 )
@@ -268,6 +278,7 @@ class Case:
 @pytest.mark.parametrize("is_persistent", [False, True])
 def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas, is_persistent, n_expts_tot,
             n_expts_act, n_expt_shards, mode, act_dtype_str, weight_dtype_str, block_m, hbm_swizzling, epilogue_subtile,
+            x_transpose, w_transpose, y_transpose,
             device, opt_flags_scope, fresh_knobs):
     # TODO: remove when Triton FP8 supports proper RTNE
     if is_cuda():
@@ -373,6 +384,17 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas
                                                                  has_y_gammas, requires_grad=test_bwd, device=device)
     x_ref, w_ref, bias_ref, gs0_ref, gs1_ref = apply_precision(x_tri, w_tri, bias_tri, gs0_tri, gs1_tri, precision_opt)
 
+    if x_transpose:
+        x_tri = x_tri.detach().transpose(-1, -2).contiguous().transpose(-1, -2).requires_grad_(test_bwd)
+    if w_transpose:
+        w_tri = w_tri.detach().transpose(-1, -2).contiguous().transpose(-1, -2).requires_grad_(test_bwd)
+    if y_transpose:
+        n_rows = m if gindx is None else gindx.dst_indx.shape[0]
+        yT_shape = (n_expts_tot, n, n_rows) if mode == "batched" else (n, n_rows)
+        y_tri_in = torch.empty(yT_shape, dtype=act_dtype, device=device).transpose(-1, -2)
+    else:
+        y_tri_in = None
+
     if w_tri.shape[0] == 1 and mode != "batched":
         # Test the case when weight has dim 2, i.e., shape (K, N).
         w_tri = w_tri.squeeze(0).detach().requires_grad_(test_bwd)
@@ -423,9 +445,14 @@ def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas
 
     # triton
     try:
-        tri_y = matmul_ogs(x_tri, w_tri, bias_tri, rdata, gindx, sindx, precision_opt, gammas=gs1_ref, epilogue=epilogue)
+        tri_y = matmul_ogs(x_tri, w_tri, bias_tri, rdata, gindx, sindx, precision_opt,
+                           gammas=gs1_ref, epilogue=epilogue, y=y_tri_in)
     except (opt_flags.InapplicableConstraint, NotImplementedError):
         pytest.xfail("inapplicable opt_flags constraint")
+    if y_tri_in is not None:
+        assert tri_y.data_ptr() == y_tri_in.data_ptr()
+        assert tri_y.shape == y_tri_in.shape
+        assert tri_y.stride() == y_tri_in.stride()
     # If split_k > 1, then the intermediate tensor is fp32.
     sep_gather = mode == "ragged" and do_gather and n_expts_act > 1 and split_k == 1
     sep_scatter = mode == "ragged" and do_scatter and n_expts_act > 1 and split_k == 1
@@ -537,7 +564,7 @@ def test_set_idle_sms():
     num_idle_sms = 24
     matmul_ogs_set_idle_sms(num_idle_sms)
     flags = make_opt_flags(torch.float32, torch.float32, torch.float32, PrecisionConfig(), \
-                           1, 1024, 1024, 1024, None, True, False, 1)
+                           1, 1024, 1024, 1024, None, True, False, 1, False)
     assert flags.idle_sms == num_idle_sms
 
 
 
@@ -177,6 +177,8 @@ def apply_allocation(allocation: MatmulAllocation, output):
     if output is None:
         output = torch.empty(allocation.output[0], device=allocation.device, dtype=allocation.output[1])
     else:
+        if output.ndim == 2:
+            output = output[None, :, :]
         assert output.shape == allocation.output[0]
     ret["output"] = output[None, :, :]
     ret["scratchpad"] = {
@@ -350,6 +352,7 @@ def matmul_ogs(x, w, bias,
         x_scale = Tensor(x_scale)
     if not isinstance(x, Tensor):
         x = Tensor(x, dtype=x.dtype)
+    x_transpose = x.stride(-1) != 1
     # determine shapes
     has_gather = gather_indx is not None
     has_scatter = scatter_indx is not None
@@ -362,14 +365,20 @@ def matmul_ogs(x, w, bias,
         assert x.shape[0] == w.shape[0]
     # compute optimization flags
     out_dtype = precision_config.out_dtype or x.dtype
-    can_use_tma = x.numel() > 0 and x.storage.is_tma_compliant() and \
-                  w.numel() > 0 and w.storage.is_tma_compliant() and \
-                 (w_scale is None or w_scale.storage.is_tma_compliant())
+    can_use_tma = (
+        x.numel() > 0 and x.storage.is_tma_compliant() and
+        w.numel() > 0 and w.storage.is_tma_compliant() and
+        (w_scale is None or w_scale.storage.is_tma_compliant()) and
+        (not is_ragged or x.stride(-1) == 1) and
+        # Currently we don't support tma if y is column major; may revisit later if this becomes an issue.
+        (y is None or y.stride(-1) == 1)
+    )
     # hopper w/ mxfp4 doesn't support TMA
     can_use_tma = can_use_tma and (torch.cuda.get_device_capability()[0] > 9 or bitwidth(w.dtype) != 4)
     can_use_fused_scatter = has_scatter and (fused_activation.specs.fn is None) and (epilogue.specs.fn is None) and (routing_data.n_expts_act == 1)
     opt_flags = make_opt_flags(out_dtype, x.dtype, w.dtype, precision_config,
-        batch_size, M, N, K, routing_data, can_use_tma, can_use_fused_scatter, epilogue.effective_itemsize,
+        batch_size, M, N, K, routing_data, can_use_tma, can_use_fused_scatter,
+        epilogue.effective_itemsize, x_transpose,
     )
     if not can_use_fused_scatter and opt_flags.fused_scatter:
         raise InapplicableConstraint("Fused scatter is not supported")
@@ -469,7 +478,7 @@ def matmul_ogs(x, w, bias,
                    y_tensor_or_tma, y_storage.data, *out_matmul.stride(),
                    *((None, out_matmul_scale, None) if out_matmul_has_mx else out_matmul_flex),
                    *out_matmul_scale_strides[-4:],
-                   x_tensor_or_tma, x_storage.data, *x_strides,
+                   x_tensor_or_tma, x_storage.data, *x_strides, x_transpose,
                    flex.lhs_data.scale,
                    None if x_scale is None else x_scale.data.view(torch.uint8), *x_scale_strides,
                    w_tensor_or_tma, w_storage.data, *w_storage.data.stride(), w_transpose,
 
@@ -34,7 +34,7 @@ def _matmul_ogs(
              Y, YPtr, stride_y_k, stride_y_z, stride_y_m, stride_y_n,
              YExpectedScale, YActualScale, YChecksumScale,
              stride_y_mx_k, stride_y_mx_z, stride_y_mx_m, stride_y_mx_n,
-             X, XPtr, stride_x_z, stride_x_m, stride_x_k,
+             X, XPtr, stride_x_z, stride_x_m, stride_x_k, X_TRANSPOSE: tl.constexpr,
              XScale,
              XMxScale, stride_x_mx_z, stride_x_mx_m, stride_x_mx_k,
              W, WPtr, stride_w_e, stride_w_k, stride_w_n, W_TRANSPOSE: tl.constexpr,
 
@@ -82,7 +82,7 @@ def _p_matmul_ogs(
              Y, YPtr, stride_y_k, stride_y_z, stride_y_m, stride_y_n,
              YExpectedScale, YActualScale, YChecksumScale,
              stride_y_mx_k, stride_y_mx_z, stride_y_mx_m, stride_y_mx_n,
-             X, XPtr, stride_x_z, stride_x_m, stride_x_k,
+             X, XPtr, stride_x_z, stride_x_m, stride_x_k, X_TRANSPOSE: tl.constexpr,
              XScale,
              XMxScale, stride_x_mx_z, stride_x_mx_m, stride_x_mx_k,
              W, WPtr, stride_w_e, stride_w_k, stride_w_n, W_TRANSPOSE: tl.constexpr,
@@ -282,13 +282,17 @@ def _p_matmul_ogs(
                 if EVEN_K:
                     mask_k_scale = tl.full([MX_SCALE_BLOCK_K], True, dtype=tl.int1)
                 else:
-                    mask_k_scale = offs_k_scale < tl.cdiv(K, MX_PACK_DIVISOR)
+                    mask_k_scale = off_k_mx + tl.arange(0, MX_SCALE_BLOCK_K) < tl.cdiv(K, MX_PACK_DIVISOR)
 
             if USE_GATHER_TMA:
                 x = X.gather(offs_x_m, off_k)
             elif X_TMA_MODE == "dense":
-                x = X.load([start_z, start_m + off_m, off_k])
-                x = x.reshape(BLOCK_M, BLOCK_K)
+                if X_TRANSPOSE:
+                    x = X.load([start_z, off_k, start_m + off_m])
+                    x = x.reshape(BLOCK_K, BLOCK_M).T
+                else:
+                    x = X.load([start_z, start_m + off_m, off_k])
+                    x = x.reshape(BLOCK_M, BLOCK_K)
             elif X_TMA_MODE == "ragged":
                 x = load_ragged(X, start_m, eM, [start_z, off_m, off_k], ragged_dim=1)
                 x = x.reshape(BLOCK_M, BLOCK_K)
 
@@ -44,6 +44,7 @@ def make_default_opt_flags_intel(
     can_use_fused_scatter,
     enforce_bitwise_invariance,
     epilogue_effective_itemsize,
+    x_transpose,
     constraints,
 ):
     constraints_supported = ["block_m", "block_k", "split_k", "is_persistent", "fused_scatter", "epilogue_subtile", "num_stages"]
@@ -123,6 +124,7 @@ def make_default_opt_flags_amd(
     can_use_fused_scatter,
     enforce_bitwise_invariance,
     epilogue_effective_itemsize,
+    x_transpose,
     constraints,
 ):
     constraints_supported = ["block_m", "block_n", "block_k", "split_k", "fused_scatter", "is_persistent", "epilogue_subtile"]
@@ -222,6 +224,7 @@ def make_default_opt_flags_nvidia(
     can_use_fused_scatter,
     enforce_bitwise_invariance,
     epilogue_effective_itemsize,
+    x_transpose,
     constraints,
 ):
     constraints_supported = ["block_m", "block_k", "split_k", "is_persistent", "fused_scatter", "epilogue_subtile", "num_stages", "idle_sms"]
@@ -286,6 +289,7 @@ def make_default_opt_flags_nvidia(
         out_dtype,
         lhs_dtype,
         rhs_dtype,
+        x_transpose,
     )
 
     if constraints.get("epilogue_subtile", None) is not None:
@@ -365,6 +369,7 @@ def make_opt_flags(
     can_use_persistent_tma,
     can_use_fused_scatter,
     epilogue_effective_itemsize,
+    x_transpose,
 ):
     if _opt_flags_constraints.get("is_persistent", False) and not can_use_persistent_tma:
         raise InapplicableConstraint("cannot enforce `is_persistent=True` constraint")
@@ -376,7 +381,7 @@ def make_opt_flags(
         return _opt_flags
     args = [out_dtype, lhs_dtype, rhs_dtype, precision_config, batch_size, m, n, k,
             routing_data, can_use_persistent_tma, can_use_fused_scatter,
-            enforce_bitwise_invariance, epilogue_effective_itemsize,
+            enforce_bitwise_invariance, epilogue_effective_itemsize, x_transpose,
             _opt_flags_constraints]
     backend = triton.runtime.driver.active.get_current_target().backend
     if backend == "xpu":
 
@@ -72,6 +72,7 @@ def compute_num_stages(
     out_dtype,
     lhs_dtype,
     rhs_dtype,
+    x_transpose,
     epilogue_subtile,
     epilogue_effective_itemsize,
 ):
@@ -103,6 +104,8 @@ def compute_num_stages(
         # pipelined layout conversion before store of the accumulator
         # note: layout conversion has some padding
         smem_capacity -= int((block_m + 4) * acc_block_n * acc_size)
+        if x_transpose:
+            smem_capacity -= block_m * block_k * lhs_dtype.itemsize
         if precision_config.weight_scale is not None:
             # mx scales
             stage_size += block_n * (block_k // int(MXFP_BLOCK_SIZE))