intel
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/llvm-hash.txt‎
Lines changed: 1 addition & 1 deletion b/‎cmake/llvm-hash.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 2 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/test/gluon/test_lowerings.py‎
Lines changed: 152 additions & 13 deletions b/‎python/test/gluon/test_lowerings.py‎
Lines changed: 152 additions & 13 deletions
@@ -35,7 +35,7 @@ test-unit: all
 		--ignore=language/test_subprocess.py --ignore=test_debug.py
 	$(PYTEST) -s -n $(NUM_PROCS) python/test/unit/language/test_subprocess.py
 	$(PYTEST) -s -n $(NUM_PROCS) python/test/unit/test_debug.py --forked
-	$(PYTEST) -s -n 8 python/triton_kernels/tests/
+	$(PYTEST) -s -n 6 python/triton_kernels/tests/
 	TRITON_DISABLE_LINE_INFO=0 $(PYTEST) -s python/test/unit/language/test_line_info.py
 	# Run attention separately to avoid out of gpu memory
 	$(PYTEST) -vs python/tutorials/06-fused-attention.py
 
@@ -1 +1 @@
-bc773632355b3cebde350b0341624e88be40b744
+064f02dac0c81c19350a74415b3245f42fed09dc
@@ -645,6 +645,8 @@ class ScaledBlockedToMMAv5
     auto CTALayout = getCTALayout(oldRetType.getEncoding());
     if ((computeCapability) / 10 != 10)
       return failure();
+    if (numWarps != 4 && numWarps != 8)
+      return failure();
     if (retShapePerCTA[0] < 128 || retShapePerCTA[1] < 8)
       return failure();
     Location loc = dotOp.getLoc();
 
@@ -4,24 +4,49 @@
 import triton
 from triton.experimental import gluon
 from triton.experimental.gluon import language as ttgl
+from triton._internal_testing import is_cuda, is_hip, is_hopper_or_newer
+
+
+def _is_layout_applicable(layout) -> bool:
+    if isinstance(layout, ttgl.SliceLayout):
+        return _is_layout_applicable(layout.parent)
+    elif is_cuda():
+        mma_layout = layout.parent if isinstance(layout, ttgl.DotOperandLayout) else layout
+        if not isinstance(mma_layout, ttgl.NVMMADistributedLayout):
+            return False
+        if mma_layout.version[0] >= 3 and not is_hopper_or_newer():
+            return False
+        return True
+    elif is_hip():
+        # TODO: Add other amd layouts
+        return isinstance(layout, ttgl.amd.AMDMFMALayout)
+    else:
+        return True
+
+
+def _filter_layouts(layouts):
+    return [l for l in layouts if _is_layout_applicable(l)]
+
 
 THREADS_PER_WARP = triton.runtime.driver.active.get_current_target().warp_size
 
 
 @pytest.mark.parametrize("M, N", [(32, 16), (32, 32), (32, 64), (64, 32)])
-@pytest.mark.parametrize("src_layout", [
-    ttgl.BlockedLayout([1, 4], [4, THREADS_PER_WARP // 4], [4, 1], [0, 1]),
-    ttgl.BlockedLayout([1, 4], [8, THREADS_PER_WARP // 8], [4, 1], [0, 1]),
-    ttgl.BlockedLayout([4, 1], [4, THREADS_PER_WARP // 4], [1, 4], [0, 1]),
-    ttgl.BlockedLayout([2, 2], [4, THREADS_PER_WARP // 4], [2, 2], [0, 1]),
-    ttgl.BlockedLayout([2, 2], [8, THREADS_PER_WARP // 8], [2, 2], [0, 1]),
-    ttgl.BlockedLayout([1, 4], [4, THREADS_PER_WARP // 4], [4, 1], [1, 0]),
-    ttgl.BlockedLayout([1, 4], [8, THREADS_PER_WARP // 8], [4, 1], [1, 0]),
-    ttgl.BlockedLayout([4, 1], [4, THREADS_PER_WARP // 4], [1, 4], [1, 0]),
-    ttgl.BlockedLayout([2, 2], [4, THREADS_PER_WARP // 4], [2, 2], [1, 0]),
-    ttgl.BlockedLayout([2, 2], [8, THREADS_PER_WARP // 8], [2, 2], [1, 0]),
-    ttgl.BlockedLayout([1, 2], [1, THREADS_PER_WARP], [1, 4], [1, 0]),
-])
+@pytest.mark.parametrize(
+    "src_layout",
+    _filter_layouts([
+        ttgl.BlockedLayout([1, 4], [4, THREADS_PER_WARP // 4], [4, 1], [0, 1]),
+        ttgl.BlockedLayout([1, 4], [8, THREADS_PER_WARP // 8], [4, 1], [0, 1]),
+        ttgl.BlockedLayout([4, 1], [4, THREADS_PER_WARP // 4], [1, 4], [0, 1]),
+        ttgl.BlockedLayout([2, 2], [4, THREADS_PER_WARP // 4], [2, 2], [0, 1]),
+        ttgl.BlockedLayout([2, 2], [8, THREADS_PER_WARP // 8], [2, 2], [0, 1]),
+        ttgl.BlockedLayout([1, 4], [4, THREADS_PER_WARP // 4], [4, 1], [1, 0]),
+        ttgl.BlockedLayout([1, 4], [8, THREADS_PER_WARP // 8], [4, 1], [1, 0]),
+        ttgl.BlockedLayout([4, 1], [4, THREADS_PER_WARP // 4], [1, 4], [1, 0]),
+        ttgl.BlockedLayout([2, 2], [4, THREADS_PER_WARP // 4], [2, 2], [1, 0]),
+        ttgl.BlockedLayout([2, 2], [8, THREADS_PER_WARP // 8], [2, 2], [1, 0]),
+        ttgl.BlockedLayout([1, 2], [1, THREADS_PER_WARP], [1, 4], [1, 0]),
+    ]))
 @pytest.mark.parametrize("axis", [0, 1])
 @pytest.mark.parametrize("sanitize_overflow", [False, True])
 def test_scan_layouts(M, N, src_layout, axis, sanitize_overflow, device):
@@ -49,3 +74,117 @@ def kernel(x_ptr, z_ptr, M: ttgl.constexpr, N: ttgl.constexpr, layout: ttgl.cons
 
     z_ref = torch.cumsum(x, dim=axis, dtype=torch.int32)
     torch.testing.assert_close(z_tri, z_ref)
+
+
+@pytest.mark.parametrize("M, N", [[128, 16], [32, 128], [32, 32], [16, 16]])
+@pytest.mark.parametrize(
+    "src_layout",
+    _filter_layouts([
+        # FIXME: Do not enable these tests until the SLPVectorizor problem with nvptx target has been resolved
+        # SliceLayout(dim=1, parent=BlockedLayout([1, 4, 1], [1, 8, THREADS_PER_WARP // 8], [1, 1, 4], [2, 0, 1], [1, 1, 1], [1, 1, 1], [0, 1, 2])),
+        # SliceLayout(dim=0, parent=BlockedLayout([1, 4, 1], [1, 8, THREADS_PER_WARP // 8], [1, 4, 1], [2, 1, 0], [1, 1, 1], [1, 1, 1], [0, 1, 2])),
+        ttgl.BlockedLayout([1, 4], [8, THREADS_PER_WARP // 8], [4, 1], [1, 0], [1, 1], [1, 1], [0, 1]),
+        ttgl.BlockedLayout([1, 4], [8, THREADS_PER_WARP // 8], [4, 1], [0, 1], [1, 1], [1, 1], [0, 1]),
+        ttgl.NVMMADistributedLayout(version=[2, 0], warps_per_cta=[2, 4], ctas_per_cga=[1, 1], cta_split_num=[1, 1],
+                                    cta_order=[0, 1], instr_shape=[16, 8]),
+        ttgl.NVMMADistributedLayout(version=[3, 0], warps_per_cta=[4, 1], ctas_per_cga=[1, 1], cta_split_num=[1, 1],
+                                    cta_order=[1, 0], instr_shape=[16, 16, 16]),
+        ttgl.amd.AMDMFMALayout(version=2, warps_per_cta=[4, 1], tiles_per_warp=[1, 1], instr_shape=[32, 32],
+                               transposed=False),
+        ttgl.amd.AMDMFMALayout(version=2, warps_per_cta=[1, 4], tiles_per_warp=[1, 1], instr_shape=[32, 32],
+                               transposed=False),
+        ttgl.amd.AMDMFMALayout(version=2, warps_per_cta=[4, 1], tiles_per_warp=[1, 1], instr_shape=[32, 32],
+                               transposed=True),
+        ttgl.amd.AMDMFMALayout(version=2, warps_per_cta=[1, 4], tiles_per_warp=[1, 1], instr_shape=[32, 32],
+                               transposed=True),
+        # TODO: AMDWMMA layouts
+        # WmmaLayout(version=1, warps_per_cta=[4, 1]),
+        # WmmaLayout(version=1, warps_per_cta=[1, 4]),
+        ttgl.DotOperandLayout(
+            parent=ttgl.NVMMADistributedLayout(version=[2, 0], warps_per_cta=[2, 4], ctas_per_cga=[1, 1],  #
+                                               cta_split_num=[1, 1], cta_order=[0, 1], instr_shape=[16, 8]),  #
+            operand_index=1, k_width=8),
+        ttgl.DotOperandLayout(
+            parent=ttgl.NVMMADistributedLayout(version=[3, 0], warps_per_cta=[8, 1], ctas_per_cga=[1, 1],  #
+                                               cta_split_num=[1, 1], cta_order=[1, 0], instr_shape=[16, 32, 16]),  #
+            operand_index=0, k_width=2),
+        ttgl.SliceLayout(
+            dim=0,
+            parent=ttgl.NVMMADistributedLayout(version=[2, 0], warps_per_cta=[4, 1, 1], ctas_per_cga=[1, 1, 1],  #
+                                               cta_split_num=[1, 1, 1], cta_order=[2, 1, 0], instr_shape=[1, 16,
+                                                                                                          8])),  #
+        ttgl.SliceLayout(
+            dim=1, parent=ttgl.DotOperandLayout(
+                parent=ttgl.NVMMADistributedLayout(version=[2, 0], warps_per_cta=[4, 1, 1], ctas_per_cga=[1, 1, 1],  #
+                                                   cta_split_num=[1, 1, 1], cta_order=[2, 1, 0], instr_shape=[1, 16,
+                                                                                                              8]),  #
+                operand_index=1, k_width=2)),
+        "linear_layout",
+    ]))
+@pytest.mark.parametrize("axis", [0, 1])
+@pytest.mark.parametrize("epilogue_kind", ['reduce1d', 'reduce2d', 'expand_reduce2d'])
+@pytest.mark.parametrize("dtype_str, sanitize_overflow", [("int32", False), ("int32", True), ("float32", False),
+                                                          ("float16", False)])
+@pytest.mark.parametrize("reduce_op", ["sum", "max"])
+def test_reduce_layouts(M, N, src_layout, axis, epilogue_kind, dtype_str, sanitize_overflow, reduce_op, device):
+    if src_layout == "linear_layout":
+        src_layout = ttgl.DistributedLinearLayout(reg_bases=[[0, 16], [1, 0], [2, 0], [4, 0], [8, 0], [16, 0]],  #
+                                                  lane_bases=[[0, 0], [0, 1], [0, 2], [0, 4], [0, 8]],  #
+                                                  warp_bases=[[32, 0], [0, 32]], block_bases=[], shape=[M, N])
+        if THREADS_PER_WARP != (1 << len(src_layout.lane_bases)):
+            pytest.skip(f"Skipping. This LinearLayout assumes {1 << len(src_layout.lane_bases)} threads per warp")
+        elif M < 64 or N < 64:
+            pytest.skip(f"Skipping. This LinearLayout assumes M >= 64 and N >= 64, got M={M}, N={N}")
+    if isinstance(src_layout,
+                  (ttgl.amd.AMDMFMALayout, ttgl.NVMMADistributedLayout)) and (M < src_layout.instr_shape[0]
+                                                                              or N < src_layout.instr_shape[1]):
+        pytest.skip("Skipping because tensor shape is smaller than M(f)maLayout instr_shape")
+
+    @gluon.jit
+    def _add(a, b):
+        return a + b
+
+    @gluon.jit
+    def _max(a, b):
+        return ttgl.maximum(a, b)
+
+    combine_fn = _add if reduce_op == "sum" else _max
+
+    @gluon.jit
+    def kernel(x_ptr, z_ptr, M: ttgl.constexpr, N: ttgl.constexpr, layout: ttgl.constexpr, axis: ttgl.constexpr,
+               epilogue_kind: ttgl.constexpr):
+        x_offs_m = ttgl.arange(0, M, layout=ttgl.SliceLayout(1, layout))[:, None]
+        x_offs_n = ttgl.arange(0, N, layout=ttgl.SliceLayout(0, layout))[None, :]
+        x = ttgl.load(x_ptr + x_offs_m * N + x_offs_n)
+        y = ttgl.reduce(x, axis=axis, combine_fn=combine_fn)
+        if epilogue_kind == "reduce1d":
+            if axis == 0:
+                z_offs = ttgl.arange(0, N, layout=ttgl.SliceLayout(0, layout))
+            else:
+                z_offs = ttgl.arange(0, M, layout=ttgl.SliceLayout(1, layout))
+            ttgl.store(z_ptr + z_offs, y)
+        elif epilogue_kind == "reduce2d":
+            y = ttgl.reduce(y, axis=0, combine_fn=combine_fn)
+            ttgl.store(z_ptr, y)
+        elif epilogue_kind == "expand_reduce2d":
+            y = ttgl.expand_dims(y, axis=axis)
+            y = ttgl.reduce(y, axis=1 - axis, combine_fn=combine_fn)
+            z_offs = ttgl.arange(0, 1, layout=ttgl.SliceLayout(1 - axis, layout))
+            ttgl.store(z_ptr + z_offs, y)
+
+    torch.manual_seed(0)
+
+    torch_dtype = getattr(torch, dtype_str)
+    x = torch.randint(-10, 10, (M, N), dtype=torch.int32, device=device).to(torch_dtype)
+    out_shape = (1, 1) if "reduce2d" in epilogue_kind else (1, N) if axis == 0 else (M, 1)
+    z = torch.empty(out_shape, dtype=torch_dtype, device=device)
+
+    num_warps = int(torch.prod(torch.tensor(ttgl._layouts.warps_per_cta(src_layout, (M, N)))))
+    kernel[(1, 1, 1)](x, z, M, N, src_layout, axis, num_warps=num_warps, epilogue_kind=epilogue_kind,
+                      sanitize_overflow=sanitize_overflow, debug=sanitize_overflow)
+
+    reduce_fn = torch.sum if reduce_op == "sum" else torch.amax
+    z_ref = reduce_fn(x, dim=axis, keepdim=True)
+    if epilogue_kind in ("expand_reduce2d", "reduce2d"):
+        z_ref = reduce_fn(z_ref, dim=1 - axis, keepdim=True)
+    torch.testing.assert_close(z, z_ref.to(torch_dtype))
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-bc773632355b3cebde350b0341624e88be40b744`
	`1`	`+064f02dac0c81c19350a74415b3245f42fed09dc`