Merge commit 'a8adf9bbc170ab43478e6a32424966f5cf78ef9a'

whitneywhtsang · whitneywhtsang · commit a21c24d04155 · 2024-10-11T17:24:15.000Z
diff --git a/python/src/ir.cc b/python/src/ir.cc
@@ -567,29 +567,9 @@ void init_triton_ir(py::module &&m) {
       //  .def("has_attr", &::FuncOp::hasAttr)
       .def("finalize",
            [](FuncOp &self) -> void {
-             // Remove dead code
-             // 1. Unreachable code after return
-             self.walk([&](Block *block) {
-               Operation *retOp = nullptr;
-               // It's better to not use walk here because we only want to
-               // check operations in the current block
-               for (auto &op : block->getOperations()) {
-                 if (isa<ReturnOp>(op))
-                   if (retOp == nullptr) {
-                     retOp = &op;
-                     break;
-                   }
-               }
-               if (retOp && retOp != &block->back()) {
-                 auto pos = retOp->getIterator();
-                 pos++;
-                 auto *newBlock = block->splitBlock(pos);
-                 newBlock->erase();
-               }
-             });
-             // 2. Check if the result of tl.advance is used
-             self.walk([&](Operation *op) {
-               if (isa<AdvanceOp>(op) && op->getResult(0).use_empty())
+             // Check if the result of tl.advance is used
+             self.walk([&](AdvanceOp op) {
+               if (op->getResult(0).use_empty())
                  outputWarning(op->getLoc(), "The result of tl.advance is not "
                                              "being used. Note that tl.advance "
                                              "does not have any side effects. "
diff --git a/python/test/unit/conftest.py b/python/test/unit/conftest.py
@@ -1,10 +1,10 @@
-import pytest
 import os
+import pytest
 import tempfile
 
 
 def pytest_addoption(parser):
-    parser.addoption("--device", action="store", default='cuda')
+    parser.addoption("--device", action="store", default="cuda")
 
 
 @pytest.fixture
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -4903,6 +4903,33 @@ def nested_while(data, countPtr):
     assert data[0] == 40
 
 
+def test_constexpr_if_return(device):
+    # Reproducer for #4883, return statement in an if with a constexpr causes
+    # errors when combined with non-trivial control flow graphs
+
+    @triton.jit
+    def kernel(Semaphore, Out, total: tl.constexpr):
+        if total == 1:
+            tl.store(Out, tl.program_id(0))
+            return
+
+        prev = tl.atomic_add(Semaphore, 1)
+        if prev + 1 != total:
+            return
+
+        tl.store(Out, tl.program_id(0) + prev)
+
+    sem = torch.zeros((), device=device, dtype=torch.int32)
+    out = torch.empty((), device=device, dtype=torch.int32)
+    kernel[(1, )](sem, out, 1)
+    assert out.item() == 0
+
+    sem = torch.zeros((), device=device, dtype=torch.int32)
+    out = torch.full((), fill_value=-1, device=device, dtype=torch.int32)
+    kernel[(4, )](sem, out, 4)
+    assert out.item() >= 0
+
+
 # -----------------------
 # test extra
 # -----------------------
diff --git a/python/test/unit/test_debug_dump.py b/python/test/unit/test_debug_dump.py
@@ -1,39 +1,49 @@
-import triton
-import triton.language as tl
 import os
+from contextlib import contextmanager
+
 import torch
+import triton
+import triton.language as tl
+
 
+@contextmanager
+def enable_dump_context(pass_name="1"):
+    try:
+        os.environ["MLIR_ENABLE_DUMP"] = pass_name
+        yield
+    finally:
+        os.environ["MLIR_ENABLE_DUMP"] = "0"
 
-def test_fn_dump(capfd, device):
+
+def test_fn_dump(capfd, device, fresh_triton_cache):
     N = 1024
     src = torch.zeros(N, device=device)
 
-    grid = lambda META: (triton.cdiv(N, META['BLOCK_SIZE']), )
+    grid = lambda META: (triton.cdiv(N, META["BLOCK_SIZE"]), )
 
     @triton.jit
     def _kernel(src, N, BLOCK_SIZE: tl.constexpr):
         offsets = tl.program_id(0) * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
         x = tl.load(src + offsets, mask=offsets < N) + 1
         tl.store(src + offsets, x, mask=offsets < N)
 
-    os.environ['MLIR_ENABLE_DUMP'] = '1'
-    BLOCK_SIZE = 16
-    _kernel[grid](src, N, BLOCK_SIZE)
+    with enable_dump_context():
+        BLOCK_SIZE = 16
+        _kernel[grid](src, N, BLOCK_SIZE)
     captured = capfd.readouterr()
+    print(captured.err)
     assert "IR Dump Before" in captured.err
     assert "tt.func public @_kernel" in captured.err
 
-    os.environ['MLIR_ENABLE_DUMP'] = '_kernel'
-    BLOCK_SIZE = 32
-    _kernel[grid](src, N, BLOCK_SIZE)
+    with enable_dump_context("_kernel"):
+        BLOCK_SIZE = 32
+        _kernel[grid](src, N, BLOCK_SIZE)
     captured = capfd.readouterr()
     assert "IR Dump Before" in captured.err
     assert "tt.func public @_kernel" in captured.err
 
-    os.environ['MLIR_ENABLE_DUMP'] = '_kernel2'
-    BLOCK_SIZE = 64
-    _kernel[grid](src, N, BLOCK_SIZE)
+    with enable_dump_context("_kernel2"):
+        BLOCK_SIZE = 64
+        _kernel[grid](src, N, BLOCK_SIZE)
     captured = capfd.readouterr()
     assert "IR Dump Before" not in captured.err
-
-    os.environ['MLIR_ENABLE_DUMP'] = '0'
diff --git a/python/test/unit/test_perf_warning.py b/python/test/unit/test_perf_warning.py
@@ -1,55 +1,106 @@
-import triton
-import triton.language as tl
 import os
+from contextlib import contextmanager
+
 import pytest
 import torch
+import triton
+import triton.language as tl
+
+
+@contextmanager
+def enable_remark_context():
+    try:
+        os.environ["MLIR_ENABLE_REMARK"] = "1"
+        yield
+    finally:
+        os.environ["MLIR_ENABLE_REMARK"] = "0"
 
 
 def is_perf_warning_enabled():
-    return os.environ.get('MLIR_ENABLE_REMARK', '0') == '1'
+    return os.environ.get("MLIR_ENABLE_REMARK", "0") == "1"
 
 
 def is_cuda():
     return triton.runtime.driver.active.get_current_target().backend == "cuda"
 
 
-def test_mma_remark(capfd):
+def test_mma_remark(capfd, fresh_triton_cache):
     if is_cuda():
         capability = torch.cuda.get_device_capability()
         if capability[0] < 9:
             pytest.skip("Requires sm >= 90 to run")
 
-    os.environ['MLIR_ENABLE_REMARK'] = '1'
-
     @triton.jit
-    def matmul_kernel(a_ptr, b_ptr, c_ptr, M, N, K, stride_am, stride_ak, stride_bk, stride_bn, stride_cm, stride_cn):
-        a_block_ptr = tl.make_block_ptr(base=a_ptr, shape=(M, K), strides=(stride_am, stride_ak), offsets=(0, 0),
-                                        block_shape=(32, 128), order=(1, 0))
-        b_block_ptr = tl.make_block_ptr(base=b_ptr, shape=(K, N), strides=(stride_bk, stride_bn), offsets=(0, 0),
-                                        block_shape=(128, 32), order=(0, 1))
-        c_block_ptr = tl.make_block_ptr(base=c_ptr, shape=(M, N), strides=(stride_cm, stride_cn), offsets=(0, 0),
-                                        block_shape=(32, 32), order=(1, 0))
+    def matmul_kernel(
+        a_ptr,
+        b_ptr,
+        c_ptr,
+        M,
+        N,
+        K,
+        stride_am,
+        stride_ak,
+        stride_bk,
+        stride_bn,
+        stride_cm,
+        stride_cn,
+    ):
+        a_block_ptr = tl.make_block_ptr(
+            base=a_ptr,
+            shape=(M, K),
+            strides=(stride_am, stride_ak),
+            offsets=(0, 0),
+            block_shape=(32, 128),
+            order=(1, 0),
+        )
+        b_block_ptr = tl.make_block_ptr(
+            base=b_ptr,
+            shape=(K, N),
+            strides=(stride_bk, stride_bn),
+            offsets=(0, 0),
+            block_shape=(128, 32),
+            order=(0, 1),
+        )
+        c_block_ptr = tl.make_block_ptr(
+            base=c_ptr,
+            shape=(M, N),
+            strides=(stride_cm, stride_cn),
+            offsets=(0, 0),
+            block_shape=(32, 32),
+            order=(1, 0),
+        )
         a = tl.load(a_block_ptr)
         b = tl.load(b_block_ptr)
         c = tl.dot(a, b)
         tl.store(c_block_ptr, c)
 
-    triton.compile(
-        triton.compiler.ASTSource(
-            fn=matmul_kernel, signature={
-                'a_ptr': '*fp32', 'b_ptr': '*fp32', 'c_ptr': '*fp32', 'M': 'i32', 'N': 'i32', 'K': 'i32', 'stride_am':
-                'i32', 'stride_ak': 'i32', 'stride_bk': 'i32', 'stride_bn': 'i32', 'stride_cm': 'i32', 'stride_cn':
-                'i32'
-            }, constants={}))
+    with enable_remark_context():
+        triton.compile(
+            triton.compiler.ASTSource(
+                fn=matmul_kernel,
+                signature={
+                    "a_ptr": "*fp32",
+                    "b_ptr": "*fp32",
+                    "c_ptr": "*fp32",
+                    "M": "i32",
+                    "N": "i32",
+                    "K": "i32",
+                    "stride_am": "i32",
+                    "stride_ak": "i32",
+                    "stride_bk": "i32",
+                    "stride_bn": "i32",
+                    "stride_cm": "i32",
+                    "stride_cn": "i32",
+                },
+                constants={},
+            ))
     captured = capfd.readouterr()
 
-    assert "remark: Warning: can't use MMA V3 for the dot op" in captured.err, "expect MMA V3 remark"
+    assert ("remark: Warning: can't use MMA V3 for the dot op" in captured.err), "expect MMA V3 remark"
     assert "note: see current operation:" in captured.err
-    os.environ['MLIR_ENABLE_REMARK'] = '0'
 
 
-def test_remark_vectorization(capfd):
-    os.environ["MLIR_ENABLE_REMARK"] = "1"
+def test_remark_vectorization(capfd, fresh_triton_cache):
 
     @triton.jit
     def ldst_vec(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, XBLOCK: tl.constexpr):
@@ -75,12 +126,52 @@ def ldst_vec(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, XBLOCK: tl.constexpr)
         tl.store(out_ptr0 + (x4), tmp22, None)
 
     XBLOCK = 1024
-    triton.compile(
-        triton.compiler.ASTSource(
-            fn=ldst_vec, signature={
-                'in_ptr0': '*i64', 'in_ptr1': '*i64', 'in_ptr2': '*fp16', 'in_ptr3': '*fp32', 'out_ptr0': '*fp16'
-            }, constants={"XBLOCK": XBLOCK}), options={"num_warps": 1})
+    with enable_remark_context():
+        triton.compile(
+            triton.compiler.ASTSource(
+                fn=ldst_vec,
+                signature={
+                    "in_ptr0": "*i64",
+                    "in_ptr1": "*i64",
+                    "in_ptr2": "*fp16",
+                    "in_ptr3": "*fp32",
+                    "out_ptr0": "*fp16",
+                },
+                constants={"XBLOCK": XBLOCK},
+            ),
+            options={"num_warps": 1},
+        )
 
     _, err = capfd.readouterr()
     assert ("remark: Warning: vectorization fails" in err), "expect vectorization failure remark"
-    os.environ["MLIR_ENABLE_REMARK"] = "0"
+
+
+def test_remark_swp_op_before_operands(capfd, fresh_triton_cache):
+
+    @triton.jit
+    def kernel_pipe_error(in_ptr, out_ptr):
+        SIZE: tl.constexpr = 64
+        in_ptrs = in_ptr + tl.arange(0, SIZE)
+        val = tl.zeros((SIZE, ), dtype=tl.float32)
+        k = 0
+        for i in tl.range(0, 64, num_stages=3):
+            in_ptrs = in_ptr + tl.arange(0, SIZE) + SIZE * k
+            val = tl.load(in_ptrs)
+            out_ptrs = out_ptr + (tl.arange(0, SIZE) + i * SIZE)
+            tl.store(out_ptrs, val)
+            if tl.max(val) > 0:
+                k += 1
+
+    with enable_remark_context():
+        triton.compile(
+            triton.compiler.ASTSource(
+                fn=kernel_pipe_error,
+                signature={"in_ptr": "*fp32", "out_ptr": "*fp32"},
+                constants={},
+            ),
+            options={"cluster_dims": (1, 1, 1)},
+        )
+
+    _, err = capfd.readouterr()
+
+    assert "operation scheduled before its operands" in err, "expect swp op remark"
diff --git a/python/triton/compiler/code_generator.py b/python/triton/compiler/code_generator.py
diff --git a/python/triton/language/extra/__init__.py b/python/triton/language/extra/__init__.py