intel
diff --git a/‎README.md‎
Lines changed: 4 additions & 0 deletions b/‎README.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/test/unit/language/test_reproducer.py‎
Lines changed: 21 additions & 34 deletions b/‎python/test/unit/language/test_reproducer.py‎
Lines changed: 21 additions & 34 deletions
diff --git a/‎python/tutorials/09-persistent-matmul.py‎
Lines changed: 1 addition & 1 deletion b/‎python/tutorials/09-persistent-matmul.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/Conversion/amd/tritongpu_wmma_dot_to_llvm.mlir‎
Lines changed: 35 additions & 7 deletions b/‎test/Conversion/amd/tritongpu_wmma_dot_to_llvm.mlir‎
Lines changed: 35 additions & 7 deletions
@@ -138,6 +138,10 @@ arbitrary LLVM version.
   during the build. By default, this is the user's home directory. It
   can be changed anytime.
 
+- If you're running out of memory when building Triton, specify the `MAX_JOBS`
+  environment variable (to the `pip install -e python` command) to limit the
+  number of jobs.
+
 - Pass `--no-build-isolation` to `pip install` to make nop builds faster.
   Without this, every invocation of `pip install` uses a different symlink to
   cmake, and this forces ninja to rebuild most of the `.a` files.
 
@@ -33,6 +33,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_ENABLE_LLVM_DEBUG",
     "TRITON_HIP_GLOBAL_PREFETCH",
     "TRITON_HIP_LOCAL_PREFETCH",
+    "TRITON_HIP_USE_ASYNC_COPY",
     "TRITON_HIP_USE_BLOCK_PINGPONG",
     "TRITON_HIP_USE_IN_THREAD_TRANSPOSE",
     "TRITON_LLVM_DEBUG_ONLY",
 
@@ -1,42 +1,29 @@
-import os
-import shutil
-
-import pytest
-
-import torch
 import triton
 import re
 
 
-@triton.jit
-def triton_():
-    return
+def test_triton_reproducer_path(monkeypatch, tmp_path):
+    # If we get a cache hit there will be no reproducer generated
+    monkeypatch.setenv("TRITON_ALWAYS_COMPILE", "1")
+
+    @triton.jit
+    def triton_():
+        return
 
+    # We need an temp empty file for MLIR to write the reproducer to, and then
+    # the TRITON_REPRODUCER_PATH env var enables crash the reproduction
+    # generation in MLIR.
+    repro_path = tmp_path / "repro.mlir"
+    repro_path.touch()
+    monkeypatch.setenv("TRITON_REPRODUCER_PATH", str(repro_path))
 
-@pytest.mark.skipif(not torch.xpu.is_available() and not torch.cuda.is_available(), reason="requires cuda")
-def test_reproducer():
-    tmpdir = ".tmp"
-    reproducer = 'triton-reproducer.mlir'
-    if os.path.exists(tmpdir):
-        shutil.rmtree(tmpdir, ignore_errors=True)
-    if os.path.exists(reproducer):
-        os.remove(reproducer)
-    os.environ["TRITON_CACHE_DIR"] = tmpdir
-    os.environ["TRITON_REPRODUCER_PATH"] = reproducer
+    # Run the kernel so MLIR will generate a crash reproducer. It doesn't really
+    # matter what the kernel does, just that the PassManager runs its passes.
     triton_[(1, )]()
-    foundPipeline = ""
-    with open(reproducer, 'r') as f:
-        line = f.read()
-        if 'pipeline:' in line:
-            foundPipeline = line
-    if 0 == len(foundPipeline):
-        raise Exception("Failed to find pipeline info in reproducer file.")
 
-    ttgir_to_llvm_pass = re.compile("convert-triton-{{.*}}gpu-to-llvm")
-    if ttgir_to_llvm_pass.search(foundPipeline):
-        raise Exception("Failed to find triton passes in pipeline")
-    # cleanup
-    if os.path.exists(tmpdir):
-        shutil.rmtree(tmpdir, ignore_errors=True)
-    if os.path.exists(reproducer):
-        os.remove(reproducer)
+    repro = repro_path.read_text()
+    assert "mlir_reproducer" in repro, f"Expected MLIR reproducer in {repro_path}. Got:\n{repro}"
+    m = re.search(r"pipeline: \"(.*)\"", repro)
+    assert m, "Expected to match pass pipeline after \"pipeline:\" in MLIR reproducer"
+    pipeline_str = m.group(1)
+    assert pipeline_str, "Expected non-empty pass pipeline in MLIR reproducer"
@@ -725,9 +725,9 @@ def bench(K, dtype, reps=1000, warmup_reps=10000):
     if dtype == torch.float16:
         bench_fn(reps, warmup_reps, torch_matmul, a, b)
     bench_fn(reps, warmup_reps, matmul, a, b.T)
-    bench_fn(reps, warmup_reps, matmul_tma_ws, a, b)
     bench_fn(reps, warmup_reps, matmul_persistent, a, b.T)
     if supports_tma():
+        bench_fn(reps, warmup_reps, matmul_tma_ws, a, b)
         bench_fn(reps, warmup_reps, matmul_tma_persistent, a, b)
         bench_fn(reps, warmup_reps, matmul_descriptor_persistent, a, b)
 
 
@@ -27,13 +27,13 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     tt.return
   }
 
-  //  CHECK-LABEL: wmma1_dot
-  tt.func @wmma1_dot(%arg0: tensor<16x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>>, %arg1: tensor<16x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>>, %arg2: tensor<16x16xf16, #mma1>) {
+  //  CHECK-LABEL: wmma1_dot_f16
+  tt.func @wmma1_dot_f16(%arg0: tensor<16x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>>, %arg1: tensor<16x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>>, %arg2: tensor<16x16xf16, #mma1>) {
     // CHECK-COUNT-32: llvm.extractvalue %{{.*}} : !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>
     // CHECK-COUNT-8: llvm.extractvalue %{{.*}} : !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16)>
     // CHECK: llvm.mlir.undef : vector<16xf16>
     // CHECK-COUNT-8: llvm.insertelement {{.*}} : vector<16xf16>
-    // CHECK: rocdl.wmma.f16.16x16x16.f16 {{.*}} : (vector<16xf16>, vector<16xf16>, vector<16xf16>, i1) -> vector<16xf16>
+    // CHECK: wmma.f16.16x16x16.f16{{.*}} : (vector<16xf16>, vector<16xf16>, vector<16xf16>, i1) -> vector<16xf16>
     %0 = tt.dot %arg0, %arg1, %arg2, inputPrecision = ieee : tensor<16x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>> * tensor<16x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>> -> tensor<16x16xf16, #mma1>
     // CHECK-COUNT-8: llvm.extractelement {{.*}} : vector<16xf16>
     // CHECK: llvm.mlir.undef : !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16)>
@@ -50,11 +50,39 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     // CHECK-COUNT-8: llvm.extractvalue %{{.*}} : !llvm.struct<(bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16)>
     // CHECK: llvm.mlir.undef : vector<16xbf16>
     // CHECK-COUNT-8: llvm.insertelement {{.*}} : vector<16xbf16>
-    // CHECK: rocdl.wmma.bf16.16x16x16.bf16 {{.*}} : (vector<16xi16>, vector<16xi16>, vector<16xbf16>, i1) -> vector<16xbf16>
+    // CHECK: wmma.bf16.16x16x16.bf16{{.*}} : (vector<16xi16>, vector<16xi16>, vector<16xbf16>, i1) -> vector<16xbf16>
     %0 = tt.dot %arg0, %arg1, %arg2, inputPrecision = ieee : tensor<16x16xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>> * tensor<16x16xbf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>> -> tensor<16x16xbf16, #mma1>
     tt.return
   }
 
+  //  CHECK-LABEL: wmma1_dot_f16_tied
+  tt.func @wmma1_dot_f16_tied(%arg0: tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>>, %arg1: tensor<16x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>>, %arg2: tensor<64x16xf16, #mma1>) {
+    // CHECK-COUNT-32: llvm.extractvalue %{{.*}} : !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>
+    // CHECK-COUNT-8: llvm.extractvalue %{{.*}} : !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>
+    // CHECK: llvm.mlir.undef : vector<16xf16>
+    // CHECK-COUNT-16: llvm.insertelement {{.*}} : vector<16xf16>
+    // CHECK-COUNT-2: wmma.f16.16x16x16.f16.tied{{.*}} : (vector<16xf16>, vector<16xf16>, vector<16xf16>, i1) -> vector<16xf16>
+    %0 = tt.dot %arg0, %arg1, %arg2, inputPrecision = ieee : tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>> * tensor<16x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>> -> tensor<64x16xf16, #mma1>
+    // CHECK-COUNT-8: llvm.extractelement {{.*}} : vector<16xf16>
+    // CHECK: llvm.mlir.undef : !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>
+    // CHECK-COUNT-8: llvm.insertvalue {{.*}} : !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>
+    tt.return
+  }
+
+  //  CHECK-LABEL: wmma1_dot_bf16_tied
+  tt.func @wmma1_dot_bf16_tied(%arg0: tensor<64x16xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>>, %arg1: tensor<16x16xbf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>>, %arg2: tensor<64x16xbf16, #mma1>) {
+    // CHECK-COUNT-32: llvm.extractvalue %{{.*}} : !llvm.struct<(bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16)>
+    // CHECK-COUNT-8: llvm.extractvalue %{{.*}} : !llvm.struct<(bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16)>
+    // CHECK: llvm.mlir.undef : vector<16xbf16>
+    // CHECK-COUNT-16: llvm.insertelement {{.*}} : vector<16xbf16>
+    // CHECK-COUNT-2: wmma.bf16.16x16x16.bf16.tied{{.*}} : (vector<16xi16>, vector<16xi16>, vector<16xbf16>, i1) -> vector<16xbf16>
+    %0 = tt.dot %arg0, %arg1, %arg2, inputPrecision = ieee : tensor<64x16xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>> * tensor<16x16xbf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>> -> tensor<64x16xbf16, #mma1>
+    // CHECK-COUNT-8: llvm.extractelement {{.*}} : vector<16xbf16>
+    // CHECK: llvm.mlir.undef : !llvm.struct<(bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16)>
+    // CHECK-COUNT-8: llvm.insertvalue {{.*}} : !llvm.struct<(bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16, bf16)>
+    tt.return
+  }
+
   //  CHECK-LABEL: wmma1_dot_int8_32
   tt.func @wmma1_dot_int8_32(%arg0: tensor<16x16xi8, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>>, %arg1: tensor<16x16xi8, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>>, %arg2: tensor<16x16xi32, #mma1>) {
     // CHECK-COUNT-16: llvm.extractvalue %{{.*}} : !llvm.struct<(i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8)>
@@ -64,7 +92,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     // CHECK-COUNT-16: llvm.insertelement {{.*}} : vector<16xi8>
     // CHECK: llvm.bitcast %{{.*}} : vector<16xi8> to vector<4xi32>
     // CHECK-COUNT-8: llvm.extractvalue %{{.*}} : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
-    // CHECK: rocdl.wmma.i32.16x16x16.iu8 {{.*}} : (i1, vector<4xi32>, i1, vector<4xi32>, vector<8xi32>, i1) -> vector<8xi32>
+    // CHECK: wmma.i32.16x16x16.iu8{{.*}} : (i1, vector<4xi32>, i1, vector<4xi32>, vector<8xi32>, i1) -> vector<8xi32>
     %0 = tt.dot %arg0, %arg1, %arg2 {inputPrecision = 2 : i32, maxNumImpreciseAcc = 0 : i32} : tensor<16x16xi8, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>> * tensor<16x16xi8, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>> -> tensor<16x16xi32, #mma1>
     // CHECK-COUNT-8: llvm.insertvalue {{.*}} : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
     tt.return
@@ -79,7 +107,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     // CHECK-COUNT-16: llvm.insertelement {{.*}} : vector<16xi4>
     // CHECK: llvm.bitcast %{{.*}} : vector<16xi4> to vector<2xi32>
     // CHECK-COUNT-8: llvm.extractvalue %{{.*}} : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
-    // CHECK: rocdl.wmma.i32.16x16x16.iu4 {{.*}} : (i1, vector<2xi32>, i1, vector<2xi32>, vector<8xi32>, i1) -> vector<8xi32>
+    // CHECK: wmma.i32.16x16x16.iu4{{.*}} : (i1, vector<2xi32>, i1, vector<2xi32>, vector<8xi32>, i1) -> vector<8xi32>
     %0 = tt.dot %arg0, %arg1, %arg2 {inputPrecision = 2 : i32, maxNumImpreciseAcc = 0 : i32} : tensor<16x16xi4, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>> * tensor<16x16xi4, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>> -> tensor<16x16xi32, #mma1>
     // CHECK-COUNT-8: llvm.insertvalue {{.*}} : !llvm.struct<(i32, i32, i32, i32, i32, i32, i32, i32)>
     tt.return
@@ -196,7 +224,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr
     // CHECK-COUNT-32: llvm.insertelement
     // CHECK-COUNT-8: llvm.extractvalue %arg2
     // CHECK-COUNT-8: llvm.insertelement
-    // CHECK-COUNT-2: rocdl.wmma.f16.16x16x16.f16 {{.*}} : (vector<16xf16>, vector<16xf16>, vector<16xf16>, i1) -> vector<16xf16>
+    // CHECK-COUNT-2: wmma.f16.16x16x16.f16{{.*}} : (vector<16xf16>, vector<16xf16>, vector<16xf16>, i1) -> vector<16xf16>
     %0 = tt.dot %arg0, %arg1, %arg2, inputPrecision = ieee : tensor<2x16x32xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 16}>> * tensor<2x32x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 16}>> -> tensor<2x16x16xf16, #mma1>
     // CHECK-COUNT-8: llvm.extractelement
     // CHECK-COUNT-8: llvm.insertvalue