intel
diff --git a/‎include/triton/Analysis/Utility.h‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Analysis/Utility.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎lib/Analysis/Utility.cpp‎
Lines changed: 24 additions & 1 deletion b/‎lib/Analysis/Utility.cpp‎
Lines changed: 24 additions & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 6 additions & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎python/src/ir.cc‎
Lines changed: 1 addition & 0 deletions b/‎python/src/ir.cc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/test/unit/language/test_compile_errors.py‎
Lines changed: 25 additions & 24 deletions b/‎python/test/unit/language/test_compile_errors.py‎
Lines changed: 25 additions & 24 deletions
diff --git a/‎python/test/unit/language/test_core.py‎
Lines changed: 19 additions & 4 deletions b/‎python/test/unit/language/test_core.py‎
Lines changed: 19 additions & 4 deletions
diff --git a/‎python/test/unit/language/test_decorator.py‎
Lines changed: 1 addition & 1 deletion b/‎python/test/unit/language/test_decorator.py‎
Lines changed: 1 addition & 1 deletion
@@ -231,6 +231,11 @@ bool isBlockedToDotShortcut(RankedTensorType srcTy, RankedTensorType dstTy);
 bool matchMmaV3AndDotOperandLayout(RankedTensorType srcTy,
                                    RankedTensorType dstTy);
 
+// Check if MFMA layout can be converted to the dot operand
+// layout using warp shuffle.
+bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy,
+                                       RankedTensorType dstTy);
+
 // TODO: Move utility functions that belong to ConvertLayoutOp to class
 // ConvertLayoutOpHelper in the future
 bool shouldUseDistSmem(Attribute srcLayout, Attribute dstLayout);
 
@@ -11,6 +11,7 @@
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/Support/LLVM.h"
+#include "triton/Conversion/MLIRTypes.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
@@ -650,6 +651,25 @@ bool matchMmaV3AndDotOperandLayout(RankedTensorType srcTy,
   return ans;
 }
 
+bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy,
+                                       RankedTensorType dstTy) {
+  auto mfmaLayout = dyn_cast<AMDMfmaEncodingAttr>(srcTy.getEncoding());
+  auto dotOperandLayout = dyn_cast<DotOperandEncodingAttr>(dstTy.getEncoding());
+  if (!mfmaLayout || !dotOperandLayout)
+    return false;
+
+  // Currently supporting 32x32 and 16x16 FP8 MFMA -> dot operand case
+  return dotOperandLayout.getParent() == mfmaLayout &&
+         dotOperandLayout.getOpIdx() == 0 && mfmaLayout.getIsTransposed() &&
+         dotOperandLayout.getKWidth() == 8 &&
+         getContigPerThread(mfmaLayout)[1] == 4 &&
+         ((mfmaLayout.getMDim() == 16 && mfmaLayout.getNDim() == 16) ||
+          (mfmaLayout.getMDim() == 32 && mfmaLayout.getNDim() == 32)) &&
+         triton::type::isFloat8(srcTy.getElementType()) &&
+         triton::type::isFloat8(dstTy.getElementType()) &&
+         mfmaLayout.getWarpsPerCTA()[1] == 1;
+}
+
 // We get the smallest submap of srcTy^{-1} * dstTy that is not the identity
 // under kBlock, kWarp or kLane (in that order). The idea here is that if we
 // have a transformation that's the identity on kBlock, we don't need to use
@@ -749,7 +769,10 @@ bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy) {
   return !cvtReordersRegisters(srcTy, dstTy) &&
          !triton::gpu::intel::isDpasToDotShortcut(srcTy, dstTy) &&
          !isBlockedToDotShortcut(srcTy, dstTy) &&
-         !matchMmaV3AndDotOperandLayout(srcTy, dstTy);
+         !matchMmaV3AndDotOperandLayout(srcTy, dstTy) &&
+         // to be removed when generalized warp shuffle conversions
+         // are ready:
+         !matchMFMAAndDotOperandShuffleCase(srcTy, dstTy);
 }
 
 bool atomicNeedsSharedMemory(Value value) {
 
@@ -402,6 +402,12 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
       return failure();
     }
 
+    // The following check can be removed when generalized warp shuffle
+    // conversions are ready:
+    if (matchMFMAAndDotOperandShuffleCase(srcTy, dstTy)) {
+      return failure();
+    }
+
     assert(cvtNeedsSharedMemory(srcTy, dstTy));
 
     SmallVector<Value> inVals =
 
@@ -605,6 +605,7 @@ void init_triton_ir(py::module &&m) {
                    "Function argument index out of range");
              return self.getArgument(idx);
            })
+      .def("get_num_args", &FuncOp::getNumArguments)
       .def(
           "add_entry_block",
           [](FuncOp &self) -> Block * { return self.addEntryBlock(); },
 
@@ -17,7 +17,7 @@ def kernel():
         a += 1  # noqa
 
     with pytest.raises(CompilationError) as e:
-        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constants={}))
+        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constexprs={}))
 
     try:
         assert "is not defined" in str(e.value), "error should mention the undefined variable"
@@ -32,7 +32,7 @@ def kernel():
         0 + "a"
 
     with pytest.raises(CompilationError) as e:
-        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constants={}))
+        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constexprs={}))
 
     try:
         assert "at 2:4:" in str(e.value), "error should point to the 0"
@@ -47,7 +47,7 @@ def kernel():
         tl.static_assert(isinstance(0, tl.tensor))
 
     with pytest.raises(CompilationError) as e:
-        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constants={}))
+        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constexprs={}))
 
     try:
         assert isinstance(e.value, CompileTimeAssertionFailure)
@@ -66,7 +66,7 @@ def kernel():
         not (0, 0)
 
     with pytest.raises(CompilationError) as e:
-        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constants={}))
+        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constexprs={}))
 
     try:
         assert e.value.__cause__ is None
@@ -83,7 +83,7 @@ def kernel():
         1.0 << 1
 
     with pytest.raises(CompilationError) as e:
-        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constants={}))
+        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constexprs={}))
 
     try:
         assert "at 2:4:" in str(e.value), "error should point to the 1.0"
@@ -107,7 +107,7 @@ def kernel():
         nested_call()
 
     with pytest.raises(CompilationError) as e:
-        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constants={}))
+        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constexprs={}))
 
     try:
         inner = e.value.__cause__
@@ -130,7 +130,7 @@ def kernel():
         tl.expand_dims(None, -1)
 
     with pytest.raises(CompilationError) as e:
-        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constants={}))
+        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constexprs={}))
 
     try:
         inner = e.value.__cause__
@@ -157,7 +157,7 @@ def kernel():
         a = two_returns()
         a + tl.arange(0, 4)  # only works if we took the first return
 
-    triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constants={}))
+    triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constexprs={}))
 
 
 def test_not_const_annotate_no_err():
@@ -166,7 +166,7 @@ def test_not_const_annotate_no_err():
     def kernel(N: int = 1):
         pass
 
-    triton.compile(triton.compiler.ASTSource(fn=kernel, signature={'N': 'i32'}, constants={}))
+    triton.compile(triton.compiler.ASTSource(fn=kernel, signature={'N': 'i32'}, constexprs={}))
 
 
 @triton.jit
@@ -186,14 +186,14 @@ def kernel1(N: tl.constexpr):
         a = returns_branched_on_constexpr(N)
         a + tl.arange(0, 4)
 
-    triton.compile(triton.compiler.ASTSource(fn=kernel1, signature={}, constants={"N": 0}))
+    triton.compile(triton.compiler.ASTSource(fn=kernel1, signature={"N": "constexpr"}, constexprs={"N": 0}))
 
     @triton.jit
     def kernel2(N: tl.constexpr):
         a = returns_branched_on_constexpr(N)
         a + tl.arange(0, 8)
 
-    triton.compile(triton.compiler.ASTSource(fn=kernel2, signature={}, constants={"N": 1}))
+    triton.compile(triton.compiler.ASTSource(fn=kernel2, signature={"N": "constexpr"}, constexprs={"N": 1}))
 
 
 @triton.jit
@@ -211,7 +211,7 @@ def kernel(N: int):
         returns_branched_on_non_constexpr(N)
 
     with pytest.raises(CompilationError) as e:
-        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={'N': 'i32'}, constants={}))
+        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={'N': 'i32'}, constexprs={}))
 
     try:
         assert "at 2:4:" in str(e.value), "error should point to the function call"
@@ -227,7 +227,7 @@ def kernel():
         tl.arange(2, 7)
 
     with pytest.raises(CompilationError) as e:
-        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constants={}))
+        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constexprs={}))
     assert str(e.value.__cause__) == "arange's range must be a power of 2"
 
 
@@ -238,7 +238,7 @@ def kernel():
         tl.full((33, ), 0, dtype=tl.int64)
 
     with pytest.raises(CompilationError) as e:
-        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constants={}))
+        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constexprs={}))
     assert str(e.value.__cause__) == "Shape element 0 must be a power of 2"
 
 
@@ -251,7 +251,7 @@ def kernel():
         a = CAPTURED  # noqa
 
     with pytest.raises(CompilationError) as e:
-        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constants={}))
+        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constexprs={}))
     assert "CAPTURED is not defined" in str(e.value)
 
 
@@ -265,7 +265,7 @@ def kernel():
         a = GLOBAL  # noqa
 
     with pytest.raises(CompilationError) as e:
-        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constants={}))
+        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constexprs={}))
     assert "global variable" in str(e.value)
 
 
@@ -279,7 +279,7 @@ def kernel():
         a = CONSTEXPR_ANNOTATED_GLOBAL  # noqa
 
     # No error.
-    triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constants={}))
+    triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constexprs={}))
 
 
 CONSTEXPR_GLOBAL = tl.constexpr(42)
@@ -292,7 +292,7 @@ def kernel():
         a = CONSTEXPR_GLOBAL  # noqa
 
     # No error.
-    triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constants={}))
+    triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constexprs={}))
 
 
 TYPE_ALIAS = tl.pointer_type(tl.int32)
@@ -305,7 +305,7 @@ def kernel():
         a = TYPE_ALIAS  # noqa
 
     # No error.
-    triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constants={}))
+    triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constexprs={}))
 
 
 def test_global_access_in_fn_default_arg():
@@ -315,7 +315,7 @@ def kernel(a=GLOBAL):
         pass
 
     # No error.
-    triton.compile(triton.compiler.ASTSource(fn=kernel, signature={'a': "i32"}, constants={}))
+    triton.compile(triton.compiler.ASTSource(fn=kernel, signature={'a': "i32"}, constexprs={}))
 
 
 def test_defaults_assign_no_err():
@@ -324,7 +324,7 @@ def test_defaults_assign_no_err():
     def kernel(a=1, B: tl.constexpr = ""):
         pass
 
-    triton.compile(triton.compiler.ASTSource(fn=kernel, signature={'a': 'i32'}, constants={'B': ""}))
+    triton.compile(triton.compiler.ASTSource(fn=kernel, signature={'a': 'i32', 'B': 'constexpr'}, constexprs={'B': ""}))
 
 
 def test_where_warning(fresh_triton_cache):
@@ -337,7 +337,7 @@ def kernel():
         tl.where(a, b, c)
 
     with pytest.warns(UserWarning):
-        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constants={}))
+        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constexprs={}))
 
 
 @pytest.mark.parametrize("dtype", [tl.float8e5, tl.float8e5b16, tl.float8e4nv, tl.float8e4b8, tl.float8e4b15])
@@ -371,7 +371,8 @@ def dtype_kernel(dtype: tl.constexpr):
         ctx = pytest.raises(CompilationError, match="")
 
     with ctx as e:
-        triton.compile(triton.compiler.ASTSource(fn=dtype_kernel, signature={}, constants={"dtype": dtype}))
+        triton.compile(
+            triton.compiler.ASTSource(fn=dtype_kernel, signature={"dtype": "constexpr"}, constexprs={"dtype": dtype}))
 
     if dtype not in supported_dtypes:
         try:
@@ -390,7 +391,7 @@ def dot_kernel():
         tl.dot(a, b, max_num_imprecise_acc=128)
 
     with pytest.raises(CompilationError) as e:
-        triton.compile(triton.compiler.ASTSource(fn=dot_kernel, signature={}, constants={}))
+        triton.compile(triton.compiler.ASTSource(fn=dot_kernel, signature={}, constexprs={}))
     try:
         assert (str(e.value.__cause__) == "max_num_imprecise_acc (128) must be <= K (64)")
     except AssertionError as assertion_err:
 
@@ -4407,15 +4407,17 @@ def kernel(x):
 def test_value_specialization(value: int, value_type: str, device) -> None:
 
     def repr(specialization):
-        spec_type = specialization.signature["VALUE"]
-        return f"kernel_{spec_type}"
+        ty = specialization.signature["value1"]
+        cst = '_'.join([k for k, v in specialization.constants.items() if v == 1])
+        return f"kernel_{ty}_{cst}"
 
     @triton.jit(repr=repr)
-    def kernel(VALUE, X):
+    def kernel(value1, is_one, X):
         pass
 
     x = torch.tensor([3.14159], device=device)
-    h = kernel[(1, )](value, x)
+    h = kernel[(1, )](value, 1, x)
+    assert "is_one" in h.name
     assert value_type in h.name
 
 
@@ -6186,6 +6188,19 @@ def sanitize_sum_2d_kernel(Z, X, BLOCK_0: tl.constexpr, BLOCK_1: tl.constexpr, r
     torch.testing.assert_close(Z, X.sum(reduce_dim).to(torch.int32))
 
 
+def test_dtype(device):
+
+    @triton.jit
+    def kernel(X):
+        dtype_x: tl.constexpr = X.dtype.element_ty
+        tl.static_assert(dtype_x == tl.int32)
+        tl.static_assert(dtype_x == tl.constexpr(tl.int32))
+        tl.static_assert(dtype_x == tl.int8 or (dtype_x == tl.int16 or dtype_x == tl.int32))
+
+    X = torch.zeros(1, dtype=torch.int32, device=device)
+    kernel[(1, )](X)
+
+
 def test_side_effectful_scan(device):
     if device != "cuda":
         pytest.xfail()
 
@@ -23,7 +23,7 @@ def kernel():
         pass
 
     try:
-        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constants={}))
+        triton.compile(triton.compiler.ASTSource(fn=kernel, signature={}, constexprs={}))
     except Exception as e:
         pytest.fail(f"triton compile failed with error: {e}")