intel
diff --git a/‎include/triton/Dialect/Gluon/IR/CMakeLists.txt
Lines changed: 7 additions & 6 deletions b/‎include/triton/Dialect/Gluon/IR/CMakeLists.txt
Lines changed: 7 additions & 6 deletions
diff --git a/‎include/triton/Dialect/Gluon/IR/Dialect.h
Lines changed: 3 additions & 0 deletions b/‎include/triton/Dialect/Gluon/IR/Dialect.h
Lines changed: 3 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Gluon/IR/GluonOps.td
Lines changed: 32 additions & 0 deletions b/‎include/triton/Dialect/Gluon/IR/GluonOps.td
Lines changed: 32 additions & 0 deletions
diff --git a/‎lib/Dialect/Gluon/IR/CMakeLists.txt
Lines changed: 0 additions & 1 deletion b/‎lib/Dialect/Gluon/IR/CMakeLists.txt
Lines changed: 0 additions & 1 deletion
diff --git a/‎lib/Dialect/Gluon/IR/Dialect.cpp
Lines changed: 21 additions & 0 deletions b/‎lib/Dialect/Gluon/IR/Dialect.cpp
Lines changed: 21 additions & 0 deletions
diff --git a/‎lib/Dialect/Gluon/Transforms/ResolveAutoEncodings.cpp
Lines changed: 6 additions & 11 deletions b/‎lib/Dialect/Gluon/Transforms/ResolveAutoEncodings.cpp
Lines changed: 6 additions & 11 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp
Lines changed: 1 addition & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/src/gluon_ir.cc
Lines changed: 6 additions & 3 deletions b/‎python/src/gluon_ir.cc
Lines changed: 6 additions & 3 deletions
diff --git a/‎python/test/gluon/test_frontend.py
Lines changed: 26 additions & 24 deletions b/‎python/test/gluon/test_frontend.py
Lines changed: 26 additions & 24 deletions
diff --git a/‎python/test/unit/language/test_core.py
Lines changed: 30 additions & 0 deletions b/‎python/test/unit/language/test_core.py
Lines changed: 30 additions & 0 deletions
@@ -1,16 +1,17 @@
 set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
 
+set(LLVM_TARGET_DEFINITIONS GluonOps.td)
+mlir_tablegen(Ops.h.inc -gen-op-decls)
+mlir_tablegen(Ops.cpp.inc -gen-op-defs)
+add_mlir_doc(GluonOps GluonOps dialects/ -gen-op-doc)
+
 set(LLVM_TARGET_DEFINITIONS GluonDialect.td)
 mlir_tablegen(Dialect.h.inc -gen-dialect-decls -dialect=gluon)
 mlir_tablegen(Dialect.cpp.inc -gen-dialect-defs -dialect=gluon)
-mlir_tablegen(Ops.h.inc -gen-op-decls)
-mlir_tablegen(Ops.cpp.inc -gen-op-defs)
 add_mlir_doc(GluonDialect GluonDialect dialects/ -gen-dialect-doc)
-add_public_tablegen_target(GluonTableGen)
 
 set(LLVM_TARGET_DEFINITIONS GluonAttrDefs.td)
 mlir_tablegen(GluonAttrDefs.h.inc -gen-attrdef-decls)
 mlir_tablegen(GluonAttrDefs.cpp.inc -gen-attrdef-defs)
-mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
-mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
-add_public_tablegen_target(GluonAttrDefsIncGen)
+
+add_public_tablegen_target(GluonTableGen)
@@ -6,3 +6,6 @@
 
 #define GET_ATTRDEF_CLASSES
 #include "triton/Dialect/Gluon/IR/GluonAttrDefs.h.inc"
+
+#define GET_OP_CLASSES
+#include "triton/Dialect/Gluon/IR/Ops.h.inc"
@@ -0,0 +1,32 @@
+#ifndef GLUON_OPS
+#define GLUON_OPS
+
+include "triton/Dialect/Gluon/IR/GluonDialect.td"
+include "triton/Dialect/Gluon/IR/GluonAttrDefs.td"
+include "triton/Dialect/Triton/IR/TritonInterfaces.td"
+include "triton/Dialect/Triton/IR/TritonTypes.td"
+
+class Gluon_Op<string mnemonic, list<Trait> traits = []> :
+    Op<Gluon_Dialect, mnemonic,
+       !listconcat(traits, [VerifyTensorLayoutsTrait])> {
+}
+
+def Gluon_SetAutoLayoutOp : Gluon_Op<"set_auto_layout",
+                                 [SameOperandsAndResultShape,
+                                  SameOperandsAndResultElementType]> {
+  let summary = "set auto encoding to a concrete encoding type";
+
+  let arguments = (ins TT_Tensor:$src);
+
+  let results = (outs TT_Tensor:$result);
+
+  let builders = [
+    OpBuilder<(ins "Attribute":$encoding, "Value":$value)>
+  ];
+
+  let hasVerifier = 1;
+
+  let assemblyFormat = "$src attr-dict `:` type($src) `->` type($result)";
+}
+
+#endif // GLUON_OPS
@@ -3,7 +3,6 @@ add_triton_library(GluonIR
 
   DEPENDS
   GluonTableGen
-  GluonAttrDefsIncGen
 
   LINK_LIBS PUBLIC
   TritonIR
 
@@ -12,6 +12,9 @@ namespace gluon = mlir::triton::gluon;
 #include "triton/Dialect/Gluon/IR/Dialect.cpp.inc"
 #include "triton/Dialect/Gluon/IR/GluonAttrDefs.cpp.inc"
 
+#define GET_OP_CLASSES
+#include "triton/Dialect/Gluon/IR/Ops.cpp.inc"
+
 namespace {
 
 // Layout inference for AutoEncodingAttr -> always propagate AutoEncodingAttr to
@@ -111,4 +114,22 @@ void GluonDialect::initialize() {
   addInterfaces<GluonInferLayoutInterface>();
 }
 
+void SetAutoLayoutOp::build(OpBuilder &builder, OperationState &state,
+                            Attribute enc, Value value) {
+  auto resTy = cast<RankedTensorType>(value.getType()).cloneWithEncoding(enc);
+  return build(builder, state, resTy, value);
+}
+
+LogicalResult SetAutoLayoutOp::verify() {
+  if (!isa<gluon::AutoEncodingAttr>(getSrc().getType().getEncoding())) {
+    return emitOpError("input tensor must have an auto layout type");
+  }
+  auto dstEncoding = getType().getEncoding();
+  if (!dstEncoding)
+    return emitOpError("result tensor must have an encoding");
+  if (isa<gluon::AutoEncodingAttr>(dstEncoding))
+    return emitOpError("result type must not be auto layout");
+  return success();
+}
+
 } // namespace mlir::triton::gluon
@@ -67,17 +67,11 @@ LogicalResult inferAutoLayouts(FuncOp func) {
   };
 
   // 1. Set seed values from layout conversions
-  auto res = func.walk([&](ttg::ConvertLayoutOp cvtOp) -> WalkResult {
-    auto src = cvtOp.getSrc();
-    auto res = cvtOp.getResult();
-    auto srcEnc = src.getType().getEncoding();
-    auto resEnc = res.getType().getEncoding();
-    auto isAutoSrc = isa<gluon::AutoEncodingAttr>(srcEnc);
-    auto isAutoRes = isa<gluon::AutoEncodingAttr>(resEnc);
-    if (isAutoSrc && !isAutoRes) {
-      return updateEncoding({src}, resEnc);
-    }
-    return WalkResult::advance();
+  auto res = func.walk([&](gluon::SetAutoLayoutOp op) -> WalkResult {
+    auto res = updateEncoding({op.getSrc()}, op.getType().getEncoding());
+    op.getResult().replaceAllUsesWith(op.getSrc());
+    op->erase();
+    return res;
   });
 
   if (res.wasInterrupted())
@@ -158,6 +152,7 @@ LogicalResult inferAutoLayouts(FuncOp func) {
       }
     }
   }
+
   return success();
 }
 
 
@@ -319,7 +319,7 @@ loadOpsToIndirectionLevel(scf::ForOp forOp, bool pipelineWithoutDot,
 
   // If the loop has numStages attribute, also consider pipelining other loads
   // that are not directly used by dot ops.
-  if (pipelineWithoutDot && !seenDot) {
+  if (pipelineWithoutDot) {
     for (Operation &op : forOp.getBody()->without_terminator()) {
       if (!isa<tt::LoadOp, tt::DescriptorLoadOp, tt::DescriptorGatherOp>(op))
         dfs(&op, &op, 0);
 
@@ -114,10 +114,9 @@ static bool isConvertLayoutTrivial(RankedTensorType dstTy, Value value) {
   auto srcTy = cast<RankedTensorType>(value.getType());
   if (srcTy.getEncoding() == dstTy.getEncoding())
     return true;
-  // Handle unresolved layouts. auto -> T is trivial but T -> auto is not
-  // necessarily.
+  // Fail safe on unresolved layouts.
   if (isa<gluon::AutoEncodingAttr>(srcTy.getEncoding()))
-    return true;
+    return false;
   if (isa<gluon::AutoEncodingAttr>(dstTy.getEncoding()))
     return false;
 
@@ -404,6 +403,10 @@ void init_gluon_ir(py::module &&m) {
            [](GluonOpBuilder &self, Type resultType, Value src) -> Value {
              return self.create<ttg::MemDescReinterpretOp>(resultType, src);
            })
+      .def("create_set_auto_layout",
+           [](GluonOpBuilder &self, Attribute layout, Value value) -> Value {
+             return self.create<gluon::SetAutoLayoutOp>(layout, value);
+           })
       .def("create_split",
            [](GluonOpBuilder &self, Value &a) -> py::tuple {
              auto argTy = cast<RankedTensorType>(a.getType());
 
@@ -73,41 +73,39 @@ def test_convert_layout_assert_trivial():
     # CHECK: ttg.convert_layout
     ttgl.convert_layout(value, equiv_layout, assert_trivial=True)
 
-    value = ttgl.arange(0, 128, layout=ttgl.AutoLayout())
-    # CHECK: ttg.convert_layout
-    ttgl.convert_layout(value, equiv_layout, assert_trivial=True)
-
 
 def test_convert_layout_not_trivial():
 
     @gluon.jit
-    def kernel():
-        src_layout: ttgl.constexpr = ttgl.BlockedLayout([2], [32], [4], [0])
-        dst_layout: ttgl.constexpr = ttgl.BlockedLayout([1], [32], [4], [0])
-
+    def kernel(src_layout: ttgl.constexpr, dst_layout: ttgl.constexpr):
         value = ttgl.arange(0, 128, layout=src_layout)
         ttgl.convert_layout(value, dst_layout, assert_trivial=True)
 
     with pytest.raises(CompilationError) as e:
-        run_parser(kernel)
+        src_layout = ttgl.BlockedLayout([2], [32], [4], [0])
+        dst_layout = ttgl.BlockedLayout([1], [32], [4], [0])
+        kernel.warmup(src_layout, dst_layout, grid=(1, ))
 
-    assert "layout conversion from BlockedLayout(size_per_thread=(2)" in str(e.value.__cause__)
-    assert "to BlockedLayout(size_per_thread=(1)" in str(e.value.__cause__)
+    assert "layout conversion from BlockedLayout(size_per_thread=[2]" in str(e.value.__cause__)
+    assert "to BlockedLayout(size_per_thread=[1]" in str(e.value.__cause__)
     assert "is not trivial" in str(e.value.__cause__)
 
-    @gluon.jit
-    def kernel():
-        src_layout: ttgl.constexpr = ttgl.BlockedLayout([2], [32], [4], [0])
-        dst_layout: ttgl.constexpr = ttgl.AutoLayout()
+    with pytest.raises(CompilationError) as e:
+        src_layout = ttgl.BlockedLayout([2], [32], [4], [0])
+        dst_layout = ttgl.AutoLayout()
+        kernel.warmup(src_layout, dst_layout, grid=(1, ))
 
-        value = ttgl.arange(0, 128, layout=src_layout)
-        ttgl.convert_layout(value, dst_layout, assert_trivial=True)
+    assert "layout conversion from BlockedLayout(size_per_thread=[2]" in str(e.value.__cause__)
+    assert "to AutoLayout() is not trivial" in str(e.value.__cause__)
 
     with pytest.raises(CompilationError) as e:
-        run_parser(kernel)
+        src_layout: ttgl.constexpr = ttgl.AutoLayout()
+        dst_layout: ttgl.constexpr = ttgl.BlockedLayout([2], [32], [4], [0])
+        kernel.warmup(src_layout, dst_layout, grid=(1, ))
 
-    assert "layout conversion from BlockedLayout(size_per_thread=(2)" in str(e.value.__cause__)
-    assert "to AutoLayout() is not trivial" in str(e.value.__cause__)
+    assert "layout conversion from AutoLayout()" in str(e.value.__cause__)
+    assert "to BlockedLayout(size_per_thread=[2]" in str(e.value.__cause__)
+    assert "is not trivial" in str(e.value.__cause__)
 
 
 @gluon.jit
@@ -1223,6 +1221,7 @@ def kernel():
 @filecheck_test
 @gluon.jit
 def test_auto_layout():
+    # CHECK-DAG: [[BLOCKED:#.*]] = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
     # CHECK: [[X_1D:%.*]] = arith.constant dense<7> : tensor<16xi32, #gluon.auto_encoding>
     # CHECK: [[Y_1D:%.*]] = arith.constant dense<2> : tensor<8xi32, #gluon.auto_encoding>
     x = ttgl.full([16], 7, ttgl.int32, layout=ttgl.AutoLayout())[:, None]
@@ -1232,8 +1231,11 @@ def test_auto_layout():
     # CHECK: (tensor<16x8xi32, #gluon.auto_encoding>) -> tensor<16xi32, #gluon.auto_encoding
     ttgl.sum(z, axis=1)
 
-    # CHECK: tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #gluon.auto_encoding>
-    ttgl.arange(0, 32)
+    # CHECK: [[I:%.*]] = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #gluon.auto_encoding>
+    i = ttgl.arange(0, 32)
+
+    # CHECK: gluon.set_auto_layout [[I]] : tensor<32xi32, #gluon.auto_encoding> -> tensor<32xi32, [[BLOCKED]]
+    ttgl.set_auto_layout(i, ttgl.BlockedLayout([1], [32], [4], [0]))
 
 
 @filecheck_test
@@ -1245,13 +1247,13 @@ def test_auto_layout_broadcast():
     x = ttgl.full([16, 1], 1, ttgl.int32, layout=ttgl.AutoLayout())
     y = ttgl.full([1, 16], 2, ttgl.int32, layout=ttgl.BlockedLayout([1, 1], [1, 32], [4, 1], [1, 0]))
 
-    # CHECK: [[XCVT:%.*]] = ttg.convert_layout [[X]] : tensor<16x1xi32, #gluon.auto_encoding> -> tensor<16x1xi32, [[BLOCKED]]>
+    # CHECK: [[XCVT:%.*]] = gluon.set_auto_layout [[X]] : tensor<16x1xi32, #gluon.auto_encoding> -> tensor<16x1xi32, [[BLOCKED]]>
     # CHECK: [[XBCAST:%.*]] = tt.broadcast [[XCVT]]
     # CHECK: [[YBCAST:%.*]] = tt.broadcast [[Y]]
     # CHECK: arith.addi [[XBCAST]], [[YBCAST]] : tensor<16x16xi32, [[BLOCKED]]>
     _ = x + y
 
-    # CHECK: [[XCVT2:%.*]] = ttg.convert_layout [[X]] : tensor<16x1xi32, #gluon.auto_encoding> -> tensor<16x1xi32, [[BLOCKED]]>
+    # CHECK: [[XCVT2:%.*]] = gluon.set_auto_layout [[X]] : tensor<16x1xi32, #gluon.auto_encoding> -> tensor<16x1xi32, [[BLOCKED]]>
     # CHECK: [[YBCAST2:%.*]] = tt.broadcast [[Y]]
     # CHECK: [[XBCAST2:%.*]] = tt.broadcast [[XCVT2]]
     # CHECK: arith.muli [[YBCAST2]], [[XBCAST2]] : tensor<16x16xi32, [[BLOCKED]]>
 
@@ -6096,6 +6096,36 @@ def kernel(Semaphore, Out, total: tl.constexpr):
     assert out.item() >= 0
 
 
+def test_constexpr_flattens():
+    assert tl.constexpr(tl.constexpr(5)) == tl.constexpr(5)
+    assert tl.constexpr(tl.constexpr(tl.constexpr(5))) == tl.constexpr(5)
+
+
+@pytest.mark.parametrize("literal, tensor_ty", [(10, tl.int32), (32.1, tl.float32),
+                                                ((5, 6, 7), None),  # tuples can't be lifted to tensors
+                                                ])
+def test_constexpr_assignment(literal, tensor_ty):
+    from triton.language.core import constexpr_type
+
+    @triton.jit
+    def kernel(input_literal: tl.constexpr, tensor_type: tl.constexpr):
+        patched_literal: tl.constexpr = PATCHED
+        # Sanity checks
+        tl.static_assert(patched_literal.type == constexpr_type(PATCHED))
+        tl.static_assert(input_literal.type == constexpr_type(PATCHED))
+
+        assigned_literal: tl.constexpr = input_literal
+        tl.static_assert(assigned_literal.type == constexpr_type(PATCHED))
+        tl.static_assert(assigned_literal == patched_literal)
+
+        if tensor_type is not None:
+            assigned_variable = input_literal
+            tl.static_assert(assigned_variable.type == tensor_type)
+
+    kernel_patched = patch_kernel(kernel, {'PATCHED': f"{literal}"})
+    kernel_patched[(1, )](literal, tensor_ty)
+
+
 @triton.jit
 def return_poison(x):
     a = False