intel
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 20 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 20 additions & 2 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp‎
Lines changed: 22 additions & 0 deletions b/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 24 additions & 22 deletions b/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 24 additions & 22 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Canonicalize.cpp‎
Lines changed: 5 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/Canonicalize.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/IR/Dialect.cpp‎
Lines changed: 2 additions & 0 deletions b/‎lib/Dialect/TritonNvidiaGPU/IR/Dialect.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/TensorMemoryAllocation.cpp‎
Lines changed: 8 additions & 2 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/TensorMemoryAllocation.cpp‎
Lines changed: 8 additions & 2 deletions
@@ -252,8 +252,6 @@ def TTG_MemDescTransOp : TTG_Op<"memdesc_trans", [Pure,
     representing a transposed view of the buffer.
   }];
 
-  let arguments = (ins TTG_MemDescType:$src, Variadic<I32>:$order);
-
   let arguments = (
     ins TTG_MemDescType:$src,
     DenseI32ArrayAttr:$order
@@ -284,6 +282,26 @@ def TTG_MemDescReshapeOp : TTG_Op<"memdesc_reshape", [Pure,
   let hasVerifier = 1;
 }
 
+def TTG_MemDescReinterpretOp : TTG_Op<"memdesc_reinterpret", [Pure, MemDescViewTrait]> {
+  let summary = "reinterpret a memory descriptor as a different type and shape";
+
+  let description = [{
+    The `ttg.memdesc_reinterpret` operation reinterprets a memory descriptor
+    as one with a different shape and element type. Because memory descriptors
+    lack strides, this operation is only valid if the original memory descriptor
+    is contiguous.
+  }];
+
+  let arguments = (ins TTG_MemDescType:$src);
+  let results = (outs TTG_MemDescType:$result);
+
+  let assemblyFormat = [{
+    $src attr-dict `:` qualified(type($src)) `->` qualified(type($result))
+  }];
+
+  let hasVerifier = 1;
+}
+
 def TTG_LocalLoadOp : TTG_Op<"local_load"> {
   let summary = "Load a buffer from local memory into a distributed tensor";
 
 
@@ -369,6 +369,7 @@ def TritonGPUCanonicalize: Pass<"tritongpu-canonicalize"> {
   }];
   let dependentDialects = [
     "mlir::arith::ArithDialect",
+    "mlir::cf::ControlFlowDialect",
     "mlir::scf::SCFDialect",
   ];
 }
 
@@ -480,6 +480,27 @@ struct MemDescSubviewOpConversion
     return success();
   }
 };
+
+struct MemDescReinterpretOpConversion
+    : public ConvertOpToLLVMPattern<MemDescReinterpretOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult matchAndRewrite(MemDescReinterpretOp op, OpAdaptor adaptor,
+                                ConversionPatternRewriter &b) const override {
+    Location loc = op.getLoc();
+    MemDescType srcTy = op.getSrc().getType();
+    MemDescType dstTy = op.getType();
+    Type srcElemTy = getTypeConverter()->convertType(srcTy.getElementType());
+    Type dstElemTy = getTypeConverter()->convertType(dstTy.getElementType());
+
+    auto smemObj =
+        getSharedMemoryObjectFromStruct(loc, adaptor.getSrc(), srcElemTy, b);
+    SharedMemoryObject newObj(smemObj.getBase(), dstElemTy, dstTy.getRank(),
+                              loc, b);
+    b.replaceOp(op, getStructFromSharedMemoryObject(loc, newObj, b));
+    return success();
+  }
+};
 } // namespace
 
 void mlir::triton::populateViewOpToLLVMPatterns(
@@ -497,4 +518,5 @@ void mlir::triton::populateViewOpToLLVMPatterns(
   patterns.add<TransOpConversion>(typeConverter, benefit);
   patterns.add<BroadcastOpConversion>(typeConverter, benefit);
   patterns.add<MemDescSubviewOpConversion>(typeConverter, benefit);
+  patterns.add<MemDescReinterpretOpConversion>(typeConverter, benefit);
 }
@@ -439,14 +439,20 @@ MemDescTransOp::inferReturnTypes(MLIRContext *context,
       return failure();
     }
   }
+
+  // Permute the last `rank` dims of the source alloc shape.
+  SmallVector<int64_t> allocShape =
+      applyPermutation(argTy.getAllocShape().take_back(order.size()), order);
+  allocShape.insert(allocShape.begin(), argTy.getAllocShape().begin(),
+                    argTy.getAllocShape().end() - order.size());
+
   inferredReturnTypes.push_back(
       MemDescType::get(retShape, retEltTy, retEncoding, argTy.getMemorySpace(),
-                       argTy.getMutableMemory()));
+                       argTy.getMutableMemory(), allocShape));
   return success();
 }
 
 // MemDescReshapeOp
-
 LogicalResult MemDescReshapeOp::verify() {
   MemDescType dstType = getResult().getType();
   MemDescType srcType = getSrc().getType();
@@ -472,6 +478,13 @@ LogicalResult MemDescReshapeOp::verify() {
   return success();
 }
 
+// MemDescReinterpretOp
+LogicalResult MemDescReinterpretOp::verify() {
+  if (getSrc().getType().getMemorySpace() != getType().getMemorySpace())
+    return emitError("source and destination memory space must match");
+  return success();
+}
+
 // LocalAllocOp
 void LocalAllocOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
@@ -623,20 +636,15 @@ LogicalResult MemDescSubviewOp::verify() {
           "only nD -> (n-1)D rank-reducing subviews are supported");
     }
     for (auto offset : getOffsets().take_back(dstTy.getRank())) {
-      if (auto constOp = offset.getDefiningOp<arith::ConstantOp>()) {
-        if (auto offsetInt = dyn_cast<IntegerAttr>(constOp.getValue())) {
-          if (offsetInt.getInt() != 0) {
-            return emitError("only first offset can be non-zero for a "
-                             "rank-reducing subview");
-          }
-        } else {
-          return emitError(
-              "only integer constant values are allowed for the split");
-        }
-      } else {
+      APInt value;
+      if (!matchPattern(offset, m_ConstantInt(&value))) {
         return emitError("only constant values are allowed outside the front "
                          "dimension in a rank-reducing subview");
       }
+      if (!value.isZero()) {
+        return emitError(
+            "only first offset can be non-zero for a rank-reducing subview");
+      }
     }
     return success();
   }
@@ -658,16 +666,10 @@ LogicalResult MemDescSubviewOp::verify() {
   }
   SmallVector<int64_t> offsets;
   for (auto offset : getOffsets()) {
-    if (auto constOp = offset.getDefiningOp<arith::ConstantOp>()) {
-      if (auto offsetInt = dyn_cast<IntegerAttr>(constOp.getValue())) {
-        offsets.push_back(offsetInt.getInt());
-      } else {
-        return emitError(
-            "only integer constant values are allowed for the split");
-      }
-    } else {
+    APInt value;
+    if (!matchPattern(offset, m_ConstantInt(&value)))
       return emitError("only constant values are allowed for the split");
-    }
+    offsets.push_back(value.getSExtValue());
   }
   // Identity subview
   if (dim == -1) {
 
@@ -1,4 +1,5 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -32,6 +33,8 @@ void Canonicalize::runOnOperation() {
       patterns);
   ctx->getLoadedDialect<scf::SCFDialect>()->getCanonicalizationPatterns(
       patterns);
+  ctx->getLoadedDialect<cf::ControlFlowDialect>()->getCanonicalizationPatterns(
+      patterns);
   populateForOpDeadArgumentElimination(patterns);
 
   // Populate select Triton canonicalization patterns. The important patterns to
@@ -43,4 +46,6 @@ void Canonicalize::runOnOperation() {
   ExpandDimsOp::getCanonicalizationPatterns(patterns, ctx);
   ttg::WarpSpecializeOp::getCanonicalizationPatterns(patterns, ctx);
   ttng::TensorDescToTMAPtrOp::getCanonicalizationPatterns(patterns, ctx);
+
+  (void)applyPatternsGreedily(getOperation(), std::move(patterns));
 }
@@ -30,6 +30,7 @@
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/OpImplementation.h"
 #include "triton/Analysis/Utility.h"
+#include "triton/Dialect/Triton/IR/Interfaces.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
@@ -264,6 +265,7 @@ void TritonNvidiaGPUDialect::initialize() {
 #include "triton/Dialect/TritonNvidiaGPU/IR/Ops.cpp.inc"
       >();
   addInterfaces<TritonGPUOpAsmInterface>();
+  addInterfaces<TritonInlinerInterface>();
 }
 
 // verify TritonNvidiaGPU ops
 
@@ -13,6 +13,8 @@ namespace mlir {
 namespace triton {
 namespace nvidia_gpu {
 
+namespace ttg = triton::gpu;
+
 #define GEN_PASS_DEF_TRITONTENSORMEMORYALLOCATIONPASS
 #include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h.inc"
 
@@ -118,7 +120,7 @@ static Interval<int> getLiveIntervals(Value value, Liveness &liveness,
   SmallVector<Operation *> users(value.getUsers());
   while (!users.empty()) {
     Operation *user = users.pop_back_val();
-    if (!isa<triton::gpu::MemDescSubviewOp>(user))
+    if (!isa<ttg::MemDescSubviewOp, ttg::MemDescReinterpretOp>(user))
       continue;
     auto usersLivness = liveness.resolveLiveness(user->getResult(0));
     liveOperations.insert(liveOperations.end(), usersLivness.begin(),
@@ -177,10 +179,14 @@ static Operation *getAlloc(Value value) {
   while (true) {
     if (auto allocOp = value.getDefiningOp<TMEMAllocOp>())
       return allocOp;
-    if (auto subviewOp = value.getDefiningOp<triton::gpu::MemDescSubviewOp>()) {
+    if (auto subviewOp = value.getDefiningOp<ttg::MemDescSubviewOp>()) {
       value = subviewOp.getSrc();
       continue;
     }
+    if (auto reinterpOp = value.getDefiningOp<ttg::MemDescReinterpretOp>()) {
+      value = reinterpOp.getSrc();
+      continue;
+    }
     auto arg = dyn_cast<BlockArgument>(value);
     if (!arg || !isa<triton::gpu::WarpSpecializePartitionsOp>(
                     arg.getOwner()->getParentOp()))
Original file line number	Diff line number	Diff line change
`@@ -369,6 +369,7 @@ def TritonGPUCanonicalize: Pass<"tritongpu-canonicalize"> {`
`369`	`369`	`}];`
`370`	`370`	`let dependentDialects = [`
`371`	`371`	`"mlir::arith::ArithDialect",`
	`372`	`+ "mlir::cf::ControlFlowDialect",`
`372`	`373`	`"mlir::scf::SCFDialect",`
`373`	`374`	`];`
`374`	`375`	`}`