intel
diff --git a/‎include/triton/Analysis/Utility.h
Lines changed: 5 additions & 1 deletion b/‎include/triton/Analysis/Utility.h
Lines changed: 5 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h
Lines changed: 5 additions & 5 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h
Lines changed: 5 additions & 5 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
Lines changed: 5 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
Lines changed: 5 additions & 2 deletions
diff --git a/‎lib/Analysis/Allocation.cpp
Lines changed: 4 additions & 4 deletions b/‎lib/Analysis/Allocation.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎lib/Analysis/AxisInfo.cpp
Lines changed: 3 additions & 6 deletions b/‎lib/Analysis/AxisInfo.cpp
Lines changed: 3 additions & 6 deletions
diff --git a/‎lib/Analysis/Utility.cpp
Lines changed: 6 additions & 10 deletions b/‎lib/Analysis/Utility.cpp
Lines changed: 6 additions & 10 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
Lines changed: 4 additions & 6 deletions b/‎lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
Lines changed: 4 additions & 6 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/GatherOpToLLVM.cpp
Lines changed: 2 additions & 4 deletions b/‎lib/Conversion/TritonGPUToLLVM/GatherOpToLLVM.cpp
Lines changed: 2 additions & 4 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp
Lines changed: 4 additions & 6 deletions b/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp
Lines changed: 4 additions & 6 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp
Lines changed: 4 additions & 6 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp
Lines changed: 4 additions & 6 deletions
@@ -27,6 +27,7 @@ class ReduceOpHelper {
   explicit ReduceOpHelper(triton::ReduceOp op)
       : op(op.getOperation()), axis(op.getAxis()) {
     auto firstTy = cast<RankedTensorType>(op.getOperands()[0].getType());
+    srcTy = firstTy;
     srcShape = firstTy.getShape();
     srcEncoding = firstTy.getEncoding();
     srcElementTypes = op.getElementTypes();
@@ -68,6 +69,7 @@ class ReduceOpHelper {
 
 private:
   triton::ReduceOp op;
+  RankedTensorType srcTy;
   ArrayRef<int64_t> srcShape;
   Attribute srcEncoding;
   SmallVector<Type> srcElementTypes;
@@ -80,7 +82,7 @@ class ScanLoweringHelper {
     auto firstTy = cast<RankedTensorType>(op.getOperands()[0].getType());
     srcShape = firstTy.getShape();
     legacyEncoding = firstTy.getEncoding();
-    srcEncoding = triton::gpu::toLinearEncoding(legacyEncoding, srcShape);
+    srcEncoding = triton::gpu::toLinearEncoding(firstTy);
     srcElementTypes = op.getElementTypes();
     // The codegen does not support different element/thread/warp order so
     // we choose one a priori. We choose that of the blocked encoding.
@@ -166,6 +168,8 @@ class GatherLoweringHelper {
 
 private:
   triton::GatherOp gatherOp;
+  RankedTensorType srcTy;
+  RankedTensorType dstTy;
 };
 
 // This struct represents a decomposed layout conversion within a warp into
 
@@ -93,7 +93,8 @@ struct SharedMemory : public SideEffects::Resource::Base<SharedMemory> {
 
 // Convert a distributed layout to a linear encoding
 LinearEncodingAttr toLinearEncoding(RankedTensorType type);
-LinearEncodingAttr toLinearEncoding(Attribute layout, ArrayRef<int64_t> shape);
+LinearEncodingAttr toLinearEncoding(DistributedEncodingTrait layout,
+                                    ArrayRef<int64_t> shape);
 
 unsigned getTotalElemsPerThread(Type type);
 
@@ -274,14 +275,13 @@ llvm::SmallVector<unsigned>
 expandMatrixOrderWithBatch(llvm::ArrayRef<unsigned> o);
 
 // Return true if the two layouts represent the exact same mapping.
-bool areLayoutsEquivalent(ArrayRef<int64_t> shape, Attribute lhs,
-                          Attribute rhs);
+bool areLayoutsEquivalent(ArrayRef<int64_t> shape, DistributedEncodingTrait lhs,
+                          DistributedEncodingTrait rhs);
 
 // Return true if the innermost numElems are contiguous.
 bool isInnermostContiguous(MemDescType type, unsigned numElems);
 
-LinearLayout inferReshapeLinearLayout(ArrayRef<int64_t> srcShape,
-                                      Attribute srcEnc,
+LinearLayout inferReshapeLinearLayout(TensorOrMemDesc srcTy,
                                       ArrayRef<int64_t> dstShape);
 
 // Verify the types of operations that operate on memory.
 
@@ -17,6 +17,8 @@ class SwizzledSharedEncodingAttr;
 class NVMMASharedEncodingAttr;
 class AMDRotatingSharedEncodingAttr;
 class AMDMfmaEncodingAttr;
+class TensorOrMemDesc;
+class MemDescType;
 
 // - BlockedEncodingAttrs have the following input dimensions.
 //
@@ -45,9 +47,10 @@ class AMDMfmaEncodingAttr;
 // elemBitWidth is the bit width of one element in the layout.  This is required
 // to compute the linear layout for MMAv3 (i.e. Hopper) shared layouts (i.e.
 // shared layouts with nvmma_shared layout) but is otherwise unused.
-//
-// Returns std::nullopt if the given layout can't be converted to an LL.
 LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout);
+LinearLayout toLinearLayout(RankedTensorType type);
+LinearLayout toLinearLayout(MemDescType type);
+LinearLayout toLinearLayout(TensorOrMemDesc type);
 
 // Convert the shared encoding of a tensor with `nvmma_shared` layout to a
 // LinearLayout that maps from a linear shared memory offset to tensor index.
 
@@ -42,8 +42,8 @@ static unsigned getBitwidth(RankedTensorType ty) {
 static unsigned getNumScratchElemsSwizzledCvt(RankedTensorType srcTy,
                                               RankedTensorType dstTy) {
   auto *ctx = srcTy.getContext();
-  auto srcLayout = gpu::toLinearLayout(srcTy.getShape(), srcTy.getEncoding());
-  auto dstLayout = gpu::toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
+  auto srcLayout = gpu::toLinearLayout(srcTy);
+  auto dstLayout = gpu::toLinearLayout(dstTy);
   srcLayout = actionRemoveBroadcastedRegs(srcLayout).apply(srcLayout);
   dstLayout = actionRemoveBroadcastedRegs(dstLayout).apply(dstLayout);
   auto bitwidth = getBitwidth(srcTy);
@@ -109,8 +109,8 @@ getScratchCvtInOutVecLengths(RankedTensorType srcTy, RankedTensorType dstTy) {
   Attribute srcLayout = srcTy.getEncoding();
   Attribute dstLayout = dstTy.getEncoding();
 
-  auto srcLinAttr = gpu::toLinearEncoding(srcLayout, srcTy.getShape());
-  auto dstLinAttr = gpu::toLinearEncoding(dstLayout, dstTy.getShape());
+  auto srcLinAttr = gpu::toLinearEncoding(srcTy);
+  auto dstLinAttr = gpu::toLinearEncoding(dstTy);
   auto inOrd = srcLinAttr.getOrder();
   auto outOrd = dstLinAttr.getOrder();
 
 
@@ -1232,8 +1232,7 @@ unsigned ModuleAxisInfoAnalysis::getContiguity(Value offsetsValue,
   // the analysis to one dimension. We should determine contiguity on the
   // flattenOuts() layout
   auto tensorTy = cast<RankedTensorType>(offsetsValue.getType());
-  auto linAttr =
-      gpu::toLinearEncoding(tensorTy.getEncoding(), tensorTy.getShape());
+  auto linAttr = gpu::toLinearEncoding(tensorTy);
   auto order = linAttr.getOrder();
   unsigned align = getAlignment(offsetsValue, elementBitWidth);
 
@@ -1266,8 +1265,7 @@ unsigned ModuleAxisInfoAnalysis::getAlignment(Value offsetsValue,
   auto *axisInfo = getAxisInfo(offsetsValue);
   if (!axisInfo)
     return 1;
-  auto linAttr =
-      gpu::toLinearEncoding(tensorTy.getEncoding(), tensorTy.getShape());
+  auto linAttr = gpu::toLinearEncoding(tensorTy);
   auto order = linAttr.getOrder();
   auto maxMultipleBytes = axisInfo->getDivisibility(order[0]);
   auto maxContig = axisInfo->getContiguity(order[0]);
@@ -1295,8 +1293,7 @@ unsigned ModuleAxisInfoAnalysis::getMaskAlignment(Value mask) {
   auto *axisInfo = getAxisInfo(mask);
   if (!axisInfo)
     return 1;
-  auto linAttr =
-      gpu::toLinearEncoding(tensorTy.getEncoding(), tensorTy.getShape());
+  auto linAttr = gpu::toLinearEncoding(tensorTy);
   auto maskOrder = linAttr.getOrder();
   auto alignment = std::max<unsigned>(axisInfo->getConstancy(maskOrder[0]), 1);
   LDBG("getMaskAlignment maskOrder[0] " << maskOrder[0] << " alignment "
 
@@ -24,7 +24,7 @@ using namespace triton;
 using namespace triton::gpu;
 
 SmallVector<unsigned> ReduceOpHelper::getOrderWithAxisAtBeginning() {
-  auto order = toLinearEncoding(srcEncoding, srcShape).getOrder();
+  auto order = toLinearEncoding(srcTy).getOrder();
   auto it = std::find(order.begin(), order.end(), axis);
   // delete the axis from order
   order.erase(it);
@@ -37,7 +37,7 @@ SmallVector<unsigned> ReduceOpHelper::getOrderWithAxisAtBeginning() {
 // reduction axis within the warp.
 unsigned ReduceOpHelper::getThreadOffsetOnReductionAxis() {
   auto *ctx = srcEncoding.getContext();
-  auto linearLayout = toLinearLayout(srcShape, srcEncoding);
+  auto linearLayout = toLinearLayout(srcTy);
   auto kLane = mlir::StringAttr::get(ctx, "lane");
   const auto &bases = linearLayout.getBases();
   const auto &lanes = bases.find(kLane)->second;
@@ -576,10 +576,8 @@ bool GatherLoweringHelper::isWarpLocal() {
   // source and index tensors, all the elements are owned by the same warp.
   RankedTensorType srcType = gatherOp.getSrc().getType();
   RankedTensorType idxType = gatherOp.getIndices().getType();
-  LinearLayout srcLayout =
-      toLinearLayout(srcType.getShape(), srcType.getEncoding());
-  LinearLayout idxLayout =
-      toLinearLayout(idxType.getShape(), idxType.getEncoding());
+  LinearLayout srcLayout = toLinearLayout(srcType);
+  LinearLayout idxLayout = toLinearLayout(idxType);
 
   Builder b(gatherOp.getContext());
   StringAttr kBlock = b.getStringAttr("block");
@@ -766,10 +764,8 @@ bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy,
 LinearLayout minimalCvtLayout(Type srcTy_, Type dstTy_) {
   auto srcTy = cast<triton::gpu::TensorOrMemDesc>(srcTy_);
   auto dstTy = cast<triton::gpu::TensorOrMemDesc>(dstTy_);
-  LinearLayout srcLayout =
-      toLinearLayout(srcTy.getShape(), srcTy.getEncoding());
-  LinearLayout dstLayout =
-      toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
+  LinearLayout srcLayout = toLinearLayout(srcTy);
+  LinearLayout dstLayout = toLinearLayout(dstTy);
   auto sDims = to_vector(srcLayout.getInDimNames());
   auto dDims = to_vector(dstLayout.getInDimNames());
   SmallVector<StringAttr> dims;
 
@@ -43,10 +43,8 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
     auto dstTy = op.getType();
 
     LinearLayout conversion = minimalCvtLayout(srcTy, dstTy);
-    LinearLayout srcLayout =
-        toLinearLayout(srcTy.getShape(), srcTy.getEncoding());
-    LinearLayout dstLayout =
-        toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
+    LinearLayout srcLayout = toLinearLayout(srcTy);
+    LinearLayout dstLayout = toLinearLayout(dstTy);
 
     StringAttr kBlock = str_attr("block");
     StringAttr kWarp = str_attr("warp");
@@ -246,8 +244,8 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
 
     // Remove the kBlock dimension from the layout as it's the identity in the
     // cvt
-    auto srcLayout = toLinearLayout(srcTy.getShape(), srcTy.getEncoding());
-    auto dstLayout = toLinearLayout(dstTy.getShape(), dstTy.getEncoding());
+    auto srcLayout = toLinearLayout(srcTy);
+    auto dstLayout = toLinearLayout(dstTy);
     auto kReg = str_attr("register");
     auto kLane = str_attr("lane");
     auto kWarp = str_attr("warp");
 
@@ -209,10 +209,8 @@ void GatherOpConversion::emitWarpLocalGather(
   }
 
   // Compute the src and idx layouts.
-  LinearLayout srcLayout =
-      toLinearLayout(srcType.getShape(), srcType.getEncoding());
-  LinearLayout idxLayout =
-      toLinearLayout(idxType.getShape(), idxType.getEncoding());
+  LinearLayout srcLayout = toLinearLayout(srcType);
+  LinearLayout idxLayout = toLinearLayout(idxType);
 
   // Let `ll_src` be the source layout and `ll_idx` be the index layout.
   // Let `src_col` be a tuple of dimensions except the gather dimension,
 
@@ -39,9 +39,8 @@ LogicalResult lowerLocalStore(Location loc, MLIRContext *ctx, Value regVal,
   auto regTy = cast<RankedTensorType>(regVal.getType());
   auto llvmElemTy = typeConverter->convertType(memDescTy.getElementType());
 
-  auto regLayout = toLinearLayout(regTy.getShape(), regTy.getEncoding());
-  auto sharedLayout =
-      toLinearLayout(memDescTy.getShape(), memDescTy.getEncoding());
+  auto regLayout = toLinearLayout(regTy);
+  auto sharedLayout = toLinearLayout(memDescTy);
   auto cvt = regLayout.invertAndCompose(sharedLayout);
 
   auto kBlock = str_attr("block");
@@ -193,9 +192,8 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
       return success();
     }
 
-    auto regLayout = toLinearLayout(regTy.getShape(), regTy.getEncoding());
-    auto sharedLayout =
-        toLinearLayout(memDescTy.getShape(), memDescTy.getEncoding());
+    auto regLayout = toLinearLayout(regTy);
+    auto sharedLayout = toLinearLayout(memDescTy);
     auto cvt = regLayout.invertAndCompose(sharedLayout);
     auto kBlock = str_attr("block");
     // NYI. We would need to emit a map.shared::cluster instruction.
 
@@ -832,7 +832,7 @@ bool emitTransferBetweenRegistersAndShared(
     regToSharedLayout =
         regLayout.reshapeOuts({{kOffset, regLayout.getTotalOutDimSize()}});
   } else {
-    auto sharedLL = triton::gpu::toLinearLayout(shape, sharedTy.getEncoding());
+    auto sharedLL = triton::gpu::toLinearLayout(sharedTy);
     regToSharedLayout = regLayout.invertAndCompose(sharedLL);
   }
 
@@ -908,8 +908,7 @@ bool emitTransferBetweenRegistersAndShared(
     const SharedMemoryObject &smemObj, Location loc, RewriterBase &rewriter,
     const TargetInfoBase &target,
     std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
-  auto regLayout = triton::gpu::toLinearLayout(registerTy.getShape(),
-                                               registerTy.getEncoding());
+  auto regLayout = triton::gpu::toLinearLayout(registerTy);
   auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
   return emitTransferBetweenRegistersAndShared(
       regLayout, sharedTy, elemLlvmTy, maxVecElems, smemObj, loc, rewriter,
@@ -1131,8 +1130,7 @@ llvm::MapVector<StringAttr, int32_t> getFreeVariableMasks(Type type) {
   if (!tensorTy) {
     return getAllFreeVarMasks(ctx);
   }
-  auto ll =
-      triton::gpu::toLinearLayout(tensorTy.getShape(), tensorTy.getEncoding());
+  auto ll = triton::gpu::toLinearLayout(tensorTy);
   return ll.getFreeVariableMasks();
 }
 
@@ -1142,7 +1140,7 @@ SmallVector<SmallVector<unsigned>> emitOffsetForLayout(Attribute layout,
   auto shape = type.getShape();
   unsigned rank = shape.size();
 
-  auto ll = triton::gpu::toLinearLayout(shape, layout);
+  auto ll = triton::gpu::toLinearLayout(type);
 
   StringAttr kRegister = str_attr("register");
   StringAttr kLane = str_attr("lane");