intel
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 19 additions & 8 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 19 additions & 8 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 27 additions & 12 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 27 additions & 12 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/DecomposeUnsupportedConversions.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/DecomposeUnsupportedConversions.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 3 additions & 2 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -137,7 +137,7 @@ class ElementwiseOpConversionBase : public ConvertOpToLLVMPattern<SourceOp> {
     if (rank > 1) {
       // reorder the shape and constancy vectors by the axis order:
       // from the fastest-changing to the smallest-changing axis
-      SmallVector<unsigned> order = getOrder(encoding);
+      SmallVector<unsigned> order = getOrder(rtType);
       if (rank != order.size())
         return resultVals;
       elemsPerThread = applyPermutation(elemsPerThread, order);
 
@@ -387,7 +387,7 @@ class SharedMemoryObject {
     auto allocShape = memDesc.getAllocShape();
     auto allocShapePerCTA = triton::gpu::getAllocationShapePerCTA(
         memDesc.getEncoding(), allocShape);
-    auto layoutOrder = triton::gpu::getOrder(memDesc.getEncoding());
+    auto layoutOrder = triton::gpu::getOrder(memDesc);
     auto allocStrides = SharedMemoryObject::getStridesForShape(
         allocShapePerCTA, layoutOrder, loc, rewriter);
     return SmallVector<Value>(allocStrides.end() - offsets.size(),
 
@@ -134,7 +134,19 @@ getWarpsPerCTAWithUniqueData(Attribute layout, ArrayRef<int64_t> tensorShape);
 // the order of the elements within a thread.
 // For shared Layout, the order refers to which dimension of the original tensor
 // is contiguous in shared memory.
-SmallVector<unsigned> getOrder(Attribute layout);
+SmallVector<unsigned> getOrder(DistributedEncodingTrait layout,
+                               ArrayRef<int64_t> shape);
+SmallVector<unsigned> getOrder(RankedTensorType type);
+
+SmallVector<unsigned> getOrder(SharedEncodingTrait layout,
+                               ArrayRef<int64_t> shape);
+SmallVector<unsigned> getOrder(MemDescType type);
+SmallVector<unsigned> getOrder(TensorOrMemDesc type);
+
+// Order of the elements in the shared memory as defined at layout creation
+// If this layout is associated with a MemDesc with a different shape
+// it may return a different order than the actual order of the elements
+SmallVector<unsigned> getDefaultOrder(SharedEncodingTrait layout);
 
 // Returns the dimensions along which warpId's are distributed.
 // warpsPerCTA only tells the warp layout in the CTA, e.g. warpsPerCTA = [2, 4]
@@ -143,17 +155,16 @@ SmallVector<unsigned> getOrder(Attribute layout);
 // E.g. warpOrder = [0, 1] means the warp IDs are distributed as follows
 // [warp0  warp2  warp4 warp6]
 // [warp1  warp3  warp5 warp7]
-// Note that in most cases, getWarpOrder and getOrder return the same results.
-// But this is not guaranteed.
-SmallVector<unsigned> getWarpOrder(Attribute layout);
+SmallVector<unsigned> getWarpOrder(DistributedEncodingTrait layout,
+                                   ArrayRef<int64_t> shape);
+SmallVector<unsigned> getWarpOrder(RankedTensorType type);
 
 // Returns the dimensions along which threadId's are distributed.
 // Similar to warpOrder, threadOrder is necessary to tell the specific thread
 // distribution in the warp.
-// Note that, in most cases, getThreadOrder and getOrder return the same
-// results. But this is not guaranteed. One exception is mfma.transposed layout,
-// in which getOrder returns [1, 0] but getThreadOrder returns [0, 1].
-SmallVector<unsigned> getThreadOrder(Attribute layout);
+SmallVector<unsigned> getThreadOrder(DistributedEncodingTrait layout,
+                                     ArrayRef<int64_t> shape);
+SmallVector<unsigned> getThreadOrder(RankedTensorType type);
 
 CTALayoutAttr getCTALayout(Attribute layout);
 
 
@@ -464,6 +464,9 @@ def NVMMASharedEncodingAttr :
     SmallVector<unsigned> getCTAsPerCGA() const;
     SmallVector<unsigned> getCTAOrder() const;
     SmallVector<unsigned> getCTASplitNum() const;
+    SmallVector<unsigned> getOrder() const {
+      return getTransposed() ? SmallVector<unsigned>({0, 1}) : SmallVector<unsigned>({1, 0});
+    }
   }];
   let hasCustomAssemblyFormat = 1;
 }
@@ -517,25 +520,33 @@ We call each individual tile "rep".
                     "SmallVector<unsigned>",
                     "getWarpsPerCTA">,
 
-    InterfaceMethod<"Get the order of the warps per CTA. The fastest-changing axis first",
-                    "SmallVector<unsigned>",
-                    "getWarpOrder">,
 
     InterfaceMethod<"Get the shape of the threads per warp",
                     "SmallVector<unsigned>",
                     "getThreadsPerWarp">,
 
-    InterfaceMethod<"Get the order of the threads per warp. The fastest-changing axis first",
-                    "SmallVector<unsigned>",
-                    "getThreadOrder">,
-
     InterfaceMethod<"Get the shape of the values per thread.",
                     "SmallVector<unsigned>",
                     "getSizePerThread">,
     InterfaceMethod<"Convert to LinearLayout.",
                     "LinearLayout",
                     "toLinearLayout",
-                    (ins "ArrayRef<int64_t>":$shape)>
+                    (ins "ArrayRef<int64_t>":$shape)>,
+
+    // Legacy methods: They do not take into account the shape of the tensor
+    // that is, the fact that we use them to tile the tensor.
+    InterfaceMethod<"Get the default order of the registers per warp. The fastest-changing axis first",
+                    "SmallVector<unsigned>",
+                    "getDefaultOrder">,
+
+    InterfaceMethod<"Get the default order of the threads per warp. The fastest-changing axis first",
+                    "SmallVector<unsigned>",
+                    "getDefaultThreadOrder">,
+
+    InterfaceMethod<"Get the default order of the warps per CTA. The fastest-changing axis first",
+                    "SmallVector<unsigned>",
+                    "getDefaultWarpOrder">
+
   ];
 }
 
@@ -583,13 +594,16 @@ L(T) = [ {0,8} , {1,9} , {2,10}, {3,11}, {0,8} , {1, 9} , {2, 10}, {3, 11},
     SmallVector<unsigned> getCTAOrder() const;
     SmallVector<unsigned> getCTASplitNum() const;
     SmallVector<unsigned> getWarpsPerCTA() const;
-    SmallVector<unsigned> getWarpOrder() const;
     SmallVector<unsigned> getThreadsPerWarp() const;
-    SmallVector<unsigned> getThreadOrder() const;
 
     SmallVector<unsigned> getSizePerThread() const;
 
     LinearLayout toLinearLayout(ArrayRef<int64_t> shape) const;
+
+    // Legacy methods: They do not take into account the shape of the tensor
+    SmallVector<unsigned> getDefaultWarpOrder() const;
+    SmallVector<unsigned> getDefaultThreadOrder() const;
+    SmallVector<unsigned> getDefaultOrder() const;
   }];
 }
 
@@ -620,6 +634,8 @@ def LinearEncodingAttr : DistributedEncoding<"LinearEncoding", "linear_encoding"
     SmallVector<unsigned> getContigPerThread() const;
     SmallVector<unsigned> getContigPerWarp() const;
     SmallVector<unsigned> getOrder() const;
+    SmallVector<unsigned> getWarpOrder() const;
+    SmallVector<unsigned> getThreadOrder() const;
 
     // Generalizes get{Warp,Thread,CTA}Order to linear layouts.
     // Returns the order of the dimensions `dimName` of the layout.
@@ -1228,8 +1244,7 @@ def SliceEncodingAttr : DistributedEncoding<"SliceEncoding", "slice_encoding"> {
   let parameters = (
     ins
     "unsigned":$dim,
-    // TODO: constraint here to only take distributed encodings
-    "Attribute":$parent
+    "DistributedEncodingTrait":$parent
   );
 
   let extraClassDeclaration = extraDistributedDeclaration # [{
 
@@ -36,7 +36,7 @@ void decomposeTensorCoreToDotLayoutConversion(ModuleOp module,
 
     int numWarps = lookupNumWarps(cvtOp);
     auto enc = BlockedEncodingAttr::get(
-        ctx, srcType.getShape(), getSizePerThread(srcMma), getOrder(srcMma),
+        ctx, srcType.getShape(), getSizePerThread(srcMma), getOrder(srcType),
         numWarps, threadsPerWarp, numCTAs);
     auto tmpType = RankedTensorType::get(dstType.getShape(),
                                          dstType.getElementType(), enc);
 
@@ -32,13 +32,14 @@ static int __builtin_ctz(unsigned x) {
 namespace {
 
 LinearLayout getRegToSharedLayout(MLIRContext *ctx, ArrayRef<int64_t> shape,
-                                  LinearLayout regLayout, Attribute dstEnc,
+                                  LinearLayout regLayout,
+                                  triton::gpu::SharedEncodingTrait dstEnc,
                                   int elemBitWidth) {
   StringAttr kBlock = StringAttr::get(ctx, ("block"));
   int rank = shape.size();
 
   LinearLayout sharedLayout = triton::gpu::toLinearLayout(shape, dstEnc);
-  auto sharedOrder = triton::gpu::getOrder(dstEnc);
+  auto sharedOrder = triton::gpu::getOrder(dstEnc, shape);
 
   // sharedLayout's in-dims are currently (offset, block).  Reshape to
   // (offsetX1, offsetX2, ..., block) so that we can apply the N-dimensional
 
@@ -376,7 +376,7 @@ struct MemDescSubviewOpConversion
     auto b = TritonLLVMOpBuilder(loc, rewriter);
     auto srcTy = op.getSrc().getType();
     auto llvmElemTy = getTypeConverter()->convertType(srcTy.getElementType());
-    auto layoutOrder = getOrder(srcTy.getEncoding());
+    auto layoutOrder = getOrder(srcTy);
 
     // newBase = base + offset
     auto smemObj = getSharedMemoryObjectFromStruct(loc, adaptor.getSrc(),