intel
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 8 additions & 19 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 8 additions & 19 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 10 additions & 26 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 10 additions & 26 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/DecomposeUnsupportedConversions.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/DecomposeUnsupportedConversions.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 2 additions & 3 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -135,7 +135,7 @@ class ElementwiseOpConversionBase : public ConvertOpToLLVMPattern<SourceOp> {
     if (rank > 1) {
       // reorder the shape and constancy vectors by the axis order:
       // from the fastest-changing to the smallest-changing axis
-      SmallVector<unsigned> order = getOrder(rtType);
+      SmallVector<unsigned> order = getOrder(encoding);
       if (rank != order.size())
         return resultVals;
       elemsPerThread = applyPermutation(elemsPerThread, order);
 
@@ -363,7 +363,7 @@ class SharedMemoryObject {
     auto allocShape = memDesc.getAllocShape();
     auto allocShapePerCTA = triton::gpu::getAllocationShapePerCTA(
         memDesc.getEncoding(), allocShape);
-    auto layoutOrder = triton::gpu::getOrder(memDesc);
+    auto layoutOrder = triton::gpu::getOrder(memDesc.getEncoding());
     auto allocStrides = SharedMemoryObject::getStridesForShape(
         allocShapePerCTA, layoutOrder, loc, rewriter);
     return SmallVector<Value>(allocStrides.end() - offsets.size(),
 
@@ -134,19 +134,7 @@ getWarpsPerCTAWithUniqueData(Attribute layout, ArrayRef<int64_t> tensorShape);
 // the order of the elements within a thread.
 // For shared Layout, the order refers to which dimension of the original tensor
 // is contiguous in shared memory.
-SmallVector<unsigned> getOrder(DistributedEncodingTrait layout,
-                               ArrayRef<int64_t> shape);
-SmallVector<unsigned> getOrder(RankedTensorType type);
-
-SmallVector<unsigned> getOrder(SharedEncodingTrait layout,
-                               ArrayRef<int64_t> shape);
-SmallVector<unsigned> getOrder(MemDescType type);
-SmallVector<unsigned> getOrder(TensorOrMemDesc type);
-
-// Order of the elements in the shared memory as defined at layout creation
-// If this layout is associated with a MemDesc with a different shape
-// it may return a different order than the actual order of the elements
-SmallVector<unsigned> getDefaultOrder(SharedEncodingTrait layout);
+SmallVector<unsigned> getOrder(Attribute layout);
 
 // Returns the dimensions along which warpId's are distributed.
 // warpsPerCTA only tells the warp layout in the CTA, e.g. warpsPerCTA = [2, 4]
@@ -155,16 +143,17 @@ SmallVector<unsigned> getDefaultOrder(SharedEncodingTrait layout);
 // E.g. warpOrder = [0, 1] means the warp IDs are distributed as follows
 // [warp0  warp2  warp4 warp6]
 // [warp1  warp3  warp5 warp7]
-SmallVector<unsigned> getWarpOrder(DistributedEncodingTrait layout,
-                                   ArrayRef<int64_t> shape);
-SmallVector<unsigned> getWarpOrder(RankedTensorType type);
+// Note that in most cases, getWarpOrder and getOrder return the same results.
+// But this is not guaranteed.
+SmallVector<unsigned> getWarpOrder(Attribute layout);
 
 // Returns the dimensions along which threadId's are distributed.
 // Similar to warpOrder, threadOrder is necessary to tell the specific thread
 // distribution in the warp.
-SmallVector<unsigned> getThreadOrder(DistributedEncodingTrait layout,
-                                     ArrayRef<int64_t> shape);
-SmallVector<unsigned> getThreadOrder(RankedTensorType type);
+// Note that, in most cases, getThreadOrder and getOrder return the same
+// results. But this is not guaranteed. One exception is mfma.transposed layout,
+// in which getOrder returns [1, 0] but getThreadOrder returns [0, 1].
+SmallVector<unsigned> getThreadOrder(Attribute layout);
 
 CTALayoutAttr getCTALayout(Attribute layout);
 
 
@@ -464,9 +464,6 @@ def NVMMASharedEncodingAttr :
     SmallVector<unsigned> getCTAsPerCGA() const;
     SmallVector<unsigned> getCTAOrder() const;
     SmallVector<unsigned> getCTASplitNum() const;
-    SmallVector<unsigned> getOrder() const {
-      return getTransposed() ? SmallVector<unsigned>({0, 1}) : SmallVector<unsigned>({1, 0});
-    }
   }];
   let hasCustomAssemblyFormat = 1;
 }
@@ -520,33 +517,25 @@ We call each individual tile "rep".
                     "SmallVector<unsigned>",
                     "getWarpsPerCTA">,
 
+    InterfaceMethod<"Get the order of the warps per CTA. The fastest-changing axis first",
+                    "SmallVector<unsigned>",
+                    "getWarpOrder">,
 
     InterfaceMethod<"Get the shape of the threads per warp",
                     "SmallVector<unsigned>",
                     "getThreadsPerWarp">,
 
+    InterfaceMethod<"Get the order of the threads per warp. The fastest-changing axis first",
+                    "SmallVector<unsigned>",
+                    "getThreadOrder">,
+
     InterfaceMethod<"Get the shape of the values per thread.",
                     "SmallVector<unsigned>",
                     "getSizePerThread">,
     InterfaceMethod<"Convert to LinearLayout.",
                     "LinearLayout",
                     "toLinearLayout",
-                    (ins "ArrayRef<int64_t>":$shape)>,
-
-    // Legacy methods: They do not take into account the shape of the tensor
-    // that is, the fact that we use them to tile the tensor.
-    InterfaceMethod<"Get the default order of the registers per warp. The fastest-changing axis first",
-                    "SmallVector<unsigned>",
-                    "getDefaultOrder">,
-
-    InterfaceMethod<"Get the default order of the threads per warp. The fastest-changing axis first",
-                    "SmallVector<unsigned>",
-                    "getDefaultThreadOrder">,
-
-    InterfaceMethod<"Get the default order of the warps per CTA. The fastest-changing axis first",
-                    "SmallVector<unsigned>",
-                    "getDefaultWarpOrder">
-
+                    (ins "ArrayRef<int64_t>":$shape)>
   ];
 }
 
@@ -594,16 +583,13 @@ L(T) = [ {0,8} , {1,9} , {2,10}, {3,11}, {0,8} , {1, 9} , {2, 10}, {3, 11},
     SmallVector<unsigned> getCTAOrder() const;
     SmallVector<unsigned> getCTASplitNum() const;
     SmallVector<unsigned> getWarpsPerCTA() const;
+    SmallVector<unsigned> getWarpOrder() const;
     SmallVector<unsigned> getThreadsPerWarp() const;
+    SmallVector<unsigned> getThreadOrder() const;
 
     SmallVector<unsigned> getSizePerThread() const;
 
     LinearLayout toLinearLayout(ArrayRef<int64_t> shape) const;
-
-    // Legacy methods: They do not take into account the shape of the tensor
-    SmallVector<unsigned> getDefaultWarpOrder() const;
-    SmallVector<unsigned> getDefaultThreadOrder() const;
-    SmallVector<unsigned> getDefaultOrder() const;
   }];
 }
 
@@ -634,8 +620,6 @@ def LinearEncodingAttr : DistributedEncoding<"LinearEncoding", "linear_encoding"
     SmallVector<unsigned> getContigPerThread() const;
     SmallVector<unsigned> getContigPerWarp() const;
     SmallVector<unsigned> getOrder() const;
-    SmallVector<unsigned> getWarpOrder() const;
-    SmallVector<unsigned> getThreadOrder() const;
 
     // Generalizes get{Warp,Thread,CTA}Order to linear layouts.
     // Returns the order of the dimensions `dimName` of the layout.
 
@@ -36,7 +36,7 @@ void decomposeTensorCoreToDotLayoutConversion(ModuleOp module,
 
     int numWarps = lookupNumWarps(cvtOp);
     auto enc = BlockedEncodingAttr::get(
-        ctx, srcType.getShape(), getSizePerThread(srcMma), getOrder(srcType),
+        ctx, srcType.getShape(), getSizePerThread(srcMma), getOrder(srcMma),
         numWarps, threadsPerWarp, numCTAs);
     auto tmpType = RankedTensorType::get(dstType.getShape(),
                                          dstType.getElementType(), enc);
 
@@ -32,14 +32,13 @@ static int __builtin_ctz(unsigned x) {
 namespace {
 
 LinearLayout getRegToSharedLayout(MLIRContext *ctx, ArrayRef<int64_t> shape,
-                                  LinearLayout regLayout,
-                                  triton::gpu::SharedEncodingTrait dstEnc,
+                                  LinearLayout regLayout, Attribute dstEnc,
                                   int elemBitWidth) {
   StringAttr kBlock = StringAttr::get(ctx, ("block"));
   int rank = shape.size();
 
   LinearLayout sharedLayout = triton::gpu::toLinearLayout(shape, dstEnc);
-  auto sharedOrder = triton::gpu::getOrder(dstEnc, shape);
+  auto sharedOrder = triton::gpu::getOrder(dstEnc);
 
   // sharedLayout's in-dims are currently (offset, block).  Reshape to
   // (offsetX1, offsetX2, ..., block) so that we can apply the N-dimensional
 
@@ -369,7 +369,7 @@ struct MemDescSubviewOpConversion
     auto b = TritonLLVMOpBuilder(loc, rewriter);
     auto srcTy = op.getSrc().getType();
     auto llvmElemTy = getTypeConverter()->convertType(srcTy.getElementType());
-    auto layoutOrder = getOrder(srcTy);
+    auto layoutOrder = getOrder(srcTy.getEncoding());
 
     // newBase = base + offset
     auto smemObj = getSharedMemoryObjectFromStruct(loc, adaptor.getSrc(),