[LAYOUTS] Move all get.*ContigPerThread functions to a common API (triton-lang#6002)

lezcano · loislo · commit a8d53ade9cff · 2025-03-04T16:28:38.000+01:00
There were a couple things left to clean up after triton-lang#5840. Now we provide a common API in terms of RankedTensorType.
diff --git a/include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h b/include/triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h
@@ -101,7 +101,7 @@ class ElementwiseOpConversionBase : public ConvertOpToLLVMPattern<SourceOp> {
     if (!axisInfo)
       // axis info (e.g., constancy) not available
       return resultVals;
-    SmallVector<unsigned> contigPerThread = getContigPerThread(encoding);
+    SmallVector<unsigned> contigPerThread = getContigPerThread(rtType);
     if (rank != contigPerThread.size())
       return resultVals;
 
diff --git a/include/triton/Dialect/TritonGPU/IR/Dialect.h b/include/triton/Dialect/TritonGPU/IR/Dialect.h
@@ -104,20 +104,13 @@ SmallVector<unsigned> getWarpsPerCTA(Attribute layout);
 
 SmallVector<unsigned> getSizePerThread(Attribute layout);
 
-// Returns the number of contiguous elements that each thread
-// has access to, on each dimension of the tensor. E.g.
-// for a blocked layout with sizePerThread = [1, 4], returns [1, 4],
-// regardless of the shape of the tensor.
-SmallVector<unsigned> getContigPerThread(Attribute layout);
-
-// Returns the number of non-replicated contiguous elements that each thread
-// has access to, on each dimension of the tensor. For a blocked layout
+// Returns the number of contiguous elements of the logical tensor that each
+// thread has access to, on each dimension of the tensor. For a blocked layout
 // with sizePerThread = [1, 4] and tensor shape = [128, 1], the elements
 // for thread 0 would be [A_{0, 0}, A_{0, 0}, A_{0, 0}, A_{0, 0}], returns [1,
 // 1]. Whereas for a tensor shape [128, 128], the elements for thread 0 would be
 // [A_{0, 0}, A_{0, 1}, A_{0, 2}, A_{0, 3}], returns [1, 4].
-SmallVector<unsigned> getUniqueContigPerThread(Attribute layout,
-                                               ArrayRef<int64_t> tensorShape);
+SmallVector<unsigned> getContigPerThread(RankedTensorType tensorType);
 
 // Returns the number of threads per warp that have access to non-replicated
 // elements of the tensor. E.g. for a blocked layout with sizePerThread = [1,
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -532,10 +532,6 @@ We call each individual tile "rep".
     InterfaceMethod<"Get the shape of the values per thread.",
                     "SmallVector<unsigned>",
                     "getSizePerThread">,
-
-    InterfaceMethod<"Gets the number of contiguous elements per thread.",
-                    "SmallVector<unsigned>",
-                    "getContigPerThread">,
     InterfaceMethod<"Convert to LinearLayout.",
                     "LinearLayout",
                     "toLinearLayout",
@@ -819,12 +815,7 @@ for
     }]>
   ];
 
-  let extraClassDeclaration = extraDistributedDeclaration # [{
-    SmallVector<unsigned> getContigPerThread() {
-      // Block encoding is dense stride layout. The elements per thread are contiguous.
-      return getSizePerThread();
-    };
-  }];
+  let extraClassDeclaration = extraDistributedDeclaration;
 
   let hasCustomAssemblyFormat = 1;
 }
@@ -972,17 +963,6 @@ V [ 0,4,8...60   1,5...61     2,6...62     3,7...63    ]   [ 128,132...188  129,
     SmallVector<int64_t> getRepForOperand(ArrayRef<int64_t> operandShape, int kWidth, int opIdx) const;
     SmallVector<unsigned> getRepOrderForOperand(int opIdx) const;
     SmallVector<unsigned> getThreadsPerWarpForOperand(int opIdx) const;
-
-    SmallVector<unsigned> getContigPerThread() {
-      auto rank = getWarpsPerCTA().size();
-      SmallVector<unsigned> contigPerThread(rank, 1);
-      if (getIsTransposed())
-        contigPerThread[rank - 1] = 4;
-      else
-        contigPerThread[rank - 2] = 4;
-      return contigPerThread;
-    };
-
   }];
 
   let genVerifyDecl = 1;
@@ -1100,16 +1080,6 @@ Row |
     SmallVector<unsigned> getRepOrderForOperand(int opIdx) const;
     SmallVector<unsigned> getThreadsPerWarpForOperand(int opIdx) const;
     static SmallVector<unsigned> getMNKDimPerInstr();
-
-    SmallVector<unsigned> getContigPerThread() {
-      auto rank = getWarpsPerCTA().size();
-      assert(rank == 2 || rank == 3);
-      SmallVector<unsigned> contigPerThread(rank, 1);
-      if (getVersion() == 2) {
-        contigPerThread[rank - 2] = 8;
-      }
-      return contigPerThread;
-    };
   }];
 }
 
@@ -1219,15 +1189,6 @@ For example, the matrix L corresponding to blockTileSize=[32,16] is:
     SmallVector<unsigned> getRepOrderForOperand(int opIdx) const;
     SmallVector<unsigned> getThreadsPerWarpForOperand(int opIdx) const;
     SmallVector<unsigned> getSizePerThreadForOperand(int kWidth, int opIdx) const;
-
-    SmallVector<unsigned> getContigPerThread() {
-      assert(isAmpere() || isHopper());
-      auto rank = getWarpsPerCTA().size();
-      SmallVector<unsigned> contigPerThread(rank, 1);
-      contigPerThread[rank - 1] = 2;
-      return contigPerThread;
-    };
-
   }];
 
   let hasCustomAssemblyFormat = 1;
@@ -1273,13 +1234,6 @@ def SliceEncodingAttr : DistributedEncoding<"SliceEncoding", "slice_encoding"> {
   let extraClassDeclaration = extraDistributedDeclaration # [{
     template<class T>
     SmallVector<T> paddedShape(ArrayRef<T> shape) const;
-
-    SmallVector<unsigned> getContigPerThread() {
-      auto parentLayout = mlir::cast<DistributedEncodingTrait>(getParent());
-      auto parentContigPerThread = parentLayout.getContigPerThread();
-      parentContigPerThread.erase(parentContigPerThread.begin() + getDim());
-      return parentContigPerThread;
-    };
   }];
 
   let hasCustomAssemblyFormat = 1;
@@ -1347,20 +1301,7 @@ vecIdx (index of the element in the quad; this is always along the k-dim)
 
   let assemblyFormat = "`<` `{` struct(params) `}` `>`";
   let genVerifyDecl = 1;
-  let extraClassDeclaration = extraDistributedDeclaration # [{
-    SmallVector<unsigned> getContigPerThread() {
-      auto rank = getWarpsPerCTA().size();
-      assert(rank == 2 || rank == 3);
-      SmallVector<unsigned> contigPerThread(rank, 1);
-      auto kWidth = getKWidth();
-      assert(kWidth != 0 && "Do not support kWidth=0");
-      if (getOpIdx() == 0)
-        contigPerThread[rank - 1] = kWidth;
-      else
-        contigPerThread[rank - 2] = kWidth;
-      return contigPerThread;
-    };
-  }];
+  let extraClassDeclaration = extraDistributedDeclaration;
 }
 
 def TTG_SharedMemorySpace : AttrDef<TritonGPU_Dialect, "SharedMemorySpace"> {
diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
@@ -729,7 +729,6 @@ bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy,
   return dotOperandLayout.getParent() == mfmaLayout &&
          dotOperandLayout.getOpIdx() == 0 && mfmaLayout.getIsTransposed() &&
          dotOperandLayout.getKWidth() == 8 &&
-         getContigPerThread(mfmaLayout)[1] == 4 &&
          ((mfmaLayout.getMDim() == 16 && mfmaLayout.getNDim() == 16) ||
           (mfmaLayout.getMDim() == 32 && mfmaLayout.getNDim() == 32)) &&
          triton::type::isFloat8(srcTy.getElementType()) &&
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -113,19 +113,11 @@ SmallVector<unsigned> getSizePerThread(Attribute layout) {
   }
 }
 
-SmallVector<unsigned> getContigPerThread(Attribute layout) {
-  if (auto distributedLayout = dyn_cast<DistributedEncodingTrait>(layout)) {
-    return distributedLayout.getContigPerThread();
-  } else {
-    llvm::report_fatal_error("getContigPerThread not implemented");
-    return {};
-  }
-}
-
-SmallVector<unsigned> getUniqueContigPerThread(Attribute layout,
-                                               ArrayRef<int64_t> shape) {
+SmallVector<unsigned> getContigPerThread(RankedTensorType tensorType) {
+  auto layout = tensorType.getEncoding();
+  auto shape = tensorType.getShape();
   auto linearLayout = toLinearLayout(shape, layout);
-  auto llAttr = LinearEncodingAttr::get(layout.getContext(), linearLayout);
+  auto llAttr = LinearEncodingAttr::get(tensorType.getContext(), linearLayout);
   return llAttr.getContigPerThread();
 }
 
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp
@@ -5,6 +5,7 @@
 #include "mlir/IR/PatternMatch.h"
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
 
 using mlir::triton::ModuleAxisInfoAnalysis;
 using mlir::triton::AMD::DppCtrl;
@@ -536,12 +537,14 @@ unsigned getContiguity(Value ptr, Value offset,
   Type type = getPointerTypeWithShape(ptr, offset);
   RankedTensorType tensorTy = cast<RankedTensorType>(type);
   auto layout = tensorTy.getEncoding();
-  auto order = triton::gpu::getOrder(layout);
-  auto uniqueContigPerThread =
-      triton::gpu::getUniqueContigPerThread(layout, tensorTy.getShape());
-  assert(order[0] < uniqueContigPerThread.size() &&
-         "Unexpected uniqueContigPerThread size");
-  unsigned contiguity = uniqueContigPerThread[order[0]];
+  auto linearLayout = triton::gpu::toLinearLayout(tensorTy.getShape(), layout);
+  auto llAttr =
+      triton::gpu::LinearEncodingAttr::get(tensorTy.getContext(), linearLayout);
+  auto order = llAttr.getOrder();
+  auto contigPerThread = llAttr.getContigPerThread();
+  assert(order[0] < contigPerThread.size() &&
+         "Unexpected contigPerThread size");
+  unsigned contiguity = contigPerThread[order[0]];
 
   // Get alignment from the pointer. Since this is a scalar pointer
   // we should not take the pointer contiguity to consider alignment
diff --git a/unittest/Dialect/TritonGPU/DialectTest.cpp b/unittest/Dialect/TritonGPU/DialectTest.cpp
@@ -723,8 +723,6 @@ TEST_F(LinearEncodingTest, DistributedEncodingToLinearEncoding) {
       if (!is_dot_op_with_block_parent(distributedEncoding)) {
         ASSERT_EQ(distributedEncoding.getRepOrder(),
                   linearEncoding.getRepOrder());
-        ASSERT_EQ(distributedEncoding.getContigPerThread(),
-                  linearEncoding.getContigPerThread());
       }
       // DotOperandEncodingAttr::getWarpOrder() is not defined
       if (!isa<triton::gpu::DotOperandEncodingAttr>(distributedEncoding)) {

Original file line number	Diff line number	Diff line change
`@@ -723,8 +723,6 @@ TEST_F(LinearEncodingTest, DistributedEncodingToLinearEncoding) {`
`723`	`723`	`if (!is_dot_op_with_block_parent(distributedEncoding)) {`
`724`	`724`	`ASSERT_EQ(distributedEncoding.getRepOrder(),`
`725`	`725`	`linearEncoding.getRepOrder());`
`726`		`- ASSERT_EQ(distributedEncoding.getContigPerThread(),`
`727`		`- linearEncoding.getContigPerThread());`
`728`	`726`	`}`
`729`	`727`	`// DotOperandEncodingAttr::getWarpOrder() is not defined`
`730`	`728`	`if (!isa<triton::gpu::DotOperandEncodingAttr>(distributedEncoding)) {`