[BACKEND] Generalize getShapePerCTA (#7580)

Jokeren · web-flow · commit 6a6ed52e1781 · 2025-07-21T12:53:06.000-04:00
diff --git a/include/triton/Dialect/TritonGPU/IR/Dialect.h b/include/triton/Dialect/TritonGPU/IR/Dialect.h
@@ -221,13 +221,19 @@ SmallVector<unsigned> getCTAOrder(Attribute layout);
 // [FIXME LL] Kill this function
 SmallVector<unsigned> getShapePerCTATile(RankedTensorType layout);
 
-// Returns the "logical" shape per CTA
+// Returns the "logical" shape per CTA.
+// When shape and CTASplitNum have different number of dimensions, we assume
+// only the last N between common dimensions are split.
+// Example1: shape = [2, 4, 8], CTASplitNum = [2, 2], ret = [2, 2, 4].
+// It can be caused by pipelining.
+// Example2: shape = [2, 4], CTASplitNum = [2, 2, 2], ret = [1, 2].
+// It can be caused by memory slicing.
 SmallVector<int64_t> getShapePerCTA(ArrayRef<unsigned> CTASplitNum,
                                     ArrayRef<int64_t> shape);
 SmallVector<int64_t> getShapePerCTA(Attribute layout, ArrayRef<int64_t> shape);
 SmallVector<int64_t> getShapePerCTA(Type type);
 
-// Returns the shape per CTA, which is "physically" allocated
+// Returns the shape per CTA, which is "physically" allocated.
 // Such shapes may be bigger than the logical one due to, for example, padding
 // in shared memory.
 SmallVector<int64_t> getAllocationShapePerCTA(Attribute layout,
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -292,34 +292,22 @@ SmallVector<unsigned> getCTAOrder(Attribute layout) {
 SmallVector<int64_t> getShapePerCTA(ArrayRef<unsigned> CTASplitNum,
                                     ArrayRef<int64_t> shape) {
   unsigned rank = shape.size();
+  auto splitNum = llvm::to_vector(CTASplitNum);
+  if (splitNum.size() <= rank) { // pipelining
+    splitNum.insert(splitNum.begin(), rank - splitNum.size(), 1);
+  } else { // memory slicing
+    splitNum =
+        llvm::to_vector(llvm::drop_begin(splitNum, splitNum.size() - rank));
+  }
   SmallVector<int64_t> shapePerCTA(rank);
   for (unsigned i = 0; i < rank; ++i) {
-    unsigned splitNum = std::min<unsigned>(shape[i], CTASplitNum[i]);
-    shapePerCTA[i] = shape[i] / splitNum;
+    shapePerCTA[i] = shape[i] / std::min<unsigned>(shape[i], splitNum[i]);
   }
   return shapePerCTA;
 }
 
 SmallVector<int64_t> getShapePerCTA(Attribute layout, ArrayRef<int64_t> shape) {
-  if (mlir::isa<SharedEncodingTrait>(layout)) {
-    // Special logic for pipeline pass, where shape is 3D and CTALayout is 2D.
-    // The first dim of shape is numStages. This is a work around, otherwise
-    // too many places would have to be modified in pipeline pass. Maybe we
-    // need to refactor this logic in the future.
-    auto CTASplitNum = cast<LayoutEncodingTrait>(layout).getCTASplitNum();
-    if (shape.size() == CTASplitNum.size() + 1) {
-      auto res = getShapePerCTA(CTASplitNum, shape.drop_front());
-      res.insert(res.begin(), shape.front());
-      return res;
-    }
-  }
-  SmallVector<unsigned> splitNum = getCTASplitNum(layout);
-  if (auto tmem = dyn_cast<nvidia_gpu::TensorMemoryEncodingAttr>(layout)) {
-    if (shape.size() > splitNum.size()) {
-      splitNum.insert(splitNum.begin(), shape.size() - splitNum.size(), 1);
-    }
-  }
-  return getShapePerCTA(splitNum, shape);
+  return getShapePerCTA(getCTASplitNum(layout), shape);
 }
 
 SmallVector<int64_t> getAllocationShapePerCTA(Attribute layout,
diff --git a/unittest/Dialect/TritonGPU/DialectTest.cpp b/unittest/Dialect/TritonGPU/DialectTest.cpp
@@ -367,6 +367,38 @@ TEST_F(Fp4ToFpOpTest, Fp4ToFpOpLayoutPropagation) {
   }
 }
 
+class ShapePerCTATest : public ::testing::Test {
+public:
+  ShapePerCTATest() { ctx.getOrLoadDialect<TritonGPUDialect>(); }
+
+protected:
+  MLIRContext ctx;
+};
+
+TEST_F(ShapePerCTATest, ShapePerCTA) {
+  // Equal length
+  SmallVector<unsigned> CTASplitNum = {2, 4};
+  SmallVector<int64_t> shape = {64, 128};
+  auto shapePerCTA = getShapePerCTA(CTASplitNum, shape);
+  auto expectedShapePerCTA = SmallVector<int64_t>{32, 32};
+  EXPECT_EQ(shapePerCTA.size(), shape.size());
+  EXPECT_EQ(shapePerCTA, expectedShapePerCTA);
+
+  // rank(shape) < rank(CTASplitNum)
+  CTASplitNum = {2, 4, 8};
+  shapePerCTA = getShapePerCTA(CTASplitNum, shape);
+  expectedShapePerCTA = SmallVector<int64_t>{16, 16};
+  EXPECT_EQ(shapePerCTA.size(), shape.size());
+  EXPECT_EQ(shapePerCTA, expectedShapePerCTA);
+
+  // rank(shape) > rank(CTASplitNum)
+  CTASplitNum = {2};
+  shapePerCTA = getShapePerCTA(CTASplitNum, shape);
+  expectedShapePerCTA = SmallVector<int64_t>{64, 64};
+  EXPECT_EQ(shapePerCTA.size(), shape.size());
+  EXPECT_EQ(shapePerCTA, expectedShapePerCTA);
+}
+
 class JoinOpTest : public ::testing::Test {
 public:
   JoinOpTest() { ctx.getOrLoadDialect<TritonGPUDialect>(); }