Revert "[LAYOUTS] Unify the implementation of getShapePerCTA (#5183)"

whitneywhtsang · whitneywhtsang · commit 016ee0055ccf · 2024-11-21T03:22:42.000Z
This reverts commit d36e579.
diff --git a/include/triton/Dialect/TritonGPU/IR/Dialect.h b/include/triton/Dialect/TritonGPU/IR/Dialect.h
@@ -116,7 +116,9 @@ SmallVector<unsigned> getCTAOrder(Attribute layout);
  * (3) In the implementation of emitIndices, ShapePerCTATile will
  *     be replicated or wrapped to fit ShapePerCTA.
  */
-SmallVector<unsigned> getShapePerCTATile(Attribute layout);
+SmallVector<unsigned>
+getShapePerCTATile(Attribute layout,
+                   ArrayRef<int64_t> tensorShape = ArrayRef<int64_t>());
 
 SmallVector<int64_t> getShapePerCTA(ArrayRef<unsigned> CTASplitNum,
                                     ArrayRef<int64_t> shape);
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -502,6 +502,11 @@ We call each individual tile "rep".
                     "SmallVector<unsigned>",
                     "getCTASplitNum">,
 
+    InterfaceMethod<"Gets the shape of the encoding's tile, e.g. sizePerThread * threadsPerWarp * warpsPerCTA",
+                    "SmallVector<unsigned>",
+                    "getShapePerCTATile",
+                     (ins "ArrayRef<int64_t>":$tensorShape)>,
+
     InterfaceMethod<"Gets the number of contiguous elements per thread.",
                     "SmallVector<unsigned>",
                     "getContigPerThread">,
@@ -560,6 +565,7 @@ L(T) = [ {0,8} , {1,9} , {2,10}, {3,11}, {0,8} , {1, 9} , {2, 10}, {3, 11},
     SmallVector<unsigned> getThreadOrder() const;
 
     SmallVector<unsigned> getSizePerThread() const;
+    SmallVector<unsigned> getShapePerCTATile(ArrayRef<int64_t> tensorShape = ArrayRef<int64_t>()) const;
 
     std::optional<LinearLayout> toLinearLayout(ArrayRef<int64_t> shape) const;
   }];
@@ -759,6 +765,13 @@ def MmaEncodingTrait : AttrInterface<"MmaEncodingTrait"> {
                     "bool",
                     "supportReduction">,
 
+    InterfaceMethod<"Return shape per CTA.",
+                    "SmallVector<unsigned>",
+                    "getShapePerCTATileForOperand",
+                    (ins "ArrayRef<int64_t>":$tensorShape,
+                         "int":$kWidth,
+                         "int":$opIdx)>,
+
     InterfaceMethod<"Return size per thread for dot operands.",
                     "SmallVector<unsigned>",
                     "getSizePerThreadForOperand",
@@ -892,6 +905,7 @@ V [ 0,4,8...60   1,5...61     2,6...62     3,7...63    ]   [ 128,132...188  129,
       return true;
     }
     SmallVector<unsigned> getSizePerThreadForOperand(int kWidth, int opIdx) const;
+    SmallVector<unsigned> getShapePerCTATileForOperand(ArrayRef<int64_t> shape, int kWidth, int opIdx) const;
     unsigned getTotalElemsPerThreadForOperand(ArrayRef<int64_t> shape, Type eltTy, int kWidth, int opIdx) const;
     SmallVector<int64_t> getInstrShapeForOperand(int kWidth, int opIdx) const;
     SmallVector<int64_t> getRepForOperand(ArrayRef<int64_t> operandShape, int kWidth, int opIdx) const;
@@ -1003,6 +1017,7 @@ Row |       warp 0                warp 2
       return true;
     }
     SmallVector<unsigned> getSizePerThreadForOperand(int kWidth, int opIdx) const;
+    SmallVector<unsigned> getShapePerCTATileForOperand(ArrayRef<int64_t> shape, int kWidth, int opIdx) const;
     unsigned getTotalElemsPerThreadForOperand(ArrayRef<int64_t> shape, Type eltTy, int kWidth, int opIdx) const;
     SmallVector<int64_t> getElemsPerInstrForOperands() const;
     SmallVector<int64_t> getRepForOperand(ArrayRef<int64_t> operandShape,
@@ -1138,6 +1153,7 @@ For example, the matrix L corresponding to blockTileSize=[32,16] is:
       return false;
     };
     SmallVector<unsigned> getSizePerThreadForOperand(int kWidth, int opIdx) const;
+    SmallVector<unsigned> getShapePerCTATileForOperand(ArrayRef<int64_t> shape, int kWidth, int opIdx) const;
 
     SmallVector<unsigned> getContigPerThread() {
       assert(isAmpere() || isHopper());
diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp
@@ -41,8 +41,10 @@ static SmallVector<unsigned> getRepShapeForCvt(RankedTensorType srcTy,
 
   auto srcShapePerCTA = gpu::getShapePerCTA(srcTy);
   auto dstShapePerCTA = gpu::getShapePerCTA(dstTy);
-  auto srcShapePerCTATile = gpu::getShapePerCTATile(srcLayout);
-  auto dstShapePerCTATile = gpu::getShapePerCTATile(dstLayout);
+  auto srcShapePerCTATile =
+      gpu::getShapePerCTATile(srcLayout, srcTy.getShape());
+  auto dstShapePerCTATile =
+      gpu::getShapePerCTATile(dstLayout, dstTy.getShape());
 
   assert(srcTy.getRank() == dstTy.getRank() &&
          "src and dst must have the same rank");
diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -174,8 +174,8 @@ struct ConvertLayoutOpConversion
     SmallVector<unsigned> outNumCTAsEachRep(rank);
     SmallVector<unsigned> inNumCTAs(rank);
     SmallVector<unsigned> outNumCTAs(rank);
-    auto srcShapePerCTATile = getShapePerCTATile(srcLayout);
-    auto dstShapePerCTATile = getShapePerCTATile(dstLayout);
+    auto srcShapePerCTATile = getShapePerCTATile(srcLayout, srcTy.getShape());
+    auto dstShapePerCTATile = getShapePerCTATile(dstLayout, shape);
     auto shapePerCTA = getShapePerCTA(srcLayout, shape);
 
     for (unsigned d = 0; d < rank; ++d) {
diff --git a/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
@@ -421,7 +421,7 @@ struct ReduceOpConversion
         auto resultIndices = emitIndices(loc, rewriter, targetInfo,
                                          resultLayout, resultTy, true);
         auto resultShape = resultTy.getShape();
-        auto resultCTATile = getShapePerCTATile(resultLayout);
+        auto resultCTATile = getShapePerCTATile(resultLayout, resultShape);
         assert(resultIndices.size() == resultElems);
 
         SmallVector<Value> resultVals(resultElems);
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -204,25 +204,12 @@ SmallVector<unsigned> getUniqueContigPerThread(Attribute layout,
   }
   return ret;
 }
-SmallVector<unsigned> getShapePerCTATile(Attribute layout) {
+
+SmallVector<unsigned> getShapePerCTATile(Attribute layout,
+                                         ArrayRef<int64_t> tensorShape) {
   if (auto distributedLayout =
           mlir::dyn_cast<DistributedEncodingTrait>(layout)) {
-    auto sizePerThread = distributedLayout.getSizePerThread();
-    auto threadsPerWarp = distributedLayout.getThreadsPerWarp();
-    // ThreadsPerWarp does not align with this function for slice layout
-    if (auto sliceLayout = mlir::dyn_cast<SliceEncodingAttr>(layout)) {
-      threadsPerWarp = getThreadsPerWarp(sliceLayout.getParent());
-      threadsPerWarp.erase(threadsPerWarp.begin() + sliceLayout.getDim());
-    }
-    auto warpsPerCTA = distributedLayout.getWarpsPerCTA();
-    assert(sizePerThread.size() == threadsPerWarp.size() &&
-           sizePerThread.size() == warpsPerCTA.size());
-    SmallVector<unsigned> shape;
-    for (auto [size, thread, warp] :
-         llvm::zip(sizePerThread, threadsPerWarp, warpsPerCTA)) {
-      shape.push_back(size * thread * warp);
-    }
-    return shape;
+    return distributedLayout.getShapePerCTATile(tensorShape);
   } else {
     llvm::report_fatal_error("getShapePerCTATile not implemented");
     return SmallVector<unsigned>();
@@ -704,6 +691,14 @@ SmallVector<unsigned> BlockedEncodingAttr::getThreadOrder() const {
 SmallVector<unsigned> BlockedEncodingAttr::getSizePerThread() const {
   return SmallVector<unsigned>(getSizePerThread__());
 }
+SmallVector<unsigned>
+BlockedEncodingAttr::getShapePerCTATile(ArrayRef<int64_t> tensorShape) const {
+  SmallVector<unsigned> shape;
+  for (unsigned d = 0, n = getOrder().size(); d < n; ++d)
+    shape.push_back(getSizePerThread()[d] * getThreadsPerWarp()[d] *
+                    getWarpsPerCTA()[d]);
+  return shape;
+}
 
 template <class T>
 SmallVector<T> SliceEncodingAttr::paddedShape(ArrayRef<T> shape) const {
@@ -805,6 +800,12 @@ SmallVector<unsigned> SliceEncodingAttr::getSizePerThread() const {
   sizePerThread.erase(sizePerThread.begin() + getDim());
   return sizePerThread;
 }
+SmallVector<unsigned>
+SliceEncodingAttr::getShapePerCTATile(ArrayRef<int64_t> tensorShape) const {
+  SmallVector<unsigned> shape = ::getShapePerCTATile(getParent(), tensorShape);
+  shape.erase(shape.begin() + getDim());
+  return shape;
+}
 
 //
 
@@ -998,9 +999,9 @@ unsigned DotOperandEncodingAttr::getTotalElemsPerThread(ArrayRef<int64_t> shape,
   }
   if (auto blockedLayout = mlir::dyn_cast<BlockedEncodingAttr>(getParent())) {
     auto shapePerCTA = getShapePerCTA(*this, shape);
-    auto shapePerCTATile = getShapePerCTATile(blockedLayout);
+    auto shapePerCTATile = ::getShapePerCTATile(blockedLayout);
     auto order = blockedLayout.getOrder();
-    auto sizePerThread = blockedLayout.getSizePerThread();
+    auto sizePerThread = ::getSizePerThread(blockedLayout);
 
     int K = getOpIdx() == 0 ? shapePerCTA[1] : shapePerCTA[0];
     int otherDim = getOpIdx() == 1 ? shapePerCTA[1] : shapePerCTA[0];
@@ -1071,6 +1072,19 @@ SmallVector<unsigned> DotOperandEncodingAttr::getThreadOrder() const {
                                  /*kMajor*/ true);
   }
 }
+SmallVector<unsigned> DotOperandEncodingAttr::getShapePerCTATile(
+    ArrayRef<int64_t> tensorShape) const {
+  auto parentLayout = getParent();
+  assert(parentLayout && "DotOperandEncodingAttr must have a parent");
+  if (auto parentMmaLayout = mlir::dyn_cast<MmaEncodingTrait>(parentLayout)) {
+    return parentMmaLayout.getShapePerCTATileForOperand(
+        tensorShape, getKWidth(), getOpIdx());
+  } else {
+    llvm::report_fatal_error(
+        "DotOperandEncodingAttr non-NvidiaMmaEncodingAttr parent not "
+        "supported yet");
+  }
+}
 
 LogicalResult DotOperandEncodingAttr::verify(
     ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
@@ -1592,6 +1606,16 @@ void SharedEncodingAttr::print(AsmPrinter &printer) const {
 //===----------------------------------------------------------------------===//
 // TODO: there is a lot of common code with MmaEncoding here
 
+SmallVector<unsigned>
+AMDMfmaEncodingAttr::getShapePerCTATile(ArrayRef<int64_t> tensorShape) const {
+  auto warpsPerCTA = getWarpsPerCTA();
+  auto rank = warpsPerCTA.size();
+  SmallVector<unsigned> shapePerCTATile(warpsPerCTA.begin(), warpsPerCTA.end());
+  shapePerCTATile[rank - 1] *= getMDim();
+  shapePerCTATile[rank - 2] *= getNDim();
+  return shapePerCTATile;
+}
+
 SmallVector<unsigned> AMDMfmaEncodingAttr::getCTAsPerCGA() const {
   return SmallVector<unsigned>(getCTALayout().getCTAsPerCGA());
 }
@@ -1735,10 +1759,43 @@ AMDMfmaEncodingAttr::getSizePerThreadForOperand(int kWidth, int opIdx) const {
   return sizePerThread;
 }
 
+SmallVector<unsigned>
+AMDMfmaEncodingAttr::getShapePerCTATileForOperand(ArrayRef<int64_t> shape,
+                                                  int kWidth, int opIdx) const {
+  assert(getMDim() == 32 || getMDim() == 16);
+  auto parentShapePerCTATile = getShapePerCTATile(shape);
+  auto rank = parentShapePerCTATile.size();
+  if (opIdx == 0) {
+    if (rank == 2)
+      return {parentShapePerCTATile[rank - 2], 32};
+    else
+      return {parentShapePerCTATile[0], parentShapePerCTATile[rank - 2], 32};
+  } else if (opIdx == 1) {
+    if (rank == 2)
+      return {32, parentShapePerCTATile[rank - 1]};
+    else
+      return {parentShapePerCTATile[0], 32, parentShapePerCTATile[rank - 1]};
+  } else {
+    llvm::report_fatal_error("DotOperandEncodingAttr opIdx must be 0 or 1");
+  }
+  llvm_unreachable("DotOperandEncodingAttr opIdx must be 0 or 1");
+}
+
 //===----------------------------------------------------------------------===//
 // Wmma encoding
 //===----------------------------------------------------------------------===//
 
+SmallVector<unsigned>
+AMDWmmaEncodingAttr::getShapePerCTATile(ArrayRef<int64_t> tensorShape) const {
+  auto warpsPerCTA = getWarpsPerCTA();
+  auto rank = warpsPerCTA.size();
+  SmallVector<unsigned> shapePerCTATile(warpsPerCTA.begin(), warpsPerCTA.end());
+
+  auto mnkDim = getMNKDimPerInstr();
+  shapePerCTATile[rank - 2] *= mnkDim[0];
+  shapePerCTATile[rank - 1] *= mnkDim[1];
+  return shapePerCTATile;
+}
 SmallVector<unsigned> AMDWmmaEncodingAttr::getRepOrder() const {
   auto rank = getWarpsPerCTA().size();
   return getMatrixOrder(rank, /*rowMajor*/ true);
@@ -1803,6 +1860,21 @@ AMDWmmaEncodingAttr::getSizePerThreadForOperand(int kWidth, int opIdx) const {
   return sizePerThread;
 }
 
+SmallVector<unsigned>
+AMDWmmaEncodingAttr::getShapePerCTATileForOperand(ArrayRef<int64_t> shape,
+                                                  int kWidth, int opIdx) const {
+  auto parentShapePerCTA = getShapePerCTATile(shape);
+  auto rank = shape.size();
+  assert(rank == 2);
+  if (opIdx == 0) {
+    return {parentShapePerCTA[0], static_cast<unsigned>(shape[1])};
+  } else if (opIdx == 1) {
+    return {static_cast<unsigned>(shape[0]), parentShapePerCTA[1]};
+  } else {
+    llvm::report_fatal_error("DotOperandEncodingAttr opIdx must be 0 or 1");
+  }
+}
+
 unsigned AMDWmmaEncodingAttr::getTotalElemsPerThreadForOperand(
     ArrayRef<int64_t> shape, Type eltTy, int kWidth, int opIdx) const {
   auto rep = getRepForOperand(shape, eltTy, kWidth, opIdx);
@@ -1921,6 +1993,24 @@ SmallVector<unsigned> NvidiaMmaEncodingAttr::getSizePerThread() const {
   llvm_unreachable("Unexpected mma version");
 }
 
+SmallVector<unsigned>
+NvidiaMmaEncodingAttr::getShapePerCTATile(ArrayRef<int64_t> tensorShape) const {
+  if (isAmpere()) {
+    auto warpsPerCTA = getWarpsPerCTA();
+    auto rank = warpsPerCTA.size();
+    SmallVector<unsigned> shapePerCTATile(warpsPerCTA.begin(),
+                                          warpsPerCTA.end());
+    shapePerCTATile[rank - 1] *= 8;
+    shapePerCTATile[rank - 2] *= 16;
+    return shapePerCTATile;
+  }
+  if (isHopper()) {
+    auto instrShape = getInstrShape();
+    return {16 * getWarpsPerCTA()[0], instrShape[1] * getWarpsPerCTA()[1]};
+  }
+  llvm::report_fatal_error("Unexpected MMA layout version found");
+}
+
 SmallVector<unsigned>
 NvidiaMmaEncodingAttr::getRepOrderForOperand(int opIdx) const {
   auto rank = getWarpsPerCTA().size();
@@ -1961,6 +2051,16 @@ NvidiaMmaEncodingAttr::getRepForOperand(ArrayRef<int64_t> shape, int bitwidth,
   }
 }
 
+SmallVector<unsigned> NvidiaMmaEncodingAttr::getShapePerCTATileForOperand(
+    ArrayRef<int64_t> shape, int kWidth, int opIdx) const {
+  assert(isAmpere() && "mmaLayout Hopper is not implemented yet");
+  auto shapePerCTATile = getShapePerCTATile(shape);
+  auto rank = shapePerCTATile.size();
+  auto kDim = opIdx == 0 ? rank - 1 : rank - 2;
+  // 4 threads * 2 subtiles
+  shapePerCTATile[kDim] = kWidth * 2 * 4;
+  return shapePerCTATile;
+}
 SmallVector<unsigned>
 NvidiaMmaEncodingAttr::getSizePerThreadForOperand(int kWidth, int opIdx) const {
   auto rank = getWarpsPerCTA().size();
diff --git a/third_party/amd/lib/Dialect/TritonAMDGPU/IR/Dialect.cpp b/third_party/amd/lib/Dialect/TritonAMDGPU/IR/Dialect.cpp
@@ -78,7 +78,8 @@ LogicalResult ExtractSliceOp::verify() {
   }
 
   auto srcShape = srcTy.getShape();
-  auto shapePerCTATile = mlir::triton::gpu::getShapePerCTATile(srcLayout);
+  auto shapePerCTATile =
+      mlir::triton::gpu::getShapePerCTATile(srcLayout, srcShape);
   shapePerCTATile[0] =
       std::min(static_cast<unsigned>(srcShape[0]), shapePerCTATile[0]);
   shapePerCTATile[1] =
diff --git a/third_party/amd/lib/TritonAMDGPUDialectToLLVM/ExtractSliceOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUDialectToLLVM/ExtractSliceOpToLLVM.cpp
@@ -70,7 +70,7 @@ struct ExtractSliceOpConversion
     auto order = triton::gpu::getOrder(srcLayout);
 
     // Calculate valid total number of workers in each dimension
-    auto shapePerCTATile = triton::gpu::getShapePerCTATile(srcLayout);
+    auto shapePerCTATile = triton::gpu::getShapePerCTATile(srcLayout, srcShape);
     shapePerCTATile[0] =
         std::min(static_cast<unsigned>(srcShape[0]), shapePerCTATile[0]);
     shapePerCTATile[1] =
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -48,7 +48,7 @@ Value redundantDataMask(Type valueTy, ConversionPatternRewriter &rewriter,
     } else {
       warpOrder = triton::gpu::getWarpOrder(layout);
     }
-    auto shapePerCTATile = triton::gpu::getShapePerCTATile(layout);
+    auto shapePerCTATile = triton::gpu::getShapePerCTATile(layout, shape);
     Value warpSize = i32_val(triton::gpu::getWarpSize(layout));
     Value laneId = urem(tid, warpSize);
     Value warpId = udiv(tid, warpSize);
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -17,6 +17,7 @@ using ::mlir::LLVM::linearize;
 using ::mlir::triton::gpu::DotOperandEncodingAttr;
 using ::mlir::triton::gpu::getOrder;
 using ::mlir::triton::gpu::getShapePerCTA;
+using ::mlir::triton::gpu::getShapePerCTATile;
 using ::mlir::triton::gpu::getSizePerThread;
 using ::mlir::triton::gpu::getTotalElemsPerThread;
 using ::mlir::triton::gpu::SharedEncodingAttr;
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -46,7 +46,7 @@ Value redundantDataMask(Type valueTy, ConversionPatternRewriter &rewriter,
     } else {
       warpOrder = triton::gpu::getWarpOrder(layout);
     }
-    auto shapePerCTATile = triton::gpu::getShapePerCTATile(layout);
+    auto shapePerCTATile = triton::gpu::getShapePerCTATile(layout, shape);
     Value warpSize = i32_val(32);
     Value laneId = urem(tid, warpSize);
     Value warpId = udiv(tid, warpSize);
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.cpp
@@ -12,6 +12,8 @@ using namespace mlir;
 
 using mlir::LLVM::getWrappedMultiDimOffset;
 using ::mlir::LLVM::linearize;
+using ::mlir::triton::gpu::getShapePerCTA;
+using ::mlir::triton::gpu::getShapePerCTATile;
 namespace {
 // declare vprintf(i8*, i8*) as external function
 LLVM::LLVMFuncOp getVprintfDeclaration(RewriterBase &rewriter) {

Original file line number	Diff line number	Diff line change
`@@ -78,7 +78,8 @@ LogicalResult ExtractSliceOp::verify() {`
`78`	`78`	`}`
`79`	`79`
`80`	`80`	`auto srcShape = srcTy.getShape();`
`81`		`- auto shapePerCTATile = mlir::triton::gpu::getShapePerCTATile(srcLayout);`
	`81`	`+ auto shapePerCTATile =`
	`82`	`+ mlir::triton::gpu::getShapePerCTATile(srcLayout, srcShape);`
`82`	`83`	`shapePerCTATile[0] =`
`83`	`84`	`std::min(static_cast<unsigned>(srcShape[0]), shapePerCTATile[0]);`
`84`	`85`	`shapePerCTATile[1] =`
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ Value redundantDataMask(Type valueTy, ConversionPatternRewriter &rewriter,`
`48`	`48`	`} else {`
`49`	`49`	`warpOrder = triton::gpu::getWarpOrder(layout);`
`50`	`50`	`}`
`51`		`- auto shapePerCTATile = triton::gpu::getShapePerCTATile(layout);`
	`51`	`+ auto shapePerCTATile = triton::gpu::getShapePerCTATile(layout, shape);`
`52`	`52`	`Value warpSize = i32_val(triton::gpu::getWarpSize(layout));`
`53`	`53`	`Value laneId = urem(tid, warpSize);`
`54`	`54`	`Value warpId = udiv(tid, warpSize);`