Fix 3d ConvertLayoutToLLVM

leonling-ll · leonling-ll · commit 77422cbce255 · 2024-10-24T16:09:55.000Z
diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp
@@ -14,6 +14,7 @@
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 
 using ::mlir::triton::gpu::AMDMfmaEncodingAttr;
@@ -64,6 +65,7 @@ static SmallVector<unsigned> getRepShapeForCvt(RankedTensorType srcTy,
                                                RankedTensorType dstTy) {
   Attribute srcLayout = srcTy.getEncoding();
   Attribute dstLayout = dstTy.getEncoding();
+  std::cout << "- in getRepShapeForCvt\n";
 
   if (!cvtNeedsSharedMemory(srcTy, dstTy)) {
     return {};
@@ -80,6 +82,10 @@ static SmallVector<unsigned> getRepShapeForCvt(RankedTensorType srcTy,
   auto dstShapePerCTA = getShapePerCTA(dstTy);
   auto srcShapePerCTATile = getShapePerCTATile(srcLayout, srcTy.getShape());
   auto dstShapePerCTATile = getShapePerCTATile(dstLayout, dstTy.getShape());
+  std::cout << "!!!shapePerCTA: " << srcShapePerCTA.size() << " "
+            << dstShapePerCTA.size() << "\n";
+  std::cout << "!!!shapePerCTATile: " << srcShapePerCTATile.size() << " "
+            << dstShapePerCTATile.size() << "\n";
 
   unsigned rank = dstTy.getRank();
   SmallVector<unsigned> repShape(rank);
@@ -106,7 +112,9 @@ static SmallVector<unsigned> getRepShapeForAtomic(Value result) {
 ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
                                      RankedTensorType dstTy) {
   // Initialize vector sizes and stride
+  std::cout << "getRepShapeForCvt start\n";
   auto repShape = getRepShapeForCvt(srcTy, dstTy);
+  std::cout << "repShape rank: " << repShape.size() << "\n";
   if (repShape.empty())
     return ScratchConfig({}, {});
   ScratchConfig scratchConfig(repShape, repShape);
@@ -118,13 +126,24 @@ ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
 
   auto [inOrd, outOrd] = getCvtOrder(srcLayout, dstLayout);
   scratchConfig.order = outOrd;
+  std::cout << "inOrd: ";
+  for (auto i : inOrd) {
+    std::cout << i << " ";
+  }
+  std::cout << "rank: " << inOrd.size() << "\n";
+  std::cout << "outOrd: ";
+  for (auto i : outOrd) {
+    std::cout << i << " ";
+  }
+  std::cout << "rank: " << outOrd.size() << "\n";
 
   unsigned srcContigPerThread =
       getUniqueContigPerThread(srcLayout, srcTy.getShape())[inOrd[0]];
   unsigned dstContigPerThread =
       getUniqueContigPerThread(dstLayout, dstTy.getShape())[outOrd[0]];
   // TODO: Fix the legacy issue that ourOrd[0] == 0 always means
   //       that we cannot do vectorization.
+  std::cout << "no index issue in getUniqueContigPerThread\n";
   unsigned innerDim = rank - 1;
   scratchConfig.inVec = outOrd[0] != innerDim  ? 1
                         : inOrd[0] != innerDim ? 1
@@ -174,13 +193,9 @@ class AllocationAnalysis {
   using GraphT = DenseMap<BufferT *, DenseSet<BufferT *>>;
 
   void run() {
-    std::cout << "!!!! getValueAndSizes start\n";
     getValuesAndSizes();
-    std::cout << "!!!! resolveLiveness start\n";
     resolveLiveness();
-    std::cout << "!!!! computeOffsets start\n";
     computeOffsets();
-    std::cout << "!!!! AllocationAnalysis end\n";
   }
 
   /// Initializes explicitly defined shared memory values for a given operation.
@@ -237,27 +252,33 @@ class AllocationAnalysis {
       maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes,
                                                           scratchAlignment);
     } else if (auto cvtLayout = dyn_cast<triton::gpu::ConvertLayoutOp>(op)) {
+      std::cout << "getScratchValueSize from ConvertLayoutOp\n";
       auto srcTy = cvtLayout.getSrc().getType();
       auto dstTy = cvtLayout.getType();
       auto srcEncoding = srcTy.getEncoding();
       auto dstEncoding = dstTy.getEncoding();
       if (mlir::isa<SharedEncodingAttr>(srcEncoding) ||
           mlir::isa<SharedEncodingAttr>(dstEncoding)) {
         // Conversions from/to shared memory do not need scratch memory.
+        std::cout << "-- ConvertLayoutOp from/to shared memory\n";
         return;
       }
       // ConvertLayoutOp with both input/output non-shared_layout
       // TODO: Besides of implementing ConvertLayoutOp via shared memory, it's
       //       also possible to realize it with other approaches in restricted
       //       conditions, such as warp-shuffle
+      std::cout << "-- getScratchConfigForCvt\n";
       auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);
+      std::cout << "-- getNumScratchElements\n";
       auto elems = getNumScratchElements(scratchConfig.paddedRepShape);
       auto bytes =
           isa<triton::PointerType>(srcTy.getElementType())
               ? elems * kPtrBitWidth / 8
               : elems * std::max<int>(8, srcTy.getElementTypeBitWidth()) / 8;
       maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes,
                                                           scratchAlignment);
+      std::cout << "-- ConvertLayoutOp from/to non-shared memory: " << bytes
+                << " bytes\n";
     } else if (isa<triton::AtomicRMWOp, triton::AtomicCASOp>(op)) {
       auto value = op->getOperand(0);
       // only scalar requires scratch memory
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -1,6 +1,7 @@
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
 #include <cstdint>
+#include <iostream>
 #include <numeric>
 
 #include "mlir/IR/DialectImplementation.h"
@@ -383,6 +384,17 @@ SmallVector<unsigned> getCTAOrder(Attribute layout) {
 SmallVector<int64_t> getShapePerCTA(ArrayRef<unsigned> CTASplitNum,
                                     ArrayRef<int64_t> shape) {
   unsigned rank = shape.size();
+  std::cout << "!!!GPU dialect - getShapePerCTA\n";
+  std::cout << "CTASplitNum: ";
+  for (auto i : CTASplitNum) {
+    std::cout << i << " ";
+  }
+  std::cout << "\nshape: ";
+  for (auto i : shape) {
+    std::cout << i << " ";
+  }
+  std::cout << "\n";
+
   SmallVector<int64_t> shapePerCTA(rank);
   for (unsigned i = 0; i < rank; ++i) {
     // This wrapping rule must be consistent with emitCTAOffsetForLayout
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp
@@ -186,17 +186,24 @@ unsigned DpasEncodingAttr::getTotalElemsPerThread(ArrayRef<int64_t> shape,
 }
 
 SmallVector<unsigned> DpasEncodingAttr::getCTASplitNum() const {
-  SmallVector<unsigned> res{1, 1};
+  size_t rank = getWarpsPerCTA().size();
+  SmallVector<unsigned> res(rank, 1);
   return res;
 }
 
 SmallVector<unsigned> DpasEncodingAttr::getCTAOrder() const {
-  SmallVector<unsigned> res{1, 0};
-  return res;
+  size_t rank = getWarpsPerCTA().size();
+  // auto res = llvm::to_vector(llvm::reverse(llvm::seq<unsigned>(rank)));
+  // return res;
+  if (rank == 3)
+    return {2, 1, 0};
+  else
+    return {1, 0};
 }
 
 SmallVector<unsigned> DpasEncodingAttr::getCTAsPerCGA() const {
-  SmallVector<unsigned> res{1, 1};
+  size_t rank = getWarpsPerCTA().size();
+  SmallVector<unsigned> res(rank, 1);
   return res;
 }
 
@@ -370,8 +377,8 @@ SmallVector<unsigned> DpasEncodingAttr::getElemsPerThreadForOperands(
   SmallVector<unsigned> elemsPerThread(rank);
   if (rank == 3)
     elemsPerThread[0] = repetitions[0];
-  elemsPerThread[rank - 2] = sizePerThread[rank - 2] * repetitions[1];
-  elemsPerThread[rank - 1] = sizePerThread[rank - 1] * repetitions[2];
+  elemsPerThread[rank - 2] = sizePerThread[0] * repetitions[1];
+  elemsPerThread[rank - 1] = sizePerThread[1] * repetitions[2];
 
   return elemsPerThread;
 };
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -1,6 +1,7 @@
 #include "PatternTritonGPUOpToLLVM.h"
 #include "TargetInfo.h"
 #include "Utility.h"
+#include <iostream>
 
 #include "intel/include/Analysis/Utility.h"
 #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
@@ -39,6 +40,7 @@ struct ConvertLayoutOpConversion
   LogicalResult
   matchAndRewrite(triton::gpu::ConvertLayoutOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    std::cout << "ConvertLayoutOpConversion" << std::endl;
     RankedTensorType srcTy = op.getSrc().getType();
     RankedTensorType dstTy = op.getType();
     Attribute srcLayout = srcTy.getEncoding();
@@ -64,6 +66,7 @@ struct ConvertLayoutOpConversion
                     RankedTensorType type,
                     ArrayRef<unsigned> multiDimCTAInRepId,
                     ArrayRef<unsigned> shapePerCTATile) const {
+    std::cout << "getMultiDimOffset" << std::endl;
     auto shape = type.getShape();
     unsigned rank = shape.size();
     if (auto blockedLayout = dyn_cast<BlockedEncodingAttr>(layout)) {
@@ -140,6 +143,7 @@ struct ConvertLayoutOpConversion
                       ArrayRef<unsigned> origRepShape,
                       ArrayRef<unsigned> outOrd, SmallVector<Value> &vals,
                       Value smemBase) const {
+    std::cout << "processReplica" << std::endl;
     auto accumNumCTAsEachRep = product<unsigned>(numCTAsEachRep);
     auto layout = type.getEncoding();
     auto rank = type.getRank();
@@ -225,6 +229,7 @@ struct ConvertLayoutOpConversion
   lowerDistributedToDistributed(triton::gpu::ConvertLayoutOp op,
                                 OpAdaptor adaptor,
                                 ConversionPatternRewriter &rewriter) const {
+    std::cout << "lowerDistributedToDistributed" << std::endl;
     auto loc = op.getLoc();
     auto typeConverter = getTypeConverter();
     RankedTensorType srcTy = op.getSrc().getType();
@@ -324,6 +329,7 @@ struct ConvertLayoutOpConversion
                                            ConversionPatternRewriter &rewriter,
                                            Value vals,
                                            RankedTensorType srcType) const {
+    std::cout << "getValuesFromDpasLayoutStruct" << std::endl;
     SmallVector<Value> elems = unpackLLElements(loc, vals, rewriter);
     auto dpasLayout = dyn_cast<DpasEncodingAttr>(srcType.getEncoding());
 
@@ -368,6 +374,7 @@ struct ConvertLayoutOpConversion
   Value composeValuesToDotOperandLayoutStruct(
       Location loc, ConversionPatternRewriter &rewriter, const ValueTable &vals,
       RankedTensorType dstType) const {
+    std::cout << "composeValuesToDotOperandLayoutStruct" << std::endl;
     auto dotLayout = dyn_cast<DotOperandEncodingAttr>(dstType.getEncoding());
     auto dpasLayout = dyn_cast<DpasEncodingAttr>(dotLayout.getParent());
     unsigned opIdx = dotLayout.getOpIdx();
@@ -424,6 +431,7 @@ struct ConvertLayoutOpConversion
   LogicalResult
   lowerDpasToDotOperand(triton::gpu::ConvertLayoutOp op, OpAdaptor adaptor,
                         ConversionPatternRewriter &rewriter) const {
+    std::cout << "lowerDpasToDotOperand" << std::endl;
     Location loc = op.getLoc();
     RankedTensorType srcTy = op.getSrc().getType();
     RankedTensorType dstTy = op.getType();
@@ -456,6 +464,7 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
   LogicalResult
   matchAndRewrite(ConvertLayoutOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    std::cout << "ConvertLayoutOpUsingLinearLayoutsConversion" << std::endl;
     MLIRContext *ctx = op.getContext();
 
     const auto &shape = op.getType().getShape();
@@ -504,6 +513,7 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
   transferWithinThread(ConvertLayoutOp op, const LinearLayout &srcLayout,
                        const LinearLayout &dstLayout, OpAdaptor adaptor,
                        ConversionPatternRewriter &rewriter) const {
+    std::cout << "transferWithinThread" << std::endl;
     MLIRContext *ctx = op.getContext();
     auto loc = op.getLoc();
     StringAttr kRegister = str_attr("register");
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandDPAS.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandDPAS.cpp
@@ -186,6 +186,9 @@ template <unsigned opIdx>
 Value DpasMatmulLoader<opIdx>::loadMatrix(
     int repBatch, int repOuter, int repInner, const ArrayRef<Value> ptrs,
     LLVM::LLVMStructType structTy, Type smemTy, Value cSwizzleOffset) const {
+  std::cout << "-- loadMatrix: repBatch: " << repBatch
+            << ", repOuter: " << repOuter << ", repInner: " << repInner
+            << std::endl;
   Type elemTy = structTy.getBody()[0];
   assert(
       llvm::any_of(structTy.getBody(), [&](Type ty) { return ty == elemTy; }) &&
@@ -195,7 +198,11 @@ Value DpasMatmulLoader<opIdx>::loadMatrix(
   Value offsetOuter = mul(i32_val(repOuter), repNonKDimStride);
   Value offsetInner = mul(i32_val(repInner), repKDimStride);
   Value offset = add(offsetOuter, offsetInner);
-  // offset = add(offset, offsetBatch);
+  // FIXME: repBatchSize and
+  if (repBatch > 0) {
+    Value offsetBatch = mul(i32_val(repBatch), repBatchDimStride);
+    offset = add(offset, offsetBatch);
+  }
 
   Value llvmStruct = rewriter.create<LLVM::UndefOp>(loc, structTy);
   size_t elemNum = structTy.getBody().size();
@@ -206,6 +213,7 @@ Value DpasMatmulLoader<opIdx>::loadMatrix(
     llvmStruct = insert_val(structTy, llvmStruct, val, i);
   }
 
+  std::cout << "-- loadMatrix end --" << std::endl;
   return llvmStruct;
 }
 
@@ -234,6 +242,7 @@ Value composeValuesToDotOperandLayoutStruct(
   Type structTy = LLVM::LLVMStructType::getLiteral(
       ctx, SmallVector<Type>(elems.size(), elemTy));
 
+  std::cout << "packLLElements: elems size: " << elems.size() << std::endl;
   return packLLElements(loc, typeConverter, elems, rewriter, structTy);
 }
 
@@ -269,6 +278,12 @@ getLoadMatrixFn(MemDescType descTy, const SharedMemoryObject &smemObj,
   auto sharedLayout = cast<SharedEncodingAttr>(descTy.getEncoding());
   ArrayRef<unsigned> order = sharedLayout.getOrder();
 
+  std::cout << "getLoadMatrixFn: sharedLayout order: ";
+  for (auto i : order) {
+    std::cout << i << " ";
+  }
+  std::cout << std::endl;
+
   // (a, b) is the coordinate.
   auto load = [=, &rewriter, &smemObj, &instrShape, &vals](int batch, int outer,
                                                            int inner) {
@@ -353,6 +368,9 @@ Value loadOperand(ConversionPatternRewriter &rewriter, Location loc,
   int64_t numRepOuter = numReps[opIdx ? 2 : 1];
   int64_t numRepK = numReps[opIdx ? 1 : 2];
 
+  std::cout << "!!! numRepBatch: " << numRepBatch
+            << ", numRepOuter: " << numRepOuter << ", numRepK: " << numRepK
+            << "\n";
   for (int b = 0; b < numRepBatch; ++b)
     for (int m = 0; m < numRepOuter; ++m)
       for (int k = 0; k < numRepK; ++k)
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/DPAS.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/DPAS.cpp
@@ -1,6 +1,7 @@
 #include "../TritonGPUToLLVMBase.h"
 #include "../Utility.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include <iostream>
 
 #include "intel/include/Analysis/DPAS.h"
 #include "intel/include/Dialect/TritonGEN/IR/TritonGENDialect.h"
@@ -176,6 +177,9 @@ class DotOpDPASConversionHelper {
     });
 
     auto generateDPASOp = [&](unsigned b, unsigned m, unsigned n, unsigned k) {
+      std::cout << "valA: " << b << " " << m << " " << k << "\n";
+      std::cout << "valB: " << b << " " << n << " " << k << "\n";
+      std::cout << "valC: " << b << " " << m << " " << n << "\n";
       Value valA = ha.at({b, m, k});
       Value valB = hb.at({b, n, k});
       Value valc = fc.at({b, m, n});
@@ -186,7 +190,7 @@ class DotOpDPASConversionHelper {
           TritonGEN::PrecisionTypeAttr::get(B.getContext(), BPrecision);
       auto RC = IntegerAttr::get(rewriter.getIntegerType(32),
                                  dpasEncoding.getRepeatCount());
-      fc.at({m, n}) = rewriter.create<TritonGEN::MatrixDPASOp>(
+      fc.at({b, m, n}) = rewriter.create<TritonGEN::MatrixDPASOp>(
           loc, dTy, valc, valA, valB, pA, pB, RC);
     };
 
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/MemoryOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/MemoryOpToLLVM.cpp
@@ -186,9 +186,7 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
     auto sharedLayout =
         cast<SharedEncodingAttr>(op.getSrc().getType().getEncoding());
 
-    sharedLayout.dump();
-    std::cout << "!!! sharedLayout order: "
-              << "\n";
+    std::cout << "!!! sharedLayout order: ";
     for (auto o : sharedLayout.getOrder()) {
       std::cout << o << " ";
     }
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h
@@ -16,6 +16,7 @@
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <iostream>
 
 #define DEBUG_TYPE "ttgpu_to_llvm"
 
@@ -573,6 +574,7 @@ emitOffsetForLayout(Attribute layout, RankedTensorType type) {
 inline SmallVector<SmallVector<Value>>
 emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
             Attribute layout, RankedTensorType type, bool withCTAOffset) {
+  std::cout << "emitIndices" << std::endl;
   MLIRContext *ctx = rewriter.getContext();
   auto shape = type.getShape();
   std::optional<LinearLayout> ll = triton::gpu::toLinearLayout(shape, layout);