Fix AllocationShareMemory

leonling-ll · leonling-ll · commit 4775fedbbb26 · 2024-10-24T07:27:32.000Z
diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp
@@ -1,6 +1,7 @@
 #include "triton/Analysis/Allocation.h"
 
 #include <algorithm>
+#include <iostream>
 #include <limits>
 #include <numeric>
 
@@ -173,9 +174,13 @@ class AllocationAnalysis {
   using GraphT = DenseMap<BufferT *, DenseSet<BufferT *>>;
 
   void run() {
+    std::cout << "!!!! getValueAndSizes start\n";
     getValuesAndSizes();
+    std::cout << "!!!! resolveLiveness start\n";
     resolveLiveness();
+    std::cout << "!!!! computeOffsets start\n";
     computeOffsets();
+    std::cout << "!!!! AllocationAnalysis end\n";
   }
 
   /// Initializes explicitly defined shared memory values for a given operation.
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp
@@ -56,8 +56,8 @@ LinearLayout identityND(StringAttr inDimName, ArrayRef<unsigned> shape,
   LinearLayout ret = LinearLayout::empty();
   for (int i = 0; i < shape.size(); i++) {
     // Start with the most-minor dimension, which is order[0].
-    std::cout << "i: " << i << " shape[i]: " << shape[i]
-              << " order[i]: " << order[i] << std::endl;
+    // std::cout << "i: " << i << " shape[i]: " << shape[i]
+    //           << " order[i]: " << order[i] << std::endl;
     int dim = order[i];
     ret *= LinearLayout::identity1D(shape[dim], inDimName, outDimNames[dim]);
   }
@@ -291,15 +291,16 @@ LinearLayout ensureLayoutNotSmallerThan(
     assert(actualSize > desiredSize ||
            desiredSize % actualSize == 0 && "bad shape");
     ret *= LinearLayout::identity1D(desiredSize / actualSize, kDim, outDimName);
-    std::cout << "actualSize: " << actualSize << " desiredSize: " << desiredSize
-              << std::endl;
-    std::cout << "outDimName: " << outDimName.str() << std::endl;
-    std::cout << "identity1D: "
-              << LinearLayout::identity1D(desiredSize / actualSize, kDim,
-                                          outDimName)
-                     .toString()
-              << std::endl;
-    std::cout << "ret: " << ret.toString() << std::endl;
+    // std::cout << "actualSize: " << actualSize << " desiredSize: " <<
+    // desiredSize
+    //           << std::endl;
+    // std::cout << "outDimName: " << outDimName.str() << std::endl;
+    // std::cout << "identity1D: "
+    //           << LinearLayout::identity1D(desiredSize / actualSize, kDim,
+    //                                       outDimName)
+    //                  .toString()
+    //           << std::endl;
+    // std::cout << "ret: " << ret.toString() << std::endl;
     assert(ret.getOutDimSize(outDimName) >= desiredSize && "bad grow");
   }
   return ret;
@@ -327,8 +328,8 @@ LinearLayout combineCtaCgaWithShape(LinearLayout ctaLayout,
   for (auto s : shape) {
     std::cout << s << ", ";
   }
-
   std::cout << std::endl;
+
   llvm::SmallDenseMap<StringAttr, int64_t> labeledShape;
   for (auto [dim, size] : llvm::zip(outDimNames, shape)) {
     labeledShape[dim] = size;
@@ -337,7 +338,7 @@ LinearLayout combineCtaCgaWithShape(LinearLayout ctaLayout,
   LinearLayout cgaLayout =
       ensureLayoutNotLargerThan(makeCgaLayout(cgaLayoutAttr), labeledShape)
           .transposeOuts(llvm::to_vector(ctaLayout.getOutDimNames()));
-  std::cout << "\ncgaLayout: " << cgaLayout.toString() << std::endl;
+  // std::cout << "\ncgaLayout: " << cgaLayout.toString() << std::endl;
 
   // Calculate the shape of the ctaLayout, which is `shape` divided by the
   // cgaLayout's size.
@@ -346,29 +347,32 @@ LinearLayout combineCtaCgaWithShape(LinearLayout ctaLayout,
              llvm::to_vector(cgaLayout.getOutDimNames()) &&
          "bad layout");
 
-  std::cout << "ctaShape: ";
+  // std::cout << "ctaShape: ";
   for (auto dim : ctaLayout.getOutDimNames()) {
     ctaShape[dim] =
         std::max(int64_t{1}, labeledShape[dim] / cgaLayout.getOutDimSize(dim));
-    std::cout << ctaShape[dim] << ", ";
+    // std::cout << ctaShape[dim] << ", ";
   }
-  std::cout << std::endl;
+  // std::cout << std::endl;
 
+  std::cout << "ensureLayoutNotSmallerThan start" << std::endl;
   ctaLayout = ensureLayoutNotSmallerThan(ctaLayout, ctaShape);
-  std::cout << "\nctaLayout not smaller than: " << ctaLayout.toString()
-            << std::endl;
+  // std::cout << "\nctaLayout not smaller than: " << ctaLayout.toString()
+  //           << std::endl;
+  std::cout << "ensureLayoutNotLargerThan start" << std::endl;
   ctaLayout = ensureLayoutNotLargerThan(ctaLayout, ctaShape);
-  std::cout << "\nctaLayout not larger than: " << ctaLayout.toString()
-            << std::endl;
+  // std::cout << "\nctaLayout not larger than: " << ctaLayout.toString()
+  //           << std::endl;
 
-  std::cout << "\ncta * cga: " << (ctaLayout * cgaLayout).toString()
-            << std::endl;
+  // std::cout << "\ncta * cga: " << (ctaLayout * cgaLayout).toString()
+  //           << std::endl;
   LinearLayout ret =
       (std::move(ctaLayout) * std::move(cgaLayout)).transposeOuts(outDimNames);
   for (auto dim : ret.getOutDimNames()) {
     assert(ret.getOutDimSize(dim) == labeledShape[dim] && "bad shape");
   }
-  std::cout << "\ncombineCtaCgaWithShape: " << ret.toString() << std::endl;
+  // std::cout << "\ncombineCtaCgaWithShape: " << ret.toString() << std::endl;
+  std::cout << "combineCtaCgaWithShape end" << std::endl;
   return ret;
 }
 
@@ -593,26 +597,26 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
         DPASLaneBasesC(repeatCount, executionSize, threadsPerWarp);
     tileLayout = LinearLayout({{kRegister, regBasesC}, {kLane, laneBasesC}},
                               ArrayRef(outDimNames).take_back(2));
-    std::cout << tileLayout.toString() << std::endl;
+    // std::cout << tileLayout.toString() << std::endl;
     // The per-inst layout is repeated at each repCluster.
     // Hence, multiply with the identity layouts starting from the
     // least significant dimension.
     dimNonK = rank - 2;
     dimK = rank - 1;
     tileLayout *= LinearLayout::identity1D(repCluster[dimK], kRegister,
                                            outDimNames[dimK]);
-    std::cout << (LinearLayout::identity1D(repCluster[dimK], kRegister,
-                                           outDimNames[dimK])
-                      .toString())
-              << std::endl;
-    std::cout << (tileLayout.toString()) << std::endl;
+    // std::cout << (LinearLayout::identity1D(repCluster[dimK], kRegister,
+    //                                        outDimNames[dimK])
+    //                   .toString())
+    //           << std::endl;
+    // std::cout << (tileLayout.toString()) << std::endl;
     tileLayout *= LinearLayout::identity1D(repCluster[dimNonK], kRegister,
                                            outDimNames[dimNonK]);
-    std::cout << (LinearLayout::identity1D(repCluster[dimNonK], kRegister,
-                                           outDimNames[dimNonK])
-                      .toString())
-              << std::endl;
-    std::cout << (tileLayout.toString()) << std::endl;
+    // std::cout << (LinearLayout::identity1D(repCluster[dimNonK], kRegister,
+    //                                        outDimNames[dimNonK])
+    //                   .toString())
+    //           << std::endl;
+    // std::cout << (tileLayout.toString()) << std::endl;
 
     // // The identical layout is repeated among warps
     tileLayout *=
@@ -622,7 +626,7 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
     if (rank == 3)
       tileLayout *=
           LinearLayout::identity1D(warpsPerCTA[0], kWarp, outDimNames[0]);
-    std::cout << (tileLayout.toString()) << std::endl;
+    // std::cout << (tileLayout.toString()) << std::endl;
   }
 
   // Lastly, the layout repeats to match the shape.
@@ -647,8 +651,9 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
   if (rank == 3)
     tileLayout *=
         LinearLayout::identity1D(numReps[0], kRegister, outDimNames[0]);
-  std::cout << "\ntileLayout with DPASRepetition: " << (tileLayout.toString())
-            << std::endl;
+  // std::cout << "\ntileLayout with DPASRepetition: " <<
+  // (tileLayout.toString())
+  //           << std::endl;
 
   return combineCtaCgaWithShape(std::move(tileLayout),
                                 CTALayoutAttr::getDefault(ctx, rank), shape);
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/AllocateSharedMemory.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/AllocateSharedMemory.cpp
@@ -1,8 +1,8 @@
-
 #include "intel/include/Dialect/TritonGEN/IR/TritonGENDialect.h"
 #include "intel/include/TritonIntelGPUToLLVM/Passes.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "triton/Analysis/Allocation.h"
+#include <iostream>
 
 using namespace mlir;
 
@@ -20,17 +20,21 @@ struct AllocateSharedMemory
       AllocateSharedMemory>::IntelAllocateSharedMemoryBase;
 
   void runOnOperation() override {
+    std::cout << "AllocateSharedMemory Start\n";
     ModuleOp mod = getOperation();
     MLIRContext *ctx = &getContext();
+    std::cout << "Before create Module Allocation\n";
     ModuleAllocation allocation(mod);
 
+    std::cout << "Before mod walk\n";
     mod.walk([&](FunctionOpInterface funcOp) {
       if (allocation.isRoot(funcOp) && allocation.getSharedMemorySize()) {
         LLVM::LLVMPointerType ptrTy = LLVM::LLVMPointerType::get(
             ctx, triton::TritonGEN::TritonGENMemorySpace::kWorkgroup);
         funcOp.insertArgument(funcOp.getNumArguments(), ptrTy, {},
                               funcOp.getLoc());
       }
+      std::cout << "Before funcOp walk\n";
       funcOp.walk([&](Operation *op) {
         auto *funcAllocation = allocation.getFuncData(funcOp);
         auto oBufferId = funcAllocation->getBufferId(op);
@@ -49,6 +53,7 @@ struct AllocateSharedMemory
                     IntegerAttr::get(IntegerType::get(ctx, 32), offset));
       });
     });
+    std::cout << "Before getSharedMemorySize\n";
     int32_t initialSharedMemorySize = 0;
     if (IntegerAttr sharedAttr =
             mod->getAttrOfType<IntegerAttr>("triton_gpu.shared"))
@@ -57,6 +62,7 @@ struct AllocateSharedMemory
                  IntegerAttr::get(IntegerType::get(ctx, 32),
                                   initialSharedMemorySize +
                                       allocation.getSharedMemorySize()));
+    std::cout << "AllocateSharedMemory End\n";
   }
 };
 
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -1,7 +1,6 @@
 #include "PatternTritonGPUOpToLLVM.h"
 #include "TargetInfo.h"
 #include "Utility.h"
-#include <iostream>
 
 #include "intel/include/Analysis/Utility.h"
 #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
@@ -112,7 +111,6 @@ struct ConvertLayoutOpConversion
     }
     if (auto dpasLayout = dyn_cast<DpasEncodingAttr>(layout)) {
       assert(rank == 2 || rank == 3);
-      std::cout << "!!!getMultiDimOffset: dpasLayout" << std::endl;
       auto multiDimBase = ::intel::emitBaseIndexForLayout(
           loc, rewriter, targetInfo, layout, type, false);
       SmallVector<SmallVector<unsigned>> offsets;
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandDPAS.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandDPAS.cpp
@@ -2,6 +2,7 @@
 #include "../Utility.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <iostream>
 
 using ValueTable = std::map<std::array<int, 3>, Value>;
 using mlir::triton::gpu::getShapePerCTA;
@@ -334,7 +335,6 @@ Value loadOperand(ConversionPatternRewriter &rewriter, Location loc,
   SmallVector<Value> multiDimWarpId =
       LLVM::delinearize(rewriter, loc, warpId, warpsPerCTA, order);
 
-  // FIXME: Using opIdx as the dimIdx will be incorrect in 3D case.
   unsigned rank = shape.size();
   unsigned dimOuter = opIdx ? (rank - 1) : (rank - 2);
   unsigned ceilRes =
@@ -373,6 +373,7 @@ Value convertLayout(int opIdx, ConversionPatternRewriter &rewriter,
                     const SharedMemoryObject &smemObj,
                     const LLVMTypeConverter *typeConverter, Value threadId) {
   auto descTy = cast<MemDescType>(tensor.getType());
+  std::cout << "!!! SharedToDotOperandDPAS::intel::convertLayout\n";
   switch (opIdx) {
   case 0:
     return loadOperand<0>(rewriter, loc, descTy, encoding, smemObj,
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/MemoryOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/MemoryOpToLLVM.cpp
@@ -1,3 +1,5 @@
+#include <iostream>
+
 #include "PatternTritonGPUOpToLLVM.h"
 #include "Utility.h"
 #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
@@ -57,6 +59,7 @@ struct LocalAllocOpConversion
   LogicalResult
   matchAndRewrite(triton::gpu::LocalAllocOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    std::cout << "LocalAllocOpConversion start\n";
     if (!op.isSharedMemoryAlloc())
       return failure();
     Location loc = op->getLoc();
@@ -91,6 +94,7 @@ struct LocalAllocOpConversion
     }
     auto retVal = getStructFromSharedMemoryObject(loc, smemObj, rewriter);
     rewriter.replaceOp(op, retVal);
+    std::cout << "LocalAllocOpConversion end\n";
     return success();
   }
 
@@ -122,17 +126,20 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
   LogicalResult
   matchAndRewrite(LocalLoadOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    std::cout << "LocalLoadOpConversion start\n";
     MemDescType srcTy = op.getSrc().getType();
     RankedTensorType dstTy = op.getType();
     Attribute srcLayout = srcTy.getEncoding();
     Attribute dstLayout = dstTy.getEncoding();
     if (isa<SharedEncodingAttr>(srcLayout) &&
         isa<BlockedEncodingAttr, MmaEncodingTrait, SliceEncodingAttr>(
             dstLayout)) {
+      std::cout << "shared -> distributed\n";
       return lowerSharedToDistributed(op, adaptor, getTypeConverter(),
                                       rewriter);
     }
     if (isa<DotOperandEncodingAttr>(dstLayout)) {
+      std::cout << "shared -> dot_operand\n";
       return lowerSharedToDotOperand(op, adaptor, getTypeConverter(), rewriter);
     }
     return failure();
@@ -154,6 +161,9 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
 
     auto smemObj = getSharedMemoryObjectFromStruct(loc, adaptor.getSrc(),
                                                    llvmElemTy, rewriter);
+    std::cout << "!!! smemObj strides rank: " << smemObj.getStrides().size()
+              << "\n";
+
     Value res;
     if (!isOuter) {
       res = SharedToDotOperandDPAS::intel::convertLayout(
@@ -176,6 +186,14 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
     auto sharedLayout =
         cast<SharedEncodingAttr>(op.getSrc().getType().getEncoding());
 
+    sharedLayout.dump();
+    std::cout << "!!! sharedLayout order: "
+              << "\n";
+    for (auto o : sharedLayout.getOrder()) {
+      std::cout << o << " ";
+    }
+    std::cout << std::endl;
+
     int K;
     if (dotLayout.getOpIdx() == 0) // $a
       K = op.getType().getShape()[sharedLayout.getOrder()[0]];
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h
@@ -501,7 +501,6 @@ emitBaseIndexForLayoutImpl(Location loc, RewriterBase &rewriter,
   RewriterBase::InsertionGuard guard(rewriter);
   SmallVector<Value> result;
   if (auto dpasLayout = dyn_cast<DpasEncodingAttr>(layout)) {
-    printf("emitBaseIndexForLayoutImpl: dpasLayout\n");
     result = emitBaseIndexForDpasLayout(loc, rewriter, dpasLayout, type);
   } else if (auto sliceLayout = dyn_cast<SliceEncodingAttr>(layout)) {
     auto parentLayout = sliceLayout.getParent();
@@ -514,7 +513,6 @@ emitBaseIndexForLayoutImpl(Location loc, RewriterBase &rewriter,
     // CTAOffset has been added in emitBaseIndexForLayout of parentLayout
     return result;
   } else if (auto dotLayout = dyn_cast<DotOperandEncodingAttr>(layout)) {
-    printf("emitBaseIndexForLayoutImpl: DotOperandLayout\n");
     result = emitBaseIndexForDotOpLayout(loc, rewriter, dotLayout, type);
   } else {
     return mlir::emitBaseIndexForLayoutImpl(loc, rewriter, target, layout, type,