intel
diff --git a/‎lib/Analysis/Allocation.cpp‎
Lines changed: 0 additions & 24 deletions b/‎lib/Analysis/Allocation.cpp‎
Lines changed: 0 additions & 24 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 0 additions & 10 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎python/src/ir.cc‎
Lines changed: 1 addition & 1 deletion b/‎python/src/ir.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 0 additions & 49 deletions b/‎third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 0 additions & 49 deletions
diff --git a/‎third_party/intel/lib/TritonIntelGPUToLLVM/AllocateSharedMemory.cpp‎
Lines changed: 0 additions & 6 deletions b/‎third_party/intel/lib/TritonIntelGPUToLLVM/AllocateSharedMemory.cpp‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 0 additions & 9 deletions b/‎third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp‎
Lines changed: 0 additions & 9 deletions
@@ -65,7 +65,6 @@ static SmallVector<unsigned> getRepShapeForCvt(RankedTensorType srcTy,
                                                RankedTensorType dstTy) {
   Attribute srcLayout = srcTy.getEncoding();
   Attribute dstLayout = dstTy.getEncoding();
-  std::cout << "- in getRepShapeForCvt\n";
 
   if (!cvtNeedsSharedMemory(srcTy, dstTy)) {
     return {};
@@ -82,10 +81,6 @@ static SmallVector<unsigned> getRepShapeForCvt(RankedTensorType srcTy,
   auto dstShapePerCTA = getShapePerCTA(dstTy);
   auto srcShapePerCTATile = getShapePerCTATile(srcLayout, srcTy.getShape());
   auto dstShapePerCTATile = getShapePerCTATile(dstLayout, dstTy.getShape());
-  std::cout << "!!!shapePerCTA: " << srcShapePerCTA.size() << " "
-            << dstShapePerCTA.size() << "\n";
-  std::cout << "!!!shapePerCTATile: " << srcShapePerCTATile.size() << " "
-            << dstShapePerCTATile.size() << "\n";
 
   unsigned rank = dstTy.getRank();
   SmallVector<unsigned> repShape(rank);
@@ -112,9 +107,7 @@ static SmallVector<unsigned> getRepShapeForAtomic(Value result) {
 ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
                                      RankedTensorType dstTy) {
   // Initialize vector sizes and stride
-  std::cout << "getRepShapeForCvt start\n";
   auto repShape = getRepShapeForCvt(srcTy, dstTy);
-  std::cout << "repShape rank: " << repShape.size() << "\n";
   if (repShape.empty())
     return ScratchConfig({}, {});
   ScratchConfig scratchConfig(repShape, repShape);
@@ -126,24 +119,13 @@ ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
 
   auto [inOrd, outOrd] = getCvtOrder(srcLayout, dstLayout);
   scratchConfig.order = outOrd;
-  std::cout << "inOrd: ";
-  for (auto i : inOrd) {
-    std::cout << i << " ";
-  }
-  std::cout << "rank: " << inOrd.size() << "\n";
-  std::cout << "outOrd: ";
-  for (auto i : outOrd) {
-    std::cout << i << " ";
-  }
-  std::cout << "rank: " << outOrd.size() << "\n";
 
   unsigned srcContigPerThread =
       getUniqueContigPerThread(srcLayout, srcTy.getShape())[inOrd[0]];
   unsigned dstContigPerThread =
       getUniqueContigPerThread(dstLayout, dstTy.getShape())[outOrd[0]];
   // TODO: Fix the legacy issue that ourOrd[0] == 0 always means
   //       that we cannot do vectorization.
-  std::cout << "no index issue in getUniqueContigPerThread\n";
   unsigned innerDim = rank - 1;
   scratchConfig.inVec = outOrd[0] != innerDim  ? 1
                         : inOrd[0] != innerDim ? 1
@@ -252,33 +234,27 @@ class AllocationAnalysis {
       maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes,
                                                           scratchAlignment);
     } else if (auto cvtLayout = dyn_cast<triton::gpu::ConvertLayoutOp>(op)) {
-      std::cout << "getScratchValueSize from ConvertLayoutOp\n";
       auto srcTy = cvtLayout.getSrc().getType();
       auto dstTy = cvtLayout.getType();
       auto srcEncoding = srcTy.getEncoding();
       auto dstEncoding = dstTy.getEncoding();
       if (mlir::isa<SharedEncodingAttr>(srcEncoding) ||
           mlir::isa<SharedEncodingAttr>(dstEncoding)) {
         // Conversions from/to shared memory do not need scratch memory.
-        std::cout << "-- ConvertLayoutOp from/to shared memory\n";
         return;
       }
       // ConvertLayoutOp with both input/output non-shared_layout
       // TODO: Besides of implementing ConvertLayoutOp via shared memory, it's
       //       also possible to realize it with other approaches in restricted
       //       conditions, such as warp-shuffle
-      std::cout << "-- getScratchConfigForCvt\n";
       auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);
-      std::cout << "-- getNumScratchElements\n";
       auto elems = getNumScratchElements(scratchConfig.paddedRepShape);
       auto bytes =
           isa<triton::PointerType>(srcTy.getElementType())
               ? elems * kPtrBitWidth / 8
               : elems * std::max<int>(8, srcTy.getElementTypeBitWidth()) / 8;
       maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes,
                                                           scratchAlignment);
-      std::cout << "-- ConvertLayoutOp from/to non-shared memory: " << bytes
-                << " bytes\n";
     } else if (isa<triton::AtomicRMWOp, triton::AtomicCASOp>(op)) {
       auto value = op->getOperand(0);
       // only scalar requires scratch memory
 
@@ -384,16 +384,6 @@ SmallVector<unsigned> getCTAOrder(Attribute layout) {
 SmallVector<int64_t> getShapePerCTA(ArrayRef<unsigned> CTASplitNum,
                                     ArrayRef<int64_t> shape) {
   unsigned rank = shape.size();
-  std::cout << "!!!GPU dialect - getShapePerCTA\n";
-  std::cout << "CTASplitNum: ";
-  for (auto i : CTASplitNum) {
-    std::cout << i << " ";
-  }
-  std::cout << "\nshape: ";
-  for (auto i : shape) {
-    std::cout << i << " ";
-  }
-  std::cout << "\n";
 
   SmallVector<int64_t> shapePerCTA(rank);
   for (unsigned i = 0; i < rank; ++i) {
 
@@ -1622,7 +1622,7 @@ void init_triton_ir(py::module &&m) {
              if (haveDump) {
                auto printingFlags = OpPrintingFlags();
                printingFlags.elideLargeElementsAttrs(16);
-               //  printingFlags.enableDebugInfo();
+               printingFlags.enableDebugInfo();
                auto printAlways = [funcToDump](Pass *, Operation *op) -> bool {
                  if (funcToDump.empty())
                    return true;
 
@@ -56,8 +56,6 @@ LinearLayout identityND(StringAttr inDimName, ArrayRef<unsigned> shape,
   LinearLayout ret = LinearLayout::empty();
   for (int i = 0; i < shape.size(); i++) {
     // Start with the most-minor dimension, which is order[0].
-    // std::cout << "i: " << i << " shape[i]: " << shape[i]
-    //           << " order[i]: " << order[i] << std::endl;
     int dim = order[i];
     ret *= LinearLayout::identity1D(shape[dim], inDimName, outDimNames[dim]);
   }
@@ -291,16 +289,6 @@ LinearLayout ensureLayoutNotSmallerThan(
     assert(actualSize > desiredSize ||
            desiredSize % actualSize == 0 && "bad shape");
     ret *= LinearLayout::identity1D(desiredSize / actualSize, kDim, outDimName);
-    // std::cout << "actualSize: " << actualSize << " desiredSize: " <<
-    // desiredSize
-    //           << std::endl;
-    // std::cout << "outDimName: " << outDimName.str() << std::endl;
-    // std::cout << "identity1D: "
-    //           << LinearLayout::identity1D(desiredSize / actualSize, kDim,
-    //                                       outDimName)
-    //                  .toString()
-    //           << std::endl;
-    // std::cout << "ret: " << ret.toString() << std::endl;
     assert(ret.getOutDimSize(outDimName) >= desiredSize && "bad grow");
   }
   return ret;
@@ -324,12 +312,6 @@ LinearLayout combineCtaCgaWithShape(LinearLayout ctaLayout,
 
   SmallVector<StringAttr> outDimNames = standardOutDimNames(ctx, rank);
 
-  std::cout << "shape: ";
-  for (auto s : shape) {
-    std::cout << s << ", ";
-  }
-  std::cout << std::endl;
-
   llvm::SmallDenseMap<StringAttr, int64_t> labeledShape;
   for (auto [dim, size] : llvm::zip(outDimNames, shape)) {
     labeledShape[dim] = size;
@@ -338,7 +320,6 @@ LinearLayout combineCtaCgaWithShape(LinearLayout ctaLayout,
   LinearLayout cgaLayout =
       ensureLayoutNotLargerThan(makeCgaLayout(cgaLayoutAttr), labeledShape)
           .transposeOuts(llvm::to_vector(ctaLayout.getOutDimNames()));
-  // std::cout << "\ncgaLayout: " << cgaLayout.toString() << std::endl;
 
   // Calculate the shape of the ctaLayout, which is `shape` divided by the
   // cgaLayout's size.
@@ -347,32 +328,19 @@ LinearLayout combineCtaCgaWithShape(LinearLayout ctaLayout,
              llvm::to_vector(cgaLayout.getOutDimNames()) &&
          "bad layout");
 
-  // std::cout << "ctaShape: ";
   for (auto dim : ctaLayout.getOutDimNames()) {
     ctaShape[dim] =
         std::max(int64_t{1}, labeledShape[dim] / cgaLayout.getOutDimSize(dim));
-    // std::cout << ctaShape[dim] << ", ";
   }
-  // std::cout << std::endl;
 
-  std::cout << "ensureLayoutNotSmallerThan start" << std::endl;
   ctaLayout = ensureLayoutNotSmallerThan(ctaLayout, ctaShape);
-  // std::cout << "\nctaLayout not smaller than: " << ctaLayout.toString()
-  //           << std::endl;
-  std::cout << "ensureLayoutNotLargerThan start" << std::endl;
   ctaLayout = ensureLayoutNotLargerThan(ctaLayout, ctaShape);
-  // std::cout << "\nctaLayout not larger than: " << ctaLayout.toString()
-  //           << std::endl;
 
-  // std::cout << "\ncta * cga: " << (ctaLayout * cgaLayout).toString()
-  //           << std::endl;
   LinearLayout ret =
       (std::move(ctaLayout) * std::move(cgaLayout)).transposeOuts(outDimNames);
   for (auto dim : ret.getOutDimNames()) {
     assert(ret.getOutDimSize(dim) == labeledShape[dim] && "bad shape");
   }
-  // std::cout << "\ncombineCtaCgaWithShape: " << ret.toString() << std::endl;
-  std::cout << "combineCtaCgaWithShape end" << std::endl;
   return ret;
 }
 
@@ -569,7 +537,6 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
           LinearLayout::identity1D(warpsPerCTA[0], kWarp, outDimNames[0]);
 
   } else if (opIdx == 1) { // Operand B
-    std::cout << "\nOperand B" << std::endl;
     auto regBasesB = DPASRegBasesB(opsPerChannel, executionSize, threadsPerWarp,
                                    systolicDepth);
     auto laneBasesB =
@@ -591,32 +558,20 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
       tileLayout *=
           LinearLayout::identity1D(warpsPerCTA[0], kWarp, outDimNames[0]);
   } else { // opIdx=2 -> Operand C
-    std::cout << "\nOperand C" << std::endl;
     auto regBasesC = DPASRegBasesC(repeatCount, executionSize, threadsPerWarp);
     auto laneBasesC =
         DPASLaneBasesC(repeatCount, executionSize, threadsPerWarp);
     tileLayout = LinearLayout({{kRegister, regBasesC}, {kLane, laneBasesC}},
                               ArrayRef(outDimNames).take_back(2));
-    // std::cout << tileLayout.toString() << std::endl;
     // The per-inst layout is repeated at each repCluster.
     // Hence, multiply with the identity layouts starting from the
     // least significant dimension.
     dimNonK = rank - 2;
     dimK = rank - 1;
     tileLayout *= LinearLayout::identity1D(repCluster[dimK], kRegister,
                                            outDimNames[dimK]);
-    // std::cout << (LinearLayout::identity1D(repCluster[dimK], kRegister,
-    //                                        outDimNames[dimK])
-    //                   .toString())
-    //           << std::endl;
-    // std::cout << (tileLayout.toString()) << std::endl;
     tileLayout *= LinearLayout::identity1D(repCluster[dimNonK], kRegister,
                                            outDimNames[dimNonK]);
-    // std::cout << (LinearLayout::identity1D(repCluster[dimNonK], kRegister,
-    //                                        outDimNames[dimNonK])
-    //                   .toString())
-    //           << std::endl;
-    // std::cout << (tileLayout.toString()) << std::endl;
 
     // // The identical layout is repeated among warps
     tileLayout *=
@@ -626,7 +581,6 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
     if (rank == 3)
       tileLayout *=
           LinearLayout::identity1D(warpsPerCTA[0], kWarp, outDimNames[0]);
-    // std::cout << (tileLayout.toString()) << std::endl;
   }
 
   // Lastly, the layout repeats to match the shape.
@@ -651,9 +605,6 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
   if (rank == 3)
     tileLayout *=
         LinearLayout::identity1D(numReps[0], kRegister, outDimNames[0]);
-  // std::cout << "\ntileLayout with DPASRepetition: " <<
-  // (tileLayout.toString())
-  //           << std::endl;
 
   return combineCtaCgaWithShape(std::move(tileLayout),
                                 CTALayoutAttr::getDefault(ctx, rank), shape);
 
@@ -20,21 +20,17 @@ struct AllocateSharedMemory
       AllocateSharedMemory>::IntelAllocateSharedMemoryBase;
 
   void runOnOperation() override {
-    std::cout << "AllocateSharedMemory Start\n";
     ModuleOp mod = getOperation();
     MLIRContext *ctx = &getContext();
-    std::cout << "Before create Module Allocation\n";
     ModuleAllocation allocation(mod);
 
-    std::cout << "Before mod walk\n";
     mod.walk([&](FunctionOpInterface funcOp) {
       if (allocation.isRoot(funcOp) && allocation.getSharedMemorySize()) {
         LLVM::LLVMPointerType ptrTy = LLVM::LLVMPointerType::get(
             ctx, triton::TritonGEN::TritonGENMemorySpace::kWorkgroup);
         funcOp.insertArgument(funcOp.getNumArguments(), ptrTy, {},
                               funcOp.getLoc());
       }
-      std::cout << "Before funcOp walk\n";
       funcOp.walk([&](Operation *op) {
         auto *funcAllocation = allocation.getFuncData(funcOp);
         auto oBufferId = funcAllocation->getBufferId(op);
@@ -53,7 +49,6 @@ struct AllocateSharedMemory
                     IntegerAttr::get(IntegerType::get(ctx, 32), offset));
       });
     });
-    std::cout << "Before getSharedMemorySize\n";
     int32_t initialSharedMemorySize = 0;
     if (IntegerAttr sharedAttr =
             mod->getAttrOfType<IntegerAttr>("triton_gpu.shared"))
@@ -62,7 +57,6 @@ struct AllocateSharedMemory
                  IntegerAttr::get(IntegerType::get(ctx, 32),
                                   initialSharedMemorySize +
                                       allocation.getSharedMemorySize()));
-    std::cout << "AllocateSharedMemory End\n";
   }
 };
 
 
@@ -40,7 +40,6 @@ struct ConvertLayoutOpConversion
   LogicalResult
   matchAndRewrite(triton::gpu::ConvertLayoutOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    std::cout << "ConvertLayoutOpConversion" << std::endl;
     RankedTensorType srcTy = op.getSrc().getType();
     RankedTensorType dstTy = op.getType();
     Attribute srcLayout = srcTy.getEncoding();
@@ -66,7 +65,6 @@ struct ConvertLayoutOpConversion
                     RankedTensorType type,
                     ArrayRef<unsigned> multiDimCTAInRepId,
                     ArrayRef<unsigned> shapePerCTATile) const {
-    std::cout << "getMultiDimOffset" << std::endl;
     auto shape = type.getShape();
     unsigned rank = shape.size();
     if (auto blockedLayout = dyn_cast<BlockedEncodingAttr>(layout)) {
@@ -143,7 +141,6 @@ struct ConvertLayoutOpConversion
                       ArrayRef<unsigned> origRepShape,
                       ArrayRef<unsigned> outOrd, SmallVector<Value> &vals,
                       Value smemBase) const {
-    std::cout << "processReplica" << std::endl;
     auto accumNumCTAsEachRep = product<unsigned>(numCTAsEachRep);
     auto layout = type.getEncoding();
     auto rank = type.getRank();
@@ -229,7 +226,6 @@ struct ConvertLayoutOpConversion
   lowerDistributedToDistributed(triton::gpu::ConvertLayoutOp op,
                                 OpAdaptor adaptor,
                                 ConversionPatternRewriter &rewriter) const {
-    std::cout << "lowerDistributedToDistributed" << std::endl;
     auto loc = op.getLoc();
     auto typeConverter = getTypeConverter();
     RankedTensorType srcTy = op.getSrc().getType();
@@ -329,7 +325,6 @@ struct ConvertLayoutOpConversion
                                            ConversionPatternRewriter &rewriter,
                                            Value vals,
                                            RankedTensorType srcType) const {
-    std::cout << "getValuesFromDpasLayoutStruct" << std::endl;
     SmallVector<Value> elems = unpackLLElements(loc, vals, rewriter);
     auto dpasLayout = dyn_cast<DpasEncodingAttr>(srcType.getEncoding());
 
@@ -374,7 +369,6 @@ struct ConvertLayoutOpConversion
   Value composeValuesToDotOperandLayoutStruct(
       Location loc, ConversionPatternRewriter &rewriter, const ValueTable &vals,
       RankedTensorType dstType) const {
-    std::cout << "composeValuesToDotOperandLayoutStruct" << std::endl;
     auto dotLayout = dyn_cast<DotOperandEncodingAttr>(dstType.getEncoding());
     auto dpasLayout = dyn_cast<DpasEncodingAttr>(dotLayout.getParent());
     unsigned opIdx = dotLayout.getOpIdx();
@@ -431,7 +425,6 @@ struct ConvertLayoutOpConversion
   LogicalResult
   lowerDpasToDotOperand(triton::gpu::ConvertLayoutOp op, OpAdaptor adaptor,
                         ConversionPatternRewriter &rewriter) const {
-    std::cout << "lowerDpasToDotOperand" << std::endl;
     Location loc = op.getLoc();
     RankedTensorType srcTy = op.getSrc().getType();
     RankedTensorType dstTy = op.getType();
@@ -464,7 +457,6 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
   LogicalResult
   matchAndRewrite(ConvertLayoutOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    std::cout << "ConvertLayoutOpUsingLinearLayoutsConversion" << std::endl;
     MLIRContext *ctx = op.getContext();
 
     const auto &shape = op.getType().getShape();
@@ -513,7 +505,6 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
   transferWithinThread(ConvertLayoutOp op, const LinearLayout &srcLayout,
                        const LinearLayout &dstLayout, OpAdaptor adaptor,
                        ConversionPatternRewriter &rewriter) const {
-    std::cout << "transferWithinThread" << std::endl;
     MLIRContext *ctx = op.getContext();
     auto loc = op.getLoc();
     StringAttr kRegister = str_attr("register");