intel
diff --git a/‎lib/Analysis/Allocation.cpp‎
Lines changed: 0 additions & 26 deletions b/‎lib/Analysis/Allocation.cpp‎
Lines changed: 0 additions & 26 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 0 additions & 12 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎python/src/ir.cc‎
Lines changed: 1 addition & 1 deletion b/‎python/src/ir.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp‎
Lines changed: 11 additions & 18 deletions b/‎third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp‎
Lines changed: 11 additions & 18 deletions
diff --git a/‎third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 0 additions & 61 deletions b/‎third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 0 additions & 61 deletions
@@ -1,7 +1,6 @@
 #include "triton/Analysis/Allocation.h"
 
 #include <algorithm>
-#include <iostream>
 #include <limits>
 #include <numeric>
 
@@ -14,7 +13,6 @@
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 
 using ::mlir::triton::gpu::AMDMfmaEncodingAttr;
@@ -65,7 +63,6 @@ static SmallVector<unsigned> getRepShapeForCvt(RankedTensorType srcTy,
                                                RankedTensorType dstTy) {
   Attribute srcLayout = srcTy.getEncoding();
   Attribute dstLayout = dstTy.getEncoding();
-  std::cout << "- in getRepShapeForCvt\n";
 
   if (!cvtNeedsSharedMemory(srcTy, dstTy)) {
     return {};
@@ -82,10 +79,6 @@ static SmallVector<unsigned> getRepShapeForCvt(RankedTensorType srcTy,
   auto dstShapePerCTA = getShapePerCTA(dstTy);
   auto srcShapePerCTATile = getShapePerCTATile(srcLayout, srcTy.getShape());
   auto dstShapePerCTATile = getShapePerCTATile(dstLayout, dstTy.getShape());
-  std::cout << "!!!shapePerCTA: " << srcShapePerCTA.size() << " "
-            << dstShapePerCTA.size() << "\n";
-  std::cout << "!!!shapePerCTATile: " << srcShapePerCTATile.size() << " "
-            << dstShapePerCTATile.size() << "\n";
 
   unsigned rank = dstTy.getRank();
   SmallVector<unsigned> repShape(rank);
@@ -112,9 +105,7 @@ static SmallVector<unsigned> getRepShapeForAtomic(Value result) {
 ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
                                      RankedTensorType dstTy) {
   // Initialize vector sizes and stride
-  std::cout << "getRepShapeForCvt start\n";
   auto repShape = getRepShapeForCvt(srcTy, dstTy);
-  std::cout << "repShape rank: " << repShape.size() << "\n";
   if (repShape.empty())
     return ScratchConfig({}, {});
   ScratchConfig scratchConfig(repShape, repShape);
@@ -126,24 +117,13 @@ ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
 
   auto [inOrd, outOrd] = getCvtOrder(srcLayout, dstLayout);
   scratchConfig.order = outOrd;
-  std::cout << "inOrd: ";
-  for (auto i : inOrd) {
-    std::cout << i << " ";
-  }
-  std::cout << "rank: " << inOrd.size() << "\n";
-  std::cout << "outOrd: ";
-  for (auto i : outOrd) {
-    std::cout << i << " ";
-  }
-  std::cout << "rank: " << outOrd.size() << "\n";
 
   unsigned srcContigPerThread =
       getUniqueContigPerThread(srcLayout, srcTy.getShape())[inOrd[0]];
   unsigned dstContigPerThread =
       getUniqueContigPerThread(dstLayout, dstTy.getShape())[outOrd[0]];
   // TODO: Fix the legacy issue that ourOrd[0] == 0 always means
   //       that we cannot do vectorization.
-  std::cout << "no index issue in getUniqueContigPerThread\n";
   unsigned innerDim = rank - 1;
   scratchConfig.inVec = outOrd[0] != innerDim  ? 1
                         : inOrd[0] != innerDim ? 1
@@ -252,33 +232,27 @@ class AllocationAnalysis {
       maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes,
                                                           scratchAlignment);
     } else if (auto cvtLayout = dyn_cast<triton::gpu::ConvertLayoutOp>(op)) {
-      std::cout << "getScratchValueSize from ConvertLayoutOp\n";
       auto srcTy = cvtLayout.getSrc().getType();
       auto dstTy = cvtLayout.getType();
       auto srcEncoding = srcTy.getEncoding();
       auto dstEncoding = dstTy.getEncoding();
       if (mlir::isa<SharedEncodingAttr>(srcEncoding) ||
           mlir::isa<SharedEncodingAttr>(dstEncoding)) {
         // Conversions from/to shared memory do not need scratch memory.
-        std::cout << "-- ConvertLayoutOp from/to shared memory\n";
         return;
       }
       // ConvertLayoutOp with both input/output non-shared_layout
       // TODO: Besides of implementing ConvertLayoutOp via shared memory, it's
       //       also possible to realize it with other approaches in restricted
       //       conditions, such as warp-shuffle
-      std::cout << "-- getScratchConfigForCvt\n";
       auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);
-      std::cout << "-- getNumScratchElements\n";
       auto elems = getNumScratchElements(scratchConfig.paddedRepShape);
       auto bytes =
           isa<triton::PointerType>(srcTy.getElementType())
               ? elems * kPtrBitWidth / 8
               : elems * std::max<int>(8, srcTy.getElementTypeBitWidth()) / 8;
       maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes,
                                                           scratchAlignment);
-      std::cout << "-- ConvertLayoutOp from/to non-shared memory: " << bytes
-                << " bytes\n";
     } else if (isa<triton::AtomicRMWOp, triton::AtomicCASOp>(op)) {
       auto value = op->getOperand(0);
       // only scalar requires scratch memory
 
@@ -1,7 +1,6 @@
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
 #include <cstdint>
-#include <iostream>
 #include <numeric>
 
 #include "mlir/IR/DialectImplementation.h"
@@ -384,17 +383,6 @@ SmallVector<unsigned> getCTAOrder(Attribute layout) {
 SmallVector<int64_t> getShapePerCTA(ArrayRef<unsigned> CTASplitNum,
                                     ArrayRef<int64_t> shape) {
   unsigned rank = shape.size();
-  std::cout << "!!!GPU dialect - getShapePerCTA\n";
-  std::cout << "CTASplitNum: ";
-  for (auto i : CTASplitNum) {
-    std::cout << i << " ";
-  }
-  std::cout << "\nshape: ";
-  for (auto i : shape) {
-    std::cout << i << " ";
-  }
-  std::cout << "\n";
-
   SmallVector<int64_t> shapePerCTA(rank);
   for (unsigned i = 0; i < rank; ++i) {
     // This wrapping rule must be consistent with emitCTAOffsetForLayout
 
@@ -1622,7 +1622,7 @@ void init_triton_ir(py::module &&m) {
              if (haveDump) {
                auto printingFlags = OpPrintingFlags();
                printingFlags.elideLargeElementsAttrs(16);
-               //  printingFlags.enableDebugInfo();
+               printingFlags.enableDebugInfo();
                auto printAlways = [funcToDump](Pass *, Operation *op) -> bool {
                  if (funcToDump.empty())
                    return true;
 
@@ -1,6 +1,5 @@
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
-#include <cstdint>
 #include <numeric>
 
 #include "intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h"
@@ -13,9 +12,7 @@
 
 #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.cpp.inc"
 
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TypeSwitch.h"
-#include "llvm/Support/ErrorHandling.h"
 
 using namespace mlir;
 using namespace mlir::triton;
@@ -105,32 +102,32 @@ SmallVector<unsigned> DpasEncodingAttr::getDPASInstShapeC() const {
 };
 
 SmallVector<unsigned> DpasEncodingAttr::getShapeA() const {
-  auto shapeA = getDPASInstShapeA();
+  auto instShapeA = getDPASInstShapeA();
   auto repCluster = getRepCluster();
   size_t rank = repCluster.size();
   SmallVector<unsigned> resShape(rank, 1);
-  resShape[rank - 2] = shapeA[0] * repCluster[rank - 2];
-  resShape[rank - 1] = shapeA[1];
+  resShape[rank - 2] = instShapeA[0] * repCluster[rank - 2];
+  resShape[rank - 1] = instShapeA[1];
   return resShape;
 }
 
 SmallVector<unsigned> DpasEncodingAttr::getShapeB() const {
-  auto shapeB = getDPASInstShapeB();
+  auto instShapeB = getDPASInstShapeB();
   auto repCluster = getRepCluster();
   size_t rank = repCluster.size();
   SmallVector<unsigned> resShape(rank, 1);
-  resShape[rank - 2] = shapeB[0];
-  resShape[rank - 1] = shapeB[1] * repCluster[rank - 1];
+  resShape[rank - 2] = instShapeB[0];
+  resShape[rank - 1] = instShapeB[1] * repCluster[rank - 1];
   return resShape;
 }
 
 SmallVector<unsigned> DpasEncodingAttr::getShapeC() const {
-  auto shapeC = getDPASInstShapeC();
+  auto instShapeC = getDPASInstShapeC();
   auto repCluster = getRepCluster();
   size_t rank = repCluster.size();
   SmallVector<unsigned> resShape(rank, 1);
-  resShape[rank - 2] = shapeC[0] * repCluster[rank - 2];
-  resShape[rank - 1] = shapeC[1] * repCluster[rank - 1];
+  resShape[rank - 2] = instShapeC[0] * repCluster[rank - 2];
+  resShape[rank - 1] = instShapeC[1] * repCluster[rank - 1];
   return resShape;
 }
 
@@ -193,12 +190,8 @@ SmallVector<unsigned> DpasEncodingAttr::getCTASplitNum() const {
 
 SmallVector<unsigned> DpasEncodingAttr::getCTAOrder() const {
   size_t rank = getWarpsPerCTA().size();
-  // auto res = llvm::to_vector(llvm::reverse(llvm::seq<unsigned>(rank)));
-  // return res;
-  if (rank == 3)
-    return {2, 1, 0};
-  else
-    return {1, 0};
+  auto res = llvm::to_vector(llvm::reverse(llvm::seq<unsigned>(rank)));
+  return res;
 }
 
 SmallVector<unsigned> DpasEncodingAttr::getCTAsPerCGA() const {
 
@@ -1,4 +1,3 @@
-#include <iostream>
 #include <vector>
 
 #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
@@ -8,9 +7,7 @@
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Tools/LinearLayout.h"
 #include "triton/Tools/StrUtil.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -56,8 +53,6 @@ LinearLayout identityND(StringAttr inDimName, ArrayRef<unsigned> shape,
   LinearLayout ret = LinearLayout::empty();
   for (int i = 0; i < shape.size(); i++) {
     // Start with the most-minor dimension, which is order[0].
-    // std::cout << "i: " << i << " shape[i]: " << shape[i]
-    //           << " order[i]: " << order[i] << std::endl;
     int dim = order[i];
     ret *= LinearLayout::identity1D(shape[dim], inDimName, outDimNames[dim]);
   }
@@ -280,7 +275,6 @@ LinearLayout ensureLayoutNotSmallerThan(
     return layout;
   }
 
-  // MLIRContext *ctx = shape.begin()->first.getContext();
   StringAttr kDim = *layout.getInDimNames().begin();
   assert(kDim == "register" || kDim == "offset" && "unexpected kDim");
 
@@ -291,16 +285,6 @@ LinearLayout ensureLayoutNotSmallerThan(
     assert(actualSize > desiredSize ||
            desiredSize % actualSize == 0 && "bad shape");
     ret *= LinearLayout::identity1D(desiredSize / actualSize, kDim, outDimName);
-    // std::cout << "actualSize: " << actualSize << " desiredSize: " <<
-    // desiredSize
-    //           << std::endl;
-    // std::cout << "outDimName: " << outDimName.str() << std::endl;
-    // std::cout << "identity1D: "
-    //           << LinearLayout::identity1D(desiredSize / actualSize, kDim,
-    //                                       outDimName)
-    //                  .toString()
-    //           << std::endl;
-    // std::cout << "ret: " << ret.toString() << std::endl;
     assert(ret.getOutDimSize(outDimName) >= desiredSize && "bad grow");
   }
   return ret;
@@ -324,12 +308,6 @@ LinearLayout combineCtaCgaWithShape(LinearLayout ctaLayout,
 
   SmallVector<StringAttr> outDimNames = standardOutDimNames(ctx, rank);
 
-  std::cout << "shape: ";
-  for (auto s : shape) {
-    std::cout << s << ", ";
-  }
-  std::cout << std::endl;
-
   llvm::SmallDenseMap<StringAttr, int64_t> labeledShape;
   for (auto [dim, size] : llvm::zip(outDimNames, shape)) {
     labeledShape[dim] = size;
@@ -338,41 +316,26 @@ LinearLayout combineCtaCgaWithShape(LinearLayout ctaLayout,
   LinearLayout cgaLayout =
       ensureLayoutNotLargerThan(makeCgaLayout(cgaLayoutAttr), labeledShape)
           .transposeOuts(llvm::to_vector(ctaLayout.getOutDimNames()));
-  // std::cout << "\ncgaLayout: " << cgaLayout.toString() << std::endl;
 
   // Calculate the shape of the ctaLayout, which is `shape` divided by the
   // cgaLayout's size.
   llvm::SmallDenseMap<StringAttr, int64_t> ctaShape;
   assert(llvm::to_vector(ctaLayout.getOutDimNames()) ==
              llvm::to_vector(cgaLayout.getOutDimNames()) &&
          "bad layout");
-
-  // std::cout << "ctaShape: ";
   for (auto dim : ctaLayout.getOutDimNames()) {
     ctaShape[dim] =
         std::max(int64_t{1}, labeledShape[dim] / cgaLayout.getOutDimSize(dim));
-    // std::cout << ctaShape[dim] << ", ";
   }
-  // std::cout << std::endl;
 
-  std::cout << "ensureLayoutNotSmallerThan start" << std::endl;
   ctaLayout = ensureLayoutNotSmallerThan(ctaLayout, ctaShape);
-  // std::cout << "\nctaLayout not smaller than: " << ctaLayout.toString()
-  //           << std::endl;
-  std::cout << "ensureLayoutNotLargerThan start" << std::endl;
   ctaLayout = ensureLayoutNotLargerThan(ctaLayout, ctaShape);
-  // std::cout << "\nctaLayout not larger than: " << ctaLayout.toString()
-  //           << std::endl;
 
-  // std::cout << "\ncta * cga: " << (ctaLayout * cgaLayout).toString()
-  //           << std::endl;
   LinearLayout ret =
       (std::move(ctaLayout) * std::move(cgaLayout)).transposeOuts(outDimNames);
   for (auto dim : ret.getOutDimNames()) {
     assert(ret.getOutDimSize(dim) == labeledShape[dim] && "bad shape");
   }
-  // std::cout << "\ncombineCtaCgaWithShape: " << ret.toString() << std::endl;
-  std::cout << "combineCtaCgaWithShape end" << std::endl;
   return ret;
 }
 
@@ -569,7 +532,6 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
           LinearLayout::identity1D(warpsPerCTA[0], kWarp, outDimNames[0]);
 
   } else if (opIdx == 1) { // Operand B
-    std::cout << "\nOperand B" << std::endl;
     auto regBasesB = DPASRegBasesB(opsPerChannel, executionSize, threadsPerWarp,
                                    systolicDepth);
     auto laneBasesB =
@@ -591,32 +553,20 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
       tileLayout *=
           LinearLayout::identity1D(warpsPerCTA[0], kWarp, outDimNames[0]);
   } else { // opIdx=2 -> Operand C
-    std::cout << "\nOperand C" << std::endl;
     auto regBasesC = DPASRegBasesC(repeatCount, executionSize, threadsPerWarp);
     auto laneBasesC =
         DPASLaneBasesC(repeatCount, executionSize, threadsPerWarp);
     tileLayout = LinearLayout({{kRegister, regBasesC}, {kLane, laneBasesC}},
                               ArrayRef(outDimNames).take_back(2));
-    // std::cout << tileLayout.toString() << std::endl;
     // The per-inst layout is repeated at each repCluster.
     // Hence, multiply with the identity layouts starting from the
     // least significant dimension.
     dimNonK = rank - 2;
     dimK = rank - 1;
     tileLayout *= LinearLayout::identity1D(repCluster[dimK], kRegister,
                                            outDimNames[dimK]);
-    // std::cout << (LinearLayout::identity1D(repCluster[dimK], kRegister,
-    //                                        outDimNames[dimK])
-    //                   .toString())
-    //           << std::endl;
-    // std::cout << (tileLayout.toString()) << std::endl;
     tileLayout *= LinearLayout::identity1D(repCluster[dimNonK], kRegister,
                                            outDimNames[dimNonK]);
-    // std::cout << (LinearLayout::identity1D(repCluster[dimNonK], kRegister,
-    //                                        outDimNames[dimNonK])
-    //                   .toString())
-    //           << std::endl;
-    // std::cout << (tileLayout.toString()) << std::endl;
 
     // // The identical layout is repeated among warps
     tileLayout *=
@@ -626,34 +576,23 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
     if (rank == 3)
       tileLayout *=
           LinearLayout::identity1D(warpsPerCTA[0], kWarp, outDimNames[0]);
-    // std::cout << (tileLayout.toString()) << std::endl;
   }
 
   // Lastly, the layout repeats to match the shape.
   // Operand A/B repeats through the K-dimension first then repeats
   // through the non-K dimension.
   SmallVector<int64_t> numReps = dpas.getDPASRepetitions(shape, opIdx);
 
-  std::cout << "numReps: ";
-  for (auto numRep : numReps) {
-    std::cout << numRep << ", ";
-  }
-  std::cout << std::endl;
-
   // numReps is always 3D, we should add 1 to dim id when rank is 2
   int repDimK = rank == 2 ? dimK + 1 : dimK;
   int repDimNonK = rank == 2 ? dimNonK + 1 : dimNonK;
   tileLayout *=
       LinearLayout::identity1D(numReps[repDimK], kRegister, outDimNames[dimK]);
   tileLayout *= LinearLayout::identity1D(numReps[repDimNonK], kRegister,
                                          outDimNames[dimNonK]);
-  std::cout << "rank: " << rank << std::endl;
   if (rank == 3)
     tileLayout *=
         LinearLayout::identity1D(numReps[0], kRegister, outDimNames[0]);
-  // std::cout << "\ntileLayout with DPASRepetition: " <<
-  // (tileLayout.toString())
-  //           << std::endl;
 
   return combineCtaCgaWithShape(std::move(tileLayout),
                                 CTALayoutAttr::getDefault(ctx, rank), shape);