Skip to content

Commit 19cbbbc

Browse files
committed
Cleanup
1 parent 77422cb commit 19cbbbc

File tree

12 files changed

+17
-176
lines changed

12 files changed

+17
-176
lines changed

lib/Analysis/Allocation.cpp

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#include "triton/Analysis/Allocation.h"
22

33
#include <algorithm>
4-
#include <iostream>
54
#include <limits>
65
#include <numeric>
76

@@ -14,7 +13,6 @@
1413
#include "triton/Dialect/Triton/IR/Dialect.h"
1514
#include "triton/Dialect/Triton/IR/Utility.h"
1615
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
17-
#include "llvm/ADT/STLExtras.h"
1816
#include "llvm/ADT/SmallVector.h"
1917

2018
using ::mlir::triton::gpu::AMDMfmaEncodingAttr;
@@ -65,7 +63,6 @@ static SmallVector<unsigned> getRepShapeForCvt(RankedTensorType srcTy,
6563
RankedTensorType dstTy) {
6664
Attribute srcLayout = srcTy.getEncoding();
6765
Attribute dstLayout = dstTy.getEncoding();
68-
std::cout << "- in getRepShapeForCvt\n";
6966

7067
if (!cvtNeedsSharedMemory(srcTy, dstTy)) {
7168
return {};
@@ -82,10 +79,6 @@ static SmallVector<unsigned> getRepShapeForCvt(RankedTensorType srcTy,
8279
auto dstShapePerCTA = getShapePerCTA(dstTy);
8380
auto srcShapePerCTATile = getShapePerCTATile(srcLayout, srcTy.getShape());
8481
auto dstShapePerCTATile = getShapePerCTATile(dstLayout, dstTy.getShape());
85-
std::cout << "!!!shapePerCTA: " << srcShapePerCTA.size() << " "
86-
<< dstShapePerCTA.size() << "\n";
87-
std::cout << "!!!shapePerCTATile: " << srcShapePerCTATile.size() << " "
88-
<< dstShapePerCTATile.size() << "\n";
8982

9083
unsigned rank = dstTy.getRank();
9184
SmallVector<unsigned> repShape(rank);
@@ -112,9 +105,7 @@ static SmallVector<unsigned> getRepShapeForAtomic(Value result) {
112105
ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
113106
RankedTensorType dstTy) {
114107
// Initialize vector sizes and stride
115-
std::cout << "getRepShapeForCvt start\n";
116108
auto repShape = getRepShapeForCvt(srcTy, dstTy);
117-
std::cout << "repShape rank: " << repShape.size() << "\n";
118109
if (repShape.empty())
119110
return ScratchConfig({}, {});
120111
ScratchConfig scratchConfig(repShape, repShape);
@@ -126,24 +117,13 @@ ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
126117

127118
auto [inOrd, outOrd] = getCvtOrder(srcLayout, dstLayout);
128119
scratchConfig.order = outOrd;
129-
std::cout << "inOrd: ";
130-
for (auto i : inOrd) {
131-
std::cout << i << " ";
132-
}
133-
std::cout << "rank: " << inOrd.size() << "\n";
134-
std::cout << "outOrd: ";
135-
for (auto i : outOrd) {
136-
std::cout << i << " ";
137-
}
138-
std::cout << "rank: " << outOrd.size() << "\n";
139120

140121
unsigned srcContigPerThread =
141122
getUniqueContigPerThread(srcLayout, srcTy.getShape())[inOrd[0]];
142123
unsigned dstContigPerThread =
143124
getUniqueContigPerThread(dstLayout, dstTy.getShape())[outOrd[0]];
144125
// TODO: Fix the legacy issue that ourOrd[0] == 0 always means
145126
// that we cannot do vectorization.
146-
std::cout << "no index issue in getUniqueContigPerThread\n";
147127
unsigned innerDim = rank - 1;
148128
scratchConfig.inVec = outOrd[0] != innerDim ? 1
149129
: inOrd[0] != innerDim ? 1
@@ -252,33 +232,27 @@ class AllocationAnalysis {
252232
maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes,
253233
scratchAlignment);
254234
} else if (auto cvtLayout = dyn_cast<triton::gpu::ConvertLayoutOp>(op)) {
255-
std::cout << "getScratchValueSize from ConvertLayoutOp\n";
256235
auto srcTy = cvtLayout.getSrc().getType();
257236
auto dstTy = cvtLayout.getType();
258237
auto srcEncoding = srcTy.getEncoding();
259238
auto dstEncoding = dstTy.getEncoding();
260239
if (mlir::isa<SharedEncodingAttr>(srcEncoding) ||
261240
mlir::isa<SharedEncodingAttr>(dstEncoding)) {
262241
// Conversions from/to shared memory do not need scratch memory.
263-
std::cout << "-- ConvertLayoutOp from/to shared memory\n";
264242
return;
265243
}
266244
// ConvertLayoutOp with both input/output non-shared_layout
267245
// TODO: Besides of implementing ConvertLayoutOp via shared memory, it's
268246
// also possible to realize it with other approaches in restricted
269247
// conditions, such as warp-shuffle
270-
std::cout << "-- getScratchConfigForCvt\n";
271248
auto scratchConfig = getScratchConfigForCvt(srcTy, dstTy);
272-
std::cout << "-- getNumScratchElements\n";
273249
auto elems = getNumScratchElements(scratchConfig.paddedRepShape);
274250
auto bytes =
275251
isa<triton::PointerType>(srcTy.getElementType())
276252
? elems * kPtrBitWidth / 8
277253
: elems * std::max<int>(8, srcTy.getElementTypeBitWidth()) / 8;
278254
maybeAddScratchBuffer<BufferT::BufferKind::Scratch>(op, bytes,
279255
scratchAlignment);
280-
std::cout << "-- ConvertLayoutOp from/to non-shared memory: " << bytes
281-
<< " bytes\n";
282256
} else if (isa<triton::AtomicRMWOp, triton::AtomicCASOp>(op)) {
283257
auto value = op->getOperand(0);
284258
// only scalar requires scratch memory

lib/Dialect/TritonGPU/IR/Dialect.cpp

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#include "triton/Dialect/Triton/IR/Dialect.h"
22

33
#include <cstdint>
4-
#include <iostream>
54
#include <numeric>
65

76
#include "mlir/IR/DialectImplementation.h"
@@ -384,17 +383,6 @@ SmallVector<unsigned> getCTAOrder(Attribute layout) {
384383
SmallVector<int64_t> getShapePerCTA(ArrayRef<unsigned> CTASplitNum,
385384
ArrayRef<int64_t> shape) {
386385
unsigned rank = shape.size();
387-
std::cout << "!!!GPU dialect - getShapePerCTA\n";
388-
std::cout << "CTASplitNum: ";
389-
for (auto i : CTASplitNum) {
390-
std::cout << i << " ";
391-
}
392-
std::cout << "\nshape: ";
393-
for (auto i : shape) {
394-
std::cout << i << " ";
395-
}
396-
std::cout << "\n";
397-
398386
SmallVector<int64_t> shapePerCTA(rank);
399387
for (unsigned i = 0; i < rank; ++i) {
400388
// This wrapping rule must be consistent with emitCTAOffsetForLayout

python/src/ir.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1622,7 +1622,7 @@ void init_triton_ir(py::module &&m) {
16221622
if (haveDump) {
16231623
auto printingFlags = OpPrintingFlags();
16241624
printingFlags.elideLargeElementsAttrs(16);
1625-
// printingFlags.enableDebugInfo();
1625+
printingFlags.enableDebugInfo();
16261626
auto printAlways = [funcToDump](Pass *, Operation *op) -> bool {
16271627
if (funcToDump.empty())
16281628
return true;

third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#include "triton/Dialect/Triton/IR/Dialect.h"
22

3-
#include <cstdint>
43
#include <numeric>
54

65
#include "intel/include/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.h"
@@ -13,9 +12,7 @@
1312

1413
#include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.cpp.inc"
1514

16-
#include "llvm/ADT/SmallVector.h"
1715
#include "llvm/ADT/TypeSwitch.h"
18-
#include "llvm/Support/ErrorHandling.h"
1916

2017
using namespace mlir;
2118
using namespace mlir::triton;
@@ -105,32 +102,32 @@ SmallVector<unsigned> DpasEncodingAttr::getDPASInstShapeC() const {
105102
};
106103

107104
SmallVector<unsigned> DpasEncodingAttr::getShapeA() const {
108-
auto shapeA = getDPASInstShapeA();
105+
auto instShapeA = getDPASInstShapeA();
109106
auto repCluster = getRepCluster();
110107
size_t rank = repCluster.size();
111108
SmallVector<unsigned> resShape(rank, 1);
112-
resShape[rank - 2] = shapeA[0] * repCluster[rank - 2];
113-
resShape[rank - 1] = shapeA[1];
109+
resShape[rank - 2] = instShapeA[0] * repCluster[rank - 2];
110+
resShape[rank - 1] = instShapeA[1];
114111
return resShape;
115112
}
116113

117114
SmallVector<unsigned> DpasEncodingAttr::getShapeB() const {
118-
auto shapeB = getDPASInstShapeB();
115+
auto instShapeB = getDPASInstShapeB();
119116
auto repCluster = getRepCluster();
120117
size_t rank = repCluster.size();
121118
SmallVector<unsigned> resShape(rank, 1);
122-
resShape[rank - 2] = shapeB[0];
123-
resShape[rank - 1] = shapeB[1] * repCluster[rank - 1];
119+
resShape[rank - 2] = instShapeB[0];
120+
resShape[rank - 1] = instShapeB[1] * repCluster[rank - 1];
124121
return resShape;
125122
}
126123

127124
SmallVector<unsigned> DpasEncodingAttr::getShapeC() const {
128-
auto shapeC = getDPASInstShapeC();
125+
auto instShapeC = getDPASInstShapeC();
129126
auto repCluster = getRepCluster();
130127
size_t rank = repCluster.size();
131128
SmallVector<unsigned> resShape(rank, 1);
132-
resShape[rank - 2] = shapeC[0] * repCluster[rank - 2];
133-
resShape[rank - 1] = shapeC[1] * repCluster[rank - 1];
129+
resShape[rank - 2] = instShapeC[0] * repCluster[rank - 2];
130+
resShape[rank - 1] = instShapeC[1] * repCluster[rank - 1];
134131
return resShape;
135132
}
136133

@@ -193,12 +190,8 @@ SmallVector<unsigned> DpasEncodingAttr::getCTASplitNum() const {
193190

194191
SmallVector<unsigned> DpasEncodingAttr::getCTAOrder() const {
195192
size_t rank = getWarpsPerCTA().size();
196-
// auto res = llvm::to_vector(llvm::reverse(llvm::seq<unsigned>(rank)));
197-
// return res;
198-
if (rank == 3)
199-
return {2, 1, 0};
200-
else
201-
return {1, 0};
193+
auto res = llvm::to_vector(llvm::reverse(llvm::seq<unsigned>(rank)));
194+
return res;
202195
}
203196

204197
SmallVector<unsigned> DpasEncodingAttr::getCTAsPerCGA() const {

third_party/intel/lib/Dialect/TritonIntelGPU/IR/LinearLayoutConversions.cpp

Lines changed: 0 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
#include <iostream>
21
#include <vector>
32

43
#include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
@@ -8,9 +7,7 @@
87
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
98
#include "triton/Tools/LinearLayout.h"
109
#include "triton/Tools/StrUtil.h"
11-
#include "llvm/ADT/ArrayRef.h"
1210
#include "llvm/ADT/DenseMap.h"
13-
#include "llvm/ADT/SmallVector.h"
1411
#include "llvm/ADT/Twine.h"
1512
#include "llvm/Support/ErrorHandling.h"
1613
#include "llvm/Support/MathExtras.h"
@@ -56,8 +53,6 @@ LinearLayout identityND(StringAttr inDimName, ArrayRef<unsigned> shape,
5653
LinearLayout ret = LinearLayout::empty();
5754
for (int i = 0; i < shape.size(); i++) {
5855
// Start with the most-minor dimension, which is order[0].
59-
// std::cout << "i: " << i << " shape[i]: " << shape[i]
60-
// << " order[i]: " << order[i] << std::endl;
6156
int dim = order[i];
6257
ret *= LinearLayout::identity1D(shape[dim], inDimName, outDimNames[dim]);
6358
}
@@ -280,7 +275,6 @@ LinearLayout ensureLayoutNotSmallerThan(
280275
return layout;
281276
}
282277

283-
// MLIRContext *ctx = shape.begin()->first.getContext();
284278
StringAttr kDim = *layout.getInDimNames().begin();
285279
assert(kDim == "register" || kDim == "offset" && "unexpected kDim");
286280

@@ -291,16 +285,6 @@ LinearLayout ensureLayoutNotSmallerThan(
291285
assert(actualSize > desiredSize ||
292286
desiredSize % actualSize == 0 && "bad shape");
293287
ret *= LinearLayout::identity1D(desiredSize / actualSize, kDim, outDimName);
294-
// std::cout << "actualSize: " << actualSize << " desiredSize: " <<
295-
// desiredSize
296-
// << std::endl;
297-
// std::cout << "outDimName: " << outDimName.str() << std::endl;
298-
// std::cout << "identity1D: "
299-
// << LinearLayout::identity1D(desiredSize / actualSize, kDim,
300-
// outDimName)
301-
// .toString()
302-
// << std::endl;
303-
// std::cout << "ret: " << ret.toString() << std::endl;
304288
assert(ret.getOutDimSize(outDimName) >= desiredSize && "bad grow");
305289
}
306290
return ret;
@@ -324,12 +308,6 @@ LinearLayout combineCtaCgaWithShape(LinearLayout ctaLayout,
324308

325309
SmallVector<StringAttr> outDimNames = standardOutDimNames(ctx, rank);
326310

327-
std::cout << "shape: ";
328-
for (auto s : shape) {
329-
std::cout << s << ", ";
330-
}
331-
std::cout << std::endl;
332-
333311
llvm::SmallDenseMap<StringAttr, int64_t> labeledShape;
334312
for (auto [dim, size] : llvm::zip(outDimNames, shape)) {
335313
labeledShape[dim] = size;
@@ -338,41 +316,26 @@ LinearLayout combineCtaCgaWithShape(LinearLayout ctaLayout,
338316
LinearLayout cgaLayout =
339317
ensureLayoutNotLargerThan(makeCgaLayout(cgaLayoutAttr), labeledShape)
340318
.transposeOuts(llvm::to_vector(ctaLayout.getOutDimNames()));
341-
// std::cout << "\ncgaLayout: " << cgaLayout.toString() << std::endl;
342319

343320
// Calculate the shape of the ctaLayout, which is `shape` divided by the
344321
// cgaLayout's size.
345322
llvm::SmallDenseMap<StringAttr, int64_t> ctaShape;
346323
assert(llvm::to_vector(ctaLayout.getOutDimNames()) ==
347324
llvm::to_vector(cgaLayout.getOutDimNames()) &&
348325
"bad layout");
349-
350-
// std::cout << "ctaShape: ";
351326
for (auto dim : ctaLayout.getOutDimNames()) {
352327
ctaShape[dim] =
353328
std::max(int64_t{1}, labeledShape[dim] / cgaLayout.getOutDimSize(dim));
354-
// std::cout << ctaShape[dim] << ", ";
355329
}
356-
// std::cout << std::endl;
357330

358-
std::cout << "ensureLayoutNotSmallerThan start" << std::endl;
359331
ctaLayout = ensureLayoutNotSmallerThan(ctaLayout, ctaShape);
360-
// std::cout << "\nctaLayout not smaller than: " << ctaLayout.toString()
361-
// << std::endl;
362-
std::cout << "ensureLayoutNotLargerThan start" << std::endl;
363332
ctaLayout = ensureLayoutNotLargerThan(ctaLayout, ctaShape);
364-
// std::cout << "\nctaLayout not larger than: " << ctaLayout.toString()
365-
// << std::endl;
366333

367-
// std::cout << "\ncta * cga: " << (ctaLayout * cgaLayout).toString()
368-
// << std::endl;
369334
LinearLayout ret =
370335
(std::move(ctaLayout) * std::move(cgaLayout)).transposeOuts(outDimNames);
371336
for (auto dim : ret.getOutDimNames()) {
372337
assert(ret.getOutDimSize(dim) == labeledShape[dim] && "bad shape");
373338
}
374-
// std::cout << "\ncombineCtaCgaWithShape: " << ret.toString() << std::endl;
375-
std::cout << "combineCtaCgaWithShape end" << std::endl;
376339
return ret;
377340
}
378341

@@ -569,7 +532,6 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
569532
LinearLayout::identity1D(warpsPerCTA[0], kWarp, outDimNames[0]);
570533

571534
} else if (opIdx == 1) { // Operand B
572-
std::cout << "\nOperand B" << std::endl;
573535
auto regBasesB = DPASRegBasesB(opsPerChannel, executionSize, threadsPerWarp,
574536
systolicDepth);
575537
auto laneBasesB =
@@ -591,32 +553,20 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
591553
tileLayout *=
592554
LinearLayout::identity1D(warpsPerCTA[0], kWarp, outDimNames[0]);
593555
} else { // opIdx=2 -> Operand C
594-
std::cout << "\nOperand C" << std::endl;
595556
auto regBasesC = DPASRegBasesC(repeatCount, executionSize, threadsPerWarp);
596557
auto laneBasesC =
597558
DPASLaneBasesC(repeatCount, executionSize, threadsPerWarp);
598559
tileLayout = LinearLayout({{kRegister, regBasesC}, {kLane, laneBasesC}},
599560
ArrayRef(outDimNames).take_back(2));
600-
// std::cout << tileLayout.toString() << std::endl;
601561
// The per-inst layout is repeated at each repCluster.
602562
// Hence, multiply with the identity layouts starting from the
603563
// least significant dimension.
604564
dimNonK = rank - 2;
605565
dimK = rank - 1;
606566
tileLayout *= LinearLayout::identity1D(repCluster[dimK], kRegister,
607567
outDimNames[dimK]);
608-
// std::cout << (LinearLayout::identity1D(repCluster[dimK], kRegister,
609-
// outDimNames[dimK])
610-
// .toString())
611-
// << std::endl;
612-
// std::cout << (tileLayout.toString()) << std::endl;
613568
tileLayout *= LinearLayout::identity1D(repCluster[dimNonK], kRegister,
614569
outDimNames[dimNonK]);
615-
// std::cout << (LinearLayout::identity1D(repCluster[dimNonK], kRegister,
616-
// outDimNames[dimNonK])
617-
// .toString())
618-
// << std::endl;
619-
// std::cout << (tileLayout.toString()) << std::endl;
620570

621571
// // The identical layout is repeated among warps
622572
tileLayout *=
@@ -626,34 +576,23 @@ LinearLayout DPAStoLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
626576
if (rank == 3)
627577
tileLayout *=
628578
LinearLayout::identity1D(warpsPerCTA[0], kWarp, outDimNames[0]);
629-
// std::cout << (tileLayout.toString()) << std::endl;
630579
}
631580

632581
// Lastly, the layout repeats to match the shape.
633582
// Operand A/B repeats through the K-dimension first then repeats
634583
// through the non-K dimension.
635584
SmallVector<int64_t> numReps = dpas.getDPASRepetitions(shape, opIdx);
636585

637-
std::cout << "numReps: ";
638-
for (auto numRep : numReps) {
639-
std::cout << numRep << ", ";
640-
}
641-
std::cout << std::endl;
642-
643586
// numReps is always 3D, we should add 1 to dim id when rank is 2
644587
int repDimK = rank == 2 ? dimK + 1 : dimK;
645588
int repDimNonK = rank == 2 ? dimNonK + 1 : dimNonK;
646589
tileLayout *=
647590
LinearLayout::identity1D(numReps[repDimK], kRegister, outDimNames[dimK]);
648591
tileLayout *= LinearLayout::identity1D(numReps[repDimNonK], kRegister,
649592
outDimNames[dimNonK]);
650-
std::cout << "rank: " << rank << std::endl;
651593
if (rank == 3)
652594
tileLayout *=
653595
LinearLayout::identity1D(numReps[0], kRegister, outDimNames[0]);
654-
// std::cout << "\ntileLayout with DPASRepetition: " <<
655-
// (tileLayout.toString())
656-
// << std::endl;
657596

658597
return combineCtaCgaWithShape(std::move(tileLayout),
659598
CTALayoutAttr::getDefault(ctx, rank), shape);

0 commit comments

Comments
 (0)