Skip to content

Commit 5d23561

Browse files
[Intel] Remove emitIndices (#3533)
We no longer need Intel specific `emitIndices`, as common `emitIndices` now uses linear layout. Signed-off-by: Whitney Tsang <[email protected]>
1 parent 8375773 commit 5d23561

File tree

7 files changed

+17
-134
lines changed

7 files changed

+17
-134
lines changed

third_party/intel/lib/TritonIntelGPUToLLVM/HistogramOpToLLVM.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -189,8 +189,8 @@ struct HistogramOpConversion
189189
LLVM::getSharedMemoryBase(loc, rewriter, targetInfo, op.getOperation());
190190
auto dstType = op.getType();
191191
Attribute dstEncoding = dstType.getEncoding();
192-
auto indices = ::intel::emitIndices(op.getLoc(), rewriter, targetInfo,
193-
dstEncoding, dstType, true);
192+
auto indices = emitIndices(op.getLoc(), rewriter, targetInfo, dstEncoding,
193+
dstType, true);
194194
SmallVector<Value> innerDimIndices;
195195
for (int i = 0; i < indices.size(); ++i)
196196
innerDimIndices.push_back(indices[i][0]);

third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,8 @@ struct LoadStoreConversionBase {
217217
unsigned numElems = getTotalElemsPerThread(tensorType);
218218

219219
// Get the LLVM values for indices in block
220-
auto indices = mlir::triton::intel::emitIndices(
221-
loc, rewriter, targetInfo, tensorType.getEncoding(), tensorType, true);
220+
auto indices = emitIndices(loc, rewriter, targetInfo,
221+
tensorType.getEncoding(), tensorType, true);
222222

223223
auto linearize =
224224
[](ArrayRef<Value> A, ArrayRef<Value> B, Value init,

third_party/intel/lib/TritonIntelGPUToLLVM/MakeRangeOpToLLVM.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,7 @@ struct MakeRangeOpConversion
2424
auto elemTy = ty.getElementType();
2525
assert(elemTy.isInteger(32));
2626
Value start = createIndexAttrConstant(rewriter, loc, elemTy, op.getStart());
27-
auto idxs =
28-
::intel::emitIndices(loc, rewriter, targetInfo, layout, ty, true);
27+
auto idxs = emitIndices(loc, rewriter, targetInfo, layout, ty, true);
2928
unsigned elems = idxs.size();
3029
SmallVector<Value> retVals(elems);
3130
// TODO: slice layout has more elements than expected.

third_party/intel/lib/TritonIntelGPUToLLVM/PrintOpToLLVM.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ struct PrintOpConversion
6565
SmallVector<SmallVector<Value>> indices;
6666
if (auto rankedTy =
6767
dyn_cast<RankedTensorType>(op.getOperand(i).getType())) {
68-
indices = ::intel::emitIndices(loc, rewriter, targetInfo,
69-
rankedTy.getEncoding(), rankedTy, true);
68+
indices = emitIndices(loc, rewriter, targetInfo, rankedTy.getEncoding(),
69+
rankedTy, true);
7070
for (int64_t dim : rankedTy.getShape()) {
7171
if (dim > 0) {
7272
dimWidths.push_back(static_cast<int>(std::ceil(std::log10(dim))));

third_party/intel/lib/TritonIntelGPUToLLVM/ReduceOpToLLVM.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ struct ReduceOpConversion
127127
RankedTensorType operandType = op.getInputTypes()[0];
128128
// Assumes offsets don't actually depend on type
129129
SmallVector<SmallVector<unsigned>> offsets =
130-
::intel::emitOffsetForLayout(helper.getSrcLayout(), operandType);
130+
emitOffsetForLayout(helper.getSrcLayout(), operandType);
131131

132132
// Thread X might hold the same input value in two registers. Get the
133133
// indices in `offsets` that hold unique values, and only accumulate over
@@ -139,9 +139,8 @@ struct ReduceOpConversion
139139

140140
unsigned srcElems = getTotalElemsPerThread(operandType);
141141
auto *combineOp = &op.getCombineOp();
142-
auto srcIndices =
143-
::intel::emitIndices(op.getLoc(), rewriter, targetInfo,
144-
helper.getSrcLayout(), operandType, true);
142+
auto srcIndices = emitIndices(op.getLoc(), rewriter, targetInfo,
143+
helper.getSrcLayout(), operandType, true);
145144
// reduce within threads
146145
for (const auto &[_, i] : uniqueOffsets) {
147146
SmallVector<unsigned> key = offsets[i];
@@ -204,7 +203,7 @@ struct ReduceOpConversion
204203
auto resultLayout = cast<SliceEncodingAttr>(resultTy.getEncoding());
205204
unsigned resultElems = getTotalElemsPerThread(resultTy);
206205
SmallVector<SmallVector<unsigned>> resultOffset =
207-
::intel::emitOffsetForLayout(resultLayout, resultTy);
206+
emitOffsetForLayout(resultLayout, resultTy);
208207
SmallVector<Value> resultVals;
209208
for (int j = 0; j < resultElems; j++) {
210209
auto key = resultOffset[j];
@@ -381,8 +380,8 @@ struct ReduceOpConversion
381380
// nd-tensor where n >= 1
382381
auto resultLayout = cast<SliceEncodingAttr>(resultTy.getEncoding());
383382
unsigned resultElems = getTotalElemsPerThread(resultTy);
384-
auto resultIndices = ::intel::emitIndices(loc, rewriter, targetInfo,
385-
resultLayout, resultTy, true);
383+
auto resultIndices = emitIndices(loc, rewriter, targetInfo,
384+
resultLayout, resultTy, true);
386385
auto resultShape = resultTy.getShape();
387386
auto resultCTATile = getShapePerCTATile(resultLayout);
388387
assert(resultIndices.size() == resultElems);

third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h

Lines changed: 0 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -411,117 +411,6 @@ emitBaseIndexForDpasLayout(Location loc, RewriterBase &rewriter,
411411

412412
namespace mlir::triton::intel {
413413

414-
inline SmallVector<SmallVector<unsigned>>
415-
emitOffsetForLayout(Attribute layout, RankedTensorType type);
416-
417-
// -----------------------------------------------------------------------
418-
// Get offsets / indices for any layout
419-
// -----------------------------------------------------------------------
420-
421-
inline SmallVector<Value>
422-
emitBaseIndexForLayoutImpl(Location loc, RewriterBase &rewriter,
423-
const TargetInfoBase &target, Attribute layout,
424-
RankedTensorType type, bool withCTAOffset) {
425-
auto b = TritonLLVMOpBuilder(loc, rewriter);
426-
auto shape = type.getShape();
427-
428-
SmallVector<Value> baseIndex;
429-
RewriterBase::InsertionGuard guard(rewriter);
430-
SmallVector<Value> result;
431-
if (auto dpasLayout = dyn_cast<DpasEncodingAttr>(layout)) {
432-
result = emitBaseIndexForDpasLayout(loc, rewriter, dpasLayout, type);
433-
} else if (auto sliceLayout = dyn_cast<SliceEncodingAttr>(layout)) {
434-
auto parentLayout = sliceLayout.getParent();
435-
auto parentShape = sliceLayout.paddedShape(type.getShape());
436-
RankedTensorType parentTy =
437-
RankedTensorType::get(parentShape, type.getElementType(), parentLayout);
438-
result = ::intel::emitBaseIndexForLayoutImpl(
439-
loc, rewriter, target, parentLayout, parentTy, withCTAOffset);
440-
result.erase(result.begin() + sliceLayout.getDim());
441-
// CTAOffset has been added in emitBaseIndexForLayout of parentLayout
442-
return result;
443-
} else if (auto dotLayout = dyn_cast<DotOperandEncodingAttr>(layout)) {
444-
result = emitBaseIndexForDotOpLayout(loc, rewriter, dotLayout, type);
445-
} else {
446-
return mlir::emitBaseIndexForLayoutImpl(loc, rewriter, target, layout, type,
447-
withCTAOffset);
448-
}
449-
if (withCTAOffset) {
450-
auto CTAOffset =
451-
emitCTAOffsetForLayout(loc, rewriter, target, layout, shape);
452-
assert(CTAOffset.size() == result.size() && "Rank mismatch");
453-
for (unsigned k = 0; k < result.size(); ++k) {
454-
// Individual elements of `result` may be null. In the caller
455-
// (emitBaseIndexForLayout), we assert that all such dimensions are sliced
456-
// off.
457-
if (!result[k])
458-
continue;
459-
result[k] = b.add(result[k], CTAOffset[k]);
460-
}
461-
}
462-
return result;
463-
}
464-
465-
inline SmallVector<Value>
466-
emitBaseIndexForLayout(Location loc, RewriterBase &rewriter,
467-
const TargetInfoBase &target, Attribute layout,
468-
RankedTensorType type, bool withCTAOffset) {
469-
SmallVector<Value> idx = ::intel::emitBaseIndexForLayoutImpl(
470-
loc, rewriter, target, layout, type, withCTAOffset);
471-
472-
// Check that any null values were sliced out.
473-
for (Value v : idx) {
474-
if (!v) {
475-
llvm::errs() << "Failed to generate indexing code, possibly due to bad "
476-
"#mma layout. Please rerun your program with "
477-
"MLIR_ENABLE_DUMP=1 and file a bug."
478-
<< "\nloc: " << loc << "\nlayout: " << layout
479-
<< "\ntype: " << type << "\nwithCTAOffset: " << withCTAOffset
480-
<< "\n";
481-
llvm::report_fatal_error("Failed to generate indexing code");
482-
}
483-
}
484-
485-
return idx;
486-
}
487-
488-
inline SmallVector<SmallVector<unsigned>>
489-
emitOffsetForLayout(Attribute layout, RankedTensorType type) {
490-
return mlir::emitOffsetForLayout(layout, type);
491-
}
492-
493-
// Emit indices calculation within each ConversionPattern, and returns a
494-
// [elemsPerThread X rank] index matrix.
495-
inline SmallVector<SmallVector<Value>>
496-
emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
497-
Attribute layout, RankedTensorType type, bool withCTAOffset) {
498-
auto b = TritonLLVMOpBuilder(loc, rewriter);
499-
MLIRContext *ctx = rewriter.getContext();
500-
auto shape = type.getShape();
501-
std::optional<LinearLayout> ll = triton::gpu::toLinearLayout(shape, layout);
502-
if (ll.has_value())
503-
return mlir::emitIndices(loc, rewriter, target, layout, type,
504-
withCTAOffset);
505-
506-
// step 1, delinearize threadId to get the base index
507-
auto multiDimBase = ::intel::emitBaseIndexForLayout(
508-
loc, rewriter, target, layout, type, withCTAOffset);
509-
// step 2, get offset of each element
510-
auto offset = intel::emitOffsetForLayout(layout, type);
511-
// step 3, add offset to base, and reorder the sequence
512-
// of indices to guarantee that elems in the same
513-
// sizePerThread are adjacent in order
514-
unsigned rank = shape.size();
515-
unsigned elemsPerThread = offset.size();
516-
SmallVector<SmallVector<Value>> multiDimIdx(elemsPerThread,
517-
SmallVector<Value>(rank));
518-
for (unsigned n = 0; n < elemsPerThread; ++n)
519-
for (unsigned k = 0; k < rank; ++k)
520-
multiDimIdx[n][k] = b.add(multiDimBase[k], b.i32_val(offset[n][k]));
521-
522-
return multiDimIdx;
523-
}
524-
525414
Value convertBf16ToFp32(Location loc, ConversionPatternRewriter &rewriter,
526415
Value v);
527416
Value convertFp32ToBf16(Location loc, ConversionPatternRewriter &rewriter,

third_party/intel/lib/TritonIntelGPUToLLVM/ViewOpToLLVM.cpp

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -253,10 +253,8 @@ struct ExpandDimsOpConversion : public ConvertOpToLLVMPattern<ExpandDimsOp> {
253253
loc, "ExpandDimsOp only supports SliceEncodingAttr as its input");
254254
}
255255
auto resultLayout = resultTy.getEncoding();
256-
auto srcOffsets =
257-
mlir::triton::intel::emitOffsetForLayout(srcLayout, srcTy);
258-
auto resultOffsets =
259-
mlir::triton::intel::emitOffsetForLayout(resultLayout, resultTy);
256+
auto srcOffsets = emitOffsetForLayout(srcLayout, srcTy);
257+
auto resultOffsets = emitOffsetForLayout(resultLayout, resultTy);
260258
std::map<SmallVector<unsigned>, Value> srcValues;
261259
for (size_t i = 0; i < srcOffsets.size(); i++) {
262260
srcValues[srcOffsets[i]] = srcVals[i];
@@ -338,10 +336,8 @@ struct BroadcastOpConversion
338336
auto typeConverter = getTypeConverter();
339337
assert(rank == resultTy.getRank());
340338
auto order = triton::gpu::getOrder(srcLayout);
341-
auto srcOffsets =
342-
mlir::triton::intel::emitOffsetForLayout(srcLayout, srcTy);
343-
auto resultOffsets =
344-
mlir::triton::intel::emitOffsetForLayout(resultLayout, resultTy);
339+
auto srcOffsets = emitOffsetForLayout(srcLayout, srcTy);
340+
auto resultOffsets = emitOffsetForLayout(resultLayout, resultTy);
345341
SmallVector<Value> srcVals = unpackLLElements(loc, src, rewriter);
346342
std::map<SmallVector<unsigned>, Value> srcValues;
347343
for (size_t i = 0; i < srcOffsets.size(); i++) {

0 commit comments

Comments
 (0)