Skip to content

Commit 4480f86

Browse files
authored
[BACKEND] Drop all volta related code (#5099)
Only leave the `isVolta` interface. If any contributors are willing to support volta through the linear layout approach, we can add it back later.
1 parent 0cea768 commit 4480f86

File tree

12 files changed

+21
-1376
lines changed

12 files changed

+21
-1376
lines changed

include/triton/Conversion/TritonGPUToLLVM/Utility.h

Lines changed: 0 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -598,122 +598,6 @@ emitOffsetForBlockedLayout(const BlockedEncodingAttr &blockedLayout,
598598
// Mma layout indices
599599
// -----------------------------------------------------------------------
600600

601-
inline SmallVector<Value>
602-
emitBaseIndexWithinCTAForMmaLayoutV1(Location loc, RewriterBase &rewriter,
603-
const NvidiaMmaEncodingAttr &mmaLayout,
604-
RankedTensorType type) {
605-
auto shape = type.getShape();
606-
auto wpt = mmaLayout.getWarpsPerCTA();
607-
static constexpr std::array<int, 3> fpw{{2, 2, 1}};
608-
auto [isARow, isBRow, isAVec4, isBVec4, _] =
609-
mmaLayout.decodeVoltaLayoutStates();
610-
611-
Value thread = getThreadId(rewriter, loc);
612-
auto *ctx = thread.getContext();
613-
Value _1 = i32_val(1);
614-
Value _2 = i32_val(2);
615-
Value _4 = i32_val(4);
616-
Value _16 = i32_val(16);
617-
Value _32 = i32_val(32);
618-
Value _fpw0 = i32_val(fpw[0]);
619-
Value _fpw1 = i32_val(fpw[1]);
620-
621-
// A info
622-
auto aRep = mmaLayout.getMMAv1Rep(0);
623-
auto aSpw = mmaLayout.getMMAv1ShapePerWarp(0);
624-
// B info
625-
auto bSpw = mmaLayout.getMMAv1ShapePerWarp(1);
626-
auto bRep = mmaLayout.getMMAv1Rep(1);
627-
628-
SmallVector<int, 2> rep({aRep[0], bRep[1]});
629-
SmallVector<int, 2> spw({aSpw[0], bSpw[1]});
630-
SmallVector<unsigned, 2> shapePerCTA({spw[0] * wpt[0], spw[1] * wpt[1]});
631-
632-
Value lane = urem(thread, _32);
633-
Value warp = udiv(thread, _32);
634-
635-
Value warp0 = urem(warp, i32_val(wpt[0]));
636-
Value warp12 = udiv(warp, i32_val(wpt[0]));
637-
Value warp1 = urem(warp12, i32_val(wpt[1]));
638-
639-
// warp offset
640-
Value offWarpM = mul(warp0, i32_val(spw[0]));
641-
Value offWarpN = mul(warp1, i32_val(spw[1]));
642-
// quad offset
643-
Value offQuadM = mul(udiv(and_(lane, _16), _4), _fpw0);
644-
Value offQuadN = mul(udiv(and_(lane, _16), _4), _fpw1);
645-
// pair offset
646-
Value offPairM = udiv(urem(lane, _16), _4);
647-
offPairM = urem(offPairM, _fpw0);
648-
offPairM = mul(offPairM, _4);
649-
Value offPairN = udiv(urem(lane, _16), _4);
650-
offPairN = udiv(offPairN, _fpw0);
651-
offPairN = urem(offPairN, _fpw1);
652-
offPairN = mul(offPairN, _4);
653-
offPairM = mul(offPairM, i32_val(rep[0] / 2));
654-
offQuadM = mul(offQuadM, i32_val(rep[0] / 2));
655-
offPairN = mul(offPairN, i32_val(rep[1] / 2));
656-
offQuadN = mul(offQuadN, i32_val(rep[1] / 2));
657-
// quad pair offset
658-
Value offLaneM = add(offPairM, offQuadM);
659-
Value offLaneN = add(offPairN, offQuadN);
660-
// a, b offset
661-
Value offsetAM = add(offWarpM, offLaneM);
662-
Value offsetBN = add(offWarpN, offLaneN);
663-
// m indices
664-
Value offsetCM = add(and_(lane, _1), offsetAM);
665-
// n indices
666-
Value offsetCN = add((and_(lane, _2)), (add(offWarpN, offPairN)));
667-
return {offsetCM, offsetCN};
668-
}
669-
670-
inline SmallVector<SmallVector<unsigned>>
671-
emitOffsetForMmaLayoutV1(const NvidiaMmaEncodingAttr &mmaLayout,
672-
RankedTensorType type) {
673-
auto shape = type.getShape();
674-
675-
auto [isARow, isBRow, isAVec4, isBVec4, _] =
676-
mmaLayout.decodeVoltaLayoutStates();
677-
678-
// TODO: seems like the pattern below to get `rep`/`spw` appears quite often
679-
// A info
680-
auto aRep = mmaLayout.getMMAv1Rep(0);
681-
auto aSpw = mmaLayout.getMMAv1ShapePerWarp(0);
682-
// B info
683-
auto bSpw = mmaLayout.getMMAv1ShapePerWarp(1);
684-
auto bRep = mmaLayout.getMMAv1Rep(1);
685-
686-
auto wpt = mmaLayout.getWarpsPerCTA();
687-
static constexpr std::array<int, 3> fpw{{2, 2, 1}};
688-
SmallVector<int, 2> rep({aRep[0], bRep[1]});
689-
SmallVector<int, 2> spw({aSpw[0], bSpw[1]});
690-
SmallVector<unsigned, 2> shapePerCTA({spw[0] * wpt[0], spw[1] * wpt[1]});
691-
692-
SmallVector<unsigned> idxM;
693-
for (unsigned m = 0; m < shape[0]; m += shapePerCTA[0])
694-
for (unsigned mm = 0; mm < rep[0]; ++mm)
695-
idxM.push_back(m + mm * 2);
696-
697-
SmallVector<unsigned> idxN;
698-
for (int n = 0; n < shape[1]; n += shapePerCTA[1]) {
699-
for (int nn = 0; nn < rep[1]; ++nn) {
700-
idxN.push_back(n + nn / 2 * 4 + (nn % 2) * 2 * fpw[1] * rep[1]);
701-
idxN.push_back(n + nn / 2 * 4 + (nn % 2) * 2 * fpw[1] * rep[1] + 1);
702-
}
703-
}
704-
705-
SmallVector<SmallVector<unsigned>> ret;
706-
for (unsigned x1 : idxN) { // N
707-
for (unsigned x0 : idxM) { // M
708-
SmallVector<unsigned> idx(2);
709-
idx[0] = x0; // M
710-
idx[1] = x1; // N
711-
ret.push_back(std::move(idx));
712-
}
713-
}
714-
return ret;
715-
}
716-
717601
inline SmallVector<SmallVector<unsigned>>
718602
emitOffsetForMmaLayoutV2(const NvidiaMmaEncodingAttr &mmaLayout,
719603
RankedTensorType type) {
@@ -1179,9 +1063,6 @@ emitBaseIndexForLayoutImpl(Location loc, RewriterBase &rewriter,
11791063
result = emitBaseIndexWithinCTAForBlockedLayout(loc, rewriter,
11801064
blockedLayout, type);
11811065
} else if (auto mmaLayout = mlir::dyn_cast<NvidiaMmaEncodingAttr>(layout)) {
1182-
if (mmaLayout.isVolta())
1183-
result =
1184-
emitBaseIndexWithinCTAForMmaLayoutV1(loc, rewriter, mmaLayout, type);
11851066
if (mmaLayout.isAmpere() || mmaLayout.isHopper())
11861067
result = emitBaseIndexWithinCTAForMmaLayoutV2V3(loc, rewriter, mmaLayout,
11871068
type);
@@ -1536,18 +1417,6 @@ inline Value packLLVector(Location loc, ValueRange vals,
15361417
return vec;
15371418
}
15381419

1539-
inline bool isLayoutMmaV1(Attribute layout) {
1540-
bool isMmaV1 = false;
1541-
if (auto mmaLayout = dyn_cast<NvidiaMmaEncodingAttr>(layout)) {
1542-
isMmaV1 = mmaLayout.isVolta();
1543-
}
1544-
if (auto sliceLayout = dyn_cast<SliceEncodingAttr>(layout)) {
1545-
isMmaV1 = isa<NvidiaMmaEncodingAttr>(sliceLayout.getParent()) &&
1546-
cast<NvidiaMmaEncodingAttr>(sliceLayout.getParent()).isVolta();
1547-
}
1548-
return isMmaV1;
1549-
}
1550-
15511420
} // namespace mlir
15521421

15531422
#endif

include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td

Lines changed: 3 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -346,21 +346,6 @@ compared to 1*64 when the hasLeadingOffset is false.
346346
// index of the inner dimension in `order`
347347
unsigned inner = (opIdx == 0) ? 0 : 1;
348348

349-
// ---- begin Volta ----
350-
if (mmaEnc.isVolta()) {
351-
int perPhase = 128 / (shapePerCTA[order[0]] * (typeWidthInBit / 8));
352-
perPhase = std::max<int>(perPhase, 1);
353-
bool is_row = order[0] != 0;
354-
bool is_vec4 = opIdx == 0 ? !is_row && (shapePerCTA[order[0]] <= 16) :
355-
is_row && (shapePerCTA[order[0]] <= 16);
356-
int pack_size = opIdx == 0 ? ((is_row || is_vec4) ? 1 : 2) :
357-
((is_row && !is_vec4) ? 2 : 1);
358-
int rep = 2 * pack_size;
359-
int maxPhase = (order[inner] == 1 ? 8 : 4) / perPhase;
360-
int vec = 2 * rep;
361-
return get(context, vec, perPhase, maxPhase, order, CTALayout);
362-
}
363-
364349
// ---- begin Ampere & Hopper ----
365350
if (mmaEnc.isAmpere() || mmaEnc.isHopper()) {
366351
int perPhase = 128 / (shapePerCTA[order[0]] * 4 / dotOpEnc.getKWidth());
@@ -771,7 +756,7 @@ for
771756
//===----------------------------------------------------------------------===//
772757
// MMA Layout Encoding
773758
//===----------------------------------------------------------------------===//
774-
// TODO: MMAv1 and MMAv2 should be two instances of the same class
759+
775760
def MmaEncodingTrait : AttrInterface<"MmaEncodingTrait"> {
776761
let cppNamespace = "::mlir::triton::gpu";
777762
let methods = [
@@ -1139,92 +1124,13 @@ For example, the matrix L corresponding to blockTileSize=[32,16] is:
11391124
ArrayRefParameter<"unsigned">:$instrShape
11401125
);
11411126

1142-
let builders = [
1143-
// Specially for MMAV1(Volta)
1144-
AttrBuilder<(ins "int":$versionMajor,
1145-
"int":$numWarps,
1146-
"CTALayoutAttr":$CTALayout,
1147-
"ArrayRef<unsigned>":$instrShape,
1148-
"ArrayRef<int64_t>":$shapeC,
1149-
"bool":$isARow,
1150-
"bool":$isBRow,
1151-
"bool":$isAVec4,
1152-
"bool":$isBVec4,
1153-
"int":$id), [{
1154-
assert(versionMajor == 1 && "This builder is specially for versionMajor==1");
1155-
// 4-bits to encode 4 booleans: [isARow, isBRow, isAVec4, isBVec4]
1156-
int versionMinor = (isARow * (1<<0)) |\
1157-
(isBRow * (1<<1)) |\
1158-
(isAVec4 * (1<<2)) |\
1159-
(isBVec4 * (1<<3));
1160-
1161-
// TODO: Share code with
1162-
// DotOpMmaV1ConversionHelper::AParam/BParam, since same code to compute the
1163-
// rep,spw and fpw.
1164-
SmallVector<unsigned> wpt({1, 1});
1165-
SmallVector<unsigned> wpt_nm1;
1166-
1167-
SmallVector<int, 2> rep(2), spw(2);
1168-
std::array<int, 3> fpw{{2, 2, 1}};
1169-
int packSize0 = (isARow || isAVec4) ? 1 : 2;
1170-
rep[0] = 2 * packSize0;
1171-
spw[0] = fpw[0] * 4 * rep[0];
1172-
1173-
int packSize1 = (isBRow && !isBVec4) ? 2 : 1;
1174-
rep[1] = 2 * packSize1;
1175-
spw[1] = fpw[1] * 4 * rep[1];
1176-
1177-
do {
1178-
wpt_nm1 = wpt;
1179-
if (wpt[0] * wpt[1] < numWarps)
1180-
wpt[0] = std::clamp<int>(wpt[0] * 2, 1, shapeC[0] / spw[0]);
1181-
if (wpt[0] * wpt[1] < numWarps)
1182-
wpt[1] = std::clamp<int>(wpt[1] * 2, 1, shapeC[1] / spw[1]);
1183-
} while (wpt_nm1 != wpt);
1184-
1185-
return $_get(context, versionMajor, versionMinor, wpt, CTALayout, instrShape);
1186-
}]>,
1187-
1188-
1189-
AttrBuilder<(ins "int":$versionMajor,
1190-
"int":$numWarps,
1191-
"CTALayoutAttr":$CTALayout,
1192-
"ArrayRef<unsigned>":$instrShape,
1193-
"ArrayRef<int64_t>":$shapeA,
1194-
"ArrayRef<int64_t>":$shapeB,
1195-
"ArrayRef<int64_t>":$shapeC,
1196-
"bool":$isARow,
1197-
"bool":$isBRow,
1198-
"int":$id), [{
1199-
assert(versionMajor == 1 && "This builder is specially for versionMajor==1");
1200-
bool isAVec4 = !isARow && (shapeA[isARow] <= 16);
1201-
bool isBVec4 = isBRow && (shapeB[isBRow] <= 16);
1202-
return get(context, versionMajor, numWarps, CTALayout, instrShape, shapeC, isARow, isBRow, isAVec4, isBVec4, id);
1203-
}]>
1204-
];
12051127

12061128
let extraClassDeclaration = extraDistributedDeclaration # [{
12071129
bool isVolta() const;
12081130
bool isTuring() const;
12091131
bool isAmpere() const;
12101132
bool isHopper() const;
12111133

1212-
// Get [isARow, isBRow, isAVec4, isBVec4, id] from versionMinor
1213-
std::tuple<bool, bool, bool, bool, int> decodeVoltaLayoutStates() const;
1214-
1215-
// Number of bits in versionMinor to hold the ID of the MMA encoding instance.
1216-
// Here 5 bits can hold 32 IDs in a single module.
1217-
static constexpr int numBitsToHoldMmaV1ID{5};
1218-
1219-
// For MMA v1, method `getMMAv1IsRow` returns whether e.g. the a operand is used
1220-
// in the context of an mma.884.row.col or an mma.884.col.col operation. See the PTX ISA documentation
1221-
// section 9.7.13.4.1 for more details.
1222-
bool getMMAv1IsRow(int opIdx) const;
1223-
bool getMMAv1IsVec4(int opIdx) const;
1224-
int getMMAv1NumOuter(ArrayRef<int64_t> shape, int opIdx) const;
1225-
SmallVector<int> getMMAv1Rep(int opIdx) const;
1226-
SmallVector<int> getMMAv1ShapePerWarp(int opIdx) const;
1227-
int getMMAv1Vec(int opIdx) const;
12281134
SmallVector<int64_t> getRepForOperand(ArrayRef<int64_t> shape,
12291135
int bitwidth, int opIdx) const;
12301136
SmallVector<unsigned> getRepOrderForOperand(int opIdx) const;
@@ -1240,7 +1146,7 @@ For example, the matrix L corresponding to blockTileSize=[32,16] is:
12401146
unsigned getTotalElemsPerThreadForOperand(ArrayRef<int64_t> shape, Type eltTy, int kWidth, int opIdx) const;
12411147

12421148
SmallVector<unsigned> getContigPerThread() {
1243-
assert(isVolta() || isAmpere() || isHopper());
1149+
assert(isAmpere() || isHopper());
12441150
auto rank = getWarpsPerCTA().size();
12451151
SmallVector<unsigned> contigPerThread(rank, 1);
12461152
contigPerThread[rank - 1] = 2;
@@ -1357,7 +1263,7 @@ vecIdx (index of the element in the quad; this is always along the k-dim)
13571263
"Type":$eltTy), [{
13581264
NvidiaMmaEncodingAttr parentAttr = mlir::dyn_cast<NvidiaMmaEncodingAttr>(parent);
13591265
if (!parentAttr || (!parentAttr.isAmpere() && !parentAttr.isHopper()))
1360-
return $_get(context, opIdx, parent, 0); // For MMAV1
1266+
return $_get(context, opIdx, parent, 0);
13611267
// For MMAV2 and V3
13621268
unsigned bitwidth = eltTy.getIntOrFloatBitWidth();
13631269
unsigned kWidth = 32 / bitwidth;

lib/Analysis/Allocation.cpp

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -100,17 +100,12 @@ ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
100100
: srcContigPerThread;
101101
scratchConfig.outVec = outOrd[0] != innerDim ? 1 : dstContigPerThread;
102102

103-
if (auto mma = mlir::dyn_cast<gpu::NvidiaMmaEncodingAttr>(srcLayout)) {
104-
if (mma.getVersionMajor() == 1) {
105-
// For conversions to MmaV1 (Nvidia V100), this inVec is hardcoded in the
106-
// codegen.
107-
scratchConfig.inVec = srcContigPerThread;
108-
} else if (mlir::isa<gpu::BlockedEncodingAttr>(dstLayout)) {
109-
// when storing from mma layout and loading in blocked layout vectorizing
110-
// the load back gives better performance even if there is a
111-
// transposition.
112-
scratchConfig.outVec = dstContigPerThread;
113-
}
103+
if (mlir::isa<gpu::NvidiaMmaEncodingAttr>(srcLayout) &&
104+
mlir::isa<gpu::BlockedEncodingAttr>(dstLayout)) {
105+
// when storing from mma layout and loading in blocked layout vectorizing
106+
// the load back gives better performance even if there is a
107+
// transposition.
108+
scratchConfig.outVec = dstContigPerThread;
114109
}
115110

116111
// No padding is required if the tensor is 1-D, or if all dimensions except

lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
namespace {
1818

19-
using ::mlir::isLayoutMmaV1;
2019
using ::mlir::LLVM::getMultiDimOffset;
2120
using ::mlir::LLVM::getSharedMemoryObjectFromStruct;
2221
using ::mlir::LLVM::getStridesFromShapeAndOrder;
@@ -56,8 +55,7 @@ struct ConvertLayoutOpConversion
5655
return isa<BlockedEncodingAttr, MmaEncodingTrait, SliceEncodingAttr>(
5756
srcLayout) &&
5857
isa<BlockedEncodingAttr, MmaEncodingTrait, SliceEncodingAttr>(
59-
dstLayout) &&
60-
!isLayoutMmaV1(srcLayout) && !isLayoutMmaV1(dstLayout);
58+
dstLayout);
6159
}
6260

6361
// shared memory rd/st for blocked or mma layout with data padding

0 commit comments

Comments
 (0)