|
1 | | -#include "mlir/Support/LLVM.h" |
| 1 | +#include "triton/Conversion/TritonGPUToLLVM/FMADotUtility.h" |
2 | 2 | #include "triton/Conversion/TritonGPUToLLVM/Utility.h" |
3 | | -#include "triton/Dialect/TritonGPU/IR/Dialect.h" |
4 | | -#include "triton/Dialect/TritonGPU/Transforms/Utility.h" |
5 | 3 |
|
6 | 4 | using namespace mlir; |
7 | 5 | using namespace mlir::triton; |
8 | 6 | using namespace ::mlir::triton::gpu; |
9 | 7 |
|
10 | | -using ::mlir::LLVM::linearize; |
11 | | -using ::mlir::triton::gpu::expandMatrixOrderWithBatch; |
12 | | -using ::mlir::triton::gpu::expandMatrixShapeWithBatch; |
13 | | -using ::mlir::triton::gpu::getShapePerCTA; |
14 | | -using ::mlir::triton::gpu::getSizePerThread; |
15 | | - |
16 | | -/// \brief spatial position of repetition and register of a given value |
17 | | -struct OperandValueKey { |
18 | | - unsigned bRepIdx, nonKRepIdx; |
19 | | - unsigned bIdx, nonKIdx, kIdx; |
20 | | - |
21 | | - bool operator==(const OperandValueKey &other) const { |
22 | | - return (bRepIdx == other.bRepIdx && nonKRepIdx == other.nonKRepIdx && |
23 | | - bIdx == other.bIdx && nonKIdx == other.nonKIdx && |
24 | | - kIdx == other.kIdx); |
25 | | - } |
26 | | -}; |
27 | | - |
28 | | -template <> struct std::hash<OperandValueKey> { |
29 | | - std::size_t operator()(const OperandValueKey &k) const { |
30 | | - return llvm::hash_combine(k.bRepIdx, k.nonKRepIdx, k.bIdx, k.nonKIdx, |
31 | | - k.kIdx); |
| 8 | +namespace { |
| 9 | +class GenericFMAVectorMultiplier : public FMAVectorMultiplier { |
| 10 | + OpBuilder &builder; |
| 11 | + Location loc; |
| 12 | + |
| 13 | +public: |
| 14 | + GenericFMAVectorMultiplier(OpBuilder &builder, Location loc) |
| 15 | + : builder(builder), loc(loc) {} |
| 16 | + |
| 17 | + Value multiplyVectors(ArrayRef<Value> a, ArrayRef<Value> b, |
| 18 | + Value c) override { |
| 19 | + auto K = a.size(); |
| 20 | + assert(b.size() == K); |
| 21 | + Value accum = c; |
| 22 | + for (auto [aElem, bElem] : llvm::zip(a, b)) |
| 23 | + accum = builder.create<LLVM::FMulAddOp>(loc, aElem, bElem, accum); |
| 24 | + return accum; |
32 | 25 | } |
33 | 26 | }; |
34 | 27 |
|
35 | | -using ValueTableFMA = std::unordered_map<OperandValueKey, Value>; |
36 | | - |
37 | | -static ValueTableFMA getValueTableFromStructFMA( |
38 | | - Value val, ArrayRef<unsigned> perRepShape, ArrayRef<unsigned> repetitions, |
39 | | - unsigned kDim, unsigned nonKDim, ConversionPatternRewriter &rewriter, |
40 | | - Location loc, ArrayRef<unsigned> inRepOrder, ArrayRef<unsigned> repOrder) { |
41 | | - ValueTableFMA res; |
42 | | - auto elems = unpackLLElements(loc, val, rewriter); |
43 | | - assert(perRepShape.size() == 3); |
44 | | - auto numElemsRep = product(perRepShape); |
45 | | - assert(elems.size() == numElemsRep * product(repetitions)); |
46 | | - assert(kDim == 1 || kDim == 2); |
47 | | - assert(nonKDim == 1 || nonKDim == 2); |
48 | | - const unsigned bDim = 0; |
| 28 | +} // namespace |
49 | 29 |
|
50 | | - for (unsigned idx = 0; idx < elems.size(); ++idx) { |
51 | | - auto inRepLinearIdx = idx % numElemsRep; |
52 | | - auto repLinearIdx = idx / numElemsRep; |
53 | | - auto inRepSpatialIdx = |
54 | | - mlir::LLVM::delinearize(inRepLinearIdx, perRepShape, inRepOrder); |
55 | | - auto repSpatialIdx = |
56 | | - mlir::LLVM::delinearize(repLinearIdx, repetitions, repOrder); |
57 | | - OperandValueKey key{repSpatialIdx[0], repSpatialIdx[nonKDim], |
58 | | - inRepSpatialIdx[0], inRepSpatialIdx[nonKDim], |
59 | | - inRepSpatialIdx[kDim]}; |
60 | | - res[key] = elems[idx]; |
61 | | - } |
62 | | - return res; |
63 | | -} |
64 | | - |
65 | | -LogicalResult convertFMADot(triton::DotOp op, triton::DotOp::Adaptor adaptor, |
| 30 | +LogicalResult convertFMADot(DotOp op, DotOp::Adaptor adaptor, |
66 | 31 | const LLVMTypeConverter *typeConverter, |
67 | 32 | ConversionPatternRewriter &rewriter) { |
68 | 33 | auto *ctx = rewriter.getContext(); |
69 | 34 | auto loc = op.getLoc(); |
70 | | - |
71 | | - auto A = op.getA(); |
72 | | - auto D = op.getResult(); |
73 | | - |
74 | | - auto aTensorTy = cast<RankedTensorType>(A.getType()); |
75 | | - auto dTensorTy = cast<RankedTensorType>(D.getType()); |
76 | | - |
77 | | - SmallVector<int64_t> aShapePerCTA = |
78 | | - expandMatrixShapeWithBatch(ArrayRef(getShapePerCTA(aTensorTy))); |
79 | | - auto dShapePerCTA = |
80 | | - expandMatrixShapeWithBatch(ArrayRef(getShapePerCTA(dTensorTy))); |
81 | | - |
82 | | - BlockedEncodingAttr dLayout = |
83 | | - cast<BlockedEncodingAttr>(dTensorTy.getEncoding()); |
84 | | - // TODO process A and B operand separately |
85 | | - auto inRepOrder = expandMatrixOrderWithBatch(dLayout.getOrder()); |
86 | | - auto repOrder = expandMatrixOrderWithBatch(dLayout.getRepOrder()); |
87 | | - auto cc = unpackLLElements(loc, adaptor.getC(), rewriter); |
88 | | - |
89 | | - Value llA = adaptor.getA(); |
90 | | - Value llB = adaptor.getB(); |
91 | | - |
92 | | - auto sizePerThread = |
93 | | - expandMatrixShapeWithBatch(ArrayRef(getSizePerThread(dLayout))); |
94 | | - auto numElemsPerThread = product(sizePerThread); |
95 | | - auto shapePerCTATile = |
96 | | - expandMatrixShapeWithBatch(ArrayRef(getShapePerCTATile(dLayout))); |
97 | | - |
98 | | - unsigned K = aShapePerCTA[2]; |
99 | | - |
100 | | - unsigned threadTileShape[3]; |
101 | | - unsigned repetitions[3]; |
102 | | - for (int i = 0; i < 3; ++i) { |
103 | | - repetitions[i] = |
104 | | - ceil(dShapePerCTA[i], static_cast<int64_t>(shapePerCTATile[i])); |
105 | | - } |
106 | | - |
107 | | - auto has = getValueTableFromStructFMA( |
108 | | - llA, {sizePerThread[0], sizePerThread[1], K}, |
109 | | - {repetitions[0], repetitions[1], 1}, |
110 | | - /*kDim*/ 2, /*nonKDim*/ 1, rewriter, loc, inRepOrder, repOrder); |
111 | | - auto hbs = getValueTableFromStructFMA( |
112 | | - llB, {sizePerThread[0], K, sizePerThread[2]}, |
113 | | - {repetitions[0], 1, repetitions[2]}, |
114 | | - /*kDim*/ 1, /*nonKDim*/ 2, rewriter, loc, inRepOrder, repOrder); |
115 | | - |
116 | | - SmallVector<Value> acc = cc; |
117 | | - |
118 | | - for (unsigned bRep = 0; bRep < repetitions[0]; ++bRep) |
119 | | - for (unsigned mRep = 0; mRep < repetitions[1]; ++mRep) |
120 | | - for (unsigned nRep = 0; nRep < repetitions[2]; ++nRep) |
121 | | - for (unsigned b = 0; b < sizePerThread[0]; ++b) |
122 | | - for (unsigned m = 0; m < sizePerThread[1]; ++m) |
123 | | - for (unsigned n = 0; n < sizePerThread[2]; ++n) { |
124 | | - SmallVector<unsigned> multiDimAccumIdx = {b, m, n}; |
125 | | - unsigned linearInRepIdx = |
126 | | - linearize(multiDimAccumIdx, sizePerThread, inRepOrder); |
127 | | - SmallVector<unsigned> multiDimRepIdx = {bRep, mRep, nRep}; |
128 | | - unsigned linearRepIdx = |
129 | | - linearize(multiDimRepIdx, repetitions, repOrder); |
130 | | - unsigned linearAccumIdx = |
131 | | - linearInRepIdx + linearRepIdx * numElemsPerThread; |
132 | | - for (unsigned k = 0; k < K; ++k) { |
133 | | - auto aOp = has[{bRep, mRep, b, m, k}]; |
134 | | - auto bOp = hbs[{bRep, nRep, b, n, k}]; |
135 | | - acc[linearAccumIdx] = rewriter.create<LLVM::FMulAddOp>( |
136 | | - loc, aOp, bOp, acc[linearAccumIdx]); |
137 | | - } |
138 | | - } |
139 | | - |
140 | | - auto res = packLLElements(loc, typeConverter, acc, rewriter, dTensorTy); |
141 | | - rewriter.replaceOp(op, res); |
142 | | - |
143 | | - return success(); |
| 35 | + GenericFMAVectorMultiplier multiplier(rewriter, loc); |
| 36 | + return parametricConvertFMADot(op, adaptor, typeConverter, rewriter, |
| 37 | + multiplier); |
144 | 38 | } |
0 commit comments