@@ -24,11 +24,11 @@ namespace gpu {
2424// / region.
2525void moveScalarUniformCode (gpu::WarpExecuteOnLane0Op op);
2626
27- template <typename T>
2827struct WarpDistributionPattern : OpRewritePattern<WarpExecuteOnLane0Op> {
2928 using OpRewritePattern<WarpExecuteOnLane0Op>::OpRewritePattern;
3029 virtual LogicalResult
31- matchAndRewrite (T op, PatternRewriter &rewriter) const override = 0 ;
30+ matchAndRewrite (WarpExecuteOnLane0Op op,
31+ PatternRewriter &rewriter) const override = 0 ;
3232
3333protected:
3434 // / Return a value yielded by `warpOp` which statifies the filter lamdba
@@ -60,132 +60,6 @@ struct WarpDistributionPattern : OpRewritePattern<WarpExecuteOnLane0Op> {
6060 SmallVectorImpl<Value> &delinearizedIds);
6161};
6262
63- template <typename T>
64- WarpExecuteOnLane0Op
65- WarpDistributionPattern<T>::moveRegionToNewWarpOpAndReplaceReturns(
66- RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
67- ValueRange newYieldedValues, TypeRange newReturnTypes) {
68- // Create a new op before the existing one, with the extra operands.
69- OpBuilder::InsertionGuard g (rewriter);
70- rewriter.setInsertionPoint (warpOp);
71- auto newWarpOp = rewriter.create <WarpExecuteOnLane0Op>(
72- warpOp.getLoc (), newReturnTypes, warpOp.getLaneid (), warpOp.getWarpSize (),
73- warpOp.getArgs (), warpOp.getBody ()->getArgumentTypes ());
74-
75- Region &opBody = warpOp.getBodyRegion ();
76- Region &newOpBody = newWarpOp.getBodyRegion ();
77- Block &newOpFirstBlock = newOpBody.front ();
78- rewriter.inlineRegionBefore (opBody, newOpBody, newOpBody.begin ());
79- rewriter.eraseBlock (&newOpFirstBlock);
80- assert (newWarpOp.getWarpRegion ().hasOneBlock () &&
81- " expected WarpOp with single block" );
82-
83- auto yield =
84- cast<gpu::YieldOp>(newOpBody.getBlocks ().begin ()->getTerminator ());
85-
86- rewriter.modifyOpInPlace (
87- yield, [&]() { yield.getValuesMutable ().assign (newYieldedValues); });
88- return newWarpOp;
89- }
90-
91- template <typename T>
92- WarpExecuteOnLane0Op
93- WarpDistributionPattern<T>::moveRegionToNewWarpOpAndAppendReturns(
94- RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
95- ValueRange newYieldedValues, TypeRange newReturnTypes,
96- llvm::SmallVector<size_t > &indices) {
97- SmallVector<Type> types (warpOp.getResultTypes ().begin (),
98- warpOp.getResultTypes ().end ());
99- auto yield = cast<gpu::YieldOp>(
100- warpOp.getBodyRegion ().getBlocks ().begin ()->getTerminator ());
101- llvm::SmallSetVector<Value, 32 > yieldValues (yield.getOperands ().begin (),
102- yield.getOperands ().end ());
103- for (auto newRet : llvm::zip (newYieldedValues, newReturnTypes)) {
104- if (yieldValues.insert (std::get<0 >(newRet))) {
105- types.push_back (std::get<1 >(newRet));
106- indices.push_back (yieldValues.size () - 1 );
107- } else {
108- // If the value already exit the region don't create a new output.
109- for (auto [idx, yieldOperand] :
110- llvm::enumerate (yieldValues.getArrayRef ())) {
111- if (yieldOperand == std::get<0 >(newRet)) {
112- indices.push_back (idx);
113- break ;
114- }
115- }
116- }
117- }
118- yieldValues.insert (newYieldedValues.begin (), newYieldedValues.end ());
119- WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndReplaceReturns (
120- rewriter, warpOp, yieldValues.getArrayRef (), types);
121- rewriter.replaceOp (warpOp,
122- newWarpOp.getResults ().take_front (warpOp.getNumResults ()));
123- return newWarpOp;
124- }
125-
126- template <typename T>
127- OpOperand *WarpDistributionPattern<T>::getWarpResult(
128- WarpExecuteOnLane0Op warpOp, const std::function<bool (Operation *)> &fn) {
129- auto yield = cast<gpu::YieldOp>(
130- warpOp.getBodyRegion ().getBlocks ().begin ()->getTerminator ());
131- for (OpOperand &yieldOperand : yield->getOpOperands ()) {
132- Value yieldValues = yieldOperand.get ();
133- Operation *definedOp = yieldValues.getDefiningOp ();
134- if (definedOp && fn (definedOp)) {
135- if (!warpOp.getResult (yieldOperand.getOperandNumber ()).use_empty ())
136- return &yieldOperand;
137- }
138- }
139- return {};
140- }
141-
142- template <typename T>
143- bool WarpDistributionPattern<T>::delinearizeLaneId(
144- OpBuilder &builder, Location loc, ArrayRef<int64_t > originalShape,
145- ArrayRef<int64_t > distributedShape, int64_t warpSize, Value laneId,
146- SmallVectorImpl<Value> &delinearizedIds) {
147- // If the original shape and the distributed shape is the same, we don't
148- // distribute at all--every thread is handling the whole. For such case, we
149- // should not rely on lane IDs later. So just return an empty lane ID vector.
150- if (originalShape == distributedShape) {
151- delinearizedIds.clear ();
152- return true ;
153- }
154-
155- SmallVector<int64_t > sizes;
156- for (auto [large, small] : llvm::zip_equal (originalShape, distributedShape)) {
157- if (large % small != 0 )
158- return false ;
159- sizes.push_back (large / small);
160- }
161- if (std::accumulate (sizes.begin (), sizes.end (), 1 ,
162- std::multiplies<int64_t >()) != warpSize)
163- return false ;
164-
165- AffineExpr s0, s1;
166- bindSymbols (builder.getContext (), s0, s1);
167-
168- int64_t usedThreads = 1 ;
169-
170- Value zero = builder.create <arith::ConstantIndexOp>(loc, 0 );
171- delinearizedIds.assign (sizes.size (), zero);
172-
173- for (int i = sizes.size () - 1 ; i >= 0 ; --i) {
174- usedThreads *= sizes[i];
175- if (usedThreads == warpSize) {
176- // We've used up all available threads. Don't need to perform modulo
177- // anymore. And we can stop the calculation for further dimensions.
178- delinearizedIds[i] = laneId;
179- break ;
180- }
181- delinearizedIds[i] =
182- affine::makeComposedAffineApply (builder, loc, s0 % sizes[i], {laneId});
183- laneId = affine::makeComposedAffineApply (
184- builder, loc, s0.floorDiv (usedThreads), {laneId});
185- }
186- return true ;
187- }
188-
18963} // namespace gpu
19064} // namespace mlir
19165
0 commit comments