|
1 | | -//===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===// |
2 | | -// |
3 | | -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | -// See https://llvm.org/LICENSE.txt for license information. |
5 | | -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | -// |
7 | | -//===----------------------------------------------------------------------===// |
8 | | -// |
9 | | -// This file implements a pass to automatically promote accessed memref regions |
10 | | -// to buffers in a faster memory space that is explicitly managed, with the |
11 | | -// necessary data movement operations performed through either regular |
12 | | -// point-wise load/store's or DMAs. Such explicit copying (also referred to as |
13 | | -// array packing/unpacking in the literature), when done on arrays that exhibit |
14 | | -// reuse, results in near elimination of conflict misses, TLB misses, reduced |
15 | | -// use of hardware prefetch streams, and reduced false sharing. It is also |
16 | | -// necessary for hardware that explicitly managed levels in the memory |
17 | | -// hierarchy, and where DMAs may have to be used. This optimization is often |
18 | | -// performed on already tiled code. |
19 | | -// |
20 | | -//===----------------------------------------------------------------------===// |
21 | | - |
22 | | -#include "mlir/Analysis/Utils.h" |
23 | | -#include "mlir/Dialect/AffineOps/AffineOps.h" |
24 | | -#include "mlir/IR/Builders.h" |
25 | | -#include "mlir/Pass/Pass.h" |
26 | | -#include "mlir/Transforms/LoopUtils.h" |
27 | | -#include "mlir/Transforms/Passes.h" |
28 | | -#include "mlir/Transforms/Utils.h" |
29 | | -#include "llvm/ADT/MapVector.h" |
30 | | -#include "llvm/Support/CommandLine.h" |
31 | | -#include "llvm/Support/Debug.h" |
32 | | -#include <algorithm> |
33 | | - |
34 | | -#define DEBUG_TYPE "affine-data-copy-generate" |
35 | | - |
36 | | -using namespace mlir; |
37 | | - |
38 | | -static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); |
39 | | - |
40 | | -static llvm::cl::opt<unsigned long long> clFastMemoryCapacity( |
41 | | - "affine-data-copy-generate-fast-mem-capacity", |
42 | | - llvm::cl::desc( |
43 | | - "Set fast memory space capacity in KiB (default: unlimited)"), |
44 | | - llvm::cl::cat(clOptionsCategory)); |
45 | | - |
46 | | -static llvm::cl::opt<bool> |
47 | | - clDma("affine-data-copy-generate-dma", |
48 | | - llvm::cl::desc("Generate DMA instead of point-wise copy"), |
49 | | - llvm::cl::cat(clOptionsCategory), llvm::cl::init(true)); |
50 | | - |
51 | | -static llvm::cl::opt<unsigned> clFastMemorySpace( |
52 | | - "affine-data-copy-generate-fast-mem-space", llvm::cl::init(1), |
53 | | - llvm::cl::desc( |
54 | | - "Fast memory space identifier for copy generation (default: 1)"), |
55 | | - llvm::cl::cat(clOptionsCategory)); |
56 | | - |
57 | | -static llvm::cl::opt<bool> clSkipNonUnitStrideLoop( |
58 | | - "affine-data-copy-generate-skip-non-unit-stride-loops", llvm::cl::Hidden, |
59 | | - llvm::cl::init(false), |
60 | | - llvm::cl::desc("Testing purposes: avoid non-unit stride loop choice depths " |
61 | | - "for copy placement"), |
62 | | - llvm::cl::cat(clOptionsCategory)); |
63 | | - |
64 | | -namespace { |
65 | | - |
66 | | -/// Replaces all loads and stores on memref's living in 'slowMemorySpace' by |
67 | | -/// introducing copy operations to transfer data into `fastMemorySpace` and |
68 | | -/// rewriting the original load's/store's to instead load/store from the |
69 | | -/// allocated fast memory buffers. Additional options specify the identifier |
70 | | -/// corresponding to the fast memory space and the amount of fast memory space |
71 | | -/// available. The pass traverses through the nesting structure, recursing to |
72 | | -/// inner levels if necessary to determine at what depth copies need to be |
73 | | -/// placed so that the allocated buffers fit within the memory capacity |
74 | | -/// provided. |
75 | | -// TODO(bondhugula): We currently can't generate copies correctly when stores |
76 | | -// are strided. Check for strided stores. |
77 | | -struct AffineDataCopyGeneration |
78 | | - : public FunctionPass<AffineDataCopyGeneration> { |
79 | | - explicit AffineDataCopyGeneration( |
80 | | - unsigned slowMemorySpace = 0, |
81 | | - unsigned fastMemorySpace = clFastMemorySpace, unsigned tagMemorySpace = 0, |
82 | | - int minDmaTransferSize = 1024, |
83 | | - uint64_t fastMemCapacityBytes = |
84 | | - (clFastMemoryCapacity.getNumOccurrences() > 0 |
85 | | - ? clFastMemoryCapacity * 1024 // cl-provided size is in KiB |
86 | | - : std::numeric_limits<uint64_t>::max()), |
87 | | - bool generateDma = clDma, |
88 | | - bool skipNonUnitStrideLoops = clSkipNonUnitStrideLoop) |
89 | | - : slowMemorySpace(slowMemorySpace), fastMemorySpace(fastMemorySpace), |
90 | | - tagMemorySpace(tagMemorySpace), minDmaTransferSize(minDmaTransferSize), |
91 | | - fastMemCapacityBytes(fastMemCapacityBytes), generateDma(generateDma), |
92 | | - skipNonUnitStrideLoops(skipNonUnitStrideLoops) {} |
93 | | - |
94 | | - explicit AffineDataCopyGeneration(const AffineDataCopyGeneration &other) |
95 | | - : slowMemorySpace(other.slowMemorySpace), |
96 | | - fastMemorySpace(other.fastMemorySpace), |
97 | | - tagMemorySpace(other.tagMemorySpace), |
98 | | - minDmaTransferSize(other.minDmaTransferSize), |
99 | | - fastMemCapacityBytes(other.fastMemCapacityBytes), |
100 | | - generateDma(other.generateDma), |
101 | | - skipNonUnitStrideLoops(other.skipNonUnitStrideLoops) {} |
102 | | - |
103 | | - void runOnFunction() override; |
104 | | - LogicalResult runOnBlock(Block *block, DenseSet<Operation *> ©Nests); |
105 | | - |
106 | | - // Slow memory space associated with copies. |
107 | | - const unsigned slowMemorySpace; |
108 | | - // Fast memory space associated with copies. |
109 | | - unsigned fastMemorySpace; |
110 | | - // Memory space associated with DMA tags. |
111 | | - unsigned tagMemorySpace; |
112 | | - // Minimum DMA transfer size supported by the target in bytes. |
113 | | - const int minDmaTransferSize; |
114 | | - // Capacity of the faster memory space. |
115 | | - uint64_t fastMemCapacityBytes; |
116 | | - |
117 | | - // If set, generate DMA operations instead of read/write. |
118 | | - bool generateDma; |
119 | | - |
120 | | - // If set, ignore loops with steps other than 1. |
121 | | - bool skipNonUnitStrideLoops; |
122 | | - |
123 | | - // Constant zero index to avoid too many duplicates. |
124 | | - Value zeroIndex = nullptr; |
| 1 | +struct A { |
| 2 | + int a; |
125 | 3 | }; |
126 | 4 |
|
127 | | -} // end anonymous namespace |
| 5 | +static A a; |
128 | 6 |
|
129 | | -/// Generates copies for memref's living in 'slowMemorySpace' into newly created |
130 | | -/// buffers in 'fastMemorySpace', and replaces memory operations to the former |
131 | | -/// by the latter. Only load op's handled for now. |
132 | | -/// TODO(bondhugula): extend this to store op's. |
133 | | -std::unique_ptr<OpPassBase<FuncOp>> mlir::createAffineDataCopyGenerationPass( |
134 | | - unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace, |
135 | | - int minDmaTransferSize, uint64_t fastMemCapacityBytes) { |
136 | | - return std::make_unique<AffineDataCopyGeneration>( |
137 | | - slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize, |
138 | | - fastMemCapacityBytes); |
139 | | -} |
140 | | - |
141 | | -/// Generate copies for this block. The block is partitioned into separate |
142 | | -/// ranges: each range is either a sequence of one or more operations starting |
143 | | -/// and ending with an affine load or store op, or just an affine.forop (which |
144 | | -/// could have other affine for op's nested within). |
145 | | -LogicalResult |
146 | | -AffineDataCopyGeneration::runOnBlock(Block *block, |
147 | | - DenseSet<Operation *> ©Nests) { |
148 | | - if (block->empty()) |
149 | | - return success(); |
150 | | - |
151 | | - AffineCopyOptions copyOptions = {generateDma, slowMemorySpace, |
152 | | - fastMemorySpace, tagMemorySpace, |
153 | | - fastMemCapacityBytes}; |
154 | | - |
155 | | - // Every affine.forop in the block starts and ends a block range for copying; |
156 | | - // in addition, a contiguous sequence of operations starting with a |
157 | | - // load/store op but not including any copy nests themselves is also |
158 | | - // identified as a copy block range. Straightline code (a contiguous chunk of |
159 | | - // operations excluding AffineForOp's) are always assumed to not exhaust |
160 | | - // memory. As a result, this approach is conservative in some cases at the |
161 | | - // moment; we do a check later and report an error with location info. |
162 | | - // TODO(bondhugula): An 'affine.if' operation is being treated similar to an |
163 | | - // operation. 'affine.if''s could have 'affine.for's in them; |
164 | | - // treat them separately. |
165 | | - |
166 | | - // Get to the first load, store, or for op (that is not a copy nest itself). |
167 | | - auto curBegin = |
168 | | - std::find_if(block->begin(), block->end(), [&](Operation &op) { |
169 | | - return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) || |
170 | | - isa<AffineForOp>(op)) && |
171 | | - copyNests.count(&op) == 0; |
172 | | - }); |
173 | | - |
174 | | - // Create [begin, end) ranges. |
175 | | - auto it = curBegin; |
176 | | - while (it != block->end()) { |
177 | | - AffineForOp forOp; |
178 | | - // If you hit a non-copy for loop, we will split there. |
179 | | - if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) { |
180 | | - // Perform the copying up unti this 'for' op first. |
181 | | - affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions, |
182 | | - /*filterMemRef=*/llvm::None, copyNests); |
183 | | - |
184 | | - // Returns true if the footprint is known to exceed capacity. |
185 | | - auto exceedsCapacity = [&](AffineForOp forOp) { |
186 | | - Optional<int64_t> footprint = |
187 | | - getMemoryFootprintBytes(forOp, |
188 | | - /*memorySpace=*/0); |
189 | | - return (footprint.hasValue() && |
190 | | - static_cast<uint64_t>(footprint.getValue()) > |
191 | | - fastMemCapacityBytes); |
192 | | - }; |
193 | | - |
194 | | - // If the memory footprint of the 'affine.for' loop is higher than fast |
195 | | - // memory capacity (when provided), we recurse to copy at an inner level |
196 | | - // until we find a depth at which footprint fits in fast mem capacity. If |
197 | | - // the footprint can't be calculated, we assume for now it fits. Recurse |
198 | | - // inside if footprint for 'forOp' exceeds capacity, or when |
199 | | - // skipNonUnitStrideLoops is set and the step size is not one. |
200 | | - bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1 |
201 | | - : exceedsCapacity(forOp); |
202 | | - if (recurseInner) { |
203 | | - // We'll recurse and do the copies at an inner level for 'forInst'. |
204 | | - // Recurse onto the body of this loop. |
205 | | - runOnBlock(forOp.getBody(), copyNests); |
206 | | - } else { |
207 | | - // We have enough capacity, i.e., copies will be computed for the |
208 | | - // portion of the block until 'it', and for 'it', which is 'forOp'. Note |
209 | | - // that for the latter, the copies are placed just before this loop (for |
210 | | - // incoming copies) and right after (for outgoing ones). |
211 | | - |
212 | | - // Inner loop copies have their own scope - we don't thus update |
213 | | - // consumed capacity. The footprint check above guarantees this inner |
214 | | - // loop's footprint fits. |
215 | | - affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions, |
216 | | - /*filterMemRef=*/llvm::None, copyNests); |
217 | | - } |
218 | | - // Get to the next load or store op after 'forOp'. |
219 | | - curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) { |
220 | | - return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) || |
221 | | - isa<AffineForOp>(op)) && |
222 | | - copyNests.count(&op) == 0; |
223 | | - }); |
224 | | - it = curBegin; |
225 | | - } else { |
226 | | - assert(copyNests.count(&*it) == 0 && |
227 | | - "all copy nests generated should have been skipped above"); |
228 | | - // We simply include this op in the current range and continue for more. |
229 | | - ++it; |
230 | | - } |
231 | | - } |
232 | | - |
233 | | - // Generate the copy for the final block range. |
234 | | - if (curBegin != block->end()) { |
235 | | - // Can't be a terminator because it would have been skipped above. |
236 | | - assert(!curBegin->isKnownTerminator() && "can't be a terminator"); |
237 | | - // Exclude the affine terminator - hence, the std::prev. |
238 | | - affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()), |
239 | | - copyOptions, /*filterMemRef=*/llvm::None, copyNests); |
240 | | - } |
241 | | - |
242 | | - return success(); |
243 | | -} |
244 | | - |
245 | | -void AffineDataCopyGeneration::runOnFunction() { |
246 | | - FuncOp f = getFunction(); |
247 | | - OpBuilder topBuilder(f.getBody()); |
248 | | - zeroIndex = topBuilder.create<ConstantIndexOp>(f.getLoc(), 0); |
249 | | - |
250 | | - // Nests that are copy-in's or copy-out's; the root AffineForOps of those |
251 | | - // nests are stored herein. |
252 | | - DenseSet<Operation *> copyNests; |
253 | | - |
254 | | - // Clear recorded copy nests. |
255 | | - copyNests.clear(); |
| 7 | +struct B { |
| 8 | + B(int b = a.a) {} |
| 9 | +}; |
256 | 10 |
|
257 | | - for (auto &block : f) |
258 | | - runOnBlock(&block, copyNests); |
259 | 11 |
|
260 | | - // Promote any single iteration loops in the copy nests. |
261 | | - for (auto nest : copyNests) { |
262 | | - nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); }); |
263 | | - } |
| 12 | +void foo() { |
| 13 | + B(); |
264 | 14 | } |
265 | | - |
266 | | -static PassRegistration<AffineDataCopyGeneration> |
267 | | - pass("affine-data-copy-generate", |
268 | | - "Generate explicit copying for memory operations"); |
0 commit comments