|
1 | | -struct A { |
2 | | - int a; |
3 | | -}; |
| 1 | +//===- AffineDataCopyGeneration.cpp - Explicit memref copying pass ------*-===// |
| 2 | +// |
| 3 | +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | +// See https://llvm.org/LICENSE.txt for license information. |
| 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | +// |
| 7 | +//===----------------------------------------------------------------------===// |
| 8 | +// |
| 9 | +// This file implements a pass to automatically promote accessed memref regions |
| 10 | +// to buffers in a faster memory space that is explicitly managed, with the |
| 11 | +// necessary data movement operations performed through either regular |
| 12 | +// point-wise load/store's or DMAs. Such explicit copying (also referred to as |
| 13 | +// array packing/unpacking in the literature), when done on arrays that exhibit |
| 14 | +// reuse, results in near elimination of conflict misses, TLB misses, reduced |
| 15 | +// use of hardware prefetch streams, and reduced false sharing. It is also |
| 16 | +// necessary for hardware that explicitly managed levels in the memory |
| 17 | +// hierarchy, and where DMAs may have to be used. This optimization is often |
| 18 | +// performed on already tiled code. |
| 19 | +// |
| 20 | +//===----------------------------------------------------------------------===// |
| 21 | + |
| 22 | +#include "mlir/Analysis/Utils.h" |
| 23 | +#include "mlir/Dialect/AffineOps/AffineOps.h" |
| 24 | +#include "mlir/IR/Builders.h" |
| 25 | +#include "mlir/Pass/Pass.h" |
| 26 | +#include "mlir/Transforms/LoopUtils.h" |
| 27 | +#include "mlir/Transforms/Passes.h" |
| 28 | +#include "mlir/Transforms/Utils.h" |
| 29 | +#include "llvm/ADT/MapVector.h" |
| 30 | +#include "llvm/Support/CommandLine.h" |
| 31 | +#include "llvm/Support/Debug.h" |
| 32 | +#include <algorithm> |
| 33 | + |
| 34 | +#define DEBUG_TYPE "affine-data-copy-generate" |
| 35 | + |
| 36 | +using namespace mlir; |
| 37 | + |
| 38 | +static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); |
| 39 | + |
| 40 | +static llvm::cl::opt<unsigned long long> clFastMemoryCapacity( |
| 41 | + "affine-data-copy-generate-fast-mem-capacity", |
| 42 | + llvm::cl::desc( |
| 43 | + "Set fast memory space capacity in KiB (default: unlimited)"), |
| 44 | + llvm::cl::cat(clOptionsCategory)); |
| 45 | + |
| 46 | +static llvm::cl::opt<bool> |
| 47 | + clDma("affine-data-copy-generate-dma", |
| 48 | + llvm::cl::desc("Generate DMA instead of point-wise copy"), |
| 49 | + llvm::cl::cat(clOptionsCategory), llvm::cl::init(true)); |
| 50 | + |
| 51 | +static llvm::cl::opt<unsigned> clFastMemorySpace( |
| 52 | + "affine-data-copy-generate-fast-mem-space", llvm::cl::init(1), |
| 53 | + llvm::cl::desc( |
| 54 | + "Fast memory space identifier for copy generation (default: 1)"), |
| 55 | + llvm::cl::cat(clOptionsCategory)); |
| 56 | + |
| 57 | +static llvm::cl::opt<bool> clSkipNonUnitStrideLoop( |
| 58 | + "affine-data-copy-generate-skip-non-unit-stride-loops", llvm::cl::Hidden, |
| 59 | + llvm::cl::init(false), |
| 60 | + llvm::cl::desc("Testing purposes: avoid non-unit stride loop choice depths " |
| 61 | + "for copy placement"), |
| 62 | + llvm::cl::cat(clOptionsCategory)); |
4 | 63 |
|
5 | | -static A a; |
| 64 | +namespace { |
6 | 65 |
|
7 | | -struct B { |
8 | | - B(int b = a.a) {} |
| 66 | +/// Replaces all loads and stores on memref's living in 'slowMemorySpace' by |
| 67 | +/// introducing copy operations to transfer data into `fastMemorySpace` and |
| 68 | +/// rewriting the original load's/store's to instead load/store from the |
| 69 | +/// allocated fast memory buffers. Additional options specify the identifier |
| 70 | +/// corresponding to the fast memory space and the amount of fast memory space |
| 71 | +/// available. The pass traverses through the nesting structure, recursing to |
| 72 | +/// inner levels if necessary to determine at what depth copies need to be |
| 73 | +/// placed so that the allocated buffers fit within the memory capacity |
| 74 | +/// provided. |
| 75 | +// TODO(bondhugula): We currently can't generate copies correctly when stores |
| 76 | +// are strided. Check for strided stores. |
| 77 | +struct AffineDataCopyGeneration |
| 78 | + : public FunctionPass<AffineDataCopyGeneration> { |
| 79 | + explicit AffineDataCopyGeneration( |
| 80 | + unsigned slowMemorySpace = 0, |
| 81 | + unsigned fastMemorySpace = clFastMemorySpace, unsigned tagMemorySpace = 0, |
| 82 | + int minDmaTransferSize = 1024, |
| 83 | + uint64_t fastMemCapacityBytes = |
| 84 | + (clFastMemoryCapacity.getNumOccurrences() > 0 |
| 85 | + ? clFastMemoryCapacity * 1024 // cl-provided size is in KiB |
| 86 | + : std::numeric_limits<uint64_t>::max()), |
| 87 | + bool generateDma = clDma, |
| 88 | + bool skipNonUnitStrideLoops = clSkipNonUnitStrideLoop) |
| 89 | + : slowMemorySpace(slowMemorySpace), fastMemorySpace(fastMemorySpace), |
| 90 | + tagMemorySpace(tagMemorySpace), minDmaTransferSize(minDmaTransferSize), |
| 91 | + fastMemCapacityBytes(fastMemCapacityBytes), generateDma(generateDma), |
| 92 | + skipNonUnitStrideLoops(skipNonUnitStrideLoops) {} |
| 93 | + |
| 94 | + explicit AffineDataCopyGeneration(const AffineDataCopyGeneration &other) |
| 95 | + : slowMemorySpace(other.slowMemorySpace), |
| 96 | + fastMemorySpace(other.fastMemorySpace), |
| 97 | + tagMemorySpace(other.tagMemorySpace), |
| 98 | + minDmaTransferSize(other.minDmaTransferSize), |
| 99 | + fastMemCapacityBytes(other.fastMemCapacityBytes), |
| 100 | + generateDma(other.generateDma), |
| 101 | + skipNonUnitStrideLoops(other.skipNonUnitStrideLoops) {} |
| 102 | + |
| 103 | + void runOnFunction() override; |
| 104 | + LogicalResult runOnBlock(Block *block, DenseSet<Operation *> ©Nests); |
| 105 | + |
| 106 | + // Slow memory space associated with copies. |
| 107 | + const unsigned slowMemorySpace; |
| 108 | + // Fast memory space associated with copies. |
| 109 | + unsigned fastMemorySpace; |
| 110 | + // Memory space associated with DMA tags. |
| 111 | + unsigned tagMemorySpace; |
| 112 | + // Minimum DMA transfer size supported by the target in bytes. |
| 113 | + const int minDmaTransferSize; |
| 114 | + // Capacity of the faster memory space. |
| 115 | + uint64_t fastMemCapacityBytes; |
| 116 | + |
| 117 | + // If set, generate DMA operations instead of read/write. |
| 118 | + bool generateDma; |
| 119 | + |
| 120 | + // If set, ignore loops with steps other than 1. |
| 121 | + bool skipNonUnitStrideLoops; |
| 122 | + |
| 123 | + // Constant zero index to avoid too many duplicates. |
| 124 | + Value zeroIndex = nullptr; |
9 | 125 | }; |
10 | 126 |
|
| 127 | +} // end anonymous namespace |
11 | 128 |
|
12 | | -void foo() { |
13 | | - B(); |
| 129 | +/// Generates copies for memref's living in 'slowMemorySpace' into newly created |
| 130 | +/// buffers in 'fastMemorySpace', and replaces memory operations to the former |
| 131 | +/// by the latter. Only load op's handled for now. |
| 132 | +/// TODO(bondhugula): extend this to store op's. |
| 133 | +std::unique_ptr<OpPassBase<FuncOp>> mlir::createAffineDataCopyGenerationPass( |
| 134 | + unsigned slowMemorySpace, unsigned fastMemorySpace, unsigned tagMemorySpace, |
| 135 | + int minDmaTransferSize, uint64_t fastMemCapacityBytes) { |
| 136 | + return std::make_unique<AffineDataCopyGeneration>( |
| 137 | + slowMemorySpace, fastMemorySpace, tagMemorySpace, minDmaTransferSize, |
| 138 | + fastMemCapacityBytes); |
14 | 139 | } |
| 140 | + |
| 141 | +/// Generate copies for this block. The block is partitioned into separate |
| 142 | +/// ranges: each range is either a sequence of one or more operations starting |
| 143 | +/// and ending with an affine load or store op, or just an affine.forop (which |
| 144 | +/// could have other affine for op's nested within). |
| 145 | +LogicalResult |
| 146 | +AffineDataCopyGeneration::runOnBlock(Block *block, |
| 147 | + DenseSet<Operation *> ©Nests) { |
| 148 | + if (block->empty()) |
| 149 | + return success(); |
| 150 | + |
| 151 | + AffineCopyOptions copyOptions = {generateDma, slowMemorySpace, |
| 152 | + fastMemorySpace, tagMemorySpace, |
| 153 | + fastMemCapacityBytes}; |
| 154 | + |
| 155 | + // Every affine.forop in the block starts and ends a block range for copying; |
| 156 | + // in addition, a contiguous sequence of operations starting with a |
| 157 | + // load/store op but not including any copy nests themselves is also |
| 158 | + // identified as a copy block range. Straightline code (a contiguous chunk of |
| 159 | + // operations excluding AffineForOp's) are always assumed to not exhaust |
| 160 | + // memory. As a result, this approach is conservative in some cases at the |
| 161 | + // moment; we do a check later and report an error with location info. |
| 162 | + // TODO(bondhugula): An 'affine.if' operation is being treated similar to an |
| 163 | + // operation. 'affine.if''s could have 'affine.for's in them; |
| 164 | + // treat them separately. |
| 165 | + |
| 166 | + // Get to the first load, store, or for op (that is not a copy nest itself). |
| 167 | + auto curBegin = |
| 168 | + std::find_if(block->begin(), block->end(), [&](Operation &op) { |
| 169 | + return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) || |
| 170 | + isa<AffineForOp>(op)) && |
| 171 | + copyNests.count(&op) == 0; |
| 172 | + }); |
| 173 | + |
| 174 | + // Create [begin, end) ranges. |
| 175 | + auto it = curBegin; |
| 176 | + while (it != block->end()) { |
| 177 | + AffineForOp forOp; |
| 178 | + // If you hit a non-copy for loop, we will split there. |
| 179 | + if ((forOp = dyn_cast<AffineForOp>(&*it)) && copyNests.count(forOp) == 0) { |
| 180 | + // Perform the copying up unti this 'for' op first. |
| 181 | + affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions, |
| 182 | + /*filterMemRef=*/llvm::None, copyNests); |
| 183 | + |
| 184 | + // Returns true if the footprint is known to exceed capacity. |
| 185 | + auto exceedsCapacity = [&](AffineForOp forOp) { |
| 186 | + Optional<int64_t> footprint = |
| 187 | + getMemoryFootprintBytes(forOp, |
| 188 | + /*memorySpace=*/0); |
| 189 | + return (footprint.hasValue() && |
| 190 | + static_cast<uint64_t>(footprint.getValue()) > |
| 191 | + fastMemCapacityBytes); |
| 192 | + }; |
| 193 | + |
| 194 | + // If the memory footprint of the 'affine.for' loop is higher than fast |
| 195 | + // memory capacity (when provided), we recurse to copy at an inner level |
| 196 | + // until we find a depth at which footprint fits in fast mem capacity. If |
| 197 | + // the footprint can't be calculated, we assume for now it fits. Recurse |
| 198 | + // inside if footprint for 'forOp' exceeds capacity, or when |
| 199 | + // skipNonUnitStrideLoops is set and the step size is not one. |
| 200 | + bool recurseInner = skipNonUnitStrideLoops ? forOp.getStep() != 1 |
| 201 | + : exceedsCapacity(forOp); |
| 202 | + if (recurseInner) { |
| 203 | + // We'll recurse and do the copies at an inner level for 'forInst'. |
| 204 | + // Recurse onto the body of this loop. |
| 205 | + runOnBlock(forOp.getBody(), copyNests); |
| 206 | + } else { |
| 207 | + // We have enough capacity, i.e., copies will be computed for the |
| 208 | + // portion of the block until 'it', and for 'it', which is 'forOp'. Note |
| 209 | + // that for the latter, the copies are placed just before this loop (for |
| 210 | + // incoming copies) and right after (for outgoing ones). |
| 211 | + |
| 212 | + // Inner loop copies have their own scope - we don't thus update |
| 213 | + // consumed capacity. The footprint check above guarantees this inner |
| 214 | + // loop's footprint fits. |
| 215 | + affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions, |
| 216 | + /*filterMemRef=*/llvm::None, copyNests); |
| 217 | + } |
| 218 | + // Get to the next load or store op after 'forOp'. |
| 219 | + curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) { |
| 220 | + return (isa<AffineLoadOp>(op) || isa<AffineStoreOp>(op) || |
| 221 | + isa<AffineForOp>(op)) && |
| 222 | + copyNests.count(&op) == 0; |
| 223 | + }); |
| 224 | + it = curBegin; |
| 225 | + } else { |
| 226 | + assert(copyNests.count(&*it) == 0 && |
| 227 | + "all copy nests generated should have been skipped above"); |
| 228 | + // We simply include this op in the current range and continue for more. |
| 229 | + ++it; |
| 230 | + } |
| 231 | + } |
| 232 | + |
| 233 | + // Generate the copy for the final block range. |
| 234 | + if (curBegin != block->end()) { |
| 235 | + // Can't be a terminator because it would have been skipped above. |
| 236 | + assert(!curBegin->isKnownTerminator() && "can't be a terminator"); |
| 237 | + // Exclude the affine terminator - hence, the std::prev. |
| 238 | + affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()), |
| 239 | + copyOptions, /*filterMemRef=*/llvm::None, copyNests); |
| 240 | + } |
| 241 | + |
| 242 | + return success(); |
| 243 | +} |
| 244 | + |
| 245 | +void AffineDataCopyGeneration::runOnFunction() { |
| 246 | + FuncOp f = getFunction(); |
| 247 | + OpBuilder topBuilder(f.getBody()); |
| 248 | + zeroIndex = topBuilder.create<ConstantIndexOp>(f.getLoc(), 0); |
| 249 | + |
| 250 | + // Nests that are copy-in's or copy-out's; the root AffineForOps of those |
| 251 | + // nests are stored herein. |
| 252 | + DenseSet<Operation *> copyNests; |
| 253 | + |
| 254 | + // Clear recorded copy nests. |
| 255 | + copyNests.clear(); |
| 256 | + |
| 257 | + for (auto &block : f) |
| 258 | + runOnBlock(&block, copyNests); |
| 259 | + |
| 260 | + // Promote any single iteration loops in the copy nests. |
| 261 | + for (auto nest : copyNests) { |
| 262 | + nest->walk([](AffineForOp forOp) { promoteIfSingleIteration(forOp); }); |
| 263 | + } |
| 264 | +} |
| 265 | + |
| 266 | +static PassRegistration<AffineDataCopyGeneration> |
| 267 | + pass("affine-data-copy-generate", |
| 268 | + "Generate explicit copying for memory operations"); |
0 commit comments