|
| 1 | +//===- SIConvertWaveSize.cpp - Automatically converts wave32 kernels to wave64 |
| 2 | +//---------===// |
| 3 | +// |
| 4 | +// Part of the LLVM Project, under the Apache License v2.0 WITH LLVM Exceptions. |
| 5 | +// See https://llvm.org/LICENSE.txt for license information. |
| 6 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 7 | +// |
| 8 | +//===----------------------------------------------------------------------===// |
| 9 | +// |
| 10 | +/// \file |
| 11 | +// Small short living kernels may become waveslot limited. |
| 12 | +// To work around the problem an optimization is proposed to convert such |
| 13 | +// kernels from wave32 to wave64 automatically.These kernels shall conform to a |
| 14 | +// strict set of limitations and satisfy profitability conditions. |
| 15 | +// |
| 16 | +// 1. A kernel shall have no function calls as we cannot analyze call stack |
| 17 | +// requirements (nor will it fall into a category of short living kernels |
| 18 | +// anyway). |
| 19 | +// 2. A kernel itself shall not be called from a device enqueue call. |
| 20 | +// 3. A kernel shall not attempt to access EXEC or VCC in any user visible |
| 21 | +// way. |
| 22 | +// 4. A kernel must not use readlane/readfirstlane or any cross-lane/DPP |
| 23 | +// operations in general. |
| 24 | +// 5. A kernel shall not read wavefront size or use ballot through |
| 25 | +// intrinsics (a use of pre-defined frontend wave size macro was deemed |
| 26 | +// permissible for now). |
| 27 | +// 6. There shall be no atomic operations of any sort as these may be used |
| 28 | +// for cross-thread communication. |
| 29 | +// 7. There shall be no LDS access as the allocation is usually tied to the |
| 30 | +// workgroup size and we generally cannot extend it. It is also changing |
| 31 | +// occupancy which is tied to the wave size. |
| 32 | +// 8. There shall be no inline asm calls. |
| 33 | +// 9 .There shall be no dynamic VGPRs. |
| 34 | +// 10 .Starting from GFX11 some instructions (such as WMMA on GFX11+ and |
| 35 | +// transpose loads on GFX12+) work differently (have different operands) in |
| 36 | +// wave32 and wave64. The kernel shall not have intrinsics to invoke such |
| 37 | +// instructions. |
| 38 | + |
| 39 | +#include "SIConvertWaveSize.h" |
| 40 | +#include "AMDGPU.h" |
| 41 | +#include "GCNSubtarget.h" |
| 42 | +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| 43 | +#include "llvm/Analysis/ScalarEvolutionExpressions.h" |
| 44 | +#include "llvm/IR/IntrinsicsAMDGPU.h" |
| 45 | +#include "llvm/InitializePasses.h" |
| 46 | + |
| 47 | +using namespace llvm; |
| 48 | + |
| 49 | +#define DEBUG_TYPE "si-convert-wave-size" |
| 50 | + |
| 51 | +namespace { |
| 52 | +class SIConvertWaveSize { |
| 53 | + const TargetMachine *TM; |
| 54 | + const LoopInfo *LI; |
| 55 | + ScalarEvolution *SE; |
| 56 | + TargetTransformInfo *TTI; |
| 57 | + |
| 58 | + InstructionCost TotalCost = 0; |
| 59 | + |
| 60 | + static const unsigned MaxLatency = 2000; |
| 61 | + |
| 62 | + SmallVector<Function *> Callees; |
| 63 | + |
| 64 | +public: |
| 65 | + SIConvertWaveSize(const TargetMachine *TM, const LoopInfo *LI, |
| 66 | + ScalarEvolution *SE, TargetTransformInfo *TTI) |
| 67 | + : TM(TM), LI(LI), SE(SE), TTI(TTI) {} |
| 68 | + |
| 69 | + bool run(Function &F); |
| 70 | + |
| 71 | + bool changeWaveSizeAttr(Function *F); |
| 72 | +}; |
| 73 | + |
| 74 | +class SIConvertWaveSizeLegacy : public FunctionPass { |
| 75 | + const TargetMachine *TM; |
| 76 | + |
| 77 | +public: |
| 78 | + static char ID; |
| 79 | + SIConvertWaveSizeLegacy(const TargetMachine *TM) : FunctionPass(ID), TM(TM) {} |
| 80 | + bool runOnFunction(Function &F) override { |
| 81 | + auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); |
| 82 | + auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); |
| 83 | + auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); |
| 84 | + SIConvertWaveSize Impl(TM, &LI, &SE, &TTI); |
| 85 | + return Impl.run(F); |
| 86 | + } |
| 87 | + StringRef getPassName() const override { return "SI convert wave size"; } |
| 88 | + void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 89 | + AU.addRequired<LoopInfoWrapperPass>(); |
| 90 | + AU.addRequired<ScalarEvolutionWrapperPass>(); |
| 91 | + AU.setPreservesAll(); |
| 92 | + FunctionPass::getAnalysisUsage(AU); |
| 93 | + } |
| 94 | +}; |
| 95 | +} // end anonymous namespace |
| 96 | + |
| 97 | +void printFunctionAttributes(const Function &F) { |
| 98 | + LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n"); |
| 99 | + for (const auto &Attr : F.getAttributes()) { |
| 100 | + LLVM_DEBUG(dbgs() << " Attribute: " << Attr.getAsString() << "\n"); |
| 101 | + } |
| 102 | +} |
| 103 | + |
| 104 | +bool SIConvertWaveSize::run(Function &F) { |
| 105 | + LLVM_DEBUG(dbgs() << "Running SIConvertWaveSize on function: " << F.getName() << "\n"); |
| 106 | + LLVM_DEBUG(printFunctionAttributes(F)); |
| 107 | + |
| 108 | + const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); |
| 109 | + if (ST.getGeneration() < AMDGPUSubtarget::GFX11) |
| 110 | + return false; |
| 111 | + |
| 112 | + // Check if the function is a kernel. |
| 113 | + if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL) |
| 114 | + return false; |
| 115 | + |
| 116 | + // Check if the kernel is wave32 |
| 117 | + if (F.hasFnAttribute("target-features")) { |
| 118 | + if (!F.getFnAttribute("target-features") |
| 119 | + .getValueAsString().contains("wavefrontsize32")) { |
| 120 | + LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Kernel is not wave32.\n"); |
| 121 | + return false; |
| 122 | + } |
| 123 | + } |
| 124 | + |
| 125 | + // Check if the function is a device enqueue call. |
| 126 | + if (F.hasFnAttribute("amdgpu-device-enqueue")) { |
| 127 | + LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Device enqueue call detected.\n"); |
| 128 | + return false; |
| 129 | + } |
| 130 | + |
| 131 | + // Check if a trip count is a compile time constant for all loops in the |
| 132 | + // kernel |
| 133 | + for (Loop *L : *LI) { |
| 134 | + const SCEV *TripCountSCEV = SE->getBackedgeTakenCount(L); |
| 135 | + if (!isa<SCEVConstant>(TripCountSCEV)) { |
| 136 | + LLVM_DEBUG( |
| 137 | + dbgs() << "SIConvertWaveSize: Trip count is not a compile time " |
| 138 | + "constant.\n"); |
| 139 | + return false; |
| 140 | + } |
| 141 | + } |
| 142 | + |
| 143 | + for (const auto &BB : F) { |
| 144 | + InstructionCost BlockCost = 0; |
| 145 | + for (const auto &I : BB) { |
| 146 | + if (const CallBase *CB = dyn_cast<CallBase>(&I)) { |
| 147 | + // FIXME: Any calls are not allowed. Only non-converged intrinsic clls |
| 148 | + // and amdgsn_s_barrier are exempt. InlineAsm and Atomics are checkedd |
| 149 | + // separately for debug purposes. This will be changed in the final |
| 150 | + // version. |
| 151 | + if (CB->isInlineAsm()) { |
| 152 | + // Inline assembly is not allowed. |
| 153 | + LLVM_DEBUG(dbgs() |
| 154 | + << "SIConvertWaveSize: Inline assembly detected.\n"); |
| 155 | + return false; |
| 156 | + } |
| 157 | + if (CB->isAtomic()) { |
| 158 | + // Atomic operations are not allowed. |
| 159 | + LLVM_DEBUG(dbgs() |
| 160 | + << "SIConvertWaveSize: Atomic operation detected.\n"); |
| 161 | + return false; |
| 162 | + } |
| 163 | + if (Function *Callee = CB->getCalledFunction()) { |
| 164 | + // assuming readlane/readfirstlane or any cross-lane/DPP |
| 165 | + // operations have "let isConvergent = 1" in IntrinsicsAMDGPU.td |
| 166 | + if (Callee->isIntrinsic()) { |
| 167 | + if (Callee->hasFnAttribute(Attribute::Convergent)) { |
| 168 | + if (Callee->getIntrinsicID() != Intrinsic::amdgcn_s_barrier) { |
| 169 | + // TODO: what else should go in a "white list" ? |
| 170 | + // Intrinsic::amdgcn_s_barrier_wavefront ? |
| 171 | + // Intrinsic::amdgcn_s_barrier_signal ? |
| 172 | + LLVM_DEBUG(dbgs() |
| 173 | + << "SIConvertWaveSize: Convergent intrinsic " |
| 174 | + << Callee->getName() << " detected.\n"); |
| 175 | + return false; |
| 176 | + } |
| 177 | + } |
| 178 | + |
| 179 | + if (Callee->getIntrinsicID() == Intrinsic::read_register) { |
| 180 | + if (const auto *MDVal = |
| 181 | + dyn_cast<MetadataAsValue>(CB->getArgOperand(0))) { |
| 182 | + Metadata *MD = MDVal->getMetadata(); |
| 183 | + if (auto *MDNodeVal = dyn_cast<MDNode>(MD)) { |
| 184 | + if (MDNodeVal->getNumOperands() >= 1) { |
| 185 | + if (auto *MDStr = |
| 186 | + dyn_cast<MDString>(MDNodeVal->getOperand(0))) { |
| 187 | + if (MDStr->getString().starts_with("exec") || |
| 188 | + MDStr->getString().starts_with("vcc")) { |
| 189 | + LLVM_DEBUG(dbgs() << "SIConvertWaveSize: read_register(" |
| 190 | + << MDStr->getString() |
| 191 | + << ") intrinsic detected.\n"); |
| 192 | + return false; |
| 193 | + } |
| 194 | + } |
| 195 | + } |
| 196 | + } |
| 197 | + } |
| 198 | + } |
| 199 | + |
| 200 | + // Save callee as a candidate for attribute change |
| 201 | + Callees.push_back(Callee); |
| 202 | + } |
| 203 | + } else { |
| 204 | + // General calls are not allowed. |
| 205 | + LLVM_DEBUG(dbgs() << "SIConvertWaveSize: function call detected.\n"); |
| 206 | + return false; |
| 207 | + } |
| 208 | + } |
| 209 | + // No LDS access is allowed |
| 210 | + if (auto LI = dyn_cast<LoadInst>(&I)) { |
| 211 | + if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { |
| 212 | + LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access detected.\n"); |
| 213 | + return false; |
| 214 | + } |
| 215 | + } |
| 216 | + if (auto SI = dyn_cast<StoreInst>(&I)) { |
| 217 | + if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { |
| 218 | + LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access detected.\n"); |
| 219 | + return false; |
| 220 | + } |
| 221 | + } |
| 222 | + // TODO: All atomics are not allowed? |
| 223 | + // if (auto AI = dyn_cast<AtomicRMWInst>(&I)) { |
| 224 | + // if (AI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { |
| 225 | + // LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access |
| 226 | + // detected.\n"); return false; |
| 227 | + // } |
| 228 | + // } |
| 229 | + |
| 230 | + // TODO: Dynamic VGPRS and GFX11+ special operations ??? |
| 231 | + BlockCost += |
| 232 | + TTI->getInstructionCost(&I, TargetTransformInfo::TCK_RecipThroughput); |
| 233 | + } |
| 234 | + if (auto L = LI->getLoopFor(&BB)) { |
| 235 | + const SCEV *TripCount = SE->getBackedgeTakenCount(L); |
| 236 | + if (auto *C = dyn_cast<SCEVConstant>(TripCount)) { |
| 237 | + uint64_t TC = C->getValue()->getZExtValue() + 1; |
| 238 | + size_t Depth = LI->getLoopDepth(&BB); |
| 239 | + BlockCost *= TC * Depth; |
| 240 | + } else |
| 241 | + llvm_unreachable("SIConvertWaveSize: only loops with compile time " |
| 242 | + "constant trip count could reach here!\n"); |
| 243 | + } |
| 244 | + TotalCost += BlockCost; |
| 245 | + if (TotalCost.isValid()) { |
| 246 | + if (TotalCost.getValue().value() >= MaxLatency) { |
| 247 | + LLVM_DEBUG( |
| 248 | + dbgs() << "SIConvertWaveSize: Total latency of the kernel [" |
| 249 | + << TotalCost.getValue().value() |
| 250 | + << "] exceeds the limit of 2000 cycles - not profitable!\n"); |
| 251 | + return false; |
| 252 | + } |
| 253 | + } else |
| 254 | + llvm_unreachable( |
| 255 | + "SIConvertWaveSize: Cost model error - invalid state!\n"); |
| 256 | + } |
| 257 | + |
| 258 | + // Additional checks can be added here... |
| 259 | + |
| 260 | + // If all checks pass, convert wave size from wave32 to wave64. |
| 261 | + // Conversion logic goes here... |
| 262 | + bool Changed = changeWaveSizeAttr(&F); |
| 263 | + if (Changed) |
| 264 | + // Now take care of the intrinsic calls |
| 265 | + for (auto C : Callees) { |
| 266 | + // TODO: if we could not change Attr for one of the callee |
| 267 | + // we need to rollback all the changes! |
| 268 | + changeWaveSizeAttr(C); |
| 269 | + } |
| 270 | + |
| 271 | + return Changed; |
| 272 | + } |
| 273 | + |
| 274 | +bool SIConvertWaveSize::changeWaveSizeAttr(Function *F) { |
| 275 | + auto Attr = F->getFnAttribute("target-features"); |
| 276 | + if (Attr.isValid()) { |
| 277 | + StringRef AttrStr = Attr.getValueAsString(); |
| 278 | + size_t Pos = AttrStr.find("+wavefrontsize32"); |
| 279 | + if (Pos != StringRef::npos) { |
| 280 | + // Remove the "+wavefrontsize32" attribute. |
| 281 | + std::string NewBegin = AttrStr.substr(0, Pos).str().append("+wavefrontsize64"); |
| 282 | + std::string End = AttrStr.substr(Pos + strlen("+wavefrontsize32")).str(); |
| 283 | + std::string NewAttrStr = NewBegin + End; |
| 284 | + // Add the "+wavefrontsize64" attribute. |
| 285 | + F->removeFnAttr("target-features"); |
| 286 | + F->addFnAttr("target-features", NewAttrStr); |
| 287 | + LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Converted wave size for " |
| 288 | + << F->getName() |
| 289 | + << " from wave32 " |
| 290 | + "to wave64.\n"); |
| 291 | + return true; |
| 292 | + } |
| 293 | + } |
| 294 | + return false; |
| 295 | +} |
| 296 | + |
| 297 | +INITIALIZE_PASS_BEGIN(SIConvertWaveSizeLegacy, DEBUG_TYPE, "SI convert wave size", |
| 298 | + false, false) |
| 299 | +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) |
| 300 | +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) |
| 301 | +INITIALIZE_PASS_END(SIConvertWaveSizeLegacy, DEBUG_TYPE, "SI convert wave size", |
| 302 | + false, false) |
| 303 | + |
| 304 | +char SIConvertWaveSizeLegacy::ID = 0; |
| 305 | + |
| 306 | +char &llvm::SIConvertWaveSizeLegacyID = SIConvertWaveSizeLegacy::ID; |
| 307 | + |
| 308 | +FunctionPass *llvm::createSIConvertWaveSizeLegacyPass(const TargetMachine *TM) { |
| 309 | + return new SIConvertWaveSizeLegacy(TM); |
| 310 | +} |
| 311 | + |
| 312 | +PreservedAnalyses SIConvertWaveSizePass::run( |
| 313 | + Function &F, FunctionAnalysisManager &FAM) { |
| 314 | + auto &LI = FAM.getResult<LoopAnalysis>(F); |
| 315 | + auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(F); |
| 316 | + auto &TTI = FAM.getResult<TargetIRAnalysis>(F); |
| 317 | + |
| 318 | + SIConvertWaveSize Impl(TM, &LI, &SE, &TTI); |
| 319 | + bool Changed = Impl.run(F); |
| 320 | + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); |
| 321 | +} |
0 commit comments