Skip to content

Commit 0e647db

Browse files
author
alex-t
committed
[AMDGPU] Automatic conversion from wave32 to wave64
1 parent 7f684c9 commit 0e647db

File tree

7 files changed

+480
-0
lines changed

7 files changed

+480
-0
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ FunctionPass *createSIMemoryLegalizerPass();
5151
FunctionPass *createSIInsertWaitcntsPass();
5252
FunctionPass *createSIPreAllocateWWMRegsLegacyPass();
5353
FunctionPass *createSIFormMemoryClausesLegacyPass();
54+
FunctionPass *createSIConvertWaveSizeLegacyPass(const TargetMachine *);
5455

5556
FunctionPass *createSIPostRABundlerPass();
5657
FunctionPass *createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *);
@@ -174,6 +175,9 @@ extern char &SIShrinkInstructionsLegacyID;
174175
void initializeSIFixSGPRCopiesLegacyPass(PassRegistry &);
175176
extern char &SIFixSGPRCopiesLegacyID;
176177

178+
void initializeSIConvertWaveSizeLegacyPass(PassRegistry &);
179+
extern char &SIConvertWaveSizeLegacyID;
180+
177181
void initializeSIFixVGPRCopiesLegacyPass(PassRegistry &);
178182
extern char &SIFixVGPRCopiesID;
179183

llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
6767
AMDGPUUnifyDivergentExitNodesPass())
6868
FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
6969
FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast<const GCNTargetMachine *>(this)))
70+
FUNCTION_PASS("si-convert-wave-size", SIConvertWaveSizePass(*static_cast<const GCNTargetMachine *>(this)))
7071
#undef FUNCTION_PASS
7172

7273
#ifndef FUNCTION_ANALYSIS

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
#include "R600TargetMachine.h"
4545
#include "SIFixSGPRCopies.h"
4646
#include "SIFixVGPRCopies.h"
47+
#include "SIConvertWaveSize.h"
4748
#include "SIFoldOperands.h"
4849
#include "SIFormMemoryClauses.h"
4950
#include "SILoadStoreOptimizer.h"
@@ -506,6 +507,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
506507
initializeSILowerSGPRSpillsLegacyPass(*PR);
507508
initializeSIFixSGPRCopiesLegacyPass(*PR);
508509
initializeSIFixVGPRCopiesLegacyPass(*PR);
510+
initializeSIConvertWaveSizeLegacyPass(*PR);
509511
initializeSIFoldOperandsLegacyPass(*PR);
510512
initializeSIPeepholeSDWALegacyPass(*PR);
511513
initializeSIShrinkInstructionsLegacyPass(*PR);

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ add_llvm_target(AMDGPUCodeGen
150150
SIAnnotateControlFlow.cpp
151151
SIFixSGPRCopies.cpp
152152
SIFixVGPRCopies.cpp
153+
SIConvertWaveSize.cpp
153154
SIFoldOperands.cpp
154155
SIFormMemoryClauses.cpp
155156
SIFrameLowering.cpp
Lines changed: 321 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,321 @@
1+
//===- SIConvertWaveSize.cpp - Automatically converts wave32 kernels to wave64
2+
//---------===//
3+
//
4+
// Part of the LLVM Project, under the Apache License v2.0 WITH LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
//
8+
//===----------------------------------------------------------------------===//
9+
//
10+
/// \file
11+
// Small short living kernels may become waveslot limited.
12+
// To work around the problem an optimization is proposed to convert such
13+
// kernels from wave32 to wave64 automatically.These kernels shall conform to a
14+
// strict set of limitations and satisfy profitability conditions.
15+
//
16+
// 1. A kernel shall have no function calls as we cannot analyze call stack
17+
// requirements (nor will it fall into a category of short living kernels
18+
// anyway).
19+
// 2. A kernel itself shall not be called from a device enqueue call.
20+
// 3. A kernel shall not attempt to access EXEC or VCC in any user visible
21+
// way.
22+
// 4. A kernel must not use readlane/readfirstlane or any cross-lane/DPP
23+
// operations in general.
24+
// 5. A kernel shall not read wavefront size or use ballot through
25+
// intrinsics (a use of pre-defined frontend wave size macro was deemed
26+
// permissible for now).
27+
// 6. There shall be no atomic operations of any sort as these may be used
28+
// for cross-thread communication.
29+
// 7. There shall be no LDS access as the allocation is usually tied to the
30+
// workgroup size and we generally cannot extend it. It is also changing
31+
// occupancy which is tied to the wave size.
32+
// 8. There shall be no inline asm calls.
33+
// 9 .There shall be no dynamic VGPRs.
34+
// 10 .Starting from GFX11 some instructions (such as WMMA on GFX11+ and
35+
// transpose loads on GFX12+) work differently (have different operands) in
36+
// wave32 and wave64. The kernel shall not have intrinsics to invoke such
37+
// instructions.
38+
39+
#include "SIConvertWaveSize.h"
40+
#include "AMDGPU.h"
41+
#include "GCNSubtarget.h"
42+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
43+
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
44+
#include "llvm/IR/IntrinsicsAMDGPU.h"
45+
#include "llvm/InitializePasses.h"
46+
47+
using namespace llvm;
48+
49+
#define DEBUG_TYPE "si-convert-wave-size"
50+
51+
namespace {
52+
class SIConvertWaveSize {
53+
const TargetMachine *TM;
54+
const LoopInfo *LI;
55+
ScalarEvolution *SE;
56+
TargetTransformInfo *TTI;
57+
58+
InstructionCost TotalCost = 0;
59+
60+
static const unsigned MaxLatency = 2000;
61+
62+
SmallVector<Function *> Callees;
63+
64+
public:
65+
SIConvertWaveSize(const TargetMachine *TM, const LoopInfo *LI,
66+
ScalarEvolution *SE, TargetTransformInfo *TTI)
67+
: TM(TM), LI(LI), SE(SE), TTI(TTI) {}
68+
69+
bool run(Function &F);
70+
71+
bool changeWaveSizeAttr(Function *F);
72+
};
73+
74+
class SIConvertWaveSizeLegacy : public FunctionPass {
75+
const TargetMachine *TM;
76+
77+
public:
78+
static char ID;
79+
SIConvertWaveSizeLegacy(const TargetMachine *TM) : FunctionPass(ID), TM(TM) {}
80+
bool runOnFunction(Function &F) override {
81+
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
82+
auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
83+
auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
84+
SIConvertWaveSize Impl(TM, &LI, &SE, &TTI);
85+
return Impl.run(F);
86+
}
87+
StringRef getPassName() const override { return "SI convert wave size"; }
88+
void getAnalysisUsage(AnalysisUsage &AU) const override {
89+
AU.addRequired<LoopInfoWrapperPass>();
90+
AU.addRequired<ScalarEvolutionWrapperPass>();
91+
AU.setPreservesAll();
92+
FunctionPass::getAnalysisUsage(AU);
93+
}
94+
};
95+
} // end anonymous namespace
96+
97+
void printFunctionAttributes(const Function &F) {
98+
LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n");
99+
for (const auto &Attr : F.getAttributes()) {
100+
LLVM_DEBUG(dbgs() << " Attribute: " << Attr.getAsString() << "\n");
101+
}
102+
}
103+
104+
bool SIConvertWaveSize::run(Function &F) {
105+
LLVM_DEBUG(dbgs() << "Running SIConvertWaveSize on function: " << F.getName() << "\n");
106+
LLVM_DEBUG(printFunctionAttributes(F));
107+
108+
const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
109+
if (ST.getGeneration() < AMDGPUSubtarget::GFX11)
110+
return false;
111+
112+
// Check if the function is a kernel.
113+
if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL)
114+
return false;
115+
116+
// Check if the kernel is wave32
117+
if (F.hasFnAttribute("target-features")) {
118+
if (!F.getFnAttribute("target-features")
119+
.getValueAsString().contains("wavefrontsize32")) {
120+
LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Kernel is not wave32.\n");
121+
return false;
122+
}
123+
}
124+
125+
// Check if the function is a device enqueue call.
126+
if (F.hasFnAttribute("amdgpu-device-enqueue")) {
127+
LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Device enqueue call detected.\n");
128+
return false;
129+
}
130+
131+
// Check if a trip count is a compile time constant for all loops in the
132+
// kernel
133+
for (Loop *L : *LI) {
134+
const SCEV *TripCountSCEV = SE->getBackedgeTakenCount(L);
135+
if (!isa<SCEVConstant>(TripCountSCEV)) {
136+
LLVM_DEBUG(
137+
dbgs() << "SIConvertWaveSize: Trip count is not a compile time "
138+
"constant.\n");
139+
return false;
140+
}
141+
}
142+
143+
for (const auto &BB : F) {
144+
InstructionCost BlockCost = 0;
145+
for (const auto &I : BB) {
146+
if (const CallBase *CB = dyn_cast<CallBase>(&I)) {
147+
// FIXME: Any calls are not allowed. Only non-converged intrinsic clls
148+
// and amdgsn_s_barrier are exempt. InlineAsm and Atomics are checkedd
149+
// separately for debug purposes. This will be changed in the final
150+
// version.
151+
if (CB->isInlineAsm()) {
152+
// Inline assembly is not allowed.
153+
LLVM_DEBUG(dbgs()
154+
<< "SIConvertWaveSize: Inline assembly detected.\n");
155+
return false;
156+
}
157+
if (CB->isAtomic()) {
158+
// Atomic operations are not allowed.
159+
LLVM_DEBUG(dbgs()
160+
<< "SIConvertWaveSize: Atomic operation detected.\n");
161+
return false;
162+
}
163+
if (Function *Callee = CB->getCalledFunction()) {
164+
// assuming readlane/readfirstlane or any cross-lane/DPP
165+
// operations have "let isConvergent = 1" in IntrinsicsAMDGPU.td
166+
if (Callee->isIntrinsic()) {
167+
if (Callee->hasFnAttribute(Attribute::Convergent)) {
168+
if (Callee->getIntrinsicID() != Intrinsic::amdgcn_s_barrier) {
169+
// TODO: what else should go in a "white list" ?
170+
// Intrinsic::amdgcn_s_barrier_wavefront ?
171+
// Intrinsic::amdgcn_s_barrier_signal ?
172+
LLVM_DEBUG(dbgs()
173+
<< "SIConvertWaveSize: Convergent intrinsic "
174+
<< Callee->getName() << " detected.\n");
175+
return false;
176+
}
177+
}
178+
179+
if (Callee->getIntrinsicID() == Intrinsic::read_register) {
180+
if (const auto *MDVal =
181+
dyn_cast<MetadataAsValue>(CB->getArgOperand(0))) {
182+
Metadata *MD = MDVal->getMetadata();
183+
if (auto *MDNodeVal = dyn_cast<MDNode>(MD)) {
184+
if (MDNodeVal->getNumOperands() >= 1) {
185+
if (auto *MDStr =
186+
dyn_cast<MDString>(MDNodeVal->getOperand(0))) {
187+
if (MDStr->getString().starts_with("exec") ||
188+
MDStr->getString().starts_with("vcc")) {
189+
LLVM_DEBUG(dbgs() << "SIConvertWaveSize: read_register("
190+
<< MDStr->getString()
191+
<< ") intrinsic detected.\n");
192+
return false;
193+
}
194+
}
195+
}
196+
}
197+
}
198+
}
199+
200+
// Save callee as a candidate for attribute change
201+
Callees.push_back(Callee);
202+
}
203+
} else {
204+
// General calls are not allowed.
205+
LLVM_DEBUG(dbgs() << "SIConvertWaveSize: function call detected.\n");
206+
return false;
207+
}
208+
}
209+
// No LDS access is allowed
210+
if (auto LI = dyn_cast<LoadInst>(&I)) {
211+
if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
212+
LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access detected.\n");
213+
return false;
214+
}
215+
}
216+
if (auto SI = dyn_cast<StoreInst>(&I)) {
217+
if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
218+
LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access detected.\n");
219+
return false;
220+
}
221+
}
222+
// TODO: All atomics are not allowed?
223+
// if (auto AI = dyn_cast<AtomicRMWInst>(&I)) {
224+
// if (AI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
225+
// LLVM_DEBUG(dbgs() << "SIConvertWaveSize: LDS access
226+
// detected.\n"); return false;
227+
// }
228+
// }
229+
230+
// TODO: Dynamic VGPRS and GFX11+ special operations ???
231+
BlockCost +=
232+
TTI->getInstructionCost(&I, TargetTransformInfo::TCK_RecipThroughput);
233+
}
234+
if (auto L = LI->getLoopFor(&BB)) {
235+
const SCEV *TripCount = SE->getBackedgeTakenCount(L);
236+
if (auto *C = dyn_cast<SCEVConstant>(TripCount)) {
237+
uint64_t TC = C->getValue()->getZExtValue() + 1;
238+
size_t Depth = LI->getLoopDepth(&BB);
239+
BlockCost *= TC * Depth;
240+
} else
241+
llvm_unreachable("SIConvertWaveSize: only loops with compile time "
242+
"constant trip count could reach here!\n");
243+
}
244+
TotalCost += BlockCost;
245+
if (TotalCost.isValid()) {
246+
if (TotalCost.getValue().value() >= MaxLatency) {
247+
LLVM_DEBUG(
248+
dbgs() << "SIConvertWaveSize: Total latency of the kernel ["
249+
<< TotalCost.getValue().value()
250+
<< "] exceeds the limit of 2000 cycles - not profitable!\n");
251+
return false;
252+
}
253+
} else
254+
llvm_unreachable(
255+
"SIConvertWaveSize: Cost model error - invalid state!\n");
256+
}
257+
258+
// Additional checks can be added here...
259+
260+
// If all checks pass, convert wave size from wave32 to wave64.
261+
// Conversion logic goes here...
262+
bool Changed = changeWaveSizeAttr(&F);
263+
if (Changed)
264+
// Now take care of the intrinsic calls
265+
for (auto C : Callees) {
266+
// TODO: if we could not change Attr for one of the callee
267+
// we need to rollback all the changes!
268+
changeWaveSizeAttr(C);
269+
}
270+
271+
return Changed;
272+
}
273+
274+
bool SIConvertWaveSize::changeWaveSizeAttr(Function *F) {
275+
auto Attr = F->getFnAttribute("target-features");
276+
if (Attr.isValid()) {
277+
StringRef AttrStr = Attr.getValueAsString();
278+
size_t Pos = AttrStr.find("+wavefrontsize32");
279+
if (Pos != StringRef::npos) {
280+
// Remove the "+wavefrontsize32" attribute.
281+
std::string NewBegin = AttrStr.substr(0, Pos).str().append("+wavefrontsize64");
282+
std::string End = AttrStr.substr(Pos + strlen("+wavefrontsize32")).str();
283+
std::string NewAttrStr = NewBegin + End;
284+
// Add the "+wavefrontsize64" attribute.
285+
F->removeFnAttr("target-features");
286+
F->addFnAttr("target-features", NewAttrStr);
287+
LLVM_DEBUG(dbgs() << "SIConvertWaveSize: Converted wave size for "
288+
<< F->getName()
289+
<< " from wave32 "
290+
"to wave64.\n");
291+
return true;
292+
}
293+
}
294+
return false;
295+
}
296+
297+
INITIALIZE_PASS_BEGIN(SIConvertWaveSizeLegacy, DEBUG_TYPE, "SI convert wave size",
298+
false, false)
299+
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
300+
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
301+
INITIALIZE_PASS_END(SIConvertWaveSizeLegacy, DEBUG_TYPE, "SI convert wave size",
302+
false, false)
303+
304+
char SIConvertWaveSizeLegacy::ID = 0;
305+
306+
char &llvm::SIConvertWaveSizeLegacyID = SIConvertWaveSizeLegacy::ID;
307+
308+
FunctionPass *llvm::createSIConvertWaveSizeLegacyPass(const TargetMachine *TM) {
309+
return new SIConvertWaveSizeLegacy(TM);
310+
}
311+
312+
PreservedAnalyses SIConvertWaveSizePass::run(
313+
Function &F, FunctionAnalysisManager &FAM) {
314+
auto &LI = FAM.getResult<LoopAnalysis>(F);
315+
auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(F);
316+
auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
317+
318+
SIConvertWaveSize Impl(TM, &LI, &SE, &TTI);
319+
bool Changed = Impl.run(F);
320+
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
321+
}

0 commit comments

Comments
 (0)