Skip to content

Commit dfa002e

Browse files
committed
revise AMDGPU vector idiom pass
1 parent 59fda7e commit dfa002e

File tree

2 files changed

+408
-96
lines changed

2 files changed

+408
-96
lines changed

llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.cpp

Lines changed: 57 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,14 @@
4646
#include "AMDGPU.h"
4747

4848
#include "llvm/ADT/SmallVector.h"
49-
#include "llvm/Analysis/AliasAnalysis.h"
5049
#include "llvm/Analysis/AssumptionCache.h"
5150
#include "llvm/Analysis/Loads.h"
51+
#include "llvm/Analysis/LoopInfo.h"
52+
#include "llvm/Analysis/PostDominators.h"
5253
#include "llvm/Analysis/TargetLibraryInfo.h"
5354
#include "llvm/Analysis/TargetTransformInfo.h"
5455
#include "llvm/Analysis/ValueTracking.h"
56+
#include "llvm/IR/Dominators.h"
5557
#include "llvm/IR/IRBuilder.h"
5658
#include "llvm/IR/InstIterator.h"
5759
#include "llvm/IR/Instructions.h"
@@ -122,7 +124,7 @@ static bool bothArmsSafeToSpeculateLoads(Value *A, Value *B, uint64_t Size,
122124
LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: false arm "
123125
<< "(B) not dereferenceable for " << Size
124126
<< " bytes at align(1)\n");
125-
LLVM_DEBUG(dbgs() << " false arm (B) value: " << *B << "\n");
127+
LLVM_DEBUG(dbgs() << " false arm (B) value: " << *B << '\n');
126128
return false;
127129
}
128130

@@ -132,7 +134,7 @@ static bool bothArmsSafeToSpeculateLoads(Value *A, Value *B, uint64_t Size,
132134
if (AlignB < Align(1)) {
133135
LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: known "
134136
<< "alignment of false arm (B) < 1: " << AlignB.value()
135-
<< "\n");
137+
<< '\n');
136138
return false;
137139
}
138140

@@ -141,7 +143,7 @@ static bool bothArmsSafeToSpeculateLoads(Value *A, Value *B, uint64_t Size,
141143
LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: true arm "
142144
<< "(A) not dereferenceable for " << Size
143145
<< " bytes at align(1)\n");
144-
LLVM_DEBUG(dbgs() << " true arm (A) value: " << *A << "\n");
146+
LLVM_DEBUG(dbgs() << " true arm (A) value: " << *A << '\n');
145147
return false;
146148
}
147149

@@ -151,28 +153,19 @@ static bool bothArmsSafeToSpeculateLoads(Value *A, Value *B, uint64_t Size,
151153
if (AlignA < Align(1)) {
152154
LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: known "
153155
<< "alignment of true arm (A) < 1: " << AlignA.value()
154-
<< "\n");
156+
<< '\n');
155157
return false;
156158
}
157159

158160
OutAlign = minAlign(AlignA, AlignB);
159161
LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Speculative loads allowed: "
160-
<< "minAlign=" << OutAlign.value() << "\n");
162+
<< "minAlign=" << OutAlign.value() << '\n');
161163
return true;
162164
}
163165

164-
// With opaque pointers, ensure address spaces match and otherwise return Ptr.
165-
// Assumes the address space is the only property to validate for this cast.
166-
static Value *castPtrTo(Value *Ptr, unsigned ExpectedAS) {
167-
auto *FromPTy = cast<PointerType>(Ptr->getType());
168-
unsigned AS = FromPTy->getAddressSpace();
169-
(void)ExpectedAS;
170-
assert(AS == ExpectedAS && "Address space mismatch for castPtrTo");
171-
return Ptr;
172-
}
173-
174166
struct AMDGPUVectorIdiomImpl {
175167
const unsigned MaxBytes;
168+
bool CFGChanged = false;
176169

177170
AMDGPUVectorIdiomImpl(unsigned MaxBytes) : MaxBytes(MaxBytes) {}
178171

@@ -182,12 +175,12 @@ struct AMDGPUVectorIdiomImpl {
182175
// calls. Assumptions:
183176
// - Non-volatile, constant length, within MaxBytes.
184177
// - Source and destination in the same address space.
185-
bool transformSelectMemcpySource(MemTransferInst &MT, SelectInst &Sel,
178+
bool transformSelectMemcpySource(MemCpyInst &MT, SelectInst &Sel,
186179
const DataLayout &DL,
187180
const DominatorTree *DT,
188181
AssumptionCache *AC) {
189182
LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Considering memcpy(select-src): "
190-
<< MT << "\n");
183+
<< MT << '\n');
191184
// Note: SelectInst has no volatility, but keep the check for consistency
192185
// with original logic; emit a debug message if it ever triggers.
193186
IRBuilder<> B(&MT);
@@ -235,7 +228,7 @@ struct AMDGPUVectorIdiomImpl {
235228
<< "dereferenceable(" << DerefBytes << ")"
236229
<< (HasDerefOrNull ? " (or null)" : "")
237230
<< (HasNonNull ? ", nonnull" : "") << ", align "
238-
<< ProvenSrcAlign.value() << "\n");
231+
<< ProvenSrcAlign.value() << '\n');
239232
if (DerefBytes >= N && (!HasDerefOrNull || HasNonNull)) {
240233
LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Using memcpy source operand "
241234
<< "attributes at this use; accepting speculation\n");
@@ -247,40 +240,34 @@ struct AMDGPUVectorIdiomImpl {
247240
<< "enough for speculation: need dereferenceable(" << N
248241
<< ") and nonnull; got dereferenceable(" << DerefBytes << ")"
249242
<< (HasDerefOrNull ? " (or null)" : "")
250-
<< (HasNonNull ? ", nonnull" : "") << "\n");
243+
<< (HasNonNull ? ", nonnull" : "") << '\n');
251244
}
252245
} else {
253246
LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] memcpy source param has no "
254247
<< "dereferenceable bytes attribute; align "
255-
<< ProvenSrcAlign.value() << "\n");
248+
<< ProvenSrcAlign.value() << '\n');
256249
}
257250
if (!CanSpeculate)
258251
CanSpeculate =
259252
bothArmsSafeToSpeculateLoads(A, Bv, N, AlignAB, DL, AC, DT, &MT);
260253

261-
unsigned AS = cast<PointerType>(A->getType())->getAddressSpace();
262-
263254
if (CanSpeculate) {
264255
Align MinAlign = std::min(AlignAB, DstAlign);
265256
LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Rewriting memcpy(select-src) "
266257
<< "with value-level select; N=" << N
267-
<< " minAlign=" << MinAlign.value() << "\n");
258+
<< " minAlign=" << MinAlign.value() << '\n');
268259

269260
Type *Ty = getIntOrVecTypeForSize(N, B.getContext(), MinAlign);
270261

271-
Value *PA = castPtrTo(A, AS);
272-
Value *PB = castPtrTo(Bv, AS);
273-
LoadInst *LA = B.CreateAlignedLoad(Ty, PA, MinAlign);
274-
LoadInst *LB = B.CreateAlignedLoad(Ty, PB, MinAlign);
262+
LoadInst *LA = B.CreateAlignedLoad(Ty, A, MinAlign);
263+
LoadInst *LB = B.CreateAlignedLoad(Ty, Bv, MinAlign);
275264
Value *V = B.CreateSelect(Sel.getCondition(), LA, LB);
276265

277-
Value *PDst =
278-
castPtrTo(Dst, cast<PointerType>(Dst->getType())->getAddressSpace());
279-
(void)B.CreateAlignedStore(V, PDst, DstAlign);
266+
(void)B.CreateAlignedStore(V, Dst, DstAlign);
280267

281268
LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Rewrote memcpy(select-src) to "
282269
"value-select loads/stores: "
283-
<< MT << "\n");
270+
<< MT << '\n');
284271
MT.eraseFromParent();
285272
return true;
286273
}
@@ -297,12 +284,12 @@ struct AMDGPUVectorIdiomImpl {
297284
// Rewrites memcpy when the destination is a select of pointers. To avoid
298285
// speculative stores, always splits the CFG and emits a memcpy per branch.
299286
// Assumptions mirror the source case.
300-
bool transformSelectMemcpyDest(MemTransferInst &MT, SelectInst &Sel) {
287+
bool transformSelectMemcpyDest(MemCpyInst &MT, SelectInst &Sel) {
301288
Value *DA = Sel.getTrueValue();
302289
Value *DB = Sel.getFalseValue();
303290
LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Rewriting memcpy(select-dst) via "
304291
<< "CFG split to avoid speculative stores: " << MT
305-
<< "\n");
292+
<< '\n');
306293

307294
splitCFGForMemcpy(MT, Sel.getCondition(), DA, DB, false);
308295
LLVM_DEBUG(
@@ -316,8 +303,10 @@ struct AMDGPUVectorIdiomImpl {
316303
// Assumptions:
317304
// - MT has constant length and is non-volatile.
318305
// - TruePtr/FalsePtr are correct replacements for the selected operand.
319-
void splitCFGForMemcpy(MemTransferInst &MT, Value *Cond, Value *TruePtr,
306+
void splitCFGForMemcpy(MemCpyInst &MT, Value *Cond, Value *TruePtr,
320307
Value *FalsePtr, bool IsSource) {
308+
CFGChanged = true;
309+
321310
Function *F = MT.getFunction();
322311
BasicBlock *Cur = MT.getParent();
323312
BasicBlock *ThenBB = BasicBlock::Create(F->getContext(), "memcpy.then", F);
@@ -373,19 +362,16 @@ AMDGPUVectorIdiomCombinePass::run(Function &F, FunctionAnalysisManager &FAM) {
373362
if (!AMDGPUVectorIdiomEnable)
374363
return PreservedAnalyses::all();
375364

376-
SmallVector<CallInst *, 8> Worklist;
365+
SmallVector<MemCpyInst *, 8> Worklist;
377366
for (Instruction &I : instructions(F)) {
378-
if (auto *CI = dyn_cast<CallInst>(&I)) {
379-
if (isa<MemTransferInst>(CI))
380-
Worklist.push_back(CI);
381-
}
367+
if (auto *MC = dyn_cast<MemCpyInst>(&I))
368+
Worklist.push_back(MC);
382369
}
383370

384371
bool Changed = false;
385372
AMDGPUVectorIdiomImpl Impl(MaxBytes);
386373

387-
for (CallInst *CI : Worklist) {
388-
auto *MT = cast<MemTransferInst>(CI);
374+
for (MemCpyInst *MT : Worklist) {
389375
Value *Dst = MT->getRawDest();
390376
Value *Src = MT->getRawSource();
391377
if (!isa<SelectInst>(Src) && !isa<SelectInst>(Dst))
@@ -399,48 +385,45 @@ AMDGPUVectorIdiomCombinePass::run(Function &F, FunctionAnalysisManager &FAM) {
399385
Value *LenV = MT->getLength();
400386

401387
auto dumpPtrForms = [&](StringRef Label, Value *V) {
402-
dbgs() << " " << Label << ": " << *V << "\n";
388+
dbgs() << " " << Label << ": " << *V << '\n';
403389

404-
// Strip pointer casts only
405390
Value *StripCasts = V->stripPointerCasts();
406391
if (StripCasts != V)
407-
dbgs() << " - stripCasts: " << *StripCasts << "\n";
392+
dbgs() << " - stripCasts: " << *StripCasts << '\n';
408393
else
409394
dbgs() << " - stripCasts: (no change)\n";
410395

411-
// Strip GEPs and casts to the base object (older API: no DataLayout
412-
// arg) Optionally increase MaxLookup if you want to chase deeper.
413-
Value *Underlying = getUnderlyingObject(V /*, MaxLookup=6*/);
396+
Value *Underlying = getUnderlyingObject(V);
414397
if (Underlying != V)
415-
dbgs() << " - underlying: " << *Underlying << "\n";
398+
dbgs() << " - underlying: " << *Underlying << '\n';
416399
else
417400
dbgs() << " - underlying: (no change)\n";
418401
};
419402

420403
auto dumpSelect = [&](StringRef Which, Value *V) {
421404
if (auto *SI = dyn_cast<SelectInst>(V)) {
422-
dbgs() << " - " << Which << " is Select: " << *SI << "\n";
423-
dbgs() << " cond: " << *SI->getCondition() << "\n";
405+
dbgs() << " - " << Which << " is Select: " << *SI << '\n';
406+
dbgs() << " cond: " << *SI->getCondition() << '\n';
424407
Value *T = SI->getTrueValue();
425408
Value *Fv = SI->getFalseValue();
426409
dumpPtrForms("true", T);
427410
dumpPtrForms("false", Fv);
428411
}
429412
};
430413

431-
dbgs() << "[AMDGPUVectorIdiom] Found memcpy: " << *MT << "\n"
432-
<< " in function: " << F.getName() << "\n"
433-
<< " - volatile=" << (MT->isVolatile() ? "true" : "false") << "\n"
414+
dbgs() << "[AMDGPUVectorIdiom] Found memcpy: " << *MT << '\n'
415+
<< " in function: " << F.getName() << '\n'
416+
<< " - volatile=" << (MT->isVolatile() ? "true" : "false") << '\n'
434417
<< " - sameAS=" << (DstAS == SrcAS ? "true" : "false")
435418
<< " (dstAS=" << DstAS << ", srcAS=" << SrcAS << ")\n"
436419
<< " - constLen=" << (isa<ConstantInt>(LenV) ? "true" : "false");
437420
if (auto *LCI = dyn_cast<ConstantInt>(LenV))
438421
dbgs() << " (N=" << LCI->getLimitedValue() << ")";
439-
dbgs() << "\n"
422+
dbgs() << '\n'
440423
<< " - srcIsSelect=" << (isa<SelectInst>(SrcV) ? "true" : "false")
441-
<< "\n"
424+
<< '\n'
442425
<< " - dstIsSelect=" << (isa<SelectInst>(DstV) ? "true" : "false")
443-
<< "\n";
426+
<< '\n';
444427

445428
// Detailed dumps
446429
dumpSelect("src", SrcV);
@@ -487,5 +470,21 @@ AMDGPUVectorIdiomCombinePass::run(Function &F, FunctionAnalysisManager &FAM) {
487470
<< "destination is a select of pointers\n");
488471
}
489472

490-
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
473+
if (!Changed)
474+
return PreservedAnalyses::all();
475+
476+
// Be conservative: preserve only analyses we know remain valid.
477+
PreservedAnalyses PA;
478+
PA.preserve<AssumptionAnalysis>();
479+
PA.preserve<TargetLibraryAnalysis>();
480+
PA.preserve<TargetIRAnalysis>();
481+
482+
// If we didn't change the CFG, we can keep DT/LI/PostDT.
483+
if (!Impl.CFGChanged) {
484+
PA.preserve<DominatorTreeAnalysis>();
485+
PA.preserve<LoopAnalysis>();
486+
PA.preserve<PostDominatorTreeAnalysis>();
487+
}
488+
489+
return PA;
491490
}

0 commit comments

Comments
 (0)