4646#include " AMDGPU.h"
4747
4848#include " llvm/ADT/SmallVector.h"
49- #include " llvm/Analysis/AliasAnalysis.h"
5049#include " llvm/Analysis/AssumptionCache.h"
5150#include " llvm/Analysis/Loads.h"
51+ #include " llvm/Analysis/LoopInfo.h"
52+ #include " llvm/Analysis/PostDominators.h"
5253#include " llvm/Analysis/TargetLibraryInfo.h"
5354#include " llvm/Analysis/TargetTransformInfo.h"
5455#include " llvm/Analysis/ValueTracking.h"
56+ #include " llvm/IR/Dominators.h"
5557#include " llvm/IR/IRBuilder.h"
5658#include " llvm/IR/InstIterator.h"
5759#include " llvm/IR/Instructions.h"
@@ -122,7 +124,7 @@ static bool bothArmsSafeToSpeculateLoads(Value *A, Value *B, uint64_t Size,
122124 LLVM_DEBUG (dbgs () << " [AMDGPUVectorIdiom] Not speculating loads: false arm "
123125 << " (B) not dereferenceable for " << Size
124126 << " bytes at align(1)\n " );
125- LLVM_DEBUG (dbgs () << " false arm (B) value: " << *B << " \n " );
127+ LLVM_DEBUG (dbgs () << " false arm (B) value: " << *B << ' \n ' );
126128 return false ;
127129 }
128130
@@ -132,7 +134,7 @@ static bool bothArmsSafeToSpeculateLoads(Value *A, Value *B, uint64_t Size,
132134 if (AlignB < Align (1 )) {
133135 LLVM_DEBUG (dbgs () << " [AMDGPUVectorIdiom] Not speculating loads: known "
134136 << " alignment of false arm (B) < 1: " << AlignB.value ()
135- << " \n " );
137+ << ' \n ' );
136138 return false ;
137139 }
138140
@@ -141,7 +143,7 @@ static bool bothArmsSafeToSpeculateLoads(Value *A, Value *B, uint64_t Size,
141143 LLVM_DEBUG (dbgs () << " [AMDGPUVectorIdiom] Not speculating loads: true arm "
142144 << " (A) not dereferenceable for " << Size
143145 << " bytes at align(1)\n " );
144- LLVM_DEBUG (dbgs () << " true arm (A) value: " << *A << " \n " );
146+ LLVM_DEBUG (dbgs () << " true arm (A) value: " << *A << ' \n ' );
145147 return false ;
146148 }
147149
@@ -151,28 +153,19 @@ static bool bothArmsSafeToSpeculateLoads(Value *A, Value *B, uint64_t Size,
151153 if (AlignA < Align (1 )) {
152154 LLVM_DEBUG (dbgs () << " [AMDGPUVectorIdiom] Not speculating loads: known "
153155 << " alignment of true arm (A) < 1: " << AlignA.value ()
154- << " \n " );
156+ << ' \n ' );
155157 return false ;
156158 }
157159
158160 OutAlign = minAlign (AlignA, AlignB);
159161 LLVM_DEBUG (dbgs () << " [AMDGPUVectorIdiom] Speculative loads allowed: "
160- << " minAlign=" << OutAlign.value () << " \n " );
162+ << " minAlign=" << OutAlign.value () << ' \n ' );
161163 return true ;
162164}
163165
164- // With opaque pointers, ensure address spaces match and otherwise return Ptr.
165- // Assumes the address space is the only property to validate for this cast.
166- static Value *castPtrTo (Value *Ptr, unsigned ExpectedAS) {
167- auto *FromPTy = cast<PointerType>(Ptr->getType ());
168- unsigned AS = FromPTy->getAddressSpace ();
169- (void )ExpectedAS;
170- assert (AS == ExpectedAS && " Address space mismatch for castPtrTo" );
171- return Ptr;
172- }
173-
174166struct AMDGPUVectorIdiomImpl {
175167 const unsigned MaxBytes;
168+ bool CFGChanged = false ;
176169
177170 AMDGPUVectorIdiomImpl (unsigned MaxBytes) : MaxBytes(MaxBytes) {}
178171
@@ -182,12 +175,12 @@ struct AMDGPUVectorIdiomImpl {
182175 // calls. Assumptions:
183176 // - Non-volatile, constant length, within MaxBytes.
184177 // - Source and destination in the same address space.
185- bool transformSelectMemcpySource (MemTransferInst &MT, SelectInst &Sel,
178+ bool transformSelectMemcpySource (MemCpyInst &MT, SelectInst &Sel,
186179 const DataLayout &DL,
187180 const DominatorTree *DT,
188181 AssumptionCache *AC) {
189182 LLVM_DEBUG (dbgs () << " [AMDGPUVectorIdiom] Considering memcpy(select-src): "
190- << MT << " \n " );
183+ << MT << ' \n ' );
191184 // Note: SelectInst has no volatility, but keep the check for consistency
192185 // with original logic; emit a debug message if it ever triggers.
193186 IRBuilder<> B (&MT);
@@ -235,7 +228,7 @@ struct AMDGPUVectorIdiomImpl {
235228 << " dereferenceable(" << DerefBytes << " )"
236229 << (HasDerefOrNull ? " (or null)" : " " )
237230 << (HasNonNull ? " , nonnull" : " " ) << " , align "
238- << ProvenSrcAlign.value () << " \n " );
231+ << ProvenSrcAlign.value () << ' \n ' );
239232 if (DerefBytes >= N && (!HasDerefOrNull || HasNonNull)) {
240233 LLVM_DEBUG (dbgs () << " [AMDGPUVectorIdiom] Using memcpy source operand "
241234 << " attributes at this use; accepting speculation\n " );
@@ -247,40 +240,34 @@ struct AMDGPUVectorIdiomImpl {
247240 << " enough for speculation: need dereferenceable(" << N
248241 << " ) and nonnull; got dereferenceable(" << DerefBytes << " )"
249242 << (HasDerefOrNull ? " (or null)" : " " )
250- << (HasNonNull ? " , nonnull" : " " ) << " \n " );
243+ << (HasNonNull ? " , nonnull" : " " ) << ' \n ' );
251244 }
252245 } else {
253246 LLVM_DEBUG (dbgs () << " [AMDGPUVectorIdiom] memcpy source param has no "
254247 << " dereferenceable bytes attribute; align "
255- << ProvenSrcAlign.value () << " \n " );
248+ << ProvenSrcAlign.value () << ' \n ' );
256249 }
257250 if (!CanSpeculate)
258251 CanSpeculate =
259252 bothArmsSafeToSpeculateLoads (A, Bv, N, AlignAB, DL, AC, DT, &MT);
260253
261- unsigned AS = cast<PointerType>(A->getType ())->getAddressSpace ();
262-
263254 if (CanSpeculate) {
264255 Align MinAlign = std::min (AlignAB, DstAlign);
265256 LLVM_DEBUG (dbgs () << " [AMDGPUVectorIdiom] Rewriting memcpy(select-src) "
266257 << " with value-level select; N=" << N
267- << " minAlign=" << MinAlign.value () << " \n " );
258+ << " minAlign=" << MinAlign.value () << ' \n ' );
268259
269260 Type *Ty = getIntOrVecTypeForSize (N, B.getContext (), MinAlign);
270261
271- Value *PA = castPtrTo (A, AS);
272- Value *PB = castPtrTo (Bv, AS);
273- LoadInst *LA = B.CreateAlignedLoad (Ty, PA, MinAlign);
274- LoadInst *LB = B.CreateAlignedLoad (Ty, PB, MinAlign);
262+ LoadInst *LA = B.CreateAlignedLoad (Ty, A, MinAlign);
263+ LoadInst *LB = B.CreateAlignedLoad (Ty, Bv, MinAlign);
275264 Value *V = B.CreateSelect (Sel.getCondition (), LA, LB);
276265
277- Value *PDst =
278- castPtrTo (Dst, cast<PointerType>(Dst->getType ())->getAddressSpace ());
279- (void )B.CreateAlignedStore (V, PDst, DstAlign);
266+ (void )B.CreateAlignedStore (V, Dst, DstAlign);
280267
281268 LLVM_DEBUG (dbgs () << " [AMDGPUVectorIdiom] Rewrote memcpy(select-src) to "
282269 " value-select loads/stores: "
283- << MT << " \n " );
270+ << MT << ' \n ' );
284271 MT.eraseFromParent ();
285272 return true ;
286273 }
@@ -297,12 +284,12 @@ struct AMDGPUVectorIdiomImpl {
297284 // Rewrites memcpy when the destination is a select of pointers. To avoid
298285 // speculative stores, always splits the CFG and emits a memcpy per branch.
299286 // Assumptions mirror the source case.
300- bool transformSelectMemcpyDest (MemTransferInst &MT, SelectInst &Sel) {
287+ bool transformSelectMemcpyDest (MemCpyInst &MT, SelectInst &Sel) {
301288 Value *DA = Sel.getTrueValue ();
302289 Value *DB = Sel.getFalseValue ();
303290 LLVM_DEBUG (dbgs () << " [AMDGPUVectorIdiom] Rewriting memcpy(select-dst) via "
304291 << " CFG split to avoid speculative stores: " << MT
305- << " \n " );
292+ << ' \n ' );
306293
307294 splitCFGForMemcpy (MT, Sel.getCondition (), DA, DB, false );
308295 LLVM_DEBUG (
@@ -316,8 +303,10 @@ struct AMDGPUVectorIdiomImpl {
316303 // Assumptions:
317304 // - MT has constant length and is non-volatile.
318305 // - TruePtr/FalsePtr are correct replacements for the selected operand.
319- void splitCFGForMemcpy (MemTransferInst &MT, Value *Cond, Value *TruePtr,
306+ void splitCFGForMemcpy (MemCpyInst &MT, Value *Cond, Value *TruePtr,
320307 Value *FalsePtr, bool IsSource) {
308+ CFGChanged = true ;
309+
321310 Function *F = MT.getFunction ();
322311 BasicBlock *Cur = MT.getParent ();
323312 BasicBlock *ThenBB = BasicBlock::Create (F->getContext (), " memcpy.then" , F);
@@ -373,19 +362,16 @@ AMDGPUVectorIdiomCombinePass::run(Function &F, FunctionAnalysisManager &FAM) {
373362 if (!AMDGPUVectorIdiomEnable)
374363 return PreservedAnalyses::all ();
375364
376- SmallVector<CallInst *, 8 > Worklist;
365+ SmallVector<MemCpyInst *, 8 > Worklist;
377366 for (Instruction &I : instructions (F)) {
378- if (auto *CI = dyn_cast<CallInst>(&I)) {
379- if (isa<MemTransferInst>(CI))
380- Worklist.push_back (CI);
381- }
367+ if (auto *MC = dyn_cast<MemCpyInst>(&I))
368+ Worklist.push_back (MC);
382369 }
383370
384371 bool Changed = false ;
385372 AMDGPUVectorIdiomImpl Impl (MaxBytes);
386373
387- for (CallInst *CI : Worklist) {
388- auto *MT = cast<MemTransferInst>(CI);
374+ for (MemCpyInst *MT : Worklist) {
389375 Value *Dst = MT->getRawDest ();
390376 Value *Src = MT->getRawSource ();
391377 if (!isa<SelectInst>(Src) && !isa<SelectInst>(Dst))
@@ -399,48 +385,45 @@ AMDGPUVectorIdiomCombinePass::run(Function &F, FunctionAnalysisManager &FAM) {
399385 Value *LenV = MT->getLength ();
400386
401387 auto dumpPtrForms = [&](StringRef Label, Value *V) {
402- dbgs () << " " << Label << " : " << *V << " \n " ;
388+ dbgs () << " " << Label << " : " << *V << ' \n ' ;
403389
404- // Strip pointer casts only
405390 Value *StripCasts = V->stripPointerCasts ();
406391 if (StripCasts != V)
407- dbgs () << " - stripCasts: " << *StripCasts << " \n " ;
392+ dbgs () << " - stripCasts: " << *StripCasts << ' \n ' ;
408393 else
409394 dbgs () << " - stripCasts: (no change)\n " ;
410395
411- // Strip GEPs and casts to the base object (older API: no DataLayout
412- // arg) Optionally increase MaxLookup if you want to chase deeper.
413- Value *Underlying = getUnderlyingObject (V /* , MaxLookup=6*/ );
396+ Value *Underlying = getUnderlyingObject (V);
414397 if (Underlying != V)
415- dbgs () << " - underlying: " << *Underlying << " \n " ;
398+ dbgs () << " - underlying: " << *Underlying << ' \n ' ;
416399 else
417400 dbgs () << " - underlying: (no change)\n " ;
418401 };
419402
420403 auto dumpSelect = [&](StringRef Which, Value *V) {
421404 if (auto *SI = dyn_cast<SelectInst>(V)) {
422- dbgs () << " - " << Which << " is Select: " << *SI << " \n " ;
423- dbgs () << " cond: " << *SI->getCondition () << " \n " ;
405+ dbgs () << " - " << Which << " is Select: " << *SI << ' \n ' ;
406+ dbgs () << " cond: " << *SI->getCondition () << ' \n ' ;
424407 Value *T = SI->getTrueValue ();
425408 Value *Fv = SI->getFalseValue ();
426409 dumpPtrForms (" true" , T);
427410 dumpPtrForms (" false" , Fv);
428411 }
429412 };
430413
431- dbgs () << " [AMDGPUVectorIdiom] Found memcpy: " << *MT << " \n "
432- << " in function: " << F.getName () << " \n "
433- << " - volatile=" << (MT->isVolatile () ? " true" : " false" ) << " \n "
414+ dbgs () << " [AMDGPUVectorIdiom] Found memcpy: " << *MT << ' \n '
415+ << " in function: " << F.getName () << ' \n '
416+ << " - volatile=" << (MT->isVolatile () ? " true" : " false" ) << ' \n '
434417 << " - sameAS=" << (DstAS == SrcAS ? " true" : " false" )
435418 << " (dstAS=" << DstAS << " , srcAS=" << SrcAS << " )\n "
436419 << " - constLen=" << (isa<ConstantInt>(LenV) ? " true" : " false" );
437420 if (auto *LCI = dyn_cast<ConstantInt>(LenV))
438421 dbgs () << " (N=" << LCI->getLimitedValue () << " )" ;
439- dbgs () << " \n "
422+ dbgs () << ' \n '
440423 << " - srcIsSelect=" << (isa<SelectInst>(SrcV) ? " true" : " false" )
441- << " \n "
424+ << ' \n '
442425 << " - dstIsSelect=" << (isa<SelectInst>(DstV) ? " true" : " false" )
443- << " \n " ;
426+ << ' \n ' ;
444427
445428 // Detailed dumps
446429 dumpSelect (" src" , SrcV);
@@ -487,5 +470,21 @@ AMDGPUVectorIdiomCombinePass::run(Function &F, FunctionAnalysisManager &FAM) {
487470 << " destination is a select of pointers\n " );
488471 }
489472
490- return Changed ? PreservedAnalyses::none () : PreservedAnalyses::all ();
473+ if (!Changed)
474+ return PreservedAnalyses::all ();
475+
476+ // Be conservative: preserve only analyses we know remain valid.
477+ PreservedAnalyses PA;
478+ PA.preserve <AssumptionAnalysis>();
479+ PA.preserve <TargetLibraryAnalysis>();
480+ PA.preserve <TargetIRAnalysis>();
481+
482+ // If we didn't change the CFG, we can keep DT/LI/PostDT.
483+ if (!Impl.CFGChanged ) {
484+ PA.preserve <DominatorTreeAnalysis>();
485+ PA.preserve <LoopAnalysis>();
486+ PA.preserve <PostDominatorTreeAnalysis>();
487+ }
488+
489+ return PA;
491490}
0 commit comments