15
15
#include " AMDGPU.h"
16
16
#include " AMDGPUTargetMachine.h"
17
17
#include " SIModeRegisterDefaults.h"
18
+ #include " llvm/ADT/SetVector.h"
18
19
#include " llvm/Analysis/AssumptionCache.h"
19
20
#include " llvm/Analysis/ConstantFolding.h"
20
21
#include " llvm/Analysis/TargetLibraryInfo.h"
27
28
#include " llvm/IR/InstVisitor.h"
28
29
#include " llvm/IR/IntrinsicsAMDGPU.h"
29
30
#include " llvm/IR/PatternMatch.h"
31
+ #include " llvm/IR/ValueHandle.h"
30
32
#include " llvm/InitializePasses.h"
31
33
#include " llvm/Pass.h"
32
34
#include " llvm/Support/KnownBits.h"
@@ -106,6 +108,7 @@ class AMDGPUCodeGenPrepareImpl
106
108
bool FlowChanged = false ;
107
109
mutable Function *SqrtF32 = nullptr ;
108
110
mutable Function *LdexpF32 = nullptr ;
111
+ mutable SmallVector<WeakVH> DeadVals;
109
112
110
113
DenseMap<const PHINode *, bool > BreakPhiNodesCache;
111
114
@@ -242,6 +245,8 @@ class AMDGPUCodeGenPrepareImpl
242
245
Value *emitSqrtIEEE2ULP (IRBuilder<> &Builder, Value *Src,
243
246
FastMathFlags FMF) const ;
244
247
248
+ bool tryNarrowMathIfNoOverflow (Instruction *I);
249
+
245
250
public:
246
251
bool visitFDiv (BinaryOperator &I);
247
252
@@ -281,28 +286,21 @@ bool AMDGPUCodeGenPrepareImpl::run() {
281
286
BreakPhiNodesCache.clear ();
282
287
bool MadeChange = false ;
283
288
284
- Function::iterator NextBB;
285
- for (Function::iterator FI = F.begin (), FE = F.end (); FI != FE; FI = NextBB) {
286
- BasicBlock *BB = &*FI;
287
- NextBB = std::next (FI);
288
-
289
- BasicBlock::iterator Next;
290
- for (BasicBlock::iterator I = BB->begin (), E = BB->end (); I != E;
291
- I = Next) {
292
- Next = std::next (I);
293
-
294
- MadeChange |= visit (*I);
295
-
296
- if (Next != E) { // Control flow changed
297
- BasicBlock *NextInstBB = Next->getParent ();
298
- if (NextInstBB != BB) {
299
- BB = NextInstBB;
300
- E = BB->end ();
301
- FE = F.end ();
302
- }
303
- }
289
+ // Need to use make_early_inc_range because integer division expansion is
290
+ // handled by Transform/Utils, and it can delete instructions such as the
291
+ // terminator of the BB.
292
+ for (BasicBlock &BB : reverse (F)) {
293
+ for (Instruction &I : make_early_inc_range (reverse (BB))) {
294
+ if (!isInstructionTriviallyDead (&I, TLI))
295
+ MadeChange |= visit (I);
304
296
}
305
297
}
298
+
299
+ while (!DeadVals.empty ()) {
300
+ if (auto *I = dyn_cast_or_null<Instruction>(DeadVals.pop_back_val ()))
301
+ RecursivelyDeleteTriviallyDeadInstructions (I, TLI);
302
+ }
303
+
306
304
return MadeChange;
307
305
}
308
306
@@ -422,7 +420,7 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
422
420
Value *NewVal = insertValues (Builder, Ty, ResultVals);
423
421
NewVal->takeName (&I);
424
422
I.replaceAllUsesWith (NewVal);
425
- I. eraseFromParent ( );
423
+ DeadVals. push_back (&I );
426
424
427
425
return true ;
428
426
}
@@ -496,10 +494,10 @@ bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
496
494
FoldedT, FoldedF);
497
495
NewSelect->takeName (&BO);
498
496
BO.replaceAllUsesWith (NewSelect);
499
- BO. eraseFromParent ( );
497
+ DeadVals. push_back (&BO );
500
498
if (CastOp)
501
- CastOp-> eraseFromParent ( );
502
- Sel-> eraseFromParent ( );
499
+ DeadVals. push_back (CastOp );
500
+ DeadVals. push_back (Sel );
503
501
return true ;
504
502
}
505
503
@@ -895,7 +893,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
895
893
if (NewVal) {
896
894
FDiv.replaceAllUsesWith (NewVal);
897
895
NewVal->takeName (&FDiv);
898
- RecursivelyDeleteTriviallyDeadInstructions (&FDiv, TLI );
896
+ DeadVals. push_back (&FDiv);
899
897
}
900
898
901
899
return true ;
@@ -1302,10 +1300,7 @@ it will create `s_and_b32 s0, s0, 0xff`.
1302
1300
We accept this change since the non-byte load assumes the upper bits
1303
1301
within the byte are all 0.
1304
1302
*/
1305
- static bool tryNarrowMathIfNoOverflow (Instruction *I,
1306
- const SITargetLowering *TLI,
1307
- const TargetTransformInfo &TTI,
1308
- const DataLayout &DL) {
1303
+ bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow (Instruction *I) {
1309
1304
unsigned Opc = I->getOpcode ();
1310
1305
Type *OldType = I->getType ();
1311
1306
@@ -1330,6 +1325,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I,
1330
1325
NewType = I->getType ()->getWithNewBitWidth (NewBit);
1331
1326
1332
1327
// Old cost
1328
+ const TargetTransformInfo &TTI = TM.getTargetTransformInfo (F);
1333
1329
InstructionCost OldCost =
1334
1330
TTI.getArithmeticInstrCost (Opc, OldType, TTI::TCK_RecipThroughput);
1335
1331
// New cost of new op
@@ -1360,7 +1356,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I,
1360
1356
1361
1357
Value *Zext = Builder.CreateZExt (Arith, OldType);
1362
1358
I->replaceAllUsesWith (Zext);
1363
- I-> eraseFromParent ( );
1359
+ DeadVals. push_back (I );
1364
1360
return true ;
1365
1361
}
1366
1362
@@ -1370,8 +1366,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1370
1366
1371
1367
if (UseMul24Intrin && replaceMulWithMul24 (I))
1372
1368
return true ;
1373
- if (tryNarrowMathIfNoOverflow (&I, ST.getTargetLowering (),
1374
- TM.getTargetTransformInfo (F), DL))
1369
+ if (tryNarrowMathIfNoOverflow (&I))
1375
1370
return true ;
1376
1371
1377
1372
bool Changed = false ;
@@ -1436,7 +1431,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1436
1431
1437
1432
if (NewDiv) {
1438
1433
I.replaceAllUsesWith (NewDiv);
1439
- I. eraseFromParent ( );
1434
+ DeadVals. push_back (&I );
1440
1435
Changed = true ;
1441
1436
}
1442
1437
}
@@ -1492,7 +1487,7 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1492
1487
Value *ValTrunc = Builder.CreateTrunc (WidenLoad, IntNTy);
1493
1488
Value *ValOrig = Builder.CreateBitCast (ValTrunc, I.getType ());
1494
1489
I.replaceAllUsesWith (ValOrig);
1495
- I. eraseFromParent ( );
1490
+ DeadVals. push_back (&I );
1496
1491
return true ;
1497
1492
}
1498
1493
@@ -1534,7 +1529,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1534
1529
1535
1530
Fract->takeName (&I);
1536
1531
I.replaceAllUsesWith (Fract);
1537
- RecursivelyDeleteTriviallyDeadInstructions (&I, TLI );
1532
+ DeadVals. push_back (&I);
1538
1533
return true ;
1539
1534
}
1540
1535
@@ -1822,7 +1817,7 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1822
1817
}
1823
1818
1824
1819
I.replaceAllUsesWith (Vec);
1825
- I. eraseFromParent ( );
1820
+ DeadVals. push_back (&I );
1826
1821
return true ;
1827
1822
}
1828
1823
@@ -1903,7 +1898,7 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
1903
1898
auto *Intrin = B.CreateIntrinsic (
1904
1899
I.getType (), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand (0 )});
1905
1900
I.replaceAllUsesWith (Intrin);
1906
- I. eraseFromParent ( );
1901
+ DeadVals. push_back (&I );
1907
1902
return true ;
1908
1903
}
1909
1904
@@ -2000,16 +1995,10 @@ bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
2000
1995
Value *Fract = applyFractPat (Builder, FractArg);
2001
1996
Fract->takeName (&I);
2002
1997
I.replaceAllUsesWith (Fract);
2003
-
2004
- RecursivelyDeleteTriviallyDeadInstructions (&I, TLI);
1998
+ DeadVals.push_back (&I);
2005
1999
return true ;
2006
2000
}
2007
2001
2008
- static bool isOneOrNegOne (const Value *Val) {
2009
- const APFloat *C;
2010
- return match (Val, m_APFloat (C)) && C->getExactLog2Abs () == 0 ;
2011
- }
2012
-
2013
2002
// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2014
2003
bool AMDGPUCodeGenPrepareImpl::visitSqrt (IntrinsicInst &Sqrt) {
2015
2004
Type *Ty = Sqrt.getType ()->getScalarType ();
@@ -2030,18 +2019,6 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2030
2019
if (ReqdAccuracy < 1 .0f )
2031
2020
return false ;
2032
2021
2033
- // FIXME: This is an ugly hack for this pass using forward iteration instead
2034
- // of reverse. If it worked like a normal combiner, the rsq would form before
2035
- // we saw a sqrt call.
2036
- auto *FDiv =
2037
- dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser ());
2038
- if (FDiv && FDiv->getOpcode () == Instruction::FDiv &&
2039
- FDiv->getFPAccuracy () >= 1 .0f &&
2040
- canOptimizeWithRsq (FPOp, FDiv->getFastMathFlags (), SqrtFMF) &&
2041
- // TODO: We should also handle the arcp case for the fdiv with non-1 value
2042
- isOneOrNegOne (FDiv->getOperand (0 )))
2043
- return false ;
2044
-
2045
2022
Value *SrcVal = Sqrt.getOperand (0 );
2046
2023
bool CanTreatAsDAZ = canIgnoreDenormalInput (SrcVal, &Sqrt);
2047
2024
@@ -2065,7 +2042,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2065
2042
Value *NewSqrt = insertValues (Builder, Sqrt.getType (), ResultVals);
2066
2043
NewSqrt->takeName (&Sqrt);
2067
2044
Sqrt.replaceAllUsesWith (NewSqrt);
2068
- Sqrt. eraseFromParent ( );
2045
+ DeadVals. push_back (&Sqrt );
2069
2046
return true ;
2070
2047
}
2071
2048
0 commit comments