Skip to content

Commit 8bacfb2

Browse files
paperchalicearsenm
andauthored
[AMDGPU] Remove UnsafeFPMath uses (#151079)
Remove `UnsafeFPMath` in AMDGPU part, it blocks some bugfixes related to clang and the ultimate goal is to remove `resetTargetOptions` method in `TargetMachine`, see FIXME in `resetTargetOptions`. See also https://discourse.llvm.org/t/rfc-honor-pragmas-with-ffp-contract-fast https://discourse.llvm.org/t/allowfpopfusion-vs-sdnodeflags-hasallowcontract --------- Co-authored-by: Matt Arsenault <[email protected]>
1 parent c10736a commit 8bacfb2

21 files changed

+1404
-2117
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,6 @@ static cl::opt<bool> DisableFDivExpand(
8989
cl::ReallyHidden,
9090
cl::init(false));
9191

92-
static bool hasUnsafeFPMath(const Function &F) {
93-
return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
94-
}
95-
9692
class AMDGPUCodeGenPrepareImpl
9793
: public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
9894
public:
@@ -104,7 +100,6 @@ class AMDGPUCodeGenPrepareImpl
104100
const DominatorTree *DT;
105101
const UniformityInfo &UA;
106102
const DataLayout &DL;
107-
const bool HasUnsafeFPMath;
108103
const bool HasFP32DenormalFlush;
109104
bool FlowChanged = false;
110105
mutable Function *SqrtF32 = nullptr;
@@ -117,7 +112,6 @@ class AMDGPUCodeGenPrepareImpl
117112
const DominatorTree *DT, const UniformityInfo &UA)
118113
: F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC),
119114
DT(DT), UA(UA), DL(F.getDataLayout()),
120-
HasUnsafeFPMath(hasUnsafeFPMath(F)),
121115
HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
122116
DenormalMode::getPreserveSign()) {}
123117

@@ -637,8 +631,7 @@ bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
637631
return false;
638632

639633
// v_rsq_f32 gives 1ulp
640-
return SqrtFMF.approxFunc() || HasUnsafeFPMath ||
641-
SqrtOp->getFPAccuracy() >= 1.0f;
634+
return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f;
642635
}
643636

644637
Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
@@ -664,7 +657,7 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
664657
IRBuilder<>::FastMathFlagGuard Guard(Builder);
665658
Builder.setFastMathFlags(DivFMF | SqrtFMF);
666659

667-
if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath ||
660+
if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
668661
canIgnoreDenormalInput(Den, CtxI)) {
669662
Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
670663
// -1.0 / sqrt(x) -> fneg(rsq(x))
@@ -680,7 +673,7 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
680673
// Optimize fdiv with rcp:
681674
//
682675
// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
683-
// allowed with unsafe-fp-math or afn.
676+
// allowed with afn.
684677
//
685678
// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
686679
Value *
@@ -803,9 +796,9 @@ Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
803796
//
804797
// With rcp:
805798
// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
806-
// allowed with unsafe-fp-math or afn.
799+
// allowed with afn.
807800
//
808-
// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
801+
// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn.
809802
//
810803
// With fdiv.fast:
811804
// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
@@ -843,7 +836,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
843836
RsqOp = SqrtOp->getOperand(0);
844837
}
845838

846-
// Inaccurate rcp is allowed with unsafe-fp-math or afn.
839+
// Inaccurate rcp is allowed with afn.
847840
//
848841
// Defer to codegen to handle this.
849842
//
@@ -852,7 +845,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
852845
// expansion of afn to codegen. The current interpretation is so aggressive we
853846
// don't need any pre-consideration here when we have better information. A
854847
// more conservative interpretation could use handling here.
855-
const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc();
848+
const bool AllowInaccurateRcp = DivFMF.approxFunc();
856849
if (!RsqOp && AllowInaccurateRcp)
857850
return false;
858851

@@ -2026,7 +2019,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
20262019

20272020
// We're trying to handle the fast-but-not-that-fast case only. The lowering
20282021
// of fast llvm.sqrt will give the raw instruction anyway.
2029-
if (SqrtFMF.approxFunc() || HasUnsafeFPMath)
2022+
if (SqrtFMF.approxFunc())
20302023
return false;
20312024

20322025
const float ReqdAccuracy = FPOp->getFPAccuracy();

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2634,7 +2634,7 @@ bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
26342634
if (Flags.hasApproximateFuncs())
26352635
return true;
26362636
auto &Options = DAG.getTarget().Options;
2637-
return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
2637+
return Options.ApproxFuncFPMath;
26382638
}
26392639

26402640
bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
@@ -2757,7 +2757,7 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
27572757

27582758
const auto &Options = getTargetMachine().Options;
27592759
if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
2760-
Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
2760+
Options.ApproxFuncFPMath) {
27612761

27622762
if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
27632763
// Log and multiply in f32 is good enough for f16.
@@ -3585,7 +3585,7 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) con
35853585
if (N0.getValueType() == MVT::f32)
35863586
return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
35873587

3588-
if (getTargetMachine().Options.UnsafeFPMath) {
3588+
if (Op->getFlags().hasApproximateFuncs()) {
35893589
// There is a generic expand for FP_TO_FP16 with unsafe fast math.
35903590
return SDValue();
35913591
}

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,6 @@ def NoFP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode()
9494
def NoFP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">;
9595
def IEEEModeEnabled : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">;
9696
def IEEEModeDisabled : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">;
97-
def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
9897
}
9998

10099
def FMA : Predicate<"Subtarget->hasFMA()">;

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3344,7 +3344,7 @@ static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
33443344
if (Flags & MachineInstr::FmAfn)
33453345
return true;
33463346
const auto &Options = MF.getTarget().Options;
3347-
return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3347+
return Options.ApproxFuncFPMath;
33483348
}
33493349

33503350
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
@@ -3450,7 +3450,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
34503450
static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
34513451

34523452
if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3453-
TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3453+
TM.Options.ApproxFuncFPMath) {
34543454
if (Ty == F16 && !ST.has16BitInsts()) {
34553455
Register LogVal = MRI.createGenericVirtualRegister(F32);
34563456
auto PromoteSrc = B.buildFPExt(F32, X);
@@ -4877,9 +4877,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
48774877
uint16_t Flags = MI.getFlags();
48784878
LLT ResTy = MRI.getType(Res);
48794879

4880-
const MachineFunction &MF = B.getMF();
4881-
bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4882-
MF.getTarget().Options.UnsafeFPMath;
4880+
bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
48834881

48844882
if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
48854883
if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
@@ -4939,9 +4937,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
49394937
uint16_t Flags = MI.getFlags();
49404938
LLT ResTy = MRI.getType(Res);
49414939

4942-
const MachineFunction &MF = B.getMF();
4943-
bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4944-
MI.getFlag(MachineInstr::FmAfn);
4940+
bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
49454941

49464942
if (!AllowInaccurateRcp)
49474943
return false;

llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,6 @@ class AMDGPULibCalls {
5353

5454
using FuncInfo = llvm::AMDGPULibFunc;
5555

56-
bool UnsafeFPMath = false;
57-
5856
// -fuse-native.
5957
bool AllNative = false;
6058

@@ -117,7 +115,6 @@ class AMDGPULibCalls {
117115
bool AllowStrictFP = false);
118116

119117
protected:
120-
bool isUnsafeMath(const FPMathOperator *FPOp) const;
121118
bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const;
122119

123120
bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const;
@@ -415,23 +412,17 @@ bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName,
415412
return AMDGPULibFunc::parse(FMangledName, FInfo);
416413
}
417414

418-
bool AMDGPULibCalls::isUnsafeMath(const FPMathOperator *FPOp) const {
419-
return UnsafeFPMath || FPOp->isFast();
420-
}
421-
422415
bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const {
423-
return UnsafeFPMath ||
424-
(FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs());
416+
return FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs();
425417
}
426418

427419
bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold(
428420
const FPMathOperator *FPOp) const {
429421
// TODO: Refine to approxFunc or contract
430-
return isUnsafeMath(FPOp);
422+
return FPOp->isFast();
431423
}
432424

433425
void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) {
434-
UnsafeFPMath = F.getFnAttribute("unsafe-fp-math").getValueAsBool();
435426
AC = &FAM.getResult<AssumptionAnalysis>(F);
436427
TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
437428
DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -597,7 +597,6 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
597597
// Estimate all types may be fused with contract/unsafe flags
598598
const TargetOptions &Options = TLI->getTargetMachine().Options;
599599
if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
600-
Options.UnsafeFPMath ||
601600
(FAdd->hasAllowContract() && CxtI->hasAllowContract()))
602601
return TargetTransformInfo::TCC_Free;
603602
}
@@ -650,8 +649,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
650649
return LT.first * Cost * NElts;
651650
}
652651

653-
if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) ||
654-
TLI->getTargetMachine().Options.UnsafeFPMath)) {
652+
if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
655653
// Fast unsafe fdiv lowering:
656654
// f32 rcp
657655
// f32 fmul

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7199,7 +7199,7 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
71997199
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
72007200
return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
72017201
}
7202-
if (getTargetMachine().Options.UnsafeFPMath) {
7202+
if (Op->getFlags().hasApproximateFuncs()) {
72037203
SDValue Flags = Op.getOperand(1);
72047204
SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
72057205
return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
@@ -11294,8 +11294,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
1129411294
EVT VT = Op.getValueType();
1129511295
const SDNodeFlags Flags = Op->getFlags();
1129611296

11297-
bool AllowInaccurateRcp =
11298-
Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
11297+
bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
1129911298

1130011299
if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
1130111300
// Without !fpmath accuracy information, we can't do more because we don't
@@ -11314,7 +11313,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
1131411313

1131511314
// 1.0 / sqrt(x) -> rsq(x)
1131611315

11317-
// XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
11316+
// XXX - Is afn sufficient to do this for f64? The maximum ULP
1131811317
// error seems really high at 2^29 ULP.
1131911318
// 1.0 / x -> rcp(x)
1132011319
return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
@@ -11348,8 +11347,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
1134811347
EVT VT = Op.getValueType();
1134911348
const SDNodeFlags Flags = Op->getFlags();
1135011349

11351-
bool AllowInaccurateDiv =
11352-
Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
11350+
bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
1135311351
if (!AllowInaccurateDiv)
1135411352
return SDValue();
1135511353

@@ -14601,7 +14599,7 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
1460114599
return ISD::FMAD;
1460214600

1460314601
const TargetOptions &Options = DAG.getTarget().Options;
14604-
if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14602+
if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
1460514603
(N0->getFlags().hasAllowContract() &&
1460614604
N1->getFlags().hasAllowContract())) &&
1460714605
isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) {
@@ -15724,9 +15722,9 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
1572415722

1572515723
// fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
1572615724
// regardless of the denorm mode setting. Therefore,
15727-
// unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
15725+
// fp-contract is sufficient to allow generating fdot2.
1572815726
const TargetOptions &Options = DAG.getTarget().Options;
15729-
if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
15727+
if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
1573015728
(N->getFlags().hasAllowContract() &&
1573115729
FMA->getFlags().hasAllowContract())) {
1573215730
Op1 = Op1.getOperand(0);

llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
157157
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
158158
%r0 = load half, ptr addrspace(1) %in1, align 4
159159
%r1 = load half, ptr addrspace(1) %gep2, align 4
160-
%r2 = frem half %r0, %r1
160+
%r2 = frem afn half %r0, %r1
161161
store half %r2, ptr addrspace(1) %out, align 4
162162
ret void
163163
}
@@ -311,7 +311,7 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
311311
%gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
312312
%r0 = load float, ptr addrspace(1) %in1, align 4
313313
%r1 = load float, ptr addrspace(1) %gep2, align 4
314-
%r2 = frem float %r0, %r1
314+
%r2 = frem afn float %r0, %r1
315315
store float %r2, ptr addrspace(1) %out, align 4
316316
ret void
317317
}
@@ -489,7 +489,7 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
489489
ptr addrspace(1) %in2) #1 {
490490
%r0 = load double, ptr addrspace(1) %in1, align 8
491491
%r1 = load double, ptr addrspace(1) %in2, align 8
492-
%r2 = frem double %r0, %r1
492+
%r2 = frem afn double %r0, %r1
493493
store double %r2, ptr addrspace(1) %out, align 8
494494
ret void
495495
}
@@ -1140,5 +1140,5 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
11401140
ret void
11411141
}
11421142

1143-
attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
1144-
attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
1143+
attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
1144+
attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }

0 commit comments

Comments
 (0)