Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 8 additions & 15 deletions llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,6 @@ static cl::opt<bool> DisableFDivExpand(
cl::ReallyHidden,
cl::init(false));

static bool hasUnsafeFPMath(const Function &F) {
return F.getFnAttribute("unsafe-fp-math").getValueAsBool();
}

class AMDGPUCodeGenPrepareImpl
: public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
public:
Expand All @@ -104,7 +100,6 @@ class AMDGPUCodeGenPrepareImpl
const DominatorTree *DT;
const UniformityInfo &UA;
const DataLayout &DL;
const bool HasUnsafeFPMath;
const bool HasFP32DenormalFlush;
bool FlowChanged = false;
mutable Function *SqrtF32 = nullptr;
Expand All @@ -117,7 +112,6 @@ class AMDGPUCodeGenPrepareImpl
const DominatorTree *DT, const UniformityInfo &UA)
: F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC),
DT(DT), UA(UA), DL(F.getDataLayout()),
HasUnsafeFPMath(hasUnsafeFPMath(F)),
HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
DenormalMode::getPreserveSign()) {}

Expand Down Expand Up @@ -637,8 +631,7 @@ bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
return false;

// v_rsq_f32 gives 1ulp
return SqrtFMF.approxFunc() || HasUnsafeFPMath ||
SqrtOp->getFPAccuracy() >= 1.0f;
return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f;
}

Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
Expand All @@ -664,7 +657,7 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
IRBuilder<>::FastMathFlagGuard Guard(Builder);
Builder.setFastMathFlags(DivFMF | SqrtFMF);

if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath ||
if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
canIgnoreDenormalInput(Den, CtxI)) {
Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
// -1.0 / sqrt(x) -> fneg(rsq(x))
Expand All @@ -680,7 +673,7 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
// Optimize fdiv with rcp:
//
// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
// allowed with unsafe-fp-math or afn.
// allowed with afn.
//
// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
Value *
Expand Down Expand Up @@ -803,9 +796,9 @@ Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
//
// With rcp:
// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
// allowed with unsafe-fp-math or afn.
// allowed with afn.
//
// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn.
//
// With fdiv.fast:
// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
Expand Down Expand Up @@ -843,7 +836,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
RsqOp = SqrtOp->getOperand(0);
}

// Inaccurate rcp is allowed with unsafe-fp-math or afn.
// Inaccurate rcp is allowed with afn.
//
// Defer to codegen to handle this.
//
Expand All @@ -852,7 +845,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
// expansion of afn to codegen. The current interpretation is so aggressive we
// don't need any pre-consideration here when we have better information. A
// more conservative interpretation could use handling here.
const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc();
const bool AllowInaccurateRcp = DivFMF.approxFunc();
if (!RsqOp && AllowInaccurateRcp)
return false;

Expand Down Expand Up @@ -2026,7 +2019,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {

// We're trying to handle the fast-but-not-that-fast case only. The lowering
// of fast llvm.sqrt will give the raw instruction anyway.
if (SqrtFMF.approxFunc() || HasUnsafeFPMath)
if (SqrtFMF.approxFunc())
return false;

const float ReqdAccuracy = FPOp->getFPAccuracy();
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2634,7 +2634,7 @@ bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
if (Flags.hasApproximateFuncs())
return true;
auto &Options = DAG.getTarget().Options;
return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
return Options.ApproxFuncFPMath;
}

bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
Expand Down Expand Up @@ -2757,7 +2757,7 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,

const auto &Options = getTargetMachine().Options;
if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
Options.ApproxFuncFPMath) {

if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
// Log and multiply in f32 is good enough for f16.
Expand Down Expand Up @@ -3585,7 +3585,7 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) con
if (N0.getValueType() == MVT::f32)
return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);

if (getTargetMachine().Options.UnsafeFPMath) {
if (Op->getFlags().hasApproximateFuncs()) {
// There is a generic expand for FP_TO_FP16 with unsafe fast math.
return SDValue();
}
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ def NoFP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode()
def NoFP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">;
def IEEEModeEnabled : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">;
def IEEEModeDisabled : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().IEEE">;
def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
}

def FMA : Predicate<"Subtarget->hasFMA()">;
Expand Down
12 changes: 4 additions & 8 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3344,7 +3344,7 @@ static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
if (Flags & MachineInstr::FmAfn)
return true;
const auto &Options = MF.getTarget().Options;
return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
return Options.ApproxFuncFPMath;
}

static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
Expand Down Expand Up @@ -3450,7 +3450,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
static_cast<const AMDGPUTargetMachine &>(MF.getTarget());

if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
TM.Options.ApproxFuncFPMath) {
if (Ty == F16 && !ST.has16BitInsts()) {
Register LogVal = MRI.createGenericVirtualRegister(F32);
auto PromoteSrc = B.buildFPExt(F32, X);
Expand Down Expand Up @@ -4877,9 +4877,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
uint16_t Flags = MI.getFlags();
LLT ResTy = MRI.getType(Res);

const MachineFunction &MF = B.getMF();
bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
MF.getTarget().Options.UnsafeFPMath;
bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);

if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
Expand Down Expand Up @@ -4939,9 +4937,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
uint16_t Flags = MI.getFlags();
LLT ResTy = MRI.getType(Res);

const MachineFunction &MF = B.getMF();
bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
MI.getFlag(MachineInstr::FmAfn);
bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);

if (!AllowInaccurateRcp)
return false;
Expand Down
13 changes: 2 additions & 11 deletions llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,6 @@ class AMDGPULibCalls {

using FuncInfo = llvm::AMDGPULibFunc;

bool UnsafeFPMath = false;

// -fuse-native.
bool AllNative = false;

Expand Down Expand Up @@ -117,7 +115,6 @@ class AMDGPULibCalls {
bool AllowStrictFP = false);

protected:
bool isUnsafeMath(const FPMathOperator *FPOp) const;
bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const;

bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const;
Expand Down Expand Up @@ -415,23 +412,17 @@ bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName,
return AMDGPULibFunc::parse(FMangledName, FInfo);
}

bool AMDGPULibCalls::isUnsafeMath(const FPMathOperator *FPOp) const {
return UnsafeFPMath || FPOp->isFast();
}

bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const {
return UnsafeFPMath ||
(FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs());
return FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs();
}

bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold(
const FPMathOperator *FPOp) const {
// TODO: Refine to approxFunc or contract
return isUnsafeMath(FPOp);
return FPOp->isFast();
}

void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) {
UnsafeFPMath = F.getFnAttribute("unsafe-fp-math").getValueAsBool();
AC = &FAM.getResult<AssumptionAnalysis>(F);
TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
Expand Down
4 changes: 1 addition & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -597,7 +597,6 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
// Estimate all types may be fused with contract/unsafe flags
const TargetOptions &Options = TLI->getTargetMachine().Options;
if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
Options.UnsafeFPMath ||
(FAdd->hasAllowContract() && CxtI->hasAllowContract()))
return TargetTransformInfo::TCC_Free;
}
Expand Down Expand Up @@ -650,8 +649,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
return LT.first * Cost * NElts;
}

if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) ||
TLI->getTargetMachine().Options.UnsafeFPMath)) {
if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
// Fast unsafe fdiv lowering:
// f32 rcp
// f32 fmul
Expand Down
16 changes: 7 additions & 9 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7199,7 +7199,7 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
}
if (getTargetMachine().Options.UnsafeFPMath) {
if (Op->getFlags().hasApproximateFuncs()) {
SDValue Flags = Op.getOperand(1);
SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
Expand Down Expand Up @@ -11294,8 +11294,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
EVT VT = Op.getValueType();
const SDNodeFlags Flags = Op->getFlags();

bool AllowInaccurateRcp =
Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
bool AllowInaccurateRcp = Flags.hasApproximateFuncs();

if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
// Without !fpmath accuracy information, we can't do more because we don't
Expand All @@ -11314,7 +11313,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,

// 1.0 / sqrt(x) -> rsq(x)

// XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
// XXX - Is afn sufficient to do this for f64? The maximum ULP
// error seems really high at 2^29 ULP.
// 1.0 / x -> rcp(x)
return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
Expand Down Expand Up @@ -11348,8 +11347,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
EVT VT = Op.getValueType();
const SDNodeFlags Flags = Op->getFlags();

bool AllowInaccurateDiv =
Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath;
bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
if (!AllowInaccurateDiv)
return SDValue();

Expand Down Expand Up @@ -14601,7 +14599,7 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
return ISD::FMAD;

const TargetOptions &Options = DAG.getTarget().Options;
if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
(N0->getFlags().hasAllowContract() &&
N1->getFlags().hasAllowContract())) &&
isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) {
Expand Down Expand Up @@ -15724,9 +15722,9 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,

// fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
// regardless of the denorm mode setting. Therefore,
// unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
// fp-contract is sufficient to allow generating fdot2.
const TargetOptions &Options = DAG.getTarget().Options;
if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
(N->getFlags().hasAllowContract() &&
FMA->getFlags().hasAllowContract())) {
Op1 = Op1.getOperand(0);
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
%r1 = load half, ptr addrspace(1) %gep2, align 4
%r2 = frem half %r0, %r1
%r2 = frem afn half %r0, %r1
store half %r2, ptr addrspace(1) %out, align 4
ret void
}
Expand Down Expand Up @@ -311,7 +311,7 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
%gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
%r0 = load float, ptr addrspace(1) %in1, align 4
%r1 = load float, ptr addrspace(1) %gep2, align 4
%r2 = frem float %r0, %r1
%r2 = frem afn float %r0, %r1
store float %r2, ptr addrspace(1) %out, align 4
ret void
}
Expand Down Expand Up @@ -489,7 +489,7 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
ptr addrspace(1) %in2) #1 {
%r0 = load double, ptr addrspace(1) %in1, align 8
%r1 = load double, ptr addrspace(1) %in2, align 8
%r2 = frem double %r0, %r1
%r2 = frem afn double %r0, %r1
store double %r2, ptr addrspace(1) %out, align 8
ret void
}
Expand Down Expand Up @@ -1140,5 +1140,5 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
ret void
}

attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
Loading