diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 5f1983791cfae..a9278c1dc3a6a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -89,10 +89,6 @@ static cl::opt DisableFDivExpand( cl::ReallyHidden, cl::init(false)); -static bool hasUnsafeFPMath(const Function &F) { - return F.getFnAttribute("unsafe-fp-math").getValueAsBool(); -} - class AMDGPUCodeGenPrepareImpl : public InstVisitor { public: @@ -104,7 +100,6 @@ class AMDGPUCodeGenPrepareImpl const DominatorTree *DT; const UniformityInfo &UA; const DataLayout &DL; - const bool HasUnsafeFPMath; const bool HasFP32DenormalFlush; bool FlowChanged = false; mutable Function *SqrtF32 = nullptr; @@ -117,7 +112,6 @@ class AMDGPUCodeGenPrepareImpl const DominatorTree *DT, const UniformityInfo &UA) : F(F), ST(TM.getSubtarget(F)), TM(TM), TLI(TLI), AC(AC), DT(DT), UA(UA), DL(F.getDataLayout()), - HasUnsafeFPMath(hasUnsafeFPMath(F)), HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals == DenormalMode::getPreserveSign()) {} @@ -637,8 +631,7 @@ bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp, return false; // v_rsq_f32 gives 1ulp - return SqrtFMF.approxFunc() || HasUnsafeFPMath || - SqrtOp->getFPAccuracy() >= 1.0f; + return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f; } Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( @@ -664,7 +657,7 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( IRBuilder<>::FastMathFlagGuard Guard(Builder); Builder.setFastMathFlags(DivFMF | SqrtFMF); - if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath || + if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || canIgnoreDenormalInput(Den, CtxI)) { Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den); // -1.0 / sqrt(x) -> fneg(rsq(x)) @@ -680,7 +673,7 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq( // Optimize fdiv with rcp: // // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is -// allowed with unsafe-fp-math or afn. +// allowed with afn. // // a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0 Value * @@ -803,9 +796,9 @@ Value *AMDGPUCodeGenPrepareImpl::visitFDivElement( // // With rcp: // 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is -// allowed with unsafe-fp-math or afn. +// allowed with afn. // -// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn. +// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn. // // With fdiv.fast: // a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed. @@ -843,7 +836,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { RsqOp = SqrtOp->getOperand(0); } - // Inaccurate rcp is allowed with unsafe-fp-math or afn. + // Inaccurate rcp is allowed with afn. // // Defer to codegen to handle this. // @@ -852,7 +845,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { // expansion of afn to codegen. The current interpretation is so aggressive we // don't need any pre-consideration here when we have better information. A // more conservative interpretation could use handling here. - const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc(); + const bool AllowInaccurateRcp = DivFMF.approxFunc(); if (!RsqOp && AllowInaccurateRcp) return false; @@ -2026,7 +2019,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { // We're trying to handle the fast-but-not-that-fast case only. The lowering // of fast llvm.sqrt will give the raw instruction anyway. - if (SqrtFMF.approxFunc() || HasUnsafeFPMath) + if (SqrtFMF.approxFunc()) return false; const float ReqdAccuracy = FPOp->getFPAccuracy(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 61189337e5233..31c4f62d24dfe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2634,7 +2634,7 @@ bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG, if (Flags.hasApproximateFuncs()) return true; auto &Options = DAG.getTarget().Options; - return Options.UnsafeFPMath || Options.ApproxFuncFPMath; + return Options.ApproxFuncFPMath; } bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG, @@ -2757,7 +2757,7 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op, const auto &Options = getTargetMachine().Options; if (VT == MVT::f16 || Flags.hasApproximateFuncs() || - Options.ApproxFuncFPMath || Options.UnsafeFPMath) { + Options.ApproxFuncFPMath) { if (VT == MVT::f16 && !Subtarget->has16BitInsts()) { // Log and multiply in f32 is good enough for f16. @@ -3585,7 +3585,7 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) con if (N0.getValueType() == MVT::f32) return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0); - if (getTargetMachine().Options.UnsafeFPMath) { + if (Op->getFlags().hasApproximateFuncs()) { // There is a generic expand for FP_TO_FP16 with unsafe fast math. return SDValue(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 7a50923ffedc6..511fc6967da31 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -94,7 +94,6 @@ def NoFP32Denormals : Predicate<"MF->getInfo()->getMode() def NoFP64Denormals : Predicate<"MF->getInfo()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">; def IEEEModeEnabled : Predicate<"MF->getInfo()->getMode().IEEE">; def IEEEModeDisabled : Predicate<"!MF->getInfo()->getMode().IEEE">; -def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; } def FMA : Predicate<"Subtarget->hasFMA()">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 50da8fd7a47a1..1fdf272ee2191 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3344,7 +3344,7 @@ static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) { if (Flags & MachineInstr::FmAfn) return true; const auto &Options = MF.getTarget().Options; - return Options.UnsafeFPMath || Options.ApproxFuncFPMath; + return Options.ApproxFuncFPMath; } static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, @@ -3450,7 +3450,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, static_cast(MF.getTarget()); if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) || - TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) { + TM.Options.ApproxFuncFPMath) { if (Ty == F16 && !ST.has16BitInsts()) { Register LogVal = MRI.createGenericVirtualRegister(F32); auto PromoteSrc = B.buildFPExt(F32, X); @@ -4877,9 +4877,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, uint16_t Flags = MI.getFlags(); LLT ResTy = MRI.getType(Res); - const MachineFunction &MF = B.getMF(); - bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) || - MF.getTarget().Options.UnsafeFPMath; + bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn); if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) { if (!AllowInaccurateRcp && ResTy != LLT::scalar(16)) @@ -4939,9 +4937,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, uint16_t Flags = MI.getFlags(); LLT ResTy = MRI.getType(Res); - const MachineFunction &MF = B.getMF(); - bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || - MI.getFlag(MachineInstr::FmAfn); + bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn); if (!AllowInaccurateRcp) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 8767208d20ec9..aa755344d3325 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -53,8 +53,6 @@ class AMDGPULibCalls { using FuncInfo = llvm::AMDGPULibFunc; - bool UnsafeFPMath = false; - // -fuse-native. bool AllNative = false; @@ -117,7 +115,6 @@ class AMDGPULibCalls { bool AllowStrictFP = false); protected: - bool isUnsafeMath(const FPMathOperator *FPOp) const; bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const; bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const; @@ -415,23 +412,17 @@ bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName, return AMDGPULibFunc::parse(FMangledName, FInfo); } -bool AMDGPULibCalls::isUnsafeMath(const FPMathOperator *FPOp) const { - return UnsafeFPMath || FPOp->isFast(); -} - bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const { - return UnsafeFPMath || - (FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs()); + return FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs(); } bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold( const FPMathOperator *FPOp) const { // TODO: Refine to approxFunc or contract - return isUnsafeMath(FPOp); + return FPOp->isFast(); } void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) { - UnsafeFPMath = F.getFnAttribute("unsafe-fp-math").getValueAsBool(); AC = &FAM.getResult(F); TLInfo = &FAM.getResult(F); DT = FAM.getCachedResult(F); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 24f4df2aff9d1..a0c99b0ef0491 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -597,7 +597,6 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost( // Estimate all types may be fused with contract/unsafe flags const TargetOptions &Options = TLI->getTargetMachine().Options; if (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath || (FAdd->hasAllowContract() && CxtI->hasAllowContract())) return TargetTransformInfo::TCC_Free; } @@ -650,8 +649,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost( return LT.first * Cost * NElts; } - if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) || - TLI->getTargetMachine().Options.UnsafeFPMath)) { + if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) { // Fast unsafe fdiv lowering: // f32 rcp // f32 fmul diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f4d74088b48d6..ad267576a84af 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7199,7 +7199,7 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16); return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc); } - if (getTargetMachine().Options.UnsafeFPMath) { + if (Op->getFlags().hasApproximateFuncs()) { SDValue Flags = Op.getOperand(1); SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags); return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags); @@ -11294,8 +11294,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - bool AllowInaccurateRcp = - Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath; + bool AllowInaccurateRcp = Flags.hasApproximateFuncs(); if (const ConstantFPSDNode *CLHS = dyn_cast(LHS)) { // Without !fpmath accuracy information, we can't do more because we don't @@ -11314,7 +11313,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, // 1.0 / sqrt(x) -> rsq(x) - // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP + // XXX - Is afn sufficient to do this for f64? The maximum ULP // error seems really high at 2^29 ULP. // 1.0 / x -> rcp(x) return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); @@ -11348,8 +11347,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op, EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - bool AllowInaccurateDiv = - Flags.hasApproximateFuncs() || DAG.getTarget().Options.UnsafeFPMath; + bool AllowInaccurateDiv = Flags.hasApproximateFuncs(); if (!AllowInaccurateDiv) return SDValue(); @@ -14601,7 +14599,7 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, return ISD::FMAD; const TargetOptions &Options = DAG.getTarget().Options; - if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || + if ((Options.AllowFPOpFusion == FPOpFusion::Fast || (N0->getFlags().hasAllowContract() && N1->getFlags().hasAllowContract())) && isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) { @@ -15724,9 +15722,9 @@ SDValue SITargetLowering::performFMACombine(SDNode *N, // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero, // regardless of the denorm mode setting. Therefore, - // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2. + // fp-contract is sufficient to allow generating fdot2. const TargetOptions &Options = DAG.getTarget().Options; - if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || + if (Options.AllowFPOpFusion == FPOpFusion::Fast || (N->getFlags().hasAllowContract() && FMA->getFlags().hasAllowContract())) { Op1 = Op1.getOperand(0); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll index eafad58874141..2226fd20fb774 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -157,7 +157,7 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4 %r0 = load half, ptr addrspace(1) %in1, align 4 %r1 = load half, ptr addrspace(1) %gep2, align 4 - %r2 = frem half %r0, %r1 + %r2 = frem afn half %r0, %r1 store half %r2, ptr addrspace(1) %out, align 4 ret void } @@ -311,7 +311,7 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4 %r0 = load float, ptr addrspace(1) %in1, align 4 %r1 = load float, ptr addrspace(1) %gep2, align 4 - %r2 = frem float %r0, %r1 + %r2 = frem afn float %r0, %r1 store float %r2, ptr addrspace(1) %out, align 4 ret void } @@ -489,7 +489,7 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace( ptr addrspace(1) %in2) #1 { %r0 = load double, ptr addrspace(1) %in1, align 8 %r1 = load double, ptr addrspace(1) %in2, align 8 - %r2 = frem double %r0, %r1 + %r2 = frem afn double %r0, %r1 store double %r2, ptr addrspace(1) %out, align 8 ret void } @@ -1140,5 +1140,5 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ret void } -attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } -attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir index 1f9c059c2ac60..3fa73c23cbb28 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir @@ -2,9 +2,8 @@ # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -run-pass=legalizer %s -o - | FileCheck -check-prefix=SI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer %s -o - | FileCheck -check-prefix=VI %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX9 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer -enable-unsafe-fp-math -o - %s | FileCheck -check-prefix=GFX9-UNSAFE %s # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX10 %s -# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX10 %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX11 %s --- name: test_fdiv_s16 @@ -99,17 +98,56 @@ body: | ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; - ; GFX9-UNSAFE-LABEL: name: test_fdiv_s16 - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) - ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC1]](s16) - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[INT]] - ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL]](s16) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; GFX10-LABEL: name: test_fdiv_s16 + ; GFX10: liveins: $vgpr0, $vgpr1 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) + ; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]] + ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]] + ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]] + ; GFX10-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]] + ; GFX10-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]] + ; GFX10-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]] + ; GFX10-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]] + ; GFX10-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]] + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608 + ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C]] + ; GFX10-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]] + ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32) + ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16) + ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) + ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; + ; GFX11-LABEL: name: test_fdiv_s16 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX11-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; GFX11-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) + ; GFX11-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]] + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; GFX11-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; GFX11-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[FPEXT]] + ; GFX11-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT]], [[FMUL]] + ; GFX11-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA1]], [[FPEXT]] + ; GFX11-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMA2]], [[INT]] + ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608 + ; GFX11-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL1]], [[C]] + ; GFX11-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FMA1]] + ; GFX11-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32) + ; GFX11-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16) + ; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) + ; GFX11-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_TRUNC %0 @@ -119,6 +157,90 @@ body: | $vgpr0 = COPY %5 ... +--- +name: test_fdiv_s16_afn +machineFunctionInfo: + mode: + fp32-input-denormals: true + fp32-output-denormals: true + fp64-fp16-input-denormals: true + fp64-fp16-output-denormals: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; SI-LABEL: name: test_fdiv_s16_afn + ; SI: liveins: $vgpr0, $vgpr1 + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; SI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; SI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) + ; SI-NEXT: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; SI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = afn G_FMUL [[FPEXT]], [[INT]] + ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32) + ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16) + ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; + ; VI-LABEL: name: test_fdiv_s16_afn + ; VI: liveins: $vgpr0, $vgpr1 + ; VI-NEXT: {{ $}} + ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; VI-NEXT: [[INT:%[0-9]+]]:_(s16) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC1]](s16) + ; VI-NEXT: [[FMUL:%[0-9]+]]:_(s16) = afn G_FMUL [[TRUNC]], [[INT]] + ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL]](s16) + ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; + ; GFX9-LABEL: name: test_fdiv_s16_afn + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s16) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC1]](s16) + ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s16) = afn G_FMUL [[TRUNC]], [[INT]] + ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL]](s16) + ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; + ; GFX10-LABEL: name: test_fdiv_s16_afn + ; GFX10: liveins: $vgpr0, $vgpr1 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s16) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC1]](s16) + ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s16) = afn G_FMUL [[TRUNC]], [[INT]] + ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL]](s16) + ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; + ; GFX11-LABEL: name: test_fdiv_s16_afn + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s16) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC1]](s16) + ; GFX11-NEXT: [[FMUL:%[0-9]+]]:_(s16) = afn G_FMUL [[TRUNC]], [[INT]] + ; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL]](s16) + ; GFX11-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(s16) = afn G_FDIV %2, %3 + %5:_(s32) = G_ANYEXT %4 + $vgpr0 = COPY %5 +... + --- name: test_fdiv_s32_denorms_on machineFunctionInfo: @@ -192,15 +314,6 @@ body: | ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) ; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32) ; - ; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_on - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[INT]] - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMUL]](s32) - ; ; GFX10-LABEL: name: test_fdiv_s32_denorms_on ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} @@ -220,12 +333,96 @@ body: | ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; GFX10-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) ; GFX10-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; + ; GFX11-LABEL: name: test_fdiv_s32_denorms_on + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0 + ; GFX11-NEXT: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1 + ; GFX11-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32) + ; GFX11-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[INT]] + ; GFX11-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[INT4]], [[C]] + ; GFX11-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT4]], [[INT4]] + ; GFX11-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[FMA1]] + ; GFX11-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[INT2]] + ; GFX11-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FMA2]], [[FMA1]], [[FMUL]] + ; GFX11-NEXT: [[FMA4:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA3]], [[INT2]] + ; GFX11-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) + ; GFX11-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[INT6]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_FDIV %0, %1 $vgpr0 = COPY %2 ... +--- +name: test_fdiv_s32_denorms_on_afn +machineFunctionInfo: + mode: + fp32-input-denormals: true + fp32-output-denormals: true + fp64-fp16-input-denormals: true + fp64-fp16-output-denormals: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; SI-LABEL: name: test_fdiv_s32_denorms_on_afn + ; SI: liveins: $vgpr0, $vgpr1 + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; SI-NEXT: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) + ; SI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = afn G_FMUL [[COPY]], [[INT]] + ; SI-NEXT: $vgpr0 = COPY [[FMUL]](s32) + ; + ; VI-LABEL: name: test_fdiv_s32_denorms_on_afn + ; VI: liveins: $vgpr0, $vgpr1 + ; VI-NEXT: {{ $}} + ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) + ; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = afn G_FMUL [[COPY]], [[INT]] + ; VI-NEXT: $vgpr0 = COPY [[FMUL]](s32) + ; + ; GFX9-LABEL: name: test_fdiv_s32_denorms_on_afn + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) + ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = afn G_FMUL [[COPY]], [[INT]] + ; GFX9-NEXT: $vgpr0 = COPY [[FMUL]](s32) + ; + ; GFX10-LABEL: name: test_fdiv_s32_denorms_on_afn + ; GFX10: liveins: $vgpr0, $vgpr1 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) + ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = afn G_FMUL [[COPY]], [[INT]] + ; GFX10-NEXT: $vgpr0 = COPY [[FMUL]](s32) + ; + ; GFX11-LABEL: name: test_fdiv_s32_denorms_on_afn + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) + ; GFX11-NEXT: [[FMUL:%[0-9]+]]:_(s32) = afn G_FMUL [[COPY]], [[INT]] + ; GFX11-NEXT: $vgpr0 = COPY [[FMUL]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = afn G_FDIV %0, %1 + $vgpr0 = COPY %2 +... + + --- name: test_fdiv_s32_denorms_off machineFunctionInfo: @@ -305,15 +502,6 @@ body: | ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) ; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32) ; - ; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_off - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[INT]] - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMUL]](s32) - ; ; GFX10-LABEL: name: test_fdiv_s32_denorms_off ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} @@ -335,12 +523,97 @@ body: | ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; GFX10-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) ; GFX10-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; + ; GFX11-LABEL: name: test_fdiv_s32_denorms_off + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0 + ; GFX11-NEXT: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1 + ; GFX11-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32) + ; GFX11-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[INT]] + ; GFX11-NEXT: S_DENORM_MODE 15, implicit-def $mode, implicit $mode + ; GFX11-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[INT4]], [[C]] + ; GFX11-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT4]], [[INT4]] + ; GFX11-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[FMA1]] + ; GFX11-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[INT2]] + ; GFX11-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FMA2]], [[FMA1]], [[FMUL]] + ; GFX11-NEXT: [[FMA4:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA3]], [[INT2]] + ; GFX11-NEXT: S_DENORM_MODE 12, implicit-def $mode, implicit $mode + ; GFX11-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) + ; GFX11-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[INT6]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = G_FDIV %0, %1 $vgpr0 = COPY %2 ... +--- +name: test_fdiv_s32_denorms_off_afn +machineFunctionInfo: + mode: + fp32-input-denormals: false + fp32-output-denormals: false + fp64-fp16-input-denormals: true + fp64-fp16-output-denormals: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; SI-LABEL: name: test_fdiv_s32_denorms_off_afn + ; SI: liveins: $vgpr0, $vgpr1 + ; SI-NEXT: {{ $}} + ; SI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; SI-NEXT: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) + ; SI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = afn G_FMUL [[COPY]], [[INT]] + ; SI-NEXT: $vgpr0 = COPY [[FMUL]](s32) + ; + ; VI-LABEL: name: test_fdiv_s32_denorms_off_afn + ; VI: liveins: $vgpr0, $vgpr1 + ; VI-NEXT: {{ $}} + ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) + ; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = afn G_FMUL [[COPY]], [[INT]] + ; VI-NEXT: $vgpr0 = COPY [[FMUL]](s32) + ; + ; GFX9-LABEL: name: test_fdiv_s32_denorms_off_afn + ; GFX9: liveins: $vgpr0, $vgpr1 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) + ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = afn G_FMUL [[COPY]], [[INT]] + ; GFX9-NEXT: $vgpr0 = COPY [[FMUL]](s32) + ; + ; GFX10-LABEL: name: test_fdiv_s32_denorms_off_afn + ; GFX10: liveins: $vgpr0, $vgpr1 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) + ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = afn G_FMUL [[COPY]], [[INT]] + ; GFX10-NEXT: $vgpr0 = COPY [[FMUL]](s32) + ; + ; GFX11-LABEL: name: test_fdiv_s32_denorms_off_afn + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) + ; GFX11-NEXT: [[FMUL:%[0-9]+]]:_(s32) = afn G_FMUL [[COPY]], [[INT]] + ; GFX11-NEXT: $vgpr0 = COPY [[FMUL]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = afn G_FDIV %0, %1 + $vgpr0 = COPY %2 +... + --- name: test_fdiv_s32_denorms_off_arcp machineFunctionInfo: @@ -420,15 +693,6 @@ body: | ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) ; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32) ; - ; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_off_arcp - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32) - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]] - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMUL]](s32) - ; ; GFX10-LABEL: name: test_fdiv_s32_denorms_off_arcp ; GFX10: liveins: $vgpr0, $vgpr1 ; GFX10-NEXT: {{ $}} @@ -450,6 +714,28 @@ body: | ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; GFX10-NEXT: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) ; GFX10-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; + ; GFX11-LABEL: name: test_fdiv_s32_denorms_off_arcp + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0 + ; GFX11-NEXT: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1 + ; GFX11-NEXT: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32) + ; GFX11-NEXT: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]] + ; GFX11-NEXT: S_DENORM_MODE 15, implicit-def $mode, implicit $mode + ; GFX11-NEXT: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]] + ; GFX11-NEXT: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]] + ; GFX11-NEXT: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]] + ; GFX11-NEXT: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]] + ; GFX11-NEXT: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]] + ; GFX11-NEXT: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]] + ; GFX11-NEXT: S_DENORM_MODE 12, implicit-def $mode, implicit $mode + ; GFX11-NEXT: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) + ; GFX11-NEXT: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[INT6]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = arcp G_FDIV %0, %1 @@ -536,23 +822,6 @@ body: | ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY1]](s64), [[COPY]](s64) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64) ; - ; GFX9-UNSAFE-LABEL: name: test_fdiv_s64 - ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 - ; GFX9-UNSAFE-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY1]] - ; GFX9-UNSAFE-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 - ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s64) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C]] - ; GFX9-UNSAFE-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]] - ; GFX9-UNSAFE-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C]] - ; GFX9-UNSAFE-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]] - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[FMA3]] - ; GFX9-UNSAFE-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[COPY]] - ; GFX9-UNSAFE-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]] - ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[FMA5]](s64) - ; ; GFX10-LABEL: name: test_fdiv_s64 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} @@ -572,6 +841,26 @@ body: | ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1) ; GFX10-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY1]](s64), [[COPY]](s64) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64) + ; + ; GFX11-LABEL: name: test_fdiv_s64 + ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 + ; GFX11-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s64), [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s64), [[COPY1]](s64), 0 + ; GFX11-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[INT]] + ; GFX11-NEXT: [[INT2:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s64) + ; GFX11-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT2]], [[C]] + ; GFX11-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[INT2]], [[FMA]], [[INT2]] + ; GFX11-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C]] + ; GFX11-NEXT: [[INT3:%[0-9]+]]:_(s64), [[INT4:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s64), [[COPY1]](s64), 1 + ; GFX11-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA1]], [[FMA2]], [[FMA1]] + ; GFX11-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INT3]], [[FMA3]] + ; GFX11-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[INT3]] + ; GFX11-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1) + ; GFX11-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY1]](s64), [[COPY]](s64) + ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 %2:_(s64) = G_FDIV %0, %1 @@ -708,20 +997,6 @@ body: | ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; - ; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s32 - ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; GFX9-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[UV2]](s32) - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[INT]] - ; GFX9-UNSAFE-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[UV3]](s32) - ; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[INT1]] - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) - ; ; GFX10-LABEL: name: test_fdiv_v2s32 ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} @@ -760,6 +1035,45 @@ body: | ; GFX10-NEXT: [[INT13:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32) ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; + ; GFX11-LABEL: name: test_fdiv_v2s32 + ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX11-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[UV]](s32), [[UV2]](s32), 0 + ; GFX11-NEXT: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[UV]](s32), [[UV2]](s32), 1 + ; GFX11-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32) + ; GFX11-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[INT]] + ; GFX11-NEXT: S_DENORM_MODE 15, implicit-def $mode, implicit $mode + ; GFX11-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[INT4]], [[C]] + ; GFX11-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT4]], [[INT4]] + ; GFX11-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[FMA1]] + ; GFX11-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[INT2]] + ; GFX11-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FMA2]], [[FMA1]], [[FMUL]] + ; GFX11-NEXT: [[FMA4:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA3]], [[INT2]] + ; GFX11-NEXT: S_DENORM_MODE 12, implicit-def $mode, implicit $mode + ; GFX11-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) + ; GFX11-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[UV2]](s32), [[UV]](s32) + ; GFX11-NEXT: [[INT7:%[0-9]+]]:_(s32), [[INT8:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[UV1]](s32), [[UV3]](s32), 0 + ; GFX11-NEXT: [[INT9:%[0-9]+]]:_(s32), [[INT10:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[UV1]](s32), [[UV3]](s32), 1 + ; GFX11-NEXT: [[INT11:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT7]](s32) + ; GFX11-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[INT7]] + ; GFX11-NEXT: S_DENORM_MODE 15, implicit-def $mode, implicit $mode + ; GFX11-NEXT: [[FMA5:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[INT11]], [[C]] + ; GFX11-NEXT: [[FMA6:%[0-9]+]]:_(s32) = G_FMA [[FMA5]], [[INT11]], [[INT11]] + ; GFX11-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT9]], [[FMA6]] + ; GFX11-NEXT: [[FMA7:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMUL1]], [[INT9]] + ; GFX11-NEXT: [[FMA8:%[0-9]+]]:_(s32) = G_FMA [[FMA7]], [[FMA6]], [[FMUL1]] + ; GFX11-NEXT: [[FMA9:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMA8]], [[INT9]] + ; GFX11-NEXT: S_DENORM_MODE 12, implicit-def $mode, implicit $mode + ; GFX11-NEXT: [[INT12:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA9]](s32), [[FMA6]](s32), [[FMA8]](s32), [[INT10]](s1) + ; GFX11-NEXT: [[INT13:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32) + ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 %2:_(<2 x s32>) = G_FDIV %0, %1 @@ -877,20 +1191,6 @@ body: | ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; - ; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s32_flags - ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) - ; GFX9-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) - ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[UV2]](s32) - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[UV]], [[INT]] - ; GFX9-UNSAFE-NEXT: [[INT1:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[UV3]](s32) - ; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = nnan G_FMUL [[UV1]], [[INT1]] - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) - ; ; GFX10-LABEL: name: test_fdiv_v2s32_flags ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX10-NEXT: {{ $}} @@ -925,6 +1225,41 @@ body: | ; GFX10-NEXT: [[INT13:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32) ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; + ; GFX11-LABEL: name: test_fdiv_v2s32_flags + ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3 + ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>) + ; GFX11-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[UV]](s32), [[UV2]](s32), 0 + ; GFX11-NEXT: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[UV]](s32), [[UV2]](s32), 1 + ; GFX11-NEXT: [[INT4:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32) + ; GFX11-NEXT: [[FNEG:%[0-9]+]]:_(s32) = nnan G_FNEG [[INT]] + ; GFX11-NEXT: [[FMA:%[0-9]+]]:_(s32) = nnan G_FMA [[FNEG]], [[INT4]], [[C]] + ; GFX11-NEXT: [[FMA1:%[0-9]+]]:_(s32) = nnan G_FMA [[FMA]], [[INT4]], [[INT4]] + ; GFX11-NEXT: [[FMUL:%[0-9]+]]:_(s32) = nnan G_FMUL [[INT2]], [[FMA1]] + ; GFX11-NEXT: [[FMA2:%[0-9]+]]:_(s32) = nnan G_FMA [[FNEG]], [[FMUL]], [[INT2]] + ; GFX11-NEXT: [[FMA3:%[0-9]+]]:_(s32) = nnan G_FMA [[FMA2]], [[FMA1]], [[FMUL]] + ; GFX11-NEXT: [[FMA4:%[0-9]+]]:_(s32) = nnan G_FMA [[FNEG]], [[FMA3]], [[INT2]] + ; GFX11-NEXT: [[INT5:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) + ; GFX11-NEXT: [[INT6:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[UV2]](s32), [[UV]](s32) + ; GFX11-NEXT: [[INT7:%[0-9]+]]:_(s32), [[INT8:%[0-9]+]]:_(s1) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[UV1]](s32), [[UV3]](s32), 0 + ; GFX11-NEXT: [[INT9:%[0-9]+]]:_(s32), [[INT10:%[0-9]+]]:_(s1) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[UV1]](s32), [[UV3]](s32), 1 + ; GFX11-NEXT: [[INT11:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT7]](s32) + ; GFX11-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = nnan G_FNEG [[INT7]] + ; GFX11-NEXT: [[FMA5:%[0-9]+]]:_(s32) = nnan G_FMA [[FNEG1]], [[INT11]], [[C]] + ; GFX11-NEXT: [[FMA6:%[0-9]+]]:_(s32) = nnan G_FMA [[FMA5]], [[INT11]], [[INT11]] + ; GFX11-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = nnan G_FMUL [[INT9]], [[FMA6]] + ; GFX11-NEXT: [[FMA7:%[0-9]+]]:_(s32) = nnan G_FMA [[FNEG1]], [[FMUL1]], [[INT9]] + ; GFX11-NEXT: [[FMA8:%[0-9]+]]:_(s32) = nnan G_FMA [[FMA7]], [[FMA6]], [[FMUL1]] + ; GFX11-NEXT: [[FMA9:%[0-9]+]]:_(s32) = nnan G_FMA [[FNEG1]], [[FMA8]], [[INT9]] + ; GFX11-NEXT: [[INT12:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA9]](s32), [[FMA6]](s32), [[FMA8]](s32), [[INT10]](s1) + ; GFX11-NEXT: [[INT13:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32) + ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 %2:_(<2 x s32>) = nnan G_FDIV %0, %1 @@ -1078,22 +1413,6 @@ body: | ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32), [[INT20]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; - ; GFX9-UNSAFE-LABEL: name: test_fdiv_v3s32 - ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) - ; GFX9-UNSAFE-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>) - ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[UV3]](s32) - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[UV]], [[INT]] - ; GFX9-UNSAFE-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[UV4]](s32) - ; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[INT1]] - ; GFX9-UNSAFE-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[UV5]](s32) - ; GFX9-UNSAFE-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[INT2]] - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32), [[FMUL2]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) - ; ; GFX10-LABEL: name: test_fdiv_v3s32 ; GFX10: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 ; GFX10-NEXT: {{ $}} @@ -1140,6 +1459,53 @@ body: | ; GFX10-NEXT: [[INT20:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT19]](s32), [[UV5]](s32), [[UV2]](s32) ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32), [[INT20]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) + ; + ; GFX11-LABEL: name: test_fdiv_v3s32 + ; GFX11: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 + ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>) + ; GFX11-NEXT: [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>) + ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[UV]](s32), [[UV3]](s32), 0 + ; GFX11-NEXT: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[UV]](s32), [[UV3]](s32), 1 + ; GFX11-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32) + ; GFX11-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[INT]] + ; GFX11-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[INT4]], [[C]] + ; GFX11-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT4]], [[INT4]] + ; GFX11-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[FMA1]] + ; GFX11-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[INT2]] + ; GFX11-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FMA2]], [[FMA1]], [[FMUL]] + ; GFX11-NEXT: [[FMA4:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA3]], [[INT2]] + ; GFX11-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) + ; GFX11-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[UV3]](s32), [[UV]](s32) + ; GFX11-NEXT: [[INT7:%[0-9]+]]:_(s32), [[INT8:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[UV1]](s32), [[UV4]](s32), 0 + ; GFX11-NEXT: [[INT9:%[0-9]+]]:_(s32), [[INT10:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[UV1]](s32), [[UV4]](s32), 1 + ; GFX11-NEXT: [[INT11:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT7]](s32) + ; GFX11-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[INT7]] + ; GFX11-NEXT: [[FMA5:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[INT11]], [[C]] + ; GFX11-NEXT: [[FMA6:%[0-9]+]]:_(s32) = G_FMA [[FMA5]], [[INT11]], [[INT11]] + ; GFX11-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[INT9]], [[FMA6]] + ; GFX11-NEXT: [[FMA7:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMUL1]], [[INT9]] + ; GFX11-NEXT: [[FMA8:%[0-9]+]]:_(s32) = G_FMA [[FMA7]], [[FMA6]], [[FMUL1]] + ; GFX11-NEXT: [[FMA9:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMA8]], [[INT9]] + ; GFX11-NEXT: [[INT12:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA9]](s32), [[FMA6]](s32), [[FMA8]](s32), [[INT10]](s1) + ; GFX11-NEXT: [[INT13:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV4]](s32), [[UV1]](s32) + ; GFX11-NEXT: [[INT14:%[0-9]+]]:_(s32), [[INT15:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[UV2]](s32), [[UV5]](s32), 0 + ; GFX11-NEXT: [[INT16:%[0-9]+]]:_(s32), [[INT17:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[UV2]](s32), [[UV5]](s32), 1 + ; GFX11-NEXT: [[INT18:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT14]](s32) + ; GFX11-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[INT14]] + ; GFX11-NEXT: [[FMA10:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[INT18]], [[C]] + ; GFX11-NEXT: [[FMA11:%[0-9]+]]:_(s32) = G_FMA [[FMA10]], [[INT18]], [[INT18]] + ; GFX11-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[INT16]], [[FMA11]] + ; GFX11-NEXT: [[FMA12:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[FMUL2]], [[INT16]] + ; GFX11-NEXT: [[FMA13:%[0-9]+]]:_(s32) = G_FMA [[FMA12]], [[FMA11]], [[FMUL2]] + ; GFX11-NEXT: [[FMA14:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[FMA13]], [[INT16]] + ; GFX11-NEXT: [[INT19:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA14]](s32), [[FMA11]](s32), [[FMA13]](s32), [[INT17]](s1) + ; GFX11-NEXT: [[INT20:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT19]](s32), [[UV5]](s32), [[UV2]](s32) + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32), [[INT20]](s32) + ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 %1:_(<3 x s32>) = COPY $vgpr3_vgpr4_vgpr5 %2:_(<3 x s32>) = G_FDIV %0, %1 @@ -1271,35 +1637,6 @@ body: | ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[INT6]](s64), [[INT13]](s64) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; - ; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s64 - ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) - ; GFX9-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>) - ; GFX9-UNSAFE-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[UV2]] - ; GFX9-UNSAFE-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 - ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[UV2]](s64) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C]] - ; GFX9-UNSAFE-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]] - ; GFX9-UNSAFE-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C]] - ; GFX9-UNSAFE-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]] - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[UV]], [[FMA3]] - ; GFX9-UNSAFE-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[UV]] - ; GFX9-UNSAFE-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]] - ; GFX9-UNSAFE-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[UV3]] - ; GFX9-UNSAFE-NEXT: [[INT1:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[UV3]](s64) - ; GFX9-UNSAFE-NEXT: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[INT1]], [[C]] - ; GFX9-UNSAFE-NEXT: [[FMA7:%[0-9]+]]:_(s64) = G_FMA [[FMA6]], [[INT1]], [[INT1]] - ; GFX9-UNSAFE-NEXT: [[FMA8:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA7]], [[C]] - ; GFX9-UNSAFE-NEXT: [[FMA9:%[0-9]+]]:_(s64) = G_FMA [[FMA8]], [[FMA7]], [[FMA7]] - ; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[UV1]], [[FMA9]] - ; GFX9-UNSAFE-NEXT: [[FMA10:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMUL1]], [[UV1]] - ; GFX9-UNSAFE-NEXT: [[FMA11:%[0-9]+]]:_(s64) = G_FMA [[FMA10]], [[FMA9]], [[FMUL1]] - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FMA5]](s64), [[FMA11]](s64) - ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) - ; ; GFX10-LABEL: name: test_fdiv_v2s64 ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX10-NEXT: {{ $}} @@ -1334,6 +1671,41 @@ body: | ; GFX10-NEXT: [[INT13:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s64), [[UV3]](s64), [[UV1]](s64) ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[INT6]](s64), [[INT13]](s64) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; + ; GFX11-LABEL: name: test_fdiv_v2s64 + ; GFX11: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>) + ; GFX11-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY1]](<2 x s64>) + ; GFX11-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s64), [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[UV]](s64), [[UV2]](s64), 0 + ; GFX11-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[INT]] + ; GFX11-NEXT: [[INT2:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s64) + ; GFX11-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT2]], [[C]] + ; GFX11-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[INT2]], [[FMA]], [[INT2]] + ; GFX11-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C]] + ; GFX11-NEXT: [[INT3:%[0-9]+]]:_(s64), [[INT4:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[UV]](s64), [[UV2]](s64), 1 + ; GFX11-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA1]], [[FMA2]], [[FMA1]] + ; GFX11-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INT3]], [[FMA3]] + ; GFX11-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[INT3]] + ; GFX11-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1) + ; GFX11-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[UV2]](s64), [[UV]](s64) + ; GFX11-NEXT: [[INT7:%[0-9]+]]:_(s64), [[INT8:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[UV1]](s64), [[UV3]](s64), 0 + ; GFX11-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[INT7]] + ; GFX11-NEXT: [[INT9:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT7]](s64) + ; GFX11-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[INT9]], [[C]] + ; GFX11-NEXT: [[FMA6:%[0-9]+]]:_(s64) = G_FMA [[INT9]], [[FMA5]], [[INT9]] + ; GFX11-NEXT: [[FMA7:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMA6]], [[C]] + ; GFX11-NEXT: [[INT10:%[0-9]+]]:_(s64), [[INT11:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[UV1]](s64), [[UV3]](s64), 1 + ; GFX11-NEXT: [[FMA8:%[0-9]+]]:_(s64) = G_FMA [[FMA6]], [[FMA7]], [[FMA6]] + ; GFX11-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[INT10]], [[FMA8]] + ; GFX11-NEXT: [[FMA9:%[0-9]+]]:_(s64) = G_FMA [[FNEG1]], [[FMUL1]], [[INT10]] + ; GFX11-NEXT: [[INT12:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA9]](s64), [[FMA8]](s64), [[FMUL1]](s64), [[INT11]](s1) + ; GFX11-NEXT: [[INT13:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s64), [[UV3]](s64), [[UV1]](s64) + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[INT6]](s64), [[INT13]](s64) + ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(<2 x s64>) = COPY $vgpr4_vgpr5_vgpr6_vgpr7 %2:_(<2 x s64>) = G_FDIV %0, %1 @@ -1502,26 +1874,92 @@ body: | ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; - ; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s16 - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 - ; GFX9-UNSAFE-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9-UNSAFE-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-UNSAFE-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-UNSAFE-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX9-UNSAFE-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9-UNSAFE-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC2]](s16) - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[INT]] - ; GFX9-UNSAFE-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC3]](s16) - ; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC1]], [[INT1]] - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMUL]](s16), [[FMUL1]](s16) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX10-LABEL: name: test_fdiv_v2s16 + ; GFX10: liveins: $vgpr0, $vgpr1 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) + ; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]] + ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]] + ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]] + ; GFX10-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]] + ; GFX10-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]] + ; GFX10-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]] + ; GFX10-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]] + ; GFX10-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]] + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608 + ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]] + ; GFX10-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]] + ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32) + ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC2]](s16), [[TRUNC]](s16) + ; GFX10-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) + ; GFX10-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) + ; GFX10-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]] + ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32) + ; GFX10-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] + ; GFX10-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]] + ; GFX10-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]] + ; GFX10-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]] + ; GFX10-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]] + ; GFX10-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]] + ; GFX10-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]] + ; GFX10-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]] + ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]] + ; GFX10-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]] + ; GFX10-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32) + ; GFX10-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC3]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16) + ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; + ; GFX11-LABEL: name: test_fdiv_v2s16 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<2 x s16>) + ; GFX11-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](<2 x s16>) + ; GFX11-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[UV]](s16) + ; GFX11-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[UV2]](s16) + ; GFX11-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]] + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; GFX11-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; GFX11-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[FPEXT]] + ; GFX11-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT]], [[FMUL]] + ; GFX11-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA1]], [[FPEXT]] + ; GFX11-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMA2]], [[INT]] + ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608 + ; GFX11-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL1]], [[C]] + ; GFX11-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FMA1]] + ; GFX11-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32) + ; GFX11-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[UV2]](s16), [[UV]](s16) + ; GFX11-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[UV1]](s16) + ; GFX11-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[UV3]](s16) + ; GFX11-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]] + ; GFX11-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32) + ; GFX11-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] + ; GFX11-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMUL2]], [[FPEXT2]] + ; GFX11-NEXT: [[FMA4:%[0-9]+]]:_(s32) = G_FMA [[FMA3]], [[INT2]], [[FMUL2]] + ; GFX11-NEXT: [[FMA5:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMA4]], [[FPEXT2]] + ; GFX11-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMA5]], [[INT2]] + ; GFX11-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL3]], [[C]] + ; GFX11-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FMA4]] + ; GFX11-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32) + ; GFX11-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[UV3]](s16), [[UV1]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16) + ; GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 %2:_(<2 x s16>) = G_FDIV %0, %1 @@ -1756,37 +2194,133 @@ body: | ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32) ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) ; - ; GFX9-UNSAFE-LABEL: name: test_fdiv_v3s16 - ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; GFX9-UNSAFE-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9-UNSAFE-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-UNSAFE-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-UNSAFE-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX9-UNSAFE-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF - ; GFX9-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) - ; GFX9-UNSAFE-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) - ; GFX9-UNSAFE-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-UNSAFE-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX9-UNSAFE-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) - ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC3]](s16) - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[INT]] - ; GFX9-UNSAFE-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC4]](s16) - ; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC1]], [[INT1]] - ; GFX9-UNSAFE-NEXT: [[INT2:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC5]](s16) - ; GFX9-UNSAFE-NEXT: [[FMUL2:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC2]], [[INT2]] - ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL]](s16) - ; GFX9-UNSAFE-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL1]](s16) - ; GFX9-UNSAFE-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL2]](s16) - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32) - ; GFX9-UNSAFE-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; GFX10-LABEL: name: test_fdiv_v3s16 + ; GFX10: liveins: $vgpr0, $vgpr1 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) + ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX10-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) + ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) + ; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]] + ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]] + ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]] + ; GFX10-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]] + ; GFX10-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]] + ; GFX10-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]] + ; GFX10-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]] + ; GFX10-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]] + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608 + ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]] + ; GFX10-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]] + ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32) + ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC3]](s16), [[TRUNC]](s16) + ; GFX10-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) + ; GFX10-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16) + ; GFX10-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]] + ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32) + ; GFX10-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] + ; GFX10-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]] + ; GFX10-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]] + ; GFX10-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]] + ; GFX10-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]] + ; GFX10-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]] + ; GFX10-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]] + ; GFX10-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]] + ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]] + ; GFX10-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]] + ; GFX10-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32) + ; GFX10-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC4]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) + ; GFX10-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16) + ; GFX10-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]] + ; GFX10-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32) + ; GFX10-NEXT: [[FMUL10:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]] + ; GFX10-NEXT: [[FMUL11:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FMUL10]] + ; GFX10-NEXT: [[FADD8:%[0-9]+]]:_(s32) = G_FADD [[FMUL11]], [[FPEXT4]] + ; GFX10-NEXT: [[FMUL12:%[0-9]+]]:_(s32) = G_FMUL [[FADD8]], [[INT4]] + ; GFX10-NEXT: [[FADD9:%[0-9]+]]:_(s32) = G_FADD [[FMUL12]], [[FMUL10]] + ; GFX10-NEXT: [[FMUL13:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FADD9]] + ; GFX10-NEXT: [[FADD10:%[0-9]+]]:_(s32) = G_FADD [[FMUL13]], [[FPEXT4]] + ; GFX10-NEXT: [[FMUL14:%[0-9]+]]:_(s32) = G_FMUL [[FADD10]], [[INT4]] + ; GFX10-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[FMUL14]], [[C1]] + ; GFX10-NEXT: [[FADD11:%[0-9]+]]:_(s32) = G_FADD [[AND2]], [[FADD9]] + ; GFX10-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD11]](s32) + ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC5]](s16), [[TRUNC2]](s16) + ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) + ; GFX10-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16) + ; GFX10-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32) + ; GFX10-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) + ; + ; GFX11-LABEL: name: test_fdiv_v3s16 + ; GFX11: liveins: $vgpr0, $vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) + ; GFX11-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF + ; GFX11-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>) + ; GFX11-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[UV]](s16) + ; GFX11-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[UV4]](s16) + ; GFX11-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]] + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; GFX11-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; GFX11-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[FPEXT]] + ; GFX11-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT]], [[FMUL]] + ; GFX11-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA1]], [[FPEXT]] + ; GFX11-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMA2]], [[INT]] + ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608 + ; GFX11-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL1]], [[C]] + ; GFX11-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FMA1]] + ; GFX11-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32) + ; GFX11-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[UV4]](s16), [[UV]](s16) + ; GFX11-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[UV1]](s16) + ; GFX11-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[UV5]](s16) + ; GFX11-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]] + ; GFX11-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32) + ; GFX11-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] + ; GFX11-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMUL2]], [[FPEXT2]] + ; GFX11-NEXT: [[FMA4:%[0-9]+]]:_(s32) = G_FMA [[FMA3]], [[INT2]], [[FMUL2]] + ; GFX11-NEXT: [[FMA5:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMA4]], [[FPEXT2]] + ; GFX11-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMA5]], [[INT2]] + ; GFX11-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL3]], [[C]] + ; GFX11-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FMA4]] + ; GFX11-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32) + ; GFX11-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[UV5]](s16), [[UV1]](s16) + ; GFX11-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[UV2]](s16) + ; GFX11-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[UV6]](s16) + ; GFX11-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]] + ; GFX11-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32) + ; GFX11-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]] + ; GFX11-NEXT: [[FMA6:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[FMUL4]], [[FPEXT4]] + ; GFX11-NEXT: [[FMA7:%[0-9]+]]:_(s32) = G_FMA [[FMA6]], [[INT4]], [[FMUL4]] + ; GFX11-NEXT: [[FMA8:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[FMA7]], [[FPEXT4]] + ; GFX11-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FMA8]], [[INT4]] + ; GFX11-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[FMUL5]], [[C]] + ; GFX11-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[AND2]], [[FMA7]] + ; GFX11-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD2]](s32) + ; GFX11-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[UV6]](s16), [[UV2]](s16) + ; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16) + ; GFX11-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16) + ; GFX11-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32) + ; GFX11-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>) %0:_(<3 x s16>) = G_IMPLICIT_DEF %1:_(<3 x s16>) = G_IMPLICIT_DEF %2:_(<3 x s16>) = G_FDIV %0, %1 @@ -2094,42 +2628,164 @@ body: | ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; - ; GFX9-UNSAFE-LABEL: name: test_fdiv_v4s16 - ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX9-UNSAFE-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 - ; GFX9-UNSAFE-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) - ; GFX9-UNSAFE-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9-UNSAFE-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-UNSAFE-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; GFX9-UNSAFE-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-UNSAFE-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX9-UNSAFE-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) - ; GFX9-UNSAFE-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX9-UNSAFE-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) - ; GFX9-UNSAFE-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) - ; GFX9-UNSAFE-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) - ; GFX9-UNSAFE-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) - ; GFX9-UNSAFE-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) - ; GFX9-UNSAFE-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) - ; GFX9-UNSAFE-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) - ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC4]](s16) - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[INT]] - ; GFX9-UNSAFE-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC5]](s16) - ; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC1]], [[INT1]] - ; GFX9-UNSAFE-NEXT: [[INT2:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC6]](s16) - ; GFX9-UNSAFE-NEXT: [[FMUL2:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC2]], [[INT2]] - ; GFX9-UNSAFE-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC7]](s16) - ; GFX9-UNSAFE-NEXT: [[FMUL3:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC3]], [[INT3]] - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMUL]](s16), [[FMUL1]](s16) - ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMUL2]](s16), [[FMUL3]](s16) - ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX10-LABEL: name: test_fdiv_v4s16 + ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 + ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) + ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32) + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32) + ; GFX10-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32) + ; GFX10-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32) + ; GFX10-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32) + ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16) + ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16) + ; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]] + ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]] + ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]] + ; GFX10-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]] + ; GFX10-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]] + ; GFX10-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]] + ; GFX10-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]] + ; GFX10-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]] + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608 + ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]] + ; GFX10-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]] + ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32) + ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC4]](s16), [[TRUNC]](s16) + ; GFX10-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16) + ; GFX10-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16) + ; GFX10-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]] + ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32) + ; GFX10-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] + ; GFX10-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]] + ; GFX10-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]] + ; GFX10-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]] + ; GFX10-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]] + ; GFX10-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]] + ; GFX10-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]] + ; GFX10-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]] + ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]] + ; GFX10-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]] + ; GFX10-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32) + ; GFX10-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC5]](s16), [[TRUNC1]](s16) + ; GFX10-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16) + ; GFX10-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16) + ; GFX10-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]] + ; GFX10-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32) + ; GFX10-NEXT: [[FMUL10:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]] + ; GFX10-NEXT: [[FMUL11:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FMUL10]] + ; GFX10-NEXT: [[FADD8:%[0-9]+]]:_(s32) = G_FADD [[FMUL11]], [[FPEXT4]] + ; GFX10-NEXT: [[FMUL12:%[0-9]+]]:_(s32) = G_FMUL [[FADD8]], [[INT4]] + ; GFX10-NEXT: [[FADD9:%[0-9]+]]:_(s32) = G_FADD [[FMUL12]], [[FMUL10]] + ; GFX10-NEXT: [[FMUL13:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FADD9]] + ; GFX10-NEXT: [[FADD10:%[0-9]+]]:_(s32) = G_FADD [[FMUL13]], [[FPEXT4]] + ; GFX10-NEXT: [[FMUL14:%[0-9]+]]:_(s32) = G_FMUL [[FADD10]], [[INT4]] + ; GFX10-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[FMUL14]], [[C1]] + ; GFX10-NEXT: [[FADD11:%[0-9]+]]:_(s32) = G_FADD [[AND2]], [[FADD9]] + ; GFX10-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD11]](s32) + ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC6]](s16), [[TRUNC2]](s16) + ; GFX10-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16) + ; GFX10-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16) + ; GFX10-NEXT: [[FNEG3:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT7]] + ; GFX10-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT7]](s32) + ; GFX10-NEXT: [[FMUL15:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]] + ; GFX10-NEXT: [[FMUL16:%[0-9]+]]:_(s32) = G_FMUL [[FNEG3]], [[FMUL15]] + ; GFX10-NEXT: [[FADD12:%[0-9]+]]:_(s32) = G_FADD [[FMUL16]], [[FPEXT6]] + ; GFX10-NEXT: [[FMUL17:%[0-9]+]]:_(s32) = G_FMUL [[FADD12]], [[INT6]] + ; GFX10-NEXT: [[FADD13:%[0-9]+]]:_(s32) = G_FADD [[FMUL17]], [[FMUL15]] + ; GFX10-NEXT: [[FMUL18:%[0-9]+]]:_(s32) = G_FMUL [[FNEG3]], [[FADD13]] + ; GFX10-NEXT: [[FADD14:%[0-9]+]]:_(s32) = G_FADD [[FMUL18]], [[FPEXT6]] + ; GFX10-NEXT: [[FMUL19:%[0-9]+]]:_(s32) = G_FMUL [[FADD14]], [[INT6]] + ; GFX10-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[FMUL19]], [[C1]] + ; GFX10-NEXT: [[FADD15:%[0-9]+]]:_(s32) = G_FADD [[AND3]], [[FADD13]] + ; GFX10-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD15]](s32) + ; GFX10-NEXT: [[INT7:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC3]](s16), [[TRUNC7]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT5]](s16), [[INT7]](s16) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; + ; GFX11-LABEL: name: test_fdiv_v4s16 + ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3 + ; GFX11-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<4 x s16>) + ; GFX11-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>) + ; GFX11-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[UV]](s16) + ; GFX11-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[UV4]](s16) + ; GFX11-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]] + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32) + ; GFX11-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]] + ; GFX11-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[FPEXT]] + ; GFX11-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT]], [[FMUL]] + ; GFX11-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA1]], [[FPEXT]] + ; GFX11-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMA2]], [[INT]] + ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608 + ; GFX11-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL1]], [[C]] + ; GFX11-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FMA1]] + ; GFX11-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32) + ; GFX11-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[UV4]](s16), [[UV]](s16) + ; GFX11-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[UV1]](s16) + ; GFX11-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[UV5]](s16) + ; GFX11-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]] + ; GFX11-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32) + ; GFX11-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]] + ; GFX11-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMUL2]], [[FPEXT2]] + ; GFX11-NEXT: [[FMA4:%[0-9]+]]:_(s32) = G_FMA [[FMA3]], [[INT2]], [[FMUL2]] + ; GFX11-NEXT: [[FMA5:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMA4]], [[FPEXT2]] + ; GFX11-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMA5]], [[INT2]] + ; GFX11-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL3]], [[C]] + ; GFX11-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FMA4]] + ; GFX11-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32) + ; GFX11-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[UV5]](s16), [[UV1]](s16) + ; GFX11-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[UV2]](s16) + ; GFX11-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[UV6]](s16) + ; GFX11-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]] + ; GFX11-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32) + ; GFX11-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]] + ; GFX11-NEXT: [[FMA6:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[FMUL4]], [[FPEXT4]] + ; GFX11-NEXT: [[FMA7:%[0-9]+]]:_(s32) = G_FMA [[FMA6]], [[INT4]], [[FMUL4]] + ; GFX11-NEXT: [[FMA8:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[FMA7]], [[FPEXT4]] + ; GFX11-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FMA8]], [[INT4]] + ; GFX11-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[FMUL5]], [[C]] + ; GFX11-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[AND2]], [[FMA7]] + ; GFX11-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD2]](s32) + ; GFX11-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[UV6]](s16), [[UV2]](s16) + ; GFX11-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[UV3]](s16) + ; GFX11-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[UV7]](s16) + ; GFX11-NEXT: [[FNEG3:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT7]] + ; GFX11-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT7]](s32) + ; GFX11-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]] + ; GFX11-NEXT: [[FMA9:%[0-9]+]]:_(s32) = G_FMA [[FNEG3]], [[FMUL6]], [[FPEXT6]] + ; GFX11-NEXT: [[FMA10:%[0-9]+]]:_(s32) = G_FMA [[FMA9]], [[INT6]], [[FMUL6]] + ; GFX11-NEXT: [[FMA11:%[0-9]+]]:_(s32) = G_FMA [[FNEG3]], [[FMA10]], [[FPEXT6]] + ; GFX11-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FMA11]], [[INT6]] + ; GFX11-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[FMUL7]], [[C]] + ; GFX11-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND3]], [[FMA10]] + ; GFX11-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32) + ; GFX11-NEXT: [[INT7:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC3]](s16), [[UV7]](s16), [[UV3]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16) + ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT5]](s16), [[INT7]](s16) + ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr2_vgpr3 %2:_(<4 x s16>) = G_FDIV %0, %1 @@ -2185,15 +2841,6 @@ body: | ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; - ; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_one_rcp - ; GFX9-UNSAFE: liveins: $vgpr0 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16) - ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; ; GFX10-LABEL: name: test_fdiv_s16_constant_one_rcp ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -2202,6 +2849,15 @@ body: | ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16) ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; + ; GFX11-LABEL: name: test_fdiv_s16_constant_one_rcp + ; GFX11: liveins: $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16) + ; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) + ; GFX11-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s16) = G_FCONSTANT half 1.0 %1:_(s32) = COPY $vgpr0 %2:_(s16) = G_TRUNC %1 @@ -2261,16 +2917,6 @@ body: | ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) ; - ; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_negative_one_rcp - ; GFX9-UNSAFE: liveins: $vgpr0 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) - ; GFX9-UNSAFE-NEXT: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]] - ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16) - ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) - ; ; GFX10-LABEL: name: test_fdiv_s16_constant_negative_one_rcp ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -2280,6 +2926,16 @@ body: | ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16) ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) + ; + ; GFX11-LABEL: name: test_fdiv_s16_constant_negative_one_rcp + ; GFX11: liveins: $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; GFX11-NEXT: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]] + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16) + ; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16) + ; GFX11-NEXT: $vgpr0 = COPY [[ANYEXT]](s32) %0:_(s16) = G_FCONSTANT half -1.0 %1:_(s32) = COPY $vgpr0 %2:_(s16) = G_TRUNC %1 @@ -2351,13 +3007,6 @@ body: | ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32) ; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32) ; - ; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_constant_one_rcp - ; GFX9-UNSAFE: liveins: $vgpr0 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[INT]](s32) - ; ; GFX10-LABEL: name: test_fdiv_s32_constant_one_rcp ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -2376,6 +3025,25 @@ body: | ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; GFX10-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32) ; GFX10-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; + ; GFX11-LABEL: name: test_fdiv_s32_constant_one_rcp + ; GFX11: liveins: $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[C]](s32), [[COPY]](s32), 0 + ; GFX11-NEXT: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[C]](s32), [[COPY]](s32), 1 + ; GFX11-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32) + ; GFX11-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[INT]] + ; GFX11-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[INT4]], [[C]] + ; GFX11-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT4]], [[INT4]] + ; GFX11-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[FMA1]] + ; GFX11-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[INT2]] + ; GFX11-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FMA2]], [[FMA1]], [[FMUL]] + ; GFX11-NEXT: [[FMA4:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA3]], [[INT2]] + ; GFX11-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) + ; GFX11-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[INT6]](s32) %0:_(s32) = G_FCONSTANT float 1.0 %1:_(s32) = COPY $vgpr0 %2:_(s32) = G_FDIV %0, %1 @@ -2448,14 +3116,6 @@ body: | ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32) ; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32) ; - ; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_constant_negative_one_rcp - ; GFX9-UNSAFE: liveins: $vgpr0 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; GFX9-UNSAFE-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]] - ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s32) - ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[INT]](s32) - ; ; GFX10-LABEL: name: test_fdiv_s32_constant_negative_one_rcp ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} @@ -2475,6 +3135,26 @@ body: | ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) ; GFX10-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32) ; GFX10-NEXT: $vgpr0 = COPY [[INT6]](s32) + ; + ; GFX11-LABEL: name: test_fdiv_s32_constant_negative_one_rcp + ; GFX11: liveins: $vgpr0 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float -1.000000e+00 + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00 + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[C]](s32), [[COPY]](s32), 0 + ; GFX11-NEXT: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[C]](s32), [[COPY]](s32), 1 + ; GFX11-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32) + ; GFX11-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[INT]] + ; GFX11-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[INT4]], [[C1]] + ; GFX11-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT4]], [[INT4]] + ; GFX11-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[INT2]], [[FMA1]] + ; GFX11-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[INT2]] + ; GFX11-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FMA2]], [[FMA1]], [[FMUL]] + ; GFX11-NEXT: [[FMA4:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA3]], [[INT2]] + ; GFX11-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1) + ; GFX11-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32) + ; GFX11-NEXT: $vgpr0 = COPY [[INT6]](s32) %0:_(s32) = G_FCONSTANT float -1.0 %1:_(s32) = COPY $vgpr0 %2:_(s32) = G_FDIV %0, %1 @@ -2558,22 +3238,6 @@ body: | ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64) ; - ; GFX9-UNSAFE-LABEL: name: test_fdiv_s64_constant_one_rcp - ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; GFX9-UNSAFE-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY]] - ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY]](s64) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C]] - ; GFX9-UNSAFE-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]] - ; GFX9-UNSAFE-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C]] - ; GFX9-UNSAFE-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]] - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[C]], [[FMA3]] - ; GFX9-UNSAFE-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[C]] - ; GFX9-UNSAFE-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]] - ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[FMA5]](s64) - ; ; GFX10-LABEL: name: test_fdiv_s64_constant_one_rcp ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -2592,6 +3256,25 @@ body: | ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1) ; GFX10-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64) + ; + ; GFX11-LABEL: name: test_fdiv_s64_constant_one_rcp + ; GFX11: liveins: $vgpr0_vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s64), [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[C]](s64), [[COPY]](s64), 0 + ; GFX11-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[INT]] + ; GFX11-NEXT: [[INT2:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s64) + ; GFX11-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT2]], [[C]] + ; GFX11-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[INT2]], [[FMA]], [[INT2]] + ; GFX11-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C]] + ; GFX11-NEXT: [[INT3:%[0-9]+]]:_(s64), [[INT4:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[C]](s64), [[COPY]](s64), 1 + ; GFX11-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA1]], [[FMA2]], [[FMA1]] + ; GFX11-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INT3]], [[FMA3]] + ; GFX11-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[INT3]] + ; GFX11-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1) + ; GFX11-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64) + ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64) %0:_(s64) = G_FCONSTANT double 1.0 %1:_(s64) = COPY $vgpr0_vgpr1 %2:_(s64) = G_FDIV %0, %1 @@ -2678,23 +3361,6 @@ body: | ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64) ; - ; GFX9-UNSAFE-LABEL: name: test_fdiv_s64_constant_negative_one_rcp - ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1 - ; GFX9-UNSAFE-NEXT: {{ $}} - ; GFX9-UNSAFE-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double -1.000000e+00 - ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; GFX9-UNSAFE-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY]] - ; GFX9-UNSAFE-NEXT: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 - ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY]](s64) - ; GFX9-UNSAFE-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT]], [[C1]] - ; GFX9-UNSAFE-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[FMA]], [[INT]], [[INT]] - ; GFX9-UNSAFE-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C1]] - ; GFX9-UNSAFE-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA2]], [[FMA1]], [[FMA1]] - ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[C]], [[FMA3]] - ; GFX9-UNSAFE-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[C]] - ; GFX9-UNSAFE-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]] - ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[FMA5]](s64) - ; ; GFX10-LABEL: name: test_fdiv_s64_constant_negative_one_rcp ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} @@ -2714,6 +3380,26 @@ body: | ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1) ; GFX10-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64) + ; + ; GFX11-LABEL: name: test_fdiv_s64_constant_negative_one_rcp + ; GFX11: liveins: $vgpr0_vgpr1 + ; GFX11-NEXT: {{ $}} + ; GFX11-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double -1.000000e+00 + ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.000000e+00 + ; GFX11-NEXT: [[INT:%[0-9]+]]:_(s64), [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[C]](s64), [[COPY]](s64), 0 + ; GFX11-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[INT]] + ; GFX11-NEXT: [[INT2:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s64) + ; GFX11-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[INT2]], [[C1]] + ; GFX11-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[INT2]], [[FMA]], [[INT2]] + ; GFX11-NEXT: [[FMA2:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMA1]], [[C1]] + ; GFX11-NEXT: [[INT3:%[0-9]+]]:_(s64), [[INT4:%[0-9]+]]:_(s1) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[C]](s64), [[COPY]](s64), 1 + ; GFX11-NEXT: [[FMA3:%[0-9]+]]:_(s64) = G_FMA [[FMA1]], [[FMA2]], [[FMA1]] + ; GFX11-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[INT3]], [[FMA3]] + ; GFX11-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[INT3]] + ; GFX11-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1) + ; GFX11-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64) + ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64) %0:_(s64) = G_FCONSTANT double -1.0 %1:_(s64) = COPY $vgpr0_vgpr1 %2:_(s64) = G_FDIV %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll index 9ae9d1977bd11..210e09fd9169a 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -1702,7 +1702,7 @@ entry: %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext %a.val = load volatile half, ptr addrspace(1) %gep.a %b.val = load volatile half, ptr addrspace(1) %gep.b - %r.val = fdiv half %a.val, %b.val + %r.val = fdiv afn half %a.val, %b.val store half %r.val, ptr addrspace(1) %gep.r ret void } @@ -2475,4 +2475,4 @@ declare <2 x half> @llvm.sqrt.v2f16(<2 x half>) #2 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } -attributes #2 = { nounwind "unsafe-fp-math"="true" } +attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index 57b4857776246..1d33c26686528 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -1101,62 +1101,21 @@ entry: define amdgpu_kernel void @fptrunc_f64_to_f16_afn( ; SI-SDAG-LABEL: fptrunc_f64_to_f16_afn: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 -; SI-SDAG-NEXT: s_mov_b32 s10, s2 -; SI-SDAG-NEXT: s_mov_b32 s11, s3 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 +; SI-SDAG-NEXT: s_mov_b32 s10, s6 +; SI-SDAG-NEXT: s_mov_b32 s11, s7 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: s_mov_b32 s8, s6 -; SI-SDAG-NEXT: s_mov_b32 s9, s7 +; SI-SDAG-NEXT: s_mov_b32 s8, s2 +; SI-SDAG-NEXT: s_mov_b32 s9, s3 ; SI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; SI-SDAG-NEXT: s_movk_i32 s0, 0x7e00 +; SI-SDAG-NEXT: s_mov_b32 s4, s0 +; SI-SDAG-NEXT: s_mov_b32 s5, s1 ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) -; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v1 -; SI-SDAG-NEXT: s_and_b32 s6, s1, 0x1ff -; SI-SDAG-NEXT: s_lshr_b32 s7, s1, 8 -; SI-SDAG-NEXT: s_bfe_u32 s8, s1, 0xb0014 -; SI-SDAG-NEXT: v_or_b32_e32 v0, s6, v0 -; SI-SDAG-NEXT: s_and_b32 s6, s7, 0xffe -; SI-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8 -; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13 -; SI-SDAG-NEXT: v_readfirstlane_b32 s7, v0 -; SI-SDAG-NEXT: v_readfirstlane_b32 s9, v1 -; SI-SDAG-NEXT: s_or_b32 s6, s6, s7 -; SI-SDAG-NEXT: s_or_b32 s7, s6, 0x1000 -; SI-SDAG-NEXT: s_lshr_b32 s10, s7, s9 -; SI-SDAG-NEXT: s_lshl_b32 s9, s10, s9 -; SI-SDAG-NEXT: s_cmp_lg_u32 s9, s7 -; SI-SDAG-NEXT: s_cselect_b32 s7, 1, 0 -; SI-SDAG-NEXT: s_addk_i32 s8, 0xfc10 -; SI-SDAG-NEXT: s_or_b32 s7, s10, s7 -; SI-SDAG-NEXT: s_lshl_b32 s9, s8, 12 -; SI-SDAG-NEXT: s_or_b32 s9, s6, s9 -; SI-SDAG-NEXT: s_cmp_lt_i32 s8, 1 -; SI-SDAG-NEXT: s_cselect_b32 s7, s7, s9 -; SI-SDAG-NEXT: s_and_b32 s9, s7, 7 -; SI-SDAG-NEXT: s_cmp_gt_i32 s9, 5 -; SI-SDAG-NEXT: s_cselect_b32 s10, 1, 0 -; SI-SDAG-NEXT: s_cmp_eq_u32 s9, 3 -; SI-SDAG-NEXT: s_cselect_b32 s9, 1, 0 -; SI-SDAG-NEXT: s_lshr_b32 s7, s7, 2 -; SI-SDAG-NEXT: s_or_b32 s9, s9, s10 -; SI-SDAG-NEXT: s_add_i32 s7, s7, s9 -; SI-SDAG-NEXT: s_cmp_lt_i32 s8, 31 -; SI-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00 -; SI-SDAG-NEXT: s_cmp_lg_u32 s6, 0 -; SI-SDAG-NEXT: s_cselect_b32 s0, s0, 0x7c00 -; SI-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f -; SI-SDAG-NEXT: s_cselect_b32 s0, s0, s7 -; SI-SDAG-NEXT: s_lshr_b32 s1, s1, 16 -; SI-SDAG-NEXT: s_and_b32 s1, s1, 0x8000 -; SI-SDAG-NEXT: s_or_b32 s6, s1, s0 -; SI-SDAG-NEXT: s_mov_b32 s0, s4 -; SI-SDAG-NEXT: s_mov_b32 s1, s5 -; SI-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; SI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: fptrunc_f64_to_f16_afn: @@ -1174,62 +1133,21 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn( ; ; VI-SDAG-LABEL: fptrunc_f64_to_f16_afn: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 -; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; VI-SDAG-NEXT: s_mov_b32 s2, -1 -; VI-SDAG-NEXT: s_mov_b32 s10, s2 -; VI-SDAG-NEXT: s_mov_b32 s11, s3 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; VI-SDAG-NEXT: s_mov_b32 s6, -1 +; VI-SDAG-NEXT: s_mov_b32 s10, s6 +; VI-SDAG-NEXT: s_mov_b32 s11, s7 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_mov_b32 s8, s6 -; VI-SDAG-NEXT: s_mov_b32 s9, s7 +; VI-SDAG-NEXT: s_mov_b32 s8, s2 +; VI-SDAG-NEXT: s_mov_b32 s9, s3 ; VI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; VI-SDAG-NEXT: s_mov_b32 s0, s4 -; VI-SDAG-NEXT: s_mov_b32 s1, s5 -; VI-SDAG-NEXT: s_movk_i32 s6, 0x7e00 +; VI-SDAG-NEXT: s_mov_b32 s4, s0 +; VI-SDAG-NEXT: s_mov_b32 s5, s1 ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) -; VI-SDAG-NEXT: v_readfirstlane_b32 s4, v1 -; VI-SDAG-NEXT: s_and_b32 s5, s4, 0x1ff -; VI-SDAG-NEXT: v_or_b32_e32 v0, s5, v0 -; VI-SDAG-NEXT: s_lshr_b32 s7, s4, 8 -; VI-SDAG-NEXT: s_bfe_u32 s8, s4, 0xb0014 -; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-SDAG-NEXT: s_and_b32 s5, s7, 0xffe -; VI-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; VI-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13 -; VI-SDAG-NEXT: v_readfirstlane_b32 s7, v0 -; VI-SDAG-NEXT: s_or_b32 s5, s5, s7 -; VI-SDAG-NEXT: v_readfirstlane_b32 s9, v1 -; VI-SDAG-NEXT: s_or_b32 s7, s5, 0x1000 -; VI-SDAG-NEXT: s_lshr_b32 s10, s7, s9 -; VI-SDAG-NEXT: s_lshl_b32 s9, s10, s9 -; VI-SDAG-NEXT: s_cmp_lg_u32 s9, s7 -; VI-SDAG-NEXT: s_cselect_b32 s7, 1, 0 -; VI-SDAG-NEXT: s_addk_i32 s8, 0xfc10 -; VI-SDAG-NEXT: s_lshl_b32 s9, s8, 12 -; VI-SDAG-NEXT: s_or_b32 s7, s10, s7 -; VI-SDAG-NEXT: s_or_b32 s9, s5, s9 -; VI-SDAG-NEXT: s_cmp_lt_i32 s8, 1 -; VI-SDAG-NEXT: s_cselect_b32 s7, s7, s9 -; VI-SDAG-NEXT: s_and_b32 s9, s7, 7 -; VI-SDAG-NEXT: s_cmp_gt_i32 s9, 5 -; VI-SDAG-NEXT: s_cselect_b32 s10, 1, 0 -; VI-SDAG-NEXT: s_cmp_eq_u32 s9, 3 -; VI-SDAG-NEXT: s_cselect_b32 s9, 1, 0 -; VI-SDAG-NEXT: s_lshr_b32 s7, s7, 2 -; VI-SDAG-NEXT: s_or_b32 s9, s9, s10 -; VI-SDAG-NEXT: s_add_i32 s7, s7, s9 -; VI-SDAG-NEXT: s_cmp_lt_i32 s8, 31 -; VI-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00 -; VI-SDAG-NEXT: s_cmp_lg_u32 s5, 0 -; VI-SDAG-NEXT: s_cselect_b32 s5, s6, 0x7c00 -; VI-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f -; VI-SDAG-NEXT: s_cselect_b32 s5, s5, s7 -; VI-SDAG-NEXT: s_lshr_b32 s4, s4, 16 -; VI-SDAG-NEXT: s_and_b32 s4, s4, 0x8000 -; VI-SDAG-NEXT: s_or_b32 s4, s4, s5 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: fptrunc_f64_to_f16_afn: @@ -1247,62 +1165,21 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn( ; ; GFX9-SDAG-LABEL: fptrunc_f64_to_f16_afn: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s6, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s7, s3 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s4, s10 -; GFX9-SDAG-NEXT: s_mov_b32 s5, s11 -; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s0, s8 -; GFX9-SDAG-NEXT: s_mov_b32 s1, s9 -; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7e00 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_readfirstlane_b32 s5, v1 -; GFX9-SDAG-NEXT: s_and_b32 s6, s5, 0x1ff -; GFX9-SDAG-NEXT: v_or_b32_e32 v0, s6, v0 -; GFX9-SDAG-NEXT: s_lshr_b32 s7, s5, 8 -; GFX9-SDAG-NEXT: s_bfe_u32 s8, s5, 0xb0014 -; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: s_and_b32 s6, s7, 0xffe -; GFX9-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13 -; GFX9-SDAG-NEXT: v_readfirstlane_b32 s7, v0 -; GFX9-SDAG-NEXT: s_or_b32 s6, s6, s7 -; GFX9-SDAG-NEXT: v_readfirstlane_b32 s9, v1 -; GFX9-SDAG-NEXT: s_or_b32 s7, s6, 0x1000 -; GFX9-SDAG-NEXT: s_lshr_b32 s10, s7, s9 -; GFX9-SDAG-NEXT: s_lshl_b32 s9, s10, s9 -; GFX9-SDAG-NEXT: s_cmp_lg_u32 s9, s7 -; GFX9-SDAG-NEXT: s_cselect_b32 s7, 1, 0 -; GFX9-SDAG-NEXT: s_addk_i32 s8, 0xfc10 -; GFX9-SDAG-NEXT: s_lshl_b32 s9, s8, 12 -; GFX9-SDAG-NEXT: s_or_b32 s7, s10, s7 -; GFX9-SDAG-NEXT: s_or_b32 s9, s6, s9 -; GFX9-SDAG-NEXT: s_cmp_lt_i32 s8, 1 -; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, s9 -; GFX9-SDAG-NEXT: s_and_b32 s9, s7, 7 -; GFX9-SDAG-NEXT: s_cmp_gt_i32 s9, 5 -; GFX9-SDAG-NEXT: s_cselect_b32 s10, 1, 0 -; GFX9-SDAG-NEXT: s_cmp_eq_u32 s9, 3 -; GFX9-SDAG-NEXT: s_cselect_b32 s9, 1, 0 -; GFX9-SDAG-NEXT: s_lshr_b32 s7, s7, 2 -; GFX9-SDAG-NEXT: s_or_b32 s9, s9, s10 -; GFX9-SDAG-NEXT: s_add_i32 s7, s7, s9 -; GFX9-SDAG-NEXT: s_cmp_lt_i32 s8, 31 -; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00 -; GFX9-SDAG-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 -; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f -; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, s7 -; GFX9-SDAG-NEXT: s_lshr_b32 s5, s5, 16 -; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 -; GFX9-SDAG-NEXT: s_or_b32 s4, s5, s4 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fptrunc_f64_to_f16_afn: @@ -1320,62 +1197,21 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn( ; ; GFX950-SDAG-LABEL: fptrunc_f64_to_f16_afn: ; GFX950-SDAG: ; %bb.0: ; %entry -; GFX950-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; GFX950-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX950-SDAG-NEXT: s_mov_b32 s6, s2 -; GFX950-SDAG-NEXT: s_mov_b32 s7, s3 +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX950-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX950-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX950-SDAG-NEXT: s_mov_b32 s11, s7 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: s_mov_b32 s4, s10 -; GFX950-SDAG-NEXT: s_mov_b32 s5, s11 -; GFX950-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; GFX950-SDAG-NEXT: s_mov_b32 s0, s8 -; GFX950-SDAG-NEXT: s_mov_b32 s1, s9 -; GFX950-SDAG-NEXT: s_movk_i32 s4, 0x7e00 +; GFX950-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX950-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX950-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; GFX950-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX950-SDAG-NEXT: s_mov_b32 s5, s1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX950-SDAG-NEXT: v_readfirstlane_b32 s5, v1 -; GFX950-SDAG-NEXT: s_and_b32 s6, s5, 0x1ff -; GFX950-SDAG-NEXT: v_or_b32_e32 v0, s6, v0 -; GFX950-SDAG-NEXT: s_lshr_b32 s7, s5, 8 -; GFX950-SDAG-NEXT: s_bfe_u32 s8, s5, 0xb0014 -; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX950-SDAG-NEXT: s_and_b32 s6, s7, 0xffe -; GFX950-SDAG-NEXT: s_sub_i32 s7, 0x3f1, s8 -; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX950-SDAG-NEXT: v_med3_i32 v1, s7, 0, 13 -; GFX950-SDAG-NEXT: v_readfirstlane_b32 s7, v0 -; GFX950-SDAG-NEXT: s_or_b32 s6, s6, s7 -; GFX950-SDAG-NEXT: v_readfirstlane_b32 s9, v1 -; GFX950-SDAG-NEXT: s_or_b32 s7, s6, 0x1000 -; GFX950-SDAG-NEXT: s_lshr_b32 s10, s7, s9 -; GFX950-SDAG-NEXT: s_lshl_b32 s9, s10, s9 -; GFX950-SDAG-NEXT: s_cmp_lg_u32 s9, s7 -; GFX950-SDAG-NEXT: s_cselect_b32 s7, 1, 0 -; GFX950-SDAG-NEXT: s_addk_i32 s8, 0xfc10 -; GFX950-SDAG-NEXT: s_lshl_b32 s9, s8, 12 -; GFX950-SDAG-NEXT: s_or_b32 s7, s10, s7 -; GFX950-SDAG-NEXT: s_or_b32 s9, s6, s9 -; GFX950-SDAG-NEXT: s_cmp_lt_i32 s8, 1 -; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, s9 -; GFX950-SDAG-NEXT: s_and_b32 s9, s7, 7 -; GFX950-SDAG-NEXT: s_cmp_gt_i32 s9, 5 -; GFX950-SDAG-NEXT: s_cselect_b32 s10, 1, 0 -; GFX950-SDAG-NEXT: s_cmp_eq_u32 s9, 3 -; GFX950-SDAG-NEXT: s_cselect_b32 s9, 1, 0 -; GFX950-SDAG-NEXT: s_lshr_b32 s7, s7, 2 -; GFX950-SDAG-NEXT: s_or_b32 s9, s9, s10 -; GFX950-SDAG-NEXT: s_add_i32 s7, s7, s9 -; GFX950-SDAG-NEXT: s_cmp_lt_i32 s8, 31 -; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, 0x7c00 -; GFX950-SDAG-NEXT: s_cmp_lg_u32 s6, 0 -; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 -; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s8, 0x40f -; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, s7 -; GFX950-SDAG-NEXT: s_lshr_b32 s5, s5, 16 -; GFX950-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 -; GFX950-SDAG-NEXT: s_or_b32 s4, s5, s4 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX950-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: fptrunc_f64_to_f16_afn: @@ -1401,60 +1237,13 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn( ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3 -; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 -; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s3, s2, 0x1ff -; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s2, 8 -; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, s3, v0 -; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s3, s2, 0xb0014 -; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s4, 0x3f1, s3 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v1, s4, 0, 13 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s8, v1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, s5, s4 -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s4, 0x1000 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s9, s5, s8 -; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s9, s8 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s8, s5 -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 -; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s3, 0xfc10 -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s9, s5 -; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s3, 12 -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s4, s8 -; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 1 -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, s8 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s8, s5, 7 -; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s8, 5 -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, 1, 0 -; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s8, 3 -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, 1, 0 -; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s5, 2 -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s8, s9 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s5, s5, s8 -; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 31 -; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s8, 0x7e00 -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, 0x7c00 -; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s4, 0 -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s4, s8, 0x7c00 -; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s3, 0x40f -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s4, s5 -; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s2, s2, 0x8000 +; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s2, s2, s3 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-SDAG-TRUE16-NEXT: s_endpgm ; @@ -1468,60 +1257,13 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn( ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2 ; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3 -; GFX11-SDAG-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 -; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s2, v1 -; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s3, s2, 0x1ff -; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s2, 8 -; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, s3, v0 -; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s3, s2, 0xb0014 -; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s4, 0x3f1, s3 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v1, s4, 0, 13 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s8, v1 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v0 -; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s4, s5, s4 -; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s4, 0x1000 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s9, s5, s8 -; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s9, s8 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s8, s5 -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 -; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s3, 0xfc10 -; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s9, s5 -; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s3, 12 -; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s4, s8 -; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 1 -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, s8 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s8, s5, 7 -; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s8, 5 -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, 1, 0 -; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s8, 3 -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, 1, 0 -; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s5, 2 -; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s8, s9 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s5, s5, s8 -; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 31 -; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s8, 0x7e00 -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, 0x7c00 -; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s4, 0 -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s4, s8, 0x7c00 -; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s3, 0x40f -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s4, s5 -; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 -; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s2, s2, 0x8000 +; GFX11-SDAG-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 ; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 -; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s2, s2, s3 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX11-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-SDAG-FAKE16-NEXT: s_endpgm ; @@ -3026,106 +2768,25 @@ entry: define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn( ; SI-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn: ; SI-SDAG: ; %bb.0: ; %entry -; SI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 -; SI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; SI-SDAG-NEXT: s_mov_b32 s2, -1 -; SI-SDAG-NEXT: s_mov_b32 s10, s2 -; SI-SDAG-NEXT: s_mov_b32 s11, s3 +; SI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; SI-SDAG-NEXT: s_mov_b32 s6, -1 +; SI-SDAG-NEXT: s_mov_b32 s10, s6 +; SI-SDAG-NEXT: s_mov_b32 s11, s7 ; SI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SI-SDAG-NEXT: s_mov_b32 s8, s6 -; SI-SDAG-NEXT: s_mov_b32 s9, s7 +; SI-SDAG-NEXT: s_mov_b32 s8, s2 +; SI-SDAG-NEXT: s_mov_b32 s9, s3 ; SI-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; SI-SDAG-NEXT: s_movk_i32 s0, 0x7e00 +; SI-SDAG-NEXT: s_mov_b32 s4, s0 +; SI-SDAG-NEXT: s_mov_b32 s5, s1 ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) -; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v3 -; SI-SDAG-NEXT: v_readfirstlane_b32 s6, v1 -; SI-SDAG-NEXT: s_and_b32 s7, s1, 0x1ff -; SI-SDAG-NEXT: s_lshr_b32 s8, s1, 8 -; SI-SDAG-NEXT: s_bfe_u32 s9, s1, 0xb0014 -; SI-SDAG-NEXT: v_or_b32_e32 v1, s7, v2 -; SI-SDAG-NEXT: s_and_b32 s7, s8, 0xffe -; SI-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9 -; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; SI-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13 -; SI-SDAG-NEXT: v_readfirstlane_b32 s8, v1 -; SI-SDAG-NEXT: v_readfirstlane_b32 s10, v2 -; SI-SDAG-NEXT: s_or_b32 s7, s7, s8 -; SI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 -; SI-SDAG-NEXT: s_lshr_b32 s11, s8, s10 -; SI-SDAG-NEXT: s_lshl_b32 s10, s11, s10 -; SI-SDAG-NEXT: s_cmp_lg_u32 s10, s8 -; SI-SDAG-NEXT: s_cselect_b32 s8, 1, 0 -; SI-SDAG-NEXT: s_addk_i32 s9, 0xfc10 -; SI-SDAG-NEXT: s_or_b32 s8, s11, s8 -; SI-SDAG-NEXT: s_lshl_b32 s10, s9, 12 -; SI-SDAG-NEXT: s_or_b32 s10, s7, s10 -; SI-SDAG-NEXT: s_cmp_lt_i32 s9, 1 -; SI-SDAG-NEXT: s_cselect_b32 s8, s8, s10 -; SI-SDAG-NEXT: s_and_b32 s10, s8, 7 -; SI-SDAG-NEXT: s_cmp_gt_i32 s10, 5 -; SI-SDAG-NEXT: s_cselect_b32 s11, 1, 0 -; SI-SDAG-NEXT: s_cmp_eq_u32 s10, 3 -; SI-SDAG-NEXT: s_cselect_b32 s10, 1, 0 -; SI-SDAG-NEXT: s_lshr_b32 s8, s8, 2 -; SI-SDAG-NEXT: s_or_b32 s10, s10, s11 -; SI-SDAG-NEXT: s_add_i32 s8, s8, s10 -; SI-SDAG-NEXT: s_cmp_lt_i32 s9, 31 -; SI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 -; SI-SDAG-NEXT: s_cmp_lg_u32 s7, 0 -; SI-SDAG-NEXT: s_cselect_b32 s7, s0, 0x7c00 -; SI-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f -; SI-SDAG-NEXT: s_cselect_b32 s7, s7, s8 -; SI-SDAG-NEXT: s_lshr_b32 s1, s1, 16 -; SI-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff -; SI-SDAG-NEXT: s_lshr_b32 s9, s6, 8 -; SI-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014 -; SI-SDAG-NEXT: s_and_b32 s1, s1, 0x8000 -; SI-SDAG-NEXT: v_or_b32_e32 v0, s8, v0 -; SI-SDAG-NEXT: s_and_b32 s8, s9, 0xffe -; SI-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10 -; SI-SDAG-NEXT: s_or_b32 s1, s1, s7 -; SI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13 -; SI-SDAG-NEXT: s_lshl_b32 s1, s1, 16 -; SI-SDAG-NEXT: v_readfirstlane_b32 s7, v0 -; SI-SDAG-NEXT: v_readfirstlane_b32 s9, v1 -; SI-SDAG-NEXT: s_or_b32 s7, s8, s7 -; SI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 -; SI-SDAG-NEXT: s_lshr_b32 s11, s8, s9 -; SI-SDAG-NEXT: s_lshl_b32 s9, s11, s9 -; SI-SDAG-NEXT: s_cmp_lg_u32 s9, s8 -; SI-SDAG-NEXT: s_cselect_b32 s8, 1, 0 -; SI-SDAG-NEXT: s_addk_i32 s10, 0xfc10 -; SI-SDAG-NEXT: s_or_b32 s8, s11, s8 -; SI-SDAG-NEXT: s_lshl_b32 s9, s10, 12 -; SI-SDAG-NEXT: s_or_b32 s9, s7, s9 -; SI-SDAG-NEXT: s_cmp_lt_i32 s10, 1 -; SI-SDAG-NEXT: s_cselect_b32 s8, s8, s9 -; SI-SDAG-NEXT: s_and_b32 s9, s8, 7 -; SI-SDAG-NEXT: s_cmp_gt_i32 s9, 5 -; SI-SDAG-NEXT: s_cselect_b32 s11, 1, 0 -; SI-SDAG-NEXT: s_cmp_eq_u32 s9, 3 -; SI-SDAG-NEXT: s_cselect_b32 s9, 1, 0 -; SI-SDAG-NEXT: s_lshr_b32 s8, s8, 2 -; SI-SDAG-NEXT: s_or_b32 s9, s9, s11 -; SI-SDAG-NEXT: s_add_i32 s8, s8, s9 -; SI-SDAG-NEXT: s_cmp_lt_i32 s10, 31 -; SI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 -; SI-SDAG-NEXT: s_cmp_lg_u32 s7, 0 -; SI-SDAG-NEXT: s_cselect_b32 s0, s0, 0x7c00 -; SI-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f -; SI-SDAG-NEXT: s_cselect_b32 s0, s0, s8 -; SI-SDAG-NEXT: s_lshr_b32 s6, s6, 16 -; SI-SDAG-NEXT: s_and_b32 s6, s6, 0x8000 -; SI-SDAG-NEXT: s_or_b32 s0, s6, s0 -; SI-SDAG-NEXT: s_and_b32 s0, s0, 0xffff -; SI-SDAG-NEXT: s_or_b32 s6, s0, s1 -; SI-SDAG-NEXT: s_mov_b32 s0, s4 -; SI-SDAG-NEXT: s_mov_b32 s1, s5 -; SI-SDAG-NEXT: v_mov_b32_e32 v0, s6 -; SI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] +; SI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2 +; SI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-SDAG-NEXT: s_endpgm ; ; SI-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn: @@ -3147,106 +2808,24 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn( ; ; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn: ; VI-SDAG: ; %bb.0: ; %entry -; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 -; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; VI-SDAG-NEXT: s_mov_b32 s2, -1 -; VI-SDAG-NEXT: s_mov_b32 s10, s2 -; VI-SDAG-NEXT: s_mov_b32 s11, s3 +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; VI-SDAG-NEXT: s_mov_b32 s6, -1 +; VI-SDAG-NEXT: s_mov_b32 s10, s6 +; VI-SDAG-NEXT: s_mov_b32 s11, s7 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_mov_b32 s8, s6 -; VI-SDAG-NEXT: s_mov_b32 s9, s7 +; VI-SDAG-NEXT: s_mov_b32 s8, s2 +; VI-SDAG-NEXT: s_mov_b32 s9, s3 ; VI-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; VI-SDAG-NEXT: s_mov_b32 s0, s4 -; VI-SDAG-NEXT: s_mov_b32 s1, s5 -; VI-SDAG-NEXT: s_movk_i32 s6, 0x7e00 +; VI-SDAG-NEXT: s_mov_b32 s4, s0 +; VI-SDAG-NEXT: s_mov_b32 s5, s1 ; VI-SDAG-NEXT: s_waitcnt vmcnt(0) -; VI-SDAG-NEXT: v_readfirstlane_b32 s4, v3 -; VI-SDAG-NEXT: s_and_b32 s7, s4, 0x1ff -; VI-SDAG-NEXT: v_readfirstlane_b32 s5, v1 -; VI-SDAG-NEXT: v_or_b32_e32 v1, s7, v2 -; VI-SDAG-NEXT: s_lshr_b32 s8, s4, 8 -; VI-SDAG-NEXT: s_bfe_u32 s9, s4, 0xb0014 -; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; VI-SDAG-NEXT: s_and_b32 s7, s8, 0xffe -; VI-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; VI-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13 -; VI-SDAG-NEXT: v_readfirstlane_b32 s8, v1 -; VI-SDAG-NEXT: s_or_b32 s7, s7, s8 -; VI-SDAG-NEXT: v_readfirstlane_b32 s10, v2 -; VI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 -; VI-SDAG-NEXT: s_lshr_b32 s11, s8, s10 -; VI-SDAG-NEXT: s_lshl_b32 s10, s11, s10 -; VI-SDAG-NEXT: s_cmp_lg_u32 s10, s8 -; VI-SDAG-NEXT: s_cselect_b32 s8, 1, 0 -; VI-SDAG-NEXT: s_addk_i32 s9, 0xfc10 -; VI-SDAG-NEXT: s_lshl_b32 s10, s9, 12 -; VI-SDAG-NEXT: s_or_b32 s8, s11, s8 -; VI-SDAG-NEXT: s_or_b32 s10, s7, s10 -; VI-SDAG-NEXT: s_cmp_lt_i32 s9, 1 -; VI-SDAG-NEXT: s_cselect_b32 s8, s8, s10 -; VI-SDAG-NEXT: s_and_b32 s10, s8, 7 -; VI-SDAG-NEXT: s_cmp_gt_i32 s10, 5 -; VI-SDAG-NEXT: s_cselect_b32 s11, 1, 0 -; VI-SDAG-NEXT: s_cmp_eq_u32 s10, 3 -; VI-SDAG-NEXT: s_cselect_b32 s10, 1, 0 -; VI-SDAG-NEXT: s_lshr_b32 s8, s8, 2 -; VI-SDAG-NEXT: s_or_b32 s10, s10, s11 -; VI-SDAG-NEXT: s_add_i32 s8, s8, s10 -; VI-SDAG-NEXT: s_cmp_lt_i32 s9, 31 -; VI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 -; VI-SDAG-NEXT: s_cmp_lg_u32 s7, 0 -; VI-SDAG-NEXT: s_cselect_b32 s7, s6, 0x7c00 -; VI-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f -; VI-SDAG-NEXT: s_cselect_b32 s7, s7, s8 -; VI-SDAG-NEXT: s_and_b32 s8, s5, 0x1ff -; VI-SDAG-NEXT: v_or_b32_e32 v0, s8, v0 -; VI-SDAG-NEXT: s_lshr_b32 s4, s4, 16 -; VI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-SDAG-NEXT: s_lshr_b32 s9, s5, 8 -; VI-SDAG-NEXT: s_bfe_u32 s10, s5, 0xb0014 -; VI-SDAG-NEXT: s_and_b32 s4, s4, 0x8000 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; VI-SDAG-NEXT: s_and_b32 s8, s9, 0xffe -; VI-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10 -; VI-SDAG-NEXT: s_or_b32 s4, s4, s7 -; VI-SDAG-NEXT: v_readfirstlane_b32 s7, v0 -; VI-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13 -; VI-SDAG-NEXT: s_or_b32 s7, s8, s7 -; VI-SDAG-NEXT: v_readfirstlane_b32 s9, v1 -; VI-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 -; VI-SDAG-NEXT: s_lshr_b32 s11, s8, s9 -; VI-SDAG-NEXT: s_lshl_b32 s4, s4, 16 -; VI-SDAG-NEXT: s_lshl_b32 s9, s11, s9 -; VI-SDAG-NEXT: s_cmp_lg_u32 s9, s8 -; VI-SDAG-NEXT: s_cselect_b32 s8, 1, 0 -; VI-SDAG-NEXT: s_addk_i32 s10, 0xfc10 -; VI-SDAG-NEXT: s_lshl_b32 s9, s10, 12 -; VI-SDAG-NEXT: s_or_b32 s8, s11, s8 -; VI-SDAG-NEXT: s_or_b32 s9, s7, s9 -; VI-SDAG-NEXT: s_cmp_lt_i32 s10, 1 -; VI-SDAG-NEXT: s_cselect_b32 s8, s8, s9 -; VI-SDAG-NEXT: s_and_b32 s9, s8, 7 -; VI-SDAG-NEXT: s_cmp_gt_i32 s9, 5 -; VI-SDAG-NEXT: s_cselect_b32 s11, 1, 0 -; VI-SDAG-NEXT: s_cmp_eq_u32 s9, 3 -; VI-SDAG-NEXT: s_cselect_b32 s9, 1, 0 -; VI-SDAG-NEXT: s_lshr_b32 s8, s8, 2 -; VI-SDAG-NEXT: s_or_b32 s9, s9, s11 -; VI-SDAG-NEXT: s_add_i32 s8, s8, s9 -; VI-SDAG-NEXT: s_cmp_lt_i32 s10, 31 -; VI-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 -; VI-SDAG-NEXT: s_cmp_lg_u32 s7, 0 -; VI-SDAG-NEXT: s_cselect_b32 s6, s6, 0x7c00 -; VI-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f -; VI-SDAG-NEXT: s_cselect_b32 s6, s6, s8 -; VI-SDAG-NEXT: s_lshr_b32 s5, s5, 16 -; VI-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 -; VI-SDAG-NEXT: s_or_b32 s5, s5, s6 -; VI-SDAG-NEXT: s_and_b32 s5, s5, 0xffff -; VI-SDAG-NEXT: s_or_b32 s4, s5, s4 -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; VI-SDAG-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn: @@ -3267,104 +2846,24 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn( ; ; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn: ; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX9-SDAG-NEXT: s_mov_b32 s6, s2 -; GFX9-SDAG-NEXT: s_mov_b32 s7, s3 +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX9-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX9-SDAG-NEXT: s_mov_b32 s11, s7 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: s_mov_b32 s4, s10 -; GFX9-SDAG-NEXT: s_mov_b32 s5, s11 -; GFX9-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GFX9-SDAG-NEXT: s_mov_b32 s0, s8 -; GFX9-SDAG-NEXT: s_mov_b32 s1, s9 -; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7e00 +; GFX9-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX9-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX9-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GFX9-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX9-SDAG-NEXT: s_mov_b32 s5, s1 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; GFX9-SDAG-NEXT: s_and_b32 s7, s5, 0x1ff -; GFX9-SDAG-NEXT: v_readfirstlane_b32 s6, v1 -; GFX9-SDAG-NEXT: v_or_b32_e32 v1, s7, v2 -; GFX9-SDAG-NEXT: s_lshr_b32 s8, s5, 8 -; GFX9-SDAG-NEXT: s_bfe_u32 s9, s5, 0xb0014 -; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-SDAG-NEXT: s_and_b32 s7, s8, 0xffe -; GFX9-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13 -; GFX9-SDAG-NEXT: v_readfirstlane_b32 s8, v1 -; GFX9-SDAG-NEXT: s_or_b32 s7, s7, s8 -; GFX9-SDAG-NEXT: v_readfirstlane_b32 s10, v2 -; GFX9-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 -; GFX9-SDAG-NEXT: s_lshr_b32 s11, s8, s10 -; GFX9-SDAG-NEXT: s_lshl_b32 s10, s11, s10 -; GFX9-SDAG-NEXT: s_cmp_lg_u32 s10, s8 -; GFX9-SDAG-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-SDAG-NEXT: s_addk_i32 s9, 0xfc10 -; GFX9-SDAG-NEXT: s_lshl_b32 s10, s9, 12 -; GFX9-SDAG-NEXT: s_or_b32 s8, s11, s8 -; GFX9-SDAG-NEXT: s_or_b32 s10, s7, s10 -; GFX9-SDAG-NEXT: s_cmp_lt_i32 s9, 1 -; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, s10 -; GFX9-SDAG-NEXT: s_and_b32 s10, s8, 7 -; GFX9-SDAG-NEXT: s_cmp_gt_i32 s10, 5 -; GFX9-SDAG-NEXT: s_cselect_b32 s11, 1, 0 -; GFX9-SDAG-NEXT: s_cmp_eq_u32 s10, 3 -; GFX9-SDAG-NEXT: s_cselect_b32 s10, 1, 0 -; GFX9-SDAG-NEXT: s_lshr_b32 s8, s8, 2 -; GFX9-SDAG-NEXT: s_or_b32 s10, s10, s11 -; GFX9-SDAG-NEXT: s_add_i32 s8, s8, s10 -; GFX9-SDAG-NEXT: s_cmp_lt_i32 s9, 31 -; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 -; GFX9-SDAG-NEXT: s_cmp_lg_u32 s7, 0 -; GFX9-SDAG-NEXT: s_cselect_b32 s7, s4, 0x7c00 -; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f -; GFX9-SDAG-NEXT: s_cselect_b32 s7, s7, s8 -; GFX9-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff -; GFX9-SDAG-NEXT: v_or_b32_e32 v0, s8, v0 -; GFX9-SDAG-NEXT: s_lshr_b32 s5, s5, 16 -; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-SDAG-NEXT: s_lshr_b32 s9, s6, 8 -; GFX9-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014 -; GFX9-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-SDAG-NEXT: s_and_b32 s8, s9, 0xffe -; GFX9-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10 -; GFX9-SDAG-NEXT: s_or_b32 s5, s5, s7 -; GFX9-SDAG-NEXT: v_readfirstlane_b32 s7, v0 -; GFX9-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13 -; GFX9-SDAG-NEXT: s_or_b32 s7, s8, s7 -; GFX9-SDAG-NEXT: v_readfirstlane_b32 s9, v1 -; GFX9-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 -; GFX9-SDAG-NEXT: s_lshr_b32 s11, s8, s9 -; GFX9-SDAG-NEXT: s_lshl_b32 s9, s11, s9 -; GFX9-SDAG-NEXT: s_cmp_lg_u32 s9, s8 -; GFX9-SDAG-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-SDAG-NEXT: s_addk_i32 s10, 0xfc10 -; GFX9-SDAG-NEXT: s_lshl_b32 s9, s10, 12 -; GFX9-SDAG-NEXT: s_or_b32 s8, s11, s8 -; GFX9-SDAG-NEXT: s_or_b32 s9, s7, s9 -; GFX9-SDAG-NEXT: s_cmp_lt_i32 s10, 1 -; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, s9 -; GFX9-SDAG-NEXT: s_and_b32 s9, s8, 7 -; GFX9-SDAG-NEXT: s_cmp_gt_i32 s9, 5 -; GFX9-SDAG-NEXT: s_cselect_b32 s11, 1, 0 -; GFX9-SDAG-NEXT: s_cmp_eq_u32 s9, 3 -; GFX9-SDAG-NEXT: s_cselect_b32 s9, 1, 0 -; GFX9-SDAG-NEXT: s_lshr_b32 s8, s8, 2 -; GFX9-SDAG-NEXT: s_or_b32 s9, s9, s11 -; GFX9-SDAG-NEXT: s_add_i32 s8, s8, s9 -; GFX9-SDAG-NEXT: s_cmp_lt_i32 s10, 31 -; GFX9-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 -; GFX9-SDAG-NEXT: s_cmp_lg_u32 s7, 0 -; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 -; GFX9-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f -; GFX9-SDAG-NEXT: s_cselect_b32 s4, s4, s8 -; GFX9-SDAG-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-SDAG-NEXT: s_and_b32 s6, s6, 0x8000 -; GFX9-SDAG-NEXT: s_or_b32 s4, s6, s4 -; GFX9-SDAG-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] +; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-SDAG-NEXT: s_endpgm ; ; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn: @@ -3385,104 +2884,22 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn( ; ; GFX950-SDAG-LABEL: fptrunc_v2f64_to_v2f16_afn: ; GFX950-SDAG: ; %bb.0: ; %entry -; GFX950-SDAG-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; GFX950-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX950-SDAG-NEXT: s_mov_b32 s6, s2 -; GFX950-SDAG-NEXT: s_mov_b32 s7, s3 +; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX950-SDAG-NEXT: s_mov_b32 s7, 0xf000 +; GFX950-SDAG-NEXT: s_mov_b32 s6, -1 +; GFX950-SDAG-NEXT: s_mov_b32 s10, s6 +; GFX950-SDAG-NEXT: s_mov_b32 s11, s7 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: s_mov_b32 s4, s10 -; GFX950-SDAG-NEXT: s_mov_b32 s5, s11 -; GFX950-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GFX950-SDAG-NEXT: s_mov_b32 s0, s8 -; GFX950-SDAG-NEXT: s_mov_b32 s1, s9 -; GFX950-SDAG-NEXT: s_movk_i32 s4, 0x7e00 +; GFX950-SDAG-NEXT: s_mov_b32 s8, s2 +; GFX950-SDAG-NEXT: s_mov_b32 s9, s3 +; GFX950-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GFX950-SDAG-NEXT: s_mov_b32 s4, s0 +; GFX950-SDAG-NEXT: s_mov_b32 s5, s1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX950-SDAG-NEXT: v_readfirstlane_b32 s5, v3 -; GFX950-SDAG-NEXT: s_and_b32 s7, s5, 0x1ff -; GFX950-SDAG-NEXT: v_readfirstlane_b32 s6, v1 -; GFX950-SDAG-NEXT: v_or_b32_e32 v1, s7, v2 -; GFX950-SDAG-NEXT: s_lshr_b32 s8, s5, 8 -; GFX950-SDAG-NEXT: s_bfe_u32 s9, s5, 0xb0014 -; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX950-SDAG-NEXT: s_and_b32 s7, s8, 0xffe -; GFX950-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s9 -; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX950-SDAG-NEXT: v_med3_i32 v2, s8, 0, 13 -; GFX950-SDAG-NEXT: v_readfirstlane_b32 s8, v1 -; GFX950-SDAG-NEXT: s_or_b32 s7, s7, s8 -; GFX950-SDAG-NEXT: v_readfirstlane_b32 s10, v2 -; GFX950-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 -; GFX950-SDAG-NEXT: s_lshr_b32 s11, s8, s10 -; GFX950-SDAG-NEXT: s_lshl_b32 s10, s11, s10 -; GFX950-SDAG-NEXT: s_cmp_lg_u32 s10, s8 -; GFX950-SDAG-NEXT: s_cselect_b32 s8, 1, 0 -; GFX950-SDAG-NEXT: s_addk_i32 s9, 0xfc10 -; GFX950-SDAG-NEXT: s_lshl_b32 s10, s9, 12 -; GFX950-SDAG-NEXT: s_or_b32 s8, s11, s8 -; GFX950-SDAG-NEXT: s_or_b32 s10, s7, s10 -; GFX950-SDAG-NEXT: s_cmp_lt_i32 s9, 1 -; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, s10 -; GFX950-SDAG-NEXT: s_and_b32 s10, s8, 7 -; GFX950-SDAG-NEXT: s_cmp_gt_i32 s10, 5 -; GFX950-SDAG-NEXT: s_cselect_b32 s11, 1, 0 -; GFX950-SDAG-NEXT: s_cmp_eq_u32 s10, 3 -; GFX950-SDAG-NEXT: s_cselect_b32 s10, 1, 0 -; GFX950-SDAG-NEXT: s_lshr_b32 s8, s8, 2 -; GFX950-SDAG-NEXT: s_or_b32 s10, s10, s11 -; GFX950-SDAG-NEXT: s_add_i32 s8, s8, s10 -; GFX950-SDAG-NEXT: s_cmp_lt_i32 s9, 31 -; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 -; GFX950-SDAG-NEXT: s_cmp_lg_u32 s7, 0 -; GFX950-SDAG-NEXT: s_cselect_b32 s7, s4, 0x7c00 -; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s9, 0x40f -; GFX950-SDAG-NEXT: s_cselect_b32 s7, s7, s8 -; GFX950-SDAG-NEXT: s_and_b32 s8, s6, 0x1ff -; GFX950-SDAG-NEXT: v_or_b32_e32 v0, s8, v0 -; GFX950-SDAG-NEXT: s_lshr_b32 s5, s5, 16 -; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX950-SDAG-NEXT: s_lshr_b32 s9, s6, 8 -; GFX950-SDAG-NEXT: s_bfe_u32 s10, s6, 0xb0014 -; GFX950-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 -; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX950-SDAG-NEXT: s_and_b32 s8, s9, 0xffe -; GFX950-SDAG-NEXT: s_sub_i32 s9, 0x3f1, s10 -; GFX950-SDAG-NEXT: s_or_b32 s5, s5, s7 -; GFX950-SDAG-NEXT: v_readfirstlane_b32 s7, v0 -; GFX950-SDAG-NEXT: v_med3_i32 v1, s9, 0, 13 -; GFX950-SDAG-NEXT: s_or_b32 s7, s8, s7 -; GFX950-SDAG-NEXT: v_readfirstlane_b32 s9, v1 -; GFX950-SDAG-NEXT: s_or_b32 s8, s7, 0x1000 -; GFX950-SDAG-NEXT: s_lshr_b32 s11, s8, s9 -; GFX950-SDAG-NEXT: s_lshl_b32 s9, s11, s9 -; GFX950-SDAG-NEXT: s_cmp_lg_u32 s9, s8 -; GFX950-SDAG-NEXT: s_cselect_b32 s8, 1, 0 -; GFX950-SDAG-NEXT: s_addk_i32 s10, 0xfc10 -; GFX950-SDAG-NEXT: s_lshl_b32 s9, s10, 12 -; GFX950-SDAG-NEXT: s_or_b32 s8, s11, s8 -; GFX950-SDAG-NEXT: s_or_b32 s9, s7, s9 -; GFX950-SDAG-NEXT: s_cmp_lt_i32 s10, 1 -; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, s9 -; GFX950-SDAG-NEXT: s_and_b32 s9, s8, 7 -; GFX950-SDAG-NEXT: s_cmp_gt_i32 s9, 5 -; GFX950-SDAG-NEXT: s_cselect_b32 s11, 1, 0 -; GFX950-SDAG-NEXT: s_cmp_eq_u32 s9, 3 -; GFX950-SDAG-NEXT: s_cselect_b32 s9, 1, 0 -; GFX950-SDAG-NEXT: s_lshr_b32 s8, s8, 2 -; GFX950-SDAG-NEXT: s_or_b32 s9, s9, s11 -; GFX950-SDAG-NEXT: s_add_i32 s8, s8, s9 -; GFX950-SDAG-NEXT: s_cmp_lt_i32 s10, 31 -; GFX950-SDAG-NEXT: s_cselect_b32 s8, s8, 0x7c00 -; GFX950-SDAG-NEXT: s_cmp_lg_u32 s7, 0 -; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 -; GFX950-SDAG-NEXT: s_cmpk_eq_i32 s10, 0x40f -; GFX950-SDAG-NEXT: s_cselect_b32 s4, s4, s8 -; GFX950-SDAG-NEXT: s_lshr_b32 s6, s6, 16 -; GFX950-SDAG-NEXT: s_and_b32 s6, s6, 0x8000 -; GFX950-SDAG-NEXT: s_or_b32 s4, s6, s4 -; GFX950-SDAG-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] +; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v0, v0, v2 +; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: fptrunc_v2f64_to_v2f16_afn: @@ -3511,109 +2928,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn( ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 ; GFX11-SDAG-TRUE16-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 -; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s2, v3 -; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s3, s2, 0x1ff -; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s2, 8 -; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, s3, v2 -; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s3, s2, 0xb0014 -; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s4, 0x3f1, s3 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, s4, 0, 13 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s8, v3 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v2 -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s4, s5, s4 -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s4, 0x1000 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s9, s5, s8 -; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s9, s8 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s8, s5 -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 -; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s3, 0xfc10 -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s5, s9, s5 -; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s8, s3, 12 -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s4, s8 -; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 1 -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, s8 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s8, s5, 7 -; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s8, 5 -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, 1, 0 -; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s8, 3 -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, 1, 0 -; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s5, s5, 2 -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s8, s8, s9 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s5, s5, s8 -; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s3, 31 -; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s8, 0x7e00 -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s5, s5, 0x7c00 -; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s4, 0 -; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s9, s8, 0x7c00 -; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s3, 0x40f -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s9, s5 -; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s5, s4, 0x1ff -; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s10, s4, 8 -; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, s5, v0 -; GFX11-SDAG-TRUE16-NEXT: s_bfe_u32 s5, s4, 0xb0014 -; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s10, s10, 0xffe -; GFX11-SDAG-TRUE16-NEXT: s_sub_i32 s9, 0x3f1, s5 -; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v1, s9, 0, 13 -; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s2, s2, 0x8000 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s2, s2, s3 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s11, v1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: v_readfirstlane_b32 s9, v0 -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s9, s10, s9 -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s9, 0x1000 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s12, s10, s11 -; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s11, s12, s11 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s11, s10 -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX11-SDAG-TRUE16-NEXT: s_addk_i32 s5, 0xfc10 -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s3, s12, s3 -; GFX11-SDAG-TRUE16-NEXT: s_lshl_b32 s10, s5, 12 -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s9, s10 -; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s5, 1 -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s3, s10 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s10, s3, 7 -; GFX11-SDAG-TRUE16-NEXT: s_cmp_gt_i32 s10, 5 -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s11, 1, 0 -; GFX11-SDAG-TRUE16-NEXT: s_cmp_eq_u32 s10, 3 -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s10, 1, 0 -; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s3, s3, 2 -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s10, s10, s11 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: s_add_i32 s3, s3, s10 -; GFX11-SDAG-TRUE16-NEXT: s_cmp_lt_i32 s5, 31 -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s3, 0x7c00 -; GFX11-SDAG-TRUE16-NEXT: s_cmp_lg_u32 s9, 0 -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s8, s8, 0x7c00 -; GFX11-SDAG-TRUE16-NEXT: s_cmpk_eq_i32 s5, 0x40f ; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 -; GFX11-SDAG-TRUE16-NEXT: s_cselect_b32 s3, s8, s3 -; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s4, s4, 16 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: s_and_b32 s4, s4, 0x8000 -; GFX11-SDAG-TRUE16-NEXT: s_or_b32 s3, s4, s3 -; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX11-SDAG-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v1, v[0:1] +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v2 +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l ; GFX11-SDAG-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-SDAG-TRUE16-NEXT: s_endpgm ; @@ -3627,109 +2952,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn( ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2 ; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 ; GFX11-SDAG-FAKE16-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 -; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s2, v3 -; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s3, s2, 0x1ff -; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s2, 8 -; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, s3, v2 -; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s3, s2, 0xb0014 -; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe -; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s4, 0x3f1, s3 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v3, s4, 0, 13 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s8, v3 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v2 -; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s4, s5, s4 -; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s4, 0x1000 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s9, s5, s8 -; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s9, s8 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s8, s5 -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 -; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s3, 0xfc10 -; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s5, s9, s5 -; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s8, s3, 12 -; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s4, s8 -; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 1 -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, s8 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s8, s5, 7 -; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s8, 5 -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, 1, 0 -; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s8, 3 -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, 1, 0 -; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s5, s5, 2 -; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s8, s8, s9 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s5, s5, s8 -; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s3, 31 -; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s8, 0x7e00 -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s5, s5, 0x7c00 -; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s4, 0 -; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s9, s8, 0x7c00 -; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s3, 0x40f -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s9, s5 -; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s5, s4, 0x1ff -; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s10, s4, 8 -; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, s5, v0 -; GFX11-SDAG-FAKE16-NEXT: s_bfe_u32 s5, s4, 0xb0014 -; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s10, s10, 0xffe -; GFX11-SDAG-FAKE16-NEXT: s_sub_i32 s9, 0x3f1, s5 -; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-SDAG-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v1, s9, 0, 13 -; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s2, s2, 0x8000 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s2, s2, s3 -; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s11, v1 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-FAKE16-NEXT: v_readfirstlane_b32 s9, v0 -; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s9, s10, s9 -; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s9, 0x1000 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s12, s10, s11 -; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s11, s12, s11 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s11, s10 -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, 1, 0 -; GFX11-SDAG-FAKE16-NEXT: s_addk_i32 s5, 0xfc10 -; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s3, s12, s3 -; GFX11-SDAG-FAKE16-NEXT: s_lshl_b32 s10, s5, 12 -; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s9, s10 -; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s5, 1 -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s3, s10 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s10, s3, 7 -; GFX11-SDAG-FAKE16-NEXT: s_cmp_gt_i32 s10, 5 -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s11, 1, 0 -; GFX11-SDAG-FAKE16-NEXT: s_cmp_eq_u32 s10, 3 -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s10, 1, 0 -; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s3, s3, 2 -; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s10, s10, s11 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-FAKE16-NEXT: s_add_i32 s3, s3, s10 -; GFX11-SDAG-FAKE16-NEXT: s_cmp_lt_i32 s5, 31 -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s3, 0x7c00 -; GFX11-SDAG-FAKE16-NEXT: s_cmp_lg_u32 s9, 0 -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s8, s8, 0x7c00 -; GFX11-SDAG-FAKE16-NEXT: s_cmpk_eq_i32 s5, 0x40f ; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 -; GFX11-SDAG-FAKE16-NEXT: s_cselect_b32 s3, s8, s3 -; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s4, s4, 16 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SDAG-FAKE16-NEXT: s_and_b32 s4, s4, 0x8000 -; GFX11-SDAG-FAKE16-NEXT: s_or_b32 s3, s4, s3 -; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 -; GFX11-SDAG-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s3, s2 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-SDAG-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-SDAG-FAKE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index 4f8eab1c2fec0..5d311776066e5 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -226,59 +226,59 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; -; VI-SAFE-SDAG-LABEL: fptrunc_f64_to_f16: -; VI-SAFE-SDAG: ; %bb.0: -; VI-SAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 -; VI-SAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; VI-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 -; VI-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SAFE-SDAG-NEXT: s_mov_b32 s0, s4 -; VI-SAFE-SDAG-NEXT: s_lshr_b32 s4, s7, 8 -; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s4, 0xffe -; VI-SAFE-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff -; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s6 -; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 -; VI-SAFE-SDAG-NEXT: s_mov_b32 s1, s5 -; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s4, v0 -; VI-SAFE-SDAG-NEXT: s_bfe_u32 s6, s7, 0xb0014 -; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s8, s4 -; VI-SAFE-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s6 -; VI-SAFE-SDAG-NEXT: v_med3_i32 v0, s8, 0, 13 -; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 -; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s8, v0 -; VI-SAFE-SDAG-NEXT: s_lshr_b32 s9, s5, s8 -; VI-SAFE-SDAG-NEXT: s_lshl_b32 s8, s9, s8 -; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s8, s5 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 -; VI-SAFE-SDAG-NEXT: s_addk_i32 s6, 0xfc10 -; VI-SAFE-SDAG-NEXT: s_lshl_b32 s8, s6, 12 -; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s9, s5 -; VI-SAFE-SDAG-NEXT: s_or_b32 s8, s4, s8 -; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s6, 1 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s8 -; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s5, 7 -; VI-SAFE-SDAG-NEXT: s_cmp_gt_i32 s8, 5 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s9, 1, 0 -; VI-SAFE-SDAG-NEXT: s_cmp_eq_u32 s8, 3 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s8, 1, 0 -; VI-SAFE-SDAG-NEXT: s_or_b32 s8, s8, s9 -; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 -; VI-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s8 -; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s6, 31 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00 -; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 -; VI-SAFE-SDAG-NEXT: s_movk_i32 s4, 0x7e00 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 -; VI-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s6, 0x40f -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, s5 -; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s7, 16 -; VI-SAFE-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 -; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s5, s4 -; VI-SAFE-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; VI-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 -; VI-SAFE-SDAG-NEXT: s_endpgm +; VI-SDAG-LABEL: fptrunc_f64_to_f16: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; VI-SDAG-NEXT: s_mov_b32 s2, -1 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: s_mov_b32 s0, s4 +; VI-SDAG-NEXT: s_lshr_b32 s4, s7, 8 +; VI-SDAG-NEXT: s_and_b32 s8, s4, 0xffe +; VI-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff +; VI-SDAG-NEXT: s_or_b32 s4, s4, s6 +; VI-SDAG-NEXT: s_cmp_lg_u32 s4, 0 +; VI-SDAG-NEXT: s_mov_b32 s1, s5 +; VI-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; VI-SDAG-NEXT: v_readfirstlane_b32 s4, v0 +; VI-SDAG-NEXT: s_bfe_u32 s6, s7, 0xb0014 +; VI-SDAG-NEXT: s_or_b32 s4, s8, s4 +; VI-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s6 +; VI-SDAG-NEXT: v_med3_i32 v0, s8, 0, 13 +; VI-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 +; VI-SDAG-NEXT: v_readfirstlane_b32 s8, v0 +; VI-SDAG-NEXT: s_lshr_b32 s9, s5, s8 +; VI-SDAG-NEXT: s_lshl_b32 s8, s9, s8 +; VI-SDAG-NEXT: s_cmp_lg_u32 s8, s5 +; VI-SDAG-NEXT: s_cselect_b32 s5, 1, 0 +; VI-SDAG-NEXT: s_addk_i32 s6, 0xfc10 +; VI-SDAG-NEXT: s_lshl_b32 s8, s6, 12 +; VI-SDAG-NEXT: s_or_b32 s5, s9, s5 +; VI-SDAG-NEXT: s_or_b32 s8, s4, s8 +; VI-SDAG-NEXT: s_cmp_lt_i32 s6, 1 +; VI-SDAG-NEXT: s_cselect_b32 s5, s5, s8 +; VI-SDAG-NEXT: s_and_b32 s8, s5, 7 +; VI-SDAG-NEXT: s_cmp_gt_i32 s8, 5 +; VI-SDAG-NEXT: s_cselect_b32 s9, 1, 0 +; VI-SDAG-NEXT: s_cmp_eq_u32 s8, 3 +; VI-SDAG-NEXT: s_cselect_b32 s8, 1, 0 +; VI-SDAG-NEXT: s_or_b32 s8, s8, s9 +; VI-SDAG-NEXT: s_lshr_b32 s5, s5, 2 +; VI-SDAG-NEXT: s_add_i32 s5, s5, s8 +; VI-SDAG-NEXT: s_cmp_lt_i32 s6, 31 +; VI-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; VI-SDAG-NEXT: s_cmp_lg_u32 s4, 0 +; VI-SDAG-NEXT: s_movk_i32 s4, 0x7e00 +; VI-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 +; VI-SDAG-NEXT: s_cmpk_eq_i32 s6, 0x40f +; VI-SDAG-NEXT: s_cselect_b32 s4, s4, s5 +; VI-SDAG-NEXT: s_lshr_b32 s5, s7, 16 +; VI-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 +; VI-SDAG-NEXT: s_or_b32 s4, s5, s4 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: fptrunc_f64_to_f16: ; VI-GISEL: ; %bb.0: @@ -331,68 +331,57 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-GISEL-NEXT: s_endpgm ; -; VI-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16: -; VI-UNSAFE-SDAG: ; %bb.0: -; VI-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] -; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1 -; VI-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 -; VI-UNSAFE-SDAG-NEXT: s_endpgm -; -; GFX10-SAFE-SDAG-LABEL: fptrunc_f64_to_f16: -; GFX10-SAFE-SDAG: ; %bb.0: -; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff -; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2 -; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s5, 0xffe -; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0 -; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX10-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014 -; GFX10-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2 -; GFX10-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13 -; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0 -; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 -; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6 -; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s6, s7, s6 -; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, s5 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s7, s5 -; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s6, s2, 12 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s4, s6 -; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7 -; GFX10-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s7, 1, 0 -; GFX10-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7 -; GFX10-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s6 -; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31 -; GFX10-SAFE-SDAG-NEXT: s_movk_i32 s6, 0x7e00 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00 -; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s4, s6, 0x7c00 -; GFX10-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, s4, s5 -; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-SAFE-SDAG-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s3, s2 -; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-SAFE-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX10-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX10-SAFE-SDAG-NEXT: s_endpgm +; GFX10-SDAG-LABEL: fptrunc_f64_to_f16: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff +; GFX10-SDAG-NEXT: s_lshr_b32 s5, s3, 8 +; GFX10-SDAG-NEXT: s_or_b32 s2, s4, s2 +; GFX10-SDAG-NEXT: s_and_b32 s4, s5, 0xffe +; GFX10-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX10-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX10-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014 +; GFX10-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2 +; GFX10-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s6, v1 +; GFX10-SDAG-NEXT: s_or_b32 s4, s4, s5 +; GFX10-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 +; GFX10-SDAG-NEXT: s_lshr_b32 s7, s5, s6 +; GFX10-SDAG-NEXT: s_lshl_b32 s6, s7, s6 +; GFX10-SDAG-NEXT: s_cmp_lg_u32 s6, s5 +; GFX10-SDAG-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-SDAG-NEXT: s_addk_i32 s2, 0xfc10 +; GFX10-SDAG-NEXT: s_or_b32 s5, s7, s5 +; GFX10-SDAG-NEXT: s_lshl_b32 s6, s2, 12 +; GFX10-SDAG-NEXT: s_or_b32 s6, s4, s6 +; GFX10-SDAG-NEXT: s_cmp_lt_i32 s2, 1 +; GFX10-SDAG-NEXT: s_cselect_b32 s5, s5, s6 +; GFX10-SDAG-NEXT: s_and_b32 s6, s5, 7 +; GFX10-SDAG-NEXT: s_cmp_gt_i32 s6, 5 +; GFX10-SDAG-NEXT: s_cselect_b32 s7, 1, 0 +; GFX10-SDAG-NEXT: s_cmp_eq_u32 s6, 3 +; GFX10-SDAG-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10-SDAG-NEXT: s_lshr_b32 s5, s5, 2 +; GFX10-SDAG-NEXT: s_or_b32 s6, s6, s7 +; GFX10-SDAG-NEXT: s_add_i32 s5, s5, s6 +; GFX10-SDAG-NEXT: s_cmp_lt_i32 s2, 31 +; GFX10-SDAG-NEXT: s_movk_i32 s6, 0x7e00 +; GFX10-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; GFX10-SDAG-NEXT: s_cmp_lg_u32 s4, 0 +; GFX10-SDAG-NEXT: s_cselect_b32 s4, s6, 0x7c00 +; GFX10-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f +; GFX10-SDAG-NEXT: s_cselect_b32 s2, s4, s5 +; GFX10-SDAG-NEXT: s_lshr_b32 s3, s3, 16 +; GFX10-SDAG-NEXT: s_and_b32 s3, s3, 0x8000 +; GFX10-SDAG-NEXT: s_or_b32 s2, s3, s2 +; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX10-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX10-GISEL: ; %bb.0: @@ -445,76 +434,65 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX10-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX10-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16: -; GFX10-UNSAFE-SDAG: ; %bb.0: -; GFX10-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] -; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX10-UNSAFE-SDAG-NEXT: s_endpgm -; -; GFX11-SAFE-SDAG-LABEL: fptrunc_f64_to_f16: -; GFX11-SAFE-SDAG: ; %bb.0: -; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff -; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2 -; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s5, 0xffe -; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX11-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014 -; GFX11-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13 -; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6 -; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s7, s6 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, s5 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 -; GFX11-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s7, s5 -; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s2, 12 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s4, s6 -; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s6 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7 -; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s7, 1, 0 -; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s6, 1, 0 -; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s6 -; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31 -; GFX11-SAFE-SDAG-NEXT: s_movk_i32 s6, 0x7e00 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00 -; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s4, s6, 0x7c00 -; GFX11-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, s4, s5 -; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-SDAG-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s3, s2 -; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-SAFE-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX11-SAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-SAFE-SDAG-NEXT: s_endpgm +; GFX11-SDAG-LABEL: fptrunc_f64_to_f16: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff +; GFX11-SDAG-NEXT: s_lshr_b32 s5, s3, 8 +; GFX11-SDAG-NEXT: s_or_b32 s2, s4, s2 +; GFX11-SDAG-NEXT: s_and_b32 s4, s5, 0xffe +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014 +; GFX11-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s5, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s6, v1 +; GFX11-SDAG-NEXT: s_or_b32 s4, s4, s5 +; GFX11-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_lshr_b32 s7, s5, s6 +; GFX11-SDAG-NEXT: s_lshl_b32 s6, s7, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s6, s5 +; GFX11-SDAG-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-SDAG-NEXT: s_addk_i32 s2, 0xfc10 +; GFX11-SDAG-NEXT: s_or_b32 s5, s7, s5 +; GFX11-SDAG-NEXT: s_lshl_b32 s6, s2, 12 +; GFX11-SDAG-NEXT: s_or_b32 s6, s4, s6 +; GFX11-SDAG-NEXT: s_cmp_lt_i32 s2, 1 +; GFX11-SDAG-NEXT: s_cselect_b32 s5, s5, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_and_b32 s6, s5, 7 +; GFX11-SDAG-NEXT: s_cmp_gt_i32 s6, 5 +; GFX11-SDAG-NEXT: s_cselect_b32 s7, 1, 0 +; GFX11-SDAG-NEXT: s_cmp_eq_u32 s6, 3 +; GFX11-SDAG-NEXT: s_cselect_b32 s6, 1, 0 +; GFX11-SDAG-NEXT: s_lshr_b32 s5, s5, 2 +; GFX11-SDAG-NEXT: s_or_b32 s6, s6, s7 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_add_i32 s5, s5, s6 +; GFX11-SDAG-NEXT: s_cmp_lt_i32 s2, 31 +; GFX11-SDAG-NEXT: s_movk_i32 s6, 0x7e00 +; GFX11-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00 +; GFX11-SDAG-NEXT: s_cmp_lg_u32 s4, 0 +; GFX11-SDAG-NEXT: s_cselect_b32 s4, s6, 0x7c00 +; GFX11-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f +; GFX11-SDAG-NEXT: s_cselect_b32 s2, s4, s5 +; GFX11-SDAG-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_and_b32 s3, s3, 0x8000 +; GFX11-SDAG-NEXT: s_or_b32 s2, s3, s2 +; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fptrunc_f64_to_f16: ; GFX11-GISEL: ; %bb.0: @@ -570,30 +548,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 ; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-GISEL-NEXT: s_endpgm -; -; GFX11-UNSAFE-DAG-TRUE16-LABEL: fptrunc_f64_to_f16: -; GFX11-UNSAFE-DAG-TRUE16: ; %bb.0: -; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-UNSAFE-DAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] -; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-UNSAFE-DAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX11-UNSAFE-DAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_endpgm -; -; GFX11-UNSAFE-DAG-FAKE16-LABEL: fptrunc_f64_to_f16: -; GFX11-UNSAFE-DAG-FAKE16: ; %bb.0: -; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-UNSAFE-DAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] -; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-UNSAFE-DAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-UNSAFE-DAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_endpgm %result = fptrunc double %in to half %result_i16 = bitcast half %result to i16 store i16 %result_i16, ptr addrspace(1) %out @@ -603,111 +557,27 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) define amdgpu_kernel void @fptrunc_f64_to_f16_afn(ptr addrspace(1) %out, double %in) { ; SI-LABEL: fptrunc_f64_to_f16_afn: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_movk_i32 s2, 0x7e00 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s0, s7, 8 -; SI-NEXT: s_and_b32 s1, s7, 0x1ff -; SI-NEXT: s_and_b32 s8, s0, 0xffe -; SI-NEXT: s_or_b32 s0, s1, s6 -; SI-NEXT: s_cmp_lg_u32 s0, 0 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 -; SI-NEXT: v_readfirstlane_b32 s1, v0 -; SI-NEXT: s_sub_i32 s6, 0x3f1, s0 -; SI-NEXT: s_or_b32 s1, s8, s1 -; SI-NEXT: v_med3_i32 v0, s6, 0, 13 -; SI-NEXT: s_or_b32 s6, s1, 0x1000 -; SI-NEXT: v_readfirstlane_b32 s8, v0 -; SI-NEXT: s_lshr_b32 s9, s6, s8 -; SI-NEXT: s_lshl_b32 s8, s9, s8 -; SI-NEXT: s_cmp_lg_u32 s8, s6 -; SI-NEXT: s_cselect_b32 s6, 1, 0 -; SI-NEXT: s_addk_i32 s0, 0xfc10 -; SI-NEXT: s_or_b32 s6, s9, s6 -; SI-NEXT: s_lshl_b32 s8, s0, 12 -; SI-NEXT: s_or_b32 s8, s1, s8 -; SI-NEXT: s_cmp_lt_i32 s0, 1 -; SI-NEXT: s_cselect_b32 s6, s6, s8 -; SI-NEXT: s_and_b32 s8, s6, 7 -; SI-NEXT: s_cmp_gt_i32 s8, 5 -; SI-NEXT: s_cselect_b32 s9, 1, 0 -; SI-NEXT: s_cmp_eq_u32 s8, 3 -; SI-NEXT: s_cselect_b32 s8, 1, 0 -; SI-NEXT: s_lshr_b32 s6, s6, 2 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_add_i32 s6, s6, s8 -; SI-NEXT: s_cmp_lt_i32 s0, 31 -; SI-NEXT: s_cselect_b32 s6, s6, 0x7c00 -; SI-NEXT: s_cmp_lg_u32 s1, 0 -; SI-NEXT: s_cselect_b32 s1, s2, 0x7c00 -; SI-NEXT: s_cmpk_eq_i32 s0, 0x40f -; SI-NEXT: s_cselect_b32 s0, s1, s6 -; SI-NEXT: s_lshr_b32 s1, s7, 16 -; SI-NEXT: s_and_b32 s1, s1, 0x8000 -; SI-NEXT: s_or_b32 s6, s1, s0 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; -; VI-SAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn: -; VI-SAFE-SDAG: ; %bb.0: -; VI-SAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 -; VI-SAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; VI-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 -; VI-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SAFE-SDAG-NEXT: s_mov_b32 s0, s4 -; VI-SAFE-SDAG-NEXT: s_lshr_b32 s4, s7, 8 -; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s4, 0xffe -; VI-SAFE-SDAG-NEXT: s_and_b32 s4, s7, 0x1ff -; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s6 -; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 -; VI-SAFE-SDAG-NEXT: s_mov_b32 s1, s5 -; VI-SAFE-SDAG-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s4, v0 -; VI-SAFE-SDAG-NEXT: s_bfe_u32 s6, s7, 0xb0014 -; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s8, s4 -; VI-SAFE-SDAG-NEXT: s_sub_i32 s8, 0x3f1, s6 -; VI-SAFE-SDAG-NEXT: v_med3_i32 v0, s8, 0, 13 -; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 -; VI-SAFE-SDAG-NEXT: v_readfirstlane_b32 s8, v0 -; VI-SAFE-SDAG-NEXT: s_lshr_b32 s9, s5, s8 -; VI-SAFE-SDAG-NEXT: s_lshl_b32 s8, s9, s8 -; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s8, s5 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 -; VI-SAFE-SDAG-NEXT: s_addk_i32 s6, 0xfc10 -; VI-SAFE-SDAG-NEXT: s_lshl_b32 s8, s6, 12 -; VI-SAFE-SDAG-NEXT: s_or_b32 s5, s9, s5 -; VI-SAFE-SDAG-NEXT: s_or_b32 s8, s4, s8 -; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s6, 1 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s8 -; VI-SAFE-SDAG-NEXT: s_and_b32 s8, s5, 7 -; VI-SAFE-SDAG-NEXT: s_cmp_gt_i32 s8, 5 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s9, 1, 0 -; VI-SAFE-SDAG-NEXT: s_cmp_eq_u32 s8, 3 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s8, 1, 0 -; VI-SAFE-SDAG-NEXT: s_or_b32 s8, s8, s9 -; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 -; VI-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s8 -; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s6, 31 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00 -; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 -; VI-SAFE-SDAG-NEXT: s_movk_i32 s4, 0x7e00 -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00 -; VI-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s6, 0x40f -; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, s5 -; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s7, 16 -; VI-SAFE-SDAG-NEXT: s_and_b32 s5, s5, 0x8000 -; VI-SAFE-SDAG-NEXT: s_or_b32 s4, s5, s4 -; VI-SAFE-SDAG-NEXT: v_mov_b32_e32 v0, s4 -; VI-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 -; VI-SAFE-SDAG-NEXT: s_endpgm +; VI-SDAG-LABEL: fptrunc_f64_to_f16_afn: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000 +; VI-SDAG-NEXT: s_mov_b32 s2, -1 +; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: fptrunc_f64_to_f16_afn: ; VI-GISEL: ; %bb.0: @@ -720,68 +590,16 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn(ptr addrspace(1) %out, double ; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-GISEL-NEXT: s_endpgm ; -; VI-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn: -; VI-UNSAFE-SDAG: ; %bb.0: -; VI-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] -; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000 -; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1 -; VI-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 -; VI-UNSAFE-SDAG-NEXT: s_endpgm -; -; GFX10-SAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn: -; GFX10-SAFE-SDAG: ; %bb.0: -; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff -; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2 -; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s5, 0xffe -; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0 -; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX10-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014 -; GFX10-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2 -; GFX10-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13 -; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0 -; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 -; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6 -; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s6, s7, s6 -; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, s5 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s7, s5 -; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s6, s2, 12 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s4, s6 -; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7 -; GFX10-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s7, 1, 0 -; GFX10-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7 -; GFX10-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s6 -; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31 -; GFX10-SAFE-SDAG-NEXT: s_movk_i32 s6, 0x7e00 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00 -; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s4, s6, 0x7c00 -; GFX10-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f -; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, s4, s5 -; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-SAFE-SDAG-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s3, s2 -; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-SAFE-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX10-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX10-SAFE-SDAG-NEXT: s_endpgm +; GFX10-SDAG-LABEL: fptrunc_f64_to_f16_afn: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX10-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: fptrunc_f64_to_f16_afn: ; GFX10-GISEL: ; %bb.0: @@ -794,74 +612,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn(ptr addrspace(1) %out, double ; GFX10-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX10-GISEL-NEXT: s_endpgm ; -; GFX10-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn: -; GFX10-UNSAFE-SDAG: ; %bb.0: -; GFX10-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] -; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 -; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX10-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0 -; GFX10-UNSAFE-SDAG-NEXT: s_endpgm -; ; GFX11-SAFE-SDAG-LABEL: fptrunc_f64_to_f16_afn: ; GFX11-SAFE-SDAG: ; %bb.0: ; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff -; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2 -; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s5, 0xffe -; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX11-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014 -; GFX11-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13 -; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s7, s5, s6 -; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s7, s6 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, s5 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, 1, 0 -; GFX11-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s7, s5 -; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s2, 12 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s4, s6 -; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s6 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7 -; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s7, 1, 0 -; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s6, 1, 0 -; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s6 -; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31 -; GFX11-SAFE-SDAG-NEXT: s_movk_i32 s6, 0x7e00 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00 -; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0 -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s4, s6, 0x7c00 -; GFX11-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f -; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, s4, s5 -; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-SAFE-SDAG-NEXT: s_and_b32 s3, s3, 0x8000 -; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s3, s2 +; GFX11-SAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] ; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-SAFE-SDAG-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-SAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-SAFE-SDAG-NEXT: s_endpgm ; @@ -1833,4 +1592,8 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32_afn(ptr addrspace(1) %out, <8 } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GFX10-SAFE-GISEL: {{.*}} +; GFX10-SAFE-SDAG: {{.*}} +; GFX10-UNSAFE-SDAG: {{.*}} ; VI-SAFE-GISEL: {{.*}} +; VI-SAFE-SDAG: {{.*}} +; VI-UNSAFE-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll index 87c7cce854b11..f81950bde03e0 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll @@ -1294,13 +1294,13 @@ define float @v_sqrt_f32__enough_unsafe_attrs(float %x) #3 { ret float %result } -define float @v_sqrt_f32__unsafe_attr(float %x) #4 { +define float @v_sqrt_f32__unsafe_attr(float %x) { ; GCN-LABEL: v_sqrt_f32__unsafe_attr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sqrt_f32_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = call nsz float @llvm.sqrt.f32(float %x) + %result = call afn nsz float @llvm.sqrt.f32(float %x) ret float %result } @@ -4763,7 +4763,6 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo attributes #1 = { convergent nounwind willreturn memory(none) } attributes #2 = { "approx-func-fp-math"="true" } attributes #3 = { "approx-func-fp-math"="true" "no-nans-fp-math"="true" "no-infs-fp-math"="true" } -attributes #4 = { "unsafe-fp-math"="true" } attributes #5 = { "no-infs-fp-math"="true" } !0 = !{float 0.5} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll index 425a8530afa97..477f0a610feec 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll @@ -51,7 +51,7 @@ define amdgpu_kernel void @safe_no_fp32_denormals_rcp_f32(ptr addrspace(1) %out, ; SI-NOT: [[RESULT]] ; SI: buffer_store_dword [[RESULT]] define amdgpu_kernel void @safe_f32_denormals_rcp_pat_f32(ptr addrspace(1) %out, float %src) #4 { - %rcp = fdiv float 1.0, %src, !fpmath !0 + %rcp = fdiv afn float 1.0, %src, !fpmath !0 store float %rcp, ptr addrspace(1) %out, align 4 ret void } @@ -105,8 +105,8 @@ define amdgpu_kernel void @safe_rsq_rcp_pat_amdgcn_sqrt_f32_nocontract(ptr addrs ; SI: v_sqrt_f32_e32 ; SI: v_rcp_f32_e32 define amdgpu_kernel void @unsafe_rsq_rcp_pat_f32(ptr addrspace(1) %out, float %src) #2 { - %sqrt = call float @llvm.sqrt.f32(float %src) - %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt) + %sqrt = call afn float @llvm.sqrt.f32(float %src) + %rcp = call afn float @llvm.amdgcn.rcp.f32(float %sqrt) store float %rcp, ptr addrspace(1) %out, align 4 ret void } @@ -148,7 +148,7 @@ define amdgpu_kernel void @rcp_pat_f64(ptr addrspace(1) %out, double %src) #1 { ; SI: v_fma_f64 ; SI: v_fma_f64 define amdgpu_kernel void @unsafe_rcp_pat_f64(ptr addrspace(1) %out, double %src) #2 { - %rcp = fdiv double 1.0, %src + %rcp = fdiv afn double 1.0, %src store double %rcp, ptr addrspace(1) %out, align 8 ret void } @@ -214,9 +214,9 @@ define amdgpu_kernel void @unsafe_amdgcn_sqrt_rsq_rcp_pat_f64(ptr addrspace(1) % } attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } -attributes #2 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } -attributes #3 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="ieee,ieee" } -attributes #4 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="ieee,ieee" } +attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #2 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #3 = { nounwind "denormal-fp-math-f32"="ieee,ieee" } +attributes #4 = { nounwind "denormal-fp-math-f32"="ieee,ieee" } !0 = !{float 2.500000e+00} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 8c1e166babaf8..7151fee3cdc96 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -3227,72 +3227,6 @@ define float @v_exp_f32_fast(float %in) { ret float %result } -define float @v_exp_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { -; GCN-SDAG-LABEL: v_exp_f32_unsafe_math_attr: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc2aeac50 -; GCN-SDAG-NEXT: v_add_f32_e32 v1, 0x42800000, v0 -; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mul_f32_e32 v1, 0x114b4ea4, v0 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GCN-GISEL-LABEL: v_exp_f32_unsafe_math_attr: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2aeac50 -; GCN-GISEL-NEXT: v_add_f32_e32 v2, 0x42800000, v0 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mul_f32_e32 v1, 0x114b4ea4, v0 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; SI-SDAG-LABEL: v_exp_f32_unsafe_math_attr: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: s_mov_b32 s4, 0xc2aeac50 -; SI-SDAG-NEXT: v_add_f32_e32 v1, 0x42800000, v0 -; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x114b4ea4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp_f32_unsafe_math_attr: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2aeac50 -; SI-GISEL-NEXT: v_add_f32_e32 v2, 0x42800000, v0 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x114b4ea4, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; R600-LABEL: v_exp_f32_unsafe_math_attr: -; R600: ; %bb.0: -; R600-NEXT: CF_END -; R600-NEXT: PAD -; -; CM-LABEL: v_exp_f32_unsafe_math_attr: -; CM: ; %bb.0: -; CM-NEXT: CF_END -; CM-NEXT: PAD - %result = call float @llvm.exp.f32(float %in) - ret float %result -} - define float @v_exp_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" { ; GCN-SDAG-LABEL: v_exp_f32_approx_fn_attr: ; GCN-SDAG: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index edc505bdd6c1d..918b1b26aff2e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -3235,78 +3235,6 @@ define float @v_exp10_f32_fast(float %in) { ret float %result } -define float @v_exp10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { -; GCN-SDAG-LABEL: v_exp10_f32_unsafe_math_attr: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: s_mov_b32 s4, 0xc217b818 -; GCN-SDAG-NEXT: v_add_f32_e32 v1, 0x42000000, v0 -; GCN-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-SDAG-NEXT: v_mul_f32_e32 v1, 0x3a2784bc, v0 -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0 -; GCN-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GCN-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: v_mul_f32_e32 v1, 0xa4fb11f, v0 -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GCN-GISEL-LABEL: v_exp10_f32_unsafe_math_attr: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2aeac50 -; GCN-GISEL-NEXT: v_add_f32_e32 v2, 0x42800000, v0 -; GCN-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; GCN-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GCN-GISEL-NEXT: v_mul_f32_e32 v1, 0x114b4ea4, v0 -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; SI-SDAG-LABEL: v_exp10_f32_unsafe_math_attr: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: s_mov_b32 s4, 0xc217b818 -; SI-SDAG-NEXT: v_add_f32_e32 v1, 0x42000000, v0 -; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3a2784bc, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549000, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0xa4fb11f, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp10_f32_unsafe_math_attr: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2aeac50 -; SI-GISEL-NEXT: v_add_f32_e32 v2, 0x42800000, v0 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x114b4ea4, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; R600-LABEL: v_exp10_f32_unsafe_math_attr: -; R600: ; %bb.0: -; R600-NEXT: CF_END -; R600-NEXT: PAD -; -; CM-LABEL: v_exp10_f32_unsafe_math_attr: -; CM: ; %bb.0: -; CM-NEXT: CF_END -; CM-NEXT: PAD - %result = call float @llvm.exp10.f32(float %in) - ret float %result -} - define float @v_exp10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" { ; GCN-SDAG-LABEL: v_exp10_f32_approx_fn_attr: ; GCN-SDAG: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index 38d1b4789cf45..307fa89003b4b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -3076,121 +3076,6 @@ define float @v_log_f32_fast(float %in) { ret float %result } -define float @v_log_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { -; SI-SDAG-LABEL: v_log_f32_unsafe_math_attr: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317218 -; SI-SDAG-NEXT: v_fma_f32 v0, v0, s4, v1 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log_f32_unsafe_math_attr: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_log_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0xc1b17218 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317218 -; SI-GISEL-NEXT: v_fma_f32 v0, v2, v1, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; VI-SDAG-LABEL: v_log_f32_unsafe_math_attr: -; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc -; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; VI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; VI-GISEL-LABEL: v_log_f32_unsafe_math_attr: -; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_log_f32_e32 v2, v0 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0xc1b17218 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317218, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; VI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-SDAG-LABEL: v_log_f32_unsafe_math_attr: -; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc -; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc1b17218 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317218 -; GFX900-SDAG-NEXT: v_fma_f32 v0, v0, s4, v1 -; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-GISEL-LABEL: v_log_f32_unsafe_math_attr: -; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0xc1b17218 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317218 -; GFX900-GISEL-NEXT: v_fma_f32 v0, v2, v1, v0 -; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-SDAG-LABEL: v_log_f32_unsafe_math_attr: -; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc1b17218, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3f317218, v1 -; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-GISEL-LABEL: v_log_f32_unsafe_math_attr: -; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v0 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 0xc1b17218, vcc_lo -; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v0, 0x3f317218, v1 -; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; R600-LABEL: v_log_f32_unsafe_math_attr: -; R600: ; %bb.0: -; R600-NEXT: CF_END -; R600-NEXT: PAD -; -; CM-LABEL: v_log_f32_unsafe_math_attr: -; CM: ; %bb.0: -; CM-NEXT: CF_END -; CM-NEXT: PAD - %result = call float @llvm.log.f32(float %in) - ret float %result -} - define float @v_log_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" { ; SI-SDAG-LABEL: v_log_f32_approx_fn_attr: ; SI-SDAG: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 058933f5481a0..5278589968d91 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -3076,121 +3076,6 @@ define float @v_log10_f32_fast(float %in) { ret float %result } -define float @v_log10_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" { -; SI-SDAG-LABEL: v_log10_f32_unsafe_math_attr: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; SI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; SI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; SI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b -; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209b -; SI-SDAG-NEXT: v_fma_f32 v0, v0, s4, v1 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log10_f32_unsafe_math_attr: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_log_f32_e32 v2, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0xc11a209b -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209b -; SI-GISEL-NEXT: v_fma_f32 v0, v2, v1, v0 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; VI-SDAG-LABEL: v_log10_f32_unsafe_math_attr: -; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; VI-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; VI-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc -; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; VI-SDAG-NEXT: v_log_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b -; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0 -; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1 -; VI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; VI-GISEL-LABEL: v_log10_f32_unsafe_math_attr: -; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_log_f32_e32 v2, v0 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0xc11a209b -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209b, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; VI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-SDAG-LABEL: v_log10_f32_unsafe_math_attr: -; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x800000 -; GFX900-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX900-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc -; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc11a209b -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209b -; GFX900-SDAG-NEXT: v_fma_f32 v0, v0, s4, v1 -; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-GISEL-LABEL: v_log10_f32_unsafe_math_attr: -; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_log_f32_e32 v2, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0xc11a209b -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209b -; GFX900-GISEL-NEXT: v_fma_f32 v0, v2, v1, v0 -; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-SDAG-LABEL: v_log10_f32_unsafe_math_attr: -; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0xc11a209b, vcc_lo -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_fmamk_f32 v0, v0, 0x3e9a209b, v1 -; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-GISEL-LABEL: v_log10_f32_unsafe_math_attr: -; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v0 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 0xc11a209b, vcc_lo -; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v0, 0x3e9a209b, v1 -; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; R600-LABEL: v_log10_f32_unsafe_math_attr: -; R600: ; %bb.0: -; R600-NEXT: CF_END -; R600-NEXT: PAD -; -; CM-LABEL: v_log10_f32_unsafe_math_attr: -; CM: ; %bb.0: -; CM-NEXT: CF_END -; CM-NEXT: PAD - %result = call float @llvm.log10.f32(float %in) - ret float %result -} - define float @v_log10_f32_approx_fn_attr(float %in) "approx-func-fp-math"="true" { ; SI-SDAG-LABEL: v_log10_f32_approx_fn_attr: ; SI-SDAG: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll index 228420ef0acb0..9f0ffbcf6eff9 100644 --- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll +++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll @@ -56,7 +56,7 @@ define float @v_rcp_f32_ieee_unsafe(float %x) #4 { ; R600: ; %bb.0: ; R600-NEXT: CF_END ; R600-NEXT: PAD - %rcp = fdiv float 1.0, %x + %rcp = fdiv afn float 1.0, %x ret float %rcp } @@ -1411,10 +1411,10 @@ define amdgpu_kernel void @s_div_arcp_neg_k_x_pat_f32_daz(ptr addrspace(1) %out) declare float @llvm.fabs.f32(float) #1 declare float @llvm.sqrt.f32(float) #1 -attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone } -attributes #2 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #2 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #3 = { nounwind "denormal-fp-math-f32"="ieee,ieee" } -attributes #4 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="ieee,ieee" } +attributes #4 = { nounwind "denormal-fp-math-f32"="ieee,ieee" } !0 = !{float 2.500000e+00} diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll index f7e0388561104..f967e951b27a4 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-UNSAFE,SI-DAZ-UNSAFE %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-UNSAFE,SI-IEEE-UNSAFE %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-UNSAFE,SI-DAZ-UNSAFE %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-UNSAFE,SI-IEEE-UNSAFE %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-UNSAFE,CI-DAZ-UNSAFE %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-UNSAFE,CI-IEEE-UNSAFE %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-UNSAFE,CI-DAZ-UNSAFE %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-UNSAFE,CI-IEEE-UNSAFE %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -65,8 +65,8 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace( ; GCN-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-UNSAFE-NEXT: s_endpgm %val = load float, ptr addrspace(1) %in, align 4 - %sqrt = call contract float @llvm.sqrt.f32(float %val) nounwind readnone - %div = fdiv contract float 1.0, %sqrt, !fpmath !0 + %sqrt = call afn contract float @llvm.sqrt.f32(float %val) nounwind readnone + %div = fdiv afn contract float 1.0, %sqrt, !fpmath !0 store float %div, ptr addrspace(1) %out, align 4 ret void } @@ -103,8 +103,8 @@ define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %va ; GCN-UNSAFE-NEXT: s_mov_b32 s2, -1 ; GCN-UNSAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-UNSAFE-NEXT: s_endpgm - %sqrt = call contract float @llvm.sqrt.f32(float %val) nounwind readnone - %div = fdiv contract float 1.0, %sqrt, !fpmath !0 + %sqrt = call afn contract float @llvm.sqrt.f32(float %val) nounwind readnone + %div = fdiv afn contract float 1.0, %sqrt, !fpmath !0 store float %div, ptr addrspace(1) %out, align 4 ret void } @@ -196,7 +196,7 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i %x = call contract float @llvm.sqrt.f32(float %a) %y = fmul contract float %x, %b - %z = fdiv arcp contract float %c, %y + %z = fdiv arcp afn contract float %c, %y store float %z, ptr addrspace(1) %out.gep ret void } @@ -258,8 +258,8 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp ; GCN-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-UNSAFE-NEXT: s_endpgm %val = load float, ptr addrspace(1) %in, align 4 - %sqrt = call contract float @llvm.sqrt.f32(float %val) - %div = fdiv contract float -1.0, %sqrt, !fpmath !0 + %sqrt = call afn contract float @llvm.sqrt.f32(float %val) + %div = fdiv afn contract float -1.0, %sqrt, !fpmath !0 store float %div, ptr addrspace(1) %out, align 4 ret void } @@ -322,8 +322,8 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad ; GCN-UNSAFE-NEXT: s_endpgm %val = load float, ptr addrspace(1) %in, align 4 %val.fneg = fneg float %val - %sqrt = call contract float @llvm.sqrt.f32(float %val.fneg) - %div = fdiv contract float -1.0, %sqrt, !fpmath !0 + %sqrt = call afn contract float @llvm.sqrt.f32(float %val.fneg) + %div = fdiv afn contract float -1.0, %sqrt, !fpmath !0 store float %div, ptr addrspace(1) %out, align 4 ret void } @@ -343,8 +343,8 @@ define float @v_neg_rsq_neg_f32(float %val) { ; GCN-IEEE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] %val.fneg = fneg float %val - %sqrt = call contract float @llvm.sqrt.f32(float %val.fneg) - %div = fdiv contract float -1.0, %sqrt, !fpmath !0 + %sqrt = call afn contract float @llvm.sqrt.f32(float %val.fneg) + %div = fdiv afn contract float -1.0, %sqrt, !fpmath !0 ret float %div } @@ -367,8 +367,8 @@ define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) { ; GCN-IEEE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] %val.fneg = fneg <2 x float> %val - %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val.fneg) - %div = fdiv contract <2 x float> , %sqrt, !fpmath !0 + %sqrt = call afn contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val.fneg) + %div = fdiv afn contract <2 x float> , %sqrt, !fpmath !0 ret <2 x float> %div } @@ -387,8 +387,8 @@ define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) { ; GCN-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v1 ; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] %val0.neg = fneg float %val0 - %sqrt = call contract float @llvm.sqrt.f32(float %val0.neg) - %div = fdiv contract float -1.0, %sqrt, !fpmath !0 + %sqrt = call afn contract float @llvm.sqrt.f32(float %val0.neg) + %div = fdiv afn contract float -1.0, %sqrt, !fpmath !0 %user = fmul contract float %div, %val1 ret float %user } @@ -412,8 +412,8 @@ define <2 x float> @v_neg_rsq_neg_v2f32_foldable_user(<2 x float> %val0, <2 x fl ; GCN-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v3 ; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] %val0.fneg = fneg <2 x float> %val0 - %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0.fneg) - %div = fdiv contract <2 x float> , %sqrt, !fpmath !0 + %sqrt = call afn contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0.fneg) + %div = fdiv afn contract <2 x float> , %sqrt, !fpmath !0 %user = fmul contract <2 x float> %div, %val1 ret <2 x float> %user } @@ -432,8 +432,8 @@ define float @v_neg_rsq_f32(float %val) { ; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 ; GCN-IEEE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] - %sqrt = call contract float @llvm.sqrt.f32(float %val) - %div = fdiv contract float -1.0, %sqrt, !fpmath !0 + %sqrt = call afn contract float @llvm.sqrt.f32(float %val) + %div = fdiv afn contract float -1.0, %sqrt, !fpmath !0 ret float %div } @@ -455,8 +455,8 @@ define <2 x float> @v_neg_rsq_v2f32(<2 x float> %val) { ; GCN-IEEE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-IEEE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 ; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] - %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val) - %div = fdiv contract <2 x float> , %sqrt, !fpmath !0 + %sqrt = call afn contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val) + %div = fdiv afn contract <2 x float> , %sqrt, !fpmath !0 ret <2 x float> %div } @@ -474,8 +474,8 @@ define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) { ; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 ; GCN-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v1 ; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] - %sqrt = call contract float @llvm.sqrt.f32(float %val0) - %div = fdiv contract float -1.0, %sqrt, !fpmath !0 + %sqrt = call afn contract float @llvm.sqrt.f32(float %val0) + %div = fdiv afn contract float -1.0, %sqrt, !fpmath !0 %user = fmul contract float %div, %val1 ret float %user } @@ -643,8 +643,8 @@ define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float> ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2 ; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] - %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0) - %div = fdiv contract <2 x float> , %sqrt, !fpmath !0 + %sqrt = call afn contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0) + %div = fdiv afn contract <2 x float> , %sqrt, !fpmath !0 %user = fmul contract <2 x float> %div, %val1 ret <2 x float> %user } @@ -672,8 +672,8 @@ define float @v_rsq_f32(float %val) { ; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, 0, 12, vcc ; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v1 ; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] - %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1 - %div = fdiv contract float 1.0, %sqrt, !fpmath !1 + %sqrt = call afn contract float @llvm.sqrt.f32(float %val), !fpmath !1 + %div = fdiv afn contract float 1.0, %sqrt, !fpmath !1 ret float %div } @@ -756,9 +756,9 @@ define { float, float } @v_rsq_f32_multi_use(float %val) { ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v2, vcc, 0, v2 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v1, v2 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] - %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1 + %sqrt = call afn contract float @llvm.sqrt.f32(float %val), !fpmath !1 %insert.0 = insertvalue { float, float } poison, float %sqrt, 0 - %div = fdiv arcp contract float 1.0, %sqrt, !fpmath !1 + %div = fdiv arcp afn contract float 1.0, %sqrt, !fpmath !1 %insert.1 = insertvalue { float, float } %insert.0, float %div, 1 ret { float, float } %insert.1 } @@ -838,8 +838,8 @@ define float @v_rsq_f32_missing_contract0(float %val) { ; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 ; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0 ; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31] - %sqrt = call float @llvm.sqrt.f32(float %val), !fpmath !1 - %div = fdiv arcp contract float 1.0, %sqrt, !fpmath !1 + %sqrt = call afn float @llvm.sqrt.f32(float %val), !fpmath !1 + %div = fdiv arcp afn contract float 1.0, %sqrt, !fpmath !1 ret float %div } @@ -855,8 +855,8 @@ define float @v_rsq_f32_missing_contract1(float %val) { ; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 ; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] - %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1 - %div = fdiv arcp float 1.0, %sqrt, !fpmath !1 + %sqrt = call afn contract float @llvm.sqrt.f32(float %val), !fpmath !1 + %div = fdiv arcp afn float 1.0, %sqrt, !fpmath !1 ret float %div } @@ -876,8 +876,8 @@ define float @v_rsq_f32_contractable_user(float %val0, float %val1) { ; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 ; GCN-IEEE-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] - %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1 - %div = fdiv contract float 1.0, %sqrt, !fpmath !1 + %sqrt = call afn contract float @llvm.sqrt.f32(float %val0), !fpmath !1 + %div = fdiv afn contract float 1.0, %sqrt, !fpmath !1 %add = fadd contract float %div, %val1 ret float %add } @@ -897,8 +897,8 @@ define float @v_rsq_f32_contractable_user_missing_contract0(float %val0, float % ; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 ; GCN-IEEE-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] - %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1 - %div = fdiv contract float 1.0, %sqrt, !fpmath !1 + %sqrt = call afn contract float @llvm.sqrt.f32(float %val0), !fpmath !1 + %div = fdiv afn contract float 1.0, %sqrt, !fpmath !1 %add = fadd contract float %div, %val1 ret float %add } @@ -918,8 +918,8 @@ define float @v_rsq_f32_contractable_user_missing_contract1(float %val0, float % ; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 ; GCN-IEEE-NEXT: v_add_f32_e32 v0, v0, v1 ; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] - %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1 - %div = fdiv contract float 1.0, %sqrt, !fpmath !1 + %sqrt = call afn contract float @llvm.sqrt.f32(float %val0), !fpmath !1 + %div = fdiv afn contract float 1.0, %sqrt, !fpmath !1 %add = fadd float %div, %val1 ret float %add } @@ -953,8 +953,8 @@ define float @v_rsq_f32_known_never_posdenormal(float nofpclass(psub) %val) { ; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0 ; GCN-IEEE-NEXT: s_setpc_b64 s[30:31] - %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1 - %div = fdiv contract float 1.0, %sqrt, !fpmath !1 + %sqrt = call afn contract float @llvm.sqrt.f32(float %val), !fpmath !1 + %div = fdiv afn contract float 1.0, %sqrt, !fpmath !1 ret float %div } diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll index b78cbb0ac29cf..4aac193d6aeab 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll @@ -4504,7 +4504,7 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) { ret <2 x double> %rsq } -define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 { +define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) { ; SI-SDAG-LABEL: s_rsq_f64_unsafe: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0 @@ -4648,8 +4648,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 { ; VI-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; VI-GISEL-NEXT: v_readfirstlane_b32 s1, v1 ; VI-GISEL-NEXT: ; return to shader part epilog - %rsq = call contract double @llvm.sqrt.f64(double %x) - %result = fdiv contract double 1.0, %rsq + %rsq = call contract afn double @llvm.sqrt.f64(double %x) + %result = fdiv contract afn double 1.0, %rsq %cast = bitcast double %result to <2 x i32> %cast.0 = extractelement <2 x i32> %cast, i32 0 %cast.1 = extractelement <2 x i32> %cast, i32 1 @@ -4660,7 +4660,7 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 { ret <2 x i32> %insert.1 } -define double @v_rsq_f64_unsafe(double %x) #0 { +define double @v_rsq_f64_unsafe(double %x) { ; SI-SDAG-LABEL: v_rsq_f64_unsafe: ; SI-SDAG: ; %bb.0: ; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -4800,8 +4800,8 @@ define double @v_rsq_f64_unsafe(double %x) #0 { ; VI-GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 ; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] ; VI-GISEL-NEXT: s_setpc_b64 s[30:31] - %sqrt = call double @llvm.sqrt.f64(double %x) - %rsq = fdiv double 1.0, %sqrt + %sqrt = call afn contract double @llvm.sqrt.f64(double %x) + %rsq = fdiv afn contract double 1.0, %sqrt ret double %rsq } @@ -5737,7 +5737,6 @@ define double @v_div_const_contract_sqrt_f64(double %x) { ret double %rsq } -attributes #0 = { "unsafe-fp-math"="true" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} ; GISEL: {{.*}}