diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index aba3c0f80a024..a9d94090cbcf4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16619,8 +16619,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { if (!HasFMAD && !HasFMA) return SDValue(); - bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath || HasFMAD); + bool AllowFusionGlobally = + Options.AllowFPOpFusion == FPOpFusion::Fast || HasFMAD; // If the addition is not contractable, do not combine. if (!AllowFusionGlobally && !N->getFlags().hasAllowContract()) return SDValue(); @@ -17826,6 +17826,7 @@ template SDValue DAGCombiner::visitFMA(SDNode *N) { SDValue N2 = N->getOperand(2); ConstantFPSDNode *N0CFP = dyn_cast(N0); ConstantFPSDNode *N1CFP = dyn_cast(N1); + ConstantFPSDNode *N2CFP = dyn_cast(N2); EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; @@ -17855,11 +17856,17 @@ template SDValue DAGCombiner::visitFMA(SDNode *N) { } // FIXME: use fast math flags instead of Options.UnsafeFPMath - if (Options.UnsafeFPMath) { - if (N0CFP && N0CFP->isZero()) - return N2; - if (N1CFP && N1CFP->isZero()) - return N2; + // TODO: Finally migrate away from global TargetOptions. + if (Options.AllowFPOpFusion == FPOpFusion::Fast || + (Options.NoNaNsFPMath && Options.NoInfsFPMath) || + (N->getFlags().hasNoNaNs() && N->getFlags().hasNoInfs())) { + if (Options.NoSignedZerosFPMath || N->getFlags().hasNoSignedZeros() || + (N2CFP && !N2CFP->isExactlyValue(-0.0))) { + if (N0CFP && N0CFP->isZero()) + return N2; + if (N1CFP && N1CFP->isZero()) + return N2; + } } // FIXME: Support splat of constant. diff --git a/llvm/test/CodeGen/AArch64/arm64-fp-contract-zero.ll b/llvm/test/CodeGen/AArch64/arm64-fp-contract-zero.ll index 9a753748a29ef..53e69b985cafb 100644 --- a/llvm/test/CodeGen/AArch64/arm64-fp-contract-zero.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fp-contract-zero.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=arm64 -fp-contract=fast -o - %s | FileCheck %s +; RUN: llc -mtriple=arm64 -o - %s | FileCheck %s ; Make sure we don't try to fold an fneg into +0.0, creating an illegal constant @@ -7,12 +7,10 @@ define double @test_fms_fold(double %a, double %b) { ; CHECK-LABEL: test_fms_fold: ; CHECK: // %bb.0: -; CHECK-NEXT: movi d2, #0000000000000000 -; CHECK-NEXT: fmul d1, d1, d2 -; CHECK-NEXT: fnmsub d0, d0, d2, d1 +; CHECK-NEXT: movi {{d[0-9]+}}, #0000000000000000 ; CHECK-NEXT: ret - %mul = fmul double %a, 0.000000e+00 - %mul1 = fmul double %b, 0.000000e+00 + %mul = fmul fast double %a, 0.000000e+00 + %mul1 = fmul fast double %b, 0.000000e+00 %sub = fsub double %mul, %mul1 ret double %sub } diff --git a/llvm/test/CodeGen/AMDGPU/fdot2.ll b/llvm/test/CodeGen/AMDGPU/fdot2.ll index 776816d6aa0e3..b61981bbcecec 100644 --- a/llvm/test/CodeGen/AMDGPU/fdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/fdot2.ll @@ -1,28 +1,53 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900 -; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE -; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT -; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900 +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE +; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT +; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-CONTRACT ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DENORM-CONTRACT -; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -mattr="+dot7-insts,-dot10-insts" -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DOT10-DISABLED +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -mattr="+dot7-insts,-dot10-insts" -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DOT10-DISABLED ; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z) ; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions ; are not converted from f16 to f32. -; GCN-LABEL: {{^}}dotproduct_f16 +; GCN-LABEL: {{^}}dotproduct_f16_contract ; GFX900: v_fma_f16 ; GFX900: v_fma_f16 -; GFX906: v_mul_f16_e32 -; GFX906: v_mul_f16_e32 - ; GFX906-DL-UNSAFE: v_fma_f16 ; GFX10-CONTRACT: v_fmac_f16 ; GFX906-CONTRACT: v_mac_f16_e32 ; GFX906-DENORM-CONTRACT: v_fma_f16 ; GFX906-DOT10-DISABLED: v_fma_f16 + +define amdgpu_kernel void @dotproduct_f16_contract(ptr addrspace(1) %src1, + ptr addrspace(1) %src2, + ptr addrspace(1) nocapture %dst) { +entry: + %src1.vec = load <2 x half>, ptr addrspace(1) %src1 + %src2.vec = load <2 x half>, ptr addrspace(1) %src2 + + %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 + %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 + + %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 + %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 + + %mul2 = fmul contract half %src1.el2, %src2.el2 + %mul1 = fmul contract half %src1.el1, %src2.el1 + %acc = load half, ptr addrspace(1) %dst, align 2 + %acc1 = fadd contract half %mul2, %acc + %acc2 = fadd contract half %mul1, %acc1 + store half %acc2, ptr addrspace(1) %dst, align 2 + ret void +} + +; GCN-LABEL: {{^}}dotproduct_f16 + +; GFX906: v_mul_f16_e32 +; GFX906: v_mul_f16_e32 + define amdgpu_kernel void @dotproduct_f16(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -45,18 +70,12 @@ entry: ret void } - ; We only want to generate fdot2 if: ; - vector element of dot product is converted from f16 to f32, and ; - the vectors are of type <2 x half>, and ; - "dot10-insts" is enabled -; GCN-LABEL: {{^}}dotproduct_f16_f32 -; GFX900: v_mad_mix_f32 -; GFX900: v_mad_mix_f32 - -; GFX906: v_mad_f32 -; GFX906: v_mac_f32_e32 +; GCN-LABEL: {{^}}dotproduct_f16_f32_contract ; GFX906-DL-UNSAFE: v_dot2_f32_f16 ; GFX10-DL-UNSAFE: v_dot2c_f32_f16 @@ -65,6 +84,39 @@ entry: ; GFX906-DENORM-CONTRACT: v_dot2_f32_f16 ; GFX906-DOT10-DISABLED: v_fma_mix_f32 +define amdgpu_kernel void @dotproduct_f16_f32_contract(ptr addrspace(1) %src1, + ptr addrspace(1) %src2, + ptr addrspace(1) nocapture %dst) { +entry: + %src1.vec = load <2 x half>, ptr addrspace(1) %src1 + %src2.vec = load <2 x half>, ptr addrspace(1) %src2 + + %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 + %csrc1.el1 = fpext half %src1.el1 to float + %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 + %csrc2.el1 = fpext half %src2.el1 to float + + %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 + %csrc1.el2 = fpext half %src1.el2 to float + %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 + %csrc2.el2 = fpext half %src2.el2 to float + + %mul2 = fmul contract float %csrc1.el2, %csrc2.el2 + %mul1 = fmul contract float %csrc1.el1, %csrc2.el1 + %acc = load float, ptr addrspace(1) %dst, align 4 + %acc1 = fadd contract float %mul2, %acc + %acc2 = fadd contract float %mul1, %acc1 + store float %acc2, ptr addrspace(1) %dst, align 4 + ret void +} + +; GCN-LABEL: {{^}}dotproduct_f16_f32 +; GFX900: v_mad_mix_f32 +; GFX900: v_mad_mix_f32 + +; GFX906: v_mad_f32 +; GFX906: v_mac_f32_e32 + define amdgpu_kernel void @dotproduct_f16_f32(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -96,6 +148,39 @@ entry: ; - the vectors are of type <2 x half>, and ; - "dot10-insts" is enabled +; GCN-LABEL: {{^}}dotproduct_diffvecorder_contract +; GFX906-DL-UNSAFE: v_dot2_f32_f16 +; GFX10-DL-UNSAFE: v_dot2c_f32_f16 + +; GFX906-CONTRACT: v_dot2_f32_f16 +; GFX906-DENORM-CONTRACT: v_dot2_f32_f16 +; GFX906-DOT10-DISABLED: v_fma_mix_f32 +define amdgpu_kernel void @dotproduct_diffvecorder_contract(ptr addrspace(1) %src1, + ptr addrspace(1) %src2, + ptr addrspace(1) nocapture %dst) { +entry: + %src1.vec = load <2 x half>, ptr addrspace(1) %src1 + %src2.vec = load <2 x half>, ptr addrspace(1) %src2 + + %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 + %csrc1.el1 = fpext half %src1.el1 to float + %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 + %csrc2.el1 = fpext half %src2.el1 to float + + %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 + %csrc1.el2 = fpext half %src1.el2 to float + %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 + %csrc2.el2 = fpext half %src2.el2 to float + + %mul2 = fmul contract float %csrc2.el2, %csrc1.el2 + %mul1 = fmul contract float %csrc1.el1, %csrc2.el1 + %acc = load float, ptr addrspace(1) %dst, align 4 + %acc1 = fadd contract float %mul2, %acc + %acc2 = fadd contract float %mul1, %acc1 + store float %acc2, ptr addrspace(1) %dst, align 4 + ret void +} + ; GCN-LABEL: {{^}}dotproduct_diffvecorder ; GFX900: v_mad_mix_f32 ; GFX900: v_mad_mix_f32 @@ -103,12 +188,6 @@ entry: ; GFX906: v_mad_f32 ; GFX906: v_mac_f32_e32 -; GFX906-DL-UNSAFE: v_dot2_f32_f16 -; GFX10-DL-UNSAFE: v_dot2c_f32_f16 - -; GFX906-CONTRACT: v_dot2_f32_f16 -; GFX906-DENORM-CONTRACT: v_dot2_f32_f16 -; GFX906-DOT10-DISABLED: v_fma_mix_f32 define amdgpu_kernel void @dotproduct_diffvecorder(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -136,17 +215,45 @@ entry: } ; Tests to make sure dot product is not generated when the vectors are not of <2 x half>. -; GCN-LABEL: {{^}}dotproduct_v4f16 -; GFX900: v_mad_mix_f32 - -; GFX906: v_mad_f32 -; GFX906: v_mac_f32_e32 +; GCN-LABEL: {{^}}dotproduct_v4f16_contract ; GCN-DL-UNSAFE: v_fma_mix_f32 ; GFX906-CONTRACT: v_fma_mix_f32 ; GFX906-DENORM-CONTRACT: v_fma_mix_f32 ; GFX906-DOT10-DISABLED: v_fma_mix_f32 +define amdgpu_kernel void @dotproduct_v4f16_contract(ptr addrspace(1) %src1, + ptr addrspace(1) %src2, + ptr addrspace(1) nocapture %dst) { +entry: + %src1.vec = load <4 x half>, ptr addrspace(1) %src1 + %src2.vec = load <4 x half>, ptr addrspace(1) %src2 + + %src1.el1 = extractelement <4 x half> %src1.vec, i64 0 + %csrc1.el1 = fpext half %src1.el1 to float + %src2.el1 = extractelement <4 x half> %src2.vec, i64 0 + %csrc2.el1 = fpext half %src2.el1 to float + + %src1.el2 = extractelement <4 x half> %src1.vec, i64 1 + %csrc1.el2 = fpext half %src1.el2 to float + %src2.el2 = extractelement <4 x half> %src2.vec, i64 1 + %csrc2.el2 = fpext half %src2.el2 to float + + %mul2 = fmul contract float %csrc1.el2, %csrc2.el2 + %mul1 = fmul float %csrc1.el1, %csrc2.el1 + %acc = load float, ptr addrspace(1) %dst, align 4 + %acc1 = fadd contract float %mul2, %acc + %acc2 = fadd contract float %mul1, %acc1 + store float %acc2, ptr addrspace(1) %dst, align 4 + ret void +} + +; GCN-LABEL: {{^}}dotproduct_v4f16 +; GFX900: v_mad_mix_f32 + +; GFX906: v_mad_f32 +; GFX906: v_mac_f32_e32 + define amdgpu_kernel void @dotproduct_v4f16(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -173,6 +280,39 @@ entry: ret void } +; GCN-LABEL: {{^}}NotAdotproductContract + +; GCN-DL-UNSAFE: v_fma_mix_f32 + +; GFX906-CONTRACT: v_fma_mix_f32 +; GFX906-DENORM-CONTRACT: v_fma_mix_f32 +; GFX906-DOT10-DISABLED: v_fma_mix_f32 +define amdgpu_kernel void @NotAdotproductContract(ptr addrspace(1) %src1, + ptr addrspace(1) %src2, + ptr addrspace(1) nocapture %dst) { +entry: + %src1.vec = load <2 x half>, ptr addrspace(1) %src1 + %src2.vec = load <2 x half>, ptr addrspace(1) %src2 + + %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 + %csrc1.el1 = fpext half %src1.el1 to float + %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 + %csrc2.el1 = fpext half %src2.el1 to float + + %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 + %csrc1.el2 = fpext half %src1.el2 to float + %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 + %csrc2.el2 = fpext half %src2.el2 to float + + %mul2 = fmul contract float %csrc1.el2, %csrc1.el1 + %mul1 = fmul contract float %csrc2.el1, %csrc2.el2 + %acc = load float, ptr addrspace(1) %dst, align 4 + %acc1 = fadd contract float %mul2, %acc + %acc2 = fadd contract float %mul1, %acc1 + store float %acc2, ptr addrspace(1) %dst, align 4 + ret void +} + ; GCN-LABEL: {{^}}NotAdotproduct ; GFX900: v_mad_mix_f32 ; GFX900: v_mad_mix_f32 @@ -180,11 +320,6 @@ entry: ; GFX906: v_mad_f32 ; GFX906: v_mac_f32_e32 -; GCN-DL-UNSAFE: v_fma_mix_f32 - -; GFX906-CONTRACT: v_fma_mix_f32 -; GFX906-DENORM-CONTRACT: v_fma_mix_f32 -; GFX906-DOT10-DISABLED: v_fma_mix_f32 define amdgpu_kernel void @NotAdotproduct(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -211,6 +346,39 @@ entry: ret void } +; GCN-LABEL: {{^}}Diff_Idx_NotAdotproductContract + +; GCN-DL-UNSAFE: v_fma_mix_f32 + +; GFX906-CONTRACT: v_fma_mix_f32 +; GFX906-DENORM-CONTRACT: v_fma_mix_f32 +; GFX906-DOT10-DISABLED: v_fma_mix_f32 +define amdgpu_kernel void @Diff_Idx_NotAdotproductContract(ptr addrspace(1) %src1, + ptr addrspace(1) %src2, + ptr addrspace(1) nocapture %dst) { +entry: + %src1.vec = load <2 x half>, ptr addrspace(1) %src1 + %src2.vec = load <2 x half>, ptr addrspace(1) %src2 + + %src1.el1 = extractelement <2 x half> %src1.vec, i64 0 + %csrc1.el1 = fpext half %src1.el1 to float + %src2.el1 = extractelement <2 x half> %src2.vec, i64 0 + %csrc2.el1 = fpext half %src2.el1 to float + + %src1.el2 = extractelement <2 x half> %src1.vec, i64 1 + %csrc1.el2 = fpext half %src1.el2 to float + %src2.el2 = extractelement <2 x half> %src2.vec, i64 1 + %csrc2.el2 = fpext half %src2.el2 to float + + %mul2 = fmul contract float %csrc1.el2, %csrc2.el1 + %mul1 = fmul contract float %csrc1.el1, %csrc2.el2 + %acc = load float, ptr addrspace(1) %dst, align 4 + %acc1 = fadd contract float %mul2, %acc + %acc2 = fadd contract float %mul1, %acc1 + store float %acc2, ptr addrspace(1) %dst, align 4 + ret void +} + ; GCN-LABEL: {{^}}Diff_Idx_NotAdotproduct ; GFX900: v_mad_mix_f32 ; GFX900: v_mad_mix_f32 @@ -218,11 +386,6 @@ entry: ; GFX906: v_mad_f32 ; GFX906: v_mac_f32_e32 -; GCN-DL-UNSAFE: v_fma_mix_f32 - -; GFX906-CONTRACT: v_fma_mix_f32 -; GFX906-DENORM-CONTRACT: v_fma_mix_f32 -; GFX906-DOT10-DISABLED: v_fma_mix_f32 define amdgpu_kernel void @Diff_Idx_NotAdotproduct(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index f199db3ca12ca..462d7748b86cd 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -832,9 +832,9 @@ define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, < ; GFX11-NSZ-TRUE16-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv afn half 1.000000e+00, %tmp6 - %tmp8 = fmul half 0.000000e+00, %tmp7 + %tmp8 = fmul contract half 0.000000e+00, %tmp7 %tmp9 = fmul reassoc nnan arcp contract half 0.000000e+00, %tmp8 - %.i188 = fadd half %tmp9, 0.000000e+00 + %.i188 = fadd nnan ninf contract half %tmp9, 0.000000e+00 %tmp10 = fcmp uge half %.i188, %tmp2 %tmp11 = fneg half %.i188 %.i092 = select i1 %tmp10, half %tmp2, half %tmp11 @@ -6258,7 +6258,7 @@ declare <4 x half> @llvm.fmuladd.v4f16(<4 x half>, <4 x half>, <4 x half>) #1 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone } -attributes #2 = { nounwind "unsafe-fp-math"="true" } +attributes #2 = { nounwind } attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" } attributes #4 = { nounwind "amdgpu-ieee"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 46da9d33639b6..dc9942b7274ea 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,SI,SI-SAFE %s -; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,SI,SI-NSZ %s +; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=hawaii -mattr=+flat-for-global -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,SI,SI-NSZ %s ; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-SAFE,VI,VI-SAFE %s -; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,VI,VI-NSZ %s +; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji -fp-contract=fast < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-NSZ,VI,VI-NSZ %s ; -------------------------------------------------------------------------------- ; fadd tests @@ -289,14 +289,18 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4 ; function attribute unsafe-fp-math automatically. Combine with the previous test ; when that is done. define amdgpu_ps float @fneg_fadd_0_nsz_f32(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) #2 { -; SI-SAFE-LABEL: fneg_fadd_0_nsz_f32: -; SI-SAFE: ; %bb.0: ; %.entry -; SI-SAFE-NEXT: v_min_legacy_f32_e64 v0, 0, s0 -; SI-SAFE-NEXT: s_brev_b32 s0, 1 -; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; SI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; SI-SAFE-NEXT: ; return to shader part epilog +; GCN-SAFE-LABEL: fneg_fadd_0_nsz_f32: +; GCN-SAFE: ; %bb.0: ; %.entry +; GCN-SAFE-NEXT: v_rcp_f32_e32 v0, s1 +; GCN-SAFE-NEXT: v_mov_b32_e32 v1, s0 +; GCN-SAFE-NEXT: v_mul_f32_e32 v0, 0, v0 +; GCN-SAFE-NEXT: v_add_f32_e32 v0, 0, v0 +; GCN-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; GCN-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; GCN-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GCN-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; GCN-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; GCN-SAFE-NEXT: ; return to shader part epilog ; ; GCN-NSZ-LABEL: fneg_fadd_0_nsz_f32: ; GCN-NSZ: ; %bb.0: ; %.entry @@ -309,19 +313,6 @@ define amdgpu_ps float @fneg_fadd_0_nsz_f32(float inreg %tmp2, float inreg %tmp6 ; GCN-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 ; GCN-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; GCN-NSZ-NEXT: ; return to shader part epilog -; -; VI-SAFE-LABEL: fneg_fadd_0_nsz_f32: -; VI-SAFE: ; %bb.0: ; %.entry -; VI-SAFE-NEXT: v_rcp_f32_e32 v0, s1 -; VI-SAFE-NEXT: v_mov_b32_e32 v1, s0 -; VI-SAFE-NEXT: v_mul_f32_e32 v0, 0, v0 -; VI-SAFE-NEXT: v_add_f32_e32 v0, 0, v0 -; VI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; VI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; VI-SAFE-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv afn float 1.000000e+00, %tmp6 %tmp8 = fmul float 0.000000e+00, %tmp7 @@ -672,17 +663,28 @@ define amdgpu_ps double @fneg_fadd_0_f64(double inreg %tmp2, double inreg %tmp6, ; function attribute unsafe-fp-math automatically. Combine with the previous test ; when that is done. define amdgpu_ps double @fneg_fadd_0_nsz_f64(double inreg %tmp2, double inreg %tmp6, <4 x i32> %arg) #2 { -; GCN-SAFE-LABEL: fneg_fadd_0_nsz_f64: -; GCN-SAFE: ; %bb.0: ; %.entry -; GCN-SAFE-NEXT: v_cmp_ngt_f64_e64 s[2:3], s[0:1], 0 -; GCN-SAFE-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN-SAFE-NEXT: s_cselect_b32 s1, s1, 0x80000000 -; GCN-SAFE-NEXT: s_cselect_b32 s0, s0, 0 -; GCN-SAFE-NEXT: v_cmp_ngt_f64_e64 s[0:1], s[0:1], 0 -; GCN-SAFE-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-SAFE-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 -; GCN-SAFE-NEXT: s_mov_b32 s0, 0 -; GCN-SAFE-NEXT: ; return to shader part epilog +; SI-SAFE-LABEL: fneg_fadd_0_nsz_f64: +; SI-SAFE: ; %bb.0: ; %.entry +; SI-SAFE-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] +; SI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; SI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; SI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; SI-SAFE-NEXT: v_mov_b32_e32 v2, s1 +; SI-SAFE-NEXT: v_mul_f64 v[0:1], v[0:1], 0 +; SI-SAFE-NEXT: v_mov_b32_e32 v3, s0 +; SI-SAFE-NEXT: v_add_f64 v[0:1], v[0:1], 0 +; SI-SAFE-NEXT: v_cmp_ngt_f64_e32 vcc, s[0:1], v[0:1] +; SI-SAFE-NEXT: v_xor_b32_e32 v4, 0x80000000, v1 +; SI-SAFE-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-SAFE-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] +; SI-SAFE-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-SAFE-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 +; SI-SAFE-NEXT: s_mov_b32 s0, 0 +; SI-SAFE-NEXT: ; return to shader part epilog ; ; SI-NSZ-LABEL: fneg_fadd_0_nsz_f64: ; SI-NSZ: ; %bb.0: ; %.entry @@ -707,6 +709,29 @@ define amdgpu_ps double @fneg_fadd_0_nsz_f64(double inreg %tmp2, double inreg %t ; SI-NSZ-NEXT: s_mov_b32 s0, 0 ; SI-NSZ-NEXT: ; return to shader part epilog ; +; VI-SAFE-LABEL: fneg_fadd_0_nsz_f64: +; VI-SAFE: ; %bb.0: ; %.entry +; VI-SAFE-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] +; VI-SAFE-NEXT: v_mov_b32_e32 v4, s0 +; VI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; VI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; VI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; VI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; VI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; VI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; VI-SAFE-NEXT: v_mov_b32_e32 v2, s1 +; VI-SAFE-NEXT: v_mul_f64 v[0:1], v[0:1], 0 +; VI-SAFE-NEXT: v_add_f64 v[0:1], v[0:1], 0 +; VI-SAFE-NEXT: v_cmp_ngt_f64_e32 vcc, s[0:1], v[0:1] +; VI-SAFE-NEXT: v_xor_b32_e32 v3, 0x80000000, v1 +; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; VI-SAFE-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] +; VI-SAFE-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-SAFE-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 +; VI-SAFE-NEXT: s_mov_b32 s0, 0 +; VI-SAFE-NEXT: ; return to shader part epilog +; ; VI-NSZ-LABEL: fneg_fadd_0_nsz_f64: ; VI-NSZ: ; %bb.0: ; %.entry ; VI-NSZ-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] @@ -4602,6 +4627,6 @@ declare half @llvm.amdgcn.rcp.f16(half) #1 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone } -attributes #2 = { nounwind "unsafe-fp-math"="true" } +attributes #2 = { nounwind } attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" } attributes #4 = { nounwind "amdgpu-ieee"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll index 9c11f169a89df..b10a740bf003b 100644 --- a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll +++ b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -fp-contract=fast | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -fp-contract=fast | %ptxas-verify -arch=sm_80 %} ; Using FTZ should emit fma.ftz.relu for f16, not for bf16 -; RUN: llc < %s -denormal-fp-math-f32=preserve-sign -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s --check-prefixes=CHECK-FTZ -; RUN: %if ptxas %{ llc < %s -denormal-fp-math-f32=preserve-sign -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} +; RUN: llc < %s -denormal-fp-math-f32=preserve-sign -mtriple=nvptx64 -mcpu=sm_80 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK-FTZ +; RUN: %if ptxas %{ llc < %s -denormal-fp-math-f32=preserve-sign -fp-contract=fast -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify -arch=sm_80 %} ; SM < 80 or (which needs PTX version >= 70) should not emit fma{.ftz}.relu -; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 | FileCheck %s --check-prefixes=CHECK-SM70 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK-SM70 define half @fma_f16_expanded_no_nans(half %a, half %b, half %c) #0 { ; CHECK-LABEL: fma_f16_expanded_no_nans( @@ -119,7 +119,7 @@ define half @fma_f16_expanded_no_nans_multiple_uses_of_fma(half %a, half %b, hal ret half %6 } -define half @fma_f16_expanded_unsafe_with_nans(half %a, half %b, half %c) #1 { +define half @fma_f16_expanded_unsafe_with_nans(half %a, half %b, half %c) { ; CHECK-LABEL: fma_f16_expanded_unsafe_with_nans( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<7>; @@ -216,7 +216,7 @@ define half @fma_f16_expanded_maxnum_no_nans(half %a, half %b, half %c) #0 { ret half %3 } -define bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat %c) #1 { +define bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat %c) { ; CHECK-LABEL: fma_bf16_expanded_unsafe_with_nans( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<7>; @@ -614,7 +614,7 @@ define <2 x half> @fma_f16x2_expanded_no_nans_multiple_uses_of_fma(<2 x half> %a ret <2 x half> %6 } -define <2 x half> @fma_f16x2_expanded_unsafe_with_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) #1 { +define <2 x half> @fma_f16x2_expanded_unsafe_with_nans(<2 x half> %a, <2 x half> %b, <2 x half> %c) { ; CHECK-LABEL: fma_f16x2_expanded_unsafe_with_nans( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<7>; @@ -720,7 +720,7 @@ define <2 x half> @fma_f16x2_expanded_maxnum_no_nans(<2 x half> %a, <2 x half> % ret <2 x half> %3 } -define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #1 { +define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) { ; CHECK-LABEL: fma_bf16x2_expanded_unsafe_with_nans( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<7>; @@ -1126,5 +1126,4 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf ret <2 x bfloat> %3 } -attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="true" } -attributes #1 = { "unsafe-fp-math"="true" } +attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" } diff --git a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll index 4e72a5ac5ede3..e71f59c79ce4d 100644 --- a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll +++ b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll @@ -2,8 +2,8 @@ ; REQUIRES: asserts ; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1 | FileCheck %s --check-prefix=FMFDEBUG ; RUN: llc < %s -mtriple=powerpc64le | FileCheck %s --check-prefix=FMF -; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1 -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s --check-prefix=GLOBALDEBUG -; RUN: llc < %s -mtriple=powerpc64le -enable-unsafe-fp-math -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math | FileCheck %s --check-prefix=GLOBAL +; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1 -enable-unsafe-fp-math -fp-contract=fast -enable-no-nans-fp-math | FileCheck %s --check-prefix=GLOBALDEBUG +; RUN: llc < %s -mtriple=powerpc64le -enable-unsafe-fp-math -fp-contract=fast -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math | FileCheck %s --check-prefix=GLOBAL ; Test FP transforms using instruction/node-level fast-math-flags. ; We're also checking debug output to verify that FMF is propagated to the newly created nodes. diff --git a/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll b/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll index 96aa58000f9b7..539b563691723 100644 --- a/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll +++ b/llvm/test/CodeGen/PowerPC/vsx-fma-mutate-trivial-copy.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -fp-contract=fast < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -31,7 +31,7 @@ declare double @llvm.sqrt.f64(double) #1 declare signext i32 @p_col_helper(...) #2 -attributes #0 = { nounwind "no-infs-fp-math"="true" "no-nans-fp-math"="true" "target-cpu"="pwr7" "unsafe-fp-math"="true" } +attributes #0 = { nounwind "no-infs-fp-math"="true" "no-nans-fp-math"="true" "target-cpu"="pwr7" } attributes #1 = { nounwind readnone } attributes #2 = { nounwind } diff --git a/llvm/test/CodeGen/X86/dag-combiner-fma-folding.ll b/llvm/test/CodeGen/X86/dag-combiner-fma-folding.ll new file mode 100644 index 0000000000000..2bd7dc445a02b --- /dev/null +++ b/llvm/test/CodeGen/X86/dag-combiner-fma-folding.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=x86_64-- --start-before=x86-isel -mattr=+avx,+fma %s -o - | FileCheck %s + +define double @fma_folding(double %x) { +; CHECK-LABEL: fma_folding: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0] +; CHECK-NEXT: retq + %prod = fmul contract ninf nnan double %x, 0.0 + %fused = fadd contract ninf nnan double %prod, 1.0 + ret double %fused +} + +define double @fma_no_folding(double %x) { +; CHECK-LABEL: fma_no_folding: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + mem +; CHECK-NEXT: retq + %fused = call contract nnan ninf double @llvm.fma.f64(double %x, double 0.0, double -0.0) + ret double %fused +} diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll index 2c7da100344b7..78df4f685f6e9 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -stop-after=finalize-isel 2>&1 | FileCheck %s +; RUN: llc -fp-contract=fast < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -stop-after=finalize-isel 2>&1 | FileCheck %s declare float @llvm.sqrt.f32(float) #2 @@ -144,6 +144,6 @@ define float @rsqrt_daz(float %f) #1 { ret float %div } -attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt:2" "denormal-fp-math"="ieee,ieee" } -attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt:2" "denormal-fp-math"="ieee,preserve-sign" } +attributes #0 = { "reciprocal-estimates"="sqrt:2" "denormal-fp-math"="ieee,ieee" } +attributes #1 = { "reciprocal-estimates"="sqrt:2" "denormal-fp-math"="ieee,preserve-sign" } attributes #2 = { nounwind readnone } diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll index 9f420bcede110..5cd604c62a166 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll @@ -183,7 +183,7 @@ define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 { ret <4 x float> %call } -define <4 x float> @sqrt_v4f32_check_denorms_ieee_ninf(<4 x float> %x) #3 { +define <4 x float> @sqrt_v4f32_check_denorms_ieee_ninf(<4 x float> %x) #7 { ; SSE-LABEL: sqrt_v4f32_check_denorms_ieee_ninf: ; SSE: # %bb.0: ; SSE-NEXT: rsqrtps %xmm0, %xmm1 @@ -230,11 +230,11 @@ define <4 x float> @sqrt_v4f32_check_denorms_ieee_ninf(<4 x float> %x) #3 { ; AVX512-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %call = tail call ninf afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2 + %call = tail call fast ninf afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2 ret <4 x float> %call } -define <4 x float> @sqrt_v4f32_check_denorms_dynamic_ninf(<4 x float> %x) #6 { +define <4 x float> @sqrt_v4f32_check_denorms_dynamic_ninf(<4 x float> %x) #8 { ; SSE-LABEL: sqrt_v4f32_check_denorms_dynamic_ninf: ; SSE: # %bb.0: ; SSE-NEXT: rsqrtps %xmm0, %xmm1 @@ -281,7 +281,7 @@ define <4 x float> @sqrt_v4f32_check_denorms_dynamic_ninf(<4 x float> %x) #6 { ; AVX512-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %call = tail call ninf afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2 + %call = tail call fast ninf afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2 ret <4 x float> %call } @@ -1019,3 +1019,8 @@ attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" attributes #4 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" } attributes #5 = { "unsafe-fp-math"="true" "reciprocal-estimates"="all:0" } attributes #6 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" } + +; Attributes without "unsafe-fp-math"="true" +; TODO: Merge with previous attributes when this attribute can be deleted. +attributes #7 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" } ; #3 +attributes #8 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" } ; #6