From 8618eed01be92469daf652cf1e2df6f75424f5d7 Mon Sep 17 00:00:00 2001 From: PaperChalice Date: Fri, 10 Oct 2025 17:47:34 +0800 Subject: [PATCH 1/5] [SelectionDAG] Remove NoInfsFPMath uses --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 16 ++++------------ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 +- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c5c38661f1d71..0aea025530fa7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17086,11 +17086,6 @@ static bool isContractableFMUL(const TargetOptions &Options, SDValue N) { N->getFlags().hasAllowContract(); } -// Returns true if `N` can assume no infinities involved in its computation. -static bool hasNoInfs(const TargetOptions &Options, SDValue N) { - return Options.NoInfsFPMath || N->getFlags().hasNoInfs(); -} - /// Try to perform FMA combining on a given FADD node. template SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { @@ -17666,7 +17661,7 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) { // The transforms below are incorrect when x == 0 and y == inf, because the // intermediate multiplication produces a nan. SDValue FAdd = N0.getOpcode() == ISD::FADD ? N0 : N1; - if (!hasNoInfs(Options, FAdd)) + if (!FAdd->getFlags().hasNoInfs()) return SDValue(); // Floating-point multiply-add without intermediate rounding. @@ -18343,7 +18338,7 @@ template SDValue DAGCombiner::visitFMA(SDNode *N) { return matcher.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2); } - if ((Options.NoNaNsFPMath && Options.NoInfsFPMath) || + if ((Options.NoNaNsFPMath && N->getFlags().hasNoInfs()) || (N->getFlags().hasNoNaNs() && N->getFlags().hasNoInfs())) { if (N->getFlags().hasNoSignedZeros() || (N2CFP && !N2CFP->isExactlyValue(-0.0))) { @@ -18533,7 +18528,6 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc DL(N); - const TargetOptions &Options = DAG.getTarget().Options; SDNodeFlags Flags = N->getFlags(); SelectionDAG::FlagInserter FlagsInserter(DAG, N); @@ -18644,7 +18638,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { } // Fold into a reciprocal estimate and multiply instead of a real divide. - if (Options.NoInfsFPMath || Flags.hasNoInfs()) + if (Flags.hasNoInfs()) if (SDValue RV = BuildDivEstimate(N0, N1, Flags)) return RV; } @@ -18721,12 +18715,10 @@ SDValue DAGCombiner::visitFREM(SDNode *N) { SDValue DAGCombiner::visitFSQRT(SDNode *N) { SDNodeFlags Flags = N->getFlags(); - const TargetOptions &Options = DAG.getTarget().Options; // Require 'ninf' flag since sqrt(+Inf) = +Inf, but the estimation goes as: // sqrt(+Inf) == rsqrt(+Inf) * +Inf = 0 * +Inf = NaN - if (!Flags.hasApproximateFuncs() || - (!Options.NoInfsFPMath && !Flags.hasNoInfs())) + if (!Flags.hasApproximateFuncs() || !Flags.hasNoInfs()) return SDValue(); SDValue N0 = N->getOperand(0); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 6ea2e2708c162..3d963103d7469 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5771,7 +5771,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, return true; const TargetOptions &Options = getTarget().Options; - return Options.NoNaNsFPMath || Options.NoInfsFPMath; + return Options.NoNaNsFPMath || Op->getFlags().hasNoInfs(); } case ISD::OR: From 1721b2dc8ecd681ea3069586b0ef8fce64a5a04d Mon Sep 17 00:00:00 2001 From: PaperChalice Date: Fri, 10 Oct 2025 17:59:34 +0800 Subject: [PATCH 2/5] regenerate fma-combine.ll --- llvm/test/CodeGen/AMDGPU/fma-combine.ll | 440 ++++++++++++++---------- 1 file changed, 250 insertions(+), 190 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll index d7cf411da0ccb..955d4abb25f90 100644 --- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA,TAHITI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA,VERDE %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -enable-no-infs-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-NOFMA %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -enable-no-infs-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s @@ -648,53 +648,53 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z))) define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { -; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: -; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s6, 0 -; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; SI-FMA-NEXT: v_mov_b32_e32 v1, 0 -; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-FMA-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc -; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc -; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc -; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc -; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc -; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-FMA-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], -v[6:7] -; SI-FMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; SI-FMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; SI-FMA-NEXT: s_endpgm +; SI-LABEL: aggressive_combine_to_fma_fsub_0_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_fma_f64 v[6:7], v[8:9], v[10:11], -v[6:7] +; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm ; -; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: -; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 -; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], -v[4:5] -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] -; GFX11-FMA-NEXT: s_endpgm +; GFX11-LABEL: aggressive_combine_to_fma_fsub_0_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], -v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX11-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 @@ -718,55 +718,55 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) } define amdgpu_kernel void @no_aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { -; SI-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_0_f64: -; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s6, 0 -; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; SI-NOFMA-NEXT: v_mov_b32_e32 v1, 0 -; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOFMA-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NOFMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc -; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc -; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc -; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc -; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc -; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NOFMA-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11] -; SI-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9] -; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] -; SI-NOFMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; SI-NOFMA-NEXT: s_endpgm +; SI-LABEL: no_aggressive_combine_to_fma_fsub_0_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11] +; SI-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[6:7] +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm ; -; GFX11-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_0_f64: -; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 -; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9] -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7] -; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] -; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] -; GFX11-NOFMA-NEXT: s_endpgm +; GFX11-LABEL: no_aggressive_combine_to_fma_fsub_0_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7] +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] +; GFX11-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 @@ -792,53 +792,53 @@ define amdgpu_kernel void @no_aggressive_combine_to_fma_fsub_0_f64(ptr addrspace ; fold (fsub x, (fma y, z, (fmul u, v))) ; -> (fma (fneg y), z, (fma (fneg u), v, x)) define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { -; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: -; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s6, 0 -; SI-FMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; SI-FMA-NEXT: v_mov_b32_e32 v1, 0 -; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-FMA-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc -; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc -; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc -; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc -; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc -; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-FMA-NEXT: v_fma_f64 v[2:3], -v[8:9], v[10:11], v[2:3] -; SI-FMA-NEXT: v_fma_f64 v[2:3], -v[4:5], v[6:7], v[2:3] -; SI-FMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; SI-FMA-NEXT: s_endpgm +; SI-LABEL: aggressive_combine_to_fma_fsub_1_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[10:11], v[2:3] +; SI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[6:7], v[2:3] +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm ; -; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: -; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 -; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[6:7], v[8:9], v[0:1] -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[4:5], v[0:1] -; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] -; GFX11-FMA-NEXT: s_endpgm +; GFX11-LABEL: aggressive_combine_to_fma_fsub_1_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f64 v[0:1], -v[6:7], v[8:9], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[0:1], -v[2:3], v[4:5], v[0:1] +; GFX11-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 @@ -862,55 +862,55 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ret void } define amdgpu_kernel void @no_aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 { -; SI-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_1_f64: -; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s6, 0 -; SI-NOFMA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; SI-NOFMA-NEXT: v_mov_b32_e32 v1, 0 -; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOFMA-NEXT: s_mov_b64 s[4:5], s[2:3] -; SI-NOFMA-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc -; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc -; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc -; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc -; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc -; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NOFMA-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11] -; SI-NOFMA-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] -; SI-NOFMA-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5] -; SI-NOFMA-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; SI-NOFMA-NEXT: s_endpgm +; SI-LABEL: no_aggressive_combine_to_fma_fsub_1_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_mul_f64 v[8:9], v[8:9], v[10:11] +; SI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[4:5] +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm ; -; GFX11-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_1_f64: -; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NOFMA-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 -; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9] -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] -; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] -; GFX11-NOFMA-NEXT: s_endpgm +; GFX11-LABEL: no_aggressive_combine_to_fma_fsub_1_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX11-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1 @@ -2123,6 +2123,66 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, } define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, +; TAHITI-LABEL: test_f64_interp: +; TAHITI: ; %bb.0: +; TAHITI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; TAHITI-NEXT: s_mov_b32 s11, 0xf000 +; TAHITI-NEXT: s_mov_b32 s10, -1 +; TAHITI-NEXT: s_mov_b32 s18, s10 +; TAHITI-NEXT: s_mov_b32 s19, s11 +; TAHITI-NEXT: s_waitcnt lgkmcnt(0) +; TAHITI-NEXT: s_mov_b32 s16, s4 +; TAHITI-NEXT: s_mov_b32 s17, s5 +; TAHITI-NEXT: s_mov_b32 s4, s6 +; TAHITI-NEXT: s_mov_b32 s5, s7 +; TAHITI-NEXT: s_mov_b32 s6, s10 +; TAHITI-NEXT: s_mov_b32 s7, s11 +; TAHITI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; TAHITI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 +; TAHITI-NEXT: s_mov_b32 s14, s10 +; TAHITI-NEXT: s_mov_b32 s12, s2 +; TAHITI-NEXT: s_mov_b32 s13, s3 +; TAHITI-NEXT: s_mov_b32 s15, s11 +; TAHITI-NEXT: buffer_load_dwordx2 v[4:5], off, s[12:15], 0 +; TAHITI-NEXT: s_mov_b32 s8, s0 +; TAHITI-NEXT: s_mov_b32 s9, s1 +; TAHITI-NEXT: s_waitcnt vmcnt(2) +; TAHITI-NEXT: v_add_f64 v[6:7], -v[0:1], 1.0 +; TAHITI-NEXT: s_waitcnt vmcnt(1) +; TAHITI-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7] +; TAHITI-NEXT: s_waitcnt vmcnt(0) +; TAHITI-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3] +; TAHITI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; TAHITI-NEXT: s_endpgm +; +; VERDE-LABEL: test_f64_interp: +; VERDE: ; %bb.0: +; VERDE-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; VERDE-NEXT: s_mov_b32 s11, 0xf000 +; VERDE-NEXT: s_mov_b32 s10, -1 +; VERDE-NEXT: s_mov_b32 s14, s10 +; VERDE-NEXT: s_mov_b32 s15, s11 +; VERDE-NEXT: s_waitcnt lgkmcnt(0) +; VERDE-NEXT: s_mov_b32 s12, s6 +; VERDE-NEXT: s_mov_b32 s13, s7 +; VERDE-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 +; VERDE-NEXT: s_mov_b32 s6, s10 +; VERDE-NEXT: s_mov_b32 s7, s11 +; VERDE-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; VERDE-NEXT: s_mov_b32 s4, s2 +; VERDE-NEXT: s_mov_b32 s5, s3 +; VERDE-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 +; VERDE-NEXT: s_mov_b32 s8, s0 +; VERDE-NEXT: s_mov_b32 s9, s1 +; VERDE-NEXT: s_waitcnt vmcnt(2) +; VERDE-NEXT: v_add_f64 v[6:7], -v[0:1], 1.0 +; VERDE-NEXT: s_waitcnt vmcnt(1) +; VERDE-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7] +; VERDE-NEXT: s_waitcnt vmcnt(0) +; VERDE-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3] +; VERDE-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; VERDE-NEXT: s_endpgm +; ; SI-FMA-LABEL: test_f64_interp: ; SI-FMA: ; %bb.0: ; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 From 87ed258e2e4dbc6453073f3adb3c971e5a35e255 Mon Sep 17 00:00:00 2001 From: PaperChalice Date: Fri, 10 Oct 2025 18:32:36 +0800 Subject: [PATCH 3/5] fix tests --- llvm/test/CodeGen/AMDGPU/fma-combine.ll | 1573 ++++++++++------- llvm/test/CodeGen/X86/fma_patterns.ll | 1837 +++++++++++--------- llvm/test/CodeGen/X86/fma_patterns_wide.ll | 1269 +++++++------- 3 files changed, 2616 insertions(+), 2063 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll index 955d4abb25f90..8fc6904f5009c 100644 --- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA,TAHITI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA,VERDE %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -enable-no-infs-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-NOFMA %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -enable-no-infs-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be ; beneficial even without fp32 denormals, but they do require no-infs-fp-math @@ -939,7 +939,58 @@ define amdgpu_kernel void @no_aggressive_combine_to_fma_fsub_1_f64(ptr addrspace ; define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, -; SI-NOFMA-LABEL: test_f32_mul_add_x_one_y: +; SI-LABEL: test_f32_mul_add_x_one_y: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; GFX11-LABEL: test_f32_mul_add_x_one_y: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm + ptr addrspace(1) %in1, + ptr addrspace(1) %in2) { + %x = load volatile float, ptr addrspace(1) %in1 + %y = load volatile float, ptr addrspace(1) %in2 + %a = fadd contract float %x, 1.0 + %m = fmul contract float %a, %y + store float %m, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_f32_mul_add_x_one_y_ninf(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_add_x_one_y_ninf: ; SI-NOFMA: ; %bb.0: ; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -958,12 +1009,11 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s4, s0 ; SI-NOFMA-NEXT: s_mov_b32 s5, s1 -; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NOFMA-NEXT: v_mac_f32_e32 v1, v0, v1 +; SI-NOFMA-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; -; SI-FMA-LABEL: test_f32_mul_add_x_one_y: +; SI-FMA-LABEL: test_f32_mul_add_x_one_y_ninf: ; SI-FMA: ; %bb.0: ; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -986,49 +1036,83 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out, ; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; -; GFX11-NOFMA-LABEL: test_f32_mul_add_x_one_y: -; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NOFMA-NEXT: s_endpgm +; GFX11-LABEL: test_f32_mul_add_x_one_y_ninf: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fmac_f32_e32 v2, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-NEXT: s_endpgm + ptr addrspace(1) %in1, + ptr addrspace(1) %in2) { + %x = load volatile float, ptr addrspace(1) %in1 + %y = load volatile float, ptr addrspace(1) %in2 + %a = fadd contract ninf float %x, 1.0 + %m = fmul contract ninf float %a, %y + store float %m, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, +; SI-LABEL: test_f32_mul_y_add_x_one: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm ; -; GFX11-FMA-LABEL: test_f32_mul_add_x_one_y: -; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX11-FMA-NEXT: s_endpgm +; GFX11-LABEL: test_f32_mul_y_add_x_one: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load volatile float, ptr addrspace(1) %in1 %y = load volatile float, ptr addrspace(1) %in2 %a = fadd contract float %x, 1.0 - %m = fmul contract float %a, %y + %m = fmul contract float %y, %a store float %m, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, -; SI-NOFMA-LABEL: test_f32_mul_y_add_x_one: +define amdgpu_kernel void @test_f32_mul_y_add_x_one_ninf(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_y_add_x_one_ninf: ; SI-NOFMA: ; %bb.0: ; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1047,12 +1131,11 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) ; SI-NOFMA-NEXT: s_mov_b32 s4, s0 ; SI-NOFMA-NEXT: s_mov_b32 s5, s1 -; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NOFMA-NEXT: v_mac_f32_e32 v1, v0, v1 +; SI-NOFMA-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; -; SI-FMA-LABEL: test_f32_mul_y_add_x_one: +; SI-FMA-LABEL: test_f32_mul_y_add_x_one_ninf: ; SI-FMA: ; %bb.0: ; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1075,126 +1158,72 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out, ; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; -; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_one: -; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NOFMA-NEXT: s_endpgm -; -; GFX11-FMA-LABEL: test_f32_mul_y_add_x_one: -; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX11-FMA-NEXT: s_endpgm +; GFX11-LABEL: test_f32_mul_y_add_x_one_ninf: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fmac_f32_e32 v2, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load volatile float, ptr addrspace(1) %in1 %y = load volatile float, ptr addrspace(1) %in2 - %a = fadd contract float %x, 1.0 - %m = fmul contract float %y, %a + %a = fadd contract ninf float %x, 1.0 + %m = fmul contract ninf float %y, %a store float %m, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, -; SI-NOFMA-LABEL: test_f32_mul_add_x_negone_y: -; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s6, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s6 -; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s12, s2 -; SI-NOFMA-NEXT: s_mov_b32 s13, s3 -; SI-NOFMA-NEXT: s_mov_b32 s15, s7 -; SI-NOFMA-NEXT: s_mov_b32 s10, s6 -; SI-NOFMA-NEXT: s_mov_b32 s11, s7 -; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s4, s0 -; SI-NOFMA-NEXT: s_mov_b32 s5, s1 -; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) -; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 -; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NOFMA-NEXT: s_endpgm -; -; SI-FMA-LABEL: test_f32_mul_add_x_negone_y: -; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s6, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s6 -; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s12, s2 -; SI-FMA-NEXT: s_mov_b32 s13, s3 -; SI-FMA-NEXT: s_mov_b32 s15, s7 -; SI-FMA-NEXT: s_mov_b32 s10, s6 -; SI-FMA-NEXT: s_mov_b32 s11, s7 -; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s4, s0 -; SI-FMA-NEXT: s_mov_b32 s5, s1 -; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-FMA-NEXT: s_endpgm -; -; GFX11-NOFMA-LABEL: test_f32_mul_add_x_negone_y: -; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) -; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1 -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NOFMA-NEXT: s_endpgm +; SI-LABEL: test_f32_mul_add_x_negone_y: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v0, -1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm ; -; GFX11-FMA-LABEL: test_f32_mul_add_x_negone_y: -; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FMA-NEXT: s_endpgm +; GFX11-LABEL: test_f32_mul_add_x_negone_y: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_add_f32_e32 v1, -1.0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 @@ -1205,8 +1234,8 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out, ret void } -define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, -; SI-NOFMA-LABEL: test_f32_mul_y_add_x_negone: +define amdgpu_kernel void @test_f32_mul_add_x_negone_y_ninf(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_add_x_negone_y_ninf: ; SI-NOFMA: ; %bb.0: ; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1223,14 +1252,12 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, ; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_mov_b32 s4, s0 ; SI-NOFMA-NEXT: s_mov_b32 s5, s1 -; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) -; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NOFMA-NEXT: v_mad_f32 v0, v0, v1, -v1 ; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; -; SI-FMA-LABEL: test_f32_mul_y_add_x_negone: +; SI-FMA-LABEL: test_f32_mul_add_x_negone_y_ninf: ; SI-FMA: ; %bb.0: ; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1252,38 +1279,72 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, ; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; -; GFX11-NOFMA-LABEL: test_f32_mul_y_add_x_negone: -; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) -; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1 -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NOFMA-NEXT: s_endpgm +; GFX11-LABEL: test_f32_mul_add_x_negone_y_ninf: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f32 v1, v1, v2, -v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm + ptr addrspace(1) %in1, + ptr addrspace(1) %in2) { + %x = load float, ptr addrspace(1) %in1 + %y = load float, ptr addrspace(1) %in2 + %a = fadd contract ninf float %x, -1.0 + %m = fmul contract ninf float %a, %y + store float %m, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, +; SI-LABEL: test_f32_mul_y_add_x_negone: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v0, -1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm ; -; GFX11-FMA-LABEL: test_f32_mul_y_add_x_negone: -; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FMA-NEXT: s_endpgm +; GFX11-LABEL: test_f32_mul_y_add_x_negone: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_add_f32_e32 v1, -1.0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 @@ -1294,8 +1355,8 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out, ret void } -define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, -; SI-NOFMA-LABEL: test_f32_mul_sub_one_x_y: +define amdgpu_kernel void @test_f32_mul_y_add_x_negone_ninf(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_y_add_x_negone_ninf: ; SI-NOFMA: ; %bb.0: ; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1312,14 +1373,12 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, ; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_mov_b32 s4, s0 ; SI-NOFMA-NEXT: s_mov_b32 s5, s1 -; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) -; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NOFMA-NEXT: v_mad_f32 v0, v0, v1, -v1 ; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; -; SI-FMA-LABEL: test_f32_mul_sub_one_x_y: +; SI-FMA-LABEL: test_f32_mul_y_add_x_negone_ninf: ; SI-FMA: ; %bb.0: ; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1337,42 +1396,76 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, ; SI-FMA-NEXT: s_mov_b32 s4, s0 ; SI-FMA-NEXT: s_mov_b32 s5, s1 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1 +; SI-FMA-NEXT: v_fma_f32 v0, v0, v1, -v1 ; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; -; GFX11-NOFMA-LABEL: test_f32_mul_sub_one_x_y: -; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) -; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1 -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NOFMA-NEXT: s_endpgm +; GFX11-LABEL: test_f32_mul_y_add_x_negone_ninf: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f32 v1, v1, v2, -v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm + ptr addrspace(1) %in1, + ptr addrspace(1) %in2) { + %x = load float, ptr addrspace(1) %in1 + %y = load float, ptr addrspace(1) %in2 + %a = fadd contract ninf float %x, -1.0 + %m = fmul contract ninf float %y, %a + store float %m, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, +; SI-LABEL: test_f32_mul_sub_one_x_y: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_sub_f32_e32 v0, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm ; -; GFX11-FMA-LABEL: test_f32_mul_sub_one_x_y: -; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FMA-NEXT: s_endpgm +; GFX11-LABEL: test_f32_mul_sub_one_x_y: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_sub_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 @@ -1383,85 +1476,100 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out, ret void } -define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, -; SI-NOFMA-LABEL: test_f32_mul_y_sub_one_x: -; SI-NOFMA: ; %bb.0: -; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NOFMA-NEXT: s_mov_b32 s7, 0xf000 -; SI-NOFMA-NEXT: s_mov_b32 s6, -1 -; SI-NOFMA-NEXT: s_mov_b32 s14, s6 -; SI-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-NOFMA-NEXT: s_mov_b32 s12, s2 -; SI-NOFMA-NEXT: s_mov_b32 s13, s3 -; SI-NOFMA-NEXT: s_mov_b32 s15, s7 -; SI-NOFMA-NEXT: s_mov_b32 s10, s6 -; SI-NOFMA-NEXT: s_mov_b32 s11, s7 -; SI-NOFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-NOFMA-NEXT: s_mov_b32 s4, s0 -; SI-NOFMA-NEXT: s_mov_b32 s5, s1 -; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) -; SI-NOFMA-NEXT: v_sub_f32_e32 v0, 1.0, v0 -; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NOFMA-NEXT: s_endpgm -; -; SI-FMA-LABEL: test_f32_mul_y_sub_one_x: -; SI-FMA: ; %bb.0: -; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-FMA-NEXT: s_mov_b32 s7, 0xf000 -; SI-FMA-NEXT: s_mov_b32 s6, -1 -; SI-FMA-NEXT: s_mov_b32 s14, s6 -; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) -; SI-FMA-NEXT: s_mov_b32 s12, s2 -; SI-FMA-NEXT: s_mov_b32 s13, s3 -; SI-FMA-NEXT: s_mov_b32 s15, s7 -; SI-FMA-NEXT: s_mov_b32 s10, s6 -; SI-FMA-NEXT: s_mov_b32 s11, s7 -; SI-FMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 -; SI-FMA-NEXT: s_mov_b32 s4, s0 -; SI-FMA-NEXT: s_mov_b32 s5, s1 -; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1 -; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-FMA-NEXT: s_endpgm +define amdgpu_kernel void @test_f32_mul_sub_one_x_y_ninf(ptr addrspace(1) %out, +; SI-LABEL: test_f32_mul_sub_one_x_y_ninf: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_sub_f32_e32 v0, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm ; -; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_one_x: -; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) -; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1 -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NOFMA-NEXT: s_endpgm +; GFX11-LABEL: test_f32_mul_sub_one_x_y_ninf: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_sub_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm + ptr addrspace(1) %in1, + ptr addrspace(1) %in2) { + %x = load float, ptr addrspace(1) %in1 + %y = load float, ptr addrspace(1) %in2 + %s = fsub contract ninf float 1.0, %x + %m = fmul contract ninf float %s, %y + store float %m, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, +; SI-LABEL: test_f32_mul_y_sub_one_x: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_sub_f32_e32 v0, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm ; -; GFX11-FMA-LABEL: test_f32_mul_y_sub_one_x: -; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FMA-NEXT: s_endpgm +; GFX11-LABEL: test_f32_mul_y_sub_one_x: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_sub_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 @@ -1472,8 +1580,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out, ret void } -define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, -; SI-NOFMA-LABEL: test_f32_mul_sub_negone_x_y: +define amdgpu_kernel void @test_f32_mul_y_sub_one_x_ninf(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_y_sub_one_x_ninf: ; SI-NOFMA: ; %bb.0: ; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1490,14 +1598,12 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, ; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_mov_b32 s4, s0 ; SI-NOFMA-NEXT: s_mov_b32 s5, s1 -; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) -; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NOFMA-NEXT: v_mad_f32 v0, -v0, v1, v1 ; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; -; SI-FMA-LABEL: test_f32_mul_sub_negone_x_y: +; SI-FMA-LABEL: test_f32_mul_y_sub_one_x_ninf: ; SI-FMA: ; %bb.0: ; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1515,42 +1621,76 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, ; SI-FMA-NEXT: s_mov_b32 s4, s0 ; SI-FMA-NEXT: s_mov_b32 s5, s1 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, -v1 +; SI-FMA-NEXT: v_fma_f32 v0, -v0, v1, v1 ; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; -; GFX11-NOFMA-LABEL: test_f32_mul_sub_negone_x_y: -; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) -; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1 -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NOFMA-NEXT: s_endpgm +; GFX11-LABEL: test_f32_mul_y_sub_one_x_ninf: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f32 v1, -v1, v2, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm + ptr addrspace(1) %in1, + ptr addrspace(1) %in2) { + %x = load float, ptr addrspace(1) %in1 + %y = load float, ptr addrspace(1) %in2 + %s = fsub contract ninf float 1.0, %x + %m = fmul contract ninf float %y, %s + store float %m, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, +; SI-LABEL: test_f32_mul_sub_negone_x_y: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_sub_f32_e32 v0, -1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm ; -; GFX11-FMA-LABEL: test_f32_mul_sub_negone_x_y: -; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FMA-NEXT: s_endpgm +; GFX11-LABEL: test_f32_mul_sub_negone_x_y: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_sub_f32_e32 v1, -1.0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 @@ -1561,8 +1701,112 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out, ret void } +define amdgpu_kernel void @test_f32_mul_sub_negone_x_y_ninf(ptr addrspace(1) %out, +; SI-LABEL: test_f32_mul_sub_negone_x_y_ninf: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_sub_f32_e32 v0, -1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; GFX11-LABEL: test_f32_mul_sub_negone_x_y_ninf: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_sub_f32_e32 v1, -1.0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm + ptr addrspace(1) %in1, + ptr addrspace(1) %in2) { + %x = load float, ptr addrspace(1) %in1 + %y = load float, ptr addrspace(1) %in2 + %s = fsub contract ninf float -1.0, %x + %m = fmul contract ninf float %s, %y + store float %m, ptr addrspace(1) %out + ret void +} + define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, -; SI-NOFMA-LABEL: test_f32_mul_y_sub_negone_x: +; SI-LABEL: test_f32_mul_y_sub_negone_x: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_sub_f32_e32 v0, -1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; GFX11-LABEL: test_f32_mul_y_sub_negone_x: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_sub_f32_e32 v1, -1.0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm + ptr addrspace(1) %in1, + ptr addrspace(1) %in2) { + %x = load float, ptr addrspace(1) %in1 + %y = load float, ptr addrspace(1) %in2 + %s = fsub contract float -1.0, %x + %m = fmul contract float %y, %s + store float %m, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_f32_mul_y_sub_negone_x_ninf(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_y_sub_negone_x_ninf: ; SI-NOFMA: ; %bb.0: ; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1579,14 +1823,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, ; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_mov_b32 s4, s0 ; SI-NOFMA-NEXT: s_mov_b32 s5, s1 -; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) -; SI-NOFMA-NEXT: v_sub_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NOFMA-NEXT: v_mad_f32 v0, -v0, v1, -v1 ; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; -; SI-FMA-LABEL: test_f32_mul_y_sub_negone_x: +; SI-FMA-LABEL: test_f32_mul_y_sub_negone_x_ninf: ; SI-FMA: ; %bb.0: ; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1608,50 +1850,84 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out, ; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; -; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_negone_x: -; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) -; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1 -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NOFMA-NEXT: s_endpgm -; -; GFX11-FMA-LABEL: test_f32_mul_y_sub_negone_x: -; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FMA-NEXT: s_endpgm +; GFX11-LABEL: test_f32_mul_y_sub_negone_x_ninf: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f32 v1, -v1, v2, -v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 %y = load float, ptr addrspace(1) %in2 - %s = fsub contract float -1.0, %x - %m = fmul contract float %y, %s + %s = fsub contract ninf float -1.0, %x + %m = fmul contract ninf float %y, %s store float %m, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, -; SI-NOFMA-LABEL: test_f32_mul_sub_x_one_y: +; SI-LABEL: test_f32_mul_sub_x_one_y: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v0, -1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; GFX11-LABEL: test_f32_mul_sub_x_one_y: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_add_f32_e32 v1, -1.0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm + ptr addrspace(1) %in1, + ptr addrspace(1) %in2) { + %x = load float, ptr addrspace(1) %in1 + %y = load float, ptr addrspace(1) %in2 + %s = fsub contract float %x, 1.0 + %m = fmul contract float %s, %y + store float %m, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_f32_mul_sub_x_one_y_ninf(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_sub_x_one_y_ninf: ; SI-NOFMA: ; %bb.0: ; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1668,14 +1944,12 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, ; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_mov_b32 s4, s0 ; SI-NOFMA-NEXT: s_mov_b32 s5, s1 -; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) -; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NOFMA-NEXT: v_mad_f32 v0, v0, v1, -v1 ; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; -; SI-FMA-LABEL: test_f32_mul_sub_x_one_y: +; SI-FMA-LABEL: test_f32_mul_sub_x_one_y_ninf: ; SI-FMA: ; %bb.0: ; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1697,50 +1971,84 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out, ; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; -; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_one_y: -; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) -; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1 -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NOFMA-NEXT: s_endpgm -; -; GFX11-FMA-LABEL: test_f32_mul_sub_x_one_y: -; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FMA-NEXT: s_endpgm +; GFX11-LABEL: test_f32_mul_sub_x_one_y_ninf: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f32 v1, v1, v2, -v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 %y = load float, ptr addrspace(1) %in2 - %s = fsub contract float %x, 1.0 - %m = fmul contract float %s, %y + %s = fsub contract ninf float %x, 1.0 + %m = fmul contract ninf float %s, %y store float %m, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, -; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_one: +; SI-LABEL: test_f32_mul_y_sub_x_one: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v0, -1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; GFX11-LABEL: test_f32_mul_y_sub_x_one: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_add_f32_e32 v1, -1.0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm + ptr addrspace(1) %in1, + ptr addrspace(1) %in2) { + %x = load float, ptr addrspace(1) %in1 + %y = load float, ptr addrspace(1) %in2 + %s = fsub contract float %x, 1.0 + %m = fmul contract float %y, %s + store float %m, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_f32_mul_y_sub_x_one_ninf(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_one_ninf: ; SI-NOFMA: ; %bb.0: ; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1757,14 +2065,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, ; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_mov_b32 s4, s0 ; SI-NOFMA-NEXT: s_mov_b32 s5, s1 -; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) -; SI-NOFMA-NEXT: v_add_f32_e32 v0, -1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NOFMA-NEXT: v_mad_f32 v0, v0, v1, -v1 ; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; -; SI-FMA-LABEL: test_f32_mul_y_sub_x_one: +; SI-FMA-LABEL: test_f32_mul_y_sub_x_one_ninf: ; SI-FMA: ; %bb.0: ; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1786,50 +2092,84 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out, ; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; -; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_one: -; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) -; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1 -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NOFMA-NEXT: s_endpgm -; -; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_one: -; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FMA-NEXT: s_endpgm +; GFX11-LABEL: test_f32_mul_y_sub_x_one_ninf: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f32 v1, v1, v2, -v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 %y = load float, ptr addrspace(1) %in2 - %s = fsub contract float %x, 1.0 - %m = fmul contract float %y, %s + %s = fsub contract ninf float %x, 1.0 + %m = fmul contract ninf float %y, %s store float %m, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, -; SI-NOFMA-LABEL: test_f32_mul_sub_x_negone_y: +; SI-LABEL: test_f32_mul_sub_x_negone_y: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm +; +; GFX11-LABEL: test_f32_mul_sub_x_negone_y: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm + ptr addrspace(1) %in1, + ptr addrspace(1) %in2) { + %x = load float, ptr addrspace(1) %in1 + %y = load float, ptr addrspace(1) %in2 + %s = fsub contract float %x, -1.0 + %m = fmul contract float %s, %y + store float %m, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_f32_mul_sub_x_negone_y_ninf(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_sub_x_negone_y_ninf: ; SI-NOFMA: ; %bb.0: ; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1846,14 +2186,12 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, ; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_mov_b32 s4, s0 ; SI-NOFMA-NEXT: s_mov_b32 s5, s1 -; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) -; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v0, v1 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NOFMA-NEXT: v_mac_f32_e32 v1, v0, v1 +; SI-NOFMA-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; -; SI-FMA-LABEL: test_f32_mul_sub_x_negone_y: +; SI-FMA-LABEL: test_f32_mul_sub_x_negone_y_ninf: ; SI-FMA: ; %bb.0: ; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1875,50 +2213,84 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out, ; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; -; GFX11-NOFMA-LABEL: test_f32_mul_sub_x_negone_y: -; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) -; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v1, v2 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NOFMA-NEXT: s_endpgm +; GFX11-LABEL: test_f32_mul_sub_x_negone_y_ninf: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fmac_f32_e32 v2, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-NEXT: s_endpgm + ptr addrspace(1) %in1, + ptr addrspace(1) %in2) { + %x = load float, ptr addrspace(1) %in1 + %y = load float, ptr addrspace(1) %in2 + %s = fsub contract ninf float %x, -1.0 + %m = fmul contract ninf float %s, %y + store float %m, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, +; SI-LABEL: test_f32_mul_y_sub_x_negone: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v0, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm ; -; GFX11-FMA-LABEL: test_f32_mul_sub_x_negone_y: -; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX11-FMA-NEXT: s_endpgm +; GFX11-LABEL: test_f32_mul_y_sub_x_negone: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 %y = load float, ptr addrspace(1) %in2 %s = fsub contract float %x, -1.0 - %m = fmul contract float %s, %y + %m = fmul contract float %y, %s store float %m, ptr addrspace(1) %out ret void } -define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, -; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_negone: +define amdgpu_kernel void @test_f32_mul_y_sub_x_negone_ninf(ptr addrspace(1) %out, +; SI-NOFMA-LABEL: test_f32_mul_y_sub_x_negone_ninf: ; SI-NOFMA: ; %bb.0: ; SI-NOFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-NOFMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1935,14 +2307,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, ; SI-NOFMA-NEXT: buffer_load_dword v1, off, s[8:11], 0 ; SI-NOFMA-NEXT: s_mov_b32 s4, s0 ; SI-NOFMA-NEXT: s_mov_b32 s5, s1 -; SI-NOFMA-NEXT: s_waitcnt vmcnt(1) -; SI-NOFMA-NEXT: v_add_f32_e32 v0, 1.0, v0 ; SI-NOFMA-NEXT: s_waitcnt vmcnt(0) -; SI-NOFMA-NEXT: v_mul_f32_e32 v0, v1, v0 -; SI-NOFMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NOFMA-NEXT: v_mac_f32_e32 v1, v0, v1 +; SI-NOFMA-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NOFMA-NEXT: s_endpgm ; -; SI-FMA-LABEL: test_f32_mul_y_sub_x_negone: +; SI-FMA-LABEL: test_f32_mul_y_sub_x_negone_ninf: ; SI-FMA: ; %bb.0: ; SI-FMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; SI-FMA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd @@ -1964,44 +2334,26 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out, ; SI-FMA-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-FMA-NEXT: s_endpgm ; -; GFX11-NOFMA-LABEL: test_f32_mul_y_sub_x_negone: -; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NOFMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: s_clause 0x1 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) -; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_mul_f32_e32 v1, v2, v1 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NOFMA-NEXT: s_endpgm -; -; GFX11-FMA-LABEL: test_f32_mul_y_sub_x_negone: -; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FMA-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: s_clause 0x1 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX11-FMA-NEXT: s_endpgm +; GFX11-LABEL: test_f32_mul_y_sub_x_negone_ninf: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fmac_f32_e32 v2, v1, v2 +; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2) { %x = load float, ptr addrspace(1) %in1 %y = load float, ptr addrspace(1) %in2 - %s = fsub contract float %x, -1.0 - %m = fmul contract float %y, %s + %s = fsub contract ninf float %x, -1.0 + %m = fmul contract ninf float %y, %s store float %m, ptr addrspace(1) %out ret void } @@ -2048,66 +2400,51 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out, ; SI-FMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 ; SI-FMA-NEXT: s_mov_b32 s11, 0xf000 ; SI-FMA-NEXT: s_mov_b32 s10, -1 -; SI-FMA-NEXT: s_mov_b32 s18, s10 -; SI-FMA-NEXT: s_mov_b32 s19, s11 +; SI-FMA-NEXT: s_mov_b32 s14, s10 +; SI-FMA-NEXT: s_mov_b32 s15, s11 ; SI-FMA-NEXT: s_waitcnt lgkmcnt(0) ; SI-FMA-NEXT: s_mov_b32 s16, s4 ; SI-FMA-NEXT: s_mov_b32 s17, s5 -; SI-FMA-NEXT: s_mov_b32 s14, s10 -; SI-FMA-NEXT: s_mov_b32 s12, s2 -; SI-FMA-NEXT: s_mov_b32 s13, s3 -; SI-FMA-NEXT: s_mov_b32 s15, s11 ; SI-FMA-NEXT: s_mov_b32 s4, s6 ; SI-FMA-NEXT: s_mov_b32 s5, s7 ; SI-FMA-NEXT: s_mov_b32 s6, s10 ; SI-FMA-NEXT: s_mov_b32 s7, s11 -; SI-FMA-NEXT: buffer_load_dword v0, off, s[16:19], 0 -; SI-FMA-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-FMA-NEXT: s_mov_b32 s12, s2 +; SI-FMA-NEXT: s_mov_b32 s13, s3 +; SI-FMA-NEXT: s_mov_b32 s18, s10 +; SI-FMA-NEXT: s_mov_b32 s19, s11 +; SI-FMA-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-FMA-NEXT: buffer_load_dword v1, off, s[16:19], 0 ; SI-FMA-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; SI-FMA-NEXT: s_mov_b32 s8, s0 ; SI-FMA-NEXT: s_mov_b32 s9, s1 +; SI-FMA-NEXT: s_waitcnt vmcnt(2) +; SI-FMA-NEXT: v_sub_f32_e32 v3, 1.0, v0 ; SI-FMA-NEXT: s_waitcnt vmcnt(1) -; SI-FMA-NEXT: v_fma_f32 v0, -v1, v0, v0 +; SI-FMA-NEXT: v_mul_f32_e32 v1, v1, v3 ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: v_fma_f32 v0, v2, v1, v0 +; SI-FMA-NEXT: v_fma_f32 v0, v2, v0, v1 ; SI-FMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; -; GFX11-NOFMA-LABEL: test_f32_interp: -; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: s_clause 0x2 -; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] -; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5] -; GFX11-NOFMA-NEXT: global_load_b32 v3, v0, s[2:3] -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(2) -; GFX11-NOFMA-NEXT: v_sub_f32_e32 v4, 1.0, v1 -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: v_fmac_f32_e32 v2, v3, v1 -; GFX11-NOFMA-NEXT: global_store_b32 v0, v2, s[0:1] -; GFX11-NOFMA-NEXT: s_endpgm -; -; GFX11-FMA-LABEL: test_f32_interp: -; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: s_clause 0x2 -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[4:5] -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[6:7] -; GFX11-FMA-NEXT: global_load_b32 v3, v0, s[2:3] -; GFX11-FMA-NEXT: s_waitcnt vmcnt(1) -; GFX11-FMA-NEXT: v_fma_f32 v1, -v2, v1, v1 -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FMA-NEXT: v_fmac_f32_e32 v1, v3, v2 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FMA-NEXT: s_endpgm +; GFX11-LABEL: test_f32_interp: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] +; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] +; GFX11-NEXT: global_load_b32 v3, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_sub_f32_e32 v4, 1.0, v1 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v1 +; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) { @@ -2197,8 +2534,8 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; SI-FMA-NEXT: s_mov_b32 s5, s7 ; SI-FMA-NEXT: s_mov_b32 s6, s10 ; SI-FMA-NEXT: s_mov_b32 s7, s11 -; SI-FMA-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0 -; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-FMA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-FMA-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 ; SI-FMA-NEXT: s_mov_b32 s14, s10 ; SI-FMA-NEXT: s_mov_b32 s12, s2 ; SI-FMA-NEXT: s_mov_b32 s13, s3 @@ -2206,48 +2543,33 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out, ; SI-FMA-NEXT: buffer_load_dwordx2 v[4:5], off, s[12:15], 0 ; SI-FMA-NEXT: s_mov_b32 s8, s0 ; SI-FMA-NEXT: s_mov_b32 s9, s1 +; SI-FMA-NEXT: s_waitcnt vmcnt(2) +; SI-FMA-NEXT: v_add_f64 v[6:7], -v[0:1], 1.0 ; SI-FMA-NEXT: s_waitcnt vmcnt(1) -; SI-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], v[0:1] +; SI-FMA-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7] ; SI-FMA-NEXT: s_waitcnt vmcnt(0) -; SI-FMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-FMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3] ; SI-FMA-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-FMA-NEXT: s_endpgm ; -; GFX11-NOFMA-LABEL: test_f64_interp: -; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NOFMA-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: s_clause 0x2 -; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v8, s[6:7] -; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v8, s[4:5] -; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v8, s[2:3] -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(2) -; GFX11-NOFMA-NEXT: v_add_f64 v[6:7], -v[0:1], 1.0 -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1) -; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NOFMA-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7] -; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3] -; GFX11-NOFMA-NEXT: global_store_b64 v8, v[0:1], s[0:1] -; GFX11-NOFMA-NEXT: s_endpgm -; -; GFX11-FMA-LABEL: test_f64_interp: -; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-FMA-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: s_clause 0x2 -; GFX11-FMA-NEXT: global_load_b64 v[0:1], v6, s[4:5] -; GFX11-FMA-NEXT: global_load_b64 v[2:3], v6, s[6:7] -; GFX11-FMA-NEXT: global_load_b64 v[4:5], v6, s[2:3] -; GFX11-FMA-NEXT: s_waitcnt vmcnt(1) -; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], v[0:1] -; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] -; GFX11-FMA-NEXT: global_store_b64 v6, v[0:1], s[0:1] -; GFX11-FMA-NEXT: s_endpgm +; GFX11-LABEL: test_f64_interp: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: global_load_b64 v[0:1], v8, s[6:7] +; GFX11-NEXT: global_load_b64 v[2:3], v8, s[4:5] +; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_add_f64 v[6:7], -v[0:1], 1.0 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3] +; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) { @@ -2416,3 +2738,6 @@ define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspac attributes #0 = { nounwind readnone } attributes #1 = { nounwind } attributes #2 = { nounwind "no-signed-zeros-fp-math"="true" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-FMA: {{.*}} +; GFX11-NOFMA: {{.*}} diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll index be5e23cd4cce3..e185d72635503 100644 --- a/llvm/test/CodeGen/X86/fma_patterns.ll +++ b/llvm/test/CodeGen/X86/fma_patterns.ll @@ -3,10 +3,6 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma | FileCheck %s --check-prefixes=FMA4,FMA4-INFS ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 | FileCheck %s --check-prefixes=FMA4,FMA4-INFS ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512-INFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA,FMA-NOINFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA4,FMA4-NOINFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA4,FMA4-NOINFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -enable-no-infs-fp-math | FileCheck %s --check-prefixes=AVX512,AVX512-NOINFS ; ; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z) @@ -561,753 +557,834 @@ define <2 x double> @test_2f64_fmsub_load(ptr %a0, <2 x double> %a1, <2 x double ; define <4 x float> @test_v4f32_mul_add_x_one_y(<4 x float> %x, <4 x float> %y) { -; FMA-INFS-LABEL: test_v4f32_mul_add_x_one_y: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f32_mul_add_x_one_y: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f32_mul_add_x_one_y: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f32_mul_add_x_one_y: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f32_mul_add_x_one_y: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f32_mul_add_x_one_y: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f32_mul_add_x_one_y: +; FMA: # %bb.0: +; FMA-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_add_x_one_y: +; FMA4: # %bb.0: +; FMA4-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_add_x_one_y: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %a = fadd contract <4 x float> %x, %m = fmul contract <4 x float> %a, %y ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_add_x_one_y_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_add_x_one_y_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_add_x_one_y_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_add_x_one_y_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 +; AVX512-NEXT: retq + %a = fadd contract ninf <4 x float> %x, + %m = fmul contract ninf <4 x float> %a, %y + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_y_add_x_one(<4 x float> %x, <4 x float> %y) { -; FMA-INFS-LABEL: test_v4f32_mul_y_add_x_one: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f32_mul_y_add_x_one: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f32_mul_y_add_x_one: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f32_mul_y_add_x_one: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f32_mul_y_add_x_one: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f32_mul_y_add_x_one: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f32_mul_y_add_x_one: +; FMA: # %bb.0: +; FMA-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_add_x_one: +; FMA4: # %bb.0: +; FMA4-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_add_x_one: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq %a = fadd contract <4 x float> %x, %m = fmul contract <4 x float> %y, %a ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_y_add_x_one_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_add_x_one_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_add_x_one_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_add_x_one_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 +; AVX512-NEXT: retq + %a = fadd contract ninf <4 x float> %x, + %m = fmul contract ninf <4 x float> %y, %a + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_y_add_x_one_undefs(<4 x float> %x, <4 x float> %y) { -; FMA-INFS-LABEL: test_v4f32_mul_y_add_x_one_undefs: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f32_mul_y_add_x_one_undefs: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f32_mul_y_add_x_one_undefs: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f32_mul_y_add_x_one_undefs: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f32_mul_y_add_x_one_undefs: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f32_mul_y_add_x_one_undefs: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f32_mul_y_add_x_one_undefs: +; FMA: # %bb.0: +; FMA-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_add_x_one_undefs: +; FMA4: # %bb.0: +; FMA4-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_add_x_one_undefs: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq %a = fadd contract <4 x float> %x, %m = fmul contract <4 x float> %y, %a ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_y_add_x_one_undefs_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_add_x_one_undefs_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_add_x_one_undefs_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_add_x_one_undefs_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 +; AVX512-NEXT: retq + %a = fadd contract ninf<4 x float> %x, + %m = fmul contract ninf<4 x float> %y, %a + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_add_x_negone_y(<4 x float> %x, <4 x float> %y) { -; FMA-INFS-LABEL: test_v4f32_mul_add_x_negone_y: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f32_mul_add_x_negone_y: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f32_mul_add_x_negone_y: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f32_mul_add_x_negone_y: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f32_mul_add_x_negone_y: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f32_mul_add_x_negone_y: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f32_mul_add_x_negone_y: +; FMA: # %bb.0: +; FMA-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_add_x_negone_y: +; FMA4: # %bb.0: +; FMA4-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_add_x_negone_y: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %a = fadd contract <4 x float> %x, %m = fmul contract <4 x float> %a, %y ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_add_x_negone_y_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_add_x_negone_y_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_add_x_negone_y_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_add_x_negone_y_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 +; AVX512-NEXT: retq + %a = fadd contract ninf<4 x float> %x, + %m = fmul contract ninf<4 x float> %a, %y + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_y_add_x_negone(<4 x float> %x, <4 x float> %y) { -; FMA-INFS-LABEL: test_v4f32_mul_y_add_x_negone: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f32_mul_y_add_x_negone: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f32_mul_y_add_x_negone: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f32_mul_y_add_x_negone: +; FMA: # %bb.0: +; FMA-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_add_x_negone: +; FMA4: # %bb.0: +; FMA4-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_add_x_negone: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq %a = fadd contract <4 x float> %x, %m = fmul contract <4 x float> %y, %a ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_y_add_x_negone_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_add_x_negone_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_add_x_negone_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_add_x_negone_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 +; AVX512-NEXT: retq + %a = fadd contract ninf<4 x float> %x, + %m = fmul contract ninf<4 x float> %y, %a + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_y_add_x_negone_undefs(<4 x float> %x, <4 x float> %y) { -; FMA-INFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f32_mul_y_add_x_negone_undefs: +; FMA: # %bb.0: +; FMA-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_add_x_negone_undefs: +; FMA4: # %bb.0: +; FMA4-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_add_x_negone_undefs: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq %a = fadd contract <4 x float> %x, %m = fmul contract <4 x float> %y, %a ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_y_add_x_negone_undefs_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_add_x_negone_undefs_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_add_x_negone_undefs_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_add_x_negone_undefs_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 +; AVX512-NEXT: retq + %a = fadd contract ninf<4 x float> %x, + %m = fmul contract ninf<4 x float> %y, %a + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) { -; FMA-INFS-LABEL: test_v4f32_mul_sub_one_x_y: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f32_mul_sub_one_x_y: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f32_mul_sub_one_x_y: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX512-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f32_mul_sub_one_x_y: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f32_mul_sub_one_x_y: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f32_mul_sub_one_x_y: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f32_mul_sub_one_x_y: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_sub_one_x_y: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA4-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_sub_one_x_y: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %s = fsub contract <4 x float> , %x %m = fmul contract <4 x float> %s, %y ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_sub_one_x_y_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_sub_one_x_y_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_sub_one_x_y_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA4-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_sub_one_x_y_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %s = fsub contract ninf<4 x float> , %x + %m = fmul contract ninf<4 x float> %s, %y + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) { -; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_one_x: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f32_mul_y_sub_one_x: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_sub_one_x: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_sub_one_x: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq %s = fsub contract <4 x float> , %x %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_y_sub_one_x_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_one_x_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_sub_one_x_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_sub_one_x_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 +; AVX512-NEXT: retq + %s = fsub contract ninf<4 x float> , %x + %m = fmul contract ninf<4 x float> %y, %s + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs(<4 x float> %x, <4 x float> %y) { -; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f32_mul_y_sub_one_x_undefs: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_sub_one_x_undefs: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_sub_one_x_undefs: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq %s = fsub contract <4 x float> , %x %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_one_x_undefs_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_sub_one_x_undefs_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_sub_one_x_undefs_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 +; AVX512-NEXT: retq + %s = fsub contract ninf<4 x float> , %x + %m = fmul contract ninf<4 x float> %y, %s + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y) { -; FMA-INFS-LABEL: test_v4f32_mul_sub_negone_x_y: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f32_mul_sub_negone_x_y: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f32_mul_sub_negone_x_y: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; AVX512-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX512-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f32_mul_sub_negone_x_y: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f32_mul_sub_negone_x_y: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f32_mul_sub_negone_x_y: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f32_mul_sub_negone_x_y: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_sub_negone_x_y: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA4-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_sub_negone_x_y: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; AVX512-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %s = fsub contract <4 x float> , %x %m = fmul contract <4 x float> %s, %y ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_sub_negone_x_y_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_sub_negone_x_y_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_sub_negone_x_y_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA4-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_sub_negone_x_y_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; AVX512-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq + %s = fsub contract ninf<4 x float> , %x + %m = fmul contract ninf<4 x float> %s, %y + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y) { -; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_negone_x: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; AVX512-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f32_mul_y_sub_negone_x: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_sub_negone_x: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_sub_negone_x: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; AVX512-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq %s = fsub contract <4 x float> , %x %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_y_sub_negone_x_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_negone_x_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_sub_negone_x_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_sub_negone_x_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1 +; AVX512-NEXT: retq + %s = fsub contract ninf<4 x float> , %x + %m = fmul contract ninf<4 x float> %y, %s + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_y_sub_negone_x_undefs(<4 x float> %x, <4 x float> %y) { -; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; AVX512-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 -; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; AVX512-NEXT: vsubps %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq %s = fsub contract <4 x float> , %x %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_y_sub_negone_x_undefs_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_negone_x_undefs_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_sub_negone_x_undefs_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_sub_negone_x_undefs_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1 +; AVX512-NEXT: retq + %s = fsub contract ninf<4 x float> , %x + %m = fmul contract ninf<4 x float> %y, %s + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_sub_x_one_y(<4 x float> %x, <4 x float> %y) { -; FMA-INFS-LABEL: test_v4f32_mul_sub_x_one_y: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f32_mul_sub_x_one_y: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f32_mul_sub_x_one_y: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f32_mul_sub_x_one_y: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f32_mul_sub_x_one_y: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f32_mul_sub_x_one_y: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f32_mul_sub_x_one_y: +; FMA: # %bb.0: +; FMA-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_sub_x_one_y: +; FMA4: # %bb.0: +; FMA4-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_sub_x_one_y: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %s = fsub contract <4 x float> %x, %m = fmul contract <4 x float> %s, %y ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_sub_x_one_y_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_sub_x_one_y_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_sub_x_one_y_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_sub_x_one_y_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 +; AVX512-NEXT: retq + %s = fsub contract ninf<4 x float> %x, + %m = fmul contract ninf<4 x float> %s, %y + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_y_sub_x_one(<4 x float> %x, <4 x float> %y) { -; FMA-INFS-LABEL: test_v4f32_mul_y_sub_x_one: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_x_one: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_x_one: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f32_mul_y_sub_x_one: +; FMA: # %bb.0: +; FMA-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_sub_x_one: +; FMA4: # %bb.0: +; FMA4-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_sub_x_one: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq %s = fsub contract <4 x float> %x, %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_y_sub_x_one_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_x_one_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_sub_x_one_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_sub_x_one_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 +; AVX512-NEXT: retq + %s = fsub contract ninf<4 x float> %x, + %m = fmul contract ninf<4 x float> %y, %s + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_y_sub_x_one_undefs(<4 x float> %x, <4 x float> %y) { -; FMA-INFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f32_mul_y_sub_x_one_undefs: +; FMA: # %bb.0: +; FMA-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_sub_x_one_undefs: +; FMA4: # %bb.0: +; FMA4-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_sub_x_one_undefs: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq %s = fsub contract <4 x float> %x, %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_y_sub_x_one_undefs_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_x_one_undefs_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_sub_x_one_undefs_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_sub_x_one_undefs_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 +; AVX512-NEXT: retq + %s = fsub contract ninf<4 x float> %x, + %m = fmul contract ninf<4 x float> %y, %s + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_sub_x_negone_y(<4 x float> %x, <4 x float> %y) { -; FMA-INFS-LABEL: test_v4f32_mul_sub_x_negone_y: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f32_mul_sub_x_negone_y: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f32_mul_sub_x_negone_y: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f32_mul_sub_x_negone_y: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f32_mul_sub_x_negone_y: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f32_mul_sub_x_negone_y: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f32_mul_sub_x_negone_y: +; FMA: # %bb.0: +; FMA-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_sub_x_negone_y: +; FMA4: # %bb.0: +; FMA4-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_sub_x_negone_y: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %s = fsub contract <4 x float> %x, %m = fmul contract <4 x float> %s, %y ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_sub_x_negone_y_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_sub_x_negone_y_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_sub_x_negone_y_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_sub_x_negone_y_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 +; AVX512-NEXT: retq + %s = fsub contract ninf<4 x float> %x, + %m = fmul contract ninf<4 x float> %s, %y + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_y_sub_x_negone(<4 x float> %x, <4 x float> %y) { -; FMA-INFS-LABEL: test_v4f32_mul_y_sub_x_negone: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_x_negone: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_x_negone: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f32_mul_y_sub_x_negone: +; FMA: # %bb.0: +; FMA-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_sub_x_negone: +; FMA4: # %bb.0: +; FMA4-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_sub_x_negone: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq %s = fsub contract <4 x float> %x, %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_y_sub_x_negone_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_x_negone_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_sub_x_negone_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_sub_x_negone_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 +; AVX512-NEXT: retq + %s = fsub contract ninf<4 x float> %x, + %m = fmul contract ninf<4 x float> %y, %s + ret <4 x float> %m +} + define <4 x float> @test_v4f32_mul_y_sub_x_negone_undefs(<4 x float> %x, <4 x float> %y) { -; FMA-INFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: +; FMA: # %bb.0: +; FMA-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: +; FMA4: # %bb.0: +; FMA4-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq %s = fsub contract <4 x float> %x, %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } +define <4 x float> @test_v4f32_mul_y_sub_x_negone_undefs_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_x_negone_undefs_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 +; FMA-NEXT: retq ; -; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y)) +; FMA4-LABEL: test_v4f32_mul_y_sub_x_negone_undefs_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 +; FMA4-NEXT: retq ; +; AVX512-LABEL: test_v4f32_mul_y_sub_x_negone_undefs_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 +; AVX512-NEXT: retq + %s = fsub contract ninf<4 x float> %x, + %m = fmul contract ninf<4 x float> %y, %s + ret <4 x float> %m +} + define float @test_f32_interp(float %x, float %y, float %t) { -; FMA-INFS-LABEL: test_f32_interp: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] -; FMA-INFS-NEXT: vsubss %xmm2, %xmm3, %xmm3 -; FMA-INFS-NEXT: vmulss %xmm3, %xmm1, %xmm1 -; FMA-INFS-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_f32_interp: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] -; FMA4-INFS-NEXT: vsubss %xmm2, %xmm3, %xmm3 -; FMA4-INFS-NEXT: vmulss %xmm3, %xmm1, %xmm1 -; FMA4-INFS-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_f32_interp: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] -; AVX512-INFS-NEXT: vsubss %xmm2, %xmm3, %xmm3 -; AVX512-INFS-NEXT: vmulss %xmm3, %xmm1, %xmm1 -; AVX512-INFS-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_f32_interp: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_f32_interp: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_f32_interp: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; AVX512-NOINFS-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_f32_interp: +; FMA: # %bb.0: +; FMA-NEXT: vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] +; FMA-NEXT: vsubss %xmm2, %xmm3, %xmm3 +; FMA-NEXT: vmulss %xmm3, %xmm1, %xmm1 +; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_f32_interp: +; FMA4: # %bb.0: +; FMA4-NEXT: vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] +; FMA4-NEXT: vsubss %xmm2, %xmm3, %xmm3 +; FMA4-NEXT: vmulss %xmm3, %xmm1, %xmm1 +; FMA4-NEXT: vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_f32_interp: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] +; AVX512-NEXT: vsubss %xmm2, %xmm3, %xmm3 +; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 +; AVX512-NEXT: retq %t1 = fsub contract nsz float 1.0, %t %tx = fmul contract nsz float %x, %t %ty = fmul contract nsz float %y, %t1 @@ -1315,48 +1392,55 @@ define float @test_f32_interp(float %x, float %y, float %t) { ret float %r } +define float @test_f32_interp_ninf(float %x, float %y, float %t) { +; FMA-LABEL: test_f32_interp_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 +; FMA-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_f32_interp_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmsubss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 +; FMA4-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_f32_interp_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 +; AVX512-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; AVX512-NEXT: retq + %t1 = fsub contract ninf nsz float 1.0, %t + %tx = fmul contract ninf nsz float %x, %t + %ty = fmul contract ninf nsz float %y, %t1 + %r = fadd contract ninf nsz float %tx, %ty + ret float %r +} + define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float> %t) { -; FMA-INFS-LABEL: test_v4f32_interp: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-INFS-NEXT: vsubps %xmm2, %xmm3, %xmm3 -; FMA-INFS-NEXT: vmulps %xmm3, %xmm1, %xmm1 -; FMA-INFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f32_interp: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA4-INFS-NEXT: vsubps %xmm2, %xmm3, %xmm3 -; FMA4-INFS-NEXT: vmulps %xmm3, %xmm1, %xmm1 -; FMA4-INFS-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f32_interp: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-INFS-NEXT: vsubps %xmm2, %xmm3, %xmm3 -; AVX512-INFS-NEXT: vmulps %xmm3, %xmm1, %xmm1 -; AVX512-INFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f32_interp: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f32_interp: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f32_interp: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f32_interp: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-NEXT: vsubps %xmm2, %xmm3, %xmm3 +; FMA-NEXT: vmulps %xmm3, %xmm1, %xmm1 +; FMA-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_interp: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-NEXT: vsubps %xmm2, %xmm3, %xmm3 +; FMA4-NEXT: vmulps %xmm3, %xmm1, %xmm1 +; FMA4-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_interp: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512-NEXT: vsubps %xmm2, %xmm3, %xmm3 +; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 +; AVX512-NEXT: retq %t1 = fsub contract nsz <4 x float> , %t %tx = fmul contract nsz <4 x float> %x, %t %ty = fmul contract nsz <4 x float> %y, %t1 @@ -1364,48 +1448,55 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float ret <4 x float> %r } +define <4 x float> @test_v4f32_interp_ninf(<4 x float> %x, <4 x float> %y, <4 x float> %t) { +; FMA-LABEL: test_v4f32_interp_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmsub213ps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 +; FMA-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f32_interp_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmsubps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 +; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f32_interp_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 +; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; AVX512-NEXT: retq + %t1 = fsub contract ninf nsz <4 x float> , %t + %tx = fmul contract ninf nsz <4 x float> %x, %t + %ty = fmul contract ninf nsz <4 x float> %y, %t1 + %r = fadd contract ninf nsz <4 x float> %tx, %ty + ret <4 x float> %r +} + define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float> %t) { -; FMA-INFS-LABEL: test_v8f32_interp: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-INFS-NEXT: vsubps %ymm2, %ymm3, %ymm3 -; FMA-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; FMA-INFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v8f32_interp: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA4-INFS-NEXT: vsubps %ymm2, %ymm3, %ymm3 -; FMA4-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; FMA4-INFS-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v8f32_interp: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-INFS-NEXT: vsubps %ymm2, %ymm3, %ymm3 -; AVX512-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; AVX512-INFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v8f32_interp: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v8f32_interp: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v8f32_interp: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v8f32_interp: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-NEXT: vsubps %ymm2, %ymm3, %ymm3 +; FMA-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; FMA-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v8f32_interp: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-NEXT: vsubps %ymm2, %ymm3, %ymm3 +; FMA4-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; FMA4-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v8f32_interp: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512-NEXT: vsubps %ymm2, %ymm3, %ymm3 +; AVX512-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; AVX512-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 +; AVX512-NEXT: retq %t1 = fsub contract nsz <8 x float> , %t %tx = fmul contract nsz <8 x float> %x, %t %ty = fmul contract nsz <8 x float> %y, %t1 @@ -1413,48 +1504,55 @@ define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float ret <8 x float> %r } +define <8 x float> @test_v8f32_interp_ninf(<8 x float> %x, <8 x float> %y, <8 x float> %t) { +; FMA-LABEL: test_v8f32_interp_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 +; FMA-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v8f32_interp_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 +; FMA4-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v8f32_interp_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 +; AVX512-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 +; AVX512-NEXT: retq + %t1 = fsub contract ninf nsz <8 x float> , %t + %tx = fmul contract ninf nsz <8 x float> %x, %t + %ty = fmul contract ninf nsz <8 x float> %y, %t1 + %r = fadd contract ninf nsz <8 x float> %tx, %ty + ret <8 x float> %r +} + define double @test_f64_interp(double %x, double %y, double %t) { -; FMA-INFS-LABEL: test_f64_interp: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovsd {{.*#+}} xmm3 = [1.0E+0,0.0E+0] -; FMA-INFS-NEXT: vsubsd %xmm2, %xmm3, %xmm3 -; FMA-INFS-NEXT: vmulsd %xmm3, %xmm1, %xmm1 -; FMA-INFS-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_f64_interp: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovsd {{.*#+}} xmm3 = [1.0E+0,0.0E+0] -; FMA4-INFS-NEXT: vsubsd %xmm2, %xmm3, %xmm3 -; FMA4-INFS-NEXT: vmulsd %xmm3, %xmm1, %xmm1 -; FMA4-INFS-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_f64_interp: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vmovsd {{.*#+}} xmm3 = [1.0E+0,0.0E+0] -; AVX512-INFS-NEXT: vsubsd %xmm2, %xmm3, %xmm3 -; AVX512-INFS-NEXT: vmulsd %xmm3, %xmm1, %xmm1 -; AVX512-INFS-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_f64_interp: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_f64_interp: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubsd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: vfmsubsd {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_f64_interp: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; AVX512-NOINFS-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_f64_interp: +; FMA: # %bb.0: +; FMA-NEXT: vmovsd {{.*#+}} xmm3 = [1.0E+0,0.0E+0] +; FMA-NEXT: vsubsd %xmm2, %xmm3, %xmm3 +; FMA-NEXT: vmulsd %xmm3, %xmm1, %xmm1 +; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_f64_interp: +; FMA4: # %bb.0: +; FMA4-NEXT: vmovsd {{.*#+}} xmm3 = [1.0E+0,0.0E+0] +; FMA4-NEXT: vsubsd %xmm2, %xmm3, %xmm3 +; FMA4-NEXT: vmulsd %xmm3, %xmm1, %xmm1 +; FMA4-NEXT: vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_f64_interp: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovsd {{.*#+}} xmm3 = [1.0E+0,0.0E+0] +; AVX512-NEXT: vsubsd %xmm2, %xmm3, %xmm3 +; AVX512-NEXT: vmulsd %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 +; AVX512-NEXT: retq %t1 = fsub contract nsz double 1.0, %t %tx = fmul contract nsz double %x, %t %ty = fmul contract nsz double %y, %t1 @@ -1462,51 +1560,58 @@ define double @test_f64_interp(double %x, double %y, double %t) { ret double %r } +define double @test_f64_interp_ninf(double %x, double %y, double %t) { +; FMA-LABEL: test_f64_interp_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmsub213sd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 +; FMA-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_f64_interp_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmsubsd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 +; FMA4-NEXT: vfmsubsd {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_f64_interp_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmsub213sd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 +; AVX512-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; AVX512-NEXT: retq + %t1 = fsub contract ninf nsz double 1.0, %t + %tx = fmul contract ninf nsz double %x, %t + %ty = fmul contract ninf nsz double %y, %t1 + %r = fadd contract ninf nsz double %tx, %ty + ret double %r +} + define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x double> %t) { -; FMA-INFS-LABEL: test_v2f64_interp: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovddup {{.*#+}} xmm3 = [1.0E+0,1.0E+0] -; FMA-INFS-NEXT: # xmm3 = mem[0,0] -; FMA-INFS-NEXT: vsubpd %xmm2, %xmm3, %xmm3 -; FMA-INFS-NEXT: vmulpd %xmm3, %xmm1, %xmm1 -; FMA-INFS-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v2f64_interp: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovddup {{.*#+}} xmm3 = [1.0E+0,1.0E+0] -; FMA4-INFS-NEXT: # xmm3 = mem[0,0] -; FMA4-INFS-NEXT: vsubpd %xmm2, %xmm3, %xmm3 -; FMA4-INFS-NEXT: vmulpd %xmm3, %xmm1, %xmm1 -; FMA4-INFS-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v2f64_interp: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vmovddup {{.*#+}} xmm3 = [1.0E+0,1.0E+0] -; AVX512-INFS-NEXT: # xmm3 = mem[0,0] -; AVX512-INFS-NEXT: vsubpd %xmm2, %xmm3, %xmm3 -; AVX512-INFS-NEXT: vmulpd %xmm3, %xmm1, %xmm1 -; AVX512-INFS-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v2f64_interp: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v2f64_interp: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v2f64_interp: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v2f64_interp: +; FMA: # %bb.0: +; FMA-NEXT: vmovddup {{.*#+}} xmm3 = [1.0E+0,1.0E+0] +; FMA-NEXT: # xmm3 = mem[0,0] +; FMA-NEXT: vsubpd %xmm2, %xmm3, %xmm3 +; FMA-NEXT: vmulpd %xmm3, %xmm1, %xmm1 +; FMA-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v2f64_interp: +; FMA4: # %bb.0: +; FMA4-NEXT: vmovddup {{.*#+}} xmm3 = [1.0E+0,1.0E+0] +; FMA4-NEXT: # xmm3 = mem[0,0] +; FMA4-NEXT: vsubpd %xmm2, %xmm3, %xmm3 +; FMA4-NEXT: vmulpd %xmm3, %xmm1, %xmm1 +; FMA4-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v2f64_interp: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovddup {{.*#+}} xmm3 = [1.0E+0,1.0E+0] +; AVX512-NEXT: # xmm3 = mem[0,0] +; AVX512-NEXT: vsubpd %xmm2, %xmm3, %xmm3 +; AVX512-NEXT: vmulpd %xmm3, %xmm1, %xmm1 +; AVX512-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 +; AVX512-NEXT: retq %t1 = fsub contract nsz <2 x double> , %t %tx = fmul contract nsz <2 x double> %x, %t %ty = fmul contract nsz <2 x double> %y, %t1 @@ -1514,48 +1619,55 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do ret <2 x double> %r } +define <2 x double> @test_v2f64_interp_ninf(<2 x double> %x, <2 x double> %y, <2 x double> %t) { +; FMA-LABEL: test_v2f64_interp_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmsub213pd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 +; FMA-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v2f64_interp_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmsubpd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 +; FMA4-NEXT: vfmsubpd {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v2f64_interp_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmsub213pd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1 +; AVX512-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1 +; AVX512-NEXT: retq + %t1 = fsub contract ninf nsz <2 x double> , %t + %tx = fmul contract ninf nsz <2 x double> %x, %t + %ty = fmul contract ninf nsz <2 x double> %y, %t1 + %r = fadd contract ninf nsz <2 x double> %tx, %ty + ret <2 x double> %r +} + define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x double> %t) { -; FMA-INFS-LABEL: test_v4f64_interp: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-INFS-NEXT: vsubpd %ymm2, %ymm3, %ymm3 -; FMA-INFS-NEXT: vmulpd %ymm3, %ymm1, %ymm1 -; FMA-INFS-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v4f64_interp: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA4-INFS-NEXT: vsubpd %ymm2, %ymm3, %ymm3 -; FMA4-INFS-NEXT: vmulpd %ymm3, %ymm1, %ymm1 -; FMA4-INFS-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v4f64_interp: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-INFS-NEXT: vsubpd %ymm2, %ymm3, %ymm3 -; AVX512-INFS-NEXT: vmulpd %ymm3, %ymm1, %ymm1 -; AVX512-INFS-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v4f64_interp: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v4f64_interp: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm1 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v4f64_interp: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v4f64_interp: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-NEXT: vsubpd %ymm2, %ymm3, %ymm3 +; FMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1 +; FMA-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f64_interp: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-NEXT: vsubpd %ymm2, %ymm3, %ymm3 +; FMA4-NEXT: vmulpd %ymm3, %ymm1, %ymm1 +; FMA4-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f64_interp: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512-NEXT: vsubpd %ymm2, %ymm3, %ymm3 +; AVX512-NEXT: vmulpd %ymm3, %ymm1, %ymm1 +; AVX512-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 +; AVX512-NEXT: retq %t1 = fsub contract nsz <4 x double> , %t %tx = fmul contract nsz <4 x double> %x, %t %ty = fmul contract nsz <4 x double> %y, %t1 @@ -1563,6 +1675,31 @@ define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x do ret <4 x double> %r } +define <4 x double> @test_v4f64_interp_ninf(<4 x double> %x, <4 x double> %y, <4 x double> %t) { +; FMA-LABEL: test_v4f64_interp_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 +; FMA-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v4f64_interp_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmsubpd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 +; FMA4-NEXT: vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v4f64_interp_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1 +; AVX512-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1 +; AVX512-NEXT: retq + %t1 = fsub contract ninf nsz <4 x double> , %t + %tx = fmul contract ninf nsz <4 x double> %x, %t + %ty = fmul contract ninf nsz <4 x double> %y, %t1 + %r = fadd contract ninf nsz <4 x double> %tx, %ty + ret <4 x double> %r +} + ; ; Pattern: (fneg (fma x, y, z)) -> (fma x, -y, -z) ; @@ -2022,3 +2159,7 @@ define float @fadd_fma_fmul_extra_use_3(float %a, float %b, float %c, float %d, %a2 = fadd contract fast float %n0, %a1 ret float %a2 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX512-INFS: {{.*}} +; FMA-INFS: {{.*}} +; FMA4-INFS: {{.*}} diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll index f0af3945ae959..4c16cf9a8550b 100644 --- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll +++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll @@ -3,10 +3,6 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-INFS ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-INFS ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512-INFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA --check-prefix=FMA-NOINFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-NOINFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-NOINFS -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq -enable-no-infs-fp-math | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512-NOINFS ; ; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z) @@ -257,617 +253,668 @@ define <8 x double> @test_8f64_fmsub_load(ptr %a0, <8 x double> %a1, <8 x double ; define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %y) { -; FMA-INFS-LABEL: test_v16f32_mul_add_x_one_y: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 -; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 -; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; FMA-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v16f32_mul_add_x_one_y: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 -; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 -; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; FMA4-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v16f32_mul_add_x_one_y: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512-INFS-NEXT: vmulps %zmm1, %zmm0, %zmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v16f32_mul_add_x_one_y: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm2 -; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm3 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v16f32_mul_add_x_one_y: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 -; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} ymm1 = (ymm1 * ymm3) + ymm3 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v16f32_mul_add_x_one_y: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v16f32_mul_add_x_one_y: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-NEXT: vaddps %ymm4, %ymm1, %ymm1 +; FMA-NEXT: vaddps %ymm4, %ymm0, %ymm0 +; FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; FMA-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v16f32_mul_add_x_one_y: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-NEXT: vaddps %ymm4, %ymm1, %ymm1 +; FMA4-NEXT: vaddps %ymm4, %ymm0, %ymm0 +; FMA4-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; FMA4-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v16f32_mul_add_x_one_y: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq %a = fadd contract <16 x float> %x, %m = fmul contract <16 x float> %a, %y ret <16 x float> %m } +define <16 x float> @test_v16f32_mul_add_x_one_y_ninf(<16 x float> %x, <16 x float> %y) { +; FMA-LABEL: test_v16f32_mul_add_x_one_y_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm2 +; FMA-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm3 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v16f32_mul_add_x_one_y_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 +; FMA4-NEXT: vfmaddps {{.*#+}} ymm1 = (ymm1 * ymm3) + ymm3 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v16f32_mul_add_x_one_y_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1 +; AVX512-NEXT: retq + %a = fadd contract ninf <16 x float> %x, + %m = fmul contract ninf <16 x float> %a, %y + ret <16 x float> %m +} + define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y) { -; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_one: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 -; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 -; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; FMA-INFS-NEXT: vmulpd %ymm1, %ymm3, %ymm1 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_one: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 -; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 -; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; FMA4-INFS-NEXT: vmulpd %ymm1, %ymm3, %ymm1 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v8f64_mul_y_add_x_one: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512-INFS-NEXT: vmulpd %zmm0, %zmm1, %zmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v8f64_mul_y_add_x_one: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm2 -; FMA-NOINFS-NEXT: vfmadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm3 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v8f64_mul_y_add_x_one: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 -; FMA4-NOINFS-NEXT: vfmaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) + ymm3 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v8f64_mul_y_add_x_one: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v8f64_mul_y_add_x_one: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; FMA-NEXT: vaddpd %ymm4, %ymm0, %ymm0 +; FMA-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; FMA-NEXT: vmulpd %ymm1, %ymm3, %ymm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v8f64_mul_y_add_x_one: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; FMA4-NEXT: vaddpd %ymm4, %ymm0, %ymm0 +; FMA4-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; FMA4-NEXT: vmulpd %ymm1, %ymm3, %ymm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v8f64_mul_y_add_x_one: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vmulpd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retq %a = fadd contract <8 x double> %x, %m = fmul contract <8 x double> %y, %a ret <8 x double> %m } +define <8 x double> @test_v8f64_mul_y_add_x_one_ninf(<8 x double> %x, <8 x double> %y) { +; FMA-LABEL: test_v8f64_mul_y_add_x_one_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm2 +; FMA-NEXT: vfmadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm3 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v8f64_mul_y_add_x_one_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 +; FMA4-NEXT: vfmaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) + ymm3 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v8f64_mul_y_add_x_one_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1 +; AVX512-NEXT: retq + %a = fadd contract ninf <8 x double> %x, + %m = fmul contract ninf <8 x double> %y, %a + ret <8 x double> %m +} + define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float> %y) { -; FMA-INFS-LABEL: test_v16f32_mul_add_x_negone_y: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 -; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 -; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; FMA-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v16f32_mul_add_x_negone_y: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 -; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 -; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; FMA4-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v16f32_mul_add_x_negone_y: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512-INFS-NEXT: vmulps %zmm1, %zmm0, %zmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v16f32_mul_add_x_negone_y: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) - ymm3 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v16f32_mul_add_x_negone_y: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm3) - ymm3 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v16f32_mul_add_x_negone_y: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v16f32_mul_add_x_negone_y: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-NEXT: vaddps %ymm4, %ymm1, %ymm1 +; FMA-NEXT: vaddps %ymm4, %ymm0, %ymm0 +; FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; FMA-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v16f32_mul_add_x_negone_y: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-NEXT: vaddps %ymm4, %ymm1, %ymm1 +; FMA4-NEXT: vaddps %ymm4, %ymm0, %ymm0 +; FMA4-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; FMA4-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v16f32_mul_add_x_negone_y: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq %a = fadd contract <16 x float> %x, %m = fmul contract <16 x float> %a, %y ret <16 x float> %m } +define <16 x float> @test_v16f32_mul_add_x_negone_y_ninf(<16 x float> %x, <16 x float> %y) { +; FMA-LABEL: test_v16f32_mul_add_x_negone_y_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm2 +; FMA-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) - ymm3 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v16f32_mul_add_x_negone_y_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm2 +; FMA4-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm3) - ymm3 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v16f32_mul_add_x_negone_y_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1 +; AVX512-NEXT: retq + %a = fadd contract ninf <16 x float> %x, + %m = fmul contract ninf <16 x float> %a, %y + ret <16 x float> %m +} + define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double> %y) { -; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_negone: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 -; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 -; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; FMA-INFS-NEXT: vmulpd %ymm1, %ymm3, %ymm1 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_negone: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 -; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 -; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; FMA4-INFS-NEXT: vmulpd %ymm1, %ymm3, %ymm1 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v8f64_mul_y_add_x_negone: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512-INFS-NEXT: vmulpd %zmm0, %zmm1, %zmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v8f64_mul_y_add_x_negone: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) - ymm3 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v8f64_mul_y_add_x_negone: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) - ymm3 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v8f64_mul_y_add_x_negone: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v8f64_mul_y_add_x_negone: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; FMA-NEXT: vaddpd %ymm4, %ymm0, %ymm0 +; FMA-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; FMA-NEXT: vmulpd %ymm1, %ymm3, %ymm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v8f64_mul_y_add_x_negone: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; FMA4-NEXT: vaddpd %ymm4, %ymm0, %ymm0 +; FMA4-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; FMA4-NEXT: vmulpd %ymm1, %ymm3, %ymm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v8f64_mul_y_add_x_negone: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vmulpd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retq %a = fadd contract <8 x double> %x, %m = fmul contract <8 x double> %y, %a ret <8 x double> %m } +define <8 x double> @test_v8f64_mul_y_add_x_negone_ninf(<8 x double> %x, <8 x double> %y) { +; FMA-LABEL: test_v8f64_mul_y_add_x_negone_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm2 +; FMA-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) - ymm3 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v8f64_mul_y_add_x_negone_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm2 +; FMA4-NEXT: vfmsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) - ymm3 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v8f64_mul_y_add_x_negone_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1 +; AVX512-NEXT: retq + %a = fadd contract ninf <8 x double> %x, + %m = fmul contract ninf <8 x double> %y, %a + ret <8 x double> %m +} + define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %y) { -; FMA-INFS-LABEL: test_v16f32_mul_sub_one_x_y: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 -; FMA-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 -; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; FMA-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v16f32_mul_sub_one_x_y: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA4-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 -; FMA4-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 -; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; FMA4-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v16f32_mul_sub_one_x_y: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-INFS-NEXT: vsubps %zmm0, %zmm2, %zmm0 -; AVX512-INFS-NEXT: vmulps %zmm1, %zmm0, %zmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v16f32_mul_sub_one_x_y: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2 -; FMA-NOINFS-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm3 * ymm1) + ymm3 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v16f32_mul_sub_one_x_y: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2 -; FMA4-NOINFS-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm3) + ymm3 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v16f32_mul_sub_one_x_y: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v16f32_mul_sub_one_x_y: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-NEXT: vsubps %ymm1, %ymm4, %ymm1 +; FMA-NEXT: vsubps %ymm0, %ymm4, %ymm0 +; FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; FMA-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v16f32_mul_sub_one_x_y: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-NEXT: vsubps %ymm1, %ymm4, %ymm1 +; FMA4-NEXT: vsubps %ymm0, %ymm4, %ymm0 +; FMA4-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; FMA4-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v16f32_mul_sub_one_x_y: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512-NEXT: vsubps %zmm0, %zmm2, %zmm0 +; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq %s = fsub contract <16 x float> , %x %m = fmul contract <16 x float> %s, %y ret <16 x float> %m } +define <16 x float> @test_v16f32_mul_sub_one_x_y_ninf(<16 x float> %x, <16 x float> %y) { +; FMA-LABEL: test_v16f32_mul_sub_one_x_y_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-NEXT: vsubps %ymm1, %ymm4, %ymm1 +; FMA-NEXT: vsubps %ymm0, %ymm4, %ymm0 +; FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; FMA-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v16f32_mul_sub_one_x_y_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-NEXT: vsubps %ymm1, %ymm4, %ymm1 +; FMA4-NEXT: vsubps %ymm0, %ymm4, %ymm0 +; FMA4-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; FMA4-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v16f32_mul_sub_one_x_y_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512-NEXT: vsubps %zmm0, %zmm2, %zmm0 +; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %s = fsub contract ninf <16 x float> , %x + %m = fmul contract ninf <16 x float> %s, %y + ret <16 x float> %m +} + define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y) { -; FMA-INFS-LABEL: test_v8f64_mul_y_sub_one_x: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1 -; FMA-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0 -; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; FMA-INFS-NEXT: vmulpd %ymm1, %ymm3, %ymm1 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_one_x: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA4-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1 -; FMA4-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0 -; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; FMA4-INFS-NEXT: vmulpd %ymm1, %ymm3, %ymm1 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_one_x: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vbroadcastsd {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-INFS-NEXT: vsubpd %zmm0, %zmm2, %zmm0 -; AVX512-INFS-NEXT: vmulpd %zmm0, %zmm1, %zmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v8f64_mul_y_sub_one_x: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2 -; FMA-NOINFS-NEXT: vfnmadd213pd {{.*#+}} ymm1 = -(ymm3 * ymm1) + ymm3 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v8f64_mul_y_sub_one_x: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfnmaddpd {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2 -; FMA4-NOINFS-NEXT: vfnmaddpd {{.*#+}} ymm1 = -(ymm1 * ymm3) + ymm3 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v8f64_mul_y_sub_one_x: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v8f64_mul_y_sub_one_x: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-NEXT: vsubpd %ymm1, %ymm4, %ymm1 +; FMA-NEXT: vsubpd %ymm0, %ymm4, %ymm0 +; FMA-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; FMA-NEXT: vmulpd %ymm1, %ymm3, %ymm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v8f64_mul_y_sub_one_x: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-NEXT: vsubpd %ymm1, %ymm4, %ymm1 +; FMA4-NEXT: vsubpd %ymm0, %ymm4, %ymm0 +; FMA4-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; FMA4-NEXT: vmulpd %ymm1, %ymm3, %ymm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v8f64_mul_y_sub_one_x: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastsd {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512-NEXT: vsubpd %zmm0, %zmm2, %zmm0 +; AVX512-NEXT: vmulpd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retq %s = fsub contract <8 x double> , %x %m = fmul contract <8 x double> %y, %s ret <8 x double> %m } +define <8 x double> @test_v8f64_mul_y_sub_one_x_ninf(<8 x double> %x, <8 x double> %y) { +; FMA-LABEL: test_v8f64_mul_y_sub_one_x_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfnmadd213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2 +; FMA-NEXT: vfnmadd213pd {{.*#+}} ymm1 = -(ymm3 * ymm1) + ymm3 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v8f64_mul_y_sub_one_x_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfnmaddpd {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2 +; FMA4-NEXT: vfnmaddpd {{.*#+}} ymm1 = -(ymm1 * ymm3) + ymm3 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v8f64_mul_y_sub_one_x_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm1 +; AVX512-NEXT: retq + %s = fsub contract ninf <8 x double> , %x + %m = fmul contract ninf <8 x double> %y, %s + ret <8 x double> %m +} + define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float> %y) { -; FMA-INFS-LABEL: test_v16f32_mul_sub_negone_x_y: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; FMA-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 -; FMA-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 -; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; FMA-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v16f32_mul_sub_negone_x_y: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; FMA4-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 -; FMA4-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 -; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; FMA4-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v16f32_mul_sub_negone_x_y: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} zmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; AVX512-INFS-NEXT: vsubps %zmm0, %zmm2, %zmm0 -; AVX512-INFS-NEXT: vmulps %zmm1, %zmm0, %zmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v16f32_mul_sub_negone_x_y: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfnmsub213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) - ymm2 -; FMA-NOINFS-NEXT: vfnmsub213ps {{.*#+}} ymm1 = -(ymm3 * ymm1) - ymm3 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v16f32_mul_sub_negone_x_y: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfnmsubps {{.*#+}} ymm0 = -(ymm0 * ymm2) - ymm2 -; FMA4-NOINFS-NEXT: vfnmsubps {{.*#+}} ymm1 = -(ymm1 * ymm3) - ymm3 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v16f32_mul_sub_negone_x_y: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v16f32_mul_sub_negone_x_y: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-NEXT: vsubps %ymm1, %ymm4, %ymm1 +; FMA-NEXT: vsubps %ymm0, %ymm4, %ymm0 +; FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; FMA-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v16f32_mul_sub_negone_x_y: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-NEXT: vsubps %ymm1, %ymm4, %ymm1 +; FMA4-NEXT: vsubps %ymm0, %ymm4, %ymm0 +; FMA4-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; FMA4-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v16f32_mul_sub_negone_x_y: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} zmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; AVX512-NEXT: vsubps %zmm0, %zmm2, %zmm0 +; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq %s = fsub contract <16 x float> , %x %m = fmul contract <16 x float> %s, %y ret <16 x float> %m } +define <16 x float> @test_v16f32_mul_sub_negone_x_y_ninf(<16 x float> %x, <16 x float> %y) { +; FMA-LABEL: test_v16f32_mul_sub_negone_x_y_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-NEXT: vsubps %ymm1, %ymm4, %ymm1 +; FMA-NEXT: vsubps %ymm0, %ymm4, %ymm0 +; FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; FMA-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v16f32_mul_sub_negone_x_y_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-NEXT: vsubps %ymm1, %ymm4, %ymm1 +; FMA4-NEXT: vsubps %ymm0, %ymm4, %ymm0 +; FMA4-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; FMA4-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v16f32_mul_sub_negone_x_y_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} zmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; AVX512-NEXT: vsubps %zmm0, %zmm2, %zmm0 +; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq + %s = fsub contract ninf <16 x float> , %x + %m = fmul contract ninf <16 x float> %s, %y + ret <16 x float> %m +} + define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double> %y) { -; FMA-INFS-LABEL: test_v8f64_mul_y_sub_negone_x: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; FMA-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1 -; FMA-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0 -; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; FMA-INFS-NEXT: vmulpd %ymm1, %ymm3, %ymm1 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_negone_x: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; FMA4-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1 -; FMA4-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0 -; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; FMA4-INFS-NEXT: vmulpd %ymm1, %ymm3, %ymm1 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_negone_x: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vbroadcastsd {{.*#+}} zmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; AVX512-INFS-NEXT: vsubpd %zmm0, %zmm2, %zmm0 -; AVX512-INFS-NEXT: vmulpd %zmm0, %zmm1, %zmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v8f64_mul_y_sub_negone_x: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfnmsub213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) - ymm2 -; FMA-NOINFS-NEXT: vfnmsub213pd {{.*#+}} ymm1 = -(ymm3 * ymm1) - ymm3 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v8f64_mul_y_sub_negone_x: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfnmsubpd {{.*#+}} ymm0 = -(ymm0 * ymm2) - ymm2 -; FMA4-NOINFS-NEXT: vfnmsubpd {{.*#+}} ymm1 = -(ymm1 * ymm3) - ymm3 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v8f64_mul_y_sub_negone_x: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v8f64_mul_y_sub_negone_x: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-NEXT: vsubpd %ymm1, %ymm4, %ymm1 +; FMA-NEXT: vsubpd %ymm0, %ymm4, %ymm0 +; FMA-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; FMA-NEXT: vmulpd %ymm1, %ymm3, %ymm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v8f64_mul_y_sub_negone_x: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-NEXT: vsubpd %ymm1, %ymm4, %ymm1 +; FMA4-NEXT: vsubpd %ymm0, %ymm4, %ymm0 +; FMA4-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; FMA4-NEXT: vmulpd %ymm1, %ymm3, %ymm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v8f64_mul_y_sub_negone_x: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastsd {{.*#+}} zmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; AVX512-NEXT: vsubpd %zmm0, %zmm2, %zmm0 +; AVX512-NEXT: vmulpd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retq %s = fsub contract <8 x double> , %x %m = fmul contract <8 x double> %y, %s ret <8 x double> %m } +define <8 x double> @test_v8f64_mul_y_sub_negone_x_ninf(<8 x double> %x, <8 x double> %y) { +; FMA-LABEL: test_v8f64_mul_y_sub_negone_x_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfnmsub213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) - ymm2 +; FMA-NEXT: vfnmsub213pd {{.*#+}} ymm1 = -(ymm3 * ymm1) - ymm3 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v8f64_mul_y_sub_negone_x_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfnmsubpd {{.*#+}} ymm0 = -(ymm0 * ymm2) - ymm2 +; FMA4-NEXT: vfnmsubpd {{.*#+}} ymm1 = -(ymm1 * ymm3) - ymm3 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v8f64_mul_y_sub_negone_x_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm1 +; AVX512-NEXT: retq + %s = fsub contract ninf <8 x double> , %x + %m = fmul contract ninf <8 x double> %y, %s + ret <8 x double> %m +} + define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %y) { -; FMA-INFS-LABEL: test_v16f32_mul_sub_x_one_y: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 -; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 -; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; FMA-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_one_y: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 -; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 -; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; FMA4-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v16f32_mul_sub_x_one_y: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512-INFS-NEXT: vmulps %zmm1, %zmm0, %zmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v16f32_mul_sub_x_one_y: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) - ymm3 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v16f32_mul_sub_x_one_y: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm3) - ymm3 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v16f32_mul_sub_x_one_y: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v16f32_mul_sub_x_one_y: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-NEXT: vaddps %ymm4, %ymm1, %ymm1 +; FMA-NEXT: vaddps %ymm4, %ymm0, %ymm0 +; FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; FMA-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v16f32_mul_sub_x_one_y: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-NEXT: vaddps %ymm4, %ymm1, %ymm1 +; FMA4-NEXT: vaddps %ymm4, %ymm0, %ymm0 +; FMA4-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; FMA4-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v16f32_mul_sub_x_one_y: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq %s = fsub contract <16 x float> %x, %m = fmul contract <16 x float> %s, %y ret <16 x float> %m } +define <16 x float> @test_v16f32_mul_sub_x_one_y_ninf(<16 x float> %x, <16 x float> %y) { +; FMA-LABEL: test_v16f32_mul_sub_x_one_y_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm2 +; FMA-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) - ymm3 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v16f32_mul_sub_x_one_y_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm2 +; FMA4-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm3) - ymm3 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v16f32_mul_sub_x_one_y_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1 +; AVX512-NEXT: retq + %s = fsub contract ninf <16 x float> %x, + %m = fmul contract ninf <16 x float> %s, %y + ret <16 x float> %m +} + define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y) { -; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_one: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 -; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 -; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; FMA-INFS-NEXT: vmulpd %ymm1, %ymm3, %ymm1 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_one: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] -; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 -; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 -; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; FMA4-INFS-NEXT: vmulpd %ymm1, %ymm3, %ymm1 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_x_one: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512-INFS-NEXT: vmulpd %zmm0, %zmm1, %zmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v8f64_mul_y_sub_x_one: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) - ymm3 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v8f64_mul_y_sub_x_one: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) - ymm3 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v8f64_mul_y_sub_x_one: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v8f64_mul_y_sub_x_one: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; FMA-NEXT: vaddpd %ymm4, %ymm0, %ymm0 +; FMA-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; FMA-NEXT: vmulpd %ymm1, %ymm3, %ymm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v8f64_mul_y_sub_x_one: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; FMA4-NEXT: vaddpd %ymm4, %ymm0, %ymm0 +; FMA4-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; FMA4-NEXT: vmulpd %ymm1, %ymm3, %ymm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v8f64_mul_y_sub_x_one: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vmulpd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retq %s = fsub contract <8 x double> %x, %m = fmul contract <8 x double> %y, %s ret <8 x double> %m } +define <8 x double> @test_v8f64_mul_y_sub_x_one_ninf(<8 x double> %x, <8 x double> %y) { +; FMA-LABEL: test_v8f64_mul_y_sub_x_one_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm2 +; FMA-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) - ymm3 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v8f64_mul_y_sub_x_one_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm2 +; FMA4-NEXT: vfmsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) - ymm3 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v8f64_mul_y_sub_x_one_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1 +; AVX512-NEXT: retq + %s = fsub contract ninf <8 x double> %x, + %m = fmul contract ninf <8 x double> %y, %s + ret <8 x double> %m +} + define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float> %y) { -; FMA-INFS-LABEL: test_v16f32_mul_sub_x_negone_y: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 -; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 -; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; FMA-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_negone_y: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 -; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 -; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; FMA4-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v16f32_mul_sub_x_negone_y: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512-INFS-NEXT: vmulps %zmm1, %zmm0, %zmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v16f32_mul_sub_x_negone_y: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm2 -; FMA-NOINFS-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm3 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v16f32_mul_sub_x_negone_y: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 -; FMA4-NOINFS-NEXT: vfmaddps {{.*#+}} ymm1 = (ymm1 * ymm3) + ymm3 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v16f32_mul_sub_x_negone_y: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v16f32_mul_sub_x_negone_y: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-NEXT: vaddps %ymm4, %ymm1, %ymm1 +; FMA-NEXT: vaddps %ymm4, %ymm0, %ymm0 +; FMA-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; FMA-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v16f32_mul_sub_x_negone_y: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-NEXT: vaddps %ymm4, %ymm1, %ymm1 +; FMA4-NEXT: vaddps %ymm4, %ymm0, %ymm0 +; FMA4-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; FMA4-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v16f32_mul_sub_x_negone_y: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: retq %s = fsub contract <16 x float> %x, %m = fmul contract <16 x float> %s, %y ret <16 x float> %m } +define <16 x float> @test_v16f32_mul_sub_x_negone_y_ninf(<16 x float> %x, <16 x float> %y) { +; FMA-LABEL: test_v16f32_mul_sub_x_negone_y_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm2 +; FMA-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm3 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v16f32_mul_sub_x_negone_y_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 +; FMA4-NEXT: vfmaddps {{.*#+}} ymm1 = (ymm1 * ymm3) + ymm3 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v16f32_mul_sub_x_negone_y_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1 +; AVX512-NEXT: retq + %s = fsub contract ninf <16 x float> %x, + %m = fmul contract ninf <16 x float> %s, %y + ret <16 x float> %m +} + define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double> %y) { -; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_negone: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 -; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 -; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; FMA-INFS-NEXT: vmulpd %ymm1, %ymm3, %ymm1 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_negone: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 -; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 -; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 -; FMA4-INFS-NEXT: vmulpd %ymm1, %ymm3, %ymm1 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_x_negone: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512-INFS-NEXT: vmulpd %zmm0, %zmm1, %zmm0 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v8f64_mul_y_sub_x_negone: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm2 -; FMA-NOINFS-NEXT: vfmadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm3 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v8f64_mul_y_sub_x_negone: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 -; FMA4-NOINFS-NEXT: vfmaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) + ymm3 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v8f64_mul_y_sub_x_negone: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v8f64_mul_y_sub_x_negone: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; FMA-NEXT: vaddpd %ymm4, %ymm0, %ymm0 +; FMA-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; FMA-NEXT: vmulpd %ymm1, %ymm3, %ymm1 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v8f64_mul_y_sub_x_negone: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-NEXT: vaddpd %ymm4, %ymm1, %ymm1 +; FMA4-NEXT: vaddpd %ymm4, %ymm0, %ymm0 +; FMA4-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; FMA4-NEXT: vmulpd %ymm1, %ymm3, %ymm1 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v8f64_mul_y_sub_x_negone: +; AVX512: # %bb.0: +; AVX512-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vmulpd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: retq %s = fsub contract <8 x double> %x, %m = fmul contract <8 x double> %y, %s ret <8 x double> %m } +define <8 x double> @test_v8f64_mul_y_sub_x_negone_ninf(<8 x double> %x, <8 x double> %y) { +; FMA-LABEL: test_v8f64_mul_y_sub_x_negone_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm2 +; FMA-NEXT: vfmadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm3 +; FMA-NEXT: retq ; -; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y)) +; FMA4-LABEL: test_v8f64_mul_y_sub_x_negone_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2 +; FMA4-NEXT: vfmaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) + ymm3 +; FMA4-NEXT: retq ; +; AVX512-LABEL: test_v8f64_mul_y_sub_x_negone_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1 +; AVX512-NEXT: retq + %s = fsub contract ninf <8 x double> %x, + %m = fmul contract ninf <8 x double> %y, %s + ret <8 x double> %m +} + define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x float> %t) { -; FMA-INFS-LABEL: test_v16f32_interp: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-INFS-NEXT: vsubps %ymm4, %ymm6, %ymm7 -; FMA-INFS-NEXT: vsubps %ymm5, %ymm6, %ymm6 -; FMA-INFS-NEXT: vmulps %ymm6, %ymm3, %ymm3 -; FMA-INFS-NEXT: vmulps %ymm7, %ymm2, %ymm2 -; FMA-INFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm2 -; FMA-INFS-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm5 * ymm1) + ymm3 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v16f32_interp: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA4-INFS-NEXT: vsubps %ymm4, %ymm6, %ymm7 -; FMA4-INFS-NEXT: vsubps %ymm5, %ymm6, %ymm6 -; FMA4-INFS-NEXT: vmulps %ymm6, %ymm3, %ymm3 -; FMA4-INFS-NEXT: vmulps %ymm7, %ymm2, %ymm2 -; FMA4-INFS-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm2 -; FMA4-INFS-NEXT: vfmaddps {{.*#+}} ymm1 = (ymm1 * ymm5) + ymm3 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v16f32_interp: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} zmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-INFS-NEXT: vsubps %zmm2, %zmm3, %zmm3 -; AVX512-INFS-NEXT: vmulps %zmm3, %zmm1, %zmm1 -; AVX512-INFS-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm2 * zmm0) + zmm1 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v16f32_interp: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm5 * ymm1) - ymm3 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v16f32_interp: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm4) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm5) - ymm3 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v16f32_interp: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} zmm1 = (zmm2 * zmm1) - zmm1 -; AVX512-NOINFS-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm2 * zmm0) - zmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v16f32_interp: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-NEXT: vsubps %ymm4, %ymm6, %ymm7 +; FMA-NEXT: vsubps %ymm5, %ymm6, %ymm6 +; FMA-NEXT: vmulps %ymm6, %ymm3, %ymm3 +; FMA-NEXT: vmulps %ymm7, %ymm2, %ymm2 +; FMA-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm2 +; FMA-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm5 * ymm1) + ymm3 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v16f32_interp: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-NEXT: vsubps %ymm4, %ymm6, %ymm7 +; FMA4-NEXT: vsubps %ymm5, %ymm6, %ymm6 +; FMA4-NEXT: vmulps %ymm6, %ymm3, %ymm3 +; FMA4-NEXT: vmulps %ymm7, %ymm2, %ymm2 +; FMA4-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm2 +; FMA4-NEXT: vfmaddps {{.*#+}} ymm1 = (ymm1 * ymm5) + ymm3 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v16f32_interp: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} zmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512-NEXT: vsubps %zmm2, %zmm3, %zmm3 +; AVX512-NEXT: vmulps %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm2 * zmm0) + zmm1 +; AVX512-NEXT: retq %t1 = fsub contract nsz <16 x float> , %t %tx = fmul contract nsz <16 x float> %x, %t %ty = fmul contract nsz <16 x float> %y, %t1 @@ -875,58 +922,65 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x ret <16 x float> %r } +define <16 x float> @test_v16f32_interp_ninf(<16 x float> %x, <16 x float> %y, <16 x float> %t) { +; FMA-LABEL: test_v16f32_interp_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmsub213ps {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 +; FMA-NEXT: vfmsub213ps {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 +; FMA-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm2 +; FMA-NEXT: vfmsub213ps {{.*#+}} ymm1 = (ymm5 * ymm1) - ymm3 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v16f32_interp_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 +; FMA4-NEXT: vfmsubps {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 +; FMA4-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm4) - ymm2 +; FMA4-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm5) - ymm3 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v16f32_interp_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm1 = (zmm2 * zmm1) - zmm1 +; AVX512-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm2 * zmm0) - zmm1 +; AVX512-NEXT: retq + %t1 = fsub contract ninf nsz <16 x float> , %t + %tx = fmul contract ninf nsz <16 x float> %x, %t + %ty = fmul contract ninf nsz <16 x float> %y, %t1 + %r = fadd contract ninf nsz <16 x float> %tx, %ty + ret <16 x float> %r +} + define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x double> %t) { -; FMA-INFS-LABEL: test_v8f64_interp: -; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-INFS-NEXT: vsubpd %ymm4, %ymm6, %ymm7 -; FMA-INFS-NEXT: vsubpd %ymm5, %ymm6, %ymm6 -; FMA-INFS-NEXT: vmulpd %ymm6, %ymm3, %ymm3 -; FMA-INFS-NEXT: vmulpd %ymm7, %ymm2, %ymm2 -; FMA-INFS-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm2 -; FMA-INFS-NEXT: vfmadd213pd {{.*#+}} ymm1 = (ymm5 * ymm1) + ymm3 -; FMA-INFS-NEXT: retq -; -; FMA4-INFS-LABEL: test_v8f64_interp: -; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA4-INFS-NEXT: vsubpd %ymm4, %ymm6, %ymm7 -; FMA4-INFS-NEXT: vsubpd %ymm5, %ymm6, %ymm6 -; FMA4-INFS-NEXT: vmulpd %ymm6, %ymm3, %ymm3 -; FMA4-INFS-NEXT: vmulpd %ymm7, %ymm2, %ymm2 -; FMA4-INFS-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm2 -; FMA4-INFS-NEXT: vfmaddpd {{.*#+}} ymm1 = (ymm1 * ymm5) + ymm3 -; FMA4-INFS-NEXT: retq -; -; AVX512-INFS-LABEL: test_v8f64_interp: -; AVX512-INFS: # %bb.0: -; AVX512-INFS-NEXT: vbroadcastsd {{.*#+}} zmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-INFS-NEXT: vsubpd %zmm2, %zmm3, %zmm3 -; AVX512-INFS-NEXT: vmulpd %zmm3, %zmm1, %zmm1 -; AVX512-INFS-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm2 * zmm0) + zmm1 -; AVX512-INFS-NEXT: retq -; -; FMA-NOINFS-LABEL: test_v8f64_interp: -; FMA-NOINFS: # %bb.0: -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm2 -; FMA-NOINFS-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm5 * ymm1) - ymm3 -; FMA-NOINFS-NEXT: retq -; -; FMA4-NOINFS-LABEL: test_v8f64_interp: -; FMA4-NOINFS: # %bb.0: -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm4) - ymm2 -; FMA4-NOINFS-NEXT: vfmsubpd {{.*#+}} ymm1 = (ymm1 * ymm5) - ymm3 -; FMA4-NOINFS-NEXT: retq -; -; AVX512-NOINFS-LABEL: test_v8f64_interp: -; AVX512-NOINFS: # %bb.0: -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} zmm1 = (zmm2 * zmm1) - zmm1 -; AVX512-NOINFS-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm2 * zmm0) - zmm1 -; AVX512-NOINFS-NEXT: retq +; FMA-LABEL: test_v8f64_interp: +; FMA: # %bb.0: +; FMA-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-NEXT: vsubpd %ymm4, %ymm6, %ymm7 +; FMA-NEXT: vsubpd %ymm5, %ymm6, %ymm6 +; FMA-NEXT: vmulpd %ymm6, %ymm3, %ymm3 +; FMA-NEXT: vmulpd %ymm7, %ymm2, %ymm2 +; FMA-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm2 +; FMA-NEXT: vfmadd213pd {{.*#+}} ymm1 = (ymm5 * ymm1) + ymm3 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v8f64_interp: +; FMA4: # %bb.0: +; FMA4-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-NEXT: vsubpd %ymm4, %ymm6, %ymm7 +; FMA4-NEXT: vsubpd %ymm5, %ymm6, %ymm6 +; FMA4-NEXT: vmulpd %ymm6, %ymm3, %ymm3 +; FMA4-NEXT: vmulpd %ymm7, %ymm2, %ymm2 +; FMA4-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm2 +; FMA4-NEXT: vfmaddpd {{.*#+}} ymm1 = (ymm1 * ymm5) + ymm3 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v8f64_interp: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastsd {{.*#+}} zmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512-NEXT: vsubpd %zmm2, %zmm3, %zmm3 +; AVX512-NEXT: vmulpd %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm2 * zmm0) + zmm1 +; AVX512-NEXT: retq %t1 = fsub contract nsz <8 x double> , %t %tx = fmul contract nsz <8 x double> %x, %t %ty = fmul contract nsz <8 x double> %y, %t1 @@ -934,6 +988,35 @@ define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x do ret <8 x double> %r } +define <8 x double> @test_v8f64_interp_ninf(<8 x double> %x, <8 x double> %y, <8 x double> %t) { +; FMA-LABEL: test_v8f64_interp_ninf: +; FMA: # %bb.0: +; FMA-NEXT: vfmsub213pd {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 +; FMA-NEXT: vfmsub213pd {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 +; FMA-NEXT: vfmsub213pd {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm2 +; FMA-NEXT: vfmsub213pd {{.*#+}} ymm1 = (ymm5 * ymm1) - ymm3 +; FMA-NEXT: retq +; +; FMA4-LABEL: test_v8f64_interp_ninf: +; FMA4: # %bb.0: +; FMA4-NEXT: vfmsubpd {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3 +; FMA4-NEXT: vfmsubpd {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2 +; FMA4-NEXT: vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm4) - ymm2 +; FMA4-NEXT: vfmsubpd {{.*#+}} ymm1 = (ymm1 * ymm5) - ymm3 +; FMA4-NEXT: retq +; +; AVX512-LABEL: test_v8f64_interp_ninf: +; AVX512: # %bb.0: +; AVX512-NEXT: vfmsub213pd {{.*#+}} zmm1 = (zmm2 * zmm1) - zmm1 +; AVX512-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm2 * zmm0) - zmm1 +; AVX512-NEXT: retq + %t1 = fsub contract ninf nsz <8 x double> , %t + %tx = fmul contract ninf nsz <8 x double> %x, %t + %ty = fmul contract ninf nsz <8 x double> %y, %t1 + %r = fadd contract ninf nsz <8 x double> %tx, %ty + ret <8 x double> %r +} + ; ; Pattern: (fneg (fma x, y, z)) -> (fma x, -y, -z) ; @@ -1168,3 +1251,7 @@ define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> % } attributes #0 = { "unsafe-fp-math"="true" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX512-INFS: {{.*}} +; FMA-INFS: {{.*}} +; FMA4-INFS: {{.*}} From 189805a098fb9889ceedcec6885bef59ce810382 Mon Sep 17 00:00:00 2001 From: PaperChalice Date: Fri, 10 Oct 2025 18:59:58 +0800 Subject: [PATCH 4/5] use poison instead of undef --- llvm/test/CodeGen/X86/fma_patterns.ll | 120 +++++++++++++------------- 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll index e185d72635503..d16522eebcee2 100644 --- a/llvm/test/CodeGen/X86/fma_patterns.ll +++ b/llvm/test/CodeGen/X86/fma_patterns.ll @@ -642,45 +642,45 @@ define <4 x float> @test_v4f32_mul_y_add_x_one_ninf(<4 x float> %x, <4 x float> ret <4 x float> %m } -define <4 x float> @test_v4f32_mul_y_add_x_one_undefs(<4 x float> %x, <4 x float> %y) { -; FMA-LABEL: test_v4f32_mul_y_add_x_one_undefs: +define <4 x float> @test_v4f32_mul_y_add_x_one_poisons(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_add_x_one_poisons: ; FMA: # %bb.0: ; FMA-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA-NEXT: retq ; -; FMA4-LABEL: test_v4f32_mul_y_add_x_one_undefs: +; FMA4-LABEL: test_v4f32_mul_y_add_x_one_poisons: ; FMA4: # %bb.0: ; FMA4-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA4-NEXT: retq ; -; AVX512-LABEL: test_v4f32_mul_y_add_x_one_undefs: +; AVX512-LABEL: test_v4f32_mul_y_add_x_one_poisons: ; AVX512: # %bb.0: ; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %a = fadd contract <4 x float> %x, + %a = fadd contract <4 x float> %x, %m = fmul contract <4 x float> %y, %a ret <4 x float> %m } -define <4 x float> @test_v4f32_mul_y_add_x_one_undefs_ninf(<4 x float> %x, <4 x float> %y) { -; FMA-LABEL: test_v4f32_mul_y_add_x_one_undefs_ninf: +define <4 x float> @test_v4f32_mul_y_add_x_one_poisons_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_add_x_one_poisons_ninf: ; FMA: # %bb.0: ; FMA-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 ; FMA-NEXT: retq ; -; FMA4-LABEL: test_v4f32_mul_y_add_x_one_undefs_ninf: +; FMA4-LABEL: test_v4f32_mul_y_add_x_one_poisons_ninf: ; FMA4: # %bb.0: ; FMA4-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 ; FMA4-NEXT: retq ; -; AVX512-LABEL: test_v4f32_mul_y_add_x_one_undefs_ninf: +; AVX512-LABEL: test_v4f32_mul_y_add_x_one_poisons_ninf: ; AVX512: # %bb.0: ; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 ; AVX512-NEXT: retq - %a = fadd contract ninf<4 x float> %x, + %a = fadd contract ninf<4 x float> %x, %m = fmul contract ninf<4 x float> %y, %a ret <4 x float> %m } @@ -771,45 +771,45 @@ define <4 x float> @test_v4f32_mul_y_add_x_negone_ninf(<4 x float> %x, <4 x floa ret <4 x float> %m } -define <4 x float> @test_v4f32_mul_y_add_x_negone_undefs(<4 x float> %x, <4 x float> %y) { -; FMA-LABEL: test_v4f32_mul_y_add_x_negone_undefs: +define <4 x float> @test_v4f32_mul_y_add_x_negone_poisons(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_add_x_negone_poisons: ; FMA: # %bb.0: ; FMA-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA-NEXT: retq ; -; FMA4-LABEL: test_v4f32_mul_y_add_x_negone_undefs: +; FMA4-LABEL: test_v4f32_mul_y_add_x_negone_poisons: ; FMA4: # %bb.0: ; FMA4-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA4-NEXT: retq ; -; AVX512-LABEL: test_v4f32_mul_y_add_x_negone_undefs: +; AVX512-LABEL: test_v4f32_mul_y_add_x_negone_poisons: ; AVX512: # %bb.0: ; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %a = fadd contract <4 x float> %x, + %a = fadd contract <4 x float> %x, %m = fmul contract <4 x float> %y, %a ret <4 x float> %m } -define <4 x float> @test_v4f32_mul_y_add_x_negone_undefs_ninf(<4 x float> %x, <4 x float> %y) { -; FMA-LABEL: test_v4f32_mul_y_add_x_negone_undefs_ninf: +define <4 x float> @test_v4f32_mul_y_add_x_negone_poisons_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_add_x_negone_poisons_ninf: ; FMA: # %bb.0: ; FMA-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 ; FMA-NEXT: retq ; -; FMA4-LABEL: test_v4f32_mul_y_add_x_negone_undefs_ninf: +; FMA4-LABEL: test_v4f32_mul_y_add_x_negone_poisons_ninf: ; FMA4: # %bb.0: ; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1 ; FMA4-NEXT: retq ; -; AVX512-LABEL: test_v4f32_mul_y_add_x_negone_undefs_ninf: +; AVX512-LABEL: test_v4f32_mul_y_add_x_negone_poisons_ninf: ; AVX512: # %bb.0: ; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 ; AVX512-NEXT: retq - %a = fadd contract ninf<4 x float> %x, + %a = fadd contract ninf<4 x float> %x, %m = fmul contract ninf<4 x float> %y, %a ret <4 x float> %m } @@ -912,48 +912,48 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x_ninf(<4 x float> %x, <4 x float> ret <4 x float> %m } -define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs(<4 x float> %x, <4 x float> %y) { -; FMA-LABEL: test_v4f32_mul_y_sub_one_x_undefs: +define <4 x float> @test_v4f32_mul_y_sub_one_x_poisons(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_one_x_poisons: ; FMA: # %bb.0: ; FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA-NEXT: retq ; -; FMA4-LABEL: test_v4f32_mul_y_sub_one_x_undefs: +; FMA4-LABEL: test_v4f32_mul_y_sub_one_x_poisons: ; FMA4: # %bb.0: ; FMA4-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA4-NEXT: retq ; -; AVX512-LABEL: test_v4f32_mul_y_sub_one_x_undefs: +; AVX512-LABEL: test_v4f32_mul_y_sub_one_x_poisons: ; AVX512: # %bb.0: ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX512-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %s = fsub contract <4 x float> , %x + %s = fsub contract <4 x float> , %x %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } -define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs_ninf(<4 x float> %x, <4 x float> %y) { -; FMA-LABEL: test_v4f32_mul_y_sub_one_x_undefs_ninf: +define <4 x float> @test_v4f32_mul_y_sub_one_x_poisons_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_one_x_poisons_ninf: ; FMA: # %bb.0: ; FMA-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 ; FMA-NEXT: retq ; -; FMA4-LABEL: test_v4f32_mul_y_sub_one_x_undefs_ninf: +; FMA4-LABEL: test_v4f32_mul_y_sub_one_x_poisons_ninf: ; FMA4: # %bb.0: ; FMA4-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 ; FMA4-NEXT: retq ; -; AVX512-LABEL: test_v4f32_mul_y_sub_one_x_undefs_ninf: +; AVX512-LABEL: test_v4f32_mul_y_sub_one_x_poisons_ninf: ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1 ; AVX512-NEXT: retq - %s = fsub contract ninf<4 x float> , %x + %s = fsub contract ninf<4 x float> , %x %m = fmul contract ninf<4 x float> %y, %s ret <4 x float> %m } @@ -1056,48 +1056,48 @@ define <4 x float> @test_v4f32_mul_y_sub_negone_x_ninf(<4 x float> %x, <4 x floa ret <4 x float> %m } -define <4 x float> @test_v4f32_mul_y_sub_negone_x_undefs(<4 x float> %x, <4 x float> %y) { -; FMA-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: +define <4 x float> @test_v4f32_mul_y_sub_negone_x_poisons(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_negone_x_poisons: ; FMA: # %bb.0: ; FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA-NEXT: retq ; -; FMA4-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: +; FMA4-LABEL: test_v4f32_mul_y_sub_negone_x_poisons: ; FMA4: # %bb.0: ; FMA4-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA4-NEXT: retq ; -; AVX512-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: +; AVX512-LABEL: test_v4f32_mul_y_sub_negone_x_poisons: ; AVX512: # %bb.0: ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; AVX512-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %s = fsub contract <4 x float> , %x + %s = fsub contract <4 x float> , %x %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } -define <4 x float> @test_v4f32_mul_y_sub_negone_x_undefs_ninf(<4 x float> %x, <4 x float> %y) { -; FMA-LABEL: test_v4f32_mul_y_sub_negone_x_undefs_ninf: +define <4 x float> @test_v4f32_mul_y_sub_negone_x_poisons_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_negone_x_poisons_ninf: ; FMA: # %bb.0: ; FMA-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1 ; FMA-NEXT: retq ; -; FMA4-LABEL: test_v4f32_mul_y_sub_negone_x_undefs_ninf: +; FMA4-LABEL: test_v4f32_mul_y_sub_negone_x_poisons_ninf: ; FMA4: # %bb.0: ; FMA4-NEXT: vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm1 ; FMA4-NEXT: retq ; -; AVX512-LABEL: test_v4f32_mul_y_sub_negone_x_undefs_ninf: +; AVX512-LABEL: test_v4f32_mul_y_sub_negone_x_poisons_ninf: ; AVX512: # %bb.0: ; AVX512-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1 ; AVX512-NEXT: retq - %s = fsub contract ninf<4 x float> , %x + %s = fsub contract ninf<4 x float> , %x %m = fmul contract ninf<4 x float> %y, %s ret <4 x float> %m } @@ -1188,45 +1188,45 @@ define <4 x float> @test_v4f32_mul_y_sub_x_one_ninf(<4 x float> %x, <4 x float> ret <4 x float> %m } -define <4 x float> @test_v4f32_mul_y_sub_x_one_undefs(<4 x float> %x, <4 x float> %y) { -; FMA-LABEL: test_v4f32_mul_y_sub_x_one_undefs: +define <4 x float> @test_v4f32_mul_y_sub_x_one_poisons(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_x_one_poisons: ; FMA: # %bb.0: ; FMA-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA-NEXT: retq ; -; FMA4-LABEL: test_v4f32_mul_y_sub_x_one_undefs: +; FMA4-LABEL: test_v4f32_mul_y_sub_x_one_poisons: ; FMA4: # %bb.0: ; FMA4-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA4-NEXT: retq ; -; AVX512-LABEL: test_v4f32_mul_y_sub_x_one_undefs: +; AVX512-LABEL: test_v4f32_mul_y_sub_x_one_poisons: ; AVX512: # %bb.0: ; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %s = fsub contract <4 x float> %x, + %s = fsub contract <4 x float> %x, %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } -define <4 x float> @test_v4f32_mul_y_sub_x_one_undefs_ninf(<4 x float> %x, <4 x float> %y) { -; FMA-LABEL: test_v4f32_mul_y_sub_x_one_undefs_ninf: +define <4 x float> @test_v4f32_mul_y_sub_x_one_poisons_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_x_one_poisons_ninf: ; FMA: # %bb.0: ; FMA-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 ; FMA-NEXT: retq ; -; FMA4-LABEL: test_v4f32_mul_y_sub_x_one_undefs_ninf: +; FMA4-LABEL: test_v4f32_mul_y_sub_x_one_poisons_ninf: ; FMA4: # %bb.0: ; FMA4-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1 ; FMA4-NEXT: retq ; -; AVX512-LABEL: test_v4f32_mul_y_sub_x_one_undefs_ninf: +; AVX512-LABEL: test_v4f32_mul_y_sub_x_one_poisons_ninf: ; AVX512: # %bb.0: ; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1 ; AVX512-NEXT: retq - %s = fsub contract ninf<4 x float> %x, + %s = fsub contract ninf<4 x float> %x, %m = fmul contract ninf<4 x float> %y, %s ret <4 x float> %m } @@ -1317,45 +1317,45 @@ define <4 x float> @test_v4f32_mul_y_sub_x_negone_ninf(<4 x float> %x, <4 x floa ret <4 x float> %m } -define <4 x float> @test_v4f32_mul_y_sub_x_negone_undefs(<4 x float> %x, <4 x float> %y) { -; FMA-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: +define <4 x float> @test_v4f32_mul_y_sub_x_negone_poisons(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_x_negone_poisons: ; FMA: # %bb.0: ; FMA-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA-NEXT: retq ; -; FMA4-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: +; FMA4-LABEL: test_v4f32_mul_y_sub_x_negone_poisons: ; FMA4: # %bb.0: ; FMA4-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; FMA4-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA4-NEXT: retq ; -; AVX512-LABEL: test_v4f32_mul_y_sub_x_negone_undefs: +; AVX512-LABEL: test_v4f32_mul_y_sub_x_negone_poisons: ; AVX512: # %bb.0: ; AVX512-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq - %s = fsub contract <4 x float> %x, + %s = fsub contract <4 x float> %x, %m = fmul contract <4 x float> %y, %s ret <4 x float> %m } -define <4 x float> @test_v4f32_mul_y_sub_x_negone_undefs_ninf(<4 x float> %x, <4 x float> %y) { -; FMA-LABEL: test_v4f32_mul_y_sub_x_negone_undefs_ninf: +define <4 x float> @test_v4f32_mul_y_sub_x_negone_poisons_ninf(<4 x float> %x, <4 x float> %y) { +; FMA-LABEL: test_v4f32_mul_y_sub_x_negone_poisons_ninf: ; FMA: # %bb.0: ; FMA-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 ; FMA-NEXT: retq ; -; FMA4-LABEL: test_v4f32_mul_y_sub_x_negone_undefs_ninf: +; FMA4-LABEL: test_v4f32_mul_y_sub_x_negone_poisons_ninf: ; FMA4: # %bb.0: ; FMA4-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 ; FMA4-NEXT: retq ; -; AVX512-LABEL: test_v4f32_mul_y_sub_x_negone_undefs_ninf: +; AVX512-LABEL: test_v4f32_mul_y_sub_x_negone_poisons_ninf: ; AVX512: # %bb.0: ; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 ; AVX512-NEXT: retq - %s = fsub contract ninf<4 x float> %x, + %s = fsub contract ninf<4 x float> %x, %m = fmul contract ninf<4 x float> %y, %s ret <4 x float> %m } From 2438adcdf914e87525f8f0ade6c882937a2e397b Mon Sep 17 00:00:00 2001 From: PaperChalice Date: Sat, 11 Oct 2025 13:30:29 +0800 Subject: [PATCH 5/5] remove hasPoisonGeneratingFlags covered check --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 3d963103d7469..67e4f1911328a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5767,11 +5767,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, // even if the nonan flag is dropped somewhere. unsigned CCOp = Opcode == ISD::SETCC ? 2 : 4; ISD::CondCode CCCode = cast(Op.getOperand(CCOp))->get(); - if (((unsigned)CCCode & 0x10U)) - return true; - - const TargetOptions &Options = getTarget().Options; - return Options.NoNaNsFPMath || Op->getFlags().hasNoInfs(); + return ((unsigned)CCCode & 0x10U); } case ISD::OR: