diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 1ef5dc2863eb6..3ee6aeb34c838 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10789,6 +10789,10 @@ static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, SDValue LeftOp = ShiftOperand.getOperand(0); SDValue RightOp = ShiftOperand.getOperand(1); + if (LeftOp.getOpcode() != ISD::SIGN_EXTEND && + LeftOp.getOpcode() != ISD::ZERO_EXTEND) + std::swap(LeftOp, RightOp); + bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND; bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND; @@ -10821,18 +10825,17 @@ static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, } SDValue MulhRightOp; - if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) { - unsigned ActiveBits = IsSignExt - ? Constant->getAPIntValue().getSignificantBits() - : Constant->getAPIntValue().getActiveBits(); - if (ActiveBits > NarrowVTSize) + if (LeftOp.getOpcode() != RightOp.getOpcode()) { + if (IsZeroExt && ShiftOperand.hasOneUse() && + DAG.computeKnownBits(RightOp).countMaxActiveBits() <= NarrowVTSize) { + MulhRightOp = DAG.getNode(ISD::TRUNCATE, DL, NarrowVT, RightOp); + } else if (IsSignExt && ShiftOperand.hasOneUse() && + DAG.ComputeMaxSignificantBits(RightOp) <= NarrowVTSize) { + MulhRightOp = DAG.getNode(ISD::TRUNCATE, DL, NarrowVT, RightOp); + } else { return SDValue(); - MulhRightOp = DAG.getConstant( - Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL, - NarrowVT); + } } else { - if (LeftOp.getOpcode() != RightOp.getOpcode()) - return SDValue(); // Check that the two extend nodes are the same type. if (NarrowVT != RightOp.getOperand(0).getValueType()) return SDValue(); diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 71f5a94a7f245..2d2468ea1c5e6 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -571,7 +571,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v3, v3, v2 ; GCN-NEXT: v_mul_hi_u32 v3, v2, v3 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_mul_hi_u32 v2, v1, v2 +; GCN-NEXT: v_mul_hi_u32 v2, v2, v1 ; GCN-NEXT: v_mul_u32_u24_e32 v3, v2, v0 ; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 @@ -598,7 +598,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v2 ; GCN-IR-NEXT: v_mul_hi_u32 v3, v2, v3 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-IR-NEXT: v_mul_hi_u32 v2, v1, v2 +; GCN-IR-NEXT: v_mul_hi_u32 v2, v2, v1 ; GCN-IR-NEXT: v_mul_u32_u24_e32 v3, v2, v0 ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index fd461ac80ea55..6bc5577aec407 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -515,7 +515,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s3 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: s_mul_i32 s0, s0, s8 ; GCN-NEXT: s_sub_i32 s0, s3, s0 @@ -551,7 +551,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s3 ; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-IR-NEXT: s_mul_i32 s0, s0, s8 ; GCN-IR-NEXT: s_sub_i32 s0, s3, s0 @@ -595,7 +595,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-NEXT: s_lshr_b32 s2, s3, 1 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s2 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: s_mul_i32 s0, s0, s8 @@ -633,7 +633,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s2 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-IR-NEXT: s_mul_i32 s0, s0, s8 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 137dc1fe42294..ad601d8e75973 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -467,7 +467,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-NEXT: s_lshr_b32 s2, s3, 1 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s2 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 @@ -502,7 +502,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64 ; GCN-IR-NEXT: s_lshr_b32 s2, s3, 1 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s2 ; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_readfirstlane_b32 s0, v0 @@ -544,7 +544,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-NEXT: s_lshr_b32 s1, s9, 1 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s1 ; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_readfirstlane_b32 s2, v0 @@ -562,7 +562,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_mul_hi_u32 v2, s7, v0 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s7 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v3, v1 @@ -599,7 +599,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-IR-NEXT: s_lshr_b32 s1, s9, 1 ; GCN-IR-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s1, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s1 ; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-IR-NEXT: v_readfirstlane_b32 s2, v0 @@ -617,7 +617,7 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64> ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_mul_hi_u32 v0, v1, v0 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s7, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v2, v0, s7 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 @@ -728,7 +728,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v1, v1, s4 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s7 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0x7fffff, v1 ; GCN-NEXT: v_readfirstlane_b32 s4, v0 @@ -775,7 +775,7 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6 ; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, s4 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s7, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, v0, s7 ; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 ; GCN-IR-NEXT: v_and_b32_e32 v2, 0x7fffff, v1 ; GCN-IR-NEXT: v_readfirstlane_b32 s4, v0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmulhu-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vmulhu-sdnode.ll index 3fd7f5be860cf..c0c9b1797f91f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmulhu-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmulhu-sdnode.ll @@ -48,18 +48,11 @@ define @vmulhu_vi_nxv1i32_0( %va) { } define @vmulhu_vi_nxv1i32_1( %va) { -; RV32-LABEL: vmulhu_vi_nxv1i32_1: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, mf2, ta, ma -; RV32-NEXT: vsrl.vi v8, v8, 28 -; RV32-NEXT: ret -; -; RV64-LABEL: vmulhu_vi_nxv1i32_1: -; RV64: # %bb.0: -; RV64-NEXT: li a0, 16 -; RV64-NEXT: vsetvli a1, zero, e32, mf2, ta, ma -; RV64-NEXT: vmulhu.vx v8, v8, a0 -; RV64-NEXT: ret +; CHECK-LABEL: vmulhu_vi_nxv1i32_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsrl.vi v8, v8, 28 +; CHECK-NEXT: ret %vb = zext splat (i32 16) to %vc = zext %va to %vd = mul %vb, %vc @@ -114,18 +107,11 @@ define @vmulhu_vi_nxv2i32_0( %va) { } define @vmulhu_vi_nxv2i32_1( %va) { -; RV32-LABEL: vmulhu_vi_nxv2i32_1: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; RV32-NEXT: vsrl.vi v8, v8, 28 -; RV32-NEXT: ret -; -; RV64-LABEL: vmulhu_vi_nxv2i32_1: -; RV64: # %bb.0: -; RV64-NEXT: li a0, 16 -; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; RV64-NEXT: vmulhu.vx v8, v8, a0 -; RV64-NEXT: ret +; CHECK-LABEL: vmulhu_vi_nxv2i32_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vsrl.vi v8, v8, 28 +; CHECK-NEXT: ret %vb = zext splat (i32 16) to %vc = zext %va to %vd = mul %vb, %vc @@ -180,18 +166,11 @@ define @vmulhu_vi_nxv4i32_0( %va) { } define @vmulhu_vi_nxv4i32_1( %va) { -; RV32-LABEL: vmulhu_vi_nxv4i32_1: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma -; RV32-NEXT: vsrl.vi v8, v8, 28 -; RV32-NEXT: ret -; -; RV64-LABEL: vmulhu_vi_nxv4i32_1: -; RV64: # %bb.0: -; RV64-NEXT: li a0, 16 -; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; RV64-NEXT: vmulhu.vx v8, v8, a0 -; RV64-NEXT: ret +; CHECK-LABEL: vmulhu_vi_nxv4i32_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vsrl.vi v8, v8, 28 +; CHECK-NEXT: ret %vb = zext splat (i32 16) to %vc = zext %va to %vd = mul %vb, %vc @@ -246,18 +225,11 @@ define @vmulhu_vi_nxv8i32_0( %va) { } define @vmulhu_vi_nxv8i32_1( %va) { -; RV32-LABEL: vmulhu_vi_nxv8i32_1: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, m4, ta, ma -; RV32-NEXT: vsrl.vi v8, v8, 28 -; RV32-NEXT: ret -; -; RV64-LABEL: vmulhu_vi_nxv8i32_1: -; RV64: # %bb.0: -; RV64-NEXT: li a0, 16 -; RV64-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; RV64-NEXT: vmulhu.vx v8, v8, a0 -; RV64-NEXT: ret +; CHECK-LABEL: vmulhu_vi_nxv8i32_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-NEXT: vsrl.vi v8, v8, 28 +; CHECK-NEXT: ret %vb = zext splat (i32 16) to %vc = zext %va to %vd = mul %vb, %vc @@ -265,3 +237,6 @@ define @vmulhu_vi_nxv8i32_1( %va) { %vf = trunc %ve to ret %vf } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}} diff --git a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll index 32648b6b449a8..8d8e5e9f48ab8 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll @@ -793,23 +793,11 @@ entry: define arm_aapcs_vfpcc <4 x i32> @vmulhs_kb_v4i32(<4 x i32> %s0, <4 x i64> %s1) { ; CHECK-LABEL: vmulhs_kb_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: smmul r0, r0, r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: smmul r1, r1, r2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: smmul r0, r0, r1 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: smmul r1, r1, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s6, s9 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmulh.s32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = sext <4 x i32> %s0 to <4 x i64> @@ -823,23 +811,11 @@ entry: define arm_aapcs_vfpcc <4 x i32> @vmulhu_kb_v4i32(<4 x i32> %s0, <4 x i64> %s1) { ; CHECK-LABEL: vmulhu_kb_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: umull r0, r1, r0, r1 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: umull r0, r2, r0, r2 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: umull r0, r1, r0, r1 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: umull r0, r2, r0, r2 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s6, s9 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmulh.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <4 x i32> %s0 to <4 x i64> @@ -853,23 +829,11 @@ entry: define arm_aapcs_vfpcc <4 x i32> @vmulhs_kbc_v4i32(<4 x i32> %s0, <4 x i64> %s1) { ; CHECK-LABEL: vmulhs_kbc_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: smmul r0, r1, r0 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: smmul r1, r2, r1 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: smmul r0, r1, r0 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: smmul r1, r2, r1 -; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s6, s9 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmulh.s32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = sext <4 x i32> %s0 to <4 x i64> @@ -883,23 +847,11 @@ entry: define arm_aapcs_vfpcc <4 x i32> @vmulhu_kbc_v4i32(<4 x i32> %s0, <4 x i64> %s1) { ; CHECK-LABEL: vmulhu_kbc_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: umull r0, r2, r2, r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: umull r0, r2, r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s6, s9 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmulh.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <4 x i32> %s0 to <4 x i64> @@ -913,25 +865,17 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vmulhs_kb_v8i16(<8 x i16> %s0, <8 x i32> %s1) { ; CHECK-LABEL: vmulhs_kb_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmovlt.s16 q4, q0 ; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vmov.f32 s5, s6 ; CHECK-NEXT: vmov.f32 s14, s9 ; CHECK-NEXT: vmov.f32 s15, s11 ; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vshr.s32 q3, q3, #16 -; CHECK-NEXT: vmov.f32 s7, s10 -; CHECK-NEXT: vmul.i32 q3, q4, q3 -; CHECK-NEXT: vshr.s32 q1, q1, #16 ; CHECK-NEXT: vshr.u32 q3, q3, #16 -; CHECK-NEXT: vmul.i32 q0, q0, q1 -; CHECK-NEXT: vshr.u32 q0, q0, #16 -; CHECK-NEXT: vmovnt.i32 q0, q3 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vshr.u32 q1, q1, #16 +; CHECK-NEXT: vmovnt.i32 q1, q3 +; CHECK-NEXT: vmulh.s16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = sext <8 x i16> %s0 to <8 x i32> @@ -945,25 +889,17 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vmulhu_kb_v8i16(<8 x i16> %s0, <8 x i32> %s1) { ; CHECK-LABEL: vmulhu_kb_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmovlt.u16 q4, q0 ; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: vmov.f32 s5, s6 ; CHECK-NEXT: vmov.f32 s14, s9 ; CHECK-NEXT: vmov.f32 s15, s11 ; CHECK-NEXT: vmov.f32 s6, s8 ; CHECK-NEXT: vshr.u32 q3, q3, #16 ; CHECK-NEXT: vmov.f32 s7, s10 -; CHECK-NEXT: vmul.i32 q3, q4, q3 ; CHECK-NEXT: vshr.u32 q1, q1, #16 -; CHECK-NEXT: vshr.u32 q3, q3, #16 -; CHECK-NEXT: vmul.i32 q0, q0, q1 -; CHECK-NEXT: vshr.u32 q0, q0, #16 -; CHECK-NEXT: vmovnt.i32 q0, q3 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmovnt.i32 q1, q3 +; CHECK-NEXT: vmulh.u16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <8 x i16> %s0 to <8 x i32> @@ -977,25 +913,17 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vmulhs_kbc_v8i16(<8 x i16> %s0, <8 x i32> %s1) { ; CHECK-LABEL: vmulhs_kbc_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmovlt.s16 q4, q0 ; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vmov.f32 s5, s6 ; CHECK-NEXT: vmov.f32 s14, s9 ; CHECK-NEXT: vmov.f32 s15, s11 ; CHECK-NEXT: vmov.f32 s6, s8 -; CHECK-NEXT: vshr.s32 q3, q3, #16 -; CHECK-NEXT: vmov.f32 s7, s10 -; CHECK-NEXT: vmul.i32 q3, q3, q4 -; CHECK-NEXT: vshr.s32 q1, q1, #16 ; CHECK-NEXT: vshr.u32 q3, q3, #16 -; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vshr.u32 q0, q0, #16 -; CHECK-NEXT: vmovnt.i32 q0, q3 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vshr.u32 q1, q1, #16 +; CHECK-NEXT: vmovnt.i32 q1, q3 +; CHECK-NEXT: vmulh.s16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = sext <8 x i16> %s0 to <8 x i32> @@ -1009,25 +937,17 @@ entry: define arm_aapcs_vfpcc <8 x i16> @vmulhu_kbc_v8i16(<8 x i16> %s0, <8 x i32> %s1) { ; CHECK-LABEL: vmulhu_kbc_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmovlt.u16 q4, q0 ; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: vmov.f32 s5, s6 ; CHECK-NEXT: vmov.f32 s14, s9 ; CHECK-NEXT: vmov.f32 s15, s11 ; CHECK-NEXT: vmov.f32 s6, s8 ; CHECK-NEXT: vshr.u32 q3, q3, #16 ; CHECK-NEXT: vmov.f32 s7, s10 -; CHECK-NEXT: vmul.i32 q3, q3, q4 ; CHECK-NEXT: vshr.u32 q1, q1, #16 -; CHECK-NEXT: vshr.u32 q3, q3, #16 -; CHECK-NEXT: vmul.i32 q0, q1, q0 -; CHECK-NEXT: vshr.u32 q0, q0, #16 -; CHECK-NEXT: vmovnt.i32 q0, q3 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmovnt.i32 q1, q3 +; CHECK-NEXT: vmulh.u16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <8 x i16> %s0 to <8 x i32> diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll index ff5329c637251..0b5ef7e3a514c 100644 --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -330,22 +330,27 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) { ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE-NEXT: pxor %xmm0, %xmm0 ; SSE-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0,0] ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: .p2align 4 ; SSE-NEXT: .LBB7_1: # %loop ; SSE-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE-NEXT: pmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; SSE-NEXT: pmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero -; SSE-NEXT: pmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero -; SSE-NEXT: pmovzxdq {{.*#+}} xmm6 = mem[0],zero,mem[1],zero -; SSE-NEXT: pmuludq %xmm2, %xmm6 +; SSE-NEXT: movdqu 2097152(%rdi,%rax), %xmm4 +; SSE-NEXT: movdqu 2097168(%rdi,%rax), %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE-NEXT: pmuludq %xmm3, %xmm6 ; SSE-NEXT: pmuludq %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3] -; SSE-NEXT: paddd %xmm5, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; SSE-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] +; SSE-NEXT: paddd %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; SSE-NEXT: pmuludq %xmm3, %xmm5 ; SSE-NEXT: pmuludq %xmm2, %xmm4 -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm3[1,3] -; SSE-NEXT: paddd %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] +; SSE-NEXT: paddd %xmm4, %xmm0 ; SSE-NEXT: subq $-128, %rax ; SSE-NEXT: jne .LBB7_1 ; SSE-NEXT: # %bb.2: # %end @@ -356,27 +361,33 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) { ; AVX1-NEXT: movl %esi, %eax ; AVX1-NEXT: vmovq %rax, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm4[1],xmm5[1] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm5[0] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm4[1],xmm5[1] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; AVX1-NEXT: .p2align 4 ; AVX1-NEXT: .LBB7_1: # %loop ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = mem[0],zero,mem[1],zero -; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm6 -; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3] -; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3],xmm4[1,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqu 2097152(%rdi,%rax), %xmm5 +; AVX1-NEXT: vmovdqu 2097168(%rdi,%rax), %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm1, %xmm7, %xmm7 +; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; AVX1-NEXT: vpmuludq %xmm3, %xmm7, %xmm7 +; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 +; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 ; AVX1-NEXT: subq $-128, %rax ; AVX1-NEXT: jne .LBB7_1 ; AVX1-NEXT: # %bb.2: # %end @@ -389,16 +400,19 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) { ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] ; AVX2-NEXT: .p2align 4 ; AVX2-NEXT: .LBB7_1: # %loop ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovdqu 2097152(%rdi,%rax), %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm4 +; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] +; AVX2-NEXT: vpaddd %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: subq $-128, %rax ; AVX2-NEXT: jne .LBB7_1 ; AVX2-NEXT: # %bb.2: # %end @@ -410,14 +424,18 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) { ; AVX512VL-NEXT: vpbroadcastq %rax, %zmm1 ; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX512VL-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] ; AVX512VL-NEXT: .p2align 4 ; AVX512VL-NEXT: .LBB7_1: # %loop ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX512VL-NEXT: vpmuludq %zmm2, %zmm1, %zmm2 -; AVX512VL-NEXT: vpsrlq $32, %zmm2, %zmm2 -; AVX512VL-NEXT: vpmovqd %zmm2, %ymm2 -; AVX512VL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vmovdqu 2097152(%rdi,%rax), %ymm3 +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[1,1,3,3,5,5,7,7] +; AVX512VL-NEXT: vpmuludq %ymm2, %ymm4, %ymm4 +; AVX512VL-NEXT: vpmuludq %ymm1, %ymm3, %ymm3 +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] +; AVX512VL-NEXT: vpaddd %ymm0, %ymm3, %ymm0 ; AVX512VL-NEXT: subq $-128, %rax ; AVX512VL-NEXT: jne .LBB7_1 ; AVX512VL-NEXT: # %bb.2: # %end @@ -429,14 +447,18 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) { ; AVX512DQVL-NEXT: vpbroadcastq %rax, %zmm1 ; AVX512DQVL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512DQVL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX512DQVL-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] ; AVX512DQVL-NEXT: .p2align 4 ; AVX512DQVL-NEXT: .LBB7_1: # %loop ; AVX512DQVL-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX512DQVL-NEXT: vpmuludq %zmm2, %zmm1, %zmm2 -; AVX512DQVL-NEXT: vpsrlq $32, %zmm2, %zmm2 -; AVX512DQVL-NEXT: vpmovqd %zmm2, %ymm2 -; AVX512DQVL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX512DQVL-NEXT: vmovdqu 2097152(%rdi,%rax), %ymm3 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[1,1,3,3,5,5,7,7] +; AVX512DQVL-NEXT: vpmuludq %ymm2, %ymm4, %ymm4 +; AVX512DQVL-NEXT: vpmuludq %ymm1, %ymm3, %ymm3 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7] +; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] +; AVX512DQVL-NEXT: vpaddd %ymm0, %ymm3, %ymm0 ; AVX512DQVL-NEXT: subq $-128, %rax ; AVX512DQVL-NEXT: jne .LBB7_1 ; AVX512DQVL-NEXT: # %bb.2: # %end @@ -531,27 +553,33 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) { ; AVX1-NEXT: movslq %esi, %rax ; AVX1-NEXT: vmovq %rax, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4 ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm4[1],xmm5[1] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm5[0] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm4[1],xmm5[1] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; AVX1-NEXT: .p2align 4 ; AVX1-NEXT: .LBB8_1: # %loop ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxdq 2097152(%rdi,%rax), %xmm3 -; AVX1-NEXT: vpmovsxdq 2097160(%rdi,%rax), %xmm4 -; AVX1-NEXT: vpmovsxdq 2097168(%rdi,%rax), %xmm5 -; AVX1-NEXT: vpmovsxdq 2097176(%rdi,%rax), %xmm6 -; AVX1-NEXT: vpmuldq %xmm6, %xmm2, %xmm6 -; AVX1-NEXT: vpmuldq %xmm5, %xmm1, %xmm5 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3] -; AVX1-NEXT: vpmuldq %xmm4, %xmm2, %xmm4 -; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3],xmm4[1,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqu 2097152(%rdi,%rax), %xmm5 +; AVX1-NEXT: vmovdqu 2097168(%rdi,%rax), %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,3,3] +; AVX1-NEXT: vpmuldq %xmm1, %xmm7, %xmm7 +; AVX1-NEXT: vpmuldq %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] +; AVX1-NEXT: vpmuldq %xmm3, %xmm7, %xmm7 +; AVX1-NEXT: vpmuldq %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] +; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 +; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0 ; AVX1-NEXT: subq $-128, %rax ; AVX1-NEXT: jne .LBB8_1 ; AVX1-NEXT: # %bb.2: # %end @@ -564,16 +592,19 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) { ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,0,2,4,6,4,6] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] ; AVX2-NEXT: .p2align 4 ; AVX2-NEXT: .LBB8_1: # %loop ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vpmovsxdq 2097152(%rdi,%rax), %ymm2 -; AVX2-NEXT: vpmovsxdq 2097168(%rdi,%rax), %ymm3 -; AVX2-NEXT: vpmuldq %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpmuldq %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm3[1,3],ymm2[5,7],ymm3[5,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovdqu 2097152(%rdi,%rax), %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpmuldq %ymm2, %ymm4, %ymm4 +; AVX2-NEXT: vpmuldq %ymm1, %ymm3, %ymm3 +; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] +; AVX2-NEXT: vpaddd %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: subq $-128, %rax ; AVX2-NEXT: jne .LBB8_1 ; AVX2-NEXT: # %bb.2: # %end @@ -585,14 +616,18 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) { ; AVX512VL-NEXT: vpbroadcastq %rax, %zmm1 ; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX512VL-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] ; AVX512VL-NEXT: .p2align 4 ; AVX512VL-NEXT: .LBB8_1: # %loop ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX512VL-NEXT: vpmuldq %zmm2, %zmm1, %zmm2 -; AVX512VL-NEXT: vpsrlq $32, %zmm2, %zmm2 -; AVX512VL-NEXT: vpmovqd %zmm2, %ymm2 -; AVX512VL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vmovdqu 2097152(%rdi,%rax), %ymm3 +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[1,1,3,3,5,5,7,7] +; AVX512VL-NEXT: vpmuldq %ymm2, %ymm4, %ymm4 +; AVX512VL-NEXT: vpmuldq %ymm1, %ymm3, %ymm3 +; AVX512VL-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] +; AVX512VL-NEXT: vpaddd %ymm0, %ymm3, %ymm0 ; AVX512VL-NEXT: subq $-128, %rax ; AVX512VL-NEXT: jne .LBB8_1 ; AVX512VL-NEXT: # %bb.2: # %end @@ -604,14 +639,18 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) { ; AVX512DQVL-NEXT: vpbroadcastq %rax, %zmm1 ; AVX512DQVL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512DQVL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX512DQVL-NEXT: vpmovqd %zmm1, %ymm1 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] ; AVX512DQVL-NEXT: .p2align 4 ; AVX512DQVL-NEXT: .LBB8_1: # %loop ; AVX512DQVL-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX512DQVL-NEXT: vpmuldq %zmm2, %zmm1, %zmm2 -; AVX512DQVL-NEXT: vpsrlq $32, %zmm2, %zmm2 -; AVX512DQVL-NEXT: vpmovqd %zmm2, %ymm2 -; AVX512DQVL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX512DQVL-NEXT: vmovdqu 2097152(%rdi,%rax), %ymm3 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[1,1,3,3,5,5,7,7] +; AVX512DQVL-NEXT: vpmuldq %ymm2, %ymm4, %ymm4 +; AVX512DQVL-NEXT: vpmuldq %ymm1, %ymm3, %ymm3 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7] +; AVX512DQVL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] +; AVX512DQVL-NEXT: vpaddd %ymm0, %ymm3, %ymm0 ; AVX512DQVL-NEXT: subq $-128, %rax ; AVX512DQVL-NEXT: jne .LBB8_1 ; AVX512DQVL-NEXT: # %bb.2: # %end