diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index a16ec19e7a688..0360c1bd76f00 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -797,6 +797,16 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( return Op.getOperand(1); break; } + case ISD::ADD: { + RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + if (RHSKnown.isZero()) + return Op.getOperand(0); + + LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + if (LHSKnown.isZero()) + return Op.getOperand(1); + break; + } case ISD::SHL: { // If we are only demanding sign bits then we can use the shift source // directly. diff --git a/llvm/test/CodeGen/AArch64/srem-lkk.ll b/llvm/test/CodeGen/AArch64/srem-lkk.ll index d9f91449dffb8..1223ae3a15e7b 100644 --- a/llvm/test/CodeGen/AArch64/srem-lkk.ll +++ b/llvm/test/CodeGen/AArch64/srem-lkk.ll @@ -23,12 +23,11 @@ define i32 @fold_srem_positive_even(i32 %x) { ; CHECK-LABEL: fold_srem_positive_even: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #36849 // =0x8ff1 +; CHECK-NEXT: mov w9, #1060 // =0x424 ; CHECK-NEXT: movk w8, #15827, lsl #16 ; CHECK-NEXT: smull x8, w0, w8 -; CHECK-NEXT: lsr x9, x8, #63 ; CHECK-NEXT: asr x8, x8, #40 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: mov w9, #1060 // =0x424 +; CHECK-NEXT: add w8, w8, w8, lsr #31 ; CHECK-NEXT: msub w0, w8, w9, w0 ; CHECK-NEXT: ret %1 = srem i32 %x, 1060 @@ -40,12 +39,11 @@ define i32 @fold_srem_negative_odd(i32 %x) { ; CHECK-LABEL: fold_srem_negative_odd: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #65445 // =0xffa5 +; CHECK-NEXT: mov w9, #-723 // =0xfffffd2d ; CHECK-NEXT: movk w8, #42330, lsl #16 ; CHECK-NEXT: smull x8, w0, w8 -; CHECK-NEXT: lsr x9, x8, #63 ; CHECK-NEXT: asr x8, x8, #40 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: mov w9, #-723 // =0xfffffd2d +; CHECK-NEXT: add w8, w8, w8, lsr #31 ; CHECK-NEXT: msub w0, w8, w9, w0 ; CHECK-NEXT: ret %1 = srem i32 %x, -723 @@ -57,12 +55,11 @@ define i32 @fold_srem_negative_even(i32 %x) { ; CHECK-LABEL: fold_srem_negative_even: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #62439 // =0xf3e7 +; CHECK-NEXT: mov w9, #-22981 // =0xffffa63b ; CHECK-NEXT: movk w8, #64805, lsl #16 ; CHECK-NEXT: smull x8, w0, w8 -; CHECK-NEXT: lsr x9, x8, #63 ; CHECK-NEXT: asr x8, x8, #40 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: mov w9, #-22981 // =0xffffa63b +; CHECK-NEXT: add w8, w8, w8, lsr #31 ; CHECK-NEXT: msub w0, w8, w9, w0 ; CHECK-NEXT: ret %1 = srem i32 %x, -22981 diff --git a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll index a74f0c86fe185..b165ac0d56d20 100644 --- a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll @@ -263,16 +263,14 @@ define <2 x i32> @fold_srem_v2i32(<2 x i32> %x) { ; CHECK-LABEL: fold_srem_v2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #26215 // =0x6667 -; CHECK-NEXT: movi v3.2s, #10 +; CHECK-NEXT: movi v2.2s, #10 ; CHECK-NEXT: movk w8, #26214, lsl #16 ; CHECK-NEXT: dup v1.2s, w8 ; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: ushr v2.2d, v1.2d, #63 ; CHECK-NEXT: sshr v1.2d, v1.2d, #34 -; CHECK-NEXT: xtn v2.2s, v2.2d ; CHECK-NEXT: xtn v1.2s, v1.2d -; CHECK-NEXT: add v1.2s, v1.2s, v2.2s -; CHECK-NEXT: mls v0.2s, v1.2s, v3.2s +; CHECK-NEXT: usra v1.2s, v1.2s, #31 +; CHECK-NEXT: mls v0.2s, v1.2s, v2.2s ; CHECK-NEXT: ret %1 = srem <2 x i32> %x, ret <2 x i32> %1 diff --git a/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll b/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll index 35b478017383f..ae23520094db6 100644 --- a/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll +++ b/llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll @@ -11,23 +11,18 @@ define dso_local fastcc void @BuildVectorICE() unnamed_addr { ; 32BIT-NEXT: stwu 1, -48(1) ; 32BIT-NEXT: .cfi_def_cfa_offset 48 ; 32BIT-NEXT: lxvw4x 34, 0, 3 -; 32BIT-NEXT: li 3, .LCPI0_0@l -; 32BIT-NEXT: lis 4, .LCPI0_0@ha ; 32BIT-NEXT: li 5, 0 -; 32BIT-NEXT: xxlxor 36, 36, 36 -; 32BIT-NEXT: lxvw4x 35, 4, 3 ; 32BIT-NEXT: addi 3, 1, 16 ; 32BIT-NEXT: addi 4, 1, 32 -; 32BIT-NEXT: .p2align 4 +; 32BIT-NEXT: xxspltw 35, 34, 1 +; 32BIT-NEXT: .p2align 5 ; 32BIT-NEXT: .LBB0_1: # %while.body ; 32BIT-NEXT: # ; 32BIT-NEXT: stw 5, 16(1) -; 32BIT-NEXT: lxvw4x 37, 0, 3 -; 32BIT-NEXT: vperm 5, 5, 4, 3 -; 32BIT-NEXT: vadduwm 5, 2, 5 -; 32BIT-NEXT: xxspltw 32, 37, 1 -; 32BIT-NEXT: vadduwm 5, 5, 0 -; 32BIT-NEXT: stxvw4x 37, 0, 4 +; 32BIT-NEXT: lxvw4x 36, 0, 3 +; 32BIT-NEXT: vadduwm 4, 2, 4 +; 32BIT-NEXT: vadduwm 4, 4, 3 +; 32BIT-NEXT: stxvw4x 36, 0, 4 ; 32BIT-NEXT: lwz 5, 32(1) ; 32BIT-NEXT: b .LBB0_1 ; @@ -35,21 +30,16 @@ define dso_local fastcc void @BuildVectorICE() unnamed_addr { ; 64BIT: # %bb.0: # %entry ; 64BIT-NEXT: lxvw4x 34, 0, 3 ; 64BIT-NEXT: li 3, 0 -; 64BIT-NEXT: rldimi 3, 3, 32, 0 -; 64BIT-NEXT: mtfprd 0, 3 -; 64BIT-NEXT: li 3, 0 -; 64BIT-NEXT: .p2align 4 +; 64BIT-NEXT: xxspltw 35, 34, 1 +; 64BIT-NEXT: .p2align 5 ; 64BIT-NEXT: .LBB0_1: # %while.body ; 64BIT-NEXT: # -; 64BIT-NEXT: li 4, 0 -; 64BIT-NEXT: rldimi 4, 3, 32, 0 -; 64BIT-NEXT: mtfprd 1, 4 -; 64BIT-NEXT: xxmrghd 35, 1, 0 -; 64BIT-NEXT: vadduwm 3, 2, 3 -; 64BIT-NEXT: xxspltw 36, 35, 1 -; 64BIT-NEXT: vadduwm 3, 3, 4 -; 64BIT-NEXT: xxsldwi 1, 35, 35, 3 -; 64BIT-NEXT: mffprwz 3, 1 +; 64BIT-NEXT: sldi 3, 3, 32 +; 64BIT-NEXT: mtvsrd 36, 3 +; 64BIT-NEXT: vadduwm 4, 2, 4 +; 64BIT-NEXT: vadduwm 4, 4, 3 +; 64BIT-NEXT: xxsldwi 0, 36, 36, 3 +; 64BIT-NEXT: mffprwz 3, 0 ; 64BIT-NEXT: b .LBB0_1 entry: br label %while.body diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index 5802f45d311b3..bd556909f8be5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -13487,7 +13487,6 @@ define <32 x i64> @mgather_strided_split(ptr %base) { ; RV32ZVE32F-NEXT: vid.v v8 ; RV32ZVE32F-NEXT: vsll.vi v8, v8, 4 ; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 ; RV32ZVE32F-NEXT: lw a3, 0(a1) ; RV32ZVE32F-NEXT: sw a3, 252(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a1, 4(a1) @@ -13587,10 +13586,10 @@ define <32 x i64> @mgather_strided_split(ptr %base) { ; RV32ZVE32F-NEXT: lw s9, 4(a1) ; RV32ZVE32F-NEXT: lw s10, 0(a2) ; RV32ZVE32F-NEXT: lw s11, 4(a2) -; RV32ZVE32F-NEXT: lw t5, 0(a3) -; RV32ZVE32F-NEXT: lw t6, 4(a3) -; RV32ZVE32F-NEXT: lw s2, 0(a4) -; RV32ZVE32F-NEXT: lw s3, 4(a4) +; RV32ZVE32F-NEXT: lw s4, 0(a3) +; RV32ZVE32F-NEXT: lw s5, 4(a3) +; RV32ZVE32F-NEXT: lw s6, 0(a4) +; RV32ZVE32F-NEXT: lw s7, 4(a4) ; RV32ZVE32F-NEXT: lw a2, 336(sp) ; RV32ZVE32F-NEXT: lw a4, 340(sp) ; RV32ZVE32F-NEXT: lw a5, 344(sp) @@ -13607,8 +13606,8 @@ define <32 x i64> @mgather_strided_split(ptr %base) { ; RV32ZVE32F-NEXT: lw a6, 356(sp) ; RV32ZVE32F-NEXT: lw t3, 360(sp) ; RV32ZVE32F-NEXT: lw t4, 364(sp) -; RV32ZVE32F-NEXT: lw s4, 0(a5) -; RV32ZVE32F-NEXT: sw s4, 116(sp) # 4-byte Folded Spill +; RV32ZVE32F-NEXT: lw t5, 0(a5) +; RV32ZVE32F-NEXT: sw t5, 116(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a5, 4(a5) ; RV32ZVE32F-NEXT: sw a5, 112(sp) # 4-byte Folded Spill ; RV32ZVE32F-NEXT: lw a5, 0(a6) @@ -13626,10 +13625,10 @@ define <32 x i64> @mgather_strided_split(ptr %base) { ; RV32ZVE32F-NEXT: lw a6, 372(sp) ; RV32ZVE32F-NEXT: lw t3, 376(sp) ; RV32ZVE32F-NEXT: lw t4, 380(sp) -; RV32ZVE32F-NEXT: lw s4, 0(a5) -; RV32ZVE32F-NEXT: lw s5, 4(a5) -; RV32ZVE32F-NEXT: lw s6, 0(a6) -; RV32ZVE32F-NEXT: lw s7, 4(a6) +; RV32ZVE32F-NEXT: lw t5, 0(a5) +; RV32ZVE32F-NEXT: lw t6, 4(a5) +; RV32ZVE32F-NEXT: lw s2, 0(a6) +; RV32ZVE32F-NEXT: lw s3, 4(a6) ; RV32ZVE32F-NEXT: lw a5, 0(t3) ; RV32ZVE32F-NEXT: lw a6, 4(t3) ; RV32ZVE32F-NEXT: lw t3, 0(t4) @@ -13642,10 +13641,10 @@ define <32 x i64> @mgather_strided_split(ptr %base) { ; RV32ZVE32F-NEXT: sw t0, 164(a0) ; RV32ZVE32F-NEXT: sw t1, 168(a0) ; RV32ZVE32F-NEXT: sw t2, 172(a0) -; RV32ZVE32F-NEXT: sw t5, 144(a0) -; RV32ZVE32F-NEXT: sw t6, 148(a0) -; RV32ZVE32F-NEXT: sw s2, 152(a0) -; RV32ZVE32F-NEXT: sw s3, 156(a0) +; RV32ZVE32F-NEXT: sw s4, 144(a0) +; RV32ZVE32F-NEXT: sw s5, 148(a0) +; RV32ZVE32F-NEXT: sw s6, 152(a0) +; RV32ZVE32F-NEXT: sw s7, 156(a0) ; RV32ZVE32F-NEXT: sw s8, 128(a0) ; RV32ZVE32F-NEXT: sw s9, 132(a0) ; RV32ZVE32F-NEXT: sw s10, 136(a0) @@ -13686,10 +13685,10 @@ define <32 x i64> @mgather_strided_split(ptr %base) { ; RV32ZVE32F-NEXT: sw a6, 244(a0) ; RV32ZVE32F-NEXT: sw t3, 248(a0) ; RV32ZVE32F-NEXT: sw t4, 252(a0) -; RV32ZVE32F-NEXT: sw s4, 224(a0) -; RV32ZVE32F-NEXT: sw s5, 228(a0) -; RV32ZVE32F-NEXT: sw s6, 232(a0) -; RV32ZVE32F-NEXT: sw s7, 236(a0) +; RV32ZVE32F-NEXT: sw t5, 224(a0) +; RV32ZVE32F-NEXT: sw t6, 228(a0) +; RV32ZVE32F-NEXT: sw s2, 232(a0) +; RV32ZVE32F-NEXT: sw s3, 236(a0) ; RV32ZVE32F-NEXT: sw ra, 208(a0) ; RV32ZVE32F-NEXT: lw a1, 108(sp) # 4-byte Folded Reload ; RV32ZVE32F-NEXT: sw a1, 212(a0) diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll index 2a52b9eabc7b4..4efde4b8d7539 100644 --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -203,46 +203,39 @@ define i32 @PR43159(ptr %a0) { ; SSE-LABEL: PR43159: ; SSE: # %bb.0: # %entry ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $1, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE-NEXT: psubd %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrld $1, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7] +; SSE-NEXT: psubd %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] -; SSE-NEXT: paddd %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 ; SSE-NEXT: psrld $7, %xmm0 -; SSE-NEXT: psrld $6, %xmm2 -; SSE-NEXT: movd %xmm2, %edi +; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE-NEXT: psrld $6, %xmm1 +; SSE-NEXT: movd %xmm1, %edi ; SSE-NEXT: pextrd $1, %xmm0, %esi -; SSE-NEXT: pextrd $2, %xmm2, %edx +; SSE-NEXT: pextrd $2, %xmm1, %edx ; SSE-NEXT: pextrd $3, %xmm0, %ecx ; SSE-NEXT: jmp foo # TAILCALL ; ; AVX1-LABEL: PR43159: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsrld $7, %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $1, %xmm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpsrld $6, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %edi ; AVX1-NEXT: vpextrd $1, %xmm1, %esi diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 2b392e69297f0..2f19d14ef4256 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -2657,37 +2657,36 @@ define <8 x i16> @combine_vec_sdiv_nonuniform4(<8 x i16> %x) { define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) { ; SSE2-LABEL: combine_vec_sdiv_nonuniform5: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1] -; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32639,54613,19945,21846,2979,5243,32897,32833] +; SSE2-NEXT: pmulhw %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [65535,0,65535,0,0,0,1,1] ; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psraw $4, %xmm3 +; SSE2-NEXT: psraw $8, %xmm3 ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: psraw $2, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psraw $4, %xmm3 +; SSE2-NEXT: pandn %xmm3, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: psraw $1, %xmm1 +; SSE2-NEXT: psraw $2, %xmm1 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: psraw $1, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: psrlw $15, %xmm0 -; SSE2-NEXT: paddw %xmm2, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_sdiv_nonuniform5: @@ -2695,41 +2694,40 @@ define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) { ; SSE41-NEXT: pmovsxbw {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1] ; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833] +; SSE41-NEXT: paddw %xmm0, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [256,16384,4096,u,u,u,512,256] +; SSE41-NEXT: pmulhw %xmm1, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] +; SSE41-NEXT: psraw $1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5],xmm2[6,7] +; SSE41-NEXT: psrlw $15, %xmm1 ; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [256,16384,4096,u,u,u,512,256] -; SSE41-NEXT: pmulhw %xmm0, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psraw $1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] -; SSE41-NEXT: psrlw $15, %xmm0 -; SSE41-NEXT: paddw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_nonuniform5: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,0,65535,0,0,0,1,1] ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833] +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [256,16384,4096,u,u,u,512,256] +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] +; AVX1-NEXT: vpsraw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5],xmm2[6,7] +; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm1 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [256,16384,4096,u,u,u,512,256] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] -; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_vec_sdiv_nonuniform5: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,0,65535,0,0,0,1,1] ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32639,54613,19945,21846,2979,5243,32897,32833] +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [256,16384,4096,u,u,u,512,256] +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] +; AVX2-NEXT: vpsraw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5],xmm2[6,7] +; AVX2-NEXT: vpsrlw $15, %xmm1, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [256,16384,4096,u,u,u,512,256] -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX2-NEXT: vpsraw $1, %xmm0, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] -; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: combine_vec_sdiv_nonuniform5: @@ -2770,33 +2768,33 @@ define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) { define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) { ; SSE2-LABEL: combine_vec_sdiv_nonuniform6: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0] -; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32767,32767,32703,0,0,32897,32769,16385] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,1,1,1,0] +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32767,32767,32703,0,0,32897,32769,16385] ; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $6, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,65535,0] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: psraw $12, %xmm5 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psraw $8, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: psraw $6, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,65535,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,0,65535,0] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: psraw $12, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,0] -; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psraw $1, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm1 +; SSE2-NEXT: psraw $1, %xmm4 +; SSE2-NEXT: pandn %xmm4, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: psrlw $15, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2809,13 +2807,13 @@ define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) { ; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32767,32767,32703,0,0,32897,32769,16385] ; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,256,256,u,u,512,256,8] -; SSE41-NEXT: pmulhw %xmm0, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; SSE41-NEXT: psrlw $15, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] -; SSE41-NEXT: paddw %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrlw $15, %xmm2 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] +; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,256,256,u,u,512,256,8] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] +; SSE41-NEXT: paddw %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_nonuniform6: @@ -2823,12 +2821,12 @@ define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) { ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,65535,65535,65535,1,1,1,0] ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32767,32767,32703,0,0,32897,32769,16385] ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4,256,256,u,u,512,256,8] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] -; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] +; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,256,256,u,u,512,256,8] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_vec_sdiv_nonuniform6: @@ -2836,12 +2834,12 @@ define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) { ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [65535,65535,65535,65535,1,1,1,0] ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32767,32767,32703,0,0,32897,32769,16385] ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [4,256,256,u,u,512,256,8] -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] -; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm2 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] +; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,256,256,u,u,512,256,8] +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] +; AVX2-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: combine_vec_sdiv_nonuniform6: @@ -2928,15 +2926,14 @@ define <16 x i8> @pr38658(<16 x i8> %x) { ; SSE2-LABEL: pr38658: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,0,37632] -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: packuswb %xmm2, %xmm3 -; SSE2-NEXT: paddb %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,0,0,0,0,0,0,37632] +; SSE2-NEXT: psrlw $8, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; SSE2-NEXT: psraw $8, %xmm1 @@ -2956,20 +2953,21 @@ define <16 x i8> @pr38658(<16 x i8> %x) { ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,0,0,0,0,37632] ; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: packuswb %xmm2, %xmm1 -; SSE41-NEXT: paddb %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE41-NEXT: psraw $8, %xmm1 +; SSE41-NEXT: paddb %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psllw $6, %xmm2 -; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: packuswb %xmm1, %xmm2 -; SSE41-NEXT: psrlw $7, %xmm0 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: paddb %xmm2, %xmm0 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE41-NEXT: psraw $8, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psllw $6, %xmm3 +; SSE41-NEXT: psllw $8, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7] +; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: psrlw $7, %xmm1 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: pr38658: @@ -2979,18 +2977,18 @@ define <16 x i8> @pr38658(<16 x i8> %x) { ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,0,0,0,0,37632] ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $6, %xmm1, %xmm2 -; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $6, %xmm2, %xmm3 +; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7] +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: pr38658: diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll index d5a481549f851..55715197830b1 100644 --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -412,19 +412,19 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) { ; ; AVX1-LABEL: combine_vec_udiv_by_shl_pow2b: ; AVX1: # %bb.0: -; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4 +; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_vec_udiv_by_shl_pow2b: @@ -683,21 +683,19 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { define <8 x i16> @pr38477(<8 x i16> %a0) { ; SSE2-LABEL: pr38477: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [u,4957,57457,4103,16385,35545,2048,2115] -; SSE2-NEXT: pmulhuw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psubw %xmm1, %xmm2 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,32768,0,0,0,0,0,32768] -; SSE2-NEXT: paddw %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [u,4957,57457,4103,16385,35545,2048,2115] +; SSE2-NEXT: pmulhuw %xmm0, %xmm3 +; SSE2-NEXT: psubw %xmm3, %xmm0 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [u,32768,0,0,0,0,0,32768] +; SSE2-NEXT: paddw %xmm3, %xmm0 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: pr38477: diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll index b124bd5165e11..dfae853f9961e 100644 --- a/llvm/test/CodeGen/X86/dpbusd_const.ll +++ b/llvm/test/CodeGen/X86/dpbusd_const.ll @@ -10,8 +10,8 @@ define i32 @mul_4xi8_zc_exceed(<4 x i8> %a, i32 %c) { ; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,128,0] ; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; ALL-NEXT: vmovd %xmm0, %eax ; ALL-NEXT: addl %edi, %eax ; ALL-NEXT: retq @@ -55,7 +55,7 @@ define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) { ; AVX512VLVNNI-NEXT: retq entry: %0 = zext <4 x i8> %a to <4 x i32> - %1 = mul nsw <4 x i32> %0, + %1 = mul nsw <4 x i32> %0, %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1) %op.extra = add nsw i32 %2, %c ret i32 %op.extra @@ -97,7 +97,7 @@ define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) { ; AVX512VLVNNI-NEXT: retq entry: %0 = zext <4 x i4> %a to <4 x i32> - %1 = mul nsw <4 x i32> , %0 + %1 = mul nsw <4 x i32> , %0 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1) %op.extra = add nsw i32 %2, %c ret i32 %op.extra @@ -108,7 +108,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) { ; AVXVNNI: # %bb.0: # %entry ; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVXVNNI-NEXT: vmovd {{.*#+}} xmm2 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVXVNNI-NEXT: vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] ; AVXVNNI-NEXT: {vex} vpdpbusd %xmm0, %xmm2, %xmm1 ; AVXVNNI-NEXT: vmovd %xmm1, %eax ; AVXVNNI-NEXT: addl %edi, %eax @@ -118,7 +118,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) { ; AVX512VNNI: # %bb.0: # %entry ; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vmovd {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VNNI-NEXT: vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 ; AVX512VNNI-NEXT: vmovd %xmm2, %eax @@ -130,7 +130,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) { ; AVX512VLVNNI: # %bb.0: # %entry ; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vmovd {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVNNI-NEXT: vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2 ; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax @@ -138,7 +138,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) { ; AVX512VLVNNI-NEXT: retq entry: %0 = sext <4 x i8> %a to <4 x i32> - %1 = mul nsw <4 x i32> , %0 + %1 = mul nsw <4 x i32> , %0 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1) %op.extra = add nsw i32 %2, %c ret i32 %op.extra @@ -151,8 +151,8 @@ define i32 @mul_4xi8_cs_exceed(<4 x i8> %a, i32 %c) { ; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,256,0] ; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; ALL-NEXT: vmovd %xmm0, %eax ; ALL-NEXT: addl %edi, %eax ; ALL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll index 1b13cee628df6..e595b3f3449e2 100644 --- a/llvm/test/CodeGen/X86/pr62286.ll +++ b/llvm/test/CodeGen/X86/pr62286.ll @@ -8,18 +8,17 @@ define i64 @PR62286(i32 %a) { ; SSE-LABEL: PR62286: ; SSE: # %bb.0: ; SSE-NEXT: movd %edi, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,1,0] -; SSE-NEXT: paddd %xmm0, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,0] +; SSE-NEXT: paddd %xmm1, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: paddq %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: paddq %xmm1, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr67333.ll b/llvm/test/CodeGen/X86/pr67333.ll index 64c7f4fb143bf..946380971988c 100644 --- a/llvm/test/CodeGen/X86/pr67333.ll +++ b/llvm/test/CodeGen/X86/pr67333.ll @@ -18,42 +18,42 @@ define void @SHA256_Compress_Generic(ptr noundef %ctx) #1 { ; CHECK-NEXT: vpsrld $19, %xmm2, %xmm3 ; CHECK-NEXT: vpslld $13, %xmm2, %xmm4 ; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vpxor %xmm3, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; CHECK-NEXT: vmovd %ecx, %xmm3 -; CHECK-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; CHECK-NEXT: vpxor %xmm3, %xmm0, %xmm3 +; CHECK-NEXT: vpxor %xmm2, %xmm3, %xmm0 +; CHECK-NEXT: vmovd %ecx, %xmm4 +; CHECK-NEXT: vpshufb %xmm1, %xmm4, %xmm1 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm1 ; CHECK-NEXT: vpsrld $17, %xmm1, %xmm0 -; CHECK-NEXT: vpslld $15, %xmm1, %xmm3 -; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0 -; CHECK-NEXT: vpsrld $19, %xmm1, %xmm3 -; CHECK-NEXT: vpslld $13, %xmm1, %xmm4 -; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: vpslld $15, %xmm1, %xmm4 +; CHECK-NEXT: vpor %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpsrld $19, %xmm1, %xmm4 +; CHECK-NEXT: vpslld $13, %xmm1, %xmm5 +; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vpxor %xmm4, %xmm0, %xmm0 ; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; CHECK-NEXT: vpsrld $17, %xmm0, %xmm3 -; CHECK-NEXT: vpslld $15, %xmm0, %xmm4 -; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vpsrld $19, %xmm0, %xmm4 -; CHECK-NEXT: vpslld $13, %xmm0, %xmm5 +; CHECK-NEXT: vpsrld $17, %xmm0, %xmm4 +; CHECK-NEXT: vpslld $15, %xmm0, %xmm5 ; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 -; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vpsrld $19, %xmm0, %xmm5 +; CHECK-NEXT: vpslld $13, %xmm0, %xmm6 +; CHECK-NEXT: vpor %xmm5, %xmm6, %xmm5 +; CHECK-NEXT: vpxor %xmm5, %xmm4, %xmm4 ; CHECK-NEXT: vpsrld $10, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vpxor %xmm0, %xmm4, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; CHECK-NEXT: vpsrld $17, %xmm0, %xmm3 -; CHECK-NEXT: vpslld $15, %xmm0, %xmm4 -; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vpsrld $19, %xmm0, %xmm4 -; CHECK-NEXT: vpslld $13, %xmm0, %xmm5 +; CHECK-NEXT: vpsrld $17, %xmm0, %xmm4 +; CHECK-NEXT: vpslld $15, %xmm0, %xmm5 ; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 -; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; CHECK-NEXT: vpsrld $10, %xmm0, %xmm4 -; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3] +; CHECK-NEXT: vpsrld $19, %xmm0, %xmm5 +; CHECK-NEXT: vpslld $13, %xmm0, %xmm6 +; CHECK-NEXT: vpor %xmm5, %xmm6, %xmm5 +; CHECK-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; CHECK-NEXT: vpsrld $10, %xmm0, %xmm5 +; CHECK-NEXT: vpxor %xmm5, %xmm4, %xmm4 +; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] ; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,3] -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vpaddd %xmm4, %xmm2, %xmm2 ; CHECK-NEXT: vpsrld $17, %xmm2, %xmm3 ; CHECK-NEXT: vpslld $15, %xmm2, %xmm4 ; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll index 245516974d15b..c74440d7ec021 100644 --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -986,26 +986,56 @@ define dso_local i32 @sad_unroll_nonzero_initial(ptr %arg, ptr %arg1, ptr %arg2, ; SSE2-NEXT: movdqu (%rcx), %xmm2 ; SSE2-NEXT: psadbw %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: incl %eax ; SSE2-NEXT: retq ; -; AVX-LABEL: sad_unroll_nonzero_initial: -; AVX: # %bb.0: # %bb -; AVX-NEXT: vmovdqu (%rdi), %xmm0 -; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu (%rdx), %xmm1 -; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX1-LABEL: sad_unroll_nonzero_initial: +; AVX1: # %bb.0: # %bb +; AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; AVX1-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu (%rdx), %xmm1 +; AVX1-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: sad_unroll_nonzero_initial: +; AVX2: # %bb.0: # %bb +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu (%rdx), %xmm1 +; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: retq +; +; AVX512-LABEL: sad_unroll_nonzero_initial: +; AVX512: # %bb.0: # %bb +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu (%rdx), %xmm1 +; AVX512-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: retq bb: %tmp = load <16 x i8>, ptr %arg, align 1 %tmp4 = load <16 x i8>, ptr %arg1, align 1 diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll index a2bcadd104a7b..08d9183bd30b6 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -2068,14 +2068,12 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou ; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrlq $32, %xmm1 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2141,14 +2139,12 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no ; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrlq $32, %xmm1 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0