diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index f21233abfa4f5..a16ec19e7a688 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -2030,6 +2030,21 @@ bool TargetLowering::SimplifyDemandedBits( Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth); } + // If we are only demanding sign bits then we can use the shift source + // directly. + if (std::optional MaxSA = + TLO.DAG.getValidMaximumShiftAmount(Op, DemandedElts, Depth + 1)) { + unsigned ShAmt = *MaxSA; + // Must already be signbits in DemandedBits bounds, and can't demand any + // shifted in zeroes. + if (DemandedBits.countl_zero() >= ShAmt) { + unsigned NumSignBits = + TLO.DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1); + if (DemandedBits.countr_zero() >= (BitWidth - NumSignBits)) + return TLO.CombineTo(Op, Op0); + } + } + // Try to match AVG patterns (after shift simplification). if (SDValue AVG = combineShiftToAVG(Op, TLO, *this, DemandedBits, DemandedElts, Depth + 1)) diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll index a4be81a1973a3..ee289c4faab50 100644 --- a/llvm/test/CodeGen/NVPTX/load-store.ll +++ b/llvm/test/CodeGen/NVPTX/load-store.ll @@ -167,25 +167,25 @@ define void @generic_4xi8(ptr %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi8_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; -; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520; ; CHECK-NEXT: st.u32 [%rd1], %r12; ; CHECK-NEXT: ret; %a.load = load <4 x i8>, ptr %a @@ -511,25 +511,25 @@ define void @generic_volatile_4xi8(ptr %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi8_param_0]; ; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; -; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520; ; CHECK-NEXT: st.volatile.u32 [%rd1], %r12; ; CHECK-NEXT: ret; %a.load = load volatile <4 x i8>, ptr %a @@ -1416,25 +1416,25 @@ define void @global_4xi8(ptr addrspace(1) %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi8_param_0]; ; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; -; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520; ; CHECK-NEXT: st.global.u32 [%rd1], %r12; ; CHECK-NEXT: ret; %a.load = load <4 x i8>, ptr addrspace(1) %a @@ -1741,25 +1741,25 @@ define void @global_volatile_4xi8(ptr addrspace(1) %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi8_param_0]; ; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; -; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520; ; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r12; ; CHECK-NEXT: ret; %a.load = load volatile <4 x i8>, ptr addrspace(1) %a @@ -2788,25 +2788,25 @@ define void @shared_4xi8(ptr addrspace(3) %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi8_param_0]; ; CHECK-NEXT: ld.shared.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; -; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520; ; CHECK-NEXT: st.shared.u32 [%rd1], %r12; ; CHECK-NEXT: ret; %a.load = load <4 x i8>, ptr addrspace(3) %a @@ -3113,25 +3113,25 @@ define void @shared_volatile_4xi8(ptr addrspace(3) %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi8_param_0]; ; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; -; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520; ; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r12; ; CHECK-NEXT: ret; %a.load = load volatile <4 x i8>, ptr addrspace(3) %a @@ -4018,25 +4018,25 @@ define void @local_4xi8(ptr addrspace(5) %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi8_param_0]; ; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; -; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520; ; CHECK-NEXT: st.local.u32 [%rd1], %r12; ; CHECK-NEXT: ret; %a.load = load <4 x i8>, ptr addrspace(5) %a @@ -4343,25 +4343,25 @@ define void @local_volatile_4xi8(ptr addrspace(5) %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi8_param_0]; ; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; ; CHECK-NEXT: add.s16 %rs4, %rs3, 1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 13120; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: add.s16 %rs6, %rs5, 1; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; ; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; -; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520; ; CHECK-NEXT: st.local.u32 [%rd1], %r12; ; CHECK-NEXT: ret; %a.load = load volatile <4 x i8>, ptr addrspace(5) %a diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll index 537e05310dbea..874913629e9e3 100644 --- a/llvm/test/CodeGen/X86/scmp.ll +++ b/llvm/test/CodeGen/X86/scmp.ll @@ -1764,153 +1764,146 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx ; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d -; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r12d -; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; SSE2-NEXT: addb %dil, %dil -; SSE2-NEXT: sarb %dil -; SSE2-NEXT: addb %sil, %sil -; SSE2-NEXT: sarb %sil -; SSE2-NEXT: cmpb %dil, %sil -; SSE2-NEXT: setl %sil -; SSE2-NEXT: setg %dil -; SSE2-NEXT: subb %sil, %dil -; SSE2-NEXT: movsbq %dil, %rdi -; SSE2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movq %rdi, (%rax) -; SSE2-NEXT: sarq $63, %rdi -; SSE2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: addb %r11b, %r11b -; SSE2-NEXT: sarb %r11b -; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %esi +; SSE2-NEXT: addb %r15b, %r15b +; SSE2-NEXT: sarb %r15b ; SSE2-NEXT: addb %sil, %sil ; SSE2-NEXT: sarb %sil -; SSE2-NEXT: cmpb %r11b, %sil +; SSE2-NEXT: cmpb %r15b, %sil ; SSE2-NEXT: setl %sil -; SSE2-NEXT: setg %r11b -; SSE2-NEXT: subb %sil, %r11b -; SSE2-NEXT: movsbq %r11b, %r11 -; SSE2-NEXT: movq %r11, %r14 -; SSE2-NEXT: sarq $63, %r14 -; SSE2-NEXT: addb %r12b, %r12b -; SSE2-NEXT: sarb %r12b +; SSE2-NEXT: setg %r15b +; SSE2-NEXT: subb %sil, %r15b +; SSE2-NEXT: movsbq %r15b, %rsi +; SSE2-NEXT: movq %rsi, (%rax) +; SSE2-NEXT: movq %rsi, %xmm0 +; SSE2-NEXT: sarq $63, %rsi +; SSE2-NEXT: addb %r14b, %r14b +; SSE2-NEXT: sarb %r14b +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d +; SSE2-NEXT: addb %r15b, %r15b +; SSE2-NEXT: sarb %r15b +; SSE2-NEXT: cmpb %r14b, %r15b +; SSE2-NEXT: setl %r14b +; SSE2-NEXT: setg %r15b +; SSE2-NEXT: subb %r14b, %r15b +; SSE2-NEXT: movsbq %r15b, %r14 +; SSE2-NEXT: movq %r14, %r15 +; SSE2-NEXT: sarq $63, %r15 +; SSE2-NEXT: addb %bpl, %bpl +; SSE2-NEXT: sarb %bpl ; SSE2-NEXT: addb %dl, %dl ; SSE2-NEXT: sarb %dl -; SSE2-NEXT: cmpb %r12b, %dl +; SSE2-NEXT: cmpb %bpl, %dl ; SSE2-NEXT: setl %dl -; SSE2-NEXT: setg %sil -; SSE2-NEXT: subb %dl, %sil -; SSE2-NEXT: movsbq %sil, %r13 -; SSE2-NEXT: movq %r13, %rdi -; SSE2-NEXT: sarq $63, %rdi -; SSE2-NEXT: addb %r15b, %r15b -; SSE2-NEXT: sarb %r15b +; SSE2-NEXT: setg %bpl +; SSE2-NEXT: subb %dl, %bpl +; SSE2-NEXT: movsbq %bpl, %rdx +; SSE2-NEXT: movq %rdx, %r12 +; SSE2-NEXT: sarq $63, %r12 +; SSE2-NEXT: addb %bl, %bl +; SSE2-NEXT: sarb %bl ; SSE2-NEXT: addb %cl, %cl ; SSE2-NEXT: sarb %cl -; SSE2-NEXT: cmpb %r15b, %cl +; SSE2-NEXT: cmpb %bl, %cl ; SSE2-NEXT: setl %cl -; SSE2-NEXT: setg %dl -; SSE2-NEXT: subb %cl, %dl -; SSE2-NEXT: movsbq %dl, %r15 -; SSE2-NEXT: movq %r15, %rcx +; SSE2-NEXT: setg %bl +; SSE2-NEXT: subb %cl, %bl +; SSE2-NEXT: movsbq %bl, %rbx +; SSE2-NEXT: movq %rbx, %rcx ; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: addb %bpl, %bpl -; SSE2-NEXT: sarb %bpl +; SSE2-NEXT: addb %r11b, %r11b +; SSE2-NEXT: sarb %r11b ; SSE2-NEXT: addb %r8b, %r8b ; SSE2-NEXT: sarb %r8b -; SSE2-NEXT: cmpb %bpl, %r8b -; SSE2-NEXT: setl %dl -; SSE2-NEXT: setg %r8b -; SSE2-NEXT: subb %dl, %r8b -; SSE2-NEXT: movsbq %r8b, %r8 -; SSE2-NEXT: movq %r8, %r12 -; SSE2-NEXT: sarq $63, %r12 -; SSE2-NEXT: addb %bl, %bl -; SSE2-NEXT: sarb %bl -; SSE2-NEXT: addb %r9b, %r9b -; SSE2-NEXT: sarb %r9b -; SSE2-NEXT: cmpb %bl, %r9b -; SSE2-NEXT: setl %dl -; SSE2-NEXT: setg %r9b -; SSE2-NEXT: subb %dl, %r9b -; SSE2-NEXT: movsbq %r9b, %rsi -; SSE2-NEXT: movq %rsi, %r9 -; SSE2-NEXT: sarq $63, %r9 +; SSE2-NEXT: cmpb %r11b, %r8b +; SSE2-NEXT: setl %r8b +; SSE2-NEXT: setg %r11b +; SSE2-NEXT: subb %r8b, %r11b +; SSE2-NEXT: movsbq %r11b, %r8 +; SSE2-NEXT: movq %r8, %r11 +; SSE2-NEXT: sarq $63, %r11 ; SSE2-NEXT: addb %r10b, %r10b ; SSE2-NEXT: sarb %r10b -; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %edx -; SSE2-NEXT: addb %dl, %dl -; SSE2-NEXT: sarb %dl -; SSE2-NEXT: cmpb %r10b, %dl -; SSE2-NEXT: setl %dl +; SSE2-NEXT: addb %r9b, %r9b +; SSE2-NEXT: sarb %r9b +; SSE2-NEXT: cmpb %r10b, %r9b +; SSE2-NEXT: setl %r9b ; SSE2-NEXT: setg %r10b -; SSE2-NEXT: subb %dl, %r10b -; SSE2-NEXT: movsbq %r10b, %r10 -; SSE2-NEXT: movq %r10, %rdx -; SSE2-NEXT: sarq $63, %rdx -; SSE2-NEXT: movl %edx, 96(%rax) +; SSE2-NEXT: subb %r9b, %r10b +; SSE2-NEXT: movsbq %r10b, %r9 +; SSE2-NEXT: movq %r9, %r10 +; SSE2-NEXT: sarq $63, %r10 +; SSE2-NEXT: addb %dil, %dil +; SSE2-NEXT: sarb %dil +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp +; SSE2-NEXT: addb %bpl, %bpl +; SSE2-NEXT: sarb %bpl +; SSE2-NEXT: cmpb %dil, %bpl +; SSE2-NEXT: setl %dil +; SSE2-NEXT: setg %bpl +; SSE2-NEXT: subb %dil, %bpl +; SSE2-NEXT: movsbq %bpl, %rdi +; SSE2-NEXT: movq %rdi, %r13 +; SSE2-NEXT: sarq $63, %r13 +; SSE2-NEXT: movl %r13d, 96(%rax) ; SSE2-NEXT: movabsq $2251799813685247, %rbp # imm = 0x7FFFFFFFFFFFF -; SSE2-NEXT: andq %rdx, %rbp -; SSE2-NEXT: shldq $62, %r10, %rdx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload -; SSE2-NEXT: # xmm0 = mem[0],zero -; SSE2-NEXT: movq %r9, %rbx -; SSE2-NEXT: shldq $20, %rsi, %rbx -; SSE2-NEXT: movq %rdx, 88(%rax) -; SSE2-NEXT: movq %r12, %rdx -; SSE2-NEXT: shldq $31, %r8, %rdx -; SSE2-NEXT: movq %rbx, 64(%rax) -; SSE2-NEXT: movq %rcx, %rbx -; SSE2-NEXT: shldq $42, %r15, %rbx -; SSE2-NEXT: movq %rdx, 48(%rax) -; SSE2-NEXT: movq %rbx, 32(%rax) -; SSE2-NEXT: movabsq $9007199254738944, %rbx # imm = 0x1FFFFFFFFFF800 -; SSE2-NEXT: andq %rdi, %rbx -; SSE2-NEXT: shldq $53, %r13, %rdi -; SSE2-NEXT: movq %rdi, 16(%rax) -; SSE2-NEXT: movq %rbp, %rdx -; SSE2-NEXT: shrq $48, %rdx -; SSE2-NEXT: movb %dl, 102(%rax) +; SSE2-NEXT: andq %r13, %rbp +; SSE2-NEXT: shldq $62, %rdi, %r13 +; SSE2-NEXT: movq %r13, 88(%rax) +; SSE2-NEXT: movq %r10, %r13 +; SSE2-NEXT: shldq $20, %r9, %r13 +; SSE2-NEXT: movq %r13, 64(%rax) +; SSE2-NEXT: movq %r11, %r13 +; SSE2-NEXT: shldq $31, %r8, %r13 +; SSE2-NEXT: movq %r13, 48(%rax) +; SSE2-NEXT: movq %rcx, %r13 +; SSE2-NEXT: shldq $42, %rbx, %r13 +; SSE2-NEXT: movq %r13, 32(%rax) +; SSE2-NEXT: movabsq $9007199254738944, %r13 # imm = 0x1FFFFFFFFFF800 +; SSE2-NEXT: andq %r12, %r13 +; SSE2-NEXT: shldq $53, %rdx, %r12 +; SSE2-NEXT: movq %r12, 16(%rax) +; SSE2-NEXT: movq %rbp, %r12 +; SSE2-NEXT: shrq $48, %r12 +; SSE2-NEXT: movb %r12b, 102(%rax) ; SSE2-NEXT: shrq $32, %rbp -; SSE2-NEXT: movabsq $9007199254740991, %rdx # imm = 0x1FFFFFFFFFFFFF -; SSE2-NEXT: andq %rdx, %r14 -; SSE2-NEXT: shldq $9, %r11, %r14 ; SSE2-NEXT: movw %bp, 100(%rax) -; SSE2-NEXT: shlq $62, %r10 +; SSE2-NEXT: movabsq $9007199254740991, %r12 # imm = 0x1FFFFFFFFFFFFF +; SSE2-NEXT: andq %r12, %r15 +; SSE2-NEXT: shldq $9, %r14, %r15 +; SSE2-NEXT: shlq $62, %rdi +; SSE2-NEXT: orq %r15, %rdi +; SSE2-NEXT: movq %rdi, 80(%rax) +; SSE2-NEXT: shlq $42, %rbx +; SSE2-NEXT: shrq $11, %r13 +; SSE2-NEXT: orq %rbx, %r13 +; SSE2-NEXT: movq %r13, 24(%rax) +; SSE2-NEXT: shlq $9, %r14 +; SSE2-NEXT: andl $511, %r10d # imm = 0x1FF ; SSE2-NEXT: orq %r14, %r10 -; SSE2-NEXT: movq %r10, 80(%rax) -; SSE2-NEXT: shlq $42, %r15 -; SSE2-NEXT: shrq $11, %rbx -; SSE2-NEXT: orq %r15, %rbx -; SSE2-NEXT: movq %rbx, 24(%rax) -; SSE2-NEXT: shlq $9, %r11 -; SSE2-NEXT: shrq $44, %r9 -; SSE2-NEXT: andl $511, %r9d # imm = 0x1FF -; SSE2-NEXT: orq %r11, %r9 -; SSE2-NEXT: movq %r9, 72(%rax) -; SSE2-NEXT: shlq $20, %rsi -; SSE2-NEXT: shrq $33, %r12 -; SSE2-NEXT: andl $1048575, %r12d # imm = 0xFFFFF -; SSE2-NEXT: orq %rsi, %r12 -; SSE2-NEXT: movq %r12, 56(%rax) +; SSE2-NEXT: movq %r10, 72(%rax) +; SSE2-NEXT: shlq $20, %r9 +; SSE2-NEXT: andl $1048575, %r11d # imm = 0xFFFFF +; SSE2-NEXT: orq %r9, %r11 +; SSE2-NEXT: movq %r11, 56(%rax) ; SSE2-NEXT: shlq $31, %r8 -; SSE2-NEXT: shrq $22, %rcx ; SSE2-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF ; SSE2-NEXT: orq %r8, %rcx ; SSE2-NEXT: movq %rcx, 40(%rax) -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload -; SSE2-NEXT: # xmm1 = mem[0],zero +; SSE2-NEXT: movq %rsi, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rcx -; SSE2-NEXT: andq %rdx, %rcx -; SSE2-NEXT: shlq $53, %r13 -; SSE2-NEXT: orq %rcx, %r13 -; SSE2-NEXT: movq %r13, 8(%rax) +; SSE2-NEXT: andq %r12, %rcx +; SSE2-NEXT: shlq $53, %rdx +; SSE2-NEXT: orq %rcx, %rdx +; SSE2-NEXT: movq %rdx, 8(%rax) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -1927,11 +1920,11 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE4-NEXT: pushq %r13 ; SSE4-NEXT: pushq %r12 ; SSE4-NEXT: pushq %rbx -; SSE4-NEXT: movq %rdi, %rbx +; SSE4-NEXT: movq %rdi, %rax ; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %ebx ; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %ebp ; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r15d ; SSE4-NEXT: movzbl {{[0-9]+}}(%rsp), %r14d @@ -1944,7 +1937,7 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE4-NEXT: setg %r14b ; SSE4-NEXT: subb %sil, %r14b ; SSE4-NEXT: movsbq %r14b, %r14 -; SSE4-NEXT: movq %r14, (%rbx) +; SSE4-NEXT: movq %r14, (%rax) ; SSE4-NEXT: sarq $63, %r14 ; SSE4-NEXT: addb %r15b, %r15b ; SSE4-NEXT: sarb %r15b @@ -1955,8 +1948,8 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE4-NEXT: setl %sil ; SSE4-NEXT: setg %r15b ; SSE4-NEXT: subb %sil, %r15b -; SSE4-NEXT: movsbq %r15b, %r15 -; SSE4-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE4-NEXT: movsbq %r15b, %rsi +; SSE4-NEXT: movq %rsi, %r15 ; SSE4-NEXT: sarq $63, %r15 ; SSE4-NEXT: addb %bpl, %bpl ; SSE4-NEXT: sarb %bpl @@ -1969,16 +1962,16 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE4-NEXT: movsbq %bpl, %r12 ; SSE4-NEXT: movq %r12, %r13 ; SSE4-NEXT: sarq $63, %r13 -; SSE4-NEXT: addb %al, %al -; SSE4-NEXT: sarb %al +; SSE4-NEXT: addb %bl, %bl +; SSE4-NEXT: sarb %bl ; SSE4-NEXT: addb %cl, %cl ; SSE4-NEXT: sarb %cl -; SSE4-NEXT: cmpb %al, %cl +; SSE4-NEXT: cmpb %bl, %cl ; SSE4-NEXT: setl %cl ; SSE4-NEXT: setg %dl ; SSE4-NEXT: subb %cl, %dl -; SSE4-NEXT: movsbq %dl, %rsi -; SSE4-NEXT: movq %rsi, %rcx +; SSE4-NEXT: movsbq %dl, %rbx +; SSE4-NEXT: movq %rbx, %rcx ; SSE4-NEXT: sarq $63, %rcx ; SSE4-NEXT: addb %r11b, %r11b ; SSE4-NEXT: sarb %r11b @@ -2014,61 +2007,55 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; SSE4-NEXT: movsbq %r11b, %rdi ; SSE4-NEXT: movq %rdi, %rbp ; SSE4-NEXT: sarq $63, %rbp -; SSE4-NEXT: movl %ebp, 96(%rbx) -; SSE4-NEXT: movq %rbp, %rax -; SSE4-NEXT: shldq $62, %rdi, %rax +; SSE4-NEXT: movl %ebp, 96(%rax) ; SSE4-NEXT: movabsq $2251799813685247, %r11 # imm = 0x7FFFFFFFFFFFF ; SSE4-NEXT: andq %rbp, %r11 +; SSE4-NEXT: shldq $62, %rdi, %rbp +; SSE4-NEXT: movq %rbp, 88(%rax) ; SSE4-NEXT: movq %r10, %rbp ; SSE4-NEXT: shldq $20, %r9, %rbp -; SSE4-NEXT: movq %rax, 88(%rbx) -; SSE4-NEXT: movq %r8, %rax -; SSE4-NEXT: shldq $31, %rdx, %rax -; SSE4-NEXT: movq %rbp, 64(%rbx) +; SSE4-NEXT: movq %rbp, 64(%rax) +; SSE4-NEXT: movq %r8, %rbp +; SSE4-NEXT: shldq $31, %rdx, %rbp +; SSE4-NEXT: movq %rbp, 48(%rax) ; SSE4-NEXT: movq %rcx, %rbp -; SSE4-NEXT: shldq $42, %rsi, %rbp -; SSE4-NEXT: movq %rax, 48(%rbx) -; SSE4-NEXT: movq %rbp, 32(%rbx) -; SSE4-NEXT: movabsq $9007199254738944, %rax # imm = 0x1FFFFFFFFFF800 -; SSE4-NEXT: andq %r13, %rax +; SSE4-NEXT: shldq $42, %rbx, %rbp +; SSE4-NEXT: movq %rbp, 32(%rax) +; SSE4-NEXT: movabsq $9007199254738944, %rbp # imm = 0x1FFFFFFFFFF800 +; SSE4-NEXT: andq %r13, %rbp ; SSE4-NEXT: shldq $53, %r12, %r13 -; SSE4-NEXT: movq %r13, 16(%rbx) +; SSE4-NEXT: movq %r13, 16(%rax) ; SSE4-NEXT: movq %r11, %r13 ; SSE4-NEXT: shrq $48, %r13 -; SSE4-NEXT: movb %r13b, 102(%rbx) +; SSE4-NEXT: movb %r13b, 102(%rax) ; SSE4-NEXT: shrq $32, %r11 -; SSE4-NEXT: movabsq $9007199254740991, %r13 # imm = 0x1FFFFFFFFFFFFF -; SSE4-NEXT: andq %r13, %r15 -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE4-NEXT: shldq $9, %rbp, %r15 -; SSE4-NEXT: movw %r11w, 100(%rbx) +; SSE4-NEXT: movw %r11w, 100(%rax) +; SSE4-NEXT: movabsq $9007199254740991, %r11 # imm = 0x1FFFFFFFFFFFFF +; SSE4-NEXT: andq %r11, %r15 +; SSE4-NEXT: shldq $9, %rsi, %r15 ; SSE4-NEXT: shlq $62, %rdi ; SSE4-NEXT: orq %r15, %rdi -; SSE4-NEXT: movq %rdi, 80(%rbx) -; SSE4-NEXT: andq %r13, %r14 +; SSE4-NEXT: movq %rdi, 80(%rax) +; SSE4-NEXT: andq %r11, %r14 ; SSE4-NEXT: shlq $53, %r12 ; SSE4-NEXT: orq %r14, %r12 -; SSE4-NEXT: movq %r12, 8(%rbx) -; SSE4-NEXT: shlq $42, %rsi -; SSE4-NEXT: shrq $11, %rax -; SSE4-NEXT: orq %rsi, %rax -; SSE4-NEXT: movq %rax, 24(%rbx) -; SSE4-NEXT: shlq $9, %rbp -; SSE4-NEXT: shrq $44, %r10 +; SSE4-NEXT: movq %r12, 8(%rax) +; SSE4-NEXT: shlq $42, %rbx +; SSE4-NEXT: shrq $11, %rbp +; SSE4-NEXT: orq %rbx, %rbp +; SSE4-NEXT: movq %rbp, 24(%rax) +; SSE4-NEXT: shlq $9, %rsi ; SSE4-NEXT: andl $511, %r10d # imm = 0x1FF -; SSE4-NEXT: orq %rbp, %r10 -; SSE4-NEXT: movq %r10, 72(%rbx) +; SSE4-NEXT: orq %rsi, %r10 +; SSE4-NEXT: movq %r10, 72(%rax) ; SSE4-NEXT: shlq $20, %r9 -; SSE4-NEXT: shrq $33, %r8 ; SSE4-NEXT: andl $1048575, %r8d # imm = 0xFFFFF ; SSE4-NEXT: orq %r9, %r8 -; SSE4-NEXT: movq %r8, 56(%rbx) +; SSE4-NEXT: movq %r8, 56(%rax) ; SSE4-NEXT: shlq $31, %rdx -; SSE4-NEXT: shrq $22, %rcx ; SSE4-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF ; SSE4-NEXT: orq %rdx, %rcx -; SSE4-NEXT: movq %rcx, 40(%rbx) -; SSE4-NEXT: movq %rbx, %rax +; SSE4-NEXT: movq %rcx, 40(%rax) ; SSE4-NEXT: popq %rbx ; SSE4-NEXT: popq %r12 ; SSE4-NEXT: popq %r13 @@ -2114,8 +2101,8 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; AVX-NEXT: setg %r15b ; AVX-NEXT: subb %sil, %r15b ; AVX-NEXT: movsbq %r15b, %rsi -; AVX-NEXT: movq %rsi, %r15 -; AVX-NEXT: sarq $63, %r15 +; AVX-NEXT: movq %rsi, %r12 +; AVX-NEXT: sarq $63, %r12 ; AVX-NEXT: addb %bpl, %bpl ; AVX-NEXT: sarb %bpl ; AVX-NEXT: addb %dl, %dl @@ -2124,8 +2111,8 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; AVX-NEXT: setl %dl ; AVX-NEXT: setg %bpl ; AVX-NEXT: subb %dl, %bpl -; AVX-NEXT: movsbq %bpl, %r12 -; AVX-NEXT: movq %r12, %r13 +; AVX-NEXT: movsbq %bpl, %r15 +; AVX-NEXT: movq %r15, %r13 ; AVX-NEXT: sarq $63, %r13 ; AVX-NEXT: addb %bl, %bl ; AVX-NEXT: sarb %bl @@ -2186,9 +2173,9 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; AVX-NEXT: movq %rcx, %rbp ; AVX-NEXT: shldq $42, %rbx, %rbp ; AVX-NEXT: movq %rbp, 32(%rax) -; AVX-NEXT: movl $53, %ebp +; AVX-NEXT: movb $42, %bpl ; AVX-NEXT: bzhiq %rbp, %r13, %rbp -; AVX-NEXT: shldq $53, %r12, %r13 +; AVX-NEXT: shldq $53, %r15, %r13 ; AVX-NEXT: movq %r13, 16(%rax) ; AVX-NEXT: movq %r11, %r13 ; AVX-NEXT: shrq $48, %r13 @@ -2196,31 +2183,27 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; AVX-NEXT: shrq $32, %r11 ; AVX-NEXT: movw %r11w, 100(%rax) ; AVX-NEXT: movb $53, %r11b -; AVX-NEXT: bzhiq %r11, %r15, %r15 -; AVX-NEXT: shldq $9, %rsi, %r15 +; AVX-NEXT: bzhiq %r11, %r12, %r12 +; AVX-NEXT: shldq $9, %rsi, %r12 ; AVX-NEXT: shlq $62, %rdi -; AVX-NEXT: orq %r15, %rdi +; AVX-NEXT: orq %r12, %rdi ; AVX-NEXT: movq %rdi, 80(%rax) -; AVX-NEXT: bzhiq %r11, %r14, %rdi -; AVX-NEXT: shlq $53, %r12 -; AVX-NEXT: orq %rdi, %r12 -; AVX-NEXT: movq %r12, 8(%rax) ; AVX-NEXT: shlq $42, %rbx -; AVX-NEXT: shrq $11, %rbp -; AVX-NEXT: orq %rbx, %rbp -; AVX-NEXT: movq %rbp, 24(%rax) +; AVX-NEXT: orq %rbp, %rbx +; AVX-NEXT: movq %rbx, 24(%rax) +; AVX-NEXT: bzhiq %r11, %r14, %rdi +; AVX-NEXT: shlq $53, %r15 +; AVX-NEXT: orq %rdi, %r15 +; AVX-NEXT: movq %r15, 8(%rax) ; AVX-NEXT: shlq $9, %rsi -; AVX-NEXT: shrq $44, %r10 ; AVX-NEXT: andl $511, %r10d # imm = 0x1FF ; AVX-NEXT: orq %rsi, %r10 ; AVX-NEXT: movq %r10, 72(%rax) ; AVX-NEXT: shlq $20, %r9 -; AVX-NEXT: shrq $33, %r8 ; AVX-NEXT: andl $1048575, %r8d # imm = 0xFFFFF ; AVX-NEXT: orq %r9, %r8 ; AVX-NEXT: movq %r8, 56(%rax) ; AVX-NEXT: shlq $31, %rdx -; AVX-NEXT: shrq $22, %rcx ; AVX-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF ; AVX-NEXT: orq %rdx, %rcx ; AVX-NEXT: movq %rcx, 40(%rax) @@ -2263,170 +2246,164 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; X86-NEXT: addb %al, %al ; X86-NEXT: sarb %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movb {{[0-9]+}}(%esp), %dh +; X86-NEXT: addb %dh, %dh +; X86-NEXT: sarb %dh +; X86-NEXT: movb {{[0-9]+}}(%esp), %dl +; X86-NEXT: addb %dl, %dl +; X86-NEXT: sarb %dl +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addb %al, %al +; X86-NEXT: sarb %al +; X86-NEXT: movb {{[0-9]+}}(%esp), %ah +; X86-NEXT: addb %ah, %ah +; X86-NEXT: sarb %ah +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addb %cl, %cl +; X86-NEXT: sarb %cl +; X86-NEXT: movb {{[0-9]+}}(%esp), %ch +; X86-NEXT: addb %ch, %ch +; X86-NEXT: sarb %ch ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: addb %bl, %bl ; X86-NEXT: sarb %bl ; X86-NEXT: movb {{[0-9]+}}(%esp), %bh ; X86-NEXT: addb %bh, %bh ; X86-NEXT: sarb %bh -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: addb %dl, %dl -; X86-NEXT: sarb %dl -; X86-NEXT: movb {{[0-9]+}}(%esp), %ch -; X86-NEXT: addb %ch, %ch -; X86-NEXT: sarb %ch -; X86-NEXT: movb {{[0-9]+}}(%esp), %cl -; X86-NEXT: addb %cl, %cl -; X86-NEXT: sarb %cl -; X86-NEXT: movb {{[0-9]+}}(%esp), %ah -; X86-NEXT: addb %ah, %ah -; X86-NEXT: sarb %ah -; X86-NEXT: movb {{[0-9]+}}(%esp), %al -; X86-NEXT: addb %al, %al -; X86-NEXT: sarb %al -; X86-NEXT: movb {{[0-9]+}}(%esp), %dh -; X86-NEXT: addb %dh, %dh -; X86-NEXT: sarb %dh -; X86-NEXT: cmpb %al, %dh -; X86-NEXT: setl %al -; X86-NEXT: setg %dh -; X86-NEXT: subb %al, %dh -; X86-NEXT: movsbl %dh, %esi +; X86-NEXT: cmpb %bl, %bh +; X86-NEXT: setl %bl +; X86-NEXT: setg %bh +; X86-NEXT: subb %bl, %bh +; X86-NEXT: movsbl %bh, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sarl $31, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: andl $2097151, %esi # imm = 0x1FFFFF ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpb %cl, %ah +; X86-NEXT: cmpb %cl, %ch +; X86-NEXT: setl %cl +; X86-NEXT: setg %ch +; X86-NEXT: subb %cl, %ch +; X86-NEXT: movsbl %ch, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: andl $2097151, %ecx # imm = 0x1FFFFF +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpb %al, %ah ; X86-NEXT: setl %al ; X86-NEXT: setg %cl ; X86-NEXT: subb %al, %cl -; X86-NEXT: movsbl %cl, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movsbl %cl, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %ecx, (%edi) +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl $2097151, %eax # imm = 0x1FFFFF ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpb %dl, %ch +; X86-NEXT: cmpb %dh, %dl ; X86-NEXT: setl %al -; X86-NEXT: setg %cl -; X86-NEXT: subb %al, %cl -; X86-NEXT: movsbl %cl, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, (%ecx) -; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: andl $2097151, %ecx # imm = 0x1FFFFF -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpb %bl, %bh -; X86-NEXT: setl %cl ; X86-NEXT: setg %dl -; X86-NEXT: subb %cl, %dl -; X86-NEXT: movsbl %dl, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sarl $31, %edi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload -; X86-NEXT: setl %cl +; X86-NEXT: subb %al, %dl +; X86-NEXT: movsbl %dl, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sarl $31, %ebp +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload +; X86-NEXT: setl %al ; X86-NEXT: setg %dl -; X86-NEXT: subb %cl, %dl -; X86-NEXT: movsbl %dl, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sarl $31, %edx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload -; X86-NEXT: setl %cl -; X86-NEXT: setg %ch -; X86-NEXT: subb %cl, %ch -; X86-NEXT: movsbl %ch, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload -; X86-NEXT: setl %cl -; X86-NEXT: setg %ch -; X86-NEXT: subb %cl, %ch -; X86-NEXT: movsbl %ch, %esi +; X86-NEXT: subb %al, %dl +; X86-NEXT: movsbl %dl, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %esi, 96(%ecx) -; X86-NEXT: movl %esi, 92(%ecx) -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: movl %ebp, 80(%ecx) -; X86-NEXT: movl %ebx, 68(%ecx) -; X86-NEXT: movl %ebx, 64(%ecx) -; X86-NEXT: movl %edx, 52(%ecx) -; X86-NEXT: movl %edx, 48(%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: movl %ebp, 36(%ecx) -; X86-NEXT: movl %edi, 24(%ecx) -; X86-NEXT: movl %edi, 20(%ecx) -; X86-NEXT: movl %eax, 8(%ecx) -; X86-NEXT: movl %eax, 4(%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movw %ax, 100(%ecx) +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload +; X86-NEXT: setl %al +; X86-NEXT: setg %dl +; X86-NEXT: subb %al, %dl +; X86-NEXT: movsbl %dl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sarl $31, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload +; X86-NEXT: cmpb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload +; X86-NEXT: setl %dl +; X86-NEXT: setg %dh +; X86-NEXT: subb %dl, %dh +; X86-NEXT: movsbl %dh, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: movl %ebx, 96(%edi) +; X86-NEXT: movl %ebx, 92(%edi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, 80(%edi) +; X86-NEXT: movl %eax, 68(%edi) +; X86-NEXT: movl %eax, 64(%edi) +; X86-NEXT: movl %esi, 52(%edi) +; X86-NEXT: movl %esi, 48(%edi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, 36(%edi) +; X86-NEXT: movl %ebp, 24(%edi) +; X86-NEXT: movl %ebp, 20(%edi) +; X86-NEXT: movl %ecx, 8(%edi) +; X86-NEXT: movl %ecx, 4(%edi) +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movw %cx, 100(%edi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $30, %edx, %ecx +; X86-NEXT: movl %ecx, 88(%edi) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $30, %ecx, %eax -; X86-NEXT: movl %eax, 88(%esi) +; X86-NEXT: shldl $9, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $9, %edx, %ecx +; X86-NEXT: movl %ecx, 76(%edi) +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $20, %edx, %ecx +; X86-NEXT: movl %ecx, 60(%edi) +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $31, %edx, %ecx +; X86-NEXT: movl %ecx, 44(%edi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl $10, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $10, %edx, %ecx +; X86-NEXT: movl %ecx, 32(%edi) +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $21, %ebx, %ecx +; X86-NEXT: movl %ecx, 16(%edi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shll $9, %ecx +; X86-NEXT: andl $511, %eax # imm = 0x1FF +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %eax, 72(%edi) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl $9, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $9, %ebp, %eax -; X86-NEXT: movl %eax, 76(%esi) -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $20, %ebp, %eax -; X86-NEXT: movl %eax, 60(%esi) -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $31, %ebp, %eax -; X86-NEXT: movl %eax, 44(%esi) +; X86-NEXT: shll $20, %eax +; X86-NEXT: andl $1048575, %esi # imm = 0xFFFFF +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl %esi, 56(%edi) +; X86-NEXT: shll $10, %edx +; X86-NEXT: andl $1023, %ebp # imm = 0x3FF +; X86-NEXT: orl %edx, %ebp +; X86-NEXT: movl %ebp, 28(%edi) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shldl $10, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $10, %ebp, %eax -; X86-NEXT: movl %eax, 32(%esi) -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $21, %ebp, %eax -; X86-NEXT: movl %eax, 16(%esi) -; X86-NEXT: shll $21, %ebp -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, 12(%esi) +; X86-NEXT: shll $21, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, 12(%edi) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: andl $7, %eax -; X86-NEXT: movb %al, 102(%esi) +; X86-NEXT: movb %al, 102(%edi) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shll $30, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, 84(%esi) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shll $9, %eax -; X86-NEXT: shrl $12, %ebx -; X86-NEXT: andl $511, %ebx # imm = 0x1FF -; X86-NEXT: orl %eax, %ebx -; X86-NEXT: movl %ebx, 72(%esi) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shll $20, %eax -; X86-NEXT: shrl %edx -; X86-NEXT: andl $1048575, %edx # imm = 0xFFFFF -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, 56(%esi) +; X86-NEXT: movl %eax, 84(%edi) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shll $31, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, 40(%esi) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: shll $10, %eax -; X86-NEXT: shrl $11, %edi -; X86-NEXT: andl $1023, %edi # imm = 0x3FF -; X86-NEXT: orl %eax, %edi -; X86-NEXT: movl %edi, 28(%esi) -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %eax, 40(%edi) +; X86-NEXT: movl %edi, %eax ; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi