diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c97300d64d455..fbc63d8eb6d40 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10197,6 +10197,31 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { } } + // fold (not (or A, or(B, C))) -> and(not(A), and(not(B), not(C)) + if (TLI.hasAndNot(SDValue(N, 0))) { + // If we have AndNot then it is profitable to apply demorgan to make use + // of the machine instruction. + SDValue A; + SDValue B; + SDValue C; + APInt Cst; + if (sd_match(N, m_Xor(m_Or(m_Value(A), m_Or(m_Value(B), m_Value(C))), + m_ConstInt(Cst))) && + Cst.isAllOnes()) { + auto Ty = N->getValueType(0); + + auto NegA = + DAG.getNode(ISD::XOR, DL, VT, A, DAG.getConstant(Cst, DL, Ty)); + auto NegB = + DAG.getNode(ISD::XOR, DL, VT, B, DAG.getConstant(Cst, DL, Ty)); + auto NegC = + DAG.getNode(ISD::XOR, DL, VT, C, DAG.getConstant(Cst, DL, Ty)); + + return DAG.getNode(ISD::AND, DL, VT, NegA, + DAG.getNode(ISD::AND, DL, VT, NegB, NegC)); + } + } + return SDValue(); } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a0b64ff370b10..16f6d31728717 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -55615,11 +55615,6 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, // Folds for better commutativity: if (N1->hasOneUse()) { - // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)). - if (SDValue Not = IsNOT(N1, DAG)) - return DAG.getNOT( - DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT); - // ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x)) // Zero out elements by setting the PSHUFB mask value to 0xFF. if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) { diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll index 04124609eec74..f459cc2d78442 100644 --- a/llvm/test/CodeGen/AArch64/ctlz.ll +++ b/llvm/test/CodeGen/AArch64/ctlz.ll @@ -276,18 +276,23 @@ define <2 x i64> @v2i64(<2 x i64> %d) { ; CHECK-SD-LABEL: v2i64: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ushr v1.2d, v0.2d, #1 -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ushr v1.2d, v0.2d, #2 -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ushr v1.2d, v0.2d, #4 -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ushr v1.2d, v0.2d, #8 -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ushr v1.2d, v0.2d, #16 -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ushr v1.2d, v0.2d, #32 -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: orr v2.16b, v0.16b, v1.16b ; CHECK-SD-NEXT: mvn v0.16b, v0.16b +; CHECK-SD-NEXT: ushr v3.2d, v2.2d, #2 +; CHECK-SD-NEXT: bic v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: orr v2.16b, v2.16b, v3.16b +; CHECK-SD-NEXT: bic v0.16b, v0.16b, v3.16b +; CHECK-SD-NEXT: ushr v4.2d, v2.2d, #4 +; CHECK-SD-NEXT: orr v2.16b, v2.16b, v4.16b +; CHECK-SD-NEXT: bic v0.16b, v0.16b, v4.16b +; CHECK-SD-NEXT: ushr v1.2d, v2.2d, #8 +; CHECK-SD-NEXT: orr v2.16b, v2.16b, v1.16b +; CHECK-SD-NEXT: bic v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ushr v3.2d, v2.2d, #16 +; CHECK-SD-NEXT: orr v1.16b, v2.16b, v3.16b +; CHECK-SD-NEXT: bic v0.16b, v0.16b, v3.16b +; CHECK-SD-NEXT: ushr v1.2d, v1.2d, #32 +; CHECK-SD-NEXT: bic v0.16b, v0.16b, v1.16b ; CHECK-SD-NEXT: cnt v0.16b, v0.16b ; CHECK-SD-NEXT: uaddlp v0.8h, v0.16b ; CHECK-SD-NEXT: uaddlp v0.4s, v0.8h @@ -314,34 +319,44 @@ define <3 x i64> @v3i64(<3 x i64> %d) { ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-SD-NEXT: ushr v4.2d, v2.2d, #1 ; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] +; CHECK-SD-NEXT: orr v6.16b, v2.16b, v4.16b +; CHECK-SD-NEXT: mvn v2.16b, v2.16b ; CHECK-SD-NEXT: ushr v1.2d, v0.2d, #1 -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ushr v1.2d, v2.2d, #1 -; CHECK-SD-NEXT: ushr v3.2d, v0.2d, #2 -; CHECK-SD-NEXT: orr v1.16b, v2.16b, v1.16b -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v3.16b -; CHECK-SD-NEXT: ushr v2.2d, v1.2d, #2 -; CHECK-SD-NEXT: ushr v3.2d, v0.2d, #4 -; CHECK-SD-NEXT: orr v1.16b, v1.16b, v2.16b -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v3.16b -; CHECK-SD-NEXT: ushr v2.2d, v1.2d, #4 -; CHECK-SD-NEXT: ushr v3.2d, v0.2d, #8 -; CHECK-SD-NEXT: orr v1.16b, v1.16b, v2.16b -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v3.16b -; CHECK-SD-NEXT: ushr v2.2d, v1.2d, #8 -; CHECK-SD-NEXT: ushr v3.2d, v0.2d, #16 -; CHECK-SD-NEXT: orr v1.16b, v1.16b, v2.16b -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v3.16b -; CHECK-SD-NEXT: ushr v2.2d, v1.2d, #16 -; CHECK-SD-NEXT: ushr v3.2d, v0.2d, #32 -; CHECK-SD-NEXT: orr v1.16b, v1.16b, v2.16b -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v3.16b -; CHECK-SD-NEXT: ushr v2.2d, v1.2d, #32 +; CHECK-SD-NEXT: ushr v7.2d, v6.2d, #2 +; CHECK-SD-NEXT: bic v2.16b, v2.16b, v4.16b +; CHECK-SD-NEXT: orr v3.16b, v0.16b, v1.16b ; CHECK-SD-NEXT: mvn v0.16b, v0.16b -; CHECK-SD-NEXT: orr v1.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: orr v6.16b, v6.16b, v7.16b +; CHECK-SD-NEXT: bic v2.16b, v2.16b, v7.16b +; CHECK-SD-NEXT: ushr v5.2d, v3.2d, #2 +; CHECK-SD-NEXT: bic v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ushr v17.2d, v6.2d, #4 +; CHECK-SD-NEXT: orr v3.16b, v3.16b, v5.16b +; CHECK-SD-NEXT: bic v0.16b, v0.16b, v5.16b +; CHECK-SD-NEXT: orr v6.16b, v6.16b, v17.16b +; CHECK-SD-NEXT: bic v2.16b, v2.16b, v17.16b +; CHECK-SD-NEXT: ushr v16.2d, v3.2d, #4 +; CHECK-SD-NEXT: ushr v4.2d, v6.2d, #8 +; CHECK-SD-NEXT: orr v3.16b, v3.16b, v16.16b +; CHECK-SD-NEXT: bic v0.16b, v0.16b, v16.16b +; CHECK-SD-NEXT: orr v6.16b, v6.16b, v4.16b +; CHECK-SD-NEXT: bic v2.16b, v2.16b, v4.16b +; CHECK-SD-NEXT: ushr v1.2d, v3.2d, #8 +; CHECK-SD-NEXT: orr v3.16b, v3.16b, v1.16b +; CHECK-SD-NEXT: bic v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ushr v5.2d, v3.2d, #16 +; CHECK-SD-NEXT: orr v1.16b, v3.16b, v5.16b +; CHECK-SD-NEXT: ushr v3.2d, v6.2d, #16 +; CHECK-SD-NEXT: bic v0.16b, v0.16b, v5.16b +; CHECK-SD-NEXT: ushr v1.2d, v1.2d, #32 +; CHECK-SD-NEXT: orr v4.16b, v6.16b, v3.16b +; CHECK-SD-NEXT: bic v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: bic v1.16b, v2.16b, v3.16b +; CHECK-SD-NEXT: ushr v2.2d, v4.2d, #32 ; CHECK-SD-NEXT: cnt v0.16b, v0.16b -; CHECK-SD-NEXT: mvn v1.16b, v1.16b +; CHECK-SD-NEXT: bic v1.16b, v1.16b, v2.16b ; CHECK-SD-NEXT: cnt v1.16b, v1.16b ; CHECK-SD-NEXT: uaddlp v0.8h, v0.16b ; CHECK-SD-NEXT: uaddlp v0.4s, v0.8h @@ -377,30 +392,40 @@ define <4 x i64> @v4i64(<4 x i64> %d) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ushr v2.2d, v0.2d, #1 ; CHECK-SD-NEXT: ushr v3.2d, v1.2d, #1 -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v2.16b -; CHECK-SD-NEXT: orr v1.16b, v1.16b, v3.16b -; CHECK-SD-NEXT: ushr v2.2d, v0.2d, #2 -; CHECK-SD-NEXT: ushr v3.2d, v1.2d, #2 -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v2.16b -; CHECK-SD-NEXT: orr v1.16b, v1.16b, v3.16b -; CHECK-SD-NEXT: ushr v2.2d, v0.2d, #4 -; CHECK-SD-NEXT: ushr v3.2d, v1.2d, #4 -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v2.16b -; CHECK-SD-NEXT: orr v1.16b, v1.16b, v3.16b -; CHECK-SD-NEXT: ushr v2.2d, v0.2d, #8 -; CHECK-SD-NEXT: ushr v3.2d, v1.2d, #8 -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v2.16b -; CHECK-SD-NEXT: orr v1.16b, v1.16b, v3.16b -; CHECK-SD-NEXT: ushr v2.2d, v0.2d, #16 -; CHECK-SD-NEXT: ushr v3.2d, v1.2d, #16 -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v2.16b -; CHECK-SD-NEXT: orr v1.16b, v1.16b, v3.16b -; CHECK-SD-NEXT: ushr v2.2d, v0.2d, #32 -; CHECK-SD-NEXT: ushr v3.2d, v1.2d, #32 -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v2.16b -; CHECK-SD-NEXT: orr v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: orr v4.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: orr v5.16b, v1.16b, v3.16b ; CHECK-SD-NEXT: mvn v0.16b, v0.16b ; CHECK-SD-NEXT: mvn v1.16b, v1.16b +; CHECK-SD-NEXT: ushr v6.2d, v4.2d, #2 +; CHECK-SD-NEXT: ushr v7.2d, v5.2d, #2 +; CHECK-SD-NEXT: bic v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: bic v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: orr v4.16b, v4.16b, v6.16b +; CHECK-SD-NEXT: orr v5.16b, v5.16b, v7.16b +; CHECK-SD-NEXT: bic v0.16b, v0.16b, v6.16b +; CHECK-SD-NEXT: bic v1.16b, v1.16b, v7.16b +; CHECK-SD-NEXT: ushr v16.2d, v4.2d, #4 +; CHECK-SD-NEXT: ushr v17.2d, v5.2d, #4 +; CHECK-SD-NEXT: orr v4.16b, v4.16b, v16.16b +; CHECK-SD-NEXT: orr v5.16b, v5.16b, v17.16b +; CHECK-SD-NEXT: bic v0.16b, v0.16b, v16.16b +; CHECK-SD-NEXT: bic v1.16b, v1.16b, v17.16b +; CHECK-SD-NEXT: ushr v2.2d, v4.2d, #8 +; CHECK-SD-NEXT: ushr v3.2d, v5.2d, #8 +; CHECK-SD-NEXT: orr v4.16b, v4.16b, v2.16b +; CHECK-SD-NEXT: orr v5.16b, v5.16b, v3.16b +; CHECK-SD-NEXT: bic v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: bic v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: ushr v6.2d, v4.2d, #16 +; CHECK-SD-NEXT: ushr v7.2d, v5.2d, #16 +; CHECK-SD-NEXT: orr v2.16b, v4.16b, v6.16b +; CHECK-SD-NEXT: orr v3.16b, v5.16b, v7.16b +; CHECK-SD-NEXT: bic v0.16b, v0.16b, v6.16b +; CHECK-SD-NEXT: bic v1.16b, v1.16b, v7.16b +; CHECK-SD-NEXT: ushr v2.2d, v2.2d, #32 +; CHECK-SD-NEXT: ushr v3.2d, v3.2d, #32 +; CHECK-SD-NEXT: bic v0.16b, v0.16b, v2.16b +; CHECK-SD-NEXT: bic v1.16b, v1.16b, v3.16b ; CHECK-SD-NEXT: cnt v0.16b, v0.16b ; CHECK-SD-NEXT: cnt v1.16b, v1.16b ; CHECK-SD-NEXT: uaddlp v0.8h, v0.16b diff --git a/llvm/test/CodeGen/AArch64/eon.ll b/llvm/test/CodeGen/AArch64/eon.ll index 8b31cbfe16b1a..f939b4901be09 100644 --- a/llvm/test/CodeGen/AArch64/eon.ll +++ b/llvm/test/CodeGen/AArch64/eon.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s ; RUN: llc %s -pass-remarks-missed=gisel* -mtriple=aarch64-none-linux-gnu -global-isel -o - 2>&1 | FileCheck %s @@ -6,8 +7,9 @@ ; Check that the eon instruction is generated instead of eor,movn define i64 @test1(i64 %a, i64 %b, i64 %c) { ; CHECK-LABEL: test1: -; CHECK: eon -; CHECK: ret +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: eon x0, x0, x1, lsl #4 +; CHECK-NEXT: ret entry: %shl = shl i64 %b, 4 %neg = xor i64 %a, -1 @@ -18,10 +20,11 @@ entry: ; Same check with multiple uses of %neg define i64 @test2(i64 %a, i64 %b, i64 %c) { ; CHECK-LABEL: test2: -; CHECK: eon -; CHECK: eon -; CHECK: lsl -; CHECK: ret +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: eon x8, x0, x1, lsl #4 +; CHECK-NEXT: eon x9, x2, x1, lsl #4 +; CHECK-NEXT: lsl x0, x8, x9 +; CHECK-NEXT: ret entry: %shl = shl i64 %b, 4 %neg = xor i64 %shl, -1 @@ -34,8 +37,9 @@ entry: ; Check that eon is generated if the xor is a disjoint or. define i64 @disjoint_or(i64 %a, i64 %b) { ; CHECK-LABEL: disjoint_or: -; CHECK: eon -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: eon x0, x0, x1 +; CHECK-NEXT: ret %or = or disjoint i64 %a, %b %eon = xor i64 %or, -1 ret i64 %eon @@ -44,9 +48,10 @@ define i64 @disjoint_or(i64 %a, i64 %b) { ; Check that eon is *not* generated if the or is not disjoint. define i64 @normal_or(i64 %a, i64 %b) { ; CHECK-LABEL: normal_or: -; CHECK: orr -; CHECK: mvn -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: orr x8, x0, x1 +; CHECK-NEXT: mvn x0, x8 +; CHECK-NEXT: ret %or = or i64 %a, %b %not = xor i64 %or, -1 ret i64 %not diff --git a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll index 27be02c50f1c7..4c5eab036dbb4 100644 --- a/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll +++ b/llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll @@ -21,13 +21,15 @@ define i8 @test_ctlz_i8(i8 %a) nounwind { ; LA32R: # %bb.0: ; LA32R-NEXT: andi $a1, $a0, 254 ; LA32R-NEXT: srli.w $a1, $a1, 1 +; LA32R-NEXT: nor $a2, $a0, $a1 ; LA32R-NEXT: or $a0, $a0, $a1 ; LA32R-NEXT: andi $a1, $a0, 252 ; LA32R-NEXT: srli.w $a1, $a1, 2 +; LA32R-NEXT: andn $a2, $a2, $a1 ; LA32R-NEXT: or $a0, $a0, $a1 -; LA32R-NEXT: andi $a1, $a0, 240 -; LA32R-NEXT: srli.w $a1, $a1, 4 -; LA32R-NEXT: nor $a0, $a0, $a1 +; LA32R-NEXT: andi $a0, $a0, 240 +; LA32R-NEXT: srli.w $a0, $a0, 4 +; LA32R-NEXT: andn $a0, $a2, $a0 ; LA32R-NEXT: srli.w $a1, $a0, 1 ; LA32R-NEXT: andi $a1, $a1, 85 ; LA32R-NEXT: sub.w $a0, $a0, $a1 @@ -60,23 +62,28 @@ define i8 @test_ctlz_i8(i8 %a) nounwind { define i16 @test_ctlz_i16(i16 %a) nounwind { ; LA32R-LABEL: test_ctlz_i16: ; LA32R: # %bb.0: +; LA32R-NEXT: srli.w $a1, $a0, 1 +; LA32R-NEXT: lu12i.w $a2, 7 +; LA32R-NEXT: ori $a2, $a2, 4095 +; LA32R-NEXT: and $a1, $a1, $a2 +; LA32R-NEXT: nor $a2, $a0, $zero +; LA32R-NEXT: andn $a2, $a2, $a1 +; LA32R-NEXT: or $a0, $a0, $a1 ; LA32R-NEXT: lu12i.w $a1, 15 -; LA32R-NEXT: ori $a2, $a1, 4094 -; LA32R-NEXT: and $a2, $a0, $a2 -; LA32R-NEXT: srli.w $a2, $a2, 1 -; LA32R-NEXT: or $a0, $a0, $a2 -; LA32R-NEXT: ori $a2, $a1, 4092 -; LA32R-NEXT: and $a2, $a0, $a2 -; LA32R-NEXT: srli.w $a2, $a2, 2 -; LA32R-NEXT: or $a0, $a0, $a2 -; LA32R-NEXT: ori $a2, $a1, 4080 -; LA32R-NEXT: and $a2, $a0, $a2 -; LA32R-NEXT: srli.w $a2, $a2, 4 -; LA32R-NEXT: or $a0, $a0, $a2 +; LA32R-NEXT: ori $a3, $a1, 4092 +; LA32R-NEXT: and $a3, $a0, $a3 +; LA32R-NEXT: srli.w $a3, $a3, 2 +; LA32R-NEXT: andn $a2, $a2, $a3 +; LA32R-NEXT: or $a0, $a0, $a3 +; LA32R-NEXT: ori $a3, $a1, 4080 +; LA32R-NEXT: and $a3, $a0, $a3 +; LA32R-NEXT: srli.w $a3, $a3, 4 +; LA32R-NEXT: andn $a2, $a2, $a3 +; LA32R-NEXT: or $a0, $a0, $a3 ; LA32R-NEXT: ori $a1, $a1, 3840 -; LA32R-NEXT: and $a1, $a0, $a1 -; LA32R-NEXT: srli.w $a1, $a1, 8 -; LA32R-NEXT: nor $a0, $a0, $a1 +; LA32R-NEXT: and $a0, $a0, $a1 +; LA32R-NEXT: srli.w $a0, $a0, 8 +; LA32R-NEXT: andn $a0, $a2, $a0 ; LA32R-NEXT: srli.w $a1, $a0, 1 ; LA32R-NEXT: lu12i.w $a2, 5 ; LA32R-NEXT: ori $a2, $a2, 1365 @@ -117,15 +124,19 @@ define i32 @test_ctlz_i32(i32 %a) nounwind { ; LA32R-LABEL: test_ctlz_i32: ; LA32R: # %bb.0: ; LA32R-NEXT: srli.w $a1, $a0, 1 +; LA32R-NEXT: nor $a2, $a0, $a1 ; LA32R-NEXT: or $a0, $a0, $a1 ; LA32R-NEXT: srli.w $a1, $a0, 2 +; LA32R-NEXT: andn $a2, $a2, $a1 ; LA32R-NEXT: or $a0, $a0, $a1 ; LA32R-NEXT: srli.w $a1, $a0, 4 +; LA32R-NEXT: andn $a2, $a2, $a1 ; LA32R-NEXT: or $a0, $a0, $a1 ; LA32R-NEXT: srli.w $a1, $a0, 8 +; LA32R-NEXT: andn $a2, $a2, $a1 ; LA32R-NEXT: or $a0, $a0, $a1 -; LA32R-NEXT: srli.w $a1, $a0, 16 -; LA32R-NEXT: nor $a0, $a0, $a1 +; LA32R-NEXT: srli.w $a0, $a0, 16 +; LA32R-NEXT: andn $a0, $a2, $a0 ; LA32R-NEXT: srli.w $a1, $a0, 1 ; LA32R-NEXT: lu12i.w $a2, 349525 ; LA32R-NEXT: ori $a2, $a2, 1365 @@ -175,15 +186,19 @@ define i64 @test_ctlz_i64(i64 %a) nounwind { ; LA32R-NEXT: bne $a1, $zero, .LBB3_2 ; LA32R-NEXT: # %bb.1: ; LA32R-NEXT: srli.w $a1, $a0, 1 +; LA32R-NEXT: nor $a6, $a0, $a1 ; LA32R-NEXT: or $a0, $a0, $a1 ; LA32R-NEXT: srli.w $a1, $a0, 2 +; LA32R-NEXT: andn $a6, $a6, $a1 ; LA32R-NEXT: or $a0, $a0, $a1 ; LA32R-NEXT: srli.w $a1, $a0, 4 +; LA32R-NEXT: andn $a6, $a6, $a1 ; LA32R-NEXT: or $a0, $a0, $a1 ; LA32R-NEXT: srli.w $a1, $a0, 8 +; LA32R-NEXT: andn $a6, $a6, $a1 ; LA32R-NEXT: or $a0, $a0, $a1 -; LA32R-NEXT: srli.w $a1, $a0, 16 -; LA32R-NEXT: nor $a0, $a0, $a1 +; LA32R-NEXT: srli.w $a0, $a0, 16 +; LA32R-NEXT: andn $a0, $a6, $a0 ; LA32R-NEXT: srli.w $a1, $a0, 1 ; LA32R-NEXT: and $a1, $a1, $a5 ; LA32R-NEXT: sub.w $a0, $a0, $a1 @@ -201,15 +216,19 @@ define i64 @test_ctlz_i64(i64 %a) nounwind { ; LA32R-NEXT: ret ; LA32R-NEXT: .LBB3_2: ; LA32R-NEXT: srli.w $a0, $a1, 1 +; LA32R-NEXT: nor $a6, $a1, $a0 ; LA32R-NEXT: or $a0, $a1, $a0 ; LA32R-NEXT: srli.w $a1, $a0, 2 +; LA32R-NEXT: andn $a6, $a6, $a1 ; LA32R-NEXT: or $a0, $a0, $a1 ; LA32R-NEXT: srli.w $a1, $a0, 4 +; LA32R-NEXT: andn $a6, $a6, $a1 ; LA32R-NEXT: or $a0, $a0, $a1 ; LA32R-NEXT: srli.w $a1, $a0, 8 +; LA32R-NEXT: andn $a6, $a6, $a1 ; LA32R-NEXT: or $a0, $a0, $a1 -; LA32R-NEXT: srli.w $a1, $a0, 16 -; LA32R-NEXT: nor $a0, $a0, $a1 +; LA32R-NEXT: srli.w $a0, $a0, 16 +; LA32R-NEXT: andn $a0, $a6, $a0 ; LA32R-NEXT: srli.w $a1, $a0, 1 ; LA32R-NEXT: and $a1, $a1, $a5 ; LA32R-NEXT: sub.w $a0, $a0, $a1 @@ -250,14 +269,17 @@ define i8 @test_not_ctlz_i8(i8 %a) nounwind { ; LA32R: # %bb.0: ; LA32R-NEXT: ori $a1, $zero, 254 ; LA32R-NEXT: andn $a1, $a1, $a0 +; LA32R-NEXT: nor $a2, $a0, $zero ; LA32R-NEXT: srli.w $a1, $a1, 1 +; LA32R-NEXT: nor $a2, $a2, $a1 ; LA32R-NEXT: orn $a0, $a1, $a0 ; LA32R-NEXT: andi $a1, $a0, 252 ; LA32R-NEXT: srli.w $a1, $a1, 2 +; LA32R-NEXT: andn $a2, $a2, $a1 ; LA32R-NEXT: or $a0, $a0, $a1 -; LA32R-NEXT: andi $a1, $a0, 240 -; LA32R-NEXT: srli.w $a1, $a1, 4 -; LA32R-NEXT: nor $a0, $a0, $a1 +; LA32R-NEXT: andi $a0, $a0, 240 +; LA32R-NEXT: srli.w $a0, $a0, 4 +; LA32R-NEXT: andn $a0, $a2, $a0 ; LA32R-NEXT: srli.w $a1, $a0, 1 ; LA32R-NEXT: andi $a1, $a1, 85 ; LA32R-NEXT: sub.w $a0, $a0, $a1 @@ -293,19 +315,22 @@ define i16 @test_not_ctlz_i16(i16 %a) nounwind { ; LA32R-NEXT: ori $a2, $a1, 4094 ; LA32R-NEXT: andn $a2, $a2, $a0 ; LA32R-NEXT: srli.w $a2, $a2, 1 +; LA32R-NEXT: andn $a3, $a0, $a2 ; LA32R-NEXT: orn $a0, $a2, $a0 ; LA32R-NEXT: ori $a2, $a1, 4092 ; LA32R-NEXT: and $a2, $a0, $a2 ; LA32R-NEXT: srli.w $a2, $a2, 2 +; LA32R-NEXT: andn $a3, $a3, $a2 ; LA32R-NEXT: or $a0, $a0, $a2 ; LA32R-NEXT: ori $a2, $a1, 4080 ; LA32R-NEXT: and $a2, $a0, $a2 ; LA32R-NEXT: srli.w $a2, $a2, 4 +; LA32R-NEXT: andn $a3, $a3, $a2 ; LA32R-NEXT: or $a0, $a0, $a2 ; LA32R-NEXT: ori $a1, $a1, 3840 -; LA32R-NEXT: and $a1, $a0, $a1 -; LA32R-NEXT: srli.w $a1, $a1, 8 -; LA32R-NEXT: nor $a0, $a0, $a1 +; LA32R-NEXT: and $a0, $a0, $a1 +; LA32R-NEXT: srli.w $a0, $a0, 8 +; LA32R-NEXT: andn $a0, $a3, $a0 ; LA32R-NEXT: srli.w $a1, $a0, 1 ; LA32R-NEXT: lu12i.w $a2, 5 ; LA32R-NEXT: ori $a2, $a2, 1365 @@ -345,16 +370,20 @@ define i32 @test_not_ctlz_i32(i32 %a) nounwind { ; LA32R-LABEL: test_not_ctlz_i32: ; LA32R: # %bb.0: ; LA32R-NEXT: nor $a1, $a0, $zero -; LA32R-NEXT: srli.w $a1, $a1, 1 -; LA32R-NEXT: orn $a0, $a1, $a0 -; LA32R-NEXT: srli.w $a1, $a0, 2 -; LA32R-NEXT: or $a0, $a0, $a1 -; LA32R-NEXT: srli.w $a1, $a0, 4 -; LA32R-NEXT: or $a0, $a0, $a1 -; LA32R-NEXT: srli.w $a1, $a0, 8 -; LA32R-NEXT: or $a0, $a0, $a1 -; LA32R-NEXT: srli.w $a1, $a0, 16 -; LA32R-NEXT: nor $a0, $a0, $a1 +; LA32R-NEXT: srli.w $a2, $a1, 1 +; LA32R-NEXT: nor $a1, $a1, $a2 +; LA32R-NEXT: orn $a0, $a2, $a0 +; LA32R-NEXT: srli.w $a2, $a0, 2 +; LA32R-NEXT: andn $a1, $a1, $a2 +; LA32R-NEXT: or $a0, $a0, $a2 +; LA32R-NEXT: srli.w $a2, $a0, 4 +; LA32R-NEXT: andn $a1, $a1, $a2 +; LA32R-NEXT: or $a0, $a0, $a2 +; LA32R-NEXT: srli.w $a2, $a0, 8 +; LA32R-NEXT: andn $a1, $a1, $a2 +; LA32R-NEXT: or $a0, $a0, $a2 +; LA32R-NEXT: srli.w $a0, $a0, 16 +; LA32R-NEXT: andn $a0, $a1, $a0 ; LA32R-NEXT: srli.w $a1, $a0, 1 ; LA32R-NEXT: lu12i.w $a2, 349525 ; LA32R-NEXT: ori $a2, $a2, 1365 @@ -406,16 +435,20 @@ define i64 @test_not_ctlz_i64(i64 %a) nounwind { ; LA32R-NEXT: bne $a6, $zero, .LBB7_2 ; LA32R-NEXT: # %bb.1: ; LA32R-NEXT: nor $a1, $a0, $zero -; LA32R-NEXT: srli.w $a1, $a1, 1 -; LA32R-NEXT: orn $a0, $a1, $a0 -; LA32R-NEXT: srli.w $a1, $a0, 2 -; LA32R-NEXT: or $a0, $a0, $a1 -; LA32R-NEXT: srli.w $a1, $a0, 4 -; LA32R-NEXT: or $a0, $a0, $a1 -; LA32R-NEXT: srli.w $a1, $a0, 8 -; LA32R-NEXT: or $a0, $a0, $a1 -; LA32R-NEXT: srli.w $a1, $a0, 16 -; LA32R-NEXT: nor $a0, $a0, $a1 +; LA32R-NEXT: srli.w $a6, $a1, 1 +; LA32R-NEXT: nor $a1, $a1, $a6 +; LA32R-NEXT: orn $a0, $a6, $a0 +; LA32R-NEXT: srli.w $a6, $a0, 2 +; LA32R-NEXT: andn $a1, $a1, $a6 +; LA32R-NEXT: or $a0, $a0, $a6 +; LA32R-NEXT: srli.w $a6, $a0, 4 +; LA32R-NEXT: andn $a1, $a1, $a6 +; LA32R-NEXT: or $a0, $a0, $a6 +; LA32R-NEXT: srli.w $a6, $a0, 8 +; LA32R-NEXT: andn $a1, $a1, $a6 +; LA32R-NEXT: or $a0, $a0, $a6 +; LA32R-NEXT: srli.w $a0, $a0, 16 +; LA32R-NEXT: andn $a0, $a1, $a0 ; LA32R-NEXT: srli.w $a1, $a0, 1 ; LA32R-NEXT: and $a1, $a1, $a5 ; LA32R-NEXT: sub.w $a0, $a0, $a1 @@ -433,15 +466,19 @@ define i64 @test_not_ctlz_i64(i64 %a) nounwind { ; LA32R-NEXT: ret ; LA32R-NEXT: .LBB7_2: ; LA32R-NEXT: srli.w $a0, $a6, 1 +; LA32R-NEXT: nor $a6, $a6, $a0 ; LA32R-NEXT: orn $a0, $a0, $a1 ; LA32R-NEXT: srli.w $a1, $a0, 2 +; LA32R-NEXT: andn $a6, $a6, $a1 ; LA32R-NEXT: or $a0, $a0, $a1 ; LA32R-NEXT: srli.w $a1, $a0, 4 +; LA32R-NEXT: andn $a6, $a6, $a1 ; LA32R-NEXT: or $a0, $a0, $a1 ; LA32R-NEXT: srli.w $a1, $a0, 8 +; LA32R-NEXT: andn $a6, $a6, $a1 ; LA32R-NEXT: or $a0, $a0, $a1 -; LA32R-NEXT: srli.w $a1, $a0, 16 -; LA32R-NEXT: nor $a0, $a0, $a1 +; LA32R-NEXT: srli.w $a0, $a0, 16 +; LA32R-NEXT: andn $a0, $a6, $a0 ; LA32R-NEXT: srli.w $a1, $a0, 1 ; LA32R-NEXT: and $a1, $a1, $a5 ; LA32R-NEXT: sub.w $a0, $a0, $a1 diff --git a/llvm/test/CodeGen/PowerPC/xxeval-eqv-nor-or-xor.ll b/llvm/test/CodeGen/PowerPC/xxeval-eqv-nor-or-xor.ll index 6616a1e6e7e9f..e391228fc95a9 100644 --- a/llvm/test/CodeGen/PowerPC/xxeval-eqv-nor-or-xor.ll +++ b/llvm/test/CodeGen/PowerPC/xxeval-eqv-nor-or-xor.ll @@ -32,7 +32,10 @@ entry: define dso_local <8 x i16> @eqvA_B_C(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) local_unnamed_addr #0 { ; CHECK-LABEL: eqvA_B_C: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxeval v2, v2, v3, v4, 150 +; CHECK-NEXT: xxleqv vs1, vs1, vs1 +; CHECK-NEXT: xxland vs0, v3, v4 +; CHECK-NEXT: xxeval vs1, v3, v4, vs1, 96 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 ; CHECK-NEXT: blr entry: %and = and <8 x i16> %B, %C @@ -113,7 +116,9 @@ entry: define dso_local <4 x i32> @norA_B_C(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) local_unnamed_addr #0 { ; CHECK-LABEL: norA_B_C: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xxeval v2, v2, v3, v4, 128 +; CHECK-NEXT: xxlnor vs0, v4, v4 +; CHECK-NEXT: xxlnor vs1, v3, v3 +; CHECK-NEXT: xxeval v2, v2, vs1, vs0, 16 ; CHECK-NEXT: blr entry: %or = or <4 x i32> %B, %C diff --git a/llvm/test/CodeGen/SystemZ/scalar-ctlz-02.ll b/llvm/test/CodeGen/SystemZ/scalar-ctlz-02.ll index 2c3bf944cdf89..9ff15f946d2d6 100644 --- a/llvm/test/CodeGen/SystemZ/scalar-ctlz-02.ll +++ b/llvm/test/CodeGen/SystemZ/scalar-ctlz-02.ll @@ -11,25 +11,31 @@ define i128 @f1(i128 %a) { ; CHECK-NEXT: vl %v0, 0(%r3), 3 ; CHECK-NEXT: vrepib %v1, 1 ; CHECK-NEXT: vsrl %v1, %v0, %v1 +; CHECK-NEXT: vno %v2, %v0, %v1 ; CHECK-NEXT: vo %v0, %v0, %v1 ; CHECK-NEXT: vrepib %v1, 2 ; CHECK-NEXT: vsrl %v1, %v0, %v1 +; CHECK-NEXT: vnc %v2, %v2, %v1 ; CHECK-NEXT: vo %v0, %v0, %v1 ; CHECK-NEXT: vrepib %v1, 4 ; CHECK-NEXT: vsrl %v1, %v0, %v1 +; CHECK-NEXT: vnc %v2, %v2, %v1 ; CHECK-NEXT: vo %v0, %v0, %v1 ; CHECK-NEXT: vrepib %v1, 8 ; CHECK-NEXT: vsrlb %v1, %v0, %v1 +; CHECK-NEXT: vnc %v2, %v2, %v1 ; CHECK-NEXT: vo %v0, %v0, %v1 ; CHECK-NEXT: vrepib %v1, 16 ; CHECK-NEXT: vsrlb %v1, %v0, %v1 +; CHECK-NEXT: vnc %v2, %v2, %v1 ; CHECK-NEXT: vo %v0, %v0, %v1 ; CHECK-NEXT: vrepib %v1, 32 ; CHECK-NEXT: vsrlb %v1, %v0, %v1 +; CHECK-NEXT: vnc %v2, %v2, %v1 ; CHECK-NEXT: vo %v0, %v0, %v1 ; CHECK-NEXT: vrepib %v1, 64 -; CHECK-NEXT: vsrlb %v1, %v0, %v1 -; CHECK-NEXT: vno %v0, %v0, %v1 +; CHECK-NEXT: vsrlb %v0, %v0, %v1 +; CHECK-NEXT: vnc %v0, %v2, %v0 ; CHECK-NEXT: vpopct %v0, %v0, 0 ; CHECK-NEXT: vgbm %v1, 0 ; CHECK-NEXT: vsumb %v0, %v0, %v1 @@ -47,25 +53,31 @@ define i128 @f2(i128 %a) { ; CHECK-NEXT: vl %v0, 0(%r3), 3 ; CHECK-NEXT: vrepib %v1, 1 ; CHECK-NEXT: vsrl %v1, %v0, %v1 +; CHECK-NEXT: vno %v2, %v0, %v1 ; CHECK-NEXT: vo %v0, %v0, %v1 ; CHECK-NEXT: vrepib %v1, 2 ; CHECK-NEXT: vsrl %v1, %v0, %v1 +; CHECK-NEXT: vnc %v2, %v2, %v1 ; CHECK-NEXT: vo %v0, %v0, %v1 ; CHECK-NEXT: vrepib %v1, 4 ; CHECK-NEXT: vsrl %v1, %v0, %v1 +; CHECK-NEXT: vnc %v2, %v2, %v1 ; CHECK-NEXT: vo %v0, %v0, %v1 ; CHECK-NEXT: vrepib %v1, 8 ; CHECK-NEXT: vsrlb %v1, %v0, %v1 +; CHECK-NEXT: vnc %v2, %v2, %v1 ; CHECK-NEXT: vo %v0, %v0, %v1 ; CHECK-NEXT: vrepib %v1, 16 ; CHECK-NEXT: vsrlb %v1, %v0, %v1 +; CHECK-NEXT: vnc %v2, %v2, %v1 ; CHECK-NEXT: vo %v0, %v0, %v1 ; CHECK-NEXT: vrepib %v1, 32 ; CHECK-NEXT: vsrlb %v1, %v0, %v1 +; CHECK-NEXT: vnc %v2, %v2, %v1 ; CHECK-NEXT: vo %v0, %v0, %v1 ; CHECK-NEXT: vrepib %v1, 64 -; CHECK-NEXT: vsrlb %v1, %v0, %v1 -; CHECK-NEXT: vno %v0, %v0, %v1 +; CHECK-NEXT: vsrlb %v0, %v0, %v1 +; CHECK-NEXT: vnc %v0, %v2, %v0 ; CHECK-NEXT: vpopct %v0, %v0, 0 ; CHECK-NEXT: vgbm %v1, 0 ; CHECK-NEXT: vsumb %v0, %v0, %v1 diff --git a/llvm/test/CodeGen/SystemZ/vec-eval.ll b/llvm/test/CodeGen/SystemZ/vec-eval.ll index bcdedcd3a407b..417fcb90af9a3 100644 --- a/llvm/test/CodeGen/SystemZ/vec-eval.ll +++ b/llvm/test/CodeGen/SystemZ/vec-eval.ll @@ -1889,7 +1889,9 @@ entry: define <16 x i8> @eval128(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) { ; CHECK-LABEL: eval128: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: veval %v24, %v26, %v24, %v28, 128 +; CHECK-NEXT: vno %v0, %v24, %v24 +; CHECK-NEXT: vno %v1, %v26, %v26 +; CHECK-NEXT: veval %v24, %v1, %v0, %v28, 2 ; CHECK-NEXT: br %r14 entry: %and.demorgan = or <16 x i8> %src2, %src1 @@ -1901,9 +1903,10 @@ entry: define <16 x i8> @eval129(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) { ; CHECK-LABEL: eval129: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vo %v0, %v26, %v24 +; CHECK-NEXT: vgbm %v0, 65535 +; CHECK-NEXT: veval %v0, %v24, %v0, %v26, 40 ; CHECK-NEXT: vn %v1, %v26, %v24 -; CHECK-NEXT: veval %v24, %v1, %v28, %v0, 139 +; CHECK-NEXT: vsel %v24, %v1, %v0, %v28 ; CHECK-NEXT: br %r14 entry: %and.demorgan = or <16 x i8> %src2, %src1 @@ -2034,8 +2037,10 @@ entry: define <16 x i8> @eval138(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) { ; CHECK-LABEL: eval138: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: veval %v0, %v26, %v24, %v28, 127 -; CHECK-NEXT: veval %v24, %v24, %v28, %v0, 174 +; CHECK-NEXT: vgbm %v0, 65535 +; CHECK-NEXT: veval %v0, %v24, %v0, %v26, 40 +; CHECK-NEXT: vnc %v1, %v24, %v28 +; CHECK-NEXT: veval %v24, %v1, %v0, %v28, 47 ; CHECK-NEXT: br %r14 entry: %not2 = xor <16 x i8> %src3, splat(i8 -1) @@ -2050,9 +2055,10 @@ entry: define <16 x i8> @eval139(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) { ; CHECK-LABEL: eval139: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vo %v0, %v26, %v24 +; CHECK-NEXT: vgbm %v0, 65535 +; CHECK-NEXT: veval %v0, %v24, %v0, %v26, 40 ; CHECK-NEXT: veval %v1, %v24, %v26, %v28, 11 -; CHECK-NEXT: veval %v24, %v1, %v0, %v28, 143 +; CHECK-NEXT: veval %v24, %v1, %v0, %v28, 47 ; CHECK-NEXT: br %r14 entry: %0 = or <16 x i8> %src2, %src1 @@ -2068,8 +2074,10 @@ entry: define <16 x i8> @eval140(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) { ; CHECK-LABEL: eval140: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: veval %v0, %v24, %v28, %v26, 127 -; CHECK-NEXT: veval %v24, %v24, %v26, %v0, 174 +; CHECK-NEXT: vgbm %v0, 65535 +; CHECK-NEXT: veval %v0, %v28, %v0, %v24, 40 +; CHECK-NEXT: vnc %v1, %v24, %v26 +; CHECK-NEXT: veval %v24, %v1, %v0, %v26, 47 ; CHECK-NEXT: br %r14 entry: %not1 = xor <16 x i8> %src2, splat(i8 -1) @@ -2084,10 +2092,11 @@ entry: define <16 x i8> @eval141(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) { ; CHECK-LABEL: eval141: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vgbm %v0, 65535 ; CHECK-NEXT: veval %v1, %v26, %v24, %v28, 1 -; CHECK-NEXT: vo %v0, %v26, %v24 +; CHECK-NEXT: veval %v0, %v24, %v0, %v26, 40 ; CHECK-NEXT: veval %v1, %v1, %v24, %v26, 47 -; CHECK-NEXT: veval %v24, %v1, %v0, %v28, 143 +; CHECK-NEXT: veval %v24, %v1, %v0, %v28, 47 ; CHECK-NEXT: br %r14 entry: %not1 = xor <16 x i8> %src2, splat(i8 -1) @@ -2105,9 +2114,10 @@ entry: define <16 x i8> @eval142(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) { ; CHECK-LABEL: eval142: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: veval %v0, %v26, %v24, %v28, 127 -; CHECK-NEXT: vn %v1, %v28, %v26 -; CHECK-NEXT: veval %v24, %v24, %v1, %v0, 174 +; CHECK-NEXT: vgbm %v0, 65535 +; CHECK-NEXT: veval %v0, %v24, %v0, %v26, 40 +; CHECK-NEXT: veval %v1, %v24, %v28, %v26, 14 +; CHECK-NEXT: veval %v24, %v1, %v0, %v28, 47 ; CHECK-NEXT: br %r14 entry: %0 = or <16 x i8> %src2, %src1 @@ -2441,8 +2451,10 @@ entry: define <16 x i8> @eval162(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) { ; CHECK-LABEL: eval162: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: veval %v0, %v28, %v24, %v26, 127 -; CHECK-NEXT: veval %v24, %v26, %v28, %v0, 174 +; CHECK-NEXT: vgbm %v1, 65535 +; CHECK-NEXT: vno %v0, %v28, %v28 +; CHECK-NEXT: veval %v1, %v24, %v1, %v28, 40 +; CHECK-NEXT: vsel %v24, %v0, %v1, %v26 ; CHECK-NEXT: br %r14 entry: %not2 = xor <16 x i8> %src3, splat(i8 -1) @@ -2457,9 +2469,10 @@ entry: define <16 x i8> @eval163(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %src3) { ; CHECK-LABEL: eval163: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vo %v0, %v26, %v24 +; CHECK-NEXT: vgbm %v0, 65535 +; CHECK-NEXT: veval %v0, %v24, %v0, %v26, 40 ; CHECK-NEXT: veval %v1, %v26, %v24, %v28, 11 -; CHECK-NEXT: veval %v24, %v1, %v0, %v28, 143 +; CHECK-NEXT: veval %v24, %v1, %v0, %v28, 47 ; CHECK-NEXT: br %r14 entry: %0 = or <16 x i8> %src2, %src1 diff --git a/llvm/test/CodeGen/X86/bmi-rewrite-demorgan.ll b/llvm/test/CodeGen/X86/bmi-rewrite-demorgan.ll new file mode 100644 index 0000000000000..a1ace1b6ca157 --- /dev/null +++ b/llvm/test/CodeGen/X86/bmi-rewrite-demorgan.ll @@ -0,0 +1,171 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefix=X86-WITH-BMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefix=X64-WITH-BMI +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86-WITHOUT-BMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64-WITHOUT-BMI + +define i32 @not_rewrite_demorgan_i32(i32 %a, i32 %b) nounwind { +; X86-WITH-BMI-LABEL: not_rewrite_demorgan_i32: +; X86-WITH-BMI: # %bb.0: +; X86-WITH-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-WITH-BMI-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-WITH-BMI-NEXT: notl %eax +; X86-WITH-BMI-NEXT: retl +; +; X64-WITH-BMI-LABEL: not_rewrite_demorgan_i32: +; X64-WITH-BMI: # %bb.0: +; X64-WITH-BMI-NEXT: movl %edi, %eax +; X64-WITH-BMI-NEXT: orl %esi, %eax +; X64-WITH-BMI-NEXT: notl %eax +; X64-WITH-BMI-NEXT: retq +; +; X86-WITHOUT-BMI-LABEL: not_rewrite_demorgan_i32: +; X86-WITHOUT-BMI: # %bb.0: +; X86-WITHOUT-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-WITHOUT-BMI-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-WITHOUT-BMI-NEXT: notl %eax +; X86-WITHOUT-BMI-NEXT: retl +; +; X64-WITHOUT-BMI-LABEL: not_rewrite_demorgan_i32: +; X64-WITHOUT-BMI: # %bb.0: +; X64-WITHOUT-BMI-NEXT: movl %edi, %eax +; X64-WITHOUT-BMI-NEXT: orl %esi, %eax +; X64-WITHOUT-BMI-NEXT: notl %eax +; X64-WITHOUT-BMI-NEXT: retq + %temp = or i32 %b, %a + %res = xor i32 %temp, -1 + ret i32 %res +} + +define i32 @rewrite_demorgan_i32(i32 %a, i32 %b, i32 %c) nounwind { +; X86-WITH-BMI-LABEL: rewrite_demorgan_i32: +; X86-WITH-BMI: # %bb.0: +; X86-WITH-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-WITH-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-WITH-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-WITH-BMI-NEXT: notl %edx +; X86-WITH-BMI-NEXT: andnl %edx, %ecx, %ecx +; X86-WITH-BMI-NEXT: andnl %ecx, %eax, %eax +; X86-WITH-BMI-NEXT: retl +; +; X64-WITH-BMI-LABEL: rewrite_demorgan_i32: +; X64-WITH-BMI: # %bb.0: +; X64-WITH-BMI-NEXT: notl %edi +; X64-WITH-BMI-NEXT: andnl %edi, %esi, %eax +; X64-WITH-BMI-NEXT: andnl %eax, %edx, %eax +; X64-WITH-BMI-NEXT: retq +; +; X86-WITHOUT-BMI-LABEL: rewrite_demorgan_i32: +; X86-WITHOUT-BMI: # %bb.0: +; X86-WITHOUT-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-WITHOUT-BMI-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-WITHOUT-BMI-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-WITHOUT-BMI-NEXT: notl %eax +; X86-WITHOUT-BMI-NEXT: retl +; +; X64-WITHOUT-BMI-LABEL: rewrite_demorgan_i32: +; X64-WITHOUT-BMI: # %bb.0: +; X64-WITHOUT-BMI-NEXT: movl %edi, %eax +; X64-WITHOUT-BMI-NEXT: orl %esi, %eax +; X64-WITHOUT-BMI-NEXT: orl %edx, %eax +; X64-WITHOUT-BMI-NEXT: notl %eax +; X64-WITHOUT-BMI-NEXT: retq + %and.demorgan = or i32 %b, %a + %and3.demorgan = or i32 %and.demorgan, %c + %and3 = xor i32 %and3.demorgan, -1 + ret i32 %and3 +} + +define i64 @not_rewrite_demorgan_i64(i64 %a, i64 %b) nounwind { +; X86-WITH-BMI-LABEL: not_rewrite_demorgan_i64: +; X86-WITH-BMI: # %bb.0: +; X86-WITH-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-WITH-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-WITH-BMI-NEXT: orl {{[0-9]+}}(%esp), %edx +; X86-WITH-BMI-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-WITH-BMI-NEXT: notl %eax +; X86-WITH-BMI-NEXT: notl %edx +; X86-WITH-BMI-NEXT: retl +; +; X64-WITH-BMI-LABEL: not_rewrite_demorgan_i64: +; X64-WITH-BMI: # %bb.0: +; X64-WITH-BMI-NEXT: movq %rdi, %rax +; X64-WITH-BMI-NEXT: orq %rsi, %rax +; X64-WITH-BMI-NEXT: notq %rax +; X64-WITH-BMI-NEXT: retq +; +; X86-WITHOUT-BMI-LABEL: not_rewrite_demorgan_i64: +; X86-WITHOUT-BMI: # %bb.0: +; X86-WITHOUT-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-WITHOUT-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-WITHOUT-BMI-NEXT: orl {{[0-9]+}}(%esp), %edx +; X86-WITHOUT-BMI-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-WITHOUT-BMI-NEXT: notl %eax +; X86-WITHOUT-BMI-NEXT: notl %edx +; X86-WITHOUT-BMI-NEXT: retl +; +; X64-WITHOUT-BMI-LABEL: not_rewrite_demorgan_i64: +; X64-WITHOUT-BMI: # %bb.0: +; X64-WITHOUT-BMI-NEXT: movq %rdi, %rax +; X64-WITHOUT-BMI-NEXT: orq %rsi, %rax +; X64-WITHOUT-BMI-NEXT: notq %rax +; X64-WITHOUT-BMI-NEXT: retq + %temp = or i64 %b, %a + %res = xor i64 %temp, -1 + ret i64 %res +} + +define i64 @rewrite_demorgan_i64(i64 %a, i64 %b, i64 %c) nounwind { +; X86-WITH-BMI-LABEL: rewrite_demorgan_i64: +; X86-WITH-BMI: # %bb.0: +; X86-WITH-BMI-NEXT: pushl %ebx +; X86-WITH-BMI-NEXT: pushl %edi +; X86-WITH-BMI-NEXT: pushl %esi +; X86-WITH-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-WITH-BMI-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-WITH-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-WITH-BMI-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-WITH-BMI-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-WITH-BMI-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-WITH-BMI-NEXT: notl %edi +; X86-WITH-BMI-NEXT: andnl %edi, %edx, %edx +; X86-WITH-BMI-NEXT: andnl %edx, %eax, %eax +; X86-WITH-BMI-NEXT: notl %ebx +; X86-WITH-BMI-NEXT: andnl %ebx, %esi, %edx +; X86-WITH-BMI-NEXT: andnl %edx, %ecx, %edx +; X86-WITH-BMI-NEXT: popl %esi +; X86-WITH-BMI-NEXT: popl %edi +; X86-WITH-BMI-NEXT: popl %ebx +; X86-WITH-BMI-NEXT: retl +; +; X64-WITH-BMI-LABEL: rewrite_demorgan_i64: +; X64-WITH-BMI: # %bb.0: +; X64-WITH-BMI-NEXT: notq %rdi +; X64-WITH-BMI-NEXT: andnq %rdi, %rsi, %rax +; X64-WITH-BMI-NEXT: andnq %rax, %rdx, %rax +; X64-WITH-BMI-NEXT: retq +; +; X86-WITHOUT-BMI-LABEL: rewrite_demorgan_i64: +; X86-WITHOUT-BMI: # %bb.0: +; X86-WITHOUT-BMI-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-WITHOUT-BMI-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-WITHOUT-BMI-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-WITHOUT-BMI-NEXT: orl {{[0-9]+}}(%esp), %edx +; X86-WITHOUT-BMI-NEXT: orl {{[0-9]+}}(%esp), %edx +; X86-WITHOUT-BMI-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-WITHOUT-BMI-NEXT: notl %eax +; X86-WITHOUT-BMI-NEXT: notl %edx +; X86-WITHOUT-BMI-NEXT: retl +; +; X64-WITHOUT-BMI-LABEL: rewrite_demorgan_i64: +; X64-WITHOUT-BMI: # %bb.0: +; X64-WITHOUT-BMI-NEXT: movq %rdi, %rax +; X64-WITHOUT-BMI-NEXT: orq %rsi, %rax +; X64-WITHOUT-BMI-NEXT: orq %rdx, %rax +; X64-WITHOUT-BMI-NEXT: notq %rax +; X64-WITHOUT-BMI-NEXT: retq + %and.demorgan = or i64 %b, %a + %and3.demorgan = or i64 %and.demorgan, %c + %and3 = xor i64 %and3.demorgan, -1 + ret i64 %and3 +} diff --git a/llvm/test/CodeGen/X86/bool-ext-inc.ll b/llvm/test/CodeGen/X86/bool-ext-inc.ll index 088b0ce857f20..d89893f94bdae 100644 --- a/llvm/test/CodeGen/X86/bool-ext-inc.ll +++ b/llvm/test/CodeGen/X86/bool-ext-inc.ll @@ -88,8 +88,11 @@ define <4 x i32> @bool_logic_and_math_vec(<4 x i32> %a, <4 x i32> %b, <4 x i32> ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] +; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %cmp1 = icmp ne <4 x i32> %a, %b %cmp2 = icmp ne <4 x i32> %c, %d diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll index 7bc90534dcc6e..21657bf67f233 100644 --- a/llvm/test/CodeGen/X86/combine-srl.ll +++ b/llvm/test/CodeGen/X86/combine-srl.ll @@ -437,12 +437,13 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) { ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $8, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 diff --git a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll index dbfa69d497698..905d1648564fb 100644 --- a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll @@ -1487,12 +1487,13 @@ define <4 x i32> @vp_ctlz_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psrld $8, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pandn %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: psrlw $1, %xmm1 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 diff --git a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll index 23dcf334124c0..f59e53687ff74 100644 --- a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll @@ -640,8 +640,8 @@ define <4 x i1> @ne_and_to_abs_vec4x64(<4 x i64> %x) { ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18446744073709551487,18446744073709551487,18446744073709551487,18446744073709551487] ; AVX2-NEXT: vpcmpeqq %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -650,17 +650,18 @@ define <4 x i1> @ne_and_to_abs_vec4x64(<4 x i64> %x) { ; SSE41-LABEL: ne_and_to_abs_vec4x64: ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = [129,129] -; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: pcmpeqq %xmm2, %xmm3 -; SSE41-NEXT: pcmpeqq %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE41-NEXT: pmovsxwq {{.*#+}} xmm5 = [18446744073709551487,18446744073709551487] -; SSE41-NEXT: pcmpeqq %xmm5, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm0, %xmm2 +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE41-NEXT: pmovsxwq {{.*#+}} xmm4 = [18446744073709551487,18446744073709551487] +; SSE41-NEXT: pcmpeqq %xmm4, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm0 ; SSE41-NEXT: packssdw %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pandn %xmm0, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; SSE2-LABEL: ne_and_to_abs_vec4x64: @@ -681,8 +682,9 @@ define <4 x i1> @ne_and_to_abs_vec4x64(<4 x i64> %x) { ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm1[1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE2-NEXT: andps %xmm4, %xmm0 -; SSE2-NEXT: orps %xmm2, %xmm0 ; SSE2-NEXT: xorps %xmm3, %xmm0 +; SSE2-NEXT: andnps %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq %cmp1 = icmp ne <4 x i64> %x, %cmp2 = icmp ne <4 x i64> %x, @@ -706,51 +708,51 @@ define <4 x i64> @ne_and_to_abs_vec4x64_sext(<4 x i64> %x) { ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18446744073709551487,18446744073709551487,18446744073709551487,18446744073709551487] ; AVX2-NEXT: vpcmpeqq %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: ne_and_to_abs_vec4x64_sext: ; SSE41: # %bb.0: ; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = [129,129] -; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: pcmpeqq %xmm2, %xmm3 -; SSE41-NEXT: pcmpeqq %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE41-NEXT: pmovsxwq {{.*#+}} xmm5 = [18446744073709551487,18446744073709551487] -; SSE41-NEXT: pcmpeqq %xmm5, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm0, %xmm2 +; SSE41-NEXT: packssdw %xmm3, %xmm2 +; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 +; SSE41-NEXT: pmovsxwq {{.*#+}} xmm4 = [18446744073709551487,18446744073709551487] +; SSE41-NEXT: pcmpeqq %xmm4, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm4, %xmm0 ; SSE41-NEXT: packssdw %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: pandn %xmm0, %xmm2 +; SSE41-NEXT: pmovsxdq %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; SSE2-LABEL: ne_and_to_abs_vec4x64_sext: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [129,129] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [129,129] ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm3[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; SSE2-NEXT: andps %xmm4, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; SSE2-NEXT: andps %xmm4, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709551487,18446744073709551487] ; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm1[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-NEXT: andps %xmm4, %xmm0 -; SSE2-NEXT: orps %xmm2, %xmm0 -; SSE2-NEXT: xorps %xmm3, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] +; SSE2-NEXT: andps %xmm4, %xmm2 +; SSE2-NEXT: xorps %xmm3, %xmm2 +; SSE2-NEXT: andnps %xmm2, %xmm0 ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] @@ -868,8 +870,9 @@ define <4 x i1> @ne_and_to_abs_vec4x32(<4 x i32> %x) { ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq %cmp1 = icmp ne <4 x i32> %x, %cmp2 = icmp ne <4 x i32> %x, @@ -909,8 +912,9 @@ define <4 x i32> @ne_and_to_abs_vec4x32_sext(<4 x i32> %x) { ; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq %cmp1 = icmp ne <4 x i32> %x, %cmp2 = icmp ne <4 x i32> %x, @@ -1031,8 +1035,8 @@ define <4 x i1> @ne_and_to_abs_vec4x8(<4 x i8> %x) { ; AVX2-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -1042,21 +1046,22 @@ define <4 x i1> @ne_and_to_abs_vec4x8(<4 x i8> %x) { ; SSE41-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE41-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: pandn %xmm0, %xmm1 +; SSE41-NEXT: pmovsxbd %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; SSE2-LABEL: ne_and_to_abs_vec4x8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [88,88,88,88,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq %cmp1 = icmp ne <4 x i8> %x, %cmp2 = icmp ne <4 x i8> %x, @@ -1087,8 +1092,8 @@ define <4 x i16> @ne_and_to_abs_vec4x16_sext(<4 x i16> %x) { ; AVX2-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: ne_and_to_abs_vec4x16_sext: @@ -1097,8 +1102,9 @@ define <4 x i16> @ne_and_to_abs_vec4x16_sext(<4 x i16> %x) { ; SSE41-NEXT: pcmpeqw %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE41-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pandn %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; SSE2-LABEL: ne_and_to_abs_vec4x16_sext: @@ -1107,8 +1113,9 @@ define <4 x i16> @ne_and_to_abs_vec4x16_sext(<4 x i16> %x) { ; SSE2-NEXT: pcmpeqw %xmm0, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq %cmp1 = icmp ne <4 x i16> %x, %cmp2 = icmp ne <4 x i16> %x, diff --git a/llvm/test/CodeGen/X86/icmp-pow2-diff.ll b/llvm/test/CodeGen/X86/icmp-pow2-diff.ll index dada1726be424..3fc2a323b5dc1 100644 --- a/llvm/test/CodeGen/X86/icmp-pow2-diff.ll +++ b/llvm/test/CodeGen/X86/icmp-pow2-diff.ll @@ -151,7 +151,7 @@ define <8 x i1> @andnot_ne_v8i16_todo_no_splat(<8 x i16> %x) nounwind { ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm2 ; AVX512-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $54, %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm2 & (xmm0 ^ xmm1) ; AVX512-NEXT: retq ; ; AVX2-LABEL: andnot_ne_v8i16_todo_no_splat: @@ -159,18 +159,19 @@ define <8 x i1> @andnot_ne_v8i16_todo_no_splat(<8 x i16> %x) nounwind { ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; SSE-LABEL: andnot_ne_v8i16_todo_no_splat: ; SSE: # %bb.0: -; SSE-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pcmpeqw %xmm1, %xmm2 +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pcmpeqw %xmm2, %xmm1 ; SSE-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq %cmp1 = icmp ne <8 x i16> %x, %cmp2 = icmp ne <8 x i16> %x, @@ -184,7 +185,7 @@ define <8 x i1> @andnot_ne_v8i16(<8 x i16> %x) nounwind { ; AVX512-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: andnot_ne_v8i16: @@ -215,28 +216,29 @@ define <16 x i1> @andnot_ne_v16i8_fail_max_not_n1(<16 x i8> %x) nounwind { ; AVX512-LABEL: andnot_ne_v16i8_fail_max_not_n1: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $54, %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: andnot_ne_v16i8_fail_max_not_n1: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; SSE-LABEL: andnot_ne_v16i8_fail_max_not_n1: ; SSE: # %bb.0: ; SSE-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pcmpeqb %xmm1, %xmm2 -; SSE-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; SSE-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq %cmp1 = icmp ne <16 x i8> %x, %cmp2 = icmp ne <16 x i8> %x, @@ -250,7 +252,7 @@ define <16 x i1> @andnot_ne_v16i8(<16 x i8> %x) nounwind { ; AVX512-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: andnot_ne_v16i8: @@ -309,7 +311,7 @@ define <8 x i1> @addand_ne_v8i16_fail(<8 x i16> %x) nounwind { ; AVX512-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $86, %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm1 & (xmm0 ^ xmm2) ; AVX512-NEXT: retq ; ; AVX2-LABEL: addand_ne_v8i16_fail: @@ -317,8 +319,8 @@ define <8 x i1> @addand_ne_v8i16_fail(<8 x i16> %x) nounwind { ; AVX2-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: addand_ne_v8i16_fail: @@ -327,8 +329,9 @@ define <8 x i1> @addand_ne_v8i16_fail(<8 x i16> %x) nounwind { ; SSE41-NEXT: pcmpeqw %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE41-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: pandn %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; SSE2-LABEL: addand_ne_v8i16_fail: @@ -337,8 +340,9 @@ define <8 x i1> @addand_ne_v8i16_fail(<8 x i16> %x) nounwind { ; SSE2-NEXT: pcmpeqw %xmm0, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE2-NEXT: pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq %cmp1 = icmp ne <8 x i16> %x, %cmp2 = icmp ne <8 x i16> %x, diff --git a/llvm/test/CodeGen/X86/mul-cmp.ll b/llvm/test/CodeGen/X86/mul-cmp.ll index 0ee4601acf694..4fffb42bdc672 100644 --- a/llvm/test/CodeGen/X86/mul-cmp.ll +++ b/llvm/test/CodeGen/X86/mul-cmp.ll @@ -119,21 +119,21 @@ define <4 x i1> @mul_nsw_ne0_v4i32(<4 x i32> %x, <4 x i32> %y) { ; SSE-LABEL: mul_nsw_ne0_v4i32: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pcmpeqd %xmm2, %xmm1 ; SSE-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: pxor %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: mul_nsw_ne0_v4i32: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %m = mul nsw <4 x i32> %x, %y %r = icmp ne <4 x i32> %m, zeroinitializer diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll index b12be7cb129d3..37bc8ded142c1 100644 --- a/llvm/test/CodeGen/X86/sat-add.ll +++ b/llvm/test/CodeGen/X86/sat-add.ll @@ -1004,9 +1004,10 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_min(<4 x i32> %x, <4 x i32> ; SSE2-NEXT: pxor %xmm1, %xmm4 ; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1147,9 +1148,10 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_min(<2 x i64> %x, <2 x i64> ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm3, %xmm4 ; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: paddq %xmm1, %xmm0 ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/setcc-logic.ll b/llvm/test/CodeGen/X86/setcc-logic.ll index c98aae7fbf405..4b1225c7ac1d8 100644 --- a/llvm/test/CodeGen/X86/setcc-logic.ll +++ b/llvm/test/CodeGen/X86/setcc-logic.ll @@ -541,9 +541,10 @@ define <4 x i32> @and_icmps_const_1bit_diff_vec(<4 x i32> %x) { ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [44,60,44,60] ; CHECK-NEXT: pcmpeqd %xmm0, %xmm1 ; CHECK-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 +; CHECK-NEXT: pxor %xmm0, %xmm2 +; CHECK-NEXT: pandn %xmm2, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %a = icmp ne <4 x i32> %x, %b = icmp ne <4 x i32> %x, diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll index 2d0778853fecd..aad6abfa78c23 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -2401,16 +2401,16 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX1-NEXT: vpor %xmm5, %xmm3, %xmm3 ; CHECK-AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqb %xmm2, %xmm4, %xmm3 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpcmpeqb %xmm2, %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 ; CHECK-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; CHECK-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; CHECK-AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 ; CHECK-AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; CHECK-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 -; CHECK-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; CHECK-AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; CHECK-AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: pr51133: @@ -2450,10 +2450,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-AVX2-NEXT: vpor %ymm3, %ymm4, %ymm3 ; CHECK-AVX2-NEXT: vpsubb %ymm3, %ymm0, %ymm0 ; CHECK-AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 +; CHECK-AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; CHECK-AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 ; CHECK-AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 -; CHECK-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 -; CHECK-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; CHECK-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; CHECK-AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: pr51133: diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll index 10dee14bdd1a0..82c157c207375 100644 --- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll @@ -37,9 +37,9 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; X64-NEXT: pand %xmm2, %xmm0 ; X64-NEXT: pxor %xmm5, %xmm5 ; X64-NEXT: pcmpgtd %xmm4, %xmm5 -; X64-NEXT: por %xmm2, %xmm5 -; X64-NEXT: pcmpeqd %xmm2, %xmm2 -; X64-NEXT: pxor %xmm5, %xmm2 +; X64-NEXT: pcmpeqd %xmm4, %xmm4 +; X64-NEXT: pxor %xmm5, %xmm4 +; X64-NEXT: pandn %xmm4, %xmm2 ; X64-NEXT: por %xmm0, %xmm2 ; X64-NEXT: pandn %xmm2, %xmm1 ; X64-NEXT: por %xmm3, %xmm1 diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll index 58fd6492f2ed5..00d122838dbc5 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll @@ -127,14 +127,21 @@ define <4 x i32> @in_constant_varx_mone_invmask(ptr%px, ptr%py, ptr%pmask) { ; ; CHECK-SSE2-LABEL: in_constant_varx_mone_invmask: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movaps (%rdi), %xmm0 -; CHECK-SSE2-NEXT: orps (%rdx), %xmm0 +; CHECK-SSE2-NEXT: movdqa (%rdi), %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-SSE2-NEXT: movdqa (%rdx), %xmm2 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-XOP-LABEL: in_constant_varx_mone_invmask: ; CHECK-XOP: # %bb.0: -; CHECK-XOP-NEXT: vmovaps (%rdi), %xmm0 -; CHECK-XOP-NEXT: vorps (%rdx), %xmm0, %xmm0 +; CHECK-XOP-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-XOP-NEXT: vpxor (%rdx), %xmm1, %xmm2 +; CHECK-XOP-NEXT: vpandn %xmm2, %xmm0, %xmm0 +; CHECK-XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; CHECK-XOP-NEXT: retq %x = load <4 x i32>, ptr%px, align 16 %y = load <4 x i32>, ptr%py, align 16 diff --git a/llvm/test/CodeGen/X86/vec_cmp_sint-128.ll b/llvm/test/CodeGen/X86/vec_cmp_sint-128.ll index ac4b25be5eb65..63e08de7fdf53 100644 --- a/llvm/test/CodeGen/X86/vec_cmp_sint-128.ll +++ b/llvm/test/CodeGen/X86/vec_cmp_sint-128.ll @@ -155,7 +155,7 @@ define <2 x i64> @ne_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX512-LABEL: ne_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -194,7 +194,7 @@ define <4 x i32> @ne_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX512-LABEL: ne_v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -233,7 +233,7 @@ define <8 x i16> @ne_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX512-LABEL: ne_v8i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -272,7 +272,7 @@ define <16 x i8> @ne_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX512-LABEL: ne_v16i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -349,7 +349,7 @@ define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX512-LABEL: ge_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -388,7 +388,7 @@ define <4 x i32> @ge_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX512-LABEL: ge_v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -427,7 +427,7 @@ define <8 x i16> @ge_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX512-LABEL: ge_v8i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -466,7 +466,7 @@ define <16 x i8> @ge_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX512-LABEL: ge_v16i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -657,7 +657,7 @@ define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX512-LABEL: le_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -696,7 +696,7 @@ define <4 x i32> @le_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX512-LABEL: le_v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -735,7 +735,7 @@ define <8 x i16> @le_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX512-LABEL: le_v8i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -774,7 +774,7 @@ define <16 x i8> @le_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX512-LABEL: le_v16i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll b/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll index 9a0756edbce32..9d65ff94061b0 100644 --- a/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll +++ b/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll @@ -155,7 +155,7 @@ define <2 x i64> @ne_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX512-LABEL: ne_v2i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -194,7 +194,7 @@ define <4 x i32> @ne_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX512-LABEL: ne_v4i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -233,7 +233,7 @@ define <8 x i16> @ne_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX512-LABEL: ne_v8i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -272,7 +272,7 @@ define <16 x i8> @ne_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX512-LABEL: ne_v16i8: ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -535,7 +535,7 @@ define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -594,7 +594,7 @@ define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -653,7 +653,7 @@ define <8 x i16> @gt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -696,7 +696,7 @@ define <16 x i8> @gt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -960,7 +960,7 @@ define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 ; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1020,7 +1020,7 @@ define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1080,7 +1080,7 @@ define <8 x i16> @lt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1123,7 +1123,7 @@ define <16 x i8> @lt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX512: # %bb.0: ; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_ctbits.ll b/llvm/test/CodeGen/X86/vec_ctbits.ll index 370f88d644b57..4a3bcbb0a96a4 100644 --- a/llvm/test/CodeGen/X86/vec_ctbits.ll +++ b/llvm/test/CodeGen/X86/vec_ctbits.ll @@ -49,12 +49,13 @@ define <2 x i64> @foolz(<2 x i64> %a) nounwind { ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlq $16, %xmm1 +; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 +; CHECK-NEXT: pxor %xmm1, %xmm2 +; CHECK-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NEXT: pandn %xmm2, %xmm3 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrlq $32, %xmm1 -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: psrlq $32, %xmm0 +; CHECK-NEXT: pandn %xmm3, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $1, %xmm1 ; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -151,12 +152,13 @@ define <2 x i32> @promlz(<2 x i32> %a) nounwind { ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrld $8, %xmm1 +; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 +; CHECK-NEXT: pxor %xmm1, %xmm2 +; CHECK-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NEXT: pandn %xmm2, %xmm3 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrld $16, %xmm1 -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: psrld $16, %xmm0 +; CHECK-NEXT: pandn %xmm3, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $1, %xmm1 ; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll index cfb5fac2fd7aa..716090abf1c4a 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll @@ -30,12 +30,13 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlq $16, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlq $32, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -70,12 +71,13 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlq $16, %xmm1 +; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE3-NEXT: pxor %xmm1, %xmm2 +; SSE3-NEXT: movdqa %xmm0, %xmm3 +; SSE3-NEXT: pandn %xmm2, %xmm3 ; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlq $32, %xmm1 -; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE3-NEXT: pxor %xmm1, %xmm0 +; SSE3-NEXT: psrlq $32, %xmm0 +; SSE3-NEXT: pandn %xmm3, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -308,12 +310,13 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlq $16, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlq $32, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -348,12 +351,13 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlq $16, %xmm1 +; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE3-NEXT: pxor %xmm1, %xmm2 +; SSE3-NEXT: movdqa %xmm0, %xmm3 +; SSE3-NEXT: pandn %xmm2, %xmm3 ; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlq $32, %xmm1 -; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE3-NEXT: pxor %xmm1, %xmm0 +; SSE3-NEXT: psrlq $32, %xmm0 +; SSE3-NEXT: pandn %xmm3, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -583,12 +587,13 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $8, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -625,12 +630,13 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrld $8, %xmm1 +; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE3-NEXT: pxor %xmm1, %xmm2 +; SSE3-NEXT: movdqa %xmm0, %xmm3 +; SSE3-NEXT: pandn %xmm2, %xmm3 ; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrld $16, %xmm1 -; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE3-NEXT: pxor %xmm1, %xmm0 +; SSE3-NEXT: psrld $16, %xmm0 +; SSE3-NEXT: pandn %xmm3, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -832,12 +838,13 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrld $8, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -874,12 +881,13 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrld $8, %xmm1 +; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE3-NEXT: pxor %xmm1, %xmm2 +; SSE3-NEXT: movdqa %xmm0, %xmm3 +; SSE3-NEXT: pandn %xmm2, %xmm3 ; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrld $16, %xmm1 -; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE3-NEXT: pxor %xmm1, %xmm0 +; SSE3-NEXT: psrld $16, %xmm0 +; SSE3-NEXT: pandn %xmm3, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -1078,12 +1086,13 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -1114,12 +1123,13 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE3-NEXT: pxor %xmm1, %xmm2 +; SSE3-NEXT: movdqa %xmm0, %xmm3 +; SSE3-NEXT: pandn %xmm2, %xmm3 ; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $8, %xmm1 -; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE3-NEXT: pxor %xmm1, %xmm0 +; SSE3-NEXT: psrlw $8, %xmm0 +; SSE3-NEXT: pandn %xmm3, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -1286,12 +1296,13 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -1322,12 +1333,13 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; SSE3-NEXT: por %xmm1, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE3-NEXT: pxor %xmm1, %xmm2 +; SSE3-NEXT: movdqa %xmm0, %xmm3 +; SSE3-NEXT: pandn %xmm2, %xmm3 ; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $8, %xmm1 -; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE3-NEXT: pxor %xmm1, %xmm0 +; SSE3-NEXT: psrlw $8, %xmm0 +; SSE3-NEXT: pandn %xmm3, %xmm0 ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $1, %xmm1 ; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -1493,28 +1505,29 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $2, %xmm1 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: psubb %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: paddb %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrlw $4, %xmm2 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv16i8: @@ -1526,28 +1539,29 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $2, %xmm1 ; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE3-NEXT: pxor %xmm1, %xmm2 +; SSE3-NEXT: movdqa %xmm0, %xmm3 +; SSE3-NEXT: pandn %xmm2, %xmm3 ; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE3-NEXT: pand %xmm2, %xmm1 -; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE3-NEXT: pxor %xmm1, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm1, %xmm0 +; SSE3-NEXT: pandn %xmm3, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: psrlw $1, %xmm2 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE3-NEXT: psubb %xmm2, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm3 -; SSE3-NEXT: pand %xmm1, %xmm3 +; SSE3-NEXT: pand %xmm2, %xmm3 ; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm3, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: paddb %xmm3, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: psrlw $4, %xmm2 +; SSE3-NEXT: paddb %xmm2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv16i8: @@ -1656,28 +1670,29 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrlw $2, %xmm1 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pandn %xmm2, %xmm3 ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: psubb %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pandn %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE2-NEXT: psubb %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: paddb %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: paddb %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrlw $4, %xmm2 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: testv16i8u: @@ -1689,28 +1704,29 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; SSE3-NEXT: movdqa %xmm0, %xmm1 ; SSE3-NEXT: psrlw $2, %xmm1 ; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE3-NEXT: pxor %xmm1, %xmm2 +; SSE3-NEXT: movdqa %xmm0, %xmm3 +; SSE3-NEXT: pandn %xmm2, %xmm3 ; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; SSE3-NEXT: pand %xmm2, %xmm1 -; SSE3-NEXT: por %xmm1, %xmm0 -; SSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE3-NEXT: pxor %xmm1, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $1, %xmm1 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: psubb %xmm1, %xmm0 -; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: psrlw $4, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE3-NEXT: pand %xmm1, %xmm0 +; SSE3-NEXT: pandn %xmm3, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: psrlw $1, %xmm2 +; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE3-NEXT: psubb %xmm2, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] ; SSE3-NEXT: movdqa %xmm0, %xmm3 -; SSE3-NEXT: pand %xmm1, %xmm3 +; SSE3-NEXT: pand %xmm2, %xmm3 ; SSE3-NEXT: psrlw $2, %xmm0 -; SSE3-NEXT: pand %xmm1, %xmm0 -; SSE3-NEXT: paddb %xmm3, %xmm0 -; SSE3-NEXT: movdqa %xmm0, %xmm1 -; SSE3-NEXT: psrlw $4, %xmm1 -; SSE3-NEXT: paddb %xmm1, %xmm0 ; SSE3-NEXT: pand %xmm2, %xmm0 +; SSE3-NEXT: paddb %xmm3, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: psrlw $4, %xmm2 +; SSE3-NEXT: paddb %xmm2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: testv16i8u: diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll index d35a365508d54..8fe00afe0c0bb 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll @@ -26,18 +26,17 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512BW-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlq $16, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsrlq $32, %zmm2, %zmm3 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm2 = zmm4 & ~(zmm2 | zmm3) +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpshufb %zmm2, %zmm5, %zmm2 +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm3 = ~(zmm3 | zmm0 | zmm1) +; AVX512BW-NEXT: vpsrlw $4, %zmm3, %zmm0 +; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm5, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 @@ -54,30 +53,31 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind { ; AVX512DQ-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrlq $16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpsrlq $32, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = ~(zmm3 | zmm0 | zmm1) +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm0 ; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm4 +; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm4 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512DQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpsrlq $32, %ymm2, %ymm6 +; AVX512DQ-NEXT: vpor %ymm6, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpandn %ymm1, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512DQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq %out = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %in, i1 0) ret <8 x i64> %out @@ -105,18 +105,17 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512BW-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrlq $16, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsrlq $32, %zmm2, %zmm3 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm2 = zmm4 & ~(zmm2 | zmm3) +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpshufb %zmm2, %zmm5, %zmm2 +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm3 = ~(zmm3 | zmm0 | zmm1) +; AVX512BW-NEXT: vpsrlw $4, %zmm3, %zmm0 +; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm5, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0 @@ -133,30 +132,31 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind { ; AVX512DQ-NEXT: vpsrlq $8, %zmm0, %zmm1 ; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrlq $16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm1 -; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpsrlq $32, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = ~(zmm3 | zmm0 | zmm1) +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm0 ; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm4 +; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm4 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512DQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpsrlq $32, %ymm2, %ymm6 +; AVX512DQ-NEXT: vpor %ymm6, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpandn %ymm1, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512DQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq %out = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %in, i1 -1) ret <8 x i64> %out @@ -182,18 +182,17 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512BW-NEXT: vpsrld $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm1 -; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsrld $16, %zmm2, %zmm3 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm2 = zmm4 & ~(zmm2 | zmm3) +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpshufb %zmm2, %zmm5, %zmm2 +; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm3 = ~(zmm3 | zmm0 | zmm1) +; AVX512BW-NEXT: vpsrlw $4, %zmm3, %zmm0 +; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm5, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] @@ -212,38 +211,38 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { ; AVX512DQ-NEXT: vpsrld $4, %zmm0, %zmm1 ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $8, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpandn %ymm0, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = ~zmm1 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm4 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm6, %ymm6 -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpackuswb %ymm6, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpsrld $16, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm0 = ~(zmm0 | zmm3 | zmm1) +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm5 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512DQ-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512DQ-NEXT: vpaddb %ymm5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm7 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] +; AVX512DQ-NEXT: vpsadbw %ymm5, %ymm7, %ymm7 +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] +; AVX512DQ-NEXT: vpsadbw %ymm5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpackuswb %ymm7, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpandn %ymm4, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 ; AVX512DQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] -; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5] -; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[6],ymm5[6],ymm0[7],ymm5[7] +; AVX512DQ-NEXT: vpsadbw %ymm5, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] +; AVX512DQ-NEXT: vpsadbw %ymm5, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq %out = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %in, i1 0) ret <16 x i32> %out @@ -269,18 +268,17 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512BW-NEXT: vpsrld $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpsrld $8, %zmm0, %zmm1 -; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm1 -; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsrld $16, %zmm2, %zmm3 +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm2 = zmm4 & ~(zmm2 | zmm3) +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpshufb %zmm2, %zmm5, %zmm2 +; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm3 = ~(zmm3 | zmm0 | zmm1) +; AVX512BW-NEXT: vpsrlw $4, %zmm3, %zmm0 +; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm5, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] @@ -299,38 +297,38 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { ; AVX512DQ-NEXT: vpsrld $4, %zmm0, %zmm1 ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $8, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpandn %ymm0, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = ~zmm1 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm4 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm5, %ymm3 -; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm6 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] -; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm6, %ymm6 -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] -; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpackuswb %ymm6, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpsrld $16, %zmm2, %zmm3 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm0 = ~(zmm0 | zmm3 | zmm1) +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm5 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512DQ-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX512DQ-NEXT: vpaddb %ymm5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm7 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] +; AVX512DQ-NEXT: vpsadbw %ymm5, %ymm7, %ymm7 +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] +; AVX512DQ-NEXT: vpsadbw %ymm5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpackuswb %ymm7, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpandn %ymm4, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0 ; AVX512DQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7] -; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5] -; AVX512DQ-NEXT: vpsadbw %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[6],ymm5[6],ymm0[7],ymm5[7] +; AVX512DQ-NEXT: vpsadbw %ymm5, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] +; AVX512DQ-NEXT: vpsadbw %ymm5, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq %out = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %in, i1 -1) ret <16 x i32> %out diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-sub128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-sub128.ll index a1b277efde6ff..555d033ac5ee4 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-sub128.ll @@ -17,12 +17,13 @@ define <2 x i32> @illegal_ctlz(<2 x i32> %v1) { ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrld $8, %xmm1 +; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 +; CHECK-NEXT: pxor %xmm1, %xmm2 +; CHECK-NEXT: movdqa %xmm0, %xmm3 +; CHECK-NEXT: pandn %xmm2, %xmm3 ; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: movdqa %xmm0, %xmm1 -; CHECK-NEXT: psrld $16, %xmm1 -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: pcmpeqd %xmm1, %xmm1 -; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: psrld $16, %xmm0 +; CHECK-NEXT: pandn %xmm3, %xmm0 ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrlw $1, %xmm1 ; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 diff --git a/llvm/test/CodeGen/X86/vsplit-and.ll b/llvm/test/CodeGen/X86/vsplit-and.ll index 833db0efbda89..90bbde645cd08 100644 --- a/llvm/test/CodeGen/X86/vsplit-and.ll +++ b/llvm/test/CodeGen/X86/vsplit-and.ll @@ -7,9 +7,9 @@ define void @t0(ptr %dst, <2 x i64> %src1, <2 x i64> %src2) nounwind readonly { ; CHECK-NEXT: pxor %xmm2, %xmm2 ; CHECK-NEXT: pcmpeqq %xmm2, %xmm0 ; CHECK-NEXT: pcmpeqq %xmm2, %xmm1 -; CHECK-NEXT: por %xmm0, %xmm1 -; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 -; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 +; CHECK-NEXT: pxor %xmm1, %xmm2 +; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: movdqa %xmm0, (%rdi) ; CHECK-NEXT: retq %cmp1 = icmp ne <2 x i64> %src1, zeroinitializer @@ -32,19 +32,19 @@ define void @t2(ptr %dst, <3 x i64> %src1, <3 x i64> %src2) nounwind readonly { ; CHECK-NEXT: movq %rcx, %xmm0 ; CHECK-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqq %xmm4, %xmm2 ; CHECK-NEXT: pcmpeqq %xmm4, %xmm0 -; CHECK-NEXT: pcmpeqd %xmm5, %xmm5 -; CHECK-NEXT: pcmpeqq %xmm4, %xmm1 -; CHECK-NEXT: por %xmm2, %xmm1 +; CHECK-NEXT: pcmpeqq %xmm4, %xmm2 +; CHECK-NEXT: packssdw %xmm0, %xmm2 +; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 ; CHECK-NEXT: pcmpeqq %xmm4, %xmm3 -; CHECK-NEXT: por %xmm0, %xmm3 +; CHECK-NEXT: pcmpeqq %xmm4, %xmm1 ; CHECK-NEXT: packssdw %xmm3, %xmm1 -; CHECK-NEXT: pxor %xmm5, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] +; CHECK-NEXT: pxor %xmm0, %xmm1 +; CHECK-NEXT: pandn %xmm1, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2] ; CHECK-NEXT: pslld $31, %xmm0 ; CHECK-NEXT: psrad $31, %xmm0 -; CHECK-NEXT: pmovsxdq %xmm1, %xmm1 +; CHECK-NEXT: pmovsxdq %xmm2, %xmm1 ; CHECK-NEXT: movdqa %xmm1, (%rdi) ; CHECK-NEXT: movq %xmm0, 16(%rdi) ; CHECK-NEXT: retq