diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index cee609ed1e2f6..424f18ba4d822 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15841,11 +15841,27 @@ static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG); } - // Vectors that are less than 64 bits get widened to neatly fit a 64 bit - // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to + // Results of setcc operations get widened to 128 bits for xor reduce if + // their input operands are 128 bits wide, otherwise vectors that are less + // than 64 bits get widened to neatly fit a 64 bit register, so e.g. + // <4 x i1> gets lowered to either <4 x i16> or <4 x i32>. Sign extending to // this element size leads to the best codegen, since e.g. setcc results // might need to be truncated otherwise. - EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u)); + unsigned ExtendedWidth = 64; + if (ScalarOpcode == ISD::XOR && Vec.getOpcode() == ISD::SETCC && + Vec.getOperand(0).getValueSizeInBits() >= 128) { + ExtendedWidth = 128; + } + EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u)); + + // Negate the reduced vector value for reduce and operations that use + // fcmp. + if (ScalarOpcode == ISD::AND && NumElems < 16) { + Vec = DAG.getNode( + ISD::XOR, DL, VecVT, Vec, + DAG.getSplatVector( + VecVT, DL, DAG.getConstant(APInt::getAllOnes(32), DL, MVT::i32))); + } // any_ext doesn't work with umin/umax, so only use it for uadd. unsigned ExtendOp = @@ -15854,10 +15870,36 @@ static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec); switch (ScalarOpcode) { case ISD::AND: - Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended); + if (NumElems < 16) { + // Check if all lanes of the negated bool vector value are zero by + // comparing against 0.0 with ordered and equal predicate. The only + // non-zero bit pattern that compares ordered and equal to 0.0 is -0.0, + // where only the sign bit is set. However the bool vector is + // sign-extended so that each bit in a lane is either zero or one, + // meaning that it is impossible to get the bit pattern of -0.0. + assert(Extended.getValueSizeInBits() == 64); + Extended = DAG.getBitcast(MVT::f64, Extended); + Result = + DAG.getSetCC(DL, MVT::i32, Extended, + DAG.getConstantFP(0.0, DL, MVT::f64), ISD::SETOEQ); + } else { + Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended); + } break; case ISD::OR: - Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended); + if (NumElems < 16) { + // Check if any lane of the bool vector is set by comparing against 0.0. + // NaN bit patterns are handled by using the 'unordered or not equal' + // predicate. Similarly to the reduce and case, -0.0 doesn't have to be + // handled here (see explanation above). + assert(Extended.getValueSizeInBits() == 64); + Extended = DAG.getBitcast(MVT::f64, Extended); + Result = + DAG.getSetCC(DL, MVT::i32, Extended, + DAG.getConstantFP(0.0, DL, MVT::f64), ISD::SETUNE); + } else { + Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended); + } break; case ISD::XOR: Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended); diff --git a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll index a48a4e0e723eb..fb366564723db 100644 --- a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll @@ -5,10 +5,8 @@ define i1 @combine_setcc_eq_vecreduce_or_v8i1(<8 x i8> %a) { ; CHECK-LABEL: combine_setcc_eq_vecreduce_or_v8i1: ; CHECK: // %bb.0: ; CHECK-NEXT: cmeq v0.8b, v0.8b, #0 -; CHECK-NEXT: mov w8, #1 // =0x1 -; CHECK-NEXT: umaxv b0, v0.8b -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: bic w0, w8, w9 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cmp1 = icmp eq <8 x i8> %a, zeroinitializer %cast = bitcast <8 x i1> %cmp1 to i8 @@ -73,9 +71,8 @@ define i1 @combine_setcc_ne_vecreduce_or_v8i1(<8 x i8> %a) { ; CHECK-LABEL: combine_setcc_ne_vecreduce_or_v8i1: ; CHECK: // %bb.0: ; CHECK-NEXT: cmtst v0.8b, v0.8b, v0.8b -; CHECK-NEXT: umaxv b0, v0.8b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %cmp1 = icmp ne <8 x i8> %a, zeroinitializer %cast = bitcast <8 x i1> %cmp1 to i8 @@ -132,10 +129,9 @@ define i1 @combine_setcc_ne_vecreduce_or_v64i1(<64 x i8> %a) { define i1 @combine_setcc_eq_vecreduce_and_v8i1(<8 x i8> %a) { ; CHECK-LABEL: combine_setcc_eq_vecreduce_and_v8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: cmeq v0.8b, v0.8b, #0 -; CHECK-NEXT: uminv b0, v0.8b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: cmtst v0.8b, v0.8b, v0.8b +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %cmp1 = icmp eq <8 x i8> %a, zeroinitializer %cast = bitcast <8 x i1> %cmp1 to i8 @@ -192,11 +188,9 @@ define i1 @combine_setcc_eq_vecreduce_and_v64i1(<64 x i8> %a) { define i1 @combine_setcc_ne_vecreduce_and_v8i1(<8 x i8> %a) { ; CHECK-LABEL: combine_setcc_ne_vecreduce_and_v8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: cmtst v0.8b, v0.8b, v0.8b -; CHECK-NEXT: mov w8, #1 // =0x1 -; CHECK-NEXT: uminv b0, v0.8b -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: bic w0, w8, w9 +; CHECK-NEXT: cmeq v0.8b, v0.8b, #0 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %cmp1 = icmp ne <8 x i8> %a, zeroinitializer %cast = bitcast <8 x i1> %cmp1 to i8 diff --git a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll index 767ca91a58bb1..5374d4823034f 100644 --- a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll +++ b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll @@ -9,13 +9,11 @@ define i1 @unordered_floating_point_compare_on_v8f32(<8 x float> %a_vec) { ; CHECK: // %bb.0: ; CHECK-NEXT: fcmgt v1.4s, v1.4s, #0.0 ; CHECK-NEXT: fcmgt v0.4s, v0.4s, #0.0 -; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: umaxv b0, v0.8b -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: bic w0, w8, w9 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %a_cmp = fcmp ule <8 x float> %a_vec, zeroinitializer %cmp_result = bitcast <8 x i1> %a_cmp to i8 diff --git a/llvm/test/CodeGen/AArch64/reduce-and.ll b/llvm/test/CodeGen/AArch64/reduce-and.ll index 8ca521327c2e3..62f3e8d184d24 100644 --- a/llvm/test/CodeGen/AArch64/reduce-and.ll +++ b/llvm/test/CodeGen/AArch64/reduce-and.ll @@ -20,11 +20,11 @@ define i1 @test_redand_v1i1(<1 x i1> %a) { define i1 @test_redand_v2i1(<2 x i1> %a) { ; CHECK-LABEL: test_redand_v2i1: ; CHECK: // %bb.0: +; CHECK-NEXT: mvn v0.8b, v0.8b ; CHECK-NEXT: shl v0.2s, v0.2s, #31 ; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 -; CHECK-NEXT: uminp v0.2s, v0.2s, v0.2s -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v2i1: @@ -42,11 +42,11 @@ define i1 @test_redand_v2i1(<2 x i1> %a) { define i1 @test_redand_v4i1(<4 x i1> %a) { ; CHECK-LABEL: test_redand_v4i1: ; CHECK: // %bb.0: +; CHECK-NEXT: mvn v0.8b, v0.8b ; CHECK-NEXT: shl v0.4h, v0.4h, #15 ; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 -; CHECK-NEXT: uminv h0, v0.4h -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v4i1: @@ -68,11 +68,11 @@ define i1 @test_redand_v4i1(<4 x i1> %a) { define i1 @test_redand_v8i1(<8 x i1> %a) { ; CHECK-LABEL: test_redand_v8i1: ; CHECK: // %bb.0: +; CHECK-NEXT: mvn v0.8b, v0.8b ; CHECK-NEXT: shl v0.8b, v0.8b, #7 ; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 -; CHECK-NEXT: uminv b0, v0.8b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v8i1: diff --git a/llvm/test/CodeGen/AArch64/reduce-or.ll b/llvm/test/CodeGen/AArch64/reduce-or.ll index aac31ce8b71b7..485cb7c916140 100644 --- a/llvm/test/CodeGen/AArch64/reduce-or.ll +++ b/llvm/test/CodeGen/AArch64/reduce-or.ll @@ -22,9 +22,8 @@ define i1 @test_redor_v2i1(<2 x i1> %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: shl v0.2s, v0.2s, #31 ; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 -; CHECK-NEXT: umaxp v0.2s, v0.2s, v0.2s -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v2i1: @@ -44,9 +43,8 @@ define i1 @test_redor_v4i1(<4 x i1> %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: shl v0.4h, v0.4h, #15 ; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 -; CHECK-NEXT: umaxv h0, v0.4h -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v4i1: @@ -70,9 +68,8 @@ define i1 @test_redor_v8i1(<8 x i1> %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: shl v0.8b, v0.8b, #7 ; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 -; CHECK-NEXT: umaxv b0, v0.8b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v8i1: diff --git a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll index 7fa416e0dbcd5..fd81deeb7d913 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll @@ -139,11 +139,11 @@ define i32 @test_v3i32(<3 x i32> %a) nounwind { define i1 @test_v4i1(<4 x i1> %a) nounwind { ; CHECK-LABEL: test_v4i1: ; CHECK: // %bb.0: +; CHECK-NEXT: mvn v0.8b, v0.8b ; CHECK-NEXT: shl v0.4h, v0.4h, #15 ; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 -; CHECK-NEXT: uminv h0, v0.4h -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a) ret i1 %b diff --git a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll index 58020d28702b2..10a3ef1658a96 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll @@ -15,8 +15,15 @@ declare i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a) declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a) declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a) -define i32 @reduce_and_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_and_v1: +declare i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %a) +declare i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %a) +declare i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a) +declare i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a) +declare i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a) +declare i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %a) + +define i32 @reduce_and_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v1i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w8, v0.b[0] @@ -29,16 +36,14 @@ define i32 @reduce_and_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_and_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_and_v2: +define i32 @reduce_and_v2i8(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v2i8: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-NEXT: sshr v0.2s, v0.2s, #24 -; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 -; CHECK-NEXT: uminp v0.2s, v0.2s, v0.2s -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: cmge v0.2s, v0.2s, #0 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: csel w0, w0, w1, eq ; CHECK-NEXT: ret %x = icmp slt <2 x i8> %a0, zeroinitializer %y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x) @@ -46,16 +51,14 @@ define i32 @reduce_and_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_and_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_and_v4: +define i32 @reduce_and_v4i8(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-NEXT: sshr v0.4h, v0.4h, #8 -; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 -; CHECK-NEXT: uminv h0, v0.4h -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: cmge v0.4h, v0.4h, #0 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: csel w0, w0, w1, eq ; CHECK-NEXT: ret %x = icmp slt <4 x i8> %a0, zeroinitializer %y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x) @@ -63,14 +66,12 @@ define i32 @reduce_and_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_and_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_and_v8: +define i32 @reduce_and_v8i8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 -; CHECK-NEXT: uminv b0, v0.8b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: cmge v0.8b, v0.8b, #0 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: csel w0, w0, w1, eq ; CHECK-NEXT: ret %x = icmp slt <8 x i8> %a0, zeroinitializer %y = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x) @@ -78,8 +79,8 @@ define i32 @reduce_and_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_and_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_and_v16: +define i32 @reduce_and_v16i8(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 ; CHECK-NEXT: uminv b0, v0.16b @@ -93,8 +94,8 @@ define i32 @reduce_and_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_and_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_and_v32: +define i32 @reduce_and_v32i8(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 @@ -109,8 +110,182 @@ define i32 @reduce_and_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_or_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_or_v1: +define i32 @reduce_and_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w8, v0.h[0] +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v2i16(<2 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-NEXT: cmge v0.2s, v0.2s, #0 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: csel w0, w0, w1, eq +; CHECK-NEXT: ret + %x = icmp slt <2 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v4i16(<4 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v0.4h, v0.4h, #0 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: csel w0, w0, w1, eq +; CHECK-NEXT: ret + %x = icmp slt <4 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v8i16(<8 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v0.8h, v0.8h, #0 +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: csel w0, w0, w1, eq +; CHECK-NEXT: ret + %x = icmp slt <8 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v16i16(<16 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 +; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: uminv b0, v0.16b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <16 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v1i32(<1 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v2i32(<2 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v0.2s, v0.2s, #0 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: csel w0, w0, w1, eq +; CHECK-NEXT: ret + %x = icmp slt <2 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v4i32(<4 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v0.4s, v0.4s, #0 +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: csel w0, w0, w1, eq +; CHECK-NEXT: ret + %x = icmp slt <4 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v8i32(<8 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v1.4s, v1.4s, #0 +; CHECK-NEXT: cmge v0.4s, v0.4s, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: csel w0, w0, w1, eq +; CHECK-NEXT: ret + %x = icmp slt <8 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v1i64(<1 x i64> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i64> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v2i64(<2 x i64> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v0.2d, v0.2d, #0 +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: csel w0, w0, w1, eq +; CHECK-NEXT: ret + %x = icmp slt <2 x i64> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_and_v4i64(<4 x i64> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_and_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v1.2d, v1.2d, #0 +; CHECK-NEXT: cmge v0.2d, v0.2d, #0 +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: csel w0, w0, w1, eq +; CHECK-NEXT: ret + %x = icmp slt <4 x i64> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v1i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: smov w8, v0.b[0] @@ -123,15 +298,13 @@ define i32 @reduce_or_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_or_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_or_v2: +define i32 @reduce_or_v2i8(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v2i8: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-NEXT: sshr v0.2s, v0.2s, #24 ; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 -; CHECK-NEXT: umaxp v0.2s, v0.2s, v0.2s -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: fcmp d0, #0.0 ; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <2 x i8> %a0, zeroinitializer @@ -140,15 +313,13 @@ define i32 @reduce_or_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_or_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_or_v4: +define i32 @reduce_or_v4i8(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-NEXT: sshr v0.4h, v0.4h, #8 ; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 -; CHECK-NEXT: umaxv h0, v0.4h -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: fcmp d0, #0.0 ; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <4 x i8> %a0, zeroinitializer @@ -157,13 +328,11 @@ define i32 @reduce_or_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_or_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_or_v8: +define i32 @reduce_or_v8i8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 -; CHECK-NEXT: umaxv b0, v0.8b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: fcmp d0, #0.0 ; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <8 x i8> %a0, zeroinitializer @@ -172,8 +341,8 @@ define i32 @reduce_or_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_or_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_or_v16: +define i32 @reduce_or_v16i8(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 ; CHECK-NEXT: umaxv b0, v0.16b @@ -187,8 +356,8 @@ define i32 @reduce_or_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind { ret i32 %z } -define i32 @reduce_or_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind { -; CHECK-LABEL: reduce_or_v32: +define i32 @reduce_or_v32i8(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 @@ -202,3 +371,457 @@ define i32 @reduce_or_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind { %z = select i1 %y, i32 %a1, i32 %a2 ret i32 %z } + +define i32 @reduce_or_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w8, v0.h[0] +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v2i16(<2 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <2 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v4i16(<4 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <4 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v8i16(<8 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <8 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v16i16(<16 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 +; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umaxv b0, v0.16b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <16 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v1i32(<1 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v2i32(<2 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <2 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v4i32(<4 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <4 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v8i32(<8 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v1.4s, v1.4s, #0 +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <8 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v1i64(<1 x i64> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i64> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v2i64(<2 x i64> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.2d, v0.2d, #0 +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <2 x i64> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_or_v4i64(<4 x i64> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_or_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v1.2d, v1.2d, #0 +; CHECK-NEXT: cmlt v0.2d, v0.2d, #0 +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: fcmp d0, #0.0 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <4 x i64> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w8, v0.b[0] +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i8> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v2i8(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: sshr v0.2s, v0.2s, #24 +; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 +; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <2 x i8> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v4i8(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.4h, v0.4h, #8 +; CHECK-NEXT: sshr v0.4h, v0.4h, #8 +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: addv h0, v0.4h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <4 x i8> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v8i8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 +; CHECK-NEXT: addv b0, v0.8b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <8 x i8> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v16i8(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: addv b0, v0.16b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <16 x i8> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v32i8(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: addv b0, v0.16b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <32 x i8> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: smov w8, v0.h[0] +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v2i16(<2 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 +; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <2 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v4i16(<4 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: addv h0, v0.4h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <4 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v8i16(<8 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <8 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v16i16(<16 x i16> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v1.8h, v1.8h, #0 +; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: addv b0, v0.16b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <16 x i16> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v1i32(<1 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v2i32(<2 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 +; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <2 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v4i32(<4 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <4 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v8i32(<8 x i32> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v1.4s, v1.4s, #0 +; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <8 x i32> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v1i64(<1 x i64> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: csel w0, w0, w1, lt +; CHECK-NEXT: ret + %x = icmp slt <1 x i64> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v2i64(<2 x i64> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v0.2d, v0.2d, #0 +; CHECK-NEXT: addp d0, v0.2d +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <2 x i64> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} + +define i32 @reduce_xor_v4i64(<4 x i64> %a0, i32 %a1, i32 %a2) nounwind { +; CHECK-LABEL: reduce_xor_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmlt v1.2d, v1.2d, #0 +; CHECK-NEXT: cmlt v0.2d, v0.2d, #0 +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: tst w8, #0x1 +; CHECK-NEXT: csel w0, w0, w1, ne +; CHECK-NEXT: ret + %x = icmp slt <4 x i64> %a0, zeroinitializer + %y = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %x) + %z = select i1 %y, i32 %a1, i32 %a2 + ret i32 %z +} diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll index 809a6d6556a7b..2a21cc8d7c611 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll @@ -202,9 +202,8 @@ define i1 @test_v4i1(<4 x i1> %a) nounwind { ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: shl v0.4h, v0.4h, #15 ; CHECK-SD-NEXT: cmlt v0.4h, v0.4h, #0 -; CHECK-SD-NEXT: umaxv h0, v0.4h -; CHECK-SD-NEXT: fmov w8, s0 -; CHECK-SD-NEXT: and w0, w8, #0x1 +; CHECK-SD-NEXT: fcmp d0, #0.0 +; CHECK-SD-NEXT: cset w0, ne ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: test_v4i1: diff --git a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll index c0f1720e1cf8b..593c9db090a26 100644 --- a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll +++ b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll @@ -62,13 +62,11 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) { ; NEON-FIXED-NEXT: str q0, [sp] ; NEON-FIXED-NEXT: xtn v1.8b, v1.8h ; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b -; NEON-FIXED-NEXT: umaxv b1, v1.8b +; NEON-FIXED-NEXT: fcmp d1, #0.0 ; NEON-FIXED-NEXT: umaxv b2, v2.8b ; NEON-FIXED-NEXT: fmov w8, s2 ; NEON-FIXED-NEXT: bfi x9, x8, #1, #3 ; NEON-FIXED-NEXT: ldrh w8, [x9] -; NEON-FIXED-NEXT: fmov w9, s1 -; NEON-FIXED-NEXT: tst w9, #0x1 ; NEON-FIXED-NEXT: csel w0, w8, w0, ne ; NEON-FIXED-NEXT: add sp, sp, #16 ; NEON-FIXED-NEXT: ret @@ -83,13 +81,11 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) { ; SVE-FIXED-NEXT: str q0, [sp] ; SVE-FIXED-NEXT: xtn v1.8b, v1.8h ; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b -; SVE-FIXED-NEXT: umaxv b1, v1.8b +; SVE-FIXED-NEXT: fcmp d1, #0.0 ; SVE-FIXED-NEXT: umaxv b2, v2.8b ; SVE-FIXED-NEXT: fmov w8, s2 ; SVE-FIXED-NEXT: bfi x9, x8, #1, #3 ; SVE-FIXED-NEXT: ldrh w8, [x9] -; SVE-FIXED-NEXT: fmov w9, s1 -; SVE-FIXED-NEXT: tst w9, #0x1 ; SVE-FIXED-NEXT: csel w0, w8, w0, ne ; SVE-FIXED-NEXT: add sp, sp, #16 ; SVE-FIXED-NEXT: ret @@ -110,13 +106,11 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) { ; NEON-FIXED-NEXT: str q0, [sp] ; NEON-FIXED-NEXT: xtn v1.4h, v1.4s ; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b -; NEON-FIXED-NEXT: umaxv h1, v1.4h +; NEON-FIXED-NEXT: fcmp d1, #0.0 ; NEON-FIXED-NEXT: umaxv h2, v2.4h ; NEON-FIXED-NEXT: fmov w8, s2 ; NEON-FIXED-NEXT: bfi x9, x8, #2, #2 ; NEON-FIXED-NEXT: ldr w8, [x9] -; NEON-FIXED-NEXT: fmov w9, s1 -; NEON-FIXED-NEXT: tst w9, #0x1 ; NEON-FIXED-NEXT: csel w0, w8, w0, ne ; NEON-FIXED-NEXT: add sp, sp, #16 ; NEON-FIXED-NEXT: ret @@ -131,13 +125,11 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) { ; SVE-FIXED-NEXT: str q0, [sp] ; SVE-FIXED-NEXT: xtn v1.4h, v1.4s ; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b -; SVE-FIXED-NEXT: umaxv h1, v1.4h +; SVE-FIXED-NEXT: fcmp d1, #0.0 ; SVE-FIXED-NEXT: umaxv h2, v2.4h ; SVE-FIXED-NEXT: fmov w8, s2 ; SVE-FIXED-NEXT: bfi x9, x8, #2, #2 ; SVE-FIXED-NEXT: ldr w8, [x9] -; SVE-FIXED-NEXT: fmov w9, s1 -; SVE-FIXED-NEXT: tst w9, #0x1 ; SVE-FIXED-NEXT: csel w0, w8, w0, ne ; SVE-FIXED-NEXT: add sp, sp, #16 ; SVE-FIXED-NEXT: ret @@ -158,13 +150,11 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) { ; NEON-FIXED-NEXT: str q0, [sp] ; NEON-FIXED-NEXT: xtn v1.2s, v1.2d ; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b -; NEON-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s +; NEON-FIXED-NEXT: fcmp d1, #0.0 ; NEON-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s ; NEON-FIXED-NEXT: fmov w8, s2 ; NEON-FIXED-NEXT: bfi x9, x8, #3, #1 ; NEON-FIXED-NEXT: ldr x8, [x9] -; NEON-FIXED-NEXT: fmov w9, s1 -; NEON-FIXED-NEXT: tst w9, #0x1 ; NEON-FIXED-NEXT: csel x0, x8, x0, ne ; NEON-FIXED-NEXT: add sp, sp, #16 ; NEON-FIXED-NEXT: ret @@ -179,13 +169,11 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) { ; SVE-FIXED-NEXT: str q0, [sp] ; SVE-FIXED-NEXT: xtn v1.2s, v1.2d ; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b -; SVE-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s +; SVE-FIXED-NEXT: fcmp d1, #0.0 ; SVE-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s ; SVE-FIXED-NEXT: fmov w8, s2 ; SVE-FIXED-NEXT: bfi x9, x8, #3, #1 ; SVE-FIXED-NEXT: ldr x8, [x9] -; SVE-FIXED-NEXT: fmov w9, s1 -; SVE-FIXED-NEXT: tst w9, #0x1 ; SVE-FIXED-NEXT: csel x0, x8, x0, ne ; SVE-FIXED-NEXT: add sp, sp, #16 ; SVE-FIXED-NEXT: ret @@ -206,13 +194,11 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass ; NEON-FIXED-NEXT: str q0, [sp] ; NEON-FIXED-NEXT: xtn v1.4h, v1.4s ; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b -; NEON-FIXED-NEXT: umaxv h1, v1.4h +; NEON-FIXED-NEXT: fcmp d1, #0.0 ; NEON-FIXED-NEXT: umaxv h3, v3.4h ; NEON-FIXED-NEXT: fmov w8, s3 ; NEON-FIXED-NEXT: bfi x9, x8, #2, #2 -; NEON-FIXED-NEXT: fmov w8, s1 ; NEON-FIXED-NEXT: ldr s0, [x9] -; NEON-FIXED-NEXT: tst w8, #0x1 ; NEON-FIXED-NEXT: fcsel s0, s0, s2, ne ; NEON-FIXED-NEXT: add sp, sp, #16 ; NEON-FIXED-NEXT: ret @@ -227,13 +213,11 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass ; SVE-FIXED-NEXT: str q0, [sp] ; SVE-FIXED-NEXT: xtn v1.4h, v1.4s ; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b -; SVE-FIXED-NEXT: umaxv h1, v1.4h +; SVE-FIXED-NEXT: fcmp d1, #0.0 ; SVE-FIXED-NEXT: umaxv h3, v3.4h ; SVE-FIXED-NEXT: fmov w8, s3 ; SVE-FIXED-NEXT: bfi x9, x8, #2, #2 -; SVE-FIXED-NEXT: fmov w8, s1 ; SVE-FIXED-NEXT: ldr s0, [x9] -; SVE-FIXED-NEXT: tst w8, #0x1 ; SVE-FIXED-NEXT: fcsel s0, s0, s2, ne ; SVE-FIXED-NEXT: add sp, sp, #16 ; SVE-FIXED-NEXT: ret @@ -254,13 +238,11 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double % ; NEON-FIXED-NEXT: str q0, [sp] ; NEON-FIXED-NEXT: xtn v1.2s, v1.2d ; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b -; NEON-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s +; NEON-FIXED-NEXT: fcmp d1, #0.0 ; NEON-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s ; NEON-FIXED-NEXT: fmov w8, s3 ; NEON-FIXED-NEXT: bfi x9, x8, #3, #1 -; NEON-FIXED-NEXT: fmov w8, s1 ; NEON-FIXED-NEXT: ldr d0, [x9] -; NEON-FIXED-NEXT: tst w8, #0x1 ; NEON-FIXED-NEXT: fcsel d0, d0, d2, ne ; NEON-FIXED-NEXT: add sp, sp, #16 ; NEON-FIXED-NEXT: ret @@ -275,13 +257,11 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double % ; SVE-FIXED-NEXT: str q0, [sp] ; SVE-FIXED-NEXT: xtn v1.2s, v1.2d ; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b -; SVE-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s +; SVE-FIXED-NEXT: fcmp d1, #0.0 ; SVE-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s ; SVE-FIXED-NEXT: fmov w8, s3 ; SVE-FIXED-NEXT: bfi x9, x8, #3, #1 -; SVE-FIXED-NEXT: fmov w8, s1 ; SVE-FIXED-NEXT: ldr d0, [x9] -; SVE-FIXED-NEXT: tst w8, #0x1 ; SVE-FIXED-NEXT: fcsel d0, d0, d2, ne ; SVE-FIXED-NEXT: add sp, sp, #16 ; SVE-FIXED-NEXT: ret