diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index fa104e4f69d7f..d386da4438641 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -15556,6 +15556,89 @@ SDValue PPCTargetLowering::combineSetCC(SDNode *N, SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1)); return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC); } + + // Optimization: Fold i128 equality/inequality compares of two loads into a + // vectorized compare using vcmpequb.p when VSX is available. + // + // Rationale: + // A scalar i128 SETCC (eq/ne) normally lowers to multiple scalar ops. + // On VSX-capable subtargets, we can instead reinterpret the i128 loads + // as v16i8 vectors and use the Altivec/VSX vcmpequb.p instruction to + // perform a full 128-bit equality check in a single vector compare. + + if (Subtarget.hasVSX()) { + if (LHS.getOpcode() == ISD::LOAD && RHS.getOpcode() == ISD::LOAD && + LHS.hasOneUse() && RHS.hasOneUse() && + LHS.getValueType() == MVT::i128 && RHS.getValueType() == MVT::i128) { + SDLoc DL(N); + SelectionDAG &DAG = DCI.DAG; + auto *LA = dyn_cast(LHS); + auto *LB = dyn_cast(RHS); + if (!LA || !LB) + return DAGCombineTruncBoolExt(N, DCI); + + // If either memory operation (LA or LB) is volatile, do not perform any + // optimization or transformation. Volatile operations must be preserved + // as written to ensure correct program behavior, so we return an empty + // SDValue to indicate no action. + if (LA->isVolatile() || LB->isVolatile()) + return DAGCombineTruncBoolExt(N, DCI); + + // Only combine loads if both use the unindexed addressing mode. + // PowerPC AltiVec/VMX does not support vector loads or stores with + // pre/post-increment addressing. Indexed modes may imply implicit + // pointer updates, which are not compatible with AltiVec vector + // instructions. + if (LA->getAddressingMode() != ISD::UNINDEXED || + LB->getAddressingMode() != ISD::UNINDEXED) + return DAGCombineTruncBoolExt(N, DCI); + + // Only combine loads if both are non-extending loads + // (ISD::NON_EXTLOAD). Extending loads (such as ISD::ZEXTLOAD or + // ISD::SEXTLOAD) perform zero or sign extension, which may change the + // loaded value's semantics and are not compatible with vector loads. + if (LA->getExtensionType() != ISD::NON_EXTLOAD || + LB->getExtensionType() != ISD::NON_EXTLOAD) + return DAGCombineTruncBoolExt(N, DCI); + + // Following code transforms the DAG + // t0: ch,glue = EntryToken + // t2: i64,ch = CopyFromReg t0, Register:i64 %0 + // t3: i128,ch = load<(load (s128) from %ir.a, align 1)> t0, t2, + // undef:i64 t4: i64,ch = CopyFromReg t0, Register:i64 %1 t5: i128,ch = + // load<(load (s128) from %ir.b, align 1)> t0, t4, undef:i64 t6: i1 = + // setcc t3, t5, setne:ch + // + // ----> + // + // t0: ch,glue = EntryToken + // t2: i64,ch = CopyFromReg t0, Register:i64 %0 + // t3: v16i8,ch = load<(load (s128) from %ir.a, align 1)> t0, t2, + // undef:i64 t4: i64,ch = CopyFromReg t0, Register:i64 %1 t5: v16i8,ch = + // load<(load (s128) from %ir.b, align 1)> t0, t4, undef:i64 t6: i32 = + // llvm.ppc.altivec.vcmpequb.p TargetConstant:i32<10505>, + // Constant:i32<2>, t3, t5 t7: i1 = setcc t6, Constant:i32<0>, seteq:ch + + SDValue LHSVec = DAG.getLoad(MVT::v16i8, DL, LA->getChain(), + LA->getBasePtr(), LA->getMemOperand()); + SDValue RHSVec = DAG.getLoad(MVT::v16i8, DL, LB->getChain(), + LB->getBasePtr(), LB->getMemOperand()); + + SDValue IntrID = + DAG.getTargetConstant(Intrinsic::ppc_altivec_vcmpequb_p, DL, + Subtarget.isPPC64() ? MVT::i64 : MVT::i32); + SDValue CRSel = + DAG.getConstant(2, DL, MVT::i32); // which CR6 predicate field + SDValue PredResult = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, + IntrID, CRSel, LHSVec, RHSVec); + + // ppc_altivec_vcmpequb_p returns 1 when two vectors are the same, + // so we need to invert the CC opcode. + return DAG.getSetCC(DL, N->getValueType(0), PredResult, + DAG.getConstant(0, DL, MVT::i32), + CC == ISD::SETNE ? ISD::SETEQ : ISD::SETNE); + } + } } return DAGCombineTruncBoolExt(N, DCI); diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 2fba090f2d501..93b7308f3fdab 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -439,7 +439,7 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) const { PPCTTIImpl::TTI::MemCmpExpansionOptions PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { TTI::MemCmpExpansionOptions Options; - Options.LoadSizes = {8, 4, 2, 1}; + Options.LoadSizes = {16, 8, 4, 2, 1}; Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); return Options; } diff --git a/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll b/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll index 1da40d46aa773..7c4cf7265ff6a 100644 --- a/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll +++ b/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll @@ -35,18 +35,13 @@ define signext i32 @zeroEqualityTest02(ptr %x, ptr %y) { define signext i32 @zeroEqualityTest01(ptr %x, ptr %y) { ; CHECK-LABEL: zeroEqualityTest01: ; CHECK: # %bb.0: -; CHECK-NEXT: ld 5, 0(3) -; CHECK-NEXT: ld 6, 0(4) -; CHECK-NEXT: cmpld 5, 6 -; CHECK-NEXT: bne 0, .LBB1_2 -; CHECK-NEXT: # %bb.1: # %loadbb1 -; CHECK-NEXT: ld 5, 8(3) -; CHECK-NEXT: ld 4, 8(4) -; CHECK-NEXT: li 3, 0 -; CHECK-NEXT: cmpld 5, 4 -; CHECK-NEXT: beqlr 0 -; CHECK-NEXT: .LBB1_2: # %res_block -; CHECK-NEXT: li 3, 1 +; CHECK-NEXT: lxvd2x 34, 0, 4 +; CHECK-NEXT: lxvd2x 35, 0, 3 +; CHECK-NEXT: vcmpequb. 2, 3, 2 +; CHECK-NEXT: mfocrf 3, 2 +; CHECK-NEXT: rlwinm 3, 3, 25, 31, 31 +; CHECK-NEXT: cntlzw 3, 3 +; CHECK-NEXT: srwi 3, 3, 5 ; CHECK-NEXT: blr %call = tail call signext i32 @memcmp(ptr %x, ptr %y, i64 16) %not.tobool = icmp ne i32 %call, 0 @@ -85,7 +80,7 @@ define signext i32 @zeroEqualityTest03(ptr %x, ptr %y) { ; Validate with > 0 define signext i32 @zeroEqualityTest04() { ; CHECK-LABEL: zeroEqualityTest04: -; CHECK: # %bb.0: # %loadbb +; CHECK: # %bb.0: ; CHECK-NEXT: li 3, 0 ; CHECK-NEXT: blr %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest02.buffer1, ptr @zeroEqualityTest02.buffer2, i64 16) @@ -97,7 +92,7 @@ define signext i32 @zeroEqualityTest04() { ; Validate with < 0 define signext i32 @zeroEqualityTest05() { ; CHECK-LABEL: zeroEqualityTest05: -; CHECK: # %bb.0: # %loadbb +; CHECK: # %bb.0: ; CHECK-NEXT: li 3, 0 ; CHECK-NEXT: blr %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest03.buffer1, ptr @zeroEqualityTest03.buffer2, i64 16) @@ -109,7 +104,7 @@ define signext i32 @zeroEqualityTest05() { ; Validate with memcmp()?: define signext i32 @equalityFoldTwoConstants() { ; CHECK-LABEL: equalityFoldTwoConstants: -; CHECK: # %bb.0: # %loadbb +; CHECK: # %bb.0: ; CHECK-NEXT: li 3, 1 ; CHECK-NEXT: blr %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest04.buffer1, ptr @zeroEqualityTest04.buffer2, i64 16) @@ -122,23 +117,17 @@ define signext i32 @equalityFoldOneConstant(ptr %X) { ; CHECK-LABEL: equalityFoldOneConstant: ; CHECK: # %bb.0: ; CHECK-NEXT: li 5, 1 -; CHECK-NEXT: ld 4, 0(3) +; CHECK-NEXT: ld 4, 8(3) +; CHECK-NEXT: ld 3, 0(3) ; CHECK-NEXT: rldic 5, 5, 32, 31 -; CHECK-NEXT: cmpld 4, 5 -; CHECK-NEXT: bne 0, .LBB6_2 -; CHECK-NEXT: # %bb.1: # %loadbb1 +; CHECK-NEXT: xor 3, 3, 5 ; CHECK-NEXT: lis 5, -32768 -; CHECK-NEXT: ld 4, 8(3) -; CHECK-NEXT: li 3, 0 ; CHECK-NEXT: ori 5, 5, 1 ; CHECK-NEXT: rldic 5, 5, 1, 30 -; CHECK-NEXT: cmpld 4, 5 -; CHECK-NEXT: beq 0, .LBB6_3 -; CHECK-NEXT: .LBB6_2: # %res_block -; CHECK-NEXT: li 3, 1 -; CHECK-NEXT: .LBB6_3: # %endblock -; CHECK-NEXT: cntlzw 3, 3 -; CHECK-NEXT: srwi 3, 3, 5 +; CHECK-NEXT: xor 4, 4, 5 +; CHECK-NEXT: or 3, 3, 4 +; CHECK-NEXT: cntlzd 3, 3 +; CHECK-NEXT: rldicl 3, 3, 58, 63 ; CHECK-NEXT: blr %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest04.buffer1, ptr %X, i64 16) %not.tobool = icmp eq i32 %call, 0 diff --git a/llvm/test/CodeGen/PowerPC/memcmp32_fixsize.ll b/llvm/test/CodeGen/PowerPC/memcmp32_fixsize.ll index f5483ad2a7c3f..7dfaac1a8ae37 100644 --- a/llvm/test/CodeGen/PowerPC/memcmp32_fixsize.ll +++ b/llvm/test/CodeGen/PowerPC/memcmp32_fixsize.ll @@ -14,110 +14,38 @@ define dso_local signext range(i32 0, 2) i32 @cmpeq16(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b) { ; CHECK-AIX32-P8-LABEL: cmpeq16: ; CHECK-AIX32-P8: # %bb.0: # %entry -; CHECK-AIX32-P8-NEXT: lwz r5, 4(r3) -; CHECK-AIX32-P8-NEXT: lwz r6, 0(r3) -; CHECK-AIX32-P8-NEXT: lwz r7, 4(r4) -; CHECK-AIX32-P8-NEXT: lwz r8, 0(r4) -; CHECK-AIX32-P8-NEXT: xor r6, r6, r8 -; CHECK-AIX32-P8-NEXT: xor r5, r5, r7 -; CHECK-AIX32-P8-NEXT: or. r5, r5, r6 -; CHECK-AIX32-P8-NEXT: bne cr0, L..BB0_2 -; CHECK-AIX32-P8-NEXT: # %bb.1: # %loadbb1 -; CHECK-AIX32-P8-NEXT: lwz r5, 12(r3) -; CHECK-AIX32-P8-NEXT: lwz r3, 8(r3) -; CHECK-AIX32-P8-NEXT: lwz r6, 12(r4) -; CHECK-AIX32-P8-NEXT: lwz r4, 8(r4) -; CHECK-AIX32-P8-NEXT: xor r3, r3, r4 -; CHECK-AIX32-P8-NEXT: xor r4, r5, r6 -; CHECK-AIX32-P8-NEXT: or. r3, r4, r3 -; CHECK-AIX32-P8-NEXT: li r3, 0 -; CHECK-AIX32-P8-NEXT: beq cr0, L..BB0_3 -; CHECK-AIX32-P8-NEXT: L..BB0_2: # %res_block -; CHECK-AIX32-P8-NEXT: li r3, 1 -; CHECK-AIX32-P8-NEXT: L..BB0_3: # %endblock -; CHECK-AIX32-P8-NEXT: cntlzw r3, r3 -; CHECK-AIX32-P8-NEXT: rlwinm r3, r3, 27, 31, 31 +; CHECK-AIX32-P8-NEXT: lxvw4x vs34, 0, r4 +; CHECK-AIX32-P8-NEXT: lxvw4x vs35, 0, r3 +; CHECK-AIX32-P8-NEXT: vcmpequb. v2, v3, v2 +; CHECK-AIX32-P8-NEXT: mfocrf r3, 2 +; CHECK-AIX32-P8-NEXT: rlwinm r3, r3, 25, 31, 31 ; CHECK-AIX32-P8-NEXT: blr ; ; CHECK-AIX32-P10-LABEL: cmpeq16: ; CHECK-AIX32-P10: # %bb.0: # %entry -; CHECK-AIX32-P10-NEXT: lwz r5, 4(r3) -; CHECK-AIX32-P10-NEXT: lwz r6, 0(r3) -; CHECK-AIX32-P10-NEXT: lwz r7, 4(r4) -; CHECK-AIX32-P10-NEXT: xor r5, r5, r7 -; CHECK-AIX32-P10-NEXT: lwz r8, 0(r4) -; CHECK-AIX32-P10-NEXT: xor r6, r6, r8 -; CHECK-AIX32-P10-NEXT: or. r5, r5, r6 -; CHECK-AIX32-P10-NEXT: bne cr0, L..BB0_2 -; CHECK-AIX32-P10-NEXT: # %bb.1: # %loadbb1 -; CHECK-AIX32-P10-NEXT: lwz r5, 12(r3) -; CHECK-AIX32-P10-NEXT: lwz r3, 8(r3) -; CHECK-AIX32-P10-NEXT: lwz r6, 12(r4) -; CHECK-AIX32-P10-NEXT: lwz r4, 8(r4) -; CHECK-AIX32-P10-NEXT: xor r3, r3, r4 -; CHECK-AIX32-P10-NEXT: xor r4, r5, r6 -; CHECK-AIX32-P10-NEXT: or. r3, r4, r3 -; CHECK-AIX32-P10-NEXT: li r3, 0 -; CHECK-AIX32-P10-NEXT: beq cr0, L..BB0_3 -; CHECK-AIX32-P10-NEXT: L..BB0_2: # %res_block -; CHECK-AIX32-P10-NEXT: li r3, 1 -; CHECK-AIX32-P10-NEXT: L..BB0_3: # %endblock -; CHECK-AIX32-P10-NEXT: cntlzw r3, r3 -; CHECK-AIX32-P10-NEXT: rlwinm r3, r3, 27, 31, 31 +; CHECK-AIX32-P10-NEXT: lxv vs34, 0(r4) +; CHECK-AIX32-P10-NEXT: lxv vs35, 0(r3) +; CHECK-AIX32-P10-NEXT: vcmpequb. v2, v3, v2 +; CHECK-AIX32-P10-NEXT: setbc r3, 4*cr6+lt ; CHECK-AIX32-P10-NEXT: blr ; ; CHECK-LINUX32-P8-LABEL: cmpeq16: ; CHECK-LINUX32-P8: # %bb.0: # %entry -; CHECK-LINUX32-P8-NEXT: lwz r5, 0(r3) -; CHECK-LINUX32-P8-NEXT: lwz r6, 4(r3) -; CHECK-LINUX32-P8-NEXT: lwz r7, 0(r4) -; CHECK-LINUX32-P8-NEXT: lwz r8, 4(r4) -; CHECK-LINUX32-P8-NEXT: xor r6, r6, r8 -; CHECK-LINUX32-P8-NEXT: xor r5, r5, r7 -; CHECK-LINUX32-P8-NEXT: or. r5, r5, r6 -; CHECK-LINUX32-P8-NEXT: bne cr0, .LBB0_2 -; CHECK-LINUX32-P8-NEXT: # %bb.1: # %loadbb1 -; CHECK-LINUX32-P8-NEXT: lwz r5, 8(r3) -; CHECK-LINUX32-P8-NEXT: lwz r3, 12(r3) -; CHECK-LINUX32-P8-NEXT: lwz r6, 8(r4) -; CHECK-LINUX32-P8-NEXT: lwz r4, 12(r4) -; CHECK-LINUX32-P8-NEXT: xor r3, r3, r4 -; CHECK-LINUX32-P8-NEXT: xor r4, r5, r6 -; CHECK-LINUX32-P8-NEXT: or. r3, r4, r3 -; CHECK-LINUX32-P8-NEXT: li r3, 0 -; CHECK-LINUX32-P8-NEXT: beq cr0, .LBB0_3 -; CHECK-LINUX32-P8-NEXT: .LBB0_2: # %res_block -; CHECK-LINUX32-P8-NEXT: li r3, 1 -; CHECK-LINUX32-P8-NEXT: .LBB0_3: # %endblock -; CHECK-LINUX32-P8-NEXT: cntlzw r3, r3 -; CHECK-LINUX32-P8-NEXT: rlwinm r3, r3, 27, 31, 31 +; CHECK-LINUX32-P8-NEXT: lxvd2x vs0, 0, r4 +; CHECK-LINUX32-P8-NEXT: xxswapd vs34, vs0 +; CHECK-LINUX32-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-LINUX32-P8-NEXT: xxswapd vs35, vs0 +; CHECK-LINUX32-P8-NEXT: vcmpequb. v2, v3, v2 +; CHECK-LINUX32-P8-NEXT: mfocrf r3, 2 +; CHECK-LINUX32-P8-NEXT: rlwinm r3, r3, 25, 31, 31 ; CHECK-LINUX32-P8-NEXT: blr ; ; CHECK-LINUX32-P10-LABEL: cmpeq16: ; CHECK-LINUX32-P10: # %bb.0: # %entry -; CHECK-LINUX32-P10-NEXT: lwz r5, 0(r3) -; CHECK-LINUX32-P10-NEXT: lwz r6, 4(r3) -; CHECK-LINUX32-P10-NEXT: lwz r7, 0(r4) -; CHECK-LINUX32-P10-NEXT: xor r5, r5, r7 -; CHECK-LINUX32-P10-NEXT: lwz r8, 4(r4) -; CHECK-LINUX32-P10-NEXT: xor r6, r6, r8 -; CHECK-LINUX32-P10-NEXT: or. r5, r5, r6 -; CHECK-LINUX32-P10-NEXT: bne cr0, .LBB0_2 -; CHECK-LINUX32-P10-NEXT: # %bb.1: # %loadbb1 -; CHECK-LINUX32-P10-NEXT: lwz r5, 8(r3) -; CHECK-LINUX32-P10-NEXT: lwz r3, 12(r3) -; CHECK-LINUX32-P10-NEXT: lwz r6, 8(r4) -; CHECK-LINUX32-P10-NEXT: lwz r4, 12(r4) -; CHECK-LINUX32-P10-NEXT: xor r3, r3, r4 -; CHECK-LINUX32-P10-NEXT: xor r4, r5, r6 -; CHECK-LINUX32-P10-NEXT: or. r3, r4, r3 -; CHECK-LINUX32-P10-NEXT: li r3, 0 -; CHECK-LINUX32-P10-NEXT: beq cr0, .LBB0_3 -; CHECK-LINUX32-P10-NEXT: .LBB0_2: # %res_block -; CHECK-LINUX32-P10-NEXT: li r3, 1 -; CHECK-LINUX32-P10-NEXT: .LBB0_3: # %endblock -; CHECK-LINUX32-P10-NEXT: cntlzw r3, r3 -; CHECK-LINUX32-P10-NEXT: rlwinm r3, r3, 27, 31, 31 +; CHECK-LINUX32-P10-NEXT: lxv vs34, 0(r4) +; CHECK-LINUX32-P10-NEXT: lxv vs35, 0(r3) +; CHECK-LINUX32-P10-NEXT: vcmpequb. v2, v3, v2 +; CHECK-LINUX32-P10-NEXT: setbc r3, 4*cr6+lt ; CHECK-LINUX32-P10-NEXT: blr entry: %bcmp = tail call i32 @bcmp(ptr noundef nonnull dereferenceable(16) %a, ptr noundef nonnull dereferenceable(16) %b, i32 16) diff --git a/llvm/test/CodeGen/PowerPC/memcmp64_fixsize.ll b/llvm/test/CodeGen/PowerPC/memcmp64_fixsize.ll index 216b7638642d4..bd703b9d35cf7 100644 --- a/llvm/test/CodeGen/PowerPC/memcmp64_fixsize.ll +++ b/llvm/test/CodeGen/PowerPC/memcmp64_fixsize.ll @@ -14,78 +14,36 @@ define dso_local signext range(i32 0, 2) i32 @cmpeq16(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b) { ; CHECK-AIX64-32-P8-LABEL: cmpeq16: ; CHECK-AIX64-32-P8: # %bb.0: # %entry -; CHECK-AIX64-32-P8-NEXT: ld r5, 0(r3) -; CHECK-AIX64-32-P8-NEXT: ld r6, 0(r4) -; CHECK-AIX64-32-P8-NEXT: cmpld r5, r6 -; CHECK-AIX64-32-P8-NEXT: bne cr0, L..BB0_2 -; CHECK-AIX64-32-P8-NEXT: # %bb.1: # %loadbb1 -; CHECK-AIX64-32-P8-NEXT: ld r5, 8(r3) -; CHECK-AIX64-32-P8-NEXT: ld r4, 8(r4) -; CHECK-AIX64-32-P8-NEXT: li r3, 0 -; CHECK-AIX64-32-P8-NEXT: cmpld r5, r4 -; CHECK-AIX64-32-P8-NEXT: beq cr0, L..BB0_3 -; CHECK-AIX64-32-P8-NEXT: L..BB0_2: # %res_block -; CHECK-AIX64-32-P8-NEXT: li r3, 1 -; CHECK-AIX64-32-P8-NEXT: L..BB0_3: # %endblock -; CHECK-AIX64-32-P8-NEXT: cntlzw r3, r3 -; CHECK-AIX64-32-P8-NEXT: srwi r3, r3, 5 +; CHECK-AIX64-32-P8-NEXT: lxvw4x vs34, 0, r4 +; CHECK-AIX64-32-P8-NEXT: lxvw4x vs35, 0, r3 +; CHECK-AIX64-32-P8-NEXT: vcmpequb. v2, v3, v2 +; CHECK-AIX64-32-P8-NEXT: mfocrf r3, 2 +; CHECK-AIX64-32-P8-NEXT: rlwinm r3, r3, 25, 31, 31 ; CHECK-AIX64-32-P8-NEXT: blr ; ; CHECK-AIX64-32-P10-LABEL: cmpeq16: ; CHECK-AIX64-32-P10: # %bb.0: # %entry -; CHECK-AIX64-32-P10-NEXT: ld r5, 0(r3) -; CHECK-AIX64-32-P10-NEXT: ld r6, 0(r4) -; CHECK-AIX64-32-P10-NEXT: cmpld r5, r6 -; CHECK-AIX64-32-P10-NEXT: bne cr0, L..BB0_2 -; CHECK-AIX64-32-P10-NEXT: # %bb.1: # %loadbb1 -; CHECK-AIX64-32-P10-NEXT: ld r5, 8(r3) -; CHECK-AIX64-32-P10-NEXT: ld r4, 8(r4) -; CHECK-AIX64-32-P10-NEXT: li r3, 0 -; CHECK-AIX64-32-P10-NEXT: cmpld r5, r4 -; CHECK-AIX64-32-P10-NEXT: beq cr0, L..BB0_3 -; CHECK-AIX64-32-P10-NEXT: L..BB0_2: # %res_block -; CHECK-AIX64-32-P10-NEXT: li r3, 1 -; CHECK-AIX64-32-P10-NEXT: L..BB0_3: # %endblock -; CHECK-AIX64-32-P10-NEXT: cntlzw r3, r3 -; CHECK-AIX64-32-P10-NEXT: rlwinm r3, r3, 27, 31, 31 +; CHECK-AIX64-32-P10-NEXT: lxv vs34, 0(r4) +; CHECK-AIX64-32-P10-NEXT: lxv vs35, 0(r3) +; CHECK-AIX64-32-P10-NEXT: vcmpequb. v2, v3, v2 +; CHECK-AIX64-32-P10-NEXT: setbc r3, 4*cr6+lt ; CHECK-AIX64-32-P10-NEXT: blr ; ; CHECK-LINUX64-P8-LABEL: cmpeq16: ; CHECK-LINUX64-P8: # %bb.0: # %entry -; CHECK-LINUX64-P8-NEXT: ld r5, 0(r3) -; CHECK-LINUX64-P8-NEXT: ld r6, 0(r4) -; CHECK-LINUX64-P8-NEXT: cmpld r5, r6 -; CHECK-LINUX64-P8-NEXT: bne cr0, .LBB0_2 -; CHECK-LINUX64-P8-NEXT: # %bb.1: # %loadbb1 -; CHECK-LINUX64-P8-NEXT: ld r5, 8(r3) -; CHECK-LINUX64-P8-NEXT: ld r4, 8(r4) -; CHECK-LINUX64-P8-NEXT: li r3, 0 -; CHECK-LINUX64-P8-NEXT: cmpld r5, r4 -; CHECK-LINUX64-P8-NEXT: beq cr0, .LBB0_3 -; CHECK-LINUX64-P8-NEXT: .LBB0_2: # %res_block -; CHECK-LINUX64-P8-NEXT: li r3, 1 -; CHECK-LINUX64-P8-NEXT: .LBB0_3: # %endblock -; CHECK-LINUX64-P8-NEXT: cntlzw r3, r3 -; CHECK-LINUX64-P8-NEXT: srwi r3, r3, 5 +; CHECK-LINUX64-P8-NEXT: lxvd2x vs34, 0, r4 +; CHECK-LINUX64-P8-NEXT: lxvd2x vs35, 0, r3 +; CHECK-LINUX64-P8-NEXT: vcmpequb. v2, v3, v2 +; CHECK-LINUX64-P8-NEXT: mfocrf r3, 2 +; CHECK-LINUX64-P8-NEXT: rlwinm r3, r3, 25, 31, 31 ; CHECK-LINUX64-P8-NEXT: blr ; ; CHECK-LINUX64-P10-LABEL: cmpeq16: ; CHECK-LINUX64-P10: # %bb.0: # %entry -; CHECK-LINUX64-P10-NEXT: ld r5, 0(r3) -; CHECK-LINUX64-P10-NEXT: ld r6, 0(r4) -; CHECK-LINUX64-P10-NEXT: cmpld r5, r6 -; CHECK-LINUX64-P10-NEXT: bne cr0, .LBB0_2 -; CHECK-LINUX64-P10-NEXT: # %bb.1: # %loadbb1 -; CHECK-LINUX64-P10-NEXT: ld r5, 8(r3) -; CHECK-LINUX64-P10-NEXT: ld r4, 8(r4) -; CHECK-LINUX64-P10-NEXT: li r3, 0 -; CHECK-LINUX64-P10-NEXT: cmpld r5, r4 -; CHECK-LINUX64-P10-NEXT: beq cr0, .LBB0_3 -; CHECK-LINUX64-P10-NEXT: .LBB0_2: # %res_block -; CHECK-LINUX64-P10-NEXT: li r3, 1 -; CHECK-LINUX64-P10-NEXT: .LBB0_3: # %endblock -; CHECK-LINUX64-P10-NEXT: cntlzw r3, r3 -; CHECK-LINUX64-P10-NEXT: rlwinm r3, r3, 27, 31, 31 +; CHECK-LINUX64-P10-NEXT: lxv vs34, 0(r4) +; CHECK-LINUX64-P10-NEXT: lxv vs35, 0(r3) +; CHECK-LINUX64-P10-NEXT: vcmpequb. v2, v3, v2 +; CHECK-LINUX64-P10-NEXT: setbc r3, 4*cr6+lt ; CHECK-LINUX64-P10-NEXT: blr entry: %bcmp = tail call i32 @bcmp(ptr noundef nonnull dereferenceable(16) %a, ptr noundef nonnull dereferenceable(16) %b, i64 16) diff --git a/llvm/test/CodeGen/PowerPC/memcmpIR.ll b/llvm/test/CodeGen/PowerPC/memcmpIR.ll index b57d2b5116b77..995ecb64d4bdd 100644 --- a/llvm/test/CodeGen/PowerPC/memcmpIR.ll +++ b/llvm/test/CodeGen/PowerPC/memcmpIR.ll @@ -4,48 +4,19 @@ define signext i32 @test1(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) { entry: ; CHECK-LABEL: @test1( - ; CHECK-LABEL: res_block:{{.*}} - ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64 - ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 - ; CHECK-NEXT: br label %endblock - - ; CHECK-LABEL: loadbb:{{.*}} - ; CHECK: [[LOAD1:%[0-9]+]] = load i64, ptr - ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr - ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]]) - ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]]) - ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[BSWAP1]], [[BSWAP2]] - ; CHECK-NEXT: br i1 [[ICMP]], label %loadbb1, label %res_block - - ; CHECK-LABEL: loadbb1:{{.*}} - ; CHECK-NEXT: [[GEP1:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8 - ; CHECK-NEXT: [[GEP2:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8 - ; CHECK-NEXT: [[LOAD1:%[0-9]+]] = load i64, ptr [[GEP1]] - ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr [[GEP2]] - ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]]) - ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]]) - ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[BSWAP1]], [[BSWAP2]] - ; CHECK-NEXT: br i1 [[ICMP]], label %endblock, label %res_block - + ; CHECK: [[LOAD0:%[0-9]+]] = load i128, ptr %buffer1, align 1 + ; CHECK-NEXT: [[LOAD1:%[0-9]+]] = load i128, ptr %buffer2, align 1 + ; CHECK-NEXT: [[CALL1:%[0-9]+]] = call i128 @llvm.bswap.i128(i128 [[LOAD0]]) + ; CHECK-NEXT: [[CALL2:%[0-9]+]] = call i128 @llvm.bswap.i128(i128 [[LOAD1]]) + ; CHECK-NEXT: [[CALL3:%[0-9]+]] = call i32 @llvm.ucmp.i32.i128(i128 [[CALL1]], i128 [[CALL2]]) + ; CHECK-NEXT: ret i32 [[CALL3]] + + ; CHECK-BE-LABEL: @test1( - ; CHECK-BE-LABEL: res_block:{{.*}} - ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64 - ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1 - ; CHECK-BE-NEXT: br label %endblock - - ; CHECK-BE-LABEL: loadbb:{{.*}} - ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, ptr - ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr - ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[LOAD1]], [[LOAD2]] - ; CHECK-BE-NEXT: br i1 [[ICMP]], label %loadbb1, label %res_block - - ; CHECK-BE-LABEL: loadbb1:{{.*}} - ; CHECK-BE-NEXT: [[GEP1:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8 - ; CHECK-BE-NEXT: [[GEP2:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8 - ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i64, ptr [[GEP1]] - ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr [[GEP2]] - ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[LOAD1]], [[LOAD2]] - ; CHECK-BE-NEXT: br i1 [[ICMP]], label %endblock, label %res_block + ; CHECK-BE: [[LOAD0:%[0-9]+]] = load i128, ptr %buffer1, align 1 + ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i128, ptr %buffer2, align 1 + ; CHECK-BE-NEXT: [[CALL0:%[0-9]+]] = call i32 @llvm.ucmp.i32.i128(i128 [[LOAD0]], i128 [[LOAD1]]) + ; CHECK-BE-NEXT: ret i32 [[CALL0]] %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 16) ret i32 %call @@ -156,7 +127,7 @@ entry: define signext i32 @test4(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) { entry: - %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 65) + %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 165) ret i32 %call }