Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 133 additions & 0 deletions llvm/lib/Target/PowerPC/PPCISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15587,6 +15587,123 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
ShiftCst);
}

// The function check a i128 load can convert to 16i8 load for Vcmpequb.
static bool canConvertToVcmpequb(SDValue &LHS, SDValue &RHS) {

auto isValidForConvert = [](SDValue &Operand) {
if (!Operand.hasOneUse())
return false;

if (Operand.getValueType() != MVT::i128)
return false;

if (Operand.getOpcode() == ISD::Constant)
return true;

auto *LoadNode = dyn_cast<LoadSDNode>(Operand);
if (!LoadNode)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since you already checked for opcode ISD::LOAD, this can be an assert.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I keep the code and remove the checking of

 if (Oprand.getOpcode() != ISD::LOAD)
      return false;

return false;

// If memory operation is volatile, do not perform any
// optimization or transformation. Volatile operations must be preserved
// as written to ensure correct program behavior, so we return an empty
// SDValue to indicate no action.

if (LoadNode->isVolatile())
return false;

// Only combine loads if both use the unindexed addressing mode.
// PowerPC AltiVec/VMX does not support vector loads or stores with
// pre/post-increment addressing. Indexed modes may imply implicit
// pointer updates, which are not compatible with AltiVec vector
// instructions.
if (LoadNode->getAddressingMode() != ISD::UNINDEXED)
return false;

// Only combine loads if both are non-extending loads
// (ISD::NON_EXTLOAD). Extending loads (such as ISD::ZEXTLOAD or
// ISD::SEXTLOAD) perform zero or sign extension, which may change the
// loaded value's semantics and are not compatible with vector loads.
if (LoadNode->getExtensionType() != ISD::NON_EXTLOAD)
return false;

return true;
};

return (isValidForConvert(LHS) && isValidForConvert(RHS));
}

SDValue convertTwoLoadsAndCmpToVCMPEQUB(SelectionDAG &DAG, SDNode *N,
const SDLoc &DL) {

assert(N->getOpcode() == ISD::SETCC && "Should be called with a SETCC node");

ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
assert(CC == ISD::SETNE ||
CC == ISD::SETEQ && "CC mus be ISD::SETNE or ISD::SETEQ");

auto getV16i8Load = [&](const SDValue &Operand) {
if (Operand.getOpcode() == ISD::Constant)
return DAG.getBitcast(MVT::v16i8, Operand);

assert(Operand.getOpcode() == ISD::LOAD && "Must be LoadSDNode here.");

auto *LoadNode = cast<LoadSDNode>(Operand);
return DAG.getLoad(MVT::v16i8, DL, LoadNode->getChain(),
LoadNode->getBasePtr(), LoadNode->getMemOperand());
};

// Following code transforms the DAG
// t0: ch,glue = EntryToken
// t2: i64,ch = CopyFromReg t0, Register:i64 %0
// t3: i128,ch = load<(load (s128) from %ir.a, align 1)> t0, t2,
// undef:i64
// t4: i64,ch = CopyFromReg t0, Register:i64 %1
// t5: i128,ch =
// load<(load (s128) from %ir.b, align 1)> t0, t4, undef:i64 t6: i1 =
// setcc t3, t5, setne:ch
//
// ---->
//
// t0: ch,glue = EntryToken
// t2: i64,ch = CopyFromReg t0, Register:i64 %0
// t3: v16i8,ch = load<(load (s128) from %ir.a, align 1)> t0, t2,
// undef:i64
// t4: i64,ch = CopyFromReg t0, Register:i64 %1
// t5: v16i8,ch =
// load<(load (s128) from %ir.b, align 1)> t0, t4, undef:i64
// t6: i32 =
// llvm.ppc.altivec.vcmpequb.p TargetConstant:i32<10505>,
// Constant:i32<2>, t3, t5
// t7: i1 = setcc t6, Constant:i32<0>, seteq:ch

// Or transforms the DAG
// t5: i128,ch = load<(load (s128) from %ir.X, align 1)> t0, t2, undef:i64
// t8: i1 =
// setcc Constant:i128<237684487579686500932345921536>, t5, setne:ch
//
// --->
//
// t5: v16i8,ch = load<(load (s128) from %ir.X, align 1)> t0, t2, undef:i64
// t6: v16i8 = bitcast Constant:i128<237684487579686500932345921536>
// t7: i32 =
// llvm.ppc.altivec.vcmpequb.p Constant:i32<10962>, Constant:i32<2>, t5, t2

SDValue LHSVec = getV16i8Load(N->getOperand(0));
SDValue RHSVec = getV16i8Load(N->getOperand(1));

SDValue IntrID =
DAG.getConstant(Intrinsic::ppc_altivec_vcmpequb_p, DL, MVT::i32);
SDValue CRSel = DAG.getConstant(2, DL, MVT::i32); // which CR6 predicate field
SDValue PredResult = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
IntrID, CRSel, LHSVec, RHSVec);
// ppc_altivec_vcmpequb_p returns 1 when two vectors are the same,
// so we need to invert the CC opcode.
return DAG.getSetCC(DL, N->getValueType(0), PredResult,
DAG.getConstant(0, DL, MVT::i32),
CC == ISD::SETNE ? ISD::SETEQ : ISD::SETNE);
}

SDValue PPCTargetLowering::combineSetCC(SDNode *N,
DAGCombinerInfo &DCI) const {
assert(N->getOpcode() == ISD::SETCC &&
Expand All @@ -15613,6 +15730,22 @@ SDValue PPCTargetLowering::combineSetCC(SDNode *N,
SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
}

// Optimization: Fold i128 equality/inequality compares of two loads into a
// vectorized compare using vcmpequb.p when Altivec is available.
//
// Rationale:
// A scalar i128 SETCC (eq/ne) normally lowers to multiple scalar ops.
// On VSX-capable subtargets, we can instead reinterpret the i128 loads
// as v16i8 vectors and use the Altive vcmpequb.p instruction to
// perform a full 128-bit equality check in a single vector compare.
//
// Example Result:
// This transformation replaces memcmp(a, b, 16) with two vector loads
// and one vector compare instruction.

if (Subtarget.hasAltivec() && canConvertToVcmpequb(LHS, RHS))
return convertTwoLoadsAndCmpToVCMPEQUB(DCI.DAG, N, SDLoc(N));
}

return DAGCombineTruncBoolExt(N, DCI);
Expand Down
6 changes: 5 additions & 1 deletion llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,11 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) const {
PPCTTIImpl::TTI::MemCmpExpansionOptions
PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
TTI::MemCmpExpansionOptions Options;
Options.LoadSizes = {8, 4, 2, 1};
if (getST()->hasAltivec())
Options.LoadSizes = {16, 8, 4, 2, 1};
else
Options.LoadSizes = {8, 4, 2, 1};

Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
return Options;
}
Expand Down
50 changes: 17 additions & 33 deletions llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,13 @@ define signext i32 @zeroEqualityTest02(ptr %x, ptr %y) {
define signext i32 @zeroEqualityTest01(ptr %x, ptr %y) {
; CHECK-LABEL: zeroEqualityTest01:
; CHECK: # %bb.0:
; CHECK-NEXT: ld 5, 0(3)
; CHECK-NEXT: ld 6, 0(4)
; CHECK-NEXT: cmpld 5, 6
; CHECK-NEXT: bne 0, .LBB1_2
; CHECK-NEXT: # %bb.1: # %loadbb1
; CHECK-NEXT: ld 5, 8(3)
; CHECK-NEXT: ld 4, 8(4)
; CHECK-NEXT: li 3, 0
; CHECK-NEXT: cmpld 5, 4
; CHECK-NEXT: beqlr 0
; CHECK-NEXT: .LBB1_2: # %res_block
; CHECK-NEXT: li 3, 1
; CHECK-NEXT: lxvd2x 34, 0, 4
; CHECK-NEXT: lxvd2x 35, 0, 3
; CHECK-NEXT: vcmpequb. 2, 3, 2
; CHECK-NEXT: mfocrf 3, 2
; CHECK-NEXT: rlwinm 3, 3, 25, 31, 31
; CHECK-NEXT: cntlzw 3, 3
; CHECK-NEXT: srwi 3, 3, 5
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extra instruction? I think isolating and flipping the bit can just be rlwinm/xori.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in the patch , we just make the

#include <memory.h>
int cmp16(const void *a, const void *b)
{
    return memcmp(a, b, 16) == 0;
}

equal to

#include <altivec.h>
bool cmpeq16_2(const void *a, const void *b)
{
    const vector unsigned char va = vec_xl(0, (unsigned char *)a);
    const vector unsigned char vb = vec_xl(0, (unsigned char *)b);
    return vec_all_eq(va, vb);
}

that is

Following code transforms the DAG

t0: ch,glue = EntryToken
 t2: i64,ch = CopyFromReg t0, Register:i64 %0
  t3: i128,ch = load<(load (s128) from %ir.a, align 1)> t0, t2,
  undef:i64 t4: i64,ch = CopyFromReg t0, Register:i64 %1 t5: i128,ch =
   load<(load (s128) from %ir.b, align 1)> t0, t4, undef:i64 t6: i1 =
    setcc t3, t5, setne:ch

---->

t0: ch,glue = EntryToken
     t2: i64,ch = CopyFromReg t0, Register:i64 %0
    t3: v16i8,ch = load<(load (s128) from %ir.a, align 1)> t0, t2,
     undef:i64 t4: i64,ch = CopyFromReg t0, Register:i64 %1 t5: v16i8,ch =
      load<(load (s128) from %ir.b, align 1)> t0, t4, undef:i64 t6: i32 =
       llvm.ppc.altivec.vcmpequb.p TargetConstant:i32<10505>,
     Constant:i32<2>, t3, t5 t7: i1 = setcc t6, Constant:i32<0>, seteq:ch

I think we can have another patch to let

 llvm.ppc.altivec.vcmpequb.p TargetConstant:i32<10505>,
     Constant:i32<2>, t3, t5 t7: i1 = setcc t6, Constant:i32<0>, seteq:ch

convert to your instructions.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am ok with addressing this in a following patch.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I consider again, I do not think we can transfer

         rlwinm r3, r3, 25, 31, 31
        cntlzw  r3, r3
        srwi r3, r3, 5

into

    rlwinm r3, r3, 25, 31, 31
    xori r3,  (value).

since

      cntlzw  r3, r3
      srwi r3, r3, 5

check whether only the bit 31 of r3 is 1 . I do not think we have a single xori instruction or other single instruction to achieve it.

Copy link
Collaborator

@RolandF77 RolandF77 Nov 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

count leading zero / shift by 5 is a test for zero. Because of the prior rlwinm, this is comparing a 0/1 value. Compared to zero, 0/1 => 1/0. Can use xori with 1.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that means we have to check a long list IR ,


              t32: v16i8,glue = PPCISD::VCMP_rec t42, t40, Constant:i32<6>
            t34: i32 = PPCISD::MFOCRF Register:i32 $cr6, t32:1
          t36: i32 = srl t34, Constant:i32<7>
        t38: i32 = and t36, Constant:i32<1>
      t28: i1 = setcc t38, Constant:i32<0>, seteq:ch

and convert to

 t32: v16i8,glue = PPCISD::VCMP_rec t42, t40, Constant:i32<6>
            t34: i32 = PPCISD::MFOCRF Register:i32 $cr6, t32:1
          t36: i32 = srl t34, Constant:i32<7>
        t38: i32 = and t36, Constant:i32<1>
      t28: i1 = xor t38, Constant:i32<1>

not sure whether it worth to check for the long list of IR for this optimize, it will increase complier time.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You don't need to look for that whole sequence. The and / setcc is enough. But also, in general I don't think looking for a sequence of 5 instructions is anything to worry about.

Copy link
Contributor Author

@diggerlin diggerlin Nov 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree with "and / setcc is enough" since and with 0x01 . I will create a separate patch for the optimization later.

; CHECK-NEXT: blr
%call = tail call signext i32 @memcmp(ptr %x, ptr %y, i64 16)
%not.tobool = icmp ne i32 %call, 0
Expand Down Expand Up @@ -85,7 +80,7 @@ define signext i32 @zeroEqualityTest03(ptr %x, ptr %y) {
; Validate with > 0
define signext i32 @zeroEqualityTest04() {
; CHECK-LABEL: zeroEqualityTest04:
; CHECK: # %bb.0: # %loadbb
; CHECK: # %bb.0:
; CHECK-NEXT: li 3, 0
; CHECK-NEXT: blr
%call = tail call signext i32 @memcmp(ptr @zeroEqualityTest02.buffer1, ptr @zeroEqualityTest02.buffer2, i64 16)
Expand All @@ -97,7 +92,7 @@ define signext i32 @zeroEqualityTest04() {
; Validate with < 0
define signext i32 @zeroEqualityTest05() {
; CHECK-LABEL: zeroEqualityTest05:
; CHECK: # %bb.0: # %loadbb
; CHECK: # %bb.0:
; CHECK-NEXT: li 3, 0
; CHECK-NEXT: blr
%call = tail call signext i32 @memcmp(ptr @zeroEqualityTest03.buffer1, ptr @zeroEqualityTest03.buffer2, i64 16)
Expand All @@ -109,7 +104,7 @@ define signext i32 @zeroEqualityTest05() {
; Validate with memcmp()?:
define signext i32 @equalityFoldTwoConstants() {
; CHECK-LABEL: equalityFoldTwoConstants:
; CHECK: # %bb.0: # %loadbb
; CHECK: # %bb.0:
; CHECK-NEXT: li 3, 1
; CHECK-NEXT: blr
%call = tail call signext i32 @memcmp(ptr @zeroEqualityTest04.buffer1, ptr @zeroEqualityTest04.buffer2, i64 16)
Expand All @@ -121,24 +116,13 @@ define signext i32 @equalityFoldTwoConstants() {
define signext i32 @equalityFoldOneConstant(ptr %X) {
; CHECK-LABEL: equalityFoldOneConstant:
; CHECK: # %bb.0:
; CHECK-NEXT: li 5, 1
; CHECK-NEXT: ld 4, 0(3)
; CHECK-NEXT: rldic 5, 5, 32, 31
; CHECK-NEXT: cmpld 4, 5
; CHECK-NEXT: bne 0, .LBB6_2
; CHECK-NEXT: # %bb.1: # %loadbb1
; CHECK-NEXT: lis 5, -32768
; CHECK-NEXT: ld 4, 8(3)
; CHECK-NEXT: li 3, 0
; CHECK-NEXT: ori 5, 5, 1
; CHECK-NEXT: rldic 5, 5, 1, 30
; CHECK-NEXT: cmpld 4, 5
; CHECK-NEXT: beq 0, .LBB6_3
; CHECK-NEXT: .LBB6_2: # %res_block
; CHECK-NEXT: li 3, 1
; CHECK-NEXT: .LBB6_3: # %endblock
; CHECK-NEXT: cntlzw 3, 3
; CHECK-NEXT: srwi 3, 3, 5
; CHECK-NEXT: lxvd2x 34, 0, 3
; CHECK-NEXT: addis 3, 2, .LCPI6_0@toc@ha
; CHECK-NEXT: addi 3, 3, .LCPI6_0@toc@l
; CHECK-NEXT: lxvd2x 35, 0, 3
; CHECK-NEXT: vcmpequb. 2, 2, 3
; CHECK-NEXT: mfocrf 3, 2
; CHECK-NEXT: rlwinm 3, 3, 25, 31, 31
; CHECK-NEXT: blr
%call = tail call signext i32 @memcmp(ptr @zeroEqualityTest04.buffer1, ptr %X, i64 16)
%not.tobool = icmp eq i32 %call, 0
Expand Down
112 changes: 20 additions & 92 deletions llvm/test/CodeGen/PowerPC/memcmp32_fixsize.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,110 +14,38 @@
define dso_local signext range(i32 0, 2) i32 @cmpeq16(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b) {
; CHECK-AIX32-P8-LABEL: cmpeq16:
; CHECK-AIX32-P8: # %bb.0: # %entry
; CHECK-AIX32-P8-NEXT: lwz r5, 4(r3)
; CHECK-AIX32-P8-NEXT: lwz r6, 0(r3)
; CHECK-AIX32-P8-NEXT: lwz r7, 4(r4)
; CHECK-AIX32-P8-NEXT: lwz r8, 0(r4)
; CHECK-AIX32-P8-NEXT: xor r6, r6, r8
; CHECK-AIX32-P8-NEXT: xor r5, r5, r7
; CHECK-AIX32-P8-NEXT: or. r5, r5, r6
; CHECK-AIX32-P8-NEXT: bne cr0, L..BB0_2
; CHECK-AIX32-P8-NEXT: # %bb.1: # %loadbb1
; CHECK-AIX32-P8-NEXT: lwz r5, 12(r3)
; CHECK-AIX32-P8-NEXT: lwz r3, 8(r3)
; CHECK-AIX32-P8-NEXT: lwz r6, 12(r4)
; CHECK-AIX32-P8-NEXT: lwz r4, 8(r4)
; CHECK-AIX32-P8-NEXT: xor r3, r3, r4
; CHECK-AIX32-P8-NEXT: xor r4, r5, r6
; CHECK-AIX32-P8-NEXT: or. r3, r4, r3
; CHECK-AIX32-P8-NEXT: li r3, 0
; CHECK-AIX32-P8-NEXT: beq cr0, L..BB0_3
; CHECK-AIX32-P8-NEXT: L..BB0_2: # %res_block
; CHECK-AIX32-P8-NEXT: li r3, 1
; CHECK-AIX32-P8-NEXT: L..BB0_3: # %endblock
; CHECK-AIX32-P8-NEXT: cntlzw r3, r3
; CHECK-AIX32-P8-NEXT: rlwinm r3, r3, 27, 31, 31
; CHECK-AIX32-P8-NEXT: lxvw4x vs34, 0, r4
; CHECK-AIX32-P8-NEXT: lxvw4x vs35, 0, r3
; CHECK-AIX32-P8-NEXT: vcmpequb. v2, v3, v2
; CHECK-AIX32-P8-NEXT: mfocrf r3, 2
; CHECK-AIX32-P8-NEXT: rlwinm r3, r3, 25, 31, 31
; CHECK-AIX32-P8-NEXT: blr
;
; CHECK-AIX32-P10-LABEL: cmpeq16:
; CHECK-AIX32-P10: # %bb.0: # %entry
; CHECK-AIX32-P10-NEXT: lwz r5, 4(r3)
; CHECK-AIX32-P10-NEXT: lwz r6, 0(r3)
; CHECK-AIX32-P10-NEXT: lwz r7, 4(r4)
; CHECK-AIX32-P10-NEXT: xor r5, r5, r7
; CHECK-AIX32-P10-NEXT: lwz r8, 0(r4)
; CHECK-AIX32-P10-NEXT: xor r6, r6, r8
; CHECK-AIX32-P10-NEXT: or. r5, r5, r6
; CHECK-AIX32-P10-NEXT: bne cr0, L..BB0_2
; CHECK-AIX32-P10-NEXT: # %bb.1: # %loadbb1
; CHECK-AIX32-P10-NEXT: lwz r5, 12(r3)
; CHECK-AIX32-P10-NEXT: lwz r3, 8(r3)
; CHECK-AIX32-P10-NEXT: lwz r6, 12(r4)
; CHECK-AIX32-P10-NEXT: lwz r4, 8(r4)
; CHECK-AIX32-P10-NEXT: xor r3, r3, r4
; CHECK-AIX32-P10-NEXT: xor r4, r5, r6
; CHECK-AIX32-P10-NEXT: or. r3, r4, r3
; CHECK-AIX32-P10-NEXT: li r3, 0
; CHECK-AIX32-P10-NEXT: beq cr0, L..BB0_3
; CHECK-AIX32-P10-NEXT: L..BB0_2: # %res_block
; CHECK-AIX32-P10-NEXT: li r3, 1
; CHECK-AIX32-P10-NEXT: L..BB0_3: # %endblock
; CHECK-AIX32-P10-NEXT: cntlzw r3, r3
; CHECK-AIX32-P10-NEXT: rlwinm r3, r3, 27, 31, 31
; CHECK-AIX32-P10-NEXT: lxv vs34, 0(r4)
; CHECK-AIX32-P10-NEXT: lxv vs35, 0(r3)
; CHECK-AIX32-P10-NEXT: vcmpequb. v2, v3, v2
; CHECK-AIX32-P10-NEXT: setbc r3, 4*cr6+lt
; CHECK-AIX32-P10-NEXT: blr
;
; CHECK-LINUX32-P8-LABEL: cmpeq16:
; CHECK-LINUX32-P8: # %bb.0: # %entry
; CHECK-LINUX32-P8-NEXT: lwz r5, 0(r3)
; CHECK-LINUX32-P8-NEXT: lwz r6, 4(r3)
; CHECK-LINUX32-P8-NEXT: lwz r7, 0(r4)
; CHECK-LINUX32-P8-NEXT: lwz r8, 4(r4)
; CHECK-LINUX32-P8-NEXT: xor r6, r6, r8
; CHECK-LINUX32-P8-NEXT: xor r5, r5, r7
; CHECK-LINUX32-P8-NEXT: or. r5, r5, r6
; CHECK-LINUX32-P8-NEXT: bne cr0, .LBB0_2
; CHECK-LINUX32-P8-NEXT: # %bb.1: # %loadbb1
; CHECK-LINUX32-P8-NEXT: lwz r5, 8(r3)
; CHECK-LINUX32-P8-NEXT: lwz r3, 12(r3)
; CHECK-LINUX32-P8-NEXT: lwz r6, 8(r4)
; CHECK-LINUX32-P8-NEXT: lwz r4, 12(r4)
; CHECK-LINUX32-P8-NEXT: xor r3, r3, r4
; CHECK-LINUX32-P8-NEXT: xor r4, r5, r6
; CHECK-LINUX32-P8-NEXT: or. r3, r4, r3
; CHECK-LINUX32-P8-NEXT: li r3, 0
; CHECK-LINUX32-P8-NEXT: beq cr0, .LBB0_3
; CHECK-LINUX32-P8-NEXT: .LBB0_2: # %res_block
; CHECK-LINUX32-P8-NEXT: li r3, 1
; CHECK-LINUX32-P8-NEXT: .LBB0_3: # %endblock
; CHECK-LINUX32-P8-NEXT: cntlzw r3, r3
; CHECK-LINUX32-P8-NEXT: rlwinm r3, r3, 27, 31, 31
; CHECK-LINUX32-P8-NEXT: lxvd2x vs0, 0, r4
; CHECK-LINUX32-P8-NEXT: xxswapd vs34, vs0
; CHECK-LINUX32-P8-NEXT: lxvd2x vs0, 0, r3
; CHECK-LINUX32-P8-NEXT: xxswapd vs35, vs0
; CHECK-LINUX32-P8-NEXT: vcmpequb. v2, v3, v2
; CHECK-LINUX32-P8-NEXT: mfocrf r3, 2
; CHECK-LINUX32-P8-NEXT: rlwinm r3, r3, 25, 31, 31
; CHECK-LINUX32-P8-NEXT: blr
;
; CHECK-LINUX32-P10-LABEL: cmpeq16:
; CHECK-LINUX32-P10: # %bb.0: # %entry
; CHECK-LINUX32-P10-NEXT: lwz r5, 0(r3)
; CHECK-LINUX32-P10-NEXT: lwz r6, 4(r3)
; CHECK-LINUX32-P10-NEXT: lwz r7, 0(r4)
; CHECK-LINUX32-P10-NEXT: xor r5, r5, r7
; CHECK-LINUX32-P10-NEXT: lwz r8, 4(r4)
; CHECK-LINUX32-P10-NEXT: xor r6, r6, r8
; CHECK-LINUX32-P10-NEXT: or. r5, r5, r6
; CHECK-LINUX32-P10-NEXT: bne cr0, .LBB0_2
; CHECK-LINUX32-P10-NEXT: # %bb.1: # %loadbb1
; CHECK-LINUX32-P10-NEXT: lwz r5, 8(r3)
; CHECK-LINUX32-P10-NEXT: lwz r3, 12(r3)
; CHECK-LINUX32-P10-NEXT: lwz r6, 8(r4)
; CHECK-LINUX32-P10-NEXT: lwz r4, 12(r4)
; CHECK-LINUX32-P10-NEXT: xor r3, r3, r4
; CHECK-LINUX32-P10-NEXT: xor r4, r5, r6
; CHECK-LINUX32-P10-NEXT: or. r3, r4, r3
; CHECK-LINUX32-P10-NEXT: li r3, 0
; CHECK-LINUX32-P10-NEXT: beq cr0, .LBB0_3
; CHECK-LINUX32-P10-NEXT: .LBB0_2: # %res_block
; CHECK-LINUX32-P10-NEXT: li r3, 1
; CHECK-LINUX32-P10-NEXT: .LBB0_3: # %endblock
; CHECK-LINUX32-P10-NEXT: cntlzw r3, r3
; CHECK-LINUX32-P10-NEXT: rlwinm r3, r3, 27, 31, 31
; CHECK-LINUX32-P10-NEXT: lxv vs34, 0(r4)
; CHECK-LINUX32-P10-NEXT: lxv vs35, 0(r3)
; CHECK-LINUX32-P10-NEXT: vcmpequb. v2, v3, v2
; CHECK-LINUX32-P10-NEXT: setbc r3, 4*cr6+lt
; CHECK-LINUX32-P10-NEXT: blr
entry:
%bcmp = tail call i32 @bcmp(ptr noundef nonnull dereferenceable(16) %a, ptr noundef nonnull dereferenceable(16) %b, i32 16)
Expand Down
Loading
Loading