-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[PowerPC] Implement a more efficient memcmp in cases where the length is known. #158657
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
cfbf70c
cc66c46
46d907a
bff468f
6044203
24bb9f4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15556,6 +15556,89 @@ SDValue PPCTargetLowering::combineSetCC(SDNode *N, | |
SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1)); | ||
return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC); | ||
} | ||
|
||
// Optimization: Fold i128 equality/inequality compares of two loads into a | ||
// vectorized compare using vcmpequb.p when VSX is available. | ||
// | ||
// Rationale: | ||
// A scalar i128 SETCC (eq/ne) normally lowers to multiple scalar ops. | ||
// On VSX-capable subtargets, we can instead reinterpret the i128 loads | ||
// as v16i8 vectors and use the Altivec/VSX vcmpequb.p instruction to | ||
// perform a full 128-bit equality check in a single vector compare. | ||
|
||
if (Subtarget.hasVSX()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add a documentation as to the type of optimization that is being done in this block. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. vcmpequb is an Altivec vector instruction, not VSX. |
||
if (LHS.getOpcode() == ISD::LOAD && RHS.getOpcode() == ISD::LOAD && | ||
LHS.hasOneUse() && RHS.hasOneUse() && | ||
LHS.getValueType() == MVT::i128 && RHS.getValueType() == MVT::i128) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why the type restriction? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in the pass expand-memcmp
is changed to
but in original code, the
in 64-bit mode, it is not efficient with two we want to i128 to be converted to vector load. so there is type restriction. |
||
SDLoc DL(N); | ||
SelectionDAG &DAG = DCI.DAG; | ||
auto *LA = dyn_cast<LoadSDNode>(LHS); | ||
auto *LB = dyn_cast<LoadSDNode>(RHS); | ||
if (!LA || !LB) | ||
return DAGCombineTruncBoolExt(N, DCI); | ||
|
||
// If either memory operation (LA or LB) is volatile, do not perform any | ||
// optimization or transformation. Volatile operations must be preserved | ||
// as written to ensure correct program behavior, so we return an empty | ||
// SDValue to indicate no action. | ||
if (LA->isVolatile() || LB->isVolatile()) | ||
return DAGCombineTruncBoolExt(N, DCI); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This return pattern is going to make it hard to further modify the combineSetCC function. I think this code should be outlined to a separate function. |
||
|
||
// Only combine loads if both use the unindexed addressing mode. | ||
// PowerPC AltiVec/VMX does not support vector loads or stores with | ||
// pre/post-increment addressing. Indexed modes may imply implicit | ||
// pointer updates, which are not compatible with AltiVec vector | ||
// instructions. | ||
if (LA->getAddressingMode() != ISD::UNINDEXED || | ||
LB->getAddressingMode() != ISD::UNINDEXED) | ||
return DAGCombineTruncBoolExt(N, DCI); | ||
|
||
// Only combine loads if both are non-extending loads | ||
// (ISD::NON_EXTLOAD). Extending loads (such as ISD::ZEXTLOAD or | ||
// ISD::SEXTLOAD) perform zero or sign extension, which may change the | ||
// loaded value's semantics and are not compatible with vector loads. | ||
if (LA->getExtensionType() != ISD::NON_EXTLOAD || | ||
LB->getExtensionType() != ISD::NON_EXTLOAD) | ||
return DAGCombineTruncBoolExt(N, DCI); | ||
|
||
// Following code transforms the DAG | ||
// t0: ch,glue = EntryToken | ||
// t2: i64,ch = CopyFromReg t0, Register:i64 %0 | ||
// t3: i128,ch = load<(load (s128) from %ir.a, align 1)> t0, t2, | ||
// undef:i64 t4: i64,ch = CopyFromReg t0, Register:i64 %1 t5: i128,ch = | ||
// load<(load (s128) from %ir.b, align 1)> t0, t4, undef:i64 t6: i1 = | ||
// setcc t3, t5, setne:ch | ||
// | ||
// ----> | ||
// | ||
// t0: ch,glue = EntryToken | ||
// t2: i64,ch = CopyFromReg t0, Register:i64 %0 | ||
// t3: v16i8,ch = load<(load (s128) from %ir.a, align 1)> t0, t2, | ||
// undef:i64 t4: i64,ch = CopyFromReg t0, Register:i64 %1 t5: v16i8,ch = | ||
// load<(load (s128) from %ir.b, align 1)> t0, t4, undef:i64 t6: i32 = | ||
// llvm.ppc.altivec.vcmpequb.p TargetConstant:i32<10505>, | ||
// Constant:i32<2>, t3, t5 t7: i1 = setcc t6, Constant:i32<0>, seteq:ch | ||
|
||
SDValue LHSVec = DAG.getLoad(MVT::v16i8, DL, LA->getChain(), | ||
LA->getBasePtr(), LA->getMemOperand()); | ||
SDValue RHSVec = DAG.getLoad(MVT::v16i8, DL, LB->getChain(), | ||
LB->getBasePtr(), LB->getMemOperand()); | ||
|
||
SDValue IntrID = | ||
DAG.getTargetConstant(Intrinsic::ppc_altivec_vcmpequb_p, DL, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can just use getConstant. |
||
Subtarget.isPPC64() ? MVT::i64 : MVT::i32); | ||
SDValue CRSel = | ||
DAG.getConstant(2, DL, MVT::i32); // which CR6 predicate field | ||
SDValue PredResult = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, | ||
IntrID, CRSel, LHSVec, RHSVec); | ||
|
||
// ppc_altivec_vcmpequb_p returns 1 when two vectors are the same, | ||
// so we need to invert the CC opcode. | ||
return DAG.getSetCC(DL, N->getValueType(0), PredResult, | ||
DAG.getConstant(0, DL, MVT::i32), | ||
CC == ISD::SETNE ? ISD::SETEQ : ISD::SETNE); | ||
} | ||
} | ||
} | ||
|
||
return DAGCombineTruncBoolExt(N, DCI); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,18 +35,13 @@ define signext i32 @zeroEqualityTest02(ptr %x, ptr %y) { | |
define signext i32 @zeroEqualityTest01(ptr %x, ptr %y) { | ||
; CHECK-LABEL: zeroEqualityTest01: | ||
; CHECK: # %bb.0: | ||
; CHECK-NEXT: ld 5, 0(3) | ||
; CHECK-NEXT: ld 6, 0(4) | ||
; CHECK-NEXT: cmpld 5, 6 | ||
; CHECK-NEXT: bne 0, .LBB1_2 | ||
; CHECK-NEXT: # %bb.1: # %loadbb1 | ||
; CHECK-NEXT: ld 5, 8(3) | ||
; CHECK-NEXT: ld 4, 8(4) | ||
; CHECK-NEXT: li 3, 0 | ||
; CHECK-NEXT: cmpld 5, 4 | ||
; CHECK-NEXT: beqlr 0 | ||
; CHECK-NEXT: .LBB1_2: # %res_block | ||
; CHECK-NEXT: li 3, 1 | ||
; CHECK-NEXT: lxvd2x 34, 0, 4 | ||
; CHECK-NEXT: lxvd2x 35, 0, 3 | ||
; CHECK-NEXT: vcmpequb. 2, 3, 2 | ||
; CHECK-NEXT: mfocrf 3, 2 | ||
; CHECK-NEXT: rlwinm 3, 3, 25, 31, 31 | ||
; CHECK-NEXT: cntlzw 3, 3 | ||
; CHECK-NEXT: srwi 3, 3, 5 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Extra instruction? I think isolating and flipping the bit can just be rlwinm/xori. |
||
; CHECK-NEXT: blr | ||
%call = tail call signext i32 @memcmp(ptr %x, ptr %y, i64 16) | ||
%not.tobool = icmp ne i32 %call, 0 | ||
|
@@ -85,7 +80,7 @@ define signext i32 @zeroEqualityTest03(ptr %x, ptr %y) { | |
; Validate with > 0 | ||
define signext i32 @zeroEqualityTest04() { | ||
; CHECK-LABEL: zeroEqualityTest04: | ||
; CHECK: # %bb.0: # %loadbb | ||
; CHECK: # %bb.0: | ||
; CHECK-NEXT: li 3, 0 | ||
; CHECK-NEXT: blr | ||
%call = tail call signext i32 @memcmp(ptr @zeroEqualityTest02.buffer1, ptr @zeroEqualityTest02.buffer2, i64 16) | ||
|
@@ -97,7 +92,7 @@ define signext i32 @zeroEqualityTest04() { | |
; Validate with < 0 | ||
define signext i32 @zeroEqualityTest05() { | ||
; CHECK-LABEL: zeroEqualityTest05: | ||
; CHECK: # %bb.0: # %loadbb | ||
; CHECK: # %bb.0: | ||
; CHECK-NEXT: li 3, 0 | ||
; CHECK-NEXT: blr | ||
%call = tail call signext i32 @memcmp(ptr @zeroEqualityTest03.buffer1, ptr @zeroEqualityTest03.buffer2, i64 16) | ||
|
@@ -109,7 +104,7 @@ define signext i32 @zeroEqualityTest05() { | |
; Validate with memcmp()?: | ||
define signext i32 @equalityFoldTwoConstants() { | ||
; CHECK-LABEL: equalityFoldTwoConstants: | ||
; CHECK: # %bb.0: # %loadbb | ||
; CHECK: # %bb.0: | ||
; CHECK-NEXT: li 3, 1 | ||
; CHECK-NEXT: blr | ||
%call = tail call signext i32 @memcmp(ptr @zeroEqualityTest04.buffer1, ptr @zeroEqualityTest04.buffer2, i64 16) | ||
|
@@ -122,23 +117,17 @@ define signext i32 @equalityFoldOneConstant(ptr %X) { | |
; CHECK-LABEL: equalityFoldOneConstant: | ||
; CHECK: # %bb.0: | ||
; CHECK-NEXT: li 5, 1 | ||
; CHECK-NEXT: ld 4, 0(3) | ||
; CHECK-NEXT: ld 4, 8(3) | ||
; CHECK-NEXT: ld 3, 0(3) | ||
; CHECK-NEXT: rldic 5, 5, 32, 31 | ||
; CHECK-NEXT: cmpld 4, 5 | ||
; CHECK-NEXT: bne 0, .LBB6_2 | ||
; CHECK-NEXT: # %bb.1: # %loadbb1 | ||
; CHECK-NEXT: xor 3, 3, 5 | ||
; CHECK-NEXT: lis 5, -32768 | ||
; CHECK-NEXT: ld 4, 8(3) | ||
; CHECK-NEXT: li 3, 0 | ||
; CHECK-NEXT: ori 5, 5, 1 | ||
; CHECK-NEXT: rldic 5, 5, 1, 30 | ||
; CHECK-NEXT: cmpld 4, 5 | ||
; CHECK-NEXT: beq 0, .LBB6_3 | ||
; CHECK-NEXT: .LBB6_2: # %res_block | ||
; CHECK-NEXT: li 3, 1 | ||
; CHECK-NEXT: .LBB6_3: # %endblock | ||
; CHECK-NEXT: cntlzw 3, 3 | ||
; CHECK-NEXT: srwi 3, 3, 5 | ||
; CHECK-NEXT: xor 4, 4, 5 | ||
; CHECK-NEXT: or 3, 3, 4 | ||
; CHECK-NEXT: cntlzd 3, 3 | ||
; CHECK-NEXT: rldicl 3, 3, 58, 63 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we not change this sequence? It seems like a side effect and I'm not sure it's faster or slower. |
||
; CHECK-NEXT: blr | ||
%call = tail call signext i32 @memcmp(ptr @zeroEqualityTest04.buffer1, ptr %X, i64 16) | ||
%not.tobool = icmp eq i32 %call, 0 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe say something about this gets inline memcmp.