Skip to content

Commit 6a425f1

Browse files
authored
[ARM] Have custom lowering for ucmp and scmp (#149315)
Limited to non-thumb1 for scmp at the moment, since there is no good way to do it.
1 parent 0bdd312 commit 6a425f1

File tree

6 files changed

+893
-268
lines changed

6 files changed

+893
-268
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -802,6 +802,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
802802
setOperationAction(ISD::BSWAP, VT, Expand);
803803
}
804804

805+
if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
806+
setOperationAction(ISD::SCMP, MVT::i32, Custom);
807+
808+
if (!Subtarget->hasV8_1MMainlineOps())
809+
setOperationAction(ISD::UCMP, MVT::i32, Custom);
810+
805811
setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
806812
setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
807813

@@ -1634,6 +1640,10 @@ bool ARMTargetLowering::useSoftFloat() const {
16341640
return Subtarget->useSoftFloat();
16351641
}
16361642

1643+
bool ARMTargetLowering::shouldExpandCmpUsingSelects(EVT VT) const {
1644+
return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
1645+
}
1646+
16371647
// FIXME: It might make sense to define the representative register class as the
16381648
// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
16391649
// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
@@ -10612,6 +10622,133 @@ SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
1061210622
return DAG.getBitcast(MVT::i32, Res);
1061310623
}
1061410624

10625+
SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
10626+
SDLoc dl(Op);
10627+
SDValue LHS = Op.getOperand(0);
10628+
SDValue RHS = Op.getOperand(1);
10629+
10630+
// Determine if this is signed or unsigned comparison
10631+
bool IsSigned = (Op.getOpcode() == ISD::SCMP);
10632+
10633+
// Special case for Thumb1 UCMP only
10634+
if (!IsSigned && Subtarget->isThumb1Only()) {
10635+
// For Thumb unsigned comparison, use this sequence:
10636+
// subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
10637+
// sbc r2, r2 ; r2 = r2 - r2 - !carry
10638+
// cmp r1, r0 ; compare RHS with LHS
10639+
// sbc r1, r1 ; r1 = r1 - r1 - !carry
10640+
// subs r0, r2, r1 ; r0 = r2 - r1 (final result)
10641+
10642+
// First subtraction: LHS - RHS
10643+
SDValue Sub1WithFlags = DAG.getNode(
10644+
ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10645+
SDValue Sub1Result = Sub1WithFlags.getValue(0);
10646+
SDValue Flags1 = Sub1WithFlags.getValue(1);
10647+
10648+
// SUBE: Sub1Result - Sub1Result - !carry
10649+
// This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
10650+
SDValue Sbc1 =
10651+
DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
10652+
Sub1Result, Sub1Result, Flags1);
10653+
SDValue Sbc1Result = Sbc1.getValue(0);
10654+
10655+
// Second comparison: RHS vs LHS (reverse comparison)
10656+
SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
10657+
10658+
// SUBE: RHS - RHS - !carry
10659+
// This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
10660+
SDValue Sbc2 = DAG.getNode(
10661+
ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
10662+
SDValue Sbc2Result = Sbc2.getValue(0);
10663+
10664+
// Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
10665+
SDValue Result =
10666+
DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
10667+
if (Op.getValueType() != MVT::i32)
10668+
Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
10669+
10670+
return Result;
10671+
}
10672+
10673+
// For the ARM assembly pattern:
10674+
// subs r0, r0, r1 ; subtract RHS from LHS and set flags
10675+
// movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
10676+
// unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
10677+
// signed, LO for unsigned)
10678+
// ; if LHS == RHS, result remains 0 from the subs
10679+
10680+
// Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
10681+
unsigned Opcode = ARMISD::SUBC;
10682+
10683+
// Check if RHS is a subtraction against 0: (0 - X)
10684+
if (RHS.getOpcode() == ISD::SUB) {
10685+
SDValue SubLHS = RHS.getOperand(0);
10686+
SDValue SubRHS = RHS.getOperand(1);
10687+
10688+
// Check if it's 0 - X
10689+
if (isNullConstant(SubLHS)) {
10690+
bool CanUseAdd = false;
10691+
if (IsSigned) {
10692+
// For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
10693+
if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
10694+
.getSignedMinValue()
10695+
.isMinSignedValue()) {
10696+
CanUseAdd = true;
10697+
}
10698+
} else {
10699+
// For UCMP: only if X is known to never be zero
10700+
if (DAG.isKnownNeverZero(SubRHS)) {
10701+
CanUseAdd = true;
10702+
}
10703+
}
10704+
10705+
if (CanUseAdd) {
10706+
Opcode = ARMISD::ADDC;
10707+
RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of
10708+
// LHS - (0 - X)
10709+
}
10710+
}
10711+
}
10712+
10713+
// Generate the operation with flags
10714+
SDValue OpWithFlags;
10715+
if (Opcode == ARMISD::ADDC) {
10716+
// Use ADDC: LHS + RHS (where RHS was 0 - X, now X)
10717+
OpWithFlags = DAG.getNode(ARMISD::ADDC, dl,
10718+
DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10719+
} else {
10720+
// Use ARMISD::SUBC to generate SUBS instruction (subtract with flags)
10721+
OpWithFlags = DAG.getNode(ARMISD::SUBC, dl,
10722+
DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10723+
}
10724+
10725+
SDValue OpResult = OpWithFlags.getValue(0); // The operation result
10726+
SDValue Flags = OpWithFlags.getValue(1); // The flags
10727+
10728+
// Constants for conditional moves
10729+
SDValue One = DAG.getConstant(1, dl, MVT::i32);
10730+
SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
10731+
10732+
// Select condition codes based on signed vs unsigned
10733+
ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
10734+
ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
10735+
10736+
// First conditional move: if greater than, set to 1
10737+
SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
10738+
SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
10739+
GTCondValue, Flags);
10740+
10741+
// Second conditional move: if less than, set to -1
10742+
SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
10743+
SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
10744+
LTCondValue, Flags);
10745+
10746+
if (Op.getValueType() != MVT::i32)
10747+
Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
10748+
10749+
return Result2;
10750+
}
10751+
1061510752
SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1061610753
LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
1061710754
switch (Op.getOpcode()) {
@@ -10740,6 +10877,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1074010877
case ISD::FP_TO_BF16:
1074110878
return LowerFP_TO_BF16(Op, DAG);
1074210879
case ARMISD::WIN__DBZCHK: return SDValue();
10880+
case ISD::UCMP:
10881+
case ISD::SCMP:
10882+
return LowerCMP(Op, DAG);
1074310883
}
1074410884
}
1074510885

llvm/lib/Target/ARM/ARMISelLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -607,6 +607,8 @@ class VectorType;
607607

608608
bool preferZeroCompareBranch() const override { return true; }
609609

610+
bool shouldExpandCmpUsingSelects(EVT VT) const override;
611+
610612
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
611613

612614
bool hasAndNotCompare(SDValue V) const override {
@@ -904,6 +906,7 @@ class VectorType;
904906
void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
905907
SelectionDAG &DAG) const;
906908
SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
909+
SDValue LowerCMP(SDValue Op, SelectionDAG &DAG) const;
907910

908911
Register getRegisterByName(const char* RegName, LLT VT,
909912
const MachineFunction &MF) const override;

llvm/test/CodeGen/ARM/scmp.ll

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,9 @@
44
define i8 @scmp_8_8(i8 signext %x, i8 signext %y) nounwind {
55
; CHECK-LABEL: scmp_8_8:
66
; CHECK: @ %bb.0:
7-
; CHECK-NEXT: cmp r0, r1
8-
; CHECK-NEXT: mov r0, #0
9-
; CHECK-NEXT: mov r2, #0
10-
; CHECK-NEXT: movwlt r0, #1
11-
; CHECK-NEXT: movwgt r2, #1
12-
; CHECK-NEXT: sub r0, r2, r0
7+
; CHECK-NEXT: subs r0, r0, r1
8+
; CHECK-NEXT: movwgt r0, #1
9+
; CHECK-NEXT: mvnlt r0, #0
1310
; CHECK-NEXT: bx lr
1411
%1 = call i8 @llvm.scmp(i8 %x, i8 %y)
1512
ret i8 %1
@@ -18,12 +15,9 @@ define i8 @scmp_8_8(i8 signext %x, i8 signext %y) nounwind {
1815
define i8 @scmp_8_16(i16 signext %x, i16 signext %y) nounwind {
1916
; CHECK-LABEL: scmp_8_16:
2017
; CHECK: @ %bb.0:
21-
; CHECK-NEXT: cmp r0, r1
22-
; CHECK-NEXT: mov r0, #0
23-
; CHECK-NEXT: mov r2, #0
24-
; CHECK-NEXT: movwlt r0, #1
25-
; CHECK-NEXT: movwgt r2, #1
26-
; CHECK-NEXT: sub r0, r2, r0
18+
; CHECK-NEXT: subs r0, r0, r1
19+
; CHECK-NEXT: movwgt r0, #1
20+
; CHECK-NEXT: mvnlt r0, #0
2721
; CHECK-NEXT: bx lr
2822
%1 = call i8 @llvm.scmp(i16 %x, i16 %y)
2923
ret i8 %1
@@ -32,12 +26,9 @@ define i8 @scmp_8_16(i16 signext %x, i16 signext %y) nounwind {
3226
define i8 @scmp_8_32(i32 %x, i32 %y) nounwind {
3327
; CHECK-LABEL: scmp_8_32:
3428
; CHECK: @ %bb.0:
35-
; CHECK-NEXT: cmp r0, r1
36-
; CHECK-NEXT: mov r0, #0
37-
; CHECK-NEXT: mov r2, #0
38-
; CHECK-NEXT: movwlt r0, #1
39-
; CHECK-NEXT: movwgt r2, #1
40-
; CHECK-NEXT: sub r0, r2, r0
29+
; CHECK-NEXT: subs r0, r0, r1
30+
; CHECK-NEXT: movwgt r0, #1
31+
; CHECK-NEXT: mvnlt r0, #0
4132
; CHECK-NEXT: bx lr
4233
%1 = call i8 @llvm.scmp(i32 %x, i32 %y)
4334
ret i8 %1
@@ -92,17 +83,26 @@ define i8 @scmp_8_128(i128 %x, i128 %y) nounwind {
9283
define i32 @scmp_32_32(i32 %x, i32 %y) nounwind {
9384
; CHECK-LABEL: scmp_32_32:
9485
; CHECK: @ %bb.0:
95-
; CHECK-NEXT: cmp r0, r1
96-
; CHECK-NEXT: mov r0, #0
97-
; CHECK-NEXT: mov r2, #0
98-
; CHECK-NEXT: movwlt r0, #1
99-
; CHECK-NEXT: movwgt r2, #1
100-
; CHECK-NEXT: sub r0, r2, r0
86+
; CHECK-NEXT: subs r0, r0, r1
87+
; CHECK-NEXT: movwgt r0, #1
88+
; CHECK-NEXT: mvnlt r0, #0
10189
; CHECK-NEXT: bx lr
10290
%1 = call i32 @llvm.scmp(i32 %x, i32 %y)
10391
ret i32 %1
10492
}
10593

94+
define i32 @scmp_neg(i32 %x, i32 %y) nounwind {
95+
; CHECK-LABEL: scmp_neg:
96+
; CHECK: @ %bb.0:
97+
; CHECK-NEXT: adds r0, r0, r1
98+
; CHECK-NEXT: movwgt r0, #1
99+
; CHECK-NEXT: mvnlt r0, #0
100+
; CHECK-NEXT: bx lr
101+
%yy = sub nsw i32 0, %y
102+
%1 = call i32 @llvm.scmp(i32 %x, i32 %yy)
103+
ret i32 %1
104+
}
105+
106106
define i32 @scmp_32_64(i64 %x, i64 %y) nounwind {
107107
; CHECK-LABEL: scmp_32_64:
108108
; CHECK: @ %bb.0:

llvm/test/CodeGen/ARM/ucmp.ll

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,9 @@
44
define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
55
; CHECK-LABEL: ucmp_8_8:
66
; CHECK: @ %bb.0:
7-
; CHECK-NEXT: cmp r0, r1
8-
; CHECK-NEXT: mov r0, #0
9-
; CHECK-NEXT: mov r2, #0
10-
; CHECK-NEXT: movwlo r0, #1
11-
; CHECK-NEXT: movwhi r2, #1
12-
; CHECK-NEXT: sub r0, r2, r0
7+
; CHECK-NEXT: subs r0, r0, r1
8+
; CHECK-NEXT: movwhi r0, #1
9+
; CHECK-NEXT: mvnlo r0, #0
1310
; CHECK-NEXT: bx lr
1411
%1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
1512
ret i8 %1
@@ -18,12 +15,9 @@ define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
1815
define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
1916
; CHECK-LABEL: ucmp_8_16:
2017
; CHECK: @ %bb.0:
21-
; CHECK-NEXT: cmp r0, r1
22-
; CHECK-NEXT: mov r0, #0
23-
; CHECK-NEXT: mov r2, #0
24-
; CHECK-NEXT: movwlo r0, #1
25-
; CHECK-NEXT: movwhi r2, #1
26-
; CHECK-NEXT: sub r0, r2, r0
18+
; CHECK-NEXT: subs r0, r0, r1
19+
; CHECK-NEXT: movwhi r0, #1
20+
; CHECK-NEXT: mvnlo r0, #0
2721
; CHECK-NEXT: bx lr
2822
%1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
2923
ret i8 %1
@@ -32,12 +26,9 @@ define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
3226
define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind {
3327
; CHECK-LABEL: ucmp_8_32:
3428
; CHECK: @ %bb.0:
35-
; CHECK-NEXT: cmp r0, r1
36-
; CHECK-NEXT: mov r0, #0
37-
; CHECK-NEXT: mov r2, #0
38-
; CHECK-NEXT: movwlo r0, #1
39-
; CHECK-NEXT: movwhi r2, #1
40-
; CHECK-NEXT: sub r0, r2, r0
29+
; CHECK-NEXT: subs r0, r0, r1
30+
; CHECK-NEXT: movwhi r0, #1
31+
; CHECK-NEXT: mvnlo r0, #0
4132
; CHECK-NEXT: bx lr
4233
%1 = call i8 @llvm.ucmp(i32 %x, i32 %y)
4334
ret i8 %1
@@ -92,12 +83,9 @@ define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind {
9283
define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind {
9384
; CHECK-LABEL: ucmp_32_32:
9485
; CHECK: @ %bb.0:
95-
; CHECK-NEXT: cmp r0, r1
96-
; CHECK-NEXT: mov r0, #0
97-
; CHECK-NEXT: mov r2, #0
98-
; CHECK-NEXT: movwlo r0, #1
99-
; CHECK-NEXT: movwhi r2, #1
100-
; CHECK-NEXT: sub r0, r2, r0
86+
; CHECK-NEXT: subs r0, r0, r1
87+
; CHECK-NEXT: movwhi r0, #1
88+
; CHECK-NEXT: mvnlo r0, #0
10189
; CHECK-NEXT: bx lr
10290
%1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
10391
ret i32 %1

0 commit comments

Comments
 (0)