Skip to content

Commit 56c708c

Browse files
committed
[ARM] Only change mask if demanded bits says we can optimize
Also enable a switch to turn off enable-logical-imm.
1 parent 6cb942c commit 56c708c

File tree

8 files changed

+121
-55
lines changed

8 files changed

+121
-55
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 77 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ using namespace llvm;
118118
#define DEBUG_TYPE "arm-isel"
119119

120120
STATISTIC(NumTailCalls, "Number of tail calls");
121+
STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
121122
STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
122123
STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
123124
STATISTIC(NumConstpoolPromoted,
@@ -142,6 +143,12 @@ static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
142143
cl::desc("Maximum size of ALL constants to promote into a constant pool"),
143144
cl::init(128));
144145

146+
static cl::opt<bool>
147+
EnableOptimizeLogicalImm("arm-enable-logical-imm", cl::Hidden,
148+
cl::desc("Enable ARM logical imm instruction "
149+
"optimization"),
150+
cl::init(true));
151+
145152
cl::opt<unsigned>
146153
MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
147154
cl::desc("Maximum interleave factor for MVE VLDn to generate."),
@@ -20138,6 +20145,16 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
2013820145
}
2013920146
}
2014020147

20148+
static bool isLegalLogicalImmediate(unsigned Imm,
20149+
const ARMSubtarget *Subtarget) {
20150+
if (!Subtarget->isThumb())
20151+
return ARM_AM::getSOImmVal(Imm) != -1;
20152+
if (Subtarget->isThumb2())
20153+
return ARM_AM::getT2SOImmVal(Imm) != -1;
20154+
// Thumb1 only has 8-bit unsigned immediate.
20155+
return Imm <= 255;
20156+
}
20157+
2014120158
bool ARMTargetLowering::targetShrinkDemandedConstant(
2014220159
SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2014320160
TargetLoweringOpt &TLO) const {
@@ -20146,8 +20163,7 @@ bool ARMTargetLowering::targetShrinkDemandedConstant(
2014620163
if (!TLO.LegalOps)
2014720164
return false;
2014820165

20149-
// Only optimize AND for now.
20150-
if (Op.getOpcode() != ISD::AND)
20166+
if (!EnableOptimizeLogicalImm)
2015120167
return false;
2015220168

2015320169
EVT VT = Op.getValueType();
@@ -20158,28 +20174,28 @@ bool ARMTargetLowering::targetShrinkDemandedConstant(
2015820174

2015920175
assert(VT == MVT::i32 && "Unexpected integer type");
2016020176

20177+
// Exit early if we demand all bits.
20178+
if (DemandedBits.popcount() == 32)
20179+
return false;
20180+
20181+
// Only optimize AND for now.
20182+
if (Op.getOpcode() != ISD::AND)
20183+
return false;
20184+
2016120185
// Make sure the RHS really is a constant.
2016220186
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2016320187
if (!C)
2016420188
return false;
2016520189

2016620190
unsigned Mask = C->getZExtValue();
2016720191

20192+
if (Mask == 0 || Mask == ~0U)
20193+
return false;
20194+
2016820195
unsigned Demanded = DemandedBits.getZExtValue();
2016920196
unsigned ShrunkMask = Mask & Demanded;
2017020197
unsigned ExpandedMask = Mask | ~Demanded;
2017120198

20172-
// If the mask is all zeros, let the target-independent code replace the
20173-
// result with zero.
20174-
if (ShrunkMask == 0)
20175-
return false;
20176-
20177-
// If the mask is all ones, erase the AND. (Currently, the target-independent
20178-
// code won't do this, so we have to do it explicitly to avoid an infinite
20179-
// loop in obscure cases.)
20180-
if (ExpandedMask == ~0U)
20181-
return TLO.CombineTo(Op, Op.getOperand(0));
20182-
2018320199
auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
2018420200
return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
2018520201
};
@@ -20192,30 +20208,65 @@ bool ARMTargetLowering::targetShrinkDemandedConstant(
2019220208
return TLO.CombineTo(Op, NewOp);
2019320209
};
2019420210

20195-
// Prefer uxtb mask.
20196-
if (IsLegalMask(0xFF))
20197-
return UseMask(0xFF);
20211+
// If the mask is all zeros, let the target-independent code replace the
20212+
// result with zero.
20213+
if (ShrunkMask == 0) {
20214+
++NumOptimizedImms;
20215+
return UseMask(ShrunkMask);
20216+
}
20217+
20218+
// If the mask is all ones, erase the AND. (Currently, the target-independent
20219+
// code won't do this, so we have to do it explicitly to avoid an infinite
20220+
// loop in obscure cases.)
20221+
if (ExpandedMask == ~0U) {
20222+
++NumOptimizedImms;
20223+
return UseMask(ExpandedMask);
20224+
}
20225+
20226+
// If thumb, check for uxth and uxtb masks first and foremost.
20227+
if (Subtarget->isThumb1Only() && Subtarget->hasV6Ops()) {
20228+
if (IsLegalMask(0xFF)) {
20229+
++NumOptimizedImms;
20230+
return UseMask(0xFF);
20231+
}
20232+
20233+
if (IsLegalMask(0xFFFF)) {
20234+
++NumOptimizedImms;
20235+
return UseMask(0xFFFF);
20236+
}
20237+
}
20238+
20239+
// Don't optimize if it is legal already.
20240+
if (isLegalLogicalImmediate(Mask, Subtarget))
20241+
return false;
2019820242

20199-
// Prefer uxth mask.
20200-
if (IsLegalMask(0xFFFF))
20201-
return UseMask(0xFFFF);
20243+
if (isLegalLogicalImmediate(~Mask, Subtarget))
20244+
return UseMask(Mask); // FIXME: Returning false causes infinite loop.
2020220245

20203-
// [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
20204-
// FIXME: Prefer a contiguous sequence of bits for other optimizations.
20205-
if (ShrunkMask < 256)
20246+
20247+
if (isLegalLogicalImmediate(ShrunkMask, Subtarget)) {
20248+
++NumOptimizedImms;
2020620249
return UseMask(ShrunkMask);
20250+
}
20251+
20252+
// FIXME: The check for v6 is because this interferes with some ubfx
20253+
// optimizations
20254+
if (!Subtarget->hasV6Ops() &&
20255+
isLegalLogicalImmediate(~ExpandedMask, Subtarget)) {
20256+
++NumOptimizedImms;
20257+
return UseMask(ExpandedMask);
20258+
}
2020720259

20208-
// [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
20209-
// FIXME: Prefer a contiguous sequence of bits for other optimizations.
20210-
if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
20260+
if ((~ExpandedMask) < 256) {
20261+
++NumOptimizedImms;
2021120262
return UseMask(ExpandedMask);
20263+
}
2021220264

2021320265
// Potential improvements:
2021420266
//
2021520267
// We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
2021620268
// We could try to prefer Thumb1 immediates which can be lowered to a
2021720269
// two-instruction sequence.
20218-
// We could try to recognize more legal ARM/Thumb2 immediates here.
2021920270

2022020271
return false;
2022120272
}

llvm/test/CodeGen/ARM/funnel-shift-rot.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
1919
define i8 @rotl_i8_const_shift(i8 %x) {
2020
; CHECK-LABEL: rotl_i8_const_shift:
2121
; CHECK: @ %bb.0:
22-
; CHECK-NEXT: uxtb r1, r0
22+
; CHECK-NEXT: and r1, r0, #224
2323
; CHECK-NEXT: lsl r0, r0, #3
2424
; CHECK-NEXT: orr r0, r0, r1, lsr #5
2525
; CHECK-NEXT: bx lr
@@ -161,8 +161,7 @@ define <4 x i32> @rotl_v4i32_rotl_const_shift(<4 x i32> %x) {
161161
define i8 @rotr_i8_const_shift(i8 %x) {
162162
; CHECK-LABEL: rotr_i8_const_shift:
163163
; CHECK: @ %bb.0:
164-
; CHECK-NEXT: uxtb r1, r0
165-
; CHECK-NEXT: lsr r1, r1, #3
164+
; CHECK-NEXT: ubfx r1, r0, #3, #5
166165
; CHECK-NEXT: orr r0, r1, r0, lsl #5
167166
; CHECK-NEXT: bx lr
168167
%f = call i8 @llvm.fshr.i8(i8 %x, i8 %x, i8 3)

llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll

Lines changed: 36 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ define i1 @scalar_i8_signbit_eq(i8 %x, i8 %y) nounwind {
2121
; ARM-LABEL: scalar_i8_signbit_eq:
2222
; ARM: @ %bb.0:
2323
; ARM-NEXT: uxtb r1, r1
24-
; ARM-NEXT: lsl r0, r0, r1
24+
; ARM-NEXT: mov r2, #128
25+
; ARM-NEXT: and r0, r2, r0, lsl r1
2526
; ARM-NEXT: mov r1, #1
26-
; ARM-NEXT: uxtb r0, r0
2727
; ARM-NEXT: eor r0, r1, r0, lsr #7
2828
; ARM-NEXT: bx lr
2929
;
@@ -42,7 +42,7 @@ define i1 @scalar_i8_signbit_eq(i8 %x, i8 %y) nounwind {
4242
; THUMB78-NEXT: uxtb r1, r1
4343
; THUMB78-NEXT: lsls r0, r1
4444
; THUMB78-NEXT: movs r1, #1
45-
; THUMB78-NEXT: uxtb r0, r0
45+
; THUMB78-NEXT: and r0, r0, #128
4646
; THUMB78-NEXT: eor.w r0, r1, r0, lsr #7
4747
; THUMB78-NEXT: bx lr
4848
%t0 = lshr i8 128, %y
@@ -122,9 +122,9 @@ define i1 @scalar_i16_signbit_eq(i16 %x, i16 %y) nounwind {
122122
; ARM-LABEL: scalar_i16_signbit_eq:
123123
; ARM: @ %bb.0:
124124
; ARM-NEXT: uxth r1, r1
125-
; ARM-NEXT: lsl r0, r0, r1
125+
; ARM-NEXT: mov r2, #32768
126+
; ARM-NEXT: and r0, r2, r0, lsl r1
126127
; ARM-NEXT: mov r1, #1
127-
; ARM-NEXT: uxth r0, r0
128128
; ARM-NEXT: eor r0, r1, r0, lsr #15
129129
; ARM-NEXT: bx lr
130130
;
@@ -144,7 +144,7 @@ define i1 @scalar_i16_signbit_eq(i16 %x, i16 %y) nounwind {
144144
; THUMB78-NEXT: uxth r1, r1
145145
; THUMB78-NEXT: lsls r0, r1
146146
; THUMB78-NEXT: movs r1, #1
147-
; THUMB78-NEXT: uxth r0, r0
147+
; THUMB78-NEXT: and r0, r0, #32768
148148
; THUMB78-NEXT: eor.w r0, r1, r0, lsr #15
149149
; THUMB78-NEXT: bx lr
150150
%t0 = lshr i16 32768, %y
@@ -862,21 +862,35 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi
862862
;------------------------------------------------------------------------------;
863863

864864
define i1 @scalar_i8_signbit_ne(i8 %x, i8 %y) nounwind {
865-
; ARM-LABEL: scalar_i8_signbit_ne:
866-
; ARM: @ %bb.0:
867-
; ARM-NEXT: uxtb r1, r1
868-
; ARM-NEXT: lsl r0, r0, r1
869-
; ARM-NEXT: uxtb r0, r0
870-
; ARM-NEXT: lsr r0, r0, #7
871-
; ARM-NEXT: bx lr
865+
; ARM6-LABEL: scalar_i8_signbit_ne:
866+
; ARM6: @ %bb.0:
867+
; ARM6-NEXT: uxtb r1, r1
868+
; ARM6-NEXT: mov r2, #128
869+
; ARM6-NEXT: and r0, r2, r0, lsl r1
870+
; ARM6-NEXT: lsr r0, r0, #7
871+
; ARM6-NEXT: bx lr
872872
;
873-
; THUMB-LABEL: scalar_i8_signbit_ne:
874-
; THUMB: @ %bb.0:
875-
; THUMB-NEXT: uxtb r1, r1
876-
; THUMB-NEXT: lsls r0, r1
877-
; THUMB-NEXT: uxtb r0, r0
878-
; THUMB-NEXT: lsrs r0, r0, #7
879-
; THUMB-NEXT: bx lr
873+
; ARM78-LABEL: scalar_i8_signbit_ne:
874+
; ARM78: @ %bb.0:
875+
; ARM78-NEXT: uxtb r1, r1
876+
; ARM78-NEXT: lsl r0, r0, r1
877+
; ARM78-NEXT: ubfx r0, r0, #7, #1
878+
; ARM78-NEXT: bx lr
879+
;
880+
; THUMB6-LABEL: scalar_i8_signbit_ne:
881+
; THUMB6: @ %bb.0:
882+
; THUMB6-NEXT: uxtb r1, r1
883+
; THUMB6-NEXT: lsls r0, r1
884+
; THUMB6-NEXT: uxtb r0, r0
885+
; THUMB6-NEXT: lsrs r0, r0, #7
886+
; THUMB6-NEXT: bx lr
887+
;
888+
; THUMB78-LABEL: scalar_i8_signbit_ne:
889+
; THUMB78: @ %bb.0:
890+
; THUMB78-NEXT: uxtb r1, r1
891+
; THUMB78-NEXT: lsls r0, r1
892+
; THUMB78-NEXT: ubfx r0, r0, #7, #1
893+
; THUMB78-NEXT: bx lr
880894
%t0 = lshr i8 128, %y
881895
%t1 = and i8 %t0, %x
882896
%res = icmp ne i8 %t1, 0 ; we are perfectly happy with 'ne' predicate
@@ -1051,3 +1065,5 @@ define i1 @scalar_i8_signbit_eq_with_nonzero(i8 %x, i8 %y) nounwind {
10511065
%res = icmp eq i8 %t1, 1 ; should be comparing with 0
10521066
ret i1 %res
10531067
}
1068+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
1069+
; THUMB: {{.*}}

llvm/test/CodeGen/ARM/va_arg.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ define double @test2(i32 %a, ptr %b, ...) nounwind optsize {
3535
; CHECK-NEXT: add r0, sp, #4
3636
; CHECK-NEXT: stmib sp, {r2, r3}
3737
; CHECK-NEXT: add r0, r0, #11
38-
; CHECK-NEXT: bic r0, r0, #3
38+
; CHECK-NEXT: bic r0, r0, #7
3939
; CHECK-NEXT: str r2, [r1]
4040
; CHECK-NEXT: add r1, r0, #8
4141
; CHECK-NEXT: str r1, [sp]

llvm/test/CodeGen/Thumb/branch-to-return.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ define i32 @foo(ptr %x, i32 %n) {
2626
; CHECK-NEXT: ldr.w r0, [r12]
2727
; CHECK-NEXT: .LBB0_6: @ %for.body.preheader1
2828
; CHECK-NEXT: subs r3, r1, r3
29-
; CHECK-NEXT: mvn r2, #12
29+
; CHECK-NEXT: mvn r2, #15
3030
; CHECK-NEXT: and.w r1, r2, r1, lsl #2
3131
; CHECK-NEXT: add r1, r12
3232
; CHECK-NEXT: .LBB0_7: @ %for.body

llvm/test/CodeGen/Thumb2/active_lane_mask.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ define void @test_width2(ptr nocapture readnone %x, ptr nocapture %y, i8 zeroext
283283
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
284284
; CHECK-NEXT: adds r0, r2, #1
285285
; CHECK-NEXT: movs r3, #1
286-
; CHECK-NEXT: bic r0, r0, #1
286+
; CHECK-NEXT: and r0, r0, #510
287287
; CHECK-NEXT: subs r0, #2
288288
; CHECK-NEXT: add.w r0, r3, r0, lsr #1
289289
; CHECK-NEXT: dls lr, r0

llvm/test/CodeGen/Thumb2/mve-tailpred-nonzerostart.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ define arm_aapcs_vfpcc void @startSmod4(i32 %S, ptr nocapture readonly %x, ptr n
204204
; CHECK-NEXT: poplt {r4, pc}
205205
; CHECK-NEXT: .LBB3_1: @ %vector.ph
206206
; CHECK-NEXT: vmov r12, s0
207-
; CHECK-NEXT: mvn r4, #12
207+
; CHECK-NEXT: mvn r4, #15
208208
; CHECK-NEXT: and.w r4, r4, r0, lsl #2
209209
; CHECK-NEXT: add r1, r4
210210
; CHECK-NEXT: add r2, r4

llvm/test/CodeGen/Thumb2/shift_parts.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -457,7 +457,7 @@ entry:
457457
define i32 @ashr_demand_bottommask2(i64 %x) {
458458
; CHECK-LABEL: ashr_demand_bottommask2:
459459
; CHECK: @ %bb.0: @ %entry
460-
; CHECK-NEXT: mvn r0, #2
460+
; CHECK-NEXT: mvn r0, #3
461461
; CHECK-NEXT: and.w r0, r0, r1, lsl #1
462462
; CHECK-NEXT: bx lr
463463
entry:
@@ -470,7 +470,7 @@ entry:
470470
define i32 @lshr_demand_bottommask2(i64 %x) {
471471
; CHECK-LABEL: lshr_demand_bottommask2:
472472
; CHECK: @ %bb.0: @ %entry
473-
; CHECK-NEXT: mvn r0, #2
473+
; CHECK-NEXT: mvn r0, #3
474474
; CHECK-NEXT: and.w r0, r0, r1, lsl #1
475475
; CHECK-NEXT: bx lr
476476
entry:

0 commit comments

Comments
 (0)