Skip to content

Commit 90582ad

Browse files
authored
[ARM] shouldFoldMaskToVariableShiftPair should be true for scalars up to the biggest legal type (#158070)
For ARM, we want to do this up to 32-bits. Otherwise the code ends up bigger and bloated.
1 parent dd668aa commit 90582ad

File tree

4 files changed

+7443
-0
lines changed

4 files changed

+7443
-0
lines changed

llvm/lib/Target/ARM/ARMISelLowering.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -775,6 +775,16 @@ class VectorType;
775775
bool shouldFoldConstantShiftPairToMask(const SDNode *N,
776776
CombineLevel Level) const override;
777777

778+
/// Return true if it is profitable to fold a pair of shifts into a mask.
779+
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override {
780+
EVT VT = Y.getValueType();
781+
782+
if (VT.isVector())
783+
return false;
784+
785+
return VT.getScalarSizeInBits() <= 32;
786+
}
787+
778788
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
779789
unsigned SelectOpcode, SDValue X,
780790
SDValue Y) const override;
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=thumbv7m-eabi %s -o - | FileCheck %s --check-prefix V7M
3+
; RUN: llc -mtriple=armv7a-eabi %s -o - | FileCheck %s --check-prefix V7A
4+
; RUN: llc -mtriple=thumbv7a-eabi %s -o - | FileCheck %s --check-prefix V7A-T
5+
; RUN: llc -mtriple=armv6m-eabi %s -o - | FileCheck %s --check-prefix V6M
6+
7+
define i32 @mask_pair(i32 %x, i32 %y) {
8+
; V7M-LABEL: mask_pair:
9+
; V7M: @ %bb.0:
10+
; V7M-NEXT: lsrs r0, r1
11+
; V7M-NEXT: lsls r0, r1
12+
; V7M-NEXT: bx lr
13+
;
14+
; V7A-LABEL: mask_pair:
15+
; V7A: @ %bb.0:
16+
; V7A-NEXT: lsr r0, r0, r1
17+
; V7A-NEXT: lsl r0, r0, r1
18+
; V7A-NEXT: bx lr
19+
;
20+
; V7A-T-LABEL: mask_pair:
21+
; V7A-T: @ %bb.0:
22+
; V7A-T-NEXT: lsrs r0, r1
23+
; V7A-T-NEXT: lsls r0, r1
24+
; V7A-T-NEXT: bx lr
25+
;
26+
; V6M-LABEL: mask_pair:
27+
; V6M: @ %bb.0:
28+
; V6M-NEXT: lsrs r0, r1
29+
; V6M-NEXT: lsls r0, r1
30+
; V6M-NEXT: bx lr
31+
%shl = shl nsw i32 -1, %y
32+
%and = and i32 %shl, %x
33+
ret i32 %and
34+
}
35+
36+
define i64 @mask_pair_64(i64 %x, i64 %y) {
37+
; V7M-LABEL: mask_pair_64:
38+
; V7M: @ %bb.0:
39+
; V7M-NEXT: mov.w r3, #-1
40+
; V7M-NEXT: lsl.w r12, r3, r2
41+
; V7M-NEXT: subs r2, #32
42+
; V7M-NEXT: it pl
43+
; V7M-NEXT: movpl.w r12, #0
44+
; V7M-NEXT: it pl
45+
; V7M-NEXT: lslpl r3, r2
46+
; V7M-NEXT: and.w r0, r0, r12
47+
; V7M-NEXT: ands r1, r3
48+
; V7M-NEXT: bx lr
49+
;
50+
; V7A-LABEL: mask_pair_64:
51+
; V7A: @ %bb.0:
52+
; V7A-NEXT: subs r12, r2, #32
53+
; V7A-NEXT: mvn r3, #0
54+
; V7A-NEXT: lsl r2, r3, r2
55+
; V7A-NEXT: lslpl r3, r3, r12
56+
; V7A-NEXT: movwpl r2, #0
57+
; V7A-NEXT: and r1, r3, r1
58+
; V7A-NEXT: and r0, r2, r0
59+
; V7A-NEXT: bx lr
60+
;
61+
; V7A-T-LABEL: mask_pair_64:
62+
; V7A-T: @ %bb.0:
63+
; V7A-T-NEXT: mov.w r3, #-1
64+
; V7A-T-NEXT: lsl.w r12, r3, r2
65+
; V7A-T-NEXT: subs r2, #32
66+
; V7A-T-NEXT: it pl
67+
; V7A-T-NEXT: movpl.w r12, #0
68+
; V7A-T-NEXT: it pl
69+
; V7A-T-NEXT: lslpl r3, r2
70+
; V7A-T-NEXT: and.w r0, r0, r12
71+
; V7A-T-NEXT: ands r1, r3
72+
; V7A-T-NEXT: bx lr
73+
;
74+
; V6M-LABEL: mask_pair_64:
75+
; V6M: @ %bb.0:
76+
; V6M-NEXT: .save {r4, r5, r7, lr}
77+
; V6M-NEXT: push {r4, r5, r7, lr}
78+
; V6M-NEXT: mov r4, r1
79+
; V6M-NEXT: mov r5, r0
80+
; V6M-NEXT: movs r0, #0
81+
; V6M-NEXT: mvns r0, r0
82+
; V6M-NEXT: mov r1, r0
83+
; V6M-NEXT: bl __aeabi_llsl
84+
; V6M-NEXT: ands r0, r5
85+
; V6M-NEXT: ands r1, r4
86+
; V6M-NEXT: pop {r4, r5, r7, pc}
87+
%shl = shl nsw i64 -1, %y
88+
%and = and i64 %shl, %x
89+
ret i64 %and
90+
}

0 commit comments

Comments
 (0)