Skip to content

Commit d294d46

Browse files
[AArch64] Fix vectorToScalarBitmask BE (#156312)
1 parent c8d7a73 commit d294d46

File tree

2 files changed

+97
-3
lines changed

2 files changed

+97
-3
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24168,14 +24168,18 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
2416824168
// Ensure that all elements' bits are either 0s or 1s.
2416924169
ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
2417024170

24171+
bool IsLE = DAG.getDataLayout().isLittleEndian();
2417124172
SmallVector<SDValue, 16> MaskConstants;
2417224173
if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
2417324174
VecVT == MVT::v16i8) {
2417424175
// v16i8 is a special case, as we have 16 entries but only 8 positional bits
2417524176
// per entry. We split it into two halves, apply the mask, zip the halves to
2417624177
// create 8x 16-bit values, and the perform the vector reduce.
2417724178
for (unsigned Half = 0; Half < 2; ++Half) {
24178-
for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
24179+
for (unsigned I = 0; I < 8; ++I) {
24180+
// On big-endian targets, the lane order in sub-byte vector elements
24181+
// gets reversed, so we need to flip the bit index.
24182+
unsigned MaskBit = IsLE ? (1u << I) : (1u << (7 - I));
2417924183
MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
2418024184
}
2418124185
}
@@ -24193,8 +24197,9 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
2419324197
}
2419424198

2419524199
// All other vector sizes.
24196-
unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
24197-
for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
24200+
unsigned NumEl = VecVT.getVectorNumElements();
24201+
for (unsigned I = 0; I < NumEl; ++I) {
24202+
unsigned MaskBit = IsLE ? (1u << I) : (1u << (NumEl - 1 - I));
2419824203
MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
2419924204
}
2420024205

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
; RUN: llc -O0 -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK-LE
2+
; RUN: llc -O0 -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK-BE
3+
4+
@haystack4 = internal unnamed_addr constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 4
5+
@haystack16 = internal unnamed_addr constant [16 x i8] [i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15], align 16
6+
7+
8+
define i8 @test4() {
9+
%matches = alloca <4 x i1>, align 1
10+
%index_ptr = alloca i64, align 8
11+
store i64 0, ptr %index_ptr, align 8
12+
%index_val = load i64, ptr %index_ptr, align 8
13+
%haystack = getelementptr inbounds i32, ptr getelementptr inbounds (i8, ptr @haystack4, i64 0), i64 %index_val
14+
%h_vec = load <4 x i32>, ptr %haystack, align 4
15+
%cmp_vec = icmp eq <4 x i32> %h_vec, <i32 2, i32 2, i32 2, i32 2>
16+
store <4 x i1> %cmp_vec, ptr %matches, align 1
17+
%cmp_load = load <4 x i1>, ptr %matches, align 1
18+
%extr = extractelement <4 x i1> %cmp_load, i64 2
19+
%ret = zext i1 %extr to i8
20+
ret i8 %ret
21+
}
22+
23+
define i8 @test16() {
24+
%matches = alloca <16 x i1>, align 2
25+
%index_ptr = alloca i64, align 8
26+
store i64 0, ptr %index_ptr, align 8
27+
%index_val = load i64, ptr %index_ptr, align 8
28+
%haystack = getelementptr inbounds i8, ptr getelementptr inbounds (i8, ptr @haystack16, i64 0), i64 %index_val
29+
%h_vec = load <16 x i8>, ptr %haystack, align 16
30+
%cmp_vec = icmp eq <16 x i8> %h_vec, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
31+
store <16 x i1> %cmp_vec, ptr %matches, align 2
32+
%cmp_load = load <16 x i1>, ptr %matches, align 2
33+
%extr = extractelement <16 x i1> %cmp_load, i64 7
34+
%ret = zext i1 %extr to i8
35+
ret i8 %ret
36+
}
37+
38+
; Little endian
39+
40+
; CHECK-LE-LABEL: .LCPI0_0:
41+
; CHECK-LE-NEXT: .word 1
42+
; CHECK-LE-NEXT: .word 2
43+
; CHECK-LE-NEXT: .word 4
44+
; CHECK-LE-NEXT: .word 8
45+
46+
; CHECK-LE-LABEL: .LCPI1_0:
47+
; CHECK-LE-NEXT: .byte 1
48+
; CHECK-LE-NEXT: .byte 2
49+
; CHECK-LE-NEXT: .byte 4
50+
; CHECK-LE-NEXT: .byte 8
51+
; CHECK-LE-NEXT: .byte 16
52+
; CHECK-LE-NEXT: .byte 32
53+
; CHECK-LE-NEXT: .byte 64
54+
; CHECK-LE-NEXT: .byte 128
55+
; CHECK-LE-NEXT: .byte 1
56+
; CHECK-LE-NEXT: .byte 2
57+
; CHECK-LE-NEXT: .byte 4
58+
; CHECK-LE-NEXT: .byte 8
59+
; CHECK-LE-NEXT: .byte 16
60+
; CHECK-LE-NEXT: .byte 32
61+
; CHECK-LE-NEXT: .byte 64
62+
; CHECK-LE-NEXT: .byte 128
63+
64+
65+
; Big endian
66+
67+
; CHECK-BE-LABEL: .LCPI0_0:
68+
; CHECK-BE-NEXT: .word 8
69+
; CHECK-BE-NEXT: .word 4
70+
; CHECK-BE-NEXT: .word 2
71+
; CHECK-BE-NEXT: .word 1
72+
73+
; CHECK-BE-LABEL: .LCPI1_0:
74+
; CHECK-BE-NEXT: .byte 128
75+
; CHECK-BE-NEXT: .byte 64
76+
; CHECK-BE-NEXT: .byte 32
77+
; CHECK-BE-NEXT: .byte 16
78+
; CHECK-BE-NEXT: .byte 8
79+
; CHECK-BE-NEXT: .byte 4
80+
; CHECK-BE-NEXT: .byte 2
81+
; CHECK-BE-NEXT: .byte 1
82+
; CHECK-BE-NEXT: .byte 128
83+
; CHECK-BE-NEXT: .byte 64
84+
; CHECK-BE-NEXT: .byte 32
85+
; CHECK-BE-NEXT: .byte 16
86+
; CHECK-BE-NEXT: .byte 8
87+
; CHECK-BE-NEXT: .byte 4
88+
; CHECK-BE-NEXT: .byte 2
89+
; CHECK-BE-NEXT: .byte 1

0 commit comments

Comments
 (0)