Skip to content

Commit 85921fc

Browse files
committed
[AArch64][SVE] Improve VLS imm addressing modes.
When compiling VLS SVE, the compiler often replaces VL-based offsets with immediate-based ones. This leads to a mismatch during isel since SVE loads/stores generally expect immediate offsets relative to VL. For example, given: ```c svfloat64_t foo(const double *x) { svbool_t pg = svptrue_b64(); return svld1_f64(pg, x+svcntd()); } ``` When compiled with `-msve-vector-bits=128`, we currently generate: ```gas foo: ptrue p0.d mov x8, #2 ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ret ``` In practice, we could instead be generating: ```gas foo: ldr z0, [x0, #1, mul vl] ret ``` Likewise for other types, stores, and other VLS lengths.
1 parent 624d1e9 commit 85921fc

File tree

4 files changed

+105
-134
lines changed

4 files changed

+105
-134
lines changed

clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,9 @@
1313

1414
void func(int *restrict a, int *restrict b) {
1515
// CHECK-LABEL: func
16-
// CHECK256-COUNT-1: str
17-
// CHECK256-COUNT-7: st1w
18-
// CHECK512-COUNT-1: str
19-
// CHECK512-COUNT-3: st1w
20-
// CHECK1024-COUNT-1: str
21-
// CHECK1024-COUNT-1: st1w
16+
// CHECK256-COUNT-8: str
17+
// CHECK512-COUNT-4: str
18+
// CHECK1024-COUNT-2: str
2219
// CHECK2048-COUNT-1: st1w
2320
#pragma clang loop vectorize(enable)
2421
for (int i = 0; i < 64; ++i)

llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7379,13 +7379,27 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
73797379
if (N.getOpcode() != ISD::ADD)
73807380
return false;
73817381

7382-
SDValue VScale = N.getOperand(1);
7383-
if (VScale.getOpcode() != ISD::VSCALE)
7382+
int64_t MulImm = std::numeric_limits<int64_t>::max();
7383+
if (SDValue VScale = N.getOperand(1); VScale.getOpcode() == ISD::VSCALE)
7384+
MulImm = cast<ConstantSDNode>(VScale.getOperand(0))->getSExtValue();
7385+
else if (auto C = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
7386+
int64_t ByteOffset = C->getSExtValue();
7387+
constexpr auto SVEBitsPerBlock = AArch64::SVEBitsPerBlock;
7388+
auto MinVScale = Subtarget->getMinSVEVectorSizeInBits() / SVEBitsPerBlock;
7389+
auto MaxVScale = Subtarget->getMaxSVEVectorSizeInBits() / SVEBitsPerBlock;
7390+
7391+
if (!MaxVScale || MinVScale != MaxVScale || ByteOffset % MaxVScale != 0)
7392+
return false;
7393+
7394+
MulImm = ByteOffset / MaxVScale;
7395+
} else
73847396
return false;
73857397

7398+
assert(MulImm != std::numeric_limits<int64_t>::max() &&
7399+
"Uninitialized MulImm.");
7400+
73867401
TypeSize TS = MemVT.getSizeInBits();
73877402
int64_t MemWidthBytes = static_cast<int64_t>(TS.getKnownMinValue()) / 8;
7388-
int64_t MulImm = cast<ConstantSDNode>(VScale.getOperand(0))->getSExtValue();
73897403

73907404
if ((MulImm % MemWidthBytes) != 0)
73917405
return false;

llvm/test/CodeGen/AArch64/sve-fixed-length-offsets.ll

Lines changed: 40 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -17,42 +17,32 @@ define void @nxv16i8(ptr %ldptr, ptr %stptr) {
1717
;
1818
; CHECK-128-LABEL: nxv16i8:
1919
; CHECK-128: // %bb.0:
20-
; CHECK-128-NEXT: ptrue p0.b
21-
; CHECK-128-NEXT: mov w8, #256 // =0x100
22-
; CHECK-128-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
23-
; CHECK-128-NEXT: st1b { z0.b }, p0, [x1, x8]
20+
; CHECK-128-NEXT: ldr z0, [x0, #16, mul vl]
21+
; CHECK-128-NEXT: str z0, [x1, #16, mul vl]
2422
; CHECK-128-NEXT: ret
2523
;
2624
; CHECK-256-LABEL: nxv16i8:
2725
; CHECK-256: // %bb.0:
28-
; CHECK-256-NEXT: ptrue p0.b
29-
; CHECK-256-NEXT: mov w8, #256 // =0x100
30-
; CHECK-256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
31-
; CHECK-256-NEXT: st1b { z0.b }, p0, [x1, x8]
26+
; CHECK-256-NEXT: ldr z0, [x0, #8, mul vl]
27+
; CHECK-256-NEXT: str z0, [x1, #8, mul vl]
3228
; CHECK-256-NEXT: ret
3329
;
3430
; CHECK-512-LABEL: nxv16i8:
3531
; CHECK-512: // %bb.0:
36-
; CHECK-512-NEXT: ptrue p0.b
37-
; CHECK-512-NEXT: mov w8, #256 // =0x100
38-
; CHECK-512-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
39-
; CHECK-512-NEXT: st1b { z0.b }, p0, [x1, x8]
32+
; CHECK-512-NEXT: ldr z0, [x0, #4, mul vl]
33+
; CHECK-512-NEXT: str z0, [x1, #4, mul vl]
4034
; CHECK-512-NEXT: ret
4135
;
4236
; CHECK-1024-LABEL: nxv16i8:
4337
; CHECK-1024: // %bb.0:
44-
; CHECK-1024-NEXT: ptrue p0.b
45-
; CHECK-1024-NEXT: mov w8, #256 // =0x100
46-
; CHECK-1024-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
47-
; CHECK-1024-NEXT: st1b { z0.b }, p0, [x1, x8]
38+
; CHECK-1024-NEXT: ldr z0, [x0, #2, mul vl]
39+
; CHECK-1024-NEXT: str z0, [x1, #2, mul vl]
4840
; CHECK-1024-NEXT: ret
4941
;
5042
; CHECK-2048-LABEL: nxv16i8:
5143
; CHECK-2048: // %bb.0:
52-
; CHECK-2048-NEXT: ptrue p0.b
53-
; CHECK-2048-NEXT: mov w8, #256 // =0x100
54-
; CHECK-2048-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
55-
; CHECK-2048-NEXT: st1b { z0.b }, p0, [x1, x8]
44+
; CHECK-2048-NEXT: ldr z0, [x0, #1, mul vl]
45+
; CHECK-2048-NEXT: str z0, [x1, #1, mul vl]
5646
; CHECK-2048-NEXT: ret
5747
%ldoff = getelementptr inbounds nuw i8, ptr %ldptr, i64 256
5848
%stoff = getelementptr inbounds nuw i8, ptr %stptr, i64 256
@@ -72,42 +62,32 @@ define void @nxv8i16(ptr %ldptr, ptr %stptr) {
7262
;
7363
; CHECK-128-LABEL: nxv8i16:
7464
; CHECK-128: // %bb.0:
75-
; CHECK-128-NEXT: ptrue p0.h
76-
; CHECK-128-NEXT: mov x8, #128 // =0x80
77-
; CHECK-128-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
78-
; CHECK-128-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
65+
; CHECK-128-NEXT: ldr z0, [x0, #16, mul vl]
66+
; CHECK-128-NEXT: str z0, [x1, #16, mul vl]
7967
; CHECK-128-NEXT: ret
8068
;
8169
; CHECK-256-LABEL: nxv8i16:
8270
; CHECK-256: // %bb.0:
83-
; CHECK-256-NEXT: ptrue p0.h
84-
; CHECK-256-NEXT: mov x8, #128 // =0x80
85-
; CHECK-256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
86-
; CHECK-256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
71+
; CHECK-256-NEXT: ldr z0, [x0, #8, mul vl]
72+
; CHECK-256-NEXT: str z0, [x1, #8, mul vl]
8773
; CHECK-256-NEXT: ret
8874
;
8975
; CHECK-512-LABEL: nxv8i16:
9076
; CHECK-512: // %bb.0:
91-
; CHECK-512-NEXT: ptrue p0.h
92-
; CHECK-512-NEXT: mov x8, #128 // =0x80
93-
; CHECK-512-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
94-
; CHECK-512-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
77+
; CHECK-512-NEXT: ldr z0, [x0, #4, mul vl]
78+
; CHECK-512-NEXT: str z0, [x1, #4, mul vl]
9579
; CHECK-512-NEXT: ret
9680
;
9781
; CHECK-1024-LABEL: nxv8i16:
9882
; CHECK-1024: // %bb.0:
99-
; CHECK-1024-NEXT: ptrue p0.h
100-
; CHECK-1024-NEXT: mov x8, #128 // =0x80
101-
; CHECK-1024-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
102-
; CHECK-1024-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
83+
; CHECK-1024-NEXT: ldr z0, [x0, #2, mul vl]
84+
; CHECK-1024-NEXT: str z0, [x1, #2, mul vl]
10385
; CHECK-1024-NEXT: ret
10486
;
10587
; CHECK-2048-LABEL: nxv8i16:
10688
; CHECK-2048: // %bb.0:
107-
; CHECK-2048-NEXT: ptrue p0.h
108-
; CHECK-2048-NEXT: mov x8, #128 // =0x80
109-
; CHECK-2048-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
110-
; CHECK-2048-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1]
89+
; CHECK-2048-NEXT: ldr z0, [x0, #1, mul vl]
90+
; CHECK-2048-NEXT: str z0, [x1, #1, mul vl]
11191
; CHECK-2048-NEXT: ret
11292
%ldoff = getelementptr inbounds nuw i16, ptr %ldptr, i64 128
11393
%stoff = getelementptr inbounds nuw i16, ptr %stptr, i64 128
@@ -127,42 +107,32 @@ define void @nxv4i32(ptr %ldptr, ptr %stptr) {
127107
;
128108
; CHECK-128-LABEL: nxv4i32:
129109
; CHECK-128: // %bb.0:
130-
; CHECK-128-NEXT: ptrue p0.s
131-
; CHECK-128-NEXT: mov x8, #64 // =0x40
132-
; CHECK-128-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
133-
; CHECK-128-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
110+
; CHECK-128-NEXT: ldr z0, [x0, #16, mul vl]
111+
; CHECK-128-NEXT: str z0, [x1, #16, mul vl]
134112
; CHECK-128-NEXT: ret
135113
;
136114
; CHECK-256-LABEL: nxv4i32:
137115
; CHECK-256: // %bb.0:
138-
; CHECK-256-NEXT: ptrue p0.s
139-
; CHECK-256-NEXT: mov x8, #64 // =0x40
140-
; CHECK-256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
141-
; CHECK-256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
116+
; CHECK-256-NEXT: ldr z0, [x0, #8, mul vl]
117+
; CHECK-256-NEXT: str z0, [x1, #8, mul vl]
142118
; CHECK-256-NEXT: ret
143119
;
144120
; CHECK-512-LABEL: nxv4i32:
145121
; CHECK-512: // %bb.0:
146-
; CHECK-512-NEXT: ptrue p0.s
147-
; CHECK-512-NEXT: mov x8, #64 // =0x40
148-
; CHECK-512-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
149-
; CHECK-512-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
122+
; CHECK-512-NEXT: ldr z0, [x0, #4, mul vl]
123+
; CHECK-512-NEXT: str z0, [x1, #4, mul vl]
150124
; CHECK-512-NEXT: ret
151125
;
152126
; CHECK-1024-LABEL: nxv4i32:
153127
; CHECK-1024: // %bb.0:
154-
; CHECK-1024-NEXT: ptrue p0.s
155-
; CHECK-1024-NEXT: mov x8, #64 // =0x40
156-
; CHECK-1024-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
157-
; CHECK-1024-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
128+
; CHECK-1024-NEXT: ldr z0, [x0, #2, mul vl]
129+
; CHECK-1024-NEXT: str z0, [x1, #2, mul vl]
158130
; CHECK-1024-NEXT: ret
159131
;
160132
; CHECK-2048-LABEL: nxv4i32:
161133
; CHECK-2048: // %bb.0:
162-
; CHECK-2048-NEXT: ptrue p0.s
163-
; CHECK-2048-NEXT: mov x8, #64 // =0x40
164-
; CHECK-2048-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
165-
; CHECK-2048-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2]
134+
; CHECK-2048-NEXT: ldr z0, [x0, #1, mul vl]
135+
; CHECK-2048-NEXT: str z0, [x1, #1, mul vl]
166136
; CHECK-2048-NEXT: ret
167137
%ldoff = getelementptr inbounds nuw i32, ptr %ldptr, i64 64
168138
%stoff = getelementptr inbounds nuw i32, ptr %stptr, i64 64
@@ -182,42 +152,32 @@ define void @nxv2i64(ptr %ldptr, ptr %stptr) {
182152
;
183153
; CHECK-128-LABEL: nxv2i64:
184154
; CHECK-128: // %bb.0:
185-
; CHECK-128-NEXT: ptrue p0.d
186-
; CHECK-128-NEXT: mov x8, #32 // =0x20
187-
; CHECK-128-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
188-
; CHECK-128-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
155+
; CHECK-128-NEXT: ldr z0, [x0, #16, mul vl]
156+
; CHECK-128-NEXT: str z0, [x1, #16, mul vl]
189157
; CHECK-128-NEXT: ret
190158
;
191159
; CHECK-256-LABEL: nxv2i64:
192160
; CHECK-256: // %bb.0:
193-
; CHECK-256-NEXT: ptrue p0.d
194-
; CHECK-256-NEXT: mov x8, #32 // =0x20
195-
; CHECK-256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
196-
; CHECK-256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
161+
; CHECK-256-NEXT: ldr z0, [x0, #8, mul vl]
162+
; CHECK-256-NEXT: str z0, [x1, #8, mul vl]
197163
; CHECK-256-NEXT: ret
198164
;
199165
; CHECK-512-LABEL: nxv2i64:
200166
; CHECK-512: // %bb.0:
201-
; CHECK-512-NEXT: ptrue p0.d
202-
; CHECK-512-NEXT: mov x8, #32 // =0x20
203-
; CHECK-512-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
204-
; CHECK-512-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
167+
; CHECK-512-NEXT: ldr z0, [x0, #4, mul vl]
168+
; CHECK-512-NEXT: str z0, [x1, #4, mul vl]
205169
; CHECK-512-NEXT: ret
206170
;
207171
; CHECK-1024-LABEL: nxv2i64:
208172
; CHECK-1024: // %bb.0:
209-
; CHECK-1024-NEXT: ptrue p0.d
210-
; CHECK-1024-NEXT: mov x8, #32 // =0x20
211-
; CHECK-1024-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
212-
; CHECK-1024-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
173+
; CHECK-1024-NEXT: ldr z0, [x0, #2, mul vl]
174+
; CHECK-1024-NEXT: str z0, [x1, #2, mul vl]
213175
; CHECK-1024-NEXT: ret
214176
;
215177
; CHECK-2048-LABEL: nxv2i64:
216178
; CHECK-2048: // %bb.0:
217-
; CHECK-2048-NEXT: ptrue p0.d
218-
; CHECK-2048-NEXT: mov x8, #32 // =0x20
219-
; CHECK-2048-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
220-
; CHECK-2048-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3]
179+
; CHECK-2048-NEXT: ldr z0, [x0, #1, mul vl]
180+
; CHECK-2048-NEXT: str z0, [x1, #1, mul vl]
221181
; CHECK-2048-NEXT: ret
222182
%ldoff = getelementptr inbounds nuw i64, ptr %ldptr, i64 32
223183
%stoff = getelementptr inbounds nuw i64, ptr %stptr, i64 32

llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll

Lines changed: 45 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -30,64 +30,64 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang
3030
; CHECK-NEXT: // %bb.1: // %vector.body
3131
; CHECK-NEXT: mov z0.b, #0 // =0x0
3232
; CHECK-NEXT: ptrue p0.s
33-
; CHECK-NEXT: mov x9, #8 // =0x8
34-
; CHECK-NEXT: mov x10, #24 // =0x18
33+
; CHECK-NEXT: mov x9, #24 // =0x18
3534
; CHECK-NEXT: umov w8, v0.b[8]
36-
; CHECK-NEXT: mov v1.16b, v0.16b
37-
; CHECK-NEXT: mov v1.b[1], v0.b[1]
38-
; CHECK-NEXT: fmov s2, w8
39-
; CHECK-NEXT: mov x8, #16 // =0x10
40-
; CHECK-NEXT: mov v2.b[1], v0.b[9]
41-
; CHECK-NEXT: mov v1.b[2], v0.b[2]
42-
; CHECK-NEXT: mov v2.b[2], v0.b[10]
43-
; CHECK-NEXT: mov v1.b[3], v0.b[3]
44-
; CHECK-NEXT: mov v2.b[3], v0.b[11]
45-
; CHECK-NEXT: mov v1.b[4], v0.b[4]
46-
; CHECK-NEXT: mov v2.b[4], v0.b[12]
47-
; CHECK-NEXT: mov v1.b[5], v0.b[5]
48-
; CHECK-NEXT: mov v2.b[5], v0.b[13]
49-
; CHECK-NEXT: mov v1.b[6], v0.b[6]
50-
; CHECK-NEXT: mov v2.b[6], v0.b[14]
51-
; CHECK-NEXT: mov v1.b[7], v0.b[7]
52-
; CHECK-NEXT: mov v2.b[7], v0.b[15]
53-
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16
54-
; CHECK-NEXT: uunpklo z1.h, z1.b
55-
; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
56-
; CHECK-NEXT: uunpklo z0.h, z0.b
35+
; CHECK-NEXT: mov v2.16b, v0.16b
36+
; CHECK-NEXT: mov z3.d, z0.d
37+
; CHECK-NEXT: mov v2.b[1], v0.b[1]
38+
; CHECK-NEXT: ext z3.b, z3.b, z0.b, #16
39+
; CHECK-NEXT: fmov s1, w8
40+
; CHECK-NEXT: mov x8, #8 // =0x8
41+
; CHECK-NEXT: ext v4.16b, v3.16b, v3.16b, #8
42+
; CHECK-NEXT: mov v1.b[1], v0.b[9]
43+
; CHECK-NEXT: mov v2.b[2], v0.b[2]
44+
; CHECK-NEXT: mov v1.b[2], v0.b[10]
45+
; CHECK-NEXT: mov v2.b[3], v0.b[3]
46+
; CHECK-NEXT: mov v1.b[3], v0.b[11]
47+
; CHECK-NEXT: mov v2.b[4], v0.b[4]
48+
; CHECK-NEXT: mov v1.b[4], v0.b[12]
49+
; CHECK-NEXT: mov v2.b[5], v0.b[5]
50+
; CHECK-NEXT: mov v1.b[5], v0.b[13]
51+
; CHECK-NEXT: mov v2.b[6], v0.b[6]
52+
; CHECK-NEXT: mov v1.b[6], v0.b[14]
53+
; CHECK-NEXT: mov v2.b[7], v0.b[7]
54+
; CHECK-NEXT: mov v1.b[7], v0.b[15]
5755
; CHECK-NEXT: uunpklo z2.h, z2.b
58-
; CHECK-NEXT: uunpklo z1.s, z1.h
59-
; CHECK-NEXT: uunpklo z3.h, z3.b
60-
; CHECK-NEXT: uunpklo z0.s, z0.h
56+
; CHECK-NEXT: uunpklo z0.h, z1.b
57+
; CHECK-NEXT: uunpklo z1.h, z3.b
58+
; CHECK-NEXT: uunpklo z3.h, z4.b
6159
; CHECK-NEXT: uunpklo z2.s, z2.h
62-
; CHECK-NEXT: lsl z1.s, z1.s, #31
60+
; CHECK-NEXT: uunpklo z0.s, z0.h
61+
; CHECK-NEXT: uunpklo z1.s, z1.h
6362
; CHECK-NEXT: uunpklo z3.s, z3.h
64-
; CHECK-NEXT: lsl z0.s, z0.s, #31
65-
; CHECK-NEXT: asr z1.s, z1.s, #31
6663
; CHECK-NEXT: lsl z2.s, z2.s, #31
67-
; CHECK-NEXT: asr z0.s, z0.s, #31
68-
; CHECK-NEXT: and z1.s, z1.s, #0x1
64+
; CHECK-NEXT: lsl z0.s, z0.s, #31
65+
; CHECK-NEXT: lsl z1.s, z1.s, #31
6966
; CHECK-NEXT: lsl z3.s, z3.s, #31
7067
; CHECK-NEXT: asr z2.s, z2.s, #31
71-
; CHECK-NEXT: and z0.s, z0.s, #0x1
72-
; CHECK-NEXT: cmpne p4.s, p0/z, z1.s, #0
73-
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
68+
; CHECK-NEXT: asr z0.s, z0.s, #31
69+
; CHECK-NEXT: asr z1.s, z1.s, #31
7470
; CHECK-NEXT: asr z3.s, z3.s, #31
7571
; CHECK-NEXT: and z2.s, z2.s, #0x1
76-
; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0
77-
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
72+
; CHECK-NEXT: and z0.s, z0.s, #0x1
73+
; CHECK-NEXT: and z1.s, z1.s, #0x1
7874
; CHECK-NEXT: and z3.s, z3.s, #0x1
79-
; CHECK-NEXT: cmpne p2.s, p0/z, z2.s, #0
80-
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x9, lsl #2]
81-
; CHECK-NEXT: mov z1.s, p4/m, #0 // =0x0
75+
; CHECK-NEXT: cmpne p4.s, p0/z, z2.s, #0
76+
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0]
77+
; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0
78+
; CHECK-NEXT: cmpne p2.s, p0/z, z1.s, #0
8279
; CHECK-NEXT: cmpne p3.s, p0/z, z3.s, #0
83-
; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
80+
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
81+
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl]
82+
; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, x9, lsl #2]
83+
; CHECK-NEXT: mov z2.s, p4/m, #0 // =0x0
8484
; CHECK-NEXT: mov z0.s, p1/m, #0 // =0x0
85-
; CHECK-NEXT: mov z2.s, p2/m, #0 // =0x0
86-
; CHECK-NEXT: st1w { z1.s }, p0, [x0]
87-
; CHECK-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
85+
; CHECK-NEXT: mov z1.s, p2/m, #0 // =0x0
8886
; CHECK-NEXT: mov z3.s, p3/m, #0 // =0x0
89-
; CHECK-NEXT: st1w { z2.s }, p0, [x0, x9, lsl #2]
90-
; CHECK-NEXT: st1w { z3.s }, p0, [x0, x10, lsl #2]
87+
; CHECK-NEXT: st1w { z2.s }, p0, [x0]
88+
; CHECK-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
89+
; CHECK-NEXT: st1w { z1.s }, p0, [x0, #1, mul vl]
90+
; CHECK-NEXT: st1w { z3.s }, p0, [x0, x9, lsl #2]
9191
; CHECK-NEXT: .LBB1_2: // %exit
9292
; CHECK-NEXT: ret
9393
%broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer

0 commit comments

Comments
 (0)