Skip to content

Commit 02d3507

Browse files
committed
Fixups
1 parent 39f9815 commit 02d3507

File tree

2 files changed

+171
-7
lines changed

2 files changed

+171
-7
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -345,9 +345,28 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
345345

346346
bool isLegalMaskedCompressStore(Type *DataType,
347347
Align Alignment) const override {
348-
ElementCount EC = cast<VectorType>(DataType)->getElementCount();
349-
if (EC.getKnownMinValue() != 2 && EC.getKnownMinValue() != 4)
350-
return false;
348+
auto VecTy = cast<VectorType>(DataType);
349+
Type *ElTy = VecTy->getScalarType();
350+
unsigned ElSizeInBits = ElTy->getScalarSizeInBits();
351+
TypeSize VecSizeInBits = VecTy->getPrimitiveSizeInBits();
352+
353+
if (isa<FixedVectorType>(VecTy)) {
354+
// Each 128-bit segment must contain 2 or 4 elements (packed).
355+
if (ElSizeInBits != 32 && ElSizeInBits != 64)
356+
return false;
357+
if (VecSizeInBits % 128 != 0 ||
358+
VecSizeInBits > std::max(128U, ST->getMinSVEVectorSizeInBits()))
359+
return false;
360+
} else {
361+
// Each segment must contain 2 or 4 elements, but the segments can be
362+
// < 128-bits for unpacked vector types.
363+
if (VecSizeInBits.getKnownMinValue() > 128)
364+
return false;
365+
unsigned ElementsPerSegment =
366+
VecSizeInBits.getKnownMinValue() / ElSizeInBits;
367+
if (ElementsPerSegment != 2 && ElementsPerSegment != 4)
368+
return false;
369+
}
351370

352371
if (!isElementTypeLegalForCompressStore(DataType->getScalarType()))
353372
return false;

llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll

Lines changed: 149 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2-
; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s
2+
; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-BASE
3+
; RUN: llc -mtriple=aarch64 -aarch64-sve-vector-bits-min=256 -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-VL256
34

45
;; Full SVE vectors (supported with +sve)
56

@@ -42,7 +43,6 @@ define void @test_compressstore_nxv4f32(ptr %p, <vscale x 4 x float> %vec, <vsca
4243
ret void
4344
}
4445

45-
; TODO: Legal and nonstreaming check
4646
define void @test_compressstore_nxv2f64(ptr %p, <vscale x 2 x double> %vec, <vscale x 2 x i1> %mask) {
4747
; CHECK-LABEL: test_compressstore_nxv2f64:
4848
; CHECK: // %bb.0:
@@ -56,6 +56,21 @@ define void @test_compressstore_nxv2f64(ptr %p, <vscale x 2 x double> %vec, <vsc
5656
ret void
5757
}
5858

59+
;; Unpacked SVE vector types
60+
61+
define void @test_compressstore_nxv2f32(ptr %p, <vscale x 2 x float> %vec, <vscale x 2 x i1> %mask) {
62+
; CHECK-LABEL: test_compressstore_nxv2f32:
63+
; CHECK: // %bb.0:
64+
; CHECK-NEXT: ptrue p1.d
65+
; CHECK-NEXT: compact z0.d, p0, z0.d
66+
; CHECK-NEXT: cntp x8, p1, p0.d
67+
; CHECK-NEXT: whilelo p0.d, xzr, x8
68+
; CHECK-NEXT: st1w { z0.d }, p0, [x0]
69+
; CHECK-NEXT: ret
70+
tail call void @llvm.masked.compressstore.nxv2f32(<vscale x 2 x float> %vec, ptr align 4 %p, <vscale x 2 x i1> %mask)
71+
ret void
72+
}
73+
5974
;; SVE vector types promoted to 32/64-bit (non-exhaustive)
6075

6176
define void @test_compressstore_nxv2i8(ptr %p, <vscale x 2 x i8> %vec, <vscale x 2 x i1> %mask) {
@@ -86,8 +101,8 @@ define void @test_compressstore_nxv4i16(ptr %p, <vscale x 4 x i16> %vec, <vscale
86101

87102
;; NEON vector types (promoted to SVE)
88103

89-
define void @test_compressstore_v2f32(ptr %p, <2 x double> %vec, <2 x i1> %mask) {
90-
; CHECK-LABEL: test_compressstore_v2f32:
104+
define void @test_compressstore_v2f64(ptr %p, <2 x double> %vec, <2 x i1> %mask) {
105+
; CHECK-LABEL: test_compressstore_v2f64:
91106
; CHECK: // %bb.0:
92107
; CHECK-NEXT: ushll v1.2d, v1.2s, #0
93108
; CHECK-NEXT: ptrue p0.d, vl2
@@ -139,3 +154,133 @@ define void @test_compressstore_v2i64(ptr %p, <2 x i64> %vec, <2 x i1> %mask) {
139154
tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %vec, ptr align 8 %p, <2 x i1> %mask)
140155
ret void
141156
}
157+
158+
define void @test_compressstore_v8i32(ptr %p, <8 x i32> %vec, <8 x i1> %mask) {
159+
; CHECK-BASE-LABEL: test_compressstore_v8i32:
160+
; CHECK-BASE: // %bb.0:
161+
; CHECK-BASE-NEXT: shl v2.8b, v2.8b, #7
162+
; CHECK-BASE-NEXT: adrp x8, .LCPI10_0
163+
; CHECK-BASE-NEXT: ldr d3, [x8, :lo12:.LCPI10_0]
164+
; CHECK-BASE-NEXT: cmlt v2.8b, v2.8b, #0
165+
; CHECK-BASE-NEXT: and v2.8b, v2.8b, v3.8b
166+
; CHECK-BASE-NEXT: addv b2, v2.8b
167+
; CHECK-BASE-NEXT: fmov w8, s2
168+
; CHECK-BASE-NEXT: tbnz w8, #0, .LBB10_9
169+
; CHECK-BASE-NEXT: // %bb.1: // %else
170+
; CHECK-BASE-NEXT: tbnz w8, #1, .LBB10_10
171+
; CHECK-BASE-NEXT: .LBB10_2: // %else2
172+
; CHECK-BASE-NEXT: tbnz w8, #2, .LBB10_11
173+
; CHECK-BASE-NEXT: .LBB10_3: // %else5
174+
; CHECK-BASE-NEXT: tbnz w8, #3, .LBB10_12
175+
; CHECK-BASE-NEXT: .LBB10_4: // %else8
176+
; CHECK-BASE-NEXT: tbnz w8, #4, .LBB10_13
177+
; CHECK-BASE-NEXT: .LBB10_5: // %else11
178+
; CHECK-BASE-NEXT: tbnz w8, #5, .LBB10_14
179+
; CHECK-BASE-NEXT: .LBB10_6: // %else14
180+
; CHECK-BASE-NEXT: tbnz w8, #6, .LBB10_15
181+
; CHECK-BASE-NEXT: .LBB10_7: // %else17
182+
; CHECK-BASE-NEXT: tbnz w8, #7, .LBB10_16
183+
; CHECK-BASE-NEXT: .LBB10_8: // %else20
184+
; CHECK-BASE-NEXT: ret
185+
; CHECK-BASE-NEXT: .LBB10_9: // %cond.store
186+
; CHECK-BASE-NEXT: st1 { v0.s }[0], [x0], #4
187+
; CHECK-BASE-NEXT: tbz w8, #1, .LBB10_2
188+
; CHECK-BASE-NEXT: .LBB10_10: // %cond.store1
189+
; CHECK-BASE-NEXT: st1 { v0.s }[1], [x0], #4
190+
; CHECK-BASE-NEXT: tbz w8, #2, .LBB10_3
191+
; CHECK-BASE-NEXT: .LBB10_11: // %cond.store4
192+
; CHECK-BASE-NEXT: st1 { v0.s }[2], [x0], #4
193+
; CHECK-BASE-NEXT: tbz w8, #3, .LBB10_4
194+
; CHECK-BASE-NEXT: .LBB10_12: // %cond.store7
195+
; CHECK-BASE-NEXT: st1 { v0.s }[3], [x0], #4
196+
; CHECK-BASE-NEXT: tbz w8, #4, .LBB10_5
197+
; CHECK-BASE-NEXT: .LBB10_13: // %cond.store10
198+
; CHECK-BASE-NEXT: st1 { v1.s }[0], [x0], #4
199+
; CHECK-BASE-NEXT: tbz w8, #5, .LBB10_6
200+
; CHECK-BASE-NEXT: .LBB10_14: // %cond.store13
201+
; CHECK-BASE-NEXT: st1 { v1.s }[1], [x0], #4
202+
; CHECK-BASE-NEXT: tbz w8, #6, .LBB10_7
203+
; CHECK-BASE-NEXT: .LBB10_15: // %cond.store16
204+
; CHECK-BASE-NEXT: st1 { v1.s }[2], [x0], #4
205+
; CHECK-BASE-NEXT: tbz w8, #7, .LBB10_8
206+
; CHECK-BASE-NEXT: .LBB10_16: // %cond.store19
207+
; CHECK-BASE-NEXT: st1 { v1.s }[3], [x0]
208+
; CHECK-BASE-NEXT: ret
209+
;
210+
; CHECK-VL256-LABEL: test_compressstore_v8i32:
211+
; CHECK-VL256: // %bb.0:
212+
; CHECK-VL256-NEXT: // kill: def $d2 killed $d2 def $z2
213+
; CHECK-VL256-NEXT: ptrue p0.s, vl8
214+
; CHECK-VL256-NEXT: // kill: def $q0 killed $q0 def $z0
215+
; CHECK-VL256-NEXT: // kill: def $q1 killed $q1 def $z1
216+
; CHECK-VL256-NEXT: uunpklo z2.h, z2.b
217+
; CHECK-VL256-NEXT: ptrue p1.s, vl4
218+
; CHECK-VL256-NEXT: splice z0.s, p1, z0.s, z1.s
219+
; CHECK-VL256-NEXT: ptrue p1.s
220+
; CHECK-VL256-NEXT: uunpklo z2.s, z2.h
221+
; CHECK-VL256-NEXT: lsl z2.s, z2.s, #31
222+
; CHECK-VL256-NEXT: asr z2.s, z2.s, #31
223+
; CHECK-VL256-NEXT: cmpne p0.s, p0/z, z2.s, #0
224+
; CHECK-VL256-NEXT: cntp x8, p1, p0.s
225+
; CHECK-VL256-NEXT: compact z0.s, p0, z0.s
226+
; CHECK-VL256-NEXT: whilelo p0.s, xzr, x8
227+
; CHECK-VL256-NEXT: st1w { z0.s }, p0, [x0]
228+
; CHECK-VL256-NEXT: ret
229+
tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %vec, ptr align 4 %p, <8 x i1> %mask)
230+
ret void
231+
}
232+
233+
define void @test_compressstore_v4i64(ptr %p, <4 x i64> %vec, <4 x i1> %mask) {
234+
; CHECK-BASE-LABEL: test_compressstore_v4i64:
235+
; CHECK-BASE: // %bb.0:
236+
; CHECK-BASE-NEXT: shl v2.4h, v2.4h, #15
237+
; CHECK-BASE-NEXT: adrp x8, .LCPI11_0
238+
; CHECK-BASE-NEXT: ldr d3, [x8, :lo12:.LCPI11_0]
239+
; CHECK-BASE-NEXT: cmlt v2.4h, v2.4h, #0
240+
; CHECK-BASE-NEXT: and v2.8b, v2.8b, v3.8b
241+
; CHECK-BASE-NEXT: addv h2, v2.4h
242+
; CHECK-BASE-NEXT: fmov w8, s2
243+
; CHECK-BASE-NEXT: tbnz w8, #0, .LBB11_5
244+
; CHECK-BASE-NEXT: // %bb.1: // %else
245+
; CHECK-BASE-NEXT: tbnz w8, #1, .LBB11_6
246+
; CHECK-BASE-NEXT: .LBB11_2: // %else2
247+
; CHECK-BASE-NEXT: tbnz w8, #2, .LBB11_7
248+
; CHECK-BASE-NEXT: .LBB11_3: // %else5
249+
; CHECK-BASE-NEXT: tbnz w8, #3, .LBB11_8
250+
; CHECK-BASE-NEXT: .LBB11_4: // %else8
251+
; CHECK-BASE-NEXT: ret
252+
; CHECK-BASE-NEXT: .LBB11_5: // %cond.store
253+
; CHECK-BASE-NEXT: st1 { v0.d }[0], [x0], #8
254+
; CHECK-BASE-NEXT: tbz w8, #1, .LBB11_2
255+
; CHECK-BASE-NEXT: .LBB11_6: // %cond.store1
256+
; CHECK-BASE-NEXT: st1 { v0.d }[1], [x0], #8
257+
; CHECK-BASE-NEXT: tbz w8, #2, .LBB11_3
258+
; CHECK-BASE-NEXT: .LBB11_7: // %cond.store4
259+
; CHECK-BASE-NEXT: st1 { v1.d }[0], [x0], #8
260+
; CHECK-BASE-NEXT: tbz w8, #3, .LBB11_4
261+
; CHECK-BASE-NEXT: .LBB11_8: // %cond.store7
262+
; CHECK-BASE-NEXT: st1 { v1.d }[1], [x0]
263+
; CHECK-BASE-NEXT: ret
264+
;
265+
; CHECK-VL256-LABEL: test_compressstore_v4i64:
266+
; CHECK-VL256: // %bb.0:
267+
; CHECK-VL256-NEXT: // kill: def $d2 killed $d2 def $z2
268+
; CHECK-VL256-NEXT: ptrue p0.d, vl4
269+
; CHECK-VL256-NEXT: // kill: def $q0 killed $q0 def $z0
270+
; CHECK-VL256-NEXT: // kill: def $q1 killed $q1 def $z1
271+
; CHECK-VL256-NEXT: uunpklo z2.s, z2.h
272+
; CHECK-VL256-NEXT: ptrue p1.d, vl2
273+
; CHECK-VL256-NEXT: splice z0.d, p1, z0.d, z1.d
274+
; CHECK-VL256-NEXT: ptrue p1.d
275+
; CHECK-VL256-NEXT: uunpklo z2.d, z2.s
276+
; CHECK-VL256-NEXT: lsl z2.d, z2.d, #63
277+
; CHECK-VL256-NEXT: asr z2.d, z2.d, #63
278+
; CHECK-VL256-NEXT: cmpne p0.d, p0/z, z2.d, #0
279+
; CHECK-VL256-NEXT: cntp x8, p1, p0.d
280+
; CHECK-VL256-NEXT: compact z0.d, p0, z0.d
281+
; CHECK-VL256-NEXT: whilelo p0.d, xzr, x8
282+
; CHECK-VL256-NEXT: st1d { z0.d }, p0, [x0]
283+
; CHECK-VL256-NEXT: ret
284+
tail call void @llvm.masked.compressstore.v4i64(<4 x i64> %vec, ptr align 8 %p, <4 x i1> %mask)
285+
ret void
286+
}

0 commit comments

Comments
 (0)