Skip to content

Commit 416289e

Browse files
committed
Fixups
1 parent a980f8a commit 416289e

File tree

4 files changed

+89
-118
lines changed

4 files changed

+89
-118
lines changed

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10607,20 +10607,25 @@ TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask,
1060710607
assert(DataVT.getVectorElementCount() == MaskVT.getVectorElementCount() &&
1060810608
"Incompatible types of Data and Mask");
1060910609
if (IsCompressedMemory) {
10610-
if (DataVT.isScalableVector())
10611-
report_fatal_error(
10612-
"Cannot currently handle compressed memory with scalable vectors");
1061310610
// Incrementing the pointer according to number of '1's in the mask.
10614-
EVT MaskIntVT = EVT::getIntegerVT(*DAG.getContext(), MaskVT.getSizeInBits());
10615-
SDValue MaskInIntReg = DAG.getBitcast(MaskIntVT, Mask);
10616-
if (MaskIntVT.getSizeInBits() < 32) {
10617-
MaskInIntReg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskInIntReg);
10618-
MaskIntVT = MVT::i32;
10611+
if (DataVT.isScalableVector()) {
10612+
EVT MaskExtVT = MaskVT.changeElementType(MVT::i32);
10613+
SDValue MaskExt = DAG.getNode(ISD::ZERO_EXTEND, DL, MaskExtVT, Mask);
10614+
Increment = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, MaskExt);
10615+
} else {
10616+
EVT MaskIntVT =
10617+
EVT::getIntegerVT(*DAG.getContext(), MaskVT.getSizeInBits());
10618+
SDValue MaskInIntReg = DAG.getBitcast(MaskIntVT, Mask);
10619+
if (MaskIntVT.getSizeInBits() < 32) {
10620+
MaskInIntReg =
10621+
DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskInIntReg);
10622+
MaskIntVT = MVT::i32;
10623+
}
10624+
Increment = DAG.getNode(ISD::CTPOP, DL, MaskIntVT, MaskInIntReg);
1061910625
}
1062010626

10621-
// Count '1's with POPCNT.
10622-
Increment = DAG.getNode(ISD::CTPOP, DL, MaskIntVT, MaskInIntReg);
1062310627
Increment = DAG.getZExtOrTrunc(Increment, DL, AddrVT);
10628+
1062410629
// Scale is an element size in bytes.
1062510630
SDValue Scale = DAG.getConstant(DataVT.getScalarSizeInBits() / 8, DL,
1062610631
AddrVT);

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1986,11 +1986,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
19861986
for (auto VT : {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64,
19871987
MVT::nxv2f32, MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16,
19881988
MVT::nxv4i32, MVT::nxv4f32}) {
1989-
setOperationAction(ISD::MSTORE, VT, Custom);
1989+
setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);
19901990
// Use a custom lowering for masked stores that could be a supported
19911991
// compressing store. Note: These types still use the normal (Legal)
19921992
// lowering for non-compressing masked stores.
1993-
setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);
1993+
setOperationAction(ISD::MSTORE, VT, Custom);
19941994
}
19951995

19961996
// If we have SVE, we can use SVE logic for legal (or smaller than legal)

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 7 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -333,45 +333,20 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
333333
}
334334

335335
bool isElementTypeLegalForCompressStore(Type *Ty) const {
336-
if (Ty->isFloatTy() || Ty->isDoubleTy())
337-
return true;
338-
339-
if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) || Ty->isIntegerTy(32) ||
340-
Ty->isIntegerTy(64))
341-
return true;
342-
343-
return false;
336+
return Ty->isFloatTy() || Ty->isDoubleTy() || Ty->isIntegerTy(32) ||
337+
Ty->isIntegerTy(64);
344338
}
345339

346340
bool isLegalMaskedCompressStore(Type *DataType,
347341
Align Alignment) const override {
348-
auto VecTy = cast<VectorType>(DataType);
349-
Type *ElTy = VecTy->getScalarType();
350-
unsigned ElSizeInBits = ElTy->getScalarSizeInBits();
351-
TypeSize VecSizeInBits = VecTy->getPrimitiveSizeInBits();
352-
353-
if (isa<FixedVectorType>(VecTy)) {
354-
// Each 128-bit segment must contain 2 or 4 elements (packed).
355-
if (ElSizeInBits != 32 && ElSizeInBits != 64)
356-
return false;
357-
if (VecSizeInBits % 128 != 0 ||
358-
VecSizeInBits > std::max(128U, ST->getMinSVEVectorSizeInBits()))
359-
return false;
360-
} else {
361-
// Each segment must contain 2 or 4 elements, but the segments can be
362-
// < 128-bits for unpacked vector types.
363-
if (VecSizeInBits.getKnownMinValue() > 128)
364-
return false;
365-
unsigned ElementsPerSegment =
366-
VecSizeInBits.getKnownMinValue() / ElSizeInBits;
367-
if (ElementsPerSegment != 2 && ElementsPerSegment != 4)
368-
return false;
369-
}
342+
if (!ST->isSVEAvailable())
343+
return false;
370344

371-
if (!isElementTypeLegalForCompressStore(DataType->getScalarType()))
345+
if (isa<FixedVectorType>(DataType) &&
346+
DataType->getPrimitiveSizeInBits() < 128)
372347
return false;
373348

374-
return isLegalMaskedLoadStore(DataType, Alignment);
349+
return isElementTypeLegalForCompressStore(DataType->getScalarType());
375350
}
376351

377352
bool isLegalMaskedGatherScatter(Type *DataType) const {

llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll

Lines changed: 65 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,26 @@ define void @test_compressstore_nxv2f64(ptr %p, <vscale x 2 x double> %vec, <vsc
5252
ret void
5353
}
5454

55+
;; SVE vectors that will be split
56+
57+
define void @test_compressstore_nxv8i32(ptr %p, <vscale x 8 x i32> %vec, <vscale x 8 x i1> %mask) {
58+
; CHECK-LABEL: test_compressstore_nxv8i32:
59+
; CHECK: // %bb.0:
60+
; CHECK-NEXT: punpkhi p1.h, p0.b
61+
; CHECK-NEXT: punpklo p0.h, p0.b
62+
; CHECK-NEXT: cntp x8, p1, p1.s
63+
; CHECK-NEXT: compact z1.s, p1, z1.s
64+
; CHECK-NEXT: cntp x9, p0, p0.s
65+
; CHECK-NEXT: compact z0.s, p0, z0.s
66+
; CHECK-NEXT: whilelo p0.s, xzr, x8
67+
; CHECK-NEXT: whilelo p1.s, xzr, x9
68+
; CHECK-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2]
69+
; CHECK-NEXT: st1w { z0.s }, p1, [x0]
70+
; CHECK-NEXT: ret
71+
tail call void @llvm.masked.compressstore.nxv8i32(<vscale x 8 x i32> %vec, ptr align 4 %p, <vscale x 8 x i1> %mask)
72+
ret void
73+
}
74+
5575
;; Unpacked SVE vector types
5676

5777
define void @test_compressstore_nxv2f32(ptr %p, <vscale x 2 x float> %vec, <vscale x 2 x i1> %mask) {
@@ -148,53 +168,29 @@ define void @test_compressstore_v2i64(ptr %p, <2 x i64> %vec, <2 x i1> %mask) {
148168
define void @test_compressstore_v8i32(ptr %p, <8 x i32> %vec, <8 x i1> %mask) {
149169
; CHECK-BASE-LABEL: test_compressstore_v8i32:
150170
; CHECK-BASE: // %bb.0:
151-
; CHECK-BASE-NEXT: shl v2.8b, v2.8b, #7
152-
; CHECK-BASE-NEXT: adrp x8, .LCPI10_0
153-
; CHECK-BASE-NEXT: ldr d3, [x8, :lo12:.LCPI10_0]
154-
; CHECK-BASE-NEXT: cmlt v2.8b, v2.8b, #0
155-
; CHECK-BASE-NEXT: and v2.8b, v2.8b, v3.8b
156-
; CHECK-BASE-NEXT: addv b2, v2.8b
157-
; CHECK-BASE-NEXT: fmov w8, s2
158-
; CHECK-BASE-NEXT: tbnz w8, #0, .LBB10_9
159-
; CHECK-BASE-NEXT: // %bb.1: // %else
160-
; CHECK-BASE-NEXT: tbnz w8, #1, .LBB10_10
161-
; CHECK-BASE-NEXT: .LBB10_2: // %else2
162-
; CHECK-BASE-NEXT: tbnz w8, #2, .LBB10_11
163-
; CHECK-BASE-NEXT: .LBB10_3: // %else5
164-
; CHECK-BASE-NEXT: tbnz w8, #3, .LBB10_12
165-
; CHECK-BASE-NEXT: .LBB10_4: // %else8
166-
; CHECK-BASE-NEXT: tbnz w8, #4, .LBB10_13
167-
; CHECK-BASE-NEXT: .LBB10_5: // %else11
168-
; CHECK-BASE-NEXT: tbnz w8, #5, .LBB10_14
169-
; CHECK-BASE-NEXT: .LBB10_6: // %else14
170-
; CHECK-BASE-NEXT: tbnz w8, #6, .LBB10_15
171-
; CHECK-BASE-NEXT: .LBB10_7: // %else17
172-
; CHECK-BASE-NEXT: tbnz w8, #7, .LBB10_16
173-
; CHECK-BASE-NEXT: .LBB10_8: // %else20
174-
; CHECK-BASE-NEXT: ret
175-
; CHECK-BASE-NEXT: .LBB10_9: // %cond.store
176-
; CHECK-BASE-NEXT: st1 { v0.s }[0], [x0], #4
177-
; CHECK-BASE-NEXT: tbz w8, #1, .LBB10_2
178-
; CHECK-BASE-NEXT: .LBB10_10: // %cond.store1
179-
; CHECK-BASE-NEXT: st1 { v0.s }[1], [x0], #4
180-
; CHECK-BASE-NEXT: tbz w8, #2, .LBB10_3
181-
; CHECK-BASE-NEXT: .LBB10_11: // %cond.store4
182-
; CHECK-BASE-NEXT: st1 { v0.s }[2], [x0], #4
183-
; CHECK-BASE-NEXT: tbz w8, #3, .LBB10_4
184-
; CHECK-BASE-NEXT: .LBB10_12: // %cond.store7
185-
; CHECK-BASE-NEXT: st1 { v0.s }[3], [x0], #4
186-
; CHECK-BASE-NEXT: tbz w8, #4, .LBB10_5
187-
; CHECK-BASE-NEXT: .LBB10_13: // %cond.store10
188-
; CHECK-BASE-NEXT: st1 { v1.s }[0], [x0], #4
189-
; CHECK-BASE-NEXT: tbz w8, #5, .LBB10_6
190-
; CHECK-BASE-NEXT: .LBB10_14: // %cond.store13
191-
; CHECK-BASE-NEXT: st1 { v1.s }[1], [x0], #4
192-
; CHECK-BASE-NEXT: tbz w8, #6, .LBB10_7
193-
; CHECK-BASE-NEXT: .LBB10_15: // %cond.store16
194-
; CHECK-BASE-NEXT: st1 { v1.s }[2], [x0], #4
195-
; CHECK-BASE-NEXT: tbz w8, #7, .LBB10_8
196-
; CHECK-BASE-NEXT: .LBB10_16: // %cond.store19
197-
; CHECK-BASE-NEXT: st1 { v1.s }[3], [x0]
171+
; CHECK-BASE-NEXT: // kill: def $q0 killed $q0 def $z0
172+
; CHECK-BASE-NEXT: zip2 v3.8b, v2.8b, v0.8b
173+
; CHECK-BASE-NEXT: zip1 v2.8b, v2.8b, v0.8b
174+
; CHECK-BASE-NEXT: // kill: def $q1 killed $q1 def $z1
175+
; CHECK-BASE-NEXT: movi v4.4s, #1
176+
; CHECK-BASE-NEXT: ptrue p0.s, vl4
177+
; CHECK-BASE-NEXT: ushll v3.4s, v3.4h, #0
178+
; CHECK-BASE-NEXT: ushll v2.4s, v2.4h, #0
179+
; CHECK-BASE-NEXT: shl v3.4s, v3.4s, #31
180+
; CHECK-BASE-NEXT: shl v5.4s, v2.4s, #31
181+
; CHECK-BASE-NEXT: and v2.16b, v2.16b, v4.16b
182+
; CHECK-BASE-NEXT: cmpne p1.s, p0/z, z3.s, #0
183+
; CHECK-BASE-NEXT: cmpne p0.s, p0/z, z5.s, #0
184+
; CHECK-BASE-NEXT: addv s2, v2.4s
185+
; CHECK-BASE-NEXT: fmov w10, s2
186+
; CHECK-BASE-NEXT: cntp x8, p1, p1.s
187+
; CHECK-BASE-NEXT: compact z1.s, p1, z1.s
188+
; CHECK-BASE-NEXT: compact z0.s, p0, z0.s
189+
; CHECK-BASE-NEXT: cntp x9, p0, p0.s
190+
; CHECK-BASE-NEXT: whilelo p0.s, xzr, x8
191+
; CHECK-BASE-NEXT: whilelo p1.s, xzr, x9
192+
; CHECK-BASE-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2]
193+
; CHECK-BASE-NEXT: st1w { z0.s }, p1, [x0]
198194
; CHECK-BASE-NEXT: ret
199195
;
200196
; CHECK-VL256-LABEL: test_compressstore_v8i32:
@@ -222,33 +218,28 @@ define void @test_compressstore_v8i32(ptr %p, <8 x i32> %vec, <8 x i1> %mask) {
222218
define void @test_compressstore_v4i64(ptr %p, <4 x i64> %vec, <4 x i1> %mask) {
223219
; CHECK-BASE-LABEL: test_compressstore_v4i64:
224220
; CHECK-BASE: // %bb.0:
225-
; CHECK-BASE-NEXT: shl v2.4h, v2.4h, #15
226-
; CHECK-BASE-NEXT: adrp x8, .LCPI11_0
227-
; CHECK-BASE-NEXT: ldr d3, [x8, :lo12:.LCPI11_0]
228-
; CHECK-BASE-NEXT: cmlt v2.4h, v2.4h, #0
229-
; CHECK-BASE-NEXT: and v2.8b, v2.8b, v3.8b
230-
; CHECK-BASE-NEXT: addv h2, v2.4h
231-
; CHECK-BASE-NEXT: fmov w8, s2
232-
; CHECK-BASE-NEXT: tbnz w8, #0, .LBB11_5
233-
; CHECK-BASE-NEXT: // %bb.1: // %else
234-
; CHECK-BASE-NEXT: tbnz w8, #1, .LBB11_6
235-
; CHECK-BASE-NEXT: .LBB11_2: // %else2
236-
; CHECK-BASE-NEXT: tbnz w8, #2, .LBB11_7
237-
; CHECK-BASE-NEXT: .LBB11_3: // %else5
238-
; CHECK-BASE-NEXT: tbnz w8, #3, .LBB11_8
239-
; CHECK-BASE-NEXT: .LBB11_4: // %else8
240-
; CHECK-BASE-NEXT: ret
241-
; CHECK-BASE-NEXT: .LBB11_5: // %cond.store
242-
; CHECK-BASE-NEXT: st1 { v0.d }[0], [x0], #8
243-
; CHECK-BASE-NEXT: tbz w8, #1, .LBB11_2
244-
; CHECK-BASE-NEXT: .LBB11_6: // %cond.store1
245-
; CHECK-BASE-NEXT: st1 { v0.d }[1], [x0], #8
246-
; CHECK-BASE-NEXT: tbz w8, #2, .LBB11_3
247-
; CHECK-BASE-NEXT: .LBB11_7: // %cond.store4
248-
; CHECK-BASE-NEXT: st1 { v1.d }[0], [x0], #8
249-
; CHECK-BASE-NEXT: tbz w8, #3, .LBB11_4
250-
; CHECK-BASE-NEXT: .LBB11_8: // %cond.store7
251-
; CHECK-BASE-NEXT: st1 { v1.d }[1], [x0]
221+
; CHECK-BASE-NEXT: ushll v2.4s, v2.4h, #0
222+
; CHECK-BASE-NEXT: movi v5.2s, #1
223+
; CHECK-BASE-NEXT: // kill: def $q1 killed $q1 def $z1
224+
; CHECK-BASE-NEXT: // kill: def $q0 killed $q0 def $z0
225+
; CHECK-BASE-NEXT: ptrue p0.d, vl2
226+
; CHECK-BASE-NEXT: ushll2 v3.2d, v2.4s, #0
227+
; CHECK-BASE-NEXT: ushll v4.2d, v2.2s, #0
228+
; CHECK-BASE-NEXT: and v2.8b, v2.8b, v5.8b
229+
; CHECK-BASE-NEXT: shl v3.2d, v3.2d, #63
230+
; CHECK-BASE-NEXT: shl v4.2d, v4.2d, #63
231+
; CHECK-BASE-NEXT: addp v2.2s, v2.2s, v2.2s
232+
; CHECK-BASE-NEXT: cmpne p1.d, p0/z, z3.d, #0
233+
; CHECK-BASE-NEXT: cmpne p0.d, p0/z, z4.d, #0
234+
; CHECK-BASE-NEXT: fmov w10, s2
235+
; CHECK-BASE-NEXT: cntp x8, p1, p1.d
236+
; CHECK-BASE-NEXT: compact z1.d, p1, z1.d
237+
; CHECK-BASE-NEXT: compact z0.d, p0, z0.d
238+
; CHECK-BASE-NEXT: cntp x9, p0, p0.d
239+
; CHECK-BASE-NEXT: whilelo p0.d, xzr, x8
240+
; CHECK-BASE-NEXT: whilelo p1.d, xzr, x9
241+
; CHECK-BASE-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3]
242+
; CHECK-BASE-NEXT: st1d { z0.d }, p1, [x0]
252243
; CHECK-BASE-NEXT: ret
253244
;
254245
; CHECK-VL256-LABEL: test_compressstore_v4i64:

0 commit comments

Comments
 (0)