diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 5684e0e4c26c4..521d8f07434e6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -10607,23 +10607,26 @@ TargetLowering::IncrementMemoryAddress(SDValue Addr, SDValue Mask, assert(DataVT.getVectorElementCount() == MaskVT.getVectorElementCount() && "Incompatible types of Data and Mask"); if (IsCompressedMemory) { - if (DataVT.isScalableVector()) - report_fatal_error( - "Cannot currently handle compressed memory with scalable vectors"); // Incrementing the pointer according to number of '1's in the mask. - EVT MaskIntVT = EVT::getIntegerVT(*DAG.getContext(), MaskVT.getSizeInBits()); - SDValue MaskInIntReg = DAG.getBitcast(MaskIntVT, Mask); - if (MaskIntVT.getSizeInBits() < 32) { - MaskInIntReg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskInIntReg); - MaskIntVT = MVT::i32; + if (DataVT.isScalableVector()) { + EVT MaskExtVT = MaskVT.changeElementType(MVT::i32); + SDValue MaskExt = DAG.getNode(ISD::ZERO_EXTEND, DL, MaskExtVT, Mask); + Increment = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, MaskExt); + } else { + EVT MaskIntVT = + EVT::getIntegerVT(*DAG.getContext(), MaskVT.getSizeInBits()); + SDValue MaskInIntReg = DAG.getBitcast(MaskIntVT, Mask); + if (MaskIntVT.getSizeInBits() < 32) { + MaskInIntReg = + DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskInIntReg); + MaskIntVT = MVT::i32; + } + Increment = DAG.getNode(ISD::CTPOP, DL, MaskIntVT, MaskInIntReg); } - - // Count '1's with POPCNT. - Increment = DAG.getNode(ISD::CTPOP, DL, MaskIntVT, MaskInIntReg); - Increment = DAG.getZExtOrTrunc(Increment, DL, AddrVT); // Scale is an element size in bytes. SDValue Scale = DAG.getConstant(DataVT.getScalarSizeInBits() / 8, DL, AddrVT); + Increment = DAG.getZExtOrTrunc(Increment, DL, AddrVT); Increment = DAG.getNode(ISD::MUL, DL, AddrVT, Increment, Scale); } else if (DataVT.isScalableVector()) { Increment = DAG.getVScale(DL, AddrVT, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e91f5a877b35b..3c0c83d532771 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1983,10 +1983,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); // We can lower types that have elements to compact. - for (auto VT : - {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32, - MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32}) + for (auto VT : {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, + MVT::nxv2f32, MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, + MVT::nxv4i32, MVT::nxv4f32}) { setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom); + // Use a custom lowering for masked stores that could be a supported + // compressing store. Note: These types still use the normal (Legal) + // lowering for non-compressing masked stores. + setOperationAction(ISD::MSTORE, VT, Custom); + } // If we have SVE, we can use SVE logic for legal (or smaller than legal) // NEON vectors in the lowest bits of the SVE register. @@ -7932,7 +7937,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::MSTORE: - return LowerFixedLengthVectorMStoreToSVE(Op, DAG); + return LowerMSTORE(Op, DAG); case ISD::MGATHER: return LowerMGATHER(Op, DAG); case ISD::MSCATTER: @@ -30400,6 +30405,43 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE( Store->isTruncatingStore()); } +SDValue AArch64TargetLowering::LowerMSTORE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + auto *Store = cast(Op); + EVT VT = Store->getValue().getValueType(); + if (VT.isFixedLengthVector()) + return LowerFixedLengthVectorMStoreToSVE(Op, DAG); + + if (!Store->isCompressingStore()) + return SDValue(); + + EVT MaskVT = Store->getMask().getValueType(); + EVT MaskExtVT = getPromotedVTForPredicate(MaskVT); + EVT MaskReduceVT = MaskExtVT.getScalarType(); + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + + SDValue MaskExt = + DAG.getNode(ISD::ZERO_EXTEND, DL, MaskExtVT, Store->getMask()); + SDValue CntActive = + DAG.getNode(ISD::VECREDUCE_ADD, DL, MaskReduceVT, MaskExt); + if (MaskReduceVT != MVT::i64) + CntActive = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CntActive); + + SDValue CompressedValue = + DAG.getNode(ISD::VECTOR_COMPRESS, DL, VT, Store->getValue(), + Store->getMask(), DAG.getPOISON(VT)); + SDValue CompressedMask = + DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT, Zero, CntActive); + + return DAG.getMaskedStore(Store->getChain(), DL, CompressedValue, + Store->getBasePtr(), Store->getOffset(), + CompressedMask, Store->getMemoryVT(), + Store->getMemOperand(), Store->getAddressingMode(), + Store->isTruncatingStore(), + /*isCompressing=*/false); +} + SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE( SDValue Op, SelectionDAG &DAG) const { auto *Store = cast(Op); @@ -30414,7 +30456,8 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE( return DAG.getMaskedStore( Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(), Mask, Store->getMemoryVT(), Store->getMemOperand(), - Store->getAddressingMode(), Store->isTruncatingStore()); + Store->getAddressingMode(), Store->isTruncatingStore(), + Store->isCompressingStore()); } SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE( diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index ca08eb40c956a..32aa913181a21 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -761,6 +761,7 @@ class AArch64TargetLowering : public TargetLowering { SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerMSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 881f7707f0eb7..681f1871ae692 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -645,29 +645,34 @@ def nontrunc_masked_store : (masked_st node:$val, node:$ptr, undef, node:$pred), [{ return !cast(N)->isTruncatingStore() && cast(N)->isUnindexed() && - !cast(N)->isNonTemporal(); + !cast(N)->isNonTemporal() && + !cast(N)->isCompressingStore(); }]>; // truncating masked store fragments. def trunc_masked_store : PatFrag<(ops node:$val, node:$ptr, node:$pred), (masked_st node:$val, node:$ptr, undef, node:$pred), [{ return cast(N)->isTruncatingStore() && - cast(N)->isUnindexed(); + cast(N)->isUnindexed() && + !cast(N)->isCompressingStore(); }]>; def trunc_masked_store_i8 : PatFrag<(ops node:$val, node:$ptr, node:$pred), (trunc_masked_store node:$val, node:$ptr, node:$pred), [{ - return cast(N)->getMemoryVT().getScalarType() == MVT::i8; + return cast(N)->getMemoryVT().getScalarType() == MVT::i8 && + !cast(N)->isCompressingStore(); }]>; def trunc_masked_store_i16 : PatFrag<(ops node:$val, node:$ptr, node:$pred), (trunc_masked_store node:$val, node:$ptr, node:$pred), [{ - return cast(N)->getMemoryVT().getScalarType() == MVT::i16; + return cast(N)->getMemoryVT().getScalarType() == MVT::i16 && + !cast(N)->isCompressingStore(); }]>; def trunc_masked_store_i32 : PatFrag<(ops node:$val, node:$ptr, node:$pred), (trunc_masked_store node:$val, node:$ptr, node:$pred), [{ - return cast(N)->getMemoryVT().getScalarType() == MVT::i32; + return cast(N)->getMemoryVT().getScalarType() == MVT::i32 && + !cast(N)->isCompressingStore(); }]>; def non_temporal_store : @@ -675,7 +680,8 @@ def non_temporal_store : (masked_st node:$val, node:$ptr, undef, node:$pred), [{ return !cast(N)->isTruncatingStore() && cast(N)->isUnindexed() && - cast(N)->isNonTemporal(); + cast(N)->isNonTemporal() && + !cast(N)->isCompressingStore(); }]>; multiclass masked_gather_scatter { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 6cc4987428567..82856d605a56f 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -332,6 +332,23 @@ class AArch64TTIImpl final : public BasicTTIImplBase { return isLegalMaskedLoadStore(DataType, Alignment); } + bool isElementTypeLegalForCompressStore(Type *Ty) const { + return Ty->isFloatTy() || Ty->isDoubleTy() || Ty->isIntegerTy(32) || + Ty->isIntegerTy(64); + } + + bool isLegalMaskedCompressStore(Type *DataType, + Align Alignment) const override { + if (!ST->isSVEAvailable()) + return false; + + if (isa(DataType) && + DataType->getPrimitiveSizeInBits() < 128) + return false; + + return isElementTypeLegalForCompressStore(DataType->getScalarType()); + } + bool isLegalMaskedGatherScatter(Type *DataType) const { if (!ST->isSVEAvailable()) return false; diff --git a/llvm/test/CodeGen/AArch64/sve-masked-compressstore-sve2p2.ll b/llvm/test/CodeGen/AArch64/sve-masked-compressstore-sve2p2.ll new file mode 100644 index 0000000000000..92ecc3c83e2c5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-masked-compressstore-sve2p2.ll @@ -0,0 +1,17 @@ +; RUN: llc -mtriple=aarch64 -mattr=+sve2p2 < %s + +;; These masked.compressstore operations could be natively supported with +sve2p2 +;; (or by promoting to 32/64 bit elements + a truncstore), but currently are not +;; supported. + +; XFAIL: * + +define void @test_compressstore_nxv8i16(ptr %p, %vec, %mask) { + tail call void @llvm.masked.compressstore.nxv8i16( %vec, ptr align 2 %p, %mask) + ret void +} + +define void @test_compressstore_nxv16i8(ptr %p, %vec, %mask) { + tail call void @llvm.masked.compressstore.nxv16i8( %vec, ptr align 1 %p, %mask) + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll b/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll new file mode 100644 index 0000000000000..c698658afc8c4 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-masked-compressstore.ll @@ -0,0 +1,280 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-BASE +; RUN: llc -mtriple=aarch64 -aarch64-sve-vector-bits-min=256 -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-VL256 + +;; Full SVE vectors (supported with +sve) + +define void @test_compressstore_nxv4i32(ptr %p, %vec, %mask) { +; CHECK-LABEL: test_compressstore_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cntp x8, p0, p0.s +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + tail call void @llvm.masked.compressstore.nxv4i32( %vec, ptr align 4 %p, %mask) + ret void +} + +define void @test_compressstore_nxv2i64(ptr %p, %vec, %mask) { +; CHECK-LABEL: test_compressstore_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cntp x8, p0, p0.d +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + tail call void @llvm.masked.compressstore.nxv2i64( %vec, ptr align 8 %p, %mask) + ret void +} + +define void @test_compressstore_nxv4f32(ptr %p, %vec, %mask) { +; CHECK-LABEL: test_compressstore_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: cntp x8, p0, p0.s +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + tail call void @llvm.masked.compressstore.nxv4f32( %vec, ptr align 4 %p, %mask) + ret void +} + +define void @test_compressstore_nxv2f64(ptr %p, %vec, %mask) { +; CHECK-LABEL: test_compressstore_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: cntp x8, p0, p0.d +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + tail call void @llvm.masked.compressstore.nxv2f64( %vec, ptr align 8 %p, %mask) + ret void +} + +;; SVE vectors that will be split + +define void @test_compressstore_nxv8i32(ptr %p, %vec, %mask) { +; CHECK-LABEL: test_compressstore_nxv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: cntp x8, p1, p1.s +; CHECK-NEXT: compact z1.s, p1, z1.s +; CHECK-NEXT: cntp x9, p0, p0.s +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: whilelo p1.s, xzr, x9 +; CHECK-NEXT: st1w { z1.s }, p0, [x0, x9, lsl #2] +; CHECK-NEXT: st1w { z0.s }, p1, [x0] +; CHECK-NEXT: ret + tail call void @llvm.masked.compressstore.nxv8i32( %vec, ptr align 4 %p, %mask) + ret void +} + +;; Unpacked SVE vector types + +define void @test_compressstore_nxv2f32(ptr %p, %vec, %mask) { +; CHECK-LABEL: test_compressstore_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: cntp x8, p0, p0.d +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: st1w { z0.d }, p0, [x0] +; CHECK-NEXT: ret + tail call void @llvm.masked.compressstore.nxv2f32( %vec, ptr align 4 %p, %mask) + ret void +} + +;; SVE vector types promoted to 32/64-bit (non-exhaustive) + +define void @test_compressstore_nxv2i8(ptr %p, %vec, %mask) { +; CHECK-LABEL: test_compressstore_nxv2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: cntp x8, p0, p0.d +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: st1b { z0.d }, p0, [x0] +; CHECK-NEXT: ret + tail call void @llvm.masked.compressstore.nxv2i8( %vec, ptr align 1 %p, %mask) + ret void +} + +define void @test_compressstore_nxv4i16(ptr %p, %vec, %mask) { +; CHECK-LABEL: test_compressstore_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cntp x8, p0, p0.s +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: st1h { z0.s }, p0, [x0] +; CHECK-NEXT: ret + tail call void @llvm.masked.compressstore.nxv4i16( %vec, ptr align 2 %p, %mask) + ret void +} + +;; NEON vector types (promoted to SVE) + +define void @test_compressstore_v2f64(ptr %p, <2 x double> %vec, <2 x i1> %mask) { +; CHECK-LABEL: test_compressstore_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: shl v1.2d, v1.2d, #63 +; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; CHECK-NEXT: cntp x8, p0, p0.d +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + tail call void @llvm.masked.compressstore.v2f64(<2 x double> %vec, ptr align 8 %p, <2 x i1> %mask) + ret void +} + +define void @test_compressstore_v4i32(ptr %p, <4 x i32> %vec, <4 x i1> %mask) { +; CHECK-LABEL: test_compressstore_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: shl v1.4s, v1.4s, #31 +; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; CHECK-NEXT: cntp x8, p0, p0.s +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + tail call void @llvm.masked.compressstore.v4i32(<4 x i32> %vec, ptr align 4 %p, <4 x i1> %mask) + ret void +} + +define void @test_compressstore_v2i64(ptr %p, <2 x i64> %vec, <2 x i1> %mask) { +; CHECK-LABEL: test_compressstore_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: shl v1.2d, v1.2d, #63 +; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; CHECK-NEXT: cntp x8, p0, p0.d +; CHECK-NEXT: compact z0.d, p0, z0.d +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %vec, ptr align 8 %p, <2 x i1> %mask) + ret void +} + +define void @test_compressstore_v8i32(ptr %p, <8 x i32> %vec, <8 x i1> %mask) { +; CHECK-BASE-LABEL: test_compressstore_v8i32: +; CHECK-BASE: // %bb.0: +; CHECK-BASE-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-BASE-NEXT: zip1 v3.8b, v2.8b, v0.8b +; CHECK-BASE-NEXT: adrp x8, .LCPI11_0 +; CHECK-BASE-NEXT: zip2 v2.8b, v2.8b, v0.8b +; CHECK-BASE-NEXT: ldr d5, [x8, :lo12:.LCPI11_0] +; CHECK-BASE-NEXT: ptrue p0.s +; CHECK-BASE-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-BASE-NEXT: ptrue p1.s, vl4 +; CHECK-BASE-NEXT: shl v4.4h, v3.4h, #15 +; CHECK-BASE-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-BASE-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-BASE-NEXT: cmlt v4.4h, v4.4h, #0 +; CHECK-BASE-NEXT: shl v2.4s, v2.4s, #31 +; CHECK-BASE-NEXT: shl v3.4s, v3.4s, #31 +; CHECK-BASE-NEXT: and v4.8b, v4.8b, v5.8b +; CHECK-BASE-NEXT: addv h4, v4.4h +; CHECK-BASE-NEXT: fmov w8, s4 +; CHECK-BASE-NEXT: and w8, w8, #0xf +; CHECK-BASE-NEXT: fmov s4, w8 +; CHECK-BASE-NEXT: cnt z4.s, p0/m, z4.s +; CHECK-BASE-NEXT: cmpne p0.s, p1/z, z2.s, #0 +; CHECK-BASE-NEXT: cmpne p1.s, p1/z, z3.s, #0 +; CHECK-BASE-NEXT: cntp x8, p0, p0.s +; CHECK-BASE-NEXT: compact z1.s, p0, z1.s +; CHECK-BASE-NEXT: compact z0.s, p1, z0.s +; CHECK-BASE-NEXT: cntp x9, p1, p1.s +; CHECK-BASE-NEXT: fmov w10, s4 +; CHECK-BASE-NEXT: whilelo p0.s, xzr, x8 +; CHECK-BASE-NEXT: whilelo p1.s, xzr, x9 +; CHECK-BASE-NEXT: st1w { z1.s }, p0, [x0, x10, lsl #2] +; CHECK-BASE-NEXT: st1w { z0.s }, p1, [x0] +; CHECK-BASE-NEXT: ret +; +; CHECK-VL256-LABEL: test_compressstore_v8i32: +; CHECK-VL256: // %bb.0: +; CHECK-VL256-NEXT: // kill: def $d2 killed $d2 def $z2 +; CHECK-VL256-NEXT: ptrue p0.s, vl8 +; CHECK-VL256-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-VL256-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-VL256-NEXT: uunpklo z2.h, z2.b +; CHECK-VL256-NEXT: ptrue p1.s, vl4 +; CHECK-VL256-NEXT: splice z0.s, p1, z0.s, z1.s +; CHECK-VL256-NEXT: uunpklo z2.s, z2.h +; CHECK-VL256-NEXT: lsl z2.s, z2.s, #31 +; CHECK-VL256-NEXT: asr z2.s, z2.s, #31 +; CHECK-VL256-NEXT: cmpne p0.s, p0/z, z2.s, #0 +; CHECK-VL256-NEXT: cntp x8, p0, p0.s +; CHECK-VL256-NEXT: compact z0.s, p0, z0.s +; CHECK-VL256-NEXT: whilelo p0.s, xzr, x8 +; CHECK-VL256-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-VL256-NEXT: ret + tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %vec, ptr align 4 %p, <8 x i1> %mask) + ret void +} + +define void @test_compressstore_v4i64(ptr %p, <4 x i64> %vec, <4 x i1> %mask) { +; CHECK-BASE-LABEL: test_compressstore_v4i64: +; CHECK-BASE: // %bb.0: +; CHECK-BASE-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-BASE-NEXT: index z4.s, #1, #1 +; CHECK-BASE-NEXT: ptrue p0.s +; CHECK-BASE-NEXT: ptrue p1.d, vl2 +; CHECK-BASE-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-BASE-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-BASE-NEXT: shl v3.2s, v2.2s, #31 +; CHECK-BASE-NEXT: cmlt v3.2s, v3.2s, #0 +; CHECK-BASE-NEXT: and v3.8b, v3.8b, v4.8b +; CHECK-BASE-NEXT: ushll2 v4.2d, v2.4s, #0 +; CHECK-BASE-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-BASE-NEXT: addp v3.2s, v3.2s, v3.2s +; CHECK-BASE-NEXT: shl v2.2d, v2.2d, #63 +; CHECK-BASE-NEXT: fmov w8, s3 +; CHECK-BASE-NEXT: shl v3.2d, v4.2d, #63 +; CHECK-BASE-NEXT: and w8, w8, #0x3 +; CHECK-BASE-NEXT: fmov s4, w8 +; CHECK-BASE-NEXT: cnt z4.s, p0/m, z4.s +; CHECK-BASE-NEXT: cmpne p0.d, p1/z, z3.d, #0 +; CHECK-BASE-NEXT: cmpne p1.d, p1/z, z2.d, #0 +; CHECK-BASE-NEXT: cntp x8, p0, p0.d +; CHECK-BASE-NEXT: compact z1.d, p0, z1.d +; CHECK-BASE-NEXT: compact z0.d, p1, z0.d +; CHECK-BASE-NEXT: cntp x9, p1, p1.d +; CHECK-BASE-NEXT: fmov w10, s4 +; CHECK-BASE-NEXT: whilelo p0.d, xzr, x8 +; CHECK-BASE-NEXT: whilelo p1.d, xzr, x9 +; CHECK-BASE-NEXT: st1d { z1.d }, p0, [x0, x10, lsl #3] +; CHECK-BASE-NEXT: st1d { z0.d }, p1, [x0] +; CHECK-BASE-NEXT: ret +; +; CHECK-VL256-LABEL: test_compressstore_v4i64: +; CHECK-VL256: // %bb.0: +; CHECK-VL256-NEXT: // kill: def $d2 killed $d2 def $z2 +; CHECK-VL256-NEXT: ptrue p0.d, vl4 +; CHECK-VL256-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-VL256-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-VL256-NEXT: uunpklo z2.s, z2.h +; CHECK-VL256-NEXT: ptrue p1.d, vl2 +; CHECK-VL256-NEXT: splice z0.d, p1, z0.d, z1.d +; CHECK-VL256-NEXT: uunpklo z2.d, z2.s +; CHECK-VL256-NEXT: lsl z2.d, z2.d, #63 +; CHECK-VL256-NEXT: asr z2.d, z2.d, #63 +; CHECK-VL256-NEXT: cmpne p0.d, p0/z, z2.d, #0 +; CHECK-VL256-NEXT: cntp x8, p0, p0.d +; CHECK-VL256-NEXT: compact z0.d, p0, z0.d +; CHECK-VL256-NEXT: whilelo p0.d, xzr, x8 +; CHECK-VL256-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-VL256-NEXT: ret + tail call void @llvm.masked.compressstore.v4i64(<4 x i64> %vec, ptr align 8 %p, <4 x i1> %mask) + ret void +}