diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7519ac5260a64..99c0707913f58 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13430,6 +13430,30 @@ static bool isUZP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { return true; } +/// isDUPQMask - matches a splat of equivalent lanes within 128b segments in +/// the first vector operand. +static std::optional isDUPQMask(ArrayRef M, EVT VT) { + assert(VT.getFixedSizeInBits() % 128 == 0 && "Unsupported SVE vector size"); + unsigned Lane = (unsigned)M[0]; + unsigned Segments = VT.getFixedSizeInBits() / 128; + unsigned SegmentElts = VT.getVectorNumElements() / Segments; + + // Make sure there's no size changes. + if (SegmentElts * Segments != M.size()) + return std::nullopt; + + // Check the first index corresponds to one of the lanes in the first segment. + if (Lane >= SegmentElts) + return std::nullopt; + + // Check that all lanes match the first, adjusted for segment. + for (unsigned I = 0; I < M.size(); ++I) + if ((unsigned)M[I] != (Lane + ((I / SegmentElts) * SegmentElts))) + return std::nullopt; + + return Lane; +} + /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. @@ -30013,6 +30037,19 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( return convertFromScalableVector( DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1)); } + + if (Subtarget->hasSVE2p1()) { + if (std::optional Lane = isDUPQMask(ShuffleMask, VT)) { + SDValue IID = + DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64); + return convertFromScalableVector( + DAG, VT, + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT, + {IID, Op1, + DAG.getConstant(*Lane, DL, MVT::i64, + /*isTarget=*/true)})); + } + } } // Try to widen the shuffle before generating a possibly expensive SVE TBL. diff --git a/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll new file mode 100644 index 0000000000000..40d4d0ff60148 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll @@ -0,0 +1,115 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s + +define void @dupq_i8_256b(ptr %addr) #0 { +; CHECK-LABEL: dupq_i8_256b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: dupq z0.b, z0.b[15] +; CHECK-NEXT: str z0, [x0] +; CHECK-NEXT: ret + %load = load <32 x i8>, ptr %addr + %splat.lanes = shufflevector <32 x i8> %load, <32 x i8> poison, <32 x i32> + store <32 x i8> %splat.lanes, ptr %addr + ret void +} + +define void @dupq_i16_256b(ptr %addr) #0 { +; CHECK-LABEL: dupq_i16_256b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: dupq z0.h, z0.h[2] +; CHECK-NEXT: str z0, [x0] +; CHECK-NEXT: ret + %load = load <16 x i16>, ptr %addr + %splat.lanes = shufflevector <16 x i16> %load, <16 x i16> poison, <16 x i32> + store <16 x i16> %splat.lanes, ptr %addr + ret void +} + +define void @dupq_i32_256b(ptr %addr) #0 { +; CHECK-LABEL: dupq_i32_256b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: dupq z0.s, z0.s[3] +; CHECK-NEXT: str z0, [x0] +; CHECK-NEXT: ret + %load = load <8 x i32>, ptr %addr + %splat.lanes = shufflevector <8 x i32> %load, <8 x i32> poison, <8 x i32> + store <8 x i32> %splat.lanes, ptr %addr + ret void +} + +define void @dupq_i64_256b(ptr %addr) #0 { +; CHECK-LABEL: dupq_i64_256b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: trn1 z0.d, z0.d, z0.d +; CHECK-NEXT: str z0, [x0] +; CHECK-NEXT: ret + %load = load <4 x i64>, ptr %addr + %splat.lanes = shufflevector <4 x i64> %load, <4 x i64> poison, <4 x i32> + store <4 x i64> %splat.lanes, ptr %addr + ret void +} + +define void @dupq_f16_256b(ptr %addr) #0 { +; CHECK-LABEL: dupq_f16_256b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: dupq z0.h, z0.h[2] +; CHECK-NEXT: str z0, [x0] +; CHECK-NEXT: ret + %load = load <16 x half>, ptr %addr + %splat.lanes = shufflevector <16 x half> %load, <16 x half> poison, <16 x i32> + store <16 x half> %splat.lanes, ptr %addr + ret void +} + +define void @dupq_bf16_256b(ptr %addr) #0 { +; CHECK-LABEL: dupq_bf16_256b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: dup v0.8h, v0.h[2] +; CHECK-NEXT: dup v1.8h, v1.h[2] +; CHECK-NEXT: stp q0, q1, [x0] +; CHECK-NEXT: ret + %load = load <16 x bfloat>, ptr %addr + %splat.lanes = shufflevector <16 x bfloat> %load, <16 x bfloat> poison, <16 x i32> + store <16 x bfloat> %splat.lanes, ptr %addr + ret void +} + +define void @dupq_f32_256b(ptr %addr) #0 { +; CHECK-LABEL: dupq_f32_256b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: dupq z0.s, z0.s[3] +; CHECK-NEXT: str z0, [x0] +; CHECK-NEXT: ret + %load = load <8 x float>, ptr %addr + %splat.lanes = shufflevector <8 x float> %load, <8 x float> poison, <8 x i32> + store <8 x float> %splat.lanes, ptr %addr + ret void +} + +define void @dupq_f64_256b(ptr %addr) #0 { +; CHECK-LABEL: dupq_f64_256b: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: trn1 z0.d, z0.d, z0.d +; CHECK-NEXT: str z0, [x0] +; CHECK-NEXT: ret + %load = load <4 x double>, ptr %addr + %splat.lanes = shufflevector <4 x double> %load, <4 x double> poison, <4 x i32> + store <4 x double> %splat.lanes, ptr %addr + ret void +} + +attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1,+bf16" }