Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13430,6 +13430,28 @@ static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
return true;
}

/// isDUPQMask - matches a splat of equivalent lanes within 128b segments
static bool isDUPQMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
WhichResult = (unsigned)M[0];
unsigned Segments = VT.getFixedSizeInBits() / 128;
unsigned SegmentElts = VT.getVectorNumElements() / Segments;
if (SegmentElts * Segments != M.size())
return false;

for (unsigned I = 0; I < Segments; ++I) {
unsigned Broadcast = (unsigned)M[I * SegmentElts];
if (Broadcast - (I * SegmentElts) > SegmentElts)
return false;
for (unsigned J = 0; J < SegmentElts; ++J) {
int Idx = M[(I * SegmentElts) + J];
if ((unsigned)Idx != Broadcast)
return false;
}
}

return true;
}

/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
Expand Down Expand Up @@ -30013,6 +30035,32 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
return convertFromScalableVector(
DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
}

if (Subtarget->hasSVE2p1()) {
if (isDUPQMask(ShuffleMask, VT, WhichResult)) {
unsigned DupOp;
switch (VT.getScalarSizeInBits()) {
default:
llvm_unreachable("Unsupported scalar size");
case 8:
DupOp = AArch64ISD::DUPLANEQ8;
break;
case 16:
DupOp = AArch64ISD::DUPLANEQ16;
break;
case 32:
DupOp = AArch64ISD::DUPLANEQ32;
break;
case 64:
DupOp = AArch64ISD::DUPLANEQ64;
break;
}
return convertFromScalableVector(
DAG, VT, DAG.getNode(DupOp, DL, ContainerVT, Op1,
DAG.getConstant(WhichResult, DL, MVT::i32,
/*isTarget=*/true)));
}
}
}

// Try to widen the shuffle before generating a possibly expensive SVE TBL.
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -855,6 +855,12 @@ def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>;
def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>;
def AArch64duplane128 : SDNode<"AArch64ISD::DUPLANE128", SDT_AArch64DupLane>;

// Scalar-to-vector segmented duplication
def AArch64duplaneq8 : SDNode<"AArch64ISD::DUPLANEQ8", SDT_AArch64DupLane>;
def AArch64duplaneq16 : SDNode<"AArch64ISD::DUPLANEQ16", SDT_AArch64DupLane>;
def AArch64duplaneq32 : SDNode<"AArch64ISD::DUPLANEQ32", SDT_AArch64DupLane>;
def AArch64duplaneq64 : SDNode<"AArch64ISD::DUPLANEQ64", SDT_AArch64DupLane>;

def AArch64insr : SDNode<"AArch64ISD::INSR", SDT_AArch64Insr>;

// Vector shuffles
Expand Down
17 changes: 17 additions & 0 deletions llvm/lib/Target/AArch64/SVEInstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -10613,6 +10613,23 @@ multiclass sve2p1_dupq<string mnemonic, SDPatternOperator Op> {
def : SVE_2_Op_Imm_Pat<nxv4f32, Op, nxv4f32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _S)>;
def : SVE_2_Op_Imm_Pat<nxv2f64, Op, nxv2f64, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _D)>;
def : SVE_2_Op_Imm_Pat<nxv8bf16, Op, nxv8bf16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _H)>;

def : Pat<(nxv16i8 (AArch64duplaneq8 nxv16i8:$Op1, VectorIndexB32b_timm:$imm)),
(!cast<Instruction>(NAME # _B) $Op1, $imm)>;
def : Pat<(nxv8i16 (AArch64duplaneq16 nxv8i16:$Op1, VectorIndexH32b_timm:$imm)),
(!cast<Instruction>(NAME # _H) $Op1, $imm)>;
def : Pat<(nxv4i32 (AArch64duplaneq32 nxv4i32:$Op1, VectorIndexS32b_timm:$imm)),
(!cast<Instruction>(NAME # _S) $Op1, $imm)>;
def : Pat<(nxv2i64 (AArch64duplaneq64 nxv2i64:$Op1, VectorIndexD32b_timm:$imm)),
(!cast<Instruction>(NAME # _D) $Op1, $imm)>;
def : Pat<(nxv8f16 (AArch64duplaneq16 nxv8f16:$Op1, VectorIndexH32b_timm:$imm)),
(!cast<Instruction>(NAME # _H) $Op1, $imm)>;
def : Pat<(nxv4f32 (AArch64duplaneq32 nxv4f32:$Op1, VectorIndexS32b_timm:$imm)),
(!cast<Instruction>(NAME # _S) $Op1, $imm)>;
def : Pat<(nxv2f64 (AArch64duplaneq64 nxv2f64:$Op1, VectorIndexD32b_timm:$imm)),
(!cast<Instruction>(NAME # _D) $Op1, $imm)>;
def : Pat<(nxv8bf16 (AArch64duplaneq16 nxv8bf16:$Op1, VectorIndexH32b_timm:$imm)),
(!cast<Instruction>(NAME # _H) $Op1, $imm)>;
}


Expand Down
115 changes: 115 additions & 0 deletions llvm/test/CodeGen/AArch64/sve2p1-vector-shuffles.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s

define void @dupq_i8_256b(ptr %addr) #0 {
; CHECK-LABEL: dupq_i8_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: dupq z0.b, z0.b[11]
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%load = load <32 x i8>, ptr %addr
%splat.lanes = shufflevector <32 x i8> %load, <32 x i8> poison, <32 x i32> <i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11,
i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27, i32 27>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe choose 15 and 31 here as indices to test boundaries?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I figured a mix would work, as I have the 3,7 indices for i32 values, but sure.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

store <32 x i8> %splat.lanes, ptr %addr
ret void
}

define void @dupq_i16_256b(ptr %addr) #0 {
; CHECK-LABEL: dupq_i16_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: dupq z0.h, z0.h[2]
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%load = load <16 x i16>, ptr %addr
%splat.lanes = shufflevector <16 x i16> %load, <16 x i16> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
store <16 x i16> %splat.lanes, ptr %addr
ret void
}

define void @dupq_i32_256b(ptr %addr) #0 {
; CHECK-LABEL: dupq_i32_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: dupq z0.s, z0.s[3]
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%load = load <8 x i32>, ptr %addr
%splat.lanes = shufflevector <8 x i32> %load, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
i32 7, i32 7, i32 7, i32 7>
store <8 x i32> %splat.lanes, ptr %addr
ret void
}

define void @dupq_i64_256b(ptr %addr) #0 {
; CHECK-LABEL: dupq_i64_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: trn1 z0.d, z0.d, z0.d
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%load = load <4 x i64>, ptr %addr
%splat.lanes = shufflevector <4 x i64> %load, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
store <4 x i64> %splat.lanes, ptr %addr
ret void
}

define void @dupq_f16_256b(ptr %addr) #0 {
; CHECK-LABEL: dupq_f16_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: dupq z0.h, z0.h[2]
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%load = load <16 x half>, ptr %addr
%splat.lanes = shufflevector <16 x half> %load, <16 x half> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
store <16 x half> %splat.lanes, ptr %addr
ret void
}

define void @dupq_bf16_256b(ptr %addr) #0 {
; CHECK-LABEL: dupq_bf16_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: dup v0.8h, v0.h[2]
; CHECK-NEXT: dup v1.8h, v1.h[2]
; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret
%load = load <16 x bfloat>, ptr %addr
%splat.lanes = shufflevector <16 x bfloat> %load, <16 x bfloat> poison, <16 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
store <16 x bfloat> %splat.lanes, ptr %addr
ret void
}

define void @dupq_f32_256b(ptr %addr) #0 {
; CHECK-LABEL: dupq_f32_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: dupq z0.s, z0.s[3]
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%load = load <8 x float>, ptr %addr
%splat.lanes = shufflevector <8 x float> %load, <8 x float> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3,
i32 7, i32 7, i32 7, i32 7>
store <8 x float> %splat.lanes, ptr %addr
ret void
}

define void @dupq_f64_256b(ptr %addr) #0 {
; CHECK-LABEL: dupq_f64_256b:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: trn1 z0.d, z0.d, z0.d
; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
%load = load <4 x double>, ptr %addr
%splat.lanes = shufflevector <4 x double> %load, <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
store <4 x double> %splat.lanes, ptr %addr
ret void
}

attributes #0 = { noinline vscale_range(2,2) "target-features"="+sve2p1,+bf16" }
Loading