Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18602,6 +18602,30 @@ static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
SeenZExtOrSExt = true;
}

// Avoid the said use of vector SExt/ZExt in case all vector elements are
// consumed and each shuffle's mask uses same index (== homogeneous), in order
// to permit use of indexed OP (aka by-element) instruction variant. Example
// OPs: MLA, MUL.
EVT ExtendType = Extend->getValueType(0);
if (ExtendType.isVector() && !ExtendType.isScalableVT()) {
SmallBitVector UsedElements(ExtendType.getVectorNumElements(), false);
for (SDNode *User : Extend.getNode()->users()) {
// look for shuffles whose first operand is our Extend
if (User->getOpcode() != ISD::VECTOR_SHUFFLE ||
User->getOperand(0) != Extend)
continue;
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(User)->getMask();
const int Idx = Mask[0];
// early exit if a shuffle mask isn't homogeneous
if (!all_of(Mask, [Idx](int M) { return M == Idx; }))
break;
UsedElements.set(Idx);
Comment on lines +18617 to +18622
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(User)->getMask();
const int Idx = Mask[0];
// early exit if a shuffle mask isn't homogeneous
if (!all_of(Mask, [Idx](int M) { return M == Idx; }))
break;
UsedElements.set(Idx);
auto *Shuffle = cast<ShuffleVectorSDNode>(User);
if (!Shuffle->isSplat())
break;
UsedElements.set(Shuffle->getSplatIndex());

I think you also need to check that the SplatIndex comes from somewhere in operand 0.

}
// Verified relevant shuffles cover all elements of the Extend vector
if (UsedElements.all())
return SDValue();
}

SDValue NBV;
SDLoc DL(BV);
if (BV.getOpcode() == ISD::BUILD_VECTOR) {
Expand Down
218 changes: 218 additions & 0 deletions llvm/test/CodeGen/AArch64/aarch64-ext-shuffle.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple aarch64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK-SD
; RUN: llc < %s -mtriple aarch64-none-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=CHECK-GI
define <4 x i32> @ext_shuffle_v4i16_v4i32(<4 x i16> %l, <4 x i32> %a, <4 x i32> %b) {
; CHECK-SD-LABEL: ext_shuffle_v4i16_v4i32:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: mov v3.16b, v2.16b
; CHECK-SD-NEXT: mov v4.16b, v2.16b
; CHECK-SD-NEXT: mov v5.16b, v2.16b
; CHECK-SD-NEXT: mla v3.4s, v1.4s, v0.s[0]
; CHECK-SD-NEXT: mla v4.4s, v1.4s, v0.s[1]
; CHECK-SD-NEXT: mla v2.4s, v1.4s, v0.s[3]
; CHECK-SD-NEXT: mla v5.4s, v1.4s, v0.s[2]
; CHECK-SD-NEXT: sub v0.4s, v3.4s, v4.4s
; CHECK-SD-NEXT: sub v1.4s, v2.4s, v5.4s
; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: ext_shuffle_v4i16_v4i32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-NEXT: mov v3.16b, v2.16b
; CHECK-GI-NEXT: mov v4.16b, v2.16b
; CHECK-GI-NEXT: mov v5.16b, v2.16b
; CHECK-GI-NEXT: mla v3.4s, v1.4s, v0.s[0]
; CHECK-GI-NEXT: mla v4.4s, v1.4s, v0.s[1]
; CHECK-GI-NEXT: mla v2.4s, v1.4s, v0.s[3]
; CHECK-GI-NEXT: mla v5.4s, v1.4s, v0.s[2]
; CHECK-GI-NEXT: sub v0.4s, v3.4s, v4.4s
; CHECK-GI-NEXT: sub v1.4s, v5.4s, v2.4s
; CHECK-GI-NEXT: sub v0.4s, v0.4s, v1.4s
; CHECK-GI-NEXT: ret
%lanes = sext <4 x i16> %l to <4 x i32>
%shf0 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> zeroinitializer
%mul0 = mul <4 x i32> %shf0, %a
%add0 = add <4 x i32> %mul0, %b
%shf1 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%mul1 = mul <4 x i32> %shf1, %a
%add1 = add <4 x i32> %mul1, %b
%shf2 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
%mul2 = mul <4 x i32> %shf2, %a
%add2 = add <4 x i32> %mul2, %b
%shf3 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%mul3 = mul <4 x i32> %shf3, %a
%add3 = add <4 x i32> %mul3, %b
%sub1 = sub <4 x i32> %add0, %add1
%sub2 = sub <4 x i32> %add2, %add3
%sub3 = sub <4 x i32> %sub1, %sub2
ret <4 x i32> %sub3
}

define <4 x i32> @ext_shuffle_v4i16_v4i32_partial(<4 x i16> %l, <4 x i32> %a, <4 x i32> %b) {
; CHECK-SD-LABEL: ext_shuffle_v4i16_v4i32_partial:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: dup v3.4h, v0.h[0]
; CHECK-SD-NEXT: dup v4.4h, v0.h[1]
; CHECK-SD-NEXT: mov v5.16b, v2.16b
; CHECK-SD-NEXT: dup v0.4h, v0.h[2]
; CHECK-SD-NEXT: mov v6.16b, v2.16b
; CHECK-SD-NEXT: sshll v3.4s, v3.4h, #0
; CHECK-SD-NEXT: sshll v4.4s, v4.4h, #0
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: mla v5.4s, v3.4s, v1.4s
; CHECK-SD-NEXT: mla v6.4s, v4.4s, v1.4s
; CHECK-SD-NEXT: mla v2.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: sub v0.4s, v5.4s, v6.4s
; CHECK-SD-NEXT: sub v0.4s, v0.4s, v2.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: ext_shuffle_v4i16_v4i32_partial:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-NEXT: mov v3.16b, v2.16b
; CHECK-GI-NEXT: mov v4.16b, v2.16b
; CHECK-GI-NEXT: mla v3.4s, v1.4s, v0.s[0]
; CHECK-GI-NEXT: mla v4.4s, v1.4s, v0.s[1]
; CHECK-GI-NEXT: mla v2.4s, v1.4s, v0.s[2]
; CHECK-GI-NEXT: sub v0.4s, v3.4s, v4.4s
; CHECK-GI-NEXT: sub v0.4s, v0.4s, v2.4s
; CHECK-GI-NEXT: ret
%lanes = sext <4 x i16> %l to <4 x i32>
%shf0 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> zeroinitializer
%mul0 = mul <4 x i32> %shf0, %a
%add0 = add <4 x i32> %mul0, %b
%shf1 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%mul1 = mul <4 x i32> %shf1, %a
%add1 = add <4 x i32> %mul1, %b
%shf2 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
%mul2 = mul <4 x i32> %shf2, %a
%add2 = add <4 x i32> %mul2, %b
%sub1 = sub <4 x i32> %add0, %add1
%sub3 = sub <4 x i32> %sub1, %add2
ret <4 x i32> %sub3
}

define <4 x i32> @ext_shuffle_v4i16_v4i32_add(<4 x i16> %l, <4 x i32> %a, <4 x i32> %b) {
; CHECK-SD-LABEL: ext_shuffle_v4i16_v4i32_add:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: dup v1.4s, v0.s[0]
; CHECK-SD-NEXT: dup v3.4s, v0.s[1]
; CHECK-SD-NEXT: dup v4.4s, v0.s[2]
; CHECK-SD-NEXT: dup v0.4s, v0.s[3]
; CHECK-SD-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-SD-NEXT: add v3.4s, v3.4s, v2.4s
; CHECK-SD-NEXT: add v4.4s, v4.4s, v2.4s
; CHECK-SD-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-NEXT: sub v1.4s, v1.4s, v3.4s
; CHECK-SD-NEXT: sub v0.4s, v0.4s, v4.4s
; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: ext_shuffle_v4i16_v4i32_add:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-NEXT: dup v1.4s, v0.s[0]
; CHECK-GI-NEXT: dup v3.4s, v0.s[1]
; CHECK-GI-NEXT: dup v4.4s, v0.s[2]
; CHECK-GI-NEXT: dup v0.4s, v0.s[3]
; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-GI-NEXT: add v3.4s, v3.4s, v2.4s
; CHECK-GI-NEXT: add v4.4s, v4.4s, v2.4s
; CHECK-GI-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-GI-NEXT: sub v1.4s, v1.4s, v3.4s
; CHECK-GI-NEXT: sub v0.4s, v4.4s, v0.4s
; CHECK-GI-NEXT: sub v0.4s, v1.4s, v0.4s
; CHECK-GI-NEXT: ret
%lanes = sext <4 x i16> %l to <4 x i32>
%shf0 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> zeroinitializer
%add0 = add <4 x i32> %shf0, %b
%shf1 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%add1 = add <4 x i32> %shf1, %b
%shf2 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
%add2 = add <4 x i32> %shf2, %b
%shf3 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%add3 = add <4 x i32> %shf3, %b
%sub1 = sub <4 x i32> %add0, %add1
%sub2 = sub <4 x i32> %add2, %add3
%sub3 = sub <4 x i32> %sub1, %sub2
ret <4 x i32> %sub3
}

define <4 x i32> @ext_shuffle_v4i16_v4i32_one(<4 x i16> %l, <4 x i32> %a, <4 x i32> %b) {
; CHECK-SD-LABEL: ext_shuffle_v4i16_v4i32_one:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-SD-NEXT: dup v0.4h, v0.h[3]
; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-NEXT: mla v2.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: mov v0.16b, v2.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: ext_shuffle_v4i16_v4i32_one:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: sshll v3.4s, v0.4h, #0
; CHECK-GI-NEXT: mov v0.16b, v2.16b
; CHECK-GI-NEXT: mla v0.4s, v1.4s, v3.s[3]
; CHECK-GI-NEXT: ret
%lanes = sext <4 x i16> %l to <4 x i32>
%shf3 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%mul3 = mul <4 x i32> %shf3, %a
%add3 = add <4 x i32> %mul3, %b
ret <4 x i32> %add3
}

define <4 x i32> @ext_shuffle_v4i16_v4i32_extra_user(<4 x i16> %l, <4 x i32> %a, <4 x i32> %b) {
; CHECK-SD-LABEL: ext_shuffle_v4i16_v4i32_extra_user:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: sshll v3.4s, v0.4h, #0
; CHECK-SD-NEXT: mov v4.16b, v2.16b
; CHECK-SD-NEXT: mov v5.16b, v2.16b
; CHECK-SD-NEXT: mov v6.16b, v2.16b
; CHECK-SD-NEXT: mla v4.4s, v1.4s, v3.s[0]
; CHECK-SD-NEXT: mla v5.4s, v1.4s, v3.s[1]
; CHECK-SD-NEXT: mla v2.4s, v1.4s, v3.s[3]
; CHECK-SD-NEXT: mla v6.4s, v1.4s, v3.s[2]
; CHECK-SD-NEXT: sub v1.4s, v4.4s, v5.4s
; CHECK-SD-NEXT: sub v2.4s, v2.4s, v6.4s
; CHECK-SD-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-SD-NEXT: ssubw v0.4s, v1.4s, v0.4h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: ext_shuffle_v4i16_v4i32_extra_user:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: sshll v3.4s, v0.4h, #0
; CHECK-GI-NEXT: mov v4.16b, v2.16b
; CHECK-GI-NEXT: mov v5.16b, v2.16b
; CHECK-GI-NEXT: mov v6.16b, v2.16b
; CHECK-GI-NEXT: mla v4.4s, v1.4s, v3.s[0]
; CHECK-GI-NEXT: mla v5.4s, v1.4s, v3.s[1]
; CHECK-GI-NEXT: mla v2.4s, v1.4s, v3.s[3]
; CHECK-GI-NEXT: mla v6.4s, v1.4s, v3.s[2]
; CHECK-GI-NEXT: sub v1.4s, v4.4s, v5.4s
; CHECK-GI-NEXT: sub v2.4s, v6.4s, v2.4s
; CHECK-GI-NEXT: sub v1.4s, v1.4s, v2.4s
; CHECK-GI-NEXT: ssubw v0.4s, v1.4s, v0.4h
; CHECK-GI-NEXT: ret
%lanes = sext <4 x i16> %l to <4 x i32>
%shf0 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> zeroinitializer
%mul0 = mul <4 x i32> %shf0, %a
%add0 = add <4 x i32> %mul0, %b
%shf1 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%mul1 = mul <4 x i32> %shf1, %a
%add1 = add <4 x i32> %mul1, %b
%shf2 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
%mul2 = mul <4 x i32> %shf2, %a
%add2 = add <4 x i32> %mul2, %b
%shf3 = shufflevector <4 x i32> %lanes, <4 x i32> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
%mul3 = mul <4 x i32> %shf3, %a
%add3 = add <4 x i32> %mul3, %b
%sub1 = sub <4 x i32> %add0, %add1
%sub2 = sub <4 x i32> %add2, %add3
%sub3 = sub <4 x i32> %sub1, %sub2
%sub4 = sub <4 x i32> %sub3, %lanes
ret <4 x i32> %sub4
}
Loading