-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[GlobalISel][AArch64] Legalize G_EXTRACT_SUBVECTOR for SVE #114519
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-llvm-globalisel Author: Thorsten Schütt (tschuett) ChangesWe use stores because return is not support for smaller granuels. {nxv2s16, nxv4s16} fails with: LLVM ERROR: cannot select: %0:zpr(<vscale x 4 x s16>) = G_TRUNC %2:fpr(<vscale x 4 x s32>) (in function: extract_nxv2i16_nxv4i16_1) Full diff: https://github.com/llvm/llvm-project/pull/114519.diff 4 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index 471a7f70dd546c..a61943f29d18fb 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -267,6 +267,9 @@ class LegalizationArtifactCombiner {
const LLT DstTy = MRI.getType(DstReg);
Register SrcReg = lookThroughCopyInstrs(MI.getOperand(1).getReg());
+ if (DstTy.isScalableVector())
+ return false;
+
// Try to fold trunc(g_constant) when the smaller constant type is legal.
auto *SrcMI = MRI.getVRegDef(SrcReg);
if (SrcMI->getOpcode() == TargetOpcode::G_CONSTANT) {
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
index b7541effafe5ce..93e716a22814ca 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
@@ -196,6 +196,8 @@ LegalityPredicate LegalityPredicates::sameSize(unsigned TypeIdx0,
LegalityPredicate LegalityPredicates::memSizeInBytesNotPow2(unsigned MMOIdx) {
return [=](const LegalityQuery &Query) {
+ if (Query.MMODescrs[MMOIdx].MemoryTy.isScalableVector())
+ return true;
return !llvm::has_single_bit<uint32_t>(
Query.MMODescrs[MMOIdx].MemoryTy.getSizeInBytes());
};
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index f162d1c2973cbc..4a1f3555584fcb 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -61,11 +61,19 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
const LLT v2s64 = LLT::fixed_vector(2, 64);
const LLT v2p0 = LLT::fixed_vector(2, p0);
+ // 128 bit
const LLT nxv16s8 = LLT::scalable_vector(16, s8);
const LLT nxv8s16 = LLT::scalable_vector(8, s16);
const LLT nxv4s32 = LLT::scalable_vector(4, s32);
const LLT nxv2s64 = LLT::scalable_vector(2, s64);
+ // 64 bit
+ const LLT nxv4s16 = LLT::scalable_vector(4, s16);
+ const LLT nxv2s32 = LLT::scalable_vector(2, s32);
+
+ // 32 bit
+ const LLT nxv2s16 = LLT::scalable_vector(2, s16);
+
std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
v16s8, v8s16, v4s32,
v2s64, v2p0,
@@ -442,16 +450,20 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
{p0, p0, s64, 8}, {s128, p0, s128, 8}, {v16s8, p0, s128, 8},
{v8s8, p0, s64, 8}, {v4s16, p0, s64, 8}, {v8s16, p0, s128, 8},
{v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}})
- .legalForTypesWithMemDesc({
- // SVE vscale x 128 bit base sizes
- // TODO: Add nxv2p0. Consider bitcastIf.
- // See #92130
- // https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461
- {nxv16s8, p0, nxv16s8, 8},
- {nxv8s16, p0, nxv8s16, 8},
- {nxv4s32, p0, nxv4s32, 8},
- {nxv2s64, p0, nxv2s64, 8},
- })
+ .legalForTypesWithMemDesc(
+ {// SVE vscale x 128 bit base sizes
+ // TODO: Add nxv2p0. Consider bitcastIf.
+ // See #92130
+ // https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461
+ {nxv16s8, p0, nxv16s8, 8},
+ {nxv8s16, p0, nxv8s16, 8},
+ {nxv4s32, p0, nxv4s32, 8},
+ {nxv2s64, p0, nxv2s64, 8},
+ // SVE vscale x 64 bit base sizes
+ {nxv2s32, p0, nxv2s32, 8},
+ {nxv4s16, p0, nxv4s16, 8},
+ // SVE vscale x 32 bit base sizes
+ {nxv2s16, p0, nxv2s16, 8}})
.clampScalar(0, s8, s64)
.lowerIf([=](const LegalityQuery &Query) {
return Query.Types[0].isScalar() &&
@@ -639,17 +651,20 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
getActionDefinitionsBuilder(G_TRUNC)
.legalFor({{v2s32, v2s64}, {v4s16, v4s32}, {v8s8, v8s16}})
+ .legalFor(HasSVE, {{nxv4s16, nxv4s32}})
.moreElementsToNextPow2(0)
.clampMaxNumElements(0, s8, 8)
.clampMaxNumElements(0, s16, 4)
.clampMaxNumElements(0, s32, 2)
.minScalarOrEltIf(
- [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); },
+ [=](const LegalityQuery &Query) {
+ return Query.Types[0].isFixedVector();
+ },
0, s8)
.lowerIf([=](const LegalityQuery &Query) {
LLT DstTy = Query.Types[0];
LLT SrcTy = Query.Types[1];
- return DstTy.isVector() && SrcTy.getSizeInBits() > 128 &&
+ return DstTy.isFixedVector() && SrcTy.getSizeInBits() > 128 &&
DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits();
})
.clampMinNumElements(0, s8, 8)
@@ -1315,8 +1330,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
getActionDefinitionsBuilder({G_SCMP, G_UCMP}).lower();
+ // FIXME: {nxv2s16, nxv4s16}
getActionDefinitionsBuilder(G_EXTRACT_SUBVECTOR)
.legalFor({{v8s8, v16s8}, {v4s16, v8s16}, {v2s32, v4s32}})
+ .legalFor(HasSVE,
+ {{nxv2s16, nxv8s16}, {nxv4s16, nxv8s16}, {nxv2s32, nxv4s32}})
.widenScalarOrEltToNextPow2(0)
.immIdx(0); // Inform verifier imm idx 0 is handled.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/extract_subvector.ll b/llvm/test/CodeGen/AArch64/GlobalISel/extract_subvector.ll
new file mode 100644
index 00000000000000..ab302071b815a3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/extract_subvector.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple aarch64 -mattr=+sve | FileCheck %s
+; RUN: llc < %s -mtriple aarch64 -mattr=+sve -global-isel -aarch64-enable-gisel-sve=1 | FileCheck %s
+
+;; RUN: llc -global-isel -mtriple=aarch64-linux-gnu -mattr=+sve -O0 -aarch64-enable-gisel-sve=1 -stop-after=irtranslator < %s | FileCheck %s
+
+define void @extract_nxv2i32_nxv4i32_1(<vscale x 4 x i32> %arg, ptr %p) {
+; CHECK-LABEL: extract_nxv2i32_nxv4i32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: st1w { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ %ext = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> %arg, i64 0)
+ store <vscale x 2 x i32> %ext, ptr %p
+ ret void
+}
+
+define void @extract_nxv4i16_nxv8i16_1(<vscale x 8 x i16> %arg, ptr %p) {
+; CHECK-LABEL: extract_nxv4i16_nxv8i16_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: st1h { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ %ext = call <vscale x 4 x i16> @llvm.vector.extract.nxv4i16.nxv8i16(<vscale x 8 x i16> %arg, i64 0)
+ store <vscale x 4 x i16> %ext, ptr %p
+ ret void
+}
+
+define void @extract_nxv2i16_nxv8i16_1(<vscale x 8 x i16> %arg, ptr %p) {
+; CHECK-LABEL: extract_nxv2i16_nxv8i16_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ %ext = call <vscale x 2 x i16> @llvm.vector.extract.nxv2i16.nxv8i16(<vscale x 8 x i16> %arg, i64 2)
+ store <vscale x 2 x i16> %ext, ptr %p
+ ret void
+}
+
+define void @extract_nxv2i16_nxv8i16(<vscale x 8 x i16> %arg, ptr %p) {
+; CHECK-LABEL: extract_nxv2i16_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ %ext = call <vscale x 2 x i16> @llvm.vector.extract.nxv2i16.nxv8i16(<vscale x 8 x i16> %arg, i64 0)
+ store <vscale x 2 x i16> %ext, ptr %p
+ ret void
+}
|
|
@llvm/pr-subscribers-backend-aarch64 Author: Thorsten Schütt (tschuett) ChangesWe use stores because return is not support for smaller granuels. {nxv2s16, nxv4s16} fails with: LLVM ERROR: cannot select: %0:zpr(<vscale x 4 x s16>) = G_TRUNC %2:fpr(<vscale x 4 x s32>) (in function: extract_nxv2i16_nxv4i16_1) Full diff: https://github.com/llvm/llvm-project/pull/114519.diff 4 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index 471a7f70dd546c..a61943f29d18fb 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -267,6 +267,9 @@ class LegalizationArtifactCombiner {
const LLT DstTy = MRI.getType(DstReg);
Register SrcReg = lookThroughCopyInstrs(MI.getOperand(1).getReg());
+ if (DstTy.isScalableVector())
+ return false;
+
// Try to fold trunc(g_constant) when the smaller constant type is legal.
auto *SrcMI = MRI.getVRegDef(SrcReg);
if (SrcMI->getOpcode() == TargetOpcode::G_CONSTANT) {
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
index b7541effafe5ce..93e716a22814ca 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
@@ -196,6 +196,8 @@ LegalityPredicate LegalityPredicates::sameSize(unsigned TypeIdx0,
LegalityPredicate LegalityPredicates::memSizeInBytesNotPow2(unsigned MMOIdx) {
return [=](const LegalityQuery &Query) {
+ if (Query.MMODescrs[MMOIdx].MemoryTy.isScalableVector())
+ return true;
return !llvm::has_single_bit<uint32_t>(
Query.MMODescrs[MMOIdx].MemoryTy.getSizeInBytes());
};
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index f162d1c2973cbc..4a1f3555584fcb 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -61,11 +61,19 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
const LLT v2s64 = LLT::fixed_vector(2, 64);
const LLT v2p0 = LLT::fixed_vector(2, p0);
+ // 128 bit
const LLT nxv16s8 = LLT::scalable_vector(16, s8);
const LLT nxv8s16 = LLT::scalable_vector(8, s16);
const LLT nxv4s32 = LLT::scalable_vector(4, s32);
const LLT nxv2s64 = LLT::scalable_vector(2, s64);
+ // 64 bit
+ const LLT nxv4s16 = LLT::scalable_vector(4, s16);
+ const LLT nxv2s32 = LLT::scalable_vector(2, s32);
+
+ // 32 bit
+ const LLT nxv2s16 = LLT::scalable_vector(2, s16);
+
std::initializer_list<LLT> PackedVectorAllTypeList = {/* Begin 128bit types */
v16s8, v8s16, v4s32,
v2s64, v2p0,
@@ -442,16 +450,20 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
{p0, p0, s64, 8}, {s128, p0, s128, 8}, {v16s8, p0, s128, 8},
{v8s8, p0, s64, 8}, {v4s16, p0, s64, 8}, {v8s16, p0, s128, 8},
{v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}})
- .legalForTypesWithMemDesc({
- // SVE vscale x 128 bit base sizes
- // TODO: Add nxv2p0. Consider bitcastIf.
- // See #92130
- // https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461
- {nxv16s8, p0, nxv16s8, 8},
- {nxv8s16, p0, nxv8s16, 8},
- {nxv4s32, p0, nxv4s32, 8},
- {nxv2s64, p0, nxv2s64, 8},
- })
+ .legalForTypesWithMemDesc(
+ {// SVE vscale x 128 bit base sizes
+ // TODO: Add nxv2p0. Consider bitcastIf.
+ // See #92130
+ // https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461
+ {nxv16s8, p0, nxv16s8, 8},
+ {nxv8s16, p0, nxv8s16, 8},
+ {nxv4s32, p0, nxv4s32, 8},
+ {nxv2s64, p0, nxv2s64, 8},
+ // SVE vscale x 64 bit base sizes
+ {nxv2s32, p0, nxv2s32, 8},
+ {nxv4s16, p0, nxv4s16, 8},
+ // SVE vscale x 32 bit base sizes
+ {nxv2s16, p0, nxv2s16, 8}})
.clampScalar(0, s8, s64)
.lowerIf([=](const LegalityQuery &Query) {
return Query.Types[0].isScalar() &&
@@ -639,17 +651,20 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
getActionDefinitionsBuilder(G_TRUNC)
.legalFor({{v2s32, v2s64}, {v4s16, v4s32}, {v8s8, v8s16}})
+ .legalFor(HasSVE, {{nxv4s16, nxv4s32}})
.moreElementsToNextPow2(0)
.clampMaxNumElements(0, s8, 8)
.clampMaxNumElements(0, s16, 4)
.clampMaxNumElements(0, s32, 2)
.minScalarOrEltIf(
- [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); },
+ [=](const LegalityQuery &Query) {
+ return Query.Types[0].isFixedVector();
+ },
0, s8)
.lowerIf([=](const LegalityQuery &Query) {
LLT DstTy = Query.Types[0];
LLT SrcTy = Query.Types[1];
- return DstTy.isVector() && SrcTy.getSizeInBits() > 128 &&
+ return DstTy.isFixedVector() && SrcTy.getSizeInBits() > 128 &&
DstTy.getScalarSizeInBits() * 2 <= SrcTy.getScalarSizeInBits();
})
.clampMinNumElements(0, s8, 8)
@@ -1315,8 +1330,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
getActionDefinitionsBuilder({G_SCMP, G_UCMP}).lower();
+ // FIXME: {nxv2s16, nxv4s16}
getActionDefinitionsBuilder(G_EXTRACT_SUBVECTOR)
.legalFor({{v8s8, v16s8}, {v4s16, v8s16}, {v2s32, v4s32}})
+ .legalFor(HasSVE,
+ {{nxv2s16, nxv8s16}, {nxv4s16, nxv8s16}, {nxv2s32, nxv4s32}})
.widenScalarOrEltToNextPow2(0)
.immIdx(0); // Inform verifier imm idx 0 is handled.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/extract_subvector.ll b/llvm/test/CodeGen/AArch64/GlobalISel/extract_subvector.ll
new file mode 100644
index 00000000000000..ab302071b815a3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/extract_subvector.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple aarch64 -mattr=+sve | FileCheck %s
+; RUN: llc < %s -mtriple aarch64 -mattr=+sve -global-isel -aarch64-enable-gisel-sve=1 | FileCheck %s
+
+;; RUN: llc -global-isel -mtriple=aarch64-linux-gnu -mattr=+sve -O0 -aarch64-enable-gisel-sve=1 -stop-after=irtranslator < %s | FileCheck %s
+
+define void @extract_nxv2i32_nxv4i32_1(<vscale x 4 x i32> %arg, ptr %p) {
+; CHECK-LABEL: extract_nxv2i32_nxv4i32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: st1w { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ %ext = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> %arg, i64 0)
+ store <vscale x 2 x i32> %ext, ptr %p
+ ret void
+}
+
+define void @extract_nxv4i16_nxv8i16_1(<vscale x 8 x i16> %arg, ptr %p) {
+; CHECK-LABEL: extract_nxv4i16_nxv8i16_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: st1h { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ %ext = call <vscale x 4 x i16> @llvm.vector.extract.nxv4i16.nxv8i16(<vscale x 8 x i16> %arg, i64 0)
+ store <vscale x 4 x i16> %ext, ptr %p
+ ret void
+}
+
+define void @extract_nxv2i16_nxv8i16_1(<vscale x 8 x i16> %arg, ptr %p) {
+; CHECK-LABEL: extract_nxv2i16_nxv8i16_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpkhi z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ %ext = call <vscale x 2 x i16> @llvm.vector.extract.nxv2i16.nxv8i16(<vscale x 8 x i16> %arg, i64 2)
+ store <vscale x 2 x i16> %ext, ptr %p
+ ret void
+}
+
+define void @extract_nxv2i16_nxv8i16(<vscale x 8 x i16> %arg, ptr %p) {
+; CHECK-LABEL: extract_nxv2i16_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ %ext = call <vscale x 2 x i16> @llvm.vector.extract.nxv2i16.nxv8i16(<vscale x 8 x i16> %arg, i64 0)
+ store <vscale x 2 x i16> %ext, ptr %p
+ ret void
+}
|
|
We select G_TRUNC in C++, I bet zpr is missing:
|
|
Please review the last commit. |
We use stores because return is not support for smaller granuels.
{nxv2s16, nxv4s16} fails with:
LLVM ERROR: cannot select: %0:zpr(<vscale x 4 x s16>) = G_TRUNC %2:fpr(<vscale x 4 x s32>) (in function: extract_nxv2i16_nxv4i16_1)
| if (DstTy.isScalableVector()) | ||
| return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can hoist this up to the getType
|
I'm not sure if the smaller vectors should be considered legal in SVE. There are nxv2i32->nxv4i32 extending loads / truncating stores, but the types of the vectors returned are still vscale x 128 bit. SDAG had some smaller legal fp types, but that might work differently in GISel where we don't have the fp vs int type information, and we might be able to do the legalization earlier, not during selection. |
|
For some context, I scraped the extract_subvector patterns from llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td. There are cases where extract returns small vectors: The stores and loads play around the fact that we cannot return non 128-bit vectors. |
We use stores because return is not supported for smaller granuels.
{nxv2s16, nxv4s16} fails with:
LLVM ERROR: cannot select: %0:zpr(<vscale x 4 x s16>) = G_TRUNC %2:fpr(<vscale x 4 x s32>) (in function: extract_nxv2i16_nxv4i16_1)