Skip to content

Commit 8c33ea3

Browse files
david-armtstellar
authored andcommitted
[SVE][CodeGen] Bail out for scalable vectors in AArch64TargetLowering::ReconstructShuffle
Previously the code in AArch64TargetLowering::ReconstructShuffle assumed the input vectors were always fixed-width, however this is not always the case since you can extract elements from scalable vectors and insert into fixed-width ones. We were hitting crashes here for two different cases: 1. When lowering a fixed-length vector extract from a scalable vector with i1 element types. This happens due to the fact the i1 elements get promoted to larger integer types for fixed-width vectors and leads to sequences of INSERT_VECTOR_ELT and EXTRACT_VECTOR_ELT nodes. In this case AArch64TargetLowering::ReconstructShuffle will still fail to make a transformation, but at least it no longer crashes. 2. When lowering a sequence of extractelement/insertelement operations on mixed fixed-width/scalable vectors. For now, I've just changed AArch64TargetLowering::ReconstructShuffle to bail out if it finds a scalable vector. Tests for both instances described above have been added here: (1) CodeGen/AArch64/sve-extract-fixed-vector.ll (2) CodeGen/AArch64/sve-fixed-length-reshuffle.ll Differential Revision: https://reviews.llvm.org/D116602 (cherry picked from commit a57a7f3)
1 parent 1362f8b commit 8c33ea3

File tree

3 files changed

+145
-7
lines changed

3 files changed

+145
-7
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8990,12 +8990,13 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
89908990
if (V.isUndef())
89918991
continue;
89928992
else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8993-
!isa<ConstantSDNode>(V.getOperand(1))) {
8993+
!isa<ConstantSDNode>(V.getOperand(1)) ||
8994+
V.getOperand(0).getValueType().isScalableVector()) {
89948995
LLVM_DEBUG(
89958996
dbgs() << "Reshuffle failed: "
89968997
"a shuffle can only come from building a vector from "
8997-
"various elements of other vectors, provided their "
8998-
"indices are constant\n");
8998+
"various elements of other fixed-width vectors, provided "
8999+
"their indices are constant\n");
89999000
return SDValue();
90009001
}
90019002

@@ -9039,8 +9040,8 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
90399040
for (auto &Src : Sources) {
90409041
EVT SrcVT = Src.ShuffleVec.getValueType();
90419042

9042-
uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
9043-
if (SrcVTSize == VTSize)
9043+
TypeSize SrcVTSize = SrcVT.getSizeInBits();
9044+
if (SrcVTSize == TypeSize::Fixed(VTSize))
90449045
continue;
90459046

90469047
// This stage of the search produces a source with the same element type as
@@ -9049,7 +9050,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
90499050
unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
90509051
EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
90519052

9052-
if (SrcVTSize < VTSize) {
9053+
if (SrcVTSize.getFixedValue() < VTSize) {
90539054
assert(2 * SrcVTSize == VTSize);
90549055
// We can pad out the smaller vector for free, so if it's part of a
90559056
// shuffle...
@@ -9059,7 +9060,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
90599060
continue;
90609061
}
90619062

9062-
if (SrcVTSize != 2 * VTSize) {
9063+
if (SrcVTSize.getFixedValue() != 2 * VTSize) {
90639064
LLVM_DEBUG(
90649065
dbgs() << "Reshuffle failed: result vector too small to extract\n");
90659066
return SDValue();

llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,106 @@ define <16 x i8> @extract_v16i8_nxv2i8_idx16(<vscale x 2 x i8> %vec) nounwind #1
361361
ret <16 x i8> %retval
362362
}
363363

364+
365+
; Predicates
366+
367+
define <2 x i1> @extract_v2i1_nxv2i1(<vscale x 2 x i1> %inmask) {
368+
; CHECK-LABEL: extract_v2i1_nxv2i1:
369+
; CHECK: // %bb.0:
370+
; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1
371+
; CHECK-NEXT: fmov x0, d0
372+
; CHECK-NEXT: mov x8, v0.d[1]
373+
; CHECK-NEXT: fmov s0, w0
374+
; CHECK-NEXT: mov v0.s[1], w8
375+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
376+
; CHECK-NEXT: ret
377+
%mask = call <2 x i1> @llvm.experimental.vector.extract.v2i1.nxv2i1(<vscale x 2 x i1> %inmask, i64 0)
378+
ret <2 x i1> %mask
379+
}
380+
381+
define <4 x i1> @extract_v4i1_nxv4i1(<vscale x 4 x i1> %inmask) {
382+
; CHECK-LABEL: extract_v4i1_nxv4i1:
383+
; CHECK: // %bb.0:
384+
; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1
385+
; CHECK-NEXT: mov w8, v1.s[1]
386+
; CHECK-NEXT: mov w9, v1.s[2]
387+
; CHECK-NEXT: mov v0.16b, v1.16b
388+
; CHECK-NEXT: mov v0.h[1], w8
389+
; CHECK-NEXT: mov w8, v1.s[3]
390+
; CHECK-NEXT: mov v0.h[2], w9
391+
; CHECK-NEXT: mov v0.h[3], w8
392+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
393+
; CHECK-NEXT: ret
394+
%mask = call <4 x i1> @llvm.experimental.vector.extract.v4i1.nxv4i1(<vscale x 4 x i1> %inmask, i64 0)
395+
ret <4 x i1> %mask
396+
}
397+
398+
define <8 x i1> @extract_v8i1_nxv8i1(<vscale x 8 x i1> %inmask) {
399+
; CHECK-LABEL: extract_v8i1_nxv8i1:
400+
; CHECK: // %bb.0:
401+
; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1
402+
; CHECK-NEXT: umov w8, v1.h[1]
403+
; CHECK-NEXT: umov w9, v1.h[2]
404+
; CHECK-NEXT: mov v0.16b, v1.16b
405+
; CHECK-NEXT: mov v0.b[1], w8
406+
; CHECK-NEXT: umov w8, v1.h[3]
407+
; CHECK-NEXT: mov v0.b[2], w9
408+
; CHECK-NEXT: umov w9, v1.h[4]
409+
; CHECK-NEXT: mov v0.b[3], w8
410+
; CHECK-NEXT: umov w8, v1.h[5]
411+
; CHECK-NEXT: mov v0.b[4], w9
412+
; CHECK-NEXT: umov w9, v1.h[6]
413+
; CHECK-NEXT: mov v0.b[5], w8
414+
; CHECK-NEXT: umov w8, v1.h[7]
415+
; CHECK-NEXT: mov v0.b[6], w9
416+
; CHECK-NEXT: mov v0.b[7], w8
417+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
418+
; CHECK-NEXT: ret
419+
%mask = call <8 x i1> @llvm.experimental.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> %inmask, i64 0)
420+
ret <8 x i1> %mask
421+
}
422+
423+
define <16 x i1> @extract_v16i1_nxv16i1(<vscale x 16 x i1> %inmask) {
424+
; CHECK-LABEL: extract_v16i1_nxv16i1:
425+
; CHECK: // %bb.0:
426+
; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1
427+
; CHECK-NEXT: umov w8, v1.b[1]
428+
; CHECK-NEXT: umov w9, v1.b[2]
429+
; CHECK-NEXT: mov v0.16b, v1.16b
430+
; CHECK-NEXT: mov v0.b[1], w8
431+
; CHECK-NEXT: umov w8, v1.b[3]
432+
; CHECK-NEXT: mov v0.b[2], w9
433+
; CHECK-NEXT: umov w9, v1.b[4]
434+
; CHECK-NEXT: mov v0.b[3], w8
435+
; CHECK-NEXT: umov w8, v1.b[5]
436+
; CHECK-NEXT: mov v0.b[4], w9
437+
; CHECK-NEXT: umov w9, v1.b[6]
438+
; CHECK-NEXT: mov v0.b[5], w8
439+
; CHECK-NEXT: umov w8, v1.b[7]
440+
; CHECK-NEXT: mov v0.b[6], w9
441+
; CHECK-NEXT: umov w9, v1.b[8]
442+
; CHECK-NEXT: mov v0.b[7], w8
443+
; CHECK-NEXT: umov w8, v1.b[9]
444+
; CHECK-NEXT: mov v0.b[8], w9
445+
; CHECK-NEXT: umov w9, v1.b[10]
446+
; CHECK-NEXT: mov v0.b[9], w8
447+
; CHECK-NEXT: umov w8, v1.b[11]
448+
; CHECK-NEXT: mov v0.b[10], w9
449+
; CHECK-NEXT: umov w9, v1.b[12]
450+
; CHECK-NEXT: mov v0.b[11], w8
451+
; CHECK-NEXT: umov w8, v1.b[13]
452+
; CHECK-NEXT: mov v0.b[12], w9
453+
; CHECK-NEXT: umov w9, v1.b[14]
454+
; CHECK-NEXT: mov v0.b[13], w8
455+
; CHECK-NEXT: umov w8, v1.b[15]
456+
; CHECK-NEXT: mov v0.b[14], w9
457+
; CHECK-NEXT: mov v0.b[15], w8
458+
; CHECK-NEXT: ret
459+
%mask = call <16 x i1> @llvm.experimental.vector.extract.v16i1.nxv16i1(<vscale x 16 x i1> %inmask, i64 0)
460+
ret <16 x i1> %mask
461+
}
462+
463+
364464
; Fixed length clamping
365465

366466
define <2 x i64> @extract_fixed_v2i64_nxv2i64(<vscale x 2 x i64> %vec) nounwind #0 {
@@ -441,4 +541,9 @@ declare <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv8i8(<vscale x 8 x i
441541
declare <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv4i8(<vscale x 4 x i8>, i64)
442542
declare <16 x i8> @llvm.experimental.vector.extract.v16i8.nxv2i8(<vscale x 2 x i8>, i64)
443543

544+
declare <2 x i1> @llvm.experimental.vector.extract.v2i1.nxv2i1(<vscale x 2 x i1>, i64)
545+
declare <4 x i1> @llvm.experimental.vector.extract.v4i1.nxv4i1(<vscale x 4 x i1>, i64)
546+
declare <8 x i1> @llvm.experimental.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1>, i64)
547+
declare <16 x i1> @llvm.experimental.vector.extract.v16i1.nxv16i1(<vscale x 16 x i1>, i64)
548+
444549
declare <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64>, i64)
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s | FileCheck %s
3+
4+
target triple = "aarch64-unknown-linux-gnu"
5+
6+
; == Matching first N elements ==
7+
8+
define <4 x i1> @reshuffle_v4i1_nxv4i1(<vscale x 4 x i1> %a) #0 {
9+
; CHECK-LABEL: reshuffle_v4i1_nxv4i1:
10+
; CHECK: // %bb.0:
11+
; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1
12+
; CHECK-NEXT: mov w8, v1.s[1]
13+
; CHECK-NEXT: mov w9, v1.s[2]
14+
; CHECK-NEXT: mov v0.16b, v1.16b
15+
; CHECK-NEXT: mov v0.h[1], w8
16+
; CHECK-NEXT: mov w8, v1.s[3]
17+
; CHECK-NEXT: mov v0.h[2], w9
18+
; CHECK-NEXT: mov v0.h[3], w8
19+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
20+
; CHECK-NEXT: ret
21+
%el0 = extractelement <vscale x 4 x i1> %a, i32 0
22+
%el1 = extractelement <vscale x 4 x i1> %a, i32 1
23+
%el2 = extractelement <vscale x 4 x i1> %a, i32 2
24+
%el3 = extractelement <vscale x 4 x i1> %a, i32 3
25+
%v0 = insertelement <4 x i1> undef, i1 %el0, i32 0
26+
%v1 = insertelement <4 x i1> %v0, i1 %el1, i32 1
27+
%v2 = insertelement <4 x i1> %v1, i1 %el2, i32 2
28+
%v3 = insertelement <4 x i1> %v2, i1 %el3, i32 3
29+
ret <4 x i1> %v3
30+
}
31+
32+
attributes #0 = { "target-features"="+sve" }

0 commit comments

Comments
 (0)