Skip to content
46 changes: 41 additions & 5 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20735,17 +20735,53 @@ static SDValue performBuildVectorCombine(SDNode *N,
return SDValue();
}

static SDValue performTruncateCombine(SDNode *N,
SelectionDAG &DAG) {
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
N0.getOpcode() == AArch64ISD::DUP) {
SDValue Op = N0.getOperand(0);
if (VT.getScalarType() == MVT::i32 &&
N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
return DAG.getNode(N0.getOpcode(), DL, VT, Op);
}

// Performing the following combine produces a preferable form for ISEL.
// i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
SDValue Op = N0.getOperand(0);
SDValue ExtractIndexNode = N0.getOperand(1);
if (!isa<ConstantSDNode>(ExtractIndexNode))
return SDValue();

// For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
// So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
"Unexpected legalisation result!");

EVT SrcVectorType = Op.getValueType();
// We also assume that SrcVectorType cannot be a V64 (see
// LowerEXTRACT_VECTOR_ELT).
assert((SrcVectorType.getScalarType() == MVT::i64 &&
SrcVectorType != MVT::v1i64) &&
"Unexpected legalisation result!");

// If the i64 we are extacting has uses other than this truncation, the
// upper half of this value must still be live so we prefer to extract it
// all at once.
if (!N0.hasOneUse())
return SDValue();

unsigned ExtractIndex =
cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;

Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
DAG.getConstant(ExtractIndex * 2, DL, MVT::i64));
}

return SDValue();
Expand Down Expand Up @@ -25992,7 +26028,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::BUILD_VECTOR:
return performBuildVectorCombine(N, DCI, DAG);
case ISD::TRUNCATE:
return performTruncateCombine(N, DAG);
return performTruncateCombine(N, DAG, DCI);
case AArch64ISD::ANDS:
return performFlagSettingCombine(N, DCI, ISD::AND);
case AArch64ISD::ADC:
Expand Down
58 changes: 20 additions & 38 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -6954,6 +6954,12 @@ def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
(DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;

// Also covers DUP (truncate i64 to i32)
def : Pat<(v2i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
(DUPv2i32lane V128:$Rn, imm:$idx)>;
def : Pat<(v4i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
(DUPv4i32lane V128:$Rn, imm:$idx)>;

// If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
// instruction even if the types don't match: we just have to remap the lane
// carefully. N.b. this trick only applies to truncations.
Expand All @@ -6967,44 +6973,20 @@ def VecIndex_x8 : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(8 * N->getZExtValue(), SDLoc(N), MVT::i64);
}]>;

multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
ValueType Src128VT, ValueType ScalVT,
Instruction DUP, SDNodeXForm IdxXFORM> {
def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
imm:$idx)))),
(DUP V128:$Rn, (IdxXFORM imm:$idx))>;

def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
imm:$idx)))),
(DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
}

defm : DUPWithTruncPats<v8i8, v4i16, v8i16, i32, DUPv8i8lane, VecIndex_x2>;
defm : DUPWithTruncPats<v8i8, v2i32, v4i32, i32, DUPv8i8lane, VecIndex_x4>;
defm : DUPWithTruncPats<v4i16, v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;

defm : DUPWithTruncPats<v16i8, v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
defm : DUPWithTruncPats<v16i8, v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;

multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
SDNodeXForm IdxXFORM> {
def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
imm:$idx))))),
(DUP V128:$Rn, (IdxXFORM imm:$idx))>;

def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn),
imm:$idx))))),
(DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
}

defm : DUPWithTrunci64Pats<v8i8, DUPv8i8lane, VecIndex_x8>;
defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane, VecIndex_x4>;
defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane, VecIndex_x2>;

defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
class DUPWithTruncPat<ValueType ResVT, ValueType SrcVT, ValueType ScalVT,
Instruction DUP, SDNodeXForm IdxXFORM>
: Pat<(ResVT (AArch64dup (ScalVT (vector_extract (SrcVT V128:$Rn), imm:$idx)))),
(DUP V128:$Rn, (IdxXFORM imm:$idx))>;

// DUP (truncate i16 to i8)
def : DUPWithTruncPat<v8i8, v8i16, i32, DUPv8i8lane, VecIndex_x2>;
def : DUPWithTruncPat<v16i8, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
// DUP (truncate i32/64 to i8)
def : DUPWithTruncPat<v8i8, v4i32, i32, DUPv8i8lane, VecIndex_x4>;
def : DUPWithTruncPat<v16i8, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
// DUP (truncate i32/i64 to i16)
def : DUPWithTruncPat<v4i16, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
def : DUPWithTruncPat<v8i16, v4i32, i32, DUPv8i16lane, VecIndex_x2>;

// SMOV and UMOV definitions, with some extra patterns for convenience
defm SMOV : SMov;
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
Original file line number Diff line number Diff line change
Expand Up @@ -384,9 +384,9 @@ define void @insert_vec_v4i16_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: uaddlv.4s d0, v0
; CHECK-NEXT: mov.h v1[0], v0[0]
; CHECK-NEXT: ushll.4s v0, v1, #0
; CHECK-NEXT: ucvtf.4s v0, v0
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: ucvtf.4s v1, v1
; CHECK-NEXT: str q1, [x0]
; CHECK-NEXT: ret

entry:
Expand All @@ -403,13 +403,13 @@ define void @insert_vec_v16i16_uaddlv_from_v4i32(ptr %0) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: movi.2d v2, #0000000000000000
; CHECK-NEXT: uaddlv.4s d0, v0
; CHECK-NEXT: stp q2, q2, [x0, #32]
; CHECK-NEXT: mov.h v1[0], v0[0]
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: stp q0, q0, [x0, #32]
; CHECK-NEXT: ucvtf.4s v1, v1
; CHECK-NEXT: stp q1, q0, [x0]
; CHECK-NEXT: stp q1, q2, [x0]
; CHECK-NEXT: ret

entry:
Expand All @@ -430,9 +430,9 @@ define void @insert_vec_v8i8_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: uaddlv.4s d0, v0
; CHECK-NEXT: mov.h v1[0], v0[0]
; CHECK-NEXT: bic.4h v1, #255, lsl #8
; CHECK-NEXT: ushll.4s v0, v1, #0
; CHECK-NEXT: ucvtf.4s v0, v0
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: ucvtf.4s v1, v1
; CHECK-NEXT: str q1, [x0]
; CHECK-NEXT: ret

entry:
Expand All @@ -449,14 +449,14 @@ define void @insert_vec_v16i8_uaddlv_from_v4i32(ptr %0) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: movi.2d v1, #0000000000000000
; CHECK-NEXT: movi.2d v2, #0000000000000000
; CHECK-NEXT: uaddlv.4s d0, v0
; CHECK-NEXT: stp q2, q2, [x0, #32]
; CHECK-NEXT: mov.h v1[0], v0[0]
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: bic.4h v1, #255, lsl #8
; CHECK-NEXT: stp q0, q0, [x0, #32]
; CHECK-NEXT: ushll.4s v1, v1, #0
; CHECK-NEXT: ucvtf.4s v1, v1
; CHECK-NEXT: stp q1, q0, [x0]
; CHECK-NEXT: stp q1, q2, [x0]
; CHECK-NEXT: ret

entry:
Expand Down
136 changes: 136 additions & 0 deletions llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+neon < %s | FileCheck %s

; Inserting a truncated (i64 to i32) element from the bottom 128-bits of any vector type into a NEON vector should use INS (element) of the
; truncated size to avoid pointless GPR trips.


define <2 x i32> @test_s_trunc_d_lane0(<2 x i32> %a, <1 x i64> %b) {
; CHECK-LABEL: test_s_trunc_d_lane0:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: mov v0.s[0], v1.s[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%c = extractelement <1 x i64> %b, i32 0
%d = trunc i64 %c to i32
%e = insertelement <2 x i32> %a, i32 %d, i64 0
ret <2 x i32> %e
}

define <2 x i32> @test_s_trunc_d_qlane1(<2 x i32> %a, <2 x i64> %b) {
; CHECK-LABEL: test_s_trunc_d_qlane1:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov v0.s[0], v1.s[2]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%c = extractelement <2 x i64> %b, i32 1
%d = trunc i64 %c to i32
%e = insertelement <2 x i32> %a, i32 %d, i64 0
ret <2 x i32> %e
}

define <4 x i32> @test_qs_trunc_d_lane0(<4 x i32> %a, <1 x i64> %b) {
; CHECK-LABEL: test_qs_trunc_d_lane0:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: mov v0.s[0], v1.s[0]
; CHECK-NEXT: ret
%c = extractelement <1 x i64> %b, i32 0
%d = trunc i64 %c to i32
%e = insertelement <4 x i32> %a, i32 %d, i64 0
ret <4 x i32> %e
}

define <4 x i32> @test_qs_trunc_d_qlane1(<4 x i32> %a, <2 x i64> %b) {
; CHECK-LABEL: test_qs_trunc_d_qlane1:
; CHECK: // %bb.0:
; CHECK-NEXT: mov v0.s[3], v1.s[2]
; CHECK-NEXT: ret
%c = extractelement <2 x i64> %b, i32 1
%d = trunc i64 %c to i32
%e = insertelement <4 x i32> %a, i32 %d, i64 3
ret <4 x i32> %e
}

; ---- From the bottom 128b of an SVE vector

define <2 x i32> @test_s_trunc_dsve_lane0(<2 x i32> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: test_s_trunc_dsve_lane0:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov v0.s[0], v1.s[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%c = extractelement <vscale x 2 x i64> %b, i32 0
%d = trunc i64 %c to i32
%e = insertelement <2 x i32> %a, i32 %d, i64 0
ret <2 x i32> %e
}

define <2 x i32> @test_s_trunc_dsve_lane1(<2 x i32> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: test_s_trunc_dsve_lane1:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov v0.s[1], v1.s[2]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%c = extractelement <vscale x 2 x i64> %b, i32 1
%d = trunc i64 %c to i32
%e = insertelement <2 x i32> %a, i32 %d, i64 1
ret <2 x i32> %e
}

; (negative test) Extracted element is not within V-register.
define <2 x i32> @test_s_trunc_dsve_lane2(<2 x i32> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: test_s_trunc_dsve_lane2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.s, z1.s[4]
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: mov v0.s[1], w8
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%c = extractelement <vscale x 2 x i64> %b, i32 2
%d = trunc i64 %c to i32
%e = insertelement <2 x i32> %a, i32 %d, i64 1
ret <2 x i32> %e
}

define <4 x i32> @test_qs_trunc_dsve_lane0(<4 x i32> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: test_qs_trunc_dsve_lane0:
; CHECK: // %bb.0:
; CHECK-NEXT: mov v0.s[0], v1.s[0]
; CHECK-NEXT: ret
%c = extractelement <vscale x 2 x i64> %b, i32 0
%d = trunc i64 %c to i32
%e = insertelement <4 x i32> %a, i32 %d, i64 0
ret <4 x i32> %e
}

define <4 x i32> @test_qs_trunc_dsve_lane1(<4 x i32> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: test_qs_trunc_dsve_lane1:
; CHECK: // %bb.0:
; CHECK-NEXT: mov v0.s[3], v1.s[2]
; CHECK-NEXT: ret
%c = extractelement <vscale x 2 x i64> %b, i32 1
%d = trunc i64 %c to i32
%e = insertelement <4 x i32> %a, i32 %d, i64 3
ret <4 x i32> %e
}

; (negative test) Extracted element is not within V-register.
define <4 x i32> @test_qs_trunc_dsve_lane2(<4 x i32> %a, <vscale x 2 x i64> %b) {
; CHECK-LABEL: test_qs_trunc_dsve_lane2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z1.s, z1.s[4]
; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: mov v0.s[3], w8
; CHECK-NEXT: ret
%c = extractelement <vscale x 2 x i64> %b, i32 2
%d = trunc i64 %c to i32
%e = insertelement <4 x i32> %a, i32 %d, i64 3
ret <4 x i32> %e
}
9 changes: 3 additions & 6 deletions llvm/test/CodeGen/AArch64/sve-doublereduct.ll
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,7 @@ define i32 @add_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: add z0.s, z0.s, z2.s
; CHECK-NEXT: uaddv d0, p0, z0.s
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%r1 = call i32 @llvm.vector.reduce.add.i32.nxv8i32(<vscale x 8 x i32> %a)
%r2 = call i32 @llvm.vector.reduce.add.i32.nxv4i32(<vscale x 4 x i32> %b)
Expand All @@ -112,8 +111,7 @@ define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-NEXT: add z1.h, z1.h, z3.h
; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: uaddv d0, p0, z0.h
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%ae = zext <vscale x 16 x i8> %a to <vscale x 16 x i16>
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
Expand All @@ -139,8 +137,7 @@ define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-NEXT: add z1.h, z2.h, z5.h
; CHECK-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEXT: uaddv d0, p0, z0.h
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
%ae = zext <vscale x 32 x i8> %a to <vscale x 32 x i16>
%be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AArch64/sve-extract-element.ll
Original file line number Diff line number Diff line change
Expand Up @@ -644,8 +644,8 @@ define i1 @test_lane4_2xi1(<vscale x 2 x i1> %a) #0 {
; CHECK-LABEL: test_lane4_2xi1:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1
; CHECK-NEXT: mov z0.d, z0.d[4]
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: mov z0.s, z0.s[8]
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: and w0, w8, #0x1
; CHECK-NEXT: ret
%b = extractelement <vscale x 2 x i1> %a, i32 4
Expand Down
7 changes: 2 additions & 5 deletions llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -238,11 +238,8 @@ define <2 x i1> @extract_v2i1_nxv2i1(<vscale x 2 x i1> %inmask) {
; CHECK-LABEL: extract_v2i1_nxv2i1:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: mov x8, v0.d[1]
; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: mov v0.s[1], w8
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: mov v0.s[1], v0.s[2]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%mask = call <2 x i1> @llvm.vector.extract.v2i1.nxv2i1(<vscale x 2 x i1> %inmask, i64 0)
ret <2 x i1> %mask
Expand Down
Loading
Loading