Skip to content

Commit 2947c37

Browse files
committed
[AArch64][Codegen] Improve small shufflevector/concat lowering for SME
* Avoid using TBL for small vectors (that can be lowered with a couple of ZIP1s) * Fold redundant ZIP1s
1 parent 64e3466 commit 2947c37

File tree

3 files changed

+67
-34
lines changed

3 files changed

+67
-34
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24722,6 +24722,49 @@ static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
2472224722
Op0ExtV, Op1ExtV, Op->getOperand(2));
2472324723
}
2472424724

24725+
static SDValue skipElementSizePreservingCast(SDValue Op, EVT VT) {
24726+
if (Op->getOpcode() == ISD::BITCAST)
24727+
Op = Op->getOperand(0);
24728+
EVT OpVT = Op.getValueType();
24729+
if (OpVT.isVector() && OpVT.getVectorElementType().getSizeInBits() ==
24730+
VT.getVectorElementType().getSizeInBits())
24731+
return Op;
24732+
return SDValue();
24733+
}
24734+
24735+
static SDValue performZIP1Combine(SDNode *N, SelectionDAG &DAG) {
24736+
SDLoc DL(N);
24737+
EVT VT = N->getValueType(0);
24738+
24739+
// zip1(insert_vector_elt(undef, extract_vector_elt(vec, 0), 0),
24740+
// insert_vector_elt(undef, extract_vector_elt(vec, 1), 0))
24741+
// -> vec
24742+
SDValue Op0 = skipElementSizePreservingCast(N->getOperand(0), VT);
24743+
SDValue Op1 = skipElementSizePreservingCast(N->getOperand(1), VT);
24744+
if (Op0 && Op1 && Op0->getOpcode() == ISD::INSERT_VECTOR_ELT &&
24745+
Op1->getOpcode() == ISD::INSERT_VECTOR_ELT) {
24746+
SDValue Op00 = Op0->getOperand(0);
24747+
SDValue Op10 = Op1->getOperand(0);
24748+
if (Op00.isUndef() && Op10.isUndef() &&
24749+
Op0->getConstantOperandVal(2) == 0 &&
24750+
Op1->getConstantOperandVal(2) == 0) {
24751+
SDValue Op01 = Op0->getOperand(1);
24752+
SDValue Op11 = Op1->getOperand(1);
24753+
if (Op01->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24754+
Op11->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24755+
Op01->getConstantOperandVal(1) == 0 &&
24756+
Op11->getConstantOperandVal(1) == 1) {
24757+
SDValue Op010 = skipElementSizePreservingCast(Op01->getOperand(0), VT);
24758+
SDValue Op110 = skipElementSizePreservingCast(Op11->getOperand(0), VT);
24759+
if (Op010 && Op010 == Op110)
24760+
return DAG.getBitcast(VT, Op010);
24761+
}
24762+
}
24763+
}
24764+
24765+
return SDValue();
24766+
}
24767+
2472524768
static SDValue
2472624769
performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
2472724770
SelectionDAG &DAG) {
@@ -26163,6 +26206,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
2616326206

2616426207
break;
2616526208
}
26209+
case AArch64ISD::ZIP1:
26210+
return performZIP1Combine(N, DAG);
2616626211
case ISD::XOR:
2616726212
return performXorCombine(N, DAG, DCI, Subtarget);
2616826213
case ISD::MUL:
@@ -29032,7 +29077,14 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
2903229077
if (!IsSingleOp && !Subtarget.hasSVE2())
2903329078
return SDValue();
2903429079

29080+
// Small vectors (with few extracts) can be lowered more efficiently as a
29081+
// sequence of ZIPs.
2903529082
EVT VTOp1 = Op.getOperand(0).getValueType();
29083+
unsigned NumElts = VT.getVectorNumElements();
29084+
if (VT.isPow2VectorType() && VT.getFixedSizeInBits() <= 128 &&
29085+
(NumElts <= 2 || (NumElts <= 4 && !Op2.isUndef())))
29086+
return SDValue();
29087+
2903629088
unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
2903729089
unsigned IndexLen = MinSVESize / BitsPerElt;
2903829090
unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll

Lines changed: 9 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK,SVE2
3-
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SME
2+
; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefixes=CHECK
3+
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK
44
; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
55

66
target triple = "aarch64-unknown-linux-gnu"
@@ -406,33 +406,13 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) {
406406
;
407407

408408
define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) {
409-
; SVE2-LABEL: concat_v4f16:
410-
; SVE2: // %bb.0:
411-
; SVE2-NEXT: cnth x8
412-
; SVE2-NEXT: adrp x9, .LCPI15_0
413-
; SVE2-NEXT: adrp x10, .LCPI15_1
414-
; SVE2-NEXT: mov z2.h, w8
415-
; SVE2-NEXT: ldr q3, [x9, :lo12:.LCPI15_0]
416-
; SVE2-NEXT: ldr q4, [x10, :lo12:.LCPI15_1]
417-
; SVE2-NEXT: ptrue p0.h, vl8
418-
; SVE2-NEXT: // kill: def $d1 killed $d1 killed $z0_z1 def $z0_z1
419-
; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0_z1 def $z0_z1
420-
; SVE2-NEXT: mad z2.h, p0/m, z3.h, z4.h
421-
; SVE2-NEXT: tbl z0.h, { z0.h, z1.h }, z2.h
422-
; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0
423-
; SVE2-NEXT: ret
424-
;
425-
; SME-LABEL: concat_v4f16:
426-
; SME: // %bb.0:
427-
; SME-NEXT: // kill: def $d1 killed $d1 def $z1
428-
; SME-NEXT: // kill: def $d0 killed $d0 def $z0
429-
; SME-NEXT: mov z2.h, z1.h[1]
430-
; SME-NEXT: mov z3.h, z0.h[1]
431-
; SME-NEXT: zip1 z1.h, z1.h, z2.h
432-
; SME-NEXT: zip1 z0.h, z0.h, z3.h
433-
; SME-NEXT: zip1 z0.s, z0.s, z1.s
434-
; SME-NEXT: // kill: def $d0 killed $d0 killed $z0
435-
; SME-NEXT: ret
409+
; CHECK-LABEL: concat_v4f16:
410+
; CHECK: // %bb.0:
411+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
412+
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
413+
; CHECK-NEXT: zip1 z0.s, z0.s, z1.s
414+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
415+
; CHECK-NEXT: ret
436416
;
437417
; NONEON-NOSVE-LABEL: concat_v4f16:
438418
; NONEON-NOSVE: // %bb.0:

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -643,11 +643,12 @@ define void @test_revhv32i16(ptr %a) {
643643
define void @test_rev_elts_fail(ptr %a) {
644644
; CHECK-LABEL: test_rev_elts_fail:
645645
; CHECK: // %bb.0:
646-
; CHECK-NEXT: index z0.d, #1, #-1
647-
; CHECK-NEXT: ldp q1, q2, [x0]
648-
; CHECK-NEXT: tbl z1.d, { z1.d }, z0.d
649-
; CHECK-NEXT: tbl z0.d, { z2.d }, z0.d
650-
; CHECK-NEXT: stp q1, q0, [x0]
646+
; CHECK-NEXT: ldp q0, q1, [x0]
647+
; CHECK-NEXT: mov z2.d, z0.d[1]
648+
; CHECK-NEXT: mov z3.d, z1.d[1]
649+
; CHECK-NEXT: zip1 z0.d, z2.d, z0.d
650+
; CHECK-NEXT: zip1 z1.d, z3.d, z1.d
651+
; CHECK-NEXT: stp q0, q1, [x0]
651652
; CHECK-NEXT: ret
652653
;
653654
; NONEON-NOSVE-LABEL: test_rev_elts_fail:

0 commit comments

Comments
 (0)