Skip to content

Commit f354ca2

Browse files
authored
[AArch64] Scalarize extracted vector loads. (#159714)
Given a vector load that is only extracted from, it is more efficient to perform the individual loads than a single load and many extracts. This adds a late optimization for scalarizing extracted vector loads that do not have any other uses and will not be more efficiently kept in fpr registers.
1 parent 3dddaa3 commit f354ca2

17 files changed

+741
-806
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20467,6 +20467,69 @@ performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
2046720467
}
2046820468
}
2046920469

20470+
// Given an extract(load) or extract(extend(load)), produce a scalar load
20471+
// instead to avoid the cross-register-bank copies.
20472+
if (DCI.isAfterLegalizeDAG() && Subtarget->isLittleEndian() &&
20473+
VT.isInteger() && isa<ConstantSDNode>(N1)) {
20474+
SDValue LoadN0 = N0;
20475+
// Look through sext/zext and extract_subvector / insert_subvector if
20476+
// required.
20477+
if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
20478+
N0.getOpcode() == ISD::SIGN_EXTEND ||
20479+
N0.getOpcode() == ISD::ANY_EXTEND) &&
20480+
N0.getOperand(0).hasOneUse())
20481+
LoadN0 = N0.getOperand(0);
20482+
unsigned OffsetElts = 0;
20483+
if (LoadN0.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
20484+
OffsetElts = LoadN0.getConstantOperandVal(1);
20485+
LoadN0 = LoadN0.getOperand(0);
20486+
}
20487+
if (LoadN0.getOpcode() == ISD::INSERT_SUBVECTOR &&
20488+
LoadN0.getOperand(0).isUndef() &&
20489+
isNullConstant(LoadN0.getOperand(2)) &&
20490+
LoadN0.getOperand(1).hasOneUse())
20491+
LoadN0 = LoadN0.getOperand(1);
20492+
20493+
// Check all the uses are valid and can be scalarized. We check that all the
20494+
// uses are extracts and those extracts are not re-inserted into an
20495+
// operation best treated as a vector register.
20496+
auto Load = dyn_cast<LoadSDNode>(LoadN0);
20497+
if (Load && Load->isSimple() && ISD::isNormalLoad(Load) &&
20498+
Load->getMemoryVT().isByteSized() &&
20499+
all_of(N0->uses(), [&](const SDUse &U) {
20500+
return U.getResNo() != N0.getResNo() ||
20501+
(U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20502+
!any_of(U.getUser()->uses(), [](const SDUse &U2) {
20503+
return U2.getUser()->getOpcode() ==
20504+
ISD::INSERT_VECTOR_ELT ||
20505+
U2.getUser()->getOpcode() == ISD::BUILD_VECTOR ||
20506+
U2.getUser()->getOpcode() == ISD::SCALAR_TO_VECTOR;
20507+
}));
20508+
})) {
20509+
20510+
SDLoc DL(Load);
20511+
20512+
// Generate a new scalar load.
20513+
unsigned Offset = (OffsetElts + N->getConstantOperandVal(1)) *
20514+
Load->getValueType(0).getScalarSizeInBits() / 8;
20515+
SDValue BasePtr = DAG.getObjectPtrOffset(
20516+
DL, Load->getBasePtr(), DAG.getConstant(Offset, DL, MVT::i64));
20517+
ISD::LoadExtType ExtType =
20518+
N0.getOpcode() == ISD::ZERO_EXTEND
20519+
? ISD::ZEXTLOAD
20520+
: (N0.getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD
20521+
: ISD::EXTLOAD);
20522+
SDValue ScalarLoad =
20523+
DAG.getExtLoad(ExtType, DL, VT, Load->getChain(), BasePtr,
20524+
Load->getPointerInfo().getWithOffset(Offset),
20525+
Load->getValueType(0).getScalarType(),
20526+
commonAlignment(Load->getAlign(), Offset),
20527+
Load->getMemOperand()->getFlags(), Load->getAAInfo());
20528+
DAG.makeEquivalentMemoryOrdering(Load, ScalarLoad);
20529+
return ScalarLoad;
20530+
}
20531+
}
20532+
2047020533
return SDValue();
2047120534
}
2047220535

llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -53,18 +53,15 @@ define <4 x half> @uitofp_v4i64_to_v4f16(ptr %ptr) {
5353
define <4 x bfloat> @uitofp_v4i64_to_v4bf16(ptr %ptr) {
5454
; CHECK-LABEL: uitofp_v4i64_to_v4bf16:
5555
; CHECK: // %bb.0:
56-
; CHECK-NEXT: ldp q0, q2, [x0]
57-
; CHECK-NEXT: mov x8, v0.d[1]
58-
; CHECK-NEXT: fmov x9, d0
59-
; CHECK-NEXT: ucvtf s1, x9
60-
; CHECK-NEXT: mov x9, v2.d[1]
61-
; CHECK-NEXT: ucvtf s0, x8
62-
; CHECK-NEXT: fmov x8, d2
63-
; CHECK-NEXT: ucvtf s2, x8
56+
; CHECK-NEXT: ldp x8, x9, [x0]
57+
; CHECK-NEXT: movi v2.4s, #127, msl #8
58+
; CHECK-NEXT: ucvtf s0, x9
59+
; CHECK-NEXT: ucvtf s1, x8
60+
; CHECK-NEXT: ldp x8, x9, [x0, #16]
6461
; CHECK-NEXT: mov v1.s[1], v0.s[0]
62+
; CHECK-NEXT: ucvtf s0, x8
63+
; CHECK-NEXT: mov v1.s[2], v0.s[0]
6564
; CHECK-NEXT: ucvtf s0, x9
66-
; CHECK-NEXT: mov v1.s[2], v2.s[0]
67-
; CHECK-NEXT: movi v2.4s, #127, msl #8
6865
; CHECK-NEXT: mov v1.s[3], v0.s[0]
6966
; CHECK-NEXT: movi v0.4s, #1
7067
; CHECK-NEXT: ushr v3.4s, v1.4s, #16

llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,8 @@
44
define i32 @foo(ptr %__a) nounwind {
55
; CHECK-LABEL: foo:
66
; CHECK: // %bb.0:
7-
; CHECK-NEXT: ldr d0, [x0]
8-
; CHECK-NEXT: umov.h w8, v0[0]
9-
; CHECK-NEXT: umov.h w9, v0[0]
10-
; CHECK-NEXT: add w0, w9, w8, uxth #1
7+
; CHECK-NEXT: ldrh w8, [x0]
8+
; CHECK-NEXT: add w0, w8, w8, lsl #1
119
; CHECK-NEXT: ret
1210
%tmp18 = load <4 x i16>, ptr %__a, align 8
1311
%vget_lane = extractelement <4 x i16> %tmp18, i32 0

llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -105,13 +105,13 @@ define i32 @ldr_int_volatile(ptr %a) nounwind {
105105
; CHECK: Cluster ld/st SU(1) - SU(3)
106106
; CHECK: SU(1): %{{[0-9]+}}:fpr128 = LDRQui
107107
; CHECK: SU(3): %{{[0-9]+}}:fpr128 = LDRQui
108-
define <2 x i64> @ldq_cluster(ptr %p) {
109-
%tmp1 = load <2 x i64>, < 2 x i64>* %p, align 8
108+
define <4 x i32> @ldq_cluster(ptr %p) {
109+
%tmp1 = load <4 x i32>, ptr %p, align 8
110110
%add.ptr2 = getelementptr inbounds i64, ptr %p, i64 2
111-
%tmp2 = add nsw <2 x i64> %tmp1, %tmp1
112-
%tmp3 = load <2 x i64>, ptr %add.ptr2, align 8
113-
%res = mul nsw <2 x i64> %tmp2, %tmp3
114-
ret <2 x i64> %res
111+
%tmp2 = add nsw <4 x i32> %tmp1, %tmp1
112+
%tmp3 = load <4 x i32>, ptr %add.ptr2, align 8
113+
%res = mul nsw <4 x i32> %tmp2, %tmp3
114+
ret <4 x i32> %res
115115
}
116116

117117
; CHECK: ********** MI Scheduling **********
@@ -215,7 +215,7 @@ exit:
215215
; CHECK: ********** MI Scheduling **********
216216
; CHECK: LDURXi_LDRXui:%bb.0 entry
217217
; CHECK: Cluster ld/st SU(3) - SU(4)
218-
; CHECK: SU(3): %{{[0-9]+}}:gpr64 = LDURXi
218+
; CHECK: SU(3): %{{[0-9]+}}:gpr64 = LDURXi
219219
; CHECK: SU(4): %{{[0-9]+}}:gpr64 = LDRXui
220220
;
221221
define void @LDURXi_LDRXui(ptr nocapture readonly %arg, ptr nocapture readonly %wa, ptr nocapture readonly %wb) {

llvm/test/CodeGen/AArch64/complex-int-to-fp.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,9 @@
44
define void @autogen_SD19655(ptr %addr, ptr %addrfloat) {
55
; CHECK-LABEL: autogen_SD19655:
66
; CHECK: // %bb.0:
7-
; CHECK-NEXT: ldr q0, [x0]
8-
; CHECK-NEXT: mov.d x8, v0[1]
9-
; CHECK-NEXT: fmov x9, d0
10-
; CHECK-NEXT: scvtf s1, x9
11-
; CHECK-NEXT: scvtf s0, x8
7+
; CHECK-NEXT: ldp x8, x9, [x0]
8+
; CHECK-NEXT: scvtf s0, x9
9+
; CHECK-NEXT: scvtf s1, x8
1210
; CHECK-NEXT: mov.s v1[1], v0[0]
1311
; CHECK-NEXT: str d1, [x1]
1412
; CHECK-NEXT: ret

llvm/test/CodeGen/AArch64/extract-vector-elt.ll

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1114,16 +1114,10 @@ entry:
11141114
}
11151115

11161116
define ptr @v3ext(<3 x ptr> %a, <3 x ptr> %b, <3 x ptr> %x) {
1117-
; CHECK-SD-LABEL: v3ext:
1118-
; CHECK-SD: // %bb.0: // %entry
1119-
; CHECK-SD-NEXT: ldr d0, [sp]
1120-
; CHECK-SD-NEXT: fmov x0, d0
1121-
; CHECK-SD-NEXT: ret
1122-
;
1123-
; CHECK-GI-LABEL: v3ext:
1124-
; CHECK-GI: // %bb.0: // %entry
1125-
; CHECK-GI-NEXT: ldr x0, [sp]
1126-
; CHECK-GI-NEXT: ret
1117+
; CHECK-LABEL: v3ext:
1118+
; CHECK: // %bb.0: // %entry
1119+
; CHECK-NEXT: ldr x0, [sp]
1120+
; CHECK-NEXT: ret
11271121
entry:
11281122
%c = extractelement <3 x ptr> %x, i32 2
11291123
ret ptr %c

0 commit comments

Comments
 (0)