Skip to content

Commit 69b4190

Browse files
authored
[AArch64] Optimize extending loads of small vectors (#163064)
Reduces the total amount of loads and the amount of moves between SIMD registers and general-purpose registers.
1 parent c1c22cd commit 69b4190

25 files changed

+567
-348
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 115 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1427,12 +1427,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
14271427
setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
14281428
setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
14291429

1430-
setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1430+
setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1431+
setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1432+
setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1433+
setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1434+
setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1435+
setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1436+
setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
14311437
setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
14321438
setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1433-
setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1439+
setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
14341440
setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
14351441
setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1442+
setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1443+
setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1444+
setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1445+
setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1446+
setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1447+
setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
14361448

14371449
// ADDP custom lowering
14381450
for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
@@ -6728,8 +6740,34 @@ bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
67286740
return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
67296741
}
67306742

6743+
/// Helper function to check if a small vector load can be optimized.
6744+
static bool isEligibleForSmallVectorLoadOpt(LoadSDNode *LD,
6745+
const AArch64Subtarget &Subtarget) {
6746+
if (!Subtarget.isNeonAvailable())
6747+
return false;
6748+
if (LD->isVolatile())
6749+
return false;
6750+
6751+
EVT MemVT = LD->getMemoryVT();
6752+
if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8 && MemVT != MVT::v2i16)
6753+
return false;
6754+
6755+
Align Alignment = LD->getAlign();
6756+
Align RequiredAlignment = Align(MemVT.getStoreSize().getFixedValue());
6757+
if (Subtarget.requiresStrictAlign() && Alignment < RequiredAlignment)
6758+
return false;
6759+
6760+
return true;
6761+
}
6762+
67316763
bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
67326764
EVT ExtVT = ExtVal.getValueType();
6765+
// Small, illegal vectors can be extended inreg.
6766+
if (auto *Load = dyn_cast<LoadSDNode>(ExtVal.getOperand(0))) {
6767+
if (ExtVT.isFixedLengthVector() && ExtVT.getStoreSizeInBits() <= 128 &&
6768+
isEligibleForSmallVectorLoadOpt(Load, *Subtarget))
6769+
return true;
6770+
}
67336771
if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
67346772
return false;
67356773

@@ -7188,12 +7226,86 @@ SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
71887226
return Result;
71897227
}
71907228

7229+
/// Helper function to optimize loads of extended small vectors.
7230+
/// These patterns would otherwise get scalarized into inefficient sequences.
7231+
static SDValue tryLowerSmallVectorExtLoad(LoadSDNode *Load, SelectionDAG &DAG) {
7232+
const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
7233+
if (!isEligibleForSmallVectorLoadOpt(Load, Subtarget))
7234+
return SDValue();
7235+
7236+
EVT MemVT = Load->getMemoryVT();
7237+
EVT ResVT = Load->getValueType(0);
7238+
unsigned NumElts = ResVT.getVectorNumElements();
7239+
unsigned DstEltBits = ResVT.getScalarSizeInBits();
7240+
unsigned SrcEltBits = MemVT.getScalarSizeInBits();
7241+
7242+
unsigned ExtOpcode;
7243+
switch (Load->getExtensionType()) {
7244+
case ISD::EXTLOAD:
7245+
case ISD::ZEXTLOAD:
7246+
ExtOpcode = ISD::ZERO_EXTEND;
7247+
break;
7248+
case ISD::SEXTLOAD:
7249+
ExtOpcode = ISD::SIGN_EXTEND;
7250+
break;
7251+
case ISD::NON_EXTLOAD:
7252+
return SDValue();
7253+
}
7254+
7255+
SDLoc DL(Load);
7256+
SDValue Chain = Load->getChain();
7257+
SDValue BasePtr = Load->getBasePtr();
7258+
const MachinePointerInfo &PtrInfo = Load->getPointerInfo();
7259+
Align Alignment = Load->getAlign();
7260+
7261+
// Load the data as an FP scalar to avoid issues with integer loads.
7262+
unsigned LoadBits = MemVT.getStoreSizeInBits();
7263+
MVT ScalarLoadType = MVT::getFloatingPointVT(LoadBits);
7264+
SDValue ScalarLoad =
7265+
DAG.getLoad(ScalarLoadType, DL, Chain, BasePtr, PtrInfo, Alignment);
7266+
7267+
MVT ScalarToVecTy = MVT::getVectorVT(ScalarLoadType, 128 / LoadBits);
7268+
SDValue ScalarToVec =
7269+
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ScalarToVecTy, ScalarLoad);
7270+
MVT BitcastTy =
7271+
MVT::getVectorVT(MVT::getIntegerVT(SrcEltBits), 128 / SrcEltBits);
7272+
SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, BitcastTy, ScalarToVec);
7273+
7274+
SDValue Res = Bitcast;
7275+
unsigned CurrentEltBits = Res.getValueType().getScalarSizeInBits();
7276+
unsigned CurrentNumElts = Res.getValueType().getVectorNumElements();
7277+
while (CurrentEltBits < DstEltBits) {
7278+
if (Res.getValueSizeInBits() >= 128) {
7279+
CurrentNumElts = CurrentNumElts / 2;
7280+
MVT ExtractVT =
7281+
MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
7282+
Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Res,
7283+
DAG.getConstant(0, DL, MVT::i64));
7284+
}
7285+
CurrentEltBits = CurrentEltBits * 2;
7286+
MVT ExtVT =
7287+
MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
7288+
Res = DAG.getNode(ExtOpcode, DL, ExtVT, Res);
7289+
}
7290+
7291+
if (CurrentNumElts != NumElts) {
7292+
MVT FinalVT = MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), NumElts);
7293+
Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FinalVT, Res,
7294+
DAG.getConstant(0, DL, MVT::i64));
7295+
}
7296+
7297+
return DAG.getMergeValues({Res, ScalarLoad.getValue(1)}, DL);
7298+
}
7299+
71917300
SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
71927301
SelectionDAG &DAG) const {
71937302
SDLoc DL(Op);
71947303
LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
71957304
assert(LoadNode && "Expected custom lowering of a load node");
71967305

7306+
if (SDValue Result = tryLowerSmallVectorExtLoad(LoadNode, DAG))
7307+
return Result;
7308+
71977309
if (LoadNode->getMemoryVT() == MVT::i64x8) {
71987310
SmallVector<SDValue, 8> Ops;
71997311
SDValue Base = LoadNode->getBasePtr();
@@ -7212,37 +7324,7 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
72127324
return DAG.getMergeValues({Loaded, Chain}, DL);
72137325
}
72147326

7215-
// Custom lowering for extending v4i8 vector loads.
7216-
EVT VT = Op->getValueType(0);
7217-
assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
7218-
7219-
if (LoadNode->getMemoryVT() != MVT::v4i8)
7220-
return SDValue();
7221-
7222-
// Avoid generating unaligned loads.
7223-
if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
7224-
return SDValue();
7225-
7226-
unsigned ExtType;
7227-
if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
7228-
ExtType = ISD::SIGN_EXTEND;
7229-
else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
7230-
LoadNode->getExtensionType() == ISD::EXTLOAD)
7231-
ExtType = ISD::ZERO_EXTEND;
7232-
else
7233-
return SDValue();
7234-
7235-
SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
7236-
LoadNode->getBasePtr(), MachinePointerInfo());
7237-
SDValue Chain = Load.getValue(1);
7238-
SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
7239-
SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
7240-
SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
7241-
Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
7242-
DAG.getConstant(0, DL, MVT::i64));
7243-
if (VT == MVT::v4i32)
7244-
Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
7245-
return DAG.getMergeValues({Ext, Chain}, DL);
7327+
return SDValue();
72467328
}
72477329

72487330
SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,

0 commit comments

Comments
 (0)