@@ -1427,12 +1427,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
14271427 setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
14281428 setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
14291429
1430- setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1430+ setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1431+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1432+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1433+ setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1434+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1435+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1436+ setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
14311437 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
14321438 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1433- setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1439+ setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
14341440 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
14351441 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1442+ setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1443+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1444+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1445+ setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1446+ setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1447+ setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
14361448
14371449 // ADDP custom lowering
14381450 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
@@ -6728,8 +6740,34 @@ bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
67286740 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
67296741}
67306742
6743+ /// Helper function to check if a small vector load can be optimized.
6744+ static bool isEligibleForSmallVectorLoadOpt(LoadSDNode *LD,
6745+ const AArch64Subtarget &Subtarget) {
6746+ if (!Subtarget.isNeonAvailable())
6747+ return false;
6748+ if (LD->isVolatile())
6749+ return false;
6750+
6751+ EVT MemVT = LD->getMemoryVT();
6752+ if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8 && MemVT != MVT::v2i16)
6753+ return false;
6754+
6755+ Align Alignment = LD->getAlign();
6756+ Align RequiredAlignment = Align(MemVT.getStoreSize().getFixedValue());
6757+ if (Subtarget.requiresStrictAlign() && Alignment < RequiredAlignment)
6758+ return false;
6759+
6760+ return true;
6761+ }
6762+
67316763bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
67326764 EVT ExtVT = ExtVal.getValueType();
6765+ // Small, illegal vectors can be extended inreg.
6766+ if (auto *Load = dyn_cast<LoadSDNode>(ExtVal.getOperand(0))) {
6767+ if (ExtVT.isFixedLengthVector() && ExtVT.getStoreSizeInBits() <= 128 &&
6768+ isEligibleForSmallVectorLoadOpt(Load, *Subtarget))
6769+ return true;
6770+ }
67336771 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
67346772 return false;
67356773
@@ -7188,12 +7226,86 @@ SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
71887226 return Result;
71897227}
71907228
7229+ /// Helper function to optimize loads of extended small vectors.
7230+ /// These patterns would otherwise get scalarized into inefficient sequences.
7231+ static SDValue tryLowerSmallVectorExtLoad(LoadSDNode *Load, SelectionDAG &DAG) {
7232+ const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
7233+ if (!isEligibleForSmallVectorLoadOpt(Load, Subtarget))
7234+ return SDValue();
7235+
7236+ EVT MemVT = Load->getMemoryVT();
7237+ EVT ResVT = Load->getValueType(0);
7238+ unsigned NumElts = ResVT.getVectorNumElements();
7239+ unsigned DstEltBits = ResVT.getScalarSizeInBits();
7240+ unsigned SrcEltBits = MemVT.getScalarSizeInBits();
7241+
7242+ unsigned ExtOpcode;
7243+ switch (Load->getExtensionType()) {
7244+ case ISD::EXTLOAD:
7245+ case ISD::ZEXTLOAD:
7246+ ExtOpcode = ISD::ZERO_EXTEND;
7247+ break;
7248+ case ISD::SEXTLOAD:
7249+ ExtOpcode = ISD::SIGN_EXTEND;
7250+ break;
7251+ case ISD::NON_EXTLOAD:
7252+ return SDValue();
7253+ }
7254+
7255+ SDLoc DL(Load);
7256+ SDValue Chain = Load->getChain();
7257+ SDValue BasePtr = Load->getBasePtr();
7258+ const MachinePointerInfo &PtrInfo = Load->getPointerInfo();
7259+ Align Alignment = Load->getAlign();
7260+
7261+ // Load the data as an FP scalar to avoid issues with integer loads.
7262+ unsigned LoadBits = MemVT.getStoreSizeInBits();
7263+ MVT ScalarLoadType = MVT::getFloatingPointVT(LoadBits);
7264+ SDValue ScalarLoad =
7265+ DAG.getLoad(ScalarLoadType, DL, Chain, BasePtr, PtrInfo, Alignment);
7266+
7267+ MVT ScalarToVecTy = MVT::getVectorVT(ScalarLoadType, 128 / LoadBits);
7268+ SDValue ScalarToVec =
7269+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ScalarToVecTy, ScalarLoad);
7270+ MVT BitcastTy =
7271+ MVT::getVectorVT(MVT::getIntegerVT(SrcEltBits), 128 / SrcEltBits);
7272+ SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, BitcastTy, ScalarToVec);
7273+
7274+ SDValue Res = Bitcast;
7275+ unsigned CurrentEltBits = Res.getValueType().getScalarSizeInBits();
7276+ unsigned CurrentNumElts = Res.getValueType().getVectorNumElements();
7277+ while (CurrentEltBits < DstEltBits) {
7278+ if (Res.getValueSizeInBits() >= 128) {
7279+ CurrentNumElts = CurrentNumElts / 2;
7280+ MVT ExtractVT =
7281+ MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
7282+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Res,
7283+ DAG.getConstant(0, DL, MVT::i64));
7284+ }
7285+ CurrentEltBits = CurrentEltBits * 2;
7286+ MVT ExtVT =
7287+ MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
7288+ Res = DAG.getNode(ExtOpcode, DL, ExtVT, Res);
7289+ }
7290+
7291+ if (CurrentNumElts != NumElts) {
7292+ MVT FinalVT = MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), NumElts);
7293+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FinalVT, Res,
7294+ DAG.getConstant(0, DL, MVT::i64));
7295+ }
7296+
7297+ return DAG.getMergeValues({Res, ScalarLoad.getValue(1)}, DL);
7298+ }
7299+
71917300SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
71927301 SelectionDAG &DAG) const {
71937302 SDLoc DL(Op);
71947303 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
71957304 assert(LoadNode && "Expected custom lowering of a load node");
71967305
7306+ if (SDValue Result = tryLowerSmallVectorExtLoad(LoadNode, DAG))
7307+ return Result;
7308+
71977309 if (LoadNode->getMemoryVT() == MVT::i64x8) {
71987310 SmallVector<SDValue, 8> Ops;
71997311 SDValue Base = LoadNode->getBasePtr();
@@ -7212,37 +7324,7 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
72127324 return DAG.getMergeValues({Loaded, Chain}, DL);
72137325 }
72147326
7215- // Custom lowering for extending v4i8 vector loads.
7216- EVT VT = Op->getValueType(0);
7217- assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
7218-
7219- if (LoadNode->getMemoryVT() != MVT::v4i8)
7220- return SDValue();
7221-
7222- // Avoid generating unaligned loads.
7223- if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
7224- return SDValue();
7225-
7226- unsigned ExtType;
7227- if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
7228- ExtType = ISD::SIGN_EXTEND;
7229- else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
7230- LoadNode->getExtensionType() == ISD::EXTLOAD)
7231- ExtType = ISD::ZERO_EXTEND;
7232- else
7233- return SDValue();
7234-
7235- SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
7236- LoadNode->getBasePtr(), MachinePointerInfo());
7237- SDValue Chain = Load.getValue(1);
7238- SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
7239- SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
7240- SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
7241- Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
7242- DAG.getConstant(0, DL, MVT::i64));
7243- if (VT == MVT::v4i32)
7244- Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
7245- return DAG.getMergeValues({Ext, Chain}, DL);
7327+ return SDValue();
72467328}
72477329
72487330SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
0 commit comments