@@ -23300,6 +23300,99 @@ static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG) {
2330023300 return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
2330123301}
2330223302
23303+ // Helper function to optimize small vector load + extension patterns.
23304+ // These patterns would otherwise be scalarized into inefficient sequences.
23305+ static SDValue performSmallVectorLoadExtCombine(SDNode *N, SelectionDAG &DAG) {
23306+ // Don't optimize if NEON is not available. Without NEON, the backend
23307+ // will need to scalarize these operations anyway.
23308+ const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
23309+ if (!Subtarget.isNeonAvailable())
23310+ return SDValue();
23311+ // Don't optimize if SVE is being used for fixed-length vectors, because it
23312+ // has native support for these patterns.
23313+ if (Subtarget.useSVEForFixedLengthVectors())
23314+ return SDValue();
23315+
23316+ unsigned Opcode = N->getOpcode();
23317+ if (Opcode != ISD::ZERO_EXTEND && Opcode != ISD::SIGN_EXTEND &&
23318+ Opcode != ISD::ANY_EXTEND)
23319+ return SDValue();
23320+
23321+ SDValue Op = N->getOperand(0);
23322+ if (Op.getOpcode() != ISD::LOAD)
23323+ return SDValue();
23324+ LoadSDNode *LD = cast<LoadSDNode>(Op);
23325+ if (LD->getExtensionType() != ISD::NON_EXTLOAD || !LD->hasOneUse() ||
23326+ LD->isVolatile())
23327+ return SDValue();
23328+
23329+ EVT MemVT = LD->getMemoryVT();
23330+ EVT ResVT = N->getValueType(0);
23331+ // Check if this is a small vector pattern we want to optimize.
23332+ if (MemVT != MVT::v2i8 && MemVT != MVT::v2i16)
23333+ return SDValue();
23334+
23335+ unsigned NumElts = MemVT.getVectorNumElements();
23336+ unsigned SrcEltBits = MemVT.getScalarSizeInBits();
23337+ unsigned DstEltBits = ResVT.getScalarSizeInBits();
23338+ unsigned LoadBits = NumElts * SrcEltBits;
23339+
23340+ // Check alignment: the optimization loads a larger scalar, which may be
23341+ // unaligned, compared to what the original load will be legalized into.
23342+ Align Alignment = LD->getAlign();
23343+ if (Subtarget.requiresStrictAlign() && Alignment < LoadBits)
23344+ return SDValue();
23345+
23346+ // The transformation strategy:
23347+ // 1. Load the memory as a large scalar and turn it into a 64-bit vector.
23348+ // 2. Bitcast to a narrow type (v8i8 or v4i16) that has efficient NEON extend.
23349+ // 3. Extend using ushll/sshll, extract subvector, repeat as needed.
23350+
23351+ // For ANY_EXTEND, we can choose either sign or zero extend - zero is
23352+ // typically cheaper.
23353+ if (Opcode == ISD::ANY_EXTEND)
23354+ Opcode = ISD::ZERO_EXTEND;
23355+
23356+ SDLoc DL(N);
23357+ SDValue Chain = LD->getChain();
23358+ SDValue BasePtr = LD->getBasePtr();
23359+ const MachinePointerInfo &PtrInfo = LD->getPointerInfo();
23360+ MVT LoadTy = MVT::getIntegerVT(LoadBits);
23361+ SDValue Load = DAG.getLoad(LoadTy, DL, Chain, BasePtr, PtrInfo, Alignment);
23362+
23363+ // SCALAR_TO_VECTOR needs to create a 64-bit vector for NEON instructions.
23364+ // The scalar load is inserted into the lower bits of a 64-bit register.
23365+ // We determine the appropriate 64-bit vector type based on load size,
23366+ // then bitcast to v8i8 or v4i16 for efficient ushll/sshll extends.
23367+ MVT ScalarVecVT = MVT::getVectorVT(LoadTy, 64 / LoadBits);
23368+ MVT NarrowVT = MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(),
23369+ 64 / MemVT.getScalarSizeInBits());
23370+
23371+ SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ScalarVecVT, Load);
23372+ Vec = DAG.getNode(ISD::BITCAST, DL, NarrowVT, Vec);
23373+ // Extend iteratively: each extend doubles the element size.
23374+ // We extend the full 64-bit vector to leverage NEON ushll/sshll instructions.
23375+ while (Vec.getScalarValueSizeInBits() < DstEltBits) {
23376+ MVT CurVT = Vec.getSimpleValueType();
23377+ unsigned NextBits = CurVT.getScalarSizeInBits() * 2;
23378+ MVT WideVT = MVT::getVectorVT(MVT::getIntegerVT(NextBits),
23379+ CurVT.getVectorNumElements());
23380+ Vec = DAG.getNode(Opcode, DL, WideVT, Vec);
23381+
23382+ // Extract only when: excess elements + still wide + done extending.
23383+ bool HasExcess = WideVT.getVectorNumElements() > NumElts;
23384+ bool StaysWide = WideVT.getSizeInBits() >= 64;
23385+ bool IsDone = NextBits >= DstEltBits;
23386+ if (HasExcess && StaysWide && IsDone) {
23387+ MVT ExtractVT = MVT::getVectorVT(WideVT.getScalarType(), NumElts);
23388+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Vec,
23389+ DAG.getConstant(0, DL, MVT::i64));
23390+ }
23391+ }
23392+
23393+ return DAG.getMergeValues({Vec, Load.getValue(1)}, DL);
23394+ }
23395+
2330323396static SDValue performExtendCombine(SDNode *N,
2330423397 TargetLowering::DAGCombinerInfo &DCI,
2330523398 SelectionDAG &DAG) {
@@ -23349,6 +23442,12 @@ static SDValue performExtendCombine(SDNode *N,
2334923442 NewAnyExtend);
2335023443 }
2335123444
23445+ // Try to optimize small vector load + extension patterns
23446+
23447+ // Try to optimize small vector load + extension patterns
23448+ if (SDValue Result = performSmallVectorLoadExtCombine(N, DAG))
23449+ return Result;
23450+
2335223451 return SDValue();
2335323452}
2335423453
0 commit comments