Skip to content

Commit 18e919b

Browse files
committed
[AArch64] Optimize extending loads of small vectors
Reduces the total amount of loads and the amount of moves between SIMD registers and general-purpose registers.
1 parent ed53c41 commit 18e919b

File tree

5 files changed

+360
-52
lines changed

5 files changed

+360
-52
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23300,6 +23300,99 @@ static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG) {
2330023300
return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
2330123301
}
2330223302

23303+
// Helper function to optimize small vector load + extension patterns.
23304+
// These patterns would otherwise be scalarized into inefficient sequences.
23305+
static SDValue performSmallVectorLoadExtCombine(SDNode *N, SelectionDAG &DAG) {
23306+
// Don't optimize if NEON is not available. Without NEON, the backend
23307+
// will need to scalarize these operations anyway.
23308+
const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
23309+
if (!Subtarget.isNeonAvailable())
23310+
return SDValue();
23311+
// Don't optimize if SVE is being used for fixed-length vectors, because it
23312+
// has native support for these patterns.
23313+
if (Subtarget.useSVEForFixedLengthVectors())
23314+
return SDValue();
23315+
23316+
unsigned Opcode = N->getOpcode();
23317+
if (Opcode != ISD::ZERO_EXTEND && Opcode != ISD::SIGN_EXTEND &&
23318+
Opcode != ISD::ANY_EXTEND)
23319+
return SDValue();
23320+
23321+
SDValue Op = N->getOperand(0);
23322+
if (Op.getOpcode() != ISD::LOAD)
23323+
return SDValue();
23324+
LoadSDNode *LD = cast<LoadSDNode>(Op);
23325+
if (LD->getExtensionType() != ISD::NON_EXTLOAD || !LD->hasOneUse() ||
23326+
LD->isVolatile())
23327+
return SDValue();
23328+
23329+
EVT MemVT = LD->getMemoryVT();
23330+
EVT ResVT = N->getValueType(0);
23331+
// Check if this is a small vector pattern we want to optimize.
23332+
if (MemVT != MVT::v2i8 && MemVT != MVT::v2i16)
23333+
return SDValue();
23334+
23335+
unsigned NumElts = MemVT.getVectorNumElements();
23336+
unsigned SrcEltBits = MemVT.getScalarSizeInBits();
23337+
unsigned DstEltBits = ResVT.getScalarSizeInBits();
23338+
unsigned LoadBits = NumElts * SrcEltBits;
23339+
23340+
// Check alignment: the optimization loads a larger scalar, which may be
23341+
// unaligned, compared to what the original load will be legalized into.
23342+
Align Alignment = LD->getAlign();
23343+
if (Subtarget.requiresStrictAlign() && Alignment < LoadBits)
23344+
return SDValue();
23345+
23346+
// The transformation strategy:
23347+
// 1. Load the memory as a large scalar and turn it into a 64-bit vector.
23348+
// 2. Bitcast to a narrow type (v8i8 or v4i16) that has efficient NEON extend.
23349+
// 3. Extend using ushll/sshll, extract subvector, repeat as needed.
23350+
23351+
// For ANY_EXTEND, we can choose either sign or zero extend - zero is
23352+
// typically cheaper.
23353+
if (Opcode == ISD::ANY_EXTEND)
23354+
Opcode = ISD::ZERO_EXTEND;
23355+
23356+
SDLoc DL(N);
23357+
SDValue Chain = LD->getChain();
23358+
SDValue BasePtr = LD->getBasePtr();
23359+
const MachinePointerInfo &PtrInfo = LD->getPointerInfo();
23360+
MVT LoadTy = MVT::getIntegerVT(LoadBits);
23361+
SDValue Load = DAG.getLoad(LoadTy, DL, Chain, BasePtr, PtrInfo, Alignment);
23362+
23363+
// SCALAR_TO_VECTOR needs to create a 64-bit vector for NEON instructions.
23364+
// The scalar load is inserted into the lower bits of a 64-bit register.
23365+
// We determine the appropriate 64-bit vector type based on load size,
23366+
// then bitcast to v8i8 or v4i16 for efficient ushll/sshll extends.
23367+
MVT ScalarVecVT = MVT::getVectorVT(LoadTy, 64 / LoadBits);
23368+
MVT NarrowVT = MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(),
23369+
64 / MemVT.getScalarSizeInBits());
23370+
23371+
SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ScalarVecVT, Load);
23372+
Vec = DAG.getNode(ISD::BITCAST, DL, NarrowVT, Vec);
23373+
// Extend iteratively: each extend doubles the element size.
23374+
// We extend the full 64-bit vector to leverage NEON ushll/sshll instructions.
23375+
while (Vec.getScalarValueSizeInBits() < DstEltBits) {
23376+
MVT CurVT = Vec.getSimpleValueType();
23377+
unsigned NextBits = CurVT.getScalarSizeInBits() * 2;
23378+
MVT WideVT = MVT::getVectorVT(MVT::getIntegerVT(NextBits),
23379+
CurVT.getVectorNumElements());
23380+
Vec = DAG.getNode(Opcode, DL, WideVT, Vec);
23381+
23382+
// Extract only when: excess elements + still wide + done extending.
23383+
bool HasExcess = WideVT.getVectorNumElements() > NumElts;
23384+
bool StaysWide = WideVT.getSizeInBits() >= 64;
23385+
bool IsDone = NextBits >= DstEltBits;
23386+
if (HasExcess && StaysWide && IsDone) {
23387+
MVT ExtractVT = MVT::getVectorVT(WideVT.getScalarType(), NumElts);
23388+
Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Vec,
23389+
DAG.getConstant(0, DL, MVT::i64));
23390+
}
23391+
}
23392+
23393+
return DAG.getMergeValues({Vec, Load.getValue(1)}, DL);
23394+
}
23395+
2330323396
static SDValue performExtendCombine(SDNode *N,
2330423397
TargetLowering::DAGCombinerInfo &DCI,
2330523398
SelectionDAG &DAG) {
@@ -23349,6 +23442,12 @@ static SDValue performExtendCombine(SDNode *N,
2334923442
NewAnyExtend);
2335023443
}
2335123444

23445+
// Try to optimize small vector load + extension patterns
23446+
23447+
// Try to optimize small vector load + extension patterns
23448+
if (SDValue Result = performSmallVectorLoadExtCombine(N, DAG))
23449+
return Result;
23450+
2335223451
return SDValue();
2335323452
}
2335423453

0 commit comments

Comments
 (0)