@@ -23654,6 +23654,28 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
2365423654 return DAG.getMergeValues({Extract, TokenFactor}, DL);
2365523655}
2365623656
23657+ // Replace packed scalable loads with fixed loads when vscale_range(1, 1).
23658+ // This enables further optimisations such as LDP folds.
23659+ static SDValue combineVScale1Load(LoadSDNode *LD, SelectionDAG &DAG,
23660+ TargetLowering::DAGCombinerInfo &DCI,
23661+ const AArch64Subtarget *Subtarget) {
23662+ EVT MemVT = LD->getMemoryVT();
23663+ if (!DCI.isBeforeLegalize() || !Subtarget->hasNEON() ||
23664+ !MemVT.isScalableVector() || LD->getExtensionType() != ISD::NON_EXTLOAD ||
23665+ MemVT.getSizeInBits().getKnownMinValue() != 128 ||
23666+ Subtarget->getMaxSVEVectorSizeInBits() != 128)
23667+ return SDValue();
23668+
23669+ SDLoc DL(LD);
23670+ MVT NewVT = MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(),
23671+ MemVT.getVectorMinNumElements());
23672+ SDValue NewLoad = DAG.getLoad(
23673+ NewVT, DL, LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
23674+ LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), LD->getAAInfo());
23675+ SDValue Insert = convertToScalableVector(DAG, MemVT, NewLoad);
23676+ return DAG.getMergeValues({Insert, SDValue(cast<SDNode>(NewLoad), 1)}, DL);
23677+ }
23678+
2365723679// Perform TBI simplification if supported by the target and try to break up
2365823680// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
2365923681// load instructions can be selected.
@@ -23691,6 +23713,9 @@ static SDValue performLOADCombine(SDNode *N,
2369123713 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
2369223714 return Res;
2369323715
23716+ if (SDValue Res = combineVScale1Load(LD, DAG, DCI, Subtarget))
23717+ return Res;
23718+
2369423719 if (!LD->isNonTemporal())
2369523720 return SDValue(N, 0);
2369623721
@@ -23949,6 +23974,29 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
2394923974 return Chain;
2395023975}
2395123976
23977+ // Replace packed scalable stores with fixed stores when vscale_range(1, 1).
23978+ static SDValue combineVScale1Store(StoreSDNode *ST, SelectionDAG &DAG,
23979+ TargetLowering::DAGCombinerInfo &DCI,
23980+ const AArch64Subtarget *Subtarget) {
23981+ SDValue Value = ST->getValue();
23982+ EVT ValueVT = Value.getValueType();
23983+ if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
23984+ !DCI.isBeforeLegalize() || !Subtarget->hasNEON() ||
23985+ !ValueVT.isScalableVector() || ST->isTruncatingStore() ||
23986+ ValueVT.getSizeInBits().getKnownMinValue() != 128 ||
23987+ Subtarget->getMaxSVEVectorSizeInBits() != 128)
23988+ return SDValue();
23989+
23990+ SDLoc DL(ST);
23991+ MVT NewVT = MVT::getVectorVT(ValueVT.getVectorElementType().getSimpleVT(),
23992+ ValueVT.getVectorMinNumElements());
23993+ SDValue NewValue = convertFromScalableVector(DAG, NewVT, Value);
23994+ SDValue NewStore = DAG.getStore(
23995+ ST->getChain(), DL, NewValue, ST->getBasePtr(), ST->getPointerInfo(),
23996+ ST->getOriginalAlign(), ST->getMemOperand()->getFlags(), ST->getAAInfo());
23997+ return NewStore;
23998+ }
23999+
2395224000static unsigned getFPSubregForVT(EVT VT) {
2395324001 assert(VT.isSimple() && "Expected simple VT");
2395424002 switch (VT.getSimpleVT().SimpleTy) {
@@ -23997,6 +24045,9 @@ static SDValue performSTORECombine(SDNode *N,
2399724045 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
2399824046 return Res;
2399924047
24048+ if (SDValue Res = combineVScale1Store(ST, DAG, DCI, Subtarget))
24049+ return Res;
24050+
2400024051 // If this is an FP_ROUND followed by a store, fold this into a truncating
2400124052 // store. We can do this even if this is already a truncstore.
2400224053 // We purposefully don't care about legality of the nodes here as we know
0 commit comments