@@ -1394,9 +1394,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
13941394 }
13951395 }
13961396
1397- // v1i64 -> v1i8 truncstore represents a bsub FPR8 store.
1398- setTruncStoreAction(MVT::v1i64, MVT::v1i8, Legal);
1399-
14001397 for (auto Op :
14011398 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
14021399 ISD::FROUND, ISD::FROUNDEVEN, ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
@@ -23936,6 +23933,8 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
2393623933static unsigned getFPSubregForVT(EVT VT) {
2393723934 assert(VT.isSimple() && "Expected simple VT");
2393823935 switch (VT.getSimpleVT().SimpleTy) {
23936+ case MVT::aarch64mfp8:
23937+ return AArch64::bsub;
2393923938 case MVT::f16:
2394023939 return AArch64::hsub;
2394123940 case MVT::f32:
@@ -23947,22 +23946,6 @@ static unsigned getFPSubregForVT(EVT VT) {
2394723946 }
2394823947}
2394923948
23950- static EVT get64BitVector(EVT ElVT) {
23951- assert(ElVT.isSimple() && "Expected simple VT");
23952- switch (ElVT.getSimpleVT().SimpleTy) {
23953- case MVT::i8:
23954- return MVT::v8i8;
23955- case MVT::i16:
23956- return MVT::v4i16;
23957- case MVT::i32:
23958- return MVT::v2i32;
23959- case MVT::i64:
23960- return MVT::v1i64;
23961- default:
23962- llvm_unreachable("Unexpected VT!");
23963- }
23964- }
23965-
2396623949static SDValue performSTORECombine(SDNode *N,
2396723950 TargetLowering::DAGCombinerInfo &DCI,
2396823951 SelectionDAG &DAG,
@@ -24041,72 +24024,63 @@ static SDValue performSTORECombine(SDNode *N,
2404124024 SDValue ExtIdx = Value.getOperand(1);
2404224025 EVT VectorVT = Vector.getValueType();
2404324026 EVT ElemVT = VectorVT.getVectorElementType();
24027+
2404424028 if (!ValueVT.isInteger())
2404524029 return SDValue();
2404624030 if (ValueVT != MemVT && !ST->isTruncatingStore())
2404724031 return SDValue();
2404824032
24049- if (MemVT == MVT::i8) {
24050- auto *ExtCst = dyn_cast<ConstantSDNode>(ExtIdx);
24051- if (Subtarget->isNeonAvailable() &&
24052- (VectorVT == MVT::v8i8 || VectorVT == MVT::v16i8) && ExtCst &&
24053- !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD) {
24054- // These can lower to st1.b, which is preferable if we're unlikely to
24055- // fold the addressing into the store.
24056- return SDValue();
24057- }
24058-
24059- // Lower as truncstore of v1i64 -> v1i8 (which can lower to a bsub store).
24060- SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
24061- SDValue ExtVector;
24062- EVT VecVT64 = get64BitVector(ElemVT);
24063- if (ExtCst && ExtCst->isZero()) {
24064- ExtVector =
24065- DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT64, Vector, Zero);
24066- } else {
24067- SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
24068- Value.getValueType(), Vector, ExtIdx);
24069- ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT64,
24070- DAG.getUNDEF(VecVT64), Ext, Zero);
24071- }
24072-
24073- SDValue Cast = DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, ExtVector);
24074- return DAG.getTruncStore(ST->getChain(), DL, Cast, ST->getBasePtr(),
24075- MVT::v1i8, ST->getMemOperand());
24076- }
24077-
24078- // TODO: Handle storing i8s to wider types.
24079- if (ElemVT == MVT::i8)
24033+ // This could generate an additional extract if the index is non-zero and
24034+ // the extracted value has multiple uses.
24035+ auto *ExtCst = dyn_cast<ConstantSDNode>(ExtIdx);
24036+ if ((!ExtCst || !ExtCst->isZero()) && !Value.hasOneUse())
2408024037 return SDValue();
2408124038
24082- // Heuristic: If there are other users of integer scalars extracted from
24083- // this vector that won't fold into the store -- abandon folding. Applying
24084- // this fold may extend the vector lifetime and disrupt paired stores.
24085- for (const auto &Use : Vector->uses()) {
24086- if (Use.getResNo() != Vector.getResNo())
24087- continue;
24088- const SDNode *User = Use.getUser();
24089- if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24090- (!User->hasOneUse() ||
24091- (*User->user_begin())->getOpcode() != ISD::STORE))
24092- return SDValue();
24039+ if (Subtarget->isNeonAvailable() && ElemVT == MemVT &&
24040+ (VectorVT.is64BitVector() || VectorVT.is128BitVector()) && ExtCst &&
24041+ !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD) {
24042+ // These can lower to st1, which is preferable if we're unlikely to fold
24043+ // the addressing into the store.
24044+ return SDValue();
2409324045 }
2409424046
24095- EVT FPElemVT = EVT::getFloatingPointVT(ElemVT.getSizeInBits());
24096- EVT FPVectorVT = VectorVT.changeVectorElementType(FPElemVT);
24097- SDValue Cast = DAG.getNode(ISD::BITCAST, DL, FPVectorVT, Vector);
24098- SDValue Ext =
24099- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, FPElemVT, Cast, ExtIdx);
24047+ if (MemVT == MVT::i64 || MemVT == MVT::i32) {
24048+ // Heuristic: If there are other users of w/x integer scalars extracted
24049+ // from this vector that won't fold into the store -- abandon folding.
24050+ // Applying this fold may disrupt paired stores.
24051+ for (const auto &Use : Vector->uses()) {
24052+ if (Use.getResNo() != Vector.getResNo())
24053+ continue;
24054+ const SDNode *User = Use.getUser();
24055+ if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24056+ (!User->hasOneUse() ||
24057+ (*User->user_begin())->getOpcode() != ISD::STORE))
24058+ return SDValue();
24059+ }
24060+ }
2410024061
24101- EVT FPMemVT = EVT::getFloatingPointVT(MemVT.getSizeInBits());
24102- if (ST->isTruncatingStore() && FPMemVT != FPElemVT) {
24103- SDValue Trunc = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
24104- FPMemVT, Ext);
24105- return DAG.getStore(ST->getChain(), DL, Trunc, ST->getBasePtr(),
24106- ST->getMemOperand());
24062+ SDValue ExtVector = Vector;
24063+ if (!ExtCst || !ExtCst->isZero()) {
24064+ // Handle extracting from lanes != 0.
24065+ SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
24066+ Value.getValueType(), Vector, ExtIdx);
24067+ // FIXME: Using a fixed-size vector for the insertion should not be
24068+ // necessary, but SVE ISEL is missing some folds to avoid fmovs.
24069+ SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
24070+ EVT InsertVectorVT = EVT::getVectorVT(
24071+ *DAG.getContext(), ElemVT,
24072+ VectorVT.getVectorElementCount().getKnownMinValue(), false);
24073+ ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, InsertVectorVT,
24074+ DAG.getUNDEF(InsertVectorVT), Ext, Zero);
2410724075 }
2410824076
24109- return DAG.getStore(ST->getChain(), DL, Ext, ST->getBasePtr(),
24077+ EVT FPMemVT = MemVT == MVT::i8
24078+ ? MVT::aarch64mfp8
24079+ : EVT::getFloatingPointVT(MemVT.getSizeInBits());
24080+ SDValue FPSubreg = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
24081+ FPMemVT, ExtVector);
24082+
24083+ return DAG.getStore(ST->getChain(), DL, FPSubreg, ST->getBasePtr(),
2411024084 ST->getMemOperand());
2411124085 }
2411224086
@@ -28861,10 +28835,6 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
2886128835 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
2886228836 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
2886328837
28864- // Can be lowered to a bsub store in ISEL.
28865- if (VT == MVT::v1i64 && MemVT == MVT::v1i8)
28866- return SDValue();
28867-
2886828838 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
2886928839 EVT TruncVT = ContainerVT.changeVectorElementType(
2887028840 Store->getMemoryVT().getVectorElementType());
0 commit comments