@@ -18328,10 +18328,11 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
1832818328 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
1832918329 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
1833018330 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18331- // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18331+ // For zero memset, only use AdvSIMD for 32-byte and above. It would have
1833218332 // taken one instruction to materialize the v2i64 zero and one store (with
1833318333 // restrictive addressing mode). Just do i64 stores.
18334- bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18334+ // For non-zero memset, use NEON even for smaller sizes as dup is efficient.
18335+ bool IsSmallZeroMemset = Op.isMemset() && Op.size() < 32 && Op.isZeroMemset();
1833518336 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
1833618337 if (Op.isAligned(AlignCheck))
1833718338 return true;
@@ -18341,10 +18342,11 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
1834118342 Fast;
1834218343 };
1834318344
18344- if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18345- AlignmentIsAcceptable(MVT::v16i8, Align(16)) )
18345+ // For non-zero memset, use NEON even for smaller sizes as dup + scalar store is efficient
18346+ if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset )
1834618347 return MVT::v16i8;
18347- if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18348+ if (CanUseFP && !IsSmallZeroMemset &&
18349+ AlignmentIsAcceptable(MVT::f128, Align(16)))
1834818350 return MVT::f128;
1834918351 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
1835018352 return MVT::i64;
@@ -18358,10 +18360,11 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT(
1835818360 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
1835918361 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
1836018362 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18361- // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18363+ // For zero memset, only use AdvSIMD for 32-byte and above. It would have
1836218364 // taken one instruction to materialize the v2i64 zero and one store (with
1836318365 // restrictive addressing mode). Just do i64 stores.
18364- bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18366+ // For non-zero memset, use NEON even for smaller sizes as dup is efficient.
18367+ bool IsSmallZeroMemset = Op.isMemset() && Op.size() < 32 && Op.isZeroMemset();
1836518368 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
1836618369 if (Op.isAligned(AlignCheck))
1836718370 return true;
@@ -18371,10 +18374,12 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT(
1837118374 Fast;
1837218375 };
1837318376
18374- if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18375- AlignmentIsAcceptable(MVT::v2i64, Align(16)))
18377+ // For non-zero memset, use NEON for all sizes where it's beneficial.
18378+ // NEON dup + scalar store works for any alignment and is efficient.
18379+ if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset)
1837618380 return LLT::fixed_vector(2, 64);
18377- if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18381+ if (CanUseFP && !IsSmallZeroMemset &&
18382+ AlignmentIsAcceptable(MVT::f128, Align(16)))
1837818383 return LLT::scalar(128);
1837918384 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
1838018385 return LLT::scalar(64);
0 commit comments