Skip to content

Commit e2a1ee3

Browse files
committed
Enable NEON for non-zero memset of all sizes on AArch64
Allow getOptimalMemOpType to return v16i8 for non-zero memset operations of any size when NEON is available. This enables the DUP optimization to work for memset_16. This complements the SelectionDAG.cpp change that generates vector splats for 4 and 8 byte memset operations. Signed-off-by: Osama Abdelkader <[email protected]>
1 parent 3ee2152 commit e2a1ee3

File tree

1 file changed

+15
-10
lines changed

1 file changed

+15
-10
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18328,10 +18328,11 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
1832818328
bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
1832918329
bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
1833018330
bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18331-
// Only use AdvSIMD to implement memset of 32-byte and above. It would have
18331+
// For zero memset, only use AdvSIMD for 32-byte and above. It would have
1833218332
// taken one instruction to materialize the v2i64 zero and one store (with
1833318333
// restrictive addressing mode). Just do i64 stores.
18334-
bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18334+
// For non-zero memset, use NEON even for smaller sizes as dup is efficient.
18335+
bool IsSmallZeroMemset = Op.isMemset() && Op.size() < 32 && Op.isZeroMemset();
1833518336
auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
1833618337
if (Op.isAligned(AlignCheck))
1833718338
return true;
@@ -18341,10 +18342,11 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
1834118342
Fast;
1834218343
};
1834318344

18344-
if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18345-
AlignmentIsAcceptable(MVT::v16i8, Align(16)))
18345+
// For non-zero memset, use NEON even for smaller sizes as dup + scalar store is efficient
18346+
if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset)
1834618347
return MVT::v16i8;
18347-
if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18348+
if (CanUseFP && !IsSmallZeroMemset &&
18349+
AlignmentIsAcceptable(MVT::f128, Align(16)))
1834818350
return MVT::f128;
1834918351
if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
1835018352
return MVT::i64;
@@ -18358,10 +18360,11 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT(
1835818360
bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
1835918361
bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
1836018362
bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18361-
// Only use AdvSIMD to implement memset of 32-byte and above. It would have
18363+
// For zero memset, only use AdvSIMD for 32-byte and above. It would have
1836218364
// taken one instruction to materialize the v2i64 zero and one store (with
1836318365
// restrictive addressing mode). Just do i64 stores.
18364-
bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18366+
// For non-zero memset, use NEON even for smaller sizes as dup is efficient.
18367+
bool IsSmallZeroMemset = Op.isMemset() && Op.size() < 32 && Op.isZeroMemset();
1836518368
auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
1836618369
if (Op.isAligned(AlignCheck))
1836718370
return true;
@@ -18371,10 +18374,12 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT(
1837118374
Fast;
1837218375
};
1837318376

18374-
if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18375-
AlignmentIsAcceptable(MVT::v2i64, Align(16)))
18377+
// For non-zero memset, use NEON for all sizes where it's beneficial.
18378+
// NEON dup + scalar store works for any alignment and is efficient.
18379+
if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset)
1837618380
return LLT::fixed_vector(2, 64);
18377-
if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18381+
if (CanUseFP && !IsSmallZeroMemset &&
18382+
AlignmentIsAcceptable(MVT::f128, Align(16)))
1837818383
return LLT::scalar(128);
1837918384
if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
1838018385
return LLT::scalar(64);

0 commit comments

Comments
 (0)