-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[MemCpyOpt] Drop dead memmove calls on memset'd source data
#101930
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MemCpyOpt] Drop dead memmove calls on memset'd source data
#101930
Conversation
|
@llvm/pr-subscribers-llvm-transforms Author: Antonio Frighetto (antoniofrighetto) ChangesWhen a memmove happens to clobber source data, and such data have been previously memset'd, the memmove may be redundant. Fixes: #101680. Full diff: https://github.com/llvm/llvm-project/pull/101930.diff 3 Files Affected:
diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
index 8ed03d7f3ddbf..1fbd29ea8b32a 100644
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -66,7 +66,7 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
BasicBlock::iterator &BBI);
bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI);
bool processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI);
- bool processMemMove(MemMoveInst *M);
+ bool processMemMove(MemMoveInst *M, BasicBlock::iterator &BBI);
bool performCallSlotOptzn(Instruction *cpyLoad, Instruction *cpyStore,
Value *cpyDst, Value *cpySrc, TypeSize cpyLen,
Align cpyAlign, BatchAAResults &BAA,
@@ -85,6 +85,7 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
bool performStackMoveOptzn(Instruction *Load, Instruction *Store,
AllocaInst *DestAlloca, AllocaInst *SrcAlloca,
TypeSize Size, BatchAAResults &BAA);
+ bool mayBeMemMoveMemSetDependency(MemMoveInst *M);
void eraseInstruction(Instruction *I);
bool iterateOnFunction(Function &F);
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index cee34f0a6da1f..30d16ca3f2f03 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -69,6 +69,7 @@ static cl::opt<bool> EnableMemCpyOptWithoutLibcalls(
cl::desc("Enable memcpyopt even when libcalls are disabled"));
STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
+STATISTIC(NumMemMoveInstr, "Number of memmove instructions deleted");
STATISTIC(NumMemSetInfer, "Number of memsets inferred");
STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy");
STATISTIC(NumCpyToSet, "Number of memcpys converted to memset");
@@ -1854,12 +1855,54 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
return false;
}
+/// Memmove calls with overlapping src/dest buffers that come after a memset may
+/// be removed.
+bool MemCpyOptPass::mayBeMemMoveMemSetDependency(MemMoveInst *M) {
+ MemoryUseOrDef *MemMoveAccess = MSSA->getMemoryAccess(M);
+ if (!MemMoveAccess)
+ return false;
+
+ BatchAAResults BAA(*AA);
+ MemoryAccess *FirstDef = MemMoveAccess->getDefiningAccess();
+ MemoryLocation SourceLoc = MemoryLocation::getForSource(M);
+ MemoryAccess *SourceClobber =
+ MSSA->getWalker()->getClobberingMemoryAccess(FirstDef, SourceLoc, BAA);
+
+ MemSetInst *MS = nullptr;
+ if (auto *Def = dyn_cast<MemoryDef>(SourceClobber))
+ if (auto *I = dyn_cast<Instruction>(Def->getMemoryInst()))
+ MS = dyn_cast_or_null<MemSetInst>(I);
+
+ if (!MS || MS->getParent() != M->getParent())
+ return false;
+
+ // The destination buffer must have been memset'd.
+ if (!BAA.isMustAlias(MS->getDest(), M->getDest()))
+ return false;
+
+ // No clobbering writes in between.
+ if (writtenBetween(MSSA, BAA, SourceLoc, MSSA->getMemoryAccess(MS),
+ MemMoveAccess))
+ return false;
+ return true;
+}
+
/// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
/// not to alias.
-bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
+bool MemCpyOptPass::processMemMove(MemMoveInst *M, BasicBlock::iterator &BBI) {
// See if the source could be modified by this memmove potentially.
- if (isModSet(AA->getModRefInfo(M, MemoryLocation::getForSource(M))))
+ if (isModSet(AA->getModRefInfo(M, MemoryLocation::getForSource(M)))) {
+ // On the off-chance the memmove clobbers src with previously memset'd
+ // bytes, the memmove may be redundant.
+ if (mayBeMemMoveMemSetDependency(M)) {
+ LLVM_DEBUG(dbgs() << "Removed redundant memmove.\n");
+ ++BBI;
+ eraseInstruction(M);
+ ++NumMemMoveInstr;
+ return true;
+ }
return false;
+ }
LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
<< "\n");
@@ -2068,7 +2111,7 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) {
else if (auto *M = dyn_cast<MemCpyInst>(I))
RepeatInstruction = processMemCpy(M, BI);
else if (auto *M = dyn_cast<MemMoveInst>(I))
- RepeatInstruction = processMemMove(M);
+ RepeatInstruction = processMemMove(M, BI);
else if (auto *CB = dyn_cast<CallBase>(I)) {
for (unsigned i = 0, e = CB->arg_size(); i != e; ++i) {
if (CB->isByValArgument(i))
diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memmove-redundant-memmove.ll b/llvm/test/Transforms/MemCpyOpt/memset-memmove-redundant-memmove.ll
new file mode 100644
index 0000000000000..6017f39a90c86
--- /dev/null
+++ b/llvm/test/Transforms/MemCpyOpt/memset-memmove-redundant-memmove.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=memcpyopt -S %s -verify-memoryssa | FileCheck %s
+
+; Redundant memmove.
+define i32 @test() {
+; CHECK-LABEL: @test(
+; CHECK-NEXT: [[ARRAY:%.*]] = alloca [26 x i32], align 16
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 [[ARRAY]], i8 0, i64 104, i1 false)
+; CHECK-NEXT: [[ARRAY_IDX:%.*]] = getelementptr inbounds i8, ptr [[ARRAY]], i64 4
+; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ARRAY]], align 16
+; CHECK-NEXT: ret i32 [[VAL]]
+;
+ %array = alloca [26 x i32], align 16
+ call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 %array, i8 0, i64 104, i1 false)
+ %array.idx = getelementptr inbounds i8, ptr %array, i64 4
+ call void @llvm.memmove.p0.p0.i64(ptr noundef nonnull align 16 %array, ptr noundef nonnull align 4 %array.idx, i64 100, i1 false)
+ %val = load i32, ptr %array, align 16
+ ret i32 %val
+}
+
+; Used memmmove, buffer is reset to zero.
+define i32 @test1() {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT: [[ARRAY:%.*]] = alloca [26 x i32], align 16
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 [[ARRAY]], i8 0, i64 104, i1 false)
+; CHECK-NEXT: [[ARRAY_IDX:%.*]] = getelementptr inbounds i8, ptr [[ARRAY]], i64 4
+; CHECK-NEXT: store i32 1, ptr [[ARRAY_IDX]], align 4
+; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr noundef nonnull align 16 [[ARRAY]], ptr noundef nonnull align 4 [[ARRAY_IDX]], i64 100, i1 false)
+; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ARRAY_IDX]], align 4
+; CHECK-NEXT: ret i32 [[VAL]]
+;
+ %array = alloca [26 x i32], align 16
+ call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 %array, i8 0, i64 104, i1 false)
+ %array.idx = getelementptr inbounds i8, ptr %array, i64 4
+ store i32 1, ptr %array.idx
+ call void @llvm.memmove.p0.p0.i64(ptr noundef nonnull align 16 %array, ptr noundef nonnull align 4 %array.idx, i64 100, i1 false)
+ %val = load i32, ptr %array.idx, align 4
+ ret i32 %val
+}
+
+; Used memmove, buffer clobbered by opaque.
+define i32 @test2() {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT: [[ARRAY:%.*]] = alloca [25 x i32], align 16
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(100) [[ARRAY]], i8 0, i64 100, i1 false)
+; CHECK-NEXT: call void @opaque(ptr noundef nonnull [[ARRAY]])
+; CHECK-NEXT: [[ARRAY_IDX:%.*]] = getelementptr inbounds i8, ptr [[ARRAY]], i64 4
+; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(96) [[ARRAY]], ptr noundef nonnull align 4 dereferenceable(96) [[ARRAY_IDX]], i64 96, i1 false)
+; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ARRAY]], align 16
+; CHECK-NEXT: ret i32 [[VAL]]
+;
+ %array = alloca [25 x i32], align 16
+ call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 dereferenceable(100) %array, i8 0, i64 100, i1 false)
+ call void @opaque(ptr noundef nonnull %array)
+ %array.idx = getelementptr inbounds i8, ptr %array, i64 4
+ call void @llvm.memmove.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(96) %array, ptr noundef nonnull align 4 dereferenceable(96) %array.idx, i64 96, i1 false)
+ %val = load i32, ptr %array, align 16
+ ret i32 %val
+}
+
+declare void @opaque(ptr)
+declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1)
+declare void @llvm.memmove.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind
|
dianqk
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there any real-world code written like this?
|
|
||
| /// Memmove calls with overlapping src/dest buffers that come after a memset may | ||
| /// be removed. | ||
| bool MemCpyOptPass::mayBeMemMoveMemSetDependency(MemMoveInst *M) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess this should be "must"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's implicit in the fact that the memmove comes after the memset?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
may in the function name is not strong enough, it should be something like isMemMove...
While the usefulness of the patch is indeed questionable, dubious at the very least, I thought it could still make sense considering that the underlying optimization is quite trivial/basic per se, and has a low compile-time. I feel like compilers should still perform at their best if possible, while clearly trading for efficiency. That said, I advocate for optimizations that turn out to be actually profitable, so I do not oppose if this is an unnecessary addition. |
See https://github.com/dtcxzyw/llvm-opt-benchmark/pull/1135/files |
|
Kind ping. |
|
precomitting the test would be nice so we can see the effect of this change |
|
also could you run this through https://llvm-compile-time-tracker.com/? |
Tests are already in a separate commit (2573b26).
Compile-time looks OK: https://llvm-compile-time-tracker.com/compare.php?from=2573b26d9dc2dcb53fe0492eccecd2f6f8486295&to=fd47d7f80291f5384c2ace760463cd39791f81ff&stat=instructions:u. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| if (isMemMoveMemSetDependency(M)) { | |
| if (!M->isVolatile() && isMemMoveMemSetDependency(M)) { |
Should bail out volatile memmoves.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add a negative test for MS->getParent() != M->getParent(). Why we cannot do this if the memset and memmove are not in the same block?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had limited this to the same block mostly as to avoid unnecessaries queries to the cached AA results, but I think it should be fine as long as there are no clobbering writes in between. Maybe worth understanding if this is the correct way to pursue, before dropping it.
b2dd2b9 to
ff821fb
Compare
| ; CHECK-NEXT: ret i32 [[VAL]] | ||
| ; | ||
| %array = alloca [26 x i32], align 16 | ||
| call void @llvm.memset.p0.i64(ptr noundef nonnull align 16 %array, i8 0, i64 104, i1 false) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please remove irrelevant attributes like noundef nonnull.
| return false; | ||
|
|
||
| // The destination buffer must have been memset'd. | ||
| if (!BAA.isMustAlias(MS->getDest(), M->getDest())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You also need to make sure that the source location is a subset of the memset memory. Currently, you just find a potentially clobbering memset, and then assume that it is fully clobbering. I think this is why https://github.com/dtcxzyw/llvm-opt-benchmark/pull/1135/files#diff-532ba66b66620dd644c14e91d18a3293194f3bfadfef06f698e87588e3d060ed is miscompiled.
|
|
||
| // No clobbering writes in between. | ||
| if (writtenBetween(MSSA, BAA, SourceLoc, MSSA->getMemoryAccess(MS), | ||
| MemMoveAccess)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This check doesn't make sense to me. You have already checked above that SourceLoc is not clobbered using getClobberingMemoryAccess. What you need to verify now is that the destination location is not clobbered either. In your tests that would be store i32 1, ptr %array instead of store i32 1, ptr %array.idx.
It would probably be cleanest if you a) first checked that the memmove is of the form memmove(x, x+A, B) and then checked the location x with LocationSize A+B to cover both at the same time. Then you don't need writtenBefore and can handle the cross-BB case for free.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, indeed the first dominating clobbering access for the destination location should be the memset too. Could you please check that now this is more aligned with what you thought? Though, shouldn't the LocationSize be just B to check the source region (and not A+B)?
ff821fb to
26e1089
Compare
|
Does this look on track? |
| /// be removed. | ||
| bool MemCpyOptPass::isMemMoveMemSetDependency(MemMoveInst *M) { | ||
| const auto &DL = M->getDataLayout(); | ||
| MemSetInst *MS = nullptr; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Move MS declaration down.
| if (!Source) | ||
| return false; | ||
| APInt Offset(DL.getIndexTypeSizeInBits(Source->getType()), 0); | ||
| auto MemMoveSize = MemoryLocation::getForSource(M).Size; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Already have SourceLoc above.
| // The first dominating clobbering MemoryAccess for the destination location | ||
| // needs to be the memset as well. | ||
| MemoryAccess *DestClobber = | ||
| MSSA->getWalker()->getClobberingMemoryAccess(FirstDef, DestLoc, BAA); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we do only a single MSSA query with the combined location instead of two?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| auto MemMoveSize = SourceLoc.Size; | |
| LocationSize MemMoveSize = SourceLoc.Size; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can move the getPointerOperand() check first, it's cheaper than accumulateConstantOffset...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should have done earlier.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think these getModRefInfo() queries are needed? This is implied by the isMustAlias call already.
A check I am missing though is that the memset size is sufficiently large. I think currently your transform would trigger on a partial memset?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, that one was missing, added a test for it too, thanks. While at it, I also added the case (and test for it) where x + A appears as a constantexpr (not sure if it's a case really needed though).
693081c to
aaaa51f
Compare
| // The memmove is of form memmove(x, x + A, B). | ||
| MemoryLocation SourceLoc = MemoryLocation::getForSource(M); | ||
| auto *MemMoveSourceOp = M->getSource(); | ||
| auto *Source = dyn_cast<GetElementPtrInst>(MemMoveSourceOp); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| auto *Source = dyn_cast<GetElementPtrInst>(MemMoveSourceOp); | |
| auto *Source = dyn_cast<GEPOperator>(MemMoveSourceOp); |
handles both instruction and constexpr, no need to do both.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, updated.
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Offset.isNegative() needs to be checked after accumulateConstantOffset.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ugh, overlooked it before.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| const uint64_t MemMoveSize = MemMoveLocSize.getValue(); | |
| uint64_t MemMoveSize = MemMoveLocSize.getValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| MemoryAccess *DestClobber = MSSA->getWalker()->getClobberingMemoryAccess( | |
| FirstDef, CombinedDestLoc, BAA); | |
| auto *DestClobber = dyn_cast<MemoryDef>(MSSA->getWalker()->getClobberingMemoryAccess( | |
| FirstDef, CombinedDestLoc, BAA)); | |
| if (!DestClobber) | |
| return nullptr; |
And then you can directly assign MS after that.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you please add a test where the memset size is not constant? We should use dyn_cast here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added, and used dyn_cast.
nikic
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unused variable.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| MemoryLocation CombinedDestLoc(M->getDest(), TotalSize); | |
| MemoryLocation CombinedLoc(M->getDest(), TotalSize); |
It's now both the Dest and Source Loc :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you please add a negative test where this check fails?
1db88b4 to
381f9ab
Compare
When a memmove happens to clobber source data, and such data have been previously memset'd, the memmove may be redundant.
381f9ab to
1d6ab18
Compare
When a memmove happens to clobber source data, and such data have been previously memset'd, the memmove may be redundant.
Fixes: #101680.