-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AggressiveInstCombine] Memchr inline #130525
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -54,10 +54,9 @@ static cl::opt<unsigned> StrNCmpInlineThreshold( | |
| cl::desc("The maximum length of a constant string for a builtin string cmp " | ||
| "call eligible for inlining. The default value is 3.")); | ||
|
|
||
| static cl::opt<unsigned> | ||
| MemChrInlineThreshold("memchr-inline-threshold", cl::init(3), cl::Hidden, | ||
| cl::desc("The maximum length of a constant string to " | ||
| "inline a memchr call.")); | ||
| static cl::opt<unsigned> MemChrInlineThreshold( | ||
| "memchr-inline-threshold", cl::init(6), cl::Hidden, | ||
| cl::desc("Size threshold for inlining memchr calls")); | ||
|
|
||
| /// Match a pattern for a bitwise funnel/rotate operation that partially guards | ||
| /// against undefined behavior by branching around the funnel-shift/rotation | ||
|
|
@@ -1106,79 +1105,46 @@ void StrNCmpInliner::inlineCompare(Value *LHS, StringRef RHS, uint64_t N, | |
| } | ||
| } | ||
|
|
||
| /// Convert memchr with a small constant string into a switch | ||
| static bool foldMemChr(CallInst *Call, DomTreeUpdater *DTU, | ||
| const DataLayout &DL) { | ||
| if (isa<Constant>(Call->getArgOperand(1))) | ||
| Value *Ptr = Call->getArgOperand(0); | ||
| Value *Val = Call->getArgOperand(1); | ||
| Value *Len = Call->getArgOperand(2); | ||
|
|
||
| // If length is not a constant, we can't do the optimization | ||
| auto *LenC = dyn_cast<ConstantInt>(Len); | ||
| if (!LenC) | ||
| return false; | ||
|
|
||
| StringRef Str; | ||
| Value *Base = Call->getArgOperand(0); | ||
| if (!getConstantStringInfo(Base, Str, /*TrimAtNul=*/false)) | ||
| return false; | ||
|
|
||
| uint64_t N = Str.size(); | ||
| if (auto *ConstInt = dyn_cast<ConstantInt>(Call->getArgOperand(2))) { | ||
| uint64_t Val = ConstInt->getZExtValue(); | ||
| // Ignore the case that n is larger than the size of string. | ||
| if (Val > N) | ||
| return false; | ||
| N = Val; | ||
| } else | ||
| return false; | ||
|
|
||
| if (N > MemChrInlineThreshold) | ||
| return false; | ||
|
|
||
| BasicBlock *BB = Call->getParent(); | ||
| BasicBlock *BBNext = SplitBlock(BB, Call, DTU); | ||
| IRBuilder<> IRB(BB); | ||
| IntegerType *ByteTy = IRB.getInt8Ty(); | ||
| BB->getTerminator()->eraseFromParent(); | ||
| SwitchInst *SI = IRB.CreateSwitch( | ||
| IRB.CreateTrunc(Call->getArgOperand(1), ByteTy), BBNext, N); | ||
| Type *IndexTy = DL.getIndexType(Call->getType()); | ||
| SmallVector<DominatorTree::UpdateType, 8> Updates; | ||
|
|
||
| BasicBlock *BBSuccess = BasicBlock::Create( | ||
| Call->getContext(), "memchr.success", BB->getParent(), BBNext); | ||
| IRB.SetInsertPoint(BBSuccess); | ||
| PHINode *IndexPHI = IRB.CreatePHI(IndexTy, N, "memchr.idx"); | ||
| Value *FirstOccursLocation = IRB.CreateInBoundsPtrAdd(Base, IndexPHI); | ||
| IRB.CreateBr(BBNext); | ||
| if (DTU) | ||
| Updates.push_back({DominatorTree::Insert, BBSuccess, BBNext}); | ||
|
|
||
| SmallPtrSet<ConstantInt *, 4> Cases; | ||
| for (uint64_t I = 0; I < N; ++I) { | ||
| ConstantInt *CaseVal = ConstantInt::get(ByteTy, Str[I]); | ||
| if (!Cases.insert(CaseVal).second) | ||
| continue; | ||
|
|
||
| BasicBlock *BBCase = BasicBlock::Create(Call->getContext(), "memchr.case", | ||
| BB->getParent(), BBSuccess); | ||
| SI->addCase(CaseVal, BBCase); | ||
| IRB.SetInsertPoint(BBCase); | ||
| IndexPHI->addIncoming(ConstantInt::get(IndexTy, I), BBCase); | ||
| IRB.CreateBr(BBSuccess); | ||
| if (DTU) { | ||
| Updates.push_back({DominatorTree::Insert, BB, BBCase}); | ||
| Updates.push_back({DominatorTree::Insert, BBCase, BBSuccess}); | ||
|
|
||
| uint64_t Length = LenC->getZExtValue(); | ||
|
|
||
| // Check if this is a small memchr we should inline | ||
| if (Length <= MemChrInlineThreshold) { | ||
| IRBuilder<> IRB(Call); | ||
|
|
||
| // Truncate the search value to i8 | ||
| Value *ByteVal = IRB.CreateTrunc(Val, IRB.getInt8Ty()); | ||
|
|
||
| // Initialize result to null | ||
| Value *Result = ConstantPointerNull::get(cast<PointerType>(Call->getType())); | ||
|
|
||
| // For each byte up to Length | ||
| for (unsigned i = 0; i < Length; i++) { | ||
| Value *CurPtr = i == 0 ? Ptr : | ||
| IRB.CreateGEP(IRB.getInt8Ty(), Ptr, | ||
| ConstantInt::get(DL.getIndexType(Call->getType()), i)); | ||
| Value *CurByte = IRB.CreateLoad(IRB.getInt8Ty(), CurPtr); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it is dangerous to put these load instructions in a basic block. It is well-defined because it returns at the first occurrence. To avoid UB, we should transform this call into a if chain. BTW, your implementation returns the last occurrence of ch :( |
||
| Value *CmpRes = IRB.CreateICmpEQ(CurByte, ByteVal); | ||
| Result = IRB.CreateSelect(CmpRes, CurPtr, Result); | ||
| } | ||
|
|
||
| // Replace the call with our expanded version | ||
| Call->replaceAllUsesWith(Result); | ||
| Call->eraseFromParent(); | ||
| return true; | ||
| } | ||
|
|
||
| PHINode *PHI = | ||
| PHINode::Create(Call->getType(), 2, Call->getName(), BBNext->begin()); | ||
| PHI->addIncoming(Constant::getNullValue(Call->getType()), BB); | ||
| PHI->addIncoming(FirstOccursLocation, BBSuccess); | ||
|
|
||
| Call->replaceAllUsesWith(PHI); | ||
| Call->eraseFromParent(); | ||
|
|
||
| if (DTU) | ||
| DTU->applyUpdates(Updates); | ||
|
|
||
| return true; | ||
| return false; | ||
| } | ||
|
|
||
| static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | ||
| ; RUN: opt -S -passes=aggressive-instcombine --memchr-inline-threshold=2 < %s | FileCheck %s | ||
|
|
||
| declare ptr @memchr(ptr, i32, i64) | ||
|
|
||
| define ptr @test_memchr_small(ptr %p, i32 %val) { | ||
| ; CHECK-LABEL: define ptr @test_memchr_small( | ||
| ; CHECK-SAME: ptr [[P:%.*]], i32 [[VAL:%.*]]) { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VAL]] to i8 | ||
| ; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[P]], align 1 | ||
| ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 [[TMP1]], [[TMP0]] | ||
| ; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], ptr [[P]], ptr null | ||
| ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[P]], i64 1 | ||
| ; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP4]], align 1 | ||
| ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i8 [[TMP5]], [[TMP0]] | ||
| ; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], ptr [[TMP4]], ptr [[TMP3]] | ||
| ; CHECK-NEXT: ret ptr [[TMP7]] | ||
| ; | ||
| entry: | ||
| %res = call ptr @memchr(ptr %p, i32 %val, i64 2) | ||
| ret ptr %res | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is the purpose of increasing this threshold to 6?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I initially set the threshold to 6 to match the test cases in memchr.ll, which test the inlining optimization for lengths up to 5 bytes. Would you recommend keeping it at 3 to be more conservative?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suggest keeping it at 3 if you do not have some performance data on llvm-test-suite/SPEC.