-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[Offload][PGO] Fix PGO on NVPTX targets #143568
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -391,6 +391,13 @@ class InstrLowerer final { | |
| /// Create INSTR_PROF_DATA variable for counters and bitmaps. | ||
| void createDataVariable(InstrProfCntrInstBase *Inc); | ||
|
|
||
| /// Creates delayed initialiation function for data relative offsets | ||
| /// This is only relevant on NVPTX targets where circular constant structures | ||
| /// are not allowed | ||
| bool | ||
| emitDataDelayedInit(SmallVector<Function *> &Kernels, | ||
| SmallVector<const InstrProfCntrInstBase *> &ValueSites); | ||
|
|
||
| /// Get the counters for virtual table values, creating them if necessary. | ||
| void getOrCreateVTableProfData(GlobalVariable *GV); | ||
|
|
||
|
|
@@ -947,11 +954,18 @@ bool InstrLowerer::lower() { | |
| if (!ContainsProfiling && !CoverageNamesVar) | ||
| return MadeChange; | ||
|
|
||
| // Cached info for generating delayed offset calculations | ||
| // This is only relevant on NVPTX targets | ||
| SmallVector<Function *> Kernels; | ||
| SmallVector<const InstrProfCntrInstBase *> ValueSites; | ||
|
|
||
| // We did not know how many value sites there would be inside | ||
| // the instrumented function. This is counting the number of instrumented | ||
| // target value sites to enter it as field in the profile data variable. | ||
| for (Function &F : M) { | ||
| InstrProfCntrInstBase *FirstProfInst = nullptr; | ||
| if (F.getCallingConv() == CallingConv::PTX_Kernel) | ||
| Kernels.push_back(&F); | ||
| for (BasicBlock &BB : F) { | ||
| for (auto I = BB.begin(), E = BB.end(); I != E; I++) { | ||
| if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I)) | ||
|
|
@@ -971,9 +985,12 @@ bool InstrLowerer::lower() { | |
| // Also create the data variable based on the MCDCParams. | ||
| if (FirstProfInst != nullptr) { | ||
| static_cast<void>(getOrCreateRegionCounters(FirstProfInst)); | ||
| ValueSites.push_back(FirstProfInst); | ||
| } | ||
| } | ||
|
|
||
| MadeChange |= emitDataDelayedInit(Kernels, ValueSites); | ||
|
|
||
| if (EnableVTableValueProfiling) | ||
| for (GlobalVariable &GV : M.globals()) | ||
| // Global variables with type metadata are virtual table variables. | ||
|
|
@@ -1734,6 +1751,13 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) { | |
| return PD.RegionCounters; | ||
| } | ||
|
|
||
| // Calculates difference between two global variable addresses as an integer | ||
| Constant *globalVarDiff(Module &M, GlobalVariable *A, GlobalVariable *B) { | ||
| auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext()); | ||
| return ConstantExpr::getSub(ConstantExpr::getPtrToInt(A, IntPtrTy), | ||
| ConstantExpr::getPtrToInt(B, IntPtrTy)); | ||
| } | ||
|
|
||
| void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { | ||
| // When debug information is correlated to profile data, a data variable | ||
| // is not needed. | ||
|
|
@@ -1854,13 +1878,12 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { | |
| // Reference the counter variable with a label difference (link-time | ||
| // constant). | ||
| DataSectionKind = IPSK_data; | ||
| RelativeCounterPtr = | ||
| ConstantExpr::getSub(ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy), | ||
| ConstantExpr::getPtrToInt(Data, IntPtrTy)); | ||
| const Triple T(M.getTargetTriple()); | ||
| RelativeCounterPtr = T.isNVPTX() ? ConstantInt::get(IntPtrTy, 0) | ||
| : globalVarDiff(M, CounterPtr, Data); | ||
| if (BitmapPtr != nullptr) | ||
| RelativeBitmapPtr = | ||
| ConstantExpr::getSub(ConstantExpr::getPtrToInt(BitmapPtr, IntPtrTy), | ||
| ConstantExpr::getPtrToInt(Data, IntPtrTy)); | ||
| RelativeBitmapPtr = T.isNVPTX() ? ConstantInt::get(IntPtrTy, 0) | ||
| : globalVarDiff(M, BitmapPtr, Data); | ||
| } | ||
|
|
||
| Constant *DataVals[] = { | ||
|
|
@@ -1887,6 +1910,51 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { | |
| ReferencedNames.push_back(NamePtr); | ||
| } | ||
|
|
||
| bool InstrLowerer::emitDataDelayedInit( | ||
| SmallVector<Function *> &Kernels, | ||
| SmallVector<const InstrProfCntrInstBase *> &ValueSites) { | ||
| const Triple T(M.getTargetTriple()); | ||
| if (!T.isNVPTX() || ProfileCorrelate == InstrProfCorrelator::BINARY || | ||
| Kernels.empty() || ValueSites.empty()) { | ||
| return false; | ||
| } | ||
|
|
||
| auto *VoidTy = Type::getVoidTy(M.getContext()); | ||
| auto *Int32Ty = Type::getInt32Ty(M.getContext()); | ||
| auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext()); | ||
| auto *DelayedInitFTy = FunctionType::get(VoidTy, false); | ||
| auto *DelayedInitF = | ||
| Function::Create(DelayedInitFTy, GlobalValue::InternalLinkage, | ||
| getInstrProfDelayedInitFuncName(), M); | ||
|
|
||
| IRBuilder<> IRB(BasicBlock::Create(M.getContext(), "", DelayedInitF)); | ||
|
|
||
| for (const auto *ValueSite : ValueSites) { | ||
| GlobalVariable *NamePtr = ValueSite->getName(); | ||
| auto &PD = ProfileDataMap[NamePtr]; | ||
| auto *RelativeCounter = globalVarDiff(M, PD.RegionCounters, PD.DataVar); | ||
| auto *RelativeCounterPtr = | ||
| IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 2)}); | ||
| IRB.CreateStore(RelativeCounter, RelativeCounterPtr); | ||
| if (PD.RegionBitmaps != nullptr) { | ||
| auto *RelativeBitmap = globalVarDiff(M, PD.RegionBitmaps, PD.DataVar); | ||
| auto *RelativeBitmapPtr = | ||
| IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 3)}); | ||
| IRB.CreateStore(RelativeBitmap, RelativeBitmapPtr); | ||
| } | ||
| } | ||
|
|
||
| IRB.CreateRetVoid(); | ||
|
|
||
| for (auto *Kernel : Kernels) { | ||
| auto &KernelEntry = Kernel->getEntryBlock(); | ||
| IRB.SetInsertPoint(KernelEntry.getFirstNonPHIIt()); | ||
| IRB.CreateCall(DelayedInitF); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One of my main worries: this will initialize the prof globals in each kernel execution, right? Potentially in parallel across multiple kernels, and also all threads in each kernel?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Some level of overhead is expected for PGO, but we definitely want to minimize it if possible. Do you think it would be better to try calling
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm more worried about the fact that multiple kernels, and thus multiple threads within each kernel, will be writing/reading the same memory positions, even though they write the same values. Probably the cleanest way would be to mimic the effect of a constructor (i.e.,
Seems reasonable. Are these globals different in each translation unit? If so, you will have several functions to execute. |
||
| } | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
| void InstrLowerer::emitVNodes() { | ||
| if (!ValueProfileStaticAlloc) | ||
| return; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you remind me why we need this for NVPTX and not AMDGPU?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These values are used by
emitDataDelayedInitto generate the bitmap and counter pointers. We only need them on NVPTX because circular initial values work fine on AMD: #94268 (comment).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, seems like a surprising amount of code though. I would've just expected it to split up the initializer somehow. I.e.
Could be
or something?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jhuber6 Could you provide more detail on the solution you are proposing?
I'm not familiar with this code, but it seems that the case is similar to the one below, where the address of @__profd is used in the initializer. I don't see how to implement it cleanly in the initializers.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Self-referential initializers are busted in PTX because whoever programmed it decided that it didn't work. The only way to work around it is to change how it's initialized to no longer be self-referential.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Correct. But I was referring to your comment from June, where you mentioned a way to split the initializers and avoid the initialization function implemented by Ethan. I don't see how that way could be implemented in the case I mention above, where we need the addresses.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, in your example you require the address of the global itself, see if you can hack around it with aliases as those technically work on NVPTX. If not, this is unsolvable.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh yeah, that doesn't work as well since NVPTX aliases don't work on globals. Guess you'll need to bring this up w/ NVIDIA or do dynamic fixup in an initializer.