Skip to content

Commit ff6fb08

Browse files
[Offload][PGO] Fix PGO on NVPTX targets
1 parent bdcbe67 commit ff6fb08

File tree

6 files changed

+83
-44
lines changed

6 files changed

+83
-44
lines changed

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6247,9 +6247,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
62476247
Args.AddLastArg(CmdArgs, options::OPT_fconvergent_functions,
62486248
options::OPT_fno_convergent_functions);
62496249

6250-
// NVPTX doesn't support PGO or coverage
6251-
if (!Triple.isNVPTX())
6252-
addPGOAndCoverageFlags(TC, C, JA, Output, Args, SanitizeArgs, CmdArgs);
6250+
addPGOAndCoverageFlags(TC, C, JA, Output, Args, SanitizeArgs, CmdArgs);
62536251

62546252
Args.AddLastArg(CmdArgs, options::OPT_fclang_abi_compat_EQ);
62556253

clang/test/Driver/cuda-no-pgo-or-coverage.cu

Lines changed: 0 additions & 33 deletions
This file was deleted.

llvm/include/llvm/ProfileData/InstrProf.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,12 @@ inline StringRef getInstrProfRegFuncsName() {
171171
return "__llvm_profile_register_functions";
172172
}
173173

174+
/// Return the name of function that initializes self-referential datavar values
175+
/// on NVPTX targets
176+
inline StringRef getInstrProfDelayedInitFuncName() {
177+
return "__llvm_profile_delayed_data_var_init";
178+
}
179+
174180
/// Return the name of the runtime interface that registers per-function control
175181
/// data for one instrumented function.
176182
inline StringRef getInstrProfRegFuncName() {

llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp

Lines changed: 74 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,13 @@ class InstrLowerer final {
391391
/// Create INSTR_PROF_DATA variable for counters and bitmaps.
392392
void createDataVariable(InstrProfCntrInstBase *Inc);
393393

394+
/// Creates delayed initialiation function for data relative offsets
395+
/// This is only relevant on NVPTX targets where circular constant structures
396+
/// are not allowed
397+
bool
398+
emitDataDelayedInit(SmallVector<Function *> &Kernels,
399+
SmallVector<const InstrProfCntrInstBase *> &ValueSites);
400+
394401
/// Get the counters for virtual table values, creating them if necessary.
395402
void getOrCreateVTableProfData(GlobalVariable *GV);
396403

@@ -947,11 +954,18 @@ bool InstrLowerer::lower() {
947954
if (!ContainsProfiling && !CoverageNamesVar)
948955
return MadeChange;
949956

957+
// Cached info for generating delayed offset calculations
958+
// This is only relevant on NVPTX targets
959+
SmallVector<Function *> Kernels;
960+
SmallVector<const InstrProfCntrInstBase *> ValueSites;
961+
950962
// We did not know how many value sites there would be inside
951963
// the instrumented function. This is counting the number of instrumented
952964
// target value sites to enter it as field in the profile data variable.
953965
for (Function &F : M) {
954966
InstrProfCntrInstBase *FirstProfInst = nullptr;
967+
if (F.getCallingConv() == CallingConv::PTX_Kernel)
968+
Kernels.push_back(&F);
955969
for (BasicBlock &BB : F) {
956970
for (auto I = BB.begin(), E = BB.end(); I != E; I++) {
957971
if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I))
@@ -971,9 +985,12 @@ bool InstrLowerer::lower() {
971985
// Also create the data variable based on the MCDCParams.
972986
if (FirstProfInst != nullptr) {
973987
static_cast<void>(getOrCreateRegionCounters(FirstProfInst));
988+
ValueSites.push_back(FirstProfInst);
974989
}
975990
}
976991

992+
MadeChange |= emitDataDelayedInit(Kernels, ValueSites);
993+
977994
if (EnableVTableValueProfiling)
978995
for (GlobalVariable &GV : M.globals())
979996
// Global variables with type metadata are virtual table variables.
@@ -1734,6 +1751,13 @@ InstrLowerer::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
17341751
return PD.RegionCounters;
17351752
}
17361753

1754+
// Calculates difference between two global variable addresses as an integer
1755+
Constant *globalVarDiff(Module &M, GlobalVariable *A, GlobalVariable *B) {
1756+
auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
1757+
return ConstantExpr::getSub(ConstantExpr::getPtrToInt(A, IntPtrTy),
1758+
ConstantExpr::getPtrToInt(B, IntPtrTy));
1759+
}
1760+
17371761
void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
17381762
// When debug information is correlated to profile data, a data variable
17391763
// is not needed.
@@ -1854,13 +1878,12 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
18541878
// Reference the counter variable with a label difference (link-time
18551879
// constant).
18561880
DataSectionKind = IPSK_data;
1857-
RelativeCounterPtr =
1858-
ConstantExpr::getSub(ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy),
1859-
ConstantExpr::getPtrToInt(Data, IntPtrTy));
1881+
const Triple T(M.getTargetTriple());
1882+
RelativeCounterPtr = T.isNVPTX() ? ConstantInt::get(IntPtrTy, 0)
1883+
: globalVarDiff(M, CounterPtr, Data);
18601884
if (BitmapPtr != nullptr)
1861-
RelativeBitmapPtr =
1862-
ConstantExpr::getSub(ConstantExpr::getPtrToInt(BitmapPtr, IntPtrTy),
1863-
ConstantExpr::getPtrToInt(Data, IntPtrTy));
1885+
RelativeBitmapPtr = T.isNVPTX() ? ConstantInt::get(IntPtrTy, 0)
1886+
: globalVarDiff(M, BitmapPtr, Data);
18641887
}
18651888

18661889
Constant *DataVals[] = {
@@ -1887,6 +1910,51 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
18871910
ReferencedNames.push_back(NamePtr);
18881911
}
18891912

1913+
bool InstrLowerer::emitDataDelayedInit(
1914+
SmallVector<Function *> &Kernels,
1915+
SmallVector<const InstrProfCntrInstBase *> &ValueSites) {
1916+
const Triple T(M.getTargetTriple());
1917+
if (!T.isNVPTX() || ProfileCorrelate == InstrProfCorrelator::BINARY ||
1918+
Kernels.empty() || ValueSites.empty()) {
1919+
return false;
1920+
}
1921+
1922+
auto *VoidTy = Type::getVoidTy(M.getContext());
1923+
auto *Int32Ty = Type::getInt32Ty(M.getContext());
1924+
auto *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext());
1925+
auto *DelayedInitFTy = FunctionType::get(VoidTy, false);
1926+
auto *DelayedInitF =
1927+
Function::Create(DelayedInitFTy, GlobalValue::InternalLinkage,
1928+
getInstrProfDelayedInitFuncName(), M);
1929+
1930+
IRBuilder<> IRB(BasicBlock::Create(M.getContext(), "", DelayedInitF));
1931+
1932+
for (const auto *ValueSite : ValueSites) {
1933+
GlobalVariable *NamePtr = ValueSite->getName();
1934+
auto &PD = ProfileDataMap[NamePtr];
1935+
auto *RelativeCounter = globalVarDiff(M, PD.RegionCounters, PD.DataVar);
1936+
auto *RelativeCounterPtr =
1937+
IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 2)});
1938+
IRB.CreateStore(RelativeCounter, RelativeCounterPtr);
1939+
if (PD.RegionBitmaps != nullptr) {
1940+
auto *RelativeBitmap = globalVarDiff(M, PD.RegionBitmaps, PD.DataVar);
1941+
auto *RelativeBitmapPtr =
1942+
IRB.CreateGEP(IntPtrTy, PD.DataVar, {ConstantInt::get(Int32Ty, 3)});
1943+
IRB.CreateStore(RelativeBitmap, RelativeBitmapPtr);
1944+
}
1945+
}
1946+
1947+
IRB.CreateRetVoid();
1948+
1949+
for (auto *Kernel : Kernels) {
1950+
auto &KernelEntry = Kernel->getEntryBlock();
1951+
IRB.SetInsertPoint(KernelEntry.getFirstNonPHI());
1952+
IRB.CreateCall(DelayedInitF);
1953+
}
1954+
1955+
return true;
1956+
}
1957+
18901958
void InstrLowerer::emitVNodes() {
18911959
if (!ValueProfileStaticAlloc)
18921960
return;

offload/test/offloading/gpupgo/pgo_device_and_host.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
// RUN: %target_triple.%basename_t.hfdi.profraw \
4949
// RUN: | %fcheck-generic --check-prefix="LLVM-DEVICE"
5050

51-
// REQUIRES: amdgpu
51+
// REQUIRES: gpu
5252
// REQUIRES: pgo
5353

5454
int main() {

offload/test/offloading/gpupgo/pgo_device_only.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
// RUN: %target_triple.%basename_t.clang.profraw | \
1515
// RUN: %fcheck-generic --check-prefix="CLANG-PGO"
1616

17-
// REQUIRES: amdgpu
17+
// REQUIRES: gpu
1818
// REQUIRES: pgo
1919

2020
int test1(int a) { return a / 2; }

0 commit comments

Comments
 (0)