Skip to content

Commit 9c86077

Browse files
authored
[OpenMP][Offload][AMDGPU] Enable OMP runtime to trace kernel execution time (llvm#1465)
2 parents b81a29f + 85943e3 commit 9c86077

File tree

3 files changed

+117
-20
lines changed

3 files changed

+117
-20
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 104 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -856,6 +856,10 @@ struct AMDGPUKernelTy : public GenericKernelTy {
856856
return BlockSize <= ConstWGSize;
857857
}
858858

859+
uint32_t getKernelLaunchId() const { return KernelLaunchId; }
860+
861+
void setKernelLaunchId(uint32_t Id) const { KernelLaunchId = Id; }
862+
859863
/// Envar to enable occupancy-based optimization for SPMD kernel.
860864
BoolEnvar OMPX_SPMDOccupancyBasedOpt;
861865

@@ -887,6 +891,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
887891
/// CodeGen generate WGSize
888892
uint16_t ConstWGSize;
889893

894+
static thread_local uint32_t KernelLaunchId;
895+
890896
/// Lower number of threads if tripcount is low. This should produce
891897
/// a larger number of teams if allowed by other constraints.
892898
std::pair<bool, uint32_t> adjustNumThreadsForLowTripCount(
@@ -1333,6 +1339,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
13331339
uint64_t numTeams) const override;
13341340
};
13351341

1342+
thread_local uint32_t AMDGPUKernelTy::KernelLaunchId = 0;
1343+
13361344
/// Class representing an HSA signal. Signals are used to define dependencies
13371345
/// between asynchronous operations: kernel launches and memory transfers.
13381346
struct AMDGPUSignalTy {
@@ -1691,6 +1699,20 @@ struct AMDGPUStreamTy {
16911699
NumThreads(0), KernelRunRecords(nullptr) {}
16921700
};
16931701

1702+
struct KernelDurationTracingArgsTy {
1703+
hsa_agent_t Agent;
1704+
AMDGPUSignalTy *Signal;
1705+
double TicksToTime;
1706+
int32_t DeviceId;
1707+
uint32_t LaunchId;
1708+
uint32_t NumTeams;
1709+
uint32_t NumThreads;
1710+
1711+
KernelDurationTracingArgsTy()
1712+
: Agent{0}, Signal(nullptr), TicksToTime(setTicksToTime()), DeviceId(0),
1713+
LaunchId(0), NumTeams(0), NumThreads(0) {}
1714+
};
1715+
16941716
using AMDGPUStreamCallbackTy = Error(void *Data);
16951717

16961718
/// The stream is composed of N stream's slots. The struct below represents
@@ -1892,6 +1914,9 @@ struct AMDGPUStreamTy {
18921914
/// Arguments for the callback function.
18931915
PostKernelRunProcessingArgsTy PostKernelRunProcessingArgs;
18941916

1917+
/// Arguments for callback function to collect kernel duration.
1918+
KernelDurationTracingArgsTy KernelDurationTracingArgs;
1919+
18951920
/// Return the current number of asynchronous operations on the stream.
18961921
uint32_t size() const { return NextSlot; }
18971922

@@ -2052,9 +2077,9 @@ struct AMDGPUStreamTy {
20522077
return Plugin::success();
20532078
}
20542079

2055-
static uint64_t getKernelDuration(PostKernelRunProcessingArgsTy *Args) {
2080+
template <typename Ty> static uint64_t getKernelDuration(Ty *Args) {
20562081
assert(Args->Signal &&
2057-
"Invalid AMDGPUSignal Pointer in post kernel run processing");
2082+
"Invalid AMDGPUSignal Pointer for obtaining kernel duration");
20582083
hsa_amd_profiling_dispatch_time_t TimeRec;
20592084
hsa_amd_profiling_get_dispatch_time(Args->Agent, Args->Signal->get(),
20602085
&TimeRec);
@@ -2074,7 +2099,8 @@ struct AMDGPUStreamTy {
20742099
KernelRunRecordTy *KernelRecord = Args->KernelRunRecords;
20752100
assert(KernelRecord && "KernelRunRecord is null!");
20762101

2077-
uint64_t KernelDuration = getKernelDuration(Args);
2102+
uint64_t KernelDuration =
2103+
getKernelDuration<PostKernelRunProcessingArgsTy>(Args);
20782104
KernelRecord->addEntry(Args->KernelName, Args->NumTeams, Args->NumThreads,
20792105
KernelDuration);
20802106

@@ -2088,6 +2114,24 @@ struct AMDGPUStreamTy {
20882114
return Plugin::success();
20892115
}
20902116

2117+
/// Callback function to generate traces for kernel runtime.
2118+
static Error KernelDurationTracingAction(void *Data) {
2119+
assert(Data && "Invalid data pointer for tracing kernel duration");
2120+
KernelDurationTracingArgsTy *Args =
2121+
reinterpret_cast<KernelDurationTracingArgsTy *>(Data);
2122+
2123+
uint64_t KernelDuration =
2124+
getKernelDuration<KernelDurationTracingArgsTy>(Args);
2125+
2126+
fprintf(
2127+
stderr,
2128+
"DeviceID: %2d LaunchID: %2d TeamsXthrds:(%4uX%4d) Duration(ns): %lu\n",
2129+
Args->DeviceId, Args->LaunchId, Args->NumTeams, Args->NumThreads,
2130+
KernelDuration);
2131+
2132+
return Plugin::success();
2133+
}
2134+
20912135
#ifdef OMPT_SUPPORT
20922136
static Error timeKernelInNsAsync(void *Data) {
20932137
assert(Data && "Invalid data pointer in OMPT profiling");
@@ -2187,6 +2231,21 @@ struct AMDGPUStreamTy {
21872231
}
21882232
}
21892233

2234+
// When LIBOMPTARGET_KERNEL_EXE_TIME is set, register the callback function
2235+
// to get the kernel duration.
2236+
if (Device.enableKernelDurationTracing()) {
2237+
KernelDurationTracingArgs.Agent = Agent;
2238+
KernelDurationTracingArgs.Signal = OutputSignal;
2239+
KernelDurationTracingArgs.DeviceId = Device.getDeviceId();
2240+
KernelDurationTracingArgs.LaunchId = Kernel.getKernelLaunchId();
2241+
KernelDurationTracingArgs.NumTeams = NumBlocks[0];
2242+
KernelDurationTracingArgs.NumThreads = NumThreads[0];
2243+
2244+
if (auto Err = Slots[Curr].schedCallback(KernelDurationTracingAction,
2245+
&KernelDurationTracingArgs))
2246+
return Err;
2247+
}
2248+
21902249
// Push the kernel with the output signal and an input signal (optional)
21912250
DP("Using Queue: %p with HSA Queue: %p\n", Queue, Queue->getHsaQueue());
21922251
// If we are running an RPC server we want to wake up the server thread
@@ -2626,9 +2685,10 @@ struct AMDGPUStreamManagerTy final
26262685
OMPX_EnableQueueProfiling("LIBOMPTARGET_AMDGPU_ENABLE_QUEUE_PROFILING",
26272686
false),
26282687
NextQueue(0), Agent(HSAAgent) {
2629-
// If OMPX_ENABLE_RUNTIME_AUTOTUNING is enabled,
2630-
// set queue profiling to true.
2631-
if (Device.enableRuntimeAutotuning()) {
2688+
// If OMPX_ENABLE_RUNTIME_AUTOTUNING or LIBOMPTARGET_KERNEL_EXE_TIME is
2689+
// enabled, set queue profiling to true.
2690+
if (Device.enableRuntimeAutotuning() ||
2691+
Device.enableKernelDurationTracing()) {
26322692
OMPX_EnableQueueProfiling = true;
26332693
}
26342694
}
@@ -5162,19 +5222,42 @@ void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice,
51625222
auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount;
51635223
// auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;
51645224

5165-
// This line should print exactly as the one in the old plugin.
5166-
fprintf(
5167-
stderr,
5168-
"DEVID: %2d SGN:%d ConstWGSize:%-4d args:%2d teamsXthrds:(%4uX%4d) "
5169-
"reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u agpr_count:%u "
5170-
"sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d "
5171-
"md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
5172-
"%d%% n:%s\n",
5173-
GenericDevice.getDeviceId(), getExecutionModeFlags(), ConstWGSize,
5174-
KernelArgs.NumArgs, NumBlocks[0], NumThreads[0], 0, 0, GroupSegmentSize,
5175-
SGPRCount, VGPRCount, AGPRCount, SGPRSpillCount, VGPRSpillCount,
5176-
KernelArgs.Tripcount, HasRPC, isMultiDeviceKernel(), MultiDeviceLB,
5177-
MultiDeviceUB, MaxOccupancy, AchievedOccupancy, getName());
5225+
if (GenericDevice.enableKernelDurationTracing()) {
5226+
uint32_t LaunchId = GenericDevice.getAndIncrementLaunchId();
5227+
setKernelLaunchId(LaunchId);
5228+
5229+
// Print Launch Id after Device Id.
5230+
fprintf(stderr,
5231+
"DEVID: %2d LaunchId: %u SGN:%d ConstWGSize:%-4d args:%2d "
5232+
"teamsXthrds:(%4uX%4d) "
5233+
"reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
5234+
"agpr_count:%u "
5235+
"sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d "
5236+
"md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
5237+
"%d%% n:%s\n",
5238+
GenericDevice.getDeviceId(), LaunchId, getExecutionModeFlags(),
5239+
ConstWGSize, KernelArgs.NumArgs, NumBlocks[0], NumThreads[0], 0, 0,
5240+
GroupSegmentSize, SGPRCount, VGPRCount, AGPRCount, SGPRSpillCount,
5241+
VGPRSpillCount, KernelArgs.Tripcount, HasRPC, isMultiDeviceKernel(),
5242+
MultiDeviceLB, MultiDeviceUB, MaxOccupancy, AchievedOccupancy,
5243+
getName());
5244+
} else {
5245+
5246+
// This line should print exactly as the one in the old plugin.
5247+
fprintf(stderr,
5248+
"DEVID: %2d SGN:%d ConstWGSize:%-4d args:%2d teamsXthrds:(%4uX%4d) "
5249+
"reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
5250+
"agpr_count:%u "
5251+
"sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d "
5252+
"md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
5253+
"%d%% n:%s\n",
5254+
GenericDevice.getDeviceId(), getExecutionModeFlags(), ConstWGSize,
5255+
KernelArgs.NumArgs, NumBlocks[0], NumThreads[0], 0, 0,
5256+
GroupSegmentSize, SGPRCount, VGPRCount, AGPRCount, SGPRSpillCount,
5257+
VGPRSpillCount, KernelArgs.Tripcount, HasRPC, isMultiDeviceKernel(),
5258+
MultiDeviceLB, MultiDeviceUB, MaxOccupancy, AchievedOccupancy,
5259+
getName());
5260+
}
51785261
}
51795262

51805263
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
@@ -5186,7 +5269,8 @@ Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
51865269
// When LIBOMPTARGET_KERNEL_TRACE is set, print the single-line kernel trace
51875270
// info present in the old ASO plugin, and continue with the upstream 2-line
51885271
// info, should LIBOMPTARGET_INFO be a meaningful value, otherwise return.
5189-
if (getInfoLevel() & OMP_INFOTYPE_AMD_KERNEL_TRACE)
5272+
if ((getInfoLevel() & OMP_INFOTYPE_AMD_KERNEL_TRACE) ||
5273+
GenericDevice.enableKernelDurationTracing())
51905274
printAMDOneLineKernelTrace(GenericDevice, KernelArgs, NumThreads, NumBlocks,
51915275
MultiDeviceLB, MultiDeviceUB);
51925276

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1192,6 +1192,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
11921192
/// Destroy Argbufs and clear the cache. Used as part of device destructor
11931193
void clear_ArgBufs();
11941194

1195+
bool enableKernelDurationTracing() const {
1196+
return OMPX_KernelDurationTracing;
1197+
}
1198+
1199+
uint32_t getAndIncrementLaunchId() { return LaunchId.fetch_add(1); }
1200+
11951201
private:
11961202
/// Get and set the stack size and heap size for the device. If not used, the
11971203
/// plugin can implement the setters as no-op and setting the output
@@ -1239,6 +1245,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
12391245
BoolEnvar OMPX_ReuseBlocksForHighTripCount =
12401246
BoolEnvar("LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT", true);
12411247

1248+
/// Variable to track kernel launch for a device.
1249+
std::atomic<uint32_t> LaunchId = 0;
1250+
12421251
protected:
12431252
/// Environment variables defined by the LLVM OpenMP implementation
12441253
/// regarding the initial number of streams and events.
@@ -1286,6 +1295,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
12861295
/// Structs for functions and data used in runtime autotuning.
12871296
KernelRunRecordTy *KernelRunRecords;
12881297

1298+
/// Variable to enable kernel duration tracing.
1299+
BoolEnvar OMPX_KernelDurationTracing;
1300+
12891301
private:
12901302
#ifdef OMPT_SUPPORT
12911303
/// OMPT callback functions

offload/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -947,6 +947,7 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
947947
OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1),
948948
OMPX_NumMultiDevices("LIBOMPTARGET_NUM_MULTI_DEVICES", 0),
949949
OMPX_EnableRuntimeAutotuning("OMPX_ENABLE_RUNTIME_AUTOTUNING", false),
950+
OMPX_KernelDurationTracing("LIBOMPTARGET_KERNEL_EXE_TIME", false),
950951
DeviceId(DeviceId), GridValues(OMPGridValues),
951952
PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(),
952953
PinnedAllocs(*this), RPCServer(nullptr), KernelRunRecords(nullptr) {

0 commit comments

Comments
 (0)