Skip to content

Commit dcc6421

Browse files
committed
[OpenMP][Offload] New OMP runtime envar to trace kernel duration and generated launch id
1 parent 02c015f commit dcc6421

File tree

3 files changed

+116
-19
lines changed

3 files changed

+116
-19
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 103 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -856,6 +856,10 @@ struct AMDGPUKernelTy : public GenericKernelTy {
856856
return BlockSize <= ConstWGSize;
857857
}
858858

859+
uint32_t getKernelLaunchId() const { return KernelLaunchId; }
860+
861+
void setKernelLaunchId(uint32_t Id) const { KernelLaunchId = Id; }
862+
859863
/// Envar to enable occupancy-based optimization for SPMD kernel.
860864
BoolEnvar OMPX_SPMDOccupancyBasedOpt;
861865

@@ -887,6 +891,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
887891
/// CodeGen generate WGSize
888892
uint16_t ConstWGSize;
889893

894+
static thread_local uint32_t KernelLaunchId;
895+
890896
/// Lower number of threads if tripcount is low. This should produce
891897
/// a larger number of teams if allowed by other constraints.
892898
std::pair<bool, uint32_t> adjustNumThreadsForLowTripCount(
@@ -1338,6 +1344,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
13381344
uint64_t numTeams) const override;
13391345
};
13401346

1347+
thread_local uint32_t AMDGPUKernelTy::KernelLaunchId = 0;
1348+
13411349
/// Class representing an HSA signal. Signals are used to define dependencies
13421350
/// between asynchronous operations: kernel launches and memory transfers.
13431351
struct AMDGPUSignalTy {
@@ -1696,6 +1704,20 @@ struct AMDGPUStreamTy {
16961704
NumThreads(0), KernelRunRecords(nullptr) {}
16971705
};
16981706

1707+
struct KernelDurationTracingArgsTy {
1708+
hsa_agent_t Agent;
1709+
AMDGPUSignalTy *Signal;
1710+
double TicksToTime;
1711+
int32_t DeviceId;
1712+
uint32_t LaunchId;
1713+
uint32_t NumTeams;
1714+
uint32_t NumThreads;
1715+
1716+
KernelDurationTracingArgsTy()
1717+
: Agent{0}, Signal(nullptr), TicksToTime(setTicksToTime()), DeviceId(0),
1718+
LaunchId(0), NumTeams(0), NumThreads(0) {}
1719+
};
1720+
16991721
using AMDGPUStreamCallbackTy = Error(void *Data);
17001722

17011723
/// The stream is composed of N stream's slots. The struct below represents
@@ -1896,6 +1918,9 @@ struct AMDGPUStreamTy {
18961918
/// Arguments for the callback function.
18971919
PostKernelRunProcessingArgsTy PostKernelRunProcessingArgs;
18981920

1921+
/// Arguments for callback function to collect kernel duration.
1922+
KernelDurationTracingArgsTy KernelDurationTracingArgs;
1923+
18991924
/// Return the current number of asynchronous operations on the stream.
19001925
uint32_t size() const { return NextSlot; }
19011926

@@ -2056,9 +2081,9 @@ struct AMDGPUStreamTy {
20562081
return Plugin::success();
20572082
}
20582083

2059-
static uint64_t getKernelDuration(PostKernelRunProcessingArgsTy *Args) {
2084+
template <typename Ty> static uint64_t getKernelDuration(Ty *Args) {
20602085
assert(Args->Signal &&
2061-
"Invalid AMDGPUSignal Pointer in post kernel run processing");
2086+
"Invalid AMDGPUSignal Pointer for obtaining kernel duration");
20622087
hsa_amd_profiling_dispatch_time_t TimeRec;
20632088
hsa_amd_profiling_get_dispatch_time(Args->Agent, Args->Signal->get(),
20642089
&TimeRec);
@@ -2078,7 +2103,8 @@ struct AMDGPUStreamTy {
20782103
KernelRunRecordTy *KernelRecord = Args->KernelRunRecords;
20792104
assert(KernelRecord && "KernelRunRecord is null!");
20802105

2081-
uint64_t KernelDuration = getKernelDuration(Args);
2106+
uint64_t KernelDuration =
2107+
getKernelDuration<PostKernelRunProcessingArgsTy>(Args);
20822108
KernelRecord->addEntry(Args->KernelName, Args->NumTeams, Args->NumThreads,
20832109
KernelDuration);
20842110

@@ -2092,6 +2118,24 @@ struct AMDGPUStreamTy {
20922118
return Plugin::success();
20932119
}
20942120

2121+
/// Callback function to generate traces for kernel runtime.
2122+
static Error KernelDurationTracingAction(void *Data) {
2123+
assert(Data && "Invalid data pointer for tracing kernel duration");
2124+
KernelDurationTracingArgsTy *Args =
2125+
reinterpret_cast<KernelDurationTracingArgsTy *>(Data);
2126+
2127+
uint64_t KernelDuration =
2128+
getKernelDuration<KernelDurationTracingArgsTy>(Args);
2129+
2130+
fprintf(
2131+
stderr,
2132+
"DeviceID: %2d LaunchID: %2d TeamsXthrds:(%4uX%4d) Duration(ns): %lu\n",
2133+
Args->DeviceId, Args->LaunchId, Args->NumTeams, Args->NumThreads,
2134+
KernelDuration);
2135+
2136+
return Plugin::success();
2137+
}
2138+
20952139
#ifdef OMPT_SUPPORT
20962140
static Error timeKernelInNsAsync(void *Data) {
20972141
assert(Data && "Invalid data pointer in OMPT profiling");
@@ -2191,6 +2235,21 @@ struct AMDGPUStreamTy {
21912235
}
21922236
}
21932237

2238+
// When LIBOMPTARGET_EXE_TIME is set, register the callback function to get
2239+
// the kernel duration.
2240+
if (Device.enableKernelDurationTracing()) {
2241+
KernelDurationTracingArgs.Agent = Agent;
2242+
KernelDurationTracingArgs.Signal = OutputSignal;
2243+
KernelDurationTracingArgs.DeviceId = Device.getDeviceId();
2244+
KernelDurationTracingArgs.LaunchId = Kernel.getKernelLaunchId();
2245+
KernelDurationTracingArgs.NumTeams = NumBlocks[0];
2246+
KernelDurationTracingArgs.NumThreads = NumThreads[0];
2247+
2248+
if (auto Err = Slots[Curr].schedCallback(KernelDurationTracingAction,
2249+
&KernelDurationTracingArgs))
2250+
return Err;
2251+
}
2252+
21942253
// Push the kernel with the output signal and an input signal (optional)
21952254
DP("Using Queue: %p with HSA Queue: %p\n", Queue, Queue->getHsaQueue());
21962255
// If we are running an RPC server we want to wake up the server thread
@@ -2630,9 +2689,10 @@ struct AMDGPUStreamManagerTy final
26302689
OMPX_EnableQueueProfiling("LIBOMPTARGET_AMDGPU_ENABLE_QUEUE_PROFILING",
26312690
false),
26322691
NextQueue(0), Agent(HSAAgent) {
2633-
// If OMPX_ENABLE_RUNTIME_AUTOTUNING is enabled,
2692+
// If OMPX_ENABLE_RUNTIME_AUTOTUNING or LIBOMPTARGET_EXE_TIME is enabled,
26342693
// set queue profiling to true.
2635-
if (Device.enableRuntimeAutotuning()) {
2694+
if (Device.enableRuntimeAutotuning() ||
2695+
Device.enableKernelDurationTracing()) {
26362696
OMPX_EnableQueueProfiling = true;
26372697
}
26382698
}
@@ -5164,19 +5224,42 @@ void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice,
51645224
auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount;
51655225
// auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;
51665226

5167-
// This line should print exactly as the one in the old plugin.
5168-
fprintf(
5169-
stderr,
5170-
"DEVID: %2d SGN:%d ConstWGSize:%-4d args:%2d teamsXthrds:(%4uX%4d) "
5171-
"reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u agpr_count:%u "
5172-
"sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d "
5173-
"md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
5174-
"%d%% n:%s\n",
5175-
GenericDevice.getDeviceId(), getExecutionModeFlags(), ConstWGSize,
5176-
KernelArgs.NumArgs, NumBlocks[0], NumThreads[0], 0, 0, GroupSegmentSize,
5177-
SGPRCount, VGPRCount, AGPRCount, SGPRSpillCount, VGPRSpillCount,
5178-
KernelArgs.Tripcount, HasRPC, isMultiDeviceKernel(), MultiDeviceLB,
5179-
MultiDeviceUB, MaxOccupancy, AchievedOccupancy, getName());
5227+
if (GenericDevice.enableKernelDurationTracing()) {
5228+
uint32_t LaunchId = GenericDevice.getAndIncrementLaunchId();
5229+
setKernelLaunchId(LaunchId);
5230+
5231+
// Print Launch Id after Device Id.
5232+
fprintf(stderr,
5233+
"DEVID: %2d LaunchId: %u SGN:%d ConstWGSize:%-4d args:%2d "
5234+
"teamsXthrds:(%4uX%4d) "
5235+
"reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
5236+
"agpr_count:%u "
5237+
"sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d "
5238+
"md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
5239+
"%d%% n:%s\n",
5240+
GenericDevice.getDeviceId(), LaunchId, getExecutionModeFlags(),
5241+
ConstWGSize, KernelArgs.NumArgs, NumBlocks[0], NumThreads[0], 0, 0,
5242+
GroupSegmentSize, SGPRCount, VGPRCount, AGPRCount, SGPRSpillCount,
5243+
VGPRSpillCount, KernelArgs.Tripcount, HasRPC, isMultiDeviceKernel(),
5244+
MultiDeviceLB, MultiDeviceUB, MaxOccupancy, AchievedOccupancy,
5245+
getName());
5246+
} else {
5247+
5248+
// This line should print exactly as the one in the old plugin.
5249+
fprintf(stderr,
5250+
"DEVID: %2d SGN:%d ConstWGSize:%-4d args:%2d teamsXthrds:(%4uX%4d) "
5251+
"reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
5252+
"agpr_count:%u "
5253+
"sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d "
5254+
"md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
5255+
"%d%% n:%s\n",
5256+
GenericDevice.getDeviceId(), getExecutionModeFlags(), ConstWGSize,
5257+
KernelArgs.NumArgs, NumBlocks[0], NumThreads[0], 0, 0,
5258+
GroupSegmentSize, SGPRCount, VGPRCount, AGPRCount, SGPRSpillCount,
5259+
VGPRSpillCount, KernelArgs.Tripcount, HasRPC, isMultiDeviceKernel(),
5260+
MultiDeviceLB, MultiDeviceUB, MaxOccupancy, AchievedOccupancy,
5261+
getName());
5262+
}
51805263
}
51815264

51825265
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
@@ -5188,7 +5271,8 @@ Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
51885271
// When LIBOMPTARGET_KERNEL_TRACE is set, print the single-line kernel trace
51895272
// info present in the old ASO plugin, and continue with the upstream 2-line
51905273
// info, should LIBOMPTARGET_INFO be a meaningful value, otherwise return.
5191-
if (getInfoLevel() & OMP_INFOTYPE_AMD_KERNEL_TRACE)
5274+
if ((getInfoLevel() & OMP_INFOTYPE_AMD_KERNEL_TRACE) ||
5275+
GenericDevice.enableKernelDurationTracing())
51925276
printAMDOneLineKernelTrace(GenericDevice, KernelArgs, NumThreads, NumBlocks,
51935277
MultiDeviceLB, MultiDeviceUB);
51945278

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1192,6 +1192,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
11921192
/// Destroy Argbufs and clear the cache. Used as part of device destructor
11931193
void clear_ArgBufs();
11941194

1195+
bool enableKernelDurationTracing() const {
1196+
return OMPX_KernelDurationTracing;
1197+
}
1198+
1199+
uint32_t getAndIncrementLaunchId() { return LaunchId.fetch_add(1); }
1200+
11951201
private:
11961202
/// Get and set the stack size and heap size for the device. If not used, the
11971203
/// plugin can implement the setters as no-op and setting the output
@@ -1239,6 +1245,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
12391245
BoolEnvar OMPX_ReuseBlocksForHighTripCount =
12401246
BoolEnvar("LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT", true);
12411247

1248+
/// Variable to track kernel launch for a device.
1249+
std::atomic<uint32_t> LaunchId = 0;
1250+
12421251
protected:
12431252
/// Environment variables defined by the LLVM OpenMP implementation
12441253
/// regarding the initial number of streams and events.
@@ -1286,6 +1295,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
12861295
/// Structs for functions and data used in runtime autotuning.
12871296
KernelRunRecordTy *KernelRunRecords;
12881297

1298+
/// Variable to enable kernel duration tracing.
1299+
BoolEnvar OMPX_KernelDurationTracing;
1300+
12891301
private:
12901302
#ifdef OMPT_SUPPORT
12911303
/// OMPT callback functions

offload/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -947,6 +947,7 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
947947
OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1),
948948
OMPX_NumMultiDevices("LIBOMPTARGET_NUM_MULTI_DEVICES", 0),
949949
OMPX_EnableRuntimeAutotuning("OMPX_ENABLE_RUNTIME_AUTOTUNING", false),
950+
OMPX_KernelDurationTracing("LIBOMPTARGET_EXE_TIME", false),
950951
DeviceId(DeviceId), GridValues(OMPGridValues),
951952
PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(),
952953
PinnedAllocs(*this), RPCServer(nullptr), KernelRunRecords(nullptr) {

0 commit comments

Comments
 (0)