@@ -856,6 +856,10 @@ struct AMDGPUKernelTy : public GenericKernelTy {
856
856
return BlockSize <= ConstWGSize;
857
857
}
858
858
859
+ uint32_t getKernelLaunchId () const { return KernelLaunchId; }
860
+
861
+ void setKernelLaunchId (uint32_t Id) const { KernelLaunchId = Id; }
862
+
859
863
// / Envar to enable occupancy-based optimization for SPMD kernel.
860
864
BoolEnvar OMPX_SPMDOccupancyBasedOpt;
861
865
@@ -887,6 +891,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
887
891
// / CodeGen generate WGSize
888
892
uint16_t ConstWGSize;
889
893
894
+ static thread_local uint32_t KernelLaunchId;
895
+
890
896
// / Lower number of threads if tripcount is low. This should produce
891
897
// / a larger number of teams if allowed by other constraints.
892
898
std::pair<bool , uint32_t > adjustNumThreadsForLowTripCount (
@@ -1338,6 +1344,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1338
1344
uint64_t numTeams) const override ;
1339
1345
};
1340
1346
1347
+ thread_local uint32_t AMDGPUKernelTy::KernelLaunchId = 0 ;
1348
+
1341
1349
// / Class representing an HSA signal. Signals are used to define dependencies
1342
1350
// / between asynchronous operations: kernel launches and memory transfers.
1343
1351
struct AMDGPUSignalTy {
@@ -1696,6 +1704,20 @@ struct AMDGPUStreamTy {
1696
1704
NumThreads (0 ), KernelRunRecords(nullptr ) {}
1697
1705
};
1698
1706
1707
+ struct KernelDurationTracingArgsTy {
1708
+ hsa_agent_t Agent;
1709
+ AMDGPUSignalTy *Signal;
1710
+ double TicksToTime;
1711
+ int32_t DeviceId;
1712
+ uint32_t LaunchId;
1713
+ uint32_t NumTeams;
1714
+ uint32_t NumThreads;
1715
+
1716
+ KernelDurationTracingArgsTy ()
1717
+ : Agent{0 }, Signal(nullptr ), TicksToTime(setTicksToTime()), DeviceId(0 ),
1718
+ LaunchId (0 ), NumTeams(0 ), NumThreads(0 ) {}
1719
+ };
1720
+
1699
1721
using AMDGPUStreamCallbackTy = Error(void *Data);
1700
1722
1701
1723
// / The stream is composed of N stream's slots. The struct below represents
@@ -1896,6 +1918,9 @@ struct AMDGPUStreamTy {
1896
1918
// / Arguments for the callback function.
1897
1919
PostKernelRunProcessingArgsTy PostKernelRunProcessingArgs;
1898
1920
1921
+ // / Arguments for callback function to collect kernel duration.
1922
+ KernelDurationTracingArgsTy KernelDurationTracingArgs;
1923
+
1899
1924
// / Return the current number of asynchronous operations on the stream.
1900
1925
uint32_t size () const { return NextSlot; }
1901
1926
@@ -2056,9 +2081,9 @@ struct AMDGPUStreamTy {
2056
2081
return Plugin::success ();
2057
2082
}
2058
2083
2059
- static uint64_t getKernelDuration (PostKernelRunProcessingArgsTy *Args) {
2084
+ template < typename Ty> static uint64_t getKernelDuration (Ty *Args) {
2060
2085
assert (Args->Signal &&
2061
- " Invalid AMDGPUSignal Pointer in post kernel run processing " );
2086
+ " Invalid AMDGPUSignal Pointer for obtaining kernel duration " );
2062
2087
hsa_amd_profiling_dispatch_time_t TimeRec;
2063
2088
hsa_amd_profiling_get_dispatch_time (Args->Agent , Args->Signal ->get (),
2064
2089
&TimeRec);
@@ -2078,7 +2103,8 @@ struct AMDGPUStreamTy {
2078
2103
KernelRunRecordTy *KernelRecord = Args->KernelRunRecords ;
2079
2104
assert (KernelRecord && " KernelRunRecord is null!" );
2080
2105
2081
- uint64_t KernelDuration = getKernelDuration (Args);
2106
+ uint64_t KernelDuration =
2107
+ getKernelDuration<PostKernelRunProcessingArgsTy>(Args);
2082
2108
KernelRecord->addEntry (Args->KernelName , Args->NumTeams , Args->NumThreads ,
2083
2109
KernelDuration);
2084
2110
@@ -2092,6 +2118,24 @@ struct AMDGPUStreamTy {
2092
2118
return Plugin::success ();
2093
2119
}
2094
2120
2121
+ // / Callback function to generate traces for kernel runtime.
2122
+ static Error KernelDurationTracingAction (void *Data) {
2123
+ assert (Data && " Invalid data pointer for tracing kernel duration" );
2124
+ KernelDurationTracingArgsTy *Args =
2125
+ reinterpret_cast <KernelDurationTracingArgsTy *>(Data);
2126
+
2127
+ uint64_t KernelDuration =
2128
+ getKernelDuration<KernelDurationTracingArgsTy>(Args);
2129
+
2130
+ fprintf (
2131
+ stderr,
2132
+ " DeviceID: %2d LaunchID: %2d TeamsXthrds:(%4uX%4d) Duration(ns): %lu\n " ,
2133
+ Args->DeviceId , Args->LaunchId , Args->NumTeams , Args->NumThreads ,
2134
+ KernelDuration);
2135
+
2136
+ return Plugin::success ();
2137
+ }
2138
+
2095
2139
#ifdef OMPT_SUPPORT
2096
2140
static Error timeKernelInNsAsync (void *Data) {
2097
2141
assert (Data && " Invalid data pointer in OMPT profiling" );
@@ -2191,6 +2235,21 @@ struct AMDGPUStreamTy {
2191
2235
}
2192
2236
}
2193
2237
2238
+ // When LIBOMPTARGET_EXE_TIME is set, register the callback function to get
2239
+ // the kernel duration.
2240
+ if (Device.enableKernelDurationTracing ()) {
2241
+ KernelDurationTracingArgs.Agent = Agent;
2242
+ KernelDurationTracingArgs.Signal = OutputSignal;
2243
+ KernelDurationTracingArgs.DeviceId = Device.getDeviceId ();
2244
+ KernelDurationTracingArgs.LaunchId = Kernel.getKernelLaunchId ();
2245
+ KernelDurationTracingArgs.NumTeams = NumBlocks[0 ];
2246
+ KernelDurationTracingArgs.NumThreads = NumThreads[0 ];
2247
+
2248
+ if (auto Err = Slots[Curr].schedCallback (KernelDurationTracingAction,
2249
+ &KernelDurationTracingArgs))
2250
+ return Err;
2251
+ }
2252
+
2194
2253
// Push the kernel with the output signal and an input signal (optional)
2195
2254
DP (" Using Queue: %p with HSA Queue: %p\n " , Queue, Queue->getHsaQueue ());
2196
2255
// If we are running an RPC server we want to wake up the server thread
@@ -2630,9 +2689,10 @@ struct AMDGPUStreamManagerTy final
2630
2689
OMPX_EnableQueueProfiling(" LIBOMPTARGET_AMDGPU_ENABLE_QUEUE_PROFILING" ,
2631
2690
false ),
2632
2691
NextQueue(0 ), Agent(HSAAgent) {
2633
- // If OMPX_ENABLE_RUNTIME_AUTOTUNING is enabled,
2692
+ // If OMPX_ENABLE_RUNTIME_AUTOTUNING or LIBOMPTARGET_EXE_TIME is enabled,
2634
2693
// set queue profiling to true.
2635
- if (Device.enableRuntimeAutotuning ()) {
2694
+ if (Device.enableRuntimeAutotuning () ||
2695
+ Device.enableKernelDurationTracing ()) {
2636
2696
OMPX_EnableQueueProfiling = true ;
2637
2697
}
2638
2698
}
@@ -5164,19 +5224,42 @@ void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice,
5164
5224
auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount ;
5165
5225
// auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;
5166
5226
5167
- // This line should print exactly as the one in the old plugin.
5168
- fprintf (
5169
- stderr,
5170
- " DEVID: %2d SGN:%d ConstWGSize:%-4d args:%2d teamsXthrds:(%4uX%4d) "
5171
- " reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u agpr_count:%u "
5172
- " sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d "
5173
- " md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
5174
- " %d%% n:%s\n " ,
5175
- GenericDevice.getDeviceId (), getExecutionModeFlags (), ConstWGSize,
5176
- KernelArgs.NumArgs , NumBlocks[0 ], NumThreads[0 ], 0 , 0 , GroupSegmentSize,
5177
- SGPRCount, VGPRCount, AGPRCount, SGPRSpillCount, VGPRSpillCount,
5178
- KernelArgs.Tripcount , HasRPC, isMultiDeviceKernel (), MultiDeviceLB,
5179
- MultiDeviceUB, MaxOccupancy, AchievedOccupancy, getName ());
5227
+ if (GenericDevice.enableKernelDurationTracing ()) {
5228
+ uint32_t LaunchId = GenericDevice.getAndIncrementLaunchId ();
5229
+ setKernelLaunchId (LaunchId);
5230
+
5231
+ // Print Launch Id after Device Id.
5232
+ fprintf (stderr,
5233
+ " DEVID: %2d LaunchId: %u SGN:%d ConstWGSize:%-4d args:%2d "
5234
+ " teamsXthrds:(%4uX%4d) "
5235
+ " reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
5236
+ " agpr_count:%u "
5237
+ " sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d "
5238
+ " md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
5239
+ " %d%% n:%s\n " ,
5240
+ GenericDevice.getDeviceId (), LaunchId, getExecutionModeFlags (),
5241
+ ConstWGSize, KernelArgs.NumArgs , NumBlocks[0 ], NumThreads[0 ], 0 , 0 ,
5242
+ GroupSegmentSize, SGPRCount, VGPRCount, AGPRCount, SGPRSpillCount,
5243
+ VGPRSpillCount, KernelArgs.Tripcount , HasRPC, isMultiDeviceKernel (),
5244
+ MultiDeviceLB, MultiDeviceUB, MaxOccupancy, AchievedOccupancy,
5245
+ getName ());
5246
+ } else {
5247
+
5248
+ // This line should print exactly as the one in the old plugin.
5249
+ fprintf (stderr,
5250
+ " DEVID: %2d SGN:%d ConstWGSize:%-4d args:%2d teamsXthrds:(%4uX%4d) "
5251
+ " reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
5252
+ " agpr_count:%u "
5253
+ " sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d "
5254
+ " md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
5255
+ " %d%% n:%s\n " ,
5256
+ GenericDevice.getDeviceId (), getExecutionModeFlags (), ConstWGSize,
5257
+ KernelArgs.NumArgs , NumBlocks[0 ], NumThreads[0 ], 0 , 0 ,
5258
+ GroupSegmentSize, SGPRCount, VGPRCount, AGPRCount, SGPRSpillCount,
5259
+ VGPRSpillCount, KernelArgs.Tripcount , HasRPC, isMultiDeviceKernel (),
5260
+ MultiDeviceLB, MultiDeviceUB, MaxOccupancy, AchievedOccupancy,
5261
+ getName ());
5262
+ }
5180
5263
}
5181
5264
5182
5265
Error AMDGPUKernelTy::printLaunchInfoDetails (GenericDeviceTy &GenericDevice,
@@ -5188,7 +5271,8 @@ Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
5188
5271
// When LIBOMPTARGET_KERNEL_TRACE is set, print the single-line kernel trace
5189
5272
// info present in the old ASO plugin, and continue with the upstream 2-line
5190
5273
// info, should LIBOMPTARGET_INFO be a meaningful value, otherwise return.
5191
- if (getInfoLevel () & OMP_INFOTYPE_AMD_KERNEL_TRACE)
5274
+ if ((getInfoLevel () & OMP_INFOTYPE_AMD_KERNEL_TRACE) ||
5275
+ GenericDevice.enableKernelDurationTracing ())
5192
5276
printAMDOneLineKernelTrace (GenericDevice, KernelArgs, NumThreads, NumBlocks,
5193
5277
MultiDeviceLB, MultiDeviceUB);
5194
5278
0 commit comments