@@ -856,6 +856,10 @@ struct AMDGPUKernelTy : public GenericKernelTy {
856
856
return BlockSize <= ConstWGSize;
857
857
}
858
858
859
+ uint32_t getKernelLaunchId () const { return KernelLaunchId; }
860
+
861
+ void setKernelLaunchId (uint32_t Id) const { KernelLaunchId = Id; }
862
+
859
863
// / Envar to enable occupancy-based optimization for SPMD kernel.
860
864
BoolEnvar OMPX_SPMDOccupancyBasedOpt;
861
865
@@ -887,6 +891,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
887
891
// / CodeGen generate WGSize
888
892
uint16_t ConstWGSize;
889
893
894
+ static thread_local uint32_t KernelLaunchId;
895
+
890
896
// / Lower number of threads if tripcount is low. This should produce
891
897
// / a larger number of teams if allowed by other constraints.
892
898
std::pair<bool , uint32_t > adjustNumThreadsForLowTripCount (
@@ -1333,6 +1339,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1333
1339
uint64_t numTeams) const override ;
1334
1340
};
1335
1341
1342
+ thread_local uint32_t AMDGPUKernelTy::KernelLaunchId = 0 ;
1343
+
1336
1344
// / Class representing an HSA signal. Signals are used to define dependencies
1337
1345
// / between asynchronous operations: kernel launches and memory transfers.
1338
1346
struct AMDGPUSignalTy {
@@ -1691,6 +1699,20 @@ struct AMDGPUStreamTy {
1691
1699
NumThreads (0 ), KernelRunRecords(nullptr ) {}
1692
1700
};
1693
1701
1702
+ struct KernelDurationTracingArgsTy {
1703
+ hsa_agent_t Agent;
1704
+ AMDGPUSignalTy *Signal;
1705
+ double TicksToTime;
1706
+ int32_t DeviceId;
1707
+ uint32_t LaunchId;
1708
+ uint32_t NumTeams;
1709
+ uint32_t NumThreads;
1710
+
1711
+ KernelDurationTracingArgsTy ()
1712
+ : Agent{0 }, Signal(nullptr ), TicksToTime(setTicksToTime()), DeviceId(0 ),
1713
+ LaunchId (0 ), NumTeams(0 ), NumThreads(0 ) {}
1714
+ };
1715
+
1694
1716
using AMDGPUStreamCallbackTy = Error(void *Data);
1695
1717
1696
1718
// / The stream is composed of N stream's slots. The struct below represents
@@ -1892,6 +1914,9 @@ struct AMDGPUStreamTy {
1892
1914
// / Arguments for the callback function.
1893
1915
PostKernelRunProcessingArgsTy PostKernelRunProcessingArgs;
1894
1916
1917
+ // / Arguments for callback function to collect kernel duration.
1918
+ KernelDurationTracingArgsTy KernelDurationTracingArgs;
1919
+
1895
1920
// / Return the current number of asynchronous operations on the stream.
1896
1921
uint32_t size () const { return NextSlot; }
1897
1922
@@ -2052,9 +2077,9 @@ struct AMDGPUStreamTy {
2052
2077
return Plugin::success ();
2053
2078
}
2054
2079
2055
- static uint64_t getKernelDuration (PostKernelRunProcessingArgsTy *Args) {
2080
+ template < typename Ty> static uint64_t getKernelDuration (Ty *Args) {
2056
2081
assert (Args->Signal &&
2057
- " Invalid AMDGPUSignal Pointer in post kernel run processing " );
2082
+ " Invalid AMDGPUSignal Pointer for obtaining kernel duration " );
2058
2083
hsa_amd_profiling_dispatch_time_t TimeRec;
2059
2084
hsa_amd_profiling_get_dispatch_time (Args->Agent , Args->Signal ->get (),
2060
2085
&TimeRec);
@@ -2074,7 +2099,8 @@ struct AMDGPUStreamTy {
2074
2099
KernelRunRecordTy *KernelRecord = Args->KernelRunRecords ;
2075
2100
assert (KernelRecord && " KernelRunRecord is null!" );
2076
2101
2077
- uint64_t KernelDuration = getKernelDuration (Args);
2102
+ uint64_t KernelDuration =
2103
+ getKernelDuration<PostKernelRunProcessingArgsTy>(Args);
2078
2104
KernelRecord->addEntry (Args->KernelName , Args->NumTeams , Args->NumThreads ,
2079
2105
KernelDuration);
2080
2106
@@ -2088,6 +2114,24 @@ struct AMDGPUStreamTy {
2088
2114
return Plugin::success ();
2089
2115
}
2090
2116
2117
+ // / Callback function to generate traces for kernel runtime.
2118
+ static Error KernelDurationTracingAction (void *Data) {
2119
+ assert (Data && " Invalid data pointer for tracing kernel duration" );
2120
+ KernelDurationTracingArgsTy *Args =
2121
+ reinterpret_cast <KernelDurationTracingArgsTy *>(Data);
2122
+
2123
+ uint64_t KernelDuration =
2124
+ getKernelDuration<KernelDurationTracingArgsTy>(Args);
2125
+
2126
+ fprintf (
2127
+ stderr,
2128
+ " DeviceID: %2d LaunchID: %2d TeamsXthrds:(%4uX%4d) Duration(ns): %lu\n " ,
2129
+ Args->DeviceId , Args->LaunchId , Args->NumTeams , Args->NumThreads ,
2130
+ KernelDuration);
2131
+
2132
+ return Plugin::success ();
2133
+ }
2134
+
2091
2135
#ifdef OMPT_SUPPORT
2092
2136
static Error timeKernelInNsAsync (void *Data) {
2093
2137
assert (Data && " Invalid data pointer in OMPT profiling" );
@@ -2187,6 +2231,21 @@ struct AMDGPUStreamTy {
2187
2231
}
2188
2232
}
2189
2233
2234
+ // When LIBOMPTARGET_KERNEL_EXE_TIME is set, register the callback function
2235
+ // to get the kernel duration.
2236
+ if (Device.enableKernelDurationTracing ()) {
2237
+ KernelDurationTracingArgs.Agent = Agent;
2238
+ KernelDurationTracingArgs.Signal = OutputSignal;
2239
+ KernelDurationTracingArgs.DeviceId = Device.getDeviceId ();
2240
+ KernelDurationTracingArgs.LaunchId = Kernel.getKernelLaunchId ();
2241
+ KernelDurationTracingArgs.NumTeams = NumBlocks[0 ];
2242
+ KernelDurationTracingArgs.NumThreads = NumThreads[0 ];
2243
+
2244
+ if (auto Err = Slots[Curr].schedCallback (KernelDurationTracingAction,
2245
+ &KernelDurationTracingArgs))
2246
+ return Err;
2247
+ }
2248
+
2190
2249
// Push the kernel with the output signal and an input signal (optional)
2191
2250
DP (" Using Queue: %p with HSA Queue: %p\n " , Queue, Queue->getHsaQueue ());
2192
2251
// If we are running an RPC server we want to wake up the server thread
@@ -2626,9 +2685,10 @@ struct AMDGPUStreamManagerTy final
2626
2685
OMPX_EnableQueueProfiling(" LIBOMPTARGET_AMDGPU_ENABLE_QUEUE_PROFILING" ,
2627
2686
false ),
2628
2687
NextQueue(0 ), Agent(HSAAgent) {
2629
- // If OMPX_ENABLE_RUNTIME_AUTOTUNING is enabled,
2630
- // set queue profiling to true.
2631
- if (Device.enableRuntimeAutotuning ()) {
2688
+ // If OMPX_ENABLE_RUNTIME_AUTOTUNING or LIBOMPTARGET_KERNEL_EXE_TIME is
2689
+ // enabled, set queue profiling to true.
2690
+ if (Device.enableRuntimeAutotuning () ||
2691
+ Device.enableKernelDurationTracing ()) {
2632
2692
OMPX_EnableQueueProfiling = true ;
2633
2693
}
2634
2694
}
@@ -5162,19 +5222,42 @@ void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice,
5162
5222
auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount ;
5163
5223
// auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;
5164
5224
5165
- // This line should print exactly as the one in the old plugin.
5166
- fprintf (
5167
- stderr,
5168
- " DEVID: %2d SGN:%d ConstWGSize:%-4d args:%2d teamsXthrds:(%4uX%4d) "
5169
- " reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u agpr_count:%u "
5170
- " sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d "
5171
- " md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
5172
- " %d%% n:%s\n " ,
5173
- GenericDevice.getDeviceId (), getExecutionModeFlags (), ConstWGSize,
5174
- KernelArgs.NumArgs , NumBlocks[0 ], NumThreads[0 ], 0 , 0 , GroupSegmentSize,
5175
- SGPRCount, VGPRCount, AGPRCount, SGPRSpillCount, VGPRSpillCount,
5176
- KernelArgs.Tripcount , HasRPC, isMultiDeviceKernel (), MultiDeviceLB,
5177
- MultiDeviceUB, MaxOccupancy, AchievedOccupancy, getName ());
5225
+ if (GenericDevice.enableKernelDurationTracing ()) {
5226
+ uint32_t LaunchId = GenericDevice.getAndIncrementLaunchId ();
5227
+ setKernelLaunchId (LaunchId);
5228
+
5229
+ // Print Launch Id after Device Id.
5230
+ fprintf (stderr,
5231
+ " DEVID: %2d LaunchId: %u SGN:%d ConstWGSize:%-4d args:%2d "
5232
+ " teamsXthrds:(%4uX%4d) "
5233
+ " reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
5234
+ " agpr_count:%u "
5235
+ " sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d "
5236
+ " md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
5237
+ " %d%% n:%s\n " ,
5238
+ GenericDevice.getDeviceId (), LaunchId, getExecutionModeFlags (),
5239
+ ConstWGSize, KernelArgs.NumArgs , NumBlocks[0 ], NumThreads[0 ], 0 , 0 ,
5240
+ GroupSegmentSize, SGPRCount, VGPRCount, AGPRCount, SGPRSpillCount,
5241
+ VGPRSpillCount, KernelArgs.Tripcount , HasRPC, isMultiDeviceKernel (),
5242
+ MultiDeviceLB, MultiDeviceUB, MaxOccupancy, AchievedOccupancy,
5243
+ getName ());
5244
+ } else {
5245
+
5246
+ // This line should print exactly as the one in the old plugin.
5247
+ fprintf (stderr,
5248
+ " DEVID: %2d SGN:%d ConstWGSize:%-4d args:%2d teamsXthrds:(%4uX%4d) "
5249
+ " reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
5250
+ " agpr_count:%u "
5251
+ " sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d "
5252
+ " md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
5253
+ " %d%% n:%s\n " ,
5254
+ GenericDevice.getDeviceId (), getExecutionModeFlags (), ConstWGSize,
5255
+ KernelArgs.NumArgs , NumBlocks[0 ], NumThreads[0 ], 0 , 0 ,
5256
+ GroupSegmentSize, SGPRCount, VGPRCount, AGPRCount, SGPRSpillCount,
5257
+ VGPRSpillCount, KernelArgs.Tripcount , HasRPC, isMultiDeviceKernel (),
5258
+ MultiDeviceLB, MultiDeviceUB, MaxOccupancy, AchievedOccupancy,
5259
+ getName ());
5260
+ }
5178
5261
}
5179
5262
5180
5263
Error AMDGPUKernelTy::printLaunchInfoDetails (GenericDeviceTy &GenericDevice,
@@ -5186,7 +5269,8 @@ Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
5186
5269
// When LIBOMPTARGET_KERNEL_TRACE is set, print the single-line kernel trace
5187
5270
// info present in the old ASO plugin, and continue with the upstream 2-line
5188
5271
// info, should LIBOMPTARGET_INFO be a meaningful value, otherwise return.
5189
- if (getInfoLevel () & OMP_INFOTYPE_AMD_KERNEL_TRACE)
5272
+ if ((getInfoLevel () & OMP_INFOTYPE_AMD_KERNEL_TRACE) ||
5273
+ GenericDevice.enableKernelDurationTracing ())
5190
5274
printAMDOneLineKernelTrace (GenericDevice, KernelArgs, NumThreads, NumBlocks,
5191
5275
MultiDeviceLB, MultiDeviceUB);
5192
5276
0 commit comments