@@ -856,6 +856,10 @@ struct AMDGPUKernelTy : public GenericKernelTy {
856856 return BlockSize <= ConstWGSize;
857857 }
858858
859+ uint32_t getKernelLaunchId () const { return KernelLaunchId; }
860+
861+ void setKernelLaunchId (uint32_t Id) const { KernelLaunchId = Id; }
862+
859863 // / Envar to enable occupancy-based optimization for SPMD kernel.
860864 BoolEnvar OMPX_SPMDOccupancyBasedOpt;
861865
@@ -887,6 +891,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
887891 // / CodeGen generate WGSize
888892 uint16_t ConstWGSize;
889893
894+ static thread_local uint32_t KernelLaunchId;
895+
890896 // / Lower number of threads if tripcount is low. This should produce
891897 // / a larger number of teams if allowed by other constraints.
892898 std::pair<bool , uint32_t > adjustNumThreadsForLowTripCount (
@@ -1333,6 +1339,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
13331339 uint64_t numTeams) const override ;
13341340};
13351341
1342+ thread_local uint32_t AMDGPUKernelTy::KernelLaunchId = 0 ;
1343+
13361344// / Class representing an HSA signal. Signals are used to define dependencies
13371345// / between asynchronous operations: kernel launches and memory transfers.
13381346struct AMDGPUSignalTy {
@@ -1691,6 +1699,20 @@ struct AMDGPUStreamTy {
16911699 NumThreads (0 ), KernelRunRecords(nullptr ) {}
16921700 };
16931701
1702+ struct KernelDurationTracingArgsTy {
1703+ hsa_agent_t Agent;
1704+ AMDGPUSignalTy *Signal;
1705+ double TicksToTime;
1706+ int32_t DeviceId;
1707+ uint32_t LaunchId;
1708+ uint32_t NumTeams;
1709+ uint32_t NumThreads;
1710+
1711+ KernelDurationTracingArgsTy ()
1712+ : Agent{0 }, Signal(nullptr ), TicksToTime(setTicksToTime()), DeviceId(0 ),
1713+ LaunchId (0 ), NumTeams(0 ), NumThreads(0 ) {}
1714+ };
1715+
16941716 using AMDGPUStreamCallbackTy = Error(void *Data);
16951717
16961718 // / The stream is composed of N stream's slots. The struct below represents
@@ -1892,6 +1914,9 @@ struct AMDGPUStreamTy {
18921914 // / Arguments for the callback function.
18931915 PostKernelRunProcessingArgsTy PostKernelRunProcessingArgs;
18941916
1917+ // / Arguments for callback function to collect kernel duration.
1918+ KernelDurationTracingArgsTy KernelDurationTracingArgs;
1919+
18951920 // / Return the current number of asynchronous operations on the stream.
18961921 uint32_t size () const { return NextSlot; }
18971922
@@ -2052,9 +2077,9 @@ struct AMDGPUStreamTy {
20522077 return Plugin::success ();
20532078 }
20542079
2055- static uint64_t getKernelDuration (PostKernelRunProcessingArgsTy *Args) {
2080+ template < typename Ty> static uint64_t getKernelDuration (Ty *Args) {
20562081 assert (Args->Signal &&
2057- " Invalid AMDGPUSignal Pointer in post kernel run processing " );
2082+ " Invalid AMDGPUSignal Pointer for obtaining kernel duration " );
20582083 hsa_amd_profiling_dispatch_time_t TimeRec;
20592084 hsa_amd_profiling_get_dispatch_time (Args->Agent , Args->Signal ->get (),
20602085 &TimeRec);
@@ -2074,7 +2099,8 @@ struct AMDGPUStreamTy {
20742099 KernelRunRecordTy *KernelRecord = Args->KernelRunRecords ;
20752100 assert (KernelRecord && " KernelRunRecord is null!" );
20762101
2077- uint64_t KernelDuration = getKernelDuration (Args);
2102+ uint64_t KernelDuration =
2103+ getKernelDuration<PostKernelRunProcessingArgsTy>(Args);
20782104 KernelRecord->addEntry (Args->KernelName , Args->NumTeams , Args->NumThreads ,
20792105 KernelDuration);
20802106
@@ -2088,6 +2114,24 @@ struct AMDGPUStreamTy {
20882114 return Plugin::success ();
20892115 }
20902116
2117+ // / Callback function to generate traces for kernel runtime.
2118+ static Error KernelDurationTracingAction (void *Data) {
2119+ assert (Data && " Invalid data pointer for tracing kernel duration" );
2120+ KernelDurationTracingArgsTy *Args =
2121+ reinterpret_cast <KernelDurationTracingArgsTy *>(Data);
2122+
2123+ uint64_t KernelDuration =
2124+ getKernelDuration<KernelDurationTracingArgsTy>(Args);
2125+
2126+ fprintf (
2127+ stderr,
2128+ " DeviceID: %2d LaunchID: %2d TeamsXthrds:(%4uX%4d) Duration(ns): %lu\n " ,
2129+ Args->DeviceId , Args->LaunchId , Args->NumTeams , Args->NumThreads ,
2130+ KernelDuration);
2131+
2132+ return Plugin::success ();
2133+ }
2134+
20912135#ifdef OMPT_SUPPORT
20922136 static Error timeKernelInNsAsync (void *Data) {
20932137 assert (Data && " Invalid data pointer in OMPT profiling" );
@@ -2187,6 +2231,21 @@ struct AMDGPUStreamTy {
21872231 }
21882232 }
21892233
2234+ // When LIBOMPTARGET_KERNEL_EXE_TIME is set, register the callback function
2235+ // to get the kernel duration.
2236+ if (Device.enableKernelDurationTracing ()) {
2237+ KernelDurationTracingArgs.Agent = Agent;
2238+ KernelDurationTracingArgs.Signal = OutputSignal;
2239+ KernelDurationTracingArgs.DeviceId = Device.getDeviceId ();
2240+ KernelDurationTracingArgs.LaunchId = Kernel.getKernelLaunchId ();
2241+ KernelDurationTracingArgs.NumTeams = NumBlocks[0 ];
2242+ KernelDurationTracingArgs.NumThreads = NumThreads[0 ];
2243+
2244+ if (auto Err = Slots[Curr].schedCallback (KernelDurationTracingAction,
2245+ &KernelDurationTracingArgs))
2246+ return Err;
2247+ }
2248+
21902249 // Push the kernel with the output signal and an input signal (optional)
21912250 DP (" Using Queue: %p with HSA Queue: %p\n " , Queue, Queue->getHsaQueue ());
21922251 // If we are running an RPC server we want to wake up the server thread
@@ -2626,9 +2685,10 @@ struct AMDGPUStreamManagerTy final
26262685 OMPX_EnableQueueProfiling(" LIBOMPTARGET_AMDGPU_ENABLE_QUEUE_PROFILING" ,
26272686 false ),
26282687 NextQueue(0 ), Agent(HSAAgent) {
2629- // If OMPX_ENABLE_RUNTIME_AUTOTUNING is enabled,
2630- // set queue profiling to true.
2631- if (Device.enableRuntimeAutotuning ()) {
2688+ // If OMPX_ENABLE_RUNTIME_AUTOTUNING or LIBOMPTARGET_KERNEL_EXE_TIME is
2689+ // enabled, set queue profiling to true.
2690+ if (Device.enableRuntimeAutotuning () ||
2691+ Device.enableKernelDurationTracing ()) {
26322692 OMPX_EnableQueueProfiling = true ;
26332693 }
26342694 }
@@ -5162,19 +5222,42 @@ void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice,
51625222 auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount ;
51635223 // auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;
51645224
5165- // This line should print exactly as the one in the old plugin.
5166- fprintf (
5167- stderr,
5168- " DEVID: %2d SGN:%d ConstWGSize:%-4d args:%2d teamsXthrds:(%4uX%4d) "
5169- " reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u agpr_count:%u "
5170- " sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d "
5171- " md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
5172- " %d%% n:%s\n " ,
5173- GenericDevice.getDeviceId (), getExecutionModeFlags (), ConstWGSize,
5174- KernelArgs.NumArgs , NumBlocks[0 ], NumThreads[0 ], 0 , 0 , GroupSegmentSize,
5175- SGPRCount, VGPRCount, AGPRCount, SGPRSpillCount, VGPRSpillCount,
5176- KernelArgs.Tripcount , HasRPC, isMultiDeviceKernel (), MultiDeviceLB,
5177- MultiDeviceUB, MaxOccupancy, AchievedOccupancy, getName ());
5225+ if (GenericDevice.enableKernelDurationTracing ()) {
5226+ uint32_t LaunchId = GenericDevice.getAndIncrementLaunchId ();
5227+ setKernelLaunchId (LaunchId);
5228+
5229+ // Print Launch Id after Device Id.
5230+ fprintf (stderr,
5231+ " DEVID: %2d LaunchId: %u SGN:%d ConstWGSize:%-4d args:%2d "
5232+ " teamsXthrds:(%4uX%4d) "
5233+ " reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
5234+ " agpr_count:%u "
5235+ " sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d "
5236+ " md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
5237+ " %d%% n:%s\n " ,
5238+ GenericDevice.getDeviceId (), LaunchId, getExecutionModeFlags (),
5239+ ConstWGSize, KernelArgs.NumArgs , NumBlocks[0 ], NumThreads[0 ], 0 , 0 ,
5240+ GroupSegmentSize, SGPRCount, VGPRCount, AGPRCount, SGPRSpillCount,
5241+ VGPRSpillCount, KernelArgs.Tripcount , HasRPC, isMultiDeviceKernel (),
5242+ MultiDeviceLB, MultiDeviceUB, MaxOccupancy, AchievedOccupancy,
5243+ getName ());
5244+ } else {
5245+
5246+ // This line should print exactly as the one in the old plugin.
5247+ fprintf (stderr,
5248+ " DEVID: %2d SGN:%d ConstWGSize:%-4d args:%2d teamsXthrds:(%4uX%4d) "
5249+ " reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
5250+ " agpr_count:%u "
5251+ " sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d "
5252+ " md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: "
5253+ " %d%% n:%s\n " ,
5254+ GenericDevice.getDeviceId (), getExecutionModeFlags (), ConstWGSize,
5255+ KernelArgs.NumArgs , NumBlocks[0 ], NumThreads[0 ], 0 , 0 ,
5256+ GroupSegmentSize, SGPRCount, VGPRCount, AGPRCount, SGPRSpillCount,
5257+ VGPRSpillCount, KernelArgs.Tripcount , HasRPC, isMultiDeviceKernel (),
5258+ MultiDeviceLB, MultiDeviceUB, MaxOccupancy, AchievedOccupancy,
5259+ getName ());
5260+ }
51785261}
51795262
51805263Error AMDGPUKernelTy::printLaunchInfoDetails (GenericDeviceTy &GenericDevice,
@@ -5186,7 +5269,8 @@ Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
51865269 // When LIBOMPTARGET_KERNEL_TRACE is set, print the single-line kernel trace
51875270 // info present in the old ASO plugin, and continue with the upstream 2-line
51885271 // info, should LIBOMPTARGET_INFO be a meaningful value, otherwise return.
5189- if (getInfoLevel () & OMP_INFOTYPE_AMD_KERNEL_TRACE)
5272+ if ((getInfoLevel () & OMP_INFOTYPE_AMD_KERNEL_TRACE) ||
5273+ GenericDevice.enableKernelDurationTracing ())
51905274 printAMDOneLineKernelTrace (GenericDevice, KernelArgs, NumThreads, NumBlocks,
51915275 MultiDeviceLB, MultiDeviceUB);
51925276
0 commit comments