@@ -749,7 +749,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
749749 OMPX_BigJumpLoopOccupancyBasedOpt(
750750 " OMPX_BIGJUMPLOOP_OCCUPANCY_BASED_OPT" , false ),
751751 OMPX_XTeamReductionOccupancyBasedOpt(
752- " OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT" , false ) {}
752+ " OMPX_XTEAMREDUCTION_OCCUPANCY_BASED_OPT" , false ),
753+ OMPX_EnableRuntimeAutotuning(" OMPX_ENABLE_RUNTIME_AUTOTUNING" , false ) {}
753754
754755 // / Initialize the AMDGPU kernel.
755756 Error initImpl (GenericDeviceTy &Device, DeviceImageTy &Image) override {
@@ -885,6 +886,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
885886 // / Envar to enable occupancy-based optimization for cross team reduction.
886887 BoolEnvar OMPX_XTeamReductionOccupancyBasedOpt;
887888
889+ // / Envar to enable runtime tuning.
890+ BoolEnvar OMPX_EnableRuntimeAutotuning;
891+
888892private:
889893 // / The kernel object to execute.
890894 uint64_t KernelObject;
@@ -1683,6 +1687,13 @@ struct AMDGPUStreamTy {
16831687 double TicksToTime;
16841688 };
16851689
1690+ // / Utility struct holding arguments for post kernel run processing.
1691+ struct PostKernelRunProcessingArgsTy {
1692+ hsa_agent_t Agent;
1693+ AMDGPUSignalTy *Signal;
1694+ double TicksToTime;
1695+ };
1696+
16861697 using AMDGPUStreamCallbackTy = Error(void *Data);
16871698
16881699 // / The stream is composed of N stream's slots. The struct below represents
@@ -1881,6 +1892,9 @@ struct AMDGPUStreamTy {
18811892 // / Use synchronous copy back.
18821893 bool UseSyncCopyBack;
18831894
1895+ // / Arguments for the callback function.
1896+ PostKernelRunProcessingArgsTy PostKernelRunProcessingArgs;
1897+
18841898 // / Return the current number of asychronous operations on the stream.
18851899 uint32_t size () const { return NextSlot; }
18861900
@@ -2042,6 +2056,31 @@ struct AMDGPUStreamTy {
20422056 return Plugin::success ();
20432057 }
20442058
2059+ static uint64_t getKernelDuration (PostKernelRunProcessingArgsTy *Args) {
2060+ assert (Args->Signal &&
2061+ " Invalid AMDGPUSignal Pointer in post kernel run processing" );
2062+ hsa_amd_profiling_dispatch_time_t TimeRec;
2063+ hsa_status_t Status = hsa_amd_profiling_get_dispatch_time (
2064+ Args->Agent , Args->Signal ->get (), &TimeRec);
2065+
2066+ uint64_t StartTime = TimeRec.start * Args->TicksToTime ;
2067+ uint64_t EndTime = TimeRec.end * Args->TicksToTime ;
2068+
2069+ return EndTime - StartTime;
2070+ }
2071+
2072+ // / Callback funtion to process the data for each kernel run.
2073+ static Error postKernelRunProcessingAction (void *Data) {
2074+ assert (Data && " Invalid data pointer for post kernel run processing" );
2075+ PostKernelRunProcessingArgsTy *Args =
2076+ reinterpret_cast <PostKernelRunProcessingArgsTy *>(Data);
2077+
2078+ uint64_t KernelDuration = getKernelDuration (Args);
2079+ fprintf (stderr, " Kernel Duration: %lu ns\n " , KernelDuration);
2080+
2081+ return Plugin::success ();
2082+ }
2083+
20452084#ifdef OMPT_SUPPORT
20462085 static Error timeKernelInNsAsync (void *Data) {
20472086 assert (Data && " Invalid data pointer in OMPT profiling" );
@@ -2124,6 +2163,18 @@ struct AMDGPUStreamTy {
21242163 }
21252164#endif
21262165
2166+ // If runtime autotuning is enabled, setup the callback functions to process
2167+ // the data after kernel completed.
2168+ if (Kernel.OMPX_EnableRuntimeAutotuning ) {
2169+ PostKernelRunProcessingArgs.Agent = Agent;
2170+ PostKernelRunProcessingArgs.Signal = OutputSignal;
2171+ PostKernelRunProcessingArgs.TicksToTime = 1.0 ;
2172+
2173+ if (auto Err = Slots[Curr].schedCallback (postKernelRunProcessingAction,
2174+ &PostKernelRunProcessingArgs))
2175+ return Err;
2176+ }
2177+
21272178 // Push the kernel with the output signal and an input signal (optional)
21282179 DP (" Using Queue: %p with HSA Queue: %p\n " , Queue, Queue->getHsaQueue ());
21292180 return Queue->pushKernelLaunch (Kernel, KernelArgs, NumThreads, NumBlocks,
0 commit comments