@@ -1382,9 +1382,9 @@ struct AMDGPUSignalTy {
13821382 }
13831383
13841384 // / Wait until the signal gets a zero value.
1385- Error wait (const uint64_t ActiveTimeout = 0 , RPCServerTy *RPCServer = nullptr ,
1385+ Error wait (const uint64_t ActiveTimeout = 0 ,
13861386 GenericDeviceTy *Device = nullptr ) const {
1387- if (ActiveTimeout && !RPCServer ) {
1387+ if (ActiveTimeout) {
13881388 hsa_signal_value_t Got = 1 ;
13891389 Got = hsa_signal_wait_scacquire (HSASignal, HSA_SIGNAL_CONDITION_EQ, 0 ,
13901390 ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
@@ -1393,14 +1393,11 @@ struct AMDGPUSignalTy {
13931393 }
13941394
13951395 // If there is an RPC device attached to this stream we run it as a server.
1396- uint64_t Timeout = RPCServer ? 8192 : UINT64_MAX;
1397- auto WaitState = RPCServer ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED;
1396+ uint64_t Timeout = UINT64_MAX;
1397+ auto WaitState = HSA_WAIT_STATE_BLOCKED;
13981398 while (hsa_signal_wait_scacquire (HSASignal, HSA_SIGNAL_CONDITION_EQ, 0 ,
1399- Timeout, WaitState) != 0 ) {
1400- if (RPCServer && Device)
1401- if (auto Err = RPCServer->runServer (*Device))
1402- return Err;
1403- }
1399+ Timeout, WaitState) != 0 )
1400+ ;
14041401 return Plugin::success ();
14051402 }
14061403
@@ -1895,11 +1892,6 @@ struct AMDGPUStreamTy {
18951892 // / operation that was already finalized in a previous stream sycnhronize.
18961893 uint32_t SyncCycle;
18971894
1898- // / A pointer associated with an RPC server running on the given device. If
1899- // / RPC is not being used this will be a null pointer. Otherwise, this
1900- // / indicates that an RPC server is expected to be run on this stream.
1901- RPCServerTy *RPCServer;
1902-
19031895 // / Mutex to protect stream's management.
19041896 mutable std::mutex Mutex;
19051897
@@ -2136,9 +2128,6 @@ struct AMDGPUStreamTy {
21362128
21372129 hsa_queue_t *getHsaQueue () { return Queue->getHsaQueue (); }
21382130
2139- // / Attach an RPC server to this stream.
2140- void setRPCServer (RPCServerTy *Server) { RPCServer = Server; }
2141-
21422131 // / Push a asynchronous kernel to the stream. The kernel arguments must be
21432132 // / placed in a special allocation for kernel args and must keep alive until
21442133 // / the kernel finalizes. Once the kernel is finished, the stream will release
@@ -2194,9 +2183,30 @@ struct AMDGPUStreamTy {
21942183
21952184 // Push the kernel with the output signal and an input signal (optional)
21962185 DP (" Using Queue: %p with HSA Queue: %p\n " , Queue, Queue->getHsaQueue ());
2197- return Queue->pushKernelLaunch (Kernel, KernelArgs, NumThreads, NumBlocks,
2198- GroupSize, StackSize, OutputSignal,
2199- InputSignal);
2186+ // If we are running an RPC server we want to wake up the server thread
2187+ // whenever there is a kernel running and let it sleep otherwise.
2188+ if (Device.getRPCServer ())
2189+ Device.Plugin .getRPCServer ().Thread ->notify ();
2190+
2191+ // Push the kernel with the output signal and an input signal (optional)
2192+ if (auto Err = Queue->pushKernelLaunch (Kernel, KernelArgs, NumThreads,
2193+ NumBlocks, GroupSize, StackSize,
2194+ OutputSignal, InputSignal))
2195+ return Err;
2196+
2197+ // Register a callback to indicate when the kernel is complete.
2198+ if (Device.getRPCServer ()) {
2199+ if (auto Err = Slots[Curr].schedCallback (
2200+ [](void *Data) -> llvm::Error {
2201+ GenericPluginTy &Plugin =
2202+ *reinterpret_cast <GenericPluginTy *>(Data);
2203+ Plugin.getRPCServer ().Thread ->finish ();
2204+ return Error::success ();
2205+ },
2206+ &Device.Plugin ))
2207+ return Err;
2208+ }
2209+ return Plugin::success ();
22002210 }
22012211
22022212 // / Push an asynchronous memory copy between pinned memory buffers.
@@ -2268,9 +2278,8 @@ struct AMDGPUStreamTy {
22682278
22692279 // Wait for kernel to finish before scheduling the asynchronous copy.
22702280 if (UseSyncCopyBack && InputSignal && InputSignal->load ())
2271- if (auto Err = InputSignal->wait (StreamBusyWaitMicroseconds, RPCServer, &Device))
2281+ if (auto Err = InputSignal->wait (StreamBusyWaitMicroseconds, &Device))
22722282 return Err;
2273-
22742283#ifdef OMPT_SUPPORT
22752284
22762285 if (OmptInfo) {
@@ -2457,8 +2466,8 @@ struct AMDGPUStreamTy {
24572466 return Plugin::success ();
24582467
24592468 // Wait until all previous operations on the stream have completed.
2460- if (auto Err = Slots[ last ()]. Signal -> wait (StreamBusyWaitMicroseconds,
2461- RPCServer , &Device))
2469+ if (auto Err =
2470+ Slots[ last ()]. Signal -> wait (StreamBusyWaitMicroseconds , &Device))
24622471 return Err;
24632472
24642473 // Reset the stream and perform all pending post actions.
@@ -4701,7 +4710,7 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
47014710 : Agent(Device.getAgent()), Queue(nullptr ),
47024711 SignalManager (Device.getSignalManager()), Device(Device),
47034712 // Initialize the std::deque with some empty positions.
4704- Slots(32 ), NextSlot(0 ), SyncCycle(0 ), RPCServer( nullptr ),
4713+ Slots(32 ), NextSlot(0 ), SyncCycle(0 ),
47054714 StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
47064715 UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()),
47074716 UseSyncCopyBack(Device.syncCopyBack()) {}
@@ -5117,10 +5126,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
51175126 DP (" No hostrpc buffer or service thread required\n " );
51185127 }
51195128
5120- // If this kernel requires an RPC server we attach its pointer to the stream.
5121- if (GenericDevice.getRPCServer ())
5122- Stream->setRPCServer (GenericDevice.getRPCServer ());
5123-
51245129 // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
51255130 if (ImplArgs &&
51265131 getImplicitArgsSize () == sizeof (hsa_utils::AMDGPUImplicitArgsTy)) {
0 commit comments