@@ -621,9 +621,9 @@ struct AMDGPUSignalTy {
621621 }
622622
623623 // / Wait until the signal gets a zero value.
624- Error wait (const uint64_t ActiveTimeout = 0 , RPCServerTy *RPCServer = nullptr ,
624+ Error wait (const uint64_t ActiveTimeout = 0 ,
625625 GenericDeviceTy *Device = nullptr ) const {
626- if (ActiveTimeout && !RPCServer ) {
626+ if (ActiveTimeout) {
627627 hsa_signal_value_t Got = 1 ;
628628 Got = hsa_signal_wait_scacquire (HSASignal, HSA_SIGNAL_CONDITION_EQ, 0 ,
629629 ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
@@ -632,14 +632,11 @@ struct AMDGPUSignalTy {
632632 }
633633
634634 // If there is an RPC device attached to this stream we run it as a server.
635- uint64_t Timeout = RPCServer ? 8192 : UINT64_MAX;
636- auto WaitState = RPCServer ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED;
635+ uint64_t Timeout = UINT64_MAX;
636+ auto WaitState = HSA_WAIT_STATE_BLOCKED;
637637 while (hsa_signal_wait_scacquire (HSASignal, HSA_SIGNAL_CONDITION_EQ, 0 ,
638- Timeout, WaitState) != 0 ) {
639- if (RPCServer && Device)
640- if (auto Err = RPCServer->runServer (*Device))
641- return Err;
642- }
638+ Timeout, WaitState) != 0 )
639+ ;
643640 return Plugin::success ();
644641 }
645642
@@ -1048,11 +1045,6 @@ struct AMDGPUStreamTy {
10481045 // / operation that was already finalized in a previous stream sycnhronize.
10491046 uint32_t SyncCycle;
10501047
1051- // / A pointer associated with an RPC server running on the given device. If
1052- // / RPC is not being used this will be a null pointer. Otherwise, this
1053- // / indicates that an RPC server is expected to be run on this stream.
1054- RPCServerTy *RPCServer;
1055-
10561048 // / Mutex to protect stream's management.
10571049 mutable std::mutex Mutex;
10581050
@@ -1232,9 +1224,6 @@ struct AMDGPUStreamTy {
12321224 // / Deinitialize the stream's signals.
12331225 Error deinit () { return Plugin::success (); }
12341226
1235- // / Attach an RPC server to this stream.
1236- void setRPCServer (RPCServerTy *Server) { RPCServer = Server; }
1237-
12381227 // / Push a asynchronous kernel to the stream. The kernel arguments must be
12391228 // / placed in a special allocation for kernel args and must keep alive until
12401229 // / the kernel finalizes. Once the kernel is finished, the stream will release
@@ -1262,10 +1251,30 @@ struct AMDGPUStreamTy {
12621251 if (auto Err = Slots[Curr].schedReleaseBuffer (KernelArgs, MemoryManager))
12631252 return Err;
12641253
1254+ // If we are running an RPC server we want to wake up the server thread
1255+ // whenever there is a kernel running and let it sleep otherwise.
1256+ if (Device.getRPCServer ())
1257+ Device.Plugin .getRPCServer ().Thread ->notify ();
1258+
12651259 // Push the kernel with the output signal and an input signal (optional)
1266- return Queue->pushKernelLaunch (Kernel, KernelArgs, NumThreads, NumBlocks,
1267- GroupSize, StackSize, OutputSignal,
1268- InputSignal);
1260+ if (auto Err = Queue->pushKernelLaunch (Kernel, KernelArgs, NumThreads,
1261+ NumBlocks, GroupSize, StackSize,
1262+ OutputSignal, InputSignal))
1263+ return Err;
1264+
1265+ // Register a callback to indicate when the kernel is complete.
1266+ if (Device.getRPCServer ()) {
1267+ if (auto Err = Slots[Curr].schedCallback (
1268+ [](void *Data) -> llvm::Error {
1269+ GenericPluginTy &Plugin =
1270+ *reinterpret_cast <GenericPluginTy *>(Data);
1271+ Plugin.getRPCServer ().Thread ->finish ();
1272+ return Error::success ();
1273+ },
1274+ &Device.Plugin ))
1275+ return Err;
1276+ }
1277+ return Plugin::success ();
12691278 }
12701279
12711280 // / Push an asynchronous memory copy between pinned memory buffers.
@@ -1475,8 +1484,8 @@ struct AMDGPUStreamTy {
14751484 return Plugin::success ();
14761485
14771486 // Wait until all previous operations on the stream have completed.
1478- if (auto Err = Slots[ last ()]. Signal -> wait (StreamBusyWaitMicroseconds,
1479- RPCServer , &Device))
1487+ if (auto Err =
1488+ Slots[ last ()]. Signal -> wait (StreamBusyWaitMicroseconds , &Device))
14801489 return Err;
14811490
14821491 // Reset the stream and perform all pending post actions.
@@ -3025,7 +3034,7 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
30253034 : Agent(Device.getAgent()), Queue(nullptr ),
30263035 SignalManager (Device.getSignalManager()), Device(Device),
30273036 // Initialize the std::deque with some empty positions.
3028- Slots(32 ), NextSlot(0 ), SyncCycle(0 ), RPCServer( nullptr ),
3037+ Slots(32 ), NextSlot(0 ), SyncCycle(0 ),
30293038 StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
30303039 UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
30313040
@@ -3378,10 +3387,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
33783387 if (auto Err = AMDGPUDevice.getStream (AsyncInfoWrapper, Stream))
33793388 return Err;
33803389
3381- // If this kernel requires an RPC server we attach its pointer to the stream.
3382- if (GenericDevice.getRPCServer ())
3383- Stream->setRPCServer (GenericDevice.getRPCServer ());
3384-
33853390 // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
33863391 if (ImplArgs &&
33873392 getImplicitArgsSize () == sizeof (hsa_utils::AMDGPUImplicitArgsTy)) {
0 commit comments