@@ -626,9 +626,9 @@ struct AMDGPUSignalTy {
626626 }
627627
628628 // / Wait until the signal gets a zero value.
629- Error wait (const uint64_t ActiveTimeout = 0 , RPCServerTy *RPCServer = nullptr ,
629+ Error wait (const uint64_t ActiveTimeout = 0 ,
630630 GenericDeviceTy *Device = nullptr ) const {
631- if (ActiveTimeout && !RPCServer ) {
631+ if (ActiveTimeout) {
632632 hsa_signal_value_t Got = 1 ;
633633 Got = hsa_signal_wait_scacquire (HSASignal, HSA_SIGNAL_CONDITION_EQ, 0 ,
634634 ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
@@ -637,14 +637,11 @@ struct AMDGPUSignalTy {
637637 }
638638
639639 // If there is an RPC device attached to this stream we run it as a server.
640- uint64_t Timeout = RPCServer ? 8192 : UINT64_MAX;
641- auto WaitState = RPCServer ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED;
640+ uint64_t Timeout = UINT64_MAX;
641+ auto WaitState = HSA_WAIT_STATE_BLOCKED;
642642 while (hsa_signal_wait_scacquire (HSASignal, HSA_SIGNAL_CONDITION_EQ, 0 ,
643- Timeout, WaitState) != 0 ) {
644- if (RPCServer && Device)
645- if (auto Err = RPCServer->runServer (*Device))
646- return Err;
647- }
643+ Timeout, WaitState) != 0 )
644+ ;
648645 return Plugin::success ();
649646 }
650647
@@ -1052,11 +1049,6 @@ struct AMDGPUStreamTy {
10521049 // / operation that was already finalized in a previous stream sycnhronize.
10531050 uint32_t SyncCycle;
10541051
1055- // / A pointer associated with an RPC server running on the given device. If
1056- // / RPC is not being used this will be a null pointer. Otherwise, this
1057- // / indicates that an RPC server is expected to be run on this stream.
1058- RPCServerTy *RPCServer;
1059-
10601052 // / Mutex to protect stream's management.
10611053 mutable std::mutex Mutex;
10621054
@@ -1236,9 +1228,6 @@ struct AMDGPUStreamTy {
12361228 // / Deinitialize the stream's signals.
12371229 Error deinit () { return Plugin::success (); }
12381230
1239- // / Attach an RPC server to this stream.
1240- void setRPCServer (RPCServerTy *Server) { RPCServer = Server; }
1241-
12421231 // / Push a asynchronous kernel to the stream. The kernel arguments must be
12431232 // / placed in a special allocation for kernel args and must keep alive until
12441233 // / the kernel finalizes. Once the kernel is finished, the stream will release
@@ -1266,10 +1255,30 @@ struct AMDGPUStreamTy {
12661255 if (auto Err = Slots[Curr].schedReleaseBuffer (KernelArgs, MemoryManager))
12671256 return Err;
12681257
1258+ // If we are running an RPC server we want to wake up the server thread
1259+ // whenever there is a kernel running and let it sleep otherwise.
1260+ if (Device.getRPCServer ())
1261+ Device.Plugin .getRPCServer ().Thread ->notify ();
1262+
12691263 // Push the kernel with the output signal and an input signal (optional)
1270- return Queue->pushKernelLaunch (Kernel, KernelArgs, NumThreads, NumBlocks,
1271- GroupSize, StackSize, OutputSignal,
1272- InputSignal);
1264+ if (auto Err = Queue->pushKernelLaunch (Kernel, KernelArgs, NumThreads,
1265+ NumBlocks, GroupSize, StackSize,
1266+ OutputSignal, InputSignal))
1267+ return Err;
1268+
1269+ // Register a callback to indicate when the kernel is complete.
1270+ if (Device.getRPCServer ()) {
1271+ if (auto Err = Slots[Curr].schedCallback (
1272+ [](void *Data) -> llvm::Error {
1273+ GenericPluginTy &Plugin =
1274+ *reinterpret_cast <GenericPluginTy *>(Data);
1275+ Plugin.getRPCServer ().Thread ->finish ();
1276+ return Error::success ();
1277+ },
1278+ &Device.Plugin ))
1279+ return Err;
1280+ }
1281+ return Plugin::success ();
12731282 }
12741283
12751284 // / Push an asynchronous memory copy between pinned memory buffers.
@@ -1479,8 +1488,8 @@ struct AMDGPUStreamTy {
14791488 return Plugin::success ();
14801489
14811490 // Wait until all previous operations on the stream have completed.
1482- if (auto Err = Slots[ last ()]. Signal -> wait (StreamBusyWaitMicroseconds,
1483- RPCServer , &Device))
1491+ if (auto Err =
1492+ Slots[ last ()]. Signal -> wait (StreamBusyWaitMicroseconds , &Device))
14841493 return Err;
14851494
14861495 // Reset the stream and perform all pending post actions.
@@ -3024,7 +3033,7 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
30243033 : Agent(Device.getAgent()), Queue(nullptr ),
30253034 SignalManager (Device.getSignalManager()), Device(Device),
30263035 // Initialize the std::deque with some empty positions.
3027- Slots(32 ), NextSlot(0 ), SyncCycle(0 ), RPCServer( nullptr ),
3036+ Slots(32 ), NextSlot(0 ), SyncCycle(0 ),
30283037 StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
30293038 UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
30303039
@@ -3377,10 +3386,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
33773386 if (auto Err = AMDGPUDevice.getStream (AsyncInfoWrapper, Stream))
33783387 return Err;
33793388
3380- // If this kernel requires an RPC server we attach its pointer to the stream.
3381- if (GenericDevice.getRPCServer ())
3382- Stream->setRPCServer (GenericDevice.getRPCServer ());
3383-
33843389 // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
33853390 if (ImplArgs &&
33863391 getImplicitArgsSize () == sizeof (hsa_utils::AMDGPUImplicitArgsTy)) {
0 commit comments