@@ -1857,6 +1857,10 @@ struct AMDGPUStreamTy {
18571857 // / Use synchronous copy back.
18581858 bool UseSyncCopyBack;
18591859
1860+ // / When copying data from one host buffer to another, only do it
1861+ // / asynchronously if `MinHostToHostAsyncCopySize <= size`.
1862+ UInt32Envar OMPX_MinHostToHostAsyncCopySize;
1863+
18601864 // / Arguments for the callback function.
18611865 PostKernelRunProcessingArgsTy PostKernelRunProcessingArgs;
18621866
@@ -2281,6 +2285,14 @@ struct AMDGPUStreamTy {
22812285 return Err;
22822286 }
22832287
2288+ if (CopySize < OMPX_MinHostToHostAsyncCopySize) {
2289+ if (auto Err =
2290+ OutputSignals[0 ]->wait (StreamBusyWaitMicroseconds, &Device))
2291+ return Err;
2292+ std::memcpy (Dst, Inter, CopySize);
2293+ return Error::success ();
2294+ }
2295+
22842296 // Consume another stream slot and compute dependencies.
22852297 std::tie (Curr, InputSignal) = consume (OutputSignals[1 ]);
22862298 assert (InputSignal && " Invalid input signal" );
@@ -4679,7 +4691,9 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
46794691 Slots(32 ), NextSlot(0 ), SyncCycle(0 ),
46804692 StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
46814693 UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()),
4682- UseSyncCopyBack(Device.syncCopyBack()) {}
4694+ UseSyncCopyBack(Device.syncCopyBack()),
4695+ OMPX_MinHostToHostAsyncCopySize(
4696+ " LIBOMPTARGET_AMDGPU_MIN_HOST_TO_HOST_ASYNC_COPY_SIZE" , 2048 ) {}
46834697
46844698// / Class implementing the AMDGPU-specific functionalities of the global
46854699// / handler.
0 commit comments