@@ -1904,6 +1904,10 @@ struct AMDGPUStreamTy {
1904
1904
// / Use synchronous copy back.
1905
1905
bool UseSyncCopyBack;
1906
1906
1907
+ // / When copying data from one host buffer to another, only do it
1908
+ // / asynchronously if `MinHostToHostAsyncCopySize <= size`.
1909
+ UInt32Envar OMPX_MinHostToHostAsyncCopySize;
1910
+
1907
1911
// / Arguments for the callback function.
1908
1912
PostKernelRunProcessingArgsTy PostKernelRunProcessingArgs;
1909
1913
@@ -2306,6 +2310,14 @@ struct AMDGPUStreamTy {
2306
2310
return Err;
2307
2311
}
2308
2312
2313
+ if (CopySize < OMPX_MinHostToHostAsyncCopySize) {
2314
+ if (auto Err =
2315
+ OutputSignals[0 ]->wait (StreamBusyWaitMicroseconds, &Device))
2316
+ return Err;
2317
+ std::memcpy (Dst, Inter, CopySize);
2318
+ return Error::success ();
2319
+ }
2320
+
2309
2321
// Consume another stream slot and compute dependencies.
2310
2322
std::tie (Curr, InputSignal) = consume (OutputSignals[1 ]);
2311
2323
assert (InputSignal && " Invalid input signal" );
@@ -4713,7 +4725,9 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
4713
4725
Slots(32 ), NextSlot(0 ), SyncCycle(0 ),
4714
4726
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
4715
4727
UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()),
4716
- UseSyncCopyBack(Device.syncCopyBack()) {}
4728
+ UseSyncCopyBack(Device.syncCopyBack()),
4729
+ OMPX_MinHostToHostAsyncCopySize(
4730
+ " LIBOMPTARGET_AMDGPU_MIN_HOST_TO_HOST_ASYNC_COPY_SIZE" , 2048 ) {}
4717
4731
4718
4732
// / Class implementing the AMDGPU-specific functionalities of the global
4719
4733
// / handler.
0 commit comments