Skip to content

Commit 3037f5d

Browse files
macurtis-amdronlieb
authored andcommitted
[offload][amdgpu] Do small host mem copies synchronously
1 parent dbd8049 commit 3037f5d

File tree

1 file changed

+15
-1
lines changed
  • offload/plugins-nextgen/amdgpu/src

1 file changed

+15
-1
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1904,6 +1904,10 @@ struct AMDGPUStreamTy {
19041904
/// Use synchronous copy back.
19051905
bool UseSyncCopyBack;
19061906

1907+
/// When copying data from one host buffer to another, only do it
1908+
/// asynchronously if `MinHostToHostAsyncCopySize <= size`.
1909+
UInt32Envar OMPX_MinHostToHostAsyncCopySize;
1910+
19071911
/// Arguments for the callback function.
19081912
PostKernelRunProcessingArgsTy PostKernelRunProcessingArgs;
19091913

@@ -2306,6 +2310,14 @@ struct AMDGPUStreamTy {
23062310
return Err;
23072311
}
23082312

2313+
if (CopySize < OMPX_MinHostToHostAsyncCopySize) {
2314+
if (auto Err =
2315+
OutputSignals[0]->wait(StreamBusyWaitMicroseconds, &Device))
2316+
return Err;
2317+
std::memcpy(Dst, Inter, CopySize);
2318+
return Error::success();
2319+
}
2320+
23092321
// Consume another stream slot and compute dependencies.
23102322
std::tie(Curr, InputSignal) = consume(OutputSignals[1]);
23112323
assert(InputSignal && "Invalid input signal");
@@ -4713,7 +4725,9 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
47134725
Slots(32), NextSlot(0), SyncCycle(0),
47144726
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
47154727
UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()),
4716-
UseSyncCopyBack(Device.syncCopyBack()) {}
4728+
UseSyncCopyBack(Device.syncCopyBack()),
4729+
OMPX_MinHostToHostAsyncCopySize(
4730+
"LIBOMPTARGET_AMDGPU_MIN_HOST_TO_HOST_ASYNC_COPY_SIZE", 2048) {}
47174731

47184732
/// Class implementing the AMDGPU-specific functionalities of the global
47194733
/// handler.

0 commit comments

Comments
 (0)