From b85804f95598d77f960a7cead3ede93ab9bb6687 Mon Sep 17 00:00:00 2001 From: Alex Duran Date: Fri, 15 Aug 2025 04:47:14 -0700 Subject: [PATCH 1/2] [Offload] Introduce `dataFence` plugin interface. The purpose of this fence is to ensure that any `dataSubmit`s inserted into a queue before a `dataFence` finish before finish before any `dataSubmit`s inserted after it begin. This is a no-op for most queues, since they are in-order, and by design any operations inserted into them occur in order. But the interface is supposed to be functional for out-of-order queues. The addition of the interface means that any operations that rely on such ordering (like ATTACH map-type support in #149036) can invoke it, without worrying about whether the underlying queue is in-order or out-of-order. Once a plugin supports out-of-order queues, the plugins can implement this function, without requiring any change at the libomptarget level. --- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 4 ++++ .../plugins-nextgen/common/include/PluginInterface.h | 8 ++++++++ .../plugins-nextgen/common/src/PluginInterface.cpp | 12 ++++++++++++ offload/plugins-nextgen/cuda/src/rtl.cpp | 4 ++++ offload/plugins-nextgen/host/src/rtl.cpp | 4 ++++ 5 files changed, 32 insertions(+) diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 796182075ff3d..9232f79e9c576 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2537,6 +2537,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { getAgent(), (uint64_t)Size); } + Error dataFence(__tgt_async_info *Async) override { + return Plugin::success(); + } + /// Initialize the async info for interoperability purposes. Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { // TODO: Implement this function. diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index c9ab34b024b77..1b110f6b74553 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -944,6 +944,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy { virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + /// Instert a data fence between previous data operations and the following + /// operations if necessary for the device + virtual Error dataFence(__tgt_async_info *AsyncInfo) = 0; + /// Exchange data between devices (device to device transfer). Calling this /// function is only valid if GenericPlugin::isDataExchangable() passing the /// two devices returns true. @@ -1448,6 +1452,10 @@ struct GenericPluginTy { int DstDeviceId, void *DstPtr, int64_t Size, __tgt_async_info *AsyncInfo); + /// Places a fence between previous data movements and following data + /// movements if necessary on the device + int32_t data_fence(int32_t DeviceId, __tgt_async_info *AsyncInfo); + /// Begin executing a kernel on the given device. int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, KernelArgsTy *KernelArgs, diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 083d41659a469..c99871452efe9 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -2324,3 +2324,15 @@ int32_t GenericPluginTy::async_barrier(omp_interop_val_t *Interop) { } return OFFLOAD_SUCCESS; } + +int32_t GenericPluginTy::data_fence(int32_t DeviceId, + __tgt_async_info *AsyncInfo) { + auto Err = getDevice(DeviceId).dataFence(AsyncInfo); + if (Err) { + REPORT("Failure to place data fence on device %d: %s\n", DeviceId, + toString(std::move(Err)).data()); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index 82c9f9b706cbd..426db75be790b 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -856,6 +856,10 @@ struct CUDADeviceTy : public GenericDeviceTy { return Plugin::success(); } + Error dataFence(__tgt_async_info *Async) override { + return Plugin::success(); + } + /// Initialize the device info for interoperability purposes. Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override { assert(Context && "Context is null"); diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index ed5213531999d..8b61179223b7c 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -295,6 +295,10 @@ struct GenELF64DeviceTy : public GenericDeviceTy { "dataExchangeImpl not supported"); } + Error dataFence(__tgt_async_info *Async) override { + return Plugin::success(); + } + /// All functions are already synchronous. No need to do anything on this /// synchronization function. Error synchronizeImpl(__tgt_async_info &AsyncInfo, From 2a043b294a026817cdb42f10bc52d328d9eeca01 Mon Sep 17 00:00:00 2001 From: Abhinav Gaba Date: Fri, 15 Aug 2025 05:08:52 -0700 Subject: [PATCH 2/2] Add some comments, change casing of error message. --- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 3 +++ offload/plugins-nextgen/common/src/PluginInterface.cpp | 2 +- offload/plugins-nextgen/cuda/src/rtl.cpp | 3 +++ offload/plugins-nextgen/host/src/rtl.cpp | 3 +++ 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 9232f79e9c576..eab0fda9ff0d0 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2537,6 +2537,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { getAgent(), (uint64_t)Size); } + /// Insert a data fence between previous data operations and the following + /// operations. This is a no-op for AMDGPU devices as operations inserted into + /// a queue are in-order. Error dataFence(__tgt_async_info *Async) override { return Plugin::success(); } diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index c99871452efe9..5a7d8e936313f 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -2329,7 +2329,7 @@ int32_t GenericPluginTy::data_fence(int32_t DeviceId, __tgt_async_info *AsyncInfo) { auto Err = getDevice(DeviceId).dataFence(AsyncInfo); if (Err) { - REPORT("Failure to place data fence on device %d: %s\n", DeviceId, + REPORT("failure to place data fence on device %d: %s\n", DeviceId, toString(std::move(Err)).data()); return OFFLOAD_FAIL; } diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index 426db75be790b..f3b2229daf372 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -856,6 +856,9 @@ struct CUDADeviceTy : public GenericDeviceTy { return Plugin::success(); } + /// Insert a data fence between previous data operations and the following + /// operations. This is a no-op for CUDA devices as operations inserted into + /// a queue are in-order. Error dataFence(__tgt_async_info *Async) override { return Plugin::success(); } diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index 8b61179223b7c..b88435aa9e02b 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -295,6 +295,9 @@ struct GenELF64DeviceTy : public GenericDeviceTy { "dataExchangeImpl not supported"); } + /// Insert a data fence between previous data operations and the following + /// operations. This is a no-op for Host devices as operations inserted into + /// a queue are in-order. Error dataFence(__tgt_async_info *Async) override { return Plugin::success(); }