[Offload] Implement olMemFill (#154102)

callumfare · web-flow · commit 0b18d2da7009 · 2025-08-22T14:31:16.000+01:00
Implement olMemFill to support filling device memory with arbitrary
length patterns. AMDGPU support will be added in a follow-up PR.
diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td
@@ -59,3 +59,23 @@ def olMemcpy : Function {
     ];
     let returns = [];
 }
+
+def : Function {
+  let name = "olMemFill";
+  let desc = "Fill memory with copies of the given pattern";
+  let details = [
+    "Filling with patterns larger than 4 bytes may be less performant",
+    "The destination pointer and queue must be associated with the same device",
+    "The fill size must be a multiple of the pattern size",
+  ];
+  let params = [
+      Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>,
+      Param<"void*", "Ptr", "destination pointer to start filling at", PARAM_IN>,
+      Param<"size_t", "PatternSize", "the size of the pattern in bytes", PARAM_IN>,
+      Param<"const void*", "PatternPtr", "", PARAM_IN>,
+      Param<"size_t", "FillSize", "number of bytes to fill", PARAM_IN>,
+  ];
+  let returns = [
+    Return<"OL_ERRC_INVALID_SIZE", ["`FillSize % PatternSize != 0`"]>
+  ];
+}
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
@@ -760,6 +760,12 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
   return Error::success();
 }
 
+Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize,
+                     const void *PatternPtr, size_t FillSize) {
+  return Queue->Device->Device->dataFill(Ptr, PatternPtr, PatternSize, FillSize,
+                                         Queue->AsyncInfo);
+}
+
 Error olCreateProgram_impl(ol_device_handle_t Device, const void *ProgData,
                            size_t ProgDataSize, ol_program_handle_t *Program) {
   // Make a copy of the program binary in case it is released by the caller.
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2608,6 +2608,31 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Plugin::success();
   }
 
+  Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+                     int64_t Size,
+                     AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    hsa_status_t Status;
+
+    // We can use hsa_amd_memory_fill for this size, but it's not async so the
+    // queue needs to be synchronized first
+    if (PatternSize == 4) {
+      if (AsyncInfoWrapper.hasQueue())
+        if (auto Err = synchronize(AsyncInfoWrapper))
+          return Err;
+      Status = hsa_amd_memory_fill(TgtPtr,
+                                   *static_cast<const uint32_t *>(PatternPtr),
+                                   Size / PatternSize);
+
+      if (auto Err =
+              Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n"))
+        return Err;
+    } else {
+      // TODO: Implement for AMDGPU. Most likely by doing the fill in pinned
+      // memory and copying to the device in one go.
+      return Plugin::error(ErrorCode::UNSUPPORTED, "Unsupported fill size");
+    }
+  }
+
   /// Initialize the async info for interoperability purposes.
   Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
     // TODO: Implement this function.
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -960,6 +960,13 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
                                  void *DstPtr, int64_t Size,
                                  AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
 
+  /// Fill data on the device with a pattern from the host
+  Error dataFill(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+                 int64_t Size, __tgt_async_info *AsyncInfo);
+  virtual Error dataFillImpl(void *TgtPtr, const void *PatternPtr,
+                             int64_t PatternSize, int64_t Size,
+                             AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
   /// Run the kernel associated with \p EntryPtr
   Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
                      KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1540,6 +1540,16 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
   return Err;
 }
 
+Error GenericDeviceTy::dataFill(void *TgtPtr, const void *PatternPtr,
+                                int64_t PatternSize, int64_t Size,
+                                __tgt_async_info *AsyncInfo) {
+  AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
+  auto Err =
+      dataFillImpl(TgtPtr, PatternPtr, PatternSize, Size, AsyncInfoWrapper);
+  AsyncInfoWrapper.finalize(Err);
+  return Err;
+}
+
 Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
                                     ptrdiff_t *ArgOffsets,
                                     KernelArgsTy &KernelArgs,
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -53,6 +53,13 @@ DLWRAP(cuMemcpyDtoHAsync, 4)
 DLWRAP(cuMemcpyHtoD, 3)
 DLWRAP(cuMemcpyHtoDAsync, 4)
 
+DLWRAP(cuMemsetD8Async, 4)
+DLWRAP(cuMemsetD16Async, 4)
+DLWRAP(cuMemsetD32Async, 4)
+DLWRAP(cuMemsetD2D8Async, 6)
+DLWRAP(cuMemsetD2D16Async, 6)
+DLWRAP(cuMemsetD2D32Async, 6)
+
 DLWRAP(cuMemFree, 1)
 DLWRAP(cuMemFreeHost, 1)
 DLWRAP(cuMemFreeAsync, 2)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -322,6 +322,16 @@ CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream);
 CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t);
 CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
 
+CUresult cuMemsetD8Async(CUdeviceptr, unsigned int, size_t, CUstream);
+CUresult cuMemsetD16Async(CUdeviceptr, unsigned int, size_t, CUstream);
+CUresult cuMemsetD32Async(CUdeviceptr, unsigned int, size_t, CUstream);
+CUresult cuMemsetD2D8Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
+                           CUstream);
+CUresult cuMemsetD2D16Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
+                            CUstream);
+CUresult cuMemsetD2D32Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
+                            CUstream);
+
 CUresult cuMemFree(CUdeviceptr);
 CUresult cuMemFreeHost(void *);
 CUresult cuMemFreeAsync(CUdeviceptr, CUstream);
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -858,6 +858,64 @@ struct CUDADeviceTy : public GenericDeviceTy {
                          void *DstPtr, int64_t Size,
                          AsyncInfoWrapperTy &AsyncInfoWrapper) override;
 
+  Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+                     int64_t Size,
+                     AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    if (auto Err = setContext())
+      return Err;
+
+    CUstream Stream;
+    if (auto Err = getStream(AsyncInfoWrapper, Stream))
+      return Err;
+
+    CUresult Res;
+    size_t N = Size / PatternSize;
+    if (PatternSize == 1) {
+      Res = cuMemsetD8Async((CUdeviceptr)TgtPtr,
+                            *(static_cast<const uint8_t *>(PatternPtr)), N,
+                            Stream);
+    } else if (PatternSize == 2) {
+      Res = cuMemsetD16Async((CUdeviceptr)TgtPtr,
+                             *(static_cast<const uint16_t *>(PatternPtr)), N,
+                             Stream);
+    } else if (PatternSize == 4) {
+      Res = cuMemsetD32Async((CUdeviceptr)TgtPtr,
+                             *(static_cast<const uint32_t *>(PatternPtr)), N,
+                             Stream);
+    } else {
+      // For larger patterns we can do a series of strided fills to copy the
+      // pattern efficiently
+      int64_t MemsetSize = PatternSize % 4u == 0u   ? 4u
+                           : PatternSize % 2u == 0u ? 2u
+                                                    : 1u;
+
+      int64_t NumberOfSteps = PatternSize / MemsetSize;
+      int64_t Pitch = NumberOfSteps * MemsetSize;
+      int64_t Height = Size / PatternSize;
+
+      for (auto Step = 0u; Step < NumberOfSteps; ++Step) {
+        if (MemsetSize == 4) {
+          Res = cuMemsetD2D32Async(
+              (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
+              *(static_cast<const uint32_t *>(PatternPtr) + Step), 1u, Height,
+              Stream);
+        } else if (MemsetSize == 2) {
+          Res = cuMemsetD2D16Async(
+              (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
+              *(static_cast<const uint16_t *>(PatternPtr) + Step), 1u, Height,
+              Stream);
+        } else {
+          Res = cuMemsetD2D8Async(
+              (CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
+              *(static_cast<const uint8_t *>(PatternPtr) + Step), 1u, Height,
+              Stream);
+        }
+      }
+    }
+
+    return Plugin::check(Res, "error in cuMemset: %s");
+  }
+
   /// Initialize the async info for interoperability purposes.
   Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
     if (auto Err = setContext())
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -310,6 +310,21 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
     return Plugin::success();
   }
 
+  Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
+                     int64_t Size,
+                     AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    if (PatternSize == 1) {
+      std::memset(TgtPtr, *static_cast<const char *>(PatternPtr), Size);
+    } else {
+      for (unsigned int Step = 0; Step < Size; Step += PatternSize) {
+        auto *Dst = static_cast<char *>(TgtPtr) + Step;
+        std::memcpy(Dst, PatternPtr, PatternSize);
+      }
+    }
+
+    return Plugin::success();
+  }
+
   /// All functions are already synchronous. No need to do anything on this
   /// synchronization function.
   Error synchronizeImpl(__tgt_async_info &AsyncInfo,
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -25,6 +25,7 @@ add_offload_unittest("kernel"
 
 add_offload_unittest("memory"
     memory/olMemAlloc.cpp
+    memory/olMemFill.cpp
     memory/olMemFree.cpp
     memory/olMemcpy.cpp)
 
diff --git a/offload/unittests/OffloadAPI/memory/olMemFill.cpp b/offload/unittests/OffloadAPI/memory/olMemFill.cpp
@@ -0,0 +1,134 @@
+//===------- Offload API tests - olMemFill --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olMemFillTest = OffloadQueueTest;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olMemFillTest);
+
+TEST_P(olMemFillTest, Success8) {
+  constexpr size_t Size = 1024;
+  void *Alloc;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+  uint8_t Pattern = 0x42;
+  ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+  olSyncQueue(Queue);
+
+  size_t N = Size / sizeof(Pattern);
+  for (size_t i = 0; i < N; i++) {
+    uint8_t *AllocPtr = reinterpret_cast<uint8_t *>(Alloc);
+    ASSERT_EQ(AllocPtr[i], Pattern);
+  }
+
+  olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, Success16) {
+  constexpr size_t Size = 1024;
+  void *Alloc;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+  uint16_t Pattern = 0x4242;
+  ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+  olSyncQueue(Queue);
+
+  size_t N = Size / sizeof(Pattern);
+  for (size_t i = 0; i < N; i++) {
+    uint16_t *AllocPtr = reinterpret_cast<uint16_t *>(Alloc);
+    ASSERT_EQ(AllocPtr[i], Pattern);
+  }
+
+  olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, Success32) {
+  constexpr size_t Size = 1024;
+  void *Alloc;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+  uint32_t Pattern = 0xDEADBEEF;
+  ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+  olSyncQueue(Queue);
+
+  size_t N = Size / sizeof(Pattern);
+  for (size_t i = 0; i < N; i++) {
+    uint32_t *AllocPtr = reinterpret_cast<uint32_t *>(Alloc);
+    ASSERT_EQ(AllocPtr[i], Pattern);
+  }
+
+  olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, SuccessLarge) {
+  constexpr size_t Size = 1024;
+  void *Alloc;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+  struct PatternT {
+    uint64_t A;
+    uint64_t B;
+  } Pattern{UINT64_MAX, UINT64_MAX};
+
+  ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+  olSyncQueue(Queue);
+
+  size_t N = Size / sizeof(Pattern);
+  for (size_t i = 0; i < N; i++) {
+    PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
+    ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
+    ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
+  }
+
+  olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, SuccessLargeByteAligned) {
+  constexpr size_t Size = 17 * 64;
+  void *Alloc;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+  struct __attribute__((packed)) PatternT {
+    uint64_t A;
+    uint64_t B;
+    uint8_t C;
+  } Pattern{UINT64_MAX, UINT64_MAX, 255};
+
+  ASSERT_SUCCESS(olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+  olSyncQueue(Queue);
+
+  size_t N = Size / sizeof(Pattern);
+  for (size_t i = 0; i < N; i++) {
+    PatternT *AllocPtr = reinterpret_cast<PatternT *>(Alloc);
+    ASSERT_EQ(AllocPtr[i].A, UINT64_MAX);
+    ASSERT_EQ(AllocPtr[i].B, UINT64_MAX);
+    ASSERT_EQ(AllocPtr[i].C, 255);
+  }
+
+  olMemFree(Alloc);
+}
+
+TEST_P(olMemFillTest, InvalidPatternSize) {
+  constexpr size_t Size = 1025;
+  void *Alloc;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, Size, &Alloc));
+
+  uint16_t Pattern = 0x4242;
+  ASSERT_ERROR(OL_ERRC_INVALID_SIZE,
+               olMemFill(Queue, Alloc, sizeof(Pattern), &Pattern, Size));
+
+  olSyncQueue(Queue);
+  olMemFree(Alloc);
+}