Skip to content

Commit 0b18d2d

Browse files
authored
[Offload] Implement olMemFill (#154102)
Implement olMemFill to support filling device memory with arbitrary length patterns. AMDGPU support will be added in a follow-up PR.
1 parent 3768ec3 commit 0b18d2d

File tree

11 files changed

+293
-0
lines changed

11 files changed

+293
-0
lines changed

offload/liboffload/API/Memory.td

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,23 @@ def olMemcpy : Function {
5959
];
6060
let returns = [];
6161
}
62+
63+
def : Function {
64+
let name = "olMemFill";
65+
let desc = "Fill memory with copies of the given pattern";
66+
let details = [
67+
"Filling with patterns larger than 4 bytes may be less performant",
68+
"The destination pointer and queue must be associated with the same device",
69+
"The fill size must be a multiple of the pattern size",
70+
];
71+
let params = [
72+
Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>,
73+
Param<"void*", "Ptr", "destination pointer to start filling at", PARAM_IN>,
74+
Param<"size_t", "PatternSize", "the size of the pattern in bytes", PARAM_IN>,
75+
Param<"const void*", "PatternPtr", "", PARAM_IN>,
76+
Param<"size_t", "FillSize", "number of bytes to fill", PARAM_IN>,
77+
];
78+
let returns = [
79+
Return<"OL_ERRC_INVALID_SIZE", ["`FillSize % PatternSize != 0`"]>
80+
];
81+
}

offload/liboffload/src/OffloadImpl.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -760,6 +760,12 @@ Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
760760
return Error::success();
761761
}
762762

763+
Error olMemFill_impl(ol_queue_handle_t Queue, void *Ptr, size_t PatternSize,
764+
const void *PatternPtr, size_t FillSize) {
765+
return Queue->Device->Device->dataFill(Ptr, PatternPtr, PatternSize, FillSize,
766+
Queue->AsyncInfo);
767+
}
768+
763769
Error olCreateProgram_impl(ol_device_handle_t Device, const void *ProgData,
764770
size_t ProgDataSize, ol_program_handle_t *Program) {
765771
// Make a copy of the program binary in case it is released by the caller.

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2608,6 +2608,31 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
26082608
return Plugin::success();
26092609
}
26102610

2611+
Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
2612+
int64_t Size,
2613+
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2614+
hsa_status_t Status;
2615+
2616+
// We can use hsa_amd_memory_fill for this size, but it's not async so the
2617+
// queue needs to be synchronized first
2618+
if (PatternSize == 4) {
2619+
if (AsyncInfoWrapper.hasQueue())
2620+
if (auto Err = synchronize(AsyncInfoWrapper))
2621+
return Err;
2622+
Status = hsa_amd_memory_fill(TgtPtr,
2623+
*static_cast<const uint32_t *>(PatternPtr),
2624+
Size / PatternSize);
2625+
2626+
if (auto Err =
2627+
Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n"))
2628+
return Err;
2629+
} else {
2630+
// TODO: Implement for AMDGPU. Most likely by doing the fill in pinned
2631+
// memory and copying to the device in one go.
2632+
return Plugin::error(ErrorCode::UNSUPPORTED, "Unsupported fill size");
2633+
}
2634+
}
2635+
26112636
/// Initialize the async info for interoperability purposes.
26122637
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
26132638
// TODO: Implement this function.

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -960,6 +960,13 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
960960
void *DstPtr, int64_t Size,
961961
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
962962

963+
/// Fill data on the device with a pattern from the host
964+
Error dataFill(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
965+
int64_t Size, __tgt_async_info *AsyncInfo);
966+
virtual Error dataFillImpl(void *TgtPtr, const void *PatternPtr,
967+
int64_t PatternSize, int64_t Size,
968+
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
969+
963970
/// Run the kernel associated with \p EntryPtr
964971
Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
965972
KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);

offload/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1540,6 +1540,16 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
15401540
return Err;
15411541
}
15421542

1543+
Error GenericDeviceTy::dataFill(void *TgtPtr, const void *PatternPtr,
1544+
int64_t PatternSize, int64_t Size,
1545+
__tgt_async_info *AsyncInfo) {
1546+
AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1547+
auto Err =
1548+
dataFillImpl(TgtPtr, PatternPtr, PatternSize, Size, AsyncInfoWrapper);
1549+
AsyncInfoWrapper.finalize(Err);
1550+
return Err;
1551+
}
1552+
15431553
Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
15441554
ptrdiff_t *ArgOffsets,
15451555
KernelArgsTy &KernelArgs,

offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,13 @@ DLWRAP(cuMemcpyDtoHAsync, 4)
5353
DLWRAP(cuMemcpyHtoD, 3)
5454
DLWRAP(cuMemcpyHtoDAsync, 4)
5555

56+
DLWRAP(cuMemsetD8Async, 4)
57+
DLWRAP(cuMemsetD16Async, 4)
58+
DLWRAP(cuMemsetD32Async, 4)
59+
DLWRAP(cuMemsetD2D8Async, 6)
60+
DLWRAP(cuMemsetD2D16Async, 6)
61+
DLWRAP(cuMemsetD2D32Async, 6)
62+
5663
DLWRAP(cuMemFree, 1)
5764
DLWRAP(cuMemFreeHost, 1)
5865
DLWRAP(cuMemFreeAsync, 2)

offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,16 @@ CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream);
322322
CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t);
323323
CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
324324

325+
CUresult cuMemsetD8Async(CUdeviceptr, unsigned int, size_t, CUstream);
326+
CUresult cuMemsetD16Async(CUdeviceptr, unsigned int, size_t, CUstream);
327+
CUresult cuMemsetD32Async(CUdeviceptr, unsigned int, size_t, CUstream);
328+
CUresult cuMemsetD2D8Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
329+
CUstream);
330+
CUresult cuMemsetD2D16Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
331+
CUstream);
332+
CUresult cuMemsetD2D32Async(CUdeviceptr, size_t, unsigned int, size_t, size_t,
333+
CUstream);
334+
325335
CUresult cuMemFree(CUdeviceptr);
326336
CUresult cuMemFreeHost(void *);
327337
CUresult cuMemFreeAsync(CUdeviceptr, CUstream);

offload/plugins-nextgen/cuda/src/rtl.cpp

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -858,6 +858,64 @@ struct CUDADeviceTy : public GenericDeviceTy {
858858
void *DstPtr, int64_t Size,
859859
AsyncInfoWrapperTy &AsyncInfoWrapper) override;
860860

861+
Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
862+
int64_t Size,
863+
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
864+
if (auto Err = setContext())
865+
return Err;
866+
867+
CUstream Stream;
868+
if (auto Err = getStream(AsyncInfoWrapper, Stream))
869+
return Err;
870+
871+
CUresult Res;
872+
size_t N = Size / PatternSize;
873+
if (PatternSize == 1) {
874+
Res = cuMemsetD8Async((CUdeviceptr)TgtPtr,
875+
*(static_cast<const uint8_t *>(PatternPtr)), N,
876+
Stream);
877+
} else if (PatternSize == 2) {
878+
Res = cuMemsetD16Async((CUdeviceptr)TgtPtr,
879+
*(static_cast<const uint16_t *>(PatternPtr)), N,
880+
Stream);
881+
} else if (PatternSize == 4) {
882+
Res = cuMemsetD32Async((CUdeviceptr)TgtPtr,
883+
*(static_cast<const uint32_t *>(PatternPtr)), N,
884+
Stream);
885+
} else {
886+
// For larger patterns we can do a series of strided fills to copy the
887+
// pattern efficiently
888+
int64_t MemsetSize = PatternSize % 4u == 0u ? 4u
889+
: PatternSize % 2u == 0u ? 2u
890+
: 1u;
891+
892+
int64_t NumberOfSteps = PatternSize / MemsetSize;
893+
int64_t Pitch = NumberOfSteps * MemsetSize;
894+
int64_t Height = Size / PatternSize;
895+
896+
for (auto Step = 0u; Step < NumberOfSteps; ++Step) {
897+
if (MemsetSize == 4) {
898+
Res = cuMemsetD2D32Async(
899+
(CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
900+
*(static_cast<const uint32_t *>(PatternPtr) + Step), 1u, Height,
901+
Stream);
902+
} else if (MemsetSize == 2) {
903+
Res = cuMemsetD2D16Async(
904+
(CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
905+
*(static_cast<const uint16_t *>(PatternPtr) + Step), 1u, Height,
906+
Stream);
907+
} else {
908+
Res = cuMemsetD2D8Async(
909+
(CUdeviceptr)TgtPtr + Step * MemsetSize, Pitch,
910+
*(static_cast<const uint8_t *>(PatternPtr) + Step), 1u, Height,
911+
Stream);
912+
}
913+
}
914+
}
915+
916+
return Plugin::check(Res, "error in cuMemset: %s");
917+
}
918+
861919
/// Initialize the async info for interoperability purposes.
862920
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
863921
if (auto Err = setContext())

offload/plugins-nextgen/host/src/rtl.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,21 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
310310
return Plugin::success();
311311
}
312312

313+
Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
314+
int64_t Size,
315+
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
316+
if (PatternSize == 1) {
317+
std::memset(TgtPtr, *static_cast<const char *>(PatternPtr), Size);
318+
} else {
319+
for (unsigned int Step = 0; Step < Size; Step += PatternSize) {
320+
auto *Dst = static_cast<char *>(TgtPtr) + Step;
321+
std::memcpy(Dst, PatternPtr, PatternSize);
322+
}
323+
}
324+
325+
return Plugin::success();
326+
}
327+
313328
/// All functions are already synchronous. No need to do anything on this
314329
/// synchronization function.
315330
Error synchronizeImpl(__tgt_async_info &AsyncInfo,

offload/unittests/OffloadAPI/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ add_offload_unittest("kernel"
2525

2626
add_offload_unittest("memory"
2727
memory/olMemAlloc.cpp
28+
memory/olMemFill.cpp
2829
memory/olMemFree.cpp
2930
memory/olMemcpy.cpp)
3031

0 commit comments

Comments
 (0)