Skip to content

Commit 0d6ace8

Browse files
committed
[Offload] Full AMD support for olMemFill
1 parent 7dfd5ba commit 0d6ace8

File tree

3 files changed

+212
-54
lines changed

3 files changed

+212
-54
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 90 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -924,6 +924,7 @@ struct AMDGPUStreamTy {
924924
void *Dst;
925925
const void *Src;
926926
size_t Size;
927+
size_t NumTimes;
927928
};
928929

929930
/// Utility struct holding arguments for freeing buffers to memory managers.
@@ -974,9 +975,14 @@ struct AMDGPUStreamTy {
974975
StreamSlotTy() : Signal(nullptr), Callbacks({}), ActionArgs({}) {}
975976

976977
/// Schedule a host memory copy action on the slot.
977-
Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) {
978+
///
979+
/// Num times will repeat the copy that many times, sequentually in the dest
980+
/// buffer.
981+
Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size,
982+
size_t NumTimes = 1) {
978983
Callbacks.emplace_back(memcpyAction);
979-
ActionArgs.emplace_back().MemcpyArgs = MemcpyArgsTy{Dst, Src, Size};
984+
ActionArgs.emplace_back().MemcpyArgs =
985+
MemcpyArgsTy{Dst, Src, Size, NumTimes};
980986
return Plugin::success();
981987
}
982988

@@ -1216,7 +1222,12 @@ struct AMDGPUStreamTy {
12161222
assert(Args->Dst && "Invalid destination buffer");
12171223
assert(Args->Src && "Invalid source buffer");
12181224

1219-
std::memcpy(Args->Dst, Args->Src, Args->Size);
1225+
auto BasePtr = Args->Dst;
1226+
for (size_t I = 0; I < Args->NumTimes; I++) {
1227+
std::memcpy(BasePtr, Args->Src, Args->Size);
1228+
BasePtr = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(BasePtr) +
1229+
Args->Size);
1230+
}
12201231

12211232
return Plugin::success();
12221233
}
@@ -1421,7 +1432,8 @@ struct AMDGPUStreamTy {
14211432
/// manager once the operation completes.
14221433
Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter,
14231434
uint64_t CopySize,
1424-
AMDGPUMemoryManagerTy &MemoryManager) {
1435+
AMDGPUMemoryManagerTy &MemoryManager,
1436+
size_t NumTimes = 1) {
14251437
// Retrieve available signals for the operation's outputs.
14261438
AMDGPUSignalTy *OutputSignals[2] = {};
14271439
if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
@@ -1443,7 +1455,8 @@ struct AMDGPUStreamTy {
14431455
// The std::memcpy is done asynchronously using an async handler. We store
14441456
// the function's information in the action but it is not actually a
14451457
// post action.
1446-
if (auto Err = Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize))
1458+
if (auto Err =
1459+
Slots[Curr].schedHostMemoryCopy(Inter, Src, CopySize, NumTimes))
14471460
return Err;
14481461

14491462
// Make changes on this slot visible to the async handler's thread.
@@ -1464,7 +1477,12 @@ struct AMDGPUStreamTy {
14641477
std::tie(Curr, InputSignal) = consume(OutputSignal);
14651478
} else {
14661479
// All preceding operations completed, copy the memory synchronously.
1467-
std::memcpy(Inter, Src, CopySize);
1480+
auto *InterPtr = Inter;
1481+
for (size_t I = 0; I < NumTimes; I++) {
1482+
std::memcpy(InterPtr, Src, CopySize);
1483+
InterPtr = reinterpret_cast<void *>(
1484+
reinterpret_cast<uintptr_t>(InterPtr) + CopySize);
1485+
}
14681486

14691487
// Return the second signal because it will not be used.
14701488
OutputSignals[1]->decreaseUseCount();
@@ -1481,11 +1499,11 @@ struct AMDGPUStreamTy {
14811499
if (InputSignal && InputSignal->load()) {
14821500
hsa_signal_t InputSignalRaw = InputSignal->get();
14831501
return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
1484-
Agent, CopySize, 1, &InputSignalRaw,
1485-
OutputSignal->get());
1502+
Agent, CopySize * NumTimes, 1,
1503+
&InputSignalRaw, OutputSignal->get());
14861504
}
14871505
return hsa_utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
1488-
Agent, CopySize, 0, nullptr,
1506+
Agent, CopySize * NumTimes, 0, nullptr,
14891507
OutputSignal->get());
14901508
}
14911509

@@ -2611,26 +2629,73 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
26112629
Error dataFillImpl(void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
26122630
int64_t Size,
26132631
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2614-
hsa_status_t Status;
2632+
// Fast case, where we can use the 4 byte hsa_amd_memory_fill
2633+
if (Size % 4 == 0 &&
2634+
(PatternSize == 4 || PatternSize == 2 || PatternSize == 1)) {
2635+
uint32_t Pattern;
2636+
if (PatternSize == 1) {
2637+
auto *Byte = reinterpret_cast<const uint8_t *>(PatternPtr);
2638+
Pattern = *Byte | *Byte << 8 | *Byte << 16 | *Byte << 24;
2639+
} else if (PatternSize == 2) {
2640+
auto *Word = reinterpret_cast<const uint16_t *>(PatternPtr);
2641+
Pattern = *Word | (*Word << 16);
2642+
} else if (PatternSize == 4) {
2643+
Pattern = *reinterpret_cast<const uint32_t *>(PatternPtr);
2644+
} else {
2645+
// Shouldn't be here if the pattern size is outwith those values
2646+
std::terminate();
2647+
}
26152648

2616-
// We can use hsa_amd_memory_fill for this size, but it's not async so the
2617-
// queue needs to be synchronized first
2618-
if (PatternSize == 4) {
2619-
if (AsyncInfoWrapper.hasQueue())
2620-
if (auto Err = synchronize(AsyncInfoWrapper))
2649+
if (hasPendingWorkImpl(AsyncInfoWrapper)) {
2650+
AMDGPUStreamTy *Stream = nullptr;
2651+
if (auto Err = getStream(AsyncInfoWrapper, Stream))
26212652
return Err;
2622-
Status = hsa_amd_memory_fill(TgtPtr,
2623-
*static_cast<const uint32_t *>(PatternPtr),
2624-
Size / PatternSize);
26252653

2626-
if (auto Err =
2627-
Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n"))
2628-
return Err;
2629-
} else {
2630-
// TODO: Implement for AMDGPU. Most likely by doing the fill in pinned
2631-
// memory and copying to the device in one go.
2632-
return Plugin::error(ErrorCode::UNSUPPORTED, "Unsupported fill size");
2654+
struct MemFillArgsTy {
2655+
void *Dst;
2656+
uint32_t Pattern;
2657+
int64_t Size;
2658+
};
2659+
auto *Args = new MemFillArgsTy{TgtPtr, Pattern, Size / 4};
2660+
auto Fill = [](void *Data) {
2661+
MemFillArgsTy *Args = reinterpret_cast<MemFillArgsTy *>(Data);
2662+
assert(Args && "Invalid arguments");
2663+
2664+
auto Status =
2665+
hsa_amd_memory_fill(Args->Dst, Args->Pattern, Args->Size);
2666+
delete Args;
2667+
auto Err =
2668+
Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n");
2669+
if (Err) {
2670+
FATAL_MESSAGE(1, "error performing async fill: %s",
2671+
toString(std::move(Err)).data());
2672+
}
2673+
};
2674+
2675+
// hsa_amd_memory_fill doesn't signal completion using a signal, so use
2676+
// the existing host callback logic to handle that instead
2677+
return Stream->pushHostCallback(Fill, Args);
2678+
} else {
2679+
// If there is no pending work, do the fill synchronously
2680+
auto Status = hsa_amd_memory_fill(TgtPtr, Pattern, Size / 4);
2681+
return Plugin::check(Status, "error in hsa_amd_memory_fill: %s\n");
2682+
}
26332683
}
2684+
2685+
// Slow case; allocate an appropriate memory size and enqueue copies
2686+
void *PinnedPtr = nullptr;
2687+
AMDGPUMemoryManagerTy &PinnedMemoryManager =
2688+
HostDevice.getPinnedMemoryManager();
2689+
if (auto Err = PinnedMemoryManager.allocate(Size, &PinnedPtr))
2690+
return Err;
2691+
2692+
AMDGPUStreamTy *Stream = nullptr;
2693+
if (auto Err = getStream(AsyncInfoWrapper, Stream))
2694+
return Err;
2695+
2696+
return Stream->pushMemoryCopyH2DAsync(TgtPtr, PatternPtr, PinnedPtr,
2697+
PatternSize, PinnedMemoryManager,
2698+
Size / PatternSize);
26342699
}
26352700

26362701
/// Initialize the async info for interoperability purposes.

offload/unittests/OffloadAPI/common/Fixtures.hpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,40 @@ template <typename Fn> inline void threadify(Fn body) {
8989
}
9090
}
9191

92+
/// Enqueues a task to the queue that can be manually resolved.
93+
// It will block until `trigger` is called.
94+
struct ManuallyTriggeredTask {
95+
std::mutex M;
96+
std::condition_variable CV;
97+
bool Flag = false;
98+
ol_event_handle_t CompleteEvent;
99+
100+
ol_result_t enqueue(ol_queue_handle_t Queue) {
101+
if (auto Err = olLaunchHostFunction(
102+
Queue,
103+
[](void *That) {
104+
static_cast<ManuallyTriggeredTask *>(That)->wait();
105+
},
106+
this))
107+
return Err;
108+
109+
return olCreateEvent(Queue, &CompleteEvent);
110+
}
111+
112+
void wait() {
113+
std::unique_lock<std::mutex> lk(M);
114+
CV.wait_for(lk, std::chrono::milliseconds(1000), [&] { return Flag; });
115+
EXPECT_TRUE(Flag);
116+
}
117+
118+
ol_result_t trigger() {
119+
Flag = true;
120+
CV.notify_one();
121+
122+
return olSyncEvent(CompleteEvent);
123+
}
124+
};
125+
92126
struct OffloadTest : ::testing::Test {
93127
ol_device_handle_t Host = TestEnvironment::getHostDevice();
94128
};

0 commit comments

Comments
 (0)