@@ -924,6 +924,7 @@ struct AMDGPUStreamTy {
924924 void *Dst;
925925 const void *Src;
926926 size_t Size;
927+ size_t NumTimes;
927928 };
928929
929930 // / Utility struct holding arguments for freeing buffers to memory managers.
@@ -974,9 +975,14 @@ struct AMDGPUStreamTy {
974975 StreamSlotTy () : Signal(nullptr ), Callbacks({}), ActionArgs({}) {}
975976
976977 // / Schedule a host memory copy action on the slot.
977- Error schedHostMemoryCopy (void *Dst, const void *Src, size_t Size) {
978+ // /
979+ // / Num times will repeat the copy that many times, sequentually in the dest
980+ // / buffer.
981+ Error schedHostMemoryCopy (void *Dst, const void *Src, size_t Size,
982+ size_t NumTimes = 1 ) {
978983 Callbacks.emplace_back (memcpyAction);
979- ActionArgs.emplace_back ().MemcpyArgs = MemcpyArgsTy{Dst, Src, Size};
984+ ActionArgs.emplace_back ().MemcpyArgs =
985+ MemcpyArgsTy{Dst, Src, Size, NumTimes};
980986 return Plugin::success ();
981987 }
982988
@@ -1216,7 +1222,12 @@ struct AMDGPUStreamTy {
12161222 assert (Args->Dst && " Invalid destination buffer" );
12171223 assert (Args->Src && " Invalid source buffer" );
12181224
1219- std::memcpy (Args->Dst , Args->Src , Args->Size );
1225+ auto BasePtr = Args->Dst ;
1226+ for (size_t I = 0 ; I < Args->NumTimes ; I++) {
1227+ std::memcpy (BasePtr, Args->Src , Args->Size );
1228+ BasePtr = reinterpret_cast <void *>(reinterpret_cast <uintptr_t >(BasePtr) +
1229+ Args->Size );
1230+ }
12201231
12211232 return Plugin::success ();
12221233 }
@@ -1421,7 +1432,8 @@ struct AMDGPUStreamTy {
14211432 // / manager once the operation completes.
14221433 Error pushMemoryCopyH2DAsync (void *Dst, const void *Src, void *Inter,
14231434 uint64_t CopySize,
1424- AMDGPUMemoryManagerTy &MemoryManager) {
1435+ AMDGPUMemoryManagerTy &MemoryManager,
1436+ size_t NumTimes = 1 ) {
14251437 // Retrieve available signals for the operation's outputs.
14261438 AMDGPUSignalTy *OutputSignals[2 ] = {};
14271439 if (auto Err = SignalManager.getResources (/* Num=*/ 2 , OutputSignals))
@@ -1443,7 +1455,8 @@ struct AMDGPUStreamTy {
14431455 // The std::memcpy is done asynchronously using an async handler. We store
14441456 // the function's information in the action but it is not actually a
14451457 // post action.
1446- if (auto Err = Slots[Curr].schedHostMemoryCopy (Inter, Src, CopySize))
1458+ if (auto Err =
1459+ Slots[Curr].schedHostMemoryCopy (Inter, Src, CopySize, NumTimes))
14471460 return Err;
14481461
14491462 // Make changes on this slot visible to the async handler's thread.
@@ -1464,7 +1477,12 @@ struct AMDGPUStreamTy {
14641477 std::tie (Curr, InputSignal) = consume (OutputSignal);
14651478 } else {
14661479 // All preceding operations completed, copy the memory synchronously.
1467- std::memcpy (Inter, Src, CopySize);
1480+ auto *InterPtr = Inter;
1481+ for (size_t I = 0 ; I < NumTimes; I++) {
1482+ std::memcpy (InterPtr, Src, CopySize);
1483+ InterPtr = reinterpret_cast <void *>(
1484+ reinterpret_cast <uintptr_t >(InterPtr) + CopySize);
1485+ }
14681486
14691487 // Return the second signal because it will not be used.
14701488 OutputSignals[1 ]->decreaseUseCount ();
@@ -1481,11 +1499,11 @@ struct AMDGPUStreamTy {
14811499 if (InputSignal && InputSignal->load ()) {
14821500 hsa_signal_t InputSignalRaw = InputSignal->get ();
14831501 return hsa_utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Inter,
1484- Agent, CopySize, 1 , &InputSignalRaw ,
1485- OutputSignal->get ());
1502+ Agent, CopySize * NumTimes, 1 ,
1503+ &InputSignalRaw, OutputSignal->get ());
14861504 }
14871505 return hsa_utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Inter,
1488- Agent, CopySize, 0 , nullptr ,
1506+ Agent, CopySize * NumTimes , 0 , nullptr ,
14891507 OutputSignal->get ());
14901508 }
14911509
@@ -2611,26 +2629,73 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
26112629 Error dataFillImpl (void *TgtPtr, const void *PatternPtr, int64_t PatternSize,
26122630 int64_t Size,
26132631 AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2614- hsa_status_t Status;
2632+ // Fast case, where we can use the 4 byte hsa_amd_memory_fill
2633+ if (Size % 4 == 0 &&
2634+ (PatternSize == 4 || PatternSize == 2 || PatternSize == 1 )) {
2635+ uint32_t Pattern;
2636+ if (PatternSize == 1 ) {
2637+ auto *Byte = reinterpret_cast <const uint8_t *>(PatternPtr);
2638+ Pattern = *Byte | *Byte << 8 | *Byte << 16 | *Byte << 24 ;
2639+ } else if (PatternSize == 2 ) {
2640+ auto *Word = reinterpret_cast <const uint16_t *>(PatternPtr);
2641+ Pattern = *Word | (*Word << 16 );
2642+ } else if (PatternSize == 4 ) {
2643+ Pattern = *reinterpret_cast <const uint32_t *>(PatternPtr);
2644+ } else {
2645+ // Shouldn't be here if the pattern size is outwith those values
2646+ std::terminate ();
2647+ }
26152648
2616- // We can use hsa_amd_memory_fill for this size, but it's not async so the
2617- // queue needs to be synchronized first
2618- if (PatternSize == 4 ) {
2619- if (AsyncInfoWrapper.hasQueue ())
2620- if (auto Err = synchronize (AsyncInfoWrapper))
2649+ if (hasPendingWorkImpl (AsyncInfoWrapper)) {
2650+ AMDGPUStreamTy *Stream = nullptr ;
2651+ if (auto Err = getStream (AsyncInfoWrapper, Stream))
26212652 return Err;
2622- Status = hsa_amd_memory_fill (TgtPtr,
2623- *static_cast <const uint32_t *>(PatternPtr),
2624- Size / PatternSize);
26252653
2626- if (auto Err =
2627- Plugin::check (Status, " error in hsa_amd_memory_fill: %s\n " ))
2628- return Err;
2629- } else {
2630- // TODO: Implement for AMDGPU. Most likely by doing the fill in pinned
2631- // memory and copying to the device in one go.
2632- return Plugin::error (ErrorCode::UNSUPPORTED, " Unsupported fill size" );
2654+ struct MemFillArgsTy {
2655+ void *Dst;
2656+ uint32_t Pattern;
2657+ int64_t Size;
2658+ };
2659+ auto *Args = new MemFillArgsTy{TgtPtr, Pattern, Size / 4 };
2660+ auto Fill = [](void *Data) {
2661+ MemFillArgsTy *Args = reinterpret_cast <MemFillArgsTy *>(Data);
2662+ assert (Args && " Invalid arguments" );
2663+
2664+ auto Status =
2665+ hsa_amd_memory_fill (Args->Dst , Args->Pattern , Args->Size );
2666+ delete Args;
2667+ auto Err =
2668+ Plugin::check (Status, " error in hsa_amd_memory_fill: %s\n " );
2669+ if (Err) {
2670+ FATAL_MESSAGE (1 , " error performing async fill: %s" ,
2671+ toString (std::move (Err)).data ());
2672+ }
2673+ };
2674+
2675+ // hsa_amd_memory_fill doesn't signal completion using a signal, so use
2676+ // the existing host callback logic to handle that instead
2677+ return Stream->pushHostCallback (Fill, Args);
2678+ } else {
2679+ // If there is no pending work, do the fill synchronously
2680+ auto Status = hsa_amd_memory_fill (TgtPtr, Pattern, Size / 4 );
2681+ return Plugin::check (Status, " error in hsa_amd_memory_fill: %s\n " );
2682+ }
26332683 }
2684+
2685+ // Slow case; allocate an appropriate memory size and enqueue copies
2686+ void *PinnedPtr = nullptr ;
2687+ AMDGPUMemoryManagerTy &PinnedMemoryManager =
2688+ HostDevice.getPinnedMemoryManager ();
2689+ if (auto Err = PinnedMemoryManager.allocate (Size, &PinnedPtr))
2690+ return Err;
2691+
2692+ AMDGPUStreamTy *Stream = nullptr ;
2693+ if (auto Err = getStream (AsyncInfoWrapper, Stream))
2694+ return Err;
2695+
2696+ return Stream->pushMemoryCopyH2DAsync (TgtPtr, PatternPtr, PinnedPtr,
2697+ PatternSize, PinnedMemoryManager,
2698+ Size / PatternSize);
26342699 }
26352700
26362701 // / Initialize the async info for interoperability purposes.
0 commit comments