@@ -379,6 +379,7 @@ bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMe
379379 return true ;
380380}
381381
382+ // ================================================================================================
382383bool DmaBlitManager::copyBufferRect (device::Memory& srcMemory, device::Memory& dstMemory,
383384 const amd::BufferRect& srcRect, const amd::BufferRect& dstRect,
384385 const amd::Coord3D& size, bool entire) const {
@@ -435,8 +436,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
435436 }
436437
437438 if (isSubwindowRectCopy ) {
438- const hsa_signal_value_t kInitVal = 1 ;
439- hsa_signal_store_relaxed (completion_signal_, kInitVal );
439+ hsa_signal_store_relaxed (completion_signal_, kInitSignalValueOne );
440440
441441 // Copy memory line by line
442442 hsa_status_t status =
@@ -447,10 +447,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
447447 return false ;
448448 }
449449
450-
451- hsa_signal_value_t val = hsa_signal_wait_scacquire (completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0 ,
452- uint64_t (-1 ), HSA_WAIT_STATE_BLOCKED);
453- if (val != 0 ) {
450+ if (!WaitForSignal (completion_signal_)) {
454451 LogError (" Async copy failed" );
455452 return false ;
456453 }
@@ -476,9 +473,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
476473 }
477474 }
478475
479- hsa_signal_value_t val = hsa_signal_wait_scacquire (completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0 ,
480- uint64_t (-1 ), HSA_WAIT_STATE_BLOCKED);
481- if (val != 0 ) {
476+ if (!WaitForSignal (completion_signal_)) {
482477 LogError (" Async copy failed" );
483478 return false ;
484479 }
@@ -488,6 +483,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
488483 return true ;
489484}
490485
486+ // ================================================================================================
491487bool DmaBlitManager::copyImageToBuffer (device::Memory& srcMemory, device::Memory& dstMemory,
492488 const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
493489 const amd::Coord3D& size, bool entire, size_t rowPitch,
@@ -598,6 +594,7 @@ bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMem
598594 return result;
599595}
600596
597+ // ================================================================================================
601598bool DmaBlitManager::hsaCopy (const Memory& srcMemory, const Memory& dstMemory,
602599 const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
603600 const amd::Coord3D& size, bool enableCopyRect, bool flushDMA) const {
@@ -639,8 +636,7 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
639636 srcAgent = dstAgent = dev ().getBackendDevice ();
640637 }
641638
642- const hsa_signal_value_t kInitVal = 1 ;
643- hsa_signal_store_relaxed (completion_signal_, kInitVal );
639+ hsa_signal_store_relaxed (completion_signal_, kInitSignalValueOne );
644640
645641 // Use SDMA to transfer the data
646642 status = hsa_amd_memory_async_copy (dst, dstAgent, src, srcAgent, size[0 ], 0 , nullptr ,
@@ -649,21 +645,7 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
649645 if (status == HSA_STATUS_SUCCESS) {
650646 hsa_signal_value_t val;
651647
652- // Use ACTIVE wait for small transfers.
653- // Might want to be dependent on also having an idle GPU
654- // or, if queue is busy, may want to enqueue a blank barrier
655- // before this and wait BLOCKED on its completion signal, followed
656- // by ACTIVE on this.
657-
658- constexpr size_t small_transfer_size = 4 * Mi;
659- if (size[0 ] < small_transfer_size) {
660- val = hsa_signal_wait_scacquire (completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0 ,
661- std::numeric_limits<uint64_t >::max (), HSA_WAIT_STATE_ACTIVE);
662- } else {
663- val = hsa_signal_wait_scacquire (completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0 ,
664- std::numeric_limits<uint64_t >::max (), HSA_WAIT_STATE_BLOCKED);
665- }
666- if (val != (kInitVal - 1 )) {
648+ if (!WaitForSignal (completion_signal_)) {
667649 LogError (" Async copy failed" );
668650 status = HSA_STATUS_ERROR;
669651 } else {
@@ -676,6 +658,7 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
676658 return (status == HSA_STATUS_SUCCESS);
677659}
678660
661+ // ================================================================================================
679662bool DmaBlitManager::hsaCopyStaged (const_address hostSrc, address hostDst, size_t size,
680663 address staging, bool hostToDev) const {
681664 // No allocation is necessary for Full Profile
@@ -693,12 +676,10 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
693676
694677 address hsaBuffer = staging;
695678
696- const hsa_signal_value_t kInitVal = 1 ;
697-
698679 // Allocate requested size of memory
699680 while (totalSize > 0 ) {
700681 size = std::min (totalSize, dev ().settings ().stagedXferSize_ );
701- hsa_signal_silent_store_relaxed (completion_signal_, kInitVal );
682+ hsa_signal_silent_store_relaxed (completion_signal_, kInitSignalValueOne );
702683
703684 // Copy data from Host to Device
704685 if (hostToDev) {
@@ -712,10 +693,7 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
712693 status = hsa_amd_memory_async_copy (hostDst + offset, dev ().getBackendDevice (), hsaBuffer,
713694 srcAgent, size, 0 , nullptr , completion_signal_);
714695 if (status == HSA_STATUS_SUCCESS) {
715- hsa_signal_value_t val = hsa_signal_wait_scacquire (
716- completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0 , uint64_t (-1 ), HSA_WAIT_STATE_BLOCKED);
717-
718- if (val != (kInitVal - 1 )) {
696+ if (!WaitForSignal (completion_signal_)) {
719697 LogError (" Async copy failed" );
720698 return false ;
721699 }
@@ -739,10 +717,7 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
739717 hsa_amd_memory_async_copy (hsaBuffer, dstAgent, hostSrc + offset,
740718 dev ().getBackendDevice (), size, 0 , nullptr , completion_signal_);
741719 if (status == HSA_STATUS_SUCCESS) {
742- hsa_signal_value_t val = hsa_signal_wait_scacquire (completion_signal_, HSA_SIGNAL_CONDITION_EQ,
743- 0 , uint64_t (-1 ), HSA_WAIT_STATE_BLOCKED);
744-
745- if (val != (kInitVal - 1 )) {
720+ if (!WaitForSignal (completion_signal_)) {
746721 LogError (" Async copy failed" );
747722 return false ;
748723 }
@@ -760,6 +735,7 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
760735 return true ;
761736}
762737
738+ // ================================================================================================
763739KernelBlitManager::KernelBlitManager (VirtualGPU& gpu, Setup setup)
764740 : DmaBlitManager(gpu, setup),
765741 program_(nullptr ),
@@ -1659,6 +1635,7 @@ bool KernelBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory
16591635 return result;
16601636}
16611637
1638+ // ================================================================================================
16621639bool KernelBlitManager::readBuffer (device::Memory& srcMemory, void * dstHost,
16631640 const amd::Coord3D& origin, const amd::Coord3D& size,
16641641 bool entire) const {
@@ -1667,12 +1644,13 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
16671644
16681645 if (dev ().info ().largeBar_ && size[0 ] <= kMaxD2hMemcpySize ) {
16691646 if ((srcMemory.owner ()->getHostMem () == nullptr ) && (srcMemory.owner ()->getSvmPtr () != nullptr )) {
1670- // CPU read ahead, hence release GPU memory
1671- gpu ().releaseGpuMemoryFence ();
1647+ // CPU read ahead, hence release GPU memory and force barrier to make sure L2 flush
1648+ constexpr bool ForceBarrier = true ;
1649+ gpu ().releaseGpuMemoryFence (ForceBarrier);
16721650 char * src = reinterpret_cast <char *>(srcMemory.owner ()->getSvmPtr ());
16731651 std::memcpy (dstHost, src + origin[0 ], size[0 ]);
1674- // Set HASPENDINGDISPATCH_ FLAG. That will force L2 invalidation on flush
1675- gpu ().hasPendingDispatch ();
1652+ // The first dispatch will invalidate L2
1653+ gpu ().addSystemScope ();
16761654 return true ;
16771655 }
16781656 }
@@ -1717,6 +1695,7 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
17171695 return result;
17181696}
17191697
1698+ // ================================================================================================
17201699bool KernelBlitManager::readBufferRect (device::Memory& srcMemory, void * dstHost,
17211700 const amd::BufferRect& bufRect,
17221701 const amd::BufferRect& hostRect, const amd::Coord3D& size,
@@ -1764,6 +1743,7 @@ bool KernelBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
17641743 return result;
17651744}
17661745
1746+ // ================================================================================================
17671747bool KernelBlitManager::writeBuffer (const void * srcHost, device::Memory& dstMemory,
17681748 const amd::Coord3D& origin, const amd::Coord3D& size,
17691749 bool entire) const {
@@ -1773,12 +1753,13 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
17731753 if (dev ().info ().largeBar_ && size[0 ] <= kMaxH2dMemcpySize ) {
17741754 if ((dstMemory.owner ()->getHostMem () == nullptr ) && (dstMemory.owner ()->getSvmPtr () != nullptr )) {
17751755 // CPU read ahead, hence release GPU memory
1776- gpu ().releaseGpuMemoryFence ();
1756+ constexpr bool ForceBarrier = true ;
1757+ gpu ().releaseGpuMemoryFence (ForceBarrier);
17771758 char * dst = reinterpret_cast <char *>(dstMemory.owner ()->getSvmPtr ());
17781759 std::memcpy (dst + origin[0 ], srcHost, size[0 ]);
17791760 // Set HASPENDINGDISPATCH_ FLAG. Then releaseGpuMemoryFence() will use barrier to invalidate cache
17801761 gpu ().hasPendingDispatch ();
1781- gpu ().releaseGpuMemoryFence ();
1762+ gpu ().releaseGpuMemoryFence (ForceBarrier );
17821763 return true ;
17831764 }
17841765 }
@@ -1825,6 +1806,7 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
18251806 return result;
18261807}
18271808
1809+ // ================================================================================================
18281810bool KernelBlitManager::writeBufferRect (const void * srcHost, device::Memory& dstMemory,
18291811 const amd::BufferRect& hostRect,
18301812 const amd::BufferRect& bufRect, const amd::Coord3D& size,
@@ -2284,6 +2266,7 @@ address KernelBlitManager::captureArguments(const amd::Kernel* kernel) const {
22842266void KernelBlitManager::releaseArguments (address args) const {
22852267}
22862268
2269+ // ================================================================================================
22872270bool KernelBlitManager::runScheduler (uint64_t vqVM, amd::Memory* schedulerParam,
22882271 hsa_queue_t * schedulerQueue,
22892272 hsa_signal_t & schedulerSignal,
@@ -2310,7 +2293,7 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
23102293 sp->child_queue = reinterpret_cast <uint64_t >(schedulerQueue);
23112294 sp->complete_signal = schedulerSignal;
23122295
2313- hsa_signal_store_relaxed (schedulerSignal, 1 );
2296+ hsa_signal_store_relaxed (schedulerSignal, kInitSignalValueOne );
23142297
23152298 sp->scheduler_aql .header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
23162299 (1 << HSA_PACKET_HEADER_BARRIER) |
@@ -2346,15 +2329,15 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
23462329 }
23472330 releaseArguments (parameters);
23482331
2349- if (hsa_signal_wait_scacquire (schedulerSignal, HSA_SIGNAL_CONDITION_LT, 1 , (-1 ),
2350- HSA_WAIT_STATE_BLOCKED) != 0 ) {
2332+ if (!WaitForSignal (schedulerSignal)) {
23512333 LogWarning (" Failed schedulerSignal wait" );
23522334 return false ;
23532335 }
23542336
23552337 return true ;
23562338}
23572339
2340+ // ================================================================================================
23582341bool KernelBlitManager::RunGwsInit (
23592342 uint32_t value) const {
23602343 amd::ScopedLock k (lockXferOps_);
0 commit comments