Skip to content
This repository was archived by the owner on Jan 26, 2024. It is now read-only.

Commit fde2d84

Browse files
committed
Add option to skip AQL barrier
The change reuses HSA signals for dispatches as a wait signal. Skipping the barrier requires to disable L2 cache for sysmem allocations and extra tracking for HDP access with the large bar. ROC_BARRIER_SYNC=0 activates the new logic. Barrier sync is still used by default. ROC_ACTIVE_WAIT=1 enables unconditional active wait in ROCr. The change also consolidated ROCr wait logic under single function. Change-Id: I6bd1be30aa88258da1b1f9de319ef5a45852afd8
1 parent 1fbba04 commit fde2d84

File tree

7 files changed

+166
-98
lines changed

7 files changed

+166
-98
lines changed

device/rocm/rocblit.cpp

Lines changed: 29 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,7 @@ bool DmaBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMe
379379
return true;
380380
}
381381

382+
// ================================================================================================
382383
bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory,
383384
const amd::BufferRect& srcRect, const amd::BufferRect& dstRect,
384385
const amd::Coord3D& size, bool entire) const {
@@ -435,8 +436,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
435436
}
436437

437438
if (isSubwindowRectCopy ) {
438-
const hsa_signal_value_t kInitVal = 1;
439-
hsa_signal_store_relaxed(completion_signal_, kInitVal);
439+
hsa_signal_store_relaxed(completion_signal_, kInitSignalValueOne);
440440

441441
// Copy memory line by line
442442
hsa_status_t status =
@@ -447,10 +447,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
447447
return false;
448448
}
449449

450-
451-
hsa_signal_value_t val = hsa_signal_wait_scacquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0,
452-
uint64_t(-1), HSA_WAIT_STATE_BLOCKED);
453-
if (val != 0) {
450+
if (!WaitForSignal(completion_signal_)) {
454451
LogError("Async copy failed");
455452
return false;
456453
}
@@ -476,9 +473,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
476473
}
477474
}
478475

479-
hsa_signal_value_t val = hsa_signal_wait_scacquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0,
480-
uint64_t(-1), HSA_WAIT_STATE_BLOCKED);
481-
if (val != 0) {
476+
if (!WaitForSignal(completion_signal_)) {
482477
LogError("Async copy failed");
483478
return false;
484479
}
@@ -488,6 +483,7 @@ bool DmaBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& d
488483
return true;
489484
}
490485

486+
// ================================================================================================
491487
bool DmaBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
492488
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
493489
const amd::Coord3D& size, bool entire, size_t rowPitch,
@@ -598,6 +594,7 @@ bool DmaBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMem
598594
return result;
599595
}
600596

597+
// ================================================================================================
601598
bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
602599
const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
603600
const amd::Coord3D& size, bool enableCopyRect, bool flushDMA) const {
@@ -639,8 +636,7 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
639636
srcAgent = dstAgent = dev().getBackendDevice();
640637
}
641638

642-
const hsa_signal_value_t kInitVal = 1;
643-
hsa_signal_store_relaxed(completion_signal_, kInitVal);
639+
hsa_signal_store_relaxed(completion_signal_, kInitSignalValueOne);
644640

645641
// Use SDMA to transfer the data
646642
status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, size[0], 0, nullptr,
@@ -649,21 +645,7 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
649645
if (status == HSA_STATUS_SUCCESS) {
650646
hsa_signal_value_t val;
651647

652-
// Use ACTIVE wait for small transfers.
653-
// Might want to be dependent on also having an idle GPU
654-
// or, if queue is busy, may want to enqueue a blank barrier
655-
// before this and wait BLOCKED on its completion signal, followed
656-
// by ACTIVE on this.
657-
658-
constexpr size_t small_transfer_size = 4 * Mi;
659-
if (size[0] < small_transfer_size) {
660-
val = hsa_signal_wait_scacquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0,
661-
std::numeric_limits<uint64_t>::max(), HSA_WAIT_STATE_ACTIVE);
662-
} else {
663-
val = hsa_signal_wait_scacquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0,
664-
std::numeric_limits<uint64_t>::max(), HSA_WAIT_STATE_BLOCKED);
665-
}
666-
if (val != (kInitVal - 1)) {
648+
if (!WaitForSignal(completion_signal_)) {
667649
LogError("Async copy failed");
668650
status = HSA_STATUS_ERROR;
669651
} else {
@@ -676,6 +658,7 @@ bool DmaBlitManager::hsaCopy(const Memory& srcMemory, const Memory& dstMemory,
676658
return (status == HSA_STATUS_SUCCESS);
677659
}
678660

661+
// ================================================================================================
679662
bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_t size,
680663
address staging, bool hostToDev) const {
681664
// No allocation is necessary for Full Profile
@@ -693,12 +676,10 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
693676

694677
address hsaBuffer = staging;
695678

696-
const hsa_signal_value_t kInitVal = 1;
697-
698679
// Allocate requested size of memory
699680
while (totalSize > 0) {
700681
size = std::min(totalSize, dev().settings().stagedXferSize_);
701-
hsa_signal_silent_store_relaxed(completion_signal_, kInitVal);
682+
hsa_signal_silent_store_relaxed(completion_signal_, kInitSignalValueOne);
702683

703684
// Copy data from Host to Device
704685
if (hostToDev) {
@@ -712,10 +693,7 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
712693
status = hsa_amd_memory_async_copy(hostDst + offset, dev().getBackendDevice(), hsaBuffer,
713694
srcAgent, size, 0, nullptr, completion_signal_);
714695
if (status == HSA_STATUS_SUCCESS) {
715-
hsa_signal_value_t val = hsa_signal_wait_scacquire(
716-
completion_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1), HSA_WAIT_STATE_BLOCKED);
717-
718-
if (val != (kInitVal - 1)) {
696+
if (!WaitForSignal(completion_signal_)) {
719697
LogError("Async copy failed");
720698
return false;
721699
}
@@ -739,10 +717,7 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
739717
hsa_amd_memory_async_copy(hsaBuffer, dstAgent, hostSrc + offset,
740718
dev().getBackendDevice(), size, 0, nullptr, completion_signal_);
741719
if (status == HSA_STATUS_SUCCESS) {
742-
hsa_signal_value_t val = hsa_signal_wait_scacquire(completion_signal_, HSA_SIGNAL_CONDITION_EQ,
743-
0, uint64_t(-1), HSA_WAIT_STATE_BLOCKED);
744-
745-
if (val != (kInitVal - 1)) {
720+
if (!WaitForSignal(completion_signal_)) {
746721
LogError("Async copy failed");
747722
return false;
748723
}
@@ -760,6 +735,7 @@ bool DmaBlitManager::hsaCopyStaged(const_address hostSrc, address hostDst, size_
760735
return true;
761736
}
762737

738+
// ================================================================================================
763739
KernelBlitManager::KernelBlitManager(VirtualGPU& gpu, Setup setup)
764740
: DmaBlitManager(gpu, setup),
765741
program_(nullptr),
@@ -1659,6 +1635,7 @@ bool KernelBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory
16591635
return result;
16601636
}
16611637

1638+
// ================================================================================================
16621639
bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
16631640
const amd::Coord3D& origin, const amd::Coord3D& size,
16641641
bool entire) const {
@@ -1667,12 +1644,13 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
16671644

16681645
if (dev().info().largeBar_ && size[0] <= kMaxD2hMemcpySize) {
16691646
if ((srcMemory.owner()->getHostMem() == nullptr) && (srcMemory.owner()->getSvmPtr() != nullptr)) {
1670-
// CPU read ahead, hence release GPU memory
1671-
gpu().releaseGpuMemoryFence();
1647+
// CPU read ahead, hence release GPU memory and force barrier to make sure L2 flush
1648+
constexpr bool ForceBarrier = true;
1649+
gpu().releaseGpuMemoryFence(ForceBarrier);
16721650
char* src = reinterpret_cast<char*>(srcMemory.owner()->getSvmPtr());
16731651
std::memcpy(dstHost, src + origin[0], size[0]);
1674-
// Set HASPENDINGDISPATCH_ FLAG. That will force L2 invalidation on flush
1675-
gpu().hasPendingDispatch();
1652+
// The first dispatch will invalidate L2
1653+
gpu().addSystemScope();
16761654
return true;
16771655
}
16781656
}
@@ -1717,6 +1695,7 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
17171695
return result;
17181696
}
17191697

1698+
// ================================================================================================
17201699
bool KernelBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
17211700
const amd::BufferRect& bufRect,
17221701
const amd::BufferRect& hostRect, const amd::Coord3D& size,
@@ -1764,6 +1743,7 @@ bool KernelBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
17641743
return result;
17651744
}
17661745

1746+
// ================================================================================================
17671747
bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
17681748
const amd::Coord3D& origin, const amd::Coord3D& size,
17691749
bool entire) const {
@@ -1773,12 +1753,13 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
17731753
if (dev().info().largeBar_ && size[0] <= kMaxH2dMemcpySize) {
17741754
if ((dstMemory.owner()->getHostMem() == nullptr) && (dstMemory.owner()->getSvmPtr() != nullptr)) {
17751755
// CPU read ahead, hence release GPU memory
1776-
gpu().releaseGpuMemoryFence();
1756+
constexpr bool ForceBarrier = true;
1757+
gpu().releaseGpuMemoryFence(ForceBarrier);
17771758
char* dst = reinterpret_cast<char*>(dstMemory.owner()->getSvmPtr());
17781759
std::memcpy(dst + origin[0], srcHost, size[0]);
17791760
// Set HASPENDINGDISPATCH_ FLAG. Then releaseGpuMemoryFence() will use barrier to invalidate cache
17801761
gpu().hasPendingDispatch();
1781-
gpu().releaseGpuMemoryFence();
1762+
gpu().releaseGpuMemoryFence(ForceBarrier);
17821763
return true;
17831764
}
17841765
}
@@ -1825,6 +1806,7 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
18251806
return result;
18261807
}
18271808

1809+
// ================================================================================================
18281810
bool KernelBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMemory,
18291811
const amd::BufferRect& hostRect,
18301812
const amd::BufferRect& bufRect, const amd::Coord3D& size,
@@ -2284,6 +2266,7 @@ address KernelBlitManager::captureArguments(const amd::Kernel* kernel) const {
22842266
void KernelBlitManager::releaseArguments(address args) const {
22852267
}
22862268

2269+
// ================================================================================================
22872270
bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
22882271
hsa_queue_t* schedulerQueue,
22892272
hsa_signal_t& schedulerSignal,
@@ -2310,7 +2293,7 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
23102293
sp->child_queue = reinterpret_cast<uint64_t>(schedulerQueue);
23112294
sp->complete_signal = schedulerSignal;
23122295

2313-
hsa_signal_store_relaxed(schedulerSignal, 1);
2296+
hsa_signal_store_relaxed(schedulerSignal, kInitSignalValueOne);
23142297

23152298
sp->scheduler_aql.header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
23162299
(1 << HSA_PACKET_HEADER_BARRIER) |
@@ -2346,15 +2329,15 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
23462329
}
23472330
releaseArguments(parameters);
23482331

2349-
if (hsa_signal_wait_scacquire(schedulerSignal, HSA_SIGNAL_CONDITION_LT, 1, (-1),
2350-
HSA_WAIT_STATE_BLOCKED) != 0) {
2332+
if (!WaitForSignal(schedulerSignal)) {
23512333
LogWarning("Failed schedulerSignal wait");
23522334
return false;
23532335
}
23542336

23552337
return true;
23562338
}
23572339

2340+
// ================================================================================================
23582341
bool KernelBlitManager::RunGwsInit(
23592342
uint32_t value) const {
23602343
amd::ScopedLock k(lockXferOps_);

device/rocm/rocdevice.cpp

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -718,7 +718,7 @@ bool Device::create() {
718718
}
719719

720720
// Create signal for HMM prefetch operation on device
721-
if (HSA_STATUS_SUCCESS != hsa_signal_create(InitSignalValue, 0, nullptr, &prefetch_signal_)) {
721+
if (HSA_STATUS_SUCCESS != hsa_signal_create(kInitSignalValueOne, 0, nullptr, &prefetch_signal_)) {
722722
return false;
723723
}
724724

@@ -1476,6 +1476,12 @@ bool Device::populateOCLDeviceConstants() {
14761476
device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) {
14771477
amd::ScopedLock lock(vgpusAccess());
14781478

1479+
// If barrier is disabled, then profiling should be enabled to make sure HSA signal is
1480+
// attached for every dispatch
1481+
if (!settings().barrier_sync_) {
1482+
queue->properties().set(CL_QUEUE_PROFILING_ENABLE);
1483+
}
1484+
14791485
bool profiling = (queue != nullptr) && queue->properties().test(CL_QUEUE_PROFILING_ENABLE);
14801486
bool cooperative = false;
14811487

@@ -1732,9 +1738,11 @@ device::Memory* Device::createMemory(amd::Memory& owner) const {
17321738
return memory;
17331739
}
17341740

1741+
// ================================================================================================
17351742
void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const {
17361743
void* ptr = nullptr;
1737-
const hsa_amd_memory_pool_t segment = (!atomics)
1744+
// If runtime disables barrier, then all host allocations must have L2 disabled
1745+
const hsa_amd_memory_pool_t segment = (!atomics && settings().barrier_sync_)
17381746
? (system_coarse_segment_.handle != 0) ? system_coarse_segment_ : system_segment_
17391747
: system_segment_;
17401748
assert(segment.handle != 0);
@@ -1754,10 +1762,12 @@ void* Device::hostAlloc(size_t size, size_t alignment, bool atomics) const {
17541762
return ptr;
17551763
}
17561764

1765+
// ================================================================================================
17571766
void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, bool atomics) const {
17581767
void* ptr = nullptr;
17591768
const hsa_amd_memory_pool_t segment =
1760-
(!atomics) ?
1769+
// If runtime disables barrier, then all host allocations must have L2 disabled
1770+
(!atomics && settings().barrier_sync_) ?
17611771
(agentInfo.coarse_grain_pool.handle != 0) ?
17621772
agentInfo.coarse_grain_pool : agentInfo.fine_grain_pool
17631773
: agentInfo.fine_grain_pool;
@@ -1778,6 +1788,7 @@ void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, bool atomi
17781788
return ptr;
17791789
}
17801790

1791+
// ================================================================================================
17811792
void* Device::hostNumaAlloc(size_t size, size_t alignment, bool atomics) const {
17821793
void* ptr = nullptr;
17831794
#ifndef ROCCLR_SUPPORT_NUMA_POLICY
@@ -2215,7 +2226,7 @@ bool Device::SvmAllocInit(void* memory, size_t size) const {
22152226

22162227
#if AMD_HMM_SUPPORT
22172228
// Initialize signal for the barrier
2218-
hsa_signal_store_relaxed(prefetch_signal_, InitSignalValue);
2229+
hsa_signal_store_relaxed(prefetch_signal_, kInitSignalValueOne);
22192230

22202231
// Initiate a prefetch command which should force memory update in HMM
22212232
hsa_status_t status = hsa_amd_svm_prefetch_async(memory, size, getBackendDevice(),
@@ -2226,8 +2237,7 @@ bool Device::SvmAllocInit(void* memory, size_t size) const {
22262237
}
22272238

22282239
// Wait for the prefetch
2229-
if (hsa_signal_wait_scacquire(prefetch_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1),
2230-
HSA_WAIT_STATE_BLOCKED) != 0) {
2240+
if (!WaitForSignal(prefetch_signal_)) {
22312241
LogError("Barrier packet submission failed");
22322242
return false;
22332243
}

device/rocm/rocsettings.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ Settings::Settings() {
9090
hmmFlags_ = (!flagIsDefault(ROC_HMM_FLAGS)) ? ROC_HMM_FLAGS : Hmm::EnableSystemMemory;
9191

9292
rocr_backend_ = true;
93+
barrier_sync_ = (!flagIsDefault(ROC_BARRIER_SYNC)) ? ROC_BARRIER_SYNC : true;
9394
}
9495

9596
bool Settings::create(bool fullProfile, int gfxipMajor, int gfxipMinor, bool coop_groups) {

device/rocm/rocsettings.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ class Settings : public device::Settings {
5151
uint stagedXferRead_ : 1; //!< Uses a staged buffer read
5252
uint stagedXferWrite_ : 1; //!< Uses a staged buffer write
5353
uint imageBufferWar_ : 1; //!< Image buffer workaround for Gfx10
54-
uint reserved_ : 24;
54+
uint barrier_sync_ : 1; //!< Use AQL barrier command to sync with CPU
55+
uint reserved_ : 23;
5556
};
5657
uint value_;
5758
};
@@ -82,7 +83,7 @@ class Settings : public device::Settings {
8283

8384
size_t sdmaCopyThreshold_; //!< Use SDMA to copy above this size
8485

85-
uint32_t hmmFlags_; //!< HMM functionality control flags
86+
uint32_t hmmFlags_; //!< HMM functionality control flags
8687

8788
//! Default constructor
8889
Settings();

0 commit comments

Comments
 (0)