Skip to content
This repository was archived by the owner on Jan 26, 2024. It is now read-only.

Commit 9e354e2

Browse files
gandryeyamd-aakash
authored andcommitted
Reduce the number of allocated signals
Enable this optimization when the barrier is disabled, since reuse requires a signal wait. Use the size of pending AQL signals as the size of signal pool. Change-Id: I2754a0f8b67e19d2601c58945e10fdf0e8be1624 (cherry picked from commit 86fa372)
1 parent 3e91378 commit 9e354e2

File tree

2 files changed

+39
-5
lines changed

2 files changed

+39
-5
lines changed

device/rocm/rocvirtual.cpp

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -455,8 +455,21 @@ bool VirtualGPU::dispatchGenericAqlPacket(
455455

456456
// TODO: placeholder to setup the kernel to populate start and end timestamp.
457457
if (timestamp_ != nullptr) {
458+
// Pool size must grow to the size of pending AQL packets
459+
const uint32_t pool_size = index - read;
460+
if (pool_size >= signal_pool_.size()) {
461+
ProfilingSignal profiling_signal = {};
462+
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profiling_signal.signal_)) {
463+
LogPrintfError("Failed signal allocation id = %d", pool_size);
464+
return false;
465+
}
466+
signal_pool_.push_back(profiling_signal);
467+
assert(queueSize >= signal_pool_.size() && "Pool will be reallocated!");
468+
}
469+
// Move index inside the valid pool
470+
++current_signal_ %= signal_pool_.size();
458471
// Find signal slot
459-
ProfilingSignal* profilingSignal = &signal_pool_[index & queueMask];
472+
ProfilingSignal* profilingSignal = &signal_pool_[current_signal_];
460473
// Make sure we save the old results in the TS structure
461474
if (profilingSignal->ts_ != nullptr) {
462475
profilingSignal->ts_->checkGpuTime();
@@ -536,6 +549,8 @@ bool VirtualGPU::dispatchGenericAqlPacket(
536549
LogPrintfError("Failed signal [0x%lx] wait", signal.handle);
537550
return false;
538551
}
552+
// Reset the pool of signals
553+
current_signal_ = 0;
539554
}
540555

541556
return true;
@@ -620,8 +635,14 @@ void VirtualGPU::ResetQueueStates() {
620635
// Release all memory dependencies
621636
memoryDependency().clear();
622637

623-
// Release the pool, since runtime just completed a barrier
624-
resetKernArgPool();
638+
if (dev().settings().barrier_sync_) {
639+
// Release the pool, since runtime just completed a barrier
640+
// @note: Runtime can reset kernel arg pool only if the barrier with L2 invalidation was issued
641+
resetKernArgPool();
642+
} else {
643+
// Reset the pool of signals
644+
current_signal_ = 0;
645+
}
625646
}
626647

627648
// ================================================================================================
@@ -833,8 +854,17 @@ bool VirtualGPU::initPool(size_t kernarg_pool_size, uint signal_pool_count) {
833854
}
834855

835856
if (signal_pool_count != 0) {
836-
signal_pool_.resize(signal_pool_count);
837-
for (uint i = 0; i < signal_pool_count; ++i) {
857+
// Reserve signal pool for all entries in the queue, since profiling logic will save the
858+
// pointer in timestamp info for the future references
859+
signal_pool_.reserve(signal_pool_count);
860+
// If barrier is disable, then allocate a small portion of all signals and grow the array later.
861+
// @note: the optimization requires a wait for signal on reuse, which is only available when
862+
// the barrier is disabled
863+
constexpr uint32_t kDefaultSignalPoolSize = 32;
864+
const uint32_t default_signal_pool_size = (dev().settings().barrier_sync_) ?
865+
signal_pool_count : kDefaultSignalPoolSize;
866+
signal_pool_.resize(default_signal_pool_size);
867+
for (uint i = 0; i < default_signal_pool_size; ++i) {
838868
ProfilingSignal profilingSignal;
839869
if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profilingSignal.signal_)) {
840870
return false;
@@ -881,6 +911,9 @@ void* VirtualGPU::allocKernArg(size_t size, size_t alignment) {
881911
}
882912

883913
resetKernArgPool();
914+
915+
// Reset the pool of signals
916+
current_signal_ = 0;
884917
}
885918
} while (true);
886919

device/rocm/rocvirtual.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,7 @@ class VirtualGPU : public device::VirtualDevice {
388388
uint kernarg_pool_cur_offset_;
389389

390390
std::vector<ProfilingSignal> signal_pool_; //!< Pool of signals for profiling
391+
uint32_t current_signal_ = 0; //!< Current avaialble signal in the pool
391392
friend class Timestamp;
392393

393394
// PM4 packet for gfx8 performance counter

0 commit comments

Comments
 (0)