@@ -455,8 +455,21 @@ bool VirtualGPU::dispatchGenericAqlPacket(
455455
456456 // TODO: placeholder to setup the kernel to populate start and end timestamp.
457457 if (timestamp_ != nullptr ) {
458+ // Pool size must grow to the size of pending AQL packets
459+ const uint32_t pool_size = index - read;
460+ if (pool_size >= signal_pool_.size ()) {
461+ ProfilingSignal profiling_signal = {};
462+ if (HSA_STATUS_SUCCESS != hsa_signal_create (0 , 0 , nullptr , &profiling_signal.signal_ )) {
463+ LogPrintfError (" Failed signal allocation id = %d" , pool_size);
464+ return false ;
465+ }
466+ signal_pool_.push_back (profiling_signal);
467+ assert (queueSize >= signal_pool_.size () && " Pool will be reallocated!" );
468+ }
469+ // Move index inside the valid pool
470+ ++current_signal_ %= signal_pool_.size ();
458471 // Find signal slot
459- ProfilingSignal* profilingSignal = &signal_pool_[index & queueMask ];
472+ ProfilingSignal* profilingSignal = &signal_pool_[current_signal_ ];
460473 // Make sure we save the old results in the TS structure
461474 if (profilingSignal->ts_ != nullptr ) {
462475 profilingSignal->ts_ ->checkGpuTime ();
@@ -536,6 +549,8 @@ bool VirtualGPU::dispatchGenericAqlPacket(
536549 LogPrintfError (" Failed signal [0x%lx] wait" , signal.handle );
537550 return false ;
538551 }
552+ // Reset the pool of signals
553+ current_signal_ = 0 ;
539554 }
540555
541556 return true ;
@@ -620,8 +635,14 @@ void VirtualGPU::ResetQueueStates() {
620635 // Release all memory dependencies
621636 memoryDependency ().clear ();
622637
623- // Release the pool, since runtime just completed a barrier
624- resetKernArgPool ();
638+ if (dev ().settings ().barrier_sync_ ) {
639+ // Release the pool, since runtime just completed a barrier
640+ // @note: Runtime can reset kernel arg pool only if the barrier with L2 invalidation was issued
641+ resetKernArgPool ();
642+ } else {
643+ // Reset the pool of signals
644+ current_signal_ = 0 ;
645+ }
625646}
626647
627648// ================================================================================================
@@ -833,8 +854,17 @@ bool VirtualGPU::initPool(size_t kernarg_pool_size, uint signal_pool_count) {
833854 }
834855
835856 if (signal_pool_count != 0 ) {
836- signal_pool_.resize (signal_pool_count);
837- for (uint i = 0 ; i < signal_pool_count; ++i) {
857+ // Reserve signal pool for all entries in the queue, since profiling logic will save the
858+ // pointer in timestamp info for the future references
859+ signal_pool_.reserve (signal_pool_count);
860+ // If barrier is disable, then allocate a small portion of all signals and grow the array later.
861+ // @note: the optimization requires a wait for signal on reuse, which is only available when
862+ // the barrier is disabled
863+ constexpr uint32_t kDefaultSignalPoolSize = 32 ;
864+ const uint32_t default_signal_pool_size = (dev ().settings ().barrier_sync_ ) ?
865+ signal_pool_count : kDefaultSignalPoolSize ;
866+ signal_pool_.resize (default_signal_pool_size);
867+ for (uint i = 0 ; i < default_signal_pool_size; ++i) {
838868 ProfilingSignal profilingSignal;
839869 if (HSA_STATUS_SUCCESS != hsa_signal_create (0 , 0 , nullptr , &profilingSignal.signal_ )) {
840870 return false ;
@@ -881,6 +911,9 @@ void* VirtualGPU::allocKernArg(size_t size, size_t alignment) {
881911 }
882912
883913 resetKernArgPool ();
914+
915+ // Reset the pool of signals
916+ current_signal_ = 0 ;
884917 }
885918 } while (true );
886919
0 commit comments