diff --git a/projects/rocprofiler-sdk/cmake/rocprofiler_options.cmake b/projects/rocprofiler-sdk/cmake/rocprofiler_options.cmake index 08d10326351..57b5235b5a1 100644 --- a/projects/rocprofiler-sdk/cmake/rocprofiler_options.cmake +++ b/projects/rocprofiler-sdk/cmake/rocprofiler_options.cmake @@ -34,6 +34,13 @@ rocprofiler_add_option( ROCPROFILER_BUILD_CI "Enable continuous integration default values for options" OFF ADVANCED) +rocprofiler_add_option(ROCPROFILER_GPU_TARGETS "Targets for building tests and samples" + "") +if(NOT ROCPROFILER_GPU_TARGETS OR ROCPROFILER_GPU_TARGETS STREQUAL "") + set(ROCPROFILER_GPU_TARGETS + "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1102") +endif() + rocprofiler_add_option(ROCPROFILER_BUILD_TESTS "Enable building the tests" ${ROCPROFILER_BUILD_CI}) rocprofiler_add_option(ROCPROFILER_BUILD_SAMPLES "Enable building the code samples" diff --git a/projects/rocprofiler-sdk/samples/counter_collection/device_serialized_main.cpp b/projects/rocprofiler-sdk/samples/counter_collection/device_serialized_main.cpp index e260ad3324e..86b8a5be104 100644 --- a/projects/rocprofiler-sdk/samples/counter_collection/device_serialized_main.cpp +++ b/projects/rocprofiler-sdk/samples/counter_collection/device_serialized_main.cpp @@ -21,6 +21,9 @@ // SOFTWARE. #include +#include +#include +#include #include "client.hpp" @@ -48,6 +51,92 @@ kernelA(int devid, volatile int* wait_on, int value, int* no_opt) printf("[device=%i][return] Wait on %i: %i (%i)\n", devid, value, *wait_on, *no_opt); } +// Force assert to work even in Release builds for this test +#ifdef NDEBUG +# undef NDEBUG +# include +# define NDEBUG +#endif + +__global__ void +check_order_kernel(int expected, int* actual) +{ + // Note: We do not use atomics here on purpose to ensure that the barrier + // being injected has proper fencing set. + if(*actual != expected) + { + printf("[error] Expected %i but got %i\n", expected, *actual); + } + // Assert will now work in both Debug and Release builds + assert(*actual == expected); + (*actual)++; +} + +class DualStreamExecutor +{ +private: + hipStream_t stream1_ = {}; + hipStream_t stream2_ = {}; + int device_ = {0}; + +public: + DualStreamExecutor(int device = 0) + : device_(device) + { + HIP_CALL(hipSetDevice(device_)); + HIP_CALL(hipStreamCreate(&stream1_)); + HIP_CALL(hipStreamCreate(&stream2_)); + std::cout << "Created dual streams on device " << device_ << std::endl; + } + + ~DualStreamExecutor() + { + HIP_CALL(hipStreamDestroy(stream1_)); + HIP_CALL(hipStreamDestroy(stream2_)); + } + + // Function template to launch any kernel on both streams + template + void launch_kernel_on_both_streams(KernelFunc kernel, + dim3 gridSize, + dim3 blockSize, + size_t sharedMem, + Args... args) + { + hipLaunchKernelGGL(kernel, gridSize, blockSize, sharedMem, stream1_, args...); + hipLaunchKernelGGL(kernel, gridSize, blockSize, sharedMem, stream2_, args...); + } + + // Synchronize both streams + void synchronize() + { + HIP_CALL(hipStreamSynchronize(stream1_)); + HIP_CALL(hipStreamSynchronize(stream2_)); + std::cout << "Both streams synchronized" << std::endl; + } + + // Get stream handles + hipStream_t get_stream1() const { return stream1_; } + hipStream_t get_stream2() const { return stream2_; } + + // Execute async memory operations on both streams + void async_memcpy_to_device(void* dst1, void* dst2, const void* src, size_t size) + { + HIP_CALL(hipMemcpyAsync(dst1, src, size, hipMemcpyHostToDevice, stream1_)); + HIP_CALL(hipMemcpyAsync(dst2, src, size, hipMemcpyHostToDevice, stream2_)); + } + + void async_memcpy_to_host(void* dst1, + void* dst2, + const void* src1, + const void* src2, + size_t size) + { + HIP_CALL(hipMemcpyAsync(dst1, src1, size, hipMemcpyDeviceToHost, stream1_)); + HIP_CALL(hipMemcpyAsync(dst2, src2, size, hipMemcpyDeviceToHost, stream2_)); + } +}; + int main(int, char**) { @@ -76,5 +165,27 @@ main(int, char**) HIP_CALL(hipSetDevice(0)); HIP_CALL(hipDeviceSynchronize()); + // Validate that kernels are being processed in order on the same device + HIP_CALL(hipSetDevice(0)); + DualStreamExecutor executor(0); + *no_opt_0 = 0; + // Use reproducible seed for deterministic testing and easier debugging + srand(12345); + + for(int i = 0; i < 10000; i++) + { + if(rand() & 1) + { + hipLaunchKernelGGL( + check_order_kernel, dim3(1), dim3(1), 0, executor.get_stream1(), i, no_opt_0); + } + else + { + hipLaunchKernelGGL( + check_order_kernel, dim3(1), dim3(1), 0, executor.get_stream2(), i, no_opt_0); + } + } + executor.synchronize(); + HIP_CALL(hipDeviceSynchronize()); std::cerr << "Run complete\n"; } diff --git a/projects/rocprofiler-sdk/samples/counter_collection/main.cpp b/projects/rocprofiler-sdk/samples/counter_collection/main.cpp index 05b26fadf53..9a278ab9fa3 100644 --- a/projects/rocprofiler-sdk/samples/counter_collection/main.cpp +++ b/projects/rocprofiler-sdk/samples/counter_collection/main.cpp @@ -77,6 +77,16 @@ launchKernels(const long NUM_LAUNCH, const long SYNC_INTERVAL, const int DEV_ID) hipLaunchKernelGGL(kernelA, dim3(1), dim3(1), 0, 0, 1, 2); hipLaunchKernelGGL(kernelB, dim3(1), dim3(1), 0, 0, 1, 2); if(i % SYNC_INTERVAL == (SYNC_INTERVAL - 1)) HIP_CALL(hipDeviceSynchronize()); + + // Progress indicator every 1000 iterations + if(i % 1000 == 999) + { + printf("[PROGRESS] Device %d: Completed %ld / %ld iterations\n", + DEV_ID, + i + 1, + NUM_LAUNCH); + fflush(stdout); + } } const int NElems = 512 * 512; @@ -101,6 +111,16 @@ launchKernels(const long NUM_LAUNCH, const long SYNC_INTERVAL, const int DEV_ID) { hipLaunchKernelGGL(kernelC, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, NElems); if(i % SYNC_INTERVAL == (SYNC_INTERVAL - 1)) HIP_CALL(hipDeviceSynchronize()); + + // Progress indicator every 1000 iterations + if(i % 1000 == 999) + { + printf("[PROGRESS] Device %d: KernelC completed %ld / %ld iterations\n", + DEV_ID, + i + 1, + NUM_LAUNCH); + fflush(stdout); + } } HIP_CALL(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIP_CALL(hipDeviceSynchronize()); diff --git a/projects/rocprofiler-sdk/samples/openmp_target/CMakeLists.txt b/projects/rocprofiler-sdk/samples/openmp_target/CMakeLists.txt index f3e09ee1966..aae16ff1ae4 100644 --- a/projects/rocprofiler-sdk/samples/openmp_target/CMakeLists.txt +++ b/projects/rocprofiler-sdk/samples/openmp_target/CMakeLists.txt @@ -30,8 +30,7 @@ target_link_libraries( PRIVATE rocprofiler-sdk::rocprofiler-sdk rocprofiler-sdk::samples-build-flags rocprofiler-sdk::samples-common-library) -set(DEFAULT_GPU_TARGETS "gfx906" "gfx908" "gfx90a" "gfx942" "gfx950" "gfx1100" "gfx1101" - "gfx1102") +set(DEFAULT_GPU_TARGETS ${ROCPROFILER_GPU_TARGETS}) set(OPENMP_GPU_TARGETS "${DEFAULT_GPU_TARGETS}" diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/evaluate_ast.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/evaluate_ast.cpp index 49920effb45..704a00b7157 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/evaluate_ast.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/evaluate_ast.cpp @@ -755,7 +755,7 @@ EvaluateAST::read_pkt(const aql::CounterPacketConstruct* pkt_gen, hsa::AQLPacket if(status != HSA_STATUS_SUCCESS) { - ROCP_ERROR << "AqlProfile could not decode packet"; + ROCP_ERROR << "AqlProfile could not decode packet " << status; } return ret; } diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/hsa_barrier.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/hsa_barrier.cpp index a99d0acb011..d546673780d 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/hsa_barrier.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/hsa_barrier.cpp @@ -25,6 +25,7 @@ #include "lib/rocprofiler-sdk/registration.hpp" #include +#include namespace rocprofiler { @@ -161,5 +162,47 @@ hsa_barrier::clear_barrier() ROCP_TRACE << "Barrier (handle: " << _barrier_signal.handle << ") is now cleared."; } +std::string +hsa_barrier::to_string() const +{ + std::ostringstream oss; + oss << "hsa_barrier{"; + + // _barrier_finished (function pointer - just indicate if it's set) + oss << "_barrier_finished: " << (_barrier_finished ? "set" : "null") << ", "; + + // _barrier_signal + oss << "_barrier_signal: {handle: " << _barrier_signal.handle << "}, "; + + // _queue_waiting + oss << "_queue_waiting: {"; + _queue_waiting.rlock([&](const auto& queue_waiting) { + bool first = true; + for(const auto& [queue_id, count] : queue_waiting) + { + if(!first) oss << ", "; + oss << queue_id << ": " << count; + first = false; + } + }); + oss << "}, "; + + // _barrier_enqueued + oss << "_barrier_enqueued: {"; + _barrier_enqueued.rlock([&](const auto& barrier_enqueued) { + bool first = true; + for(const auto& queue_id : barrier_enqueued) + { + if(!first) oss << ", "; + oss << queue_id; + first = false; + } + }); + oss << "}"; + + oss << "}"; + return oss.str(); +} + } // namespace hsa } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/hsa_barrier.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/hsa_barrier.hpp index 9c78fe75b4b..56aea490a25 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/hsa_barrier.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/hsa_barrier.hpp @@ -31,9 +31,12 @@ #include #include +#include + #include #include #include +#include #include #include @@ -72,6 +75,9 @@ class hsa_barrier // If this is the last queue waiting, clears the barrier and marks it as complete. void remove_queue(const Queue* queue); + // Returns a string containing all class variable data + std::string to_string() const; + private: std::function _barrier_finished = {}; CoreApiTable _core_api = {}; @@ -86,3 +92,23 @@ class hsa_barrier } // namespace hsa } // namespace rocprofiler + +namespace fmt +{ +// fmt::format support for hsa_barrier +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + auto format(rocprofiler::hsa::hsa_barrier const& barrier, Ctx& ctx) const + { + return fmt::format_to(ctx.out(), "{}", barrier.to_string()); + } +}; +} // namespace fmt diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/profile_serializer.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/profile_serializer.cpp index 94d48897183..37fd05b43a4 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/profile_serializer.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/profile_serializer.cpp @@ -20,28 +20,83 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. +/** + * @file profile_serializer.cpp + * @brief HSA Kernel Execution Serialization for Profiling + * + * This file implements kernel execution serialization to ensure only one kernel + * executes at a time across all profiled queues. This is necessary for accurate + * profiling measurements, especially for hardware performance counters. + * + * SERIALIZATION MECHANISM OVERVIEW: + * ================================ + * + * The profiler_serializer implements a two-stage queue-based serialization system: + * + * 1. **Single Executor Rule**: Only one queue can execute kernels at any given time + * 2. **Ready-State Verification**: Queues must signal ready before they can execute + * 3. **FIFO Ordering with Ready Check**: Queues are granted execution in FIFO order, + * but only if they have reached their blocking barrier + * 4. **Dual-Signal Control**: Uses block_signal for execution control and ready_signal + * for barrier notification + * + * KEY COMPONENTS: + * - _dispatch_queue: Currently executing queue (nullptr = no queue executing) + * - _enqueued_kernels: FIFO queue of kernels enqueued for execution + * - _ready_queues: Multiset of queues that have reached their blocking barrier + * - _seen_queues: Tracks which queues have ready_signal handlers registered + * - _barrier: List of active barriers (transitions to/from serialized execution) + * - block_signal: HSA signal controlling kernel execution (0=execute, 1=blocked) + * - ready_signal: HSA signal indicating queue reached barrier (-1 triggers handler) + * + * EXECUTION FLOW: + * 1. Kernel submitted → Queue added to _enqueued_kernels + * 2. Barrier reached → ready_signal triggers, queue added to _ready_queues + * 3. If no queue executing → Select first queue in _enqueued_kernels that's also ready + * 4. Kernel completes → Select next ready queue from _enqueued_kernels + * 5. Repeat until no ready queues available + * + * SIGNAL VALUES: + * - block_signal: RELEASE_BARRIER (0) allows execution, STOP_BARRIER (1) blocks + * - ready_signal: -1 triggers handler, reset to 0 after processing + */ #include "lib/rocprofiler-sdk/hsa/profile_serializer.hpp" - +#include "lib/common/logging.hpp" #include "lib/rocprofiler-sdk/hsa/queue_controller.hpp" +#include + namespace rocprofiler { namespace hsa { namespace { -bool -profiler_serializer_ready_signal_handler(hsa_signal_value_t /* signal_value */, void* data) -{ - auto* hsa_queue = static_cast(data); - const auto* queue = CHECK_NOTNULL(get_queue_controller())->get_queue(*hsa_queue); - CHECK(queue); - CHECK_NOTNULL(get_queue_controller())->serializer(queue).wlock([&](auto& serializer) { - serializer.queue_ready(hsa_queue, *queue); - }); - return true; -} - +/** + * @brief Signal value to release/unblock barrier packets + * + * When a queue's block_signal is set to RELEASE_BARRIER (0), the HSA barrier + * packets depending on this signal will proceed, allowing kernels to execute. + */ +constexpr int64_t RELEASE_BARRIER = 0; + +/** + * @brief Signal value to stop/block barrier packets + * + * When a queue's block_signal is set to STOP_BARRIER (1), the HSA barrier + * packets depending on this signal will be blocked, preventing kernel execution. + */ +constexpr int64_t STOP_BARRIER = 1; + +/** + * @brief Remove completed barriers from the barrier queue + * + * This function cleans up barriers that have finished their synchronization. + * Completed barriers are removed from the front of the deque to maintain + * proper barrier lifecycle management. + * + * @param barriers Reference to the barrier deque to clean up + */ void clear_complete_barriers(std::deque& barriers) { @@ -58,184 +113,439 @@ clear_complete_barriers(std::deque& bar } } +bool +profiler_serializer_ready_signal_handler(hsa_signal_value_t /* signal_value */, void* data) +{ + auto* hsa_queue = static_cast(data); + const auto* queue = CHECK_NOTNULL(get_queue_controller())->get_queue(*hsa_queue); + CHECK(queue); + CHECK_NOTNULL(get_queue_controller())->serializer(queue).wlock([&](auto& serializer) { + serializer.queue_ready(hsa_queue, *queue); + }); + return true; +} + } // namespace -void -profiler_serializer::add_queue(hsa_queue_t** hsa_queues, const Queue& queue) +/** + * @brief Select the first ready queue and grant it execution permission + * + * This function encapsulates the common logic for finding the first queue + * in _dispatch_ready that is also in _ready_queues, removing it from both + * containers, and granting it execution permission. + * + * @return Pointer to the selected queue, or nullptr if no ready queue found + */ +const Queue* +profiler_serializer::select_and_grant_ready_queue() { - hsa_signal_t signal = queue.ready_signal; - hsa_status_t status = - CHECK_NOTNULL(get_queue_controller()) - ->get_ext_table() - .hsa_amd_signal_async_handler_fn(signal, - HSA_SIGNAL_CONDITION_EQ, - -1, - profiler_serializer_ready_signal_handler, - *hsa_queues); - if(status != HSA_STATUS_SUCCESS) ROCP_FATAL << "hsa_amd_signal_async_handler failed"; + ROCP_INFO << "[select_and_grant] Starting - _enqueued_kernels.size=" << _enqueued_kernels.size() + << ", _ready_queues.size=" << _ready_queues.size(); + + if(_enqueued_kernels.empty()) + { + ROCP_INFO << "[select_and_grant] No enqueued kernels, returning nullptr"; + return nullptr; + } + + const auto& controller = *CHECK_NOTNULL(get_queue_controller()); + + // Find the first queue in _enqueued_kernels that is also ready + for(auto it = _enqueued_kernels.begin(); it != _enqueued_kernels.end(); ++it) + { + auto queue_id = (*it)->get_id().handle; + auto ready_count = _ready_queues.count(*it); + + ROCP_INFO << "[select_and_grant] Checking queue " << queue_id + << " - ready_count=" << ready_count; + + if(ready_count > 0) + { + const Queue* queue_to_run = *it; + + // Remove from both containers + _enqueued_kernels.erase(it); + _ready_queues.erase(_ready_queues.find(queue_to_run)); + + // Grant execution permission + _dispatch_queue = queue_to_run; + controller.get_core_table().hsa_signal_store_screlease_fn(queue_to_run->block_signal, + RELEASE_BARRIER); + + ROCP_INFO << "[select_and_grant] Queue " << queue_to_run->get_id().handle + << " SELECTED AND GRANTED EXECUTION" + << ", block_signal set to 0" + << ", _enqueued_kernels.size now=" << _enqueued_kernels.size() + << ", _ready_queues.size now=" << _ready_queues.size(); + + return queue_to_run; + } + } + + ROCP_INFO << "[select_and_grant] No ready queue found in _enqueued_kernels"; + return nullptr; } +/** + * @brief Handle kernel completion and manage dispatch queue transitions + * + * This is the core function that handles the serialization state machine when + * a kernel completes execution. It performs the following critical operations: + * + * 1. Clean up completed barriers + * 2. Update packet completion counters + * 3. Determine the current serialization state + * 4. Block the completed queue from further execution + * 5. Find the first queue in _enqueued_kernels that is also in _ready_queues + * 6. Grant execution permission to the selected ready queue (if found) + * 7. Update the dispatch queue pointer + * + * SERIALIZATION STATE MACHINE: + * Current queue completes → Block current queue → Find ready queue → Grant permission + * + * The key difference from simple FIFO is that we only grant execution to queues + * that have signaled they are ready (reached their blocking barrier). + * + * @param completed The queue that just finished executing a kernel + */ void profiler_serializer::kernel_completion_signal(const Queue& completed) { - // We do not want to track kernel compleiton signals before we have reached the barrier + ROCP_INFO << "[kernel_completion] Queue " << completed.get_id().handle + << " completed, _dispatch_queue=" + << (_dispatch_queue ? _dispatch_queue->get_id().handle : 0) + << ", _enqueued_kernels.size=" << _enqueued_kernels.size() + << ", _ready_queues.size=" << _ready_queues.size(); + + // Clean up any barriers that have completed their synchronization clear_complete_barriers(_barrier); - // Find the state of this barrier + // Track the number of completed packets for debugging/monitoring + _completed_packets++; + + // Determine the current serialization state from active barriers + // If no barriers are active, use the serializer's global state auto state = _serializer_status.load(); bool found = false; + // Check all active barriers to see if this completion affects them for(auto& barrier : _barrier) { - // Register completion of the kernel. Each queue has a number of kernels it is - // waiting on to complete for each barrier. If more than one barrier is present - // that has this queue, then it will contain a count that is the sum of all previous - // kernel packets in the queue. Thus we must register completion with every barrier. - // The state of the queue at this time is the state of the first barrier (or the state - // of the serializer if no barriers are present). + // Register completion of the kernel with each barrier + // Each barrier tracks how many kernels from each queue are still pending + // If multiple barriers exist, they accumulate the counts from previous barriers + // The state of the first barrier that recognizes this queue determines our action if(barrier.barrier->register_completion(&completed) && !found) { + // Use the state from the first barrier that recognized this completion state = barrier.state; found = true; } } - if(state == Status::DISABLED) return; - - CHECK(_dispatch_queue); - _dispatch_queue = nullptr; - CHECK_NOTNULL(get_queue_controller()) - ->get_core_table() - .hsa_signal_store_screlease_fn(completed.block_signal, 1); - CHECK_NOTNULL(get_queue_controller()) - ->get_core_table() - .hsa_signal_store_screlease_fn(completed.ready_signal, 0); - if(!_dispatch_ready.empty()) + // If serialization is disabled, don't manage dispatch queue transitions + if(state == Status::DISABLED) { - const auto* queue = _dispatch_ready.front(); - _dispatch_ready.erase(_dispatch_ready.begin()); - CHECK_NOTNULL(get_queue_controller()) - ->get_core_table() - .hsa_signal_store_screlease_fn(queue->block_signal, 0); - _dispatch_queue = queue; + ROCP_INFO << "[kernel_completion] Serialization DISABLED, returning"; + return; } -} -void -profiler_serializer::queue_ready(hsa_queue_t* hsa_queue, const Queue& queue) -{ - { - ROCP_TRACE << "Obtaining queue mutex lock..."; - std::lock_guard cv_lock(queue.cv_mutex); - ROCP_TRACE << "Queue mutex lock obtained"; - if(queue.get_state() == queue_state::to_destroy) - { - ROCP_TRACE << "Setting queue state to done_destroy..."; - CHECK_NOTNULL(get_queue_controller()) - ->set_queue_state(queue_state::done_destroy, hsa_queue); - ROCP_TRACE << "Destroying ready signal..."; - CHECK_NOTNULL(get_queue_controller()) - ->get_core_table() - .hsa_signal_destroy_fn(queue.ready_signal); - ROCP_TRACE << "Notifying queue condition variable..."; - queue.cv_ready_signal.notify_one(); - return; - } - } + const auto& controller = *CHECK_NOTNULL(get_queue_controller()); - ROCP_TRACE << "setting queue ready signal to 1..."; - CHECK_NOTNULL(get_queue_controller()) - ->get_core_table() - .hsa_signal_store_screlease_fn(queue.ready_signal, 1); + // Verify we have a currently executing queue (should be the completed one) + CHECK(_dispatch_queue); - if(_dispatch_queue == nullptr) + ROCP_INFO << "[kernel_completion] Blocking queue " << completed.get_id().handle + << " (setting block_signal to 1)"; + + // STEP 1: Block the completed queue from executing more kernels + controller.get_core_table().hsa_signal_store_screlease_fn(completed.block_signal, STOP_BARRIER); + + // STEP 2: Handle dispatch queue transition - use extracted function + ROCP_INFO << "[kernel_completion] Looking for next ready queue to execute"; + if(!select_and_grant_ready_queue()) { - CHECK_NOTNULL(get_queue_controller()) - ->get_core_table() - .hsa_signal_store_screlease_fn(queue.block_signal, 0); - _dispatch_queue = &queue; + // No ready queue found or no queues waiting + _dispatch_queue = nullptr; + ROCP_INFO + << "[kernel_completion] NO READY QUEUE AVAILABLE - _dispatch_queue set to nullptr"; } else { - _dispatch_ready.push_back(&queue); + ROCP_INFO << "[kernel_completion] Next queue selected: " + << _dispatch_queue->get_id().handle; } } +/** + * @brief Generate HSA barrier packets for kernel dispatch serialization + * + * This function is called when a kernel is about to be dispatched. It generates + * the necessary HSA barrier packets that will be inserted into the queue to + * control execution timing based on the serialization state. + * + * PACKET GENERATION LOGIC: + * 1. If serialization is DISABLED: Return empty packet list (no serialization) + * 2. If serialization is ENABLED: Generate barrier packets that depend on block_signal + * 3. Handle dispatch queue assignment (immediate execution vs. queuing for later) + * + * The generated barrier packets will block kernel execution until the queue's + * block_signal is set to RELEASE_BARRIER (0). + * + * @param queue The queue that is about to dispatch a kernel + * @return Vector of HSA barrier packets to insert before the kernel + */ common::container::small_vector -profiler_serializer::kernel_dispatch(const Queue& queue) const +profiler_serializer::kernel_dispatch(const Queue& queue) { common::container::small_vector ret; + + // Helper lambda to create properly configured HSA barrier packets auto&& CreateBarrierPacket = [](hsa_signal_t* dependency_signal, hsa_signal_t* completion_signal) { hsa::rocprofiler_packet barrier{}; + + // Set packet type to barrier with AND operation barrier.barrier_and.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; + + // Configure memory fence scopes for system-wide synchronization barrier.barrier_and.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE; barrier.barrier_and.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE; + + // Mark as barrier packet barrier.barrier_and.header |= 1 << HSA_PACKET_HEADER_BARRIER; + + // Set dependency signal (packet waits for this signal) if(dependency_signal != nullptr) barrier.barrier_and.dep_signal[0] = *dependency_signal; + + // Set completion signal (packet signals this when done) if(completion_signal != nullptr) barrier.barrier_and.completion_signal = *completion_signal; + return barrier; }; + // Track the number of packets being enqueued for monitoring + _enqueued_packets++; + + // If there are active barriers, add any barrier-specific packets if(!_barrier.empty()) { + // Check if the most recent barrier needs to add packets for this queue if(auto maybe_barrier = _barrier.back().barrier->enqueue_packet(&queue)) { ret.push_back(*maybe_barrier); } } + // Generate serialization packets based on current state switch(_serializer_status) { - case Status::DISABLED: return ret; + case Status::DISABLED: + // Serialization disabled: return any barrier packets but no serialization control + ROCP_INFO << "[kernel_dispatch] Queue " << queue.get_id().handle + << " - serialization DISABLED, returning " << ret.size() + << " barrier packets"; + return ret; + case Status::ENABLED: { - hsa_signal_t ready_signal = queue.ready_signal; + // SERIALIZATION ENABLED: Create barrier packets to control execution timing + ROCP_INFO << "[kernel_dispatch] Queue " << queue.get_id().handle + << " - serialization ENABLED, _dispatch_queue=" + << (_dispatch_queue ? _dispatch_queue->get_id().handle : 0) + << ", _enqueued_kernels.size=" << _enqueued_kernels.size() + << ", _ready_queues.size=" << _ready_queues.size(); + hsa_signal_t block_signal = queue.block_signal; + hsa_signal_t ready_signal = queue.ready_signal; + + // Register ready signal handler if not already done + register_queue_ready_handler(queue); + ret.push_back(CreateBarrierPacket(&ready_signal, &ready_signal)); + // Create barrier packet that depends on the queue's block_signal + // This packet will block until block_signal == 0 (RELEASE_BARRIER) ret.push_back(CreateBarrierPacket(&block_signal, &block_signal)); + + ROCP_INFO << "[kernel_dispatch] Queue " << queue.get_id().handle + << " adding to _enqueued_kernels"; + _enqueued_kernels.push_back(&queue); break; - }; + } } + ROCP_INFO << "[kernel_dispatch] Queue " << queue.get_id().handle << " - returning " + << ret.size() << " total packets"; return ret; } +/** + * @brief Handle notification that a queue has reached the blocking barrier + * + * This function is called when a queue's ready_signal is triggered, indicating + * that the queue has reached the blocking barrier and is ready to execute once + * the block_signal is released. This provides visibility into the serialization + * state and confirms that releasing the block signal will immediately trigger + * kernel execution. + * + * @param hsa_queue The HSA queue handle + * @param queue The Queue object that has reached the ready state + */ +/** + * @brief Check if queue has been seen before and set up signal handler if new + * + * This function checks if we've encountered this queue before by its ID. + * If it's a new queue, it sets up an async signal handler for the ready_signal + * that will be triggered when the signal value reaches -1. + * + * @param queue The Queue object to check and potentially register + */ +void +profiler_serializer::register_queue_ready_handler(const Queue& queue) +{ + uint64_t queue_id = queue.get_id().handle; + + // Check if we've already seen this queue + if(_seen_queues.find(queue_id) == _seen_queues.end()) + { + ROCP_INFO << "[register_ready_handler] Queue " << queue_id + << " - NEW QUEUE, registering ready_signal handler"; + + // New queue - register the signal handler + _seen_queues.insert(queue_id); + + const auto& controller = *CHECK_NOTNULL(get_queue_controller()); + + // Set up async signal handler for ready_signal when it equals -1 + // The handler will be called when ready_signal transitions to -1 + hsa_status_t status = controller.get_ext_table().hsa_amd_signal_async_handler_fn( + queue.ready_signal, + HSA_SIGNAL_CONDITION_EQ, + -1, // Trigger value + profiler_serializer_ready_signal_handler, + const_cast(queue.intercept_queue())); + + if(status != HSA_STATUS_SUCCESS) + { + ROCP_ERROR + << "[register_ready_handler] FAILED to register ready signal handler for queue " + << queue_id << ", status: " << status; + } + else + { + ROCP_INFO << "[register_ready_handler] Successfully registered ready signal handler " + "for queue " + << queue_id << " (signal handle: " << queue.ready_signal.handle << ")"; + } + } + else + { + ROCP_INFO << "[register_ready_handler] Queue " << queue_id + << " - already has ready_signal handler registered"; + } +} + +void +profiler_serializer::queue_ready(hsa_queue_t* /* hsa_queue */, const Queue& queue) +{ + ROCP_INFO << "[queue_ready] Queue " << queue.get_id().handle + << " SIGNALED READY - _dispatch_queue=" + << (_dispatch_queue ? _dispatch_queue->get_id().handle : 0) + << ", _enqueued_kernels.size=" << _enqueued_kernels.size() + << ", _ready_queues.size=" << _ready_queues.size(); + + // Reset the ready_signal back to 0 + const auto& controller = *CHECK_NOTNULL(get_queue_controller()); + controller.get_core_table().hsa_signal_store_screlease_fn(queue.ready_signal, 0); + + ROCP_INFO << "[queue_ready] Queue " << queue.get_id().handle << " - ready_signal reset to 0"; + + // If serialization is disabled, nothing to do + if(_serializer_status == Status::DISABLED) + { + ROCP_INFO << "[queue_ready] Serialization DISABLED, returning"; + return; + } + + // Always mark this queue as ready + _ready_queues.insert(&queue); + ROCP_INFO << "[queue_ready] Queue " << queue.get_id().handle + << " added to _ready_queues (count now: " << _ready_queues.count(&queue) + << ", total ready: " << _ready_queues.size() << ")"; + + // Check if there is a dispatch currently executing + if(_dispatch_queue == nullptr) + { + ROCP_INFO << "[queue_ready] NO QUEUE EXECUTING - checking for ready queue to grant"; + // No queue is currently executing - use extracted function to find and grant ready queue + if(!select_and_grant_ready_queue()) + { + ROCP_INFO + << "[queue_ready] No queue in _enqueued_kernels is ready to execute" + << " (this can happen if queue_ready fired before kernel_dispatch added to list)"; + } + else + { + ROCP_INFO << "[queue_ready] Granted execution to queue " + << _dispatch_queue->get_id().handle; + } + } + else + { + ROCP_INFO << "[queue_ready] Queue " << _dispatch_queue->get_id().handle + << " is currently executing - this queue will wait"; + } +} + void profiler_serializer::destroy_queue(hsa_queue_t* id, const Queue& queue) { ROCP_INFO << "destroying queue..."; - /*Deletes the queue to be destructed from the dispatch ready.*/ + uint64_t queue_id = queue.get_id().handle; + const Queue* queue_ptr = &queue; + + // Remove from barriers for(auto& barriers : _barrier) { barriers.barrier->remove_queue(&queue); } - _dispatch_ready.erase( - std::remove_if( - _dispatch_ready.begin(), - _dispatch_ready.end(), - [&](auto& it) { - /*Deletes the queue to be destructed from the dispatch ready.*/ - if(it->get_id().handle == queue.get_id().handle) - { - if(_dispatch_queue && _dispatch_queue->get_id().handle == queue.get_id().handle) - { - // insert fatal condition here - // ToDO [srnagara]: Need to find a solution rather than abort. - ROCP_FATAL - << "Queue is being destroyed while kernel launch is still active"; - } - return true; - } - return false; - }), - _dispatch_ready.end()); - CHECK_NOTNULL(get_queue_controller())->set_queue_state(queue_state::to_destroy, id); - CHECK_NOTNULL(get_queue_controller()) - ->get_core_table() - .hsa_signal_store_screlease_fn(queue.ready_signal, 0); + // Check if queue is currently executing + if(_dispatch_queue != nullptr && _dispatch_queue->get_id().handle == queue_id) + { + ROCP_FATAL << "Queue is being destroyed while kernel launch is still active"; + } + + // Check if queue is in _enqueued_kernels and report error + auto dispatch_it = std::find_if( + _enqueued_kernels.begin(), _enqueued_kernels.end(), [queue_id](const Queue* q) { + return q->get_id().handle == queue_id; + }); + if(dispatch_it != _enqueued_kernels.end()) + { + ROCP_ERROR << "Queue " << queue_id + << " found in enqueued_kernels during destruction - removing"; + _enqueued_kernels.erase(dispatch_it); + } + + // Check if queue is in _ready_queues and report error + if(_ready_queues.count(queue_ptr) > 0) + { + size_t count = _ready_queues.count(queue_ptr); + ROCP_ERROR << "Queue " << queue_id << " found " << count + << " times in ready_queues during destruction - removing all"; + auto ready_range = _ready_queues.equal_range(queue_ptr); + _ready_queues.erase(ready_range.first, ready_range.second); + } + + // Remove from _seen_queues + _seen_queues.erase(queue_id); + + // Finalize queue destruction + auto* controller = CHECK_NOTNULL(get_queue_controller()); + controller->set_queue_state(queue_state::to_destroy, id); + controller->get_core_table().hsa_signal_store_screlease_fn(queue.block_signal, RELEASE_BARRIER); ROCP_INFO << "queue destroyed"; } @@ -247,18 +557,16 @@ profiler_serializer::enable(const hsa_barrier::queue_map_ptr_t& queues) if(_serializer_status == Status::ENABLED) return; ROCP_INFO << "Enabling profiler serialization..."; - _serializer_status = Status::ENABLED; + if(queues.empty()) return; clear_complete_barriers(_barrier); - _barrier.emplace_back(Status::DISABLED, std::make_unique( [] {}, CHECK_NOTNULL(get_queue_controller())->get_core_table())); - _serializer_status = Status::ENABLED; - _barrier.back().barrier->set_barrier(queues); + ROCP_INFO << "Profiler serialization enabled"; } @@ -269,20 +577,109 @@ profiler_serializer::disable(const hsa_barrier::queue_map_ptr_t& queues) if(_serializer_status == Status::DISABLED) return; ROCP_INFO << "Disabling profiler serialization..."; - _serializer_status = Status::DISABLED; + if(queues.empty()) return; clear_complete_barriers(_barrier); - _barrier.emplace_back(Status::ENABLED, std::make_unique( [] {}, CHECK_NOTNULL(get_queue_controller())->get_core_table())); - _serializer_status = Status::DISABLED; - _barrier.back().barrier->set_barrier(queues); + ROCP_INFO << "Profiler serialization disabled"; } +std::string +profiler_serializer::to_string() const +{ + std::ostringstream oss; + oss << "profiler_serializer{"; + + // _dispatch_queue + oss << "_dispatch_queue: "; + if(_dispatch_queue) + { + oss << "{id: " << _dispatch_queue->get_id().handle << "}"; + } + else + { + oss << "null"; + } + oss << ", "; + + // _enqueued_kernels + oss << "_enqueued_kernels: {"; + oss << "count: " << _enqueued_kernels.size(); + if(!_enqueued_kernels.empty()) + { + oss << ", queue_ids: ["; + bool first = true; + for(const auto* queue : _enqueued_kernels) + { + if(!first) oss << ", "; + oss << (queue ? queue->get_id().handle : 0); + first = false; + } + oss << "]"; + } + oss << "}, "; + + // _ready_queues + oss << "_ready_queues: {"; + oss << "count: " << _ready_queues.size(); + if(!_ready_queues.empty()) + { + oss << ", queue_ids: ["; + bool first = true; + for(const auto* queue : _ready_queues) + { + if(!first) oss << ", "; + oss << (queue ? queue->get_id().handle : 0); + first = false; + } + oss << "]"; + } + oss << "}, "; + + // _serializer_status + oss << "_serializer_status: "; + switch(_serializer_status.load()) + { + case Status::ENABLED: oss << "ENABLED"; break; + case Status::DISABLED: oss << "DISABLED"; break; + default: oss << "UNKNOWN"; break; + } + oss << ", "; + + // _barrier + oss << "_barrier: {"; + oss << "count: " << _barrier.size(); + if(!_barrier.empty()) + { + oss << ", barriers: ["; + bool first = true; + for(const auto& barrier : _barrier) + { + if(!first) oss << ", "; + oss << "{state: "; + switch(barrier.state) + { + case Status::ENABLED: oss << "ENABLED"; break; + case Status::DISABLED: oss << "DISABLED"; break; + default: oss << "UNKNOWN"; break; + } + oss << ", complete: " << (barrier.barrier ? barrier.barrier->complete() : false) << "}"; + first = false; + } + oss << "]"; + } + oss << "}"; + oss << " Completed Packets: " << _completed_packets.load() << ", "; + oss << "Enqueued Packets: " << _enqueued_packets.load(); + oss << "}"; + return oss.str(); +} + } // namespace hsa } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/profile_serializer.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/profile_serializer.hpp index d689e928c89..a78642cf8e8 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/profile_serializer.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/profile_serializer.hpp @@ -29,27 +29,84 @@ #include #include +#include + #include +#include #include +#include namespace rocprofiler { namespace hsa { -/*This is a profiler serializer. It should be instantiated -only once for the profiler. The following is the -description of each field. -1. _dispatch_queue - The queue to which the currently dispatched kernel - belongs to. - At any given time, in serialization only one kernel - can be executing. -2. _dispatch_ready- It is a software data structure which holds - the queues which have a kernel ready to be dispatched. - This stores the queues in FIFO order. -3. serializer_mutex - The mutex is used for thread synchronization - while accessing the singleton instance of this structure. -Currently, in case of profiling kernels are serialized by default. -*/ +/** + * HOW THE SERIALIZATION MECHANISM WORKS: + * ===================================== + * + * OVERVIEW: + * The profiler_serializer ensures that only ONE kernel executes at a time across + * ALL profiled queues on a device. This is critical for accurate hardware counter measurements. + * + * KEY DATA STRUCTURES: + * - _dispatch_queue: Pointer to the currently executing queue (nullptr = none executing) + * - _enqueued_kernels: FIFO deque of queues with kernels enqueued for execution + * - _ready_queues: Multiset of queues that have reached their blocking barrier and are ready + * - _seen_queues: Set tracking which queues have ready_signal handlers registered + * - _barrier: List of active synchronization barriers + * - block_signal: Per-queue HSA signal controlling kernel execution permission + * - ready_signal: Per-queue HSA signal indicating queue has reached blocking barrier + * + * EXECUTION FLOW: + * + * 1. KERNEL SUBMISSION (kernel_dispatch): + * - Generate barrier packets that depend on queue.block_signal + * - If no queue executing: Grant immediate permission (block_signal = 0) + * - If queue executing: Add to _enqueued_kernels list (block_signal remains 1) + * - Register ready_signal handler for new queues (triggers at -1) + * + * 2. BARRIER PACKETS INJECTED INTO QUEUE: + * - HSA barrier packets are inserted before the actual kernel + * - When barrier is reached, ready_signal is set to -1 (triggers queue_ready) + * - Barriers wait for block_signal == 0 before proceeding + * - If block_signal == 1, kernels are blocked at hardware level + * + * 3. READY SIGNAL NOTIFICATION (queue_ready): + * - Called when queue reaches blocking barrier (ready_signal == -1) + * - Reset ready_signal to 0 + * - Add queue to _ready_queues multiset + * - If no queue executing: Find first queue in _enqueued_kernels that's also in _ready_queues + * - Grant execution to selected queue if found + * + * 4. KERNEL COMPLETION (kernel_completion_signal): + * - Current queue completes: Set its block_signal = 1 (block further kernels) + * - Find first queue in _enqueued_kernels that's also in _ready_queues + * - If found: Remove from both lists, set block_signal = 0, update _dispatch_queue + * - If not found: Set _dispatch_queue = nullptr + * + * 5. HARDWARE SYNCHRONIZATION: + * - HSA barrier packets provide hardware-level synchronization + * - Ready signal provides notification when barriers are reached + * - No software polling or busy-waiting required + * - Automatic wake-up when signal conditions are met + * + * SERIALIZATION GUARANTEES: + * - Only one kernel executes across all profiled queues + * - Only queues that have reached their barrier can execute + * - FIFO ordering with ready-state verification + * - Hardware-level blocking prevents race conditions + * - Automatic cleanup when queues are destroyed + * + * EXAMPLE EXECUTION SEQUENCE: + * 1. Queue A submits kernel → Gets immediate execution (_dispatch_queue = A, A.block_signal = 0) + * 2. Queue B submits kernel → Added to _enqueued_kernels (B.block_signal = 1) + * 3. Queue B reaches barrier → Added to _ready_queues (B.ready_signal triggers) + * 4. Queue C submits kernel → Added to _enqueued_kernels (C.block_signal = 1) + * 5. Queue A completes → Checks _enqueued_kernels for ready queues + * 6. Queue B selected → B.block_signal = 0, _dispatch_queue = B (C still waiting) + * 7. Queue C reaches barrier → Added to _ready_queues + * 8. Queue B completes → Queue C selected and executes + */ class profiler_serializer { public: @@ -72,9 +129,14 @@ class profiler_serializer void kernel_completion_signal(const Queue&); // Signal a kernel dispatch is taking place, generates packets needed to be // inserted to support kernel dispatch - common::container::small_vector kernel_dispatch(const Queue&) const; + common::container::small_vector kernel_dispatch(const Queue&); + // Signal that a queue has reached the blocking barrier and is ready to execute void queue_ready(hsa_queue_t* hsa_queue, const Queue& queue); + + // Check if queue has been seen before and set up signal handler if new + void register_queue_ready_handler(const Queue& queue); + // Enable the serializer void enable(const hsa_barrier::queue_map_ptr_t& queues); // Disable the serializer @@ -82,14 +144,42 @@ class profiler_serializer void destroy_queue(hsa_queue_t* id, const Queue& queue); - static void add_queue(hsa_queue_t** hsa_queues, const Queue& queue); + // Returns a string containing all class variable data + std::string to_string() const; private: - const Queue* _dispatch_queue{nullptr}; - std::deque _dispatch_ready; + // Extract common queue selection and execution logic + const Queue* select_and_grant_ready_queue(); + + const Queue* _dispatch_queue{nullptr}; + std::deque _enqueued_kernels; // FIFO queue of kernels enqueued for execution + std::unordered_multiset _ready_queues; // Queues that have signaled ready + std::unordered_set _seen_queues; // Track queue IDs we've registered handlers for std::atomic _serializer_status{Status::DISABLED}; std::deque _barrier; + mutable std::atomic _enqueued_packets{0}; + mutable std::atomic _completed_packets{0}; }; } // namespace hsa } // namespace rocprofiler + +namespace fmt +{ +// fmt::format support for profiler_serializer +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + auto format(rocprofiler::hsa::profiler_serializer const& serializer, Ctx& ctx) const + { + return fmt::format_to(ctx.out(), "{}", serializer.to_string()); + } +}; +} // namespace fmt diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp index bf691443e3e..a48a97fe952 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp @@ -44,6 +44,7 @@ #include #include +#include // static assert for rocprofiler_packet ABI compatibility static_assert(sizeof(hsa_ext_amd_aql_pm4_packet_t) == sizeof(hsa_kernel_dispatch_packet_t), @@ -411,7 +412,7 @@ WriteInterceptor(const void* packets, inserted_before = true; CHECK_NOTNULL(hsa::get_queue_controller()) ->serializer(&queue) - .rlock([&](const auto& serializer) { + .wlock([&](auto& serializer) { for(auto& s_pkt : serializer.kernel_dispatch(queue)) transformed_packets.emplace_back(s_pkt.ext_amd_aql_pm4); }); @@ -669,5 +670,58 @@ Queue::set_state(queue_state state) { _state = state; } + +std::string +Queue::to_string() const +{ + std::ostringstream oss; + oss << "Queue{"; + + // _notifiers + oss << "_notifiers: " << _notifiers.load() << ", "; + + // _active_async_packets + oss << "_active_async_packets: " << _active_async_packets.load() << ", "; + + // _callbacks (show count) + oss << "_callbacks: {"; + _callbacks.rlock([&](const auto& callbacks) { oss << "count: " << callbacks.size(); }); + oss << "}, "; + + // _intercept_queue (show ID if not null) + oss << "_intercept_queue: "; + if(_intercept_queue) + { + oss << "{id: " << _intercept_queue->id << "}"; + } + else + { + oss << "null"; + } + oss << ", "; + + // _state + oss << "_state: "; + switch(_state) + { + case queue_state::normal: oss << "normal"; break; + case queue_state::to_destroy: oss << "to_destroy"; break; + case queue_state::done_destroy: oss << "done_destroy"; break; + default: oss << "unknown"; break; + } + oss << ", "; + + // _active_kernels signal + oss << "_active_kernels: {handle: " << _active_kernels.handle << "}, "; + + // block_signal and ready_signal with their values + oss << "block_signal: {handle: " << block_signal.handle + << ", value: " << _core_api.hsa_signal_load_relaxed_fn(block_signal) << "}, "; + oss << "ready_signal: {handle: " << ready_signal.handle + << ", value: " << _core_api.hsa_signal_load_relaxed_fn(ready_signal) << "}"; + oss << "}"; + return oss.str(); +} + } // namespace hsa } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.hpp index e913d090e1b..b0f2200494d 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.hpp @@ -42,10 +42,13 @@ #include #include +#include + #include #include #include #include +#include #include namespace rocprofiler @@ -151,6 +154,9 @@ class Queue queue_state get_state() const; void set_state(queue_state state); + // Returns a string containing all class variable data + std::string to_string() const; + private: std::atomic _notifiers = {0}; std::atomic _active_async_packets = {0}; @@ -186,3 +192,23 @@ Queue::lock_queue(FuncT&& func) } } // namespace hsa } // namespace rocprofiler + +namespace fmt +{ +// fmt::format support for Queue +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + auto format(rocprofiler::hsa::Queue const& queue, Ctx& ctx) const + { + return fmt::format_to(ctx.out(), "{}", queue.to_string()); + } +}; +} // namespace fmt diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp index 557cfd1dd56..e0251564c12 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp @@ -21,6 +21,7 @@ // THE SOFTWARE. #include "lib/rocprofiler-sdk/hsa/queue_controller.hpp" +#include "lib/common/logging.hpp" #include "lib/common/static_object.hpp" #include "lib/rocprofiler-sdk/agent.hpp" #include "lib/rocprofiler-sdk/context/context.hpp" @@ -29,6 +30,7 @@ #include #include +#include namespace rocprofiler { @@ -62,10 +64,6 @@ create_queue(hsa_agent_t agent, controller->get_core_table(), controller->get_ext_table(), queue); - - controller->serializer(new_queue.get()).wlock([&](auto& serializer) { - serializer.add_queue(queue, *new_queue); - }); controller->add_queue(*queue, std::move(new_queue)); return HSA_STATUS_SUCCESS; @@ -428,6 +426,80 @@ QueueController::get_supported_agents() return _supported_agents; } +std::string +QueueController::to_string() const +{ + std::ostringstream oss; + oss << "QueueController{"; + + // _queues (show count) + oss << "_queues: {"; + _queues.rlock([&](const auto& queues) { + oss << "count: " << queues.size(); + if(!queues.empty()) + { + oss << ", queue_ids: ["; + bool first = true; + for(const auto& [hsa_queue, queue_ptr] : queues) + { + if(!first) oss << ", "; + oss << "[" << (hsa_queue ? hsa_queue->id : 0) << fmt::format(",{}", *queue_ptr) + << "]"; + first = false; + } + oss << "]"; + } + }); + oss << "}, "; + + // _callback_cache (show count) + oss << "_callback_cache: {"; + _callback_cache.rlock([&](const auto& callbacks) { oss << "count: " << callbacks.size(); }); + oss << "}, "; + + // _supported_agents (show count and node_ids) + oss << "_supported_agents: {"; + oss << "count: " << _supported_agents.size(); + if(!_supported_agents.empty()) + { + oss << ", node_ids: ["; + bool first = true; + for(const auto& [node_id, agent_cache] : _supported_agents) + { + if(!first) oss << ", "; + oss << node_id; + first = false; + } + oss << "]"; + } + oss << "}, "; + + // _serialized_enabled + oss << "_serialized_enabled: " << (_serialized_enabled.load() ? "true" : "false") << ", "; + + // _profiler_serializer (show count) + oss << "_profiler_serializer: {"; + _profiler_serializer.rlock([&](const auto& serializers) { + for(const auto& [id, serializer] : serializers) + { + auto serializer_data = serializer->rlock([](const auto& s) { return s.to_string(); }); + oss << "[" << id.handle << ": " << serializer_data << "]"; + } + }); + oss << "}"; + +#if !defined(NDEBUG) + // _debug_signals (show count if in debug mode) + oss << ", _debug_signals: {"; + _debug_signals.rlock( + [&](const auto& debug_signals) { oss << "count: " << debug_signals.size(); }); + oss << "}"; +#endif + + oss << "}"; + return oss.str(); +} + QueueController* get_queue_controller() { @@ -482,3 +554,17 @@ queue_controller_fini() } } // namespace hsa } // namespace rocprofiler + +extern "C" void +rocprofiler_debug_print_queue_controller_state() +{ + auto* controller = rocprofiler::hsa::get_queue_controller(); + if(controller) + { + ROCP_ERROR << "QueueController Debug State: " << controller->to_string(); + } + else + { + ROCP_ERROR << "QueueController Debug State: controller is null"; + } +} diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue_controller.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue_controller.hpp index e07ec3a62a5..418746219ad 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue_controller.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue_controller.hpp @@ -28,9 +28,12 @@ #include #include +#include + #include #include #include +#include #include #include @@ -98,6 +101,9 @@ class QueueController // serialization related signals if not compiled in debug mode. void print_debug_signals() const; + // Returns a string containing all class variable data + std::string to_string() const; + #if !defined(NDEBUG) // Tracks the creation of all signals in queues, used for debugging and disabled // in release mode (adds locking around signal creation). @@ -139,3 +145,35 @@ void profiler_serializer_kernel_completion_signal(hsa_signal_t queue_block_signal); } // namespace hsa } // namespace rocprofiler + +#ifdef __cplusplus +extern "C" { +#endif + +// Debug function to print QueueController state to ROCP_ERROR log +void +rocprofiler_debug_print_queue_controller_state(); + +#ifdef __cplusplus +} +#endif + +namespace fmt +{ +// fmt::format support for QueueController +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + auto format(rocprofiler::hsa::QueueController const& controller, Ctx& ctx) const + { + return fmt::format_to(ctx.out(), "{}", controller.to_string()); + } +}; +} // namespace fmt diff --git a/projects/rocprofiler-sdk/tests/bin/hsa-code-object/hsa_code_object_app.cpp b/projects/rocprofiler-sdk/tests/bin/hsa-code-object/hsa_code_object_app.cpp index a7645132fd7..3341e2d0d20 100644 --- a/projects/rocprofiler-sdk/tests/bin/hsa-code-object/hsa_code_object_app.cpp +++ b/projects/rocprofiler-sdk/tests/bin/hsa-code-object/hsa_code_object_app.cpp @@ -236,15 +236,15 @@ main() // Barrier on queue 1 waiting for signal_2 (from queue2's Kernel B) submit_barrier_packet(obj, queue1, completion_signal_2); + // Second dispatch packet on queue 1, Kernel C (waits for barrier above) + submit_kernel_packet(obj, queue1, copyC, args, completion_signal_3); + // Barrier on queue 2 waiting for signal_1 (from queue1's Kernel A) submit_barrier_packet(obj, queue2, completion_signal_1); // Kernel B on queue 2 (waits for barrier above) submit_kernel_packet(obj, queue2, copyB, args, completion_signal_2); - // Second dispatch packet on queue 1, Kernel C (waits for barrier above) - submit_kernel_packet(obj, queue1, copyC, args, completion_signal_3); - // Set up arguments for second batch args_memory->a = c; args_memory->b = d; diff --git a/projects/rocprofiler-sdk/tests/bin/openmp/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/openmp/CMakeLists.txt index 4b1e222d60b..b3f31129e53 100644 --- a/projects/rocprofiler-sdk/tests/bin/openmp/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/bin/openmp/CMakeLists.txt @@ -23,11 +23,8 @@ project(rocprofiler-sdk-tests-bin-openmp LANGUAGES CXX) find_package(rocprofiler-sdk REQUIRED) -set(DEFAULT_GPU_TARGETS "gfx906" "gfx908" "gfx90a" "gfx942" "gfx950" "gfx1100" "gfx1101" - "gfx1102") - set(OPENMP_GPU_TARGETS - "${DEFAULT_GPU_TARGETS}" + "${ROCPROFILER_GPU_TARGETS}" CACHE STRING "GPU targets to compile for") add_subdirectory(target) diff --git a/projects/rocprofiler-sdk/tests/common/CMakeLists.txt b/projects/rocprofiler-sdk/tests/common/CMakeLists.txt index 7e3c26b1d96..0912908b94e 100644 --- a/projects/rocprofiler-sdk/tests/common/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/common/CMakeLists.txt @@ -12,18 +12,7 @@ set(ROCPROFILER_DEFAULT_FAIL_REGEX "threw an exception|Permission denied|Could not create logging file|failed with error code|Subprocess aborted" CACHE INTERNAL "Default FAIL_REGULAR_EXPRESSION for tests") -set(DEFAULT_GPU_TARGETS - "gfx900" - "gfx906" - "gfx908" - "gfx90a" - "gfx942" - "gfx950" - "gfx1030" - "gfx1010" - "gfx1100" - "gfx1101" - "gfx1102") +set(DEFAULT_GPU_TARGETS ${ROCPROFILER_GPU_TARGETS}) set(GPU_TARGETS "${DEFAULT_GPU_TARGETS}"