diff --git a/projects/rocprofiler-sdk/samples/CMakeLists.txt b/projects/rocprofiler-sdk/samples/CMakeLists.txt index 1790a85cde1..a794a3f7003 100644 --- a/projects/rocprofiler-sdk/samples/CMakeLists.txt +++ b/projects/rocprofiler-sdk/samples/CMakeLists.txt @@ -48,3 +48,4 @@ add_subdirectory(external_correlation_id_request) add_subdirectory(pc_sampling) add_subdirectory(openmp_target) add_subdirectory(thread_trace) +add_subdirectory(spm_counter_collection) diff --git a/projects/rocprofiler-sdk/samples/spm_counter_collection/CMakeLists.txt b/projects/rocprofiler-sdk/samples/spm_counter_collection/CMakeLists.txt new file mode 100644 index 00000000000..c72853b8cad --- /dev/null +++ b/projects/rocprofiler-sdk/samples/spm_counter_collection/CMakeLists.txt @@ -0,0 +1,111 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +if(NOT CMAKE_HIP_COMPILER) + find_program( + amdclangpp_EXECUTABLE + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + mark_as_advanced(amdclangpp_EXECUTABLE) + + if(amdclangpp_EXECUTABLE) + set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}") + endif() +endif() + +project(rocprofiler-sdk-samples-spm-counter-collection LANGUAGES CXX HIP) + +foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) + if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") + set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}") + endif() +endforeach() + +if(NOT ROCPROFILER_MEMCHECK STREQUAL "") + set(SANITIZER True) +else() + set(SANITIZER False) +endif() + +find_package(rocprofiler-sdk REQUIRED) + +rocprofiler_sdk_spm_disabled(IS_SPM_DISABLED) +if(${SANITIZER}) + set(IS_SPM_DISABLED True) +endif() +set(IS_DISABLED False) + +add_library(spm-counter-collection-buffer-client SHARED) +target_sources(spm-counter-collection-buffer-client PRIVATE buffered_client.cpp + client.hpp) +target_link_libraries( + spm-counter-collection-buffer-client + PUBLIC rocprofiler-sdk::samples-build-flags + PRIVATE rocprofiler-sdk::rocprofiler-sdk rocprofiler-sdk::samples-common-library) + +set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP) +add_executable(spm-counter-collection-buffer) +target_sources(spm-counter-collection-buffer PRIVATE main.cpp) +target_link_libraries(spm-counter-collection-buffer + PRIVATE spm-counter-collection-buffer-client Threads::Threads) + +rocprofiler_samples_get_ld_library_path_env(LIBRARY_PATH_ENV) +rocprofiler_samples_get_preload_env(PRELOAD_ENV spm-counter-collection-buffer-client) + +set(SPM_ENV ROCPROFILER_SPM_BETA_ENABLED=True) +set(spm-counter-collection-buffer-env "${SPM_ENV}" "${PRELOAD_ENV}" "${LIBRARY_PATH_ENV}") + +add_test(NAME spm-counter-collection-buffer + COMMAND $) + +set_tests_properties( + spm-counter-collection-buffer + PROPERTIES TIMEOUT + 120 + LABELS + "samples" + ENVIRONMENT + "${spm-counter-collection-buffer-env}" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + DISABLED + ${IS_DISABLED}) + +add_library(spm-counter-collection-callback-client SHARED) +target_sources(spm-counter-collection-callback-client PRIVATE callback_client.cpp + client.hpp) +target_link_libraries( + spm-counter-collection-callback-client + PUBLIC rocprofiler-sdk::samples-build-flags + PRIVATE rocprofiler-sdk::rocprofiler-sdk rocprofiler-sdk::samples-common-library) + +set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP) +add_executable(spm-counter-collection-callback) +target_sources(spm-counter-collection-callback PRIVATE main.cpp) +target_link_libraries(spm-counter-collection-callback + PRIVATE spm-counter-collection-callback-client Threads::Threads) + +rocprofiler_samples_get_preload_env(PRELOAD_ENV spm-counter-collection-callback-client) + +set(spm-counter-collection-callback-env "${SPM_ENV}" "${PRELOAD_ENV}" + "${LIBRARY_PATH_ENV}") + +add_test(NAME spm-counter-collection-callback + COMMAND $) + +set_tests_properties( + spm-counter-collection-callback + PROPERTIES TIMEOUT + 120 + LABELS + "samples" + ENVIRONMENT + "${spm-counter-collection-callback-env}" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + DISABLED + ${IS_DISABLED}) diff --git a/projects/rocprofiler-sdk/samples/spm_counter_collection/buffered_client.cpp b/projects/rocprofiler-sdk/samples/spm_counter_collection/buffered_client.cpp new file mode 100644 index 00000000000..6dec0e9edd9 --- /dev/null +++ b/projects/rocprofiler-sdk/samples/spm_counter_collection/buffered_client.cpp @@ -0,0 +1,451 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "client.hpp" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ROCPROFILER_CALL(result, msg) \ + { \ + rocprofiler_status_t CHECKSTATUS = result; \ + if(CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) \ + { \ + std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS); \ + std::cerr << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg \ + << " failed with error code " << CHECKSTATUS << ": " << status_msg \ + << std::endl; \ + std::stringstream errmsg{}; \ + errmsg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg " failure (" \ + << status_msg << ")"; \ + throw std::runtime_error(errmsg.str()); \ + } \ + } + +int +start() +{ + return 1; +} + +namespace +{ +rocprofiler_context_id_t& +get_client_ctx() +{ + static rocprofiler_context_id_t ctx{0}; + return ctx; +} + +rocprofiler_buffer_id_t& +get_buffer() +{ + static rocprofiler_buffer_id_t buf = {}; + return buf; +} + +std::unordered_map>** +dimension_cache() +{ + static std::unordered_map>* + cache; + return &cache; +} + +/** + * For a given counter, query the dimensions that it has. Typically you will + * want to call this function once to get the dimensions and cache them. + */ +std::vector +counter_dimensions(rocprofiler_counter_id_t counter) +{ + if(*dimension_cache() == nullptr) return {}; + + if((*dimension_cache())->count(counter.handle) > 0) + { + return (*dimension_cache())->at(counter.handle); + } + + return {}; +} + +void +fill_dimension_cache(rocprofiler_counter_id_t counter) +{ + assert(*dimension_cache() != nullptr); + std::vector dims; + rocprofiler_counter_info_v1_t info; + ROCPROFILER_CALL(rocprofiler_query_counter_info( + counter, ROCPROFILER_COUNTER_INFO_VERSION_1, static_cast(&info)), + "Could not query info for counter"); + + (*dimension_cache()) + ->emplace(counter.handle, + std::vector{ + *info.dimensions, *info.dimensions + info.dimensions_count}); +} + +/** + * buffered_callback (set in rocprofiler_create_buffer in tool_init) is called when the + * buffer is full (or when the buffer is flushed). The callback is responsible for processing + * the records in the buffer. The records are returned in the headers array. The headers + * can contain counter records as well as other records (such as tracing). These + * records need to be filtered based on the category type. For counter collection, + * they should be filtered by category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS. + */ +void +buffered_callback(rocprofiler_context_id_t, + rocprofiler_buffer_id_t, + rocprofiler_record_header_t** headers, + size_t num_headers, + void* user_data, + uint64_t) +{ + std::stringstream ss; + // Iterate through the returned records + for(size_t i = 0; i < num_headers; ++i) + { + auto* header = headers[i]; + if(header->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS && + header->kind == ROCPROFILER_COUNTER_RECORD_PROFILE_COUNTING_DISPATCH_HEADER) + { + // Print the returned counter data. + auto* record = + static_cast(header->payload); + ss << "[Dispatch_Id: " << record->dispatch_info.dispatch_id + << " Kernel_ID: " << record->dispatch_info.kernel_id + << " Corr_Id: " << record->correlation_id.internal << ")]\n"; + } + else if(header->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS && + header->kind == ROCPROFILER_COUNTER_RECORD_VALUE) + { + // Print the returned counter data. + auto* record = static_cast(header->payload); + rocprofiler_counter_id_t counter_id = {.handle = 0}; + + rocprofiler_query_record_counter_id(record->id, &counter_id); + + ss << " (Dispatch_Id: " << record->dispatch_id << " Counter_Id: " << counter_id.handle + << " Record_Id: " << record->id << " Dimensions: ["; + + for(auto& dim : counter_dimensions(counter_id)) + { + size_t pos = 0; + rocprofiler_query_record_dimension_position(record->id, dim.id, &pos); + ss << "{" << dim.name << ": " << pos << "},"; + } + ss << "] Value [D]: " << record->value << "),"; + } + } + + auto* output_stream = static_cast(user_data); + if(!output_stream) throw std::runtime_error{"nullptr to output stream"}; + + *output_stream << "[" << __FUNCTION__ << "] " << ss.str() << "\n"; +} + +/** + * Cache to store the profile configs for each agent. This is used to prevent + * constructing the same profile config multiple times. Used by dispatch_callback + * to select the profile config (and in turn counters) to use when a kernel dispatch + * is received. + */ +std::unordered_map& +get_profile_cache() +{ + static std::unordered_map profile_cache; + return profile_cache; +} + +/** + * Callback from rocprofiler when an kernel dispatch is enqueued into the HSA queue. + * rocprofiler_counter_config_id_t* is a return to specify what counters to collect + * for this dispatch (dispatch_packet). This example function creates a profile + * to collect the counter SQ_WAVES for all kernel dispatch packets. + */ +void +dispatch_callback(const rocprofiler_spm_dispatch_counting_service_data_t* dispatch_data, + rocprofiler_counter_config_id_t* config, + rocprofiler_user_data_t* /* user_data*/, + void* /* callback_data_args*/) +{ + /** + * This simple example uses the same profile counter set for all agents. + * We store this in a cache to prevent constructing many identical profile counter + * sets. + */ + auto search_cache = [&]() { + if(auto pos = get_profile_cache().find(dispatch_data->dispatch_info.agent_id.handle); + pos != get_profile_cache().end()) + { + *config = pos->second; + return true; + } + return false; + }; + + if(!search_cache()) + { + std::cerr << "No profile for agent found in cache\n"; + exit(-1); + } +} + +/** + * Construct a profile config for an agent. This function takes an agent (obtained from + * get_gpu_device_agents()) and a set of counter names to collect. It returns a profile + * that can be used when a dispatch is received for the agent to collect the specified + * counters. Note: while you can dynamically create these profiles, it is more efficient + * to consturct them once in advance (i.e. in tool_init()) since there are non-trivial + * costs associated with constructing the profile. + */ +rocprofiler_counter_config_id_t +build_profile_for_agent(rocprofiler_agent_id_t agent, + const std::set& counters_to_collect) +{ + std::vector gpu_counters; + + // Iterate all the counters on the agent and store them in gpu_counters. + ROCPROFILER_CALL(rocprofiler_iterate_spm_supported_counters( + agent, + [](rocprofiler_agent_id_t, + rocprofiler_counter_id_t* counters, + size_t num_counters, + void* user_data) { + std::vector* vec = + static_cast*>(user_data); + for(size_t i = 0; i < num_counters; i++) + { + vec->push_back(counters[i]); + } + return ROCPROFILER_STATUS_SUCCESS; + }, + static_cast(&gpu_counters)), + "Could not fetch supported counters"); + + // Find the counters we actually want to collect (i.e. those in counters_to_collect) + std::vector collect_counters; + for(auto& counter : gpu_counters) + { + rocprofiler_counter_info_v0_t info; + ROCPROFILER_CALL( + rocprofiler_query_counter_info( + counter, ROCPROFILER_COUNTER_INFO_VERSION_0, static_cast(&info)), + "Could not query info for counter"); + if(counters_to_collect.count(std::string(info.name)) > 0) + { + std::clog << "Counter: " << counter.handle << " " << info.name << "\n"; + collect_counters.push_back(counter); + fill_dimension_cache(counter); + } + } + + // Create and return the profile + rocprofiler_counter_config_id_t profile = {.handle = 0}; + auto params = rocprofiler_spm_configuration_t{}; + params.frequency = 1.0; + params.buffer_size = 32768; + params.timeout = 0; + ROCPROFILER_CALL( + rocprofiler_spm_create_counter_config( + agent, collect_counters.data(), collect_counters.size(), ¶ms, &profile), + "Could not construct profile cfg"); + + return profile; +} + +/** + * Returns all GPU agents visible to rocprofiler on the system + */ +std::vector +get_gpu_device_agents() +{ + std::vector agents; + + // Callback used by rocprofiler_query_available_agents to return + // agents on the device. This can include CPU agents as well. We + // select GPU agents only (i.e. type == ROCPROFILER_AGENT_TYPE_GPU) + rocprofiler_query_available_agents_cb_t iterate_cb = [](rocprofiler_agent_version_t agents_ver, + const void** agents_arr, + size_t num_agents, + void* udata) { + if(agents_ver != ROCPROFILER_AGENT_INFO_VERSION_0) + throw std::runtime_error{"unexpected rocprofiler agent version"}; + auto* agents_v = static_cast*>(udata); + for(size_t i = 0; i < num_agents; ++i) + { + const auto* agent = static_cast(agents_arr[i]); + if(agent->type == ROCPROFILER_AGENT_TYPE_GPU) agents_v->emplace_back(*agent); + } + return ROCPROFILER_STATUS_SUCCESS; + }; + + // Query the agents, only a single callback is made that contains a vector + // of all agents. + ROCPROFILER_CALL( + rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, + iterate_cb, + sizeof(rocprofiler_agent_t), + const_cast(static_cast(&agents))), + "query available agents"); + return agents; +} + +/** + * Initialize the tool. This function is called once when the tool is loaded. + * The function is responsible for creating the context, buffer, profile configs + * (details counters to collect on each agent), configuring the dispatch profile + * counting service, and starting the context. + */ +int +tool_init(rocprofiler_client_finalize_t, void* user_data) +{ + ROCPROFILER_CALL(rocprofiler_create_context(&get_client_ctx()), "context creation failed"); + ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), + 4096, + 2048, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + buffered_callback, + user_data, + &get_buffer()), + "buffer creation failed"); + + // Get a vector of all GPU devices on the system. + auto agents = get_gpu_device_agents(); + + if(agents.empty()) + { + std::cerr << "No agents found" << std::endl; + return 1; + } + + // Construct the profiles in advance for each agent that is a GPU + for(const auto& agent : agents) + { + // get_profile_cache() is a map that can be accessed by dispatch_callback + // below to select the profile config to use when a kernel dispatch is + // recieved. + get_profile_cache().emplace( + agent.id.handle, build_profile_for_agent(agent.id, std::set{"TCC_HIT"})); + } + + auto client_thread = rocprofiler_callback_thread_t{}; + // Create the callback thread + ROCPROFILER_CALL(rocprofiler_create_callback_thread(&client_thread), + "failure creating callback thread"); + // Create the buffer and assign the callback thread to the buffer, when the buffer is full + // a callback will be issued (to client_thread) + ROCPROFILER_CALL(rocprofiler_assign_callback_thread(get_buffer(), client_thread), + "failed to assign thread for buffer"); + + // Setup the dispatch profile counting service. This service will trigger the dispatch_callback + // when a kernel dispatch is enqueued into the HSA queue. The callback will specify what + // counters to collect by returning a profile config id. In this example, we create the profile + // configs above and store them in the map get_profile_cache() so we can look them up at + // dispatch. + ROCPROFILER_CALL(rocprofiler_configure_buffer_spm_dispatch_service( + get_client_ctx(), get_buffer(), dispatch_callback, nullptr), + "Could not setup buffered service"); + + // Start the context (start intercepting kernel dispatches). + ROCPROFILER_CALL(rocprofiler_start_context(get_client_ctx()), "start context"); + + // no errors + return 0; +} + +void +tool_fini(void* user_data) +{ + std::clog << "In tool fini\n"; + + // Flush the buffer and stop the context + ROCPROFILER_CALL(rocprofiler_flush_buffer(get_buffer()), "buffer flush"); + rocprofiler_stop_context(get_client_ctx()); + + auto* output_stream = static_cast(user_data); + *output_stream << std::flush; + if(output_stream != &std::cout && output_stream != &std::cerr) delete output_stream; + + auto* tmp_ptr = *dimension_cache(); + *dimension_cache() = nullptr; + delete tmp_ptr; +} +} // namespace + +extern "C" rocprofiler_tool_configure_result_t* +rocprofiler_configure(uint32_t version, + const char* runtime_version, + uint32_t priority, + rocprofiler_client_id_t* id) +{ + // set the client name + id->name = "SPMCounterClientSample"; + + // compute major/minor/patch version info + uint32_t major = version / 10000; + uint32_t minor = (version % 10000) / 100; + uint32_t patch = version % 100; + + // generate info string + auto info = std::stringstream{}; + info << id->name << " (priority=" << priority << ") is using rocprofiler-sdk v" << major << "." + << minor << "." << patch << " (" << runtime_version << ")"; + + std::clog << info.str() << std::endl; + + std::ostream* output_stream = nullptr; + std::string filename = "spm_buffer_dispatch_counter_collection.log"; + if(auto* outfile = getenv("ROCPROFILER_SAMPLE_OUTPUT_FILE"); outfile) filename = outfile; + if(filename == "stdout") + output_stream = &std::cout; + else if(filename == "stderr") + output_stream = &std::cerr; + else + output_stream = new std::ofstream{filename}; + + // create configure data + static auto cfg = + rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t), + &tool_init, + &tool_fini, + static_cast(output_stream)}; + + *dimension_cache() = + new std::unordered_map>(); + + // return pointer to configure data + return &cfg; +} diff --git a/projects/rocprofiler-sdk/samples/spm_counter_collection/callback_client.cpp b/projects/rocprofiler-sdk/samples/spm_counter_collection/callback_client.cpp new file mode 100644 index 00000000000..d28e96b2044 --- /dev/null +++ b/projects/rocprofiler-sdk/samples/spm_counter_collection/callback_client.cpp @@ -0,0 +1,270 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "client.hpp" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ROCPROFILER_CALL(result, msg) \ + { \ + rocprofiler_status_t CHECKSTATUS = result; \ + if(CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) \ + { \ + std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS); \ + std::cerr << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg \ + << " failed with error code " << CHECKSTATUS << ": " << status_msg \ + << std::endl; \ + std::stringstream errmsg{}; \ + errmsg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg " failure (" \ + << status_msg << ")"; \ + throw std::runtime_error(errmsg.str()); \ + } \ + } + +int +start() +{ + return 1; +} + +namespace +{ +struct tool_data_t +{ + std::mutex mut{}; + std::ostream* output_stream{nullptr}; +}; + +rocprofiler_context_id_t& +get_client_ctx() +{ + static rocprofiler_context_id_t ctx{0}; + return ctx; +} + +void +record_callback(const rocprofiler_spm_dispatch_counting_service_data_t* dispatch_data, + const rocprofiler_spm_counter_record_t** records, + size_t record_count, + int /* flags*/, + rocprofiler_user_data_t /* user_data*/, + void* callback_data_args) +{ + std::stringstream ss; + ss << "Dispatch_Id=" << dispatch_data->dispatch_info.dispatch_id + << ", Kernel_id=" << dispatch_data->dispatch_info.kernel_id + << ", Corr_Id=" << dispatch_data->correlation_id.internal << ": "; + for(size_t i = 0; i < record_count; ++i) + ss << "(Id: " << records[i]->id << " Value [D]: " << records[i]->value << "),"; + + auto* tool = static_cast(callback_data_args); + if(!tool || !tool->output_stream) throw std::runtime_error{"nullptr to output stream"}; + + auto _lk = std::unique_lock{tool->mut}; + *tool->output_stream << "[" << __FUNCTION__ << "] " << ss.str() << "\n"; +} + +/** + * Callback from rocprofiler when an kernel dispatch is enqueued into the HSA queue. + * rocprofiler_counter_spm_config_id_t* is a return to specify what counters to collect + * for this dispatch (dispatch_packet). This example function creates a profile + * to collect the counter SQ_WAVES for all kernel dispatch packets. + */ +void +dispatch_callback(const rocprofiler_spm_dispatch_counting_service_data_t* dispatch_data, + rocprofiler_counter_config_id_t* config, + rocprofiler_user_data_t* /* user_data*/, + void* /*callback_data_args*/) +{ + /** + * This simple example uses the same profile counter set for all agents. + * We store this in a cache to prevent constructing many identical profile counter + * sets. We first check the cache to see if we have already constructed a counter" + * set for the agent. If we have, return it. Otherwise, construct a new profile counter + * set. + */ + static std::shared_mutex m_mutex = {}; + static std::unordered_map profile_cache = {}; + + auto search_cache = [&]() { + if(auto pos = profile_cache.find(dispatch_data->dispatch_info.agent_id.handle); + pos != profile_cache.end()) + { + *config = pos->second; + return true; + } + return false; + }; + + { + auto rlock = std::shared_lock{m_mutex}; + if(search_cache()) return; + } + + auto wlock = std::unique_lock{m_mutex}; + if(search_cache()) return; + + // Counters we want to collect (here its SQ_WAVES) + std::set counters_to_collect = {"SQ_WAVES"}; + // GPU Counter IDs + std::vector gpu_counters; + + // Iterate through the agents and get the counters available on that agent + ROCPROFILER_CALL(rocprofiler_iterate_spm_supported_counters( + dispatch_data->dispatch_info.agent_id, + [](rocprofiler_agent_id_t, + rocprofiler_counter_id_t* counters, + size_t num_counters, + void* user_data) { + std::vector* vec = + static_cast*>(user_data); + for(size_t i = 0; i < num_counters; i++) + { + vec->push_back(counters[i]); + } + return ROCPROFILER_STATUS_SUCCESS; + }, + static_cast(&gpu_counters)), + "Could not fetch supported counters"); + + std::vector collect_counters; + // Look for the counters contained in counters_to_collect in gpu_counters + for(auto& counter : gpu_counters) + { + rocprofiler_counter_info_v0_t info; + ROCPROFILER_CALL( + rocprofiler_query_counter_info( + counter, ROCPROFILER_COUNTER_INFO_VERSION_0, static_cast(&info)), + "Could not query info"); + if(counters_to_collect.count(std::string(info.name)) > 0) + { + std::clog << "Counter: " << counter.handle << " " << info.name << "\n"; + collect_counters.push_back(counter); + } + } + + // Create a colleciton profile for the counters + rocprofiler_counter_config_id_t profile = {.handle = 0}; + auto params = rocprofiler_spm_configuration_t{}; + params.frequency = 1.0; + params.buffer_size = 32768; + params.timeout = 0; + ROCPROFILER_CALL(rocprofiler_spm_create_counter_config(dispatch_data->dispatch_info.agent_id, + collect_counters.data(), + collect_counters.size(), + ¶ms, + &profile), + "Could not construct profile cfg"); + + profile_cache.emplace(dispatch_data->dispatch_info.agent_id.handle, profile); + // Return the profile to collect those counters for this dispatch + *config = profile; +} + +int +tool_init(rocprofiler_client_finalize_t, void* user_data) +{ + ROCPROFILER_CALL(rocprofiler_create_context(&get_client_ctx()), "context creation failed"); + + ROCPROFILER_CALL(rocprofiler_configure_callback_spm_dispatch_service( + get_client_ctx(), dispatch_callback, nullptr, record_callback, user_data), + "Could not setup counting service"); + ROCPROFILER_CALL(rocprofiler_start_context(get_client_ctx()), "start context"); + + // no errors + return 0; +} + +void +tool_fini(void* user_data) +{ + assert(user_data); + std::clog << "In tool fini\n"; + rocprofiler_stop_context(get_client_ctx()); + auto* tool_data = static_cast(user_data); + + { + auto _lk = std::unique_lock{tool_data->mut}; + auto* output_stream = tool_data->output_stream; + + *output_stream << std::flush; + if(output_stream != &std::cout && output_stream != &std::cerr) delete output_stream; + } + + delete tool_data; +} +} // namespace + +extern "C" rocprofiler_tool_configure_result_t* +rocprofiler_configure(uint32_t version, + const char* runtime_version, + uint32_t priority, + rocprofiler_client_id_t* id) +{ + // set the client name + id->name = "SPMCounterClientSample"; + + // compute major/minor/patch version info + uint32_t major = version / 10000; + uint32_t minor = (version % 10000) / 100; + uint32_t patch = version % 100; + + // generate info string + auto info = std::stringstream{}; + info << id->name << " (priority=" << priority << ") is using rocprofiler-sdk v" << major << "." + << minor << "." << patch << " (" << runtime_version << ")"; + + std::clog << info.str() << std::endl; + + auto* tool_data = new tool_data_t{}; + + std::string filename = "spm_callback_dispatch_counter_collection.log"; + if(auto* outfile = getenv("ROCPROFILER_SAMPLE_OUTPUT_FILE"); outfile) filename = outfile; + if(filename == "stdout") + tool_data->output_stream = &std::cout; + else if(filename == "stderr") + tool_data->output_stream = &std::cerr; + else + tool_data->output_stream = new std::ofstream{filename}; + + // create configure data + static auto cfg = + rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t), + &tool_init, + &tool_fini, + static_cast(tool_data)}; + + // return pointer to configure data + return &cfg; +} diff --git a/projects/rocprofiler-sdk/samples/spm_counter_collection/client.hpp b/projects/rocprofiler-sdk/samples/spm_counter_collection/client.hpp new file mode 100644 index 00000000000..debc238b45e --- /dev/null +++ b/projects/rocprofiler-sdk/samples/spm_counter_collection/client.hpp @@ -0,0 +1,30 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include + +#define CLIENT_API __attribute__((visibility("default"))) + +int +start() CLIENT_API; diff --git a/projects/rocprofiler-sdk/samples/spm_counter_collection/main.cpp b/projects/rocprofiler-sdk/samples/spm_counter_collection/main.cpp new file mode 100644 index 00000000000..00a6e216a23 --- /dev/null +++ b/projects/rocprofiler-sdk/samples/spm_counter_collection/main.cpp @@ -0,0 +1,159 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include + +#include +#include "client.hpp" + +#define HIP_CALL(call) \ + do \ + { \ + hipError_t err = call; \ + if(err != hipSuccess) \ + { \ + fprintf(stderr, "%s\n", hipGetErrorString(err)); \ + abort(); \ + } \ + } while(0) + +__global__ void +kernelA(int x, int y) +{ + x = x + y; +} + +__global__ void +kernelB(int x, int y) +{ + x = x + y; +} + +template +__global__ void +kernelC(T* C_d, const T* A_d, size_t N) +{ + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + for(size_t i = offset; i < N; i += stride) + { + C_d[i] = A_d[i] * A_d[i]; + } +} + +void +launchKernels(const long NUM_LAUNCH, const long SYNC_INTERVAL, const int DEV_ID) +{ + // Normal HIP Calls + HIP_CALL(hipSetDevice(DEV_ID)); + [[maybe_unused]] hipDeviceProp_t devProp; + HIP_CALL(hipGetDeviceProperties(&devProp, DEV_ID)); + + int* gpuMem = nullptr; + HIP_CALL(hipMalloc((void**) &gpuMem, 1 * sizeof(int))); + + for(long i = 0; i < NUM_LAUNCH; i++) + { + // KernelA and KernelB to be profiled as part of the session + hipLaunchKernelGGL(kernelA, dim3(1), dim3(1), 0, 0, 1, 2); + hipLaunchKernelGGL(kernelB, dim3(1), dim3(1), 0, 0, 1, 2); + if(i % SYNC_INTERVAL == (SYNC_INTERVAL - 1)) HIP_CALL(hipDeviceSynchronize()); + } + + const int NElems = 512 * 512; + const int Nbytes = NElems * sizeof(int); + int * A_d, *C_d; + int A_h[NElems], C_h[NElems]; + + for(int i = 0; i < NElems; i++) + { + A_h[i] = i; + } + + HIP_CALL(hipDeviceSynchronize()); + + HIP_CALL(hipMalloc(&A_d, Nbytes)); + HIP_CALL(hipMalloc(&C_d, Nbytes)); + HIP_CALL(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + HIP_CALL(hipDeviceSynchronize()); + const unsigned blocks = 512; + const unsigned threadsPerBlock = 256; + for(long i = 0; i < NUM_LAUNCH; i++) + { + hipLaunchKernelGGL(kernelC, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, NElems); + if(i % SYNC_INTERVAL == (SYNC_INTERVAL - 1)) HIP_CALL(hipDeviceSynchronize()); + } + HIP_CALL(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + HIP_CALL(hipDeviceSynchronize()); + HIP_CALL(hipFree(gpuMem)); + HIP_CALL(hipFree(A_d)); + HIP_CALL(hipFree(C_d)); + HIP_CALL(hipDeviceReset()); +} + +int +main(int argc, char** argv) +{ + auto* exe_name = basename(argv[0]); + + int ntotdevice = 0; + HIP_CALL(hipGetDeviceCount(&ntotdevice)); + + long nitr = 50; + long nsync = 50; + long ndevice = 0; + + for(int i = 1; i < argc; ++i) + { + auto _arg = std::string{argv[i]}; + if(_arg == "?" || _arg == "-h" || _arg == "--help") + { + fprintf(stderr, + "usage: %s [NUM_ITERATION (%li)] [SYNC_EVERY_N_ITERATIONS (%li)] " + "[NUMBER_OF_DEVICES (%li)]\n\n\tBy default, 0 for the number of devices means " + "use all device available", + exe_name, + nitr, + nsync, + ndevice); + exit(EXIT_SUCCESS); + } + } + + if(argc > 1) nitr = atol(argv[1]); + if(argc > 2) nsync = atoll(argv[2]); + if(argc > 3) ndevice = atol(argv[3]); + + if(ndevice > ntotdevice) ndevice = ntotdevice; + if(ndevice < 1) ndevice = ntotdevice; + + printf("[%s] Number of devices used: %li\n", exe_name, ndevice); + printf("[%s] Number of iterations: %li\n", exe_name, nitr); + printf("[%s] Syncing every %li iterations\n", exe_name, nsync); + std::cout << std::flush; + + start(); + for(long devid = 0; devid < ndevice; ++devid) + launchKernels(nitr, nsync, devid); + + std::cerr << "Run complete\n" << std::flush; +} diff --git a/projects/rocprofiler-sdk/source/bin/rocprofv3-avail.py b/projects/rocprofiler-sdk/source/bin/rocprofv3-avail.py index d57d20c6ebf..f4eca3e940f 100755 --- a/projects/rocprofiler-sdk/source/bin/rocprofv3-avail.py +++ b/projects/rocprofiler-sdk/source/bin/rocprofv3-avail.py @@ -82,6 +82,9 @@ def add_list_options(subparsers): "list", help="List options for hw counters, agents and pc-sampling support" ) add_parser_bool_argument(list_command, "--pmc", help="List counters") + add_parser_bool_argument( + list_command, "--spm", help="List counters supporting SPM" + ) add_parser_bool_argument( list_command, "--agent", help="List basic info of agents" ) @@ -97,6 +100,7 @@ def add_info_options(subparsers): ) info_command.add_argument("--pmc", nargs="*", help="PMC info") + info_command.add_argument("--spm", nargs="*", help="SPM info") info_command.add_argument( "--pc-sampling", nargs="*", help="Detailed PC Sampling info" ) @@ -198,6 +202,10 @@ def list_basic_agent(args, list_counters): from rocprofv3 import avail def print_agent_counter(counters): + if len(counters) == 0: + msg = "No PMC counters supported" + print("{:30}\n".format(msg)) + return names_len = [len(counter.name) for counter in counters] names = [ "{name:{width}}".format(name=counter.name, width=max(names_len)) @@ -271,10 +279,57 @@ def info_pc_sampling(args): print("\n") -def listing(args): +def listing_spm(args): + from rocprofv3 import avail + + def print_agent_counter(counters): + if len(counters) == 0: + print("{:30}\n".format("No SPM counters supported")) + return + names_len = [len(counter.name) for counter in counters] + names = [ + "{name:{width}}".format(name=counter.name, width=max(names_len)) + for counter in counters + ] + columns = get_number_columns(max(names_len)) + print("{:30}:\n".format("SPM")) + for idx in range(0, len(names), columns): + print("{:30}".format(" ".join(names[idx : (idx + columns)]))) + + agent_spm_counters = avail.get_spm_counters() + agent_info_map = avail.get_agent_info_map() + + for agent, info in dict(sorted(agent_info_map.items())).items(): + if ( + info["type"] == 2 + and args.device is not None + and info["logical_node_type_id"] == args.device + ): + print( + "{:30}:\t{}\n{:30}:\t{}".format( + "GPU", info["logical_node_type_id"], "Name", info["name"] + ) + ) + print_agent_counter(agent_spm_counters[agent]) + print("\n") + break + elif info["type"] == 2 and args.device is None: + print( + "{:30}:\t{}\n{:30}:\t{}".format( + "GPU", info["logical_node_type_id"], "Name", info["name"] + ) + ) + print_agent_counter(agent_spm_counters[agent]) + print("\n") + + +def listing_pmc(args): from rocprofv3 import avail def print_agent_counter(counters): + if len(counters) == 0: + print("{:30}\n".format("No PMC counters supported")) + return names_len = [len(counter.name) for counter in counters] names = [ "{name:{width}}".format(name=counter.name, width=max(names_len)) @@ -312,24 +367,65 @@ def print_agent_counter(counters): print("\n") +def info_spm(args): + from rocprofv3 import avail + + agent_info_map = avail.get_agent_info_map() + + def print_spm_info(_args, agent): + + if not _args.spm: + spm_counters = avail.get_spm_counters()[agent] + for counter in spm_counters: + print(counter) + print("\n") + else: + spm_counters = avail.get_spm_counters()[agent] + for spm in _args.spm: + for counter in spm_counters: + if spm == counter.get_as_dict()["Counter_Name"]: + print(counter) + + for agent, info in dict(sorted(agent_info_map.items())).items(): + if ( + info["type"] == 2 + and args.device is not None + and info["logical_node_type_id"] == args.device + ): + print( + "{}:{}\n{}:{}".format( + "GPU", info["logical_node_type_id"], "Name", info["name"] + ) + ) + print_spm_info(args, agent) + break + elif info["type"] == 2 and args.device is None: + print( + "{}:{}\n{}:{}".format( + "GPU", info["logical_node_type_id"], "Name", info["name"] + ) + ) + print_spm_info(args, agent) + + def info_pmc(args): from rocprofv3 import avail - agent_counters = avail.get_counters() agent_info_map = avail.get_agent_info_map() - def print_pmc_info(args, pmc_counters): + def print_pmc_info(_args, agent): - if not args.pmc: + if not _args.pmc: + pmc_counters = avail.get_counters()[agent] for counter in pmc_counters: print(counter) print("\n") else: - for pmc in args.pmc: + pmc_counters = avail.get_counters()[agent] + for pmc in _args.pmc: for counter in pmc_counters: if pmc == counter.get_as_dict()["Counter_Name"]: print(counter) - print("\n") for agent, info in dict(sorted(agent_info_map.items())).items(): if ( @@ -342,7 +438,7 @@ def print_pmc_info(args, pmc_counters): "GPU", info["logical_node_type_id"], "Name", info["name"] ) ) - print_pmc_info(args, agent_counters[agent]) + print_pmc_info(args, agent) break elif info["type"] == 2 and args.device is None: print( @@ -350,26 +446,36 @@ def print_pmc_info(args, pmc_counters): "GPU", info["logical_node_type_id"], "Name", info["name"] ) ) - print_pmc_info(args, agent_counters[agent]) + print_pmc_info(args, agent) def process_info(args): - if args.pmc is None and args.pc_sampling is None: + + if args.pmc is None and args.pc_sampling is None and args.spm is None: list_basic_agent(args, True) if args.pmc is not None: info_pmc(args) + if args.spm is not None: + info_spm(args) if args.pc_sampling is not None: os.environ["ROCPROFILER_PC_SAMPLING_BETA_ENABLED"] = "on" info_pc_sampling(args) def process_list(args): - if args.agent is None and args.pc_sampling is None and args.pmc is None: - listing(args) + if ( + args.agent is None + and args.pc_sampling is None + and args.pmc is None + and args.spm is None + ): + listing_pmc(args) if args.agent: list_basic_agent(args, False) if args.pmc: - listing(args) + listing_pmc(args) + if args.spm: + listing_spm(args) if args.pc_sampling: os.environ["ROCPROFILER_PC_SAMPLING_BETA_ENABLED"] = "on" list_pc_sampling(args) diff --git a/projects/rocprofiler-sdk/source/bin/rocprofv3.py b/projects/rocprofiler-sdk/source/bin/rocprofv3.py index 42158285ac5..9dfc0f57335 100755 --- a/projects/rocprofiler-sdk/source/bin/rocprofv3.py +++ b/projects/rocprofiler-sdk/source/bin/rocprofv3.py @@ -470,6 +470,44 @@ def add_parser_bool_argument(gparser, *args, **kwargs): action="append", ) + spm_options = parser.add_argument_group("Streaming Performance Monitor(SPM) options") + + add_parser_bool_argument( + spm_options, + "--spm-beta-enabled", + help="enable SPM; beta version", + ) + + spm_options.add_argument( + "--spm", + help=( + "Specify SPM events to collect(comma OR space separated in case of more than 1 counters). " + "Note: job will fail if entire set of counters cannot be collected in single pass" + ), + default=None, + nargs="*", + ) + + spm_options.add_argument( + "--spm-buffer-size", + help="SPM Buffer size in kilobytes. Default value is set to 32768 KB in tool", + default=None, + type=str, + ) + + spm_options.add_argument( + "--spm-timeout", + help="Timeout for SPM, in ms. Default value is set to 0 ms in tool", + default=None, + type=int, + ) + + spm_options.add_argument( + "--spm-frequency", + help="Frequency in Ghz. This is estimated to shader clock count. Default is set to 0.5GHz in tool.", + default=None, + type=str, + ) pc_sampling_options = parser.add_argument_group("PC sampling options") add_parser_bool_argument( @@ -1143,6 +1181,20 @@ def is_filtered(key): return patch_args(dotdict(data)) +def int_auto(num_str): + if isinstance(num_str, str): + if "0x" in num_str: + return int(num_str, 16) + else: + return int(num_str, 10) + elif isinstance(num_str, int): + return num_str + else: + raise ValueError( + f"{type(num_str)} is not supported. {num_str} should be of type integer or string." + ) + + def run(app_args, args, **kwargs): app_env = dict(os.environ) @@ -1740,6 +1792,48 @@ def log_config(_env): update_env("ROCPROF_PC_SAMPLING_METHOD", args.pc_sampling_method) update_env("ROCPROF_PC_SAMPLING_INTERVAL", args.pc_sampling_interval) + if args.spm or args.spm_buffer_size or args.spm_timeout or args.spm_frequency: + + if ( + not args.spm_beta_enabled + and os.environ.get("ROCPROFILER_SPM_BETA_ENABLED", None) is None + ): + fatal_error( + "SPM unavailable. The feature is implicitly disabled. To enable it, use --spm-beta-enabled option" + ) + + update_env("ROCPROFILER_SPM_BETA_ENABLED", True, overwrite=True) + update_env("ROCPROF_SPM_COUNTER_COLLECTION", True, overwrite=True) + + if ( + args.pmc + or args.pc_sampling_beta_enabled + or os.environ.get("ROCPROFILER_PC_SAMPLING_BETA_ENABLED", None) is not None + ): + fatal_error( + "SPM feature cannot be enabled along with pc sampling or pmc counter collection" + ) + + if args.spm is None: + fatal_error("Please input list of counters to be sampled") + + update_env( + "ROCPROF_SPM_COUNTERS", + "spm: {}".format(" ".join(args.spm)), + overwrite=True, + ) + + if args.spm_buffer_size: + update_env( + "ROCPROF_SPM_BUFFER_SIZE", int_auto(args.spm_buffer_size), overwrite=True + ) + + if args.spm_timeout: + update_env("ROCPROF_SPM_TIMEOUT_MS", args.spm_timeout, overwrite=True) + + if args.spm_frequency: + update_env("ROCPROF_SPM_FREQUENCY", float(args.spm_frequency), overwrite=True) + if args.disable_signal_handlers is not None: update_env("ROCPROF_SIGNAL_HANDLERS", not args.disable_signal_handlers) @@ -1751,19 +1845,6 @@ def log_config(_env): if args.advanced_thread_trace: - def int_auto(num_str): - if isinstance(num_str, str): - if "0x" in num_str: - return int(num_str, 16) - else: - return int(num_str, 10) - elif isinstance(num_str, int): - return num_str - else: - raise ValueError( - f"{type(num_str)} is not supported. {num_str} should be of type integer or string." - ) - update_env("ROCPROF_ADVANCED_THREAD_TRACE", True, overwrite=True) if args.att_target_cu is not None: diff --git a/projects/rocprofiler-sdk/source/docs/api-reference/spm.rst b/projects/rocprofiler-sdk/source/docs/api-reference/spm.rst new file mode 100644 index 00000000000..4d3506b96f1 --- /dev/null +++ b/projects/rocprofiler-sdk/source/docs/api-reference/spm.rst @@ -0,0 +1,229 @@ +.. meta:: + :description: ROCprofiler-SDK is a tooling infrastructure for profiling general-purpose GPU compute applications running on the ROCm software + :keywords: ROCprofiler-SDK API reference, Streaming Performance Monitor, SPM + +.. _SPM: + +ROCprofiler-SDK Streaming Performance Monitor method +=================================== + +Streaming Performance Monitor (SPM) sampling is a profiling method that samples hardware performance counters at regular intervals. This provides a granual insight into behavior of kernel during its execution. +.. warning:: + + Risk acknowledgment: The SPM feature is under development and might not be completely stable. Use this beta feature cautiously. It may affect your system's stability and performance. Proceed at your own risk. + + By activating this feature through ``ROCPROFILER_SPM_BETA_ENABLED`` environment variable, you acknowledge and accept the following potential risks: + + - System reboot: This beta feature could cause your hardware to restart unexpectedly. + +ROCprofiler-SDK SPM service +------------------------------------ + +This section describes how to use ROCProfiler-SDK SPM API to configure and use SPM service. For fully functional examples, see `Samples `_. + +Currently, **Dispatch counting** is supported for SPM. Please refer to :ref:`counter collection services` for information on dispatch counting, counters and profile configuration. + +The set up for SPM service is similar to counter collection services. + +SPM counter service cannot be enabled together with PMC or PC sampling service. + +kernels are serialized in dispatch counting SPM service. + +tool_init() setup +++++++++++++++++++ + +Here are the steps to set up ``tool_init()``: + +.. code-block:: cpp + + auto ctx = rocprofiler_context_id_t{0}; + auto buff = rocprofiler_buffer_id_t{}; + ROCPROFILER_CALL(rocprofiler_create_context(&ctx), "context creation failed"); + ROCPROFILER_CALL(rocprofiler_create_buffer(ctx, + 8192, + 8192, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + spm_sampling_callback, // Callback to process PC samples + user_data, + &buff), + "buffer creation failed"); + +For more details on buffer creation, see :ref:`buffered-services`. + +.. code-block:: cpp + + /* For Dispatch Counting */ + // Setup the dispatch profile counting service. This service will trigger the dispatch_callback + // when a kernel dispatch is enqueued into the HSA queue. The callback will specify what + // counters to collect by returning a profile config id. + ROCPROFILER_CALL(rocprofiler_configure_buffer_spm_dispatch_service( + ctx, buff, spm_dispatch_callback, nullptr), + "Could not setup buffered service"); + + + + +.. code-block:: cpp + + std::vector agents; + + // Callback used by rocprofiler_query_available_agents to return + // agents on the device. This can include CPU agents as well. + // Select GPU agents only (type == ROCPROFILER_AGENT_TYPE_GPU) + rocprofiler_query_available_agents_cb_t iterate_cb = [](rocprofiler_agent_version_t agents_ver, + const void** agents_arr, + size_t num_agents, + void* udata) { + if(agents_ver != ROCPROFILER_AGENT_INFO_VERSION_0) + throw std::runtime_error{"unexpected rocprofiler agent version"}; + auto* agents_v = static_cast*>(udata); + for(size_t i = 0; i < num_agents; ++i) + { + const auto* agent = static_cast(agents_arr[i]); + if(agent->type == ROCPROFILER_AGENT_TYPE_GPU) agents_v->emplace_back(*agent); + } + return ROCPROFILER_STATUS_SUCCESS; + }; + + // Query the agents. Only a single callback is made that contains a vector + // of all agents. + ROCPROFILER_CALL( + rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, + iterate_cb, + sizeof(rocprofiler_agent_t), + const_cast(static_cast(&agents))), + "query available agents"); + +Profile Setup +------------- + +1. The first step in constructing a SPM counter collection profile is to find the GPU agents on the machine. + +.. code-block:: cpp + + std::vector agents; + + // Callback used by rocprofiler_query_available_agents to return + // agents on the device. This can include CPU agents as well. We + // select GPU agents only (i.e. type == ROCPROFILER_AGENT_TYPE_GPU) + rocprofiler_query_available_agents_cb_t iterate_cb = [](rocprofiler_agent_version_t agents_ver, + const void** agents_arr, + size_t num_agents, + void* udata) { + if(agents_ver != ROCPROFILER_AGENT_INFO_VERSION_0) + throw std::runtime_error{"unexpected rocprofiler agent version"}; + auto* agents_v = static_cast*>(udata); + for(size_t i = 0; i < num_agents; ++i) + { + const auto* agent = static_cast(agents_arr[i]); + if(agent->type == ROCPROFILER_AGENT_TYPE_GPU) agents_v->emplace_back(*agent); + } + return ROCPROFILER_STATUS_SUCCESS; + }; + + // Query the agents, only a single callback is made that contains a vector + // of all agents. + ROCPROFILER_CALL( + rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, + iterate_cb, + sizeof(rocprofiler_agent_t), + const_cast(static_cast(&agents))), + "query available agents"); + +2. To identify the counters supported by an agent, query the available counters with ``rocprofiler_iterate_agent_spm_supported_counters``. Here is an example of a single agent returning the available counters in ``gpu_counters``: + +.. code-block:: cpp + + std::vector gpu_counters; + + // Iterate all the counters on the agent and store them in gpu_counters. + ROCPROFILER_CALL(rocprofiler_iterate_agent_spm_supported_counters( + agent, + [](rocprofiler_agent_id_t, + rocprofiler_counter_id_t* counters, + size_t num_counters, + void* user_data) { + std::vector* vec = + static_cast*>(user_data); + for(size_t i = 0; i < num_counters; i++) + { + vec->push_back(counters[i]); + } + return ROCPROFILER_STATUS_SUCCESS; + }, + static_cast(&gpu_counters)), + "Could not fetch supported counters"); + +3. After identifying the counters to be sampled, construct a profile by passing a list of these counters and input parameters to ``rocprofiler_spm_create_counter_config``. + +.. code-block:: cpp + + rocprofiler_counter_config_id_t profile = {.handle = 0}; + auto params = rocprofiler_spm_configuration_t{}; + params.frequency = 1.0; + params.buffer_size = 32768; + params.timeout = 15; + + // Create and return the profile + rocprofiler_counter_config_id_t profile; + ROCPROFILER_CALL(rocprofiler_spm_create_counter_config( + agent, counters_array, counters_array_count, ¶ms, &profile), + "Could not construct profile cfg"); + +Dispatch Counting Callback +-------------------------- + +When a kernel is dispatched, a dispatch callback is issued to the tool for supplying a profile. + +.. code-block:: cpp + + void + spm_dispatch_callback(rocprofiler_spm_dispatch_counting_service_data_t dispatch_data, + rocprofiler_counter_config_id_t* config, + rocprofiler_user_data_t* user_data, + void* /*callback_data_args*/) + +``dispatch_data`` contains information about the dispatch being launched such as its name. ``config`` is used by the tool to specify the profile, which allows counter sampling for the dispatch. If no profile is supplied, no counters are collected for this dispatch. ``user_data`` contains user data supplied to ``rocprofiler_configure_buffer_spm_dispatch_service``. + + +Processing SPM samples +---------------------- + +SPM buffered dispatch service asynchronously delivers samples via a dedicated callback ``(spm_sampling_callback)``. The following code snippet outlines the process of iterating over samples. + +.. code-block:: cpp + + void + spm_sampling_callback(rocprofiler_context_id_t ctx, + rocprofiler_buffer_id_t buff, + rocprofiler_record_header_t** headers, + size_t num_headers, + void* data, + uint64_t drop_count) + { + for(size_t i = 0; i < num_headers; i++) + { + auto* cur_header = headers[i]; + + if(cur_header->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS) + { + if(cur_header->kind == ROCPROFILER_COUNTER_RECORD_PROFILE_COUNTING_DISPATCH_HEADE) + { + //Process the header + auto* header = + static_cast(header->payload); + } + else if(cur_header->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS) + { + if(cur_header->kind == ROCPROFILER_COUNTER_RECORD_VALUE) + //process a sample + auto* record = static_cast(header->payload); + } + } + } + } + +For more information on the data comprising a single sample, see `spm.h `_. + +.. note:: + A user can synchronously flush buffers via ``rocprofiler_buffer_flush`` that triggers ``spm_sampling_callback``. diff --git a/projects/rocprofiler-sdk/source/docs/data/rocpd-to-csv-spm.csv b/projects/rocprofiler-sdk/source/docs/data/rocpd-to-csv-spm.csv new file mode 100644 index 00000000000..505430bc388 --- /dev/null +++ b/projects/rocprofiler-sdk/source/docs/data/rocpd-to-csv-spm.csv @@ -0,0 +1,5 @@ +"Guid","Correlation_Id","Dispatch_Id","Agent_Id","Queue_Id","Process_Id","Thread_Id","Grid_Size","Kernel_Id","Kernel_Name","Workgroup_Size","Lds_Block_Size","Scratch_Size","Vgpr_Count","Accum_Vgpr_Count","Sgpr_Count","Counter_Name","Counter_Value","Timestamp","Start_Timestamp","End_Timestamp" +"00000b06-12c3-72c3-990d-a087f50db010",1,1,"Agent 2",1,3675,3675,1048576,11,"void addition_kernel(float*, float const*, float const*, int, int)",64,0,0,8,0,16,"SQ_WAVES",0.0,18480887641576,0,0 +"00000b06-12c3-72c3-990d-a087f50db010",1,1,"Agent 2",1,3675,3675,1048576,11,"void addition_kernel(float*, float const*, float const*, int, int)",64,0,0,8,0,16,"SQ_WAVES",0.0,18480887641578,0,0 +"00000b06-12c3-72c3-990d-a087f50db010",1,1,"Agent 2",1,3675,3675,1048576,11,"void addition_kernel(float*, float const*, float const*, int, int)",64,0,0,8,0,16,"SQ_WAVES",0.0,18480887641586,0,0 +"00000b06-12c3-72c3-990d-a087f50db010",1,1,"Agent 2",1,3675,3675,1048576,11,"void addition_kernel(float*, float const*, float const*, int, int)",64,0,0,8,0,16,"SQ_WAVES",0.0,18480887641587,0,0 diff --git a/projects/rocprofiler-sdk/source/docs/how-to/using-spm.rst b/projects/rocprofiler-sdk/source/docs/how-to/using-spm.rst new file mode 100644 index 00000000000..c7cd71dfa24 --- /dev/null +++ b/projects/rocprofiler-sdk/source/docs/how-to/using-spm.rst @@ -0,0 +1,320 @@ +.. meta:: + :description: Documentation of the usage of streaming performance monitor(SPM) with rocprofv3 command-line tool + :keywords: Sampling counters, streaming performance monitors, rocprofv3, rocprofv3 tool usage, Using rocprofv3, ROCprofiler-SDK command line tool, SPM + +.. _using-spm: + +================== +Using SPM +================== + +SPM (Streaming Performance Monitor) sampling service for GPU profiling is a profiling technique to periodically sample performance counters with GPU timestamp. + +Here are the benefits of using SPM to sample counters: + +- Identify performance bottlenecks +- Understand kernel execution behavior +- fine-grained, time-resolved performance data. + +To try out the SPM, you can use the command-line tool ``rocprofv3`` or the ROCprofiler-SDK library. + +SPM availability and configuration +=========================================== + +To check counters that can be sampled, use: + +.. code-block:: bash + + rocprofv3 -L + +Or + +.. code-block:: bash + + rocprofv3 --list-avail + +The output lists if ``rocprofv3`` supports SPM + +.. code-block:: bash + + Counter_Name : TCC_MISS + Description : Number of cache misses. UC reads count as misses. + Block : TCC + SPM : Supported + Dimensions : DIMENSION_INSTANCE[0:15] DIMENSION_XCC[0:7] + +The preceding output shows that the TCC_MISS counter can be sampled. + +Use the following command to use SPM: + +.. code-block:: bash + + rocprofv3 --spm-beta-enabled --spm SQ_WAVES --spm-timeout 10 --spm-frequency 0.5 --spm-buffer-size 32768 -- + +The preceding command enables SPM for SQ_WAVES and a timeout of 10ms with a buffer size of 32768KB and spm frequency of 0.5GHZ. Replace ```` with the path to the application you want to profile. +This generates results.db file prefixed with the process ID. + +.. code-block:: bash + + rocpd convert -i results.db --output-format csv + +The preceeding command generates the CSV file with the following output fields. + +"Guid","Correlation_Id","Dispatch_Id","Agent_Id","Queue_Id","Process_Id","Thread_Id","Grid_Size","Kernel_Id","Kernel_Name","Workgroup_Size","Lds_Block_Size","Scratch_Size","Vgpr_Count","Accum_Vgpr_Count","Sgpr_Count","Counter_Name","Counter_Value","Timestamp","Start_Timestamp","End_Timestamp" +"0000065f-d21a-721a-b048-a3f39c58ee71",1,1,"Agent 2",1,9082,9082,1048576,13,"void addition_kernel(float*, float const*, float const*, int, int)",64,0,0,8,0,16,"SQ_WAVES",0.0,10691960212168,0,0 + +Along with kernel dispatch information following fields are generated by SPM + + - ``Timestamp``: Timestamp when sample is generated + - ``Counter_Value``: sampled counter value + - ``Counter_Name`` : name of the counter sampled + +Input parameters +=================== + +Here are the input parameters used to configure SPM + + - ``Metrics List``: The list of counters that you want to sample with SPM. + - ``Timeout in ms``: The buffer that collects the sample is flushed when it is either full or when timeout is exceeded. Default set to 0 ms. + - ``SPM Frequency in GHz``: This is the frequency at which counters are sampled (the frequency is approximately converted to number of shader clocks internally). Default set to 0.5 GHz. + - ``Buffer Size in KB``: size of the buffer that collects the samples. Default set to 32768 KB. + +.. code-block:: bash + + rocprofv3 --spm-beta-enabled --spm SQ_WAVES --spm-timeout 10 --spm-frequency 0.2 --spm-buffer-size 32768 --output-format json -- + +The preceding command generates a JSON file with the comprehensive output. + +.. code-block:: text + + +"SPM": { + "type": "array", + "description": "SPM Counter collection records.", + "items": { + "type": "object", + "properties": { + "dispatch_data": { + "type": "object", + "description": "Dispatch data details.", + "properties": { + "size": { + "type": "integer", + "description": "Size of the dispatch data." + }, + "correlation_id": { + "type": "object", + "description": "Correlation ID information.", + "properties": { + "internal": { + "type": "integer", + "description": "Internal correlation ID." + }, + "external": { + "type": "integer", + "description": "External correlation ID." + } + }, + "required": [ + "internal", + "external" + ] + }, + "dispatch_info": { + "type": "object", + "description": "Dispatch information details.", + "properties": { + "size": { + "type": "integer", + "description": "Size of the dispatch information." + }, + "agent_id": { + "type": "object", + "description": "Agent ID information.", + "properties": { + "handle": { + "type": "integer", + "description": "Handle of the agent." + } + }, + "required": [ + "handle" + ] + }, + "queue_id": { + "type": "object", + "description": "Queue ID information.", + "properties": { + "handle": { + "type": "integer", + "description": "Handle of the queue." + } + }, + "required": [ + "handle" + ] + }, + "kernel_id": { + "type": "integer", + "description": "ID of the kernel." + }, + "dispatch_id": { + "type": "integer", + "description": "ID of the dispatch." + }, + "private_segment_size": { + "type": "integer", + "description": "Size of the private segment." + }, + "group_segment_size": { + "type": "integer", + "description": "Size of the group segment." + }, + "workgroup_size": { + "type": "object", + "description": "Workgroup size information.", + "properties": { + "x": { + "type": "integer", + "description": "X dimension." + }, + "y": { + "type": "integer", + "description": "Y dimension." + }, + "z": { + "type": "integer", + "description": "Z dimension." + } + }, + "required": [ + "x", + "y", + "z" + ] + }, + "grid_size": { + "type": "object", + "description": "Grid size information.", + "properties": { + "x": { + "type": "integer", + "description": "X dimension." + }, + "y": { + "type": "integer", + "description": "Y dimension." + }, + "z": { + "type": "integer", + "description": "Z dimension." + } + }, + "required": [ + "x", + "y", + "z" + ] + } + }, + "required": [ + "size", + "agent_id", + "queue_id", + "kernel_id", + "dispatch_id", + "private_segment_size", + "group_segment_size", + "workgroup_size", + "grid_size" + ] + } + }, + "required": [ + "size", + "correlation_id", + "dispatch_info" + ] + }, + "records": { + "type": "array", + "description": "Records.", + "items": { + "type": "object", + "properties": { + "counter_id": { + "type": "object", + "description": "Counter ID information.", + "properties": { + "handle": { + "type": "integer", + "description": "Handle of the counter." + } + }, + "required": [ + "handle" + ] + }, + "value": { + "type": "number", + "description": "Value of the counter." + }, + "instance_id": { + "type": "object", + "description": "Counter ID information.", + "properties": { + "handle": { + "type": "integer", + "description": "Handle of the counter instance." + } + }, + "required": [ + "handle" + ] + }, + "timestamp": { + "type": "object", + "description": "Timestamp.", + "properties": { + "handle": { + "type": "integer", + "description": "timestamp of the sample" + } + } + } + }, + "required": [ + "counter_id", + "value", + "instance_id", + "timestamp" + ] + } + }, + "thread_id": { + "type": "integer", + "description": "Thread ID." + }, + "arch_vgpr_count": { + "type": "integer", + "description": "Count of VGPRs." + }, + "sgpr_count": { + "type": "integer", + "description": "Count of SGPRs." + }, + "lds_block_size_v": { + "type": "integer", + "description": "Size of LDS block." + } + }, + "required": [ + "dispatch_data", + "records", + "thread_id", + "arch_vgpr_count", + "sgpr_count", + "lds_block_size_v" + ] + } + } diff --git a/projects/rocprofiler-sdk/source/docs/rocprofv3-schema.json b/projects/rocprofiler-sdk/source/docs/rocprofv3-schema.json index 918ca0a53b1..d9f37a7b2ce 100644 --- a/projects/rocprofiler-sdk/source/docs/rocprofv3-schema.json +++ b/projects/rocprofiler-sdk/source/docs/rocprofv3-schema.json @@ -1035,6 +1035,236 @@ ] } }, + "SPM": { + "type": "array", + "description": "SPM Counter collection records.", + "items": { + "type": "object", + "properties": { + "dispatch_data": { + "type": "object", + "description": "Dispatch data details.", + "properties": { + "size": { + "type": "integer", + "description": "Size of the dispatch data." + }, + "correlation_id": { + "type": "object", + "description": "Correlation ID information.", + "properties": { + "internal": { + "type": "integer", + "description": "Internal correlation ID." + }, + "external": { + "type": "integer", + "description": "External correlation ID." + } + }, + "required": [ + "internal", + "external" + ] + }, + "dispatch_info": { + "type": "object", + "description": "Dispatch information details.", + "properties": { + "size": { + "type": "integer", + "description": "Size of the dispatch information." + }, + "agent_id": { + "type": "object", + "description": "Agent ID information.", + "properties": { + "handle": { + "type": "integer", + "description": "Handle of the agent." + } + }, + "required": [ + "handle" + ] + }, + "queue_id": { + "type": "object", + "description": "Queue ID information.", + "properties": { + "handle": { + "type": "integer", + "description": "Handle of the queue." + } + }, + "required": [ + "handle" + ] + }, + "kernel_id": { + "type": "integer", + "description": "ID of the kernel." + }, + "dispatch_id": { + "type": "integer", + "description": "ID of the dispatch." + }, + "private_segment_size": { + "type": "integer", + "description": "Size of the private segment." + }, + "group_segment_size": { + "type": "integer", + "description": "Size of the group segment." + }, + "workgroup_size": { + "type": "object", + "description": "Workgroup size information.", + "properties": { + "x": { + "type": "integer", + "description": "X dimension." + }, + "y": { + "type": "integer", + "description": "Y dimension." + }, + "z": { + "type": "integer", + "description": "Z dimension." + } + }, + "required": [ + "x", + "y", + "z" + ] + }, + "grid_size": { + "type": "object", + "description": "Grid size information.", + "properties": { + "x": { + "type": "integer", + "description": "X dimension." + }, + "y": { + "type": "integer", + "description": "Y dimension." + }, + "z": { + "type": "integer", + "description": "Z dimension." + } + }, + "required": [ + "x", + "y", + "z" + ] + } + }, + "required": [ + "size", + "agent_id", + "queue_id", + "kernel_id", + "dispatch_id", + "private_segment_size", + "group_segment_size", + "workgroup_size", + "grid_size" + ] + } + }, + "required": [ + "size", + "correlation_id", + "dispatch_info" + ] + }, + "records": { + "type": "array", + "description": "Records.", + "items": { + "type": "object", + "properties": { + "counter_id": { + "type": "object", + "description": "Counter ID information.", + "properties": { + "handle": { + "type": "integer", + "description": "Handle of the counter." + } + }, + "required": [ + "handle" + ] + }, + "value": { + "type": "number", + "description": "Value of the counter." + }, + "instance_id": { + "type": "object", + "description": "Counter ID information.", + "properties": { + "handle": { + "type": "integer", + "description": "Handle of the counter instance." + } + }, + "required": [ + "handle" + ] + }, + "timestamp": { + "type": "object", + "description": "Timestamp.", + "properties": { + "handle": { + "type": "integer", + "description": "timestamp of the sample" + } + } + } + }, + "required": [ + "counter_id", + "value", + "instance_id", + "timestamp" + ] + } + }, + "thread_id": { + "type": "integer", + "description": "Thread ID." + }, + "arch_vgpr_count": { + "type": "integer", + "description": "Count of VGPRs." + }, + "sgpr_count": { + "type": "integer", + "description": "Count of SGPRs." + }, + "lds_block_size_v": { + "type": "integer", + "description": "Size of LDS block." + } + }, + "required": [ + "dispatch_data", + "records", + "thread_id", + "arch_vgpr_count", + "sgpr_count", + "lds_block_size_v" + ] + } + }, "pc_sample_host_trap": { "type": "array", "description": "Host Trap PC Sampling records.", diff --git a/projects/rocprofiler-sdk/source/docs/rocprofv3_input_schema.json b/projects/rocprofiler-sdk/source/docs/rocprofv3_input_schema.json index e507bf5e090..31c1ef1469f 100644 --- a/projects/rocprofiler-sdk/source/docs/rocprofv3_input_schema.json +++ b/projects/rocprofiler-sdk/source/docs/rocprofv3_input_schema.json @@ -22,7 +22,7 @@ }, "pmc_group_interval": { "type" : "integer", - "description": "Number of kernel launches between selecting the next group of counters to collect" + "description": "Number of kernel launches between selecting the next group of counters to collect" }, "kernel_include_regex":{ @@ -193,6 +193,26 @@ "pc-sampling-beta-enabled": { "type": "boolean", "description": "enable pc sampling support; beta version" + }, + "spm_beta_enabled": { + "type": "boolean", + "description": "enable streaming performance monitors; beta version" + }, + "spm_frequency": { + "type" : "string", + "description": "This is the frequency at which counters are sampled (the frequency is approximately converted to number of shader clocks internally)" + }, + "spm_timeout": { + "type" : "integer", + "description": "size of the buffer that collects the samples" + }, + "spm_buffer_size": { + "type" : "string", + "description": "size of the buffer that collects the samples" + }, + "spm": { + "type" : "array", + "description": "list of SPM counters to collect" } } } diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/counters.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/counters.h index 9562ed8e077..04572194f08 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/counters.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/counters.h @@ -122,6 +122,7 @@ typedef struct ROCPROFILER_SDK_EXPERIMENTAL rocprofiler_counter_info_v1_t const char* expression; ///< Counter expression (derived counters only) uint8_t is_constant : 1; ///< If this counter is HW constant uint8_t is_derived : 1; ///< If this counter is a derived counter + uint8_t spm_support : 1; ///< If this counter supports SPM uint64_t dimensions_count; const rocprofiler_counter_record_dimension_info_t** dimensions; diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/enum_string.hpp b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/enum_string.hpp index 3a3aa72f26a..3d052d72784 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/enum_string.hpp +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/enum_string.hpp @@ -1588,7 +1588,7 @@ ROCPROFILER_ENUM_LABEL(ROCPROFILER_STATUS_ERROR_AST_NOT_FOUND); ROCPROFILER_ENUM_LABEL(ROCPROFILER_STATUS_ERROR_AQL_NO_EVENT_COORD); ROCPROFILER_ENUM_LABEL(ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL); ROCPROFILER_ENUM_LABEL(ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES); -ROCPROFILER_ENUM_LABEL(ROCPROFILER_STATUS_ERROR_PROFILE_NOT_FOUND); +ROCPROFILER_ENUM_LABEL(ROCPROFILER_STATUS_ERROR_CONFIG_NOT_FOUND); ROCPROFILER_ENUM_LABEL(ROCPROFILER_STATUS_ERROR_AGENT_DISPATCH_CONFLICT); ROCPROFILER_ENUM_LABEL(ROCPROFILER_STATUS_INTERNAL_NO_AGENT_CONTEXT); ROCPROFILER_ENUM_LABEL(ROCPROFILER_STATUS_ERROR_SAMPLE_RATE_EXCEEDED); diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization/save.hpp b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization/save.hpp index 2410e69b038..325d84d60cf 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization/save.hpp +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/serialization/save.hpp @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -524,6 +525,15 @@ save(ArchiveT& ar, rocprofiler_dispatch_counting_service_data_t data) ROCP_SDK_SAVE_DATA_FIELD(dispatch_info); } +template +void +save(ArchiveT& ar, rocprofiler_spm_dispatch_counting_service_data_t data) +{ + ROCP_SDK_SAVE_DATA_FIELD(size); + ROCP_SDK_SAVE_DATA_FIELD(correlation_id); + ROCP_SDK_SAVE_DATA_FIELD(dispatch_info); +} + template void save(ArchiveT& ar, rocprofiler_dispatch_counting_service_record_t data) @@ -577,6 +587,17 @@ save(ArchiveT& ar, rocprofiler_counter_record_t data) ROCP_SDK_SAVE_DATA_FIELD(dispatch_id); } +template +void +save(ArchiveT& ar, rocprofiler_spm_counter_record_t data) +{ + ROCP_SDK_SAVE_DATA_FIELD(dispatch_id); + ROCP_SDK_SAVE_DATA_FIELD(id); + ROCP_SDK_SAVE_DATA_FIELD(agent_id); + ROCP_SDK_SAVE_DATA_FIELD(timestamp); + ROCP_SDK_SAVE_DATA_FIELD(value); +} + template void save(ArchiveT& ar, rocprofiler_buffer_tracing_hip_api_record_t data) diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/experimental/CMakeLists.txt b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/experimental/CMakeLists.txt index 4c7e382461c..d45d67bcd6a 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/experimental/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/experimental/CMakeLists.txt @@ -2,7 +2,7 @@ # Experimental components of the ROCProfiler SDK API. # -set(ROCPROFILER_EXPERIMENTAL_HEADER_FILES counters.h registration.h thread_trace.h) +set(ROCPROFILER_EXPERIMENTAL_HEADER_FILES counters.h registration.h thread_trace.h spm.h) install( FILES ${ROCPROFILER_EXPERIMENTAL_HEADER_FILES} diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/experimental/spm.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/experimental/spm.h new file mode 100644 index 00000000000..5b83a25944f --- /dev/null +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/experimental/spm.h @@ -0,0 +1,269 @@ +// MIT License +// +// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include +#include +#include +#include + +ROCPROFILER_EXTERN_C_INIT + +/** + * @brief (experimental) SPM parameter type and value. + * + **/ +typedef struct ROCPROFILER_SDK_EXPERIMENTAL rocprofiler_spm_configuration_t +{ + size_t size; ///< Size of this struct + double frequency; ///< Input frequency (in GHz) is estimated to number of scclock count. Used + ///< to determine sample interval. + uint64_t buffer_size; ///< Buffer size of user mode buffer in KB + uint64_t timeout; ///< Timeout for the user mode buffer in ms +} rocprofiler_spm_configuration_t; + +/** + * @brief (experimental) Create SPM Counter Configuration. A config is bound to an agent but can + * be used across many contexts. The config has a fixed set of counters + * that are collected (and specified by counter_list) and parameters. The available + * counters for an agent can be queried using + * ::rocprofiler_iterate_spm_supported_counters. An existing config + * may be supplied via config_id to use as a base for the new config. + * All counters and parameters in the existing config will be copied over to the new + * config. The existing config will remain unmodified and usable with + * the new config id being returned in config_id. + * + * @param [in] agent_id Agent identifier + * @param [in] counters_list List of GPU counters + * @param [in] counters_count Size of counters list + * @param [in] parameters SPM parameter configuration + * @param [in,out] config_id Identifier for GPU SPM counters group. If an existing + config is supplied, that profiles counters and parameters will be copied + over to a new config (returned via this id) + * @return ::rocprofiler_status_t + * @retval ROCPROFILER_STATUS_SUCCESS if config created + * @retval ROCPROFILER_STATUS_ERROR if config could not be created + * @retval ROCPROFILER_STATUS_ERROR_METRIC_NOT_VALID_FOR_AGENT if agent does not support an input + counter + * @retval ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT if counters count is zero and no existing + config is supplied + * @retval ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_ABI incompatible aqlprofile version is used + * @retval ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED if the ROCPROFILER_SPM_BETA_ENABLED is not set + * @retval ROCPROFILER_STATUS_ERROR_EXCEEDS_HW_LIMIT if input counters exceed the hardware limit + * @retval ROCPROFILER_STATUS_ERROR_AGENT_NOT_FOUND if agent not found + * @retval ROCPROFILER_STATUS_ERROR_COUNTER_NOT_FOUND if an input counter is not found in metrics + file + */ +ROCPROFILER_SDK_EXPERIMENTAL +rocprofiler_status_t +rocprofiler_spm_create_counter_config(rocprofiler_agent_id_t agent_id, + rocprofiler_counter_id_t* counters_list, + size_t counters_count, + rocprofiler_spm_configuration_t* parameters, + rocprofiler_counter_config_id_t* config_id) + ROCPROFILER_API ROCPROFILER_NONNULL(2); + +/** + * @brief (experimental) Destroy SPM Profile Configuration. + * + * @param [in] config_id + * @return ::rocprofiler_status_t + * @retval ROCPROFILER_STATUS_SUCCESS if config destroyed + * @retval ROCPROFILER_STATUS_ERROR_PROFILE_NOT_FOUND if the profile is not found + * @retval ROCPROFILER_STATUS_ERROR if config could not be destroyed + */ +ROCPROFILER_SDK_EXPERIMENTAL +rocprofiler_status_t +rocprofiler_spm_destroy_counter_config(rocprofiler_counter_config_id_t config_id) + ROCPROFILER_API; + +/** + * @brief (experimental) SPM record flags. + * + **/ +typedef enum ROCPROFILER_SDK_EXPERIMENTAL rocprofiler_spm_record_flag_t +{ + ROCPROFILER_SPM_RECORD_FLAG_DATA_NONE = 0, ///< records with data loss + ROCPROFILER_SPM_RECORD_FLAG_DATA, ///< records with data + ROCPROFILER_SPM_RECORD_FLAG_END, ///< End of agent service + ROCPROFILER_SPM_RECORD_FLAG_DATA_LOST, ///< flag value none + ROCPROFILER_SPM_RECORD_FLAG_LAST, +} rocprofiler_spm_record_flag_t; + +/** + * @brief (experimental) Kernel dispatch data for profile counting callbacks. + * + */ +typedef struct ROCPROFILER_SDK_EXPERIMENTAL rocprofiler_spm_dispatch_counting_service_data_t +{ + uint64_t size; ///< Size of this struct + rocprofiler_async_correlation_id_t correlation_id; ///< Correlation ID for this dispatch + rocprofiler_kernel_dispatch_info_t dispatch_info; ///< Dispatch info +} rocprofiler_spm_dispatch_counting_service_data_t; + +typedef struct ROCPROFILER_SDK_EXPERIMENTAL rocprofiler_spm_counter_record_t +{ + uint64_t size; ///< Size of this structure. Used for versioning and validation. + rocprofiler_dispatch_id_t + dispatch_id; ///< dispatch id used to determine the dispatch this record belongs to. + rocprofiler_counter_instance_id_t id; ///< Counter instance id + rocprofiler_agent_id_t agent_id; ///< Agent on which the record is collected + rocprofiler_timestamp_t timestamp; ///< timestamp of the sample + uint64_t value; ///< SPM sample for the counter with counter instance id: id + /// @var id + /// @brief counter id, ROCPROFILER_DIMENSION_XCC, + /// ROCPROFILER_DIMENSION_INSTANCE, ROCPROFILER_DIMENSION_SHADER_ENGINE embedded in it + /// @var agent id + /// @brief identifies the agent on which SPM data was collected + /// @var timestamp + /// @brief GPU timestamp when sample was collected + /// @var value + /// @brief sampled counter value + /// @var dispatch_id + /// @brief A value greater than zero indicates that this counter record is associated with a + /// specific dispatch. + /// + /// This value can be mapped to a dispatch via the `dispatch_info` field (@see + /// ::rocprofiler_kernel_dispatch_info_t) of a ::rocprofiler_spm_dispatch_counting_service_data_t + /// ::rocprofiler_spm_dispatch_counting_service_record_t records (which will be insert into the + /// buffer prior to the associated ::rocprofiler_spm_counter_record_t records). +} rocprofiler_spm_counter_record_t; + +/** + * @brief (experimental) Counting record callback. This is a callback is invoked when the kernel + * execution is complete and contains the counter profile data requested in + * ::rocprofiler_spm_dispatch_counting_service_cb_t. Only used with + * ::rocprofiler_configure_callback_spm_dispatch_service + * + * @param [in] dispatch_data kernel dispatch data + * @param [in] records array of pointers to the rocprofiler_spm_counter_record_t. + Memory of records is managed by the SDK. It is valid only within this callback + * @param [in] record_count size of the record array + * @param [in] flags rocprofiler_spm_record_flag_t + * @param [in] userdata user data supplied by dispatch callback + * @param [in] record Callback data supplied via dispatch configure service + + */ +ROCPROFILER_SDK_EXPERIMENTAL +typedef void (*rocprofiler_spm_dispatch_counting_record_cb_t)( + const rocprofiler_spm_dispatch_counting_service_data_t* dispatch_data, + const rocprofiler_spm_counter_record_t** records, + size_t record_count, + int flags, + rocprofiler_user_data_t userdata, + void* record_callback_args); +/** + * @brief (experimental) Kernel Dispatch Callback. This is a callback that is invoked before the + * kernel is enqueued into the HSA queue. What counters to collect for a kernel are set via passing + * back a profile config (config) in this callback. These counters will be collected and emplaced in + * the buffer with ::rocprofiler_buffer_id_t used when setting up this callback or will be returned via + * a callback used when setting up this callback + * + * @param [in] dispatch_data kernel dispatch data + * @param [out] config spm counter config + * @param [out] user_data User data unique to this dispatch. Returned in record callback + * @param [in] callback_data_args Callback supplied via dispatch configure service + */ +ROCPROFILER_SDK_EXPERIMENTAL +typedef void (*rocprofiler_spm_dispatch_counting_service_cb_t)( + const rocprofiler_spm_dispatch_counting_service_data_t* dispatch_data, + rocprofiler_counter_config_id_t* config, + rocprofiler_user_data_t* user_data, + void* callback_data_args); + +/** + * @brief (experimental) Query Agent SPM Counters Availability. + * + * @param [in] agent_id GPU agent identifier + * @param [in] cb callback to caller to get counters + * @param [in] user_data data to pass into the callback + * @return ::rocprofiler_status_t + * @retval ROCPROFILER_STATUS_SUCCESS if all counters found for agent + * @retval ROCPROFILER_STATUS_ERROR_AGENT_NOT_FOUND invalid agent + * @retval ROCPROFILER_STATUS_ERROR_AGENT_ARCH_NOT_SUPPORTED agent has no supported SPM counter + */ +ROCPROFILER_SDK_EXPERIMENTAL rocprofiler_status_t +rocprofiler_iterate_spm_supported_counters(rocprofiler_agent_id_t agent_id, + rocprofiler_available_counters_cb_t cb, + void* user_data) ROCPROFILER_API ROCPROFILER_NONNULL(2); + +/** + * @brief (experimental) Configure callback dispatch profile Counting Service. + * Collects the counters in dispatch packets and calls a callback + * with the counters collected during that dispatch. + * + * @param [in] context_id context id + * @param [in] dispatch_callback callback to perform when dispatch is enqueued + * @param [in] dispatch_callback_args callback data for dispatch callback + * @param [in] record_callback Record callback for completed profile data + * @param [in] record_callback_args Callback args for record callback + * @return ::rocprofiler_status_t + * + * @return ::rocprofiler_status_t + * @retval ROCPROFILER_STATUS_SUCCESS if the context can be configured for SPM dispatch service + * @retval ROCPROFILER_STATUS_ERROR if the context cannot be configured for SPM dispatch service + * @retval ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED if the ROCPROFILER_SPM_BETA_ENABLED is not set + * @retval ROCPROFILER_STATUS_ERROR_CONFIGURATION_LOCKED for configuration locked + * @retval ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_ABI incompatible aqlprofile version is used + * @retval ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID invalid input context has not already been + * created + * @retval ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT conflicting services being enabled in the + * context + */ + +ROCPROFILER_SDK_EXPERIMENTAL rocprofiler_status_t +rocprofiler_configure_callback_spm_dispatch_service( + rocprofiler_context_id_t context_id, + rocprofiler_spm_dispatch_counting_service_cb_t dispatch_callback, + void* dispatch_callback_args, + rocprofiler_spm_dispatch_counting_record_cb_t record_callback, + void* record_callback_args) ROCPROFILER_API ROCPROFILER_NONNULL(2, 4); +; + +/** + * @brief (experimental) Configure buffered dispatch spm service. + * Collects the counters in dispatch packets and stores them + * in a buffer with @p buffer_id. The buffer may contain packets from more than + * one dispatch (denoted by correlation id). Will trigger the + * callback based on the parameters setup in buffer_id_t. + * + * @param [in] context_id context id + * @param [in] buffer_id id of the buffer to use for the counting service + * @param [in] callback callback to perform when dispatch is enqueued + * @param [in] callback_data_args callback data + * @return ::rocprofiler_status_t + * @retval ROCPROFILER_STATUS_ERROR_BUFFER_NOT_FOUND + * @retval ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID invalid input context has not already been + * @retval ROCPROFILER_STATUS_ERROR_AGENT_DISPATCH_CONFLICT + * @retval ROCPROFILER_STATUS_SUCCESS if the context can be configured for SPM buffer dispatch + * service + */ + +rocprofiler_status_t +rocprofiler_configure_buffer_spm_dispatch_service( + rocprofiler_context_id_t context_id, + rocprofiler_buffer_id_t buffer_id, + rocprofiler_spm_dispatch_counting_service_cb_t callback, + void* callback_data_args) ROCPROFILER_API ROCPROFILER_NONNULL(3); + +ROCPROFILER_EXTERN_C_FINI diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h index 57a60a05b6e..71ea39e121a 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/fwd.h @@ -89,7 +89,7 @@ typedef enum rocprofiler_status_t // NOLINT(performance-enum-size) ///< service that report incompatibility ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES, ///< The given resources are ///< insufficient to complete operation - ROCPROFILER_STATUS_ERROR_PROFILE_NOT_FOUND, ///< Could not find the counter profile + ROCPROFILER_STATUS_ERROR_CONFIG_NOT_FOUND, ///< Could not find the counter profile ROCPROFILER_STATUS_ERROR_AGENT_DISPATCH_CONFLICT, ///< Cannot enable both agent and dispatch ///< counting in the same context. ROCPROFILER_STATUS_INTERNAL_NO_AGENT_CONTEXT, ///< No agent context found, may not be an error @@ -107,6 +107,7 @@ typedef enum rocprofiler_status_t // NOLINT(performance-enum-size) ///< incompatible. Late-start profiling ///< requires ROCm 7.0+. ROCPROFILER_STATUS_LAST, + ROCPROFILER_STATUS_ERROR_PROFILE_NOT_FOUND = ROCPROFILER_STATUS_ERROR_CONFIG_NOT_FOUND, ///< ROCPROFILER_STATUS_ERROR_PROFILE_NOT_FOUND is deprecated } rocprofiler_status_t; /** diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/spm.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/spm.h index 314e329949f..4557781db9f 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/spm.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/spm.h @@ -22,48 +22,8 @@ #pragma once -#include -#include +#if defined(ROCPROFILER_SDK_EXPERIMENTAL_WARNINGS) +# warning "rocprofiler-sdk/experimental/spm.h should be included for now" +#endif -ROCPROFILER_EXTERN_C_INIT - -/** - * @defgroup SPM_SERVICE SPM Service - * @brief Streaming Performance Monitoring - * - * @{ - */ - -/** - * @brief (experimental) ROCProfiler SPM Record. - * - */ -typedef struct ROCPROFILER_SDK_EXPERIMENTAL rocprofiler_spm_record_t -{ - /** - * Counters, including identifiers to get counter information and Counters - * values - */ - rocprofiler_counter_record_t* counters; - uint64_t counters_count; -} rocprofiler_spm_record_t; - -/** - * @brief Configure SPM Service. - * - * @param [in] context_id - * @param [in] buffer_id - * @param [in] counter_config - * @param [in] interval - * @return ::rocprofiler_status_t - */ -ROCPROFILER_SDK_EXPERIMENTAL -rocprofiler_status_t -rocprofiler_configure_spm_service(rocprofiler_context_id_t context_id, - rocprofiler_buffer_id_t buffer_id, - rocprofiler_counter_config_id_t counter_config, - uint64_t interval) ROCPROFILER_API; - -/** @} */ - -ROCPROFILER_EXTERN_C_FINI +#include diff --git a/projects/rocprofiler-sdk/source/lib/output/buffered_output.hpp b/projects/rocprofiler-sdk/source/lib/output/buffered_output.hpp index 9fe89ee66ab..d323da7cd43 100644 --- a/projects/rocprofiler-sdk/source/lib/output/buffered_output.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/buffered_output.hpp @@ -186,6 +186,10 @@ using rccl_buffered_output_t = buffered_output; using counter_collection_buffered_output_t = buffered_output; +using spm_counter_collection_buffered_output_t = + buffered_output; +using spm_counter_records_buffered_output_t = + buffered_output; using scratch_memory_buffered_output_t = buffered_output; diff --git a/projects/rocprofiler-sdk/source/lib/output/counter_info.cpp b/projects/rocprofiler-sdk/source/lib/output/counter_info.cpp index 5cb5441ec00..bdc8bf4800d 100644 --- a/projects/rocprofiler-sdk/source/lib/output/counter_info.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/counter_info.cpp @@ -38,14 +38,15 @@ namespace rocprofiler { namespace tool { -constexpr auto type = domain_type::COUNTER_VALUES; +constexpr auto counter_value = domain_type::COUNTER_VALUES; +constexpr auto spm_counter_value = domain_type::SPM_COUNTER_VALUES; tool_counter_record_t::container_type tool_counter_record_t::read() const { if(!record.fpos) return container_type{}; - auto& _tmp_file = CHECK_NOTNULL(get_tmp_file_buffer(type))->file; + auto& _tmp_file = CHECK_NOTNULL(get_tmp_file_buffer(counter_value))->file; return _tmp_file.read(*record.fpos); } @@ -54,8 +55,28 @@ tool_counter_record_t::write(const tool_counter_record_t::container_type& _data) { if(_data.empty()) return; - auto& _tmp_file = CHECK_NOTNULL(get_tmp_file_buffer(type))->file; + auto& _tmp_file = CHECK_NOTNULL(get_tmp_file_buffer(counter_value))->file; record.fpos = _tmp_file.write(_data.data(), _data.size()); } + +tool_spm_counter_record_t::container_type +tool_spm_counter_record_t::read() const +{ + if(!record.fpos) return container_type{}; + + auto& _tmp_file = + CHECK_NOTNULL(get_tmp_file_buffer(spm_counter_value))->file; + return _tmp_file.read(*record.fpos); +} + +void +tool_spm_counter_record_t::write(const tool_spm_counter_record_t::container_type& _data) +{ + if(_data.empty()) return; + + auto& _tmp_file = + CHECK_NOTNULL(get_tmp_file_buffer(spm_counter_value))->file; + record.fpos = _tmp_file.write(_data.data(), _data.size()); +} } // namespace tool } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/output/counter_info.hpp b/projects/rocprofiler-sdk/source/lib/output/counter_info.hpp index b57a7c96102..41b3dc01147 100644 --- a/projects/rocprofiler-sdk/source/lib/output/counter_info.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/counter_info.hpp @@ -113,6 +113,44 @@ struct tool_counter_record_t container_type read() const; void write(const container_type& data); }; + +struct tool_spm_counter_value_t +{ + rocprofiler_counter_id_t id = {}; + uint64_t value = 0; + rocprofiler_timestamp_t timestamp = 0; + rocprofiler_counter_instance_id_t instance_id = {}; + + template + void save(ArchiveT& ar) const + { + ar(cereal::make_nvp("counter_id", id)); + ar(cereal::make_nvp("value", value)); + ar(cereal::make_nvp("timestamp", timestamp)); + ar(cereal::make_nvp("instance_id", instance_id)); + } +}; + +struct tool_spm_counter_record_t +{ + using container_type = std::vector; + + uint64_t thread_id = 0; + serialized_counter_record_t record = {}; + rocprofiler_spm_dispatch_counting_service_data_t dispatch_data = {}; + + template + void save(ArchiveT& ar) const + { + auto tmp = read(); + ar(cereal::make_nvp("thread_id", thread_id)); + ar(cereal::make_nvp("dispatch_data", dispatch_data)); + ar(cereal::make_nvp("records", tmp)); + } + + container_type read() const; + void write(const container_type& data); +}; } // namespace tool } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/output/domain_type.cpp b/projects/rocprofiler-sdk/source/lib/output/domain_type.cpp index 1bb927e3c1b..abf2cfeecd5 100644 --- a/projects/rocprofiler-sdk/source/lib/output/domain_type.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/domain_type.cpp @@ -68,6 +68,14 @@ DEFINE_BUFFER_TYPE_NAME(PC_SAMPLING_STOCHASTIC, "PC_SAMPLING_STOCHASTIC", "pc_sampling_stochastic", "pc_sampling_stochastic_stats") +DEFINE_BUFFER_TYPE_NAME(SPM_COUNTER_COLLECTION, + "SPM_COUNTER_COLLECTION", + "spm_counter_collection", + "spm_counter_collection_stats") +DEFINE_BUFFER_TYPE_NAME(SPM_COUNTER_VALUES, + "SPM_COUNTER_VALUES", + "SPM_counter_values", + "SPM_counter_values") // unused #undef DEFINE_BUFFER_TYPE_NAME diff --git a/projects/rocprofiler-sdk/source/lib/output/domain_type.hpp b/projects/rocprofiler-sdk/source/lib/output/domain_type.hpp index 7850f514374..2c7dab81c0c 100644 --- a/projects/rocprofiler-sdk/source/lib/output/domain_type.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/domain_type.hpp @@ -41,6 +41,8 @@ enum class domain_type ROCJPEG, PC_SAMPLING_STOCHASTIC, KFD, + SPM_COUNTER_COLLECTION, + SPM_COUNTER_VALUES, LAST, }; diff --git a/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp b/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp index 024da2d44ca..c420a6bd22e 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateCSV.cpp @@ -632,7 +632,6 @@ generate_csv(const output_config& cfg, const auto* kernel_info = tool_metadata.get_kernel_symbol(kernel_id); auto lds_block_size_v = (kernel_info->group_segment_size + (lds_block_size - 1)) & ~(lds_block_size - 1); - auto magnitude = [](rocprofiler_dim3_t dims) { return (dims.x * dims.y * dims.z); }; auto row_ss = std::stringstream{}; for(auto& [counter_id, counter_value] : counter_id_value) @@ -1021,5 +1020,13 @@ generate_csv(const output_config& cfg, ofs << _row.str() << std::flush; } } + +void +generate_csv(const output_config& /* cfg*/, + const metadata& /*tool_metadata*/, + const generator& /*data*/, + const stats_entry_t& /*stats*/) +{} + } // namespace tool } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/output/generateCSV.hpp b/projects/rocprofiler-sdk/source/lib/output/generateCSV.hpp index 225667fa81f..790b5b54e00 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateCSV.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateCSV.hpp @@ -124,6 +124,12 @@ generate_csv(const output_config& const generator& data, const stats_entry_t& stats); +void +generate_csv(const output_config& cfg, + const metadata& tool_metadata, + const generator& data, + const stats_entry_t& stats); + void generate_csv(const output_config& cfg, const metadata& tool_metadata, diff --git a/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp b/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp index 009486e3b99..24fb944ea10 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateJSON.cpp @@ -200,7 +200,8 @@ write_json( const generator& rocdecode_api_gen, const generator& rocjpeg_api_gen, const generator& pc_sampling_host_trap_gen, - const generator& pc_sampling_stochastic_gen) + const generator& pc_sampling_stochastic_gen, + const generator& spm_gen) { // summary { @@ -226,6 +227,7 @@ write_json( json_ar.setNextName("callback_records"); json_ar.startNode(); json_ar(cereal::make_nvp("counter_collection", counter_collection_gen)); + json_ar(cereal::make_nvp("SPM", spm_gen)); json_ar.finishNode(); } diff --git a/projects/rocprofiler-sdk/source/lib/output/generateJSON.hpp b/projects/rocprofiler-sdk/source/lib/output/generateJSON.hpp index 938c11bdb36..0920dfddbd5 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateJSON.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateJSON.hpp @@ -99,6 +99,7 @@ write_json( const generator& rocdecode_api_gen, const generator& rocjpeg_api_gen, const generator& pc_sampling_host_trap_gen, - const generator& pc_sampling_stochastic_gen); + const generator& pc_sampling_stochastic_gen, + const generator& spm_gen); } // namespace tool } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/output/generateRocpd.cpp b/projects/rocprofiler-sdk/source/lib/output/generateRocpd.cpp index 0bfaeed99c5..debdf363a6e 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateRocpd.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateRocpd.cpp @@ -667,7 +667,8 @@ write_rocpd( const generator& kfd_gen, const generator& rccl_api_gen, const generator& rocdecode_api_gen, - const generator& counter_collection_gen) + const generator& counter_collection_gen, + const generator& spm_collection_gen) { static auto get_simple_timer = [](std::string_view label) { return common::simple_timer{fmt::format("SQLite3 generation :: {:24}", label)}; @@ -1074,6 +1075,7 @@ write_rocpd( insert_value("expression", _expression, allow_empty_string{}), insert_value("is_constant", aitr.is_constant), insert_value("is_derived", aitr.is_derived), + insert_value("spm_support", aitr.spm_support), insert_value("extdata", json_data), }); @@ -1141,6 +1143,8 @@ write_rocpd( auto agent_node_id = tool_metadata.get_agent(info.agent_id)->node_id; + get_thread_id(thread_id); + // Insert into kernel dispatch table auto stmt = get_insert_statement( "rocpd_kernel_dispatch{{uuid}}", @@ -1205,6 +1209,39 @@ write_rocpd( ); } } + + for(auto pctr : spm_collection_gen) + { + auto _deferred = sql::deferred_transaction{conn}; + for(const auto& record : spm_collection_gen.get(pctr)) + { + const auto& dispatch_data = record.dispatch_data; + const auto& info = dispatch_data.dispatch_info; + + // Register thread ID + get_thread_id(record.thread_id); + + // Use buffer category for kernel dispatches + auto kind = + tool_metadata.buffer_names.at(ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH); + + // Process this dispatch (SPM dispatch timestamps are not available) + process_dispatch(info.dispatch_id, // dispatch_id + info.kernel_id, // kernel_id + dispatch_data.correlation_id, // corr_id + info, // info + kind, // kind + record.thread_id, // thread_id + get_queue_id(info.queue_id), // queue_id + get_stream_id(rocprofiler_stream_id_t{.handle = 0}), + 0, // start_timestamp + 0, // end_timestamp + info.grid_size, // grid + info.workgroup_size, // workgroup + false // enable_duplicate_check + ); + } + } } else { @@ -1214,7 +1251,7 @@ write_rocpd( for(auto itr : kernel_dispatch_gen.get(pitr)) { // Register thread ID - get_thread_id(itr.thread_id); + if(itr.thread_id != 0) get_thread_id(itr.thread_id); // Process this dispatch process_dispatch(itr.dispatch_info.dispatch_id, // dispatch_id @@ -1236,8 +1273,10 @@ write_rocpd( } }; - auto insert_pmc_event_data = [&conn, &tool_metadata, &counter_collection_gen]( - auto& dispatch_evt_ids) { + auto insert_pmc_event_data = [&conn, + &tool_metadata, + &counter_collection_gen, + &spm_collection_gen](auto& dispatch_evt_ids) { auto _sqlgenperf_rocpd = get_simple_timer("rocpd_pmc_event"); size_t idx = tool_metadata.pmc_event_offset; for(auto ditr : counter_collection_gen) @@ -1263,6 +1302,30 @@ write_rocpd( } } } + for(auto ditr : spm_collection_gen) + { + auto _deferred = sql::deferred_transaction{conn}; + for(const auto& record : spm_collection_gen.get(ditr)) + { + const auto& info = record.dispatch_data.dispatch_info; + auto dispatch_id = info.dispatch_id; + + auto evt_id = dispatch_evt_ids.at(dispatch_id); + for(const auto& count : record.read()) + { + auto stmt = get_insert_statement("rocpd_pmc_event{{uuid}}", + { + insert_value("id", idx++), + insert_value("event_id", evt_id), + insert_value("pmc_id", count.id.handle), + insert_value("value", count.value), + insert_value("timestamp", count.timestamp), + }); + + execute_raw_sql_statements(conn, stmt); + } + } + } }; auto insert_memory_copy_data = diff --git a/projects/rocprofiler-sdk/source/lib/output/generateRocpd.hpp b/projects/rocprofiler-sdk/source/lib/output/generateRocpd.hpp index 497a50ad3e3..7e27c6d7377 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateRocpd.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateRocpd.hpp @@ -51,7 +51,8 @@ write_rocpd( const generator& kfd_gen, const generator& rccl_api_gen, const generator& rocdecode_api_gen, - const generator& counter_collection_gen); + const generator& counter_collection_gen, + const generator& spm_collection_gen); // used in schema generation struct argument_info diff --git a/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp b/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp index 90009a054a9..eb2a1a1b1c8 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateStats.cpp @@ -274,6 +274,14 @@ generate_stats(const output_config& /*cfg*/, return get_stats(rocjpeg_stats); } +stats_entry_t +generate_stats(const output_config& /*cfg*/, + const metadata& /*tool_metadata*/, + const generator& /*data*/) +{ + return stats_entry_t{}; +} + namespace { void diff --git a/projects/rocprofiler-sdk/source/lib/output/generateStats.hpp b/projects/rocprofiler-sdk/source/lib/output/generateStats.hpp index 5c4e5681cd1..d04c698f121 100644 --- a/projects/rocprofiler-sdk/source/lib/output/generateStats.hpp +++ b/projects/rocprofiler-sdk/source/lib/output/generateStats.hpp @@ -63,6 +63,10 @@ generate_stats(const output_config& cfg, const generator& data); stats_entry_t +generate_stats(const output_config& cfg, + const metadata& tool_metadata, + const generator& data); +stats_entry_t generate_stats(const output_config& cfg, const metadata& tool_metadata, const generator& data); diff --git a/projects/rocprofiler-sdk/source/lib/python/rocpd/csv.py b/projects/rocprofiler-sdk/source/lib/python/rocpd/csv.py index d92d0d66ee3..d707b83baf5 100644 --- a/projects/rocprofiler-sdk/source/lib/python/rocpd/csv.py +++ b/projects/rocprofiler-sdk/source/lib/python/rocpd/csv.py @@ -340,6 +340,7 @@ def write_counters_csv(importData, config) -> None: "sgpr_count", "counter_name", "value AS Counter_Value", + "timestamp AS Timestamp", "start AS Start_Timestamp", "end AS End_Timestamp", ] diff --git a/projects/rocprofiler-sdk/source/lib/python/rocprofv3/avail.py b/projects/rocprofiler-sdk/source/lib/python/rocprofv3/avail.py index 21e3d7972ba..f6e7257fa42 100644 --- a/projects/rocprofiler-sdk/source/lib/python/rocprofv3/avail.py +++ b/projects/rocprofiler-sdk/source/lib/python/rocprofv3/avail.py @@ -38,7 +38,8 @@ def build_counter_string(obj): counter_str = "\n".join( ["{:20}:\t{}".format(key, itr) for key, itr in obj.get_as_dict().items()] ) - + spm_support = "Supported" if obj.spm_support else "Not Supported" + counter_str += "\n" + "{:20}:\t{}".format("SPM", spm_support) counter_str += "\n" + "{:20}:\t".format("Dimensions") counter_str += " ".join(dim.__str__() for dim in obj.dimensions) return counter_str @@ -74,12 +75,14 @@ def __init__( counter_description, counter_dimensions, is_hw_constant, + spm_support, ): self.name = counter_name self.counter_handle = counter_handle self.description = counter_description self.dimensions = counter_dimensions self.is_hw_constant = is_hw_constant + self.spm_support = spm_support def get_as_dict(self): return dict(zip((self.columns), [self.name, self.description])) @@ -102,6 +105,7 @@ def __init__( counter_expression, counter_dimensions, is_hw_constant, + spm_support, ): super().__init__( counter_handle, @@ -109,6 +113,7 @@ def __init__( counter_description, counter_dimensions, is_hw_constant, + spm_support, ) self.expression = counter_expression @@ -131,6 +136,7 @@ def __init__( counter_block, counter_dimensions, is_hw_constant, + spm_support, ): super().__init__( counter_handle, @@ -138,6 +144,7 @@ def __init__( counter_description, counter_dimensions, is_hw_constant, + spm_support, ) self.block = counter_block @@ -338,6 +345,24 @@ def get_counters(): return agent_counters +def get_spm_counters(): + agent_counters = {} + agents = get_agent_handles() + agent_counters = {} + agent_info_map = get_agent_info_map() + for agent in agents: + if agent_info_map[agent]["type"] != 2: + continue + agent_counters.setdefault(agent, []) + counters = get_agent_counter_handles(agent) + if counters: + for counter_id in list(counters): + counter_info = get_counter_info(counter_id) + if counter_info.spm_support.value: + agent_counters[agent].append(counter_info) + return agent_counters + + def get_pc_sample_configs(): agent_pc_sample_config = {} agents = get_agent_handles() @@ -358,17 +383,20 @@ def get_counter_info(counter_handle): ctypes.POINTER(ctypes.c_char_p), ctypes.POINTER(ctypes.c_uint), ctypes.POINTER(ctypes.c_uint), + ctypes.POINTER(ctypes.c_uint), ] counter_name = ctypes.c_char_p() counter_description = ctypes.c_char_p() is_derived = ctypes.c_uint() is_hw_constant = ctypes.c_uint() + spm_support = ctypes.c_uint() lib.counter_info( counter_handle, ctypes.byref(counter_name), ctypes.byref(counter_description), ctypes.byref(is_derived), ctypes.byref(is_hw_constant), + ctypes.byref(spm_support), ) if is_derived.value == 1: @@ -386,6 +414,7 @@ def get_counter_info(counter_handle): get_string_value(expression), dimensions, is_hw_constant, + spm_support, ) elif not is_hw_constant.value: @@ -400,6 +429,7 @@ def get_counter_info(counter_handle): get_string_value(block), dimensions, is_hw_constant, + spm_support, ) else: return counter( @@ -408,6 +438,7 @@ def get_counter_info(counter_handle): get_string_value(counter_description), [], is_hw_constant.value, + spm_support, ) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp index f172d74c1be..f2c8be580cc 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.cpp @@ -189,7 +189,7 @@ parse_att_counters(std::string line) } std::set -parse_counters(std::string line) +parse_counters(std::string line, const std::string& qualifier) { auto counters = std::set{}; @@ -205,14 +205,13 @@ parse_counters(std::string line) // check to see if comment stripping + trim resulted in empty line if(line.empty()) return counters; - constexpr auto pmc_qualifier = std::string_view{"pmc:"}; - auto pos = std::string::npos; + auto pos = std::string::npos; // should we handle an "pmc:" not being present? Seems like it should be a fatal error - if((pos = line.find(pmc_qualifier)) != std::string::npos) + if((pos = line.find(qualifier)) != std::string::npos) { // strip out pmc qualifier - line = line.substr(pos + pmc_qualifier.length()); + line = line.substr(pos + qualifier.length()); handle_special_chars(line); @@ -223,7 +222,7 @@ parse_counters(std::string line) input_ss >> counter; if(counter.empty()) break; - else if(counter != pmc_qualifier && has_counter_format(counter)) + else if(counter != qualifier && has_counter_format(counter)) counters.emplace(counter); } } @@ -236,7 +235,7 @@ parse_counter_envs() { if(auto single_counter = get_env("ROCPROF_COUNTERS", std::string{}); !single_counter.empty()) { - return {parse_counters(single_counter)}; + return {parse_counters(single_counter, "pmc:")}; } if(auto group_counters = get_env("ROCPROF_COUNTER_GROUPS", std::string{}); @@ -245,7 +244,7 @@ parse_counter_envs() auto counters = std::vector>{}; for(const auto& group : rocprofiler::sdk::parse::tokenize(group_counters, "\n")) { - counters.emplace_back(parse_counters(group)); + counters.emplace_back(parse_counters(group, "pmc:")); } return counters; } @@ -258,6 +257,7 @@ config::config() , kernel_filter_range{get_kernel_filter_range( get_env("ROCPROF_KERNEL_FILTER_RANGE", std::string{}))} , counters{parse_counter_envs()} +, spm_counters({parse_counters(get_env("ROCPROF_SPM_COUNTERS", std::string()), "spm:")}) , att_param_perfcounters{ parse_att_counters(get_env("ROCPROF_ATT_PARAM_PERFCOUNTERS", std::string{}))} { diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp index e03abfe0f0a..774278fa1a6 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/config.hpp @@ -135,6 +135,7 @@ struct config : output_config bool list_metrics = get_env("ROCPROF_LIST_METRICS", false); bool list_metrics_output_file = get_env("ROCPROF_OUTPUT_LIST_METRICS_FILE", false); bool advanced_thread_trace = get_env("ROCPROF_ADVANCED_THREAD_TRACE", false); + bool spm_counter_collection = get_env("ROCPROF_SPM_COUNTER_COLLECTION", false); bool att_serialize_all = get_env("ROCPROF_ATT_PARAM_SERIALIZE_ALL", false); bool enable_signal_handlers = get_env("ROCPROF_SIGNAL_HANDLERS", true); bool enable_process_sync = get_env("ROCPROF_PROCESS_SYNC", false); @@ -159,6 +160,10 @@ struct config : output_config bool att_param_target_only = get_env("ROCPROF_ATT_PARAM_TARGET_ONLY", 0) != 0; uint64_t att_consecutive_kernels = get_env("ROCPROF_ATT_CONSECUTIVE_KERNELS", 0); + size_t spm_buffer_size_kb = get_env("ROCPROF_SPM_BUFFER_SIZE", 1 << 15); + size_t spm_timeout_ms = get_env("ROCPROF_SPM_TIMEOUT_MS", 0); + double spm_frequency_ghz = get_env("ROCPROF_SPM_FREQUENCY", 0.5); + std::string kernel_filter_include = get_env("ROCPROF_KERNEL_FILTER_INCLUDE_REGEX", ".*"); std::string kernel_filter_exclude = get_env("ROCPROF_KERNEL_FILTER_EXCLUDE_REGEX", ""); std::string pc_sampling_method = get_env("ROCPROF_PC_SAMPLING_METHOD", "none"); @@ -169,6 +174,7 @@ struct config : output_config std::unordered_set kernel_filter_range = {}; std::vector> counters = {}; + std::set spm_counters = {}; std::vector att_param_perfcounters = {}; std::queue collection_periods = {}; @@ -283,6 +289,7 @@ config::save(ArchiveT& ar) const CFG_SERIALIZE_MEMBER(mpi_size); CFG_SERIALIZE_MEMBER(collection_periods); CFG_SERIALIZE_MEMBER(counters); + CFG_SERIALIZE_MEMBER(spm_counters); CFG_SERIALIZE_MEMBER(extra_counters_contents); CFG_SERIALIZE_MEMBER(kernel_filter_include); CFG_SERIALIZE_MEMBER(kernel_filter_exclude); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp index 62fec9b57df..b128df33e77 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk-tool/tool.cpp @@ -68,6 +68,7 @@ #include #include #include +#include #include #include #include @@ -98,6 +99,7 @@ #include #include #include +#include #include #include #include @@ -1488,6 +1490,10 @@ get_config_perf_counters() { tool_pmc_counters.emplace(att_counter.counter_name); } + for(const auto& spm_counter : rocprofiler::tool::get_config().spm_counters) + { + tool_pmc_counters.emplace(spm_counter); + } return tool_pmc_counters; } @@ -1783,6 +1789,98 @@ counter_record_callback(rocprofiler_dispatch_counting_service_data_t dispatch_da } } +std::optional +get_spm_config(rocprofiler_agent_id_t agent_id) +{ + static const auto gpu_agents_counter_info = get_agent_counter_info(); + static auto agent_configs = + std::unordered_map{}; + + auto itr = agent_configs.find(agent_id); + if(itr != agent_configs.end()) return itr->second; + + auto params = rocprofiler_spm_configuration_t{}; + params.frequency = tool::get_config().spm_frequency_ghz; + params.buffer_size = tool::get_config().spm_buffer_size_kb; + params.timeout = tool::get_config().spm_timeout_ms; + + auto expected_counters = std::vector{}; + + for(const auto& citr : gpu_agents_counter_info.at(agent_id)) + { + for(const auto& desired_counter : rocprofiler::tool::get_config().spm_counters) + { + if(citr.spm_support && std::string_view{desired_counter} == std::string_view{citr.name}) + expected_counters.emplace_back(citr.id); + } + } + auto config = rocprofiler_counter_config_id_t{}; + ROCPROFILER_CALL( + rocprofiler_spm_create_counter_config( + agent_id, expected_counters.data(), expected_counters.size(), ¶ms, &config), + "SPM could not be configured"); + agent_configs.emplace(agent_id, config); + return config; +} + +void +spm_dispatch_callback(const rocprofiler_spm_dispatch_counting_service_data_t* dispatch_data, + rocprofiler_counter_config_id_t* config, + rocprofiler_user_data_t* user_data, + void* /*callback_data_args*/) +{ + static auto kernel_iteration = common::Synchronized{}; + + if(!is_targeted_kernel(dispatch_data->dispatch_info.kernel_id, kernel_iteration)) + { + return; + } + else if(auto profile = get_spm_config(dispatch_data->dispatch_info.agent_id)) + { + *config = *profile; + user_data->value = common::get_tid(); + } +} + +void +spm_data_callback(const rocprofiler_spm_dispatch_counting_service_data_t* dispatch_data, + const rocprofiler_spm_counter_record_t** records, + size_t record_count, + int flags, + rocprofiler_user_data_t user_data, + void* /* record_callback_args*/) +{ + if(record_count == 0) return; + + if((flags >> ROCPROFILER_SPM_RECORD_FLAG_DATA) != 0) + { + auto counter_record = tool::tool_spm_counter_record_t{}; + counter_record.dispatch_data = *dispatch_data; + counter_record.thread_id = user_data.value; + auto serialized_records = std::vector{}; + for(size_t count = 0; count < record_count; count++) + { + auto _counter_id = rocprofiler_counter_id_t{}; + + ROCPROFILER_CALL(rocprofiler_query_record_counter_id(records[count]->id, &_counter_id), + "query record counter id"); + serialized_records.emplace_back(tool::tool_spm_counter_value_t{ + _counter_id, records[count]->value, records[count]->timestamp, records[count]->id}); + } + + if(!serialized_records.empty()) + { + counter_record.write(serialized_records); + tool::write_ring_buffer(counter_record, domain_type::SPM_COUNTER_COLLECTION); + } + } + + if((flags >> ROCPROFILER_SPM_RECORD_FLAG_DATA_LOST) != 0) + { + ROCP_WARNING << fmt::format("SPM data loss in dispatch ID {}", + dispatch_data->dispatch_info.dispatch_id); + } +} rocprofiler_client_finalize_t client_finalizer = nullptr; rocprofiler_client_id_t* client_identifier = nullptr; @@ -2509,6 +2607,18 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) start_context(counter_collection_ctx, "counter collection"); } + if(tool::get_config().spm_counter_collection) + { + ROCPROFILER_CALL(rocprofiler_create_context(&counter_collection_ctx), + "failed to create counter collection context"); + ROCPROFILER_CALL( + rocprofiler_configure_callback_spm_dispatch_service( + counter_collection_ctx, spm_dispatch_callback, nullptr, spm_data_callback, nullptr), + "Could not setup SPM counting service"); + + start_context(counter_collection_ctx, "SPM counter collection"); + } + auto rename_ctx = rocprofiler_context_id_t{0}; auto marker_core_api_kinds = std::array{ ROCPROFILER_MARKER_CORE_RANGE_API_ID_roctxMarkA, @@ -2880,6 +2990,8 @@ generate_output(cleanup_mode _cleanup_mode) auto marker_output = tool::marker_buffered_output_t{tool::get_config().marker_api_trace}; auto counters_output = tool::counter_collection_buffered_output_t{tool::get_config().counter_collection}; + auto spm_counters_output = + tool::spm_counter_collection_buffered_output_t{tool::get_config().spm_counter_collection}; auto scratch_memory_output = tool::scratch_memory_buffered_output_t{tool::get_config().scratch_memory_trace}; auto rccl_output = tool::rccl_buffered_output_t{tool::get_config().rccl_api_trace}; @@ -2936,6 +3048,7 @@ generate_output(cleanup_mode _cleanup_mode) generate_output(pc_sampling_host_trap_output, outdata, contributions, cleanups); generate_output(rocjpeg_output, outdata, contributions, cleanups); generate_output(pc_sampling_stochastic_output, outdata, contributions, cleanups); + generate_output(spm_counters_output, outdata, contributions, cleanups); if(tool::get_config().advanced_thread_trace && !tool_metadata->att_filenames.empty()) { @@ -2982,7 +3095,8 @@ generate_output(cleanup_mode _cleanup_mode) rocdecode_output.get_generator(), rocjpeg_output.get_generator(), pc_sampling_host_trap_output.get_generator(), - pc_sampling_stochastic_output.get_generator()); + pc_sampling_stochastic_output.get_generator(), + spm_counters_output.get_generator()); json_ar.finish_process(); tool::close_json(json_ar); @@ -3023,7 +3137,8 @@ generate_output(cleanup_mode _cleanup_mode) kfd_output.get_generator(), rccl_output.get_generator(), rocdecode_output.get_generator(), - counters_output.get_generator()); + counters_output.get_generator(), + spm_counters_output.get_generator()); } if(tool::get_config().otf2_output && outdata.num_output > 0 && diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/CMakeLists.txt index e5ab1856eeb..bead908976c 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/CMakeLists.txt +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/CMakeLists.txt @@ -48,6 +48,7 @@ add_subdirectory(aql) add_subdirectory(pc_sampling) add_subdirectory(marker) add_subdirectory(thread_trace) +add_subdirectory(spm) add_subdirectory(tracing) add_subdirectory(kernel_dispatch) add_subdirectory(kfd) diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h index 4771f1f204a..24db08f16ba 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/aql/aql_profile_v2.h @@ -91,6 +91,14 @@ typedef enum AQLPROFILE_ACCUMULATION_LAST, } aqlprofile_accumulation_type_t; +typedef enum +{ + AQLPROFILE_SPM_DEPTH_NONE, + AQLPROFILE_SPM_DEPTH_16_BITS, + AQLPROFILE_SPM_DEPTH_32_BITS, + AQLPROFILE_SPM_DEPTH_64_BITS +} aqlprofile_spm_depth_t; + /** * @brief Special flags indicating additional properties to a counter. E.g. Accumulation metrics */ @@ -102,6 +110,11 @@ typedef union uint32_t accum : 3; /**< One of aqlprofile_accumulation_type_t */ uint32_t _reserved : 29; } sq_flags; + struct + { + uint32_t _reserved : 28; + uint32_t depth : 4; /**< One of aqlprofile_spm_depth_t */ + } spm_flags; } aqlprofile_pmc_event_flags_t; /** @@ -446,6 +459,204 @@ aqlprofile_att_codeobj_marker(hsa_ext_amd_aql_pm4_packet_t* packet, aqlprofile_memory_dealloc_callback_t dealloc_cb, void* userdata); +/** + * @brief Struct to be returned by aqlprofile_spm_create_packets + */ +typedef struct +{ + hsa_ext_amd_aql_pm4_packet_t start_packet; + hsa_ext_amd_aql_pm4_packet_t stop_packet; +} aqlprofile_spm_aql_packets_t; + +typedef struct +{ + void* data; // Valid until delete_packets() is called. Caller must save contents otherwise. + size_t size; // Size of "data" +} aqlprofile_spm_buffer_desc_t; + +typedef enum +{ + AQLPROFILE_SPM_PARAMETER_TYPE_BUFFER_SIZE = 0, + AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_INTERVAL, + AQLPROFILE_SPM_PARAMETER_TYPE_TIMEOUT, + AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_MODE, + AQLPROFILE_SPM_PARAMETER_TYPE_LAST, +} aqlprofile_spm_parameter_type_t; + +typedef enum +{ + AQLPROFILE_SPM_PARAMETER_SAMPLE_MODE_SCLK = 0, + AQLPROFILE_SPM_PARAMETER_SAMPLE_MODE_REFCLK +} aqlprofile_spm_parameter_interval_mode_t; + +typedef struct +{ + aqlprofile_spm_parameter_type_t type; + uint64_t value; +} aqlprofile_spm_parameter_t; + +/** + * @brief AQLprofile struct containing information for SPM counter events + */ +typedef struct +{ + aqlprofile_agent_handle_t aql_agent; + hsa_agent_t hsa_agent; + const aqlprofile_pmc_event_t* events; + size_t event_count; + aqlprofile_spm_parameter_t* parameters; + size_t parameter_count; + size_t reserved; // For future use + aqlprofile_memory_alloc_callback_t alloc_cb; + aqlprofile_memory_dealloc_callback_t dealloc_cb; // Frees memory allocated by alloc_cb + aqlprofile_memory_copy_t memcpy_cb; + void* userdata; + /// @brief Memory allocation, usually a wrapper for hsa_amd_memory_pool_allocate + /// @brief Copy memory in and out of GPU memory allocated by alloc_cb + /// @brief Passed back to user in the memory callbacks +} aqlprofile_spm_profile_t; + +/** + * @brief Function to create control SPM packets + * @param[out] handle To be passed to iterate_data() + * @param[out] desc Used to decode SPM buffer contents + * @param[out] packets Start/Stop AQL packets to be inserted in the queue + * @param[in] profile Agent and events information + * @param[in] data_cb Callback to retrieve SPM data when available + * @param[in] flags Reserved. Must be zero. + * @param[in] userdata Passed back to user + * @retval HSA_STATUS_SUCCESS on success + * @retval HSA_STATUS_ERROR on generic error + * @retval HSA_STATUS_ERROR_OUT_OF_RESOURCES if memory allocation unsuccessful + * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT for invalid parameter or event + * @retval HSA_STATUS_ERROR_INVALID_AGENT for invalid agent handle + */ +hsa_status_t +aqlprofile_spm_create_packets(aqlprofile_handle_t* handle, + aqlprofile_spm_buffer_desc_t* desc, + aqlprofile_spm_aql_packets_t* packets, + aqlprofile_spm_profile_t profile, + size_t flags); + +/** + * @brief Destroys resources allocated by aqlprofile_spm_create_packets() + * Implicitly calls aqlprofile_spm_stop. The descriptor pointer is invalid after this call. + * @param[in] handle Handle + */ +void +aqlprofile_spm_delete_packets(aqlprofile_handle_t handle); + +typedef size_t aqlprofile_spm_buffer_handle_t; + +typedef enum +{ + AQLPROFILE_SPM_DATA_FLAGS_DATA_LOSS = 0, +} aqlprofile_spm_data_flags_t; + +/** + * @brief Data callback for SPM events. + * @param[in] handle Handle to be passed to aqlprofile_spm_decode_data_callback_t + * @param[in] spm_data SPM raw data. Can be decoded via aqlprofile_spm_decode() + * @param[in] size Size of "spm_data" + * @param[in] flags Bitwise combination of aqlprofile_spm_data_flags_t + * @param[in] userdata Data returned to user + */ +typedef void (*aqlprofile_spm_data_callback_t)(aqlprofile_spm_buffer_handle_t handle, + void* spm_data, + size_t size, + int flags, + void* userdata); + +/** + * @brief Starts processing of SPM buffer + * @param[in] handle Handle + * @param[in] data_cb Callback to retrieve SPM data when available + * @param[in] userdata Passed back to user + * @retval HSA_STATUS_SUCCESS on success + * @retval HSA_STATUS_ERROR generic error + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED for invalid handle + */ +hsa_status_t +aqlprofile_spm_start(aqlprofile_handle_t handle, + aqlprofile_spm_data_callback_t data_cb, + void* userdata); + +/** + * @brief Flushes remaining SPM data and stops processing of SPM buffer + * @param[in] handle Handle + * @retval HSA_STATUS_SUCCESS on success + * @retval HSA_STATUS_ERROR generic error + * @retval HSA_STATUS_ERROR_NOT_INITIALIZED for invalid handle + */ +hsa_status_t +aqlprofile_spm_stop(aqlprofile_handle_t handle); + +/** + * @brief Callback where decoded SPM data will be returned to + * @param[in] timestamp timestamp of sample + * @param[in] value counter value + * @param[in] index index into the counter list + * @param[in] shader_engine shader engine of the sample + * @param[in] userdata userdata from aqlprofile_spm_decode_stream_v1 + */ + +typedef void (*aqlprofile_spm_decode_callback_v1_t)(uint64_t timestamp, + uint64_t value, + uint64_t index, + int shader_engine, + void* userdata); + +/** + * @brief Decodes a raw buffer returned by aqlprofile_spm_data_callback_t. + * Returns results accumulated per event_id requested. + * @param[in] desc Descriptor returned in create_packets() + * @param[in] decode_cb Callback where decoded SPM data will be returned to + * @param[in] data Raw SPM data returned in aqlprofile_spm_data_callback_t + * @param[in] size Raw data size + * @param[in] userdata Passed back to user + * @retval HSA_STATUS_SUCCESS if decode successful + * @retval HSA_STATUS_ERROR for generic error + */ +hsa_status_t +aqlprofile_spm_decode_stream_v1(aqlprofile_spm_buffer_desc_t desc, + aqlprofile_spm_decode_callback_v1_t decode_cb, + void* data, + size_t size, + void* userdata); + +enum aqlprofile_spm_decode_query_t +{ + AQLPROFILE_SPM_DECODE_QUERY_SEG_SIZE = 0, + AQLPROFILE_SPM_DECODE_QUERY_NUM_XCC, + AQLPROFILE_SPM_DECODE_QUERY_EVENT_COUNT, + AQLPROFILE_SPM_DECODE_QUERY_COUNTER_MAP_BYTE_OFFSET, + AQLPROFILE_SPM_DECODE_QUERY_LAST +}; + +/** + * @brief Function to query data contained in aqlprofile_spm_buffer_desc_t + * @param[in] desc Descriptor returned in create_packets() + * @param[in] query enum of type aqlprofile_spm_decode_query_t + * @param[out] data information output + * @retval HSA_STATUS_SUCCESS if decode successful + * @retval HSA_STATUS_ERROR for generic error + */ + +hsa_status_t +aqlprofile_spm_decode_query(aqlprofile_spm_buffer_desc_t desc, + aqlprofile_spm_decode_query_t query, + uint64_t* param_out); + +/** + * @brief Function to query if an event is supported on an agent + * @param[in] agent agent on which event needs to be collected + * @param[in] event event to be collected + * @retval bool to indicate if the event can be collected on an agent + */ + +bool +aqlprofile_spm_is_event_supported(aqlprofile_agent_handle_t agent, aqlprofile_pmc_event_t event); + #ifdef __cplusplus } #endif diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/aql/packet_construct.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/aql/packet_construct.cpp index 3417bd090d8..947d030b2e8 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/aql/packet_construct.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/aql/packet_construct.cpp @@ -42,6 +42,12 @@ namespace rocprofiler { namespace aql { +struct AQLProfileMetric +{ + counters::Metric metric; + std::vector events; +}; + CounterPacketConstruct::CounterPacketConstruct(rocprofiler_agent_id_t agent, const std::vector& metrics) : _agent(agent) @@ -278,5 +284,144 @@ CounterPacketConstruct::can_collect() } return ROCPROFILER_STATUS_SUCCESS; } + +/** @brief Constructs the packet using the contained input parameters. + * Writes into ID map and spm descriptor used to decode SPM data + */ +std::unique_ptr +spm_construct_packet(const rocprofiler_agent_id_t agent_id, + const std::vector& metrics, + double sample_freq, + uint64_t buffer_size, + uint64_t timeout) +{ + auto events = std::vector{}; + auto params = std::vector{}; + auto id_map = std::vector{}; + + const auto* agent = CHECK_NOTNULL(rocprofiler::agent::get_agent(agent_id)); + const auto* aql_cache = CHECK_NOTNULL(rocprofiler::agent::get_agent_cache(agent)); + auto pool = std::make_shared( + *aql_cache, *hsa::get_amd_ext_table(), hsa::get_core_table()->hsa_memory_copy_fn); + const auto* aql_agent = rocprofiler::agent::get_aql_agent(agent->id); + + const double sclk_freq = agent->max_engine_clk_fcompute * 1E9; // GHz + const size_t sclk_period = static_cast(std::roundf(sclk_freq / ((sample_freq) *1E9))); + + params.push_back({AQLPROFILE_SPM_PARAMETER_TYPE_BUFFER_SIZE, buffer_size * 1024}); + params.push_back({AQLPROFILE_SPM_PARAMETER_TYPE_SAMPLE_INTERVAL, sclk_period}); + params.push_back({AQLPROFILE_SPM_PARAMETER_TYPE_TIMEOUT, timeout}); + + for(const auto& metric : metrics) + { + auto query_info = get_query_info(agent_id, metric); + + for(unsigned block_index = 0; block_index < query_info.instance_count; ++block_index) + { + auto event = aqlprofile_pmc_event_t{ + .block_index = block_index, + .event_id = + static_cast(std::stoul(metric.event().c_str(), nullptr) & 0xFFFFFFFF), + .flags = aqlprofile_pmc_event_flags_t{metric.flags()}, + .block_name = static_cast(query_info.id)}; + + events.push_back(event); + id_map.push_back({rocprofiler_counter_id_t{.handle = metric.id()}, block_index}); + } + } + + aqlprofile_spm_profile_t profile{.aql_agent = *aql_agent, + .hsa_agent = pool->gpu_agent, + .events = events.data(), + .event_count = events.size(), + .parameters = params.data(), + .parameter_count = params.size(), + .reserved = 0, + .alloc_cb = &(hsa::SPMMemoryPool::Alloc), + .dealloc_cb = &(hsa::SPMMemoryPool::Free), + .memcpy_cb = &(hsa::SPMMemoryPool::Copy), + .userdata = pool.get()}; + + auto pkt = std::make_unique(*aql_agent, profile); + ROCP_FATAL_IF(!pkt->valid()) << "SPM Packet creation failed"; + + pool->delete_packets_fn = pkt->sym->spm_delete_packets; + pool->handle = pkt->handle; + pkt->pool = std::move(pool); + + pkt->spm_desc.size = + sizeof(spm::spm_desc_v0_t) + id_map.size() * sizeof(id_map[0]) + pkt->aql_desc.size; + + pkt->container_desc_data = std::make_shared>(pkt->spm_desc.size); + pkt->spm_desc.data = pkt->container_desc_data->data(); + + auto* desc = static_cast(pkt->spm_desc.data); + + *desc = spm::spm_desc_v0_t{}; + desc->aql_desc_size = pkt->aql_desc.size; + desc->num_events = id_map.size(); + + std::memcpy(desc->aqlprofile_desc(), pkt->aql_desc.data, pkt->aql_desc.size); + std::memcpy(desc->events(), id_map.data(), id_map.size() * sizeof(id_map[0])); + + pkt->clear(); + return pkt; +} + +// Following the PMC check for now +// ToDO: change this to SPM +rocprofiler_status_t +spm_can_collect(const rocprofiler_agent_id_t agent_id, const std::vector& metrics) +{ + // Verify that the counters fit within harrdware limits + auto counter_count = + std::map, int64_t>{}; + auto max_allowed = + std::map, int64_t>{}; + auto _metrics = std::vector{}; + + for(const auto& metric : metrics) + { + auto query_info = get_query_info(agent_id, metric); + _metrics.emplace_back().metric = metric; + + auto event_id = + static_cast(std::stoul(metric.event().c_str(), nullptr) & 0xFFFFFFFF); + + for(unsigned block_index = 0; block_index < query_info.instance_count; ++block_index) + { + _metrics.back().events.push_back( + {.block_index = block_index, + .event_id = event_id, + .flags = aqlprofile_pmc_event_flags_t{metric.flags()}, + .block_name = static_cast(query_info.id)}); + } + } + + for(auto& metric : _metrics) + { + for(auto& instance : metric.events) + { + auto block_pair = std::make_pair(instance.block_name, instance.block_index); + auto [iter, inserted] = counter_count.emplace(block_pair, 0); + iter->second++; + if(inserted) + { + max_allowed.emplace(block_pair, get_block_counters(agent_id, instance)); + } + } + } + + // Check if the block count > max count + for(auto& [block_name, count] : counter_count) + { + if(auto* max = CHECK_NOTNULL(common::get_val(max_allowed, block_name)); count > *max) + { + return ROCPROFILER_STATUS_ERROR_EXCEEDS_HW_LIMIT; + } + } + return ROCPROFILER_STATUS_SUCCESS; +} + } // namespace aql } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/aql/packet_construct.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/aql/packet_construct.hpp index 035879293bd..1c604845af9 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/aql/packet_construct.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/aql/packet_construct.hpp @@ -26,6 +26,7 @@ #include "lib/rocprofiler-sdk/aql/helpers.hpp" #include "lib/rocprofiler-sdk/counters/metrics.hpp" #include "lib/rocprofiler-sdk/hsa/agent_cache.hpp" +#include "lib/rocprofiler-sdk/spm/decode.hpp" #include "lib/rocprofiler-sdk/thread_trace/core.hpp" #include @@ -126,5 +127,16 @@ class ThreadTraceAQLPacketFactory hsa::TraceMemoryPool tracepool; }; +std::unique_ptr +spm_construct_packet(const rocprofiler_agent_id_t agent_id, + const std::vector& metrics, + double sample_freq, + uint64_t buffer_size, + uint64_t timeout); + +rocprofiler_status_t +spm_can_collect(const rocprofiler_agent_id_t agent_id, + const std::vector& metrics); + } // namespace aql } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/context.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/context.cpp index 053845b3c8c..99247a7df84 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/context.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/context.cpp @@ -338,6 +338,7 @@ start_context(rocprofiler_context_id_t context_id) auto status = ROCPROFILER_STATUS_SUCCESS; + if(cfg->dispatch_spm) status = rocprofiler::spm::start_context(cfg); if(cfg->counter_collection) rocprofiler::counters::start_context(cfg); if(cfg->device_thread_trace) cfg->device_thread_trace->start_context(); if(cfg->dispatch_thread_trace) cfg->dispatch_thread_trace->start_context(); @@ -374,6 +375,9 @@ stop_context(rocprofiler_context_id_t idx) rocprofiler::counters::stop_context(const_cast(_expected)); } + if(_expected->dispatch_spm) + rocprofiler::spm::stop_context(const_cast(_expected)); + if(_expected->device_thread_trace) _expected->device_thread_trace->stop_context(); if(_expected->dispatch_thread_trace) _expected->dispatch_thread_trace->stop_context(); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/context.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/context.hpp index 62f3e806abc..d76a69d3e61 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/context.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/context/context.hpp @@ -30,6 +30,7 @@ #include "lib/rocprofiler-sdk/counters/device_counting.hpp" #include "lib/rocprofiler-sdk/external_correlation.hpp" #include "lib/rocprofiler-sdk/pc_sampling/types.hpp" +#include "lib/rocprofiler-sdk/spm/core.hpp" #include "lib/rocprofiler-sdk/thread_trace/core.hpp" #include @@ -87,6 +88,18 @@ struct dispatch_counter_collection_service common::Synchronized enabled{false}; }; +struct spm_dispatch_counter_collection_service +{ + // Contains a SPM collection instance associated with this context. + // Contains callback information along with other data needed to collect/process + // SPM counters. + std::vector> callbacks{}; + // A flag to state wether or not the counter set is currently enabled. This is primarily + // to protect against multithreaded calls to enable a context (and enabling already enabled + // counters). + common::Synchronized enabled{false}; +}; + struct device_counting_service { std::unordered_set conf_agents; @@ -131,6 +144,8 @@ struct context std::unique_ptr dispatch_thread_trace = {}; std::unique_ptr device_thread_trace = {}; + std::unique_ptr dispatch_spm = {}; + template bool is_tracing(KindT _kind) const; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters.cpp index 73870464afd..38755f9274a 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters.cpp @@ -291,6 +291,11 @@ rocprofiler_query_counter_info(rocprofiler_counter_id_t counter_id, return true; }; + auto spm_info = [&](auto& out_struct) { + if(const auto* metric_ptr = common::get_val(id_map, static_cast(base_metric_id))) + out_struct.spm_support = isSupportSpm(*metric_ptr); + return false; + }; switch(version) { case ROCPROFILER_COUNTER_INFO_VERSION_0: @@ -313,6 +318,7 @@ rocprofiler_query_counter_info(rocprofiler_counter_id_t counter_id, if(!dim_info(_out_struct, agent_id)) return ROCPROFILER_STATUS_ERROR_DIM_NOT_FOUND; if(!dim_permutations(_out_struct)) return ROCPROFILER_STATUS_ERROR_DIM_NOT_FOUND; + spm_info(_out_struct); return ROCPROFILER_STATUS_SUCCESS; } diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/metrics.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/metrics.cpp index 8adcf0f991f..1c206a87923 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/metrics.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/metrics.cpp @@ -29,6 +29,8 @@ #include "lib/common/synchronized.hpp" #include "lib/common/utility.hpp" #include "lib/rocprofiler-sdk/agent.hpp" +#include "lib/rocprofiler-sdk/aql/helpers.hpp" +#include "lib/rocprofiler-sdk/spm/interface.hpp" #include #include @@ -423,7 +425,7 @@ operator==(Metric const& lhs, Metric const& rhs) }; return get_tie(lhs) == get_tie(rhs); } -Metric::Metric(const std::string&, // Get rid of this... +Metric::Metric(std::string arch, std::string name, std::string block, std::string event, @@ -431,7 +433,8 @@ Metric::Metric(const std::string&, // Get rid of this... std::string expr, std::string constant, uint64_t id) -: name_(std::move(name)) +: arch_(std::move(arch)) +, name_(std::move(name)) , block_(std::move(block)) , event_(std::move(event)) , description_(std::move(dsc)) @@ -453,5 +456,26 @@ Metric::Metric(const std::string&, // Get rid of this... } } } + +bool +isSupportSpm(const Metric& metric) +{ + auto agents = rocprofiler::agent::get_agents(); + + const auto itr = std::find_if(agents.begin(), agents.end(), [&](const auto* agent) { + return std::string_view(agent->name) == std::string_view(metric.arch()); + }); + if(itr == agents.end()) return false; + if(metric.event().empty()) return false; + auto sym = rocprofiler::spm::construct_spm_interface(); + if(!sym.has_value()) return false; + auto aql_agent = *CHECK_NOTNULL(rocprofiler::agent::get_aql_agent((*itr)->id)); + auto query_info = rocprofiler::aql::get_query_info((*itr)->id, metric); + auto pmc_event = aqlprofile_pmc_event_t{}; + pmc_event.block_name = static_cast(query_info.id); + pmc_event.event_id = static_cast(std::stoul(metric.event().c_str(), nullptr)); + return sym->spm_is_event_supported(aql_agent, pmc_event); +} + } // namespace counters } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/metrics.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/metrics.hpp index caa6c3b84b2..bdb0888667b 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/metrics.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/metrics.hpp @@ -46,7 +46,7 @@ class Metric { public: Metric() = default; - Metric(const std::string&, // Get rid of this... + Metric(std::string arch, std::string name, std::string block, std::string event, @@ -54,7 +54,8 @@ class Metric std::string expr, std::string constant, uint64_t id); - + + const std::string& arch() const { return arch_; } const std::string& name() const { return name_; } const std::string& block() const { return block_; } const std::string& event() const { return event_; } @@ -71,7 +72,8 @@ class Metric friend bool operator<(Metric const& lhs, Metric const& rhs); friend bool operator==(Metric const& lhs, Metric const& rhs); -private: +private: + std::string arch_ = {}; std::string name_ = {}; std::string block_ = {}; std::string event_ = {}; @@ -129,6 +131,9 @@ checkValidMetric(const std::string& agent, const Metric& metric); */ rocprofiler_status_t setCustomCounterDefinition(const CustomCounterDefinition& def); + +bool +isSupportSpm(const Metric& metric); } // namespace counters } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/metrics_test.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/metrics_test.cpp index 704723aaf83..45d51edff05 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/metrics_test.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/counters/tests/metrics_test.cpp @@ -242,6 +242,8 @@ TEST(metrics, check_public_api_query) EXPECT_EQ(std::string(info.description ? info.description : ""), metric.description()); // Dimensions are now verified through the API call above + EXPECT_EQ(info.spm_support, isSupportSpm(metric)); + for(size_t i = 0; i < info.dimensions_count; i++) { EXPECT_GT(info.dimensions[i]->instance_size, 0u); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp index f1c0c3a56ec..df7e2f2ed54 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/aql_packet.cpp @@ -21,6 +21,8 @@ // THE SOFTWARE. #include "lib/rocprofiler-sdk/hsa/aql_packet.hpp" +#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp" + #include #include #include @@ -242,5 +244,136 @@ CodeobjMarkerAQLPacket::CodeobjMarkerAQLPacket(const TraceMemoryPool& _tracepool clear(); } +SPMMemoryPool::SPMMemoryPool(const AgentCache& agent, const AmdExtTable& ext, copy_fn_t copy_fn) +{ + allocate_fn = ext.hsa_amd_memory_pool_allocate_fn; + allow_access_fn = ext.hsa_amd_agents_allow_access_fn; + free_fn = ext.hsa_amd_memory_pool_free_fn; + fill_fn = ext.hsa_amd_memory_fill_fn; + api_copy_fn = copy_fn; + + gpu_agent = agent.get_hsa_agent(); + cpu_pool_ = agent.cpu_pool(); + gpu_pool_ = agent.gpu_pool(); + kernarg_pool_ = agent.kernarg_pool(); +} + +void +SPMMemoryPool::Free(void* ptr, void* data) +{ + if(ptr == nullptr) return; + auto* pool = reinterpret_cast(data); + + ROCP_FATAL_IF(!pool || !pool->free_fn) << "Unable to deallocate from HSA memory pool"; + pool->free_fn(ptr); +} + +hsa_status_t +SPMMemoryPool::Copy(void* dst, const void* src, size_t size, void* data) +{ + if(size == 0) return HSA_STATUS_SUCCESS; + auto* pool = reinterpret_cast(data); + ROCP_FATAL_IF(!pool || !pool->api_copy_fn) << "Unable to copy HSA memory"; + + return pool->api_copy_fn(dst, src, size); +} + +hsa_status_t +SPMMemoryPool::Alloc(void** ptr, size_t size, aqlprofile_buffer_desc_flags_t flags, void* data) +{ + hsa_status_t status = HSA_STATUS_ERROR; + + if(size == 0) + { + if(ptr != nullptr) *ptr = nullptr; + return HSA_STATUS_SUCCESS; + } + if(!data) return HSA_STATUS_ERROR; + + auto& pool = *reinterpret_cast(data); + if(!pool.allocate_fn || !pool.free_fn || !pool.allow_access_fn) return HSA_STATUS_ERROR; + + if(flags.host_access) + status = pool.allocate_fn(pool.cpu_pool_, size, hsa_amd_memory_pool_executable_flag, ptr); + else + status = + pool.allocate_fn(pool.kernarg_pool_, size, hsa_amd_memory_pool_executable_flag, ptr); + + if(status == HSA_STATUS_SUCCESS) + status = pool.allow_access_fn(1, &pool.gpu_agent, nullptr, *ptr); + if(status == HSA_STATUS_SUCCESS) status = pool.fill_fn(*ptr, 0u, size / sizeof(uint32_t)); + + return status; +} + +SPMPacket::SPMPacket(aqlprofile_agent_handle_t aql_agent, aqlprofile_spm_profile_t profile) +: agent(aql_agent) +{ + sym = rocprofiler::spm::construct_spm_interface(); + if(!sym.has_value()) return; + auto status = sym->spm_create_packets(&handle, &aql_desc, &packets, profile, 0); + + if(status == HSA_STATUS_ERROR_INVALID_AGENT) return; + + packets.start_packet.header = VENDOR_BIT | BARRIER_BIT; + packets.stop_packet.header = VENDOR_BIT | BARRIER_BIT; + packets.start_packet.completion_signal = hsa_signal_t{.handle = 0}; + packets.stop_packet.completion_signal = hsa_signal_t{.handle = 0}; + + status = sym->spm_decode_query(aql_desc, AQLPROFILE_SPM_DECODE_QUERY_SEG_SIZE, &spm_desc.seg_size); + if(status != HSA_STATUS_SUCCESS) return; + status = sym->spm_decode_query(aql_desc, AQLPROFILE_SPM_DECODE_QUERY_NUM_XCC, &spm_desc.buffer_num); + if(status != HSA_STATUS_SUCCESS) return; + + is_valid = true; + empty = false; +} + +void +SPMPacket::populate_before() +{ + hsa_barrier_and_packet_t barrier{}; + barrier.header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; + barrier.header |= BARRIER_BIT; + + before_krn_barrier_pkt.push_back(barrier); + before_krn_barrier_pkt.push_back(barrier); + before_krn_pkt.push_back(packets.start_packet); +}; + +void +SPMPacket::populate_after() +{ + after_krn_pkt.push_back(packets.stop_packet); +}; + +void +SPMPacket::kfd_start() +{ + ROCP_FATAL_IF(!handle.handle) << "Attempt at starting SPM with unitialized packet!"; + + if(running.exchange(true)) + { + ROCP_ERROR << "Double call to KFD start!"; + return; + } + + auto status = sym->spm_start(this->handle, spm::aql_data_callback, this); + ROCP_FATAL_IF(status != HSA_STATUS_SUCCESS) << "Unable to acquire KFD thread"; +} + +void +SPMPacket::kfd_stop() +{ + if(running.exchange(false)) + sym->spm_stop(this->handle); + else + ROCP_WARNING << "Double call to KFD stop!"; +} + +SPMPacket::~SPMPacket() +{ + if(running.exchange(false) && sym.has_value()) sym->spm_stop(this->handle); +} } // namespace hsa } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/aql_packet.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/aql_packet.hpp index 733dcfae67c..29fc84f80e5 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/aql_packet.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/aql_packet.hpp @@ -24,18 +24,26 @@ #include "lib/common/container/small_vector.hpp" #include "lib/rocprofiler-sdk/aql/aql_profile_v2.h" +#include "lib/rocprofiler-sdk/spm/decode.hpp" +#include "lib/rocprofiler-sdk/spm/interface.hpp" +#include #include +#include #include #include +#include +#include + namespace rocprofiler { namespace aql { class CounterPacketConstruct; class ThreadTraceAQLPacketFactory; +class SPMPacketConstruct; } // namespace aql namespace hsa @@ -76,6 +84,7 @@ class AQLPacket { before_krn_pkt.clear(); after_krn_pkt.clear(); + before_krn_barrier_pkt.clear(); } bool isEmpty() const { return empty; } @@ -86,8 +95,9 @@ class AQLPacket aqlprofile_handle_t handle = {.handle = 0}; bool empty = {true}; - common::container::small_vector before_krn_pkt = {}; - common::container::small_vector after_krn_pkt = {}; + common::container::small_vector before_krn_pkt = {}; + common::container::small_vector after_krn_pkt = {}; + common::container::small_vector before_krn_barrier_pkt = {}; }; class EmptyAQLPacket : public AQLPacket @@ -235,5 +245,82 @@ class TraceControlAQLPacket : public AQLPacket std::unordered_map> loaded_codeobj; }; +struct SPMMemoryPool +{ + using desc_t = aqlprofile_buffer_desc_flags_t; + using copy_fn_t = decltype(hsa_memory_copy); + hsa_agent_t gpu_agent = {.handle = 0}; + hsa_amd_memory_pool_t cpu_pool_ = {.handle = 0}; + hsa_amd_memory_pool_t gpu_pool_ = {.handle = 0}; + hsa_amd_memory_pool_t kernarg_pool_ = {.handle = 0}; + decltype(hsa_amd_memory_pool_allocate)* allocate_fn = nullptr; + decltype(hsa_amd_agents_allow_access)* allow_access_fn = nullptr; + decltype(hsa_amd_memory_pool_free)* free_fn = nullptr; + decltype(hsa_memory_copy)* api_copy_fn = nullptr; + decltype(hsa_amd_memory_fill)* fill_fn = nullptr; + + SPMMemoryPool(const class AgentCache& agent, const class AmdExtTable& ext, copy_fn_t copy_fn); + ~SPMMemoryPool() + { + if(delete_packets_fn != nullptr && handle.handle != 0) delete_packets_fn(handle); + }; + explicit SPMMemoryPool() = default; + static hsa_status_t Alloc(void** ptr, + size_t size, + aqlprofile_buffer_desc_flags_t flags, + void* data); + static void Free(void* ptr, void* data); + static hsa_status_t Copy(void* dst, const void* src, size_t size, void* data); + spm::spm_interface::spm_delete_packets_fn_t* delete_packets_fn{nullptr}; + aqlprofile_handle_t handle{}; +}; + +class SPMPacket : public AQLPacket +{ + friend class rocprofiler::aql::SPMPacketConstruct; + +public: + SPMPacket(aqlprofile_agent_handle_t aql_agent, aqlprofile_spm_profile_t profile); + ~SPMPacket() override; + + explicit SPMPacket(const SPMPacket& other) + : agent(other.agent) + , sym(other.sym) + { + packets = other.packets; + is_valid = other.is_valid; + handle = other.handle; + empty = other.empty; + pool = other.pool; + aql_desc = other.aql_desc; + spm_desc = other.spm_desc; + container_desc_data = other.container_desc_data; + } + + void kfd_start(); + void kfd_stop(); + hsa_agent_t GetAgent() const { return pool ? pool->gpu_agent : hsa_agent_t{}; } + std::optional buffer; + aqlprofile_agent_handle_t agent; + rocprofiler_user_data_t user_data; + void* record_callback_args{}; + aqlprofile_spm_buffer_desc_t aql_desc{}; + rocprofiler::spm::spm_descriptor_t spm_desc{}; + rocprofiler_spm_dispatch_counting_record_cb_t record_cb{}; + rocprofiler_spm_dispatch_counting_service_data_t dispatch_data{}; + std::shared_ptr> container_desc_data{}; + aqlprofile_spm_aql_packets_t packets{}; + std::shared_ptr pool{}; + void populate_before() override; + void populate_after() override; + bool valid() const { return is_valid; } + + std::optional sym{}; + +private: + std::atomic running{false}; + bool is_valid{false}; +}; + } // namespace hsa } // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp index 0536b481404..f8dff0db51e 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue.cpp @@ -410,6 +410,13 @@ WriteInterceptor(const void* packets, } for(const auto& pkt_injection : inst_pkt) { + if(!pkt_injection.first->before_krn_barrier_pkt.empty()) + { + for(const auto& pkt : pkt_injection.first->before_krn_barrier_pkt) + { + transformed_packets.emplace_back(pkt); + } + } for(const auto& pkt : pkt_injection.first->before_krn_pkt) { inserted_before = true; @@ -546,7 +553,7 @@ Queue::Queue(const AgentCache& agent, if(!context::get_registered_contexts([](const context::context* ctx) { return (ctx->counter_collection || ctx->device_counter_collection || - ctx->dispatch_thread_trace || ctx->device_thread_trace); + ctx->dispatch_spm || ctx->dispatch_thread_trace || ctx->device_thread_trace); }).empty()) { CHECK(_agent.cpu_pool().handle != 0); diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp index 67d4ab2f5bb..23ae1b8cca1 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hsa/queue_controller.cpp @@ -505,7 +505,7 @@ enable_queue_intercept() { for(const auto& itr : context::get_registered_contexts()) { - constexpr auto expected_context_size = 216UL; + constexpr auto expected_context_size = 224UL; static_assert( sizeof(context::context) == expected_context_size, "If you added a new field to context struct, make sure there is a check here if it " @@ -517,7 +517,7 @@ enable_queue_intercept() bool has_scratch_reporting = itr->is_tracing(ROCPROFILER_CALLBACK_TRACING_SCRATCH_MEMORY) || itr->is_tracing(ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY); - if(itr->counter_collection || itr->pc_sampler || has_kernel_tracing || + if(itr->counter_collection || itr->pc_sampler || has_kernel_tracing || itr->dispatch_spm || has_scratch_reporting || itr->device_counter_collection || itr->device_thread_trace || itr->dispatch_thread_trace) return true; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rocprofiler-avail/rocprofv3_avail.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rocprofiler-avail/rocprofv3_avail.cpp index 396b8a3c046..369178e8115 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rocprofiler-avail/rocprofv3_avail.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rocprofiler-avail/rocprofv3_avail.cpp @@ -169,7 +169,8 @@ counter_info(uint64_t counter_handle, const char** counter_name, const char** counter_description, uint8_t* is_derived, - uint8_t* is_hw_constant) + uint8_t* is_hw_constant, + uint8_t* is_spm) { const auto* counter_info = get_counter_info(rocprofiler_counter_id_t{counter_handle}); @@ -177,6 +178,7 @@ counter_info(uint64_t counter_handle, *counter_description = counter_info->description; *is_derived = counter_info->is_derived; *is_hw_constant = counter_info->is_constant; + *is_spm = counter_info->spm_support; } void diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rocprofiler-avail/rocprofv3_avail.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rocprofiler-avail/rocprofv3_avail.hpp index f599201345a..8a0ad55bfb7 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rocprofiler-avail/rocprofv3_avail.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rocprofiler-avail/rocprofv3_avail.hpp @@ -41,7 +41,8 @@ counter_info(uint64_t counter_handle, const char** counter_name, const char** counter_description, uint8_t* is_derived, - uint8_t* is_hw_constant) ROCPROFILER_EXPORT; + uint8_t* is_hw_constant, + uint8_t* is_spm) ROCPROFILER_EXPORT; void counter_block(uint64_t counter_handle, const char** counter_block) ROCPROFILER_EXPORT; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rocprofiler.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rocprofiler.cpp index ad172169867..09ea90e7c70 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rocprofiler.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/rocprofiler.cpp @@ -93,7 +93,7 @@ ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_KERNEL, "A service depends on a newer version of KFD (amdgpu kernel driver)") ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES, "The given resources are insufficient to complete operation") -ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_PROFILE_NOT_FOUND, +ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_CONFIG_NOT_FOUND, "Could not find counter profile") ROCPROFILER_STATUS_STRING(ROCPROFILER_STATUS_ERROR_AGENT_DISPATCH_CONFLICT, "Cannot have both an agent counter collection and a dispatch counter " diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/CMakeLists.txt new file mode 100644 index 00000000000..89f9d5107c8 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/CMakeLists.txt @@ -0,0 +1,7 @@ +set(ROCPROFILER_LIB_SPM_SOURCES core.cpp service.cpp decode.cpp interface.cpp dispatch_handlers.cpp) +set(ROCPROFILER_LIB_SPM_HEADERS core.hpp interface.hpp decode.hpp dispatch_handlers.hpp) +target_sources(rocprofiler-sdk-object-library PRIVATE ${ROCPROFILER_LIB_SPM_SOURCES} + ${ROCPROFILER_LIB_SPM_HEADERS}) +if(ROCPROFILER_BUILD_TESTS) + add_subdirectory(tests) +endif() diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/core.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/core.cpp new file mode 100644 index 00000000000..f4fe6485c25 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/core.cpp @@ -0,0 +1,351 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "lib/rocprofiler-sdk/spm/core.hpp" +#include "lib/common/container/stable_vector.hpp" +#include "lib/common/utility.hpp" +#include "lib/rocprofiler-sdk/buffer.hpp" +#include "lib/rocprofiler-sdk/context/context.hpp" +#include "lib/rocprofiler-sdk/counters/metrics.hpp" +#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp" +#include "lib/rocprofiler-sdk/internal_threading.hpp" +#include "lib/rocprofiler-sdk/registration.hpp" +#include "lib/rocprofiler-sdk/spm/dispatch_handlers.hpp" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define CHECK_HSA(fn, message) \ + { \ + auto _status = (fn); \ + if(_status != HSA_STATUS_SUCCESS) \ + { \ + ROCP_ERROR << "HSA Err: " << _status << '\n'; \ + throw std::runtime_error(message); \ + } \ + } + +namespace rocprofiler +{ +namespace spm +{ +/** + *This is a singleton class with lazy initialization + */ +class SpmCounterController +{ +public: + SpmCounterController() = default; + // Adds a counter collection profile to our global cache. + // Note: these profiles can be used across multiple contexts + // and are independent of the context. + void spm_add_profile(std::shared_ptr&& config); + + rocprofiler_status_t spm_destroy_profile(uint64_t id); + // Setup the SPM counter collection service. spm_counter_callback_info is created here + + std::shared_ptr get_profile_cfg(rocprofiler_counter_config_id_t id); + +private: + // Cache to contain the map of config id handle to spm counter config + common::Synchronized>> + _configs; +}; + +SpmCounterController& +spm_get_controller(); + +/** + * @brief The functions checks if the `ROCPROFILER_SPM_BETA_ENABLED` is set. + * If so, it will enable SPM service. Otherwise, the API is reported + * as not implemented. + * + * The SPM is in experimental phase . + By enabling the `ROCPROFILER_SPM_BETA_ENABLED`, + * user accepts all consequences of using early implementation of SPM API. + */ +bool +is_spm_explicitly_enabled() +{ + auto spm_sampling_enabled = rocprofiler::common::get_env("ROCPROFILER_SPM_BETA_ENABLED", false); + + if(!spm_sampling_enabled) + ROCP_INFO << " SPM unavailable. The feature is implicitly disabled. " + << "To use it on a supported architecture, " + << "set ROCPROFILER_SPM_BETA_ENABLED=ON in the environment"; + + return spm_sampling_enabled; +} + +/** + * Adds a counter collection profile to our global cache. + * Note: these profiles can be used across multiple contexts and are independent of the context. + * Note: these profiles are per agent + * Assigns the config id and increments the monotonic counter. + */ +void +SpmCounterController::spm_add_profile(std::shared_ptr&& config) +{ + static std::atomic profile_val = 1; + _configs.wlock([&](auto& data) { + config->id = rocprofiler_counter_config_id_t{.handle = profile_val}; + data.emplace(profile_val, std::move(config)); + profile_val++; + }); +} + +/** + * @brief Removes the profile entry from the global cache + */ +rocprofiler_status_t +SpmCounterController::spm_destroy_profile(uint64_t id) +{ + return _configs.wlock([&](auto& data) { + auto itr = data.find(id); + if(itr == data.end()) return ROCPROFILER_STATUS_ERROR_PROFILE_NOT_FOUND; + if(data.erase(id) != 1) return ROCPROFILER_STATUS_ERROR; + return ROCPROFILER_STATUS_SUCCESS; + }); +} + +/** + * @brief Queries the global cache for the config using config id + */ +std::shared_ptr +SpmCounterController::get_profile_cfg(rocprofiler_counter_config_id_t id) +{ + std::shared_ptr cfg; + _configs.rlock([&](const auto& map) { cfg = map.at(id.handle); }); + return cfg; +} + +rocprofiler_status_t +destroy_spm_counter_profile(uint64_t id) +{ + return spm_get_controller().spm_destroy_profile(id); +} + +SpmCounterController& +spm_get_controller() +{ + static auto* controller = rocprofiler::common::static_object::construct(); + return *CHECK_NOTNULL(controller); +} + +/** + * @brief looks into the config's packet cache to re-use the packet + * If not, constructs the packet using packet generator + * updates packet_return map + */ +rocprofiler_status_t +get_spm_packet(const std::shared_ptr& info, + std::unique_ptr& ret_pkt, + std::shared_ptr& profile) +{ + profile->packets.wlock([&](auto& pkt_vector) { + if(!pkt_vector.empty()) + { + ret_pkt = std::move(pkt_vector.back()); + pkt_vector.pop_back(); + } + }); + + if(!ret_pkt) + { + // If we do not have a packet in the cache, create one. + ret_pkt = rocprofiler::aql::spm_construct_packet( + profile->agent->id, + std::vector{profile->metrics.begin(), profile->metrics.end()}, + profile->sample_freq, + profile->buffer_size, + profile->timeout); + }; + + ret_pkt->clear(); + info->packet_return_map.wlock([&](auto& data) { data.emplace(ret_pkt.get(), profile); }); + + return ROCPROFILER_STATUS_SUCCESS; +} + +/** @brief Creates spm the counter config + * Checks if the input counters does not exceed hardware limit + * Adds the config to configs cache + */ +rocprofiler_status_t +create_spm_counter_profile(std::shared_ptr config) +{ + auto status = ROCPROFILER_STATUS_SUCCESS; + if(status = rocprofiler::aql::spm_can_collect(config->agent->id, config->metrics); + status != ROCPROFILER_STATUS_SUCCESS) + { + return status; + } + + spm_get_controller().spm_add_profile(std::move(config)); + + return status; +} + +std::shared_ptr +get_spm_counter_config(rocprofiler_counter_config_id_t id) +{ + try + { + return spm_get_controller().get_profile_cfg(id); + } catch(std::out_of_range&) + { + return nullptr; + } +} + +/** @brief Configures SPM dispatch for the context + * Checks for conflicting services + * Instantiates spm_dispatch_counter_collection_service + */ + +rocprofiler_status_t +configure_callback_spm_dispatch(rocprofiler_context_id_t context_id, + rocprofiler_spm_dispatch_counting_service_cb_t callback, + void* callback_args, + rocprofiler_spm_dispatch_counting_record_cb_t record_callback, + void* record_callback_args) +{ + auto* ctx_p = rocprofiler::context::get_mutable_registered_context(context_id); + if(!ctx_p) return ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID; + + auto& ctx = *ctx_p; + + // FIXME: Due to the clock gating issue, counter collection and PC sampling service + // cannot coexist in the same context for now. + if(ctx.pc_sampler) return ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT; + if(ctx.counter_collection) return ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT; + if(ctx.device_counter_collection) return ROCPROFILER_STATUS_ERROR_AGENT_DISPATCH_CONFLICT; + if(!ctx.dispatch_spm) + ctx.dispatch_spm = + std::make_unique(); + auto& cb = *ctx.dispatch_spm->callbacks.emplace_back( + std::make_shared()); + + cb.user_cb = callback; + cb.callback_args = callback_args; + cb.context = context_id; + cb.record_callback = record_callback; + cb.record_callback_args = record_callback_args; + cb.internal_context = ctx_p; + + return ROCPROFILER_STATUS_SUCCESS; +} + +/** @brief start SPM dispatch context + * Enables serialization + * Returns if callback has already been added by checking the queue id + * Adds a pre kernel and a post kernel callback + * Enabled flag is used to check if context has already been enabled + */ + +rocprofiler_status_t +start_context(const context::context* ctx) +{ + if(!ctx || !ctx->dispatch_spm) return ROCPROFILER_STATUS_ERROR; + + auto* controller = hsa::get_queue_controller(); + + bool already_enabled = true; + CHECK_NOTNULL(controller)->enable_serialization(); + ctx->dispatch_spm->enabled.wlock([&](auto& enabled) { + if(enabled) return; + already_enabled = false; + enabled = true; + }); + + if(!already_enabled) + { + // Insert our callbacks into HSA Interceptor. This + // turns on counter instrumentation. + for(auto& cb : ctx->dispatch_spm->callbacks) + { + if(cb->queue_id != rocprofiler::hsa::ClientID{-1}) continue; + cb->queue_id = controller->add_callback( + std::nullopt, + [=](const hsa::Queue& q, + const hsa::rocprofiler_packet& kern_pkt, + rocprofiler_kernel_id_t kernel_id, + rocprofiler_dispatch_id_t dispatch_id, + rocprofiler_user_data_t* user_data, + const hsa::Queue::queue_info_session_t::external_corr_id_map_t& extern_corr_ids, + const context::correlation_id* correlation_id) { + return pre_kernel_call(ctx, + cb, + q, + kern_pkt, + kernel_id, + dispatch_id, + user_data, + extern_corr_ids, + correlation_id); + }, + // Completion CB + [=](const hsa::Queue& /* q */, + hsa::rocprofiler_packet /* kern_pkt */, + std::shared_ptr& session, + inst_pkt_t& aql, + kernel_dispatch::profiling_time dispatch_time) { + post_kernel_call(ctx, cb, session, aql, dispatch_time); + }); + } + } + + return ROCPROFILER_STATUS_SUCCESS; +} + +/** @brief stop SPM dispatch context + * Disables serialization + * Sets Enabled flag to false + */ + +void +stop_context(const context::context* ctx) +{ + if(!ctx || !ctx->dispatch_spm) return; + + auto* controller = hsa::get_queue_controller(); + + ctx->dispatch_spm->enabled.wlock([&](auto& enabled) { + if(!enabled) return; + enabled = false; + }); + + if(controller) controller->disable_serialization(); +} + +} // namespace spm + +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/core.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/core.hpp new file mode 100644 index 00000000000..761ec9960d0 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/core.hpp @@ -0,0 +1,143 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "lib/rocprofiler-sdk/aql/packet_construct.hpp" +#include "lib/rocprofiler-sdk/context/correlation_id.hpp" +#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp" +#include "lib/rocprofiler-sdk/hsa/aql_packet.hpp" +#include "lib/rocprofiler-sdk/hsa/queue.hpp" +#include "lib/rocprofiler-sdk/hsa/queue_info_session.hpp" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace rocprofiler +{ +namespace hsa +{ +class AQLPacket; +}; +namespace spm +{ +/** + * @brief SPM counter config contains SPM parameters and counters + * SPM config is per agent + * Pkt generator is used to construct the packet, pkt generator can be created before HSA init + * Has a packet cache to store the AQLpackets for SPM, it is constructed using the pkt generator. + * Its valid function checks if config has parameters and metrics initialized + */ +struct spm_counter_config +{ + const rocprofiler_agent_t* agent = nullptr; + std::vector metrics{}; + + double sample_freq = 0.5; + uint64_t buffer_size = 32768; + uint64_t timeout = 0; + + rocprofiler_counter_config_id_t id{.handle = 0}; + // A packet cache of AQL packets. This allows reuse of AQL packets (preventing costly + // allocation of new packets/destruction). + common::Synchronized>> packets; +}; + +/** + * @brief spm_counter_callback_info has the callbacks and user data associated with a context + * It has a cache of AQLPackets associated with configs which is used in post kernel callback + * to retrieve the config information for the given AQLPacket + * + */ +struct spm_counter_callback_info +{ + rocprofiler_spm_dispatch_counting_service_cb_t user_cb{nullptr}; + void* callback_args{nullptr}; + // Link to the context this is associated with + rocprofiler_context_id_t context{.handle = 0}; + // HSA Queue ClientID. This is an ID we get when we insert a callback into the + // HSA queue interceptor. This ID can be used to disable the callback. + rocprofiler::hsa::ClientID queue_id{-1}; + // Buffer to use for storing counter data. Used if callback is not set. + std::optional buffer; + // Link to the internal context this is associated with + // Internal context is used as a key to obtain external correlation id in pre kernel call + const context::context* internal_context; + rocprofiler_spm_dispatch_counting_record_cb_t record_callback; + void* record_callback_args{nullptr}; + common::Synchronized< + std::unordered_map>> + packet_return_map{}; +}; + +rocprofiler_status_t +get_spm_packet(const std::shared_ptr& info, + std::unique_ptr&, + std::shared_ptr&); + +rocprofiler_status_t +create_spm_counter_profile(std::shared_ptr config); + +rocprofiler_status_t +destroy_spm_counter_profile(uint64_t id); + +std::shared_ptr +get_spm_counter_config(rocprofiler_counter_config_id_t id); + +rocprofiler_status_t +configure_callback_spm_dispatch(rocprofiler_context_id_t context_id, + rocprofiler_spm_dispatch_counting_service_cb_t callback, + void* callback_data_args, + rocprofiler_spm_dispatch_counting_record_cb_t record_callback, + void* record_callback_args); + +bool +is_spm_explicitly_enabled(); + +/* + * start dispatch SPM context + */ +rocprofiler_status_t +start_context(const context::context*); + +/* + * stop dispatch SPM context + */ +void +stop_context(const context::context*); + +} // namespace spm +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/decode.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/decode.cpp new file mode 100644 index 00000000000..d79b5df5a90 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/decode.cpp @@ -0,0 +1,206 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include +#include +#include + +#include "lib/common/static_object.hpp" +#include "lib/rocprofiler-sdk/agent.hpp" +#include "lib/rocprofiler-sdk/aql/aql_profile_v2.h" +#include "lib/rocprofiler-sdk/buffer.hpp" +#include "lib/rocprofiler-sdk/counters/id_decode.hpp" +#include "lib/rocprofiler-sdk/hsa/aql_packet.hpp" +#include "lib/rocprofiler-sdk/spm/decode.hpp" +#include "lib/rocprofiler-sdk/spm/interface.hpp" + +#include +#include +#include +#include +#include + +#define CHECK_HSA(fn, message) \ + { \ + auto _status = (fn); \ + if(_status != HSA_STATUS_SUCCESS) \ + { \ + ROCP_ERROR << "HSA Err: " << _status << '\n'; \ + throw std::runtime_error(message); \ + } \ + } + +namespace rocprofiler +{ +namespace spm +{ +std::mutex& +get_buffer_mut() +{ + static auto*& mut = common::static_object::construct(); + return *CHECK_NOTNULL(mut); +} + +/** @brief Calback for every sample in SPM data buffer + * [In] timestamp - timestamp of the sample + * [In] value - value of the sample + * [In] index - used to index the event map and retrieve the counter id of the sample + * [In] shader_engine - -1 for global counters or the shader engine number + */ +void +decode_cb(uint64_t timestamp, uint64_t value, uint64_t index, int shader_engine, void* userdata) +{ + auto& counters = *reinterpret_cast(userdata); + + if(shader_engine < 0) + { + counters.at(index).is_global = true; + shader_engine = 0; + } + + if(counters.at(index).shaders.size() <= static_cast(shader_engine)) + counters.at(index).shaders.resize(shader_engine + 1); + + auto& instance = counters.at(index).shaders.at(shader_engine); + instance.timestamps.push_back(timestamp); + instance.values.push_back(value); +} + +/** @brief Callback for aqlprofile to return SPM data + * buffer id - XCC of the data + * flags - Indicates if there was a data loss + */ +void +aql_data_callback(size_t buffer_id, void* data, size_t data_size, int flags, void* userdata) +{ + spm::counter_vec counters{}; + auto* spm_packet = static_cast(userdata); + if(data_size == 0) + { + return; + } + + auto& desc_v0 = *static_cast(spm_packet->spm_desc.data); + if(!desc_v0.valid()) return; + + { + uint64_t count = 0; + + auto status = spm_packet->sym->spm_decode_query( + spm_packet->aql_desc, AQLPROFILE_SPM_DECODE_QUERY_EVENT_COUNT, &count); + if(status != HSA_STATUS_SUCCESS) return; + if(count != desc_v0.num_events) return; + counters.resize(count); + } + + // Intially size to 4 shaders + for(auto& v : counters) + v.shaders.resize(4); + + // Decode SPM data and return vector of instances_t in counters list. + auto status = + spm_packet->sym->spm_decode_stream_v1(spm_packet->aql_desc, decode_cb, data, data_size, &counters); + + if(status != HSA_STATUS_SUCCESS) return; + + auto records = std::vector{}; + auto buf_records = std::vector{}; + + rocprofiler::buffer::instance* buf = nullptr; + buf = buffer::get_buffer(spm_packet->buffer->handle); + + for(size_t i = 0; i < counters.size(); i++) + { + auto& event = desc_v0.events()[i]; + for(size_t se = 0; se < counters.at(i).shaders.size(); se++) + { + const auto& times = counters.at(i).shaders.at(se).timestamps; + const auto& values = counters.at(i).shaders.at(se).values; + + size_t size = std::min(times.size(), values.size()); + if(size == 0u) continue; + // Construct instance_id + auto instance_id = rocprofiler_counter_instance_id_t{}; + counters::set_dim_in_rec( + instance_id, rocprofiler::counters::ROCPROFILER_DIMENSION_XCC, buffer_id); + counters::set_dim_in_rec( + instance_id, rocprofiler::counters::ROCPROFILER_DIMENSION_INSTANCE, event.instance); + counters::set_counter_in_rec(instance_id, event.id); + if(!counters.at(i).is_global) + counters::set_dim_in_rec( + instance_id, rocprofiler::counters::ROCPROFILER_DIMENSION_SHADER_ENGINE, se); + for(size_t it = 0; it < size; it++) + { + if(buf) + { + buf_records.emplace_back(rocprofiler_spm_counter_record_t{ + sizeof(rocprofiler_spm_counter_record_t), + spm_packet->dispatch_data.dispatch_info.dispatch_id, + instance_id, + (rocprofiler::agent::get_rocprofiler_agent(spm_packet->GetAgent()))->id, + times[it], + values[it]}); + } + else + + // Construct SPM record and add it to the buffer + records.emplace_back(new rocprofiler_spm_counter_record_t{ + .size = sizeof(rocprofiler_spm_counter_record_t), + .dispatch_id = spm_packet->dispatch_data.dispatch_info.dispatch_id, + .id = instance_id, + .agent_id = + (rocprofiler::agent::get_rocprofiler_agent(spm_packet->GetAgent()))->id, + .timestamp = times[it], + .value = values[it]}); + } + } + } + if(buf) + { + auto _lk = std::unique_lock{get_buffer_mut()}; // Buffer records need to be in order + + buf->emplace(ROCPROFILER_BUFFER_CATEGORY_COUNTERS, + ROCPROFILER_COUNTER_RECORD_PROFILE_COUNTING_DISPATCH_HEADER, + spm_packet->dispatch_data); + for(auto itr : buf_records) + { + buf->emplace( + ROCPROFILER_BUFFER_CATEGORY_COUNTERS, ROCPROFILER_COUNTER_RECORD_VALUE, itr); + } + } + else + { + // Return the buffer of SPM records to the tool + spm_packet->record_cb(&(spm_packet->dispatch_data), + records.data(), + records.size(), + 1 << ROCPROFILER_SPM_RECORD_FLAG_DATA | flags << ROCPROFILER_SPM_RECORD_FLAG_DATA_LOST, + spm_packet->user_data, + spm_packet->record_callback_args); + for(const auto* itr : records) + delete(itr); + records.clear(); + } +} + +} // namespace spm +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/decode.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/decode.hpp new file mode 100644 index 00000000000..e46a81bc0ad --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/decode.hpp @@ -0,0 +1,108 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include + +#include +#include +#include + +namespace rocprofiler +{ +namespace spm +{ +typedef struct values_vec_t +{ + std::vector timestamps; + std::vector values; +} values_vec_t; + +/** @brief SPM values for a counter + * 2D vector - values per shader + * Flag to indicate if the counter is global + */ +typedef struct instances_t +{ + std::vector shaders; + bool is_global = false; +} instances_t; + +/** + * @brief hsa::SPMPacket contains spm_descriptor_t + * aqlprofile_spm_buffer_desc_t is returned from aqlprofile + * aqlprofile_spm_buffer_desc_t has the metadata needed to decode the packet + * Data contains spm_desc_v0_t + event_map + data in aqlprofile_spm_buffer_desc_t + * size = sizeof(spm_desc_v0_t) + sizeof(spm_counter_instance_t)*num_events + aql_desc.size + * seg_size = output segment size + * buffer_num = number of XCCs + */ + +typedef struct spm_descriptor_t +{ + void* data; + size_t size; + size_t seg_size; + size_t buffer_num; +} spm_descriptor_t; + +typedef std::vector counter_vec; + +typedef struct spm_counter_instance_t +{ + rocprofiler_counter_id_t id; + uint64_t instance; +} spm_counter_instance_t; + +/** @brief defines the layout of data buffer from spm_descriptor_t + * Event map is the list of spm_counter_instance_t + */ +typedef struct spm_desc_v0_t +{ + uint64_t version{0}; + size_t struct_size{sizeof(spm_desc_v0_t)}; + uint64_t aql_desc_size{0}; + uint64_t num_events{0}; + size_t event_elem_size{sizeof(spm_counter_instance_t)}; + uint64_t reserved{0}; + + bool valid() const + { + return version == 0 && aql_desc_size != 0 && struct_size == sizeof(spm_desc_v0_t) && + event_elem_size == sizeof(spm_counter_instance_t); + } + spm_counter_instance_t* events() { return reinterpret_cast(this + 1); } + void* aqlprofile_desc() { return events() + num_events; } +} spm_desc_v0_t; + +static_assert((sizeof(spm_desc_v0_t) % sizeof(spm_counter_instance_t)) == 0, + "invalid descriptor and counter combination"); + +void +decode_cb(uint64_t timestamp, uint64_t value, uint64_t index, int shader_engine, void* userdata); + +void +aql_data_callback(size_t len, void* data, size_t data_len, int flags, void* userdata); + +} // namespace spm +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/dispatch_handlers.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/dispatch_handlers.cpp new file mode 100644 index 00000000000..3021d7660f2 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/dispatch_handlers.cpp @@ -0,0 +1,237 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "lib/rocprofiler-sdk/spm/dispatch_handlers.hpp" +#include "lib/common/container/small_vector.hpp" +#include "lib/common/synchronized.hpp" +#include "lib/common/utility.hpp" +#include "lib/rocprofiler-sdk/buffer.hpp" +#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp" +#include "lib/rocprofiler-sdk/kernel_dispatch/profiling_time.hpp" + +#include +#include + +namespace rocprofiler +{ +namespace spm +{ +/** + * @brief Async Handler for barrier=packet1 completion signal + * Destroys the barrier-packet1 completion signal + * Sets the dependency signal of barrier packet-2 to 0, so that barrier-packet2 can complete + * This guarantees that SPM has been started before the dispatch + **/ +bool +AsyncSignalHandler(hsa_signal_value_t /*signal_v*/, void* data) +{ + auto* pkt = CHECK_NOTNULL(static_cast(data)); + auto* packet = CHECK_NOTNULL(dynamic_cast(pkt)); + + packet->kfd_start(); + + CHECK_NOTNULL(hsa::get_queue_controller()) + ->get_core_table() + .hsa_signal_destroy_fn(packet->before_krn_barrier_pkt.at(0).completion_signal); + CHECK_NOTNULL(hsa::get_queue_controller()) + ->get_core_table() + .hsa_signal_store_screlease_fn(packet->before_krn_barrier_pkt.at(1).dep_signal[0], 0); + return false; +} + +/** + * @brief Callback we get from HSA interceptor when a kernel packet is being enqueued. + * We return an AQLPacket containing the start/stop . + * Barrier_packet1-barrier_packet2-SPM Start Packet - kernel packets- SPM stop packet + * AsyncSignalHandler - barrier-packet1 completion signal handler + * barrier-packet2 - dependency signal- initialized to value -1 + * Update callback, user data, and dispatch data in the SPM packet + * handshake protocol with aqlprofile SPM start kfd - SPM start packet- kernel dispatch - SPM stop + * packet - SPM stop KFD + */ +hsa::Queue::pkt_and_serialize_t +pre_kernel_call(const context::context* ctx, + const std::shared_ptr& info, + const hsa::Queue& queue, + const hsa::rocprofiler_packet& pkt, + uint64_t kernel_id, + rocprofiler_dispatch_id_t dispatch_id, + rocprofiler_user_data_t* user_data, + const hsa::Queue::queue_info_session_t::external_corr_id_map_t& extern_corr_ids, + const context::correlation_id* correlation_id) +{ + CHECK(info && ctx); + auto no_instrumentation = [&]() { + auto ret_pkt = std::make_unique(); + info->packet_return_map.wlock([&](auto& data) { data.emplace(ret_pkt.get(), nullptr); }); + // If we have a SPM counter collection context but it is not enabled, we still might need + // to add barrier packets to transition from serialized -> unserialized execution. This + // transition is coordinated by the serializer. + return ret_pkt; + }; + + if(!ctx || !ctx->dispatch_spm) return {nullptr, false}; + + bool is_enabled = false; + ctx->dispatch_spm->enabled.rlock([&](const auto& collect_ctx) { is_enabled = collect_ctx; }); + + if(!is_enabled || !info->user_cb) return {no_instrumentation(), true}; + + auto _corr_id_v = + rocprofiler_async_correlation_id_t{.internal = 0, .external = context::null_user_data}; + if(const auto* _corr_id = correlation_id) + { + _corr_id_v.internal = _corr_id->internal; + if(const auto* external = + rocprofiler::common::get_val(extern_corr_ids, info->internal_context)) + { + _corr_id_v.external = *external; + } + } + + auto req_profile = rocprofiler_counter_config_id_t{.handle = 0}; + auto dispatch_data = + common::init_public_api_struct(rocprofiler_spm_dispatch_counting_service_data_t{}); + + dispatch_data.correlation_id = _corr_id_v; + { + auto dispatch_info = common::init_public_api_struct(rocprofiler_kernel_dispatch_info_t{}); + dispatch_info.kernel_id = kernel_id; + dispatch_info.dispatch_id = dispatch_id; + dispatch_info.agent_id = CHECK_NOTNULL(queue.get_agent().get_rocp_agent())->id; + dispatch_info.queue_id = queue.get_id(); + dispatch_info.private_segment_size = pkt.kernel_dispatch.private_segment_size; + dispatch_info.group_segment_size = pkt.kernel_dispatch.group_segment_size; + dispatch_info.workgroup_size = {pkt.kernel_dispatch.workgroup_size_x, + pkt.kernel_dispatch.workgroup_size_y, + pkt.kernel_dispatch.workgroup_size_z}; + dispatch_info.grid_size = {pkt.kernel_dispatch.grid_size_x, + pkt.kernel_dispatch.grid_size_y, + pkt.kernel_dispatch.grid_size_z}; + dispatch_data.dispatch_info = dispatch_info; + } + + info->user_cb(&dispatch_data, &req_profile, user_data, info->callback_args); + + if(req_profile.handle == 0) return {no_instrumentation(), true}; + + auto prof_config = get_spm_counter_config(req_profile); + CHECK(prof_config); + + std::unique_ptr ret_pkt = nullptr; + auto ret_status = get_spm_packet(info, ret_pkt, prof_config); + + CHECK_EQ(ret_status, ROCPROFILER_STATUS_SUCCESS) << rocprofiler_get_status_string(ret_status); + + if(!ret_pkt->empty) + { + auto* spm_pkt = dynamic_cast(ret_pkt.get()); + // ROCP_FATAL_IF(pkt == nullptr) << "NULL Packet returned from get spm packet: "; + spm_pkt->clear(); + spm_pkt->populate_before(); + spm_pkt->populate_after(); + + spm_pkt->dispatch_data = dispatch_data; + spm_pkt->user_data = *user_data; + if(info->buffer) + spm_pkt->buffer = info->buffer; + else + { + spm_pkt->record_cb = info->record_callback; + spm_pkt->record_callback_args = info->record_callback_args; + } + + auto& signal_to_start_kfd = spm_pkt->before_krn_barrier_pkt.at(0).completion_signal; + auto& signal_kfd_has_started = spm_pkt->before_krn_barrier_pkt.at(1).dep_signal[0]; + + CHECK_NOTNULL(hsa::get_queue_controller()) + ->get_ext_table() + .hsa_amd_signal_create_fn(1, 0, nullptr, 0, &signal_to_start_kfd); + CHECK_NOTNULL(hsa::get_queue_controller()) + ->get_ext_table() + .hsa_amd_signal_create_fn(1, 0, nullptr, 0, &signal_kfd_has_started); + + CHECK_NOTNULL(hsa::get_queue_controller()) + ->get_core_table() + .hsa_signal_store_screlease_fn(signal_kfd_has_started, -1); + CHECK_NOTNULL(hsa::get_queue_controller()) + ->get_core_table() + .hsa_signal_store_screlease_fn(signal_to_start_kfd, 0); + + auto status = CHECK_NOTNULL(hsa::get_queue_controller()) + ->get_ext_table() + .hsa_amd_signal_async_handler_fn(signal_to_start_kfd, + HSA_SIGNAL_CONDITION_EQ, + -1, + rocprofiler::spm::AsyncSignalHandler, + ret_pkt.get()); + ROCP_FATAL_IF(status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) + << "Error: hsa_amd_signal_async_handler failed with error code " << status + << " :: " << hsa::get_hsa_status_string(status); + } + return {std::move(ret_pkt), true}; +} + +/** + * @brief Callback called by HSA interceptor when the kernel has completed processing. + * Destroys the depedency signal of barrier packet2 + * Invokes KFD SPM stop + * Removes entry in packet_return_map + * Puts the aql packet into config's packets cache for re-use + */ +void +post_kernel_call(const context::context* ctx, + const std::shared_ptr& info, + std::shared_ptr& /*ptr_session*/, + inst_pkt_t& pkts, + kernel_dispatch::profiling_time /*dispatch_time*/) +{ + CHECK(info && ctx); + + std::shared_ptr prof_config; + // Get the Profile Config + info->packet_return_map.wlock([&](auto& data) { + for(auto& [aql_pkt, _] : pkts) + { + const auto& profile = rocprofiler::common::get_val(data, aql_pkt.get()); + if(profile) + { + prof_config = *profile; + data.erase(aql_pkt.get()); + + auto* pkt = dynamic_cast(aql_pkt.get()); + if(!pkt) continue; + + CHECK_NOTNULL(hsa::get_queue_controller()) + ->get_core_table() + .hsa_signal_destroy_fn(pkt->before_krn_barrier_pkt.at(1).dep_signal[0]); + pkt->kfd_stop(); + auto rel_pkt = std::move(aql_pkt); + prof_config->packets.wlock( + [&](auto& pkt_vector) { pkt_vector.emplace_back(std::move(rel_pkt)); }); + return; + } + } + }); +} +} // namespace spm +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/dispatch_handlers.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/dispatch_handlers.hpp new file mode 100644 index 00000000000..93d2cdff28e --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/dispatch_handlers.hpp @@ -0,0 +1,56 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "lib/rocprofiler-sdk/context/context.hpp" +#include "lib/rocprofiler-sdk/hsa/aql_packet.hpp" +#include "lib/rocprofiler-sdk/kernel_dispatch/profiling_time.hpp" +#include "lib/rocprofiler-sdk/spm/core.hpp" + +namespace rocprofiler +{ +namespace spm +{ +using ClientID = int64_t; +using inst_pkt_t = common::container:: + small_vector, ClientID>, 4>; + +hsa::Queue::pkt_and_serialize_t +pre_kernel_call(const context::context* ctx, + const std::shared_ptr& info, + const hsa::Queue& queue, + const hsa::rocprofiler_packet& pkt, + uint64_t kernel_id, + rocprofiler_dispatch_id_t dispatch_id, + rocprofiler_user_data_t* user_data, + const hsa::Queue::queue_info_session_t::external_corr_id_map_t& extern_corr_ids, + const context::correlation_id* correlation_id); + +void +post_kernel_call(const context::context* ctx, + const std::shared_ptr& info, + std::shared_ptr& session, + inst_pkt_t& aql, + kernel_dispatch::profiling_time dispatch_time); +} // namespace spm +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/interface.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/interface.cpp new file mode 100644 index 00000000000..fbadacb4650 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/interface.cpp @@ -0,0 +1,63 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "lib/rocprofiler-sdk/spm/interface.hpp" +#include "lib/common/logging.hpp" + +#include + +#include +#include +#include +#include +#include +#include + +namespace rocprofiler +{ +namespace spm +{ + +std::optional +construct_spm_interface(void* handle) +{ + + if(!handle) handle = dlopen("libhsa-amd-aqlprofile64.so.1", RTLD_NOLOAD | RTLD_LAZY); + + if(!handle) + { + ROCP_CI_LOG(WARNING) << fmt::format("aqlprofile cannot be opened"); + return std::nullopt; + } + auto interface = spm_interface(); + interface.spm_create_packets = (spm_interface::spm_create_packets_fn_t*) dlsym(handle, "aqlprofile_spm_create_packets"); + interface.spm_delete_packets = (spm_interface::spm_delete_packets_fn_t*) dlsym(handle, "aqlprofile_spm_delete_packets"); + interface.spm_start = (spm_interface::spm_start_fn_t*) dlsym(handle, "aqlprofile_spm_start"); + interface.spm_stop = (spm_interface::spm_stop_fn_t*) dlsym(handle, "aqlprofile_spm_stop"); + interface.spm_decode_stream_v1 = (spm_interface::spm_decode_stream_v1_fn_t*) dlsym(handle, "aqlprofile_spm_decode_stream_v1"); + interface.spm_decode_query = (spm_interface::spm_decode_query_fn_t*) dlsym(handle, "aqlprofile_spm_decode_query"); + interface.spm_is_event_supported = (spm_interface::spm_is_event_supported_fn_t*) dlsym(handle, "aqlprofile_spm_is_event_supported"); + return interface; +} + +} // namespace spm +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/interface.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/interface.hpp new file mode 100644 index 00000000000..913cb6afd06 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/interface.hpp @@ -0,0 +1,61 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#pragma once + +#include "lib/rocprofiler-sdk/aql/aql_profile_v2.h" + +#include + +namespace rocprofiler +{ +namespace spm +{ +/** @brief Wrapper to aqlprofile functions for SPM + */ +class spm_interface +{ +public: + + using spm_create_packets_fn_t = decltype(aqlprofile_spm_create_packets); + using spm_delete_packets_fn_t = decltype(aqlprofile_spm_delete_packets); + using spm_start_fn_t = decltype(aqlprofile_spm_start); + using spm_stop_fn_t = decltype(aqlprofile_spm_stop); + using spm_decode_stream_v1_fn_t = decltype(aqlprofile_spm_decode_stream_v1); + using spm_decode_query_fn_t = decltype(aqlprofile_spm_decode_query); + using spm_is_event_supported_fn_t = decltype(aqlprofile_spm_is_event_supported); + + + spm_create_packets_fn_t* spm_create_packets = nullptr; + spm_delete_packets_fn_t* spm_delete_packets = nullptr; + spm_start_fn_t* spm_start = nullptr; + spm_stop_fn_t* spm_stop = nullptr; + spm_decode_stream_v1_fn_t* spm_decode_stream_v1 = nullptr; + spm_decode_query_fn_t* spm_decode_query = nullptr; + spm_is_event_supported_fn_t * spm_is_event_supported = nullptr; +}; + +std::optional +construct_spm_interface(void* handle = nullptr); + +} // namespace spm +} // namespace rocprofiler diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/service.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/service.cpp new file mode 100644 index 00000000000..31460f69e4d --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/service.cpp @@ -0,0 +1,249 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "lib/common/utility.hpp" +#include "lib/rocprofiler-sdk/aql/helpers.hpp" +#include "lib/rocprofiler-sdk/context/context.hpp" +#include "lib/rocprofiler-sdk/counters/id_decode.hpp" +#include "lib/rocprofiler-sdk/counters/metrics.hpp" +#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp" +#include "lib/rocprofiler-sdk/hsa/aql_packet.hpp" +#include "lib/rocprofiler-sdk/registration.hpp" +#include "lib/rocprofiler-sdk/spm/interface.hpp" + +#include +#include +#include + +#include +#include +#include + +extern "C" { + +/** + * @brief Create Profile Configuration. + * + * @param [in] agent Agent identifier + * @param [in] counters_list List of GPU counters + * @param [in] counters_count Size of counters list + * @param [in/out] config_id Identifier for GPU counters group. If an existing + profile is supplied, that profiles counters will be copied + over to a new profile (returned via this id). + * @return ::rocprofiler_status_t + */ +rocprofiler_status_t +rocprofiler_spm_create_counter_config(rocprofiler_agent_id_t agent_id, + rocprofiler_counter_id_t* counters_list, + size_t counters_count, + rocprofiler_spm_configuration_t* parameters, + rocprofiler_counter_config_id_t* config_id) +{ + auto sym = rocprofiler::spm::construct_spm_interface(); + if(!sym.has_value()) return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_ABI; + + if(!rocprofiler::spm::is_spm_explicitly_enabled()) + return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; + + std::unordered_set already_added; + const auto* agent = ::rocprofiler::agent::get_agent(agent_id); + if(!agent) return ROCPROFILER_STATUS_ERROR_AGENT_NOT_FOUND; + + std::shared_ptr config = + std::make_shared(); + + auto metrics_map = rocprofiler::counters::loadMetrics(); + const auto& id_map = metrics_map->id_to_metric; + if(config_id->handle == 0 && counters_count == 0) + return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT; + for(size_t i = 0; i < counters_count; i++) + { + auto& counter_id = counters_list[i]; + auto base_metric_id = rocprofiler::counters::get_base_metric_from_counter_id(counter_id); + const auto* metric_ptr = rocprofiler::common::get_val(id_map, base_metric_id); + + if(!metric_ptr) return ROCPROFILER_STATUS_ERROR_COUNTER_NOT_FOUND; + // Don't add duplicates + if(!already_added.emplace(metric_ptr->id()).second) continue; + + if(!rocprofiler::counters::checkValidMetric(std::string(agent->name), *metric_ptr) || + !rocprofiler::counters::isSupportSpm(*metric_ptr)) + { + return ROCPROFILER_STATUS_ERROR_METRIC_NOT_VALID_FOR_AGENT; + } + config->metrics.push_back(*metric_ptr); + } + + if(parameters) + { + config->timeout = parameters->timeout; + config->buffer_size = parameters->buffer_size; + config->sample_freq = parameters->frequency; + } + + if(config_id->handle != 0) + { + // Copy existing counters from previous config + if(auto existing = rocprofiler::spm::get_spm_counter_config(*config_id)) + { + for(const auto& metric : existing->metrics) + { + if(!already_added.emplace(metric.id()).second) continue; + config->metrics.push_back(metric); + } + if(existing->sample_freq != config->sample_freq) + config->sample_freq = existing->sample_freq; + if(existing->buffer_size != config->buffer_size) + config->buffer_size = existing->buffer_size; + if(existing->timeout != config->timeout) config->timeout = existing->timeout; + } + } + + config->agent = agent; + if(auto status = rocprofiler::spm::create_spm_counter_profile(config); + status != ROCPROFILER_STATUS_SUCCESS) + { + return ROCPROFILER_STATUS_ERROR_EXCEEDS_HW_LIMIT; + } + *config_id = config->id; + + return ROCPROFILER_STATUS_SUCCESS; +} + +rocprofiler_status_t +rocprofiler_spm_destroy_counter_config(rocprofiler_counter_config_id_t config_id) +{ + rocprofiler::spm::destroy_spm_counter_profile(config_id.handle); + return ROCPROFILER_STATUS_SUCCESS; +} + +rocprofiler_status_t +rocprofiler_configure_callback_spm_dispatch_service( + rocprofiler_context_id_t context_id, + rocprofiler_spm_dispatch_counting_service_cb_t dispatch_callback, + void* dispatch_callback_args, + rocprofiler_spm_dispatch_counting_record_cb_t record_callback, + void* record_callback_args) +{ + auto sym = rocprofiler::spm::construct_spm_interface(); + if(!sym.has_value()) return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_ABI; + + if(!rocprofiler::spm::is_spm_explicitly_enabled()) + return ROCPROFILER_STATUS_ERROR_NOT_IMPLEMENTED; + + if(rocprofiler::registration::get_init_status() > -1) + return ROCPROFILER_STATUS_ERROR_CONFIGURATION_LOCKED; + + auto* ctx = rocprofiler::context::get_mutable_registered_context(context_id); + if(!ctx) return ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID; + + return rocprofiler::spm::configure_callback_spm_dispatch(context_id, + dispatch_callback, + dispatch_callback_args, + record_callback, + record_callback_args); +} + +/** + * @brief Query Agent Counters Availability. + * + * @param [in] agent + * @param [out] counters_list + * @param [out] counters_count + * @return ::rocprofiler_status_t + */ +rocprofiler_status_t +rocprofiler_iterate_spm_supported_counters(rocprofiler_agent_id_t agent_id, + rocprofiler_available_counters_cb_t cb, + void* user_data) +{ + const auto* agent = rocprofiler::agent::get_agent(agent_id); + if(!agent) return ROCPROFILER_STATUS_ERROR_AGENT_NOT_FOUND; + + auto metrics = rocprofiler::counters::getMetricsForAgent(agent); + + auto ids = std::vector{}; + + for(const auto& m : metrics) + { + if(rocprofiler::counters::isSupportSpm(m)) + { + // Create agent-encoded counter ID using the agent's logical_node_id + rocprofiler_counter_id_t counter_id{.handle = 0}; + rocprofiler::counters::set_base_metric_in_counter_id(counter_id, m.id()); + ids.push_back(counter_id); + } + } + if(ids.empty()) return ROCPROFILER_STATUS_ERROR_AGENT_ARCH_NOT_SUPPORTED; + + return cb(agent_id, ids.data(), ids.size(), user_data); +} + +/** + * @brief Configure buffered dispatch profile Counting Service. + * Collects the counters in dispatch packets and stores them + * in buffer_id. The buffer may contain packets from more than + * one dispatch (denoted by correlation id). Will trigger the + * callback based on the parameters setup in buffer_id_t. + * + * @param [in] context_id context id + * @param [in] buffer_id id of the buffer to use for the counting service + * @param [in] profile profile config to use for dispatch + * @return ::rocprofiler_status_t + */ +rocprofiler_status_t +rocprofiler_configure_buffer_spm_dispatch_service( + rocprofiler_context_id_t context_id, + rocprofiler_buffer_id_t buffer_id, + rocprofiler_spm_dispatch_counting_service_cb_t callback, + void* callback_data_args) +{ + auto* ctx_p = rocprofiler::context::get_mutable_registered_context(context_id); + if(!ctx_p) return ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID; + + // checking if the buffer is registered + auto const* buff = rocprofiler::buffer::get_buffer(buffer_id); + if(!buff) return ROCPROFILER_STATUS_ERROR_BUFFER_NOT_FOUND; + + auto& ctx = *ctx_p; + + if(ctx.pc_sampler) return ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT; + if(ctx.counter_collection) return ROCPROFILER_STATUS_ERROR_CONTEXT_CONFLICT; + if(ctx.device_counter_collection) return ROCPROFILER_STATUS_ERROR_AGENT_DISPATCH_CONFLICT; + if(!ctx.dispatch_spm) + ctx.dispatch_spm = + std::make_unique(); + auto& cb = *ctx.dispatch_spm->callbacks.emplace_back( + std::make_shared()); + + cb.user_cb = callback; + cb.callback_args = callback_data_args; + cb.context = context_id; + if(buffer_id.handle != 0) + { + cb.buffer = buffer_id; + } + cb.internal_context = ctx_p; + + return ROCPROFILER_STATUS_SUCCESS; +} +} diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/tests/CMakeLists.txt b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/tests/CMakeLists.txt new file mode 100644 index 00000000000..e18a4c0ee41 --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/tests/CMakeLists.txt @@ -0,0 +1,42 @@ +rocprofiler_deactivate_clang_tidy() + +project(rocprofiler-sdk-unit-tests-spm-counters LANGUAGES C CXX) + +include(GoogleTest) + +find_program( + amdclangpp_EXECUTABLE REQUIRED + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + +set(_ROCPROFILER_SHARE_DIR + "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_DATAROOTDIR}/${PACKAGE_NAME}") + +set(ROCPROFILER_LIB_SPM_COUNTER_TEST_SOURCES core.cpp) + +add_executable(spm-counter-test) + +target_sources(spm-counter-test PRIVATE ${ROCPROFILER_LIB_SPM_COUNTER_TEST_SOURCES}) + +add_dependencies(spm-counter-test agent_hsaco_targets) + +target_link_libraries( + spm-counter-test + PRIVATE rocprofiler-sdk::rocprofiler-sdk-hsa-runtime + rocprofiler-sdk::rocprofiler-sdk-hip + rocprofiler-sdk::rocprofiler-sdk-common-library + rocprofiler-sdk::rocprofiler-sdk-static-library + GTest::gtest + GTest::gtest_main) + +rocprofiler_add_unit_test( + TARGET spm-counter-test + SOURCES ${ROCPROFILER_LIB_SPM_COUNTER_TEST_SOURCES} + TIMEOUT 45 + ENVIRONMENT + "ROCPROFILER_METRICS_PATH=${_ROCPROFILER_SHARE_DIR}" + "ROCPROFILER_SPM_BETA_ENABLED=True" + "LD_LIBRARY_PATH=${CMAKE_INSTALL_PREFIX}/lib:${CMAKE_INSTALL_PREFIX}/llvm/lib:/opt/rocm/lib:/opt/rocm/llvm/lib:$ENV{LD_LIBRARY_PATH}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}") diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/tests/core.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/tests/core.cpp new file mode 100644 index 00000000000..fc491833fbd --- /dev/null +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/spm/tests/core.cpp @@ -0,0 +1,746 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "lib/rocprofiler-sdk/spm/core.hpp" +#include "lib/common/utility.hpp" +#include "lib/rocprofiler-sdk/agent.hpp" +#include "lib/rocprofiler-sdk/context/context.hpp" +#include "lib/rocprofiler-sdk/hsa/agent_cache.hpp" +#include "lib/rocprofiler-sdk/hsa/aql_packet.hpp" +#include "lib/rocprofiler-sdk/hsa/hsa.hpp" +#include "lib/rocprofiler-sdk/hsa/queue.hpp" +#include "lib/rocprofiler-sdk/hsa/queue_controller.hpp" +#include "lib/rocprofiler-sdk/kernel_dispatch/profiling_time.hpp" +#include "lib/rocprofiler-sdk/registration.hpp" +#include "lib/rocprofiler-sdk/spm/dispatch_handlers.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace rocprofiler::counters; +using namespace rocprofiler; + +AmdExtTable& +get_ext_table() +{ + static auto _v = []() { + auto val = AmdExtTable{}; + val.version.major_id = HSA_AMD_EXT_API_TABLE_MAJOR_VERSION; + val.version.minor_id = sizeof(AmdExtTable); + val.version.step_id = HSA_AMD_EXT_API_TABLE_STEP_VERSION; + val.hsa_amd_memory_pool_get_info_fn = hsa_amd_memory_pool_get_info; + val.hsa_amd_agent_iterate_memory_pools_fn = hsa_amd_agent_iterate_memory_pools; + val.hsa_amd_memory_pool_allocate_fn = hsa_amd_memory_pool_allocate; + val.hsa_amd_memory_pool_free_fn = hsa_amd_memory_pool_free; + val.hsa_amd_agent_memory_pool_get_info_fn = hsa_amd_agent_memory_pool_get_info; + val.hsa_amd_agents_allow_access_fn = hsa_amd_agents_allow_access; + val.hsa_amd_memory_fill_fn = hsa_amd_memory_fill; + val.hsa_amd_signal_create_fn = hsa_amd_signal_create; + val.hsa_amd_spm_acquire_fn = hsa_amd_spm_acquire; + val.hsa_amd_spm_release_fn = hsa_amd_spm_release; + val.hsa_amd_signal_async_handler_fn = hsa_amd_signal_async_handler; + return val; + }(); + return _v; +} + +CoreApiTable& +get_api_table() +{ + static auto _v = []() { + auto val = CoreApiTable{}; + val.version.major_id = HSA_CORE_API_TABLE_MAJOR_VERSION; + val.version.minor_id = sizeof(CoreApiTable); + val.version.step_id = HSA_CORE_API_TABLE_STEP_VERSION; + val.hsa_iterate_agents_fn = hsa_iterate_agents; + val.hsa_agent_get_info_fn = hsa_agent_get_info; + val.hsa_queue_create_fn = hsa_queue_create; + val.hsa_queue_destroy_fn = hsa_queue_destroy; + val.hsa_signal_wait_relaxed_fn = hsa_signal_wait_relaxed; + val.hsa_memory_copy_fn = hsa_memory_copy; + val.hsa_signal_create_fn = hsa_signal_create; + val.hsa_signal_destroy_fn = hsa_signal_destroy; + val.hsa_signal_store_relaxed_fn = hsa_signal_store_relaxed; + val.hsa_signal_store_screlease_fn = hsa_signal_store_screlease; + return val; + }(); + return _v; +} + +#define ROCPROFILER_CALL(result, msg) \ + { \ + rocprofiler_status_t CHECKSTATUS = result; \ + if(CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) \ + { \ + std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS); \ + std::cerr << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg \ + << " failed with error code " << CHECKSTATUS << ": " << status_msg \ + << std::endl; \ + std::stringstream errmsg{}; \ + errmsg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg " failure (" \ + << status_msg << ")"; \ + ASSERT_EQ(CHECKSTATUS, ROCPROFILER_STATUS_SUCCESS) << errmsg.str(); \ + } \ + } + +namespace +{ +auto +findSPMDeviceMetrics(const hsa::AgentCache& agent, const std::unordered_set& metrics) +{ + std::vector ret; + auto mets = counters::loadMetrics(); + const auto& all_counters = mets->arch_to_metric; + + ROCP_INFO << "Looking up counters for " << std::string(agent.name()); + const auto* gfx_metrics = common::get_val(all_counters, std::string(agent.name())); + if(!gfx_metrics) + { + ROCP_ERROR << "No counters found for " << std::string(agent.name()); + return ret; + } + + for(const auto& counter : *gfx_metrics) + { + if((metrics.count(counter.name()) > 0 || metrics.empty()) && rocprofiler::counters::isSupportSpm(counter)) + { + ret.push_back(counter); + } + } + return ret; +} + +void +test_init() +{ + HsaApiTable table; + table.amd_ext_ = &get_ext_table(); + table.core_ = &get_api_table(); + rocprofiler::hsa::copy_table(table.core_, 0); + rocprofiler::hsa::copy_table(table.amd_ext_, 0); + agent::construct_agent_cache(&table); + ASSERT_TRUE(hsa::get_queue_controller() != nullptr); + + hsa::get_queue_controller()->init(get_api_table(), get_ext_table()); +} + +} // namespace + +namespace +{ +rocprofiler_context_id_t& +get_client_ctx() +{ + static rocprofiler_context_id_t ctx{0}; + return ctx; +} + +void +set_client_ctx(rocprofiler_context_id_t& ctx) +{ + ctx = rocprofiler_context_id_t{0}; +} + +void +null_dispatch_callback(const rocprofiler_spm_dispatch_counting_service_data_t*, + rocprofiler_counter_config_id_t*, + rocprofiler_user_data_t*, + void*) +{} + +void +null_record_callback(const rocprofiler_spm_dispatch_counting_service_data_t*, + const rocprofiler_spm_counter_record_t**, + size_t, + int, + rocprofiler_user_data_t, + void*) +{} + +void +null_buffered_callback(rocprofiler_context_id_t, + rocprofiler_buffer_id_t, + rocprofiler_record_header_t**, + size_t, + void*, + uint64_t) +{} +} // namespace + +TEST(spm_core, check_packet_generation) +{ + ASSERT_EQ(hsa_init(), HSA_STATUS_SUCCESS); + test_init(); + ASSERT_TRUE(hsa::get_queue_controller() != nullptr); + auto agents = hsa::get_queue_controller()->get_supported_agents(); + ASSERT_GT(agents.size(), 0); + for(const auto& [_, agent] : agents) + { + auto metrics = findSPMDeviceMetrics(agent, {}); + ASSERT_FALSE(metrics.empty()); + ASSERT_TRUE(agent.get_rocp_agent()); + for(auto& metric : metrics) + { + /** + * Check profile construction + */ + rocprofiler_counter_config_id_t cfg_id = {.handle = 0}; + rocprofiler_counter_id_t id = {.handle = metric.id()}; + ROCP_ERROR << fmt::format("Generating packet for {}", metric); + + auto params = rocprofiler_spm_configuration_t{}; + params.frequency = 1.0; + params.buffer_size = 327; + params.timeout = 30; + ROCPROFILER_CALL(rocprofiler_spm_create_counter_config( + agent.get_rocp_agent()->id, &id, 1, ¶ms, &cfg_id), + "Unable to create profile"); + auto profile = spm::get_spm_counter_config(cfg_id); + ASSERT_TRUE(profile); + + /** + * Check that a packet generator was created + */ + + /** + * Check packet generation + */ + auto cb_info = std::make_shared(); + std::unique_ptr pkt = nullptr; + EXPECT_EQ(get_spm_packet(cb_info, pkt, profile), ROCPROFILER_STATUS_SUCCESS) + << "Unable to generate packet"; + EXPECT_TRUE(pkt) << "Expected a packet to be generated"; + cb_info->packet_return_map.wlock([&](const auto& data) { + EXPECT_EQ(data.size(), 1) << "Incorrect packet size"; + const auto* ptr = common::get_val(data, pkt.get()); + EXPECT_TRUE(ptr) << "Could not find pkt"; + }); + } + } +} + +namespace rocprofiler +{ +namespace hsa +{ +class FakeQueue : public Queue +{ +public: + FakeQueue(const AgentCache& a, rocprofiler_queue_id_t id) + : Queue(a, get_api_table()) + , _agent(a) + , _id(id) + {} + const AgentCache& get_agent() const final { return _agent; }; + rocprofiler_queue_id_t get_id() const final { return _id; }; + + ~FakeQueue() override = default; + +private: + const AgentCache& _agent; + rocprofiler_queue_id_t _id = {}; +}; + +} // namespace hsa +} // namespace rocprofiler + +namespace +{ +struct expected_dispatch +{ + // To pass back + rocprofiler_counter_config_id_t id = {.handle = 0}; + rocprofiler_queue_id_t queue_id = {.handle = 0}; + rocprofiler_agent_id_t agent_id = {.handle = 0}; + uint64_t kernel_id = 0; + uint64_t dispatch_id = 0; + rocprofiler_async_correlation_id_t correlation_id = {.internal = 0, .external = {.value = 0}}; + rocprofiler_dim3_t workgroup_size = {0, 0, 0}; + rocprofiler_dim3_t grid_size = {0, 0, 0}; + rocprofiler_counter_config_id_t* config = nullptr; +}; + +void +user_dispatch_cb(const rocprofiler_spm_dispatch_counting_service_data_t* dispatch_data, + rocprofiler_counter_config_id_t* config, + rocprofiler_user_data_t* user_data, + void* callback_data_args) +{ + expected_dispatch& expected = *static_cast(callback_data_args); + + auto agent_id = dispatch_data->dispatch_info.agent_id; + auto queue_id = dispatch_data->dispatch_info.queue_id; + auto correlation_id = dispatch_data->correlation_id; + auto kernel_id = dispatch_data->dispatch_info.kernel_id; + auto dispatch_id = dispatch_data->dispatch_info.dispatch_id; + + EXPECT_EQ(sizeof(rocprofiler_spm_dispatch_counting_service_data_t), dispatch_data->size); + EXPECT_EQ(expected.kernel_id, kernel_id); + EXPECT_EQ(expected.dispatch_id, dispatch_id); + EXPECT_EQ(expected.agent_id, agent_id); + EXPECT_EQ(expected.queue_id.handle, queue_id.handle); + EXPECT_EQ(expected.correlation_id.internal, correlation_id.internal); + EXPECT_EQ(expected.correlation_id.external.ptr, correlation_id.external.ptr); + EXPECT_EQ(expected.correlation_id.external.value, correlation_id.external.value); + EXPECT_EQ(expected.workgroup_size, dispatch_data->dispatch_info.workgroup_size); + EXPECT_EQ(expected.grid_size, dispatch_data->dispatch_info.grid_size); + + ASSERT_NE(config, nullptr); + config->handle = expected.id.handle; + + (void) user_data; +} + +} // namespace + +namespace rocprofiler +{ +namespace buffer +{ +uint64_t +get_buffer_offset(); +} +} // namespace rocprofiler + +TEST(spm_core, check_callbacks) +{ + int64_t count = 0; + ASSERT_EQ(hsa_init(), HSA_STATUS_SUCCESS); + test_init(); + + registration::init_logging(); + registration::set_init_status(-1); + context::push_client(1); + ROCPROFILER_CALL(rocprofiler_create_context(&get_client_ctx()), "context creation failed"); + + context::context ctx; + ctx.dispatch_spm = + std::make_unique(); + ctx.dispatch_spm->enabled.wlock([](auto& data) { data = true; }); + + ASSERT_TRUE(hsa::get_queue_controller() != nullptr); + auto agents = hsa::get_queue_controller()->get_supported_agents(); + ASSERT_GT(agents.size(), 0); + hsa::get_queue_controller()->disable_serialization(); + + for(const auto& [_, agent] : agents) + { + /** + * Setup + */ + rocprofiler_queue_id_t qid = {.handle = static_cast(count++)}; + hsa::FakeQueue fq(agent, qid); + auto metrics = findSPMDeviceMetrics(agent, {}); + ASSERT_FALSE(metrics.empty()); + ASSERT_TRUE(agent.get_rocp_agent()); + for(auto& metric : metrics) + { + if(!metric.expression().empty()) continue; + + /** + * Setup + */ + expected_dispatch expected = {}; + rocprofiler_counter_id_t id = {.handle = metric.id()}; + auto params = rocprofiler_spm_configuration_t{}; + params.frequency = 0.5; + params.buffer_size = 32768; + params.timeout = 30; + + ROCPROFILER_CALL(rocprofiler_spm_create_counter_config( + agent.get_rocp_agent()->id, &id, 1, ¶ms, &expected.id), + "Unable to create profile"); + auto profile = spm::get_spm_counter_config(expected.id); + ASSERT_TRUE(profile); + + std::shared_ptr cb_info = + std::make_shared(); + cb_info->user_cb = user_dispatch_cb; + cb_info->callback_args = static_cast(&expected); + cb_info->record_callback = null_record_callback; + cb_info->record_callback_args = nullptr; + context::correlation_id corr_id; + corr_id.internal = count++; + + hsa::rocprofiler_packet pkt; + pkt.ext_amd_aql_pm4.header = count++; + + expected.correlation_id = {.internal = corr_id.internal, + .external = context::null_user_data}; + expected.workgroup_size = {pkt.kernel_dispatch.workgroup_size_x, + pkt.kernel_dispatch.workgroup_size_y, + pkt.kernel_dispatch.workgroup_size_z}; + expected.grid_size = {pkt.kernel_dispatch.grid_size_x, + pkt.kernel_dispatch.grid_size_y, + pkt.kernel_dispatch.grid_size_z}; + expected.kernel_id = count++; + expected.dispatch_id = count++; + expected.queue_id = qid; + expected.agent_id = fq.get_agent().get_rocp_agent()->id; + + hsa::Queue::queue_info_session_t::external_corr_id_map_t extern_ids = {}; + + auto user_data = rocprofiler_user_data_t{.value = corr_id.internal}; + auto ret_pkt = spm::pre_kernel_call(&ctx, + cb_info, + fq, + pkt, + expected.kernel_id, + expected.dispatch_id, + &user_data, + extern_ids, + &corr_id); + auto _sess = hsa::Queue::queue_info_session_t{.queue = fq}; + _sess.correlation_id = &corr_id; + + auto sess = std::make_shared(std::move(_sess)); + ASSERT_TRUE(ret_pkt.pkt) + << fmt::format("Expected a packet to be generated for - {}", metric.name()); + auto data = std::vector(10, 1); + auto* spm_pkt = dynamic_cast(ret_pkt.pkt.get()); + rocprofiler::spm::aql_data_callback(0, &(data[0]), data.size(), 0, spm_pkt); + spm::inst_pkt_t pkts; + pkts.emplace_back( + std::make_pair(std::move(ret_pkt.pkt), static_cast(0))); + post_kernel_call(&ctx, cb_info, sess, pkts, kernel_dispatch::profiling_time{}); + } + } + + registration::set_init_status(1); + + registration::finalize(); + context::pop_client(1); + set_client_ctx(get_client_ctx()); +} + +TEST(spm_core, destroy_counter_profile) +{ + ASSERT_EQ(hsa_init(), HSA_STATUS_SUCCESS); + test_init(); + + registration::init_logging(); + registration::set_init_status(-1); + context::push_client(1); + + ROCPROFILER_CALL(rocprofiler_create_context(&get_client_ctx()), "context creation failed"); + + auto agents = hsa::get_queue_controller()->get_supported_agents(); + ASSERT_GT(agents.size(), 0); + for(const auto& [_, agent] : agents) + { + auto metrics = findSPMDeviceMetrics(agent, {}); + ASSERT_FALSE(metrics.empty()); + ASSERT_TRUE(agent.get_rocp_agent()); + for(auto& metric : metrics) + { + expected_dispatch expected = {}; + rocprofiler_counter_id_t id = {.handle = metric.id()}; + auto params = rocprofiler_spm_configuration_t{}; + params.frequency = 0.5; + params.buffer_size = 32768; + params.timeout = 30; + + ROCPROFILER_CALL(rocprofiler_spm_create_counter_config( + agent.get_rocp_agent()->id, &id, 1, ¶ms, &expected.id), + "Unable to create profile"); + ROCPROFILER_CALL(rocprofiler_spm_destroy_counter_config(expected.id), + "Could not delete profile id"); + /** + * Check the profile was actually destroyed + */ + auto profile = spm::get_spm_counter_config(expected.id); + EXPECT_FALSE(profile); + } + } + registration::set_init_status(1); + + registration::finalize(); + context::pop_client(1); + set_client_ctx(get_client_ctx()); +} + +TEST(spm_core, start_stop_callback_ctx) +{ + rocprofiler::common::set_env("ROCPROFILER_SPM_BETA_ENABLED", true); + ASSERT_EQ(hsa_init(), HSA_STATUS_SUCCESS); + test_init(); + + registration::init_logging(); + registration::set_init_status(-1); + context::push_client(1); + registration::set_fini_status(0); + + ROCPROFILER_CALL(rocprofiler_create_context(&get_client_ctx()), "context creation failed"); + + ROCPROFILER_CALL(rocprofiler_configure_callback_spm_dispatch_service(get_client_ctx(), + null_dispatch_callback, + (void*) 0x12345, + null_record_callback, + (void*) 0x54321), + "Could not setup counting service"); + ROCPROFILER_CALL(rocprofiler_start_context(get_client_ctx()), "start context"); + + /** + * Check that the context was actually started + */ + auto* ctx_p = context::get_mutable_registered_context(get_client_ctx()); + ASSERT_TRUE(ctx_p); + auto& ctx = *ctx_p; + + ASSERT_TRUE(ctx.dispatch_spm); + ASSERT_EQ(ctx.dispatch_spm->callbacks.size(), 1); + EXPECT_EQ(ctx.dispatch_spm->callbacks.at(0)->user_cb, null_dispatch_callback); + EXPECT_EQ(ctx.dispatch_spm->callbacks.at(0)->callback_args, (void*) 0x12345); + EXPECT_EQ(ctx.dispatch_spm->callbacks.at(0)->record_callback, null_record_callback); + EXPECT_EQ(ctx.dispatch_spm->callbacks.at(0)->record_callback_args, (void*) 0x54321); + EXPECT_EQ(ctx.dispatch_spm->callbacks.at(0)->context.handle, get_client_ctx().handle); + + bool found = false; + ctx.dispatch_spm->enabled.rlock([&](const auto& data) { found = data; }); + EXPECT_TRUE(found); + + found = false; + hsa::get_queue_controller()->iterate_callbacks([&](auto cid, const auto&) { + if(cid == ctx.dispatch_spm->callbacks.at(0)->queue_id) + { + found = true; + } + }); + EXPECT_TRUE(found); + + /** + * Check if context can be disabled correctly + */ + ROCPROFILER_CALL(rocprofiler_stop_context(get_client_ctx()), "stop context"); + + found = false; + ctx.dispatch_spm->enabled.rlock([&](const auto& data) { found = data; }); + EXPECT_FALSE(found); + + registration::set_init_status(1); + registration::finalize(); + context::pop_client(1); + set_client_ctx(get_client_ctx()); +} + +TEST(spm_core, start_stop_buffered_ctx) +{ + ASSERT_EQ(hsa_init(), HSA_STATUS_SUCCESS); + test_init(); + + registration::init_logging(); + registration::set_init_status(-1); + context::push_client(1); + ROCPROFILER_CALL(rocprofiler_create_context(&get_client_ctx()), "context creation failed"); + + rocprofiler_buffer_id_t opt_buff_id = {.handle = 0}; + ROCPROFILER_CALL(rocprofiler_create_buffer(get_client_ctx(), + 500 * sizeof(size_t), + 500 * sizeof(size_t), + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + null_buffered_callback, + nullptr, + &opt_buff_id), + "Could not create buffer"); + + ROCPROFILER_CALL(rocprofiler_configure_buffer_spm_dispatch_service( + get_client_ctx(), opt_buff_id, null_dispatch_callback, (void*) 0x12345), + "Could not setup buffered service"); + ROCPROFILER_CALL(rocprofiler_start_context(get_client_ctx()), "start context"); + + /** + * Check that the context was actually started + */ + auto* ctx_p = context::get_mutable_registered_context(get_client_ctx()); + ASSERT_TRUE(ctx_p); + auto& ctx = *ctx_p; + + ASSERT_TRUE(ctx.dispatch_spm); + ASSERT_EQ(ctx.dispatch_spm->callbacks.size(), 1); + EXPECT_EQ(ctx.dispatch_spm->callbacks.at(0)->user_cb, null_dispatch_callback); + EXPECT_EQ(ctx.dispatch_spm->callbacks.at(0)->callback_args, (void*) 0x12345); + EXPECT_EQ(ctx.dispatch_spm->callbacks.at(0)->context.handle, get_client_ctx().handle); + ASSERT_TRUE(ctx.dispatch_spm->callbacks.at(0)->buffer); + EXPECT_EQ(ctx.dispatch_spm->callbacks.at(0)->buffer->handle, opt_buff_id.handle); + + bool found = false; + ctx.dispatch_spm->enabled.rlock([&](const auto& data) { found = data; }); + EXPECT_TRUE(found); + + found = false; + hsa::get_queue_controller()->iterate_callbacks([&](auto cid, const auto&) { + if(cid == ctx.dispatch_spm->callbacks.at(0)->queue_id) + { + found = true; + } + }); + EXPECT_TRUE(found); + + /** + * Check if context can be disabled correctly + */ + ROCPROFILER_CALL(rocprofiler_stop_context(get_client_ctx()), "stop context"); + + found = false; + ctx.dispatch_spm->enabled.rlock([&](const auto& data) { found = data; }); + EXPECT_FALSE(found); + + rocprofiler_flush_buffer(opt_buff_id); + rocprofiler_destroy_buffer(opt_buff_id); + + registration::set_init_status(1); + + registration::finalize(); +} + +TEST(spm_core, test_profile_incremental) +{ + ASSERT_EQ(hsa_init(), HSA_STATUS_SUCCESS); + test_init(); + ASSERT_TRUE(hsa::get_queue_controller() != nullptr); + auto agents = hsa::get_queue_controller()->get_supported_agents(); + ASSERT_GT(agents.size(), 0); + for(const auto& [_, agent] : agents) + { + auto metrics = findSPMDeviceMetrics(agent, {}); + ASSERT_FALSE(metrics.empty()); + ASSERT_TRUE(agent.get_rocp_agent()); + + std::map> metric_blocks; + for(const auto& metric : metrics) + { + if(!metric.block().empty()) + { + metric_blocks[metric.block()].push_back(metric); + } + } + + rocprofiler_counter_config_id_t cfg_id = {}; + + // Add one counter from each block to incrementally to make sure we can + // add them incrementally + for(const auto& [block_name, block_metrics] : metric_blocks) + { + rocprofiler_counter_config_id_t old_id = cfg_id; + rocprofiler_counter_id_t id = {.handle = block_metrics.front().id()}; + auto params = rocprofiler_spm_configuration_t{}; + params.frequency = 0.5; + params.buffer_size = 32768; + params.timeout = 30; + ROCPROFILER_CALL(rocprofiler_spm_create_counter_config( + agent.get_rocp_agent()->id, &id, 1, ¶ms, &cfg_id), + "Unable to create profile incrementally when we should be able to"); + EXPECT_NE(old_id.handle, cfg_id.handle) + << "We expect that the handle changes this is due to the existing profile being " + "unmodifiable after creation: " + << block_name; + } + + // Check that we encounter an error of exceeds hardware limits eventually + auto status = ROCPROFILER_STATUS_SUCCESS; + for(const auto& metric : metrics) + { + /** + * Check profile construction + */ + rocprofiler_counter_id_t id = {.handle = metric.id()}; + if(status = rocprofiler_spm_create_counter_config( + agent.get_rocp_agent()->id, &id, 1, nullptr, &cfg_id); + status != ROCPROFILER_STATUS_SUCCESS) + { + break; + } + } + EXPECT_EQ(status, ROCPROFILER_STATUS_ERROR_EXCEEDS_HW_LIMIT); + } + + set_client_ctx(get_client_ctx()); +} + +TEST(spm_core, public_api_iterate_agents) +{ + ASSERT_EQ(hsa_init(), HSA_STATUS_SUCCESS); + test_init(); + + registration::init_logging(); + registration::set_init_status(-1); + context::push_client(1); + auto agents = hsa::get_queue_controller()->get_supported_agents(); + for(const auto& [_, agent] : agents) + { + std::set from_api{}; + + // Iterate through the agents and get the counters available on that agent + ROCPROFILER_CALL(rocprofiler_iterate_spm_supported_counters( + agent.get_rocp_agent()->id, + [](rocprofiler_agent_id_t, + rocprofiler_counter_id_t* counters, + size_t num_counters, + void* user_data) { + std::set* vec = + static_cast*>(user_data); + for(size_t i = 0; i < num_counters; i++) + { + vec->insert(counters[i].handle); + } + return ROCPROFILER_STATUS_SUCCESS; + }, + static_cast(&from_api)), + "Could not fetch supported counters"); + + auto expected = findSPMDeviceMetrics(agent, {}); + for(const auto& x : expected) + { + bool found = false; + for(auto it = from_api.begin(); it != from_api.end(); ++it) + { + rocprofiler_counter_id_t counter_id = {.handle = *it}; + if(counters::get_base_metric_from_counter_id(counter_id) == x.id()) + { + from_api.erase(it); + found = true; + break; + } + } + ASSERT_TRUE(found) << "Expected counter ID " << x.id() << " not found in API results"; + } + + EXPECT_TRUE(from_api.empty()); + } + registration::set_init_status(1); + registration::finalize(); + context::pop_client(1); +} diff --git a/projects/rocprofiler-sdk/source/share/rocprofiler-sdk-rocpd/data_views.sql b/projects/rocprofiler-sdk/source/share/rocprofiler-sdk-rocpd/data_views.sql index 0cb976688ce..7e7404a7355 100644 --- a/projects/rocprofiler-sdk/source/share/rocprofiler-sdk-rocpd/data_views.sql +++ b/projects/rocprofiler-sdk/source/share/rocprofiler-sdk-rocpd/data_views.sql @@ -397,7 +397,8 @@ SELECT K.end, (K.end - K.start) AS duration, PMC_I.name AS counter_name, - PMC_E.value AS counter_value + PMC_E.value AS counter_value, + PMC_E.timestamp AS timestamp FROM `rocpd_pmc_event` PMC_E INNER JOIN `rocpd_info_pmc` PMC_I ON PMC_I.id = PMC_E.pmc_id @@ -686,6 +687,7 @@ SELECT PMC_I.value_type, PMC_I.id AS counter_id, SUM(PMC_E.value) AS value, + PMC_E.timestamp AS timestamp, K.start, K.end, PMC_I.is_constant, @@ -723,4 +725,5 @@ GROUP BY PMC_E.guid, K.dispatch_id, PMC_I.name, - K.agent_id; + K.agent_id, + PMC_E.timestamp; diff --git a/projects/rocprofiler-sdk/source/share/rocprofiler-sdk-rocpd/rocpd_tables.sql b/projects/rocprofiler-sdk/source/share/rocprofiler-sdk-rocpd/rocpd_tables.sql index 65f3c630839..f31cf73c370 100644 --- a/projects/rocprofiler-sdk/source/share/rocprofiler-sdk-rocpd/rocpd_tables.sql +++ b/projects/rocprofiler-sdk/source/share/rocprofiler-sdk-rocpd/rocpd_tables.sql @@ -131,6 +131,7 @@ CREATE TABLE IF NOT EXISTS "expression" TEXT, "is_constant" INTEGER, "is_derived" INTEGER, + "spm_support" INTEGER, "extdata" JSONB DEFAULT "{}" NOT NULL, FOREIGN KEY (nid) REFERENCES `rocpd_info_node{{uuid}}` (id) ON UPDATE CASCADE, FOREIGN KEY (pid) REFERENCES `rocpd_info_process{{uuid}}` (id) ON UPDATE CASCADE, @@ -231,6 +232,7 @@ CREATE TABLE IF NOT EXISTS "event_id" INTEGER, "pmc_id" INTEGER NOT NULL, "value" REAL DEFAULT 0.0, + "timestamp" INTEGER, "extdata" JSONB DEFAULT "{}", FOREIGN KEY (pmc_id) REFERENCES `rocpd_info_pmc{{uuid}}` (id) ON UPDATE CASCADE, FOREIGN KEY (event_id) REFERENCES `rocpd_event{{uuid}}` (id) ON UPDATE CASCADE diff --git a/projects/rocprofiler-sdk/tests/CMakeLists.txt b/projects/rocprofiler-sdk/tests/CMakeLists.txt index 5fade56bfcd..f837c7a2279 100644 --- a/projects/rocprofiler-sdk/tests/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/CMakeLists.txt @@ -88,6 +88,7 @@ add_subdirectory(rocdecode) add_subdirectory(rocjpeg) add_subdirectory(hip-host-tracing) add_subdirectory(code-object-multi-threaded) +add_subdirectory(spm) # rocpd validation tests add_subdirectory(rocpd) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3-avail/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3-avail/CMakeLists.txt index a2b825f230d..8f838de1b60 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3-avail/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/rocprofv3-avail/CMakeLists.txt @@ -23,6 +23,11 @@ if("${pc-sampling-gpu-0-gfx-info}" MATCHES "^gfx(10|11|12)[0-9][0-9]$" set(IS_DISABLED ON) endif() +rocprofiler_sdk_spm_disabled(IS_SPM_DISABLED) +if(${IS_DISABLED}) + set(IS_SPM_DISABLED on) +endif() + rocprofiler_add_integration_execute_test( rocprofv3-avail-test-hw-counters COMMAND ${Python3_EXECUTABLE} $ info @@ -35,6 +40,18 @@ rocprofiler_add_integration_execute_test( "GPU:[0-9]*\\n*;Name:\\t[a-zA-Z_]*\\n;Counter_Name:\\t[a-zA-Z_]*\\n;Description:\\t(.*)\\n*;Expression:\\t(.)*\\n*;Block:\\t[a-zA-Z]*\\n*;Dimensions:\\t([A-Z_]*)\\t([[0-9]*:[0-9]*\\])*\\n*" DISABLED "${IS_DISABLED}") +rocprofiler_add_integration_execute_test( + rocprofv3-avail-spm-test-hw-counters + COMMAND ${Python3_EXECUTABLE} $ info + --spm + DEPENDS rocprofiler-sdk::rocprofv3-avail + TIMEOUT 45 + LABELS "integration-tests" + PRELOAD "${PRELOAD_ENV}" + PASS_REGULAR_EXPRESSION + "GPU:[0-9]*\\n*;Name:\\t[a-zA-Z_]*\\n;Counter_Name:\\t[a-zA-Z_]*\\n;Description:\\t(.*)\\n*;Expression:\\t(.)*\\n*;Block:\\t[a-zA-Z]*\\n*;Dimensions:\\t([A-Z_]*)\\t([[0-9]*:[0-9]*\\])*\\n*" + DISABLED "${IS_SPM_DISABLED}") + rocprofiler_add_integration_execute_test( rocprofv3-avail-test-pc-sample-config COMMAND ${Python3_EXECUTABLE} $ info @@ -68,6 +85,26 @@ rocprofiler_add_integration_execute_test( PRELOAD "${PRELOAD_ENV}" DISABLED "${IS_DISABLED}") +rocprofiler_add_integration_execute_test( + rocprofv3-avail-test-info + COMMAND ${Python3_EXECUTABLE} $ info + DEPENDS rocprofiler-sdk::rocprofv3-avail + TIMEOUT 45 + LABELS "integration-tests" + PRELOAD "${PRELOAD_ENV}" + PASS_REGULAR_EXPRESSION "GPU:[0-9]*\\n*"; + DISABLED "${IS_DISABLED}") + +rocprofiler_add_integration_execute_test( + rocprofv3-avail-test-list + COMMAND ${Python3_EXECUTABLE} $ list + DEPENDS rocprofiler-sdk::rocprofv3-avail + TIMEOUT 45 + LABELS "integration-tests" + PRELOAD "${PRELOAD_ENV}" + PASS_REGULAR_EXPRESSION "GPU"[ \t\r]":[0-9]*\\n*"; + DISABLED "${IS_DISABLED}") + rocprofiler_add_integration_validate_test( rocprofv3-avail-test TEST_PATHS validate.py diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt index 0a79001897f..7202635a06a 100644 --- a/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/rocprofv3/CMakeLists.txt @@ -27,6 +27,7 @@ add_subdirectory(tracing) add_subdirectory(tracing-plus-counter-collection) add_subdirectory(tracing-hip-in-libraries) add_subdirectory(counter-collection) +add_subdirectory(spm) add_subdirectory(hsa-queue-dependency) add_subdirectory(kernel-rename) add_subdirectory(memory-allocation) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/spm/CMakeLists.txt b/projects/rocprofiler-sdk/tests/rocprofv3/spm/CMakeLists.txt new file mode 100644 index 00000000000..8714630dc68 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/spm/CMakeLists.txt @@ -0,0 +1,120 @@ +# +# rocprofv3 tool test +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project( + rocprofiler-tests-spm + LANGUAGES CXX + VERSION 0.0.0) + +find_package(rocprofiler-sdk REQUIRED) +find_package(Python3 REQUIRED) + +set(COUNTER_LIST + SQ_CYCLES + SQ_WAVES + SQ_INSTS_VALU + SQ_INSTS_SALU + TCC_HIT + TCC_MISS + TCC_REQ + CPC_CPC_STAT_BUSY + CPC_CPC_STAT_IDLE) + +if(NOT ROCPROFILER_MEMCHECK STREQUAL "") + set(SANITIZER True) +else() + set(SANITIZER False) +endif() + +rocprofiler_sdk_spm_disabled(IS_SPM_DISABLED) +if(${SANITIZER}) + set(IS_SPM_DISABLED True) +endif() +set(IS_DISABLED False) + +set(rocprofv3-spm-rocpd-env + "PYTHONPATH=${rocprofiler-sdk_LIB_DIR}/python${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}/site-packages" + ) + +# SPM +rocprofiler_add_integration_execute_test( + rocprofv3-test-spm-execute + COMMAND + $ --spm-beta-enabled=on --spm + ${COUNTER_LIST} --spm-frequency 1.0 --spm-buffer-size 32768 --spm-timeout 40 + --kernel-trace -d ${CMAKE_CURRENT_BINARY_DIR}/out_spm --output-format json rocpd + -o spm -- $ + DEPENDS simple-transpose + TIMEOUT 60 + LABELS "integration-tests" + ENVIRONMENT "${PRELOAD_ENV}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + DISABLED ${IS_DISABLED}) + +# PMC +rocprofiler_add_integration_execute_test( + rocprofv3-test-spm-compare-to-pmc + COMMAND + $ --pmc ${COUNTER_LIST} -d + ${CMAKE_CURRENT_BINARY_DIR}/out_spm --output-format json -o pmc -- + $ + DEPENDS simple-transpose + TIMEOUT 45 + LABELS "integration-tests" + ENVIRONMENT "${PRELOAD_ENV}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + DISABLED ${IS_DISABLED}) + +# rocpd csv generation from spm database +rocprofiler_add_integration_execute_test( + rocprofv3-test-spm-rocpd-csv-generation + COMMAND + ${Python3_EXECUTABLE} -m rocpd convert -f csv -d + ${CMAKE_CURRENT_BINARY_DIR}/out_spm/rocpd_csv -i + ${CMAKE_CURRENT_BINARY_DIR}/out_spm/spm_results.db + DEPENDS rocprofiler-sdk::rocprofv3 + TIMEOUT 60 + LABELS "integration-tests" + ENVIRONMENT "${rocprofv3-spm-rocpd-env}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + FIXTURES_SETUP rocprofv3-test-spm-rocpd-csv-generation + FIXTURES_REQUIRED rocprofv3-test-spm-execute + DISABLED ${IS_DISABLED}) + +string(REPLACE "LD_PRELOAD=" "ROCPROF_PRELOAD=" PRELOAD_ENV + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}") + +rocprofiler_add_integration_validate_test( + rocprofv3-test-spm-validate + TEST_PATHS validate.py + ARGS --pmc-json + ${CMAKE_CURRENT_BINARY_DIR}/out_spm/pmc_results.json + --spm-json + ${CMAKE_CURRENT_BINARY_DIR}/out_spm/spm_results.json + --rocpd-input + ${CMAKE_CURRENT_BINARY_DIR}/out_spm/spm_results.db + --counter-csv + ${CMAKE_CURRENT_BINARY_DIR}/out_spm/rocpd_csv/out_counter_collection_trace.csv + CONFIG pytest.ini + COPY spm_input.json conftest.py + LABELS "integration-tests" + FIXTURES_REQUIRED + "rocprofv3-test-spm-execute;rocprofv3-test-spm-compare-to-pmc;rocprofv3-test-spm-rocpd-csv-generation" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + DISABLED ${IS_DISABLED}) + +# Multi-gpu + multiqueue +rocprofiler_add_integration_execute_test( + rocprofv3-test-spm-multigpu + COMMAND + $ -i spm_input.json -d + ${CMAKE_CURRENT_BINARY_DIR}/out_multi --output-format json rocpd -o multi -- + $ + DEPENDS vector-ops + TIMEOUT 60 + LABELS "integration-tests" + ENVIRONMENT "${PRELOAD_ENV}" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + DISABLED ${IS_DISABLED}) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/spm/conftest.py b/projects/rocprofiler-sdk/tests/rocprofv3/spm/conftest.py new file mode 100644 index 00000000000..750a5236417 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/spm/conftest.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 + +import json +import pandas as pd +import pytest + +from rocprofiler_sdk.pytest_utils.dotdict import dotdict +from rocprofiler_sdk.pytest_utils import collapse_dict_list +from rocprofiler_sdk.pytest_utils.rocpd_reader import RocpdReader + + +def pytest_addoption(parser): + parser.addoption("--pmc-json", action="store", help="Path to PMC JSON file.") + parser.addoption("--spm-json", action="store", help="Path to SPM JSON file.") + parser.addoption("--rocpd-input", action="store", help="Path to rocpd DB file.") + parser.addoption( + "--counter-csv", action="store", help="Path to rocpd counter CSV file." + ) + + +@pytest.fixture +def pmc_json_data(request): + filename = request.config.getoption("--pmc-json") + with open(filename, "r") as inp: + return dotdict(collapse_dict_list(json.load(inp))) + + +@pytest.fixture +def spm_json_data(request): + filename = request.config.getoption("--spm-json") + with open(filename, "r") as inp: + return dotdict(collapse_dict_list(json.load(inp))) + + +@pytest.fixture +def rocpd_data(request): + filename = request.config.getoption("--rocpd-input") + return RocpdReader(filename).read()[0] + + +@pytest.fixture +def counter_csv(request): + filename = request.config.getoption("--counter-csv") + with open(filename, "r") as inp: + return pd.read_csv(inp) diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/spm/pytest.ini b/projects/rocprofiler-sdk/tests/rocprofv3/spm/pytest.ini new file mode 100644 index 00000000000..8bf72b0989b --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/spm/pytest.ini @@ -0,0 +1,5 @@ + +[pytest] +addopts = --durations=20 -rA -s +testpaths = validate.py +pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/spm/spm_input.json b/projects/rocprofiler-sdk/tests/rocprofv3/spm/spm_input.json new file mode 100644 index 00000000000..6404d666531 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/spm/spm_input.json @@ -0,0 +1,15 @@ +{ + "jobs": [ + { + "kernel_include_regex": "subtract_kernel", + "kernel_iteration_range": "[1]", + "kernel_exclude_regex": "", + "truncate_kernels": true, + "spm_beta_enabled": true, + "spm_frequency": 1.0, + "spm_timeout": 50, + "spm_buffer_size": 32768, + "spm": ["SQ_WAVES"] + } + ] +} diff --git a/projects/rocprofiler-sdk/tests/rocprofv3/spm/validate.py b/projects/rocprofiler-sdk/tests/rocprofv3/spm/validate.py new file mode 100644 index 00000000000..5ec2fa6fafa --- /dev/null +++ b/projects/rocprofiler-sdk/tests/rocprofv3/spm/validate.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 + +import sys +import pytest +import pandas as pd +import re + + +# JSON size will become large with several counters. +def test_validate_spm_json(spm_json_data): + + def get_agent(agent_id): + for agent in data["agents"]: + if agent["id"]["handle"] == agent_id["handle"]: + return agent + return None + + def get_counter(counter_id): + for counter in data["counters"]: + if counter["id"]["handle"] == counter_id["handle"]: + return counter + return None + + pattern = re.compile("^gfx9[0-9]+$") + data = spm_json_data["rocprofiler-sdk-tool"] + spm_data = data["callback_records"]["SPM"] + + for spm_record in spm_data: + + dispatch_data = spm_record["dispatch_data"] + dispatch_info = dispatch_data["dispatch_info"] + + assert dispatch_info["agent_id"]["handle"] > 0 + assert dispatch_info["queue_id"]["handle"] > 0 + assert dispatch_info["dispatch_id"] > 0 + + for record in spm_record["records"]: + sq_waves_values = [] + agent = get_agent(dispatch_info["agent_id"]) + counter = get_counter(record["counter_id"]) + assert counter is not None, f"record:\n\t{record}" + if ( + counter["name"] == "SQ_WAVES" + and re.match(pattern, agent["name"]) is not None + ): + sq_waves_values.append(record["value"]) + if len(sq_waves_values) > 0: + assert sum(sq_waves_values) > 0, "SQ_WAVES value is not > 0" + + +def test_validate_spm(pmc_json_data, spm_json_data): + + TOLERANCE = 0.2 + within_tolerance = lambda x, y: abs(x - y) < TOLERANCE * max(x, y) + + def _collect_counter_totals(json_data, record_kind, kernel_filter): + data = json_data["rocprofiler-sdk-tool"] + + counters = {itr["id"]["handle"]: itr for itr in data.get("counters", [])} + kernel_symbols = data.get("kernel_symbols", {}) + + values = {} + for entry in data["callback_records"][record_kind]: + dispatch_info = entry["dispatch_data"]["dispatch_info"] + kernel_id = dispatch_info.get("kernel_id") + if isinstance(kernel_id, dict): + kernel_id = kernel_id.get("handle") + kernel_name = kernel_symbols[kernel_id]["formatted_kernel_name"] + if kernel_filter not in kernel_name: + continue + + for record in entry["records"]: + counter_id = record["counter_id"]["handle"] + counter = counters[counter_id] + counter_name = counter["name"] + values[counter_name] = values.get(counter_name, 0) + record["value"] + + return values + + pmc_values = _collect_counter_totals( + pmc_json_data, "counter_collection", "matrixTranspose" + ) + spm_values = _collect_counter_totals(spm_json_data, "SPM", "matrixTranspose") + + assert pmc_values and spm_values + + is_cycle = lambda x: x[:2] == "CP" or x == "SQ_CYCLES" + is_deterministic = lambda x: x[:3] == "SQ_" and x != "SQ_CYCLES" + + # Deterministic and nearly deterministic counters + for counter_name, pmc_value in pmc_values.items(): + if counter_name not in spm_values: + continue + spm_value = spm_values[counter_name] + if is_deterministic(counter_name): + assert pmc_value == spm_value + elif not is_cycle(counter_name): + assert within_tolerance(pmc_value, spm_value) + + +def test_validate_spm_rocpd_csv(counter_csv: pd.DataFrame, spm_json_data): + assert not counter_csv.empty + + TOLERANCE = 0.2 + within_tolerance = lambda x, y: abs(x - y) < TOLERANCE * max(x, y) + + kernel_column = "kernel_name" if "kernel_name" in counter_csv else "Kernel_Name" + counter_column = "counter_name" if "counter_name" in counter_csv else "Counter_Name" + value_column = "Counter_Value" + + filtered = counter_csv[counter_csv[kernel_column].str.contains("matrixTranspose")] + + csv_values = ( + filtered.groupby(counter_column)[value_column].sum().to_dict() + if not filtered.empty + else {} + ) + + assert csv_values + + def _collect_spm_totals(json_data, kernel_filter): + data = json_data["rocprofiler-sdk-tool"] + counters = {itr["id"]["handle"]: itr for itr in data.get("counters", [])} + kernel_symbols = data.get("kernel_symbols", {}) + + values = {} + for entry in data["callback_records"]["SPM"]: + dispatch_info = entry["dispatch_data"]["dispatch_info"] + kernel_id = dispatch_info.get("kernel_id") + if isinstance(kernel_id, dict): + kernel_id = kernel_id.get("handle") + kernel_name = kernel_symbols[kernel_id]["formatted_kernel_name"] + if kernel_filter not in kernel_name: + continue + + for record in entry["records"]: + counter_id = record["counter_id"]["handle"] + counter = counters[counter_id] + counter_name = counter["name"] + values[counter_name] = values.get(counter_name, 0) + record["value"] + + return values + + spm_values = _collect_spm_totals(spm_json_data, "matrixTranspose") + + assert spm_values + + is_cycle = lambda x: x[:2] == "CP" or x == "SQ_CYCLES" + is_deterministic = lambda x: x[:3] == "SQ_" and x != "SQ_CYCLES" + + for counter_name, csv_value in csv_values.items(): + if counter_name not in spm_values: + continue + spm_value = spm_values[counter_name] + if is_deterministic(counter_name): + assert csv_value == spm_value + elif not is_cycle(counter_name): + assert within_tolerance(csv_value, spm_value) + + +def test_validate_spm_rocpd(spm_json_data, rocpd_data): + data = spm_json_data["rocprofiler-sdk-tool"] + spm_data = data["callback_records"]["SPM"] + + def _find_table_or_view(conn, base_name): + for typ in ("view", "table"): + row = conn.execute( + "SELECT name FROM sqlite_master WHERE type = ? AND name LIKE ?", + (typ, f"{base_name}%"), + ).fetchone() + if row: + return row[0] + return None + + pmc_table = _find_table_or_view(rocpd_data, "rocpd_info_pmc") + pmc_event_table = _find_table_or_view(rocpd_data, "rocpd_pmc_event") + + assert pmc_table is not None + assert pmc_event_table is not None + + counters = {itr["id"]["handle"]: itr["name"] for itr in data.get("counters", [])} + + spm_counter_names = set() + for entry in spm_data: + for record in entry["records"]: + spm_counter_names.add(counters[record["counter_id"]["handle"]]) + + assert len(spm_counter_names) > 0 + + placeholders = ",".join(["?"] * len(spm_counter_names)) + pmc_name_list = sorted(spm_counter_names) + + rocpd_pmc_names = rocpd_data.execute( + f"SELECT name FROM {pmc_table} WHERE name IN ({placeholders})", + pmc_name_list, + ).fetchall() + + assert len(rocpd_pmc_names) > 0 + + rocpd_spm_count = rocpd_data.execute( + f"SELECT COUNT(*) FROM {pmc_event_table} e " + f"JOIN {pmc_table} p ON e.pmc_id = p.id " + f"WHERE p.name IN ({placeholders})", + pmc_name_list, + ).fetchone()[0] + + assert rocpd_spm_count > 0 + + +if __name__ == "__main__": + exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) + sys.exit(exit_code) diff --git a/projects/rocprofiler-sdk/tests/spm/CMakeLists.txt b/projects/rocprofiler-sdk/tests/spm/CMakeLists.txt new file mode 100644 index 00000000000..7c35d4a7290 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/spm/CMakeLists.txt @@ -0,0 +1,70 @@ +# +# +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project( + rocprofiler-sdk-tests-spm-counter-collection + LANGUAGES CXX + VERSION 0.0.0) + +find_package(rocprofiler-sdk REQUIRED) + +if(ROCPROFILER_MEMCHECK_PRELOAD_ENV) + set(PRELOAD_ENV + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV}:$") +else() + set(PRELOAD_ENV "LD_PRELOAD=$") +endif() + +if(NOT ROCPROFILER_MEMCHECK STREQUAL "") + set(SANITIZER True) +else() + set(SANITIZER False) +endif() + +rocprofiler_sdk_spm_disabled(IS_SPM_DISABLED) +if(${SANITIZER}) + set(IS_SPM_DISABLED True) +endif() +set(IS_DISABLED False) + +rocprofiler_add_integration_execute_test( + test-spm-dispatch-collection-execute + TARGET vector-ops + TIMEOUT 60 + LABELS "integration-tests" + PRELOAD "$" + ENVIRONMENT + "ROCPROFILER_TOOL_OUTPUT_FILE=spm-dispatch-collection-test.json" + "ROCPROFILER_TOOL_CONTEXTS=SPM_DISPATCH_COLLECTION" + "ROCPROFILER_SPM_BETA_ENABLED=True" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + FIXTURES_SETUP spm-dispatch + DISABLED ${IS_DISABLED}) + +rocprofiler_add_integration_execute_test( + test-spm-dispatch-buffer-collection-execute + TARGET vector-ops + TIMEOUT 45 + LABELS "integration-tests" + PRELOAD "$" + ENVIRONMENT + "ROCPROFILER_TOOL_OUTPUT_FILE=spm-dispatch-collection-test.json" + "ROCPROFILER_TOOL_CONTEXTS=SPM_BUFFER_DISPATCH_COLLECTION" + "ROCPROFILER_SPM_BETA_ENABLED=True" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + FIXTURES_SETUP spm-buffer + DISABLED ${IS_DISABLED}) + +rocprofiler_add_integration_validate_test( + test-spm-dispatch-collection-validate + TEST_PATHS validate.py + ARGS --input ${CMAKE_CURRENT_BINARY_DIR}/spm-dispatch-collection-test.json + COPY conftest.py + CONFIG pytest.ini + TIMEOUT 160 + LABELS "integration-tests" + FAIL_REGULAR_EXPRESSION "${ROCPROFILER_DEFAULT_FAIL_REGEX}" + FIXTURES_REQUIRED spm-dispatch + DISABLED ${IS_DISABLED}) diff --git a/projects/rocprofiler-sdk/tests/spm/conftest.py b/projects/rocprofiler-sdk/tests/spm/conftest.py new file mode 100644 index 00000000000..20fb83ed256 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/spm/conftest.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 + +# MIT License +# +# Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import json +import pytest + + +def pytest_addoption(parser): + parser.addoption( + "--input", + action="store", + default="spm-dispatch-collection-test.json", + help="Input JSON", + ) + + +@pytest.fixture +def input_data(request): + filename = request.config.getoption("--input") + with open(filename, "r") as inp: + return json.load(inp) diff --git a/projects/rocprofiler-sdk/tests/spm/pytest.ini b/projects/rocprofiler-sdk/tests/spm/pytest.ini new file mode 100644 index 00000000000..8bf72b0989b --- /dev/null +++ b/projects/rocprofiler-sdk/tests/spm/pytest.ini @@ -0,0 +1,5 @@ + +[pytest] +addopts = --durations=20 -rA -s +testpaths = validate.py +pythonpath = @ROCPROFILER_SDK_TESTS_BINARY_DIR@/pytest-packages diff --git a/projects/rocprofiler-sdk/tests/spm/validate.py b/projects/rocprofiler-sdk/tests/spm/validate.py new file mode 100644 index 00000000000..bc4b7f94642 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/spm/validate.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 + +# MIT License +# +# Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +import sys +import pytest +from collections import defaultdict + + +# helper function +def node_exists(name, data, min_len=1): + assert name in data + assert data[name] is not None + assert len(data[name]) >= min_len + + +def test_data_structure(input_data): + """verify minimum amount of expected data is present""" + node_exists("rocprofiler-sdk-json-tool", input_data) + rocp_data = input_data + node_exists("names", rocp_data["rocprofiler-sdk-json-tool"]["callback_records"]) + node_exists("spm_records", rocp_data["rocprofiler-sdk-json-tool"]["callback_records"]) + + +def test_spm_counter_values(input_data): + data = input_data["rocprofiler-sdk-json-tool"] + agent_data = data["agents"] + counter_info = data["counter_info"] + counter_data = data["callback_records"]["spm_records"] + agent_counter_map = defaultdict(list) + + def get_counter_value(counters, name): + for itr in counters: + if itr["name"] == name: + return itr["value"] + + def get_name(counter_id): + for itr in counter_info: + if itr["id"]["handle"] == counter_id: + return itr["name"] + + def add_entry(record): + agent_counter_map[record["agent_id"]["handle"]].append( + { + "name": get_name(record["counter_id"]["handle"]), + "value": record["value"], + } + ) + + for record in counter_data: + # If the agent is found in the agent map + # Search for counter name, update it if present + # If not counter name or agent not present add a new entry + if record["agent_id"]["handle"] in agent_counter_map: + + found = 0 + for i in range(0, len(agent_counter_map[record["agent_id"]["handle"]])): + if agent_counter_map[record["agent_id"]["handle"]][i]["name"] == get_name( + record["counter_id"]["handle"] + ): + agent_counter_map[record["agent_id"]["handle"]][i]["value"] += record[ + "value" + ] + found = 1 + if not found: + add_entry(record) + + else: + add_entry(record) + + # some samples can have 0 value, so aggreegate for validation + for agent, counters in agent_counter_map.items(): + + assert float(get_counter_value(counters, "TA_TA_BUSY")) > get_counter_value( + counters, "TA_TOTAL_WAVEFRONTS" + ) + + assert ( + 100 + * get_counter_value(counters, "SQC_ICACHE_MISSES") + / get_counter_value(counters, "SQC_ICACHE_REQ") + ) < 100 + assert ( + 100 + * get_counter_value(counters, "SQC_ICACHE_HITS") + / get_counter_value(counters, "SQC_ICACHE_REQ") + ) < 100 + + +if __name__ == "__main__": + exit_code = pytest.main(["-x", __file__] + sys.argv[1:]) + sys.exit(exit_code) diff --git a/projects/rocprofiler-sdk/tests/tools/json-tool.cpp b/projects/rocprofiler-sdk/tests/tools/json-tool.cpp index 95ea4dba18a..8a914df8adf 100644 --- a/projects/rocprofiler-sdk/tests/tools/json-tool.cpp +++ b/projects/rocprofiler-sdk/tests/tools/json-tool.cpp @@ -49,8 +49,6 @@ #include #include #include -#include - #include #include #include @@ -66,6 +64,7 @@ #include #include #include +#include #include #include #include @@ -575,6 +574,60 @@ struct profile_counting_record bool operator!=(rocprofiler_record_counter_t rhs) const { return !(*this == rhs); } }; +struct spm_profile_counting_record +{ + spm_profile_counting_record(rocprofiler_spm_dispatch_counting_service_data_t hdr) + : header{hdr} + {} + + rocprofiler_spm_dispatch_counting_service_data_t header = {}; + std::vector data = {}; + + template + void save(ArchiveT& ar) const + { + cereal::save(ar, header); + ar(cereal::make_nvp("records", data)); + } + void emplace_back(rocprofiler_spm_counter_record_t val) + { + if(*this != val) + { + throw std::runtime_error{"invalid profile_counting_record::emplace_back(...)"}; + } + data.emplace_back(val); + } + + bool operator==(rocprofiler_spm_counter_record_t rhs) const + { + return (header.dispatch_info.dispatch_id == rhs.dispatch_id); + } + + bool operator!=(rocprofiler_spm_counter_record_t rhs) const { return !(*this == rhs); } +}; + +struct spm_counting_record_t +{ + rocprofiler_counter_id_t counter_id = {}; + rocprofiler_agent_id_t agent_id = {}; + // XCC, shader, counter id, block instance Id + rocprofiler_counter_instance_id_t id = {}; + rocprofiler_dispatch_id_t dispatch_id = {}; + rocprofiler_timestamp_t timestamp = {}; + uint64_t value = 0; + + template + void save(ArchiveT& ar) const + { + ar(cereal::make_nvp("counter_id", counter_id)); + ar(cereal::make_nvp("agent_id", agent_id)); + ar(cereal::make_nvp("id", id)); + ar(cereal::make_nvp("timestamp", timestamp)); + ar(cereal::make_nvp("dispatch_id", dispatch_id)); + ar(cereal::make_nvp("value", value)); + } +}; + auto counter_info = std::deque{}; auto runtime_init_cb_records = std::deque{}; auto code_object_records = std::deque{}; @@ -592,6 +645,8 @@ auto rccl_api_cb_records = std::deque{}; auto rocdecode_api_cb_records = std::deque{}; auto rocjpeg_api_cb_records = std::deque{}; auto ompt_cb_records = std::deque{}; +auto spm_cb_records = std::deque{}; +auto spm_bf_records = std::deque{}; int set_external_correlation_id(rocprofiler_thread_id_t thr_id, @@ -608,6 +663,136 @@ set_external_correlation_id(rocprofiler_thread_id_t t return 0; } +void +spm_dispatch_callback(const rocprofiler_spm_dispatch_counting_service_data_t* dispatch_data, + rocprofiler_counter_config_id_t* config, + rocprofiler_user_data_t* /* user_data*/, + void* /*callback_data_args*/) +{ + // Iterate through the agents and get the counters available on that agent + static std::shared_mutex m_mutex = {}; + static std::unordered_map + profile_cache = {}; + + auto search_cache = [&]() { + if(auto pos = profile_cache.find(dispatch_data->dispatch_info.agent_id); + pos != profile_cache.end()) + { + *config = pos->second; + return true; + } + return false; + }; + + { + auto rlock = std::shared_lock{m_mutex}; + if(search_cache()) return; + } + + auto wlock = std::unique_lock{m_mutex}; + if(search_cache()) return; + + std::set counters_to_collect = {"TA_TOTAL_WAVEFRONTS", + "TA_TA_BUSY", + "SQC_ICACHE_REQ", + "SQC_ICACHE_HITS", + "SQC_ICACHE_MISSES"}; + auto gpu_counters = std::vector{}; + ROCPROFILER_CALL(rocprofiler_iterate_spm_supported_counters( + dispatch_data->dispatch_info.agent_id, + []([[maybe_unused]] rocprofiler_agent_id_t id, + rocprofiler_counter_id_t* counters, + size_t num_counters, + void* user_data) { + std::vector* vec = + static_cast*>(user_data); + for(size_t i = 0; i < num_counters; i++) + { + vec->push_back(counters[i]); + } + return ROCPROFILER_STATUS_SUCCESS; + }, + static_cast(&gpu_counters)), + "Could not fetch supported counters"); + + for(auto& counter : gpu_counters) + { + auto info = rocprofiler_counter_info_v0_t{}; + + ROCPROFILER_CALL( + rocprofiler_query_counter_info( + counter, ROCPROFILER_COUNTER_INFO_VERSION_0, static_cast(&info)), + "Could not query counter_id"); + + counter_info.emplace_back(info); + } + + std::vector collect_counters; + // Look for the counters contained in counters_to_collect in gpu_counters + for(auto& counter : gpu_counters) + { + rocprofiler_counter_info_v0_t info; + + ROCPROFILER_CALL( + rocprofiler_query_counter_info( + counter, ROCPROFILER_COUNTER_INFO_VERSION_0, static_cast(&info)), + "Could not query counter_id"); + + if(counters_to_collect.count(std::string(info.name)) > 0) + { + collect_counters.push_back(counter); + } + } + + auto params = rocprofiler_spm_configuration_t{}; + params.timeout = 0; + params.buffer_size = 32768; + params.frequency = 0.1; + // Look for the counters contained in counters_to_collect in gpu_counters + // Create a colleciton profile for the counters + rocprofiler_counter_config_id_t profile = {.handle = 0}; + ROCPROFILER_CALL(rocprofiler_spm_create_counter_config(dispatch_data->dispatch_info.agent_id, + collect_counters.data(), + collect_counters.size(), + ¶ms, + &profile), + "Could not construct profile cfg"); + + profile_cache.emplace(dispatch_data->dispatch_info.agent_id, profile); + // Return the profile to collect those counters for this dispatch + *config = profile; +} + +void +spm_data_callback(const rocprofiler_spm_dispatch_counting_service_data_t* dispatch_data, + const rocprofiler_spm_counter_record_t** records, + size_t record_count, + int flags, + rocprofiler_user_data_t /* user_data*/, + void* /* record_callback_args*/) +{ + static std::shared_mutex m_mutex = {}; + auto lk = std::shared_lock{m_mutex}; + if(record_count == 0) return; + + if(flags >> ROCPROFILER_SPM_RECORD_FLAG_DATA) + { + for(size_t count = 0; count < record_count; count++) + { + auto counter_id = rocprofiler_counter_id_t{}; + ROCPROFILER_CALL(rocprofiler_query_record_counter_id(records[count]->id, &counter_id), + "query record counter id"); + spm_cb_records.emplace_back( + spm_counting_record_t{counter_id, + records[count]->agent_id, + records[count]->id, + dispatch_data->dispatch_info.dispatch_id, + records[count]->timestamp, + records[count]->value}); + } + } +} + void dispatch_callback(rocprofiler_dispatch_counting_service_data_t dispatch_data, rocprofiler_counter_config_id_t* config, @@ -1155,6 +1340,62 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/, } } +void +spm_buffered(rocprofiler_context_id_t /*context*/, + rocprofiler_buffer_id_t /*buffer_id*/, + rocprofiler_record_header_t** headers, + size_t num_headers, + void* user_data, + uint64_t drop_count) +{ + assert(user_data != nullptr); + assert(drop_count == 0 && "drop count should be zero for lossless policy"); + + if(num_headers == 0) + throw std::runtime_error{ + "rocprofiler invoked a buffer callback with no headers. this should never happen"}; + else if(headers == nullptr) + throw std::runtime_error{"rocprofiler invoked a buffer callback with a null pointer to the " + "array of headers. this should never happen"}; + + for(size_t i = 0; i < num_headers; ++i) + { + auto* header = headers[i]; + + if(header == nullptr) + { + throw std::runtime_error{ + "rocprofiler provided a null pointer to header. this should never happen"}; + } + else if(header->hash != + rocprofiler_record_header_compute_hash(header->category, header->kind)) + { + throw std::runtime_error{"rocprofiler_record_header_t (category | kind) != hash"}; + } + else if(header->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS && + header->kind == ROCPROFILER_COUNTER_RECORD_PROFILE_COUNTING_DISPATCH_HEADER) + { + auto* profiler_record = + static_cast(header->payload); + spm_bf_records.emplace_back(*profiler_record); + } + else if(header->category == ROCPROFILER_BUFFER_CATEGORY_COUNTERS && + header->kind == ROCPROFILER_COUNTER_RECORD_VALUE) + { + auto* profiler_record = static_cast(header->payload); + + if(spm_bf_records.empty()) + throw std::runtime_error{ + "missing rocprofiler_dispatch_spm_counting_service_record_t (header)"}; + spm_bf_records.back().emplace_back(*profiler_record); + } + else + { + throw std::runtime_error{"unexpected rocprofiler_record_header_t category + kind"}; + } + } +} + void thread_precreate(rocprofiler_runtime_library_t lib, void* tool_data) { @@ -1201,41 +1442,43 @@ void pop_external_correlation(); // contexts -rocprofiler_context_id_t hsa_api_callback_ctx = {0}; -rocprofiler_context_id_t hip_api_callback_ctx = {0}; -rocprofiler_context_id_t marker_api_callback_ctx = {0}; -rocprofiler_context_id_t code_object_ctx = {0}; -rocprofiler_context_id_t rccl_api_callback_ctx = {0}; -rocprofiler_context_id_t ompt_callback_ctx = {0}; -rocprofiler_context_id_t hsa_api_buffered_ctx = {0}; -rocprofiler_context_id_t hip_api_buffered_ctx = {0}; -rocprofiler_context_id_t marker_api_buffered_ctx = {0}; -rocprofiler_context_id_t memory_copy_callback_ctx = {0}; -rocprofiler_context_id_t memory_copy_buffered_ctx = {0}; -rocprofiler_context_id_t memory_allocation_callback_ctx = {0}; -rocprofiler_context_id_t memory_allocation_buffered_ctx = {0}; -rocprofiler_context_id_t rccl_api_buffered_ctx = {0}; -rocprofiler_context_id_t ompt_buffered_ctx = {0}; -rocprofiler_context_id_t counter_collection_ctx = {0}; -rocprofiler_context_id_t scratch_memory_ctx = {0}; -rocprofiler_context_id_t corr_id_retire_ctx = {0}; -rocprofiler_context_id_t kernel_dispatch_callback_ctx = {0}; -rocprofiler_context_id_t kernel_dispatch_buffered_ctx = {0}; -rocprofiler_context_id_t runtime_init_callback_ctx = {}; -rocprofiler_context_id_t runtime_init_buffered_ctx = {}; -rocprofiler_context_id_t rocdecode_api_callback_ctx = {0}; -rocprofiler_context_id_t rocdecode_api_buffered_ctx = {0}; -rocprofiler_context_id_t rocdecode_api_ext_buffered_ctx = {0}; -rocprofiler_context_id_t rocjpeg_api_callback_ctx = {0}; -rocprofiler_context_id_t rocjpeg_api_buffered_ctx = {0}; -rocprofiler_context_id_t page_migrate_event_ctx = {0}; -rocprofiler_context_id_t kfd_page_fault_event_ctx = {0}; -rocprofiler_context_id_t kfd_queue_event_ctx = {0}; -rocprofiler_context_id_t kfd_unmap_from_gpu_event_ctx = {0}; -rocprofiler_context_id_t kfd_droped_events_event_ctx = {0}; -rocprofiler_context_id_t kfd_page_migrate_records_ctx = {0}; -rocprofiler_context_id_t kfd_page_fault_records_ctx = {0}; -rocprofiler_context_id_t kfd_queue_records_ctx = {0}; +rocprofiler_context_id_t hsa_api_callback_ctx = {0}; +rocprofiler_context_id_t hip_api_callback_ctx = {0}; +rocprofiler_context_id_t marker_api_callback_ctx = {0}; +rocprofiler_context_id_t code_object_ctx = {0}; +rocprofiler_context_id_t rccl_api_callback_ctx = {0}; +rocprofiler_context_id_t ompt_callback_ctx = {0}; +rocprofiler_context_id_t hsa_api_buffered_ctx = {0}; +rocprofiler_context_id_t hip_api_buffered_ctx = {0}; +rocprofiler_context_id_t marker_api_buffered_ctx = {0}; +rocprofiler_context_id_t memory_copy_callback_ctx = {0}; +rocprofiler_context_id_t memory_copy_buffered_ctx = {0}; +rocprofiler_context_id_t memory_allocation_callback_ctx = {0}; +rocprofiler_context_id_t memory_allocation_buffered_ctx = {0}; +rocprofiler_context_id_t rccl_api_buffered_ctx = {0}; +rocprofiler_context_id_t ompt_buffered_ctx = {0}; +rocprofiler_context_id_t counter_collection_ctx = {0}; +rocprofiler_context_id_t spm_dispatch_collection_ctx = {0}; +rocprofiler_context_id_t scratch_memory_ctx = {0}; +rocprofiler_context_id_t corr_id_retire_ctx = {0}; +rocprofiler_context_id_t kernel_dispatch_callback_ctx = {0}; +rocprofiler_context_id_t kernel_dispatch_buffered_ctx = {0}; +rocprofiler_context_id_t runtime_init_callback_ctx = {}; +rocprofiler_context_id_t runtime_init_buffered_ctx = {}; +rocprofiler_context_id_t rocdecode_api_callback_ctx = {0}; +rocprofiler_context_id_t rocdecode_api_buffered_ctx = {0}; +rocprofiler_context_id_t rocdecode_api_ext_buffered_ctx = {0}; +rocprofiler_context_id_t rocjpeg_api_callback_ctx = {0}; +rocprofiler_context_id_t rocjpeg_api_buffered_ctx = {0}; +rocprofiler_context_id_t page_migrate_event_ctx = {0}; +rocprofiler_context_id_t kfd_page_fault_event_ctx = {0}; +rocprofiler_context_id_t kfd_queue_event_ctx = {0}; +rocprofiler_context_id_t kfd_unmap_from_gpu_event_ctx = {0}; +rocprofiler_context_id_t kfd_droped_events_event_ctx = {0}; +rocprofiler_context_id_t kfd_page_migrate_records_ctx = {0}; +rocprofiler_context_id_t kfd_page_fault_records_ctx = {0}; +rocprofiler_context_id_t kfd_queue_records_ctx = {0}; +rocprofiler_context_id_t spm_buffer_dispatch_collection_ctx = {0}; // buffers rocprofiler_buffer_id_t runtime_init_buffered_buffer = {}; @@ -1261,6 +1504,7 @@ rocprofiler_buffer_id_t kfd_droped_events_event_buffer = {}; rocprofiler_buffer_id_t kfd_page_migrate_records_buffer = {}; rocprofiler_buffer_id_t kfd_page_fault_records_buffer = {}; rocprofiler_buffer_id_t kfd_queue_records_buffer = {}; +rocprofiler_buffer_id_t spm_counter_collection_buffer = {}; auto contexts = std::unordered_map{ {"RUNTIME_INIT_CALLBACK", &runtime_init_callback_ctx}, @@ -1298,9 +1542,10 @@ auto contexts = std::unordered_map{ {"KFD_PAGE_MIGRATE", &kfd_page_migrate_records_ctx}, {"KFD_PAGE_FAULT", &kfd_page_fault_records_ctx}, {"KFD_QUEUE", &kfd_queue_records_ctx}, -}; + {"SPM_DISPATCH_COLLECTION", &spm_dispatch_collection_ctx}, + {"SPM_BUFFER_DISPATCH_COLLECTION", &spm_buffer_dispatch_collection_ctx}}; -auto buffers = std::array{&runtime_init_buffered_buffer, +auto buffers = std::array{&runtime_init_buffered_buffer, &hsa_api_buffered_buffer, &hip_api_buffered_buffer, &marker_api_buffered_buffer, @@ -1321,7 +1566,8 @@ auto buffers = std::array{&runtime_init_buffered_b &kfd_droped_events_event_buffer, &kfd_page_migrate_records_buffer, &kfd_page_fault_records_buffer, - &kfd_queue_records_buffer}; + &kfd_queue_records_buffer, + &spm_counter_collection_buffer}; auto agents = std::vector{}; auto agents_map = std::unordered_map{}; @@ -1591,7 +1837,7 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) ROCPROFILER_CALL(rocprofiler_create_buffer(memory_allocation_buffered_ctx, buffer_size, - watermark, + buffer_size, ROCPROFILER_BUFFER_POLICY_LOSSLESS, tool_tracing_buffered, tool_data, @@ -1625,6 +1871,15 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) &counter_collection_buffer), "buffer creation"); + ROCPROFILER_CALL(rocprofiler_create_buffer(spm_buffer_dispatch_collection_ctx, + buffer_size, + buffer_size, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + spm_buffered, + tool_data, + &spm_counter_collection_buffer), + "buffer creation"); + ROCPROFILER_CALL(rocprofiler_create_buffer(rccl_api_buffered_ctx, buffer_size, watermark, @@ -2075,6 +2330,27 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) else { std::cerr << "Enabling context: " << itr.first << std::endl; + + if(itr.first == "SPM_DISPATCH_COLLECTION") + { + ROCPROFILER_CALL(rocprofiler_configure_callback_spm_dispatch_service( + spm_dispatch_collection_ctx, + spm_dispatch_callback, + nullptr, + spm_data_callback, + nullptr), + "Could not setup SPM counting service"); + } + + else if(itr.first == "SPM_BUFFER_DISPATCH_COLLECTION") + { + ROCPROFILER_CALL(rocprofiler_configure_buffer_spm_dispatch_service( + spm_buffer_dispatch_collection_ctx, + spm_counter_collection_buffer, + spm_dispatch_callback, + nullptr), + "setup SPM buffered service"); + } context_settings.erase(pos, itr.first.length()); } } @@ -2195,7 +2471,9 @@ tool_fini(void* tool_data) << ", rocdecode_api_bf_records=" << rocdecode_api_bf_records.size() << ", rocdecode_api_ext_bf_records=" << rocdecode_api_ext_bf_records.size() << ", rocjpeg_api_callback_records=" << rocjpeg_api_cb_records.size() - << ", rocjpeg_api_bf_records=" << rocjpeg_api_bf_records.size() << "...\n" + << ", rocjpeg_api_bf_records=" << rocjpeg_api_bf_records.size() + << ", spm_cb_records=" << spm_cb_records.size() + << ", spm_bf_records=" << spm_bf_records.size() << "...\n" << std::flush; auto* _call_stack = static_cast(tool_data); @@ -2293,6 +2571,7 @@ write_json(call_stack_t* _call_stack) json_ar(cereal::make_nvp("memory_allocations", memory_allocation_cb_records)); json_ar(cereal::make_nvp("rocdecode_api_traces", rocdecode_api_cb_records)); json_ar(cereal::make_nvp("rocjpeg_api_traces", rocjpeg_api_cb_records)); + json_ar(cereal::make_nvp("spm_records", spm_cb_records)); } catch(std::exception& e) { std::cerr << "[" << getpid() << "][" << __FUNCTION__ @@ -2330,6 +2609,7 @@ write_json(call_stack_t* _call_stack) json_ar(cereal::make_nvp("rocdecode_api_traces", rocdecode_api_bf_records)); json_ar(cereal::make_nvp("rocdecode_api_ext_traces", rocdecode_api_ext_bf_records)); json_ar(cereal::make_nvp("rocjpeg_api_traces", rocjpeg_api_bf_records)); + json_ar(cereal::make_nvp("spm_counter_collection", spm_bf_records)); } catch(std::exception& e) { std::cerr << "[" << getpid() << "][" << __FUNCTION__