diff --git a/projects/rocprofiler-systems/cmake/Packages.cmake b/projects/rocprofiler-systems/cmake/Packages.cmake index 6084be0b6ff..67d7fe958cc 100644 --- a/projects/rocprofiler-systems/cmake/Packages.cmake +++ b/projects/rocprofiler-systems/cmake/Packages.cmake @@ -270,6 +270,55 @@ endif() target_link_libraries(rocprofiler-systems-rocm INTERFACE amd_smi) +# Detect AMD SMI library version from header +set(_AMDSMI_HEADER "${ROCM_PATH}/include/amd_smi/amdsmi.h") +if(EXISTS "${_AMDSMI_HEADER}") + file(READ "${_AMDSMI_HEADER}" _AMDSMI_HEADER_CONTENTS) + + string( + REGEX MATCH + "#define AMDSMI_LIB_VERSION_MAJOR ([0-9]+)" + _ + "${_AMDSMI_HEADER_CONTENTS}" + ) + set(ROCPROFSYS_AMDSMI_VERSION_MAJOR "${CMAKE_MATCH_1}") + + string( + REGEX MATCH + "#define AMDSMI_LIB_VERSION_MINOR ([0-9]+)" + _ + "${_AMDSMI_HEADER_CONTENTS}" + ) + set(ROCPROFSYS_AMDSMI_VERSION_MINOR "${CMAKE_MATCH_1}") + + message( + STATUS + "AMD SMI version detected: ${ROCPROFSYS_AMDSMI_VERSION_MAJOR}.${ROCPROFSYS_AMDSMI_VERSION_MINOR}" + ) +endif() + +# AINIC requires AMD SMI >= 26.3 AND ROCPROFSYS_USE_AINIC option +set(ROCPROFSYS_BUILD_AINIC OFF CACHE INTERNAL "Build AINIC support") +if(ROCPROFSYS_USE_AINIC) + if( + ROCPROFSYS_AMDSMI_VERSION_MAJOR GREATER 26 + OR ( + ROCPROFSYS_AMDSMI_VERSION_MAJOR EQUAL 26 + AND ROCPROFSYS_AMDSMI_VERSION_MINOR GREATER 2 + ) + ) + set(ROCPROFSYS_BUILD_AINIC ON CACHE INTERNAL "Build AINIC support" FORCE) + message(STATUS "AINIC support enabled (AMD SMI >= 26.3)") + else() + message( + STATUS + "AINIC disabled: AMD SMI ${ROCPROFSYS_AMDSMI_VERSION_MAJOR}.${ROCPROFSYS_AMDSMI_VERSION_MINOR} < 26.3" + ) + endif() +else() + message(STATUS "AINIC disabled: ROCPROFSYS_USE_AINIC is OFF") +endif() + # ----------------------------------------------------------------------------------------# # # ROCpd diff --git a/projects/rocprofiler-systems/source/lib/common/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/common/CMakeLists.txt index 19f843b2460..966fd75c6d3 100644 --- a/projects/rocprofiler-systems/source/lib/common/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/lib/common/CMakeLists.txt @@ -26,6 +26,7 @@ target_sources( ${CMAKE_CURRENT_SOURCE_DIR}/invoke.hpp ${CMAKE_CURRENT_SOURCE_DIR}/join.hpp ${CMAKE_CURRENT_SOURCE_DIR}/setup.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/span.hpp ${CMAKE_CURRENT_SOURCE_DIR}/traits.hpp ${CMAKE_CURRENT_SOURCE_DIR}/md5sum.hpp ${CMAKE_CURRENT_SOURCE_DIR}/static_object.hpp diff --git a/projects/rocprofiler-systems/source/lib/core/perfetto.hpp b/projects/rocprofiler-systems/source/lib/core/perfetto.hpp index d2565a0712a..cb2fd9bb5ff 100644 --- a/projects/rocprofiler-systems/source/lib/core/perfetto.hpp +++ b/projects/rocprofiler-systems/source/lib/core/perfetto.hpp @@ -53,9 +53,10 @@ std::unique_ptr<::perfetto::TracingSession>& get_perfetto_session( template struct perfetto_counter_track { - using track_map_t = std::map>; - using name_map_t = std::map>>; - using data_t = std::pair; + using category_type = Tp; + using track_map_t = std::map>; + using name_map_t = std::map>>; + using data_t = std::pair; static auto init() { (void) get_data(); } static auto exists(size_t _idx, int64_t _n = -1); diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/cache_manager.hpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/cache_manager.hpp index deba7b8d85a..47201279fab 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/cache_manager.hpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/cache_manager.hpp @@ -26,9 +26,9 @@ #include "core/trace_cache/metadata_registry.hpp" #include "core/trace_cache/sample_type.hpp" #include "core/trace_cache/storage_parser.hpp" - +#include "library/pmc/collectors/gpu/sample.hpp" +#include "library/pmc/collectors/nic/sample.hpp" #include "library/runtime.hpp" - #include #include @@ -40,8 +40,9 @@ namespace trace_cache using storage_parser_t = storage_parser; + pmc_event_with_sample, pmc::collectors::gpu::sample, + pmc::collectors::nic::sample, cpu_freq_sample, backtrace_region_sample, + scratch_memory_sample>; using buffer_storage_t = buffer_storage; diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/cache_type_traits.hpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/cache_type_traits.hpp index 537cabf4015..361cece6ed6 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/cache_type_traits.hpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/cache_type_traits.hpp @@ -22,6 +22,7 @@ #pragma once #include "common/span.hpp" +#include #include #include #include @@ -104,6 +105,17 @@ struct is_vector> : std::true_type template inline constexpr bool is_vector_v = is_vector::value; +template +struct is_array : std::false_type +{}; + +template +struct is_array> : std::true_type +{}; + +template +inline constexpr bool is_array_v = is_array::value; + template static constexpr bool is_string_view_v = std::is_same_v, std::string_view>; @@ -122,7 +134,7 @@ inline constexpr bool is_optional_v = is_optional::value; template inline constexpr bool is_supported_type_v = is_span_v || std::is_integral_v || std::is_floating_point_v || - is_string_view_v || is_vector_v || is_optional_v; + is_string_view_v || is_vector_v || is_optional_v || is_array_v; template struct is_enum_class diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/metadata_registry.hpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/metadata_registry.hpp index e3c5bae4f99..7cf4325c2bf 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/metadata_registry.hpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/metadata_registry.hpp @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include #include @@ -62,18 +62,6 @@ struct process uint32_t end; }; -template -inline std::string -annotate_category(std::optional first_section = std::nullopt, - std::optional second_section = std::nullopt) -{ - std::stringstream ss; - ss << std::string(tim::trait::name::value); - if(first_section) ss << "_" << std::to_string(*first_section); - if(second_section) ss << "_" << std::to_string(*second_section); - return ss.str(); -} - struct pmc { agent_type type; @@ -131,16 +119,12 @@ struct thread template inline std::string -annotate_with_device_id(uint32_t device_id, - std::optional first_section = std::nullopt, - std::optional second_section = std::nullopt) +format_track_name(std::optional first_section = std::nullopt, + std::optional second_section = std::nullopt) { - std::stringstream ss; - ss << std::string(tim::trait::name::value) + " [" + - std::to_string(device_id) + "]"; - if(first_section) ss << "_" << std::to_string(*first_section); - if(second_section) ss << "_" << std::to_string(*second_section); - return ss.str(); + return fmt::format("{}{}{}", tim::trait::name::value, + first_section ? fmt::format("_{}", *first_section) : "", + second_section ? fmt::format("_{}", *second_section) : ""); } template diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.cpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.cpp index a733067311e..91e5cbaa817 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.cpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.cpp @@ -126,37 +126,119 @@ using amd_smi_nic_rx_ucast_pkts_track = using amd_smi_nic_tx_ucast_pkts_track = perfetto_counter_track; -void -setup_amd_smi_tracks(const uint32_t _device_id, bool is_busy_enabled, - bool is_temp_enabled, bool is_power_enabled, - bool is_mem_usage_enabled) +template +bool +ensure_gpu_track(uint32_t device_id, bool enabled, const char* track_suffix, + const char* units) { - if(amd_smi_gfx_track::exists(_device_id)) return; + if(!enabled) return false; + if(!Track::exists(device_id)) + Track::emplace(device_id, fmt::format("GPU [{}] {} (S)", device_id, track_suffix), + units); + return true; +} - auto make_track_name = [&](const char* metric) { - return fmt::format("GPU [{}] {} (S)", _device_id, metric); - }; +template +void +emit_gpu_scalar(uint32_t device_id, size_t ts, bool enabled, const char* track_suffix, + const char* units, ValueT value) +{ + if(ensure_gpu_track(device_id, enabled, track_suffix, units)) + TRACE_COUNTER(trait::name::value, + Track::at(device_id, 0), ts, static_cast(value)); +} - if(is_busy_enabled) - { - amd_smi_gfx_track::emplace(_device_id, make_track_name("GFX Busy"), "%"); - amd_smi_umc_track::emplace(_device_id, make_track_name("UMC Busy"), "%"); - amd_smi_mm_track::emplace(_device_id, make_track_name("MM Busy"), "%"); - } - if(is_temp_enabled) - { - amd_smi_temp_track::emplace(_device_id, make_track_name("Temperature"), "deg C"); - } - if(is_power_enabled) +template +void +emit_xcp_array_metrics(uint32_t device_id, size_t ts, const char* metric_name, + const Array& data, std::optional xcp_idx, const Fn& emit) +{ + for(size_t i = 0; i < data.size(); ++i) { - amd_smi_power_track::emplace(_device_id, make_track_name("Power"), "W"); + const auto value = data[i]; + if(value == std::numeric_limits::max()) continue; + + std::string track_name; + if(xcp_idx.has_value()) + { + track_name = fmt::format("GPU [{}] {} XCP_{}: [{:02}] (S)", device_id, + metric_name, xcp_idx.value(), i); + } + else + { + track_name = + fmt::format("GPU [{}] {} [{:02}] (S)", device_id, metric_name, i); + } + + auto unique_key = (static_cast(device_id) << 16) | + (static_cast(xcp_idx.value_or(0)) << 8) | + static_cast(i); + + if(!Track::exists(unique_key)) + { + Track::emplace(unique_key, track_name, "%"); + } + emit(unique_key, ts, static_cast(value)); } - if(is_mem_usage_enabled) +} + +void +emit_xgmi_metrics(uint32_t device_id, size_t ts, const pmc::collectors::gpu::metrics& m) +{ + emit_gpu_scalar(device_id, ts, true, "XGMI Link Width", + "lanes", m.xgmi.link.width); + emit_gpu_scalar(device_id, ts, true, "XGMI Link Speed", + "Mbps", m.xgmi.link.speed); + + for(size_t link = 0; link < m.xgmi.data_acc.read.size(); ++link) { - amd_smi_mem_track::emplace(_device_id, make_track_name("Memory Usage"), "MB"); + const auto read_val = m.xgmi.data_acc.read[link]; + if(read_val != std::numeric_limits::max()) + { + auto unique_key = (device_id << 8) | link; + if(!amd_smi_xgmi_read_track::exists(unique_key)) + { + amd_smi_xgmi_read_track::emplace( + unique_key, + fmt::format("GPU [{}] XGMI Read Data [{:02}] (S)", device_id, link), + "KB"); + } + TRACE_COUNTER("device_xgmi_read_data", + amd_smi_xgmi_read_track::at(unique_key, 0), ts, + static_cast(read_val)); + } + + const auto write_val = m.xgmi.data_acc.write[link]; + if(write_val != std::numeric_limits::max()) + { + auto unique_key = (device_id << 8) | link; + if(!amd_smi_xgmi_write_track::exists(unique_key)) + { + amd_smi_xgmi_write_track::emplace( + unique_key, + fmt::format("GPU [{}] XGMI Write Data [{:02}] (S)", device_id, link), + "KB"); + } + TRACE_COUNTER("device_xgmi_write_data", + amd_smi_xgmi_write_track::at(unique_key, 0), ts, + static_cast(write_val)); + } } } +void +emit_pcie_metrics(uint32_t device_id, size_t ts, const pmc::collectors::gpu::metrics& m) +{ + emit_gpu_scalar(device_id, ts, true, "PCIe Link Width", + "lanes", m.pcie.link.width); + emit_gpu_scalar(device_id, ts, true, "PCIe Link Speed", + "MT/s", m.pcie.link.speed); + emit_gpu_scalar( + device_id, ts, true, "PCIe Bandwidth Acc", "bytes", m.pcie.bandwidth.acc); + emit_gpu_scalar( + device_id, ts, true, "PCIe Bandwidth Inst", "bytes/s", m.pcie.bandwidth.inst); +} + template void write_sampling_track_data(const struct backtrace_region_sample& _sample, @@ -1025,287 +1107,162 @@ perfetto_processor_t::handle([[maybe_unused]] const pmc_event_with_sample& _pmc) } void -perfetto_processor_t::handle([[maybe_unused]] const amd_smi_sample& _amd_smi) +perfetto_processor_t::handle([[maybe_unused]] const gpu_pmc_sample& _gpu_pmc) { - // Use the shared gpu_metrics_t from core/gpu_metrics.hpp - using gpu_metrics_t = gpu::gpu_metrics_t; - - using pos = trace_cache::amd_smi_sample::settings_positions; - std::bitset<16> settings_bits(_amd_smi.settings); - bool is_busy_enabled = settings_bits.test(static_cast(pos::busy)); - bool is_temp_enabled = settings_bits.test(static_cast(pos::temp)); - bool is_power_enabled = settings_bits.test(static_cast(pos::power)); - bool is_mem_usage_enabled = settings_bits.test(static_cast(pos::mem_usage)); - bool is_vcn_enabled = settings_bits.test(static_cast(pos::vcn_activity)); - bool is_jpeg_enabled = settings_bits.test(static_cast(pos::jpeg_activity)); - bool is_xgmi_enabled = settings_bits.test(static_cast(pos::xgmi)); - bool is_pcie_enabled = settings_bits.test(static_cast(pos::pcie)); - bool is_sdma_enabled = settings_bits.test(static_cast(pos::sdma_usage)); - - auto _ts = _amd_smi.timestamp; - auto _device_id = _amd_smi.device_id; - - setup_amd_smi_tracks(_device_id, is_busy_enabled, is_temp_enabled, is_power_enabled, - is_mem_usage_enabled); - - if(is_busy_enabled) + const auto _ts = _gpu_pmc.timestamp; + const auto _device_id = _gpu_pmc.device_id; + const auto& _em = _gpu_pmc.enabled_metric; + const auto& _m = _gpu_pmc.metric_values; + + // Scalar metrics + emit_gpu_scalar(_device_id, _ts, _em.bits.gfx_activity, "GFX Busy", + "%", _m.gfx_activity); + emit_gpu_scalar(_device_id, _ts, _em.bits.umc_activity, "UMC Busy", + "%", _m.umc_activity); + emit_gpu_scalar(_device_id, _ts, _em.bits.mm_activity, "MM Busy", + "%", _m.mm_activity); + + emit_gpu_scalar( + _device_id, _ts, _em.bits.hotspot_temperature || _em.bits.edge_temperature, + "Temperature", "deg C", + _em.bits.hotspot_temperature ? _m.hotspot_temperature : _m.edge_temperature); + + emit_gpu_scalar( + _device_id, _ts, _em.bits.current_socket_power || _em.bits.average_socket_power, + "Current Power", "watts", + _em.bits.average_socket_power ? _m.average_socket_power + : _m.current_socket_power); + + emit_gpu_scalar( + _device_id, _ts, _em.bits.memory_usage, "Memory Usage", "megabytes", + _m.memory_usage / static_cast(units::megabyte)); + + emit_gpu_scalar(_device_id, _ts, _em.bits.sdma_usage, + "SDMA Usage", "%", _m.sdma_usage); + + // Per-XCP VCN busy metrics (MI300) + if(_em.bits.vcn_busy) { - TRACE_COUNTER("device_busy_gfx", amd_smi_gfx_track::at(_device_id, 0), _ts, - _amd_smi.gfx_activity); - TRACE_COUNTER("device_busy_umc", amd_smi_umc_track::at(_device_id, 0), _ts, - _amd_smi.umc_activity); - TRACE_COUNTER("device_busy_mm", amd_smi_mm_track::at(_device_id, 0), _ts, - _amd_smi.mm_activity); - } - if(is_temp_enabled) - { - TRACE_COUNTER("device_temp", amd_smi_temp_track::at(_device_id, 0), _ts, - _amd_smi.temperature); - } - if(is_power_enabled) - { - TRACE_COUNTER("device_power", amd_smi_power_track::at(_device_id, 0), _ts, - _amd_smi.power); + for(size_t xcp = 0; xcp < _m.xcp_stats.size(); ++xcp) + { + emit_xcp_array_metrics( + _device_id, _ts, "VCN Busy", _m.xcp_stats[xcp].vcn_busy, xcp, + [](size_t key, size_t t, double v) { + TRACE_COUNTER("device_vcn_activity", amd_smi_vcn_track::at(key, 0), t, + v); + }); + } } - if(is_mem_usage_enabled) + + // Device-level VCN activity (Radeon) + if(_em.bits.vcn_activity) { - double mem_mb = _amd_smi.mem_usage / static_cast(units::megabyte); - TRACE_COUNTER("device_memory_usage", amd_smi_mem_track::at(_device_id, 0), _ts, - mem_mb); + emit_xcp_array_metrics( + _device_id, _ts, "VCN Activity", _m.vcn_activity, std::nullopt, + [](size_t key, size_t t, double v) { + TRACE_COUNTER("device_vcn_activity", amd_smi_vcn_track::at(key, 0), t, v); + }); } - if(!is_vcn_enabled && !is_jpeg_enabled && !is_xgmi_enabled && !is_pcie_enabled && - !is_sdma_enabled) - return; - - gpu_metrics_t gpu_metrics; - gpu::gpu_metrics_capabilities_t capabilities; - gpu::deserialize_gpu_metrics(_amd_smi.gpu_activity, gpu_metrics, is_vcn_enabled, - is_jpeg_enabled, is_xgmi_enabled, is_pcie_enabled, - capabilities); - - // Helper lambda to insert VCN/JPEG activity metrics - auto insert_decode_vector_metrics = [&](auto category, bool _is_enabled, - const std::vector& data, - std::optional _idx = std::nullopt) { - if(!_is_enabled) return; - - using Category = std::decay_t; - - const char* metric_name = nullptr; - if constexpr(std::is_same_v) - metric_name = "VCN Activity"; - else if constexpr(std::is_same_v) - metric_name = "JPEG Activity"; - else - metric_name = trait::name::value; - - for(size_t i = 0; i < data.size(); ++i) + // Per-XCP JPEG busy metrics (MI300) + if(_em.bits.jpeg_busy) + { + for(size_t xcp = 0; xcp < _m.xcp_stats.size(); ++xcp) { - const auto value = data[i]; - if(value == std::numeric_limits::max()) continue; - - std::string track_name; - if(_idx.has_value()) - { - // Per-XCP format - track_name = fmt::format("GPU [{}] {} XCP_{}: [{:02}] (S)", _device_id, - metric_name, _idx.value(), i); - } - else - { - // Device-level format - track_name = - fmt::format("GPU [{}] {} [{:02}] (S)", _device_id, metric_name, i); - } - - auto generate_track_key = [](uint32_t _dev_idx, size_t _xcp_idx, - size_t _clk_idx) { - return (static_cast(_dev_idx) << 16) | - (static_cast(_xcp_idx) << 8) | - static_cast(_clk_idx); - }; - - auto unique_key = generate_track_key(_device_id, _idx.value_or(0), i); - - if constexpr(std::is_same_v) - { - if(!amd_smi_vcn_track::exists(unique_key)) - { - amd_smi_vcn_track::emplace(unique_key, track_name, "%"); - } - TRACE_COUNTER("device_vcn_activity", amd_smi_vcn_track::at(unique_key, 0), - _ts, static_cast(value)); - } - else if constexpr(std::is_same_v) - { - if(!amd_smi_jpeg_track::exists(unique_key)) - { - amd_smi_jpeg_track::emplace(unique_key, track_name, "%"); - } - TRACE_COUNTER("device_jpeg_activity", - amd_smi_jpeg_track::at(unique_key, 0), _ts, - static_cast(value)); - } + emit_xcp_array_metrics( + _device_id, _ts, "JPEG Busy", _m.xcp_stats[xcp].jpeg_busy, xcp, + [](size_t key, size_t t, double v) { + TRACE_COUNTER("device_jpeg_activity", amd_smi_jpeg_track::at(key, 0), + t, v); + }); } - }; - - auto insert_xgmi_vector_metrics = [&](auto category, bool _is_enabled, - const std::vector& data) { - if(!_is_enabled) return; - - using Category = std::decay_t; + } - for(size_t i = 0; i < data.size(); ++i) - { - const auto value = data[i]; - if(value == std::numeric_limits::max()) continue; + // Device-level JPEG activity (Radeon) + if(_em.bits.jpeg_activity) + { + emit_xcp_array_metrics( + _device_id, _ts, "JPEG Activity", _m.jpeg_activity, std::nullopt, + [](size_t key, size_t t, double v) { + TRACE_COUNTER("device_jpeg_activity", amd_smi_jpeg_track::at(key, 0), t, + v); + }); + } - std::string track_name = fmt::format("GPU [{}] {} [{:02}] (S)", _device_id, - trait::name::value, i); + // Grouped interconnect metrics + if(_em.bits.xgmi) emit_xgmi_metrics(_device_id, _ts, _m); + if(_em.bits.pcie) emit_pcie_metrics(_device_id, _ts, _m); +} - auto unique_key = (_device_id << 8) | i; +void +perfetto_processor_t::handle([[maybe_unused]] const ainic_pmc_sample& _nic_sample) +{ + auto _ts = _nic_sample.timestamp; + auto _device_id = _nic_sample.device_id; - if constexpr(std::is_same_v) - { - if(!amd_smi_xgmi_read_track::exists(unique_key)) - { - amd_smi_xgmi_read_track::emplace(unique_key, track_name, "bytes"); - } - TRACE_COUNTER("device_xgmi_read_data", - amd_smi_xgmi_read_track::at(unique_key, 0), _ts, - static_cast(value)); - } - else if constexpr(std::is_same_v) - { - if(!amd_smi_xgmi_write_track::exists(unique_key)) - { - amd_smi_xgmi_write_track::emplace(unique_key, track_name, "bytes"); - } - TRACE_COUNTER("device_xgmi_write_data", - amd_smi_xgmi_write_track::at(unique_key, 0), _ts, - static_cast(value)); - } - } + // Helper to create track names + auto make_track_name = [&](const char* metric) { + return fmt::format("NIC [{}] {} (S)", _device_id, metric); }; - // Insert VCN activity metrics - if(capabilities.flags.vcn_is_device_level_only) - { - insert_decode_vector_metrics(category::amd_smi_vcn_activity{}, is_vcn_enabled, - gpu_metrics.vcn_activity, std::nullopt); - } - else + if(_nic_sample.enabled_metric.bits.rx_rdma_ucast_bytes) { - for(size_t xcp = 0; xcp < gpu_metrics.vcn_busy.size(); ++xcp) - { - insert_decode_vector_metrics(category::amd_smi_vcn_activity{}, is_vcn_enabled, - gpu_metrics.vcn_busy[xcp], xcp); - } + if(!amd_smi_nic_rx_ucast_bytes_track::exists(_device_id)) + amd_smi_nic_rx_ucast_bytes_track::emplace( + _device_id, make_track_name("RX RDMA Bytes"), "bytes"); + TRACE_COUNTER(trait::name::value, + amd_smi_nic_rx_ucast_bytes_track::at(_device_id, 0), _ts, + static_cast(_nic_sample.metric_values.rx_rdma_ucast_bytes)); } - // Insert JPEG activity metrics - if(capabilities.flags.jpeg_is_device_level_only) + if(_nic_sample.enabled_metric.bits.tx_rdma_ucast_bytes) { - insert_decode_vector_metrics(category::amd_smi_jpeg_activity{}, is_jpeg_enabled, - gpu_metrics.jpeg_activity, std::nullopt); + if(!amd_smi_nic_tx_ucast_bytes_track::exists(_device_id)) + amd_smi_nic_tx_ucast_bytes_track::emplace( + _device_id, make_track_name("TX RDMA Bytes"), "bytes"); + TRACE_COUNTER(trait::name::value, + amd_smi_nic_tx_ucast_bytes_track::at(_device_id, 0), _ts, + static_cast(_nic_sample.metric_values.tx_rdma_ucast_bytes)); } - else + + if(_nic_sample.enabled_metric.bits.rx_rdma_ucast_pkts) { - for(size_t xcp = 0; xcp < gpu_metrics.jpeg_busy.size(); ++xcp) - { - insert_decode_vector_metrics(category::amd_smi_jpeg_activity{}, - is_jpeg_enabled, gpu_metrics.jpeg_busy[xcp], - xcp); - } + if(!amd_smi_nic_rx_ucast_pkts_track::exists(_device_id)) + amd_smi_nic_rx_ucast_pkts_track::emplace( + _device_id, make_track_name("RX RDMA Packets"), "packets"); + TRACE_COUNTER(trait::name::value, + amd_smi_nic_rx_ucast_pkts_track::at(_device_id, 0), _ts, + static_cast(_nic_sample.metric_values.rx_rdma_ucast_pkts)); } - // Insert XGMI metrics - if(is_xgmi_enabled) + if(_nic_sample.enabled_metric.bits.tx_rdma_ucast_pkts) { - auto make_track_name = [&](const char* metric) { - return fmt::format("GPU [{}] {} (S)", _device_id, metric); - }; - - if(!amd_smi_xgmi_link_width_track::exists(_device_id)) - { - amd_smi_xgmi_link_width_track::emplace( - _device_id, make_track_name("XGMI Link Width"), ""); - } - TRACE_COUNTER("device_xgmi_link_width", - amd_smi_xgmi_link_width_track::at(_device_id, 0), _ts, - static_cast(gpu_metrics.xgmi_link_width)); - - if(!amd_smi_xgmi_link_speed_track::exists(_device_id)) - { - amd_smi_xgmi_link_speed_track::emplace( - _device_id, make_track_name("XGMI Link Speed"), "MT/s"); - } - TRACE_COUNTER("device_xgmi_link_speed", - amd_smi_xgmi_link_speed_track::at(_device_id, 0), _ts, - static_cast(gpu_metrics.xgmi_link_speed)); - - insert_xgmi_vector_metrics(category::amd_smi_xgmi_read_data{}, is_xgmi_enabled, - gpu_metrics.xgmi_read_data_acc); - - insert_xgmi_vector_metrics(category::amd_smi_xgmi_write_data{}, is_xgmi_enabled, - gpu_metrics.xgmi_write_data_acc); + if(!amd_smi_nic_tx_ucast_pkts_track::exists(_device_id)) + amd_smi_nic_tx_ucast_pkts_track::emplace( + _device_id, make_track_name("TX RDMA Packets"), "packets"); + TRACE_COUNTER(trait::name::value, + amd_smi_nic_tx_ucast_pkts_track::at(_device_id, 0), _ts, + static_cast(_nic_sample.metric_values.tx_rdma_ucast_pkts)); } - // Insert PCIe metrics - if(is_pcie_enabled) + if(_nic_sample.enabled_metric.bits.rx_rdma_cnp_pkts) { - auto make_track_name = [&](const char* metric) { - return fmt::format("GPU [{}] {} (S)", _device_id, metric); - }; - - if(!amd_smi_pcie_link_width_track::exists(_device_id)) - { - amd_smi_pcie_link_width_track::emplace( - _device_id, make_track_name("PCIe Link Width"), ""); - } - TRACE_COUNTER("device_pcie_link_width", - amd_smi_pcie_link_width_track::at(_device_id, 0), _ts, - static_cast(gpu_metrics.pcie_link_width)); - - if(!amd_smi_pcie_link_speed_track::exists(_device_id)) - { - amd_smi_pcie_link_speed_track::emplace( - _device_id, make_track_name("PCIe Link Speed"), "MT/s"); - } - TRACE_COUNTER("device_pcie_link_speed", - amd_smi_pcie_link_speed_track::at(_device_id, 0), _ts, - static_cast(gpu_metrics.pcie_link_speed)); - - if(!amd_smi_pcie_bandwidth_acc_track::exists(_device_id)) - { - amd_smi_pcie_bandwidth_acc_track::emplace( - _device_id, make_track_name("PCIe Bandwidth Acc"), "bytes"); - } - TRACE_COUNTER("device_pcie_bandwidth_acc", - amd_smi_pcie_bandwidth_acc_track::at(_device_id, 0), _ts, - static_cast(gpu_metrics.pcie_bandwidth_acc)); - - if(!amd_smi_pcie_bandwidth_inst_track::exists(_device_id)) - { - amd_smi_pcie_bandwidth_inst_track::emplace( - _device_id, make_track_name("PCIe Bandwidth Inst"), "bytes"); - } - TRACE_COUNTER("device_pcie_bandwidth_inst", - amd_smi_pcie_bandwidth_inst_track::at(_device_id, 0), _ts, - static_cast(gpu_metrics.pcie_bandwidth_inst)); + if(!amd_smi_nic_rx_cnp_pkts_track::exists(_device_id)) + amd_smi_nic_rx_cnp_pkts_track::emplace( + _device_id, make_track_name("RX CNP Packets"), "packets"); + TRACE_COUNTER(trait::name::value, + amd_smi_nic_rx_cnp_pkts_track::at(_device_id, 0), _ts, + static_cast(_nic_sample.metric_values.rx_rdma_cnp_pkts)); } - // Output SDMA usage - if(is_sdma_enabled) + if(_nic_sample.enabled_metric.bits.tx_rdma_cnp_pkts) { - if(!amd_smi_sdma_track::exists(_device_id)) - { - auto track_name = fmt::format("GPU [{}] SDMA Usage (S)", _device_id); - amd_smi_sdma_track::emplace(_device_id, track_name, "%"); - } - TRACE_COUNTER("device_sdma_usage", amd_smi_sdma_track::at(_device_id, 0), _ts, - static_cast(_amd_smi.sdma_usage)); + if(!amd_smi_nic_tx_cnp_pkts_track::exists(_device_id)) + amd_smi_nic_tx_cnp_pkts_track::emplace( + _device_id, make_track_name("TX CNP Packets"), "packets"); + TRACE_COUNTER(trait::name::value, + amd_smi_nic_tx_cnp_pkts_track::at(_device_id, 0), _ts, + static_cast(_nic_sample.metric_values.tx_rdma_cnp_pkts)); } } @@ -1321,68 +1278,5 @@ perfetto_processor_t::handle([[maybe_unused]] const in_time_sample& _sample) } } -void -perfetto_processor_t::handle(const ainic_sample& _ainic) -{ - auto _ts = _ainic.timestamp; - auto _nic_index = _ainic.nic_index; - - const auto& nic_agent = m_agent_manager.get_agent_by_id(_nic_index, agent_type::NIC); - const auto* nic_name = nic_agent.name.c_str(); - - if(!amd_smi_nic_rx_cnp_pkts_track::exists(_nic_index)) - { - amd_smi_nic_rx_cnp_pkts_track::emplace( - _nic_index, - info::annotate_with_nic(nic_name, - _nic_index), - "packets"); - amd_smi_nic_tx_cnp_pkts_track::emplace( - _nic_index, - info::annotate_with_nic(nic_name, - _nic_index), - "packets"); - amd_smi_nic_rx_ucast_bytes_track::emplace( - _nic_index, - info::annotate_with_nic(nic_name, - _nic_index), - "bytes"); - amd_smi_nic_tx_ucast_bytes_track::emplace( - _nic_index, - info::annotate_with_nic(nic_name, - _nic_index), - "bytes"); - amd_smi_nic_rx_ucast_pkts_track::emplace( - _nic_index, - info::annotate_with_nic(nic_name, - _nic_index), - "packets"); - amd_smi_nic_tx_ucast_pkts_track::emplace( - _nic_index, - info::annotate_with_nic(nic_name, - _nic_index), - "packets"); - } - - TRACE_COUNTER(trait::name::value, - amd_smi_nic_rx_cnp_pkts_track::at(_nic_index, 0), _ts, - static_cast(_ainic.rx_rdma_cnp_pkts)); - TRACE_COUNTER(trait::name::value, - amd_smi_nic_tx_cnp_pkts_track::at(_nic_index, 0), _ts, - static_cast(_ainic.tx_rdma_cnp_pkts)); - TRACE_COUNTER(trait::name::value, - amd_smi_nic_rx_ucast_bytes_track::at(_nic_index, 0), _ts, - static_cast(_ainic.rx_ucast_bytes)); - TRACE_COUNTER(trait::name::value, - amd_smi_nic_tx_ucast_bytes_track::at(_nic_index, 0), _ts, - static_cast(_ainic.tx_ucast_bytes)); - TRACE_COUNTER(trait::name::value, - amd_smi_nic_rx_ucast_pkts_track::at(_nic_index, 0), _ts, - static_cast(_ainic.rx_ucast_pkts)); - TRACE_COUNTER(trait::name::value, - amd_smi_nic_tx_ucast_pkts_track::at(_nic_index, 0), _ts, - static_cast(_ainic.tx_ucast_pkts)); -} - } // namespace trace_cache } // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.hpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.hpp index 81cf7e030eb..7bf7956b813 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.hpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.hpp @@ -62,10 +62,10 @@ class perfetto_processor_t : public processor_t void handle(const region_sample& sample); void handle(const in_time_sample& sample); void handle(const pmc_event_with_sample& sample); - void handle(const amd_smi_sample& sample); + void handle(const gpu_pmc_sample& sample); + void handle(const ainic_pmc_sample& sample); void handle(const cpu_freq_sample& sample); void handle(const backtrace_region_sample& sample); - void handle(const ainic_sample& sample); private: void initialize_perfetto(); diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.cpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.cpp index 94da4b3b22a..9d02bbde34a 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.cpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.cpp @@ -320,216 +320,195 @@ rocpd_processor_t::handle(const pmc_event_with_sample& _pmc) } void -rocpd_processor_t::handle(const amd_smi_sample& _amd_smi) +rocpd_processor_t::handle([[maybe_unused]] const gpu_pmc_sample& _gpu_pmc) { const auto* _name = trait::name::value; auto name_primary_key = m_data_processor->insert_string(_name); auto event_id = m_data_processor->insert_event(name_primary_key, 0, 0, 0); auto base_id = - m_agent_manager->get_agent_by_type_index(_amd_smi.device_id, agent_type::GPU) + m_agent_manager->get_agent_by_type_index(_gpu_pmc.device_id, agent_type::GPU) .base_id; - auto insert_event_and_sample = [&](bool enabled, const char* pmc_name, - const char* track_name, double value) { + auto insert_metric = [&](bool enabled, const char* pmc_name, const char* track_name, + double value) { if(!enabled) return; m_data_processor->insert_pmc_event(event_id, base_id, pmc_name, value); - m_data_processor->insert_sample(track_name, _amd_smi.timestamp, event_id); + m_data_processor->insert_sample(track_name, _gpu_pmc.timestamp, event_id); }; - using pos = trace_cache::amd_smi_sample::settings_positions; - std::bitset<16> settings_bits(_amd_smi.settings); - bool is_busy_enabled = settings_bits.test(static_cast(pos::busy)); - bool is_temp_enabled = settings_bits.test(static_cast(pos::temp)); - bool is_power_enabled = settings_bits.test(static_cast(pos::power)); - bool is_mem_usage_enabled = settings_bits.test(static_cast(pos::mem_usage)); - - bool is_vcn_enabled = settings_bits.test(static_cast(pos::vcn_activity)); - bool is_jpeg_enabled = settings_bits.test(static_cast(pos::jpeg_activity)); - bool is_xgmi_enabled = settings_bits.test(static_cast(pos::xgmi)); - bool is_pcie_enabled = settings_bits.test(static_cast(pos::pcie)); - bool is_sdma_enabled = settings_bits.test(static_cast(pos::sdma_usage)); - - insert_event_and_sample( - is_busy_enabled, trait::name::value, - info::annotate_with_device_id(_amd_smi.device_id) - .c_str(), - _amd_smi.gfx_activity); - insert_event_and_sample( - is_busy_enabled, trait::name::value, - info::annotate_with_device_id(_amd_smi.device_id) - .c_str(), - _amd_smi.umc_activity); - insert_event_and_sample( - is_busy_enabled, trait::name::value, - info::annotate_with_device_id(_amd_smi.device_id) - .c_str(), - _amd_smi.mm_activity); - insert_event_and_sample( - is_temp_enabled, trait::name::value, - info::annotate_with_device_id(_amd_smi.device_id).c_str(), - _amd_smi.temperature); - - insert_event_and_sample( - is_power_enabled, trait::name::value, - info::annotate_with_device_id(_amd_smi.device_id) - .c_str(), - _amd_smi.power); - - auto mem_usage_mb = _amd_smi.mem_usage / static_cast(units::megabyte); - insert_event_and_sample( - is_mem_usage_enabled, trait::name::value, - info::annotate_with_device_id(_amd_smi.device_id) - .c_str(), - mem_usage_mb); - - // Insert SDMA usage metric (doesn't require gpu_metrics deserialization) - insert_event_and_sample( - is_sdma_enabled, trait::name::value, - info::annotate_with_device_id(_amd_smi.device_id) - .c_str(), - static_cast(_amd_smi.sdma_usage)); - - if(!is_vcn_enabled && !is_jpeg_enabled && !is_xgmi_enabled && !is_pcie_enabled) - return; - - gpu::gpu_metrics_t gpu_metrics; - gpu::gpu_metrics_capabilities_t capabilities; - gpu::deserialize_gpu_metrics(_amd_smi.gpu_activity, gpu_metrics, is_vcn_enabled, - is_jpeg_enabled, is_xgmi_enabled, is_pcie_enabled, - capabilities); - - // Insert VCN and JPEG activity metrics - auto insert_decode_vector_metrics = [&](auto category, bool _is_enabled, - const std::vector& data, - std::optional _idx = std::nullopt) { - if(!_is_enabled) return; + const auto& m = _gpu_pmc.metric_values; + const auto& enabled = _gpu_pmc.enabled_metric; - using Category = std::decay_t; + auto insert_scalar = [&](const char* name, const std::string& track, bool is_enabled, + double value) { + insert_metric(is_enabled, name, track.c_str(), value); + }; - for(size_t i = 0; i < data.size(); ++i) + insert_scalar(trait::name::value, + info::format_track_name(), + enabled.bits.gfx_activity, m.gfx_activity); + insert_scalar(trait::name::value, + info::format_track_name(), + enabled.bits.umc_activity, m.umc_activity); + insert_scalar(trait::name::value, + info::format_track_name(), + enabled.bits.mm_activity, m.mm_activity); + insert_scalar(trait::name::value, + info::format_track_name(), + enabled.bits.hotspot_temperature, m.hotspot_temperature); + insert_scalar(trait::name::value, + info::format_track_name(), + enabled.bits.current_socket_power || enabled.bits.average_socket_power, + enabled.bits.current_socket_power ? m.current_socket_power + : m.average_socket_power); + insert_scalar(trait::name::value, + info::format_track_name(), + enabled.bits.memory_usage, m.memory_usage / units::megabyte); + insert_scalar(trait::name::value, + info::format_track_name(), + enabled.bits.sdma_usage, m.sdma_usage); + + auto insert_xcp_metrics = [&](const char* base_name, const std::string& base_track, + bool is_enabled, const auto& get_array) { + if(!is_enabled) return; + for(size_t xcp = 0; xcp < m.xcp_stats.size(); ++xcp) { - const auto value = data[i]; - if(value == std::numeric_limits::max()) continue; - - auto pmc_name = info::annotate_category(_idx, i); - auto track_name = - info::annotate_with_device_id(_amd_smi.device_id, _idx, i); - - insert_event_and_sample(_is_enabled, pmc_name.c_str(), track_name.c_str(), - static_cast(value)); + const auto& arr = get_array(m.xcp_stats[xcp]); + for(size_t i = 0; i < arr.size(); ++i) + { + auto suffix = + "_xcp" + std::to_string(xcp) + "[" + std::to_string(i) + "]"; + auto pmc_name = std::string(base_name) + suffix; + auto track_name = base_track + suffix; + insert_metric(true, pmc_name.c_str(), track_name.c_str(), arr[i]); + } } }; - // Insert XGMI read/write data metrics - auto insert_xgmi_vector_metrics = [&](auto category, bool _is_enabled, - const std::vector& data, - std::optional _idx = std::nullopt) { - if(!_is_enabled) return; - - using Category = std::decay_t; - - for(size_t i = 0; i < data.size(); ++i) + insert_xcp_metrics(trait::name::value, + info::format_track_name(), + enabled.bits.vcn_busy, + [](const auto& xcp) -> const auto& { return xcp.vcn_busy; }); + insert_xcp_metrics(trait::name::value, + info::format_track_name(), + enabled.bits.jpeg_busy, + [](const auto& xcp) -> const auto& { return xcp.jpeg_busy; }); + + auto insert_device_level_metrics = [&](const std::string_view base_name, + bool is_enabled, const auto& arr) { + if(!is_enabled) return; + for(size_t i = 0; i < arr.size(); ++i) { - const auto value = data[i]; - if(value == std::numeric_limits::max()) continue; - - auto pmc_name = info::annotate_category(_idx, i); - auto track_name = - info::annotate_with_device_id(_amd_smi.device_id, _idx, i); + auto suffix = "_" + std::to_string(i); + auto pmc_name = std::string(base_name) + suffix; + auto track_name = pmc_name; - insert_event_and_sample(_is_enabled, pmc_name.c_str(), track_name.c_str(), - static_cast(value)); + LOG_TRACE("Inserting metric: pmc_name: {}, track_name: {}, value: {}", + pmc_name, track_name, arr[i]); + insert_metric(true, pmc_name.c_str(), track_name.c_str(), arr[i]); } }; - // Insert VCN activity metrics - if(capabilities.flags.vcn_is_device_level_only) - { - // Device-level: use vcn_activity vector - insert_decode_vector_metrics(category::amd_smi_vcn_activity{}, is_vcn_enabled, - gpu_metrics.vcn_activity, std::nullopt); - } - else - { - // Per-XCP: iterate through actual XCPs in vcn_busy - for(size_t xcp = 0; xcp < gpu_metrics.vcn_busy.size(); ++xcp) + insert_device_level_metrics(info::format_track_name(), + enabled.bits.vcn_activity, m.vcn_activity); + + insert_device_level_metrics( + info::format_track_name(), + enabled.bits.jpeg_activity, m.jpeg_activity); + + insert_scalar(trait::name::value, + info::format_track_name(), + enabled.bits.pcie, m.pcie.link.width); + insert_scalar(trait::name::value, + info::format_track_name(), + enabled.bits.pcie, m.pcie.link.speed); + insert_scalar(trait::name::value, + info::format_track_name(), + enabled.bits.pcie, m.pcie.bandwidth.acc); + insert_scalar(trait::name::value, + info::format_track_name(), + enabled.bits.pcie, m.pcie.bandwidth.inst); + + // XGMI metrics + insert_scalar(trait::name::value, + info::format_track_name(), + enabled.bits.xgmi, m.xgmi.link.width); + insert_scalar(trait::name::value, + info::format_track_name(), + enabled.bits.xgmi, m.xgmi.link.speed); + + // XGMI data accumulators (per-link arrays) + auto insert_xgmi_link_metrics = [&](const std::string& base_track_name, + bool is_enabled, const auto& arr) { + if(!is_enabled) return; + for(size_t i = 0; i < arr.size(); ++i) { - insert_decode_vector_metrics(category::amd_smi_vcn_activity{}, is_vcn_enabled, - gpu_metrics.vcn_busy[xcp], xcp); - } - } + if(arr[i] == pmc::collectors::gpu::METRIC_VALUE_NOT_SUPPORTED_64) continue; - // Insert JPEG activity metrics - if(capabilities.flags.jpeg_is_device_level_only) - { - // Device-level: use jpeg_activity vector - insert_decode_vector_metrics(category::amd_smi_jpeg_activity{}, is_jpeg_enabled, - gpu_metrics.jpeg_activity, std::nullopt); - } - else - { - // Per-XCP: iterate through actual XCPs in jpeg_busy - for(size_t xcp = 0; xcp < gpu_metrics.jpeg_busy.size(); ++xcp) - { - insert_decode_vector_metrics(category::amd_smi_jpeg_activity{}, - is_jpeg_enabled, gpu_metrics.jpeg_busy[xcp], - xcp); + std::string pmc_name = base_track_name + "_link" + std::to_string(i); + std::string track_name = + base_track_name + " [Link " + std::to_string(i) + "]"; + insert_metric(true, pmc_name.c_str(), track_name.c_str(), arr[i]); } - } + }; + + insert_xgmi_link_metrics(trait::name::value, + enabled.bits.xgmi, m.xgmi.data_acc.read); + insert_xgmi_link_metrics(trait::name::value, + enabled.bits.xgmi, m.xgmi.data_acc.write); +} + +void +rocpd_processor_t::handle([[maybe_unused]] const ainic_pmc_sample& _nic_sample) +{ + // Insert NIC RDMA metrics into rocpd database + const auto* _name = "ainic"; + auto name_primary_key = m_data_processor->insert_string(_name); + auto event_id = m_data_processor->insert_event(name_primary_key, 0, 0, 0); + + // We should create a cache for this in the future + auto base_id = + m_agent_manager->get_agent_by_type_index(_nic_sample.device_id, agent_type::NIC) + .base_id; + + auto insert_metric = [&](bool enabled, const char* pmc_name, const char* track_name, + uint64_t value) { + if(!enabled) return; + + LOG_TRACE("Inserting metric: pmc_name: {}, track_name: {}, value: {}", pmc_name, + track_name, value); + + m_data_processor->insert_pmc_event(event_id, base_id, pmc_name, + static_cast(value)); + m_data_processor->insert_sample(track_name, _nic_sample.timestamp, event_id); + }; - // Insert XGMI metrics (scalar values) - insert_event_and_sample( - is_xgmi_enabled, trait::name::value, - info::annotate_with_device_id( - _amd_smi.device_id) - .c_str(), - gpu_metrics.xgmi_link_width); - - insert_event_and_sample( - is_xgmi_enabled, trait::name::value, - info::annotate_with_device_id( - _amd_smi.device_id) - .c_str(), - gpu_metrics.xgmi_link_speed); - - insert_xgmi_vector_metrics(category::amd_smi_xgmi_read_data{}, is_xgmi_enabled, - gpu_metrics.xgmi_read_data_acc, std::nullopt); - - insert_xgmi_vector_metrics(category::amd_smi_xgmi_write_data{}, is_xgmi_enabled, - gpu_metrics.xgmi_write_data_acc, std::nullopt); - - insert_event_and_sample( - is_pcie_enabled, trait::name::value, - info::annotate_with_device_id( - _amd_smi.device_id) - .c_str(), - gpu_metrics.pcie_link_width); - - insert_event_and_sample( - is_pcie_enabled, trait::name::value, - info::annotate_with_device_id( - _amd_smi.device_id) - .c_str(), - gpu_metrics.pcie_link_speed); - - insert_event_and_sample( - is_pcie_enabled, trait::name::value, - info::annotate_with_device_id( - _amd_smi.device_id) - .c_str(), - static_cast(gpu_metrics.pcie_bandwidth_acc)); - - insert_event_and_sample( - is_pcie_enabled, trait::name::value, - info::annotate_with_device_id( - _amd_smi.device_id) - .c_str(), - static_cast(gpu_metrics.pcie_bandwidth_inst)); + const auto& m = _nic_sample.metric_values; + const auto& enabled = _nic_sample.enabled_metric; + + insert_metric(enabled.bits.rx_rdma_ucast_bytes, + trait::name::value, + "ainic_rx_rdma_ucast_bytes", m.rx_rdma_ucast_bytes); + insert_metric(enabled.bits.tx_rdma_ucast_bytes, + trait::name::value, + "ainic_tx_rdma_ucast_bytes", m.tx_rdma_ucast_bytes); + insert_metric(enabled.bits.rx_rdma_ucast_pkts, + trait::name::value, + "ainic_rx_rdma_ucast_pkts", m.rx_rdma_ucast_pkts); + insert_metric(enabled.bits.tx_rdma_ucast_pkts, + trait::name::value, + "ainic_tx_rdma_ucast_pkts", m.tx_rdma_ucast_pkts); + insert_metric(enabled.bits.rx_rdma_cnp_pkts, + trait::name::value, + "ainic_rx_rdma_cnp_pkts", m.rx_rdma_cnp_pkts); + insert_metric(enabled.bits.tx_rdma_cnp_pkts, + trait::name::value, + "ainic_tx_rdma_cnp_pkts", m.tx_rdma_cnp_pkts); } void -rocpd_processor_t::handle(const cpu_freq_sample& _cpu_freq_sample) +rocpd_processor_t::handle([[maybe_unused]] const cpu_freq_sample& _cpu_freq_sample) { struct core_freq_sample { @@ -599,62 +578,6 @@ rocpd_processor_t::handle(const cpu_freq_sample& _cpu_freq_sample) } } -void -rocpd_processor_t::handle(const ainic_sample& _ainic) -{ - const auto* _category_name = trait::name::value; - auto name_primary_key = m_data_processor->insert_string(_category_name); - auto event_id = m_data_processor->insert_event(name_primary_key, 0, 0, 0); - - const auto& nic_agent = - m_agent_manager->get_agent_by_id(_ainic.nic_index, agent_type::NIC); - - const auto base_id = nic_agent.base_id; - const char* nic_name = nic_agent.name.c_str(); - - auto insert_event_and_sample = [&](const char* pmc_descriptor, const char* track_name, - double value) { - m_data_processor->insert_pmc_event(event_id, base_id, pmc_descriptor, value); - m_data_processor->insert_sample(track_name, _ainic.timestamp, event_id); - }; - - insert_event_and_sample(trait::name::value, - info::annotate_with_nic( - nic_name, _ainic.nic_index) - .c_str(), - _ainic.rx_rdma_cnp_pkts); - - insert_event_and_sample(trait::name::value, - info::annotate_with_nic( - nic_name, _ainic.nic_index) - .c_str(), - _ainic.tx_rdma_cnp_pkts); - - insert_event_and_sample(trait::name::value, - info::annotate_with_nic( - nic_name, _ainic.nic_index) - .c_str(), - _ainic.rx_ucast_bytes); - - insert_event_and_sample(trait::name::value, - info::annotate_with_nic( - nic_name, _ainic.nic_index) - .c_str(), - _ainic.tx_ucast_bytes); - - insert_event_and_sample(trait::name::value, - info::annotate_with_nic( - nic_name, _ainic.nic_index) - .c_str(), - _ainic.rx_ucast_pkts); - - insert_event_and_sample(trait::name::value, - info::annotate_with_nic( - nic_name, _ainic.nic_index) - .c_str(), - _ainic.tx_ucast_pkts); -} - rocpd_processor_t::rocpd_processor_t(const std::shared_ptr& md, const std::shared_ptr& agent_mngr, int pid, int ppid) @@ -858,6 +781,9 @@ rocpd_processor_t::post_process_metadata() target_arch = nullptr; } + LOG_TRACE("Inserting PMC description: agent_primary_key: {}, pmc_info: {}", + agent_primary_key, pmc_info.name); + m_data_processor->insert_pmc_description( n_info.id, process_info.pid, agent_primary_key, target_arch, pmc_info.event_code, pmc_info.instance_id, pmc_info.name.c_str(), diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.hpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.hpp index c12dbce25ce..501fc10d122 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.hpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.hpp @@ -51,10 +51,10 @@ class rocpd_processor_t : public processor_t void handle(const region_sample& sample); void handle(const in_time_sample& sample); void handle(const pmc_event_with_sample& sample); - void handle(const amd_smi_sample& sample); + void handle(const gpu_pmc_sample& sample); + void handle(const ainic_pmc_sample& sample); void handle(const cpu_freq_sample& sample); void handle(const backtrace_region_sample& sample); - void handle(const ainic_sample& sample); private: using primary_key = size_t; diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_processor.hpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_processor.hpp index c30052bfeb2..844cb73a22d 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_processor.hpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_processor.hpp @@ -21,9 +21,13 @@ // SOFTWARE. #pragma once +#include "common/defines.h" #include "core/trace_cache/cacheable.hpp" #include "core/trace_cache/sample_type.hpp" +#include "library/pmc/collectors/gpu/sample.hpp" +#include "library/pmc/collectors/nic/sample.hpp" + #include #include @@ -67,7 +71,9 @@ struct processor_t static_cast(this)->handle(sample); } - void handle(const amd_smi_sample& sample) { static_cast(this)->handle(sample); } + void handle(const gpu_pmc_sample& sample) { static_cast(this)->handle(sample); } + + void handle(const ainic_pmc_sample& sample) { static_cast(this)->handle(sample); } void handle(const cpu_freq_sample& sample) { static_cast(this)->handle(sample); } @@ -76,8 +82,6 @@ struct processor_t static_cast(this)->handle(sample); } - void handle(const ainic_sample& sample) { static_cast(this)->handle(sample); } - void prepare_for_processing() { static_cast(this)->prepare_for_processing(); } void finalize_processing() { static_cast(this)->finalize_processing(); } @@ -97,11 +101,11 @@ struct processor_view_t using region_fn_t = void (*)(void*, const region_sample&) noexcept; using in_time_sample_fn_t = void (*)(void*, const in_time_sample&) noexcept; using pmc_event_fn_t = void (*)(void*, const pmc_event_with_sample&) noexcept; - using amd_smi_sample_fn_t = void (*)(void*, const amd_smi_sample&) noexcept; + using gpu_pmc_sample_fn_t = void (*)(void*, const gpu_pmc_sample&) noexcept; + using ainic_pmc_sample_fn_t = void (*)(void*, const ainic_pmc_sample&) noexcept; using cpu_freq_sample_fn_t = void (*)(void*, const cpu_freq_sample&) noexcept; using backtrace_region_fn_t = void (*)(void*, const backtrace_region_sample&) noexcept; - using ainic_sample_fn_t = void (*)(void*, const ainic_sample&) noexcept; using prepare_for_processing_fn_t = void (*)(void*) noexcept; using finalize_processing_fn_t = void (*)(void*) noexcept; @@ -116,10 +120,10 @@ struct processor_view_t region_fn_t handle_region; in_time_sample_fn_t handle_in_time_sample; pmc_event_fn_t handle_pmc_event; - amd_smi_sample_fn_t handle_amd_smi_sample; + gpu_pmc_sample_fn_t handle_gpu_pmc_sample; + ainic_pmc_sample_fn_t handle_ainic_pmc_sample; cpu_freq_sample_fn_t handle_cpu_freq_sample; backtrace_region_fn_t handle_backtrace_region; - ainic_sample_fn_t handle_ainic_sample; prepare_for_processing_fn_t prepare_for_processing; finalize_processing_fn_t finalize_processing; }; @@ -174,10 +178,14 @@ struct processor_view_t { m_vtable->handle_pmc_event(m_object, sample); } + ROCPROFSYS_INLINE void handle(const gpu_pmc_sample& sample) const noexcept + { + m_vtable->handle_gpu_pmc_sample(m_object, sample); + } - ROCPROFSYS_INLINE void handle(const amd_smi_sample& sample) const noexcept + ROCPROFSYS_INLINE void handle(const ainic_pmc_sample& sample) const noexcept { - m_vtable->handle_amd_smi_sample(m_object, sample); + m_vtable->handle_ainic_pmc_sample(m_object, sample); } ROCPROFSYS_INLINE void handle(const cpu_freq_sample& sample) const noexcept @@ -190,11 +198,6 @@ struct processor_view_t m_vtable->handle_backtrace_region(m_object, sample); } - ROCPROFSYS_INLINE void handle(const ainic_sample& sample) const noexcept - { - m_vtable->handle_ainic_sample(m_object, sample); - } - ROCPROFSYS_INLINE void prepare_for_processing() const noexcept { m_vtable->prepare_for_processing(m_object); @@ -233,16 +236,16 @@ struct processor_view_t +[](void* obj, const pmc_event_with_sample& sample) noexcept { static_cast(obj)->handle(sample); }, - +[](void* obj, const amd_smi_sample& sample) noexcept { + +[](void* obj, const gpu_pmc_sample& sample) noexcept { static_cast(obj)->handle(sample); }, - +[](void* obj, const cpu_freq_sample& sample) noexcept { + +[](void* obj, const ainic_pmc_sample& sample) noexcept { static_cast(obj)->handle(sample); }, - +[](void* obj, const backtrace_region_sample& sample) noexcept { + +[](void* obj, const cpu_freq_sample& sample) noexcept { static_cast(obj)->handle(sample); }, - +[](void* obj, const ainic_sample& sample) noexcept { + +[](void* obj, const backtrace_region_sample& sample) noexcept { static_cast(obj)->handle(sample); }, +[](void* obj) noexcept { static_cast(obj)->prepare_for_processing(); }, @@ -317,8 +320,11 @@ struct sample_processor_t case type_identifier_t::pmc_event_with_sample: handle_sample(static_cast(sample)); break; - case type_identifier_t::amd_smi_sample: - handle_sample(static_cast(sample)); + case type_identifier_t::gpu_pmc_sample: + handle_sample(static_cast(sample)); + break; + case type_identifier_t::ainic_pmc_sample: + handle_sample(static_cast(sample)); break; case type_identifier_t::cpu_freq_sample: handle_sample(static_cast(sample)); @@ -326,9 +332,6 @@ struct sample_processor_t case type_identifier_t::backtrace_region_sample: handle_sample(static_cast(sample)); break; - case type_identifier_t::ainic_sample: - handle_sample(static_cast(sample)); - break; default: throw std::runtime_error("Unsupported sample type"); } } diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_type.hpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_type.hpp index 5c4d1100239..3987e07c866 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_type.hpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_type.hpp @@ -43,11 +43,11 @@ enum class type_identifier_t : uint32_t kernel_dispatch = 0x0003, memory_copy = 0x0004, memory_alloc = 0x0005, - amd_smi_sample = 0x0006, + gpu_pmc_sample = 0x0006, cpu_freq_sample = 0x0007, backtrace_region_sample = 0x0008, scratch_memory = 0x0009, - ainic_sample = 0x000A, + ainic_pmc_sample = 0x000A, fragmented_space = 0xFFFF }; @@ -613,154 +613,6 @@ get_size(const pmc_event_with_sample& item) std::string_view(item.pmc_info_name), item.value, item.system_tid); } -struct amd_smi_sample : cacheable_t -{ - static constexpr type_identifier_t type_identifier = - type_identifier_t::amd_smi_sample; - - amd_smi_sample() = default; - amd_smi_sample(uint64_t _settings, uint32_t _device_id, size_t _timestamp, - uint32_t _gfx_activity, uint32_t _umc_activity, uint32_t _mm_activity, - uint32_t _power, int64_t _temperature, size_t _mem_usage, - std::vector _gpu_activity, uint32_t _sdma_usage = 0) - : settings(_settings) - , device_id(_device_id) - , timestamp(_timestamp) - , gfx_activity(_gfx_activity) - , umc_activity(_umc_activity) - , mm_activity(_mm_activity) - , power(_power) - , temperature(_temperature) - , mem_usage(_mem_usage) - , gpu_activity(std::move(_gpu_activity)) - , sdma_usage(_sdma_usage) - {} - - enum class settings_positions : uint8_t - { - busy = 0, - temp, - power, - mem_usage, - vcn_activity, - jpeg_activity, - xgmi, - pcie, - sdma_usage - }; - - uint64_t settings; // bitfield - uint32_t device_id; - size_t timestamp; - uint32_t gfx_activity; - uint32_t umc_activity; - uint32_t mm_activity; - uint32_t power; - int64_t temperature; - size_t mem_usage; - std::vector gpu_activity; - uint32_t sdma_usage = 0; // SDMA utilization percentage (0-100) -}; - -template <> -inline void -serialize(uint8_t* buffer, const amd_smi_sample& item) -{ - utility::store_value(buffer, item.settings, item.device_id, - static_cast(item.timestamp), item.gfx_activity, - item.umc_activity, item.mm_activity, item.power, - item.temperature, static_cast(item.mem_usage), - item.gpu_activity, item.sdma_usage); -} - -template <> -inline amd_smi_sample -deserialize(uint8_t*& buffer) -{ - amd_smi_sample item; - uint64_t timestamp, mem_usage; - utility::parse_value(buffer, item.settings, item.device_id, timestamp, - item.gfx_activity, item.umc_activity, item.mm_activity, - item.power, item.temperature, mem_usage, item.gpu_activity, - item.sdma_usage); - item.timestamp = timestamp; - item.mem_usage = mem_usage; - return item; -} - -template <> -inline size_t -get_size(const amd_smi_sample& item) -{ - return utility::get_size(item.settings, item.device_id, - static_cast(item.timestamp), item.gfx_activity, - item.umc_activity, item.mm_activity, item.power, - item.temperature, static_cast(item.mem_usage), - item.gpu_activity, item.sdma_usage); -} - -struct ainic_sample : cacheable_t -{ - static constexpr type_identifier_t type_identifier = type_identifier_t::ainic_sample; - - ainic_sample() = default; - ainic_sample(size_t _timestamp, uint32_t _nic_index, uint64_t _rx_rdma_cnp_pkts, - uint64_t _tx_rdma_cnp_pkts, uint64_t _rx_ucast_bytes, - uint64_t _tx_ucast_bytes, uint64_t _rx_ucast_pkts, - uint64_t _tx_ucast_pkts) - : timestamp(_timestamp) - , nic_index(_nic_index) - , rx_rdma_cnp_pkts(_rx_rdma_cnp_pkts) - , tx_rdma_cnp_pkts(_tx_rdma_cnp_pkts) - , rx_ucast_bytes(_rx_ucast_bytes) - , tx_ucast_bytes(_tx_ucast_bytes) - , rx_ucast_pkts(_rx_ucast_pkts) - , tx_ucast_pkts(_tx_ucast_pkts) - {} - - size_t timestamp; - uint32_t nic_index; - uint64_t rx_rdma_cnp_pkts; - uint64_t tx_rdma_cnp_pkts; - uint64_t rx_ucast_bytes; - uint64_t tx_ucast_bytes; - uint64_t rx_ucast_pkts; - uint64_t tx_ucast_pkts; -}; - -template <> -inline void -serialize(uint8_t* buffer, const ainic_sample& item) -{ - utility::store_value(buffer, static_cast(item.timestamp), item.nic_index, - item.rx_rdma_cnp_pkts, item.tx_rdma_cnp_pkts, - item.rx_ucast_bytes, item.tx_ucast_bytes, item.rx_ucast_pkts, - item.tx_ucast_pkts); -} - -template <> -inline ainic_sample -deserialize(uint8_t*& buffer) -{ - ainic_sample item; - uint64_t timestamp; - utility::parse_value(buffer, timestamp, item.nic_index, item.rx_rdma_cnp_pkts, - item.tx_rdma_cnp_pkts, item.rx_ucast_bytes, item.tx_ucast_bytes, - item.rx_ucast_pkts, item.tx_ucast_pkts); - item.timestamp = timestamp; - return item; -} - -template <> -inline size_t -get_size(const ainic_sample& item) -{ - return utility::get_size(static_cast(item.timestamp), item.nic_index, - item.rx_rdma_cnp_pkts, item.tx_rdma_cnp_pkts, - item.rx_ucast_bytes, item.tx_ucast_bytes, item.rx_ucast_pkts, - item.tx_ucast_pkts); -} - struct cpu_freq_sample : cacheable_t { static constexpr type_identifier_t type_identifier = diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/tests/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/core/trace_cache/tests/CMakeLists.txt index ce20bd03dfa..ce9056be348 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/tests/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/tests/CMakeLists.txt @@ -28,6 +28,7 @@ set(trace_cache_tests_sources test_flush_worker.cpp test_cache_integration.cpp test_sample_type.cpp + test_rocpd_xcp_output.cpp ) add_library(trace-cache-tests OBJECT ${trace_cache_tests_sources}) diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/tests/test_rocpd_xcp_output.cpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/tests/test_rocpd_xcp_output.cpp new file mode 100644 index 00000000000..95539c07494 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/tests/test_rocpd_xcp_output.cpp @@ -0,0 +1,396 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +// +// Unit tests for XCP stats output naming and gating logic. +// +// These tests replicate the XCP naming/gating logic from perfetto_processor.cpp +// and rocpd_processor.cpp using mock types, verifying correctness without heavy +// Perfetto/database/AMD-SMI dependencies. +// + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace +{ + +// ────────────────────────────────────────────────────────────────── +// Mock types mirroring the production types in +// library/pmc/collectors/gpu/types.hpp +// These are self-contained to avoid AMD SMI SDK dependency. +// ────────────────────────────────────────────────────────────────── + +constexpr size_t MAX_NUM_VCN = 4; +constexpr size_t MAX_NUM_JPEG = 32; +constexpr size_t MAX_NUM_XCP = 8; + +union mock_enabled_metrics +{ + struct + { + uint32_t current_socket_power : 1; // Bit 0 + uint32_t average_socket_power : 1; // Bit 1 + uint32_t memory_usage : 1; // Bit 2 + uint32_t hotspot_temperature : 1; // Bit 3 + uint32_t edge_temperature : 1; // Bit 4 + uint32_t gfx_activity : 1; // Bit 5 + uint32_t umc_activity : 1; // Bit 6 + uint32_t mm_activity : 1; // Bit 7 + uint32_t vcn_activity : 1; // Bit 8 - Device-level VCN (Radeon) + uint32_t jpeg_activity : 1; // Bit 9 - Device-level JPEG (Radeon) + uint32_t vcn_busy : 1; // Bit 10 - Per-XCP VCN (MI300) + uint32_t jpeg_busy : 1; // Bit 11 - Per-XCP JPEG (MI300) + uint32_t xgmi : 1; // Bit 12 + uint32_t pcie : 1; // Bit 13 + uint32_t sdma_usage : 1; // Bit 14 + } bits; + uint32_t value = 0; +}; + +struct mock_xcp_metrics +{ + std::array jpeg_busy; + std::array vcn_busy; +}; + +struct mock_metrics +{ + std::array xcp_stats; + std::array vcn_activity = {}; + std::array jpeg_activity = {}; +}; + +// ────────────────────────────────────────────────────────────────── +// Logic under test — replicated from production code +// ────────────────────────────────────────────────────────────────── + +struct rocpd_xcp_entry +{ + std::string pmc_name; + std::string track_name; + double value; +}; + +// Mirrors insert_xcp_metrics lambda in rocpd_processor.cpp (lines 372-387) +template +std::vector +generate_xcp_metrics(const char* base_name, const std::string& base_track, + bool is_enabled, const mock_metrics& m, GetArrayFn&& get_array) +{ + std::vector entries; + if(!is_enabled) return entries; + for(size_t xcp = 0; xcp < m.xcp_stats.size(); ++xcp) + { + const auto& arr = get_array(m.xcp_stats[xcp]); + for(size_t i = 0; i < arr.size(); ++i) + { + auto suffix = "_xcp" + std::to_string(xcp) + "[" + std::to_string(i) + "]"; + auto pmc_name = std::string(base_name) + suffix; + auto track_name = base_track + suffix; + entries.push_back({ pmc_name, track_name, static_cast(arr[i]) }); + } + } + return entries; +} + +// Mirrors insert_device_level_metrics lambda in rocpd_processor.cpp (lines 398-411) +template +std::vector +generate_device_level_metrics(const std::string& base_name, bool is_enabled, + const ArrayT& arr) +{ + std::vector entries; + if(!is_enabled) return entries; + for(size_t i = 0; i < arr.size(); ++i) + { + auto suffix = "_" + std::to_string(i); + auto pmc_name = base_name + suffix; + auto track_name = pmc_name; + entries.push_back({ pmc_name, track_name, static_cast(arr[i]) }); + } + return entries; +} + +// Mirrors addendum_blk lambda in perfetto_policy.hpp (lines 173-181) +std::string +format_perfetto_xcp_track(uint32_t device_id, const char* metric_name, size_t xcp_idx, + size_t engine_idx) +{ + return fmt::format("GPU [{}] {} XCP_{}: [{:02}] (S)", device_id, metric_name, xcp_idx, + engine_idx); +} + +std::string +format_perfetto_device_track(uint32_t device_id, const char* metric_name, + size_t engine_idx) +{ + return fmt::format("GPU [{}] {} [{:02}] (S)", device_id, metric_name, engine_idx); +} + +// Mirrors unique_key computation in emit_xcp_array_metrics +// (perfetto_processor.cpp:173-175) +uint64_t +compute_track_key(uint32_t device_id, std::optional xcp_idx, size_t engine_idx) +{ + return (static_cast(device_id) << 16) | + (static_cast(xcp_idx.value_or(0)) << 8) | + static_cast(engine_idx); +} + +mock_metrics +make_sentinel_metrics() +{ + mock_metrics m{}; + for(auto& xcp : m.xcp_stats) + { + xcp.vcn_busy.fill(std::numeric_limits::max()); + xcp.jpeg_busy.fill(std::numeric_limits::max()); + } + m.vcn_activity.fill(std::numeric_limits::max()); + m.jpeg_activity.fill(std::numeric_limits::max()); + return m; +} + +} // namespace + +class xcp_output_test : public ::testing::Test +{ +protected: + void SetUp() override { m = make_sentinel_metrics(); } + + mock_metrics m; +}; + +// vcn_busy (bit 10) and vcn_activity (bit 8) are independent, mutually exclusive in +// practice +TEST_F(xcp_output_test, EnabledMetricsBitfieldSemantics) +{ + mock_enabled_metrics em{}; + + em.bits.vcn_busy = 1; + EXPECT_EQ(em.value & (1u << 10), (1u << 10)); + EXPECT_EQ(em.bits.vcn_activity, 0u); + + em.value = 0; + em.bits.vcn_activity = 1; + EXPECT_EQ(em.value & (1u << 8), (1u << 8)); + EXPECT_EQ(em.bits.vcn_busy, 0u); + + em.bits.jpeg_busy = 1; + EXPECT_EQ(em.value & (1u << 11), (1u << 11)); + EXPECT_EQ(em.bits.jpeg_activity, 0u); + + em.value = 0; + em.bits.jpeg_activity = 1; + EXPECT_EQ(em.value & (1u << 9), (1u << 9)); + EXPECT_EQ(em.bits.jpeg_busy, 0u); +} + +// With vcn_busy=1, names follow format "device_vcn_activity_xcp{N}[{IDX}]" +TEST_F(xcp_output_test, VcnBusyXcpMetricNaming) +{ + m.xcp_stats[0].vcn_busy[0] = 50; + m.xcp_stats[0].vcn_busy[1] = 60; + m.xcp_stats[2].vcn_busy[3] = 80; + + auto entries = generate_xcp_metrics( + "device_vcn_activity", "device_vcn_activity", true, m, + [](const mock_xcp_metrics& xcp) -> const auto& { return xcp.vcn_busy; }); + + ASSERT_FALSE(entries.empty()); + EXPECT_EQ(entries.size(), MAX_NUM_XCP * MAX_NUM_VCN); + + EXPECT_EQ(entries[0].pmc_name, "device_vcn_activity_xcp0[0]"); + EXPECT_DOUBLE_EQ(entries[0].value, 50.0); + + EXPECT_EQ(entries[1].pmc_name, "device_vcn_activity_xcp0[1]"); + EXPECT_DOUBLE_EQ(entries[1].value, 60.0); + + // xcp2, engine 3 → index 2*4+3 = 11 + EXPECT_EQ(entries[11].pmc_name, "device_vcn_activity_xcp2[3]"); + EXPECT_DOUBLE_EQ(entries[11].value, 80.0); +} + +// With jpeg_busy=1, names follow format "device_jpeg_activity_xcp{N}[{IDX}]" +TEST_F(xcp_output_test, JpegBusyXcpMetricNaming) +{ + m.xcp_stats[1].jpeg_busy[0] = 42; + + auto entries = generate_xcp_metrics( + "device_jpeg_activity", "device_jpeg_activity", true, m, + [](const mock_xcp_metrics& xcp) -> const auto& { return xcp.jpeg_busy; }); + + ASSERT_FALSE(entries.empty()); + EXPECT_EQ(entries.size(), MAX_NUM_XCP * MAX_NUM_JPEG); + + // xcp1, engine 0 → index 1 * JPEG_COUNT + 0 + size_t idx = 1 * MAX_NUM_JPEG; + EXPECT_EQ(entries[idx].pmc_name, "device_jpeg_activity_xcp1[0]"); + EXPECT_DOUBLE_EQ(entries[idx].value, 42.0); +} + +// vcn_busy=0 → no per-XCP VCN metrics generated +TEST_F(xcp_output_test, DisabledVcnBusyProducesNoOutput) +{ + m.xcp_stats[0].vcn_busy[0] = 50; + + auto entries = generate_xcp_metrics( + "device_vcn_activity", "device_vcn_activity", false, m, + [](const mock_xcp_metrics& xcp) -> const auto& { return xcp.vcn_busy; }); + + EXPECT_TRUE(entries.empty()); +} + +// jpeg_busy=0 → no per-XCP JPEG metrics generated +TEST_F(xcp_output_test, DisabledJpegBusyProducesNoOutput) +{ + m.xcp_stats[0].jpeg_busy[0] = 50; + + auto entries = generate_xcp_metrics( + "device_jpeg_activity", "device_jpeg_activity", false, m, + [](const mock_xcp_metrics& xcp) -> const auto& { return xcp.jpeg_busy; }); + + EXPECT_TRUE(entries.empty()); +} + +// Valid VCN values across all 8 XCPs → 8*4=32 entries +TEST_F(xcp_output_test, AllXcpPartitionsWritten) +{ + for(size_t xcp = 0; xcp < MAX_NUM_XCP; ++xcp) + { + for(size_t eng = 0; eng < MAX_NUM_VCN; ++eng) + { + m.xcp_stats[xcp].vcn_busy[eng] = static_cast(xcp * 10 + eng); + } + } + + auto entries = generate_xcp_metrics( + "device_vcn_activity", "device_vcn_activity", true, m, + [](const mock_xcp_metrics& xcp) -> const auto& { return xcp.vcn_busy; }); + + EXPECT_EQ(entries.size(), MAX_NUM_XCP * MAX_NUM_VCN); + + for(size_t xcp = 0; xcp < MAX_NUM_XCP; ++xcp) + { + for(size_t eng = 0; eng < MAX_NUM_VCN; ++eng) + { + size_t idx = xcp * MAX_NUM_VCN + eng; + auto expected_name = "device_vcn_activity_xcp" + std::to_string(xcp) + "[" + + std::to_string(eng) + "]"; + EXPECT_EQ(entries[idx].pmc_name, expected_name) + << "Mismatch at xcp=" << xcp << " eng=" << eng; + EXPECT_DOUBLE_EQ(entries[idx].value, static_cast(xcp * 10 + eng)); + } + } +} + +// vcn_activity=1 uses device-level array, no _xcp prefix +TEST_F(xcp_output_test, DeviceLevelVcnActivitySeparateFromXcp) +{ + m.vcn_activity[0] = 75; + m.vcn_activity[1] = 85; + + auto entries = + generate_device_level_metrics("device_vcn_activity", true, m.vcn_activity); + + EXPECT_EQ(entries.size(), m.vcn_activity.size()); + + EXPECT_EQ(entries[0].pmc_name, "device_vcn_activity_0"); + EXPECT_DOUBLE_EQ(entries[0].value, 75.0); + EXPECT_EQ(entries[1].pmc_name, "device_vcn_activity_1"); + EXPECT_DOUBLE_EQ(entries[1].value, 85.0); + + for(const auto& entry : entries) + { + EXPECT_EQ(entry.pmc_name.find("_xcp"), std::string::npos) + << "Device-level name should not contain _xcp: " << entry.pmc_name; + } +} + +// Track names follow "GPU [{id}] VCN Busy XCP_{xcp}: [{eng:02}] (S)" +TEST_F(xcp_output_test, PerfettoXcpTrackNameFormat) +{ + uint32_t device_id = 0; + + auto vcn_name = format_perfetto_xcp_track(device_id, "VCN Busy", 3, 2); + EXPECT_EQ(vcn_name, "GPU [0] VCN Busy XCP_3: [02] (S)"); + + auto jpeg_name = format_perfetto_xcp_track(device_id, "JPEG Busy", 7, 0); + EXPECT_EQ(jpeg_name, "GPU [0] JPEG Busy XCP_7: [00] (S)"); + + auto dev_vcn = format_perfetto_device_track(device_id, "VCN Activity", 1); + EXPECT_EQ(dev_vcn, "GPU [0] VCN Activity [01] (S)"); + + auto dev_jpeg = format_perfetto_device_track(device_id, "JPEG Activity", 3); + EXPECT_EQ(dev_jpeg, "GPU [0] JPEG Activity [03] (S)"); + + auto multi_dev = format_perfetto_xcp_track(5, "VCN Busy", 0, 0); + EXPECT_EQ(multi_dev, "GPU [5] VCN Busy XCP_0: [00] (S)"); +} + +// Track key uniqueness for emit_xcp_array_metrics +TEST_F(xcp_output_test, PerfettoTrackKeyUniqueness) +{ + auto key_0_0_0 = compute_track_key(0, 0, 0); + auto key_0_0_1 = compute_track_key(0, 0, 1); + auto key_0_1_0 = compute_track_key(0, 1, 0); + auto key_1_0_0 = compute_track_key(1, 0, 0); + + EXPECT_NE(key_0_0_0, key_0_0_1); + EXPECT_NE(key_0_0_0, key_0_1_0); + EXPECT_NE(key_0_0_0, key_1_0_0); + + // Device-level (nullopt→0) and per-XCP with xcp=0 produce the same key — + // separation is handled by using different Track type instantiations + auto device_key = compute_track_key(0, std::nullopt, 0); + auto xcp0_key = compute_track_key(0, 0, 0); + EXPECT_EQ(device_key, xcp0_key); +} + +// Sentinel values (0xFFFF) are skipped in Perfetto output +TEST_F(xcp_output_test, SentinelValuesSkipped) +{ + std::vector> emitted; + uint32_t device_id = 0; + + // All sentinel by default from make_sentinel_metrics() + for(size_t xcp = 0; xcp < m.xcp_stats.size(); ++xcp) + { + for(size_t i = 0; i < m.xcp_stats[xcp].vcn_busy.size(); ++i) + { + auto value = m.xcp_stats[xcp].vcn_busy[i]; + if(value == std::numeric_limits::max()) continue; + emitted.emplace_back(format_perfetto_xcp_track(device_id, "VCN Busy", xcp, i), + static_cast(value)); + } + } + EXPECT_TRUE(emitted.empty()) << "All sentinel values should be skipped"; + + // Set one valid value + m.xcp_stats[0].vcn_busy[0] = 42; + emitted.clear(); + + for(size_t xcp = 0; xcp < m.xcp_stats.size(); ++xcp) + { + for(size_t i = 0; i < m.xcp_stats[xcp].vcn_busy.size(); ++i) + { + auto value = m.xcp_stats[xcp].vcn_busy[i]; + if(value == std::numeric_limits::max()) continue; + emitted.emplace_back(format_perfetto_xcp_track(device_id, "VCN Busy", xcp, i), + static_cast(value)); + } + } + + ASSERT_EQ(emitted.size(), 1u); + EXPECT_EQ(emitted[0].first, "GPU [0] VCN Busy XCP_0: [00] (S)"); + EXPECT_DOUBLE_EQ(emitted[0].second, 42.0); +} diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/tests/test_sample_type.cpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/tests/test_sample_type.cpp index 3027e452887..72dceaef088 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/tests/test_sample_type.cpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/tests/test_sample_type.cpp @@ -421,71 +421,6 @@ TEST_F(sample_type_test, pmc_event_with_sample_type_identifier) type_identifier_t::pmc_event_with_sample); } -TEST_F(sample_type_test, amd_smi_sample_serialize_deserialize) -{ - std::vector gpu_activity_data = { 10, 20, 30, 40, 50 }; - amd_smi_sample original(0xFF, 2, 70000, 80, 60, 40, 250, 75, 1024 * 1024 * 512, - gpu_activity_data, 35); - - serialize(buffer.data(), original); - - uint8_t* buffer_ptr = buffer.data(); - auto deserialized = deserialize(buffer_ptr); - - EXPECT_EQ(deserialized.settings, original.settings); - EXPECT_EQ(deserialized.device_id, original.device_id); - EXPECT_EQ(deserialized.timestamp, original.timestamp); - EXPECT_EQ(deserialized.gfx_activity, original.gfx_activity); - EXPECT_EQ(deserialized.umc_activity, original.umc_activity); - EXPECT_EQ(deserialized.mm_activity, original.mm_activity); - EXPECT_EQ(deserialized.power, original.power); - EXPECT_EQ(deserialized.temperature, original.temperature); - EXPECT_EQ(deserialized.mem_usage, original.mem_usage); - EXPECT_EQ(deserialized.gpu_activity.size(), original.gpu_activity.size()); - EXPECT_EQ(deserialized.gpu_activity, original.gpu_activity); - EXPECT_EQ(deserialized.sdma_usage, original.sdma_usage); -} - -TEST_F(sample_type_test, amd_smi_sample_get_size) -{ - std::vector gpu_activity_data = { 10, 20, 30, 40, 50 }; - amd_smi_sample sample(0xFF, 2, 70000, 80, 60, 40, 250, 75, 1024 * 1024 * 512, - gpu_activity_data); - - size_t expected_size = sizeof(uint64_t) // settings - + sizeof(uint32_t) // device_id - + sizeof(size_t) // timestamp - + sizeof(uint32_t) // gfx_activity - + sizeof(uint32_t) // umc_activity - + sizeof(uint32_t) // mm_activity - + sizeof(uint32_t) // power - + sizeof(int64_t) // temperature - + sizeof(uint64_t) // mem_usage - + sizeof(size_t) + - gpu_activity_data.size() // gpu_activity (header + data) - + sizeof(uint32_t); // sdma_usage - - EXPECT_EQ(get_size(sample), expected_size); -} - -TEST_F(sample_type_test, amd_smi_sample_type_identifier) -{ - EXPECT_EQ(amd_smi_sample::type_identifier, type_identifier_t::amd_smi_sample); -} - -TEST_F(sample_type_test, amd_smi_sample_empty_gpu_activity) -{ - std::vector empty_activity; - amd_smi_sample original(0, 0, 0, 0, 0, 0, 0, 0, 0, empty_activity); - - serialize(buffer.data(), original); - - uint8_t* buffer_ptr = buffer.data(); - auto deserialized = deserialize(buffer_ptr); - - EXPECT_TRUE(deserialized.gpu_activity.empty()); -} - TEST_F(sample_type_test, cpu_freq_sample_serialize_deserialize) { std::vector freqs_data = { 100, 150, 200, 180, 190, 195, 185, 170 }; @@ -604,7 +539,7 @@ TEST_F(sample_type_test, type_identifier_enum_values) EXPECT_EQ(static_cast(type_identifier_t::kernel_dispatch), 0x0003); EXPECT_EQ(static_cast(type_identifier_t::memory_copy), 0x0004); EXPECT_EQ(static_cast(type_identifier_t::memory_alloc), 0x0005); - EXPECT_EQ(static_cast(type_identifier_t::amd_smi_sample), 0x0006); + EXPECT_EQ(static_cast(type_identifier_t::gpu_pmc_sample), 0x0006); EXPECT_EQ(static_cast(type_identifier_t::cpu_freq_sample), 0x0007); EXPECT_EQ(static_cast(type_identifier_t::backtrace_region_sample), 0x0008); EXPECT_EQ(static_cast(type_identifier_t::fragmented_space), 0xFFFF); @@ -646,12 +581,6 @@ TEST_F(sample_type_test, pmc_event_with_sample_default_constructor) EXPECT_EQ(sample.type_identifier, type_identifier_t::pmc_event_with_sample); } -TEST_F(sample_type_test, amd_smi_sample_default_constructor) -{ - amd_smi_sample sample; - EXPECT_EQ(sample.type_identifier, type_identifier_t::amd_smi_sample); -} - TEST_F(sample_type_test, cpu_freq_sample_default_constructor) { cpu_freq_sample sample; @@ -682,22 +611,3 @@ TEST_F(sample_type_test, kernel_dispatch_sample_large_values) EXPECT_EQ(deserialized.private_segment_size, UINT32_MAX); EXPECT_EQ(deserialized.grid_size_z, UINT32_MAX); } - -TEST_F(sample_type_test, amd_smi_sample_large_gpu_activity) -{ - std::vector large_activity(256); - for(size_t i = 0; i < large_activity.size(); ++i) - { - large_activity[i] = static_cast(i); - } - - amd_smi_sample original(0xFF, 0, 0, 0, 0, 0, 0, 0, 0, large_activity); - - serialize(buffer.data(), original); - - uint8_t* buffer_ptr = buffer.data(); - auto deserialized = deserialize(buffer_ptr); - - EXPECT_EQ(deserialized.gpu_activity.size(), 256); - EXPECT_EQ(deserialized.gpu_activity, large_activity); -} diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/CMakeLists.txt index 248ba100558..989734f520f 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/CMakeLists.txt @@ -39,6 +39,12 @@ add_library( STATIC $ ) + +target_sources( + rocprofiler-systems-static-library + PRIVATE $ +) + add_library( rocprofiler-systems::librocprofiler-systems-static ALIAS rocprofiler-systems-static-library @@ -68,6 +74,12 @@ add_library( SHARED $ ) + +target_sources( + rocprofiler-systems-shared-library + PRIVATE $ +) + add_library( rocprofiler-systems::librocprofiler-systems-shared ALIAS rocprofiler-systems-shared-library diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/CMakeLists.txt index 1ac6122afef..31e4b408ff0 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/CMakeLists.txt @@ -11,7 +11,6 @@ set(library_sources ${CMAKE_CURRENT_LIST_DIR}/thread_deleter.cpp ${CMAKE_CURRENT_LIST_DIR}/thread_info.cpp ${CMAKE_CURRENT_LIST_DIR}/tracing.cpp - ${CMAKE_CURRENT_LIST_DIR}/ainic_stats.cpp ) set(library_headers @@ -21,8 +20,6 @@ set(library_headers ${CMAKE_CURRENT_LIST_DIR}/perf.hpp ${CMAKE_CURRENT_LIST_DIR}/ptl.hpp ${CMAKE_CURRENT_LIST_DIR}/rocm.hpp - ${CMAKE_CURRENT_LIST_DIR}/amd_smi.hpp - ${CMAKE_CURRENT_LIST_DIR}/amd_smi_ainic.hpp ${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.hpp ${CMAKE_CURRENT_LIST_DIR}/runtime.hpp ${CMAKE_CURRENT_LIST_DIR}/sampling.hpp @@ -30,7 +27,6 @@ set(library_headers ${CMAKE_CURRENT_LIST_DIR}/thread_deleter.hpp ${CMAKE_CURRENT_LIST_DIR}/thread_info.hpp ${CMAKE_CURRENT_LIST_DIR}/tracing.hpp - ${CMAKE_CURRENT_LIST_DIR}/ainic_stats.hpp ) target_sources( @@ -43,11 +39,10 @@ target_sources( PRIVATE ${CMAKE_CURRENT_LIST_DIR}/rocm.cpp ${CMAKE_CURRENT_LIST_DIR}/rocprofiler-sdk.cpp - ${CMAKE_CURRENT_LIST_DIR}/amd_smi.cpp - ${CMAKE_CURRENT_LIST_DIR}/amd_smi_ainic.cpp ) add_subdirectory(rocprofiler-sdk) +add_subdirectory(pmc) add_subdirectory(causal) add_subdirectory(components) add_subdirectory(coverage) @@ -57,8 +52,6 @@ set(ndebug_sources ${CMAKE_CURRENT_LIST_DIR}/components/mpi_gotcha.cpp ${CMAKE_CURRENT_LIST_DIR}/components/backtrace_metrics.cpp ${CMAKE_CURRENT_LIST_DIR}/kokkosp.cpp - ${CMAKE_CURRENT_LIST_DIR}/amd_smi.cpp - ${CMAKE_CURRENT_LIST_DIR}/amd_smi_ainic.cpp ) set_source_files_properties( diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/ainic_stats.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/ainic_stats.cpp deleted file mode 100644 index e51cc8ed987..00000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/ainic_stats.cpp +++ /dev/null @@ -1,298 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc. -// SPDX-License-Identifier: MIT - -#include "ainic_stats.hpp" - -#include - -#include "logger/debug.hpp" - -#include - -std::string -nic_stats::to_string() const -{ - return fmt::format("[_name={}, _netdev={}, _rx_rdma_ucast_bytes={}, " - "_rx_rdma_ucast_pkts={}, _tx_rdma_ucast_bytes={}, " - "_tx_rdma_ucast_pkts={}, _rx_rdma_cnp_pkts={}, " - "_tx_rdma_cnp_pkts={}]", - _name, _netdev, _rx_rdma_ucast_bytes, _rx_rdma_ucast_pkts, - _tx_rdma_ucast_bytes, _tx_rdma_ucast_pkts, _rx_rdma_cnp_pkts, - _tx_rdma_cnp_pkts); -} - -ai_nic_stats_collector::ai_nic_stats_collector() = default; - -bool -ai_nic_stats_collector::find_nic(const std::string& nic, nic_stats& data) const -{ - auto pair = _nic_params.find(nic); - if(pair == _nic_params.end()) - { - return false; - } - data = pair->second; - return true; -} - -bool -ai_nic_stats_collector::is_nic_valid(const std::string& nic) const -{ - return (_nic_params.find(nic) != _nic_params.end()); -} - -void -ai_nic_stats_collector::update_stats() -{ -#ifdef AINIC_SUPPORTED - uint32_t soc_count{}; - std::unique_ptr sockets; - // Call amdsmi_get_socket_handles with second parameter (socket_handles) - // nullptr to get the number of socket handles. - amdsmi_status_t status = amdsmi_get_socket_handles(&soc_count, nullptr); - if(status != AMDSMI_STATUS_SUCCESS) - { - LOG_ERROR("amdsmi_get_socket_handles failed with status {}", (int) status); - return; - } - - if(soc_count == 0) // Nothing to do. - return; - - // Allocate a buffer for soc_count socket handles. - sockets = std::make_unique(soc_count); - // Get the socket handles. - status = amdsmi_get_socket_handles(&soc_count, sockets.get()); - if(status != AMDSMI_STATUS_SUCCESS) - { - LOG_ERROR("amdsmi_get_socket_handles failed with status {}", (int) status); - return; - } - - // Iterate through all socket handles to find all AI NIC processor - // handles and update the statistics for each of them. - for(uint32_t index = 0; index < soc_count; index++) - { - uint32_t processor_count = 0; - status = amdsmi_get_processor_handles_by_type( - sockets[index], AMDSMI_PROCESSOR_TYPE_AMD_NIC, nullptr, &processor_count); - if(status != AMDSMI_STATUS_SUCCESS) - { - LOG_ERROR("amdsmi_get_processor_handles_by_type failed with status {}", - (int) status); - return; - } - std::vector processor_handles(processor_count); - status = amdsmi_get_processor_handles_by_type( - sockets[index], AMDSMI_PROCESSOR_TYPE_AMD_NIC, processor_handles.data(), - &processor_count); - if(status != AMDSMI_STATUS_SUCCESS) - { - LOG_ERROR("amdsmi_get_processor_handles_by_type failed with status {}", - (int) status); - return; - } - for(uint32_t idx = 0; idx < processor_count; ++idx) - { - amdsmi_status_t status; - amdsmi_nic_rdma_devices_info_t info; - status = amdsmi_get_nic_rdma_dev_info(processor_handles[idx], &info); - if(status != AMDSMI_STATUS_SUCCESS) continue; - - // Update info and stats. - update_data_for_one_handle(processor_handles[idx], info); - } - } -#endif // AINIC_SUPPORTED -} - -size_t -ai_nic_stats_collector::get_nic_count() -{ -#ifdef AINIC_SUPPORTED - uint32_t soc_count{}; - std::unique_ptr sockets; - // Call amdsmi_get_socket_handles with second parameter (socket_handles) - // nullptr to get the number of socket handles. - amdsmi_status_t status = amdsmi_get_socket_handles(&soc_count, nullptr); - if(status != AMDSMI_STATUS_SUCCESS) - { - LOG_ERROR("amdsmi_get_socket_handles failed with status {}", (int) status); - return 0; - } - - if(soc_count == 0) // Nothing to do. - return 0; - - // Allocate a buffer for soc_count socket handles. - sockets = std::make_unique(soc_count); - // Get the socket handles. - status = amdsmi_get_socket_handles(&soc_count, sockets.get()); - if(status != AMDSMI_STATUS_SUCCESS) - { - LOG_ERROR("amdsmi_get_socket_handles failed with status {}", (int) status); - return 0; - } - - // For all sockets, find all NIC processor handles. - size_t nic_count{}; - for(uint32_t index = 0; index < soc_count; index++) - { - uint32_t processor_count = 0; - status = amdsmi_get_processor_handles_by_type( - sockets[index], AMDSMI_PROCESSOR_TYPE_AMD_NIC, nullptr, &processor_count); - if(status != AMDSMI_STATUS_SUCCESS) - { - continue; - } - nic_count += processor_count; - } - return nic_count; -#else - return 0; -#endif // AINIC_SUPPORTED -} - -#ifdef AINIC_SUPPORTED -void -ai_nic_stats_collector::update_data_for_one_handle( - amdsmi_processor_handle processor_handle, amdsmi_nic_rdma_devices_info_t& info) -{ - for(uint8_t rdma_dev_idx = 0; rdma_dev_idx < info.num_rdma_dev; ++rdma_dev_idx) - { - amdsmi_nic_rdma_dev_info_t dev_info = info.rdma_dev_info[rdma_dev_idx]; - for(uint8_t rdma_port_idx = 0; rdma_port_idx < dev_info.num_rdma_ports; - ++rdma_port_idx) - { - amdsmi_nic_rdma_port_info_t port_info = - dev_info.rdma_port_info[rdma_port_idx]; - nic_stats data; - data._name = dev_info.rdma_dev; - data._netdev = port_info.netdev; - - std::unique_ptr stats; - - // Call *_statistics the first time to get the number of statistics. - uint32_t num_stats{}; - amdsmi_status_t status; - - status = amdsmi_get_nic_rdma_port_statistics(processor_handle, rdma_port_idx, - &num_stats, nullptr); - if(status != AMDSMI_STATUS_SUCCESS) continue; - - // Allocate stats. - stats = std::make_unique(num_stats); - - // Call *_statistics the second time to get the statistics. - status = amdsmi_get_nic_rdma_port_statistics(processor_handle, rdma_port_idx, - &num_stats, stats.get()); - if(status != AMDSMI_STATUS_SUCCESS) continue; - - const std::unordered_map> - stat_handlers = { - { nic_stats::RX_RDMA_UCAST_BYTES, - [](nic_stats& d, uint64_t v) { d._rx_rdma_ucast_bytes = v; } }, - { nic_stats::RX_RDMA_UCAST_PKTS, - [](nic_stats& d, uint64_t v) { d._rx_rdma_ucast_pkts = v; } }, - { nic_stats::TX_RDMA_UCAST_BYTES, - [](nic_stats& d, uint64_t v) { d._tx_rdma_ucast_bytes = v; } }, - { nic_stats::TX_RDMA_UCAST_PKTS, - [](nic_stats& d, uint64_t v) { d._tx_rdma_ucast_pkts = v; } }, - { nic_stats::RX_RDMA_CNP_PKTS, - [](nic_stats& d, uint64_t v) { d._rx_rdma_cnp_pkts = v; } }, - { nic_stats::TX_RDMA_CNP_PKTS, - [](nic_stats& d, uint64_t v) { d._tx_rdma_cnp_pkts = v; } }, - }; - - // Retrieve relevant stats. - for(uint32_t stat_idx{}; stat_idx < num_stats; ++stat_idx) - { - if(auto it = stat_handlers.find(stats[stat_idx].name); - it != stat_handlers.end()) - { - it->second(data, stats[stat_idx].value); - } - } - - // We have filled in the fields of data. Now update _nic_params and - // _nic_delta_params. - auto it = _nic_params.find(data._netdev); - if(it == _nic_params.end()) // not found - { - nic_stats new_delta; - new_delta._name = data._name; - new_delta._netdev = data._netdev; - - new_delta._rx_rdma_ucast_bytes = 0; - new_delta._tx_rdma_ucast_bytes = 0; - new_delta._rx_rdma_ucast_pkts = 0; - new_delta._tx_rdma_ucast_pkts = 0; - - new_delta._rx_rdma_cnp_pkts = 0; - new_delta._tx_rdma_cnp_pkts = 0; - _nic_params[data._netdev] = data; - _nic_delta_params[data._netdev] = new_delta; - } - else - { - nic_stats new_delta; - nic_stats& old_data = it->second; - - new_delta._name = data._name; - new_delta._netdev = data._netdev; - - new_delta._rx_rdma_ucast_bytes = - data._rx_rdma_ucast_bytes - old_data._rx_rdma_ucast_bytes; - new_delta._tx_rdma_ucast_bytes = - data._tx_rdma_ucast_bytes - old_data._tx_rdma_ucast_bytes; - new_delta._rx_rdma_ucast_pkts = - data._rx_rdma_ucast_pkts - old_data._rx_rdma_ucast_pkts; - new_delta._tx_rdma_ucast_pkts = - data._tx_rdma_ucast_pkts - old_data._tx_rdma_ucast_pkts; - - new_delta._rx_rdma_cnp_pkts = - data._rx_rdma_cnp_pkts - old_data._rx_rdma_cnp_pkts; - new_delta._tx_rdma_cnp_pkts = - data._tx_rdma_cnp_pkts - old_data._tx_rdma_cnp_pkts; - - _nic_params[data._netdev] = data; - _nic_delta_params[data._netdev] = new_delta; - } - } - } -} -#endif // AINIC_SUPPORTED - -void -ai_nic_stats_collector::get_data(const std::string& nic, nic_stats& data) const -{ - auto it = _nic_delta_params.find(nic); - if(it == _nic_delta_params.end()) // not found - { - data._netdev = nic; - data._name = ""; - data._rx_rdma_ucast_bytes = 0; - data._tx_rdma_ucast_bytes = 0; - data._rx_rdma_ucast_pkts = 0; - data._tx_rdma_ucast_pkts = 0; - - data._rx_rdma_cnp_pkts = 0; - data._tx_rdma_cnp_pkts = 0; - } - else - { - data = it->second; - } -} - -std::vector -ai_nic_stats_collector::get_nic_list() const -{ - std::vector nic_list = {}; - for(auto& it : _nic_params) - { - nic_list.push_back(it.first); - } - return nic_list; -} diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/ainic_stats.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/ainic_stats.hpp deleted file mode 100644 index 9ba2dfbf6c8..00000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/ainic_stats.hpp +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc. -// SPDX-License-Identifier: MIT - -#pragma once - -#include -#include -#include -#include - -#include "core/amd_smi.hpp" - -#ifdef AINIC_SUPPORTED -# include -#endif - -struct nic_stats -{ - std::string _name; // RDMA device name - std::string _netdev; // NIC name - - uint64_t _rx_rdma_ucast_bytes{}; // unicast received bytes - uint64_t _rx_rdma_ucast_pkts{}; // unicast received packets - uint64_t _tx_rdma_ucast_bytes{}; // unicast transmitted bytes - uint64_t _tx_rdma_ucast_pkts{}; // unicast transmitted packets - - uint64_t _rx_rdma_cnp_pkts{}; // received CNP packets - uint64_t _tx_rdma_cnp_pkts{}; // transmitted CNP packets - - std::string to_string() const; - - static constexpr const char* RX_RDMA_UCAST_BYTES = "rx_rdma_ucast_bytes"; - static constexpr const char* RX_RDMA_UCAST_PKTS = "rx_rdma_ucast_pkts"; - static constexpr const char* TX_RDMA_UCAST_BYTES = "tx_rdma_ucast_bytes"; - static constexpr const char* TX_RDMA_UCAST_PKTS = "tx_rdma_ucast_pkts"; - static constexpr const char* RX_RDMA_CNP_PKTS = "rx_rdma_cnp_pkts"; - static constexpr const char* TX_RDMA_CNP_PKTS = "tx_rdma_cnp_pkts"; -}; - -class ai_nic_stats_collector -{ -public: - using nic_params_t = std::unordered_map; - -private: - // _nic_params and _nic_delta_params both hold network stats. _nic_params holds the - // total values as read on sysfs via amd-smi. _nic_delta_params hold the differences - // between the latest read and the read before that. - // e.g. field rx_rdma_cnp_pkts in one instance of nic_stats contains 1100000 and the - // previous one was 1000000. That means the total number of CNP packets received in - // the time interval between the two reads was 100000, so the equivalent field - // rx_rdma_cnp_pkts in the instance of nic_stats pointed to in _nic_delta_params will - // get the value 100000. The total value are read from amd-smi, but the sampling code - // in rocprof-sys needs to get the differences between two reads. - nic_params_t _nic_params; // Mapping NIC name -> NIC statistics - nic_params_t _nic_delta_params; - -public: - // Get data associated with the specified NIC in _nic_delta_params. - // If the data for nic don't exist, set all measure values to 0 (as a protection - // in case the caller is requesting stats for a nonexistent NIC). - void get_data(const std::string& nic, nic_stats& data) const; - - // get_nic_list returns the list of NICs on the system. - [[nodiscard]] std::vector get_nic_list() const; - - ai_nic_stats_collector(); - - // Update the statistics for all NICs. - void update_stats(); - - // Find nic and fill in the data. - // If the nic is not found, return false. - [[nodiscard]] bool find_nic(const std::string& nic, nic_stats& data) const; - - [[nodiscard]] bool is_nic_valid(const std::string& nic) const; - -private: - size_t get_nic_count(); - -#ifdef AINIC_SUPPORTED - void update_data_for_one_handle(amdsmi_processor_handle processor_handle, - amdsmi_nic_rdma_devices_info_t& info); -#endif -}; diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.cpp deleted file mode 100644 index ea0f4c442ff..00000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.cpp +++ /dev/null @@ -1,1486 +0,0 @@ -// Copyright (c) 2018-2025 Advanced Micro Devices, Inc. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// with the Software without restriction, including without limitation the -// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -// sell copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// * Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimers. -// -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimers in the -// documentation and/or other materials provided with the distribution. -// -// * Neither the names of Advanced Micro Devices, Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this Software without specific prior written permission. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH -// THE SOFTWARE. - -#include "core/agent.hpp" -#include "core/trace_cache/cache_manager.hpp" -#include "core/trace_cache/cacheable.hpp" -#include "core/trace_cache/sample_type.hpp" -#include -#include -#if defined(NDEBUG) -# undef NDEBUG -#endif - -#include "core/common.hpp" -#include "core/components/fwd.hpp" -#include "core/config.hpp" -#include "core/gpu.hpp" -#include "core/gpu_metrics.hpp" -#include "core/node_info.hpp" -#include "core/perfetto.hpp" -#include "core/state.hpp" -#include "core/trace_cache/metadata_registry.hpp" -#include "library/amd_smi.hpp" -#include "library/runtime.hpp" -#include "library/thread_info.hpp" - -#include -#include -#include -#include -#include -#include - -#include "logger/debug.hpp" - -#include -#include -#include -#include -#include -#include - -#define ROCPROFSYS_AMD_SMI_CALL(...) \ - ::rocprofsys::amd_smi::check_error(__FILE__, __LINE__, __VA_ARGS__) - -namespace rocprofsys -{ -namespace amd_smi -{ -using bundle_t = std::deque; -using sampler_instances = thread_data; - -std::atomic& -get_state() -{ - static std::atomic _v{ State::PreInit }; - return _v; -} - -#ifndef AMDSMI_MAX_NUM_JPEG_ENG_V1 -# define AMDSMI_MAX_NUM_JPEG_ENG_V1 AMDSMI_MAX_NUM_JPEG -#endif - -namespace -{ -// Static storage for SDMA usage delta computation -std::unordered_map prev_sdma_cumulative; -std::unordered_map prev_sdma_timestamp; - -void -metadata_initialize_category() -{ - trace_cache::get_metadata_registry().add_string( - trait::name::value); - trace_cache::get_metadata_registry().add_string( - trait::name::value); -} - -void -metadata_initialize_smi_tracks(size_t gpu_id) -{ - const auto thread_id = std::nullopt; - - trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_device_id(gpu_id), - thread_id, "{}" }); - trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_device_id(gpu_id), - thread_id, "{}" }); - trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_device_id(gpu_id), - thread_id, "{}" }); - trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_device_id(gpu_id), - thread_id, "{}" }); - trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_device_id(gpu_id), - thread_id, "{}" }); - trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_device_id( - gpu_id), - thread_id, "{}" }); - - auto add_vcn_track = [&](std::optional xcp_idx) { - for(auto clk = 0; clk < AMDSMI_MAX_NUM_VCN; ++clk) - { - auto name = trace_cache::info::annotate_with_device_id< - category::amd_smi_vcn_activity>(gpu_id, xcp_idx, clk); - trace_cache::get_metadata_registry().add_track( - { name.c_str(), thread_id, "{}" }); - } - }; - - auto add_jpeg_track = [&](std::optional xcp_idx) { - for(auto clk = 0; clk < AMDSMI_MAX_NUM_JPEG_ENG_V1; ++clk) - { - auto name = trace_cache::info::annotate_with_device_id< - category::amd_smi_jpeg_activity>(gpu_id, xcp_idx, clk); - trace_cache::get_metadata_registry().add_track( - { name.c_str(), thread_id, "{}" }); - } - }; - - if(gpu::vcn_is_device_level_only(gpu_id)) - { - add_vcn_track(std::nullopt); - } - else - { - for(int xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) - { - add_vcn_track(xcp); - } - } - - if(gpu::jpeg_is_device_level_only(gpu_id)) - { - add_jpeg_track(std::nullopt); - } - else - { - for(auto xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) - { - add_jpeg_track(xcp); - } - } - - // Add XGMI tracks using specific categories for each metric type - trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_device_id( - gpu_id), - thread_id, "{}" }); - trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_device_id( - gpu_id), - thread_id, "{}" }); - - for(size_t i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) - { - auto read_name = - trace_cache::info::annotate_with_device_id( - gpu_id, std::nullopt, i); - trace_cache::get_metadata_registry().add_track( - { read_name.c_str(), thread_id, "{}" }); - - auto write_name = - trace_cache::info::annotate_with_device_id( - gpu_id, std::nullopt, i); - trace_cache::get_metadata_registry().add_track( - { write_name.c_str(), thread_id, "{}" }); - } - - // Add PCIe tracks using specific categories for each metric - trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_device_id( - gpu_id), - thread_id, "{}" }); - trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_device_id( - gpu_id), - thread_id, "{}" }); - trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_device_id< - category::amd_smi_pcie_bandwidth_acc>(gpu_id), - thread_id, "{}" }); - trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_device_id< - category::amd_smi_pcie_bandwidth_inst>(gpu_id), - thread_id, "{}" }); - - // Add SDMA usage track - trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_device_id( - gpu_id), - thread_id, "{}" }); -} - -void -metadata_initialize_smi_pmc(size_t gpu_id) -{ - // TODO: Find the proper values for a following definitions - size_t EVENT_CODE = 0; - size_t INSTANCE_ID = 0; - const char* LONG_DESCRIPTION = ""; - const char* COMPONENT = ""; - const char* BLOCK = ""; - const char* EXPRESSION = ""; - const char* CELSIUS_DEGREES = "\u00B0C"; - auto ni = node_info::get_instance(); - const char* TARGET_ARCH = "GPU"; - - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trait::name::value, "GFX Busy", - trait::name::description, LONG_DESCRIPTION, - COMPONENT, trace_cache::PERCENTAGE, rocprofsys::trace_cache::ABSOLUTE, BLOCK, - EXPRESSION, 0, 0, "{}" }); - - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trait::name::value, "UMC Busy", - trait::name::description, LONG_DESCRIPTION, - COMPONENT, trace_cache::PERCENTAGE, rocprofsys::trace_cache::ABSOLUTE, BLOCK, - EXPRESSION, 0, 0, "{}" }); - - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trait::name::value, "MM Busy", - trait::name::description, LONG_DESCRIPTION, - COMPONENT, trace_cache::PERCENTAGE, rocprofsys::trace_cache::ABSOLUTE, BLOCK, - EXPRESSION, 0, 0, "{}" }); - - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trait::name::value, "Temp", - trait::name::description, LONG_DESCRIPTION, COMPONENT, - CELSIUS_DEGREES, rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0, 0 }); - - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trait::name::value, "Pow", - trait::name::description, LONG_DESCRIPTION, COMPONENT, - "W", rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0, 0 }); - - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trait::name::value, "MemUsg", - trait::name::description, LONG_DESCRIPTION, - COMPONENT, tim::units::mem_repr(tim::units::megabyte), - rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0, 0 }); - - auto add_vcn_pmc = [&](std::optional xcp_idx) { - for(int clk = 0; clk < AMDSMI_MAX_NUM_VCN; ++clk) - { - std::stringstream name_ss; - name_ss << trait::name::value; - if(xcp_idx) name_ss << "_" << *xcp_idx; - name_ss << "_" << clk; - - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - name_ss.str(), "VcnAct", - trait::name::description, - LONG_DESCRIPTION, COMPONENT, trace_cache::PERCENTAGE, - rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0, 0 }); - } - }; - - auto add_jpeg_pmc = [&](std::optional xcp_idx) { - for(auto clk = 0; clk < AMDSMI_MAX_NUM_JPEG_ENG_V1; ++clk) - { - std::stringstream name_ss; - name_ss << trait::name::value; - if(xcp_idx) name_ss << "_" << *xcp_idx; - name_ss << "_" << std::to_string(clk); - - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - name_ss.str(), "JpegAct", - trait::name::description, - LONG_DESCRIPTION, COMPONENT, trace_cache::PERCENTAGE, - rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0, 0 }); - } - }; - - if(gpu::vcn_is_device_level_only(gpu_id)) - { - add_vcn_pmc(std::nullopt); - } - else - { - for(int xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) - { - add_vcn_pmc(xcp); - } - } - - if(gpu::jpeg_is_device_level_only(gpu_id)) - { - add_jpeg_pmc(std::nullopt); - } - else - { - for(auto xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) - { - add_jpeg_pmc(xcp); - } - } - - // Add XGMI PMC info using specific categories for each metric type - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trait::name::value, "XgmiLinkWidth", - trait::name::description, LONG_DESCRIPTION, - COMPONENT, "bits", rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0, - 0 }); - - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trait::name::value, "XgmiLinkSpeed", - trait::name::description, LONG_DESCRIPTION, - COMPONENT, "GT/s", rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0, - 0 }); - - for(size_t i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) - { - std::stringstream read_name_ss, read_symbol_ss; - read_name_ss << trait::name::value << "_" << i; - - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - read_name_ss.str(), "XgmiRead", - trait::name::description, - LONG_DESCRIPTION, COMPONENT, "KB", rocprofsys::trace_cache::ABSOLUTE, BLOCK, - EXPRESSION, 0, 0 }); - - std::stringstream write_name_ss, write_symbol_ss; - write_name_ss << trait::name::value << "_" - << i; - - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - write_name_ss.str(), "XgmiWrite", - trait::name::description, - LONG_DESCRIPTION, COMPONENT, "KB", rocprofsys::trace_cache::ABSOLUTE, BLOCK, - EXPRESSION, 0, 0 }); - } - - // Add PCIe PMC info using specific categories for each metric - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trait::name::value, "PcieLinkWidth", - trait::name::description, LONG_DESCRIPTION, - COMPONENT, "", rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0, 0 }); - - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trait::name::value, "PcieLinkSpeed", - trait::name::description, LONG_DESCRIPTION, - COMPONENT, "GT/s", rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0, - 0 }); - - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trait::name::value, "PcieBwAcc", - trait::name::description, - LONG_DESCRIPTION, COMPONENT, "MB", rocprofsys::trace_cache::ABSOLUTE, BLOCK, - EXPRESSION, 0, 0 }); - - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trait::name::value, "PcieBwInst", - trait::name::description, - LONG_DESCRIPTION, COMPONENT, "MB/s", rocprofsys::trace_cache::ABSOLUTE, BLOCK, - EXPRESSION, 0, 0 }); - - // Add SDMA usage PMC info - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trait::name::value, "SdmaUsage", - trait::name::description, LONG_DESCRIPTION, - COMPONENT, trace_cache::PERCENTAGE, rocprofsys::trace_cache::ABSOLUTE, BLOCK, - EXPRESSION, 0, 0, "{}" }); -} - -auto& -get_settings(uint32_t _dev_id) -{ - static auto _v = std::unordered_map{}; - return _v[_dev_id]; -} - -bool& -is_initialized() -{ - static bool _v = false; - return _v; -} - -amdsmi_version_t& -get_version() -{ - static amdsmi_version_t _v = {}; - - if(_v.major == 0 && _v.minor == 0) - { - auto _err = amdsmi_get_lib_version(&_v); - if(_err != AMDSMI_STATUS_SUCCESS) - { - throw std::runtime_error( - "amdsmi_get_version failed. No version information available."); - } - } - - return _v; -} - -void -check_error(const char* _file, int _line, amdsmi_status_t _code, bool* _option = nullptr) -{ - if(_code == AMDSMI_STATUS_SUCCESS) - return; - else if(_code == AMDSMI_STATUS_NOT_SUPPORTED && _option) - { - *_option = false; - return; - } - - constexpr const char* _unknown_error_message = - "amdsmi_status_code_to_string failed. No error message available."; - - const char* _msg = nullptr; - auto _error_code_is_known = - amdsmi_status_code_to_string(_code, &_msg) == AMDSMI_STATUS_SUCCESS; - - throw std::runtime_error( - fmt::format("[{}:{}] Error code {} :: {}", _file, _line, static_cast(_code), - _error_code_is_known ? _msg : _unknown_error_message)); -} - -std::vector -serialize_gpu_metrics(uint32_t device_id, const data::gpu_metrics_t& metrics, - const gpu::gpu_metrics_capabilities_t& capabilities) -{ - // Get settings for this device - auto settings = get_settings(device_id); - - // Convert amd_smi::settings to gpu::gpu_metrics_settings_t - gpu::gpu_metrics_settings_t gpu_settings; - gpu_settings.vcn_activity = settings.vcn_activity; - gpu_settings.jpeg_activity = settings.jpeg_activity; - gpu_settings.xgmi = settings.xgmi; - gpu_settings.pcie = settings.pcie; - - // Use the shared serialization function - return gpu::serialize_gpu_metrics(metrics, capabilities, gpu_settings); -} - -size_t -serialize_settings(uint32_t _device_id) -{ - auto settings = get_settings(_device_id); - std::bitset<16> settings_bits; - settings_bits.reset(); - settings_bits.set( - static_cast(trace_cache::amd_smi_sample::settings_positions::busy), - settings.busy); - settings_bits.set( - static_cast(trace_cache::amd_smi_sample::settings_positions::temp), - settings.temp); - settings_bits.set( - static_cast(trace_cache::amd_smi_sample::settings_positions::power), - settings.power); - settings_bits.set( - static_cast(trace_cache::amd_smi_sample::settings_positions::mem_usage), - settings.mem_usage); - settings_bits.set( - static_cast(trace_cache::amd_smi_sample::settings_positions::vcn_activity), - settings.vcn_activity); - settings_bits.set( - static_cast(trace_cache::amd_smi_sample::settings_positions::jpeg_activity), - settings.jpeg_activity); - settings_bits.set( - static_cast(trace_cache::amd_smi_sample::settings_positions::xgmi), - settings.xgmi); - settings_bits.set( - static_cast(trace_cache::amd_smi_sample::settings_positions::pcie), - settings.pcie); - settings_bits.set( - static_cast(trace_cache::amd_smi_sample::settings_positions::sdma_usage), - settings.sdma_usage); - return settings_bits.to_ulong(); -} - -} // namespace - -//--------------------------------------------------------------------------------------// - -size_t data::device_count = 0; -std::set data::device_list = {}; -std::unique_ptr data::polling_finished = {}; - -data::data(uint32_t _dev_id) { sample(_dev_id); } - -void -data::sample(uint32_t _device_id) -{ - if(is_child_process()) return; - - auto _timestamp = tim::get_clock_real_now(); - assert(_timestamp < std::numeric_limits::max()); - amdsmi_gpu_metrics_t _gpu_metrics; - bool _gpu_metrics_needed = false; - - auto _state = get_state().load(); - - if(_state != State::Active) return; - - m_dev_id = _device_id; - m_ts = _timestamp; - -#define ROCPROFSYS_AMDSMI_GET(OPTION, FUNCTION, ...) \ - if(OPTION) \ - { \ - try \ - { \ - ROCPROFSYS_AMD_SMI_CALL(FUNCTION(__VA_ARGS__), &OPTION); \ - } catch(std::runtime_error & _e) \ - { \ - LOG_ERROR("Exception: {}. Disabling future samples from amd-smi...", \ - _e.what()); \ - get_state().store(State::Disabled); \ - } \ - } - - amdsmi_processor_handle sample_handle = gpu::get_handle_from_id(_device_id); - ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).busy, amdsmi_get_gpu_activity, - sample_handle, &m_busy_perc); - ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).temp, amdsmi_get_temp_metric, - sample_handle, AMDSMI_TEMPERATURE_TYPE_JUNCTION, - AMDSMI_TEMP_CURRENT, &m_temp); -#if(AMDSMI_LIB_VERSION_MAJOR == 2 && AMDSMI_LIB_VERSION_MINOR == 0) || \ - (AMDSMI_LIB_VERSION_MAJOR == 25 && AMDSMI_LIB_VERSION_MINOR == 2) - // This was a transient change in the AMD SMI API. It was never officially released. - ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).power, amdsmi_get_power_info, - sample_handle, 0, &m_power) -#else - ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).power, amdsmi_get_power_info, - sample_handle, &m_power) -#endif - ROCPROFSYS_AMDSMI_GET(get_settings(m_dev_id).mem_usage, amdsmi_get_gpu_memory_usage, - sample_handle, AMDSMI_MEM_TYPE_VRAM, &m_mem_usage); - - // Check if GPU metrics are needed for VCN, JPEG, XGMI, or PCIe - _gpu_metrics_needed = get_settings(m_dev_id).vcn_activity || - get_settings(m_dev_id).jpeg_activity || - get_settings(m_dev_id).xgmi || get_settings(m_dev_id).pcie; - - ROCPROFSYS_AMDSMI_GET(_gpu_metrics_needed, amdsmi_get_gpu_metrics_info, sample_handle, - &_gpu_metrics); - - // Determine if basic metrics are enabled - bool _basic_metrics_enabled = - get_settings(m_dev_id).busy || get_settings(m_dev_id).temp || - get_settings(m_dev_id).power || get_settings(m_dev_id).mem_usage; - - // Process GPU metrics if needed (also include SDMA) - if(_gpu_metrics_needed || _basic_metrics_enabled || get_settings(m_dev_id).sdma_usage) - { - gpu_metrics_t metrics; - bool has_data = false; - gpu::gpu_metrics_capabilities_t capabilities; - - if(_gpu_metrics_needed) - { - capabilities.flags.vcn_is_device_level_only = - gpu::vcn_is_device_level_only(m_dev_id); - capabilities.flags.jpeg_is_device_level_only = - gpu::jpeg_is_device_level_only(m_dev_id); - - // Helper lambda to filter max uint values (unsupported) - returns 0 if max, - // otherwise the value - auto filter_max_uint_value = [](const auto& value) { - using ValueType = std::decay_t; - return (value == std::numeric_limits::max()) ? ValueType{ 0 } - : value; - }; - - auto fill_gpu_metrics = [](auto& dest, const auto& src, auto max_val) { - for(const auto& val : src) - { - if(val != max_val) dest.push_back(val); - } - }; - - if(get_settings(m_dev_id).vcn_activity) - { - if(capabilities.flags.vcn_is_device_level_only) - { - fill_gpu_metrics(metrics.vcn_activity, _gpu_metrics.vcn_activity, - UINT16_MAX); - if(!metrics.vcn_activity.empty()) has_data = true; - } - else - { - for(const auto& xcp : _gpu_metrics.xcp_stats) - { - std::vector xcp_vcn_data; - fill_gpu_metrics(xcp_vcn_data, xcp.vcn_busy, UINT16_MAX); - if(!xcp_vcn_data.empty()) - { - metrics.vcn_busy.push_back(std::move(xcp_vcn_data)); - has_data = true; - } - } - } - } - - if(get_settings(m_dev_id).jpeg_activity) - { - if(capabilities.flags.jpeg_is_device_level_only) - { - fill_gpu_metrics(metrics.jpeg_activity, _gpu_metrics.jpeg_activity, - UINT16_MAX); - if(!metrics.jpeg_activity.empty()) has_data = true; - } - else - { - for(const auto& xcp : _gpu_metrics.xcp_stats) - { - std::vector xcp_jpeg_data; - fill_gpu_metrics(xcp_jpeg_data, xcp.jpeg_busy, UINT16_MAX); - if(!xcp_jpeg_data.empty()) - { - metrics.jpeg_busy.push_back(std::move(xcp_jpeg_data)); - has_data = true; - } - } - } - } - - // Process XGMI metrics if enabled - if(get_settings(m_dev_id).xgmi) - { - // Filter scalar values - returns 0 if unsupported (max value) - metrics.xgmi_link_width = - filter_max_uint_value(_gpu_metrics.xgmi_link_width); - metrics.xgmi_link_speed = - filter_max_uint_value(_gpu_metrics.xgmi_link_speed); - - // Vector values filtered by fill_gpu_metrics - fill_gpu_metrics(metrics.xgmi_read_data_acc, - _gpu_metrics.xgmi_read_data_acc, UINT64_MAX); - fill_gpu_metrics(metrics.xgmi_write_data_acc, - _gpu_metrics.xgmi_write_data_acc, UINT64_MAX); - - if(metrics.xgmi_link_width != 0 || metrics.xgmi_link_speed != 0 || - !metrics.xgmi_read_data_acc.empty() || - !metrics.xgmi_write_data_acc.empty()) - { - has_data = true; - } - } - - // Process PCIe metrics if enabled - if(get_settings(m_dev_id).pcie) - { - // Filter scalar values - returns 0 if unsupported (max value) - metrics.pcie_link_width = - filter_max_uint_value(_gpu_metrics.pcie_link_width); - metrics.pcie_link_speed = - filter_max_uint_value(_gpu_metrics.pcie_link_speed); - metrics.pcie_bandwidth_acc = - filter_max_uint_value(_gpu_metrics.pcie_bandwidth_acc); - metrics.pcie_bandwidth_inst = - filter_max_uint_value(_gpu_metrics.pcie_bandwidth_inst); - - if(metrics.pcie_link_width != 0 || metrics.pcie_link_speed != 0 || - metrics.pcie_bandwidth_acc != 0 || metrics.pcie_bandwidth_inst != 0) - { - has_data = true; - } - } - } - - // Collect SDMA usage if enabled - uint32_t sdma_usage_percent = 0; -#if AMD_SMI_SDMA_SUPPORTED == 1 - if(get_settings(m_dev_id).sdma_usage) - { - uint64_t current_cumulative = 0; - uint32_t num_processes = 0; - - // First call to get count - auto status = - amdsmi_get_gpu_process_list(sample_handle, &num_processes, nullptr); - - LOG_TRACE("[SDMA] Device {}: process_list status={}, num_processes={}", - m_dev_id, static_cast(status), num_processes); - - if(status == AMDSMI_STATUS_SUCCESS && num_processes > 0) - { - std::vector proc_list(num_processes); - status = amdsmi_get_gpu_process_list(sample_handle, &num_processes, - proc_list.data()); - - LOG_TRACE("[SDMA] Device {}: proc_list status={}, num_processes={}", - m_dev_id, static_cast(status), num_processes); - - if(status == AMDSMI_STATUS_SUCCESS) - { - for(const auto& proc : proc_list) - { - LOG_TRACE("[SDMA] Device {}: PID={}, sdma_usage={} us", m_dev_id, - proc.pid, proc.sdma_usage); - current_cumulative += proc.sdma_usage; // microseconds - } - } - } - - // Compute percentage from delta - if(prev_sdma_cumulative.count(m_dev_id) > 0) - { - uint64_t delta_usage = - current_cumulative - prev_sdma_cumulative[m_dev_id]; - uint64_t delta_time = _timestamp - prev_sdma_timestamp[m_dev_id]; // ns - - if(delta_time > 0) - { - // Convert: delta_usage is in μs, delta_time is in ns - // percentage = (delta_usage * 1000 / delta_time) * 100 - // = (delta_usage * 100000) / delta_time - sdma_usage_percent = - static_cast((delta_usage * 100000ULL) / delta_time); - - // Clamp to 100% max - if(sdma_usage_percent > 100) sdma_usage_percent = 100; - } - } - - LOG_TRACE("[SDMA] Device {}: cumulative={} us, percent={}", m_dev_id, - current_cumulative, sdma_usage_percent); - - prev_sdma_cumulative[m_dev_id] = current_cumulative; - prev_sdma_timestamp[m_dev_id] = _timestamp; - - // Store in member for legacy path output - m_sdma_usage = sdma_usage_percent; - } -#endif // AMD_SMI_SDMA_SUPPORTED == 1 - - // Store samples if basic metrics are enabled OR if there's advanced metric data - if(_basic_metrics_enabled || has_data || get_settings(m_dev_id).sdma_usage) - { - trace_cache::get_buffer_storage().store(trace_cache::amd_smi_sample{ - serialize_settings(m_dev_id), _device_id, _timestamp, - m_busy_perc.gfx_activity, m_busy_perc.umc_activity, - m_busy_perc.mm_activity, m_power.current_socket_power, m_temp, - m_mem_usage, serialize_gpu_metrics(m_dev_id, metrics, capabilities), - sdma_usage_percent }); - - if(has_data) m_gpu_metrics.push_back(metrics); - } - } -#undef ROCPROFSYS_AMDSMI_GET -} - -void -data::print(std::ostream& _os) const -{ - std::stringstream _ss{}; - - _ss << "device: " << m_dev_id << ", gpu busy: = " << m_busy_perc.gfx_activity - << "%, mm busy: = " << m_busy_perc.mm_activity - << "%, umc busy: = " << m_busy_perc.umc_activity << "%, temp = " << m_temp - << ", current power = " << m_power.current_socket_power - << ", memory usage = " << m_mem_usage; - _os << _ss.str(); -} - -namespace -{ -std::vector*> _bundle_data{}; -} // namespace - -void -config() -{ - _bundle_data.resize(data::device_count, nullptr); - for(size_t i = 0; i < data::device_count; ++i) - { - if(data::device_list.count(i) > 0) - { - _bundle_data.at(i) = &sampler_instances::get()->at(i); - if(!*_bundle_data.at(i)) - *_bundle_data.at(i) = unique_ptr_t{ new bundle_t{} }; - } - } - data::get_initial().resize(data::device_count); - for(auto itr : data::device_list) - data::get_initial().at(itr).sample(itr); - - metadata_initialize_category(); - - for(const auto& _dev_id : data::device_list) - { - metadata_initialize_smi_tracks(_dev_id); - metadata_initialize_smi_pmc(_dev_id); - } - -#ifdef AINIC_SUPPORTED - nic_config(); -#endif - - amd_smi::set_state(State::Active); -} - -void -sample() -{ - auto_lock_t _lk{ type_mutex() }; - - // TODO: Reorganize amd_smi::data and sampling mechanism not to store same data in - // bundle_data and in trace_cache - - for(auto itr : data::device_list) - { - if(amd_smi::get_state() != State::Active) continue; - LOG_TRACE("Polling amd-smi for device {}", itr); - auto& _data = *_bundle_data.at(itr); - if(!_data) continue; - _data->emplace_back(data{ itr }); - } - -#ifdef AINIC_SUPPORTED - nic_sample(); -#endif -} - -void -set_state(State _v) -{ - amd_smi::get_state().store(_v); -} - -std::vector& -data::get_initial() -{ - static std::vector _v{}; - return _v; -} - -bool -data::setup() -{ - perfetto_counter_track::init(); - amd_smi::set_state(State::PreInit); - return true; -} - -bool -data::shutdown() -{ - amd_smi::set_state(State::Finalized); - return true; -} - -#define GPU_METRIC(COMPONENT, ...) \ - if constexpr(tim::trait::is_available::value) \ - { \ - auto* _val = _v.get(); \ - if(_val) \ - { \ - _val->set_value(itr.__VA_ARGS__); \ - _val->set_accum(itr.__VA_ARGS__); \ - } \ - } - -void -data::post_process(uint32_t _dev_id) -{ - using component::sampling_gpu_busy_gfx; - using component::sampling_gpu_busy_mm; - using component::sampling_gpu_busy_umc; - using component::sampling_gpu_jpeg; - using component::sampling_gpu_memory; - using component::sampling_gpu_power; - using component::sampling_gpu_temp; - using component::sampling_gpu_vcn; - - if(device_count < _dev_id) return; - - auto& _amd_smi_v = sampler_instances::get()->at(_dev_id); - auto _amd_smi = (_amd_smi_v) ? *_amd_smi_v : std::deque{}; - const auto& _thread_info = thread_info::get(0, InternalTID); - - LOG_DEBUG("Post-processing {} amd-smi samples from device {}", _amd_smi.size(), - _dev_id); - - if(get_is_continuous_integration() && !_thread_info) - { - throw std::runtime_error("Missing thread info for thread 0"); - return; - } - - if(!_thread_info) return; - - auto _settings = get_settings(_dev_id); - - auto use_perfetto = get_use_perfetto(); - - for(auto& itr : _amd_smi) - { - using counter_track = perfetto_counter_track; - if(itr.m_dev_id != _dev_id) continue; - - uint64_t _ts = itr.m_ts; - if(!_thread_info->is_valid_time(_ts)) continue; - - double _gfxbusy = itr.m_busy_perc.gfx_activity; - double _umcbusy = itr.m_busy_perc.umc_activity; - double _mmbusy = itr.m_busy_perc.mm_activity; - double _temp = itr.m_temp; - double _power = itr.m_power.current_socket_power; - double _usage = itr.m_mem_usage / static_cast(units::megabyte); - - auto setup_perfetto_counter_tracks = [&]() { - if(counter_track::exists(_dev_id)) return; - - auto addendum = [&](const char* _v) { - return fmt::format("GPU {} [{}] (S)", _v, _dev_id); - }; - - auto addendum_blk = [&](std::size_t _i, const char* _metric, - std::size_t xcp_idx = SIZE_MAX) { - if(xcp_idx != SIZE_MAX) - { - return fmt::format("GPU [{}] {} XCP_{}: [{}] (S)", _dev_id, _metric, - xcp_idx, (_i < 10 ? "0" : ""), _i); - } - else - { - return fmt::format("GPU [{}] {} [{}] (S)", _dev_id, _metric, - (_i < 10 ? "0" : ""), _i); - } - }; - - if(_settings.busy) - { - counter_track::emplace(_dev_id, addendum("GFX Busy"), "%"); - counter_track::emplace(_dev_id, addendum("UMC Busy"), "%"); - counter_track::emplace(_dev_id, addendum("MM Busy"), "%"); - } - if(_settings.temp) - { - counter_track::emplace(_dev_id, addendum("Temperature"), "deg C"); - } - if(_settings.power) - { - counter_track::emplace(_dev_id, addendum("Current Power"), "watts"); - } - if(_settings.mem_usage) - { - counter_track::emplace(_dev_id, addendum("Memory Usage"), "megabytes"); - } - if(_settings.vcn_activity) - { - if(itr.m_gpu_metrics.empty()) - { - LOG_DEBUG("No VCN activity data collected from device {}", _dev_id); - } - else if(gpu::vcn_is_device_level_only(_dev_id)) - { - // For VCN activity supported: use vcn_activity vector - for(std::size_t i = 0; - i < std::size(itr.m_gpu_metrics[0].vcn_activity); ++i) - counter_track::emplace(_dev_id, addendum_blk(i, "VCN Activity"), - "%"); - } - else - { - // For VCN activity NOT supported: use vcn_busy vector with per-XCP - // organization - for(size_t xcp = 0; xcp < itr.m_gpu_metrics[0].vcn_busy.size(); ++xcp) - { - // Loop through each XCP's VCN busy values - for(size_t i = 0; i < itr.m_gpu_metrics[0].vcn_busy[xcp].size(); - ++i) - { - counter_track::emplace( - _dev_id, addendum_blk(i, "VCN Activity", xcp), "%"); - } - } - } - } - if(_settings.jpeg_activity) - { - if(itr.m_gpu_metrics.empty()) - { - LOG_DEBUG("No JPEG activity data collected from device {}", _dev_id); - } - else if(gpu::jpeg_is_device_level_only(_dev_id)) - { - // For JPEG activity supported: use jpeg_activity vector - for(std::size_t i = 0; - i < std::size(itr.m_gpu_metrics[0].jpeg_activity); ++i) - counter_track::emplace(_dev_id, addendum_blk(i, "JPEG Activity"), - "%"); - } - else - { - // For JPEG activity NOT supported: use jpeg_busy vector with per-XCP - // organization - for(size_t xcp = 0; xcp < itr.m_gpu_metrics[0].jpeg_busy.size(); - ++xcp) - { - // Loop through each XCP's JPEG busy values - for(size_t i = 0; i < itr.m_gpu_metrics[0].jpeg_busy[xcp].size(); - ++i) - { - counter_track::emplace( - _dev_id, addendum_blk(i, "JPEG Activity", xcp), "%"); - } - } - } - } - if(_settings.xgmi) - { - if(itr.m_gpu_metrics.empty()) - { - LOG_DEBUG("No XGMI activity data collected from device {}", _dev_id); - } - else - { - counter_track::emplace(_dev_id, addendum("XGMI Link Width"), "bits"); - counter_track::emplace(_dev_id, addendum("XGMI Link Speed"), "GT/s"); - for(std::size_t i = 0; - i < std::size(itr.m_gpu_metrics[0].xgmi_read_data_acc); ++i) - counter_track::emplace(_dev_id, addendum_blk(i, "XGMI Read Data"), - "KB"); - for(std::size_t i = 0; - i < std::size(itr.m_gpu_metrics[0].xgmi_write_data_acc); ++i) - counter_track::emplace(_dev_id, - addendum_blk(i, "XGMI Write Data"), "KB"); - } - } - if(_settings.pcie) - { - if(itr.m_gpu_metrics.empty()) - { - LOG_DEBUG("No PCIe activity data collected from device {}", _dev_id); - } - else - { - counter_track::emplace(_dev_id, addendum("PCIe Link Width"), ""); - counter_track::emplace(_dev_id, addendum("PCIe Link Speed"), "GT/s"); - counter_track::emplace(_dev_id, addendum("PCIe Bandwidth Acc"), "MB"); - counter_track::emplace(_dev_id, addendum("PCIe Bandwidth Inst"), - "MB/s"); - } - } - if(_settings.sdma_usage) - { - counter_track::emplace(_dev_id, addendum("SDMA Usage"), "%"); - } - }; - - auto write_perfetto_metrics = [&]() { - size_t track_index = 0; - - if(_settings.busy) - { - TRACE_COUNTER("device_busy_gfx", - counter_track::at(_dev_id, track_index++), _ts, _gfxbusy); - TRACE_COUNTER("device_busy_umc", - counter_track::at(_dev_id, track_index++), _ts, _umcbusy); - TRACE_COUNTER("device_busy_mm", counter_track::at(_dev_id, track_index++), - _ts, _mmbusy); - } - if(_settings.temp) - { - TRACE_COUNTER("device_temp", counter_track::at(_dev_id, track_index++), - _ts, _temp); - } - if(_settings.power) - { - TRACE_COUNTER("device_power", counter_track::at(_dev_id, track_index++), - _ts, _power); - } - if(_settings.mem_usage) - { - TRACE_COUNTER("device_memory_usage", - counter_track::at(_dev_id, track_index++), _ts, _usage); - } - - if(_settings.vcn_activity && !itr.m_gpu_metrics.empty()) - { - if(gpu::vcn_is_device_level_only(_dev_id)) - { - // Device-level VCN activity - for(const auto& vcn_val : itr.m_gpu_metrics[0].vcn_activity) - { - TRACE_COUNTER("device_vcn_activity", - counter_track::at(_dev_id, track_index++), _ts, - vcn_val); - } - } - else - { - // XCP-level VCN busy (per-XCP organization) - for(const auto& xcp_data : itr.m_gpu_metrics[0].vcn_busy) - { - for(const auto& vcn_val : xcp_data) - { - TRACE_COUNTER("device_vcn_activity", - counter_track::at(_dev_id, track_index++), _ts, - vcn_val); - } - } - } - } - - if(_settings.jpeg_activity && !itr.m_gpu_metrics.empty()) - { - if(gpu::jpeg_is_device_level_only(_dev_id)) - { - // Device-level JPEG activity - for(const auto& jpeg_val : itr.m_gpu_metrics[0].jpeg_activity) - { - TRACE_COUNTER("device_jpeg_activity", - counter_track::at(_dev_id, track_index++), _ts, - jpeg_val); - } - } - else - { - // XCP-level JPEG busy (per-XCP organization) - for(const auto& xcp_data : itr.m_gpu_metrics[0].jpeg_busy) - { - for(const auto& jpeg_val : xcp_data) - { - TRACE_COUNTER("device_jpeg_activity", - counter_track::at(_dev_id, track_index++), _ts, - jpeg_val); - } - } - } - } - - if(_settings.xgmi && !itr.m_gpu_metrics.empty()) - { - TRACE_COUNTER("device_xgmi_link_width", - counter_track::at(_dev_id, track_index++), _ts, - itr.m_gpu_metrics[0].xgmi_link_width); - TRACE_COUNTER("device_xgmi_link_speed", - counter_track::at(_dev_id, track_index++), _ts, - itr.m_gpu_metrics[0].xgmi_link_speed); - for(const auto& read_val : itr.m_gpu_metrics[0].xgmi_read_data_acc) - { - TRACE_COUNTER("device_xgmi_read_data", - counter_track::at(_dev_id, track_index++), _ts, - read_val); - } - - for(const auto& write_val : itr.m_gpu_metrics[0].xgmi_write_data_acc) - { - TRACE_COUNTER("device_xgmi_write_data", - counter_track::at(_dev_id, track_index++), _ts, - write_val); - } - } - - if(_settings.pcie && !itr.m_gpu_metrics.empty()) - { - TRACE_COUNTER("device_pcie_link_width", - counter_track::at(_dev_id, track_index++), _ts, - itr.m_gpu_metrics[0].pcie_link_width); - TRACE_COUNTER("device_pcie_link_speed", - counter_track::at(_dev_id, track_index++), _ts, - itr.m_gpu_metrics[0].pcie_link_speed); - TRACE_COUNTER("device_pcie_bandwidth_acc", - counter_track::at(_dev_id, track_index++), _ts, - itr.m_gpu_metrics[0].pcie_bandwidth_acc); - TRACE_COUNTER("device_pcie_bandwidth_inst", - counter_track::at(_dev_id, track_index++), _ts, - itr.m_gpu_metrics[0].pcie_bandwidth_inst); - } - - if(_settings.sdma_usage) - { - TRACE_COUNTER("device_sdma_usage", - counter_track::at(_dev_id, track_index++), _ts, - static_cast(itr.m_sdma_usage)); - } - }; - - if(use_perfetto) - { - setup_perfetto_counter_tracks(); - write_perfetto_metrics(); - } - } -} - -//--------------------------------------------------------------------------------------// - -void -setup() -{ - auto_lock_t _lk{ type_mutex() }; - - if(is_initialized() || !get_use_amd_smi()) return; - - ROCPROFSYS_SCOPED_SAMPLING_ON_CHILD_THREADS(false); - - if(!gpu::initialize_amdsmi()) - { - LOG_WARNING("AMD SMI is not available. Disabling AMD SMI sampling..."); - return; - } - - amdsmi_version_t _version = get_version(); - LOG_INFO("AMD SMI version: {} - str: {}.", _version.major, _version.minor, - _version.release, _version.build); - - data::device_count = gpu::device_count(); - - auto _devices_v = get_sampling_gpus(); - for(auto& itr : _devices_v) - itr = tolower(itr); - if(_devices_v == "off") - _devices_v = "none"; - else if(_devices_v == "on") - _devices_v = "all"; - bool _all_devices = _devices_v.find("all") != std::string::npos || _devices_v.empty(); - bool _no_devices = _devices_v.find("none") != std::string::npos; - - std::set _devices = {}; - auto _emplace = [&_devices](auto idx) { - if(idx < data::device_count) _devices.emplace(idx); - }; - - if(_all_devices) - { - for(uint32_t i = 0; i < data::device_count; ++i) - _emplace(i); - } - else if(!_no_devices) - { - auto _enabled = tim::delimit(_devices_v, ",; \t"); - for(auto&& itr : _enabled) - { - if(itr.find_first_not_of("0123456789-") != std::string::npos) - { - throw std::runtime_error( - fmt::format("Invalid GPU specification: '{}'. Only numerical values " - "(e.g., 0) or ranges (e.g., 0-7) are permitted.", - itr)); - } - - if(itr.find('-') != std::string::npos) - { - auto _v = tim::delimit(itr, "-"); - if(_v.size() != 2) - { - throw std::runtime_error( - fmt::format("Invalid GPU range specification: '{}'. " - "Required format N-M, e.g. 0-4", - itr)); - } - for(auto i = std::stoul(_v.at(0)); i < std::stoul(_v.at(1)); ++i) - _emplace(i); - } - else - { - _emplace(std::stoul(itr)); - } - } - } - - data::device_list = _devices; - - auto _metrics = get_setting_value("ROCPROFSYS_AMD_SMI_METRICS"); - - try - { - for(auto itr : _devices) - { - // Enable selected metrics only - if((_metrics && !_metrics->empty()) && (*_metrics != "all")) - { - using key_pair_t = std::pair; - const auto supported = std::unordered_map{ - key_pair_t{ "busy", get_settings(itr).busy }, - key_pair_t{ "temp", get_settings(itr).temp }, - key_pair_t{ "power", get_settings(itr).power }, - key_pair_t{ "mem_usage", get_settings(itr).mem_usage }, - key_pair_t{ "vcn_activity", get_settings(itr).vcn_activity }, - key_pair_t{ "jpeg_activity", get_settings(itr).jpeg_activity }, - key_pair_t{ "xgmi", get_settings(itr).xgmi }, - key_pair_t{ "pcie", get_settings(itr).pcie }, - key_pair_t{ "sdma_usage", get_settings(itr).sdma_usage }, - }; - - // Initialize all metrics to false - for(auto& it : supported) - it.second = false; - - // Parse list of metrics enabled by the user - if(*_metrics != "none") - { - for(const auto& metric : tim::delimit(*_metrics, ",;:\t\n ")) - { - auto iitr = supported.find(metric); - if(iitr == supported.end()) - { - LOG_CRITICAL("Unsupported amd-smi metric: {}", metric); - ::rocprofsys::set_state(::rocprofsys ::State ::Finalized); - std::exit(1); - } - LOG_DEBUG("Enabling amd-smi metric '{}' on device [{}]", metric, - itr); - iitr->second = true; - } - } - } - } - - // Log final settings for each device - for(auto itr : _devices) - { - auto& s = get_settings(itr); - LOG_INFO("[AMD-SMI] Device {} settings: busy={}, temp={}, power={}, " - "mem_usage={}, vcn_activity={}, jpeg_activity={}, xgmi={}, " - "pcie={}, sdma_usage={}", - itr, s.busy, s.temp, s.power, s.mem_usage, s.vcn_activity, - s.jpeg_activity, s.xgmi, s.pcie, s.sdma_usage); - } -#ifdef AINIC_SUPPORTED - nic_setup(); -#endif - - is_initialized() = true; - data::setup(); - - } catch(std::runtime_error& _e) - { - LOG_WARNING("Exception thrown when initializing amd-smi: {}", _e.what()); - data::device_list = {}; - } -} - -void -shutdown() -{ - auto_lock_t _lk{ type_mutex() }; - - if(!is_initialized()) return; - LOG_DEBUG("Shutting down amd-smi..."); - - try - { - if(data::shutdown()) - { - ROCPROFSYS_AMD_SMI_CALL(amdsmi_shut_down()); - } - } catch(std::runtime_error& _e) - { - LOG_WARNING("Exception thrown when shutting down amd-smi: {}", _e.what()); - } - - is_initialized() = false; -} - -void -post_process() -{ - for(auto itr : data::device_list) - { - LOG_DEBUG("Post-processing amd-smi data for device: {}", itr); - data::post_process(itr); - } - -#ifdef AINIC_SUPPORTED - for(size_t i = 0; i < nic_data::nic_vec.size(); ++i) - { - auto& nic = nic_data::nic_vec.at(i); - LOG_DEBUG("Post-processing ainic data for NIC: {}", nic); - nic_data::post_process(i); - } -#endif -} - -uint32_t -device_count() -{ - return gpu::device_count(); -} - -void -postfork_child_cleanup() -{ - // In child process, disable AMD SMI to prevent shutdown errors - LOG_DEBUG("Disabling AMD SMI in child process after fork..."); - - // Set to Finalized to prevent any sampling attempts (though is_child_process() check - // in sample() already handles this) - get_state().store(State::Finalized); - - // Mark as not initialized so shutdown won't try to cleanup AMD SMI library - is_initialized() = false; - - // Clear device list to prevent any GPU operations - data::device_list.clear(); -} - -void -postfork_parent_reinit() -{ - // In parent process, AMD SMI device handles may be corrupted after fork - // Reinitialize AMD SMI to get fresh handles - LOG_DEBUG("Reinitializing AMD SMI in parent process after fork..."); - - // Shutdown and reinitialize to get fresh device handles - shutdown(); - setup(); -} -} // namespace amd_smi -} // namespace rocprofsys - -ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), - true, double) - -ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), - true, double) - -ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), - true, double) - -ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), true, - double) - -ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), true, - double) - -ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), true, - double) - -ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), true, - double) - -ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), true, - double) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.hpp deleted file mode 100644 index da4a149ba4b..00000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.hpp +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright (c) 2018-2025 Advanced Micro Devices, Inc. All Rights Reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// with the Software without restriction, including without limitation the -// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -// sell copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// * Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimers. -// -// * Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimers in the -// documentation and/or other materials provided with the distribution. -// -// * Neither the names of Advanced Micro Devices, Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this Software without specific prior written permission. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH -// THE SOFTWARE. - -#pragma once - -#include "core/common.hpp" -#include "core/components/fwd.hpp" -#include "core/defines.hpp" -#include "core/gpu_metrics.hpp" -#include "core/state.hpp" -#include "library/amd_smi_ainic.hpp" -#include "library/thread_data.hpp" - -#include "core/amd_smi.hpp" -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ainic_stats.hpp" - -namespace rocprofsys -{ -namespace amd_smi -{ - -std::atomic& -get_state(); - -void -setup(); - -void -config(); - -void -sample(); - -void -shutdown(); - -void -post_process(); - -void set_state(State); - -// Fork handling - cleanup AMD SMI state in child process -void -postfork_child_cleanup(); - -// Fork handling - reinitialize AMD SMI state in parent process -void -postfork_parent_reinit(); - -struct settings -{ - bool busy = true; - bool temp = true; - bool power = true; - bool mem_usage = true; - bool vcn_activity = true; - bool jpeg_activity = true; - bool xgmi = true; - bool pcie = true; - bool sdma_usage = true; -}; - -struct data -{ - using msec_t = std::chrono::milliseconds; - using usec_t = std::chrono::microseconds; - using nsec_t = std::chrono::nanoseconds; - using promise_t = std::promise; - - using timestamp_t = int64_t; - using power_t = uint32_t; - using busy_perc_t = uint32_t; - using mem_usage_t = uint64_t; - using temp_t = int64_t; - - // Use the shared gpu_metrics_t from core/gpu_metrics.hpp - using gpu_metrics_t = rocprofsys::gpu::gpu_metrics_t; - - ROCPROFSYS_DEFAULT_OBJECT(data) - - explicit data(uint32_t _dev_id); - - void sample(uint32_t _dev_id); - void print(std::ostream& _os) const; - - static void post_process(uint32_t _dev_id); - - uint32_t m_dev_id = std::numeric_limits::max(); - timestamp_t m_ts = 0; - temp_t m_temp = 0; - mem_usage_t m_mem_usage = 0; - uint32_t m_sdma_usage = 0; // SDMA utilization percentage (0-100) - std::vector m_gpu_metrics = {}; - amdsmi_engine_usage_t m_busy_perc = {}; - amdsmi_power_info_t m_power = {}; - - friend std::ostream& operator<<(std::ostream& _os, const data& _v) - { - _v.print(_os); - return _os; - } - -private: - friend void rocprofsys::amd_smi::setup(); - friend void rocprofsys::amd_smi::config(); - friend void rocprofsys::amd_smi::sample(); - friend void rocprofsys::amd_smi::shutdown(); - friend void rocprofsys::amd_smi::post_process(); - friend void rocprofsys::amd_smi::postfork_child_cleanup(); - - static size_t device_count; - static std::set device_list; - static std::unique_ptr polling_finished; - static std::vector& get_initial(); - static std::unique_ptr& get_thread(); - static bool setup(); - static bool shutdown(); -}; - -} // namespace amd_smi -} // namespace rocprofsys - -#if !defined(ROCPROFSYS_EXTERN_COMPONENTS) || \ - (defined(ROCPROFSYS_EXTERN_COMPONENTS) && ROCPROFSYS_EXTERN_COMPONENTS > 0) - -# include -# include -# include - -ROCPROFSYS_DECLARE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), - true, double) - -ROCPROFSYS_DECLARE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), - true, double) - -ROCPROFSYS_DECLARE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), - true, double) - -ROCPROFSYS_DECLARE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), true, - double) - -ROCPROFSYS_DECLARE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), true, - double) - -ROCPROFSYS_DECLARE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), true, - double) - -ROCPROFSYS_DECLARE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), true, - double) - -ROCPROFSYS_DECLARE_EXTERN_COMPONENT( - TIMEMORY_ESC(data_tracker), true, - double) - -#endif diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi_ainic.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi_ainic.cpp deleted file mode 100644 index 81356c0f0b4..00000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi_ainic.cpp +++ /dev/null @@ -1,332 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc. -// SPDX-License-Identifier: MIT - -#include "library/amd_smi_ainic.hpp" - -#include "core/agent_manager.hpp" -#include "core/trace_cache/cache_manager.hpp" -#include "core/trace_cache/cacheable.hpp" -#include "core/trace_cache/sample_type.hpp" -#include -#if defined(NDEBUG) -# undef NDEBUG -#endif - -#include "core/common.hpp" -#include "core/components/fwd.hpp" -#include "core/config.hpp" -#include "core/gpu.hpp" -#include "core/gpu_metrics.hpp" -#include "core/node_info.hpp" -#include "core/perfetto.hpp" -#include "core/state.hpp" -#include "core/trace_cache/metadata_registry.hpp" -#include "library/amd_smi.hpp" -#include "library/runtime.hpp" -#include "library/thread_info.hpp" - -#include -#include -#include -#include -#include -#include - -#include "logger/debug.hpp" - -#include -#include -#include -#include -#include -#include -#include - -namespace rocprofsys -{ -namespace amd_smi -{ - -using nic_bundle_t = std::deque; -using nic_sampler_instances = thread_data; -static std::vector nic_sampler_vec = {}; -std::vector nic_data::nic_vec = {}; -ai_nic_stats_collector nic_data::nic_stats_collector; - -namespace -{ -std::vector*> _nic_bundle_data{}; -} // namespace - -nic_data::nic_data(uint32_t nic_index, const std::string& nic) -: _nic(nic) -, _nic_index(nic_index) -{ - sample(); -} - -void -nic_data::sample() -{ - nic_stats stats; - nic_data::nic_stats_collector.get_data(_nic, stats); - _rx_rdma_cnp_pkts = stats._rx_rdma_cnp_pkts; - _tx_rdma_cnp_pkts = stats._tx_rdma_cnp_pkts; - _rx_ucast_bytes = stats._rx_rdma_ucast_bytes; - _tx_ucast_bytes = stats._tx_rdma_ucast_bytes; - _rx_ucast_pkts = stats._rx_rdma_ucast_pkts; - _tx_ucast_pkts = stats._tx_rdma_ucast_pkts; - - auto _timestamp = tim::get_clock_real_now(); - assert(_timestamp < std::numeric_limits::max()); - m_ts = _timestamp; - - trace_cache::get_buffer_storage().store(trace_cache::ainic_sample{ - _timestamp, _nic_index, _rx_rdma_cnp_pkts, _tx_rdma_cnp_pkts, _rx_ucast_bytes, - _tx_ucast_bytes, _rx_ucast_pkts, _tx_ucast_pkts }); -} - -bool -nic_data::setup() -{ - perfetto_counter_track::init(); - return true; -} - -void -metadata_initialize_ainic_smi_tracks(uint32_t nic_index) -{ - const auto thread_id = std::nullopt; - std::string& nic = nic_data::nic_vec[nic_index]; - - trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_nic( - nic, nic_index), - thread_id, "{}" }); - trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_nic( - nic, nic_index), - thread_id, "{}" }); - trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_nic( - nic, nic_index), - thread_id, "{}" }); - trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_nic( - nic, nic_index), - thread_id, "{}" }); - trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_nic( - nic, nic_index), - thread_id, "{}" }); - trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_nic( - nic, nic_index), - thread_id, "{}" }); -} - -void -metadata_initialize_ainic_smi_pmc(uint32_t nic_index) -{ - size_t EVENT_CODE = 0; - size_t INSTANCE_ID = 0; - const char* LONG_DESCRIPTION = ""; - const char* COMPONENT = ""; - const char* BLOCK = ""; - const char* EXPRESSION = ""; - auto ni = node_info::get_instance(); - const char* TARGET_ARCH = ""; - - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::NIC, nic_index, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trait::name::value, "NIC RX CNP PKTS", - trait::name::description, LONG_DESCRIPTION, - COMPONENT, trace_cache::ABSOLUTE, rocprofsys::trace_cache::ABSOLUTE, BLOCK, - EXPRESSION, 0, 0, "{}" }); - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::NIC, nic_index, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trait::name::value, "NIC TX CNP PKTS", - trait::name::description, LONG_DESCRIPTION, - COMPONENT, trace_cache::ABSOLUTE, rocprofsys::trace_cache::ABSOLUTE, BLOCK, - EXPRESSION, 0, 0, "{}" }); - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::NIC, nic_index, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trait::name::value, - "AI NIC RX UCAST BYTES", - trait::name::description, - LONG_DESCRIPTION, COMPONENT, trace_cache::ABSOLUTE, - rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0, 0, "{}" }); - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::NIC, nic_index, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trait::name::value, - "AI NIC TX UCAST BYTES", - trait::name::description, - LONG_DESCRIPTION, COMPONENT, trace_cache::ABSOLUTE, - rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0, 0, "{}" }); - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::NIC, nic_index, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trait::name::value, "AI NIC RX UCAST PKTS", - trait::name::description, LONG_DESCRIPTION, - COMPONENT, trace_cache::ABSOLUTE, rocprofsys::trace_cache::ABSOLUTE, BLOCK, - EXPRESSION, 0, 0, "{}" }); - trace_cache::get_metadata_registry().add_pmc_info( - { agent_type::NIC, nic_index, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trait::name::value, "AI NIC TX UCAST PKTS", - trait::name::description, LONG_DESCRIPTION, - COMPONENT, trace_cache::ABSOLUTE, rocprofsys::trace_cache::ABSOLUTE, BLOCK, - EXPRESSION, 0, 0, "{}" }); -} - -void -nic_config() -{ - for(uint32_t nic_index = 0; nic_index < nic_data::nic_vec.size(); ++nic_index) - { - auto nic_bundle = std::deque{}; - nic_sampler_vec.push_back(nic_bundle); - metadata_initialize_ainic_smi_tracks(nic_index); - metadata_initialize_ainic_smi_pmc(nic_index); - } -} -void -nic_sample() -{ - if(amd_smi::get_state() != State::Active) return; - - // Get AI NIC data for all NICs at once. - nic_data::nic_stats_collector.update_stats(); - - for(uint32_t nic_index = 0; nic_index < nic_data::nic_vec.size(); ++nic_index) - { - std::string& nic = nic_data::nic_vec[nic_index]; - auto data = nic_data{ nic_index, nic }; - nic_sampler_vec[nic_index].push_back(data); - } -} - -void -nic_data::post_process(size_t nic_index) -{ - using counter_track = perfetto_counter_track; - std::string& nic = nic_data::nic_vec[nic_index]; - - const auto& _thread_info = thread_info::get(0, InternalTID); - if(!_thread_info) - { - if(get_is_continuous_integration()) - { - throw std::runtime_error("Missing thread info for thread 0"); - } - LOG_ERROR("Missing thread info for thread 0"); - return; - } - - auto addendum = [&](const char* _v) { - return fmt::format("{} {} [ {} ] (S)", nic, _v, nic_index); - }; - - for(auto& itr : nic_sampler_vec[nic_index]) - { - uint64_t _ts = itr.m_ts; - if(!_thread_info->is_valid_time(_ts)) continue; - - uint64_t _rx_rdma_cnp_pkts = itr._rx_rdma_cnp_pkts; - uint64_t _tx_rdma_cnp_pkts = itr._tx_rdma_cnp_pkts; - uint64_t _rx_ucast_bytes = itr._rx_ucast_bytes; - uint64_t _tx_ucast_bytes = itr._tx_ucast_bytes; - uint64_t _rx_ucast_pkts = itr._rx_ucast_pkts; - uint64_t _tx_ucast_pkts = itr._tx_ucast_pkts; - - counter_track::emplace(nic_index, addendum("RX CNP PKTS"), "packets"); - counter_track::emplace(nic_index, addendum("TX CNP PKTS"), "packets"); - counter_track::emplace(nic_index, addendum("RX UCAST BYTES"), "bytes"); - counter_track::emplace(nic_index, addendum("TX UCAST BYTES"), "bytes"); - counter_track::emplace(nic_index, addendum("RX UCAST PKTS"), "packets"); - counter_track::emplace(nic_index, addendum("TX UCAST PKTS"), "packets"); - - size_t track_index = 0; - - TRACE_COUNTER("nic_rx_cnp_pkts", counter_track::at(nic_index, track_index++), _ts, - _rx_rdma_cnp_pkts); - TRACE_COUNTER("nic_tx_cnp_pkts", counter_track::at(nic_index, track_index++), _ts, - _tx_rdma_cnp_pkts); - TRACE_COUNTER("nic_rx_ucast_bytes", counter_track::at(nic_index, track_index++), - _ts, _rx_ucast_bytes); - TRACE_COUNTER("nic_tx_ucast_bytes", counter_track::at(nic_index, track_index++), - _ts, _tx_ucast_bytes); - TRACE_COUNTER("nic_rx_ucast_pkts", counter_track::at(nic_index, track_index++), - _ts, _rx_ucast_pkts); - TRACE_COUNTER("nic_tx_ucast_pkts", counter_track::at(nic_index, track_index++), - _ts, _tx_ucast_pkts); - } -} - -void -nic_setup() -{ - // Run update_stats() the first time, to get the names of all existing NICs. - nic_data::nic_stats_collector.update_stats(); - - auto ainic_devices = get_sampling_ainics(); - - std::string devices_lowercase = ainic_devices; - for(auto& itr : devices_lowercase) - itr = std::tolower(itr); - - if(devices_lowercase == "all") - { - // Set nic_vec to all devices. - nic_data::nic_vec = nic_data::nic_stats_collector.get_nic_list(); - } - else if(devices_lowercase == "none") - { - // Set nic_vec to an empty vector. - nic_data::nic_vec = {}; - } - else - { - // Get list of devices from the command line and add those that are - // valid to nic_vec. - nic_data::nic_vec = {}; - auto nic_list = tim::delimit(ainic_devices, ","); - std::unordered_set nic_set{}; // For detecting duplicates - for(const auto& nic : nic_list) - { - if(!nic_data::nic_stats_collector.is_nic_valid(nic)) - { - LOG_WARNING("Invalid NIC: {}", nic); - } - else if(nic_set.find(nic) != nic_set.end()) - { - LOG_WARNING("Repeated NIC: {}", nic); - } - else - { - nic_data::nic_vec.push_back(nic); - nic_set.insert(nic); - } - } - } - - for(auto nic_index{ 0u }; nic_index < nic_data::nic_vec.size(); ++nic_index) - { - std::string& nic = nic_data::nic_vec[nic_index]; - auto cur_agent = agent{ agent_type::NIC, - 0, - nic_index, - nic_index, - static_cast(nic_index), - static_cast(nic_index), - nic, - nic, - "AI NIC", - "AI NIC" }; - get_agent_manager_instance().insert_agent(cur_agent); - } - - nic_data::setup(); -} - -} // namespace amd_smi - -} // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi_ainic.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi_ainic.hpp deleted file mode 100644 index b9a45ed6586..00000000000 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi_ainic.hpp +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc. -// SPDX-License-Identifier: MIT - -#pragma once - -#include "core/common.hpp" -#include "core/components/fwd.hpp" -#include "core/defines.hpp" -#include "core/gpu_metrics.hpp" -#include "core/state.hpp" -#include "library/thread_data.hpp" - -#include "core/amd_smi.hpp" -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "library/ainic_stats.hpp" - -namespace rocprofsys -{ -namespace amd_smi -{ - -void -nic_setup(); - -void -nic_config(); - -void -nic_sample(); - -struct nic_data -{ - using timestamp_t = int64_t; - - explicit nic_data(uint32_t nic_index, const std::string& nic); - - static std::vector& get_initial(); - static std::vector nic_vec; - const std::string& get_nic() const; - static bool setup(); - - void sample(); - - static void post_process(size_t nic_index); - - static ai_nic_stats_collector nic_stats_collector; - - timestamp_t m_ts = 0; - -private: - std::string _nic; - uint32_t _nic_index = 0; - uint64_t _rx_rdma_cnp_pkts = 0; - uint64_t _tx_rdma_cnp_pkts = 0; - uint64_t _rx_ucast_bytes = 0; - uint64_t _tx_ucast_bytes = 0; - uint64_t _rx_ucast_pkts = 0; - uint64_t _tx_ucast_pkts = 0; -}; - -} // namespace amd_smi - -} // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/cpu_freq.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/cpu_freq.cpp index 4c590cdcdcd..7a98eb4efed 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/cpu_freq.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/cpu_freq.cpp @@ -94,8 +94,7 @@ metadata_initialize_cpu_freq_tracks() { do_for_enabled_cpus([&](size_t cpu_id) { trace_cache::get_metadata_registry().add_track( - { trace_cache::info::annotate_with_device_id(cpu_id) - .c_str(), + { trace_cache::info::format_track_name(cpu_id).c_str(), std::nullopt, "{}" }); }); } @@ -137,8 +136,7 @@ metadata_initialize_cpu_freq_pmc(size_t dev_id) do_for_enabled_cpus([&](size_t cpu_id) { trace_cache::get_metadata_registry().add_pmc_info( { agent_type::CPU, dev_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, - trace_cache::info::annotate_with_device_id(cpu_id) - .c_str(), + trace_cache::info::format_track_name(cpu_id).c_str(), "Frequency", trait::name::description, LONG_DESCRIPTION, COMPONENT, component::cpu_freq::display_unit().c_str(), rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0, 0 }); diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/CMakeLists.txt new file mode 100644 index 00000000000..3c05ec7decf --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/CMakeLists.txt @@ -0,0 +1,43 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +# PMC (Performance Metric Counter) Sampler Library + +# Add subdirectories +add_subdirectory(common) +add_subdirectory(device_providers) +add_subdirectory(collectors) + +# Sampler API sources +set(pmc_sampler_sources + ${CMAKE_CURRENT_LIST_DIR}/sampler.hpp + ${CMAKE_CURRENT_LIST_DIR}/sampler.cpp +) + +# Collect all PMC sources +set(pmc_sources + ${pmc_sampler_sources} + ${pmc_common_types_sources} + ${pmc_device_provider_sources} + ${pmc_collectors_sources} +) + +# Create PMC library object +add_library(rocprofiler-systems-pmc-library OBJECT ${pmc_sources}) + +# Add AINIC support definition if available +# PUBLIC so that tests linking to this library also get the definition +if(ROCPROFSYS_BUILD_AINIC) + target_compile_definitions( + rocprofiler-systems-pmc-library + PUBLIC ROCPROFSYS_BUILD_AINIC=1 + ) +endif() + +target_link_libraries( + rocprofiler-systems-pmc-library + PRIVATE + rocprofiler-systems-core-library + rocprofiler-systems::rocprofiler-systems-interface-library + rocprofiler-systems::rocprofiler-systems-rocm +) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/CMakeLists.txt new file mode 100644 index 00000000000..2aff3ebfe9d --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/CMakeLists.txt @@ -0,0 +1,30 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +# PMC Collectors + +# Add subdirectories +add_subdirectory(common) +add_subdirectory(base) +add_subdirectory(gpu) + +# Conditionally compile NIC collector based on AINIC support +# ROCPROFSYS_BUILD_AINIC is set in cmake/Packages.cmake based on: +# - ROCPROFSYS_USE_AINIC option being ON +# - AMD SMI version >= 26.3 +if(ROCPROFSYS_BUILD_AINIC) + add_subdirectory(nic) +else() + set(pmc_nic_sources "") +endif() + +# Collect all sources +set(pmc_collectors_sources + ${pmc_common_sources} + ${pmc_base_sources} + ${pmc_gpu_sources} + ${pmc_nic_sources} +) + +# Add to parent variable +set(pmc_collectors_sources ${pmc_collectors_sources} PARENT_SCOPE) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/base/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/base/CMakeLists.txt new file mode 100644 index 00000000000..c304b7c2215 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/base/CMakeLists.txt @@ -0,0 +1,11 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +# PMC Base Collector Sources +set(pmc_base_sources + ${CMAKE_CURRENT_LIST_DIR}/collector.hpp + ${CMAKE_CURRENT_LIST_DIR}/traits_check.hpp +) + +# Add to parent variable (will be used by collectors CMakeLists.txt) +set(pmc_base_sources ${pmc_base_sources} PARENT_SCOPE) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/base/collector.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/base/collector.hpp new file mode 100644 index 00000000000..382917cf448 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/base/collector.hpp @@ -0,0 +1,223 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "library/pmc/collectors/base/traits_check.hpp" +#include "logger/debug.hpp" + +#include +#include +#include +#include +#include + +namespace rocprofsys::pmc::collectors::base +{ + +/** + * @brief Generic collector template for device performance monitoring. + * + * This collector provides a unified implementation for GPU, NIC, and CPU metrics + * collection. Device-specific behavior is configured via the Traits template parameter. + * + * @tparam Traits Device-specific traits defining types and customization points + * @tparam DeviceProvider Type providing device enumeration and management + * @tparam Config Configuration policy providing settings and output policies + */ +template +struct collector +{ + // Validate traits at compile time + static_assert(has_required_types_v, + "Invalid traits: missing required type aliases"); + static_assert(has_device_name_v, "Traits must define: device_name"); + static_assert(has_enumerate_devices_v, + "Traits must define: enumerate_devices() and device_entry type"); + + // Type aliases from traits + using traits_t = Traits; + using metrics_t = typename Traits::metrics_t; + using enabled_metrics_t = typename Traits::enabled_metrics_t; + using device_t = typename Traits::device_t; + using device_ptr_t = typename Traits::device_ptr_t; + using container_t = typename Traits::container_t; + using driver_t = typename Traits::driver_t; + + // Type aliases from config + using device_provider = DeviceProvider; + using SettingsApi = typename Config::SettingsApi; + using PerfettoApi = typename Config::PerfettoApi; + using CacheApi = typename Config::CacheApi; + + // Device entry type from traits (contains device + cached supported metrics) + using device_entry = typename Traits::device_entry; + using device_entries_t = std::vector; + + /** + * @brief Construct a collector with an injected device provider. + * + * @param provider Shared pointer to the device provider instance + */ + explicit collector(std::shared_ptr provider) + : m_device_provider(std::move(provider)) + {} + + collector() = delete; + + /** + * @brief Initialize the collector and enumerate devices. + * + * Retrieves version information (for GPU), enumerates devices based on filter + * settings, and initializes Perfetto storage if legacy metrics are enabled. + * + * @throws std::runtime_error If device provider is not set. + */ + void setup() + { + if(!m_device_provider) + { + throw std::runtime_error( + "Device provider not set. Use constructor or set_device_provider()."); + } + + m_device_entries = + Traits::template enumerate_devices(m_device_provider); + m_enabled_metrics = Traits::template get_enabled_metrics(); + + LOG_INFO("Enabled {} {} devices for PMC sampling", m_device_entries.size(), + Traits::device_name); + + if(SettingsApi::get_use_perfetto_legacy_metrics()) + { + Traits::template init_perfetto_storage(m_device_entries); + } + } + + /** + * @brief Configure metrics tracking and initialize metadata. + * + * Sets up category metadata, Perfetto counter tracks, and PMC tracks/metadata + * for all enabled devices. + */ + void config() + { + CacheApi::initialize_category_metadata(); + CacheApi::initialize_tracks_metadata(); + + for(const auto& entry : m_device_entries) + { + if(SettingsApi::get_use_perfetto_legacy_metrics()) + { + Traits::template setup_counter_tracks(entry.device, + m_enabled_metrics); + } + Traits::template init_pmc_metadata(entry.device); + } + } + + /** + * @brief Sample metrics from all enabled devices. + * + * Iterates through all devices, retrieves current metrics, and stores them + * via the cache API and optionally Perfetto. Devices that fail to read metrics + * are automatically disabled and removed from the device list. + * + * @param timestamp Current timestamp in nanoseconds for the sample. + */ + void sample(int64_t timestamp) + { + auto new_end = std::remove_if( + m_device_entries.begin(), m_device_entries.end(), + [this, timestamp](const device_entry& entry) { + auto _timestamp = static_cast(timestamp); + + try + { + auto _metrics = + Traits::get_metrics(entry.device, m_enabled_metrics, _timestamp); + auto _device_id = entry.device->get_index(); + auto _device_name = entry.device->get_name(); + + CacheApi::store_sample(_device_id, _device_name, m_enabled_metrics, + entry.supported_metrics, _metrics, _timestamp); + + if(SettingsApi::get_use_perfetto_legacy_metrics()) + { + PerfettoApi::store_sample(_device_id, _metrics, _timestamp); + } + return false; // Keep device + } catch(const std::runtime_error& e) + { + LOG_ERROR("Reading metrics failed for {} device {}. Error: {}. " + "Disabling device!", + Traits::device_name, entry.device->get_index(), e.what()); + return true; // Remove device + } + }); + m_device_entries.erase(new_end, m_device_entries.end()); + } + + /** + * @brief Perform post-processing of collected metrics. + * + * Triggers Perfetto post-processing if legacy metrics mode is enabled. + */ + void post_process() + { + if(SettingsApi::get_use_perfetto_legacy_metrics()) + { + Traits::template post_process_perfetto(m_device_entries, + m_enabled_metrics); + } + } + + /** + * @brief Get the device entries (devices with cached supported metrics). + * @return Const reference to the vector of device entries. + */ + const device_entries_t& get_device_entries() const noexcept + { + return m_device_entries; + } + + /** + * @brief Get the number of enabled devices. + * @return Number of devices currently enabled for sampling. + */ + size_t get_device_count() const noexcept { return m_device_entries.size(); } + + /** + * @brief Set the device provider (for backward compatibility). + * + * @param provider Shared pointer to the device provider instance + */ + void set_device_provider(std::shared_ptr provider) + { + m_device_provider = std::move(provider); + } + + /** + * @brief Shutdown the device provider and release resources. + * + * @note This method does NOT clear m_device_entries because post_process() + * may be called after shutdown() and needs access to device information. + * The device entries are cleared in pmc::post_process() after post_process + * is called on all collectors. + */ + void shutdown() + { + if(m_device_provider) + { + m_device_provider->shutdown(); + m_device_provider.reset(); + } + } + +private: + device_entries_t m_device_entries; ///< Devices with cached supported metrics + std::shared_ptr m_device_provider; ///< Device provider instance + enabled_metrics_t m_enabled_metrics; ///< Enabled metrics +}; + +} // namespace rocprofsys::pmc::collectors::base diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/base/traits_check.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/base/traits_check.hpp new file mode 100644 index 00000000000..a1eed009f99 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/base/traits_check.hpp @@ -0,0 +1,87 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include + +namespace rocprofsys::pmc::collectors::base +{ + +/** + * @brief Type trait to check if Traits defines all required type aliases. + * + * Required types: + * - metrics_t: The metrics data structure for this device type + * - enabled_metrics_t: Bitset/struct indicating which metrics are enabled + * - device_t: The device class type + * - device_ptr_t: Smart pointer type for device (typically shared_ptr) + * - container_t: Container type for storing devices (vector or set) + */ +template +struct has_required_types : std::false_type +{}; + +template +struct has_required_types< + Traits, std::void_t> : std::true_type +{}; + +template +inline constexpr bool has_required_types_v = has_required_types::value; + +/** + * @brief Type trait to check if Traits defines the device_name constant. + */ +template +struct has_device_name : std::false_type +{}; + +template +struct has_device_name> +: std::true_type +{}; + +template +inline constexpr bool has_device_name_v = has_device_name::value; + +/** + * @brief Type trait to check if Traits defines enumerate_devices(). + * + * Expected signature: + * template + * static std::vector enumerate_devices(std::shared_ptr) + * + * Since enumerate_devices is a template function, we cannot use + * &Traits::enumerate_devices to detect it (the compiler cannot resolve which + * instantiation to take the address of). Instead, we use SFINAE with dummy types to check + * if a valid instantiation exists. + */ +template +struct has_enumerate_devices : std::false_type +{}; + +namespace detail +{ +struct dummy_settings +{}; +struct dummy_provider +{}; +} // namespace detail + +template +struct has_enumerate_devices< + Traits, std::void_t( + std::declval>()))>> +: std::true_type +{}; + +template +inline constexpr bool has_enumerate_devices_v = has_enumerate_devices::value; + +} // namespace rocprofsys::pmc::collectors::base diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/common/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/common/CMakeLists.txt new file mode 100644 index 00000000000..2888d99f422 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/common/CMakeLists.txt @@ -0,0 +1,16 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +# PMC Common Sources +set(pmc_common_sources + ${CMAKE_CURRENT_LIST_DIR}/collector_slice.hpp + ${CMAKE_CURRENT_LIST_DIR}/settings.hpp +) + +# Add to parent variable (will be used by pmc CMakeLists.txt) +set(pmc_common_sources ${pmc_common_sources} PARENT_SCOPE) + +# Tests +if(ROCPROFSYS_BUILD_TESTING) + add_subdirectory(tests) +endif() diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/common/collector_slice.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/common/collector_slice.hpp new file mode 100644 index 00000000000..a2dfb15c131 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/common/collector_slice.hpp @@ -0,0 +1,114 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +namespace rocprofsys::pmc::collectors +{ + +/** + * @brief Type-erased collector slice - non-owning view of any collector type. + * + * This class provides a lightweight type erasure mechanism for PMC collectors. + * It allows storing heterogeneous collector types (GPU, NIC, CPU) in a single + * container without requiring virtual inheritance or a common base class. + * + * The collector_slice is a non-owning view (like std::string_view or + * std::span). The actual collector object must outlive the slice. + * + * Any type T can be wrapped in a collector_slice as long as it provides the + * required interface methods: setup(), config(), sample(timestamp), + * post_process(), shutdown() + * + * Example usage: + * @code + * pmc::collectors::gpu::collector gpu_collector(device_mgr); + * pmc::collectors::nic::collector nic_collector(device_mgr); + * + * std::vector slices; + * slices.emplace_back(gpu_collector); // Creates slice to gpu_collector + * slices.emplace_back(nic_collector); // Creates slice to nic_collector + * + * auto timestamp = get_clock_now(); + * for (auto& slice : slices) { + * slice.setup(); // Calls appropriate collector's setup() + * slice.sample(timestamp); // Calls appropriate collector's sample() + * } + * @endcode + */ +class collector_slice +{ +public: + /** + * @brief Construct a collector_slice from any collector type. + * + * @tparam T Collector type (must have setup, config, sample, post_process, shutdown + * methods) + * @param obj Reference to the collector object (must outlive the slice) + */ + template + explicit collector_slice(T& obj) + : m_object{ &obj } + , m_setup_impl{ [](void* ptr) { static_cast(ptr)->setup(); } } + , m_config_impl{ [](void* ptr) { static_cast(ptr)->config(); } } + , m_sample_impl{ [](void* ptr, int64_t timestamp) { + static_cast(ptr)->sample(timestamp); + } } + , m_post_process_impl{ [](void* ptr) { static_cast(ptr)->post_process(); } } + , m_shutdown_impl{ [](void* ptr) { static_cast(ptr)->shutdown(); } } + {} + + /** + * @brief Setup the collector. + * + * Calls the underlying collector's setup() method. + */ + void setup() { m_setup_impl(m_object); } + + /** + * @brief Configure the collector. + * + * Calls the underlying collector's config() method. + */ + void config() { m_config_impl(m_object); } + + /** + * @brief Sample metrics from the collector. + * + * @param timestamp Current timestamp in nanoseconds. + * Calls the underlying collector's sample() method. + */ + void sample(int64_t timestamp) { m_sample_impl(m_object, timestamp); } + + /** + * @brief Post-process collected metrics. + * + * Calls the underlying collector's post_process() method. + */ + void post_process() { m_post_process_impl(m_object); } + + /** + * @brief Shutdown the collector. + * + * Calls the underlying collector's shutdown() method. + */ + void shutdown() { m_shutdown_impl(m_object); } + +private: + using setup_fn_t = void (*)(void*); + using config_fn_t = void (*)(void*); + using sample_fn_t = void (*)(void*, int64_t); + using post_process_fn_t = void (*)(void*); + using shutdown_fn_t = void (*)(void*); + + void* m_object; /**< Non-owning pointer to collector */ + setup_fn_t m_setup_impl; /**< Type-erased setup function */ + config_fn_t m_config_impl; /**< Type-erased config function */ + sample_fn_t m_sample_impl; /**< Type-erased sample function */ + post_process_fn_t m_post_process_impl; /**< Type-erased post_process function */ + shutdown_fn_t m_shutdown_impl; /**< Type-erased shutdown function */ +}; + +} // namespace rocprofsys::pmc::collectors diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/common/settings.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/common/settings.hpp new file mode 100644 index 00000000000..848e35f1139 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/common/settings.hpp @@ -0,0 +1,295 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "core/config.hpp" +#include "library/pmc/collectors/gpu/types.hpp" +#include "library/pmc/collectors/nic/types.hpp" +#include "logger/debug.hpp" + +#include +#include +#include +#include +#include +#include + +namespace rocprofsys::pmc::collectors +{ + +// Import GPU types into collectors namespace +namespace gpu +{ +using ::rocprofsys::pmc::device_filter; +using ::rocprofsys::pmc::device_selection_mode; +using ::rocprofsys::pmc::collectors::gpu::enabled_metrics; +} // namespace gpu + +// Import NIC types into collectors namespace +namespace nic +{ +using ::rocprofsys::pmc::device_selection_mode; +using ::rocprofsys::pmc::nic_device_filter; +using ::rocprofsys::pmc::collectors::nic::enabled_metrics; +} // namespace nic + +namespace +{ +// Bitfield values for enabling/disabling all metrics at once +// 0x7FFF sets GPU metric bits 0-14 to 1 (all 15 GPU metrics enabled) +// 0x0000 sets all bits to 0 (all disabled) +constexpr uint32_t ENABLE_ALL_METRICS = 0x7FFF; +constexpr uint32_t DISABLE_ALL_METRICS = 0x0000; +} // namespace + +struct settings_policy +{ + static gpu::device_filter get_device_filter() noexcept + { + auto filter = rocprofsys::get_sampling_gpus(); + if(filter == "all" || filter == "on" || filter.empty()) + { + gpu::device_filter result; + result.mode = gpu::device_selection_mode::ALL; + return result; + } + + if(filter == "none" || filter == "off") + { + gpu::device_filter result; + result.mode = gpu::device_selection_mode::NONE; + return result; + } + + auto enabled_devices = parse_numeric_range(filter); + gpu::device_filter result; + result.mode = gpu::device_selection_mode::SPECIFIC; + result.indices = enabled_devices; + return result; + } + + static gpu::enabled_metrics get_enabled_metrics() noexcept + { + static auto _enabled_metrics = []() { + auto setting = get_setting_value("ROCPROFSYS_AMD_SMI_METRICS"); + auto value_str = setting.has_value() ? setting.value() : "all"; + auto result = parse_enabled_metrics(value_str); + return result; + }(); + return _enabled_metrics; + } + + static bool get_use_perfetto_legacy_metrics() { return get_use_perfetto(); } + + /** + * @brief Get NIC device filter based on ROCPROFSYS_SAMPLING_AINICS setting. + * + * Parses comma-separated list of NIC device names (e.g., "enp226s0,eth0"). + * Special values: "all" enables all NICs, "none" disables NIC sampling. + */ + static nic::nic_device_filter get_nic_device_filter() noexcept + { + auto filter = get_setting_value("ROCPROFSYS_SAMPLING_AINICS"); + if(!filter.has_value()) + { + // NIC sampling disabled by default + nic::nic_device_filter result; + result.mode = nic::device_selection_mode::NONE; + return result; + } + + auto filter_str = filter.value(); + if(filter_str == "all" || filter_str == "on") + { + nic::nic_device_filter result; + result.mode = nic::device_selection_mode::ALL; + return result; + } + + if(filter_str == "none" || filter_str == "off" || filter_str.empty()) + { + nic::nic_device_filter result; + result.mode = nic::device_selection_mode::NONE; + return result; + } + + // Parse comma-separated names + nic::nic_device_filter result; + result.mode = nic::device_selection_mode::SPECIFIC; + result.names = parse_name_list(filter_str); + return result; + } + + /** + * @brief Get NIC enabled metrics. + * + * For NIC, all 6 RDMA metrics are enabled when NIC sampling is active. + */ + static nic::enabled_metrics get_nic_enabled_metrics() noexcept + { + nic::enabled_metrics result; + result.value = nic::ALL_NIC_METRICS; + return result; + } + +private: + static gpu::enabled_metrics parse_enabled_metrics(const std::string& input) + { + std::string settings_trimmed; + settings_trimmed.reserve(input.size()); + std::for_each(input.begin(), input.end(), [&settings_trimmed](char ch) { + if(ch != '\t' && ch != ' ') + { + settings_trimmed.push_back(static_cast(std::tolower(ch))); + } + }); + + if(settings_trimmed.empty() || settings_trimmed == "all") + { + gpu::enabled_metrics result; + result.value = ENABLE_ALL_METRICS; + return result; + } + + if(settings_trimmed == "none") + { + gpu::enabled_metrics result; + result.value = DISABLE_ALL_METRICS; + return result; + } + + std::regex validator{ + R"(^(?:temp|power|busy|mem_usage|vcn_activity|jpeg_activity|xgmi|pcie|sdma_usage)" + R"()(?:[,;](?:temp|power|busy|mem_usage|vcn_activity|jpeg_activity|xgmi|pcie|sdma_usage))*$)" + }; + + if(!std::regex_match(settings_trimmed, validator)) + { + LOG_INFO("Invalid metrics settings '{}'. Enabling all metrics.", input); + gpu::enabled_metrics result; + result.value = ENABLE_ALL_METRICS; + return result; + } + + auto make_metric = [](std::initializer_list bit_positions) { + uint32_t value = 0; + for(auto bit : bit_positions) + { + value |= (1u << bit); + } + gpu::enabled_metrics result; + result.value = value; + return result.value; + }; + + // See enabled_metrics definition in common.hpp for bit position documentation + const std::unordered_map mapper{ + { "temp", make_metric({ 3, 4 }) }, // hotspot, edge + { "power", make_metric({ 0, 1 }) }, // current, average + { "busy", make_metric({ 5, 6, 7 }) }, // gfx, umc, mm + { "mem_usage", make_metric({ 2 }) }, // memory_usage + { "vcn_activity", make_metric({ 8 }) }, // vcn_activity + { "jpeg_activity", make_metric({ 9 }) }, // jpeg_activity + { "xgmi", make_metric({ 12 }) }, // xgmi + { "pcie", make_metric({ 13 }) }, // pcie + { "sdma_usage", make_metric({ 14 }) }, // sdma_usage + }; + + gpu::enabled_metrics metrics; + metrics.value = DISABLE_ALL_METRICS; + std::regex tokenizer{ R"(\w+)" }; + std::sregex_iterator it(settings_trimmed.begin(), settings_trimmed.end(), + tokenizer); + std::sregex_iterator end; + + for(; it != end; ++it) + { + auto found = mapper.find(it->str()); + if(found != mapper.end()) + { + metrics.value |= found->second; + } + } + + return metrics; + } + + static std::set parse_numeric_range(const std::string& input_range) + { + std::set result; + + const std::regex validator{ R"(^\d+(?:-\d+)?(?:[;,]\d+(?:[-:]\d+)?)*$)" }; + + if(!std::regex_match(input_range, validator)) + { + LOG_ERROR("Failed to parse gpu input list: {}", input_range); + return result; + } + + std::regex tokenizer{ R"(\d+(?:[-:]\d+)*)" }; + std::sregex_iterator it(input_range.begin(), input_range.end(), tokenizer); + std::sregex_iterator end; + + for(; it != end; ++it) + { + auto token = it->str(); + auto delimiter_position = std::find_if( + token.begin(), token.end(), [](char c) { return c == ':' || c == '-'; }); + + if(delimiter_position != token.end()) + { + size_t begin = + std::stoul(std::string{ token.begin(), delimiter_position }); + size_t range_end = + std::stoul(std::string{ delimiter_position + 1, token.end() }); + + if(begin > range_end) + { + std::swap(begin, range_end); + } + + for(auto i = begin; i <= range_end; ++i) + { + result.insert(i); + } + } + else + { + result.insert(std::stoul(token)); + } + } + + return result; + } + + /** + * @brief Parse comma or semicolon-separated list of names. + */ + static std::set parse_name_list(const std::string& input) + { + std::set result; + std::stringstream ss(input); + std::string token; + + while(std::getline(ss, token, ',')) + { + // Also handle semicolons + std::stringstream ss2(token); + std::string subtoken; + while(std::getline(ss2, subtoken, ';')) + { + // Trim whitespace + auto start = subtoken.find_first_not_of(" \t"); + auto end = subtoken.find_last_not_of(" \t"); + if(start != std::string::npos && end != std::string::npos) + { + result.insert(subtoken.substr(start, end - start + 1)); + } + } + } + return result; + } +}; + +} // namespace rocprofsys::pmc::collectors diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/common/tests/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/common/tests/CMakeLists.txt new file mode 100644 index 00000000000..c1c2c652d3d --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/common/tests/CMakeLists.txt @@ -0,0 +1,21 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +set(pmc_common_tests_sources test_collector_slice.cpp) + +add_library(pmc-common-tests OBJECT ${pmc_common_tests_sources}) + +target_link_libraries( + pmc-common-tests + PRIVATE + rocprofiler-systems-googletest-library + rocprofiler-systems-core-library + rocprofiler-systems-interface-library +) + +target_include_directories( + pmc-common-tests + PRIVATE + ${PROJECT_SOURCE_DIR}/source/lib/core + ${PROJECT_SOURCE_DIR}/source/lib/rocprof-sys +) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/common/tests/test_collector_slice.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/common/tests/test_collector_slice.cpp new file mode 100644 index 00000000000..a647b1b69a4 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/common/tests/test_collector_slice.cpp @@ -0,0 +1,185 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#include "library/pmc/collectors/common/collector_slice.hpp" + +#include +#include + +#include +#include + +namespace rocprofsys::pmc::collectors::testing +{ + +// Mock collector implementations for testing type erasure +class mock_gpu_collector +{ +public: + void setup() { setup_called = true; } + void config() { config_called = true; } + void sample(int64_t ts) + { + sample_called = true; + last_timestamp = ts; + } + void post_process() { post_process_called = true; } + void shutdown() { shutdown_called = true; } + + bool setup_called = false; + bool config_called = false; + bool sample_called = false; + int64_t last_timestamp = 0; + bool post_process_called = false; + bool shutdown_called = false; +}; + +class mock_nic_collector +{ +public: + void setup() { setup_called = true; } + void config() { config_called = true; } + void sample(int64_t ts) + { + sample_count++; + last_timestamp = ts; + } + void post_process() { post_process_called = true; } + void shutdown() { shutdown_called = true; } + + bool setup_called = false; + bool config_called = false; + int sample_count = 0; + int64_t last_timestamp = 0; + bool post_process_called = false; + bool shutdown_called = false; +}; + +class collector_slice_test : public ::testing::Test +{ +protected: + void SetUp() override + { + gpu_collector = std::make_unique(); + nic_collector = std::make_unique(); + } + + void TearDown() override + { + gpu_collector.reset(); + nic_collector.reset(); + } + + std::unique_ptr gpu_collector; + std::unique_ptr nic_collector; +}; + +TEST_F(collector_slice_test, single_collector_calls_all_methods) +{ + collector_slice slice(*gpu_collector); + + EXPECT_FALSE(gpu_collector->setup_called); + slice.setup(); + EXPECT_TRUE(gpu_collector->setup_called); + + EXPECT_FALSE(gpu_collector->config_called); + slice.config(); + EXPECT_TRUE(gpu_collector->config_called); + + EXPECT_FALSE(gpu_collector->sample_called); + slice.sample(12345); + EXPECT_TRUE(gpu_collector->sample_called); + EXPECT_EQ(gpu_collector->last_timestamp, 12345); + + EXPECT_FALSE(gpu_collector->post_process_called); + slice.post_process(); + EXPECT_TRUE(gpu_collector->post_process_called); + + EXPECT_FALSE(gpu_collector->shutdown_called); + slice.shutdown(); + EXPECT_TRUE(gpu_collector->shutdown_called); +} + +TEST_F(collector_slice_test, heterogeneous_collectors_in_vector) +{ + std::vector slices; + slices.emplace_back(*gpu_collector); + slices.emplace_back(*nic_collector); + + EXPECT_EQ(slices.size(), 2u); + + for(auto& slice : slices) + { + slice.setup(); + } + EXPECT_TRUE(gpu_collector->setup_called); + EXPECT_TRUE(nic_collector->setup_called); + + for(auto& slice : slices) + { + slice.config(); + } + EXPECT_TRUE(gpu_collector->config_called); + EXPECT_TRUE(nic_collector->config_called); + + for(auto& slice : slices) + { + slice.sample(1000); + } + EXPECT_TRUE(gpu_collector->sample_called); + EXPECT_EQ(nic_collector->sample_count, 1); + + for(auto& slice : slices) + { + slice.sample(2000); + } + EXPECT_EQ(nic_collector->sample_count, 2); +} + +TEST_F(collector_slice_test, collector_slice_is_non_owning) +{ + collector_slice slice(*gpu_collector); + + // Modify the original collector + gpu_collector->setup_called = true; + + // The slice should reflect the change (non-owning view) + slice.sample(5000); + EXPECT_TRUE(gpu_collector->sample_called); + EXPECT_TRUE(gpu_collector->setup_called); // Still true from manual set +} + +TEST_F(collector_slice_test, multiple_slices_to_same_collector) +{ + collector_slice slice1(*gpu_collector); + collector_slice slice2(*gpu_collector); + + // Both slices reference the same underlying object + slice1.setup(); + EXPECT_TRUE(gpu_collector->setup_called); + + // Calling on slice2 should see the already-called setup + slice2.config(); + EXPECT_TRUE(gpu_collector->setup_called); + EXPECT_TRUE(gpu_collector->config_called); +} + +TEST_F(collector_slice_test, collectors_can_be_different_types) +{ + std::vector slices; + slices.emplace_back(*gpu_collector); + slices.emplace_back(*nic_collector); + + for(auto& slice : slices) + { + slice.sample(100); + slice.sample(200); + slice.sample(300); + } + + // Each collector maintains its own state + EXPECT_TRUE(gpu_collector->sample_called); // bool, so just true + EXPECT_EQ(nic_collector->sample_count, 3); // int counter +} + +} // namespace rocprofsys::pmc::collectors::testing diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/CMakeLists.txt new file mode 100644 index 00000000000..344ffd4f59d --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/CMakeLists.txt @@ -0,0 +1,19 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +# Add tests subdirectory +add_subdirectory(tests) + +# PMC GPU Collector Sources +set(pmc_gpu_sources + ${CMAKE_CURRENT_LIST_DIR}/types.hpp + ${CMAKE_CURRENT_LIST_DIR}/sample.hpp + ${CMAKE_CURRENT_LIST_DIR}/device.hpp + ${CMAKE_CURRENT_LIST_DIR}/collector.hpp + ${CMAKE_CURRENT_LIST_DIR}/gpu_traits.hpp + ${CMAKE_CURRENT_LIST_DIR}/perfetto_policy.hpp + ${CMAKE_CURRENT_LIST_DIR}/cache_policy.hpp +) + +# Add to parent variable (will be used by collectors CMakeLists.txt) +set(pmc_gpu_sources ${pmc_gpu_sources} PARENT_SCOPE) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/cache_policy.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/cache_policy.hpp new file mode 100644 index 00000000000..27f96a1b26d --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/cache_policy.hpp @@ -0,0 +1,344 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "core/config.hpp" +#include "core/trace_cache/cache_manager.hpp" +#include "core/trace_cache/metadata_registry.hpp" +#include "library/pmc/collectors/gpu/sample.hpp" +#include "library/pmc/collectors/gpu/types.hpp" + +#include + +#include +#include +#include +#include + +namespace rocprofsys::pmc::collectors::gpu +{ + +/** + * @brief Output policy for writing GPU PMC samples to the trace cache. + * + * This policy handles serialization of AMD SMI GPU metric samples into the + * rocprofiler-systems trace cache for later analysis and visualization. + * It manages category metadata initialization and per-device PMC metadata + * registration. + * + * @see perfetto_policy for direct Perfetto trace output + */ +struct cache_policy +{ + /** + * @brief Initialize trace cache category metadata for AMD SMI metrics. + * + * Registers category names in the trace cache metadata registry. + * This is called once during initialization. + */ + static void initialize_category_metadata() + { + trace_cache::get_metadata_registry().add_string( + trait::name::value); + } + + static void initialize_tracks_metadata() + { + const auto thread_id = std::nullopt; + + trace_cache::get_metadata_registry().add_track( + { trace_cache::info::format_track_name(), + thread_id, "{}" }); + trace_cache::get_metadata_registry().add_track( + { trace_cache::info::format_track_name(), + thread_id, "{}" }); + trace_cache::get_metadata_registry().add_track( + { trace_cache::info::format_track_name(), + thread_id, "{}" }); + trace_cache::get_metadata_registry().add_track( + { trace_cache::info::format_track_name(), thread_id, + "{}" }); + trace_cache::get_metadata_registry().add_track( + { trace_cache::info::format_track_name(), thread_id, + "{}" }); + trace_cache::get_metadata_registry().add_track( + { trace_cache::info::format_track_name(), + thread_id, "{}" }); + + auto add_vcn_track = [&](std::optional xcp_idx) { + for(int clk = 0; clk < AMDSMI_MAX_NUM_VCN; ++clk) + { + auto name = + trace_cache::info::format_track_name( + xcp_idx, clk); + trace_cache::get_metadata_registry().add_track( + { name.c_str(), thread_id, "{}" }); + } + }; + + auto add_jpeg_track = [&](std::optional xcp_idx) { + for(size_t clk = 0; clk < ROCPROFSYS_AMDSMI_JPEG_ENGINE_COUNT; ++clk) + { + auto name = + trace_cache::info::format_track_name( + xcp_idx, clk); + trace_cache::get_metadata_registry().add_track( + { name.c_str(), thread_id, "{}" }); + } + }; + + for(int xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + add_vcn_track(xcp); + add_jpeg_track(xcp); + } + + trace_cache::get_metadata_registry().add_track( + { trace_cache::info::format_track_name(), + thread_id, "{}" }); + trace_cache::get_metadata_registry().add_track( + { trace_cache::info::format_track_name(), + thread_id, "{}" }); + + for(int vcn = 0; vcn < AMDSMI_MAX_NUM_VCN; ++vcn) + { + auto vcn_name = + trace_cache::info::format_track_name( + std::nullopt, vcn); + trace_cache::get_metadata_registry().add_track( + { vcn_name.c_str(), thread_id, "{}" }); + } + + for(int link = 0; link < AMDSMI_MAX_NUM_XGMI_LINKS; ++link) + { + auto read_name = + trace_cache::info::format_track_name( + std::nullopt, link); + trace_cache::get_metadata_registry().add_track( + { read_name.c_str(), thread_id, "{}" }); + + auto write_name = + trace_cache::info::format_track_name( + std::nullopt, link); + trace_cache::get_metadata_registry().add_track( + { write_name.c_str(), thread_id, "{}" }); + } + + trace_cache::get_metadata_registry().add_track( + { trace_cache::info::format_track_name(), + thread_id, "{}" }); + + trace_cache::get_metadata_registry().add_track( + { trace_cache::info::format_track_name(), + thread_id, "{}" }); + trace_cache::get_metadata_registry().add_track( + { trace_cache::info::format_track_name(), + thread_id, "{}" }); + trace_cache::get_metadata_registry().add_track( + { trace_cache::info::format_track_name< + category::amd_smi_pcie_bandwidth_acc>(), + thread_id, "{}" }); + trace_cache::get_metadata_registry().add_track( + { trace_cache::info::format_track_name< + category::amd_smi_pcie_bandwidth_inst>(), + thread_id, "{}" }); + } + + /** + * @brief Initialize per-device PMC metadata for AMD SMI metrics. + * + * Registers PMC metadata (name, description, units, etc.) for each metric type + * that can be collected from the specified GPU device. + * + * @param gpu_id GPU device identifier for which to register metadata + */ + static void initialize_pmc_metadata(size_t gpu_id) + { + // Metadata field constants for PMC info registration + constexpr size_t EVENT_CODE = 0; + constexpr size_t INSTANCE_ID = 0; + constexpr const char* LONG_DESCRIPTION = ""; + constexpr const char* COMPONENT = ""; + constexpr const char* BLOCK = ""; + constexpr const char* EXPRESSION = ""; + constexpr const char* CELSIUS_DEGREES = "\u00B0C"; + constexpr const char* TARGET_ARCH = "GPU"; + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, "GFX Busy", + trait::name::description, LONG_DESCRIPTION, + COMPONENT, trace_cache::PERCENTAGE, rocprofsys::trace_cache::ABSOLUTE, + BLOCK, EXPRESSION, 0, 0, "{}" }); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, "UMC Busy", + trait::name::description, LONG_DESCRIPTION, + COMPONENT, trace_cache::PERCENTAGE, rocprofsys::trace_cache::ABSOLUTE, + BLOCK, EXPRESSION, 0, 0, "{}" }); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, "MM Busy", + trait::name::description, LONG_DESCRIPTION, + COMPONENT, trace_cache::PERCENTAGE, rocprofsys::trace_cache::ABSOLUTE, + BLOCK, EXPRESSION, 0, 0, "{}" }); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, "Temp", + trait::name::description, LONG_DESCRIPTION, + COMPONENT, CELSIUS_DEGREES, rocprofsys::trace_cache::ABSOLUTE, BLOCK, + EXPRESSION, 0, 0 }); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, "Pow", + trait::name::description, LONG_DESCRIPTION, + COMPONENT, "W", rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0, + 0 }); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, "MemUsg", + trait::name::description, LONG_DESCRIPTION, + COMPONENT, tim::units::mem_repr(tim::units::megabyte), + rocprofsys::trace_cache::ABSOLUTE, BLOCK, EXPRESSION, 0, 0 }); + + for(int vcn = 0; vcn < AMDSMI_MAX_NUM_VCN; ++vcn) + { + auto vcn_name = + trace_cache::info::format_track_name(vcn); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + vcn_name.c_str(), vcn_name.c_str(), + "VCN (Video Decode) Engine Activity", LONG_DESCRIPTION, COMPONENT, + trace_cache::PERCENTAGE, rocprofsys::trace_cache::ABSOLUTE, BLOCK, + EXPRESSION, 0, 0 }); + } + + for(int xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + for(int vcn = 0; vcn < AMDSMI_MAX_NUM_VCN; ++vcn) + { + auto vcn_name = + trace_cache::info::format_track_name( + xcp, vcn); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + vcn_name.c_str(), vcn_name.c_str(), + "VCN (Video Decode) Engine Activity", LONG_DESCRIPTION, COMPONENT, + trace_cache::PERCENTAGE, rocprofsys::trace_cache::ABSOLUTE, BLOCK, + EXPRESSION, 0, 0 }); + } + } + + for(size_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + for(size_t jpeg = 0; jpeg < ROCPROFSYS_AMDSMI_JPEG_ENGINE_COUNT; ++jpeg) + { + auto jpeg_name = + trace_cache::info::format_track_name( + xcp, jpeg); + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + jpeg_name.c_str(), jpeg_name.c_str(), + "JPEG (Image Decode) Engine Activity", LONG_DESCRIPTION, COMPONENT, + trace_cache::PERCENTAGE, rocprofsys::trace_cache::ABSOLUTE, BLOCK, + EXPRESSION, 0, 0 }); + } + } + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, "SDMA Usage", + trait::name::description, LONG_DESCRIPTION, + COMPONENT, trace_cache::PERCENTAGE, rocprofsys::trace_cache::ABSOLUTE, + BLOCK, EXPRESSION, 0, 0, "{}" }); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, "XGMI Width", + trait::name::description, + LONG_DESCRIPTION, COMPONENT, "lanes", rocprofsys::trace_cache::ABSOLUTE, + BLOCK, EXPRESSION, 0, 0 }); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, "XGMI Speed", + trait::name::description, + LONG_DESCRIPTION, COMPONENT, "Mbps", rocprofsys::trace_cache::ABSOLUTE, + BLOCK, EXPRESSION, 0, 0 }); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, "XGMI Read", + trait::name::description, + LONG_DESCRIPTION, COMPONENT, "KB", rocprofsys::trace_cache::ABSOLUTE, BLOCK, + EXPRESSION, 0, 0 }); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, "XGMI Write", + trait::name::description, + LONG_DESCRIPTION, COMPONENT, "KB", rocprofsys::trace_cache::ABSOLUTE, BLOCK, + EXPRESSION, 0, 0 }); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, "PCIe Width", + trait::name::description, + LONG_DESCRIPTION, COMPONENT, "lanes", rocprofsys::trace_cache::ABSOLUTE, + BLOCK, EXPRESSION, 0, 0 }); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, "PCIe Speed", + trait::name::description, + LONG_DESCRIPTION, COMPONENT, "MT/s", rocprofsys::trace_cache::ABSOLUTE, + BLOCK, EXPRESSION, 0, 0 }); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, "PCIe BW Acc", + trait::name::description, + LONG_DESCRIPTION, COMPONENT, "bytes", rocprofsys::trace_cache::ABSOLUTE, + BLOCK, EXPRESSION, 0, 0 }); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::GPU, gpu_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, "PCIe BW Inst", + trait::name::description, + LONG_DESCRIPTION, COMPONENT, "bytes/s", rocprofsys::trace_cache::ABSOLUTE, + BLOCK, EXPRESSION, 0, 0 }); + } + + /** + * @brief Store a PMC sample to the trace cache. + * + * @param device_id GPU device identifier + * @param device_name Device name (unused for GPU, kept for API consistency) + * @param enabled_metrics Metrics requested by user configuration + * @param supported_metrics Metrics supported by this device + * @param metrics Collected metric values + * @param timestamp Sample timestamp in nanoseconds + */ + static void store_sample(size_t device_id, const std::string& /*device_name*/, + const enabled_metrics& enabled_metrics_cfg, + const enabled_metrics& supported_metrics, + const metrics& metric_values, uint64_t timestamp) + { + enabled_metrics _enabled_metrics; + _enabled_metrics.value = enabled_metrics_cfg.value & supported_metrics.value; + + trace_cache::get_buffer_storage().store(trace_cache::gpu_pmc_sample{ + _enabled_metrics, static_cast(device_id), timestamp, + metric_values }); + } +}; + +} // namespace rocprofsys::pmc::collectors::gpu diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/collector.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/collector.hpp new file mode 100644 index 00000000000..4ae55237407 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/collector.hpp @@ -0,0 +1,27 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "library/pmc/collectors/base/collector.hpp" +#include "library/pmc/collectors/gpu/gpu_traits.hpp" + +namespace rocprofsys::pmc::collectors::gpu +{ + +/** + * @brief GPU metrics collector for performance monitoring. + * + * This collector specializes the base::collector template for GPU devices + * using AMD SMI. All GPU-specific behavior is defined in gpu_traits. + * + * SDMA delta computation is handled internally by the device class to maintain + * state across samples while keeping traits stateless. + * + * @tparam DeviceProvider Type providing GPU device enumeration and management + * @tparam Config Configuration policy providing settings and output policies + */ +template +using collector = base::collector, DeviceProvider, Config>; + +} // namespace rocprofsys::pmc::collectors::gpu diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/device.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/device.hpp new file mode 100644 index 00000000000..057a42820b4 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/device.hpp @@ -0,0 +1,438 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "core/amd_smi.hpp" +#include "library/pmc/collectors/gpu/types.hpp" +#include "logger/debug.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace rocprofsys::pmc::collectors::gpu +{ + +template +class device +{ +public: + device(std::shared_ptr driver, amdsmi_processor_handle handle, + processor_type_t /*processor_type*/, size_t logical_index) + : m_driver_api{ std::move(driver) } + , m_device_handle{ handle } + , m_index{ logical_index } + { + initialize_device_info(); + m_is_supported = initialize_supported_metrics(); + } + + [[nodiscard]] bool is_supported() const noexcept { return m_is_supported; } + + [[nodiscard]] enabled_metrics get_supported_metrics() const noexcept + { + return m_supported_metrics; + } + + [[nodiscard]] size_t get_index() const noexcept { return m_index; } + + [[nodiscard]] const std::string& get_name() const noexcept { return m_device_name; } + + [[nodiscard]] const std::string& get_product_name() const noexcept + { + return m_product_name; + } + + [[nodiscard]] const std::string& get_vendor_name() const noexcept + { + return m_vendor_name; + } + + [[nodiscard]] metrics get_gpu_metrics( + [[maybe_unused]] const enabled_metrics& enabled_cfg, + [[maybe_unused]] uint64_t timestamp) + { + metrics metrics{}; + + amdsmi_gpu_metrics_t amd_smi_metrics{}; + if(m_driver_api->get_metrics_info(m_device_handle, &amd_smi_metrics) != + AMDSMI_STATUS_SUCCESS) + { + return metrics; + } + + collect_power_metrics(amd_smi_metrics, metrics); + collect_temperature_metrics(amd_smi_metrics, metrics); + collect_activity_metrics(amd_smi_metrics, metrics); + collect_memory_metrics(metrics); + collect_xcp_metrics(amd_smi_metrics, metrics); + collect_xgmi_metrics(amd_smi_metrics, metrics); + collect_pcie_metrics(amd_smi_metrics, metrics); + +#if defined(AMD_SMI_SDMA_SUPPORTED) && AMD_SMI_SDMA_SUPPORTED == 1 + if(enabled_cfg.bits.sdma_usage && m_supported_metrics.bits.sdma_usage) + { + uint64_t current_cumulative = get_raw_sdma_usage(); + + if(m_sdma_state.has_prev && timestamp > m_sdma_state.prev_timestamp) + { + uint64_t delta_usage = current_cumulative - m_sdma_state.prev_cumulative; + uint64_t delta_time = timestamp - m_sdma_state.prev_timestamp; + uint32_t pct = + static_cast((delta_usage * 100000ULL) / delta_time); + metrics.sdma_usage = (pct > 100) ? 100 : pct; + } + + m_sdma_state.prev_cumulative = current_cumulative; + m_sdma_state.prev_timestamp = timestamp; + m_sdma_state.has_prev = true; + } +#endif + + return metrics; + } + + /** + * @brief Get raw cumulative SDMA usage from all processes on this GPU. + * + * Queries the process list and sums sdma_usage (in microseconds) across + * all processes. Returns 0 if the query fails or SDMA is not supported. + * The caller (collector) is responsible for delta computation. + * + * @return Cumulative SDMA usage in microseconds. + */ +#if defined(AMD_SMI_SDMA_SUPPORTED) && AMD_SMI_SDMA_SUPPORTED == 1 + [[nodiscard]] uint64_t get_raw_sdma_usage() const + { + uint32_t num_processes = 0; + auto status = + m_driver_api->get_gpu_process_list(m_device_handle, &num_processes, nullptr); + if(status != AMDSMI_STATUS_SUCCESS || num_processes == 0) + { + return 0; + } + + std::vector proc_list(num_processes); + status = m_driver_api->get_gpu_process_list(m_device_handle, &num_processes, + proc_list.data()); + if(status != AMDSMI_STATUS_SUCCESS) + { + return 0; + } + + uint64_t cumulative = 0; + for(const auto& proc : proc_list) + { + cumulative += proc.sdma_usage; + } + return cumulative; + } +#endif + +private: + /** + * @brief Initialize device info (name, product_name, vendor_name). + * + * Queries GPU ASIC information from AMD SMI to populate device identification. + */ + void initialize_device_info() + { + // Generate a simple device name based on index + m_device_name = "GPU" + std::to_string(m_index); + + // Get ASIC info for vendor and product names + amdsmi_asic_info_t asic_info{}; + if(m_driver_api->get_gpu_asic_info(m_device_handle, &asic_info) == + AMDSMI_STATUS_SUCCESS) + { + m_product_name = asic_info.market_name; + m_vendor_name = asic_info.vendor_name; + } + else + { + m_product_name = "Unknown GPU"; + m_vendor_name = "AMD"; + } + } + + void collect_power_metrics(const amdsmi_gpu_metrics_t& gpu_metrics, + metrics& out) const + { + if(m_supported_metrics.bits.current_socket_power) + { + out.current_socket_power = gpu_metrics.current_socket_power; + } + if(m_supported_metrics.bits.average_socket_power) + { + out.average_socket_power = gpu_metrics.average_socket_power; + } + } + + void collect_temperature_metrics(const amdsmi_gpu_metrics_t& gpu_metrics, + metrics& out) const + { + if(m_supported_metrics.bits.hotspot_temperature) + { + out.hotspot_temperature = gpu_metrics.temperature_hotspot; + } + if(m_supported_metrics.bits.edge_temperature) + { + out.edge_temperature = gpu_metrics.temperature_edge; + } + } + + void collect_activity_metrics(const amdsmi_gpu_metrics_t& gpu_metrics, + metrics& out) const + { + if(m_supported_metrics.bits.gfx_activity) + { + out.gfx_activity = gpu_metrics.average_gfx_activity; + } + if(m_supported_metrics.bits.umc_activity) + { + out.umc_activity = gpu_metrics.average_umc_activity; + } + if(m_supported_metrics.bits.mm_activity) + { + out.mm_activity = gpu_metrics.average_mm_activity; + } + } + + void collect_memory_metrics(metrics& out) const + { + if(!m_supported_metrics.bits.memory_usage) + { + return; + } + + uint64_t mem_usage = 0; + if(m_driver_api->get_memory_usage(m_device_handle, AMDSMI_MEM_TYPE_VRAM, + &mem_usage) == AMDSMI_STATUS_SUCCESS) + { + out.memory_usage = mem_usage; + } + } + + void collect_xcp_metrics(const amdsmi_gpu_metrics_t& gpu_metrics, metrics& out) const + { + // Per-XCP VCN busy metrics (MI300) + if(m_supported_metrics.bits.vcn_busy) + { + for(size_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + std::copy(std::begin(gpu_metrics.xcp_stats[xcp].vcn_busy), + std::end(gpu_metrics.xcp_stats[xcp].vcn_busy), + out.xcp_stats[xcp].vcn_busy.begin()); + } + } + + // Device-level VCN activity (Radeon) + if(m_supported_metrics.bits.vcn_activity) + { + std::copy(std::begin(gpu_metrics.vcn_activity), + std::end(gpu_metrics.vcn_activity), out.vcn_activity.begin()); + } + + // Per-XCP JPEG busy metrics (MI300) + if(m_supported_metrics.bits.jpeg_busy) + { + for(size_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + std::copy(std::begin(gpu_metrics.xcp_stats[xcp].jpeg_busy), + std::end(gpu_metrics.xcp_stats[xcp].jpeg_busy), + out.xcp_stats[xcp].jpeg_busy.begin()); + } + } + + // Device-level JPEG activity (Radeon) + if(m_supported_metrics.bits.jpeg_activity) + { + std::copy(std::begin(gpu_metrics.jpeg_activity), + std::end(gpu_metrics.jpeg_activity), out.jpeg_activity.begin()); + } + } + + void collect_xgmi_metrics(const amdsmi_gpu_metrics_t& gpu_metrics, metrics& out) const + { + if(!m_supported_metrics.bits.xgmi) + { + return; + } + + populate_if_supported(out.xgmi.link.width, gpu_metrics.xgmi_link_width); + populate_if_supported(out.xgmi.link.speed, gpu_metrics.xgmi_link_speed); + + for(size_t i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) + { + populate_if_supported(out.xgmi.data_acc.read[i], + gpu_metrics.xgmi_read_data_acc[i]); + populate_if_supported(out.xgmi.data_acc.write[i], + gpu_metrics.xgmi_write_data_acc[i]); + } + } + + void collect_pcie_metrics(const amdsmi_gpu_metrics_t& gpu_metrics, metrics& out) const + { + if(!m_supported_metrics.bits.pcie) + { + return; + } + + populate_if_supported(out.pcie.link.width, gpu_metrics.pcie_link_width); + populate_if_supported(out.pcie.link.speed, gpu_metrics.pcie_link_speed); + populate_if_supported(out.pcie.bandwidth.acc, gpu_metrics.pcie_bandwidth_acc); + populate_if_supported(out.pcie.bandwidth.inst, gpu_metrics.pcie_bandwidth_inst); + } + + bool initialize_supported_metrics() + { + uint64_t mem_usage = 0; + m_supported_metrics.bits.memory_usage = + m_driver_api->get_memory_usage(m_device_handle, AMDSMI_MEM_TYPE_VRAM, + &mem_usage) == AMDSMI_STATUS_SUCCESS && + is_metric_supported(mem_usage); + + amdsmi_gpu_metrics_t gpu_metrics{}; + if(m_driver_api->get_metrics_info(m_device_handle, &gpu_metrics) != + AMDSMI_STATUS_SUCCESS) + { + return m_supported_metrics.value != 0; + } + + m_supported_metrics.bits.current_socket_power = + is_metric_supported(gpu_metrics.current_socket_power); + m_supported_metrics.bits.average_socket_power = + is_metric_supported(gpu_metrics.average_socket_power); + + m_supported_metrics.bits.hotspot_temperature = + is_metric_supported(gpu_metrics.temperature_hotspot); + m_supported_metrics.bits.edge_temperature = + is_metric_supported(gpu_metrics.temperature_edge); + + m_supported_metrics.bits.gfx_activity = + is_metric_supported(gpu_metrics.average_gfx_activity); + m_supported_metrics.bits.umc_activity = + is_metric_supported(gpu_metrics.average_umc_activity); + m_supported_metrics.bits.mm_activity = + is_metric_supported(gpu_metrics.average_mm_activity); + + // Check per-XCP VCN/JPEG busy metrics (MI300) + m_supported_metrics.bits.vcn_busy = std::any_of( + std::begin(gpu_metrics.xcp_stats), std::end(gpu_metrics.xcp_stats), + [](const amdsmi_gpu_xcp_metrics_t& xcp_stats) { + return std::any_of(std::begin(xcp_stats.vcn_busy), + std::end(xcp_stats.vcn_busy), + [](uint16_t v) { return is_metric_supported(v); }); + }); + + m_supported_metrics.bits.jpeg_busy = std::any_of( + std::begin(gpu_metrics.xcp_stats), std::end(gpu_metrics.xcp_stats), + [](const amdsmi_gpu_xcp_metrics_t& xcp_stats) { + return std::any_of(std::begin(xcp_stats.jpeg_busy), + std::end(xcp_stats.jpeg_busy), + [](uint16_t v) { return is_metric_supported(v); }); + }); + + // Check device-level VCN/JPEG activity metrics (Radeon) + // Only enable device-level if per-XCP is not available (priority to per-XCP) + m_supported_metrics.bits.vcn_activity = + !m_supported_metrics.bits.vcn_busy && + std::any_of(std::begin(gpu_metrics.vcn_activity), + std::end(gpu_metrics.vcn_activity), + [](uint16_t v) { return is_metric_supported(v); }); + + m_supported_metrics.bits.jpeg_activity = + !m_supported_metrics.bits.jpeg_busy && + std::any_of(std::begin(gpu_metrics.jpeg_activity), + std::end(gpu_metrics.jpeg_activity), + [](uint16_t v) { return is_metric_supported(v); }); + + m_supported_metrics.bits.xgmi = + is_metric_supported(gpu_metrics.xgmi_link_width) || + is_metric_supported(gpu_metrics.xgmi_link_speed) || + std::any_of(std::begin(gpu_metrics.xgmi_read_data_acc), + std::end(gpu_metrics.xgmi_read_data_acc), + [](uint64_t v) { return is_metric_supported(v); }); + + m_supported_metrics.bits.pcie = + is_metric_supported(gpu_metrics.pcie_link_width) || + is_metric_supported(gpu_metrics.pcie_link_speed) || + is_metric_supported(gpu_metrics.pcie_bandwidth_acc) || + is_metric_supported(gpu_metrics.pcie_bandwidth_inst); + +#if defined(AMD_SMI_SDMA_SUPPORTED) && AMD_SMI_SDMA_SUPPORTED == 1 + { + uint32_t num_processes = 0; + m_supported_metrics.bits.sdma_usage = + m_driver_api->get_gpu_process_list(m_device_handle, &num_processes, + nullptr) == AMDSMI_STATUS_SUCCESS; + } +#endif + + LOG_DEBUG("Device [{}] supported metrics: {}", m_index, + format_supported_metrics(m_supported_metrics)); + + return m_supported_metrics.value != 0; + } + + static std::string format_supported_metrics(const enabled_metrics& metrics) + { + const auto bool_string = [](bool value) { return value ? "true" : "false"; }; + + return fmt::format( + "Current power: {}, Average power: {}, Memory usage: {}, Hotspot temp: {}, " + "Edge temp: {}, GFX activity: {}, UMC activity: {}, MM activity: {}, " + "VCN activity: {}, JPEG activity: {}, XGMI: {}, PCIe: {}, SDMA: {}", + bool_string(metrics.bits.current_socket_power), + bool_string(metrics.bits.average_socket_power), + bool_string(metrics.bits.memory_usage), + bool_string(metrics.bits.hotspot_temperature), + bool_string(metrics.bits.edge_temperature), + bool_string(metrics.bits.gfx_activity), + bool_string(metrics.bits.umc_activity), bool_string(metrics.bits.mm_activity), + bool_string(metrics.bits.vcn_activity), + bool_string(metrics.bits.jpeg_activity), bool_string(metrics.bits.xgmi), + bool_string(metrics.bits.pcie), bool_string(metrics.bits.sdma_usage)); + } + + template + static bool is_metric_supported(T value, + T invalid_sentinel = std::numeric_limits::max()) + { + return value != invalid_sentinel; + } + + template + static bool populate_if_supported(T& dest, T src, + T invalid_sentinel = std::numeric_limits::max()) + { + const bool valid = is_metric_supported(src, invalid_sentinel); + dest = valid ? src : T{ 0 }; + return valid; + } + + struct sdma_state + { + uint64_t prev_cumulative = 0; + uint64_t prev_timestamp = 0; + bool has_prev = false; + }; + + std::shared_ptr m_driver_api; + amdsmi_processor_handle m_device_handle; + enabled_metrics m_supported_metrics; + size_t m_index; + std::string m_device_name; + std::string m_product_name; + std::string m_vendor_name; + bool m_is_supported = false; + sdma_state m_sdma_state; +}; + +} // namespace rocprofsys::pmc::collectors::gpu diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/gpu_traits.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/gpu_traits.hpp new file mode 100644 index 00000000000..c00a7fdf8e4 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/gpu_traits.hpp @@ -0,0 +1,200 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "library/pmc/collectors/gpu/device.hpp" +#include "library/pmc/collectors/gpu/types.hpp" +#include "library/pmc/common/types.hpp" +#include "logger/debug.hpp" + +#include +#include +#include +#include + +namespace rocprofsys::pmc::collectors::gpu +{ + +using ::rocprofsys::pmc::device_filter; +using ::rocprofsys::pmc::device_selection_mode; +using ::rocprofsys::pmc::device_type; + +/** + * @brief Traits type for GPU collector configuration. + * + * Defines types, constants, and customization points for the base collector template + * to work with GPU devices via AMD SMI. + * + * @tparam Driver The AMD SMI driver type (real or mock for testing) + */ +template +struct gpu_traits +{ + // Required type aliases for base::collector + using metrics_t = pmc::collectors::gpu::metrics; + using enabled_metrics_t = pmc::collectors::gpu::enabled_metrics; + using device_t = device; + using device_ptr_t = std::shared_ptr; + using container_t = std::vector; + using driver_t = typename DriverProvider::driver_t; + + // Required constants + static constexpr const char* device_name = "GPU"; + // Settings customization points + + /** + * @brief Get the device filter from settings. + */ + template + [[nodiscard]] static device_filter get_device_filter() + { + return Settings::get_device_filter(); + } + + /** + * @brief Get enabled metrics from settings. + */ + template + [[nodiscard]] static enabled_metrics_t get_enabled_metrics() + { + return Settings::get_enabled_metrics(); + } + + // Cache API customization points + + /** + * @brief Initialize PMC metadata for a specific device. + */ + template + static void init_pmc_metadata(const device_ptr_t& device) + { + Cache::initialize_pmc_metadata(device->get_index()); + } + + /** + * @brief Initialize Perfetto storage for devices. + */ + template + static void init_perfetto_storage(const DeviceVector& devices) + { + Perfetto::init_storage(devices); + } + + /** + * @brief Setup Perfetto counter tracks for a device. + */ + template + static void setup_counter_tracks(const device_ptr_t& device, + const enabled_metrics_t& enabled) + { + Perfetto::setup_counter_tracks(device->get_index(), enabled); + } + + /** + * @brief Post-process Perfetto data. + */ + template + static void post_process_perfetto(const DeviceEntries& /*device_entries*/, + const enabled_metrics_t& enabled) + { + Perfetto::post_process(enabled); + } + + /** + * @brief Get metrics from a device. + */ + [[nodiscard]] static metrics_t get_metrics(const device_ptr_t& device, + const enabled_metrics_t& enabled, + uint64_t timestamp) + { + return device->get_gpu_metrics(enabled, timestamp); + } + + // Device enumeration + + /** + * @brief Entry holding a device and its cached supported metrics. + * + * This type is returned by enumerate_devices for the base collector to store. + */ + struct device_entry + { + device_ptr_t device; + enabled_metrics_t supported_metrics; + }; + + /** + * @brief Enumerate GPU devices using AMD SMI socket/processor iteration. + * + * This function implements GPU-specific enumeration: + * - Gets device filter from settings + * - Iterates through sockets and processors + * - Filters by processor type (AMD GPU) + * - Applies device filter (ALL, NONE, SPECIFIC indices) + * - Creates device objects and queries supported metrics + * + * @tparam Settings Settings API type for device filter configuration + * @tparam Provider Device provider type + * @param provider Shared pointer to the device provider + * @return Vector of device entries with cached supported metrics + */ + template + [[nodiscard]] static std::vector enumerate_devices( + std::shared_ptr provider) + { + std::vector entries; + auto filter = get_device_filter(); + + if(filter.mode == device_selection_mode::NONE) + { + LOG_DEBUG("{} sampling disabled via configuration", device_name); + return entries; + } + + auto devices = provider->template get_devices(device_type::GPU); + + for(auto& device : devices) + { + auto index = device->get_index(); + + bool should_include = (filter.mode == device_selection_mode::ALL) || + (filter.mode == device_selection_mode::SPECIFIC && + filter.indices.count(index) > 0); + + if(should_include && device->is_supported()) + { + auto supported = device->get_supported_metrics(); + entries.push_back(device_entry{ std::move(device), supported }); + } + } + + warn_invalid_indices(filter, devices.size()); + return entries; + } + + /** + * @brief Warn about invalid device indices specified by the user. + * + * @param filter Device filter with requested indices + * @param max_index Maximum valid device index + 1 + */ + static void warn_invalid_indices(const device_filter& filter, size_t max_index) + { + if(filter.mode != device_selection_mode::SPECIFIC) + { + return; + } + for(auto requested_index : filter.indices) + { + if(requested_index >= max_index) + { + LOG_WARNING("Requested GPU device index {} does not exist. " + "Available devices: 0-{}", + requested_index, max_index > 0 ? max_index - 1 : 0); + } + } + } +}; + +} // namespace rocprofsys::pmc::collectors::gpu diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/perfetto_policy.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/perfetto_policy.hpp new file mode 100644 index 00000000000..41d682081d8 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/perfetto_policy.hpp @@ -0,0 +1,641 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "core/perfetto.hpp" +#include "library/pmc/collectors/gpu/types.hpp" +#include "library/thread_info.hpp" + +#include +#include +#include +#include + +#include + +namespace rocprofsys::pmc::collectors::gpu +{ + +namespace detail +{ + +struct track_description +{ + const char* track_name; + const char* units; + std::vector track_indexes; +}; + +// Helper function to create enabled_metrics value from bit positions +// See enabled_metrics definition in pmc/collectors/gpu/types.hpp for bit position +// documentation +inline constexpr uint32_t +make_metric_value(std::initializer_list bit_positions) +{ + uint32_t value = 0; + for(auto bit : bit_positions) + { + value |= (1u << bit); + } + return value; +} + +const auto GFX_BUSY_VALUE = make_metric_value({ 5 }); // gfx_activity +const auto UMC_BUSY_VALUE = make_metric_value({ 6 }); // umc_activity +const auto MM_BUSY_VALUE = make_metric_value({ 7 }); // mm_activity +const auto TEMPERATURE_VALUE = make_metric_value({ 3, 4 }); // hotspot, edge +const auto CURRENT_POWER_VALUE = make_metric_value({ 0, 1 }); // current, average +const auto MEMORY_USAGE_VALUE = make_metric_value({ 2 }); // memory_usage +const auto VCN_ACTIVITY_VALUE = make_metric_value({ 8 }); // vcn_activity +const auto JPEG_ACTIVITY_VALUE = make_metric_value({ 9 }); // jpeg_activity +const auto VCN_BUSY_VALUE = make_metric_value({ 10 }); // vcn_busy (MI300) +const auto JPEG_BUSY_VALUE = make_metric_value({ 11 }); // jpeg_busy (MI300) +const auto XGMI_VALUE = make_metric_value({ 12 }); // xgmi +const auto PCIE_VALUE = make_metric_value({ 13 }); // pcie +const auto SDMA_USAGE_VALUE = make_metric_value({ 14 }); // sdma_usage + +inline std::unordered_map +make_default_tracks() +{ + return { + { GFX_BUSY_VALUE, { "GFX Busy", "%", {} } }, + { UMC_BUSY_VALUE, { "UMC Busy", "%", {} } }, + { MM_BUSY_VALUE, { "MM Busy", "%", {} } }, + { TEMPERATURE_VALUE, { "Temperature", "deg C", {} } }, + { CURRENT_POWER_VALUE, { "Current Power", "watts", {} } }, + { MEMORY_USAGE_VALUE, { "Memory Usage", "megabytes", {} } }, + { VCN_ACTIVITY_VALUE, { "VCN Activity", "%", {} } }, + { JPEG_ACTIVITY_VALUE, { "JPEG Activity", "%", {} } }, + { VCN_BUSY_VALUE, { "VCN Busy", "%", {} } }, + { JPEG_BUSY_VALUE, { "JPEG Busy", "%", {} } }, + { XGMI_VALUE, { "XGMI", "", {} } }, + { PCIE_VALUE, { "PCIe", "", {} } }, + { SDMA_USAGE_VALUE, { "SDMA Usage", "%", {} } }, + }; +} + +struct xgmi_track_set +{ + std::vector link_width; + std::vector link_speed; + std::vector read_data; + std::vector write_data; +}; + +struct pcie_track_set +{ + std::vector link_width; + std::vector link_speed; + std::vector bandwidth_acc; + std::vector bandwidth_inst; +}; + +struct perfetto_amd_smi_sample +{ + uint64_t timestamp; + pmc::collectors::gpu::metrics metrics; +}; + +struct perfetto_device_data +{ + std::unique_ptr> samples; + enabled_metrics supported_metrics; + std::unordered_map tracks; + xgmi_track_set xgmi_tracks; + pcie_track_set pcie_tracks; +}; + +inline std::map& +get_perfetto_data() +{ + static std::map data; + return data; +} + +} // namespace detail + +/** + * @brief Output policy for writing PMC samples directly to Perfetto traces. + * + * This policy handles real-time serialization of AMD SMI metric samples into + * Perfetto trace format, creating counter tracks for each metric type. + * Supports both device-level metrics (Radeon) and per-XCP metrics (MI300 series). + * + * @see cache_policy for writing to trace cache instead + */ +struct perfetto_policy +{ + using counter_track = perfetto_counter_track; + + /** + * @brief Initialize Perfetto storage for the given device entries. + * + * Allocates storage buffers for Perfetto samples for each GPU device + * and caches supported metrics for post-processing. + * + * @tparam DeviceEntryVector Container type holding device entries (device + + * supported_metrics) + * @param device_entries Vector of device entries to initialize storage for + */ + template + static void init_storage(const DeviceEntryVector& device_entries) + { + for(const auto& entry : device_entries) + { + auto idx = entry.device->get_index(); + detail::get_perfetto_data()[idx] = { + std::make_unique>(), + entry.supported_metrics, + detail::make_default_tracks(), + {}, + {} + }; + } + } + + /** + * @brief Set up Perfetto counter tracks for the specified device metrics. + * + * Creates named counter tracks in the Perfetto trace for each enabled metric, + * handling both simple metrics and array metrics (VCN, JPEG, XGMI, PCIe). + * + * @param device_index GPU device index + * @param enabled_metric_config Bitfield of metrics to create tracks for + */ + static void setup_counter_tracks(size_t device_index, + const enabled_metrics& enabled_metric_config) + { + auto addendum = [&](const char* name) { + return fmt::format("GPU {} [{}] (S)", name, device_index); + }; + + auto addendum_blk = [&](std::size_t i, const char* metric, + std::size_t xcp_idx = SIZE_MAX) { + if(xcp_idx != SIZE_MAX) + { + return fmt::format("GPU [{}] {} XCP_{}: [{:02d}] (S)", device_index, + metric, xcp_idx, i); + } + return fmt::format("GPU [{}] {} [{:02d}] (S)", device_index, metric, i); + }; + + auto& device_data = detail::get_perfetto_data()[device_index]; + auto& tracks = device_data.tracks; + + // Clear track indexes from previous setup calls to prevent + // stale track IDs when metric configuration changes between runs + for(auto& [_, description] : tracks) + { + description.track_indexes.clear(); + } + device_data.xgmi_tracks = {}; + device_data.pcie_tracks = {}; + + LOG_DEBUG("[GPU perfetto_policy] Setting up counter tracks for device {}, " + "enabled_metrics=0x{:x}", + device_index, enabled_metric_config.value); + + for(auto& [num, description] : tracks) + { + auto enabled_metric = num & enabled_metric_config.value; + if(enabled_metric == 0) + { + continue; + } + + const auto process_xcp_array = [&](detail::track_description& desc, + size_t array_size, size_t xcp_id) { + for(std::size_t i = 0; i < array_size; ++i) + { + const auto track_id = counter_track::emplace( + device_index, addendum_blk(i, desc.track_name, xcp_id), + desc.units); + desc.track_indexes.emplace_back(track_id); + } + }; + + if(enabled_metric == detail::VCN_BUSY_VALUE || + enabled_metric == detail::JPEG_BUSY_VALUE) + { + // Per-XCP metrics (MI300): create tracks for each XCP partition + auto array_size = (enabled_metric == detail::VCN_BUSY_VALUE) + ? AMDSMI_MAX_NUM_VCN + : ROCPROFSYS_AMDSMI_JPEG_ENGINE_COUNT; + for(std::size_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + process_xcp_array(description, array_size, xcp); + } + } + else if(enabled_metric == detail::VCN_ACTIVITY_VALUE || + enabled_metric == detail::JPEG_ACTIVITY_VALUE) + { + // Device-level metrics (Radeon): flat array, no XCP dimension + auto array_size = (enabled_metric == detail::VCN_ACTIVITY_VALUE) + ? AMDSMI_MAX_NUM_VCN + : ROCPROFSYS_AMDSMI_JPEG_ENGINE_COUNT; + for(std::size_t i = 0; i < array_size; ++i) + { + description.track_indexes.emplace_back(counter_track::emplace( + device_index, addendum_blk(i, description.track_name), + description.units)); + } + } + else + { + description.track_indexes.emplace_back(counter_track::emplace( + device_index, addendum(description.track_name), description.units)); + } + } + + if(enabled_metric_config.bits.xgmi) + { + auto& xgmi_tracks = device_data.xgmi_tracks; + + xgmi_tracks.link_width.emplace_back(counter_track::emplace( + device_index, addendum("XGMI Link Width"), "lanes")); + xgmi_tracks.link_speed.emplace_back(counter_track::emplace( + device_index, addendum("XGMI Link Speed"), "Mbps")); + + for(std::size_t link = 0; link < AMDSMI_MAX_NUM_XGMI_LINKS; ++link) + { + xgmi_tracks.read_data.emplace_back(counter_track::emplace( + device_index, addendum_blk(link, "XGMI Read Data"), "KB")); + xgmi_tracks.write_data.emplace_back(counter_track::emplace( + device_index, addendum_blk(link, "XGMI Write Data"), "KB")); + } + } + + if(enabled_metric_config.bits.pcie) + { + auto& pcie_tracks = device_data.pcie_tracks; + + pcie_tracks.link_width.emplace_back(counter_track::emplace( + device_index, addendum("PCIe Link Width"), "lanes")); + pcie_tracks.link_speed.emplace_back(counter_track::emplace( + device_index, addendum("PCIe Link Speed"), "MT/s")); + pcie_tracks.bandwidth_acc.emplace_back(counter_track::emplace( + device_index, addendum("PCIe Bandwidth Acc"), "bytes")); + pcie_tracks.bandwidth_inst.emplace_back(counter_track::emplace( + device_index, addendum("PCIe Bandwidth Inst"), "bytes/s")); + } + } + + /** + * @brief Store a PMC sample for later Perfetto serialization. + * + * Buffers the metric sample for batch processing during post_process(). + * + * @param device_index GPU device index + * @param metric_values Collected metric values + * @param timestamp Sample timestamp in nanoseconds + */ + static void store_sample(size_t device_index, const metrics& metric_values, + uint64_t timestamp) + { + detail::get_perfetto_data()[device_index].samples->emplace_back( + detail::perfetto_amd_smi_sample{ timestamp, metric_values }); + } + + /** + * @brief Post-process buffered samples and write to Perfetto trace. + * + * Serializes all buffered PMC samples to Perfetto counter tracks. + * This is called at the end of profiling to flush all samples. + * Supported metrics are retrieved from the cached device data. + * + * @param enabled_metrics Metrics that were enabled during collection + */ + static void post_process(pmc::collectors::gpu::enabled_metrics enabled_metrics_cfg) + { + for(const auto& [device_index, data] : detail::get_perfetto_data()) + { + post_process_device(device_index, enabled_metrics_cfg, + data.supported_metrics); + } + } + +private: + static void post_process_device( + size_t device_index, pmc::collectors::gpu::enabled_metrics enabled_metrics_cfg, + pmc::collectors::gpu::enabled_metrics supported_metrics) + { + auto& samples = *detail::get_perfetto_data()[device_index].samples; + + LOG_DEBUG("[GPU perfetto_policy] Post-processing {} PMC samples for device [{}], " + "enabled=0x{:x}, supported=0x{:x}", + samples.size(), device_index, enabled_metrics_cfg.value, + supported_metrics.value); + + const auto& thread_info = thread_info::get(0, InternalTID); + if(!thread_info) + { + return; + } + + pmc::collectors::gpu::enabled_metrics effective_metrics; + effective_metrics.value = + static_cast(enabled_metrics_cfg.value & supported_metrics.value); + + if(effective_metrics.value == 0) + { + LOG_DEBUG("No enabled PMC metrics for device [{}]", device_index); + return; + } + + auto& device_data = detail::get_perfetto_data()[device_index]; + auto& tracks = device_data.tracks; + + for(const auto& sample : samples) + { + const auto ts = sample.timestamp; + + if(!thread_info->is_valid_time(ts)) + { + LOG_WARNING("Invalid timestamp {} for PMC sample", ts); + continue; + } + + process_basic_metrics(device_index, ts, sample.metrics, effective_metrics, + tracks); + process_xcp_activity(device_index, ts, sample.metrics, effective_metrics, + tracks); + process_xgmi_metrics(device_index, ts, sample.metrics, effective_metrics, + device_data.xgmi_tracks); + process_pcie_metrics(device_index, ts, sample.metrics, effective_metrics, + device_data.pcie_tracks); + } + } + +private: + static void process_basic_metrics( + size_t device_index, uint64_t ts, const metrics& metric_values, + const enabled_metrics& effective_metrics, + std::unordered_map& tracks) + { + auto gfx_it = tracks.find(detail::GFX_BUSY_VALUE); + if(effective_metrics.bits.gfx_activity && gfx_it != tracks.end() && + !gfx_it->second.track_indexes.empty()) + { + TRACE_COUNTER( + "device_busy_gfx", + counter_track::at(device_index, gfx_it->second.track_indexes[0]), ts, + static_cast(metric_values.gfx_activity)); + } + + auto umc_it = tracks.find(detail::UMC_BUSY_VALUE); + if(effective_metrics.bits.umc_activity && umc_it != tracks.end() && + !umc_it->second.track_indexes.empty()) + { + TRACE_COUNTER( + "device_busy_umc", + counter_track::at(device_index, umc_it->second.track_indexes[0]), ts, + static_cast(metric_values.umc_activity)); + } + + auto mm_it = tracks.find(detail::MM_BUSY_VALUE); + if(effective_metrics.bits.mm_activity && mm_it != tracks.end() && + !mm_it->second.track_indexes.empty()) + { + TRACE_COUNTER("device_busy_mm", + counter_track::at(device_index, mm_it->second.track_indexes[0]), + ts, static_cast(metric_values.mm_activity)); + } + + auto temp_it = tracks.find(detail::TEMPERATURE_VALUE); + if((effective_metrics.bits.edge_temperature || + effective_metrics.bits.hotspot_temperature) && + temp_it != tracks.end() && !temp_it->second.track_indexes.empty()) + { + const double temp = effective_metrics.bits.hotspot_temperature + ? metric_values.hotspot_temperature + : metric_values.edge_temperature; + TRACE_COUNTER( + "device_temp", + counter_track::at(device_index, temp_it->second.track_indexes[0]), ts, + temp); + } + + auto power_it = tracks.find(detail::CURRENT_POWER_VALUE); + if((effective_metrics.bits.average_socket_power || + effective_metrics.bits.current_socket_power) && + power_it != tracks.end() && !power_it->second.track_indexes.empty()) + { + const double power = effective_metrics.bits.average_socket_power + ? metric_values.average_socket_power + : metric_values.current_socket_power; + TRACE_COUNTER( + "device_power", + counter_track::at(device_index, power_it->second.track_indexes[0]), ts, + power); + } + + auto memory_it = tracks.find(detail::MEMORY_USAGE_VALUE); + if(effective_metrics.bits.memory_usage && memory_it != tracks.end() && + !memory_it->second.track_indexes.empty()) + { + const double usage = + metric_values.memory_usage / static_cast(tim::units::megabyte); + TRACE_COUNTER( + "device_memory_usage", + counter_track::at(device_index, memory_it->second.track_indexes[0]), ts, + usage); + } + + auto sdma_it = tracks.find(detail::SDMA_USAGE_VALUE); + if(effective_metrics.bits.sdma_usage && sdma_it != tracks.end() && + !sdma_it->second.track_indexes.empty()) + { + TRACE_COUNTER( + "device_sdma_usage", + counter_track::at(device_index, sdma_it->second.track_indexes[0]), ts, + static_cast(metric_values.sdma_usage)); + } + } + + static void process_xcp_activity( + size_t device_index, uint64_t ts, + const pmc::collectors::gpu::metrics& metric_values, + const pmc::collectors::gpu::enabled_metrics& effective_metrics, + std::unordered_map& tracks) + { + // Per-XCP VCN busy metrics (MI300) + auto vcn_busy_it = tracks.find(detail::VCN_BUSY_VALUE); + if(effective_metrics.bits.vcn_busy && vcn_busy_it != tracks.end() && + !vcn_busy_it->second.track_indexes.empty()) + { + size_t engine_id = 0; + for(const auto& xcp_stats : metric_values.xcp_stats) + { + for(const auto& vcn_val : xcp_stats.vcn_busy) + { + if(vcn_val != std::numeric_limits::max() && + engine_id < vcn_busy_it->second.track_indexes.size()) + { + TRACE_COUNTER("device_vcn_activity", + counter_track::at( + device_index, + vcn_busy_it->second.track_indexes[engine_id++]), + ts, vcn_val); + } + } + } + } + + // Device-level VCN activity (Radeon) + auto vcn_it = tracks.find(detail::VCN_ACTIVITY_VALUE); + if(effective_metrics.bits.vcn_activity && vcn_it != tracks.end() && + !vcn_it->second.track_indexes.empty()) + { + size_t engine_id = 0; + for(const auto& vcn_val : metric_values.vcn_activity) + { + if(vcn_val != std::numeric_limits::max() && + engine_id < vcn_it->second.track_indexes.size()) + { + TRACE_COUNTER( + "device_vcn_activity", + counter_track::at(device_index, + vcn_it->second.track_indexes[engine_id++]), + ts, vcn_val); + } + } + } + + // Per-XCP JPEG busy metrics (MI300) + auto jpeg_busy_it = tracks.find(detail::JPEG_BUSY_VALUE); + if(effective_metrics.bits.jpeg_busy && jpeg_busy_it != tracks.end() && + !jpeg_busy_it->second.track_indexes.empty()) + { + size_t engine_id = 0; + for(const auto& xcp_stats : metric_values.xcp_stats) + { + for(const auto& jpeg_val : xcp_stats.jpeg_busy) + { + if(jpeg_val != std::numeric_limits::max() && + engine_id < jpeg_busy_it->second.track_indexes.size()) + { + TRACE_COUNTER( + "device_jpeg_activity", + counter_track::at( + device_index, + jpeg_busy_it->second.track_indexes[engine_id++]), + ts, jpeg_val); + } + } + } + } + + // Device-level JPEG activity (Radeon) + auto jpeg_it = tracks.find(detail::JPEG_ACTIVITY_VALUE); + if(effective_metrics.bits.jpeg_activity && jpeg_it != tracks.end() && + !jpeg_it->second.track_indexes.empty()) + { + size_t engine_id = 0; + for(const auto& jpeg_val : metric_values.jpeg_activity) + { + if(jpeg_val != std::numeric_limits::max() && + engine_id < jpeg_it->second.track_indexes.size()) + { + TRACE_COUNTER( + "device_jpeg_activity", + counter_track::at(device_index, + jpeg_it->second.track_indexes[engine_id++]), + ts, jpeg_val); + } + } + } + } + + static void process_xgmi_metrics(size_t device_index, uint64_t ts, + const metrics& metric_values, + const enabled_metrics& effective_metrics, + const detail::xgmi_track_set& xgmi_tracks) + { + if(!effective_metrics.bits.xgmi) + { + return; + } + + if(!xgmi_tracks.link_width.empty() && metric_values.xgmi.link.width != 0) + { + TRACE_COUNTER("device_xgmi_link_width", + counter_track::at(device_index, xgmi_tracks.link_width[0]), ts, + static_cast(metric_values.xgmi.link.width)); + } + + if(!xgmi_tracks.link_speed.empty() && metric_values.xgmi.link.speed != 0) + { + TRACE_COUNTER("device_xgmi_link_speed", + counter_track::at(device_index, xgmi_tracks.link_speed[0]), ts, + static_cast(metric_values.xgmi.link.speed)); + } + + for(size_t link = 0; + link < AMDSMI_MAX_NUM_XGMI_LINKS && link < xgmi_tracks.read_data.size(); + ++link) + { + if(metric_values.xgmi.data_acc.read[link] != 0) + { + TRACE_COUNTER( + "device_xgmi_read_data", + counter_track::at(device_index, xgmi_tracks.read_data[link]), ts, + static_cast(metric_values.xgmi.data_acc.read[link])); + } + } + + for(size_t link = 0; + link < AMDSMI_MAX_NUM_XGMI_LINKS && link < xgmi_tracks.write_data.size(); + ++link) + { + if(metric_values.xgmi.data_acc.write[link] != 0) + { + TRACE_COUNTER( + "device_xgmi_write_data", + counter_track::at(device_index, xgmi_tracks.write_data[link]), ts, + static_cast(metric_values.xgmi.data_acc.write[link])); + } + } + } + + static void process_pcie_metrics(size_t device_index, uint64_t ts, + const metrics& metric_values, + const enabled_metrics& effective_metrics, + const detail::pcie_track_set& pcie_tracks) + { + if(!effective_metrics.bits.pcie) + { + return; + } + + if(!pcie_tracks.link_width.empty() && metric_values.pcie.link.width != 0) + { + TRACE_COUNTER("device_pcie_link_width", + counter_track::at(device_index, pcie_tracks.link_width[0]), ts, + static_cast(metric_values.pcie.link.width)); + } + + if(!pcie_tracks.link_speed.empty() && metric_values.pcie.link.speed != 0) + { + TRACE_COUNTER("device_pcie_link_speed", + counter_track::at(device_index, pcie_tracks.link_speed[0]), ts, + static_cast(metric_values.pcie.link.speed)); + } + + if(!pcie_tracks.bandwidth_acc.empty() && metric_values.pcie.bandwidth.acc != 0) + { + TRACE_COUNTER("device_pcie_bandwidth_acc", + counter_track::at(device_index, pcie_tracks.bandwidth_acc[0]), + ts, static_cast(metric_values.pcie.bandwidth.acc)); + } + + if(!pcie_tracks.bandwidth_inst.empty() && metric_values.pcie.bandwidth.inst != 0) + { + TRACE_COUNTER("device_pcie_bandwidth_inst", + counter_track::at(device_index, pcie_tracks.bandwidth_inst[0]), + ts, static_cast(metric_values.pcie.bandwidth.inst)); + } + } +}; + +} // namespace rocprofsys::pmc::collectors::gpu diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/sample.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/sample.hpp new file mode 100644 index 00000000000..327077bf470 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/sample.hpp @@ -0,0 +1,105 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "core/trace_cache/sample_type.hpp" +#include "library/pmc/collectors/gpu/types.hpp" + +#include + +namespace rocprofsys::pmc::collectors::gpu +{ + +/** + * @brief GPU PMC sample type. + * + * This struct represents a sample of GPU performance metrics collected by the PMC. + */ +struct sample : trace_cache::cacheable_t +{ + static constexpr trace_cache::type_identifier_t type_identifier{ + trace_cache::type_identifier_t::gpu_pmc_sample + }; + + sample() = default; + sample(enabled_metrics _settings, uint32_t _device_id, size_t _timestamp, + metrics _metric_values) + : enabled_metric(_settings) + , device_id(_device_id) + , timestamp(_timestamp) + , metric_values(_metric_values) + {} + + enabled_metrics enabled_metric{}; + uint32_t device_id = 0; + uint64_t timestamp = 0; + metrics metric_values{}; +}; + +} // namespace rocprofsys::pmc::collectors::gpu + +namespace rocprofsys::trace_cache +{ + +template <> +inline void +serialize(uint8_t* buffer, const pmc::collectors::gpu::sample& item) +{ + utility::store_value( + buffer, static_cast(item.enabled_metric.value), item.device_id, + item.timestamp, item.metric_values.average_socket_power, + item.metric_values.current_socket_power, item.metric_values.memory_usage, + item.metric_values.hotspot_temperature, item.metric_values.edge_temperature, + item.metric_values.gfx_activity, item.metric_values.umc_activity, + item.metric_values.mm_activity, item.metric_values.xcp_stats, + item.metric_values.vcn_activity, item.metric_values.jpeg_activity, + item.metric_values.xgmi.link.width, item.metric_values.xgmi.link.speed, + item.metric_values.xgmi.data_acc.read, item.metric_values.xgmi.data_acc.write, + item.metric_values.pcie.link.width, item.metric_values.pcie.link.speed, + item.metric_values.pcie.bandwidth.acc, item.metric_values.pcie.bandwidth.inst, + item.metric_values.sdma_usage); +} + +template <> +inline pmc::collectors::gpu::sample +deserialize(uint8_t*& buffer) +{ + pmc::collectors::gpu::sample item; + utility::parse_value( + buffer, item.enabled_metric.value, item.device_id, item.timestamp, + item.metric_values.average_socket_power, item.metric_values.current_socket_power, + item.metric_values.memory_usage, item.metric_values.hotspot_temperature, + item.metric_values.edge_temperature, item.metric_values.gfx_activity, + item.metric_values.umc_activity, item.metric_values.mm_activity, + item.metric_values.xcp_stats, item.metric_values.vcn_activity, + item.metric_values.jpeg_activity, item.metric_values.xgmi.link.width, + item.metric_values.xgmi.link.speed, item.metric_values.xgmi.data_acc.read, + item.metric_values.xgmi.data_acc.write, item.metric_values.pcie.link.width, + item.metric_values.pcie.link.speed, item.metric_values.pcie.bandwidth.acc, + item.metric_values.pcie.bandwidth.inst, item.metric_values.sdma_usage); + return item; +} + +template <> +inline size_t +get_size(const pmc::collectors::gpu::sample& item) +{ + return utility::get_size( + item.enabled_metric.value, item.device_id, item.timestamp, + item.metric_values.average_socket_power, item.metric_values.current_socket_power, + item.metric_values.memory_usage, item.metric_values.hotspot_temperature, + item.metric_values.edge_temperature, item.metric_values.gfx_activity, + item.metric_values.umc_activity, item.metric_values.mm_activity, + item.metric_values.xcp_stats, item.metric_values.vcn_activity, + item.metric_values.jpeg_activity, item.metric_values.xgmi.link.width, + item.metric_values.xgmi.link.speed, item.metric_values.xgmi.data_acc.read, + item.metric_values.xgmi.data_acc.write, item.metric_values.pcie.link.width, + item.metric_values.pcie.link.speed, item.metric_values.pcie.bandwidth.acc, + item.metric_values.pcie.bandwidth.inst, item.metric_values.sdma_usage); +} + +/// @brief GPU PMC sample type alias +using gpu_pmc_sample = pmc::collectors::gpu::sample; + +} // namespace rocprofsys::trace_cache diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/tests/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/tests/CMakeLists.txt new file mode 100644 index 00000000000..7ef6eae1a21 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/tests/CMakeLists.txt @@ -0,0 +1,15 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +add_library(pmc-gpu-collector-tests OBJECT test_device.cpp) + +target_link_libraries( + pmc-gpu-collector-tests + PUBLIC + rocprofiler-systems-pmc-library + rocprofiler-systems-googletest-library + rocprofiler-systems-logger + rocprofiler-systems::rocprofiler-systems-headers + rocprofiler-systems::rocprofiler-systems-rocm + rocprofiler-systems::rocprofiler-systems-timemory +) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/tests/test_device.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/tests/test_device.cpp new file mode 100644 index 00000000000..64551dd9be0 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/tests/test_device.cpp @@ -0,0 +1,2527 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +// Include amd_smi.hpp first to get proper AMD_SMI_SDMA_SUPPORTED detection +// based on the actual AMD SMI library version +#include "core/amd_smi.hpp" + +#include "library/pmc/collectors/gpu/device.hpp" +#include "library/pmc/device_providers/amd_smi/drivers/tests/mock_driver.hpp" + +#include +#include + +#include + +using namespace rocprofsys::pmc::collectors::gpu; +using ::testing::_; +using ::testing::AnyNumber; +using ::testing::AtLeast; +using ::testing::DoAll; +using ::testing::Return; +using ::testing::SetArgPointee; +using ::testing::StrictMock; + +using MockDriver = + ::testing::StrictMock; + +namespace rocprofsys::pmc::collectors::gpu::testing +{ + +/** + * @brief Test fixture for GPU device tests. + * + * Provides common setup for device tests including mock driver and + * helper methods for configuring mock behavior. + */ +class DeviceTest : public ::testing::Test +{ +protected: + std::shared_ptr mock_driver; + amdsmi_processor_handle test_handle; + processor_type_t test_processor_type; + size_t test_index; + + void SetUp() override + { + mock_driver = std::make_shared(); + test_handle = reinterpret_cast(0x1234); + test_processor_type = AMDSMI_PROCESSOR_TYPE_AMD_GPU; + test_index = 0; + + // Device info is always called during device initialization + EXPECT_CALL(*mock_driver, get_gpu_asic_info(test_handle, _)) + .Times(AnyNumber()) + .WillRepeatedly(Return(AMDSMI_STATUS_SUCCESS)); + } + + /** + * @brief Setup SDMA mock expectations for any device mock. + * Call this for any mock that will have devices constructed with it. + * No-op when SDMA is not supported. + */ + template + static void SetupSDMAExpectations([[maybe_unused]] MockPtr& mock, + [[maybe_unused]] amdsmi_processor_handle handle) + { +#if defined(AMD_SMI_SDMA_SUPPORTED) && AMD_SMI_SDMA_SUPPORTED == 1 + EXPECT_CALL(*mock, get_gpu_process_list(handle, _, _)) + .Times(AnyNumber()) + .WillRepeatedly(DoAll(SetArgPointee<1>(1), Return(AMDSMI_STATUS_SUCCESS))); +#endif + } + + /** + * @brief Configure mock to return GPU metrics with all valid values. + */ + void SetupAllMetricsSupported() + { + amdsmi_gpu_metrics_t metrics = CreateValidMetrics(); + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t mem_usage = 8589934592ULL; // 8 GB + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(mem_usage), Return(AMDSMI_STATUS_SUCCESS))); + + // SDMA support - allow any number of calls (happens during construction and + // metrics collection) +#if defined(AMD_SMI_SDMA_SUPPORTED) && AMD_SMI_SDMA_SUPPORTED == 1 + EXPECT_CALL(*mock_driver, get_gpu_process_list(test_handle, _, _)) + .Times(AnyNumber()) + .WillRepeatedly(DoAll(SetArgPointee<1>(1), Return(AMDSMI_STATUS_SUCCESS))); +#endif + } + + /** + * @brief Configure mock to return GPU metrics with all sentinel values. + */ + void SetupNoMetricsSupported() + { + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + // SDMA support +#if defined(AMD_SMI_SDMA_SUPPORTED) && AMD_SMI_SDMA_SUPPORTED == 1 + EXPECT_CALL(*mock_driver, get_gpu_process_list(test_handle, _, _)) + .Times(AnyNumber()) + .WillRepeatedly(Return(AMDSMI_STATUS_NOT_SUPPORTED)); +#endif + } + + /** + * @brief Configure mock to return GPU metrics with partial support. + * + * Returns valid values for: + * - current_socket_power + * - temperature_hotspot + * - average_gfx_activity + * + * All other metrics return sentinel values. + */ + void SetupPartialMetricsSupported() + { + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + // Set specific metrics to valid values + metrics.current_socket_power = 150; // Valid power (watts) + metrics.temperature_hotspot = 75; // Valid temp (degrees Celsius) + metrics.average_gfx_activity = 85; // Valid activity (percent) + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + constexpr uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + // SDMA support +#if defined(AMD_SMI_SDMA_SUPPORTED) && AMD_SMI_SDMA_SUPPORTED == 1 + EXPECT_CALL(*mock_driver, get_gpu_process_list(test_handle, _, _)) + .Times(AnyNumber()) + .WillRepeatedly(DoAll(SetArgPointee<1>(1), Return(AMDSMI_STATUS_SUCCESS))); +#endif + } + + /** + * @brief Create amdsmi_gpu_metrics_t with all valid (non-sentinel) values. + */ + static amdsmi_gpu_metrics_t CreateValidMetrics() + { + amdsmi_gpu_metrics_t metrics{}; + + // Power metrics + metrics.current_socket_power = 150; + metrics.average_socket_power = 140; + + // Temperature metrics (in millidegrees Celsius) + metrics.temperature_hotspot = 75; + metrics.temperature_edge = 70; + + // Activity metrics (percentage) + metrics.average_gfx_activity = 85; + metrics.average_umc_activity = 60; + metrics.average_mm_activity = 40; + + // XCP stats - VCN and JPEG activity + for(size_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + for(size_t i = 0; i < AMDSMI_MAX_NUM_VCN; ++i) + { + metrics.xcp_stats[xcp].vcn_busy[i] = static_cast(50 + i); + } + for(size_t i = 0; i < ROCPROFSYS_AMDSMI_JPEG_ENGINE_COUNT; ++i) + { + metrics.xcp_stats[xcp].jpeg_busy[i] = static_cast(30 + i); + } + } + + // XGMI metrics + metrics.xgmi_link_width = 16; + metrics.xgmi_link_speed = 25000; + for(size_t i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) + { + metrics.xgmi_read_data_acc[i] = 1000000ULL + i; + metrics.xgmi_write_data_acc[i] = 2000000ULL + i; + } + + // PCIe metrics + metrics.pcie_link_width = 16; + metrics.pcie_link_speed = 16000; // Gen4 + metrics.pcie_bandwidth_acc = 500000000ULL; + metrics.pcie_bandwidth_inst = 10000000ULL; + + return metrics; + } + + /** + * @brief Create amdsmi_gpu_metrics_t with all sentinel values. + */ + static amdsmi_gpu_metrics_t CreateSentinelMetrics() + { + amdsmi_gpu_metrics_t metrics{}; + + // uint16_t sentinel values + metrics.current_socket_power = 0xFFFF; + metrics.average_socket_power = 0xFFFF; + metrics.average_gfx_activity = 0xFFFF; + metrics.average_umc_activity = 0xFFFF; + metrics.average_mm_activity = 0xFFFF; + + // Temperature sentinel values (uint16_t fields) + metrics.temperature_hotspot = 0xFFFF; + metrics.temperature_edge = 0xFFFF; + + // 16-bit sentinel for XCP stats + for(size_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + for(size_t i = 0; i < AMDSMI_MAX_NUM_VCN; ++i) + { + metrics.xcp_stats[xcp].vcn_busy[i] = 0xFFFF; + } + for(size_t i = 0; i < ROCPROFSYS_AMDSMI_JPEG_ENGINE_COUNT; ++i) + { + metrics.xcp_stats[xcp].jpeg_busy[i] = 0xFFFF; + } + } + + // 16-bit sentinel for device-level VCN/JPEG activity arrays + for(size_t i = 0; i < AMDSMI_MAX_NUM_VCN; ++i) + { + metrics.vcn_activity[i] = 0xFFFF; + } + for(size_t i = 0; i < AMDSMI_MAX_NUM_JPEG; ++i) + { + metrics.jpeg_activity[i] = 0xFFFF; + } + + // 16-bit sentinel for XGMI link info + metrics.xgmi_link_width = 0xFFFF; + metrics.xgmi_link_speed = 0xFFFF; + + // 64-bit sentinel for XGMI data + for(size_t i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) + { + metrics.xgmi_read_data_acc[i] = 0xFFFFFFFFFFFFFFFFULL; + metrics.xgmi_write_data_acc[i] = 0xFFFFFFFFFFFFFFFFULL; + } + + // 16-bit sentinel for PCIe link info + metrics.pcie_link_width = 0xFFFF; + metrics.pcie_link_speed = 0xFFFF; + + // 64-bit sentinel for PCIe bandwidth + metrics.pcie_bandwidth_acc = 0xFFFFFFFFFFFFFFFFULL; + metrics.pcie_bandwidth_inst = 0xFFFFFFFFFFFFFFFFULL; + + return metrics; + } +}; + +// ============================================================================ +// Category 1: Constructor and Initialization Tests +// ============================================================================ + +/** + * TC1.1: Valid Device Construction with Full Metric Support + * + * Objective: Verify device initializes correctly when all metrics are supported. + */ +TEST_F(DeviceTest, valid_device_construction_full_support) +{ + // Setup: All metrics return valid values + SetupAllMetricsSupported(); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify device is supported + EXPECT_TRUE(dev.is_supported()); + + // Verify all metric bits are set + auto supported = dev.get_supported_metrics(); + EXPECT_NE(supported.value, 0U); + + // Verify basic properties + EXPECT_EQ(dev.get_index(), test_index); +} + +/** + * TC1.2: Device Construction with No Supported Metrics + * + * Objective: Verify device handles hardware with no supported metrics. + */ +TEST_F(DeviceTest, device_construction_no_support) +{ + // Setup: All metrics return sentinel values + SetupNoMetricsSupported(); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify device is not supported + EXPECT_FALSE(dev.is_supported()); + + // Verify no metric bits are set + auto supported = dev.get_supported_metrics(); + EXPECT_EQ(supported.value, 0U); + + // Verify get_gpu_metrics returns all zeros + auto metrics = dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + EXPECT_EQ(metrics.current_socket_power, 0U); + EXPECT_EQ(metrics.average_socket_power, 0U); + EXPECT_EQ(metrics.memory_usage, 0ULL); +} + +/** + * TC1.3: Device Construction with Partial Metric Support + * + * Objective: Verify selective metric initialization. + */ +TEST_F(DeviceTest, device_construction_partial_support) +{ + // Setup: Only specific metrics supported + SetupPartialMetricsSupported(); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify device is supported (at least one metric available) + EXPECT_TRUE(dev.is_supported()); + + // Verify only expected metrics are marked as supported + auto supported = dev.get_supported_metrics(); + + EXPECT_TRUE(supported.bits.current_socket_power); + EXPECT_TRUE(supported.bits.hotspot_temperature); + EXPECT_TRUE(supported.bits.gfx_activity); + + // Verify unsupported metrics are not set + EXPECT_FALSE(supported.bits.average_socket_power); + EXPECT_FALSE(supported.bits.edge_temperature); + EXPECT_FALSE(supported.bits.umc_activity); + EXPECT_FALSE(supported.bits.mm_activity); + EXPECT_FALSE(supported.bits.memory_usage); + EXPECT_FALSE(supported.bits.vcn_activity); + EXPECT_FALSE(supported.bits.jpeg_activity); + EXPECT_FALSE(supported.bits.vcn_busy); + EXPECT_FALSE(supported.bits.jpeg_busy); + EXPECT_FALSE(supported.bits.xgmi); + EXPECT_FALSE(supported.bits.pcie); +} + +/** + * TC1.4: Device Construction with Different Indices + * + * Objective: Verify device index is correctly stored for different device instances. + */ +TEST_F(DeviceTest, device_construction_different_indices) +{ + SetupAllMetricsSupported(); + + // Test with different indices + { + device dev(mock_driver, test_handle, AMDSMI_PROCESSOR_TYPE_AMD_GPU, + 0); + EXPECT_EQ(dev.get_index(), 0U); + } + + { + device dev(mock_driver, test_handle, AMDSMI_PROCESSOR_TYPE_AMD_GPU, + 1); + EXPECT_EQ(dev.get_index(), 1U); + } + + { + device dev(mock_driver, test_handle, AMDSMI_PROCESSOR_TYPE_AMD_GPU, + 2); + EXPECT_EQ(dev.get_index(), 2U); + } +} + +// ============================================================================ +// Category 2: Power Metrics Collection Tests +// ============================================================================ + +/** + * TC2.1: Current Socket Power Collection + * + * Objective: Verify current power is collected when supported. + */ +TEST_F(DeviceTest, current_socket_power_collection) +{ + // Setup: Mock returns specific current power value + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + metrics.current_socket_power = 150; // 150 watts + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device (initializes supported metrics) + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify current_socket_power is marked as supported + EXPECT_TRUE(dev.get_supported_metrics().bits.current_socket_power); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify current power value was collected + EXPECT_EQ(collected_metrics.current_socket_power, 150U); +} + +/** + * TC2.2: Average Socket Power Collection + * + * Objective: Verify average power is collected when supported. + */ +TEST_F(DeviceTest, average_socket_power_collection) +{ + // Setup: Mock returns specific average power value + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + metrics.average_socket_power = 140; // 140 watts + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify average_socket_power is marked as supported + EXPECT_TRUE(dev.get_supported_metrics().bits.average_socket_power); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify average power value was collected + EXPECT_EQ(collected_metrics.average_socket_power, 140U); +} + +/** + * TC2.3: Power Metrics Not Collected When Unsupported + * + * Objective: Verify power metrics remain zero when not supported. + */ +TEST_F(DeviceTest, power_metrics_not_collected_when_unsupported) +{ + // Setup: All metrics are sentinel values (unsupported) + SetupNoMetricsSupported(); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify power metrics are not marked as supported + auto supported = dev.get_supported_metrics(); + EXPECT_FALSE(supported.bits.current_socket_power); + EXPECT_FALSE(supported.bits.average_socket_power); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify power values remain zero + EXPECT_EQ(collected_metrics.current_socket_power, 0U); + EXPECT_EQ(collected_metrics.average_socket_power, 0U); +} + +// ============================================================================ +// Category 3: Temperature Metrics Collection Tests +// ============================================================================ + +/** + * TC2.4: Hotspot Temperature Collection + * + * Objective: Verify hotspot temperature collection. + */ +TEST_F(DeviceTest, hotspot_temperature_collection) +{ + // Setup: Mock returns specific hotspot temperature value + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + metrics.temperature_hotspot = 75; // 75°C + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify hotspot_temperature is marked as supported + EXPECT_TRUE(dev.get_supported_metrics().bits.hotspot_temperature); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify hotspot temperature value was collected + EXPECT_EQ(collected_metrics.hotspot_temperature, 75); +} + +/** + * TC2.5: Edge Temperature Collection + * + * Objective: Verify edge temperature collection. + */ +TEST_F(DeviceTest, edge_temperature_collection) +{ + // Setup: Mock returns specific edge temperature value + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + metrics.temperature_edge = 70; // 70°C in degrees Celsius + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + constexpr uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify edge_temperature is marked as supported + EXPECT_TRUE(dev.get_supported_metrics().bits.edge_temperature); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify edge temperature value was collected (raw value from AMD SMI) + EXPECT_EQ(collected_metrics.edge_temperature, 70); +} + +/** + * TC2.6: Temperature Metrics Not Collected When Unsupported + * + * Objective: Verify temperature skipped when not supported. + */ +TEST_F(DeviceTest, temperature_metrics_not_collected_when_unsupported) +{ + // Setup: All metrics are sentinel values (unsupported) + SetupNoMetricsSupported(); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify temperature metrics are not marked as supported + auto supported = dev.get_supported_metrics(); + EXPECT_FALSE(supported.bits.hotspot_temperature); + EXPECT_FALSE(supported.bits.edge_temperature); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify temperature values remain zero + EXPECT_EQ(collected_metrics.hotspot_temperature, 0); + EXPECT_EQ(collected_metrics.edge_temperature, 0); +} + +// ============================================================================ +// Category 4: Activity Metrics Collection Tests +// ============================================================================ + +/** + * GFX Activity Collection + * + * Objective: Verify graphics engine activity collection. + */ +TEST_F(DeviceTest, gfx_activity_collection) +{ + // Setup: Mock returns specific GFX activity value + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + metrics.average_gfx_activity = 85; // 85% activity + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify gfx_activity is marked as supported + EXPECT_TRUE(dev.get_supported_metrics().bits.gfx_activity); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify GFX activity value was collected + EXPECT_EQ(collected_metrics.gfx_activity, 85U); +} + +/** + * UMC Activity Collection + * + * Objective: Verify memory controller activity collection. + */ +TEST_F(DeviceTest, umc_activity_collection) +{ + // Setup: Mock returns specific UMC activity value + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + metrics.average_umc_activity = 60; // 60% activity + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify umc_activity is marked as supported + EXPECT_TRUE(dev.get_supported_metrics().bits.umc_activity); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify UMC activity value was collected + EXPECT_EQ(collected_metrics.umc_activity, 60U); +} + +/** + * MM Activity Collection + * + * Objective: Verify multimedia activity collection. + */ +TEST_F(DeviceTest, mm_activity_collection) +{ + // Setup: Mock returns specific MM activity value + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + metrics.average_mm_activity = 40; // 40% activity + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify mm_activity is marked as supported + EXPECT_TRUE(dev.get_supported_metrics().bits.mm_activity); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify MM activity value was collected + EXPECT_EQ(collected_metrics.mm_activity, 40U); +} + +/** + * All Activity Metrics Collection + * + * Objective: Verify all three activity metrics collected together. + */ +TEST_F(DeviceTest, all_activity_metrics_collection) +{ + // Setup: Mock returns all three activity values + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + metrics.average_gfx_activity = 85; + metrics.average_umc_activity = 60; + metrics.average_mm_activity = 40; + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify all activity metrics are marked as supported + auto supported = dev.get_supported_metrics(); + EXPECT_TRUE(supported.bits.gfx_activity); + EXPECT_TRUE(supported.bits.umc_activity); + EXPECT_TRUE(supported.bits.mm_activity); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify all three activity values were collected correctly + EXPECT_EQ(collected_metrics.gfx_activity, 85U); + EXPECT_EQ(collected_metrics.umc_activity, 60U); + EXPECT_EQ(collected_metrics.mm_activity, 40U); +} + +// ============================================================================ +// Category 5: Memory Usage Collection Tests +// ============================================================================ + +/** + * VRAM Memory Usage Collection Success + * + * Objective: Verify VRAM usage collected when API succeeds. + */ +TEST_F(DeviceTest, vram_memory_usage_collection_success) +{ + // Setup: Mock returns sentinel for all GPU metrics + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + // Mock returns valid memory usage (8 GB) + uint64_t mem_usage = 8589934592ULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(mem_usage), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify memory_usage is marked as supported + EXPECT_TRUE(dev.get_supported_metrics().bits.memory_usage); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify memory usage value was collected + EXPECT_EQ(collected_metrics.memory_usage, 8589934592ULL); +} + +/** + * Memory Usage Collection Failure + * + * Objective: Verify memory usage remains zero on API failure. + */ +TEST_F(DeviceTest, memory_usage_collection_failure) +{ + // Setup: Mock returns sentinel for all GPU metrics + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + // Mock returns failure for memory usage + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly(Return(AMDSMI_STATUS_NOT_SUPPORTED)); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify memory_usage is NOT marked as supported + EXPECT_FALSE(dev.get_supported_metrics().bits.memory_usage); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify memory usage remains zero + EXPECT_EQ(collected_metrics.memory_usage, 0ULL); +} + +/** + * Memory Usage Not Collected When Unsupported + * + * Objective: Verify early return when memory not supported. + */ +TEST_F(DeviceTest, memory_usage_not_collected_when_unsupported) +{ + // Setup: All metrics are sentinel values (unsupported) + SetupNoMetricsSupported(); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify memory_usage is NOT marked as supported + EXPECT_FALSE(dev.get_supported_metrics().bits.memory_usage); + + // Mock should NOT be called for memory usage during collection + // (because supported bit is false, early return happens) + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(0); // Should not be called during get_gpu_metrics() + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify memory usage remains zero + EXPECT_EQ(collected_metrics.memory_usage, 0ULL); +} + +// ============================================================================ +// Category 6: XCP Metrics Collection Tests +// ============================================================================ + +/** + * VCN Busy Collection - All XCPs (MI300) + * + * Objective: Verify per-XCP VCN busy stats copied for all XCP instances. + */ +TEST_F(DeviceTest, vcn_busy_collection_all_xcps) +{ + // Setup: Mock returns valid VCN busy values for all XCPs + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + // Set VCN busy values for all XCP instances + for(size_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + for(size_t vcn = 0; vcn < AMDSMI_MAX_NUM_VCN; ++vcn) + { + metrics.xcp_stats[xcp].vcn_busy[vcn] = static_cast(50 + xcp + vcn); + } + } + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify vcn_busy is marked as supported (per-XCP metrics) + EXPECT_TRUE(dev.get_supported_metrics().bits.vcn_busy); + // Device-level vcn_activity should NOT be set when per-XCP is available + EXPECT_FALSE(dev.get_supported_metrics().bits.vcn_activity); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify all XCP VCN arrays were copied correctly + for(size_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + for(size_t vcn = 0; vcn < AMDSMI_MAX_NUM_VCN; ++vcn) + { + EXPECT_EQ(collected_metrics.xcp_stats[xcp].vcn_busy[vcn], + static_cast(50 + xcp + vcn)); + } + } +} + +/** + * JPEG Activity Collection - All XCPs + * + * Objective: Verify JPEG busy stats copied for all XCP instances. + */ +TEST_F(DeviceTest, jpeg_activity_collection_all_xcps) +{ + // Setup: Mock returns valid JPEG busy values for all XCPs + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + // Set JPEG activity values for all XCP instances + for(size_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + for(size_t jpeg = 0; jpeg < ROCPROFSYS_AMDSMI_JPEG_ENGINE_COUNT; ++jpeg) + { + metrics.xcp_stats[xcp].jpeg_busy[jpeg] = + static_cast(30 + xcp + jpeg); + } + } + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify jpeg_busy (per-XCP) is marked as supported + EXPECT_TRUE(dev.get_supported_metrics().bits.jpeg_busy); + // Device-level jpeg_activity should NOT be set when per-XCP is available + EXPECT_FALSE(dev.get_supported_metrics().bits.jpeg_activity); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify all XCP JPEG arrays were copied correctly + for(size_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + for(size_t jpeg = 0; jpeg < ROCPROFSYS_AMDSMI_JPEG_ENGINE_COUNT; ++jpeg) + { + EXPECT_EQ(collected_metrics.xcp_stats[xcp].jpeg_busy[jpeg], + static_cast(30 + xcp + jpeg)); + } + } +} + +/** + * XCP Metrics Not Collected When Unsupported + * + * Objective: Verify XCP metrics skipped when not supported. + */ +TEST_F(DeviceTest, xcp_metrics_not_collected_when_unsupported) +{ + // Setup: All metrics are sentinel values (unsupported) + SetupNoMetricsSupported(); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify XCP metrics are NOT marked as supported + auto supported = dev.get_supported_metrics(); + EXPECT_FALSE(supported.bits.vcn_busy); + EXPECT_FALSE(supported.bits.jpeg_busy); + EXPECT_FALSE(supported.bits.vcn_activity); + EXPECT_FALSE(supported.bits.jpeg_activity); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify XCP arrays remain default-initialized (all zeros) + for(size_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + for(size_t vcn = 0; vcn < AMDSMI_MAX_NUM_VCN; ++vcn) + { + EXPECT_EQ(collected_metrics.xcp_stats[xcp].vcn_busy[vcn], 0); + } + for(size_t jpeg = 0; jpeg < ROCPROFSYS_AMDSMI_JPEG_ENGINE_COUNT; ++jpeg) + { + EXPECT_EQ(collected_metrics.xcp_stats[xcp].jpeg_busy[jpeg], 0); + } + } +} + +/** + * Mixed VCN/JPEG Support + * + * Objective: Verify VCN collected but not JPEG when only VCN supported. + */ +TEST_F(DeviceTest, mixed_vcn_jpeg_support) +{ + // Setup: Only VCN is supported, JPEG is not + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + // Set valid VCN values for all XCPs + for(size_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + for(size_t vcn = 0; vcn < AMDSMI_MAX_NUM_VCN; ++vcn) + { + metrics.xcp_stats[xcp].vcn_busy[vcn] = static_cast(50 + vcn); + } + // JPEG remains sentinel (0xFFFF) + } + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify only VCN busy (per-XCP) is supported + auto supported = dev.get_supported_metrics(); + EXPECT_TRUE(supported.bits.vcn_busy); + EXPECT_FALSE(supported.bits.jpeg_busy); + // Device-level vcn_activity/jpeg_activity should NOT be set when per-XCP is available + EXPECT_FALSE(supported.bits.vcn_activity); + EXPECT_FALSE(supported.bits.jpeg_activity); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify VCN arrays are populated + for(size_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + for(size_t vcn = 0; vcn < AMDSMI_MAX_NUM_VCN; ++vcn) + { + EXPECT_EQ(collected_metrics.xcp_stats[xcp].vcn_busy[vcn], + static_cast(50 + vcn)); + } + } + + // Verify JPEG arrays remain default-initialized (zeros) + for(size_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + for(size_t jpeg = 0; jpeg < ROCPROFSYS_AMDSMI_JPEG_ENGINE_COUNT; ++jpeg) + { + EXPECT_EQ(collected_metrics.xcp_stats[xcp].jpeg_busy[jpeg], 0); + } + } +} + +// ============================================================================ +// Category 7: XGMI Metrics Collection Tests +// ============================================================================ + +/** + * XGMI Link Width Collection + * + * Objective: Verify XGMI link width is populated when supported. + */ +TEST_F(DeviceTest, xgmi_link_width_collection) +{ + // Setup: Mock returns specific XGMI link width value + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + metrics.xgmi_link_width = 16; // 16-bit link width + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify XGMI is marked as supported + EXPECT_TRUE(dev.get_supported_metrics().bits.xgmi); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify XGMI link width was collected + EXPECT_EQ(collected_metrics.xgmi.link.width, 16U); +} + +/** + * XGMI Link Speed Collection + * + * Objective: Verify XGMI link speed is populated when supported. + */ +TEST_F(DeviceTest, xgmi_link_speed_collection) +{ + // Setup: Mock returns specific XGMI link speed value + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + metrics.xgmi_link_speed = 25; // 25 GT/s + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify XGMI is marked as supported + EXPECT_TRUE(dev.get_supported_metrics().bits.xgmi); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify XGMI link speed was collected + EXPECT_EQ(collected_metrics.xgmi.link.speed, 25U); +} + +/** + * XGMI Read/Write Data Collection for All Links + * + * Objective: Verify data accumulation for all XGMI links. + */ +TEST_F(DeviceTest, xgmi_read_write_data_collection_all_links) +{ + // Setup: Mock returns valid read/write data for all XGMI links + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + // Populate read and write data for all XGMI links + for(size_t i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) + { + metrics.xgmi_read_data_acc[i] = 1000000 + i * 1000; // Read data in bytes + metrics.xgmi_write_data_acc[i] = 2000000 + i * 1000; // Write data in bytes + } + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify XGMI is marked as supported + EXPECT_TRUE(dev.get_supported_metrics().bits.xgmi); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify all XGMI link read/write data was collected + for(size_t i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) + { + EXPECT_EQ(collected_metrics.xgmi.data_acc.read[i], 1000000 + i * 1000); + EXPECT_EQ(collected_metrics.xgmi.data_acc.write[i], 2000000 + i * 1000); + } +} + +/** + * XGMI Sentinel Value Handling + * + * Objective: Verify sentinel values are zeroed out properly. + */ +TEST_F(DeviceTest, xgmi_sentinel_value_handling) +{ + // Setup: Mix of valid and sentinel XGMI values + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + // Set valid link width, but sentinel link speed + metrics.xgmi_link_width = 16; + // xgmi_link_speed remains 0xFFFF (sentinel) + + // Set some valid and some sentinel read/write data + metrics.xgmi_read_data_acc[0] = 1000000; // Valid + metrics.xgmi_read_data_acc[1] = 0xFFFFFFFFFFFFFFFFULL; // Sentinel + metrics.xgmi_write_data_acc[0] = 2000000; // Valid + metrics.xgmi_write_data_acc[1] = 0xFFFFFFFFFFFFFFFFULL; // Sentinel + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify XGMI is marked as supported (at least one metric is valid) + EXPECT_TRUE(dev.get_supported_metrics().bits.xgmi); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify valid values are collected and sentinels are zeroed + EXPECT_EQ(collected_metrics.xgmi.link.width, 16U); + EXPECT_EQ(collected_metrics.xgmi.link.speed, 0U); // Sentinel converted to 0 + EXPECT_EQ(collected_metrics.xgmi.data_acc.read[0], 1000000U); + EXPECT_EQ(collected_metrics.xgmi.data_acc.read[1], 0U); // Sentinel converted to 0 + EXPECT_EQ(collected_metrics.xgmi.data_acc.write[0], 2000000U); + EXPECT_EQ(collected_metrics.xgmi.data_acc.write[1], 0U); // Sentinel converted to 0 +} + +/** + * XGMI Not Collected When Unsupported + * + * Objective: Verify early return when XGMI metrics are not supported. + */ +TEST_F(DeviceTest, xgmi_not_collected_when_unsupported) +{ + // Setup: All XGMI metrics are sentinel values (unsupported) + SetupNoMetricsSupported(); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify XGMI is NOT marked as supported + EXPECT_FALSE(dev.get_supported_metrics().bits.xgmi); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify all XGMI metrics remain default-initialized (zeros) + EXPECT_EQ(collected_metrics.xgmi.link.width, 0U); + EXPECT_EQ(collected_metrics.xgmi.link.speed, 0U); + + for(size_t i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) + { + EXPECT_EQ(collected_metrics.xgmi.data_acc.read[i], 0U); + EXPECT_EQ(collected_metrics.xgmi.data_acc.write[i], 0U); + } +} + +// ============================================================================ +// Category 8: PCIe Metrics Collection Tests +// ============================================================================ + +/** + * PCIe Link Width Collection + * + * Objective: Verify PCIe link width is populated when supported. + */ +TEST_F(DeviceTest, pcie_link_width_collection) +{ + // Setup: Mock returns specific PCIe link width value + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + metrics.pcie_link_width = 16; // x16 PCIe lanes + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify PCIe is marked as supported + EXPECT_TRUE(dev.get_supported_metrics().bits.pcie); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify PCIe link width was collected + EXPECT_EQ(collected_metrics.pcie.link.width, 16U); +} + +/** + * PCIe Link Speed Collection + * + * Objective: Verify PCIe link speed is populated when supported. + */ +TEST_F(DeviceTest, pcie_link_speed_collection) +{ + // Setup: Mock returns specific PCIe link speed value + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + metrics.pcie_link_speed = 16000; // 16 GT/s (Gen4) + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify PCIe is marked as supported + EXPECT_TRUE(dev.get_supported_metrics().bits.pcie); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify PCIe link speed was collected + EXPECT_EQ(collected_metrics.pcie.link.speed, 16000U); +} + +/** + * PCIe Bandwidth Accumulator Collection + * + * Objective: Verify bandwidth accumulator is populated when supported. + */ +TEST_F(DeviceTest, pcie_bandwidth_accumulator_collection) +{ + // Setup: Mock returns specific PCIe bandwidth accumulator value + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + metrics.pcie_bandwidth_acc = 500000000; // 500MB accumulated + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify PCIe is marked as supported + EXPECT_TRUE(dev.get_supported_metrics().bits.pcie); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify PCIe bandwidth accumulator was collected + EXPECT_EQ(collected_metrics.pcie.bandwidth.acc, 500000000U); +} + +/** + * PCIe Bandwidth Instantaneous Collection + * + * Objective: Verify instantaneous bandwidth is populated when supported. + */ +TEST_F(DeviceTest, pcie_bandwidth_instantaneous_collection) +{ + // Setup: Mock returns specific PCIe instantaneous bandwidth value + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + metrics.pcie_bandwidth_inst = 10000000; // 10 MB/s + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify PCIe is marked as supported + EXPECT_TRUE(dev.get_supported_metrics().bits.pcie); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify PCIe instantaneous bandwidth was collected + EXPECT_EQ(collected_metrics.pcie.bandwidth.inst, 10000000U); +} + +/** + * PCIe Sentinel Value Handling + * + * Objective: Verify sentinel values are zeroed out properly. + */ +TEST_F(DeviceTest, pcie_sentinel_value_handling) +{ + // Setup: Mix of valid and sentinel PCIe values + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + // Set valid link width and bandwidth acc, but sentinel link speed and bandwidth inst + metrics.pcie_link_width = 16; + metrics.pcie_bandwidth_acc = 500000000; + // pcie_link_speed and pcie_bandwidth_inst remain sentinel values + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify PCIe is marked as supported (at least one metric is valid) + EXPECT_TRUE(dev.get_supported_metrics().bits.pcie); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify valid values are collected and sentinels are zeroed + EXPECT_EQ(collected_metrics.pcie.link.width, 16U); + EXPECT_EQ(collected_metrics.pcie.link.speed, 0U); // Sentinel converted to 0 + EXPECT_EQ(collected_metrics.pcie.bandwidth.acc, 500000000U); + EXPECT_EQ(collected_metrics.pcie.bandwidth.inst, 0U); // Sentinel converted to 0 +} + +/** + * PCIe Not Collected When Unsupported + * + * Objective: Verify early return when PCIe metrics are not supported. + */ +TEST_F(DeviceTest, pcie_not_collected_when_unsupported) +{ + // Setup: All PCIe metrics are sentinel values (unsupported) + SetupNoMetricsSupported(); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify PCIe is NOT marked as supported + EXPECT_FALSE(dev.get_supported_metrics().bits.pcie); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify all PCIe metrics remain default-initialized (zeros) + EXPECT_EQ(collected_metrics.pcie.link.width, 0U); + EXPECT_EQ(collected_metrics.pcie.link.speed, 0U); + EXPECT_EQ(collected_metrics.pcie.bandwidth.acc, 0U); + EXPECT_EQ(collected_metrics.pcie.bandwidth.inst, 0U); +} + +// ============================================================================ +// Category 9: Supported Metrics Detection Tests +// ============================================================================ + +/** + * All Metrics Supported Detection + * + * Objective: Verify all supported bits are set when all metrics are valid. + */ +TEST_F(DeviceTest, all_metrics_supported_detection) +{ + // Setup: All metrics have valid values (non-sentinel) + SetupAllMetricsSupported(); + + // Create device (this triggers initialize_supported_metrics()) + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify all metric support bits are set + auto supported = dev.get_supported_metrics(); + EXPECT_TRUE(supported.bits.current_socket_power); + EXPECT_TRUE(supported.bits.average_socket_power); + EXPECT_TRUE(supported.bits.memory_usage); + EXPECT_TRUE(supported.bits.hotspot_temperature); + EXPECT_TRUE(supported.bits.edge_temperature); + EXPECT_TRUE(supported.bits.gfx_activity); + EXPECT_TRUE(supported.bits.umc_activity); + EXPECT_TRUE(supported.bits.mm_activity); + // CreateValidMetrics sets per-XCP VCN/JPEG busy, so vcn_busy/jpeg_busy should be set + // Device-level vcn_activity/jpeg_activity should NOT be set when per-XCP is available + EXPECT_TRUE(supported.bits.vcn_busy); + EXPECT_TRUE(supported.bits.jpeg_busy); + EXPECT_FALSE(supported.bits.vcn_activity); + EXPECT_FALSE(supported.bits.jpeg_activity); + EXPECT_TRUE(supported.bits.xgmi); + EXPECT_TRUE(supported.bits.pcie); +} + +/** + * VCN Activity Support Detection - Any XCP + * + * Objective: Verify VCN marked supported if any XCP has valid values. + */ +TEST_F(DeviceTest, vcn_activity_support_detection_any_xcp) +{ + // Setup: Only XCP 7 has valid VCN values, all others are sentinels + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + // Set valid VCN value only in XCP 7 + metrics.xcp_stats[7].vcn_busy[0] = 50; // Valid value + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify VCN busy (per-XCP) is marked as supported + EXPECT_TRUE(dev.get_supported_metrics().bits.vcn_busy); + // Device-level vcn_activity should NOT be set when per-XCP is available + EXPECT_FALSE(dev.get_supported_metrics().bits.vcn_activity); +} + +/** + * VCN Activity Unsupported - All Sentinels + * + * Objective: Verify VCN not supported when all XCPs have sentinel values. + */ +TEST_F(DeviceTest, vcn_activity_unsupported_all_sentinels) +{ + // Setup: All VCN values in all XCPs are sentinels + SetupNoMetricsSupported(); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify VCN activity is NOT supported + EXPECT_FALSE(dev.get_supported_metrics().bits.vcn_activity); +} + +/** + * JPEG Activity Support Detection - Any XCP + * + * Objective: Verify JPEG marked supported if any XCP has valid values. + */ +TEST_F(DeviceTest, jpeg_activity_support_detection_any_xcp) +{ + // Setup: Only XCP 5 has valid JPEG values, all others are sentinels + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + // Set valid JPEG value only in XCP 5 + metrics.xcp_stats[5].jpeg_busy[0] = 75; // Valid value + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify JPEG busy (per-XCP) is marked as supported + EXPECT_TRUE(dev.get_supported_metrics().bits.jpeg_busy); + // Device-level jpeg_activity should NOT be set when per-XCP is available + EXPECT_FALSE(dev.get_supported_metrics().bits.jpeg_activity); +} + +/** + * XGMI Support Detection - Link Width Only + * + * Objective: Verify XGMI supported if only link width is valid. + */ +TEST_F(DeviceTest, xgmi_support_detection_link_width_only) +{ + // Setup: Only XGMI link width is valid, everything else is sentinel + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + metrics.xgmi_link_width = 16; // Valid link width + // xgmi_link_speed and all xgmi_read_data_acc remain sentinel + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify XGMI is marked as supported (OR logic) + EXPECT_TRUE(dev.get_supported_metrics().bits.xgmi); +} + +/** + * XGMI Support Detection - Any Read Data Valid + * + * Objective: Verify XGMI supported if any read data is valid. + */ +TEST_F(DeviceTest, xgmi_support_detection_any_read_data_valid) +{ + // Setup: Only one XGMI read data value is valid + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + metrics.xgmi_read_data_acc[2] = 1000; // Valid read data at index 2 + // link width, link speed, and all other read data remain sentinel + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify XGMI is marked as supported (std::any_of logic) + EXPECT_TRUE(dev.get_supported_metrics().bits.xgmi); +} + +/** + * PCIe Support Detection - Bandwidth Only + * + * Objective: Verify PCIe supported if only bandwidth accumulator is valid. + */ +TEST_F(DeviceTest, pcie_support_detection_bandwidth_only) +{ + // Setup: Only PCIe bandwidth accumulator is valid + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + metrics.pcie_bandwidth_acc = 1000000; // Valid bandwidth accumulator + // pcie_link_width, pcie_link_speed, pcie_bandwidth_inst remain sentinel + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify PCIe is marked as supported (OR logic) + EXPECT_TRUE(dev.get_supported_metrics().bits.pcie); +} + +/** + * Memory Usage Support Detection + * + * Objective: Verify memory usage support based on API success with valid value. + */ +TEST_F(DeviceTest, memory_usage_support_detection) +{ + // Setup: Memory API returns success with valid value + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t valid_mem_usage = 4096000000; // 4GB + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(valid_mem_usage), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify memory usage is marked as supported + EXPECT_TRUE(dev.get_supported_metrics().bits.memory_usage); +} + +/** + * Memory Usage Unsupported - API Failure + * + * Objective: Verify memory not supported when API fails. + */ +TEST_F(DeviceTest, memory_usage_unsupported_api_failure) +{ + // Setup: Memory API returns failure + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly(Return(AMDSMI_STATUS_NOT_SUPPORTED)); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify memory usage is NOT marked as supported + EXPECT_FALSE(dev.get_supported_metrics().bits.memory_usage); +} + +/** + * Memory Usage Unsupported - Sentinel Value + * + * Objective: Verify memory not supported when value is sentinel. + */ +TEST_F(DeviceTest, memory_usage_unsupported_sentinel_value) +{ + // Setup: Memory API returns success but with sentinel value + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; // Sentinel value + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Verify memory usage is NOT marked as supported + EXPECT_FALSE(dev.get_supported_metrics().bits.memory_usage); +} + +// ============================================================================ +// Category 10: VCN Activity Dual Source Tests +// ============================================================================ + +/** + * VCN Activity in Top-Level Field Only + * + * Objective: Verify VCN activity is detected when present in top-level vcn_activity[] + * field but NOT in xcp_stats[].vcn_busy[] arrays. + * + * Note: This tests a gap in the current implementation - the device class currently + * only checks xcp_stats[].vcn_busy[] but should also check the top-level vcn_activity[]. + */ +TEST_F(DeviceTest, vcn_activity_top_level_field_only) +{ + // Setup: VCN activity present in top-level field, XCP stats have sentinels + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + // Set valid VCN activity in top-level field + // Note: This field exists in amdsmi_gpu_metrics_t but is not currently checked! + // metrics.vcn_activity[0] = 75; // 75% VCN utilization + // metrics.vcn_activity[1] = 50; // 50% VCN utilization + + // All XCP VCN busy values remain sentinel (0xFFFF) + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // EXPECTED BEHAVIOR: VCN activity should be marked as supported + // CURRENT BEHAVIOR: Will NOT be supported because implementation only checks XCP + // stats This test documents the gap and will fail until implementation is fixed + + // When implementation is fixed, uncomment: + // EXPECT_TRUE(dev.get_supported_metrics().bits.vcn_activity); + + // Current behavior (documents the bug): + EXPECT_FALSE(dev.get_supported_metrics().bits.vcn_activity) + << "BUG: Implementation does not check top-level vcn_activity[] field"; +} + +/** + * VCN Activity in Both Top-Level and XCP Fields + * + * Objective: Verify VCN activity when present in BOTH vcn_activity[] and + * xcp_stats[].vcn_busy[]. + */ +TEST_F(DeviceTest, vcn_activity_in_both_fields) +{ + // Setup: VCN activity in both top-level and XCP fields + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + // Set valid VCN activity in XCP stats (currently checked) + metrics.xcp_stats[0].vcn_busy[0] = 80; // 80% in XCP 0, VCN 0 + + // Also set in top-level field (not currently checked) + // metrics.vcn_activity[0] = 75; // Different value in top-level field + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Per-XCP vcn_busy should be supported (XCP stats are valid) + EXPECT_TRUE(dev.get_supported_metrics().bits.vcn_busy); + // Device-level vcn_activity should NOT be set when per-XCP is available + EXPECT_FALSE(dev.get_supported_metrics().bits.vcn_activity); + + // Collect metrics + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify XCP stats were collected + EXPECT_EQ(collected_metrics.xcp_stats[0].vcn_busy[0], 80U); +} + +/** + * VCN Activity Detection with Top-Level Field Support + * + * Objective: Document expected behavior when both VCN sources are checked. + * + * This test describes how the initialize_supported_metrics() should work: + * - Check top-level vcn_activity[] array (currently missing) + * - Check xcp_stats[].vcn_busy[] arrays (currently implemented) + * - Mark vcn_activity as supported if EITHER source has valid data + */ +TEST_F(DeviceTest, vcn_activity_detection_should_check_both_sources) +{ + // Setup: Only top-level vcn_activity has valid data + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + // Top-level has valid data (not checked by current implementation) + // metrics.vcn_activity[0] = 60; + + // XCP stats have sentinels (checked by current implementation) + // All xcp_stats[].vcn_busy[] remain 0xFFFF + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // EXPECTED (when fixed): vcn_activity should be supported + // CURRENT: Will be false because top-level field is not checked + EXPECT_FALSE(dev.get_supported_metrics().bits.vcn_activity) + << "Implementation gap: initialize_supported_metrics() should check both " + "vcn_activity[] AND xcp_stats[].vcn_busy[]"; +} + +/** + * VCN Activity Collection Priority + * + * Objective: Document which VCN source should take priority when collecting. + * + * When both sources are available, the implementation should decide: + * - Use top-level vcn_activity[] for overall VCN utilization? + * - Use xcp_stats[].vcn_busy[] for per-partition granularity? + * - Collect from both? + */ +TEST_F(DeviceTest, vcn_activity_collection_priority) +{ + // Setup: Different values in both sources + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + // XCP stats (per-partition detail) + metrics.xcp_stats[0].vcn_busy[0] = 80; + metrics.xcp_stats[0].vcn_busy[1] = 70; + + // Top-level (overall average?) + // metrics.vcn_activity[0] = 75; // Average of 80 and 70? + // metrics.vcn_activity[1] = 0; // Or different semantic meaning? + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + device dev(mock_driver, test_handle, test_processor_type, test_index); + + auto collected = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Current implementation collects from XCP stats only + EXPECT_EQ(collected.xcp_stats[0].vcn_busy[0], 80U); + EXPECT_EQ(collected.xcp_stats[0].vcn_busy[1], 70U); + + // Future enhancement: Also collect top-level vcn_activity[]? + // This would require extending the metrics structure to include both fields +} + +/** + * VCN Activity XCP Stats Empty But Top-Level Valid + * + * Objective: Test scenario where hardware reports VCN activity at top-level + * but XCP partitioning is disabled or not reporting VCN stats. + */ +TEST_F(DeviceTest, vcn_activity_xcp_disabled_top_level_valid) +{ + // Setup: XCP stats all sentinels (XCP disabled or not supported) + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + // Top-level VCN activity still valid + // metrics.vcn_activity[0] = 65; // VCN 0 at 65% + // metrics.vcn_activity[1] = 55; // VCN 1 at 55% + + // All XCP stats remain sentinel + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // CURRENT: VCN not supported (implementation only checks XCP stats) + EXPECT_FALSE(dev.get_supported_metrics().bits.vcn_activity); + + // EXPECTED (when fixed): Should be supported via top-level field + // This represents real hardware scenario where XCP partitioning is disabled + // but VCN engines are still active and reporting utilization +} + +// ============================================================================ +// Category 11: Error Handling and Edge Cases +// ============================================================================ + +/** + * get_metrics_info() Failure + * + * Objective: Verify graceful handling when metrics info unavailable. + */ +TEST_F(DeviceTest, get_metrics_info_failure) +{ + // Setup: get_metrics_info() returns failure + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(Return(AMDSMI_STATUS_NOT_SUPPORTED)); + + uint64_t valid_mem_usage = 4096000000; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(valid_mem_usage), Return(AMDSMI_STATUS_SUCCESS))); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Call get_gpu_metrics() - should not throw + auto metrics = dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify returns default-initialized metrics (all zeros) + EXPECT_EQ(metrics.current_socket_power, 0U); + EXPECT_EQ(metrics.average_socket_power, 0U); + EXPECT_EQ(metrics.hotspot_temperature, 0); + EXPECT_EQ(metrics.edge_temperature, 0); + EXPECT_EQ(metrics.gfx_activity, 0U); +} + +/** + * get_metrics_info() Failure During Initialization + * + * Objective: Verify initialization handles metrics info failure. + */ +TEST_F(DeviceTest, get_metrics_info_failure_during_init) +{ + // Setup: get_metrics_info() returns failure during construction + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(Return(AMDSMI_STATUS_NOT_SUPPORTED)); + + uint64_t valid_mem_usage = 4096000000; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(valid_mem_usage), Return(AMDSMI_STATUS_SUCCESS))); + + // Create device - should not crash + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // is_supported() should reflect whether ANY metric was supported (memory in this + // case) + EXPECT_TRUE(dev.is_supported()); + + // Verify memory is supported but GPU metrics are not + auto supported = dev.get_supported_metrics(); + EXPECT_TRUE(supported.bits.memory_usage); + EXPECT_FALSE(supported.bits.current_socket_power); +} + +/** + * Multiple Metric Collections + * + * Objective: Verify device can collect metrics multiple times. + */ +TEST_F(DeviceTest, multiple_metric_collections) +{ + // Setup: Mock returns varying values across collections + SetupAllMetricsSupported(); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Collect metrics 10 times in a row + for(int i = 0; i < 10; ++i) + { + auto metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + // Each collection should succeed + EXPECT_GT(metrics.current_socket_power, 0U); + } +} + +/** + * Large Array Indices - XGMI + * + * Objective: Verify no buffer overflow with maximum XGMI links. + */ +TEST_F(DeviceTest, large_array_indices_xgmi) +{ + // Setup: Set all AMDSMI_MAX_NUM_XGMI_LINKS entries + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + for(size_t i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) + { + metrics.xgmi_read_data_acc[i] = 1000 + i; + metrics.xgmi_write_data_acc[i] = 2000 + i; + } + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Collect metrics - should not crash or cause buffer overflow + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify all links were processed correctly + for(size_t i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) + { + EXPECT_EQ(collected_metrics.xgmi.data_acc.read[i], 1000 + i); + EXPECT_EQ(collected_metrics.xgmi.data_acc.write[i], 2000 + i); + } +} + +/** + * Large Array Indices - XCP + * + * Objective: Verify no buffer overflow with maximum XCPs. + */ +TEST_F(DeviceTest, large_array_indices_xcp) +{ + // Setup: Set all AMDSMI_MAX_NUM_XCP entries + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + for(size_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + for(size_t vcn = 0; vcn < AMDSMI_MAX_NUM_VCN; ++vcn) + { + metrics.xcp_stats[xcp].vcn_busy[vcn] = static_cast(xcp * 10 + vcn); + } + } + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Collect metrics - should not crash + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify all XCP stats were processed correctly + for(size_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + for(size_t vcn = 0; vcn < AMDSMI_MAX_NUM_VCN; ++vcn) + { + EXPECT_EQ(collected_metrics.xcp_stats[xcp].vcn_busy[vcn], + static_cast(xcp * 10 + vcn)); + } + } +} + +/** + * Large Array Indices - JPEG Engines + * + * Objective: Verify no buffer overflow with maximum JPEG engines. + */ +TEST_F(DeviceTest, large_array_indices_jpeg) +{ + // Setup: Set all ROCPROFSYS_AMDSMI_JPEG_ENGINE_COUNT entries + amdsmi_gpu_metrics_t metrics = CreateSentinelMetrics(); + + for(size_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + for(size_t jpeg = 0; jpeg < ROCPROFSYS_AMDSMI_JPEG_ENGINE_COUNT; ++jpeg) + { + metrics.xcp_stats[xcp].jpeg_busy[jpeg] = + static_cast(xcp * 100 + jpeg); + } + } + + EXPECT_CALL(*mock_driver, get_metrics_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver, test_handle); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + + // Collect metrics - should not crash + auto collected_metrics = + dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + + // Verify all JPEG engines were processed correctly + for(size_t xcp = 0; xcp < AMDSMI_MAX_NUM_XCP; ++xcp) + { + for(size_t jpeg = 0; jpeg < ROCPROFSYS_AMDSMI_JPEG_ENGINE_COUNT; ++jpeg) + { + EXPECT_EQ(collected_metrics.xcp_stats[xcp].jpeg_busy[jpeg], + static_cast(xcp * 100 + jpeg)); + } + } +} + +/** + * Concurrent Device Objects + * + * Objective: Verify multiple device objects don't interfere. + */ +TEST_F(DeviceTest, concurrent_device_objects) +{ + // Setup: Create mocks for two different devices + auto mock_driver1 = std::make_shared(); + auto mock_driver2 = std::make_shared(); + + amdsmi_processor_handle handle1 = reinterpret_cast(0x1111); + amdsmi_processor_handle handle2 = reinterpret_cast(0x2222); + + // Device 1 returns power = 100W + amdsmi_gpu_metrics_t metrics1 = CreateSentinelMetrics(); + metrics1.current_socket_power = 100; + + EXPECT_CALL(*mock_driver1, get_metrics_info(handle1, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics1), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock_driver1, get_memory_usage(handle1, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver1, handle1); + + EXPECT_CALL(*mock_driver1, get_gpu_asic_info(handle1, _)) + .Times(AnyNumber()) + .WillRepeatedly(Return(AMDSMI_STATUS_SUCCESS)); + + // Device 2 returns power = 200W + amdsmi_gpu_metrics_t metrics2 = CreateSentinelMetrics(); + metrics2.current_socket_power = 200; + + EXPECT_CALL(*mock_driver2, get_metrics_info(handle2, _)) + .Times(AtLeast(1)) + .WillRepeatedly(DoAll(SetArgPointee<1>(metrics2), Return(AMDSMI_STATUS_SUCCESS))); + + EXPECT_CALL(*mock_driver2, get_memory_usage(handle2, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock_driver2, handle2); + + EXPECT_CALL(*mock_driver2, get_gpu_asic_info(handle2, _)) + .Times(AnyNumber()) + .WillRepeatedly(Return(AMDSMI_STATUS_SUCCESS)); + + // Create two device objects + device dev1(mock_driver1, handle1, test_processor_type, 0); + device dev2(mock_driver2, handle2, test_processor_type, 1); + + // Collect from device 1 + auto result1 = + dev1.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + EXPECT_EQ(result1.current_socket_power, 100U); + + // Collect from device 2 + auto result2 = + dev2.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + EXPECT_EQ(result2.current_socket_power, 200U); + + // Collect from device 1 again - should still return 100W + result1 = dev1.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + EXPECT_EQ(result1.current_socket_power, 100U); + + // Verify devices maintain independent state + EXPECT_NE(dev1.get_index(), dev2.get_index()); +} + +/** + * Device with Index 0 + * + * Objective: Verify device index 0 works (boundary value). + */ +TEST_F(DeviceTest, device_with_index_zero) +{ + // Setup + SetupAllMetricsSupported(); + + // Create device with index 0 + device dev(mock_driver, test_handle, test_processor_type, 0); + + // Verify index is correctly stored + EXPECT_EQ(dev.get_index(), 0U); +} + +/** + * Device with High Index + * + * Objective: Verify device with high index works (multi-GPU scenario). + */ +TEST_F(DeviceTest, device_with_high_index) +{ + // Setup + SetupAllMetricsSupported(); + + // Create device with high index (simulating 16-GPU system) + device dev(mock_driver, test_handle, test_processor_type, 15); + + // Verify index is correctly stored + EXPECT_EQ(dev.get_index(), 15U); +} + +// ============================================================================ +// Category 12: Integration Tests +// ============================================================================ + +/** + * Full Lifecycle with Real-ish Data + * + * Objective: Simulate realistic GPU monitoring session with evolving metrics. + */ +TEST_F(DeviceTest, full_lifecycle_with_realistic_data) +{ + // Setup: Mock will return different values across collections + auto mock = std::make_shared(); + + // Initialization metrics (used during device construction) + amdsmi_gpu_metrics_t init_metrics = CreateSentinelMetrics(); + init_metrics.current_socket_power = 150; // 150W + init_metrics.temperature_hotspot = 70; // 70°C + init_metrics.average_gfx_activity = 50; // 50% activity + + // Collection 1: Idle GPU + amdsmi_gpu_metrics_t metrics1 = CreateSentinelMetrics(); + metrics1.current_socket_power = 150; // 150W + metrics1.temperature_hotspot = 70; // 70°C + metrics1.average_gfx_activity = 50; // 50% activity + + // Collection 2: Heavy workload + amdsmi_gpu_metrics_t metrics2 = CreateSentinelMetrics(); + metrics2.current_socket_power = 180; // 180W + metrics2.temperature_hotspot = 75; // 75°C + metrics2.average_gfx_activity = 90; // 90% activity + + // Collection 3: Returning to moderate + amdsmi_gpu_metrics_t metrics3 = CreateSentinelMetrics(); + metrics3.current_socket_power = 160; // 160W + metrics3.temperature_hotspot = 73; // 73°C + metrics3.average_gfx_activity = 60; // 60% activity + + // Setup mock to return different values on each call + // First call is during device construction (initialize_supported_metrics) + // Subsequent calls are from get_gpu_metrics() + EXPECT_CALL(*mock, get_metrics_info(test_handle, _)) + .WillOnce(DoAll(SetArgPointee<1>(init_metrics), Return(AMDSMI_STATUS_SUCCESS))) + .WillOnce(DoAll(SetArgPointee<1>(metrics1), Return(AMDSMI_STATUS_SUCCESS))) + .WillOnce(DoAll(SetArgPointee<1>(metrics2), Return(AMDSMI_STATUS_SUCCESS))) + .WillOnce(DoAll(SetArgPointee<1>(metrics3), Return(AMDSMI_STATUS_SUCCESS))); + + uint64_t sentinel_mem = 0xFFFFFFFFFFFFFFFFULL; + EXPECT_CALL(*mock, get_memory_usage(test_handle, AMDSMI_MEM_TYPE_VRAM, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<2>(sentinel_mem), Return(AMDSMI_STATUS_SUCCESS))); + + SetupSDMAExpectations(mock, test_handle); + + EXPECT_CALL(*mock, get_gpu_asic_info(test_handle, _)) + .Times(AnyNumber()) + .WillRepeatedly(Return(AMDSMI_STATUS_SUCCESS)); + + // Construct device + device dev(mock, test_handle, test_processor_type, test_index); + + // Collection 1: Idle + auto result1 = dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + EXPECT_EQ(result1.current_socket_power, 150U); + EXPECT_EQ(result1.hotspot_temperature, 70); + EXPECT_EQ(result1.gfx_activity, 50U); + + // Collection 2: Heavy + auto result2 = dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + EXPECT_EQ(result2.current_socket_power, 180U); + EXPECT_EQ(result2.hotspot_temperature, 75); + EXPECT_EQ(result2.gfx_activity, 90U); + + // Collection 3: Moderate + auto result3 = dev.get_gpu_metrics(enabled_metrics{ .value = 0xFFFF }, 1000000000ULL); + EXPECT_EQ(result3.current_socket_power, 160U); + EXPECT_EQ(result3.hotspot_temperature, 73); + EXPECT_EQ(result3.gfx_activity, 60U); +} + +/** + * TC12.1: SDMA Delta Computation + * + * Objective: Verify SDMA usage percentage is computed correctly from deltas. + * + * NOTE: This test is only compiled when AMD_SMI_SDMA_SUPPORTED is defined. + */ +#if defined(AMD_SMI_SDMA_SUPPORTED) && AMD_SMI_SDMA_SUPPORTED == 1 +TEST_F(DeviceTest, sdma_delta_computation) +{ + // Setup: Mock SDMA process data + SetupAllMetricsSupported(); + + // Expect calls to get_gpu_process_list: + // 1. During device construction (initialize_supported_metrics) + // 2. First get_gpu_metrics() call + // 3. Second get_gpu_metrics() call + EXPECT_CALL(*mock_driver, get_gpu_process_list(test_handle, _, nullptr)) + .Times(AtLeast(3)) + .WillRepeatedly(DoAll(SetArgPointee<1>(1), Return(AMDSMI_STATUS_SUCCESS))); + + EXPECT_CALL(*mock_driver, get_gpu_process_list(test_handle, _, ::testing::NotNull())) + .Times(2) + .WillOnce( + [](amdsmi_processor_handle, uint32_t* num_items, amdsmi_proc_info_t* procs) { + *num_items = 1; + procs[0].sdma_usage = 5000000; // First sample: 5s cumulative + return AMDSMI_STATUS_SUCCESS; + }) + .WillOnce( + [](amdsmi_processor_handle, uint32_t* num_items, amdsmi_proc_info_t* procs) { + *num_items = 1; + procs[0].sdma_usage = 15000000; // Second sample: 15s cumulative + return AMDSMI_STATUS_SUCCESS; + }); + + // Create device + device dev(mock_driver, test_handle, test_processor_type, test_index); + ASSERT_TRUE(dev.is_supported()); + ASSERT_TRUE(dev.get_supported_metrics().bits.sdma_usage); + + enabled_metrics enabled; + enabled.bits.sdma_usage = 1; + + // First sample - no previous data, should return 0 + auto metrics1 = dev.get_gpu_metrics(enabled, 1000000000ULL); // t = 1s + EXPECT_EQ(metrics1.sdma_usage, 0U); + + // Second sample - compute delta + // Delta usage = 15000000 - 5000000 = 10,000,000 microseconds + // Delta time = 2000000000 - 1000000000 = 1,000,000,000 nanoseconds + // Percentage = (10,000,000 * 100,000) / 1,000,000,000 = 1000% + // Clamped to 100% + auto metrics2 = dev.get_gpu_metrics(enabled, 2000000000ULL); // t = 2s + EXPECT_GE(metrics2.sdma_usage, 0U); + EXPECT_LE(metrics2.sdma_usage, 100U); +} +#endif + +} // namespace rocprofsys::pmc::collectors::gpu::testing diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/types.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/types.hpp new file mode 100644 index 00000000000..363b1682ecc --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/gpu/types.hpp @@ -0,0 +1,144 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "library/pmc/common/types.hpp" + +#include +#include + +#include + +namespace rocprofsys +{ +namespace pmc +{ +namespace collectors +{ +namespace gpu +{ + +// Sentinel value used by AMD SMI to indicate unsupported/unavailable 64-bit metrics +constexpr uint64_t METRIC_VALUE_NOT_SUPPORTED_64 = 0xffffffffffffffff; + +/** + * @brief Bitfield union for selecting which AMD SMI metrics to collect. + * + * Bit positions (for value access): + * - current_socket_power = 0 + * - average_socket_power = 1 + * - memory_usage = 2 + * - hotspot_temperature = 3 + * - edge_temperature = 4 + * - gfx_activity = 5 + * - umc_activity = 6 + * - mm_activity = 7 + * - vcn_activity = 8 (Device-level, Radeon GPUs) + * - jpeg_activity = 9 (Device-level, Radeon GPUs) + * - vcn_busy = 10 (Per-XCP, MI300 series) + * - jpeg_busy = 11 (Per-XCP, MI300 series) + * - xgmi = 12 + * - pcie = 13 + * - sdma_usage = 14 + */ +union enabled_metrics +{ + struct + { + uint32_t current_socket_power : 1; + uint32_t average_socket_power : 1; + uint32_t memory_usage : 1; + uint32_t hotspot_temperature : 1; + uint32_t edge_temperature : 1; + uint32_t gfx_activity : 1; + uint32_t umc_activity : 1; + uint32_t mm_activity : 1; + uint32_t vcn_activity : 1; // Device-level VCN activity + uint32_t jpeg_activity : 1; // Device-level JPEG activity + uint32_t vcn_busy : 1; // Per-XCP VCN busy + uint32_t jpeg_busy : 1; // Per-XCP JPEG busy + uint32_t xgmi : 1; + uint32_t pcie : 1; + uint32_t sdma_usage : 1; + } bits; + uint32_t value = 0; +}; + +// Get the actual JPEG engine count from the AMD SMI structure at compile time. +// This ensures compatibility across ROCm versions where the jpeg_busy array size +// may differ (32 in ROCm 6.x vs 40 in ROCm 7.x). +constexpr size_t ROCPROFSYS_AMDSMI_JPEG_ENGINE_COUNT = + sizeof(amdsmi_gpu_xcp_metrics_t::jpeg_busy) / sizeof(uint16_t); + +#ifndef AMDSMI_MAX_NUM_VCN +# define AMDSMI_MAX_NUM_VCN 4 +#endif + +#ifndef AMDSMI_MAX_NUM_JPEG +# define AMDSMI_MAX_NUM_JPEG 32 +#endif + +#ifndef AMDSMI_MAX_NUM_XCP +# define AMDSMI_MAX_NUM_XCP 8 +#endif + +struct metrics +{ + struct xcp_metrics + { + std::array jpeg_busy = {}; + std::array vcn_busy = {}; + }; + + uint32_t current_socket_power = 0; + uint32_t average_socket_power = 0; + uint64_t memory_usage = 0; + int64_t hotspot_temperature = 0; + int64_t edge_temperature = 0; + uint32_t gfx_activity = 0; + uint32_t umc_activity = 0; + uint32_t mm_activity = 0; + std::array xcp_stats; + + // Device-level VCN/JPEG activity (Radeon GPUs) + std::array vcn_activity = {}; + std::array jpeg_activity = {}; + + struct + { + struct + { + uint16_t width = 0; + uint16_t speed = 0; + } link; + + struct + { + std::array read = {}; + std::array write = {}; + } data_acc; + } xgmi; + + struct + { + struct + { + uint16_t width = 0; + uint16_t speed = 0; + } link; + + struct + { + uint64_t acc = 0; + uint64_t inst = 0; + } bandwidth; + } pcie; + + uint32_t sdma_usage = 0; // SDMA utilization percentage (0-100) +}; + +} // namespace gpu +} // namespace collectors +} // namespace pmc +} // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/CMakeLists.txt new file mode 100644 index 00000000000..3d207680801 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/CMakeLists.txt @@ -0,0 +1,19 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +# Add tests subdirectory +add_subdirectory(tests) + +# PMC NIC Collector Sources +set(pmc_nic_sources + ${CMAKE_CURRENT_LIST_DIR}/types.hpp + ${CMAKE_CURRENT_LIST_DIR}/sample.hpp + ${CMAKE_CURRENT_LIST_DIR}/device.hpp + ${CMAKE_CURRENT_LIST_DIR}/collector.hpp + ${CMAKE_CURRENT_LIST_DIR}/nic_traits.hpp + ${CMAKE_CURRENT_LIST_DIR}/perfetto_policy.hpp + ${CMAKE_CURRENT_LIST_DIR}/cache_policy.hpp +) + +# Add to parent variable (will be used by collectors CMakeLists.txt) +set(pmc_nic_sources ${pmc_nic_sources} PARENT_SCOPE) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/cache_policy.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/cache_policy.hpp new file mode 100644 index 00000000000..cf0b0216875 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/cache_policy.hpp @@ -0,0 +1,148 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "core/config.hpp" +#include "core/trace_cache/cache_manager.hpp" +#include "core/trace_cache/metadata_registry.hpp" +#include "library/pmc/collectors/nic/sample.hpp" +#include "library/pmc/collectors/nic/types.hpp" +#include "logger/debug.hpp" + +#include +#include +#include +#include + +namespace rocprofsys::pmc::collectors::nic +{ + +/** + * @brief Output policy for writing NIC RDMA samples to the trace cache. + * + * This policy handles serialization of NIC RDMA metric samples into the + * rocprofiler-systems trace cache for later analysis and visualization. + * + * @see perfetto_policy for direct Perfetto trace output + */ +struct cache_policy +{ + /** + * @brief Initialize trace cache category metadata for NIC RDMA metrics. + */ + static void initialize_category_metadata() + { + trace_cache::get_metadata_registry().add_string("ainic"); + } + + /** + * @brief Initialize NIC track metadata. + */ + static void initialize_tracks_metadata() + { + const auto thread_id = std::nullopt; + + trace_cache::get_metadata_registry().add_track( + { "ainic_rx_rdma_ucast_bytes", thread_id, "{}" }); + trace_cache::get_metadata_registry().add_track( + { "ainic_tx_rdma_ucast_bytes", thread_id, "{}" }); + trace_cache::get_metadata_registry().add_track( + { "ainic_rx_rdma_ucast_pkts", thread_id, "{}" }); + trace_cache::get_metadata_registry().add_track( + { "ainic_tx_rdma_ucast_pkts", thread_id, "{}" }); + trace_cache::get_metadata_registry().add_track( + { "ainic_rx_rdma_cnp_pkts", thread_id, "{}" }); + trace_cache::get_metadata_registry().add_track( + { "ainic_tx_rdma_cnp_pkts", thread_id, "{}" }); + } + + /** + * @brief Initialize per-device PMC metadata for NIC RDMA metrics. + * + * @param nic_id NIC device identifier + * @param nic_name NIC device name (e.g., "enp226s0") + */ + static void initialize_pmc_metadata(size_t nic_id, + [[maybe_unused]] const std::string& nic_name) + { + constexpr size_t EVENT_CODE = 0; + constexpr size_t INSTANCE_ID = 0; + constexpr const char* LONG_DESCRIPTION = ""; + constexpr const char* COMPONENT = ""; + constexpr const char* BLOCK = ""; + constexpr const char* EXPRESSION = ""; + constexpr const char* TARGET_ARCH = ""; + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::NIC, nic_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, + "NIC RX UCast PKTS", + trait::name::description, + LONG_DESCRIPTION, COMPONENT, "packets", rocprofsys::trace_cache::ABSOLUTE, + BLOCK, EXPRESSION, 0, 0 }); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::NIC, nic_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, + "NIC TX UCast PKTS", + trait::name::description, + LONG_DESCRIPTION, COMPONENT, "packets", rocprofsys::trace_cache::ABSOLUTE, + BLOCK, EXPRESSION, 0, 0 }); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::NIC, nic_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, "NIC RX CNP PKTS", + trait::name::description, + LONG_DESCRIPTION, COMPONENT, "packets", rocprofsys::trace_cache::ABSOLUTE, + BLOCK, EXPRESSION, 0, 0 }); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::NIC, nic_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, "NIC TX CNP PKTS", + trait::name::description, + LONG_DESCRIPTION, COMPONENT, "packets", rocprofsys::trace_cache::ABSOLUTE, + BLOCK, EXPRESSION, 0, 0 }); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::NIC, nic_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, + "NIC RX UCast Bytes", + trait::name::description, + LONG_DESCRIPTION, COMPONENT, "bytes", rocprofsys::trace_cache::ABSOLUTE, + BLOCK, EXPRESSION, 0, 0 }); + + trace_cache::get_metadata_registry().add_pmc_info( + { agent_type::NIC, nic_id, TARGET_ARCH, EVENT_CODE, INSTANCE_ID, + trait::name::value, + "NIC TX UCast Bytes", + trait::name::description, + LONG_DESCRIPTION, COMPONENT, "bytes", rocprofsys::trace_cache::ABSOLUTE, + BLOCK, EXPRESSION, 0, 0 }); + } + + /** + * @brief Store a NIC sample to the trace cache. + * + * @param device_id NIC device identifier + * @param device_name NIC device name + * @param enabled_metrics_cfg Metrics enabled by configuration + * @param supported_metrics Metrics supported by this device + * @param metric_values Collected metric values + * @param timestamp Sample timestamp in nanoseconds + */ + static void store_sample(size_t device_id, const std::string& device_name, + const enabled_metrics& enabled_metrics_cfg, + const enabled_metrics& supported_metrics, + const metrics& metric_values, uint64_t timestamp) + { + enabled_metrics _enabled_metrics; + _enabled_metrics.value = enabled_metrics_cfg.value & supported_metrics.value; + + trace_cache::get_buffer_storage().store(trace_cache::ainic_pmc_sample{ + _enabled_metrics, static_cast(device_id), device_name, timestamp, + metric_values }); + } +}; + +} // namespace rocprofsys::pmc::collectors::nic diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/collector.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/collector.hpp new file mode 100644 index 00000000000..6f3f997aaaa --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/collector.hpp @@ -0,0 +1,24 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "library/pmc/collectors/base/collector.hpp" +#include "library/pmc/collectors/nic/nic_traits.hpp" + +namespace rocprofsys::pmc::collectors::nic +{ + +/** + * @brief NIC RDMA metrics collector for performance monitoring. + * + * This collector specializes the base::collector template for NIC devices + * using AMD SMI. All NIC-specific behavior is defined in nic_traits. + * + * @tparam DeviceProvider Type providing NIC device enumeration and management + * @tparam Config Configuration policy providing settings and output policies + */ +template +using collector = base::collector, DeviceProvider, Config>; + +} // namespace rocprofsys::pmc::collectors::nic diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/device.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/device.hpp new file mode 100644 index 00000000000..192a4c47e44 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/device.hpp @@ -0,0 +1,224 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "library/pmc/collectors/nic/types.hpp" +#include "logger/debug.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace rocprofsys::pmc::collectors::nic +{ + +/** + * @brief NIC device wrapper for collecting RDMA statistics. + * + * Wraps an AMD SMI NIC processor handle and provides methods to + * query RDMA port statistics including bytes, packets, and CNP metrics. + * + * @tparam Driver The AMD SMI driver type (allows mock injection for testing) + */ +template +class device +{ +public: + /** + * @brief Construct a NIC device wrapper. + * + * @param driver Shared pointer to the driver instance + * @param handle AMD SMI processor handle for this NIC + * @param processor_type Type of processor (should be AMD_NIC) + * @param logical_index Device index for identification + */ + device(std::shared_ptr driver, amdsmi_processor_handle handle, + processor_type_t /*processor_type*/, size_t logical_index) + : m_driver_api{ std::move(driver) } + , m_device_handle{ handle } + , m_index{ logical_index } + { + m_is_supported = initialize_device_info(); + } + + [[nodiscard]] bool is_supported() const noexcept { return m_is_supported; } + + [[nodiscard]] enabled_metrics get_supported_metrics() const noexcept + { + return m_supported_metrics; + } + + [[nodiscard]] size_t get_index() const noexcept { return m_index; } + + [[nodiscard]] const std::string& get_name() const noexcept { return m_device_name; } + + [[nodiscard]] const std::string& get_product_name() const noexcept + { + return m_product_name; + } + + [[nodiscard]] const std::string& get_vendor_name() const noexcept + { + return m_vendor_name; + } + + /** + * @brief Collect current NIC RDMA metrics. + * + * Queries the first RDMA port for statistics and extracts the + * 6 RDMA metrics: rx/tx bytes, rx/tx packets, and rx/tx CNP packets. + * + * @return Collected metrics (zeros if query fails) + */ + [[nodiscard]] metrics get_nic_metrics() const + { + metrics nic_metrics{}; + + if(m_rdma_port_count == 0) + { + return nic_metrics; + } + + // Query statistics for the first RDMA port + uint32_t num_stats = 0; + if(m_driver_api->get_nic_rdma_port_statistics(m_device_handle, 0, &num_stats, + nullptr) != AMDSMI_STATUS_SUCCESS) + { + return nic_metrics; + } + + if(num_stats == 0) + { + return nic_metrics; + } + + std::vector stats(num_stats); + if(m_driver_api->get_nic_rdma_port_statistics( + m_device_handle, 0, &num_stats, stats.data()) != AMDSMI_STATUS_SUCCESS) + { + return nic_metrics; + } + + static const std::unordered_map + METRIC_MAP = { { "rx_rdma_ucast_bytes", &metrics::rx_rdma_ucast_bytes }, + { "tx_rdma_ucast_bytes", &metrics::tx_rdma_ucast_bytes }, + { "rx_rdma_ucast_pkts", &metrics::rx_rdma_ucast_pkts }, + { "tx_rdma_ucast_pkts", &metrics::tx_rdma_ucast_pkts }, + { "rx_rdma_cnp_pkts", &metrics::rx_rdma_cnp_pkts }, + { "tx_rdma_cnp_pkts", &metrics::tx_rdma_cnp_pkts } }; + + for(const auto& stat : stats) + { + auto it = METRIC_MAP.find(std::string_view(stat.name)); + if(it != METRIC_MAP.end()) + { + nic_metrics.*(it->second) = stat.value; + } + } + + return nic_metrics; + } + +private: + /** + * @brief Initialize device info and determine supported metrics. + * + * Queries port and RDMA device information to determine what + * statistics are available from this NIC. + * + * @return true if the device supports at least some metrics + */ + bool initialize_device_info() + { + // Get ASIC info for vendor and product names + amdsmi_nic_asic_info_t asic_info{}; + if(m_driver_api->get_nic_asic_info(m_device_handle, &asic_info) == + AMDSMI_STATUS_SUCCESS) + { + m_product_name = asic_info.product_name; + m_vendor_name = asic_info.vendor_name; + } + + // Get port info to determine the device name + amdsmi_nic_port_info_t port_info{}; + if(m_driver_api->get_nic_port_info(m_device_handle, &port_info) == + AMDSMI_STATUS_SUCCESS) + { + if(port_info.num_ports > 0) + { + m_device_name = port_info.ports[0].netdev; + } + } + + // Get RDMA device info + amdsmi_nic_rdma_devices_info_t rdma_info{}; + if(m_driver_api->get_nic_rdma_dev_info(m_device_handle, &rdma_info) != + AMDSMI_STATUS_SUCCESS) + { + LOG_DEBUG("NIC device [{}] does not support RDMA queries", m_index); + return false; + } + + if(rdma_info.num_rdma_dev == 0) + { + LOG_DEBUG("NIC device [{}] has no RDMA devices", m_index); + return false; + } + + // Use the first RDMA device's first port + m_rdma_port_count = rdma_info.rdma_dev_info[0].num_rdma_ports; + if(m_rdma_port_count == 0) + { + LOG_DEBUG("NIC device [{}] has no RDMA ports", m_index); + return false; + } + + // Try to get statistics to verify support + uint32_t num_stats = 0; + if(m_driver_api->get_nic_rdma_port_statistics(m_device_handle, 0, &num_stats, + nullptr) != AMDSMI_STATUS_SUCCESS) + { + LOG_DEBUG("NIC device [{}] failed to query statistics count", m_index); + return false; + } + + if(num_stats == 0) + { + LOG_DEBUG("NIC device [{}] has no RDMA statistics available", m_index); + return false; + } + + // All 6 metrics are assumed supported if we can query stats + m_supported_metrics.bits.rx_rdma_ucast_bytes = 1; + m_supported_metrics.bits.tx_rdma_ucast_bytes = 1; + m_supported_metrics.bits.rx_rdma_ucast_pkts = 1; + m_supported_metrics.bits.tx_rdma_ucast_pkts = 1; + m_supported_metrics.bits.rx_rdma_cnp_pkts = 1; + m_supported_metrics.bits.tx_rdma_cnp_pkts = 1; + + LOG_DEBUG("NIC device [{}] ({}) initialized with {} RDMA port(s)", m_index, + m_device_name, m_rdma_port_count); + + return m_supported_metrics.value != 0; + } + + std::shared_ptr m_driver_api; + amdsmi_processor_handle m_device_handle; + enabled_metrics m_supported_metrics; + size_t m_index; + std::string m_device_name; + std::string m_product_name; + std::string m_vendor_name; + uint8_t m_rdma_port_count = 0; + bool m_is_supported = false; +}; + +} // namespace rocprofsys::pmc::collectors::nic diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/nic_traits.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/nic_traits.hpp new file mode 100644 index 00000000000..57ebd22e4f0 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/nic_traits.hpp @@ -0,0 +1,173 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "core/agent_manager.hpp" +#include "library/pmc/collectors/nic/device.hpp" +#include "library/pmc/collectors/nic/types.hpp" +#include "library/pmc/common/types.hpp" +#include "logger/debug.hpp" + +#include +#include +#include + +namespace rocprofsys::pmc::collectors::nic +{ + +using ::rocprofsys::pmc::device_selection_mode; +using ::rocprofsys::pmc::device_type; +using ::rocprofsys::pmc::nic_device_filter; + +/** + * @brief Traits type for NIC collector configuration. + * + * Defines types, constants, and customization points for the base collector template + * to work with NIC devices via AMD SMI. + * + * @note This traits class bridges the NIC-specific requirements to the base::collector: + * - Name-based device filtering (vs GPU's index-based filtering) + * - Device context storage for NIC-specific API signatures (device_name, product_name) + * - Agent registration during device enumeration + * + * @tparam Driver The AMD SMI driver type (real or mock for testing) + */ +template +struct nic_traits +{ + using metrics_t = pmc::collectors::nic::metrics; + using enabled_metrics_t = pmc::collectors::nic::enabled_metrics; + using device_t = device; + using device_ptr_t = std::shared_ptr; + using container_t = std::vector; + using driver_t = typename DriverProvider::driver_t; + + static constexpr const char* device_name = "NIC"; + struct device_entry + { + device_ptr_t device; + enabled_metrics_t supported_metrics; + }; + + template + [[nodiscard]] static nic_device_filter get_device_filter() + { + return Settings::get_nic_device_filter(); + } + + template + [[nodiscard]] static enabled_metrics_t get_enabled_metrics() + { + return Settings::get_nic_enabled_metrics(); + } + + template + static void init_pmc_metadata(const device_ptr_t& device) + { + Cache::initialize_pmc_metadata(device->get_index(), device->get_product_name()); + } + + template + static void init_perfetto_storage(const DeviceEntries& device_entries) + { + container_t devices; + devices.reserve(device_entries.size()); + for(const auto& entry : device_entries) + { + devices.push_back(entry.device); + } + Perfetto::init_storage(devices); + } + + template + static void setup_counter_tracks(const device_ptr_t& device, + const enabled_metrics_t& enabled) + { + Perfetto::setup_counter_tracks(device->get_index(), device->get_name(), enabled); + } + + template + static void post_process_perfetto(const DeviceEntries& device_entries, + const enabled_metrics_t& enabled) + { + container_t devices; + devices.reserve(device_entries.size()); + for(const auto& entry : device_entries) + { + devices.push_back(entry.device); + } + Perfetto::post_process(devices, enabled); + } + + [[nodiscard]] static metrics_t get_metrics(const device_ptr_t& device, + const enabled_metrics_t& /*enabled*/, + uint64_t /*timestamp*/) + { + return device->get_nic_metrics(); + } + + template + [[nodiscard]] static std::vector enumerate_devices( + std::shared_ptr provider) + { + std::vector entries; + auto filter = get_device_filter(); + + if(filter.mode == device_selection_mode::NONE) + { + LOG_DEBUG("{} sampling disabled via configuration", device_name); + return entries; + } + + auto devices = provider->template get_devices(device_type::NIC); + + for(auto& device : devices) + { + if(!device->is_supported()) continue; + + bool should_include = false; + switch(filter.mode) + { + case device_selection_mode::ALL: should_include = true; break; + case device_selection_mode::NONE: should_include = false; break; + case device_selection_mode::SPECIFIC: + should_include = filter.names.count(device->get_name()) > 0; + break; + } + + if(should_include) + { + auto supported = device->get_supported_metrics(); + entries.push_back(device_entry{ std::move(device), supported }); + } + } + + register_nic_agents(entries); + + return entries; + } + + static void register_nic_agents(const std::vector& entries) + { + size_t nic_index = 0; + for(const auto& entry : entries) + { + agent cur_agent{ agent_type::NIC, + 0, + nic_index, + static_cast(nic_index), + static_cast(nic_index), + static_cast(nic_index), + entry.device->get_product_name().c_str(), + entry.device->get_vendor_name().c_str(), + "AI NIC", + "AI NIC" }; + + get_agent_manager_instance().insert_agent(cur_agent); + nic_index++; + } + } +}; + +} // namespace rocprofsys::pmc::collectors::nic diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/perfetto_policy.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/perfetto_policy.hpp new file mode 100644 index 00000000000..0c52a33c58d --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/perfetto_policy.hpp @@ -0,0 +1,334 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "core/perfetto.hpp" +#include "library/pmc/collectors/nic/types.hpp" +#include "library/thread_info.hpp" +#include "logger/debug.hpp" + +#include +#include + +#include +#include +#include +#include + +namespace rocprofsys::pmc::collectors::nic +{ + +namespace +{ + +struct nic_track_description +{ + const char* track_name; + const char* units; + size_t track_index = 0; +}; + +// Helper function to create enabled_metrics value from bit positions +// See enabled_metrics definition in pmc/collectors/nic/types.hpp for bit position +// documentation +inline constexpr uint32_t +make_nic_metric_value(std::initializer_list bit_positions) +{ + uint32_t value = 0; + for(auto bit : bit_positions) + { + value |= (1u << bit); + } + return value; +} + +const auto RX_RDMA_UCAST_BYTES_VALUE = make_nic_metric_value({ 0 }); +const auto TX_RDMA_UCAST_BYTES_VALUE = make_nic_metric_value({ 1 }); +const auto RX_RDMA_UCAST_PKTS_VALUE = make_nic_metric_value({ 2 }); +const auto TX_RDMA_UCAST_PKTS_VALUE = make_nic_metric_value({ 3 }); +const auto RX_RDMA_CNP_PKTS_VALUE = make_nic_metric_value({ 4 }); +const auto TX_RDMA_CNP_PKTS_VALUE = make_nic_metric_value({ 5 }); + +struct nic_perfetto_sample +{ + size_t timestamp; + metrics metric_values; +}; + +} // namespace + +/** + * @brief Output policy for writing NIC RDMA samples directly to Perfetto traces. + * + * This policy handles real-time serialization of NIC RDMA metric samples into + * Perfetto trace format, creating counter tracks for each metric type. + * + * @see cache_policy for writing to trace cache instead + */ +struct perfetto_policy +{ + using counter_track = perfetto_counter_track; + + // Static storage for Perfetto tracks and sample buffering (C++17 inline static) + static inline std::map> tracks{}; + static inline std::map>> + bundle{}; + + /** + * @brief Initialize Perfetto storage for the given NIC devices. + * + * Allocates storage buffers for Perfetto samples for each NIC device. + * + * @tparam DeviceVector Container type holding NIC device handles + * @param devices Vector of NIC devices to initialize storage for + */ + template + static void init_storage(const DeviceVector& devices) + { + for(const auto& device : devices) + { + perfetto_policy::bundle.insert( + { device->get_index(), + std::make_unique>() }); + } + } + + /** + * @brief Set up Perfetto counter tracks for the specified NIC device metrics. + * + * Creates named counter tracks in the Perfetto trace for each enabled metric. + * + * @param device_index NIC device index + * @param device_name NIC device name (e.g., "enp226s0") + * @param enabled_metric_config Bitfield of metrics to create tracks for + */ + static void setup_counter_tracks(size_t device_index, const std::string& device_name, + const enabled_metrics& enabled_metric_config) + { + auto addendum = [&](const char* metric_name) { + return fmt::format("NIC {} {} [{}] (S)", device_name, metric_name, + device_index); + }; + + auto& tracks = perfetto_policy::tracks[device_index]; + + if(enabled_metric_config.bits.rx_rdma_ucast_bytes) + { + tracks[RX_RDMA_UCAST_BYTES_VALUE] = { + "RX RDMA Bytes", "bytes", + counter_track::emplace(device_index, addendum("RX RDMA Bytes"), "bytes") + }; + } + + if(enabled_metric_config.bits.tx_rdma_ucast_bytes) + { + tracks[TX_RDMA_UCAST_BYTES_VALUE] = { + "TX RDMA Bytes", "bytes", + counter_track::emplace(device_index, addendum("TX RDMA Bytes"), "bytes") + }; + } + + if(enabled_metric_config.bits.rx_rdma_ucast_pkts) + { + tracks[RX_RDMA_UCAST_PKTS_VALUE] = { + "RX RDMA Pkts", "packets", + counter_track::emplace(device_index, addendum("RX RDMA Pkts"), "packets") + }; + } + + if(enabled_metric_config.bits.tx_rdma_ucast_pkts) + { + tracks[TX_RDMA_UCAST_PKTS_VALUE] = { + "TX RDMA Pkts", "packets", + counter_track::emplace(device_index, addendum("TX RDMA Pkts"), "packets") + }; + } + + if(enabled_metric_config.bits.rx_rdma_cnp_pkts) + { + tracks[RX_RDMA_CNP_PKTS_VALUE] = { + "RX CNP Pkts", "packets", + counter_track::emplace(device_index, addendum("RX CNP Pkts"), "packets") + }; + } + + if(enabled_metric_config.bits.tx_rdma_cnp_pkts) + { + tracks[TX_RDMA_CNP_PKTS_VALUE] = { + "TX CNP Pkts", "packets", + counter_track::emplace(device_index, addendum("TX CNP Pkts"), "packets") + }; + } + } + + /** + * @brief Store a NIC sample for later Perfetto serialization. + * + * Buffers the metric sample for batch processing during post_process(). + * + * @param device_index NIC device index + * @param metric_values Collected metric values + * @param timestamp Sample timestamp in nanoseconds + */ + static void store_sample(size_t device_index, const metrics& metric_values, + uint64_t timestamp) + { + auto it = perfetto_policy::bundle.find(device_index); + if(it != perfetto_policy::bundle.end()) + { + it->second->emplace_back(nic_perfetto_sample{ timestamp, metric_values }); + } + } + + /** + * @brief Post-process buffered samples and write to Perfetto trace. + * + * Serializes all buffered NIC samples to Perfetto counter tracks. + * This is called at the end of profiling to flush all samples. + * + * @tparam DeviceVector Container type holding NIC device handles + * @param devices Vector of NIC devices + * @param enabled_metrics Metrics that were enabled during collection + */ + template + static void post_process( + const DeviceVector& devices, + ::rocprofsys::pmc::collectors::nic::enabled_metrics enabled_metrics) + { + for(const auto& device : devices) + { + post_process_device(device->get_index(), enabled_metrics, + device->get_supported_metrics()); + } + } + + static void post_process_device( + size_t device_index, + ::rocprofsys::pmc::collectors::nic::enabled_metrics enabled_metrics, + ::rocprofsys::pmc::collectors::nic::enabled_metrics supported_metrics) + { + auto bundle_it = perfetto_policy::bundle.find(device_index); + if(bundle_it == perfetto_policy::bundle.end() || !bundle_it->second) + { + return; + } + + auto& samples = *bundle_it->second; + + const auto& thread_info = thread_info::get(0, InternalTID); + if(!thread_info) + { + return; + } + + ::rocprofsys::pmc::collectors::nic::enabled_metrics effective_metrics = { + .value = + static_cast(enabled_metrics.value & supported_metrics.value) + }; + + if(effective_metrics.value == 0) + { + return; + } + + auto tracks_it = perfetto_policy::tracks.find(device_index); + if(tracks_it == perfetto_policy::tracks.end()) + { + return; + } + + auto& tracks = tracks_it->second; + + for(const auto& sample : samples) + { + const auto ts = sample.timestamp; + + if(!thread_info->is_valid_time(ts)) + { + LOG_WARNING("Invalid timestamp {} for NIC sample", ts); + continue; + } + + // RX RDMA unicast bytes + if(effective_metrics.bits.rx_rdma_ucast_bytes) + { + auto it = tracks.find(RX_RDMA_UCAST_BYTES_VALUE); + if(it != tracks.end()) + { + TRACE_COUNTER( + "nic_rx_ucast_bytes", + counter_track::at(device_index, it->second.track_index), ts, + static_cast(sample.metric_values.rx_rdma_ucast_bytes)); + } + } + + // TX RDMA unicast bytes + if(effective_metrics.bits.tx_rdma_ucast_bytes) + { + auto it = tracks.find(TX_RDMA_UCAST_BYTES_VALUE); + if(it != tracks.end()) + { + TRACE_COUNTER( + "nic_tx_ucast_bytes", + counter_track::at(device_index, it->second.track_index), ts, + static_cast(sample.metric_values.tx_rdma_ucast_bytes)); + } + } + + // RX RDMA unicast packets + if(effective_metrics.bits.rx_rdma_ucast_pkts) + { + auto it = tracks.find(RX_RDMA_UCAST_PKTS_VALUE); + if(it != tracks.end()) + { + TRACE_COUNTER( + "nic_rx_ucast_pkts", + counter_track::at(device_index, it->second.track_index), ts, + static_cast(sample.metric_values.rx_rdma_ucast_pkts)); + } + } + + // TX RDMA unicast packets + if(effective_metrics.bits.tx_rdma_ucast_pkts) + { + auto it = tracks.find(TX_RDMA_UCAST_PKTS_VALUE); + if(it != tracks.end()) + { + TRACE_COUNTER( + "nic_tx_ucast_pkts", + counter_track::at(device_index, it->second.track_index), ts, + static_cast(sample.metric_values.tx_rdma_ucast_pkts)); + } + } + + // RX RDMA CNP packets + if(effective_metrics.bits.rx_rdma_cnp_pkts) + { + auto it = tracks.find(RX_RDMA_CNP_PKTS_VALUE); + if(it != tracks.end()) + { + TRACE_COUNTER( + "nic_rx_cnp_pkts", + counter_track::at(device_index, it->second.track_index), ts, + static_cast(sample.metric_values.rx_rdma_cnp_pkts)); + } + } + + // TX RDMA CNP packets + if(effective_metrics.bits.tx_rdma_cnp_pkts) + { + auto it = tracks.find(TX_RDMA_CNP_PKTS_VALUE); + if(it != tracks.end()) + { + TRACE_COUNTER( + "nic_tx_cnp_pkts", + counter_track::at(device_index, it->second.track_index), ts, + static_cast(sample.metric_values.tx_rdma_cnp_pkts)); + } + } + } + } +}; + +} // namespace rocprofsys::pmc::collectors::nic diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/sample.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/sample.hpp new file mode 100644 index 00000000000..3a000f003ec --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/sample.hpp @@ -0,0 +1,96 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "core/trace_cache/sample_type.hpp" +#include "library/pmc/collectors/nic/types.hpp" + +#include +#include +#include + +namespace rocprofsys::pmc::collectors::nic +{ + +/** + * @brief NIC RDMA sample type for trace cache. + * + * This struct represents a sample of NIC RDMA metrics collected by the PMC. + * It inherits from cacheable_t to support serialization to the trace cache. + */ +struct sample : trace_cache::cacheable_t +{ + static constexpr trace_cache::type_identifier_t type_identifier{ + trace_cache::type_identifier_t::ainic_pmc_sample + }; + + sample() = default; + sample(enabled_metrics _settings, uint32_t _device_id, std::string _device_name, + uint64_t _timestamp, metrics _metric_values) + : enabled_metric(_settings) + , device_id(_device_id) + , device_name(std::move(_device_name)) + , timestamp(_timestamp) + , metric_values(_metric_values) + {} + + enabled_metrics enabled_metric; + uint32_t device_id; + std::string device_name; + uint64_t timestamp; + metrics metric_values; +}; + +} // namespace rocprofsys::pmc::collectors::nic + +namespace rocprofsys +{ +namespace trace_cache +{ + +/// @brief AINIC PMC sample type alias +using ainic_pmc_sample = pmc::collectors::nic::sample; + +template <> +inline void +serialize(uint8_t* buffer, const pmc::collectors::nic::sample& item) +{ + utility::store_value( + buffer, static_cast(item.enabled_metric.value), item.device_id, + std::string_view(item.device_name), item.timestamp, + item.metric_values.rx_rdma_ucast_bytes, item.metric_values.tx_rdma_ucast_bytes, + item.metric_values.rx_rdma_ucast_pkts, item.metric_values.tx_rdma_ucast_pkts, + item.metric_values.rx_rdma_cnp_pkts, item.metric_values.tx_rdma_cnp_pkts); +} + +template <> +inline pmc::collectors::nic::sample +deserialize(uint8_t*& buffer) +{ + pmc::collectors::nic::sample item; + std::string_view device_name_view; + utility::parse_value( + buffer, item.enabled_metric.value, item.device_id, device_name_view, + item.timestamp, item.metric_values.rx_rdma_ucast_bytes, + item.metric_values.tx_rdma_ucast_bytes, item.metric_values.rx_rdma_ucast_pkts, + item.metric_values.tx_rdma_ucast_pkts, item.metric_values.rx_rdma_cnp_pkts, + item.metric_values.tx_rdma_cnp_pkts); + item.device_name = std::string(device_name_view); + return item; +} + +template <> +inline size_t +get_size(const pmc::collectors::nic::sample& item) +{ + return utility::get_size( + item.enabled_metric.value, item.device_id, std::string_view(item.device_name), + item.timestamp, item.metric_values.rx_rdma_ucast_bytes, + item.metric_values.tx_rdma_ucast_bytes, item.metric_values.rx_rdma_ucast_pkts, + item.metric_values.tx_rdma_ucast_pkts, item.metric_values.rx_rdma_cnp_pkts, + item.metric_values.tx_rdma_cnp_pkts); +} + +} // namespace trace_cache +} // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/tests/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/tests/CMakeLists.txt new file mode 100644 index 00000000000..943c376e701 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/tests/CMakeLists.txt @@ -0,0 +1,21 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +add_library(pmc-nic-collector-tests OBJECT test_nic_device.cpp) + +target_link_libraries( + pmc-nic-collector-tests + PUBLIC + rocprofiler-systems-pmc-library + rocprofiler-systems-googletest-library + rocprofiler-systems-logger + rocprofiler-systems::rocprofiler-systems-rocm +) + +target_include_directories( + pmc-nic-collector-tests + PRIVATE + ${PROJECT_SOURCE_DIR}/source/lib + ${PROJECT_SOURCE_DIR}/source/lib/rocprof-sys + ${PROJECT_BINARY_DIR}/source/lib +) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/tests/test_nic_device.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/tests/test_nic_device.cpp new file mode 100644 index 00000000000..313e1c871af --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/tests/test_nic_device.cpp @@ -0,0 +1,267 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#include "library/pmc/collectors/nic/device.hpp" +#include "library/pmc/device_providers/amd_smi/drivers/tests/mock_driver.hpp" + +#include +#include + +#include + +using namespace rocprofsys::pmc::collectors::nic; +using ::testing::_; +using ::testing::AtLeast; +using ::testing::DoAll; +using ::testing::Return; +using ::testing::SetArgPointee; +using ::testing::StrictMock; + +using MockDriver = + ::testing::StrictMock; + +namespace rocprofsys::pmc::collectors::nic::testing +{ + +/** + * @brief Test fixture for NIC device tests. + * + * Provides common setup for device tests including mock driver and + * helper methods for configuring mock behavior. + */ +class NicDeviceTest : public ::testing::Test +{ +protected: + std::shared_ptr mock_driver; + amdsmi_processor_handle test_handle; + processor_type_t test_processor_type; + size_t test_index; + + void SetUp() override + { + mock_driver = std::make_shared(); + test_handle = reinterpret_cast(0x5678); + test_processor_type = AMDSMI_PROCESSOR_TYPE_AMD_NIC; + test_index = 0; + } + + /** + * @brief Configure mock to return NIC with full RDMA support. + */ + void SetupFullRdmaSupport() + { + // Setup ASIC info for vendor and product names + amdsmi_nic_asic_info_t asic_info{}; + std::strncpy(asic_info.product_name, "AMD AINIC Test", + sizeof(asic_info.product_name) - 1); + std::strncpy(asic_info.vendor_name, "AMD", sizeof(asic_info.vendor_name) - 1); + + EXPECT_CALL(*mock_driver, get_nic_asic_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<1>(asic_info), Return(AMDSMI_STATUS_SUCCESS))); + + // Setup port info + amdsmi_nic_port_info_t port_info{}; + port_info.num_ports = 1; + std::strncpy(port_info.ports[0].netdev, "enp226s0", + sizeof(port_info.ports[0].netdev) - 1); + + EXPECT_CALL(*mock_driver, get_nic_port_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<1>(port_info), Return(AMDSMI_STATUS_SUCCESS))); + + // Setup RDMA device info + amdsmi_nic_rdma_devices_info_t rdma_info{}; + rdma_info.num_rdma_dev = 1; + rdma_info.rdma_dev_info[0].num_rdma_ports = 1; + std::strncpy(rdma_info.rdma_dev_info[0].rdma_dev, "rdma0", + sizeof(rdma_info.rdma_dev_info[0].rdma_dev) - 1); + + EXPECT_CALL(*mock_driver, get_nic_rdma_dev_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<1>(rdma_info), Return(AMDSMI_STATUS_SUCCESS))); + + // Setup statistics count query + EXPECT_CALL(*mock_driver, + get_nic_rdma_port_statistics(test_handle, 0, _, nullptr)) + .Times(AtLeast(1)) + .WillRepeatedly([](amdsmi_processor_handle, uint8_t, uint32_t* count, + amdsmi_nic_stat_t*) { + *count = 6; + return AMDSMI_STATUS_SUCCESS; + }); + } + + /** + * @brief Configure mock to return full statistics. + */ + void SetupStatisticsData() + { + SetupFullRdmaSupport(); + + EXPECT_CALL(*mock_driver, + get_nic_rdma_port_statistics(test_handle, 0, _, ::testing::NotNull())) + .Times(AtLeast(1)) + .WillRepeatedly([](amdsmi_processor_handle, uint8_t, uint32_t* count, + amdsmi_nic_stat_t* stats) { + *count = 6; + + std::strncpy(stats[0].name, "rx_rdma_ucast_bytes", + sizeof(stats[0].name) - 1); + stats[0].value = 1000000; + + std::strncpy(stats[1].name, "tx_rdma_ucast_bytes", + sizeof(stats[1].name) - 1); + stats[1].value = 2000000; + + std::strncpy(stats[2].name, "rx_rdma_ucast_pkts", + sizeof(stats[2].name) - 1); + stats[2].value = 5000; + + std::strncpy(stats[3].name, "tx_rdma_ucast_pkts", + sizeof(stats[3].name) - 1); + stats[3].value = 6000; + + std::strncpy(stats[4].name, "rx_rdma_cnp_pkts", + sizeof(stats[4].name) - 1); + stats[4].value = 100; + + std::strncpy(stats[5].name, "tx_rdma_cnp_pkts", + sizeof(stats[5].name) - 1); + stats[5].value = 200; + + return AMDSMI_STATUS_SUCCESS; + }); + } + + /** + * @brief Configure mock to return no RDMA support. + */ + void SetupNoRdmaSupport() + { + // Setup ASIC info + amdsmi_nic_asic_info_t asic_info{}; + std::strncpy(asic_info.product_name, "Generic NIC", + sizeof(asic_info.product_name) - 1); + std::strncpy(asic_info.vendor_name, "Unknown", sizeof(asic_info.vendor_name) - 1); + + EXPECT_CALL(*mock_driver, get_nic_asic_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<1>(asic_info), Return(AMDSMI_STATUS_SUCCESS))); + + // Setup port info (NIC exists but no RDMA) + amdsmi_nic_port_info_t port_info{}; + port_info.num_ports = 1; + std::strncpy(port_info.ports[0].netdev, "eth0", + sizeof(port_info.ports[0].netdev) - 1); + + EXPECT_CALL(*mock_driver, get_nic_port_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<1>(port_info), Return(AMDSMI_STATUS_SUCCESS))); + + // RDMA query fails + EXPECT_CALL(*mock_driver, get_nic_rdma_dev_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly(Return(AMDSMI_STATUS_NOT_SUPPORTED)); + } +}; + +TEST_F(NicDeviceTest, DeviceIsSupported_WhenRdmaAvailable) +{ + SetupFullRdmaSupport(); + + device dev(mock_driver, test_handle, test_processor_type, test_index); + + EXPECT_TRUE(dev.is_supported()); + EXPECT_EQ(dev.get_index(), test_index); + EXPECT_EQ(dev.get_name(), "enp226s0"); + EXPECT_EQ(dev.get_product_name(), "AMD AINIC Test"); + EXPECT_EQ(dev.get_vendor_name(), "AMD"); +} + +TEST_F(NicDeviceTest, DeviceIsNotSupported_WhenNoRdma) +{ + SetupNoRdmaSupport(); + + device dev(mock_driver, test_handle, test_processor_type, test_index); + + EXPECT_FALSE(dev.is_supported()); +} + +TEST_F(NicDeviceTest, GetSupportedMetrics_AllEnabled) +{ + SetupFullRdmaSupport(); + + device dev(mock_driver, test_handle, test_processor_type, test_index); + auto supported = dev.get_supported_metrics(); + + EXPECT_TRUE(supported.bits.rx_rdma_ucast_bytes); + EXPECT_TRUE(supported.bits.tx_rdma_ucast_bytes); + EXPECT_TRUE(supported.bits.rx_rdma_ucast_pkts); + EXPECT_TRUE(supported.bits.tx_rdma_ucast_pkts); + EXPECT_TRUE(supported.bits.rx_rdma_cnp_pkts); + EXPECT_TRUE(supported.bits.tx_rdma_cnp_pkts); +} + +TEST_F(NicDeviceTest, GetNicMetrics_ReturnsCorrectValues) +{ + SetupStatisticsData(); + + device dev(mock_driver, test_handle, test_processor_type, test_index); + auto m = dev.get_nic_metrics(); + + EXPECT_EQ(m.rx_rdma_ucast_bytes, 1000000ULL); + EXPECT_EQ(m.tx_rdma_ucast_bytes, 2000000ULL); + EXPECT_EQ(m.rx_rdma_ucast_pkts, 5000ULL); + EXPECT_EQ(m.tx_rdma_ucast_pkts, 6000ULL); + EXPECT_EQ(m.rx_rdma_cnp_pkts, 100ULL); + EXPECT_EQ(m.tx_rdma_cnp_pkts, 200ULL); +} + +TEST_F(NicDeviceTest, GetNicMetrics_ReturnsZeros_WhenNoRdmaPorts) +{ + // Setup ASIC info + amdsmi_nic_asic_info_t asic_info{}; + std::strncpy(asic_info.product_name, "Test NIC", sizeof(asic_info.product_name) - 1); + std::strncpy(asic_info.vendor_name, "Test Vendor", sizeof(asic_info.vendor_name) - 1); + + EXPECT_CALL(*mock_driver, get_nic_asic_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<1>(asic_info), Return(AMDSMI_STATUS_SUCCESS))); + + // Setup with RDMA device but no ports + amdsmi_nic_port_info_t port_info{}; + port_info.num_ports = 1; + std::strncpy(port_info.ports[0].netdev, "enp226s0", + sizeof(port_info.ports[0].netdev) - 1); + + EXPECT_CALL(*mock_driver, get_nic_port_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<1>(port_info), Return(AMDSMI_STATUS_SUCCESS))); + + amdsmi_nic_rdma_devices_info_t rdma_info{}; + rdma_info.num_rdma_dev = 1; + rdma_info.rdma_dev_info[0].num_rdma_ports = 0; // No ports + + EXPECT_CALL(*mock_driver, get_nic_rdma_dev_info(test_handle, _)) + .Times(AtLeast(1)) + .WillRepeatedly( + DoAll(SetArgPointee<1>(rdma_info), Return(AMDSMI_STATUS_SUCCESS))); + + device dev(mock_driver, test_handle, test_processor_type, test_index); + + EXPECT_FALSE(dev.is_supported()); + + auto m = dev.get_nic_metrics(); + EXPECT_EQ(m.rx_rdma_ucast_bytes, 0ULL); + EXPECT_EQ(m.tx_rdma_ucast_bytes, 0ULL); +} + +} // namespace rocprofsys::pmc::collectors::nic::testing diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/types.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/types.hpp new file mode 100644 index 00000000000..72d3af50a61 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/collectors/nic/types.hpp @@ -0,0 +1,64 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +namespace rocprofsys +{ +namespace pmc +{ +namespace collectors +{ +namespace nic +{ + +/** + * @brief Bitfield union for selecting which NIC RDMA metrics to collect. + * + * Bit positions (for value access): + * - rx_rdma_ucast_bytes = 0 Received unicast bytes + * - tx_rdma_ucast_bytes = 1 Transmitted unicast bytes + * - rx_rdma_ucast_pkts = 2 Received unicast packets + * - tx_rdma_ucast_pkts = 3 Transmitted unicast packets + * - rx_rdma_cnp_pkts = 4 Received CNP (congestion) packets + * - tx_rdma_cnp_pkts = 5 Transmitted CNP packets + */ +union enabled_metrics +{ + struct + { + uint32_t rx_rdma_ucast_bytes : 1; + uint32_t tx_rdma_ucast_bytes : 1; + uint32_t rx_rdma_ucast_pkts : 1; + uint32_t tx_rdma_ucast_pkts : 1; + uint32_t rx_rdma_cnp_pkts : 1; + uint32_t tx_rdma_cnp_pkts : 1; + } bits; + uint32_t value = 0; +}; + +/// All 6 NIC RDMA metrics enabled (bits 0-5) +static constexpr uint32_t ALL_NIC_METRICS = 0x3F; + +/** + * @brief Container for NIC RDMA metrics collected from AMD SMI. + * + * These metrics are collected per-port from the NIC and represent + * cumulative counters for RDMA traffic statistics. + */ +struct metrics +{ + uint64_t rx_rdma_ucast_bytes = 0; // Received unicast bytes + uint64_t tx_rdma_ucast_bytes = 0; // Transmitted unicast bytes + uint64_t rx_rdma_ucast_pkts = 0; // Received unicast packets + uint64_t tx_rdma_ucast_pkts = 0; // Transmitted unicast packets + uint64_t rx_rdma_cnp_pkts = 0; // Received CNP (congestion) packets + uint64_t tx_rdma_cnp_pkts = 0; // Transmitted CNP packets +}; + +} // namespace nic +} // namespace collectors +} // namespace pmc +} // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/common/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/common/CMakeLists.txt new file mode 100644 index 00000000000..b46a7662872 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/common/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +# PMC Common Types +set(pmc_common_types_sources ${CMAKE_CURRENT_LIST_DIR}/types.hpp) + +# Add to parent variable (will be used by pmc CMakeLists.txt) +set(pmc_common_types_sources ${pmc_common_types_sources} PARENT_SCOPE) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/common/device_slice.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/common/device_slice.hpp new file mode 100644 index 00000000000..26b38bef258 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/common/device_slice.hpp @@ -0,0 +1,138 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "library/pmc/common/types.hpp" + +#include +#include +#include +#include +#include + +namespace rocprofsys::pmc +{ + +/** + * @brief Type-erased device slice - owning wrapper for any device type. + * + * This class provides a lightweight type erasure mechanism for PMC devices. + * It allows storing heterogeneous device types (GPU, NIC) in a single + * container without requiring virtual inheritance or a common base class. + * + * Any type T can be wrapped in a device_slice as long as it provides the + * required interface methods: get_index(), get_name(), get_product_name(), + * get_vendor_name(), is_supported() + * + * Example usage: + * @code + * auto gpu_dev = std::make_shared>(...); + * auto nic_dev = std::make_shared>(...); + * + * std::vector devices; + * devices.emplace_back(gpu_dev, device_type::GPU); + * devices.emplace_back(nic_dev, device_type::NIC); + * + * for (auto& dev : devices) { + * std::cout << dev.get_name() << ": " << dev.get_product_name() << "\n"; + * } + * + * // Type recovery when needed + * if (auto gpu = devices[0].as>()) { + * auto metrics = gpu->get_gpu_metrics(enabled, timestamp); + * } + * @endcode + */ +class device_slice +{ +public: + /** + * @brief Construct a device_slice from any device type. + * + * @tparam T Device type (must have get_index, get_name, get_product_name, + * get_vendor_name, is_supported methods) + * @param dev Shared pointer to the device object + * @param dev_type The device type (GPU, NIC) + */ + template + device_slice(std::shared_ptr dev, device_type dev_type) + : m_device{ std::move(dev) } + , m_type{ dev_type } + , m_get_index_impl{ [](void* ptr) -> size_t { + return static_cast(ptr)->get_index(); + } } + , m_get_name_impl{ [](void* ptr) -> const std::string& { + return static_cast(ptr)->get_name(); + } } + , m_get_product_name_impl{ [](void* ptr) -> const std::string& { + return static_cast(ptr)->get_product_name(); + } } + , m_get_vendor_name_impl{ [](void* ptr) -> const std::string& { + return static_cast(ptr)->get_vendor_name(); + } } + , m_is_supported_impl{ [](void* ptr) -> bool { + return static_cast(ptr)->is_supported(); + } } + {} + + /** + * @brief Get the device index. + * @return Device index. + */ + [[nodiscard]] size_t get_index() const { return m_get_index_impl(m_device.get()); } + + /** + * @brief Get the device name (e.g., "GPU0", "nic0"). + * @return Const reference to the device name. + */ + [[nodiscard]] const std::string& get_name() const + { + return m_get_name_impl(m_device.get()); + } + + /** + * @brief Get the product name (e.g., "AMD Instinct MI300X"). + * @return Const reference to the product name. + */ + [[nodiscard]] const std::string& get_product_name() const + { + return m_get_product_name_impl(m_device.get()); + } + + /** + * @brief Get the vendor name (e.g., "AMD"). + * @return Const reference to the vendor name. + */ + [[nodiscard]] const std::string& get_vendor_name() const + { + return m_get_vendor_name_impl(m_device.get()); + } + + /** + * @brief Check if the device is supported. + * @return True if the device supports metrics collection. + */ + [[nodiscard]] bool is_supported() const + { + return m_is_supported_impl(m_device.get()); + } + + /** + * @brief Get the device type. + * @return The device type (GPU, NIC). + */ + [[nodiscard]] device_type type() const noexcept { return m_type; } + +private: + std::shared_ptr m_device; /**< Owning pointer to device (type-erased) */ + device_type m_type; /**< Device type (GPU, NIC) */ + + std::function m_get_index_impl; + std::function m_get_name_impl; + std::function m_get_product_name_impl; + std::function m_get_vendor_name_impl; + std::function m_is_supported_impl; +}; + +} // namespace rocprofsys::pmc diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/common/types.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/common/types.hpp new file mode 100644 index 00000000000..ca68d55885c --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/common/types.hpp @@ -0,0 +1,70 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include + +namespace rocprofsys +{ +namespace pmc +{ + +/** + * @brief Version information for device providers and drivers. + */ +struct version +{ + struct + { + uint32_t major = 0; + uint32_t minor = 0; + uint32_t release = 0; + } numeric_representation; + std::string string_representation; +}; + +/** + * @brief Device type for provider enumeration. + */ +enum class device_type : uint8_t +{ + GPU, ///< GPU device + NIC ///< Network interface device +}; + +/** + * @brief Device selection mode for filtering devices. + */ +enum class device_selection_mode : uint8_t +{ + ALL, ///< Include all devices + NONE, ///< Exclude all devices + SPECIFIC ///< Include only specific devices by index +}; + +/** + * @brief Device filter configuration (index-based, for GPUs). + */ +struct device_filter +{ + device_selection_mode mode = device_selection_mode::ALL; + std::set indices; ///< Device indices when mode is SPECIFIC +}; + +/** + * @brief NIC device filter configuration (name-based). + * + * NICs are filtered by network device name (e.g., "enp226s0", "eth0") + * rather than index, since NIC indices are not as stable or meaningful. + */ +struct nic_device_filter +{ + device_selection_mode mode = device_selection_mode::ALL; + std::set names; ///< Device names when mode is SPECIFIC +}; + +} // namespace pmc +} // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/CMakeLists.txt new file mode 100644 index 00000000000..933093225ac --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/CMakeLists.txt @@ -0,0 +1,11 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +# PMC Device Providers +add_subdirectory(amd_smi) + +# Collect all device provider sources +set(pmc_device_provider_sources ${pmc_amd_smi_provider_sources}) + +# Add to parent variable (will be used by pmc CMakeLists.txt) +set(pmc_device_provider_sources ${pmc_device_provider_sources} PARENT_SCOPE) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/amd_smi/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/amd_smi/CMakeLists.txt new file mode 100644 index 00000000000..e5abffe414f --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/amd_smi/CMakeLists.txt @@ -0,0 +1,14 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +# Add subdirectories +add_subdirectory(drivers) + +# AMD SMI Device Provider Sources +set(pmc_amd_smi_provider_sources + ${CMAKE_CURRENT_LIST_DIR}/provider.hpp + ${pmc_amd_smi_driver_sources} +) + +# Add to parent variable (will be used by device_providers CMakeLists.txt) +set(pmc_amd_smi_provider_sources ${pmc_amd_smi_provider_sources} PARENT_SCOPE) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/amd_smi/drivers/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/amd_smi/drivers/CMakeLists.txt new file mode 100644 index 00000000000..ce52a31c861 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/amd_smi/drivers/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + +# AMD SMI Driver Sources +set(pmc_amd_smi_driver_sources ${CMAKE_CURRENT_LIST_DIR}/driver.hpp) + +# Add to parent variable (will be used by amd_smi provider CMakeLists.txt) +set(pmc_amd_smi_driver_sources ${pmc_amd_smi_driver_sources} PARENT_SCOPE) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/amd_smi/drivers/driver.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/amd_smi/drivers/driver.hpp new file mode 100644 index 00000000000..02b5d72150e --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/amd_smi/drivers/driver.hpp @@ -0,0 +1,251 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +#include + +namespace rocprofsys::pmc::drivers::amd_smi +{ + +/** + * @brief Thin wrapper around AMD SMI C API for dependency injection and testing. + * + * This struct provides static methods that directly forward to the AMD SMI library. + * It serves as an abstraction layer that can be mocked in tests through the + * driver_factory. + */ +struct driver +{ + /** + * @brief Initialize the AMD SMI library. + * @param init_flags Initialization flags (default: AMDSMI_INIT_AMD_GPUS). + * @return AMD SMI status code indicating success or failure. + */ + static amdsmi_status_t init(uint64_t init_flags = AMDSMI_INIT_AMD_GPUS) + { + return amdsmi_init(init_flags); + } + + /** + * @brief Shutdown the AMD SMI library. + * @return AMD SMI status code indicating success or failure. + */ + static amdsmi_status_t shutdown() { return amdsmi_shut_down(); } + + /** + * @brief Get AMD SMI library version information. + * @param version Pointer to structure to receive version information. + * @return AMD SMI status code indicating success or failure. + */ + static amdsmi_status_t get_version(amdsmi_version_t* version) + { + return amdsmi_get_lib_version(version); + } + + /** + * @brief Get all socket handles in the system. + * @param socket_count Pointer to receive the number of sockets (input/output). + * @param socket_handles Pointer to array to receive socket handles (can be nullptr + * for count query). + * @return AMD SMI status code indicating success or failure. + */ + static amdsmi_status_t get_socket_handles(uint32_t* socket_count, + amdsmi_socket_handle* socket_handles) + { + return amdsmi_get_socket_handles(socket_count, socket_handles); + } + + /** + * @brief Get processor handles for a specific socket. + * @param socket_handle Socket to query. + * @param processor_count Pointer to receive the number of processors (input/output). + * @param processor_handles Pointer to array to receive processor handles (can be + * nullptr for count query). + * @return AMD SMI status code indicating success or failure. + * + * @note This function only returns GPU processors. For NICs, use + * get_processor_handles_by_type() with AMDSMI_PROCESSOR_TYPE_AMD_NIC. + */ + static amdsmi_status_t get_processor_handles( + amdsmi_socket_handle socket_handle, uint32_t* processor_count, + amdsmi_processor_handle* processor_handles) + { + return amdsmi_get_processor_handles(socket_handle, processor_count, + processor_handles); + } + +#if defined(ROCPROFSYS_BUILD_AINIC) && ROCPROFSYS_BUILD_AINIC == 1 + /** + * @brief Get processor handles of a specific type for a socket. + * @param socket_handle Socket to query. + * @param processor_type Type of processor to enumerate (GPU, NIC, CPU). + * @param processor_handles Pointer to array to receive processor handles (can be + * nullptr for count query). + * @param processor_count Pointer to receive the number of processors (input/output). + * @return AMD SMI status code indicating success or failure. + * + * @note This is required for enumerating NICs. amdsmi_get_processor_handles() + * only returns GPUs. Requires AMD SMI >= 26.3. + */ + static amdsmi_status_t get_processor_handles_by_type( + amdsmi_socket_handle socket_handle, processor_type_t processor_type, + amdsmi_processor_handle* processor_handles, uint32_t* processor_count) + { + return amdsmi_get_processor_handles_by_type(socket_handle, processor_type, + processor_handles, processor_count); + } +#endif + + /** + * @brief Get the type of a processor (GPU, NIC, etc.). + * @param processor_handle Processor to query. + * @param processor_type Pointer to receive the processor type. + * @return AMD SMI status code indicating success or failure. + */ + static amdsmi_status_t get_processor_type(amdsmi_processor_handle processor_handle, + processor_type_t* processor_type) + { + return amdsmi_get_processor_type(processor_handle, processor_type); + } + + /** + * @brief Get GPU memory usage for a specific memory type. + * @param processor_handle GPU processor to query. + * @param type Memory type (e.g., VRAM, GTT). + * @param usage Pointer to receive memory usage in bytes. + * @return AMD SMI status code indicating success or failure. + */ + static amdsmi_status_t get_memory_usage(amdsmi_processor_handle processor_handle, + amdsmi_memory_type_t type, uint64_t* usage) + { + return amdsmi_get_gpu_memory_usage(processor_handle, type, usage); + } + + /** + * @brief Get GPU metrics information (temperature, power, clocks, etc.). + * @param processor_handle GPU processor to query. + * @param metrics Pointer to structure to receive GPU metrics. + * @return AMD SMI status code indicating success or failure. + */ + static amdsmi_status_t get_metrics_info(amdsmi_processor_handle processor_handle, + amdsmi_gpu_metrics_t* metrics) + { + return amdsmi_get_gpu_metrics_info(processor_handle, metrics); + } + + /** + * @brief Get GPU ASIC information including vendor and product names. + * @param processor_handle GPU processor to query. + * @param asic_info Pointer to structure to receive ASIC information. + * @return AMD SMI status code indicating success or failure. + */ + static amdsmi_status_t get_gpu_asic_info(amdsmi_processor_handle processor_handle, + amdsmi_asic_info_t* asic_info) + { + return amdsmi_get_gpu_asic_info(processor_handle, asic_info); + } + + /** + * @brief Get GPU process list with per-process SDMA usage. + * + * Returns the list of processes using the GPU, including cumulative SDMA + * usage in microseconds per process. Used for computing SDMA utilization. + * + * @param processor_handle GPU processor to query. + * @param max_processes Pointer to max process count (input/output). + * @param list Pointer to array to receive process info (can be nullptr for count + * query). + * @return AMD SMI status code indicating success or failure. + * + * @note Requires AMD SMI >= 26.3. Guarded by AMD_SMI_SDMA_SUPPORTED. + */ +#if defined(AMD_SMI_SDMA_SUPPORTED) && AMD_SMI_SDMA_SUPPORTED == 1 + static amdsmi_status_t get_gpu_process_list(amdsmi_processor_handle processor_handle, + uint32_t* max_processes, + amdsmi_proc_info_t* list) + { + return amdsmi_get_gpu_process_list(processor_handle, max_processes, list); + } +#endif + +#if defined(ROCPROFSYS_BUILD_AINIC) && ROCPROFSYS_BUILD_AINIC == 1 + /** + * @brief Get NIC ASIC information including vendor and product names. + * @param processor_handle NIC processor to query. + * @param asic_info Pointer to structure to receive ASIC information. + * @return AMD SMI status code indicating success or failure. + */ + static amdsmi_status_t get_nic_asic_info(amdsmi_processor_handle processor_handle, + amdsmi_nic_asic_info_t* asic_info) + { + return amdsmi_get_nic_asic_info(processor_handle, asic_info); + } + + /** + * @brief Get NIC port information. + * @param processor_handle NIC processor to query. + * @param port_info Pointer to structure to receive port information. + * @return AMD SMI status code indicating success or failure. + */ + static amdsmi_status_t get_nic_port_info(amdsmi_processor_handle processor_handle, + amdsmi_nic_port_info_t* port_info) + { + return amdsmi_get_nic_port_info(processor_handle, port_info); + } + + /** + * @brief Get NIC RDMA device information. + * @param processor_handle NIC processor to query. + * @param rdma_info Pointer to structure to receive RDMA device info. + * @return AMD SMI status code indicating success or failure. + */ + static amdsmi_status_t get_nic_rdma_dev_info( + amdsmi_processor_handle processor_handle, + amdsmi_nic_rdma_devices_info_t* rdma_info) + { + return amdsmi_get_nic_rdma_dev_info(processor_handle, rdma_info); + } + + /** + * @brief Get NIC RDMA port statistics. + * @param processor_handle NIC processor to query. + * @param rdma_port_idx RDMA port index. + * @param num_stats Pointer to number of stats (input/output). + * @param stats Pointer to array to receive statistics (can be nullptr for count). + * @return AMD SMI status code indicating success or failure. + */ + static amdsmi_status_t get_nic_rdma_port_statistics( + amdsmi_processor_handle processor_handle, uint8_t rdma_port_idx, + uint32_t* num_stats, amdsmi_nic_stat_t* stats) + { + return amdsmi_get_nic_rdma_port_statistics(processor_handle, rdma_port_idx, + num_stats, stats); + } +#endif +}; + +/** + * @brief Factory for creating driver instances. + * + * Provides a factory method for creating driver instances. This enables + * dependency injection and allows for substituting mock drivers in tests. + */ +struct driver_factory +{ + using driver_t = driver; + + /** + * @brief Create a new driver instance. + * @return Shared pointer to the driver instance. + */ + static std::shared_ptr create_driver() + { + return std::make_shared(); + } +}; + +} // namespace rocprofsys::pmc::drivers::amd_smi diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/amd_smi/drivers/tests/mock_driver.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/amd_smi/drivers/tests/mock_driver.hpp new file mode 100644 index 00000000000..a55de4abe77 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/amd_smi/drivers/tests/mock_driver.hpp @@ -0,0 +1,135 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include + +#include + +namespace rocprofsys::pmc::drivers::amd_smi::testing +{ + +/** + * @brief Mock implementation of AMD SMI driver for unit testing. + * + * This is the unified mock driver used across all PMC tests. It provides a complete + * Google Mock implementation of the driver interface with factory pattern support + * and default behaviors via set_up_defaults(). + * + * Used by both provider-level tests and device collector tests (aliased as MockDriver + * in test_device.cpp for compatibility). + */ +class mock_driver +{ +public: + MOCK_METHOD(amdsmi_status_t, init, ()); + MOCK_METHOD(amdsmi_status_t, init, (uint64_t init_flags)); + MOCK_METHOD(amdsmi_status_t, shutdown, ()); + MOCK_METHOD(amdsmi_status_t, get_version, (amdsmi_version_t * version)); + MOCK_METHOD(amdsmi_status_t, get_socket_handles, + (uint32_t * socket_count, amdsmi_socket_handle* socket_handles)); + MOCK_METHOD(amdsmi_status_t, get_processor_handles, + (amdsmi_socket_handle socket_handle, uint32_t* processor_count, + amdsmi_processor_handle* processor_handles)); +#if defined(ROCPROFSYS_BUILD_AINIC) && ROCPROFSYS_BUILD_AINIC == 1 + MOCK_METHOD(amdsmi_status_t, get_processor_handles_by_type, + (amdsmi_socket_handle socket_handle, processor_type_t processor_type, + amdsmi_processor_handle* processor_handles, uint32_t* processor_count)); +#endif + MOCK_METHOD(amdsmi_status_t, get_processor_type, + (amdsmi_processor_handle processor_handle, + processor_type_t* processor_type)); + MOCK_METHOD(amdsmi_status_t, get_memory_usage, + (amdsmi_processor_handle processor_handle, amdsmi_memory_type_t type, + uint64_t* usage)); + MOCK_METHOD(amdsmi_status_t, get_metrics_info, + (amdsmi_processor_handle processor_handle, + amdsmi_gpu_metrics_t* metrics)); + MOCK_METHOD(amdsmi_status_t, get_gpu_asic_info, + (amdsmi_processor_handle processor_handle, + amdsmi_asic_info_t* asic_info)); + + // SDMA-specific methods (requires AMD SMI >= 26.3) +#if defined(AMD_SMI_SDMA_SUPPORTED) && AMD_SMI_SDMA_SUPPORTED == 1 + MOCK_METHOD(amdsmi_status_t, get_gpu_process_list, + (amdsmi_processor_handle processor_handle, uint32_t* max_processes, + amdsmi_proc_info_t* list)); +#endif + + // NIC-specific methods (requires ROCPROFSYS_BUILD_AINIC) +#if defined(ROCPROFSYS_BUILD_AINIC) && ROCPROFSYS_BUILD_AINIC == 1 + MOCK_METHOD(amdsmi_status_t, get_nic_asic_info, + (amdsmi_processor_handle processor_handle, + amdsmi_nic_asic_info_t* asic_info)); + MOCK_METHOD(amdsmi_status_t, get_nic_port_info, + (amdsmi_processor_handle processor_handle, + amdsmi_nic_port_info_t* port_info)); + MOCK_METHOD(amdsmi_status_t, get_nic_rdma_dev_info, + (amdsmi_processor_handle processor_handle, + amdsmi_nic_rdma_devices_info_t* rdma_info)); + MOCK_METHOD(amdsmi_status_t, get_nic_rdma_port_statistics, + (amdsmi_processor_handle processor_handle, uint8_t rdma_port_idx, + uint32_t* num_stats, amdsmi_nic_stat_t* stats)); +#endif + + /** + * @brief Set up default mock behaviors for common operations. + * + * Configures the mock to return AMDSMI_STATUS_SUCCESS for init, shutdown, + * get_memory_usage, and get_metrics_info by default. Tests can override + * these defaults with specific expectations. + */ + void set_up_defaults() + { + using ::testing::_; + using ::testing::DoAll; + using ::testing::Return; + using ::testing::SetArgPointee; + + ON_CALL(*this, init(_)).WillByDefault(Return(AMDSMI_STATUS_SUCCESS)); + ON_CALL(*this, shutdown()).WillByDefault(Return(AMDSMI_STATUS_SUCCESS)); + ON_CALL(*this, get_memory_usage(_, _, _)) + .WillByDefault(Return(AMDSMI_STATUS_SUCCESS)); + ON_CALL(*this, get_metrics_info(_, _)) + .WillByDefault(Return(AMDSMI_STATUS_SUCCESS)); + ON_CALL(*this, get_gpu_asic_info(_, _)) + .WillByDefault(Return(AMDSMI_STATUS_SUCCESS)); + } +}; + +/** + * @brief Factory for creating and injecting mock driver instances in tests. + * + * This factory allows tests to inject a mock_driver instance that will be + * used by the code under test. The mock can be configured with expectations + * and behaviors before being injected via set_mock_driver(). + */ +struct mock_driver_factory +{ + using driver_t = mock_driver; + + static std::shared_ptr s_mock_driver; + + /** + * @brief Create (retrieve) the mock driver instance. + * @return Shared pointer to the currently set mock driver. + */ + static std::shared_ptr create_driver() { return s_mock_driver; } + + /** + * @brief Set the mock driver instance to be used by tests. + * @param driver Mock driver instance to inject. + */ + static void set_mock_driver(std::shared_ptr driver) + { + s_mock_driver = std::move(driver); + } +}; + +/// Global mock driver instance shared across tests +inline std::shared_ptr mock_driver_factory::s_mock_driver = nullptr; + +} // namespace rocprofsys::pmc::drivers::amd_smi::testing diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/amd_smi/features.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/amd_smi/features.hpp new file mode 100644 index 00000000000..0f9bc415e20 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/amd_smi/features.hpp @@ -0,0 +1,31 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +/** + * @file features.hpp + * @brief AMD SMI feature detection for version compatibility. + * + * AINIC (AI NIC) support is controlled by the CMake variable ROCPROFSYS_BUILD_AINIC, + * which is set based on: + * - ROCPROFSYS_USE_AINIC option being ON + * - AMD SMI library version >= 26.3 + */ + +namespace rocprofsys::pmc::device_providers::amd_smi +{ + +/** + * @brief Check if AMD SMI NIC support is available. + * + * NIC support (AINIC) was added in ROCm 7.0 (AMD SMI lib version 26.3+). + * This is controlled by ROCPROFSYS_BUILD_AINIC defined via CMake. + */ +#if defined(ROCPROFSYS_BUILD_AINIC) && ROCPROFSYS_BUILD_AINIC == 1 +constexpr bool has_nic_support = true; +#else +constexpr bool has_nic_support = false; +#endif + +} // namespace rocprofsys::pmc::device_providers::amd_smi diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/amd_smi/provider.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/amd_smi/provider.hpp new file mode 100644 index 00000000000..21eccc7ad74 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/device_providers/amd_smi/provider.hpp @@ -0,0 +1,330 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "library/pmc/common/types.hpp" +#include "library/pmc/device_providers/amd_smi/drivers/driver.hpp" + +#include +#include +#include +#include +#include +#include + +#include + +namespace rocprofsys::pmc::device_providers::amd_smi +{ + +/** + * @brief AMD SMI device provider for initialization and device enumeration. + * + * This class manages the AMD SMI driver initialization/shutdown and provides + * access to raw device handles. It is designed to be shared by collectors + * (GPU and NIC). Device object creation and filtering is the responsibility + * of the collector. + * + * @tparam DriverFactory Factory for creating AMD SMI driver instances. + */ +template +class provider +{ +private: + /** + * @brief Check AMD SMI status and throw on error. + * @param status AMD SMI status code. + * @param error_message Error message to include in exception. + */ + static void check_amd_smi_status(amdsmi_status_t status, const char* error_message) + { + if(status != AMDSMI_STATUS_SUCCESS) + { + std::stringstream ss; + ss << error_message << " AMD SMI Error code: " << status; + throw std::runtime_error(ss.str()); + } + } + + /** + * @brief Get all socket handles. + * + * Queries the AMD SMI driver for all available socket handles in the system. + * + * @return Vector of socket handles. + * @throws std::runtime_error If querying socket handles fails. + */ + [[nodiscard]] std::vector get_socket_handles() + { + uint32_t count = 0; + check_amd_smi_status(m_driver_api->get_socket_handles(&count, nullptr), + "Failed to get socket count!"); + + std::vector handles(count); + if(count > 0) + { + check_amd_smi_status(m_driver_api->get_socket_handles(&count, handles.data()), + "Failed to get socket handles!"); + } + + return handles; + } + + /** + * @brief Get GPU processor handles for a socket. + * + * Uses the standard amdsmi_get_processor_handles() which is available on all + * ROCm versions and returns only GPU processors. + * + * @param socket_handle Socket to query. + * @return Vector of GPU processor handles (empty if none found). + */ + [[nodiscard]] std::vector get_gpu_handles_for_socket( + amdsmi_socket_handle socket_handle) + { + uint32_t count = 0; + auto status = m_driver_api->get_processor_handles(socket_handle, &count, nullptr); + + if(status != AMDSMI_STATUS_SUCCESS || count == 0) + { + return {}; + } + + std::vector handles(count); + status = + m_driver_api->get_processor_handles(socket_handle, &count, handles.data()); + + if(status != AMDSMI_STATUS_SUCCESS) + { + return {}; + } + + return handles; + } + +#if defined(ROCPROFSYS_BUILD_AINIC) && ROCPROFSYS_BUILD_AINIC == 1 + /** + * @brief Get NIC processor handles for a socket. + * + * Uses amdsmi_get_processor_handles_by_type() which is only available on + * AMD SMI >= 26.3 (ROCm 7.0+). + * + * @param socket_handle Socket to query. + * @return Vector of NIC processor handles (empty if none found). + */ + [[nodiscard]] std::vector get_nic_handles_for_socket( + amdsmi_socket_handle socket_handle) + { + uint32_t count = 0; + auto status = m_driver_api->get_processor_handles_by_type( + socket_handle, AMDSMI_PROCESSOR_TYPE_AMD_NIC, nullptr, &count); + + if(status != AMDSMI_STATUS_SUCCESS || count == 0) + { + return {}; + } + + std::vector handles(count); + status = m_driver_api->get_processor_handles_by_type( + socket_handle, AMDSMI_PROCESSOR_TYPE_AMD_NIC, handles.data(), &count); + + if(status != AMDSMI_STATUS_SUCCESS) + { + return {}; + } + + return handles; + } +#endif + + /** + * @brief Enumerate GPU devices across all sockets. + * + * @tparam Device The device type to create. + * @return Vector of shared pointers to GPU device objects. + */ + template + [[nodiscard]] std::vector> enumerate_gpu_devices() + { + std::vector> devices; + + auto socket_handles = get_socket_handles(); + size_t index = 0; + + for(auto& socket_handle : socket_handles) + { + auto handles = get_gpu_handles_for_socket(socket_handle); + for(auto& handle : handles) + { + devices.push_back(std::make_shared( + m_driver_api, handle, AMDSMI_PROCESSOR_TYPE_AMD_GPU, index)); + index++; + } + } + + return devices; + } + +#if defined(ROCPROFSYS_BUILD_AINIC) && ROCPROFSYS_BUILD_AINIC == 1 + /** + * @brief Enumerate NIC devices across all sockets. + * + * @tparam Device The device type to create. + * @return Vector of shared pointers to NIC device objects. + */ + template + [[nodiscard]] std::vector> enumerate_nic_devices() + { + std::vector> devices; + + auto socket_handles = get_socket_handles(); + size_t index = 0; + + for(auto& socket_handle : socket_handles) + { + auto handles = get_nic_handles_for_socket(socket_handle); + for(auto& handle : handles) + { + devices.push_back(std::make_shared( + m_driver_api, handle, AMDSMI_PROCESSOR_TYPE_AMD_NIC, index)); + index++; + } + } + + return devices; + } +#endif + + std::shared_ptr + m_driver_api; ///< Driver API instance + version m_version{}; ///< AMD SMI library version + +public: + using driver_t = typename DriverFactory::driver_t; + + /** + * @brief Construct and initialize the AMD SMI device provider. + * + * Creates the driver instance, initializes the AMD SMI driver, and retrieves version + * information. + * + * @throws std::runtime_error If AMD SMI initialization fails or version retrieval + * fails. + */ + provider() + : m_driver_api(DriverFactory::create_driver()) + { + // Initialize AMD SMI driver + check_amd_smi_status(m_driver_api->init(), + "Failed to initialize AMD SMI driver!"); + + // Get and store version information + amdsmi_version_t ver; + check_amd_smi_status(m_driver_api->get_version(&ver), + "Failed to get AMD SMI driver version!"); + + m_version.numeric_representation.major = ver.major; + m_version.numeric_representation.minor = ver.minor; + m_version.numeric_representation.release = ver.release; + m_version.string_representation = ver.build; + } + + ~provider() noexcept + { + if(m_driver_api) + { + m_driver_api->shutdown(); + } + } + + // Non-copyable, but movable + provider(const provider&) = delete; + provider& operator=(const provider&) = delete; + + provider(provider&& other) noexcept + : m_driver_api(std::move(other.m_driver_api)) + , m_version(std::move(other.m_version)) + { + other.m_driver_api.reset(); // Prevent double-shutdown + } + + provider& operator=(provider&& other) noexcept + { + if(this != &other) + { + if(m_driver_api) + { + m_driver_api->shutdown(); + } + m_driver_api = std::move(other.m_driver_api); + m_version = std::move(other.m_version); + other.m_driver_api.reset(); + } + return *this; + } + + /** + * @brief Get AMD SMI library version. + * @return Const reference to the version information. + */ + [[nodiscard]] const version& get_version() const noexcept { return m_version; } + + /** + * @brief Shutdown the AMD SMI driver. + * + * Releases the driver API and cleans up resources. Safe to call multiple times. + */ + void shutdown() + { + if(m_driver_api) + { + m_driver_api->shutdown(); + m_driver_api.reset(); + } + } + + /** + * @brief Get all devices of a specific type. + * + * Enumerates all devices of the specified type across all sockets. + * + * @tparam Device The device type to create. + * @param type The device type to enumerate (GPU or NIC). + * @return Vector of shared pointers to device objects. + */ + template + [[nodiscard]] std::vector> get_devices(device_type type) + { + if(type == device_type::GPU) + { + return enumerate_gpu_devices(); + } +#if defined(ROCPROFSYS_BUILD_AINIC) && ROCPROFSYS_BUILD_AINIC == 1 + if(type == device_type::NIC) + { + return enumerate_nic_devices(); + } +#endif + return {}; // Unsupported device type + } +}; + +/** + * @brief Factory for creating AMD SMI provider instances. + * + * @tparam DriverFactory Factory type for creating AMD SMI driver instances. + */ +template +struct provider_factory +{ + using provider_t = provider; + + /** + * @brief Create a new provider instance. + * @return Shared pointer to a newly created provider. + */ + static std::shared_ptr create() { return std::make_shared(); } +}; + +} // namespace rocprofsys::pmc::device_providers::amd_smi diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/sampler.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/sampler.cpp new file mode 100644 index 00000000000..83a59d8ca99 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/sampler.cpp @@ -0,0 +1,267 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#include "library/pmc/collectors/common/collector_slice.hpp" +#include "library/pmc/collectors/common/settings.hpp" +#include "library/pmc/collectors/gpu/cache_policy.hpp" +#include "library/pmc/collectors/gpu/collector.hpp" +#include "library/pmc/collectors/gpu/perfetto_policy.hpp" +#include "library/pmc/device_providers/amd_smi/provider.hpp" + +#if defined(ROCPROFSYS_BUILD_AINIC) +# include "library/pmc/collectors/nic/cache_policy.hpp" +# include "library/pmc/collectors/nic/collector.hpp" +# include "library/pmc/collectors/nic/perfetto_policy.hpp" +#endif + +#include "core/common.hpp" +#include "core/components/fwd.hpp" +#include "core/state.hpp" +#include "library/pmc/device_providers/amd_smi/drivers/driver.hpp" +#include "library/runtime.hpp" + +#include "library/pmc/sampler.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace rocprofsys +{ +namespace pmc +{ + +std::atomic& +get_state() +{ + static std::atomic _v{ State::PreInit }; + return _v; +} + +namespace +{ + +bool& +is_initialized() +{ + static bool _v = false; + return _v; +} + +struct gpu_production_config +{ + using SettingsApi = collectors::settings_policy; + using PerfettoApi = collectors::gpu::perfetto_policy; + using CacheApi = collectors::gpu::cache_policy; +}; + +#if defined(ROCPROFSYS_BUILD_AINIC) +struct nic_production_config +{ + using SettingsApi = collectors::settings_policy; + using PerfettoApi = collectors::nic::perfetto_policy; + using CacheApi = collectors::nic::cache_policy; +}; +#endif + +using provider_factory_t = + device_providers::amd_smi::provider_factory; +using provider_t = provider_factory_t::provider_t; +using gpu_collector_t = collectors::gpu::collector; +#if defined(ROCPROFSYS_BUILD_AINIC) +using nic_collector_t = collectors::nic::collector; +#endif + +std::shared_ptr g_device_provider; + +std::unique_ptr g_gpu_collector; +#if defined(ROCPROFSYS_BUILD_AINIC) +std::unique_ptr g_nic_collector; +#endif + +std::vector g_collector_slices; + +} // namespace + +void +set_state(State _v) +{ + pmc::get_state().store(_v); +} + +void +config() +{ + for(auto& slice : g_collector_slices) + { + slice.config(); + } + LOG_DEBUG("Setting PMC sampler state to active..."); + pmc::set_state(State::Active); +} + +void +sample() +{ + auto_lock_t _lk{ type_mutex() }; + + if(pmc::get_state() != State::Active) + { + return; + } + + auto timestamp = static_cast(tim::get_clock_real_now()); + + for(auto& slice : g_collector_slices) + { + slice.sample(timestamp); + } +} + +void +setup() +{ + auto_lock_t _lk{ type_mutex() }; + + if(is_initialized()) + { + return; + } + + ROCPROFSYS_SCOPED_SAMPLING_ON_CHILD_THREADS(false); + + try + { + // Create and inject device provider (shared between GPU and NIC collectors) + g_device_provider = provider_factory_t::create(); + + g_gpu_collector = std::make_unique(g_device_provider); +#if defined(ROCPROFSYS_BUILD_AINIC) + g_nic_collector = std::make_unique(g_device_provider); +#endif + + g_collector_slices.clear(); + g_collector_slices.emplace_back(*g_gpu_collector); +#if defined(ROCPROFSYS_BUILD_AINIC) + g_collector_slices.emplace_back(*g_nic_collector); +#endif + + for(auto& slice : g_collector_slices) + { + slice.setup(); + } + + is_initialized() = true; + } catch(const std::runtime_error& _e) + { + LOG_ERROR("Exception thrown when initializing PMC sampler: {}", _e.what()); + } +} + +void +shutdown() +{ + auto_lock_t _lk{ type_mutex() }; + + if(!is_initialized()) + { + return; + } + + LOG_DEBUG("Shutting down PMC sampler."); + + try + { + for(auto& slice : g_collector_slices) + { + slice.shutdown(); + } + } catch(const std::runtime_error& _e) + { + LOG_ERROR("Exception thrown when shutting down PMC sampler: {}", _e.what()); + } + + is_initialized() = false; +} + +void +post_process() +{ + LOG_DEBUG("Post-processing PMC samples ({} slices).", g_collector_slices.size()); + for(auto& slice : g_collector_slices) + { + slice.post_process(); + } + g_collector_slices.clear(); + g_device_provider.reset(); +} + +void +postfork_child_cleanup() +{ + LOG_DEBUG("Disabling PMC sampling in child process after fork."); + pmc::get_state().store(State::Finalized); + for(auto& slice : g_collector_slices) + { + slice.shutdown(); + } + g_collector_slices.clear(); + g_gpu_collector.reset(); +#if defined(ROCPROFSYS_BUILD_AINIC) + g_nic_collector.reset(); +#endif + g_device_provider.reset(); + is_initialized() = false; +} + +void +postfork_parent_reinit() +{ + LOG_DEBUG("Reinitializing PMC sampling in parent process after fork."); + shutdown(); + setup(); +} + +} // namespace pmc +} // namespace rocprofsys + +ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), + true, double) + +ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), + true, double) + +ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), + true, double) + +ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + +ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + +ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + +ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + +ROCPROFSYS_INSTANTIATE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/sampler.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/sampler.hpp new file mode 100644 index 00000000000..0f34988197b --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/pmc/sampler.hpp @@ -0,0 +1,84 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "core/components/fwd.hpp" +#include "core/state.hpp" + +#include + +namespace rocprofsys +{ +namespace pmc +{ + +std::atomic& +get_state(); + +void +setup(); + +void +config(); + +void +sample(); + +void +shutdown(); + +void +post_process(); + +void set_state(State); + +void +postfork_child_cleanup(); + +void +postfork_parent_reinit(); + +} // namespace pmc +} // namespace rocprofsys + +#if !defined(ROCPROFSYS_EXTERN_COMPONENTS) || \ + (defined(ROCPROFSYS_EXTERN_COMPONENTS) && ROCPROFSYS_EXTERN_COMPONENTS > 0) + +# include +# include +# include + +ROCPROFSYS_DECLARE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), + true, double) + +ROCPROFSYS_DECLARE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), + true, double) + +ROCPROFSYS_DECLARE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), + true, double) + +ROCPROFSYS_DECLARE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + +ROCPROFSYS_DECLARE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + +ROCPROFSYS_DECLARE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + +ROCPROFSYS_DECLARE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + +ROCPROFSYS_DECLARE_EXTERN_COMPONENT( + TIMEMORY_ESC(data_tracker), true, + double) + +#endif diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/process_sampler.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/process_sampler.cpp index cc08b9c1745..26f6e134768 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/process_sampler.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/process_sampler.cpp @@ -22,8 +22,8 @@ #include "library/process_sampler.hpp" #include "core/config.hpp" -#include "library/amd_smi.hpp" #include "library/cpu_freq.hpp" +#include "library/pmc/sampler.hpp" #include "library/runtime.hpp" #include "logger/debug.hpp" @@ -140,12 +140,13 @@ sampler::setup() if(get_use_amd_smi()) { - auto& _amd_smi = instances.emplace_back(std::make_unique()); - _amd_smi->setup = []() { amd_smi::setup(); }; - _amd_smi->shutdown = []() { amd_smi::shutdown(); }; - _amd_smi->post_process = []() { amd_smi::post_process(); }; - _amd_smi->config = []() { amd_smi::config(); }; - _amd_smi->sample = []() { amd_smi::sample(); }; + LOG_DEBUG("Setting up PMC sampling."); + auto& _pmc = instances.emplace_back(std::make_unique()); + _pmc->setup = []() { pmc::setup(); }; + _pmc->shutdown = []() { pmc::shutdown(); }; + _pmc->post_process = []() { pmc::post_process(); }; + _pmc->config = []() { pmc::config(); }; + _pmc->sample = []() { pmc::sample(); }; } if(get_cpu_freq_enabled()) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm.cpp index d3595d4fedd..402d1ee669d 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocm.cpp @@ -21,27 +21,15 @@ // SOFTWARE. #include "library/rocm.hpp" -#include "core/config.hpp" -#include "core/dynamic_library.hpp" -#include "core/gpu.hpp" -#include "library/amd_smi.hpp" #include "library/rocprofiler-sdk.hpp" -#include "library/runtime.hpp" -#include "library/thread_data.hpp" -#include "library/tracing.hpp" #include #include #include -#include -#include -#include #include -#include -#include - #include +#include namespace rocprofsys { diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp index 8f7a3979396..e31d17932a4 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp @@ -3,6 +3,7 @@ #include "core/rocprofiler-sdk.hpp" #include "api.hpp" +#include "binary/analysis.hpp" #include "common/synchronized.hpp" #include "core/common.hpp" #include "core/common_types.hpp" @@ -12,12 +13,10 @@ #include "core/gpu.hpp" #include "core/perfetto.hpp" #include "core/state.hpp" -#include "core/trace_cache/buffer_storage.hpp" #include "core/trace_cache/cache_manager.hpp" #include "core/trace_cache/metadata_registry.hpp" #include "core/trace_cache/sample_type.hpp" -#include "library/amd_smi.hpp" -#include "library/components/category_region.hpp" +#include "library/pmc/sampler.hpp" #include "library/rocprofiler-sdk.hpp" #include "library/rocprofiler-sdk/counters.hpp" #include "library/rocprofiler-sdk/fwd.hpp" @@ -64,7 +63,6 @@ #include #include #include -#include #include #include #include @@ -2649,8 +2647,8 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* user_data) if(config::get_use_process_sampling() && config::get_use_amd_smi()) { - LOG_DEBUG("Setting amd_smi state to active..."); - amd_smi::set_state(State::Active); + LOG_DEBUG("Setting PMC sampler state to active..."); + pmc::set_state(State::Active); } start(); @@ -2672,8 +2670,7 @@ tool_fini(void* callback_data) flush(); stop(); - if(config::get_use_process_sampling() && config::get_use_amd_smi()) - amd_smi::shutdown(); + if(config::get_use_process_sampling() && config::get_use_amd_smi()) pmc::shutdown(); if(get_counter_storage()) { diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.cpp index 2bb509960d4..917ff392041 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.cpp @@ -32,12 +32,12 @@ #include "core/state.hpp" #include "core/trace_cache/cache_manager.hpp" #include "core/utility.hpp" -#include "library/amd_smi.hpp" #include "library/components/backtrace.hpp" #include "library/components/backtrace_metrics.hpp" #include "library/components/backtrace_timestamp.hpp" #include "library/components/callchain.hpp" #include "library/perf.hpp" +#include "library/pmc/sampler.hpp" #include "library/runtime.hpp" #include "library/thread_data.hpp" #include "library/thread_info.hpp" @@ -1904,14 +1904,14 @@ void postfork_parent_reinit() { if(config::get_use_process_sampling() && config::get_use_amd_smi()) - amd_smi::postfork_parent_reinit(); + pmc::postfork_parent_reinit(); } void postfork_child_cleanup() { if(config::get_use_process_sampling() && config::get_use_amd_smi()) - amd_smi::postfork_child_cleanup(); + pmc::postfork_child_cleanup(); } } // namespace sampling } // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/tests/CMakeLists.txt b/projects/rocprofiler-systems/source/tests/CMakeLists.txt index 004bbb76cc3..2f46805b06d 100644 --- a/projects/rocprofiler-systems/source/tests/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/tests/CMakeLists.txt @@ -1,8 +1,7 @@ # Copyright (c) Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT -add_executable( - rocprof-sys-unit-tests +set(UNIT_TEST_OBJECTS $ $ $ @@ -11,6 +10,14 @@ add_executable( $<$:$> ) +list(APPEND UNIT_TEST_OBJECTS $) +list(APPEND UNIT_TEST_OBJECTS $) +if(ROCPROFSYS_BUILD_AINIC) + list(APPEND UNIT_TEST_OBJECTS $) +endif() + +add_executable(rocprof-sys-unit-tests ${UNIT_TEST_OBJECTS}) + target_link_libraries( rocprof-sys-unit-tests PRIVATE @@ -18,5 +25,6 @@ target_link_libraries( rocprofiler-systems-common-library rocprofiler-systems-core-library rocprofiler-systems-object-library + rocprofiler-systems-pmc-library rocprofiler-systems::rocprofiler-systems-binary ) diff --git a/projects/rocprofiler-systems/tests/pytest/conftest.py b/projects/rocprofiler-systems/tests/pytest/conftest.py index ec86d21c98e..cdf0966a55c 100644 --- a/projects/rocprofiler-systems/tests/pytest/conftest.py +++ b/projects/rocprofiler-systems/tests/pytest/conftest.py @@ -272,6 +272,7 @@ def pytest_configure(config: pytest.Config) -> None: "rocprof_binary", "rocprof_config", "xgmi", + "sdma", "group_by_queue", "group_by_stream", "openmp",