diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..4060841 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,21 @@ +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +version: 2 + +sphinx: + configuration: docs/conf.py + +# RTD by default builds html only +# Additional formats available for extra build time: htmlzip, pdf, epub +formats: [] + +python: + install: + - requirements: docs/sphinx/requirements.txt + +# Defines build environment +build: + os: ubuntu-22.04 + tools: + python: "3.10" diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..9dc0654 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,61 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +import re + +''' +html_theme is usually unchanged (rocm_docs_theme). +flavor defines the site header display, select the flavor for the corresponding portals +flavor options: rocm, rocm-docs-home, rocm-blogs, rocm-ds, instinct, ai-developer-hub, local, generic +''' +html_theme = "rocm_docs_theme" +html_theme_options = {"flavor": "rocm-docs-home"} + + +# This section turns on/off article info +setting_all_article_info = True +all_article_info_os = ["linux"] +all_article_info_author = "" + +Dynamically extract component version +with open('../CMakeLists.txt', encoding='utf-8') as f: + pattern = r'.*\brocm_setup_version\(VERSION\s+([0-9.]+)[^0-9.]+' # Update according to each component's CMakeLists.txt + match = re.search(pattern, + f.read()) + if not match: + raise ValueError("VERSION not found!") +version_number = "1.0" + +# for PDF output on Read the Docs +project = "AQLprofile" +author = "Advanced Micro Devices, Inc." +copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved." +version = version_number +release = version_number + +external_toc_path = "./sphinx/_toc.yml" # Defines Table of Content structure definition path + +''' +Doxygen Settings +Ensure Doxyfile is located at docs/doxygen. +If the component does not need doxygen, delete this section for optimal build time +''' +#doxygen_root = "doxygen" +#doxysphinx_enabled = False +# doxygen_project = { +# "name": "doxygen", +# "path": "doxygen/xml", +#} + +# Add more addtional package accordingly +extensions = [ + "rocm_docs", +# "rocm_docs.doxygen", +] + +html_title = f"{project} {version_number} documentation" + +external_projects_current_project = "AQLprofile" \ No newline at end of file diff --git a/docs/examples/pmc-workflow.rst b/docs/examples/pmc-workflow.rst new file mode 100644 index 0000000..19cb4e7 --- /dev/null +++ b/docs/examples/pmc-workflow.rst @@ -0,0 +1,109 @@ +.. meta:: + :description: A typical workflow for collecting PMC data + :keywords: AQLprofile, ROCm, API, how-to, PMC + +****************************************** +Performance Monitor Control (PMC) workflow +****************************************** + +This page describes a typical workflow for collecting PMC data using AQLprofile (as integrated in ``rocprofiler-sdk``). +This workflow relies on creating a profile object, generating command packets, and iterating over output buffers: + +1. **Intercept kernel dispatch**: The SDK intercepts kernel dispatch packets submitted to the GPU queue. +2. **Create a profile object**: A profile/session object is created, specifying the agent (GPU), events (counters), and output buffers. +3. **Generate command packets**: Start, stop, and read command packets are generated and injected into the queue around the kernel dispatch. +4. **Submit packets and run the kernel**: The kernel and profiling packets are submitted to the GPU queue for execution. +5. **Collect the output buffer**: After execution, the output buffer is read back from the GPU. +6. **Iterate and extract the results**: The SDK iterates over the output buffer to extract and report counter results. + +The SDK abstracts queue interception and packet management so tool developers can focus on results. + +Key API code snippets +===================== + +These API snippets use the legacy interfaces from ``hsa_ven_amd_aqlprofile.h``. These are provided for understanding purposes only. +For new development, refer to the updated APIs in ``aql_profile_v2.h``. + +.. note:: + + The ``rocprofiler-sdk`` is migrating to these newer interfaces; the old APIs may be deprecated in future releases. + +Define the events and profile +----------------------------- + +.. code:: cpp + + // Select events (counters) to collect + hsa_ven_amd_aqlprofile_event_t events[] = { + { HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ, 0, 2 }, // Example: SQ block, instance 0, counter 2 + { HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ, 0, 3 } + }; + + // Create profile object + hsa_ven_amd_aqlprofile_profile_t profile = { + .agent = agent, // hsa_agent_t + .type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC, + .events = events, + .event_count = sizeof(events)/sizeof(events[0]), + .parameters = nullptr, + .parameter_count = 0, + .output_buffer = {output_ptr, output_size}, + .command_buffer = {cmd_ptr, cmd_size} + }; + + +Validate events +--------------- + +.. code:: cpp + + bool valid = false; + hsa_ven_amd_aqlprofile_validate_event(agent, &events[0], &valid); + if (!valid) { + // Handle invalid event + } + + +Generate command packets +------------------------- + +.. code:: cpp + + hsa_ext_amd_aql_pm4_packet_t start_pkt, stop_pkt, read_pkt; + hsa_ven_amd_aqlprofile_start(&profile, &start_pkt); + hsa_ven_amd_aqlprofile_stop(&profile, &stop_pkt); + hsa_ven_amd_aqlprofile_read(&profile, &read_pkt); + + +Submit packets and run the kernel +--------------------------------- + +.. code:: cpp + + // Pseudocode: inject packets into HSA queue + queue->Submit(&start_pkt); + queue->Submit(&kernel_pkt); + queue->Submit(&stop_pkt); + queue->Submit(&read_pkt); + + +Iterate and extract results +---------------------------- + +.. code:: cpp + + hsa_ven_amd_aqlprofile_iterate_data( + &profile, + [](hsa_ven_amd_aqlprofile_info_type_t info_type, + hsa_ven_amd_aqlprofile_info_data_t* info_data, + void* user_data) -> hsa_status_t { + if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA) { + printf("Event: block %d, id %d, value: %llu\n", + info_data->pmc_data.event.block_name, + info_data->pmc_data.event.counter_id, + info_data->pmc_data.result); + } + return HSA_STATUS_SUCCESS; + }, + nullptr + ); diff --git a/docs/examples/sqtt-workflow.rst b/docs/examples/sqtt-workflow.rst new file mode 100644 index 0000000..6aff8ff --- /dev/null +++ b/docs/examples/sqtt-workflow.rst @@ -0,0 +1,93 @@ +.. meta:: + :description: A typical workflow for collecting detailed instruction-level traces + :keywords: AQLprofile, ROCm, API, how-to, SQTT + +******************************* +SQ Thread Trace (SQTT) workflow +******************************* + +The SQ Thread Trace workflow focuses on collecting detailed instruction-level traces. +This workflow relies on creating a profile object, generating command packets, and iterating over output buffers: + +1. **Intercept the kernel dispatch**: The SDK intercepts the kernel dispatch. +2. **Create a SQTT profile object**: A profile object is created for SQTT, specifying trace parameters and output buffers. +3. **Generate SQTT command packets**: Start, stop, and read packets for SQTT are generated and injected into the queue. +4. **Submit packets and run the kernel**: The kernel and SQTT packets are submitted for execution. +5. **Collect the trace buffer**: The trace output buffer is collected after execution. +6. **Iterate and decode trace data**: The SDK iterates over the trace buffer and decodes the SQTT data for analysis. + +The SDK abstracts queue interception and packet management so tool developers can focus on results. + +Key API code snippets +===================== + +These API snippets use the legacy interfaces from ``hsa_ven_amd_aqlprofile.h``. These are provided for understanding purposes only. +For new development, refer to the updated APIs in ``aql_profile_v2.h``. + +In the ``rocprofiler-sdk`` codebase, these APIs are wrapped and orchestrated in the ``aql``, ``hsa``, and ``thread_trace`` folders for queue interception, packet construction, and result iteration. + +.. note:: + + The ``rocprofiler-sdk`` is migrating to these newer interfaces; the old APIs may be deprecated in future releases. + +Define parameters and profile +------------------------------ + +.. code:: cpp + + hsa_ven_amd_aqlprofile_parameter_t params[] = { + { HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE, 16 } // 16 MB buffer + }; + + hsa_ven_amd_aqlprofile_profile_t profile = { + .agent = agent, + .type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE, + .events = nullptr, + .event_count = 0, + .parameters = params, + .parameter_count = sizeof(params)/sizeof(params[0]), + .output_buffer = {trace_ptr, trace_size}, + .command_buffer = {cmd_ptr, cmd_size} + }; + + +Generate SQTT start/stop packets +--------------------------------- + +.. code:: cpp + + hsa_ext_amd_aql_pm4_packet_t sqtt_start_pkt, sqtt_stop_pkt; + hsa_ven_amd_aqlprofile_start(&profile, &sqtt_start_pkt); + hsa_ven_amd_aqlprofile_stop(&profile, &sqtt_stop_pkt); + + +Submit packets and run the kernel +--------------------------------- + +.. code:: cpp + + queue->Submit(&sqtt_start_pkt); + queue->Submit(&kernel_pkt); + queue->Submit(&sqtt_stop_pkt); + + +Iterate and decode trace data +----------------------------- + +.. code:: cpp + + hsa_ven_amd_aqlprofile_iterate_data( + &profile, + [](hsa_ven_amd_aqlprofile_info_type_t info_type, + hsa_ven_amd_aqlprofile_info_data_t* info_data, + void* user_data) -> hsa_status_t { + if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA) { + // info_data->trace_data.ptr, info_data->trace_data.size + decode_trace(info_data->trace_data.ptr, info_data->trace_data.size); + } + return HSA_STATUS_SUCCESS; + }, + nullptr + ); + + diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..f7d694d --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,40 @@ +.. meta:: + :description: AQLprofile is an open source library that enables advanced GPU profiling and tracing on AMD platforms. + :keywords: AQLprofile, ROCm, tool, Instinct, accelerator, AMD + +.. _index: + +******************************** +AQLprofile documentation +******************************** + +This documentation provides a comprehensive overview of the AQLprofile library. This documentation explains the ideas motivating the design +behind the tool and its components. + +If you're new to AQLprofile, see :doc:`What is AQLprofile? `. + +AQLprofile is open source and hosted at . + +.. grid:: 2 + :gutter: 3 + + .. grid-item-card:: Install + + * :doc:`Install AQLprofile ` + + .. grid-item-card:: Examples + + * :doc:`Performance Monitor Control (PMC) workflow ` + * :doc:`SQ Thread Trace (SQTT) workflow ` + + .. grid-item-card:: Reference + + * :doc:`Terms ` + * :doc:`APIs ` + + +To contribute to the documentation, refer to +`Contributing to ROCm `_. + +You can find licensing information on the +`Licensing `_ page. diff --git a/docs/install/aqlprofile-install.rst b/docs/install/aqlprofile-install.rst new file mode 100644 index 0000000..4efa4ee --- /dev/null +++ b/docs/install/aqlprofile-install.rst @@ -0,0 +1,77 @@ +.. meta:: + :description: AQLprofile installation process + :keywords: AQLprofile, ROCm, install + +****************** +Install AQLprofile +****************** + +Learn how to build AQLprofile with a script or with CMake, then install the library with a command. + +Prerequisites +============= + +Before you begin, ensure these tools and dependencies are installed: + +* ROCm stack +* ``rocm-llvm-dev`` (required to build tests) + + +Build AQLprofile +================ + +You can build AQLprofile using either the provided build script (recommended for most users) or by manually invoking CMake for custom builds. + + +Option 1: Use the build script (Recommended) +-------------------------------------------- + +This configures and builds the project with the default settings: + +.. code:: bash + + ./build.sh + + +Option 2: Use CMake for custom builds +------------------------------------- + +For more control over the build process, you can set the CMake options manually: + +.. code:: bash + + # Set the CMAKE_PREFIX_PATH to point to hsa-runtime includes path and hsa-runtime library path + export CMAKE_PREFIX_PATH=: + # For example, if ROCm is installed at /opt/rocm: + # export CMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa + + export CMAKE_BUILD_TYPE= # release by default + + cd /path/to/aqlprofile + mkdir build + cd build + cmake .. + make -j + + +(Optional) Enable debug tracing +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To enable debug tracing, set this environment variable before running CMake: + +.. code:: bash + + export CMAKE_DEBUG_TRACE=1 + +This enables verbose debug output of the command packets while this library executes. + + +Install the AQLprofile libraries +================================ + +Once your build is successful, install the AQLprofile libraries with: + +.. code:: bash + + cd build + sudo make install diff --git a/docs/license.rst b/docs/license.rst new file mode 100644 index 0000000..2c0594e --- /dev/null +++ b/docs/license.rst @@ -0,0 +1,25 @@ +******************************** +License +******************************** + +MIT License + +Copyright (c) 2017-2025 Advanced Micro Devices, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/docs/reference/api-list.rst b/docs/reference/api-list.rst new file mode 100644 index 0000000..a454a96 --- /dev/null +++ b/docs/reference/api-list.rst @@ -0,0 +1,154 @@ +AQLprofile APIs +=============== + +Learn about the typical APIs used in AQLProfile. + +The APIs in ``hsa_ven_amd_aqlprofile.h`` are used by legacy tools such +as ``rocprof`` and ``rocprofv2``. These APIs may be deprecated in the +future as development focus shifts to the new ``aqlprofile_v2.h`` APIs. + +The APIs in ``aqlprofile_v2.h`` are designed for use with +``rocprofiler-sdk``, and are actively maintained and recommended for all +new development. + +From header ``aql_profile_v2.h`` +-------------------------------- + ++--------------------+-------------------------------------------------+ +| API Name | Purpose | ++====================+=================================================+ +| ``aqlprofil | Registers an agent for profiling using basic | +| e_register_agent`` | agent info. | ++--------------------+-------------------------------------------------+ +| ``aqlprofile_reg | Registers an agent for profiling using extended | +| ister_agent_info`` | agent info and versioning. | ++--------------------+-------------------------------------------------+ +| ``aqlprof | Retrieves information about PMC profiles (e.g., | +| ile_get_pmc_info`` | buffer sizes, counter data). | ++--------------------+-------------------------------------------------+ +| ``aqlprofile_va | Checks if a given PMC event is valid for the | +| lidate_pmc_event`` | specified agent. | ++--------------------+-------------------------------------------------+ +| ``aqlprofile_pm | Creates AQL packets (start, stop, read) for PMC | +| c_create_packets`` | profiling and returns a handle. | ++--------------------+-------------------------------------------------+ +| ``aqlprofile_pm | Deletes PMC profiling packets and releases | +| c_delete_packets`` | associated resources. | ++--------------------+-------------------------------------------------+ +| ``aqlprofile_ | Iterates over PMC profiling results using a | +| pmc_iterate_data`` | callback. | ++--------------------+-------------------------------------------------+ +| ``aqlprofile_at | Creates AQL packets (start, stop) for Advanced | +| t_create_packets`` | Thread Trace (SQTT) and returns a handle. | ++--------------------+-------------------------------------------------+ +| ``aqlprofile_at | Deletes ATT profiling packets and releases | +| t_delete_packets`` | associated resources. | ++--------------------+-------------------------------------------------+ +| ``aqlprofile_ | Iterates over thread trace (SQTT) results using | +| att_iterate_data`` | a callback. | ++--------------------+-------------------------------------------------+ +| ``aqlprofile_i | Iterates over all possible event coordinate IDs | +| terate_event_ids`` | and names using a callback. | ++--------------------+-------------------------------------------------+ +| ``aqlprofile_ite | Iterates over all event coordinates for a given | +| rate_event_coord`` | agent and event using a callback. | ++--------------------+-------------------------------------------------+ +| ``aqlprofile_at | Creates a marker packet for code object events | +| t_codeobj_marker`` | in thread trace workflows. | ++--------------------+-------------------------------------------------+ + +Callback Typedefs +~~~~~~~~~~~~~~~~~ + ++---------------------+------------------------------------------------+ +| Callback Typedef | Purpose | +| Name | | ++=====================+================================================+ +| ``aqlprofile_memory | Callback for allocating memory buffers for | +| _alloc_callback_t`` | profiles (PMC/ATT). | ++---------------------+------------------------------------------------+ +| `` | Callback for deallocating memory buffers | +| aqlprofile_memory_d | allocated for profiles. | +| ealloc_callback_t`` | | ++---------------------+------------------------------------------------+ +| ``aqlprof | Callback for copying memory (used internally | +| ile_memory_copy_t`` | by the profiler). | ++---------------------+------------------------------------------------+ +| ``aqlprofile_pm | Used with ``aqlprofile_pmc_iterate_data`` to | +| c_data_callback_t`` | process each PMC profiling result. | ++---------------------+------------------------------------------------+ +| ``aqlprofile_at | Used with ``aqlprofile_att_iterate_data`` to | +| t_data_callback_t`` | process each thread trace (SQTT) result. | ++---------------------+------------------------------------------------+ +| ``aqlprofile_eve | Used with ``aqlprofile_iterate_event_ids`` to | +| ntname_callback_t`` | process event coordinate IDs and names. | ++---------------------+------------------------------------------------+ +| ``aqlprofile_coor | Used with ``aqlprofile_iterate_event_coord`` | +| dinate_callback_t`` | to process event coordinate information. | ++---------------------+------------------------------------------------+ + +From header ``hsa_ven_amd_aqlprofile.h`` +---------------------------------------- + ++----------------------+-----------------------------------------------+ +| API Name | Purpose | ++======================+===============================================+ +| ` | Checks if a given event (counter) is valid | +| `hsa_ven_amd_aqlprof | for the specified GPU agent. | +| ile_validate_event`` | | ++----------------------+-----------------------------------------------+ +| ``hsa_ven_am | Populates an AQL packet with commands to | +| d_aqlprofile_start`` | start profiling (PMC or SQTT). | ++----------------------+-----------------------------------------------+ +| ``hsa_ven_a | Populates an AQL packet with commands to stop | +| md_aqlprofile_stop`` | profiling. | ++----------------------+-----------------------------------------------+ +| ``hsa_ven_a | Populates an AQL packet with commands to read | +| md_aqlprofile_read`` | profiling results from the GPU. | ++----------------------+-----------------------------------------------+ +| ` | Converts an AQL packet to a PM4 packet blob | +| `hsa_ven_amd_aqlprof | (for legacy devices). | +| ile_legacy_get_pm4`` | | ++----------------------+-----------------------------------------------+ +| ``hsa_ven_amd_aql | Inserts a marker (correlation ID) into the | +| profile_att_marker`` | ATT (thread trace) buffer. | ++----------------------+-----------------------------------------------+ +| ``hsa_ven_amd_a | Retrieves various profile information, such | +| qlprofile_get_info`` | as buffer sizes or collected data. | ++----------------------+-----------------------------------------------+ +| ``hsa_ven_amd_aqlpr | Iterates over the profiling output data (PMC | +| ofile_iterate_data`` | results or SQTT trace) using a callback. | ++----------------------+-----------------------------------------------+ +| ``hsa_ven_amd_aqlpr | Returns a human-readable error string for the | +| ofile_error_string`` | last error. | ++----------------------+-----------------------------------------------+ +| ``hs | Iterates over all possible event IDs and | +| a_ven_amd_aqlprofile | names for the agent. | +| _iterate_event_ids`` | | ++----------------------+-----------------------------------------------+ +| ``hsa_ | Iterates over all event coordinates for a | +| ven_amd_aqlprofile_i | given agent and event. | +| terate_event_coord`` | | ++----------------------+-----------------------------------------------+ + +.. _callback-typedefs-1: + +Callback Typedefs +~~~~~~~~~~~~~~~~~ + ++-----------------------+----------------------------------------------+ +| Callback Typedef Name | Purpose | ++=======================+==============================================+ +| ``hsa_ven_amd_aqlprof | Used with | +| ile_data_callback_t`` | ``hsa_ven_amd_aqlprofile_iterate_data`` to | +| | process each profiling result (PMC/SQTT). | ++-----------------------+----------------------------------------------+ +| ``hsa | Used with | +| _ven_amd_aqlprofile_e | ``hsa_ven_amd_aqlprofile_iterate_event_ids`` | +| ventname_callback_t`` | to process event IDs and names. | ++-----------------------+----------------------------------------------+ +| ``hsa_ | Used with | +| ven_amd_aqlprofile_co | `` | +| ordinate_callback_t`` | hsa_ven_amd_aqlprofile_iterate_event_coord`` | +| | to process event coordinate info. | ++-----------------------+----------------------------------------------+ diff --git a/docs/reference/terms.rst b/docs/reference/terms.rst new file mode 100644 index 0000000..51a7072 --- /dev/null +++ b/docs/reference/terms.rst @@ -0,0 +1,102 @@ +Terms +===== + +Agents +------ + +Agents represent computational devices (CPUs, GPUs) in the Heterogeneous +System Architecture (HSA) runtime. In AQLprofile, agents are discovered +via HSA APIs and encapsulated in the ``AgentInfo`` structure. Each agent +contains metadata such as device type, name, compute unit count, and +memory pools. + +Agents are enumerated using HSA API ``hsa_iterate_agents``, and their +properties are queried via another HSA API, ``hsa_agent_get_info``. +Agents are used to target specific GPUs for profiling, and to allocate +resources such as command buffers and memory pools. + +Counters and events +------------------- + +Performance counters are special circuits on the hardware that count +specific GPU events (e.g., cycles, instructions, cache hits). Events +specify which counters to collect, identified by block name, block +index, and counter ID. + +- Events are described using ``hsa_ven_amd_aqlprofile_event_t`` + structures. +- Events are grouped into profiles and collected during profiling + sessions. + +.. code:: cpp + + const hsa_ven_amd_aqlprofile_event_t events_arr1[] = { + {HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ, 0, 2 /*CYCLES*/}, + {HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ, 0, 3 /*BUSY_CYCLES*/}, + // ... + }; + +Counter blocks +-------------- + +Counter blocks correspond to hardware units on the GPU (e.g., SQ, TCC, +TCP). Each block exposes a set of counters/events. + +- Block names (e.g., ``HSA_VEN_AMD_AQLPROFILE_BLOCK_NAME_SQ``) map to + specific hardware blocks. +- Events specify both the block and the counter within that block. + +Command packets +--------------- + +Command packets are AQL or PM4 packets that encode profiling commands +for the GPU. They're constructed and written into command buffers. + +They're built using AQLprofile APIs or helper functions and submitted to +the GPU via HSA queues. + +.. code:: cpp + + bool Queue::Submit(hsa_ext_amd_aql_pm4_packet_t* packet) { + // Write packet to queue and signal doorbell + } + +Profile object +-------------- + +The profile object encapsulates all information required to perform a +profiling session. It's represented by the +``hsa_ven_amd_aqlprofile_profile_t`` struct, which includes the agent, +event type, list of events, command buffer, and additional parameters. + +Profile objects are constructed by specifying the agent, event type +(PMC, SQTT), events to collect, and associated buffers. They're passed +to AQLprofile APIs to start, stop, and read profiling data. + +.. code:: cpp + + hsa_ven_amd_aqlprofile_profile_t *profile = + new hsa_ven_amd_aqlprofile_profile_t{ + agent_info->dev_id, + HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC, + events, + num_events, + NULL, + 0, + 0, + 0}; + +Command buffers +--------------- + +Command buffers are memory regions that store AQL packets and PM4 +commands which control GPU profiling operations. They're allocated per +agent, and must meet alignment and size requirements dictated by the +hardware. + +Output buffer +------------- + +Output buffers are memory regions that store outputs such as counter +values and thread trace tokens. They're allocated using HSA memory pools +associated with the agent. diff --git a/docs/sphinx/_toc.yml b/docs/sphinx/_toc.yml new file mode 100644 index 0000000..63a393e --- /dev/null +++ b/docs/sphinx/_toc.yml @@ -0,0 +1,35 @@ +defaults: + numbered: False + maxdepth: 6 +root: index +subtrees: + + - entries: + - file: what-is-aqlprofile.rst + + - caption: Install + entries: + - file: install/aqlprofile-install.rst + title: Install AQLprofile + + - caption: Examples + entries: + - file: examples/pmc-workflow.rst + title: Performance Monitor Control workflow + - file: examples/sqtt-workflow.rst + title: SQ Thread Trace workflow + + - caption: Reference + entries: + - file: reference/terms.rst + title: Terms + - file: reference/api-list.rst + title: APIs + + - caption: License + entries: + - file: license.rst + title: License + + + diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in new file mode 100644 index 0000000..63a393e --- /dev/null +++ b/docs/sphinx/_toc.yml.in @@ -0,0 +1,35 @@ +defaults: + numbered: False + maxdepth: 6 +root: index +subtrees: + + - entries: + - file: what-is-aqlprofile.rst + + - caption: Install + entries: + - file: install/aqlprofile-install.rst + title: Install AQLprofile + + - caption: Examples + entries: + - file: examples/pmc-workflow.rst + title: Performance Monitor Control workflow + - file: examples/sqtt-workflow.rst + title: SQ Thread Trace workflow + + - caption: Reference + entries: + - file: reference/terms.rst + title: Terms + - file: reference/api-list.rst + title: APIs + + - caption: License + entries: + - file: license.rst + title: License + + + diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in new file mode 100644 index 0000000..48a6f66 --- /dev/null +++ b/docs/sphinx/requirements.in @@ -0,0 +1 @@ +rocm-docs-core==1.20.0 diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt new file mode 100644 index 0000000..41c0a5a --- /dev/null +++ b/docs/sphinx/requirements.txt @@ -0,0 +1,279 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile requirements.in +# +accessible-pygments==0.0.5 + # via pydata-sphinx-theme +alabaster==1.0.0 + # via sphinx +asttokens==3.0.0 + # via stack-data +attrs==25.3.0 + # via + # jsonschema + # jupyter-cache + # referencing +babel==2.17.0 + # via + # pydata-sphinx-theme + # sphinx +beautifulsoup4==4.13.4 + # via pydata-sphinx-theme +breathe==4.36.0 + # via rocm-docs-core +certifi==2025.4.26 + # via requests +cffi==1.17.1 + # via + # cryptography + # pynacl +charset-normalizer==3.4.2 + # via requests +click==8.2.1 + # via + # jupyter-cache + # sphinx-external-toc +comm==0.2.2 + # via ipykernel +cryptography==45.0.3 + # via pyjwt +debugpy==1.8.14 + # via ipykernel +decorator==5.2.1 + # via ipython +deprecated==1.2.18 + # via pygithub +docutils==0.21.2 + # via + # myst-parser + # pydata-sphinx-theme + # sphinx +exceptiongroup==1.3.0 + # via ipython +executing==2.2.0 + # via stack-data +fastjsonschema==2.21.1 + # via + # nbformat + # rocm-docs-core +gitdb==4.0.12 + # via gitpython +gitpython==3.1.44 + # via rocm-docs-core +greenlet==3.2.2 + # via sqlalchemy +idna==3.10 + # via requests +imagesize==1.4.1 + # via sphinx +importlib-metadata==8.7.0 + # via + # jupyter-cache + # myst-nb +ipykernel==6.29.5 + # via myst-nb +ipython==8.36.0 + # via + # ipykernel + # myst-nb +jedi==0.19.2 + # via ipython +jinja2==3.1.6 + # via + # myst-parser + # sphinx +jsonschema==4.24.0 + # via nbformat +jsonschema-specifications==2025.4.1 + # via jsonschema +jupyter-cache==1.0.1 + # via myst-nb +jupyter-client==8.6.3 + # via + # ipykernel + # nbclient +jupyter-core==5.8.1 + # via + # ipykernel + # jupyter-client + # nbclient + # nbformat +markdown-it-py==3.0.0 + # via + # mdit-py-plugins + # myst-parser +markupsafe==3.0.2 + # via jinja2 +matplotlib-inline==0.1.7 + # via + # ipykernel + # ipython +mdit-py-plugins==0.4.2 + # via myst-parser +mdurl==0.1.2 + # via markdown-it-py +myst-nb==1.2.0 + # via rocm-docs-core +myst-parser==4.0.1 + # via myst-nb +nbclient==0.10.2 + # via + # jupyter-cache + # myst-nb +nbformat==5.10.4 + # via + # jupyter-cache + # myst-nb + # nbclient +nest-asyncio==1.6.0 + # via ipykernel +packaging==25.0 + # via + # ipykernel + # pydata-sphinx-theme + # sphinx +parso==0.8.4 + # via jedi +pexpect==4.9.0 + # via ipython +platformdirs==4.3.8 + # via jupyter-core +prompt-toolkit==3.0.51 + # via ipython +psutil==7.0.0 + # via ipykernel +ptyprocess==0.7.0 + # via pexpect +pure-eval==0.2.3 + # via stack-data +pycparser==2.22 + # via cffi +pydata-sphinx-theme==0.15.4 + # via + # rocm-docs-core + # sphinx-book-theme +pygithub==2.6.1 + # via rocm-docs-core +pygments==2.19.1 + # via + # accessible-pygments + # ipython + # pydata-sphinx-theme + # sphinx +pyjwt[crypto]==2.10.1 + # via pygithub +pynacl==1.5.0 + # via pygithub +python-dateutil==2.9.0.post0 + # via jupyter-client +pyyaml==6.0.2 + # via + # jupyter-cache + # myst-nb + # myst-parser + # rocm-docs-core + # sphinx-external-toc +pyzmq==26.4.0 + # via + # ipykernel + # jupyter-client +referencing==0.36.2 + # via + # jsonschema + # jsonschema-specifications +requests==2.32.3 + # via + # pygithub + # sphinx +rocm-docs-core==1.20.0 + # via -r requirements.in +rpds-py==0.25.1 + # via + # jsonschema + # referencing +six==1.17.0 + # via python-dateutil +smmap==5.0.2 + # via gitdb +snowballstemmer==3.0.1 + # via sphinx +soupsieve==2.7 + # via beautifulsoup4 +sphinx==8.1.3 + # via + # breathe + # myst-nb + # myst-parser + # pydata-sphinx-theme + # rocm-docs-core + # sphinx-book-theme + # sphinx-copybutton + # sphinx-design + # sphinx-external-toc + # sphinx-notfound-page +sphinx-book-theme==1.1.4 + # via rocm-docs-core +sphinx-copybutton==0.5.2 + # via rocm-docs-core +sphinx-design==0.6.1 + # via rocm-docs-core +sphinx-external-toc==1.0.1 + # via rocm-docs-core +sphinx-notfound-page==1.1.0 + # via rocm-docs-core +sphinxcontrib-applehelp==2.0.0 + # via sphinx +sphinxcontrib-devhelp==2.0.0 + # via sphinx +sphinxcontrib-htmlhelp==2.1.0 + # via sphinx +sphinxcontrib-jsmath==1.0.1 + # via sphinx +sphinxcontrib-qthelp==2.0.0 + # via sphinx +sphinxcontrib-serializinghtml==2.0.0 + # via sphinx +sqlalchemy==2.0.41 + # via jupyter-cache +stack-data==0.6.3 + # via ipython +tabulate==0.9.0 + # via jupyter-cache +tomli==2.2.1 + # via sphinx +tornado==6.5.1 + # via + # ipykernel + # jupyter-client +traitlets==5.14.3 + # via + # comm + # ipykernel + # ipython + # jupyter-client + # jupyter-core + # matplotlib-inline + # nbclient + # nbformat +typing-extensions==4.13.2 + # via + # beautifulsoup4 + # exceptiongroup + # ipython + # myst-nb + # pydata-sphinx-theme + # pygithub + # referencing + # sqlalchemy +urllib3==2.4.0 + # via + # pygithub + # requests +wcwidth==0.2.13 + # via prompt-toolkit +wrapt==1.17.2 + # via deprecated +zipp==3.22.0 + # via importlib-metadata diff --git a/docs/what-is-aqlprofile.rst b/docs/what-is-aqlprofile.rst new file mode 100644 index 0000000..a95511a --- /dev/null +++ b/docs/what-is-aqlprofile.rst @@ -0,0 +1,134 @@ +What is AQLprofile? +=================== + +The Architected Queuing Language Profiling Library (AQLProfile) is an +open source library that enables advanced GPU profiling and tracing on +AMD platforms. It works in conjunction with +`rocprofiler-sdk `__ to +support profiling methods such as `performance counters +(PMC) `__ and `SQ thread trace +(SQTT) `__. AQLprofile provides the +foundational mechanisms for constructing AQL packets and managing +profiling operations across multiple AMD GPU architecture families. The +development of AQLprofile is aligned with ``rocprofiler-sdk``, ensuring +compatibility and feature support for new GPU architectures and +profiling requirements. + +AQLprofile builds on concepts from the Heterogeneous System Architecture +(HSA) and the AQL, which define the foundations for GPU command +processing and profiling on AMD platforms. For further reading, see: + +- `HSA Platform System Architecture + Specification `__ +- `HSA Runtime Programmer's Reference + Specification `__ + +Features +-------- + +- Profiling AQL packets for GPU workloads. +- Performance counters and SQ thread traces. +- Support for GFX9, GFX10, GFX11, and GFX12 architecture families. +- Verbose tracing and error logging capabilities. +- Thread trace binary data generated by AQLprofile can be decoded using + `rocprof-trace-decoder `__. + +Who Should use this library? +---------------------------- + +- **End users**: If you want to profile AMD GPUs, use + `rocprofiler-sdk `__ or + tools that depend on it. You do *not* need to use AQLprofile + directly. +- **Developers/integrators**: If you're building profiling tools, + custom workflows, or need to extend profiling capabilities, you may + use AQLprofile directly as a backend. + +How does AQLprofile fit into the ROCm profiling stack? +------------------------------------------------------ + +Here's the typical workflow: + +**Application** → **rocprofiler-sdk** → **AQLprofile** → **AMD GPU +hardware** + +- AQLprofile provides the mechanisms for constructing profiling + commands, managing command buffers, and interacting with hardware + counters. +- The ``rocprofiler-sdk`` provides a higher-level API and user-facing + tools, using AQLprofile internally. + +Supported architectures and Counter Blocks +------------------------------------------ + +The AQLprofile library supports profiling and tracing GPU workloads +across multiple architectures. Here's a summary of the counter blocks +supported for each architecture: + ++-------------+------+--------+--------+--------+------+------+-----+ +| Counter | GFX9 | GFX908 | GFX90A | GFX942 | G | G | GF | +| Block Name | | | | | FX10 | FX11 | X12 | ++=============+======+========+========+========+======+======+=====+ +| ATC | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| ATC_L2 | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| CHA | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| CHC | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| CPC | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| CPF | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| CPG | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GCEA | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GCR | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GDS | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GL1A | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GL1C | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GL2A | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GL2C | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GRBM | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GRBMH | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GRBM_SE | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| GUS | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| MC_VM_L2 | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| RPB | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| SDMA | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| SPI | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| SQ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| SQ_CS | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| SX | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| TA | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| TCA | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| TCC | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ++-------------+------+--------+--------+--------+------+------+-----+ +| TCP | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ +| TD | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ++-------------+------+--------+--------+--------+------+------+-----+ + +Refer to the SDK documentation for the most up-to-date list of supported +architectures.