ARM-software
diff --git a/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎layer_gpu_profile/CMakeLists.txt‎
Lines changed: 46 additions & 0 deletions b/‎layer_gpu_profile/CMakeLists.txt‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎layer_gpu_profile/README_LAYER.md‎
Lines changed: 7 additions & 0 deletions b/‎layer_gpu_profile/README_LAYER.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎layer_gpu_profile/android_build.sh‎
Lines changed: 83 additions & 0 deletions b/‎layer_gpu_profile/android_build.sh‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎layer_gpu_profile/docs/developer-docs.md‎
Lines changed: 141 additions & 0 deletions b/‎layer_gpu_profile/docs/developer-docs.md‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎layer_gpu_profile/manifest.json‎
Lines changed: 11 additions & 0 deletions b/‎layer_gpu_profile/manifest.json‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎layer_gpu_profile/source/CMakeLists.txt‎
Lines changed: 98 additions & 0 deletions b/‎layer_gpu_profile/source/CMakeLists.txt‎
Lines changed: 98 additions & 0 deletions
@@ -10,3 +10,6 @@
 [submodule "source_third_party/khronos/vulkan-utilities"]
 	path = source_third_party/khronos/vulkan-utilities
 	url = https://github.com/KhronosGroup/Vulkan-Utility-Libraries/
+[submodule "source_third_party/libGPUCounters"]
+	path = source_third_party/libGPUCounters
+	url = https://github.com/ARM-software/libGPUCounters.git
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: MIT
+# -----------------------------------------------------------------------------
+# Copyright (c) 2024-2025 Arm Limited
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# -----------------------------------------------------------------------------
+
+cmake_minimum_required(VERSION 3.19)
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRES ON)
+
+project(VkLayerGPUProfile VERSION 1.0.0)
+
+# Common configuration
+set(LGL_LOG_TAG "VkLayerGPUProfile")
+set(LGL_CONFIG_TRACE 0)
+set(LGL_CONFIG_LOG 1)
+
+include(../source_common/compiler_helper.cmake)
+include(../cmake/clang-tools.cmake)
+
+# Build steps
+add_subdirectory(../source_third_party/libGPUCounters source_third_party/libGPUCounters)
+
+add_subdirectory(../source_common/comms source_common/comms)
+add_subdirectory(../source_common/framework source_common/framework)
+add_subdirectory(../source_common/trackers source_common/trackers)
+
+add_subdirectory(source)
@@ -0,0 +1,7 @@
+# Layer: GPU Profile
+
+This layer is used to capture API correlated profiling data from an Arm GPU.
+
+- - -
+
+_Copyright © 2025, Arm Limited and contributors._
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: MIT
+# ----------------------------------------------------------------------------
+# Copyright (c) 2024-2025 Arm Limited
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+# ----------------------------------------------------------------------------
+
+# ----------------------------------------------------------------------------
+# Configuration
+
+# Exit immediately if any component command errors
+set -e
+
+BUILD_DIR_64=build_arm64
+BUILD_DIR_PACK=build_package
+
+# ----------------------------------------------------------------------------
+# Process command line options
+if [ "$#" -lt 1 ]; then
+    BUILD_TYPE=Release
+else
+    BUILD_TYPE=$1
+fi
+
+# Process command line options
+if [ "$#" -lt 2 ]; then
+    PACKAGE=0
+else
+    PACKAGE=$2
+fi
+
+if [ "${PACKAGE}" -gt "0" ]; then
+    echo "Building a ${BUILD_TYPE} build with packaging"
+else
+    echo "Building a ${BUILD_TYPE} build without packaging"
+fi
+
+# ----------------------------------------------------------------------------
+# Build the 64-bit layer
+mkdir -p ${BUILD_DIR_64}
+pushd ${BUILD_DIR_64}
+
+cmake \
+    -DCMAKE_SYSTEM_NAME=Android \
+    -DANDROID_PLATFORM=29 \
+    -DANDROID_ABI=arm64-v8a \
+    -DANDROID_TOOLCHAIN=clang \
+    -DANDROID_STL=c++_static \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake" \
+    -DCMAKE_WARN_DEPRECATED=OFF \
+    ..
+
+make -j16
+
+popd
+
+# ----------------------------------------------------------------------------
+# Build the release package
+if [ "${PACKAGE}" -gt "0" ]; then
+    # Setup the package directories
+    mkdir -p ${BUILD_DIR_PACK}/bin/android/arm64
+
+    # Install the 64-bit layer
+    cp ${BUILD_DIR_64}/source/*.so ${BUILD_DIR_PACK}/bin/android/arm64
+fi
@@ -0,0 +1,141 @@
+# Layer: GPU Profile - Developer Documentation
+
+This layer is used to profile Arm GPUs, providing API correlated performance
+data. This page provides documentation for developers working on creating and
+maintaining the layer.
+
+## Measuring performance
+
+Arm GPUs can run multiple workloads in parallel, if the application pipeline
+barriers allow it. This is good for overall frame performance, but it makes
+a mess of profiling data!
+
+## Measuring performance
+
+Arm GPUs can run multiple workloads in parallel, if the application pipeline
+barriers allow it. This is good for overall frame performance, but it makes
+profiling data messy due to cross-talk between unrelated workloads.
+
+For profiling we therefore inject serialization points between workloads to
+ensure that data corresponds to a single workload. Note that we can only
+serialize within our own application process, so data could still be perturbed
+by other processes using the GPU.
+
+### Sampling performance counters
+
+This layer will sample performance counters between each workload but, because
+sampling is a CPU-side operation, it must trap back to the CPU to make the
+counter sample. The correct way to implement this in Vulkan is to split the
+application command buffer into multiple command buffers, each containing a
+single workload. However, rewriting the command stream like this is expensive
+in terms of CPU overhead caused by the state tracking.
+
+Instead use rely on an undocumented extension supported by Arm GPUs which
+allows the CPU to set/wait on events in a submitted but not complete command
+buffer. The layer injects a `vkCmdSetEvent(A)` and `vkCmdWaitEvent(B)` pair
+between each workload, and then has the reverse `vkWaitEvent(A)` and
+`vkSetEvent(B)` pair on the CPU side. The counter sample can be inserted
+in between the two CPU-side operations. Note that there is no blocking wait on
+an event for the CPU, so `vkWaitEvent()` is really a polling loop around
+`vkGetEventStatus()`.
+
+```mermaid
+sequenceDiagram
+    actor CPU
+    actor GPU
+    CPU->>CPU: vkGetEventStatus(A)
+    Note over GPU: Run workload
+    GPU->>CPU: vkCmdSetEvent(A)
+    GPU->>GPU: vkCmdWaitEvent(B)
+    Note over CPU: Take sample
+    CPU->>GPU: vkSetEvent(B)
+    Note over GPU: Start next workload
+```
+
+### Performance implications
+
+Serializing workloads usually means that individual workloads will run with
+lower completion latency, because they are no longer contending for resources.
+However, loss of overlap means that overall frame latency will increase.
+
+In addition, serializing workloads and then trapping back to the CPU to
+sample performance counters will cause the GPU to go idle waiting for the CPU
+to complete the counter sample. This makes the GPU appear underutilized to the
+system DVFS governor, which may subsequently decide to reduce the GPU clock
+frequency. On pre-production devices we recommend locking CPU, GPU and memory
+clock frequencies to avoid this problem.
+
+```mermaid
+---
+displayMode: compact
+---
+gantt
+    dateFormat x
+    axisFormat %Lms
+    section CPU
+    Sample: a1, 0, 2ms
+    Sample: a2, after w1, 2ms
+    section GPU
+    Workload 1:w1, after a1, 10ms
+    Workload 2:w2, after a2, 10ms
+```
+
+## Software architecture
+
+The basic architecture for this layer is an extension of the timeline layer,
+using a layer command stream (LCS) recorded alongside each command buffer to
+define the software operations that the layer needs to perform.
+
+Unlike the timeline layer, which only performs operations synchronously at
+submit time, this layer also needs to perform asynchronous sampling operations
+associated with each workload after a command buffer has been submitted. To
+support this approach the layer tracks the number of workloads submitted
+in each command buffer and their debug labels, and hands this over to an
+async handler to process as the workloads complete.
+
+To ensure that the async worker gets a predictable workload stream to
+instrument, all Vulkan queue submits are serialized on the GPU. As with the
+support layer, queue serialization may cause an application to hang if the
+application submits command buffers rely on out-of-order execution to unblock
+commands in a submitted command stream. This is only possible if applications
+are using timeline semaphores, which earlier submits to depend on a later
+submit to make forward progress.
+
+## Event handling
+
+To implement this functionality, the layer allocates three additional sync
+primitives.
+
+* A timeline semaphore is allocated to implement queue serialization.
+* Two events are allocated to support the CPU<->GPU handover for counter
+  sampling. These events are reset and reused for all counter samples to avoid
+  managing many different events.
+
+```c
+CPU                       GPU
+===                       ===
+                          // Workload 1
+                          vkCmdSetEvent(A)
+// Spin test until set
+vkGetEventStatus(A)
+vkResetEvent(A)
+
+// Sample counters
+
+vSetEvent(B)
+                          // Block until set
+                          vkCmdWaitEvent(B)
+                          vkCmdResetEvent(B)
+
+                          // Workload 2
+```
+
+Due to buggy interaction between the counter sampling and power management in
+some kernel driver versions, Valhall+CSF GPUs prior to r54p0 need a sleep after
+successfully waiting on event A and before sampling any counters. Initial
+investigations seem to show that the shortest reliable sleep is 3ms, so this is
+quite a very overhead for applications with many workloads and therefore should
+be enabled conditionally only for CSF GPUs with a driver older than r54p0.
+
+- - -
+_Copyright © 2025, Arm Limited and contributors._
@@ -0,0 +1,11 @@
+{
+  "file_format_version": "1.0.0",
+  "layer": {
+      "name": "VK_LAYER_LGL_gpu_profile",
+      "type": "INSTANCE",
+      "library_path": "libVkLayerGPUProfile.so",
+      "api_version": "1.0.0",
+      "implementation_version": "1",
+      "description": "Layer for generating Arm GPU profiling data"
+  }
+}
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: MIT
+# -----------------------------------------------------------------------------
+# Copyright (c) 2024-2025 Arm Limited
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# -----------------------------------------------------------------------------
+
+# Set output file names
+if (CMAKE_BUILD_TYPE STREQUAL "Release")
+    set(VK_LAYER VkLayerGPUProfile_sym)
+    set(VK_LAYER_STRIP libVkLayerGPUProfile.so)
+else()
+    set(VK_LAYER VkLayerGPUProfile)
+endif()
+
+# Set strings used by configure
+set(LGL_LAYER_NAME_STR "VK_LAYER_LGL_gpu_profile")
+set(LGL_LAYER_DESC_STR "VkLayerGPUProfile by LGL")
+
+# Vulkan layer library
+configure_file(
+    version.hpp.in
+    version.hpp
+    ESCAPE_QUOTES @ONLY)
+
+add_library(
+    ${VK_LAYER} SHARED
+        ../../source_common/framework/entry.cpp
+        device.cpp
+        instance.cpp
+        layer_comms.cpp
+        layer_device_functions_command_buffer.cpp
+        layer_device_functions_command_pool.cpp
+        layer_device_functions_debug.cpp
+        layer_device_functions_dispatch.cpp
+        layer_device_functions_queue.cpp
+        layer_device_functions_render_pass.cpp
+        layer_device_functions_trace_rays.cpp
+        layer_device_functions_transfer.cpp
+        submit_visitor.cpp)
+
+target_include_directories(
+    ${VK_LAYER} PRIVATE
+        ./
+        ../../source_common/
+        ${CMAKE_CURRENT_BINARY_DIR})
+
+target_include_directories(
+    ${VK_LAYER} SYSTEM PRIVATE
+        ../../source_third_party/
+        ../../source_third_party/khronos/vulkan/include/
+        ../../source_third_party/khronos/vulkan-utilities/include/
+        ../../source_third_party/libGPUCounters/backend/device/include/
+        ../../source_third_party/libGPUCounters/hwcpipe/include/
+        ../../source_third_party/protopuf/include/)
+
+# We use libGPUCounters in the device structure, so add to framework includes
+target_include_directories(
+    lib_layer_framework SYSTEM PRIVATE
+        ../../source_third_party/libGPUCounters/backend/device/include/
+        ../../source_third_party/libGPUCounters/hwcpipe/include/)
+
+lgl_set_build_options(${VK_LAYER})
+
+target_link_libraries(
+    ${VK_LAYER}
+        lib_layer_comms
+        lib_layer_framework
+        lib_layer_trackers
+        device
+        hwcpipe
+        $<$<PLATFORM_ID:Android>:log>)
+
+if (CMAKE_BUILD_TYPE STREQUAL "Release")
+    add_custom_command(
+        TARGET "${VK_LAYER}" POST_BUILD
+        COMMAND ${CMAKE_STRIP}
+        ARGS --strip-all -o ${VK_LAYER_STRIP} $<TARGET_FILE:${VK_LAYER}>
+        COMMENT "Stripped lib${VK_LAYER}.so to ${VK_LAYER_STRIP}")
+endif()
+
+add_clang_tools()