Skip to content

Commit dea2448

Browse files
committed
Profile layer: Implement skeleton
1 parent ec86a80 commit dea2448

22 files changed

+3779
-0
lines changed

layer_gpu_profile/CMakeLists.txt

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# SPDX-License-Identifier: MIT
2+
# -----------------------------------------------------------------------------
3+
# Copyright (c) 2024-2025 Arm Limited
4+
#
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to
7+
# deal in the Software without restriction, including without limitation the
8+
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
9+
# sell copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
#
12+
# The above copyright notice and this permission notice shall be included in
13+
# all copies or substantial portions of the Software.
14+
#
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
# -----------------------------------------------------------------------------
23+
24+
cmake_minimum_required(VERSION 3.19)
25+
26+
set(CMAKE_CXX_STANDARD 20)
27+
set(CMAKE_CXX_STANDARD_REQUIRES ON)
28+
29+
project(VkLayerGPUProfile VERSION 1.0.0)
30+
31+
# Common configuration
32+
set(LGL_LOG_TAG "VkLayerGPUProfile")
33+
set(LGL_CONFIG_TRACE 0)
34+
set(LGL_CONFIG_LOG 1)
35+
36+
include(../source_common/compiler_helper.cmake)
37+
include(../cmake/clang-tools.cmake)
38+
39+
# Build steps
40+
add_subdirectory(../source_common/comms source_common/comms)
41+
add_subdirectory(../source_common/framework source_common/framework)
42+
add_subdirectory(../source_common/trackers source_common/trackers)
43+
add_subdirectory(source)

layer_gpu_profile/README_LAYER.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Layer: GPU Profile
2+
3+
This layer is used to capture API correlated profiling data from an Arm GPU.
4+
5+
- - -
6+
7+
_Copyright © 2025, Arm Limited and contributors._

layer_gpu_profile/android_build.sh

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
#!/usr/bin/env bash
2+
# SPDX-License-Identifier: MIT
3+
# ----------------------------------------------------------------------------
4+
# Copyright (c) 2024-2025 Arm Limited
5+
#
6+
# Permission is hereby granted, free of charge, to any person obtaining a copy
7+
# of this software and associated documentation files (the "Software"), to
8+
# deal in the Software without restriction, including without limitation the
9+
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10+
# sell copies of the Software, and to permit persons to whom the Software is
11+
# furnished to do so, subject to the following conditions:
12+
#
13+
# The above copyright notice and this permission notice shall be included in
14+
# all copies or substantial portions of the Software.
15+
#
16+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21+
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22+
# IN THE SOFTWARE.
23+
# ----------------------------------------------------------------------------
24+
25+
# ----------------------------------------------------------------------------
26+
# Configuration
27+
28+
# Exit immediately if any component command errors
29+
set -e
30+
31+
BUILD_DIR_64=build_arm64
32+
BUILD_DIR_PACK=build_package
33+
34+
# ----------------------------------------------------------------------------
35+
# Process command line options
36+
if [ "$#" -lt 1 ]; then
37+
BUILD_TYPE=Release
38+
else
39+
BUILD_TYPE=$1
40+
fi
41+
42+
# Process command line options
43+
if [ "$#" -lt 2 ]; then
44+
PACKAGE=0
45+
else
46+
PACKAGE=$2
47+
fi
48+
49+
if [ "${PACKAGE}" -gt "0" ]; then
50+
echo "Building a ${BUILD_TYPE} build with packaging"
51+
else
52+
echo "Building a ${BUILD_TYPE} build without packaging"
53+
fi
54+
55+
# ----------------------------------------------------------------------------
56+
# Build the 64-bit layer
57+
mkdir -p ${BUILD_DIR_64}
58+
pushd ${BUILD_DIR_64}
59+
60+
cmake \
61+
-DCMAKE_SYSTEM_NAME=Android \
62+
-DANDROID_PLATFORM=29 \
63+
-DANDROID_ABI=arm64-v8a \
64+
-DANDROID_TOOLCHAIN=clang \
65+
-DANDROID_STL=c++_static \
66+
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
67+
-DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake" \
68+
-DCMAKE_WARN_DEPRECATED=OFF \
69+
..
70+
71+
make -j1
72+
73+
popd
74+
75+
# ----------------------------------------------------------------------------
76+
# Build the release package
77+
if [ "${PACKAGE}" -gt "0" ]; then
78+
# Setup the package directories
79+
mkdir -p ${BUILD_DIR_PACK}/bin/android/arm64
80+
81+
# Install the 64-bit layer
82+
cp ${BUILD_DIR_64}/source/*.so ${BUILD_DIR_PACK}/bin/android/arm64
83+
fi
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
# Layer: GPU Profile - Developer Documentation
2+
3+
This layer is used to profile Arm GPUs, providing API correlated performance
4+
data. This page provides documentation for developers working on creating and
5+
maintaining the layer.
6+
7+
## Measuring performance
8+
9+
Arm GPUs can run multiple workloads in parallel, if the application pipeline
10+
barriers allow it. This is good for overall frame performance, but it makes
11+
a mess of profiling data!
12+
13+
## Measuring performance
14+
15+
Arm GPUs can run multiple workloads in parallel, if the application pipeline
16+
barriers allow it. This is good for overall frame performance, but it makes
17+
profiling data messy due to cross-talk between unrelated workloads.
18+
19+
For profiling we therefore inject serialization points between workloads to
20+
ensure that data corresponds to a single workload. Note that we can only
21+
serialize within our own application process, so data could still be perturbed
22+
by other processes using the GPU.
23+
24+
### Sampling performance counters
25+
26+
This layer will sample performance counters between each workload but, because
27+
sampling is a CPU-side operation, it must trap back to the CPU to make the
28+
counter sample. The correct way to implement this in Vulkan is to split the
29+
application command buffer into multiple command buffers, each containing a
30+
single workload. However, rewriting the command stream like this is expensive
31+
in terms of CPU overhead caused by the state tracking.
32+
33+
Instead use rely on an undocumented extension supported by Arm GPUs which
34+
allows the CPU to set/wait on events in a submitted but not complete command
35+
buffer. The layer injects a `vkCmdSetEvent(A)` and `vkCmdWaitEvent(B)` pair
36+
between each workload, and then has the reverse `vkWaitEvent(A)` and
37+
`vkSetEvent(B)` pair on the CPU side. The counter sample can be inserted
38+
in between the two CPU-side operations. Note that there is no blocking wait on
39+
an event for the CPU, so `vkWaitEvent()` is really a polling loop around
40+
`vkGetEventStatus()`.
41+
42+
```mermaid
43+
sequenceDiagram
44+
actor CPU
45+
actor GPU
46+
CPU->>CPU: vkGetEventStatus(A)
47+
Note over GPU: Run workload
48+
GPU->>CPU: vkCmdSetEvent(A)
49+
GPU->>GPU: vkCmdWaitEvent(B)
50+
Note over CPU: Take sample
51+
CPU->>GPU: vkSetEvent(B)
52+
Note over GPU: Start next workload
53+
```
54+
55+
### Performance implications
56+
57+
Serializing workloads usually means that individual workloads will run with
58+
lower completion latency, because they are no longer contending for resources.
59+
However, loss of overlap means that overall frame latency will increase.
60+
61+
In addition, serializing workloads and then trapping back to the CPU to
62+
sample performance counters will cause the GPU to go idle waiting for the CPU
63+
to complete the counter sample. This makes the GPU appear underutilized to the
64+
system DVFS governor, which may subsequently decide to reduce the GPU clock
65+
frequency. On pre-production devices we recommend locking CPU, GPU and memory
66+
clock frequencies to avoid this problem.
67+
68+
```mermaid
69+
---
70+
displayMode: compact
71+
---
72+
gantt
73+
dateFormat x
74+
axisFormat %Lms
75+
section CPU
76+
Sample: a1, 0, 2ms
77+
Sample: a2, after w1, 2ms
78+
section GPU
79+
Workload 1:w1, after a1, 10ms
80+
Workload 2:w2, after a2, 10ms
81+
```
82+
83+
## Software architecture
84+
85+
The basic architecture for this layer is an extension of the timeline layer,
86+
using a layer command stream (LCS) recorded alongside each command buffer to
87+
define the software operations that the layer needs to perform.
88+
89+
Unlike the timeline layer, which only performs operations synchronously at
90+
submit time, this layer also needs to perform asynchronous sampling operations
91+
associated with each workload after a command buffer has been submitted. To
92+
support this approach the layer tracks the number of workloads submitted
93+
in each command buffer and their debug labels, and hands this over to an
94+
async handler to process as the workloads complete.
95+
96+
To ensure that the async worker gets a predictable workload stream to
97+
instrument, all Vulkan queue submits are serialized on the GPU. As with the
98+
support layer, queue serialization may cause an application to hang if the
99+
application submits command buffers rely on out-of-order execution to unblock
100+
commands in a submitted command stream. This is only possible if applications
101+
are using timeline semaphores, which earlier submits to depend on a later
102+
submit to make forward progress.
103+
104+
## Event handling
105+
106+
To implement this functionality, the layer allocates three additional sync
107+
primitives.
108+
109+
* A timeline semaphore is allocated to implement queue serialization.
110+
* Two events are allocated to support the CPU<->GPU handover for counter
111+
sampling. These events are reset and reused for all counter samples to avoid
112+
managing many different events.
113+
114+
```c
115+
CPU GPU
116+
=== ===
117+
// Workload 1
118+
vkCmdSetEvent(A)
119+
// Spin test until set
120+
vkGetEventStatus(A)
121+
vkResetEvent(A)
122+
123+
// Sample counters
124+
125+
vSetEvent(B)
126+
// Block until set
127+
vkCmdWaitEvent(B)
128+
vkCmdResetEvent(B)
129+
130+
// Workload 2
131+
```
132+
133+
Due to buggy interaction between the counter sampling and power management in
134+
some kernel driver versions, Valhall+CSF GPUs prior to r54p0 need a sleep after
135+
successfully waiting on event A and before sampling any counters. Initial
136+
investigations seem to show that the shortest reliable sleep is 3ms, so this is
137+
quite a very overhead for applications with many workloads and therefore should
138+
be enabled conditionally only for CSF GPUs with a driver older than r54p0.
139+
140+
- - -
141+
_Copyright © 2025, Arm Limited and contributors._

layer_gpu_profile/manifest.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"file_format_version": "1.0.0",
3+
"layer": {
4+
"name": "VK_LAYER_LGL_gpu_profile",
5+
"type": "INSTANCE",
6+
"library_path": "libVkLayerGPUProfile.so",
7+
"api_version": "1.0.0",
8+
"implementation_version": "1",
9+
"description": "Layer for generating Arm GPU profiling data"
10+
}
11+
}
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# SPDX-License-Identifier: MIT
2+
# -----------------------------------------------------------------------------
3+
# Copyright (c) 2024-2025 Arm Limited
4+
#
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to
7+
# deal in the Software without restriction, including without limitation the
8+
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
9+
# sell copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
#
12+
# The above copyright notice and this permission notice shall be included in
13+
# all copies or substantial portions of the Software.
14+
#
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
# -----------------------------------------------------------------------------
23+
24+
# Set output file names
25+
if (CMAKE_BUILD_TYPE STREQUAL "Release")
26+
set(VK_LAYER VkLayerGPUProfile_sym)
27+
set(VK_LAYER_STRIP libVkLayerGPUProfile.so)
28+
else()
29+
set(VK_LAYER VkLayerGPUProfile)
30+
endif()
31+
32+
# Set strings used by configure
33+
set(LGL_LAYER_NAME_STR "VK_LAYER_LGL_gpu_profile")
34+
set(LGL_LAYER_DESC_STR "VkLayerGPUProfile by LGL")
35+
36+
# Vulkan layer library
37+
configure_file(
38+
version.hpp.in
39+
version.hpp
40+
ESCAPE_QUOTES @ONLY)
41+
42+
add_library(
43+
${VK_LAYER} SHARED
44+
../../source_common/framework/entry.cpp
45+
device.cpp
46+
instance.cpp
47+
layer_device_functions_command_buffer.cpp
48+
layer_device_functions_command_pool.cpp
49+
layer_device_functions_debug.cpp
50+
layer_device_functions_dispatch.cpp
51+
layer_device_functions_draw_call.cpp
52+
layer_device_functions_queue.cpp
53+
layer_device_functions_render_pass.cpp
54+
layer_device_functions_trace_rays.cpp
55+
layer_device_functions_transfer.cpp)
56+
57+
target_include_directories(
58+
${VK_LAYER} PRIVATE
59+
./
60+
../../source_common/
61+
${CMAKE_CURRENT_BINARY_DIR})
62+
63+
target_include_directories(
64+
${VK_LAYER} SYSTEM PRIVATE
65+
../../source_third_party/
66+
../../source_third_party/khronos/vulkan/include/
67+
../../source_third_party/khronos/vulkan-utilities/include/
68+
../../source_third_party/protopuf/include/)
69+
70+
lgl_set_build_options(${VK_LAYER})
71+
72+
target_link_libraries(
73+
${VK_LAYER}
74+
lib_layer_comms
75+
lib_layer_framework
76+
lib_layer_trackers
77+
$<$<PLATFORM_ID:Android>:log>)
78+
79+
if (CMAKE_BUILD_TYPE STREQUAL "Release")
80+
add_custom_command(
81+
TARGET "${VK_LAYER}" POST_BUILD
82+
COMMAND ${CMAKE_STRIP}
83+
ARGS --strip-all -o ${VK_LAYER_STRIP} $<TARGET_FILE:${VK_LAYER}>
84+
COMMENT "Stripped lib${VK_LAYER}.so to ${VK_LAYER_STRIP}")
85+
endif()
86+
87+
add_clang_tools()

0 commit comments

Comments
 (0)