diff --git a/.gitmodules b/.gitmodules
index 4a30702..ecd99a7 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "source_third_party/khronos/vulkan-utilities"]
 	path = source_third_party/khronos/vulkan-utilities
 	url = https://github.com/KhronosGroup/Vulkan-Utility-Libraries/
+[submodule "source_third_party/libGPUCounters"]
+	path = source_third_party/libGPUCounters
+	url = https://github.com/ARM-software/libGPUCounters.git
diff --git a/layer_gpu_profile/CMakeLists.txt b/layer_gpu_profile/CMakeLists.txt
new file mode 100644
index 0000000..e2d2bed
--- /dev/null
+++ b/layer_gpu_profile/CMakeLists.txt
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: MIT
+# -----------------------------------------------------------------------------
+# Copyright (c) 2024-2025 Arm Limited
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# -----------------------------------------------------------------------------
+
+cmake_minimum_required(VERSION 3.19)
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRES ON)
+
+project(VkLayerGPUProfile VERSION 1.0.0)
+
+# Common configuration
+set(LGL_LOG_TAG "VkLayerGPUProfile")
+set(LGL_CONFIG_TRACE 0)
+set(LGL_CONFIG_LOG 1)
+
+include(../source_common/compiler_helper.cmake)
+include(../cmake/clang-tools.cmake)
+
+# Build steps
+add_subdirectory(../source_third_party/libGPUCounters source_third_party/libGPUCounters)
+
+add_subdirectory(../source_common/comms source_common/comms)
+add_subdirectory(../source_common/framework source_common/framework)
+add_subdirectory(../source_common/trackers source_common/trackers)
+
+add_subdirectory(source)
diff --git a/layer_gpu_profile/README_LAYER.md b/layer_gpu_profile/README_LAYER.md
new file mode 100644
index 0000000..b78f6cc
--- /dev/null
+++ b/layer_gpu_profile/README_LAYER.md
@@ -0,0 +1,134 @@
+# Layer: GPU Profile
+
+This layer is a frame profiler that can capture per workload performance
+counters for selected frames running on an Arm GPU.
+
+## What devices are supported?
+
+This layer requires Vulkan 1.0 and an Arm GPU because it uses an Arm-specific
+counter sampling library.
+
+## What data can be collected?
+
+The layer serializes workloads for instrumented frames and injects counter
+samples between them, allowing the layer to measure the hardware cost of
+render passes, compute dispatches, transfers, etc.
+
+The serialization is very invasive to wall-clock performance, due to removal
+of pipeline overlap between workloads and additional GPU idle time waiting for
+the layer to performs each performance counter sampling operation. This will
+have an impact on the counter data being captured!
+
+Derived counters that show queue and functional unit utilization as a
+percentage of the overall "active" time of their parent block will report low
+because of time spent refilling and then draining the GPU pipeline between
+workloads. The overall _GPU Active Cycles_ counter is known to be unreliable,
+because the serialization means that command stream setup and teardown costs
+are not hidden in the shadow of surrounding work. We recommend using the
+individual queue active cycles counters as the main measure of performance.
+
+Note that any counter that measure direct work, such as architectural issue
+cycles, or workload nouns, such as primitives or threads, should be unaffected
+by the loss of pipelining.
+
+Arm GPUs provide a wide range of performance counters covering many different
+aspects of hardware performance. The layer will collect a standard set of
+counters by default but, with source modification, can collect any of the
+hardware counters and derived expressions supported by the
+[libGPUCounters][LGC] library that Arm provides on GitHub.
+
+[LGC]: https://github.com/ARM-software/libGPUCounters
+
+### GPU clock frequency impact
+
+The GPU idle time waiting for the CPU to take a counter sample can cause the
+system DVFS power governor to decide that the GPU is not busy. In production
+devices we commonly see that the GPU will be down-clocked during the
+instrumented frame, which may have an impact on a subset of the available
+performance counters.
+
+When running on a pre-production device we recommend pinning CPU, GPU, and bus
+clock speeds to avoid the performance instability.
+
+## How do I use the layer?
+
+### Prerequisites
+
+Device setup steps:
+
+* Ensure your Android device is in developer mode, with `adb` support enabled
+  in developer settings.
+* Ensure the Android device is connected to your development workstation, and
+  visible to `adb` with an authorized debug connection.
+
+Application setup steps:
+
+* Build a debuggable build of your application and install it on the Android
+  device.
+
+Tooling setup steps
+
+* Install the Android platform tools and ensure `adb` is on your `PATH`
+  environment variable.
+* Install the Android NDK and set the `ANDROID_NDK_HOME` environment variable
+  to its installation path.
+
+### Layer build
+
+Build the Profile layer for Android using the provided build script, or using
+equivalent manual commands, from the `layer_gpu_profile` directory. For full
+instructions see the _Build an Android layer_ and _Build a Linux layer_
+sections in the [Build documentation](../docs/building.md).
+
+### Running using the layer
+
+You can configure a device to run a profile by using the Android helper utility
+found in the root directory to configure the layer and manage the application.
+You must enable the profile layer, and provide a configuration file to
+parameterize it.
+
+```sh
+python3 lgl_android_install.py --layer layer_gpu_profile --config <your.json>  --profile <out_dir>
+```
+
+The [`layer_config.json`](layer_config.json) file in this directory is a
+template configuration file you can start from. It defaults to periodic
+sampling every 600 frames, but you can modify this to suit your needs.
+
+The `--profile` option specifies an output directory on the host to contain
+the CSV files written by the tool. One CSV is written for each frame, each CSV
+containing a table with one row per workload profiled in the frame, listed
+in API submit order.
+
+The Android helper utility contains many other options for configuring the
+application under test and the capture process. For full instructions see the
+[Running on Android documentation](../docs/running_android.md).
+
+## Layer configuration
+
+The current layer supports two `sampling_mode` values:
+
+* `periodic_frame`: Sample every N frames.
+* `frame_list`: Sample specific frames.
+
+When `mode` is `periodic_frame` the integer value of the `periodic_frame` key
+defines the frame sampling period. The integer value of the
+`periodic_min_frame` key defines the first possible frame that could be
+profiled, allowing profiles to skip over any loading frames. By default frame 0
+is ignored.
+
+When `mode` is `frame_list` the value of the `frame_list` key defines a list
+of integers giving the specific frames to capture.
+
+## Layer counters
+
+The current layer uses a hard-coded set of performance counters defined in the
+`Device` class constructor. If you wish to collect different counters you must
+edit the [Device source](./source.device.cpp) and rebuild the layer.
+
+Any counters that are specified but that are not available on the current GPU
+will be ignored.
+
+- - -
+
+_Copyright © 2025, Arm Limited and contributors._
diff --git a/layer_gpu_profile/android_build.sh b/layer_gpu_profile/android_build.sh
new file mode 100644
index 0000000..6ec8c85
--- /dev/null
+++ b/layer_gpu_profile/android_build.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: MIT
+# ----------------------------------------------------------------------------
+# Copyright (c) 2024-2025 Arm Limited
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+# ----------------------------------------------------------------------------
+
+# ----------------------------------------------------------------------------
+# Configuration
+
+# Exit immediately if any component command errors
+set -e
+
+BUILD_DIR_64=build_arm64
+BUILD_DIR_PACK=build_package
+
+# ----------------------------------------------------------------------------
+# Process command line options
+if [ "$#" -lt 1 ]; then
+    BUILD_TYPE=Release
+else
+    BUILD_TYPE=$1
+fi
+
+# Process command line options
+if [ "$#" -lt 2 ]; then
+    PACKAGE=0
+else
+    PACKAGE=$2
+fi
+
+if [ "${PACKAGE}" -gt "0" ]; then
+    echo "Building a ${BUILD_TYPE} build with packaging"
+else
+    echo "Building a ${BUILD_TYPE} build without packaging"
+fi
+
+# ----------------------------------------------------------------------------
+# Build the 64-bit layer
+mkdir -p ${BUILD_DIR_64}
+pushd ${BUILD_DIR_64}
+
+cmake \
+    -DCMAKE_SYSTEM_NAME=Android \
+    -DANDROID_PLATFORM=29 \
+    -DANDROID_ABI=arm64-v8a \
+    -DANDROID_TOOLCHAIN=clang \
+    -DANDROID_STL=c++_static \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake" \
+    -DCMAKE_WARN_DEPRECATED=OFF \
+    ..
+
+make -j16
+
+popd
+
+# ----------------------------------------------------------------------------
+# Build the release package
+if [ "${PACKAGE}" -gt "0" ]; then
+    # Setup the package directories
+    mkdir -p ${BUILD_DIR_PACK}/bin/android/arm64
+
+    # Install the 64-bit layer
+    cp ${BUILD_DIR_64}/source/*.so ${BUILD_DIR_PACK}/bin/android/arm64
+fi
diff --git a/layer_gpu_profile/docs/developer-docs.md b/layer_gpu_profile/docs/developer-docs.md
new file mode 100644
index 0000000..84bce10
--- /dev/null
+++ b/layer_gpu_profile/docs/developer-docs.md
@@ -0,0 +1,127 @@
+# Layer: GPU Profile - Developer Documentation
+
+This layer is used to profile Arm GPUs, providing API correlated performance
+data. This page provides documentation for developers working on creating and
+maintaining the layer.
+
+## Measuring performance
+
+Arm GPUs can run multiple workloads in parallel, if the application pipeline
+barriers allow it. This is good for overall frame performance, but it makes
+profiling data messy due to cross-talk between unrelated workloads.
+
+For profiling we therefore inject serialization points between workloads to
+ensure that data corresponds to a single workload. Note that we can only
+serialize within the current application process, so data could still be
+perturbed by other processes using the GPU.
+
+### Sampling performance counters
+
+This layer will sample performance counters between each workload but, because
+sampling is a CPU-side operation, it must trap back to the CPU to make the
+counter sample. The correct way to implement this in Vulkan is to split the
+application command buffer into multiple command buffers, each containing a
+single workload. However, rewriting the command stream like this is expensive
+in terms of CPU overhead caused by the state tracking.
+
+Instead we rely on an undocumented extension supported by Arm GPUs which
+allows the CPU to set/wait on events in a submitted but not complete command
+buffer. The layer injects a `vkCmdSetEvent(A)` and `vkCmdWaitEvent(B)` pair
+between each workload in the command buffer, and then has the reverse
+`vkWaitEvent(A)` and `vkSetEvent(B)` pair on the CPU side. The counter sample
+can be inserted in between the two CPU-side operations. Note that there is no
+blocking CPU-side wait for an event so `vkWaitEvent()` is really a polling loop
+around `vkGetEventStatus()`.
+
+```mermaid
+sequenceDiagram
+    actor CPU
+    actor GPU
+    CPU->>CPU: vkGetEventStatus(A)
+    Note over GPU: Run workload
+    GPU->>CPU: vkCmdSetEvent(A)
+    GPU->>GPU: vkCmdWaitEvent(B)
+    Note over CPU: Take sample
+    CPU->>GPU: vkSetEvent(B)
+    Note over GPU: Start next workload
+```
+
+### Performance implications
+
+Serializing workloads usually means that individual workloads will run with
+lower completion latency, because they are no longer contending for resources.
+However, loss of pipelining and overlap means that overall frame latency will
+increase compared to a well overlapped scenario.
+
+In addition, serializing workloads and then trapping back to the CPU to
+sample performance counters will cause the GPU to go idle waiting for the CPU
+to complete the counter sample. This makes the GPU appear underutilized to the
+system DVFS governor, which may subsequently decide to reduce the GPU clock
+frequency. On pre-production devices we recommend locking CPU, GPU and memory
+clock frequencies to avoid this problem.
+
+```mermaid
+---
+displayMode: compact
+---
+gantt
+    dateFormat x
+    axisFormat %Lms
+    section CPU
+    Sample: a1, 0, 2ms
+    Sample: a2, after w1, 2ms
+    section GPU
+    Workload 1:w1, after a1, 10ms
+    Workload 2:w2, after a2, 10ms
+```
+
+## Software architecture
+
+The basic architecture for this layer is an extension of the timeline layer,
+using a layer command stream (LCS) recorded alongside each command buffer to
+define the software operations that the layer needs to perform at submit time.
+
+Because counter sampling is handled synchronously on the CPU when a frame is
+being profiled, the layer handles each `vkQueueSubmit` and its associated
+counter samples synchronously at submit time before returning to the
+application. When sampling the layer retains the layer lock when calling into
+the driver, ensuring that only one thread at a time can process a submit that
+makes counter samples.
+
+## Event handling
+
+To implement this functionality, the layer allocates two `VkEvent` objects to
+support the CPU<->GPU handover for counter sampling. These events are reset and
+reused for all counter samples to avoid managing many different events.
+
+```c
+CPU                       GPU
+===                       ===
+                          // Workload 1
+                          vkCmdSetEvent(A)
+// Spin test until set
+vkGetEventStatus(A)
+vkResetEvent(A)
+
+// Sample counters
+
+vSetEvent(B)
+                          // Block until set
+                          vkCmdWaitEvent(B)
+                          vkCmdResetEvent(B)
+
+                          // Workload 2
+```
+
+Due to an errata in the interaction between the counter sampling and power
+management in some older kernel driver versions, Arm GPUs with the CSF frontend
+and a driver older than r54p0 need a sleep after successfully waiting on
+event A and before sampling any counters.
+
+Initial investigations seem to show that the shortest reliable sleep is 3ms, so
+this is quite a high overhead for applications with many workloads and
+therefore should be enabled conditionally only for CSF GPUs with a driver older
+than r54p0.
+
+- - -
+_Copyright © 2025, Arm Limited and contributors._
diff --git a/layer_gpu_profile/layer_config.json b/layer_gpu_profile/layer_config.json
new file mode 100644
index 0000000..c24a31a
--- /dev/null
+++ b/layer_gpu_profile/layer_config.json
@@ -0,0 +1,7 @@
+{
+    "layer": "VK_LAYER_LGL_gpu_profile",
+    "sample_mode": "periodic_frame",
+    "periodic_min_frame": 1,
+    "periodic_frame": 600,
+    "frame_list": []
+}
diff --git a/layer_gpu_profile/manifest.json b/layer_gpu_profile/manifest.json
new file mode 100644
index 0000000..411a4ae
--- /dev/null
+++ b/layer_gpu_profile/manifest.json
@@ -0,0 +1,11 @@
+{
+  "file_format_version": "1.0.0",
+  "layer": {
+      "name": "VK_LAYER_LGL_gpu_profile",
+      "type": "INSTANCE",
+      "library_path": "libVkLayerGPUProfile.so",
+      "api_version": "1.0.0",
+      "implementation_version": "1",
+      "description": "Layer for generating Arm GPU profiling data"
+  }
+}
diff --git a/layer_gpu_profile/source/CMakeLists.txt b/layer_gpu_profile/source/CMakeLists.txt
new file mode 100644
index 0000000..c0cb5f8
--- /dev/null
+++ b/layer_gpu_profile/source/CMakeLists.txt
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: MIT
+# -----------------------------------------------------------------------------
+# Copyright (c) 2024-2025 Arm Limited
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# -----------------------------------------------------------------------------
+
+# Set output file names
+if (CMAKE_BUILD_TYPE STREQUAL "Release")
+    set(VK_LAYER VkLayerGPUProfile_sym)
+    set(VK_LAYER_STRIP libVkLayerGPUProfile.so)
+else()
+    set(VK_LAYER VkLayerGPUProfile)
+endif()
+
+# Set strings used by configure
+set(LGL_LAYER_NAME_STR "VK_LAYER_LGL_gpu_profile")
+set(LGL_LAYER_DESC_STR "VkLayerGPUProfile by LGL")
+
+# Vulkan layer library
+configure_file(
+    version.hpp.in
+    version.hpp
+    ESCAPE_QUOTES @ONLY)
+
+add_library(
+    ${VK_LAYER} SHARED
+        ../../source_common/framework/entry.cpp
+        device.cpp
+        instance.cpp
+        layer_comms.cpp
+        layer_config.cpp
+        layer_device_functions_command_buffer.cpp
+        layer_device_functions_command_pool.cpp
+        layer_device_functions_debug.cpp
+        layer_device_functions_dispatch.cpp
+        layer_device_functions_queue.cpp
+        layer_device_functions_render_pass.cpp
+        layer_device_functions_trace_rays.cpp
+        layer_device_functions_transfer.cpp
+        submit_visitor.cpp)
+
+target_include_directories(
+    ${VK_LAYER} PRIVATE
+        ./
+        ../../source_common/
+        ${CMAKE_CURRENT_BINARY_DIR})
+
+target_include_directories(
+    ${VK_LAYER} SYSTEM PRIVATE
+        ../../source_third_party/
+        ../../source_third_party/khronos/vulkan/include/
+        ../../source_third_party/khronos/vulkan-utilities/include/
+        ../../source_third_party/libGPUCounters/backend/device/include/
+        ../../source_third_party/libGPUCounters/hwcpipe/include/
+        ../../source_third_party/protopuf/include/)
+
+# We use libGPUCounters in the device structure, so add to framework includes
+target_include_directories(
+    lib_layer_framework SYSTEM PRIVATE
+        ../../source_third_party/libGPUCounters/backend/device/include/
+        ../../source_third_party/libGPUCounters/hwcpipe/include/)
+
+lgl_set_build_options(${VK_LAYER})
+
+target_link_libraries(
+    ${VK_LAYER}
+        lib_layer_comms
+        lib_layer_framework
+        lib_layer_trackers
+        device
+        hwcpipe
+        $<$<PLATFORM_ID:Android>:log>)
+
+if (CMAKE_BUILD_TYPE STREQUAL "Release")
+    add_custom_command(
+        TARGET "${VK_LAYER}" POST_BUILD
+        COMMAND ${CMAKE_STRIP}
+        ARGS --strip-all -o ${VK_LAYER_STRIP} $<TARGET_FILE:${VK_LAYER}>
+        COMMENT "Stripped lib${VK_LAYER}.so to ${VK_LAYER_STRIP}")
+endif()
+
+add_clang_tools()
diff --git a/layer_gpu_profile/source/device.cpp b/layer_gpu_profile/source/device.cpp
new file mode 100644
index 0000000..b040669
--- /dev/null
+++ b/layer_gpu_profile/source/device.cpp
@@ -0,0 +1,226 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2024-2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+#include <vector>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <nlohmann/json.hpp>
+
+#include "comms/comms_module.hpp"
+#include "device.hpp"
+#include "framework/manual_functions.hpp"
+#include "framework/utils.hpp"
+#include "instance.hpp"
+
+using json = nlohmann::json;
+
+/**
+ * @brief The dispatch lookup for all of the created Vulkan devices.
+ */
+static std::unordered_map<void*, std::unique_ptr<Device>> g_devices;
+
+/* See header for documentation. */
+const std::vector<DeviceCreatePatchPtr> Device::createInfoPatches {};
+
+/* See header for documentation. */
+std::unique_ptr<Comms::CommsModule> Device::commsModule;
+
+/* See header for documentation. */
+std::unique_ptr<ProfileComms> Device::commsWrapper;
+
+/* See header for documentation. */
+void Device::store(VkDevice handle, std::unique_ptr<Device> device)
+{
+    void* key = getDispatchKey(handle);
+    g_devices.insert({key, std::move(device)});
+}
+
+/* See header for documentation. */
+Device* Device::retrieve(VkDevice handle)
+{
+    void* key = getDispatchKey(handle);
+    assert(isInMap(key, g_devices));
+    return g_devices.at(key).get();
+}
+
+/* See header for documentation. */
+Device* Device::retrieve(VkQueue handle)
+{
+    void* key = getDispatchKey(handle);
+    assert(isInMap(key, g_devices));
+    return g_devices.at(key).get();
+}
+
+/* See header for documentation. */
+Device* Device::retrieve(VkCommandBuffer handle)
+{
+    void* key = getDispatchKey(handle);
+    assert(isInMap(key, g_devices));
+    return g_devices.at(key).get();
+}
+
+/* See header for documentation. */
+void Device::destroy(Device* device)
+{
+    g_devices.erase(getDispatchKey(device));
+}
+
+/* See header for documentation. */
+Device::Device(Instance* _instance,
+               VkPhysicalDevice _physicalDevice,
+               VkDevice _device,
+               PFN_vkGetDeviceProcAddr nlayerGetProcAddress,
+               const VkDeviceCreateInfo& createInfo)
+    : instance(_instance),
+      physicalDevice(_physicalDevice),
+      device(_device)
+{
+    UNUSED(createInfo);
+
+    initDriverDeviceDispatchTable(device, nlayerGetProcAddress, driver);
+
+    // Emit a log if debug utils entry points did not load. In this scenario
+    // the layer will still be loaded and send metadata packets to the server
+    // socket, but the Perfetto data will not contain any tag labels. We will
+    // therefore be unable to cross-reference the two data streams to produce a
+    // usable visualization.
+    if (!driver.vkCmdBeginDebugUtilsLabelEXT)
+    {
+        LAYER_LOG("  - ERROR: Device does not expose VK_EXT_debug_utils");
+        LAYER_LOG("           Profiling will not contain debug labels");
+    }
+
+    // Init the shared comms module for the first device built
+    if (!commsModule)
+    {
+        commsModule = std::make_unique<Comms::CommsModule>("lglcomms");
+        commsWrapper = std::make_unique<ProfileComms>(*commsModule);
+    }
+
+    // Create events for CPU<>GPU synchronization
+    VkEventCreateInfo eventCreateInfo {
+        .sType = VK_STRUCTURE_TYPE_EVENT_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+    };
+
+    auto result = driver.vkCreateEvent(device, &eventCreateInfo, nullptr, &gpuToCpuEvent);
+    if (result != VK_SUCCESS)
+    {
+        LAYER_ERR("Failed vkCreateEvent() for gpu->cpu synchronization");
+    }
+
+    result = driver.vkCreateEvent(device, &eventCreateInfo, nullptr, &cpuToGpuEvent);
+    if (result != VK_SUCCESS)
+    {
+        LAYER_ERR("Failed vkCreateEvent() for cpu->gpu synchronization");
+    }
+
+    // Create the counter context
+    lgcGpu = std::make_unique<hwcpipe::gpu>(0);
+    if (!lgcGpu->valid())
+    {
+        LAYER_ERR("Failed libGPUCounters GPU context creation");
+        return;
+    }
+
+    // Create the counter sampler config
+    auto config = hwcpipe::sampler_config(*lgcGpu.get());
+
+    LAYER_LOG("Configuring libGPUCounters:");
+
+    // Queue cycles, not all of these are available on all GPUs - the ones that
+    // are not available will be transparently dropped
+    addCounter(config, MaliCompQueueActiveCy, "Compute queue active cycles");
+    addCounter(config, MaliVertQueueActiveCy, "Vertex queue active cycles");
+    addCounter(config, MaliBinningQueueActiveCy, "Binning phase queue active cycles");
+    addCounter(config, MaliNonFragQueueActiveCy, "Non-fragment queue active cycles");
+    addCounter(config, MaliFragQueueActiveCy, "Fragment queue active cycles");
+    addCounter(config, MaliMainQueueActiveCy, "Main phase queue active cycles");
+
+    // External bandwidth
+    addCounter(config, MaliExtBusRdBy, "External read bytes");
+    addCounter(config, MaliExtBusWrBy, "External write bytes");
+
+    // Primitive counts
+    addCounter(config, MaliGeomTotalPrim, "Input primitives");
+    addCounter(config, MaliGeomVisiblePrim, "Visible primitives");
+
+    // Thread counts
+    addCounter(config, MaliNonFragThread, "Non-fragment threads");
+    addCounter(config, MaliFragThread, "Fragment threads");
+
+    // Functional unit counters
+    // TODO HIVE-1307: Currently libGPUCounters doesn't expose a MaliALUIssueCy
+    // counter, so we use instruction counts as a measure of relative
+    // arithmetic complexity across workloads, but note that it is not directly
+    // comparable with the other "* unit cycles" counters.
+    addCounter(config, MaliEngInstr, "Arithmetic unit instructions");
+    addCounter(config, MaliEngFMAInstr, "Arithmetic unit FMA instructions");
+    addCounter(config, MaliEngCVTInstr, "Arithmetic unit CVT instructions");
+    addCounter(config, MaliEngSFUInstr, "Arithmetic unit SFU instructions");
+    addCounter(config, MaliVarIssueCy, "Varying unit cycles");
+    addCounter(config, MaliTexIssueCy, "Texture unit cycles");
+    addCounter(config, MaliLSIssueCy, "Load/store unit cycles");
+
+    // Create the counter sampler and set it running
+    lgcSampler = std::make_unique<hwcpipe::sampler<>>(config);
+    auto ec = lgcSampler->start_sampling();
+    if (ec)
+    {
+        LAYER_ERR("Failed libGPUCounters GPU sampler creation");
+    }
+
+    // Configure frame selection here so we can profile frame zero
+    isFrameOfInterest = instance->config.isFrameOfInterest(0);
+
+    // Start the next frame if it is "of interest"
+    if (isFrameOfInterest)
+    {
+        json startFrameMessage {
+            { "type", "start_frame" },
+            { "frame", 0 },
+        };
+
+        txMessage(startFrameMessage.dump());
+    }
+}
+
+void Device::addCounter(
+    hwcpipe::sampler_config& config,
+    hwcpipe_counter counterID,
+    const char* counterName
+) {
+    auto ec = config.add_counter(counterID);
+    if (ec)
+    {
+        LAYER_LOG(" - %s not available", counterName);
+    }
+    else
+    {
+        LAYER_LOG(" + %s selected", counterName);
+        lgcActiveCounters.emplace_back(counterID, counterName);
+    }
+}
diff --git a/layer_gpu_profile/source/device.hpp b/layer_gpu_profile/source/device.hpp
new file mode 100644
index 0000000..b52a47a
--- /dev/null
+++ b/layer_gpu_profile/source/device.hpp
@@ -0,0 +1,249 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2024-2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * @file Declares the root class for layer management of VkDevice objects.
+ *
+ * Role summary
+ * ============
+ *
+ * Devices represent the core context used by the application to connect to the
+ * underlying graphics driver. A device object is the dispatch root for the
+ * Vulkan driver, so device commands all take some form of dispatchable handle
+ * that can be resolved into a unique per-device key. For the driver this key
+ * would simply be a pointer directly to the driver-internal device object, but
+ * for our layer we use a device dispatch key as an index in to the map to find
+ * the layer's driver object.
+ *
+ * Key properties
+ * ==============
+ *
+ * Vulkan devices are designed to be used concurrently by multiple application
+ * threads. An application can have multiple concurrent devices, and use each
+ * device from multiple threads.
+ *
+ * Access to the layer driver structures must therefore be kept thread-safe.
+ * For sake of simplicity, we generally implement this by:
+ *   - Holding a global lock whenever any thread is inside layer code.
+ *   - Releasing the global lock whenever the layer calls a driver function.
+ */
+
+#pragma once
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <hwcpipe/gpu.hpp>
+#include <hwcpipe/sampler.hpp>
+
+#include <vulkan/utility/vk_safe_struct.hpp>
+#include <vulkan/vk_layer.h>
+
+#include "layer_comms.hpp"
+#include "comms/comms_module.hpp"
+#include "framework/device_dispatch_table.hpp"
+#include "instance.hpp"
+#include "trackers/device.hpp"
+
+/**
+ * @brief Function pointer type for patching VkDeviceCreateInfo.
+ */
+using DeviceCreatePatchPtr = void (*)(Instance& instance,
+                                      VkPhysicalDevice physicalDevice,
+                                      vku::safe_VkDeviceCreateInfo& createInfo,
+                                      std::vector<std::string>& supported);
+
+/**
+ * @brief This class implements the layer state tracker for a single device.
+ */
+class Device
+{
+public:
+    /**
+     * @brief Store a new device into the global store of dispatchable devices.
+     *
+     * @param handle   The dispatchable device handle to use as an indirect key.
+     * @param device   The @c Device object to store.
+     */
+    static void store(VkDevice handle, std::unique_ptr<Device> device);
+
+    /**
+     * @brief Fetch a device from the global store of dispatchable devices.
+     *
+     * @param handle   The dispatchable device handle to use as an indirect lookup.
+     *
+     * @return The layer device context.
+     */
+    static Device* retrieve(VkDevice handle);
+
+    /**
+     * @brief Fetch a device from the global store of dispatchable devices.
+     *
+     * @param handle   The dispatchable queue handle to use as an indirect lookup.
+     *
+     * @return The layer device context.
+     */
+    static Device* retrieve(VkQueue handle);
+
+    /**
+     * @brief Fetch a device from the global store of dispatchable devices.
+     *
+     * @param handle   The dispatchable command buffer handle to use as an indirect lookup.
+     *
+     * @return The layer device context.
+     */
+    static Device* retrieve(VkCommandBuffer handle);
+
+    /**
+     * @brief Drop a device from the global store of dispatchable devices.
+     *
+     * @param device   The device to drop.
+     */
+    static void destroy(Device* device);
+
+    /**
+     * @brief Create a new layer device object.
+     *
+     * Create info is transient, so the constructor must copy what it needs.
+     *
+     * @param instance               The layer instance object this device is created with.
+     * @param physicalDevice         The physical device this logical device is for.
+     * @param device                 The device handle this device is created with.
+     * @param nlayerGetProcAddress   The vkGetDeviceProcAddress function for the driver.
+     * @param createInfo             The create info used to create the device.
+     */
+    Device(Instance* instance,
+           VkPhysicalDevice physicalDevice,
+           VkDevice device,
+           PFN_vkGetDeviceProcAddr nlayerGetProcAddress,
+           const VkDeviceCreateInfo& createInfo);
+
+    /**
+     * @brief Destroy this layer device object.
+     */
+    ~Device() = default;
+
+    /**
+     * @brief Callback for sending some message for the device.
+     *
+     * @param message   The message to send.
+     */
+    void txMessage(const std::string& message)
+    {
+        commsWrapper->txMessage(message);
+    }
+
+    /**
+     * @brief Get the cumulative stats for this device.
+     */
+    Tracker::Device& getStateTracker() { return stateTracker; }
+
+private:
+
+    /**
+     * @brief Add a counter to the list of counters, if available.
+     *
+     * @param samplerConfig   The sampler configuration to query.
+     * @param counterID       The counter to add to the configuration.
+     * @param counterName     The human-readable counter name.
+     */
+    void addCounter(
+        hwcpipe::sampler_config& samplerConfig,
+        hwcpipe_counter counterID,
+        const char* counterName);
+
+public:
+    /**
+     * @brief The instance this device is created with.
+     */
+    const Instance* instance;
+
+    /**
+     * @brief The physical device this device is created with.
+     */
+    const VkPhysicalDevice physicalDevice;
+
+    /**
+     * @brief The device handle this device is created with.
+     */
+    const VkDevice device;
+
+    /**
+     * @brief The driver function dispatch table.
+     */
+    DeviceDispatchTable driver {};
+
+    /**
+     * @brief The set of VkCreateDeviceInfo patches needed by this layer.
+     */
+    static const std::vector<DeviceCreatePatchPtr> createInfoPatches;
+
+    /**
+     * @brief Is this frame being profiled?
+     */
+    bool isFrameOfInterest {false};
+
+    /**
+     * @brief The event needed to sync execution from GPU to CPU.
+     */
+    VkEvent gpuToCpuEvent;
+
+    /**
+     * @brief The event needed to sync execution from CPU back to GPU.
+     */
+    VkEvent cpuToGpuEvent;
+
+    /**
+     * @brief The GPU connection for counter sampling.
+     */
+    std::unique_ptr<hwcpipe::gpu> lgcGpu;
+
+    /**
+     * @brief The GPU sampler for counter sampling.
+     */
+    std::unique_ptr<hwcpipe::sampler<>> lgcSampler;
+
+    /**
+     * @brief The active GPU counters for sampling.
+     */
+    std::vector<std::pair<hwcpipe_counter, std::string>> lgcActiveCounters;
+
+private:
+    /**
+     * @brief State tracker for this device.
+     */
+    Tracker::Device stateTracker;
+
+    /**
+     * @brief Shared network communications module.
+     */
+    static std::unique_ptr<Comms::CommsModule> commsModule;
+
+    /**
+     * @brief Shared network communications message encoder.
+     */
+    static std::unique_ptr<ProfileComms> commsWrapper;
+};
diff --git a/layer_gpu_profile/source/device_utils.hpp b/layer_gpu_profile/source/device_utils.hpp
new file mode 100644
index 0000000..f0b3415
--- /dev/null
+++ b/layer_gpu_profile/source/device_utils.hpp
@@ -0,0 +1,149 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+#pragma once
+
+#include "device.hpp"
+
+#include <chrono>
+#include <thread>
+
+#include <vulkan/vulkan.h>
+
+/**
+ * @brief Emit workaround sleep if needed.
+ */
+[[maybe_unused]] static void workaroundDelay()
+{
+    // We could make this conditional (enable if GPU is CSF and DDK < r54p0).
+    // However the profile is always going to be invasive, and it's quite a bit
+    // of added complexity to handle.
+    std::this_thread::sleep_for(std::chrono::milliseconds(3));
+}
+
+/**
+ * @brief Emit the GPU-side trigger/wait for a CPU-side trap.
+ *
+ * Note: this relies on an undocumented extension supported by Arm GPUs, which
+ * allows the CPU to set/wait/reset events in a command buffer after it has
+ * been submitted to a queue.
+ *
+ * @param layer           The layer context for the device.
+ * @param commandBuffer   The command buffer we are recording.
+ */
+[[maybe_unused]] static void emitCPUTrap(
+    Device& layer,
+    VkCommandBuffer commandBuffer
+) {
+    // Don't instrument outside of active frame of interest
+    if(!layer.isFrameOfInterest)
+    {
+        return;
+    }
+
+    // Signal the gpuToCpu to wake the CPU to perform its operation
+    layer.driver.vkCmdSetEvent(
+        commandBuffer,
+        layer.gpuToCpuEvent,
+        VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
+
+    // Wait for cpuToGpu to wake the GPU after CPU has finished
+    layer.driver.vkCmdWaitEvents(
+        commandBuffer,
+        1,
+        &layer.cpuToGpuEvent,
+        VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+        VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+        0, nullptr,
+        0, nullptr,
+        0, nullptr);
+
+    // Reset cpuToGpu so it's ready to use again
+    layer.driver.vkCmdResetEvent(
+        commandBuffer,
+        layer.cpuToGpuEvent,
+        VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
+}
+
+/**
+ * @brief Perform the CPU-side wait for a CPU-side trap.
+ *
+ * Note: this relies on an undocumented extension supported by Arm GPUs, which
+ * allows the CPU to set/wait/reset events in a command buffer after it has
+ * been submitted to a queue.
+ *
+ * @param layer           The layer context for the device.
+ */
+[[maybe_unused]] static void waitForGPU(
+    Device& layer
+) {
+    // Wait for gpuToCpu to wake the CPU after GPU has finished
+    while(true)
+    {
+        auto res = layer.driver.vkGetEventStatus(layer.device, layer.gpuToCpuEvent);
+        if (res == VK_EVENT_SET)
+        {
+            break;
+        }
+
+        if (res != VK_EVENT_RESET)
+        {
+            LAYER_LOG("Failed to wait for gpuToCpuEvent");
+        }
+
+        // Sleep before polling again
+        std::this_thread::sleep_for(std::chrono::microseconds(100));
+    }
+
+    // Reset gpuToCpu so it's ready to use again
+     auto res = layer.driver.vkResetEvent(layer.device, layer.gpuToCpuEvent);
+    if (res != VK_SUCCESS)
+    {
+        LAYER_LOG("Failed to reset gpuToCpuEvent");
+    }
+
+    // Sleep after event set to workaround counter sync errata on older drivers
+    workaroundDelay();
+}
+
+/**
+ * @brief Perform the CPU-side notify of the GPU after a CPU-side trap.
+ *
+ * Note: this relies on an undocumented extension supported by Arm GPUs, which
+ * allows the CPU to set/wait/reset events in a command buffer after it has
+ * been submitted to a queue.
+ *
+ * @param layer           The layer context for the device.
+ */
+[[maybe_unused]] static void notifyGPU(
+    Device& layer
+) {
+    // Signal cpuToGpu to wake the GPU to keep processing the command stream
+    auto res = layer.driver.vkSetEvent(layer.device, layer.cpuToGpuEvent);
+    if (res != VK_SUCCESS)
+    {
+        LAYER_LOG("Failed to notify cpuToGpuEvent");
+    }
+}
diff --git a/layer_gpu_profile/source/instance.cpp b/layer_gpu_profile/source/instance.cpp
new file mode 100644
index 0000000..d567bbb
--- /dev/null
+++ b/layer_gpu_profile/source/instance.cpp
@@ -0,0 +1,80 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2024-2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+#include "instance.hpp"
+
+#include "framework/utils.hpp"
+
+#include <cassert>
+
+/**
+ * @brief The dispatch lookup for all of the created Vulkan instances.
+ */
+static std::unordered_map<void*, std::unique_ptr<Instance>> g_instances;
+
+/* See header for documentation. */
+const APIVersion Instance::minAPIVersion {1, 1};
+
+/* See header for documentation. */
+const std::vector<std::string> Instance::extraExtensions {
+    VK_EXT_DEBUG_UTILS_EXTENSION_NAME,
+};
+
+/* See header for documentation. */
+void Instance::store(VkInstance handle, std::unique_ptr<Instance>& instance)
+{
+    void* key = getDispatchKey(handle);
+    g_instances.insert({key, std::move(instance)});
+}
+
+/* See header for documentation. */
+Instance* Instance::retrieve(VkInstance handle)
+{
+    void* key = getDispatchKey(handle);
+    assert(isInMap(key, g_instances));
+    return g_instances.at(key).get();
+}
+
+/* See header for documentation. */
+Instance* Instance::retrieve(VkPhysicalDevice handle)
+{
+    void* key = getDispatchKey(handle);
+    assert(isInMap(key, g_instances));
+    return g_instances.at(key).get();
+}
+
+/* See header for documentation. */
+void Instance::destroy(Instance* instance)
+{
+    g_instances.erase(getDispatchKey(instance->instance));
+}
+
+/* See header for documentation. */
+Instance::Instance(VkInstance _instance, PFN_vkGetInstanceProcAddr _nlayerGetProcAddress)
+    : instance(_instance),
+      nlayerGetProcAddress(_nlayerGetProcAddress)
+{
+    initDriverInstanceDispatchTable(instance, nlayerGetProcAddress, driver);
+}
diff --git a/layer_gpu_profile/source/instance.hpp b/layer_gpu_profile/source/instance.hpp
new file mode 100644
index 0000000..878c84c
--- /dev/null
+++ b/layer_gpu_profile/source/instance.hpp
@@ -0,0 +1,143 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2024-2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * @file
+ * Declares the root class for layer management of VkInstance objects.
+ *
+ * Role summary
+ * ============
+ *
+ * Instances represent the core context used by the application to connect to
+ * the OS graphics subsystem prior to connection to a specific device instance.
+ * An instance object is the dispatch root for the Vulkan subsystem, so
+ * instance commands all take some form of dispatchable handle that can be
+ * resolved into a unique per-instance key. For the driver this key would
+ * simply be a pointer directly to the driver-internal instance object, but for
+ * our layer we use a instance dispatch key as an index in to the map to find
+ * the layer's instance object.
+ *
+ * Key properties
+ * ==============
+ *
+ * Vulkan instances are designed to be used concurrently by multiple
+ * application threads. An application can have multiple concurrent instances,
+ * and use each instance from multiple threads.
+ *
+ * Access to the layer driver structures must therefore be kept thread-safe.
+ * For sake of simplicity, we generally implement this by:
+ *   - Holding a global lock whenever any thread is inside layer code.
+ *   - Releasing the global lock whenever the layer calls a driver function.
+ */
+
+#pragma once
+
+#include "framework/instance_dispatch_table.hpp"
+#include "layer_config.hpp"
+
+#include <memory>
+#include <unordered_map>
+
+#include <vulkan/vk_layer.h>
+#include <vulkan/vulkan.h>
+
+/**
+ * @brief This class implements the layer state tracker for a single instance.
+ */
+class Instance
+{
+public:
+    /**
+     * @brief Store a new instance into the global store of dispatchable instances.
+     *
+     * @param handle     The dispatchable instance handle to use as an indirect key.
+     * @param instance   The @c Instance object to store.
+     */
+    static void store(VkInstance handle, std::unique_ptr<Instance>& instance);
+
+    /**
+     * @brief Fetch an instance from the global store of dispatchable instances.
+     *
+     * @param handle   The dispatchable instance handle to use as an indirect lookup.
+     *
+     * @return The layer instance context.
+     */
+    static Instance* retrieve(VkInstance handle);
+
+    /**
+     * @brief Fetch an instance from the global store of dispatchable instances.
+     *
+     * @param handle   The dispatchable physical device handle to use as an indirect lookup.
+     *
+     * @return The layer instance context.
+     */
+    static Instance* retrieve(VkPhysicalDevice handle);
+
+    /**
+     * @brief Drop an instance from the global store of dispatchable instances.
+     *
+     * @param instance   The instance to drop.
+     */
+    static void destroy(Instance* instance);
+
+    /**
+     * @brief Create a new layer instance object.
+     *
+     * @param instance               The instance handle this instance is created with.
+     * @param nlayerGetProcAddress   The vkGetProcAddress function in the driver/next layer down.
+     */
+    Instance(VkInstance instance, PFN_vkGetInstanceProcAddr nlayerGetProcAddress);
+
+public:
+    /**
+     * @brief The instance handle this instance is created with.
+     */
+    VkInstance instance;
+
+    /**
+     * @brief The next layer's \c vkGetInstanceProcAddr() function pointer.
+     */
+    PFN_vkGetInstanceProcAddr nlayerGetProcAddress;
+
+    /**
+     * @brief The driver function dispatch table.
+     */
+    InstanceDispatchTable driver {};
+
+    /**
+     * @brief The layer configuration.
+     */
+    const LayerConfig config;
+
+    /**
+     * @brief The minimum API version needed by this layer.
+     */
+    static const APIVersion minAPIVersion;
+
+    /**
+     * @brief The minimum set of instance extensions needed by this layer.
+     */
+    static const std::vector<std::string> extraExtensions;
+};
diff --git a/layer_gpu_profile/source/layer_comms.cpp b/layer_gpu_profile/source/layer_comms.cpp
new file mode 100644
index 0000000..98740ac
--- /dev/null
+++ b/layer_gpu_profile/source/layer_comms.cpp
@@ -0,0 +1,52 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2024-2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+#include <memory>
+
+#include "layer_comms.hpp"
+
+
+/* See header for documentation. */
+ProfileComms::ProfileComms(Comms::CommsInterface& _comms)
+    : comms(_comms)
+{
+    if (comms.isConnected())
+    {
+        endpoint = comms.getEndpointID("GPUProfile");
+    }
+}
+
+/* See header for documentation. */
+void ProfileComms::txMessage(const std::string& message)
+{
+    // Message endpoint is not available
+    if (endpoint == 0)
+    {
+        return;
+    }
+
+    auto data = std::make_unique<Comms::MessageData>(message.begin(), message.end());
+    comms.txAsync(endpoint, std::move(data));
+}
diff --git a/layer_gpu_profile/source/layer_comms.hpp b/layer_gpu_profile/source/layer_comms.hpp
new file mode 100644
index 0000000..1d79d7d
--- /dev/null
+++ b/layer_gpu_profile/source/layer_comms.hpp
@@ -0,0 +1,64 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2024 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * @file Declares a simple comms encoder for the profile layer.
+ */
+
+#pragma once
+
+#include "comms/comms_interface.hpp"
+
+/**
+ * @brief A simple message encoder for the profile layer comms endpoint.
+ */
+class ProfileComms
+{
+public:
+    /**
+     * @brief Construct a new encoder.
+     *
+     * @param comms   The common comms module used by all services.
+     */
+    ProfileComms(Comms::CommsInterface& comms);
+
+    /**
+     * @brief Send a message to the GPU profile endpoint service.
+     *
+     * @param message   The message to send.
+     */
+    void txMessage(const std::string& message);
+
+private:
+    /**
+     * @brief The endpoint ID of the service, or 0 if not found.
+     */
+    Comms::EndpointID endpoint {0};
+
+    /**
+     * @brief The common module for network messaging.
+     */
+    Comms::CommsInterface& comms;
+};
diff --git a/layer_gpu_profile/source/layer_config.cpp b/layer_gpu_profile/source/layer_config.cpp
new file mode 100644
index 0000000..1154401
--- /dev/null
+++ b/layer_gpu_profile/source/layer_config.cpp
@@ -0,0 +1,148 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * @file
+ * Defines the a config file to parameterize the layer.
+ */
+
+#include "layer_config.hpp"
+
+#include "framework/utils.hpp"
+#include "utils/misc.hpp"
+#include "version.hpp"
+
+#include <fstream>
+#include <sstream>
+#include <vector>
+
+#include <vulkan/vulkan.h>
+
+/* See header for documentation. */
+void LayerConfig::parseSamplingOptions(const json& config)
+{
+    // Decode top level options
+    std::string rawMode = config.at("sample_mode");
+
+    if (rawMode == "disabled")
+    {
+        mode = MODE_DISABLED;
+    }
+    else if (rawMode == "periodic_frame")
+    {
+        mode = MODE_PERIODIC_FRAME;
+        periodicFrame = config.at("periodic_frame");
+        periodicMinFrame = config.at("periodic_min_frame");
+    }
+    else if (rawMode == "frame_list")
+    {
+        mode = MODE_FRAME_LIST;
+        specificFrames = config.at("frame_list").get<std::vector<uint64_t>>();
+    }
+    else
+    {
+        LAYER_ERR("Unknown counter sample_mode: %s", rawMode.c_str());
+        rawMode = "disabled";
+    }
+
+    LAYER_LOG("Layer sampling configuration");
+    LAYER_LOG("============================");
+    LAYER_LOG(" - Sample mode: %s", rawMode.c_str());
+
+    if (mode == MODE_PERIODIC_FRAME)
+    {
+        LAYER_LOG(" - Frame period: %" PRIu64, periodicFrame);
+        LAYER_LOG(" - Minimum frame: %" PRIu64, periodicMinFrame);
+    }
+    else if (mode == MODE_FRAME_LIST)
+    {
+        std::stringstream result;
+        std::copy(specificFrames.begin(), specificFrames.end(), std::ostream_iterator<uint64_t>(result, " "));
+        LAYER_LOG(" - Frames: %s", result.str().c_str());
+    }
+}
+
+/* See header for documentation. */
+LayerConfig::LayerConfig()
+{
+#ifdef __ANDROID__
+    std::string fileName("/data/local/tmp/");
+    fileName.append(LGL_LAYER_CONFIG);
+#else
+    std::string fileName(LGL_LAYER_CONFIG);
+#endif
+
+    LAYER_LOG("Trying to read config: %s", fileName.c_str());
+
+    std::ifstream stream(fileName);
+    if (!stream)
+    {
+        LAYER_LOG("Failed to open layer config, using defaults");
+        return;
+    }
+
+    json data;
+
+    try
+    {
+        data = json::parse(stream);
+    }
+    catch (const json::parse_error& e)
+    {
+        LAYER_ERR("Failed to load layer config, using defaults");
+        LAYER_ERR("Error: %s", e.what());
+        return;
+    }
+
+    try
+    {
+        parseSamplingOptions(data);
+    }
+    catch (const json::out_of_range& e)
+    {
+        LAYER_ERR("Failed to read feature config, using defaults");
+        LAYER_ERR("Error: %s", e.what());
+    }
+}
+
+/* See header for documentation. */
+bool LayerConfig::isFrameOfInterest(
+    uint64_t frameID
+) const {
+    switch(mode)
+    {
+    case MODE_DISABLED:
+        return false;
+    case MODE_PERIODIC_FRAME:
+        return (frameID >= periodicMinFrame) &&
+               ((frameID % periodicFrame) == 0);
+    case MODE_FRAME_LIST:
+        return isIn(frameID, specificFrames);
+    }
+
+    // Should never reach here
+    return false;
+}
+
diff --git a/layer_gpu_profile/source/layer_config.hpp b/layer_gpu_profile/source/layer_config.hpp
new file mode 100644
index 0000000..2c54cb3
--- /dev/null
+++ b/layer_gpu_profile/source/layer_config.hpp
@@ -0,0 +1,100 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * @file
+ * Declares the a config file to parameterize the layer.
+ */
+
+#pragma once
+
+#include <nlohmann/json.hpp>
+using json = nlohmann::json;
+
+/**
+ * @brief This class implements a config interface for this layer.
+ *
+ * The layer contains a default config, but users can provide a JSON config
+ * file on the file system which is loaded at init time.
+ *  - On Android the file is loaded from /data/local/tmp.
+ *  - On Linux the file is loaded from the current working directory.
+ */
+class LayerConfig
+{
+public:
+    /**
+     * @brief Create a new layer config.
+     */
+    LayerConfig();
+
+    /**
+     * @brief Test if next frame should be profiled.
+     *
+     * @param frameID   The index of the next frame.
+     *
+     * @return True if profiling should be enabled, False otherwise.
+     */
+    bool isFrameOfInterest(uint64_t frameID) const;
+
+private:
+    /**
+     * @brief Supported sampling modes.
+     */
+    enum SamplingMode
+    {
+        MODE_DISABLED,
+        MODE_FRAME_LIST,
+        MODE_PERIODIC_FRAME
+    };
+
+    /**
+     * @brief Parse the configuration options for the sampling module.
+     *
+     * @param config   The JSON configuration.
+     *
+     * @throws json::out_of_bounds if required fields are missing.
+     */
+    void parseSamplingOptions(const json& config);
+
+    /**
+     * @brief The sampling mode.
+     */
+    SamplingMode mode {MODE_DISABLED};
+
+    /**
+     * @brief The sampling period in frames, or 0 if disabled.
+     */
+    uint64_t periodicFrame {0};
+
+    /**
+     * @brief The minimum frame to sample (inclusive).
+     */
+    uint64_t periodicMinFrame {0};
+
+    /**
+     * @brief The sampling frame list, or empty if disabled.
+     */
+    std::vector<uint64_t> specificFrames;
+};
diff --git a/layer_gpu_profile/source/layer_device_functions.hpp b/layer_gpu_profile/source/layer_device_functions.hpp
new file mode 100644
index 0000000..ff9781d
--- /dev/null
+++ b/layer_gpu_profile/source/layer_device_functions.hpp
@@ -0,0 +1,413 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2024-2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+#pragma once
+
+#include <vulkan/vulkan.h>
+
+// Functions for command pools
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL layer_vkCreateCommandPool<user_tag>(VkDevice device,
+                                                                   const VkCommandPoolCreateInfo* pCreateInfo,
+                                                                   const VkAllocationCallbacks* pAllocator,
+                                                                   VkCommandPool* pCommandPool);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL layer_vkResetCommandPool<user_tag>(VkDevice device,
+                                                                  VkCommandPool commandPool,
+                                                                  VkCommandPoolResetFlags flags);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkDestroyCommandPool<user_tag>(VkDevice device,
+                                                                VkCommandPool commandPool,
+                                                                const VkAllocationCallbacks* pAllocator);
+
+// Functions for command buffers
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL
+    layer_vkAllocateCommandBuffers<user_tag>(VkDevice device,
+                                             const VkCommandBufferAllocateInfo* pAllocateInfo,
+                                             VkCommandBuffer* pCommandBuffers);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult layer_vkBeginCommandBuffer<user_tag>(VkCommandBuffer commandBuffer,
+                                                         const VkCommandBufferBeginInfo* pBeginInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdExecuteCommands<user_tag>(VkCommandBuffer commandBuffer,
+                                                                uint32_t commandBufferCount,
+                                                                const VkCommandBuffer* pCommandBuffers);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL layer_vkResetCommandBuffer<user_tag>(VkCommandBuffer commandBuffer,
+                                                                    VkCommandBufferResetFlags flags);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkFreeCommandBuffers<user_tag>(VkDevice device,
+                                                                VkCommandPool commandPool,
+                                                                uint32_t commandBufferCount,
+                                                                const VkCommandBuffer* pCommandBuffers);
+
+// Functions for render passes
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL layer_vkCreateRenderPass<user_tag>(VkDevice device,
+                                                                  const VkRenderPassCreateInfo* pCreateInfo,
+                                                                  const VkAllocationCallbacks* pAllocator,
+                                                                  VkRenderPass* pRenderPass);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL layer_vkCreateRenderPass2<user_tag>(VkDevice device,
+                                                                   const VkRenderPassCreateInfo2* pCreateInfo,
+                                                                   const VkAllocationCallbacks* pAllocator,
+                                                                   VkRenderPass* pRenderPass);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL layer_vkCreateRenderPass2KHR<user_tag>(VkDevice device,
+                                                                      const VkRenderPassCreateInfo2* pCreateInfo,
+                                                                      const VkAllocationCallbacks* pAllocator,
+                                                                      VkRenderPass* pRenderPass);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkDestroyRenderPass<user_tag>(VkDevice device,
+                                                               VkRenderPass renderPass,
+                                                               const VkAllocationCallbacks* pAllocator);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdBeginRenderPass<user_tag>(VkCommandBuffer commandBuffer,
+                                                                const VkRenderPassBeginInfo* pRenderPassBegin,
+                                                                VkSubpassContents contents);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdBeginRenderPass2<user_tag>(VkCommandBuffer commandBuffer,
+                                                                 const VkRenderPassBeginInfo* pRenderPassBegin,
+                                                                 const VkSubpassBeginInfo* pSubpassBeginInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdBeginRenderPass2KHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                                    const VkRenderPassBeginInfo* pRenderPassBegin,
+                                                                    const VkSubpassBeginInfo* pSubpassBeginInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdBeginRendering<user_tag>(VkCommandBuffer commandBuffer,
+                                                               const VkRenderingInfo* pRenderingInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdBeginRenderingKHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                                  const VkRenderingInfo* pRenderingInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdEndRenderPass<user_tag>(VkCommandBuffer commandBuffer);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdEndRenderPass2<user_tag>(VkCommandBuffer commandBuffer,
+                                                               const VkSubpassEndInfo* pSubpassEndInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdEndRenderPass2KHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                                  const VkSubpassEndInfo* pSubpassEndInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdEndRendering<user_tag>(VkCommandBuffer commandBuffer);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdEndRenderingKHR<user_tag>(VkCommandBuffer commandBuffer);
+
+// Functions for compute dispatches
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdDispatch<user_tag>(VkCommandBuffer commandBuffer,
+                                                         uint32_t groupCountX,
+                                                         uint32_t groupCountY,
+                                                         uint32_t groupCountZ);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdDispatchBase<user_tag>(VkCommandBuffer commandBuffer,
+                                                             uint32_t baseGroupX,
+                                                             uint32_t baseGroupY,
+                                                             uint32_t baseGroupZ,
+                                                             uint32_t groupCountX,
+                                                             uint32_t groupCountY,
+                                                             uint32_t groupCountZ);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdDispatchBaseKHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                                uint32_t baseGroupX,
+                                                                uint32_t baseGroupY,
+                                                                uint32_t baseGroupZ,
+                                                                uint32_t groupCountX,
+                                                                uint32_t groupCountY,
+                                                                uint32_t groupCountZ);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdDispatchIndirect<user_tag>(VkCommandBuffer commandBuffer,
+                                                                 VkBuffer buffer,
+                                                                 VkDeviceSize offset);
+
+// Commands for trace rays
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdTraceRaysIndirect2KHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                                      VkDeviceAddress indirectDeviceAddress);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL
+    layer_vkCmdTraceRaysIndirectKHR<user_tag>(VkCommandBuffer commandBuffer,
+                                              const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
+                                              const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
+                                              const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
+                                              const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
+                                              VkDeviceAddress indirectDeviceAddress);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL
+    layer_vkCmdTraceRaysKHR<user_tag>(VkCommandBuffer commandBuffer,
+                                      const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
+                                      const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
+                                      const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
+                                      const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
+                                      uint32_t width,
+                                      uint32_t height,
+                                      uint32_t depth);
+
+// Commands for acceleration structure builds
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdBuildAccelerationStructuresIndirectKHR<user_tag>(
+    VkCommandBuffer commandBuffer,
+    uint32_t infoCount,
+    const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
+    const VkDeviceAddress* pIndirectDeviceAddresses,
+    const uint32_t* pIndirectStrides,
+    const uint32_t* const* ppMaxPrimitiveCounts);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdBuildAccelerationStructuresKHR<user_tag>(
+    VkCommandBuffer commandBuffer,
+    uint32_t infoCount,
+    const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
+    const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos);
+
+// Commands for transfers
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdFillBuffer<user_tag>(VkCommandBuffer commandBuffer,
+                                                           VkBuffer dstBuffer,
+                                                           VkDeviceSize dstOffset,
+                                                           VkDeviceSize size,
+                                                           uint32_t data);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdClearColorImage<user_tag>(VkCommandBuffer commandBuffer,
+                                                                VkImage image,
+                                                                VkImageLayout imageLayout,
+                                                                const VkClearColorValue* pColor,
+                                                                uint32_t rangeCount,
+                                                                const VkImageSubresourceRange* pRanges);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdClearDepthStencilImage<user_tag>(VkCommandBuffer commandBuffer,
+                                                                       VkImage image,
+                                                                       VkImageLayout imageLayout,
+                                                                       const VkClearDepthStencilValue* pDepthStencil,
+                                                                       uint32_t rangeCount,
+                                                                       const VkImageSubresourceRange* pRanges);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdCopyBuffer<user_tag>(VkCommandBuffer commandBuffer,
+                                                           VkBuffer srcBuffer,
+                                                           VkBuffer dstBuffer,
+                                                           uint32_t regionCount,
+                                                           const VkBufferCopy* pRegions);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdCopyBuffer2<user_tag>(VkCommandBuffer commandBuffer,
+                                                            const VkCopyBufferInfo2* pCopyBufferInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdCopyBuffer2KHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                               const VkCopyBufferInfo2* pCopyBufferInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdCopyBufferToImage<user_tag>(VkCommandBuffer commandBuffer,
+                                                                  VkBuffer srcBuffer,
+                                                                  VkImage dstImage,
+                                                                  VkImageLayout dstImageLayout,
+                                                                  uint32_t regionCount,
+                                                                  const VkBufferImageCopy* pRegions);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL
+    layer_vkCmdCopyBufferToImage2<user_tag>(VkCommandBuffer commandBuffer,
+                                            const VkCopyBufferToImageInfo2* pCopyBufferToImageInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL
+    layer_vkCmdCopyBufferToImage2KHR<user_tag>(VkCommandBuffer commandBuffer,
+                                               const VkCopyBufferToImageInfo2* pCopyBufferToImageInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdCopyImage<user_tag>(VkCommandBuffer commandBuffer,
+                                                          VkImage srcImage,
+                                                          VkImageLayout srcImageLayout,
+                                                          VkImage dstImage,
+                                                          VkImageLayout dstImageLayout,
+                                                          uint32_t regionCount,
+                                                          const VkImageCopy* pRegions);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdCopyImage2<user_tag>(VkCommandBuffer commandBuffer,
+                                                           const VkCopyImageInfo2* pCopyImageInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdCopyImage2KHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                              const VkCopyImageInfo2* pCopyImageInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdCopyImageToBuffer<user_tag>(VkCommandBuffer commandBuffer,
+                                                                  VkImage srcImage,
+                                                                  VkImageLayout srcImageLayout,
+                                                                  VkBuffer dstBuffer,
+                                                                  uint32_t regionCount,
+                                                                  const VkBufferImageCopy* pRegions);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL
+    layer_vkCmdCopyImageToBuffer2<user_tag>(VkCommandBuffer commandBuffer,
+                                            const VkCopyImageToBufferInfo2* pCopyImageToBufferInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL
+    layer_vkCmdCopyImageToBuffer2KHR<user_tag>(VkCommandBuffer commandBuffer,
+                                               const VkCopyImageToBufferInfo2* pCopyImageToBufferInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL
+    layer_vkCmdCopyAccelerationStructureKHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                      const VkCopyAccelerationStructureInfoKHR* pInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL
+    layer_vkCmdCopyAccelerationStructureToMemoryKHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                              const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL
+    layer_vkCmdCopyMemoryToAccelerationStructureKHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                              const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo);
+
+// Functions for debug
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdDebugMarkerBeginEXT<user_tag>(VkCommandBuffer commandBuffer,
+                                                                    const VkDebugMarkerMarkerInfoEXT* pMarkerInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdDebugMarkerEndEXT<user_tag>(VkCommandBuffer commandBuffer);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdBeginDebugUtilsLabelEXT<user_tag>(VkCommandBuffer commandBuffer,
+                                                                        const VkDebugUtilsLabelEXT* pLabelInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdEndDebugUtilsLabelEXT<user_tag>(VkCommandBuffer commandBuffer);
+
+// Functions for queues
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL layer_vkQueuePresentKHR<user_tag>(VkQueue queue, const VkPresentInfoKHR* pPresentInfo);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL
+    layer_vkQueueSubmit<user_tag>(VkQueue queue, uint32_t submitCount, const VkSubmitInfo* pSubmits, VkFence fence);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL
+    layer_vkQueueSubmit2<user_tag>(VkQueue queue, uint32_t submitCount, const VkSubmitInfo2* pSubmits, VkFence fence);
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL layer_vkQueueSubmit2KHR<user_tag>(VkQueue queue,
+                                                                 uint32_t submitCount,
+                                                                 const VkSubmitInfo2* pSubmits,
+                                                                 VkFence fence);
diff --git a/layer_gpu_profile/source/layer_device_functions_command_buffer.cpp b/layer_gpu_profile/source/layer_device_functions_command_buffer.cpp
new file mode 100644
index 0000000..de0f4f4
--- /dev/null
+++ b/layer_gpu_profile/source/layer_device_functions_command_buffer.cpp
@@ -0,0 +1,152 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2024 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+#include "device.hpp"
+#include "framework/device_dispatch_table.hpp"
+
+#include <mutex>
+
+extern std::mutex g_vulkanLock;
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL
+    layer_vkAllocateCommandBuffers<user_tag>(VkDevice device,
+                                             const VkCommandBufferAllocateInfo* pAllocateInfo,
+                                             VkCommandBuffer* pCommandBuffers)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(device);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    VkResult result = layer->driver.vkAllocateCommandBuffers(device, pAllocateInfo, pCommandBuffers);
+    if (result != VK_SUCCESS)
+    {
+        return result;
+    }
+
+    // Retake the lock to access layer-wide global store
+    lock.lock();
+    auto& tracker = layer->getStateTracker();
+    for (uint32_t i = 0; i < pAllocateInfo->commandBufferCount; i++)
+    {
+        tracker.allocateCommandBuffer(pAllocateInfo->commandPool, pCommandBuffers[i]);
+    }
+
+    return result;
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult layer_vkBeginCommandBuffer<user_tag>(VkCommandBuffer commandBuffer,
+                                                         const VkCommandBufferBeginInfo* pBeginInfo)
+{
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    auto& tracker = layer->getStateTracker();
+    auto& cmdBuffer = tracker.getCommandBuffer(commandBuffer);
+    cmdBuffer.reset();
+    cmdBuffer.begin(pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    return layer->driver.vkBeginCommandBuffer(commandBuffer, pBeginInfo);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL layer_vkResetCommandBuffer<user_tag>(VkCommandBuffer commandBuffer,
+                                                                    VkCommandBufferResetFlags flags)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    auto& tracker = layer->getStateTracker();
+    auto& cmdBuffer = tracker.getCommandBuffer(commandBuffer);
+    cmdBuffer.reset();
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    return layer->driver.vkResetCommandBuffer(commandBuffer, flags);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkFreeCommandBuffers<user_tag>(VkDevice device,
+                                                                VkCommandPool commandPool,
+                                                                uint32_t commandBufferCount,
+                                                                const VkCommandBuffer* pCommandBuffers)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(device);
+
+    auto& tracker = layer->getStateTracker();
+    for (uint32_t i = 0; i < commandBufferCount; i++)
+    {
+        tracker.freeCommandBuffer(commandPool, pCommandBuffers[i]);
+    }
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkFreeCommandBuffers(device, commandPool, commandBufferCount, pCommandBuffers);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdExecuteCommands<user_tag>(VkCommandBuffer commandBuffer,
+                                                                uint32_t commandBufferCount,
+                                                                const VkCommandBuffer* pCommandBuffers)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store and device-wide data
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    auto& tracker = layer->getStateTracker();
+    auto& primary = tracker.getCommandBuffer(commandBuffer);
+
+    for (uint32_t i = 0; i < commandBufferCount; i++)
+    {
+        auto& secondary = tracker.getCommandBuffer(pCommandBuffers[i]);
+        primary.executeCommands(secondary);
+    }
+
+    // Release the lock to call into the main driver
+    lock.unlock();
+    layer->driver.vkCmdExecuteCommands(commandBuffer, commandBufferCount, pCommandBuffers);
+}
diff --git a/layer_gpu_profile/source/layer_device_functions_command_pool.cpp b/layer_gpu_profile/source/layer_device_functions_command_pool.cpp
new file mode 100644
index 0000000..4beb9c7
--- /dev/null
+++ b/layer_gpu_profile/source/layer_device_functions_command_pool.cpp
@@ -0,0 +1,99 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2024 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+#include "device.hpp"
+#include "framework/device_dispatch_table.hpp"
+
+#include <mutex>
+
+extern std::mutex g_vulkanLock;
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL layer_vkCreateCommandPool<user_tag>(VkDevice device,
+                                                                   const VkCommandPoolCreateInfo* pCreateInfo,
+                                                                   const VkAllocationCallbacks* pAllocator,
+                                                                   VkCommandPool* pCommandPool)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(device);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    VkResult result = layer->driver.vkCreateCommandPool(device, pCreateInfo, pAllocator, pCommandPool);
+    if (result != VK_SUCCESS)
+    {
+        return result;
+    }
+
+    // Retake the lock to access layer-wide global store
+    lock.lock();
+    auto& tracker = layer->getStateTracker();
+    tracker.createCommandPool(*pCommandPool);
+    return result;
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL layer_vkResetCommandPool<user_tag>(VkDevice device,
+                                                                  VkCommandPool commandPool,
+                                                                  VkCommandPoolResetFlags flags)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(device);
+
+    auto& tracker = layer->getStateTracker();
+    tracker.getCommandPool(commandPool).reset();
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    return layer->driver.vkResetCommandPool(device, commandPool, flags);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkDestroyCommandPool<user_tag>(VkDevice device,
+                                                                VkCommandPool commandPool,
+                                                                const VkAllocationCallbacks* pAllocator)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(device);
+
+    auto& tracker = layer->getStateTracker();
+    tracker.destroyCommandPool(commandPool);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkDestroyCommandPool(device, commandPool, pAllocator);
+}
diff --git a/layer_gpu_profile/source/layer_device_functions_debug.cpp b/layer_gpu_profile/source/layer_device_functions_debug.cpp
new file mode 100644
index 0000000..f975b38
--- /dev/null
+++ b/layer_gpu_profile/source/layer_device_functions_debug.cpp
@@ -0,0 +1,133 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2024-2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+#include "device.hpp"
+#include "framework/device_dispatch_table.hpp"
+
+#include <mutex>
+
+extern std::mutex g_vulkanLock;
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdDebugMarkerBeginEXT<user_tag>(VkCommandBuffer commandBuffer,
+                                                                    const VkDebugMarkerMarkerInfoEXT* pMarkerInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // Only instrument inside active frame of interest
+    if(layer->isFrameOfInterest)
+    {
+        auto& tracker = layer->getStateTracker();
+        auto& cb = tracker.getCommandBuffer(commandBuffer);
+
+        // Push the label scope to the tracker
+        cb.debugMarkerBegin(pMarkerInfo->pMarkerName);
+    }
+
+    // ... and forward to the driver
+    lock.unlock();
+    layer->driver.vkCmdDebugMarkerBeginEXT(commandBuffer, pMarkerInfo);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdDebugMarkerEndEXT<user_tag>(VkCommandBuffer commandBuffer)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // Only instrument inside active frame of interest
+    if(layer->isFrameOfInterest)
+    {
+        auto& tracker = layer->getStateTracker();
+        auto& cb = tracker.getCommandBuffer(commandBuffer);
+
+        // Pop the label scope in the tracker
+        cb.debugMarkerEnd();
+    }
+
+    // ... and forward to the driver
+    lock.unlock();
+    layer->driver.vkCmdDebugMarkerEndEXT(commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdBeginDebugUtilsLabelEXT<user_tag>(VkCommandBuffer commandBuffer,
+                                                                        const VkDebugUtilsLabelEXT* pLabelInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // Only instrument inside active frame of interest
+    if(layer->isFrameOfInterest)
+    {
+        auto& tracker = layer->getStateTracker();
+        auto& cb = tracker.getCommandBuffer(commandBuffer);
+
+        // Push the label scope to the tracker
+        cb.debugMarkerBegin(pLabelInfo->pLabelName);
+    }
+
+    // ... and forward to the driver
+    lock.unlock();
+    layer->driver.vkCmdBeginDebugUtilsLabelEXT(commandBuffer, pLabelInfo);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdEndDebugUtilsLabelEXT<user_tag>(VkCommandBuffer commandBuffer)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // Only instrument inside active frame of interest
+    if(layer->isFrameOfInterest)
+    {
+        auto& tracker = layer->getStateTracker();
+        auto& cb = tracker.getCommandBuffer(commandBuffer);
+
+        // Pop the label scope in the tracker
+        cb.debugMarkerEnd();
+    }
+
+    // ... and forward to the driver
+    lock.unlock();
+    layer->driver.vkCmdEndDebugUtilsLabelEXT(commandBuffer);
+}
diff --git a/layer_gpu_profile/source/layer_device_functions_dispatch.cpp b/layer_gpu_profile/source/layer_device_functions_dispatch.cpp
new file mode 100644
index 0000000..70c59f5
--- /dev/null
+++ b/layer_gpu_profile/source/layer_device_functions_dispatch.cpp
@@ -0,0 +1,158 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2024-2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+#include "device.hpp"
+#include "device_utils.hpp"
+#include "framework/device_dispatch_table.hpp"
+
+#include <mutex>
+
+extern std::mutex g_vulkanLock;
+
+/**
+ * @brief Register a compute dispatch with the tracker.
+ *
+ * @param layer           The layer context for the device.
+ * @param commandBuffer   The command buffer we are recording.
+ * @param groupX          The X size of the dispatch in groups.
+ * @param groupY          The Y size of the dispatch in groups.
+ * @param groupZ          The Z size of the dispatch in groups.
+ */
+static void registerDispatch(Device* layer,
+                             VkCommandBuffer commandBuffer,
+                             int64_t groupX,
+                             int64_t groupY,
+                             int64_t groupZ)
+{
+    if (!layer->isFrameOfInterest)
+    {
+        return;
+    }
+
+    auto& tracker = layer->getStateTracker();
+    auto& cb = tracker.getCommandBuffer(commandBuffer);
+    cb.dispatch(groupX, groupY, groupZ);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdDispatch<user_tag>(VkCommandBuffer commandBuffer,
+                                                         uint32_t groupCountX,
+                                                         uint32_t groupCountY,
+                                                         uint32_t groupCountZ)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    registerDispatch(layer,
+                     commandBuffer,
+                     static_cast<int64_t>(groupCountX),
+                     static_cast<int64_t>(groupCountY),
+                     static_cast<int64_t>(groupCountZ));
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdDispatch(commandBuffer, groupCountX, groupCountY, groupCountZ);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdDispatchBase<user_tag>(VkCommandBuffer commandBuffer,
+                                                             uint32_t baseGroupX,
+                                                             uint32_t baseGroupY,
+                                                             uint32_t baseGroupZ,
+                                                             uint32_t groupCountX,
+                                                             uint32_t groupCountY,
+                                                             uint32_t groupCountZ)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    registerDispatch(layer,
+                     commandBuffer,
+                     static_cast<int64_t>(groupCountX),
+                     static_cast<int64_t>(groupCountY),
+                     static_cast<int64_t>(groupCountZ));
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdDispatchBase(commandBuffer, baseGroupX, baseGroupY, baseGroupZ, groupCountX, groupCountY, groupCountZ);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdDispatchBaseKHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                                uint32_t baseGroupX,
+                                                                uint32_t baseGroupY,
+                                                                uint32_t baseGroupZ,
+                                                                uint32_t groupCountX,
+                                                                uint32_t groupCountY,
+                                                                uint32_t groupCountZ)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    registerDispatch(layer,
+                     commandBuffer,
+                     static_cast<int64_t>(groupCountX),
+                     static_cast<int64_t>(groupCountY),
+                     static_cast<int64_t>(groupCountZ));
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdDispatchBaseKHR(commandBuffer, baseGroupX, baseGroupY, baseGroupZ, groupCountX, groupCountY, groupCountZ);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdDispatchIndirect<user_tag>(VkCommandBuffer commandBuffer,
+                                                                 VkBuffer buffer,
+                                                                 VkDeviceSize offset)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    registerDispatch(layer, commandBuffer, -1, -1, -1);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdDispatchIndirect(commandBuffer, buffer, offset);
+    emitCPUTrap(*layer, commandBuffer);
+}
diff --git a/layer_gpu_profile/source/layer_device_functions_queue.cpp b/layer_gpu_profile/source/layer_device_functions_queue.cpp
new file mode 100644
index 0000000..57c722d
--- /dev/null
+++ b/layer_gpu_profile/source/layer_device_functions_queue.cpp
@@ -0,0 +1,255 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2024-2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+#include "device.hpp"
+#include "device_utils.hpp"
+#include "submit_visitor.hpp"
+
+#include "framework/device_dispatch_table.hpp"
+#include "trackers/queue.hpp"
+
+#include <nlohmann/json.hpp>
+
+#include <mutex>
+#include <time.h>
+
+using json = nlohmann::json;
+
+extern std::mutex g_vulkanLock;
+
+/**
+ * @brief Process the command buffer stream for events.
+ *
+ * @param layer             The layer context.
+ * @param queue             The queue being submitted to.
+ * @param commandBuffer     The command buffer being submitted.
+ */
+static void processLayerCommandStream(Device& layer,
+                                      VkQueue queue,
+                                      VkCommandBuffer commandBuffer)
+{
+    // Fetch layer proxies for this workload
+    auto& tracker = layer.getStateTracker();
+    auto& trackQueue = tracker.getQueue(queue);
+    auto& trackCB = tracker.getCommandBuffer(commandBuffer);
+
+    // Play the layer command stream
+    ProfileSubmitVisitor workloadVisitor(layer);
+
+    const auto& cbLCS = trackCB.getSubmitCommandStream();
+    trackQueue.runSubmitCommandStream(cbLCS, workloadVisitor);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL layer_vkQueuePresentKHR<user_tag>(VkQueue queue, const VkPresentInfoKHR* pPresentInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(queue);
+
+    auto& tracker = layer->getStateTracker();
+    tracker.queuePresent();
+
+    // End the previous frame if it was "of interest"
+    if (layer->isFrameOfInterest)
+    {
+        json endFrameMessage {
+            { "type", "end_frame" }
+        };
+
+        layer->txMessage(endFrameMessage.dump());
+    }
+
+    uint64_t frameID = tracker.totalStats.getFrameCount();
+    layer->isFrameOfInterest = layer->instance->config.isFrameOfInterest(frameID);
+
+    // Start the next frame if it is "of interest"
+    if (layer->isFrameOfInterest)
+    {
+        json startFrameMessage {
+            { "type", "start_frame" },
+            { "frame", frameID },
+        };
+
+        layer->txMessage(startFrameMessage.dump());
+    }
+
+    // If a "normal" frame then release the lock before calling in to the
+    // driver, otherwise keep the lock to stop other threads using Vulkan
+    // while we sync and reset the counter stream
+    if (!layer->isFrameOfInterest)
+    {
+        lock.unlock();
+    }
+
+    auto ret = layer->driver.vkQueuePresentKHR(queue, pPresentInfo);
+
+    // If we are measuring performance ensure the previous frame has finished
+    // and then take an initial sample to reset the counters
+    if (layer->isFrameOfInterest)
+    {
+        layer->driver.vkDeviceWaitIdle(layer->device);
+        workaroundDelay();
+        auto ec = layer->lgcSampler->sample_now();
+        if (ec)
+        {
+            LAYER_ERR("Failed to make libGPUCounters GPU counter sample");
+        }
+    }
+
+    return ret;
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL
+    layer_vkQueueSubmit<user_tag>(VkQueue queue, uint32_t submitCount, const VkSubmitInfo* pSubmits, VkFence fence)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(queue);
+
+    // If a "normal" frame then release the lock before calling in to the
+    // driver, otherwise keep the lock to stop other threads using Vulkan
+    // while we sync and reset the counter stream
+    if (!layer->isFrameOfInterest)
+    {
+        lock.unlock();
+    }
+
+    auto res = layer->driver.vkQueueSubmit(queue, submitCount, pSubmits, fence);
+    if (res != VK_SUCCESS)
+    {
+        return res;
+    }
+
+    // If we are measuring performance then run the layer command stream with
+    // the lock held to stop other submits perturbing the counter data
+    if (layer->isFrameOfInterest)
+    {
+        for (uint32_t i = 0; i < submitCount; i++)
+        {
+            const auto& submit = pSubmits[i];
+            for (uint32_t j = 0; j < submit.commandBufferCount; j++)
+            {
+                VkCommandBuffer commandBuffer = submit.pCommandBuffers[j];
+                processLayerCommandStream(*layer, queue, commandBuffer);
+            }
+        }
+    }
+
+    return res;
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL
+    layer_vkQueueSubmit2<user_tag>(VkQueue queue, uint32_t submitCount, const VkSubmitInfo2* pSubmits, VkFence fence)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(queue);
+
+    // If a "normal" frame then release the lock before calling in to the
+    // driver, otherwise keep the lock to stop other threads using Vulkan
+    // while we sync and reset the counter stream
+    if (!layer->isFrameOfInterest)
+    {
+        lock.unlock();
+    }
+
+    auto res = layer->driver.vkQueueSubmit2(queue, submitCount, pSubmits, fence);
+    if (res != VK_SUCCESS)
+    {
+        return res;
+    }
+
+    // If we are measuring performance then run the layer command stream with
+    // the lock held to stop other submits perturbing the counter data
+    if (layer->isFrameOfInterest)
+    {
+        for (uint32_t i = 0; i < submitCount; i++)
+        {
+            const auto& submit = pSubmits[i];
+            for (uint32_t j = 0; j < submit.commandBufferInfoCount; j++)
+            {
+                VkCommandBuffer commandBuffer = submit.pCommandBufferInfos[j].commandBuffer;
+                processLayerCommandStream(*layer, queue, commandBuffer);
+            }
+        }
+    }
+
+    return res;
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL
+    layer_vkQueueSubmit2KHR<user_tag>(VkQueue queue, uint32_t submitCount, const VkSubmitInfo2* pSubmits, VkFence fence)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(queue);
+
+    // If a "normal" frame then release the lock before calling in to the
+    // driver, otherwise keep the lock to stop other threads using Vulkan
+    // while we sync and reset the counter stream
+    if (!layer->isFrameOfInterest)
+    {
+        lock.unlock();
+    }
+
+    auto res = layer->driver.vkQueueSubmit2KHR(queue, submitCount, pSubmits, fence);
+    if (res != VK_SUCCESS || !layer->isFrameOfInterest)
+    {
+        return res;
+    }
+
+    // If we are measuring performance then run the layer command stream with
+    // the lock held to stop other submits perturbing the counter data
+    if (layer->isFrameOfInterest)
+    {
+        for (uint32_t i = 0; i < submitCount; i++)
+        {
+            const auto& submit = pSubmits[i];
+            for (uint32_t j = 0; j < submit.commandBufferInfoCount; j++)
+            {
+                VkCommandBuffer commandBuffer = submit.pCommandBufferInfos[j].commandBuffer;
+                processLayerCommandStream(*layer, queue, commandBuffer);
+            }
+        }
+    }
+
+    return res;
+}
diff --git a/layer_gpu_profile/source/layer_device_functions_render_pass.cpp b/layer_gpu_profile/source/layer_device_functions_render_pass.cpp
new file mode 100644
index 0000000..ff9b87a
--- /dev/null
+++ b/layer_gpu_profile/source/layer_device_functions_render_pass.cpp
@@ -0,0 +1,424 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2024-2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+#include "device.hpp"
+#include "device_utils.hpp"
+#include "framework/device_dispatch_table.hpp"
+#include "framework/utils.hpp"
+#include "trackers/render_pass.hpp"
+
+#include <mutex>
+
+extern std::mutex g_vulkanLock;
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL layer_vkCreateRenderPass<user_tag>(VkDevice device,
+                                                                  const VkRenderPassCreateInfo* pCreateInfo,
+                                                                  const VkAllocationCallbacks* pAllocator,
+                                                                  VkRenderPass* pRenderPass)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(device);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    VkResult ret = layer->driver.vkCreateRenderPass(device, pCreateInfo, pAllocator, pRenderPass);
+    if (ret != VK_SUCCESS)
+    {
+        return ret;
+    }
+
+    // Retake the lock to access layer-wide global store
+    lock.lock();
+    auto& tracker = layer->getStateTracker();
+    tracker.createRenderPass(*pRenderPass, *pCreateInfo);
+    return VK_SUCCESS;
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL layer_vkCreateRenderPass2<user_tag>(VkDevice device,
+                                                                   const VkRenderPassCreateInfo2* pCreateInfo,
+                                                                   const VkAllocationCallbacks* pAllocator,
+                                                                   VkRenderPass* pRenderPass)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(device);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    VkResult ret = layer->driver.vkCreateRenderPass2(device, pCreateInfo, pAllocator, pRenderPass);
+    if (ret != VK_SUCCESS)
+    {
+        return ret;
+    }
+
+    // Retake the lock to access layer-wide global store
+    lock.lock();
+    auto& tracker = layer->getStateTracker();
+    tracker.createRenderPass(*pRenderPass, *pCreateInfo);
+    return VK_SUCCESS;
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL layer_vkCreateRenderPass2KHR<user_tag>(VkDevice device,
+                                                                      const VkRenderPassCreateInfo2* pCreateInfo,
+                                                                      const VkAllocationCallbacks* pAllocator,
+                                                                      VkRenderPass* pRenderPass)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(device);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    VkResult ret = layer->driver.vkCreateRenderPass2KHR(device, pCreateInfo, pAllocator, pRenderPass);
+    if (ret != VK_SUCCESS)
+    {
+        return ret;
+    }
+
+    // Retake the lock to access layer-wide global store
+    lock.lock();
+    auto& tracker = layer->getStateTracker();
+    tracker.createRenderPass(*pRenderPass, *pCreateInfo);
+    return VK_SUCCESS;
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkDestroyRenderPass<user_tag>(VkDevice device,
+                                                               VkRenderPass renderPass,
+                                                               const VkAllocationCallbacks* pAllocator)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(device);
+
+    auto& tracker = layer->getStateTracker();
+    tracker.destroyRenderPass(renderPass);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkDestroyRenderPass(device, renderPass, pAllocator);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdBeginRenderPass<user_tag>(VkCommandBuffer commandBuffer,
+                                                                const VkRenderPassBeginInfo* pRenderPassBegin,
+                                                                VkSubpassContents contents)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    if (layer->isFrameOfInterest)
+    {
+        auto& tracker = layer->getStateTracker();
+        auto& cb = tracker.getCommandBuffer(commandBuffer);
+
+        auto& rp = tracker.getRenderPass(pRenderPassBegin->renderPass);
+        uint32_t width = pRenderPassBegin->renderArea.extent.width;
+        uint32_t height = pRenderPassBegin->renderArea.extent.height;
+
+        // Notify the command buffer we are starting a new render pass
+        cb.renderPassBegin(rp, width, height);
+    }
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdBeginRenderPass(commandBuffer, pRenderPassBegin, contents);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdBeginRenderPass2<user_tag>(VkCommandBuffer commandBuffer,
+                                                                 const VkRenderPassBeginInfo* pRenderPassBegin,
+                                                                 const VkSubpassBeginInfo* pSubpassBeginInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    if (layer->isFrameOfInterest)
+    {
+        auto& tracker = layer->getStateTracker();
+        auto& cb = tracker.getCommandBuffer(commandBuffer);
+
+        auto& rp = tracker.getRenderPass(pRenderPassBegin->renderPass);
+        uint32_t width = pRenderPassBegin->renderArea.extent.width;
+        uint32_t height = pRenderPassBegin->renderArea.extent.height;
+
+        // Notify the command buffer we are starting a new render pass
+        cb.renderPassBegin(rp, width, height);
+    }
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdBeginRenderPass2(commandBuffer, pRenderPassBegin, pSubpassBeginInfo);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdBeginRenderPass2KHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                                    const VkRenderPassBeginInfo* pRenderPassBegin,
+                                                                    const VkSubpassBeginInfo* pSubpassBeginInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    if (layer->isFrameOfInterest)
+    {
+        auto& tracker = layer->getStateTracker();
+        auto& cb = tracker.getCommandBuffer(commandBuffer);
+
+        auto& rp = tracker.getRenderPass(pRenderPassBegin->renderPass);
+        uint32_t width = pRenderPassBegin->renderArea.extent.width;
+        uint32_t height = pRenderPassBegin->renderArea.extent.height;
+
+        // Notify the command buffer we are starting a new render pass
+        cb.renderPassBegin(rp, width, height);
+    }
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdBeginRenderPass2KHR(commandBuffer, pRenderPassBegin, pSubpassBeginInfo);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdBeginRendering<user_tag>(VkCommandBuffer commandBuffer,
+                                                               const VkRenderingInfo* pRenderingInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    if (layer->isFrameOfInterest)
+    {
+        auto& tracker = layer->getStateTracker();
+        auto& cb = tracker.getCommandBuffer(commandBuffer);
+
+        bool resuming = pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT;
+        bool suspending = pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT;
+
+        // Extract metadata for later use ...
+        Tracker::RenderPass rp(*pRenderingInfo);
+        uint32_t width = pRenderingInfo->renderArea.extent.width;
+        uint32_t height = pRenderingInfo->renderArea.extent.height;
+
+        // Notify the command buffer we are starting a new render pass
+        cb.renderPassBegin(rp, width, height, resuming, suspending);
+    }
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdBeginRendering(commandBuffer, pRenderingInfo);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdBeginRenderingKHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                                  const VkRenderingInfo* pRenderingInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    if (layer->isFrameOfInterest)
+    {
+        auto& tracker = layer->getStateTracker();
+        auto& cb = tracker.getCommandBuffer(commandBuffer);
+
+        bool resuming = pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT;
+        bool suspending = pRenderingInfo->flags & VK_RENDERING_SUSPENDING_BIT;
+
+        // Extract metadata for later use ...
+        Tracker::RenderPass rp(*pRenderingInfo);
+        uint32_t width = pRenderingInfo->renderArea.extent.width;
+        uint32_t height = pRenderingInfo->renderArea.extent.height;
+
+        // Notify the command buffer we are starting a new render pass
+        cb.renderPassBegin(rp, width, height, resuming, suspending);
+    }
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdBeginRenderingKHR(commandBuffer, pRenderingInfo);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdEndRenderPass<user_tag>(VkCommandBuffer commandBuffer)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    if (layer->isFrameOfInterest)
+    {
+        // Update the layer command stream in the tracker
+        auto& tracker = layer->getStateTracker();
+        auto& cb = tracker.getCommandBuffer(commandBuffer);
+        cb.renderPassEnd();
+    }
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdEndRenderPass(commandBuffer);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdEndRenderPass2<user_tag>(VkCommandBuffer commandBuffer,
+                                                               const VkSubpassEndInfo* pSubpassEndInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    if (layer->isFrameOfInterest)
+    {
+        // Update the layer command stream in the tracker
+        auto& tracker = layer->getStateTracker();
+        auto& cb = tracker.getCommandBuffer(commandBuffer);
+        cb.renderPassEnd();
+    }
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdEndRenderPass2(commandBuffer, pSubpassEndInfo);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdEndRenderPass2KHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                                  const VkSubpassEndInfo* pSubpassEndInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    if (layer->isFrameOfInterest)
+    {
+        // Update the layer command stream in the tracker
+        auto& tracker = layer->getStateTracker();
+        auto& cb = tracker.getCommandBuffer(commandBuffer);
+        cb.renderPassEnd();
+    }
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdEndRenderPass2KHR(commandBuffer, pSubpassEndInfo);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdEndRendering<user_tag>(VkCommandBuffer commandBuffer)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    bool suspending {false};
+    if (layer->isFrameOfInterest)
+    {
+        // Update the layer command stream in the tracker
+        auto& tracker = layer->getStateTracker();
+        auto& cb = tracker.getCommandBuffer(commandBuffer);
+        suspending = cb.renderPassEnd();
+    }
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdEndRendering(commandBuffer);
+    if (!suspending)
+    {
+        emitCPUTrap(*layer, commandBuffer);
+    }
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdEndRenderingKHR<user_tag>(VkCommandBuffer commandBuffer)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    bool suspending {false};
+    if (layer->isFrameOfInterest)
+    {
+        // Update the layer command stream in the tracker
+        auto& tracker = layer->getStateTracker();
+        auto& cb = tracker.getCommandBuffer(commandBuffer);
+        suspending = cb.renderPassEnd();
+    }
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdEndRenderingKHR(commandBuffer);
+    if (!suspending)
+    {
+        emitCPUTrap(*layer, commandBuffer);
+    }
+}
diff --git a/layer_gpu_profile/source/layer_device_functions_trace_rays.cpp b/layer_gpu_profile/source/layer_device_functions_trace_rays.cpp
new file mode 100644
index 0000000..bacd68b
--- /dev/null
+++ b/layer_gpu_profile/source/layer_device_functions_trace_rays.cpp
@@ -0,0 +1,220 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2024-2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+#include "device.hpp"
+#include "device_utils.hpp"
+#include "framework/device_dispatch_table.hpp"
+
+#include <memory>
+#include <mutex>
+#include <thread>
+
+extern std::mutex g_vulkanLock;
+
+/**
+ * @brief Register an acceleration structure build with the tracker.
+ *
+ * @param layer            The layer context for the device.
+ * @param commandBuffer    The command buffer we are recording.
+ * @param buildType        The build type.
+ * @param primitiveCount   The number of primitives in the build.
+ */
+static void registerAccelerationStructureBuild(Device* layer,
+                                               VkCommandBuffer commandBuffer,
+                                               Tracker::LCSAccelerationStructureBuild::Type buildType,
+                                               int64_t primitiveCount)
+{
+    if (!layer->isFrameOfInterest)
+    {
+        return;
+    }
+
+    auto& tracker = layer->getStateTracker();
+    auto& cb = tracker.getCommandBuffer(commandBuffer);
+    cb.accelerationStructureBuild(buildType, primitiveCount);
+}
+
+/**
+ * @brief Register a trace rays dispatch with the tracker.
+ *
+ * @param layer           The layer context for the device.
+ * @param commandBuffer   The command buffer we are recording.
+ * @param itemsX          The X size of the dispatch in work items.
+ * @param itemsY          The Y size of the dispatch in work items.
+ * @param itemsZ          The Z size of the dispatch in work items.
+ */
+static void registerTraceRays(Device* layer,
+                              VkCommandBuffer commandBuffer,
+                              int64_t itemsX,
+                              int64_t itemsY,
+                              int64_t itemsZ)
+{
+    if (!layer->isFrameOfInterest)
+    {
+        return;
+    }
+
+    auto& tracker = layer->getStateTracker();
+    auto& cb = tracker.getCommandBuffer(commandBuffer);
+    cb.traceRays(itemsX, itemsY, itemsZ);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdBuildAccelerationStructuresIndirectKHR<user_tag>(
+    VkCommandBuffer commandBuffer,
+    uint32_t infoCount,
+    const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
+    const VkDeviceAddress* pIndirectDeviceAddresses,
+    const uint32_t* pIndirectStrides,
+    const uint32_t* const* ppMaxPrimitiveCounts)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    registerAccelerationStructureBuild(layer,
+                                       commandBuffer,
+                                       Tracker::LCSAccelerationStructureBuild::Type::unknown,
+                                       -1);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdBuildAccelerationStructuresIndirectKHR(commandBuffer,
+                                                              infoCount,
+                                                              pInfos,
+                                                              pIndirectDeviceAddresses,
+                                                              pIndirectStrides,
+                                                              ppMaxPrimitiveCounts);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdBuildAccelerationStructuresKHR<user_tag>(
+    VkCommandBuffer commandBuffer,
+    uint32_t infoCount,
+    const VkAccelerationStructureBuildGeometryInfoKHR* pInfos,
+    const VkAccelerationStructureBuildRangeInfoKHR* const* ppBuildRangeInfos)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    registerAccelerationStructureBuild(layer,
+                                       commandBuffer,
+                                       Tracker::LCSAccelerationStructureBuild::Type::unknown,
+                                       -1);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdBuildAccelerationStructuresKHR(commandBuffer, infoCount, pInfos, ppBuildRangeInfos);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdTraceRaysIndirect2KHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                                      VkDeviceAddress indirectDeviceAddress)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    registerTraceRays(layer, commandBuffer, -1, -1, -1);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdTraceRaysIndirect2KHR(commandBuffer, indirectDeviceAddress);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL
+    layer_vkCmdTraceRaysIndirectKHR<user_tag>(VkCommandBuffer commandBuffer,
+                                              const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
+                                              const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
+                                              const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
+                                              const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
+                                              VkDeviceAddress indirectDeviceAddress)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    registerTraceRays(layer, commandBuffer, -1, -1, -1);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdTraceRaysIndirectKHR(commandBuffer,
+                                            pRaygenShaderBindingTable,
+                                            pMissShaderBindingTable,
+                                            pHitShaderBindingTable,
+                                            pCallableShaderBindingTable,
+                                            indirectDeviceAddress);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL
+    layer_vkCmdTraceRaysKHR<user_tag>(VkCommandBuffer commandBuffer,
+                                      const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
+                                      const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
+                                      const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
+                                      const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
+                                      uint32_t width,
+                                      uint32_t height,
+                                      uint32_t depth)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    registerTraceRays(layer, commandBuffer, width, height, depth);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdTraceRaysKHR(commandBuffer,
+                                    pRaygenShaderBindingTable,
+                                    pMissShaderBindingTable,
+                                    pHitShaderBindingTable,
+                                    pCallableShaderBindingTable,
+                                    width,
+                                    height,
+                                    depth);
+    emitCPUTrap(*layer, commandBuffer);
+}
diff --git a/layer_gpu_profile/source/layer_device_functions_transfer.cpp b/layer_gpu_profile/source/layer_device_functions_transfer.cpp
new file mode 100644
index 0000000..540a883
--- /dev/null
+++ b/layer_gpu_profile/source/layer_device_functions_transfer.cpp
@@ -0,0 +1,648 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2024-2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+#include "device.hpp"
+#include "device_utils.hpp"
+#include "framework/device_dispatch_table.hpp"
+#include "trackers/layer_command_stream.hpp"
+
+#include <memory>
+#include <mutex>
+#include <thread>
+
+extern std::mutex g_vulkanLock;
+
+/**
+ * @brief Register a transfer to a buffer with the tracker.
+ *
+ * @param layer           The layer context for the device.
+ * @param commandBuffer   The command buffer we are recording.
+ * @param transferType    The type of transfer being performed.
+ * @param byteCount       The number of bytes transferred.
+ */
+static void registerBufferTransfer(Device* layer,
+                                   VkCommandBuffer commandBuffer,
+                                   Tracker::LCSBufferTransfer::Type transferType,
+                                   int64_t byteCount)
+{
+    if (!layer->isFrameOfInterest)
+    {
+        return;
+    }
+
+    auto& tracker = layer->getStateTracker();
+    auto& cb = tracker.getCommandBuffer(commandBuffer);
+    cb.bufferTransfer(transferType, byteCount);
+}
+
+/**
+ * @brief Register a transfer to an image with the tracker.
+ *
+ * @param layer           The layer context for the device.
+ * @param commandBuffer   The command buffer we are recording.
+ * @param transferType    The type of transfer being performed.
+ * @param pixelCount      The number of pixels transferred.
+ */
+static void registerImageTransfer(Device* layer,
+                                  VkCommandBuffer commandBuffer,
+                                  Tracker::LCSImageTransfer::Type transferType,
+                                  int64_t pixelCount)
+{
+    if (!layer->isFrameOfInterest)
+    {
+        return;
+    }
+
+    auto& tracker = layer->getStateTracker();
+    auto& cb = tracker.getCommandBuffer(commandBuffer);
+    cb.imageTransfer(transferType, pixelCount);
+}
+
+/**
+ * @brief Register a transfer to an image with the tracker.
+ *
+ * @param layer           The layer context for the device.
+ * @param commandBuffer   The command buffer we are recording.
+ * @param transferType    The type of transfer being performed.
+ * @param byteCount       The number of bytes transferred.
+ */
+static void registerAccelerationStructureTransfer(Device* layer,
+                                                  VkCommandBuffer commandBuffer,
+                                                  Tracker::LCSAccelerationStructureTransfer::Type transferType,
+                                                  int64_t byteCount)
+{
+    if (!layer->isFrameOfInterest)
+    {
+        return;
+    }
+
+    auto& tracker = layer->getStateTracker();
+    auto& cb = tracker.getCommandBuffer(commandBuffer);
+    cb.accelerationStructureTransfer(transferType, byteCount);
+}
+
+// Commands for transfers
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdFillBuffer<user_tag>(VkCommandBuffer commandBuffer,
+                                                           VkBuffer dstBuffer,
+                                                           VkDeviceSize dstOffset,
+                                                           VkDeviceSize size,
+                                                           uint32_t data)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // Compute the size of the transfer
+    // TODO: Add buffer tracking so we can turn VK_WHOLE_SIZE into bytes
+    int64_t byteCount = static_cast<int64_t>(size);
+    if (size == VK_WHOLE_SIZE)
+    {
+        byteCount = -2;
+    }
+
+    registerBufferTransfer(layer, commandBuffer, Tracker::LCSBufferTransfer::Type::fill_buffer, byteCount);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdFillBuffer(commandBuffer, dstBuffer, dstOffset, size, data);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdClearColorImage<user_tag>(VkCommandBuffer commandBuffer,
+                                                                VkImage image,
+                                                                VkImageLayout imageLayout,
+                                                                const VkClearColorValue* pColor,
+                                                                uint32_t rangeCount,
+                                                                const VkImageSubresourceRange* pRanges)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // Compute the size of the transfer
+    // TODO: Add image tracking so we can turn image and pRanges into pixels
+    int64_t pixelCount = -1;
+
+    registerImageTransfer(layer, commandBuffer, Tracker::LCSImageTransfer::Type::clear_image, pixelCount);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdClearColorImage(commandBuffer, image, imageLayout, pColor, rangeCount, pRanges);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdClearDepthStencilImage<user_tag>(VkCommandBuffer commandBuffer,
+                                                                       VkImage image,
+                                                                       VkImageLayout imageLayout,
+                                                                       const VkClearDepthStencilValue* pDepthStencil,
+                                                                       uint32_t rangeCount,
+                                                                       const VkImageSubresourceRange* pRanges)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // Compute the size of the transfer
+    // TODO: Add image tracking so we can turn image and pRanges into pixels
+    int64_t pixelCount = -1;
+
+    registerImageTransfer(layer, commandBuffer, Tracker::LCSImageTransfer::Type::clear_image, pixelCount);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdClearDepthStencilImage(commandBuffer, image, imageLayout, pDepthStencil, rangeCount, pRanges);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdCopyBuffer<user_tag>(VkCommandBuffer commandBuffer,
+                                                           VkBuffer srcBuffer,
+                                                           VkBuffer dstBuffer,
+                                                           uint32_t regionCount,
+                                                           const VkBufferCopy* pRegions)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // Compute the size of the transfer
+    int64_t byteCount = 0;
+    for (uint32_t i = 0; i < regionCount; i++)
+    {
+        byteCount += static_cast<int64_t>(pRegions[i].size);
+    }
+
+    registerBufferTransfer(layer, commandBuffer, Tracker::LCSBufferTransfer::Type::copy_buffer, byteCount);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdCopyBuffer(commandBuffer, srcBuffer, dstBuffer, regionCount, pRegions);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdCopyBuffer2<user_tag>(VkCommandBuffer commandBuffer,
+                                                            const VkCopyBufferInfo2* pCopyBufferInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // Compute the size of the transfer
+    int64_t byteCount = 0;
+    for (uint32_t i = 0; i < pCopyBufferInfo->regionCount; i++)
+    {
+        byteCount += static_cast<int64_t>(pCopyBufferInfo->pRegions[i].size);
+    }
+
+    registerBufferTransfer(layer, commandBuffer, Tracker::LCSBufferTransfer::Type::copy_buffer, byteCount);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdCopyBuffer2(commandBuffer, pCopyBufferInfo);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdCopyBuffer2KHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                               const VkCopyBufferInfo2* pCopyBufferInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // Compute the size of the transfer
+    int64_t byteCount = 0;
+    for (uint32_t i = 0; i < pCopyBufferInfo->regionCount; i++)
+    {
+        byteCount += static_cast<int64_t>(pCopyBufferInfo->pRegions[i].size);
+    }
+
+    registerBufferTransfer(layer, commandBuffer, Tracker::LCSBufferTransfer::Type::copy_buffer, byteCount);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdCopyBuffer2KHR(commandBuffer, pCopyBufferInfo);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdCopyBufferToImage<user_tag>(VkCommandBuffer commandBuffer,
+                                                                  VkBuffer srcBuffer,
+                                                                  VkImage dstImage,
+                                                                  VkImageLayout dstImageLayout,
+                                                                  uint32_t regionCount,
+                                                                  const VkBufferImageCopy* pRegions)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // Compute the size of the transfer
+    int64_t pixelCount = 0;
+    for (uint32_t i = 0; i < regionCount; i++)
+    {
+        int64_t rPixelCount = static_cast<int64_t>(pRegions[i].imageExtent.width)
+                            * static_cast<int64_t>(pRegions[i].imageExtent.height)
+                            * static_cast<int64_t>(pRegions[i].imageExtent.depth);
+        pixelCount += rPixelCount;
+    }
+
+    registerImageTransfer(layer, commandBuffer, Tracker::LCSImageTransfer::Type::copy_buffer_to_image, pixelCount);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdCopyBufferToImage(commandBuffer, srcBuffer, dstImage, dstImageLayout, regionCount, pRegions);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL
+    layer_vkCmdCopyBufferToImage2<user_tag>(VkCommandBuffer commandBuffer,
+                                            const VkCopyBufferToImageInfo2* pCopyBufferToImageInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // Compute the size of the transfer
+    int64_t pixelCount = 0;
+    for (uint32_t i = 0; i < pCopyBufferToImageInfo->regionCount; i++)
+    {
+        int64_t rPixelCount = static_cast<int64_t>(pCopyBufferToImageInfo->pRegions[i].imageExtent.width)
+                            * static_cast<int64_t>(pCopyBufferToImageInfo->pRegions[i].imageExtent.height)
+                            * static_cast<int64_t>(pCopyBufferToImageInfo->pRegions[i].imageExtent.depth);
+        pixelCount += rPixelCount;
+    }
+
+    registerImageTransfer(layer, commandBuffer, Tracker::LCSImageTransfer::Type::copy_buffer_to_image, pixelCount);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdCopyBufferToImage2(commandBuffer, pCopyBufferToImageInfo);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL
+    layer_vkCmdCopyBufferToImage2KHR<user_tag>(VkCommandBuffer commandBuffer,
+                                               const VkCopyBufferToImageInfo2* pCopyBufferToImageInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // Compute the size of the transfer
+    int64_t pixelCount = 0;
+    for (uint32_t i = 0; i < pCopyBufferToImageInfo->regionCount; i++)
+    {
+        int64_t rPixelCount = static_cast<int64_t>(pCopyBufferToImageInfo->pRegions[i].imageExtent.width)
+                            * static_cast<int64_t>(pCopyBufferToImageInfo->pRegions[i].imageExtent.height)
+                            * static_cast<int64_t>(pCopyBufferToImageInfo->pRegions[i].imageExtent.depth);
+        pixelCount += rPixelCount;
+    }
+
+    registerImageTransfer(layer, commandBuffer, Tracker::LCSImageTransfer::Type::copy_buffer_to_image, pixelCount);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdCopyBufferToImage2KHR(commandBuffer, pCopyBufferToImageInfo);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdCopyImage<user_tag>(VkCommandBuffer commandBuffer,
+                                                          VkImage srcImage,
+                                                          VkImageLayout srcImageLayout,
+                                                          VkImage dstImage,
+                                                          VkImageLayout dstImageLayout,
+                                                          uint32_t regionCount,
+                                                          const VkImageCopy* pRegions)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // Compute the size of the transfer
+    int64_t pixelCount = 0;
+    for (uint32_t i = 0; i < regionCount; i++)
+    {
+        int64_t rPixelCount = static_cast<int64_t>(pRegions[i].extent.width)
+                            * static_cast<int64_t>(pRegions[i].extent.height)
+                            * static_cast<int64_t>(pRegions[i].extent.depth);
+        pixelCount += rPixelCount;
+    }
+
+    registerImageTransfer(layer, commandBuffer, Tracker::LCSImageTransfer::Type::copy_image, pixelCount);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdCopyImage(commandBuffer, srcImage, srcImageLayout, dstImage, dstImageLayout, regionCount, pRegions);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdCopyImage2<user_tag>(VkCommandBuffer commandBuffer,
+                                                           const VkCopyImageInfo2* pCopyImageInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // Compute the size of the transfer
+    int64_t pixelCount = 0;
+    for (uint32_t i = 0; i < pCopyImageInfo->regionCount; i++)
+    {
+        int64_t rPixelCount = static_cast<int64_t>(pCopyImageInfo->pRegions[i].extent.width)
+                            * static_cast<int64_t>(pCopyImageInfo->pRegions[i].extent.height)
+                            * static_cast<int64_t>(pCopyImageInfo->pRegions[i].extent.depth);
+        pixelCount += rPixelCount;
+    }
+
+    registerImageTransfer(layer, commandBuffer, Tracker::LCSImageTransfer::Type::copy_image, pixelCount);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdCopyImage2(commandBuffer, pCopyImageInfo);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdCopyImage2KHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                              const VkCopyImageInfo2* pCopyImageInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // Compute the size of the transfer
+    int64_t pixelCount = 0;
+    for (uint32_t i = 0; i < pCopyImageInfo->regionCount; i++)
+    {
+        int64_t rPixelCount = static_cast<int64_t>(pCopyImageInfo->pRegions[i].extent.width)
+                            * static_cast<int64_t>(pCopyImageInfo->pRegions[i].extent.height)
+                            * static_cast<int64_t>(pCopyImageInfo->pRegions[i].extent.depth);
+        pixelCount += rPixelCount;
+    }
+
+    registerImageTransfer(layer, commandBuffer, Tracker::LCSImageTransfer::Type::copy_image, pixelCount);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdCopyImage2KHR(commandBuffer, pCopyImageInfo);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL layer_vkCmdCopyImageToBuffer<user_tag>(VkCommandBuffer commandBuffer,
+                                                                  VkImage srcImage,
+                                                                  VkImageLayout srcImageLayout,
+                                                                  VkBuffer dstBuffer,
+                                                                  uint32_t regionCount,
+                                                                  const VkBufferImageCopy* pRegions)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // Compute the size of the transfer
+    int64_t pixelCount = 0;
+    for (uint32_t i = 0; i < regionCount; i++)
+    {
+        int64_t rPixelCount = static_cast<int64_t>(pRegions[i].imageExtent.width)
+                            * static_cast<int64_t>(pRegions[i].imageExtent.height)
+                            * static_cast<int64_t>(pRegions[i].imageExtent.depth);
+        pixelCount += rPixelCount;
+    }
+
+    // TODO: Our usual convention is to mark the transfer using the destination
+    // type, which means this should be a bufferTransfer reporting size in
+    // bytes. Without image tracking we only have pixels, so for now we report
+    // as "Copy image" and report size in pixels.
+    registerImageTransfer(layer, commandBuffer, Tracker::LCSImageTransfer::Type::copy_image_to_buffer, pixelCount);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdCopyImageToBuffer(commandBuffer, srcImage, srcImageLayout, dstBuffer, regionCount, pRegions);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL
+    layer_vkCmdCopyImageToBuffer2<user_tag>(VkCommandBuffer commandBuffer,
+                                            const VkCopyImageToBufferInfo2* pCopyImageToBufferInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // Compute the size of the transfer
+    int64_t pixelCount = 0;
+    for (uint32_t i = 0; i < pCopyImageToBufferInfo->regionCount; i++)
+    {
+        int64_t rPixelCount = static_cast<int64_t>(pCopyImageToBufferInfo->pRegions[i].imageExtent.width)
+                            * static_cast<int64_t>(pCopyImageToBufferInfo->pRegions[i].imageExtent.height)
+                            * static_cast<int64_t>(pCopyImageToBufferInfo->pRegions[i].imageExtent.depth);
+        pixelCount += rPixelCount;
+    }
+
+    // TODO: Our usual convention is to mark the transfer using the destination
+    // type, which means this should be a bufferTransfer reporting size in
+    // bytes. Without image tracking we only have pixels, so for now we report
+    // as "Copy image" and report size in pixels.
+    registerImageTransfer(layer, commandBuffer, Tracker::LCSImageTransfer::Type::copy_image_to_buffer, pixelCount);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdCopyImageToBuffer2(commandBuffer, pCopyImageToBufferInfo);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL
+    layer_vkCmdCopyImageToBuffer2KHR<user_tag>(VkCommandBuffer commandBuffer,
+                                               const VkCopyImageToBufferInfo2* pCopyImageToBufferInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // Compute the size of the transfer
+    int64_t pixelCount = 0;
+    for (uint32_t i = 0; i < pCopyImageToBufferInfo->regionCount; i++)
+    {
+        int64_t rPixelCount = static_cast<int64_t>(pCopyImageToBufferInfo->pRegions[i].imageExtent.width)
+                            * static_cast<int64_t>(pCopyImageToBufferInfo->pRegions[i].imageExtent.height)
+                            * static_cast<int64_t>(pCopyImageToBufferInfo->pRegions[i].imageExtent.depth);
+        pixelCount += rPixelCount;
+    }
+
+    // TODO: Our usual convention is to mark the transfer using the destination
+    // type, which means this should be a bufferTransfer reporting size in
+    // bytes. Without image tracking we only have pixels, so for now we report
+    // as "Copy image" and report size in pixels.
+    registerImageTransfer(layer, commandBuffer, Tracker::LCSImageTransfer::Type::copy_image_to_buffer, pixelCount);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdCopyImageToBuffer2KHR(commandBuffer, pCopyImageToBufferInfo);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL
+    layer_vkCmdCopyAccelerationStructureKHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                      const VkCopyAccelerationStructureInfoKHR* pInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // TODO: We ideally want to track sizes of the transfers, but this requires
+    // dispatching vkCmdWriteAccelerationStructuresPropertiesKHR() queries and
+    // capturing the result "later" which we don't support yet.
+    // We can approximate the size using vkGetAccelerationStructureBuildSizesKHR(),
+    // but this returns the build size which may be larger than the size of the
+    // AS itself which can be smaller (especially if later compacted).
+    registerAccelerationStructureTransfer(layer,
+                                          commandBuffer,
+                                          Tracker::LCSAccelerationStructureTransfer::Type::struct_to_struct,
+                                          -1);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdCopyAccelerationStructureKHR(commandBuffer, pInfo);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL
+    layer_vkCmdCopyAccelerationStructureToMemoryKHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                              const VkCopyAccelerationStructureToMemoryInfoKHR* pInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // TODO: We ideally want to track sizes of the transfers, but this requires
+    // dispatching vkCmdWriteAccelerationStructuresPropertiesKHR() queries and
+    // capturing the result "later" which we don't support yet.
+    // We can approximate the size using vkGetAccelerationStructureBuildSizesKHR(),
+    // but this returns the build size which may be larger than the size of the
+    // AS itself which can be smaller (especially if later compacted).
+    registerAccelerationStructureTransfer(layer,
+                                          commandBuffer,
+                                          Tracker::LCSAccelerationStructureTransfer::Type::struct_to_mem,
+                                          -1);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdCopyAccelerationStructureToMemoryKHR(commandBuffer, pInfo);
+    emitCPUTrap(*layer, commandBuffer);
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR void VKAPI_CALL
+    layer_vkCmdCopyMemoryToAccelerationStructureKHR<user_tag>(VkCommandBuffer commandBuffer,
+                                                              const VkCopyMemoryToAccelerationStructureInfoKHR* pInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(commandBuffer);
+
+    // TODO: We ideally want to track sizes of the transfers, but this requires
+    // dispatching vkCmdWriteAccelerationStructuresPropertiesKHR() queries and
+    // capturing the result "later" which we don't support yet.
+    // We can approximate the size using vkGetAccelerationStructureBuildSizesKHR(),
+    // but this returns the build size which may be larger than the size of the
+    // AS itself which can be smaller (especially if later compacted).
+    registerAccelerationStructureTransfer(layer,
+                                          commandBuffer,
+                                          Tracker::LCSAccelerationStructureTransfer::Type::mem_to_struct,
+                                          -1);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkCmdCopyMemoryToAccelerationStructureKHR(commandBuffer, pInfo);
+    emitCPUTrap(*layer, commandBuffer);
+}
diff --git a/layer_gpu_profile/source/submit_visitor.cpp b/layer_gpu_profile/source/submit_visitor.cpp
new file mode 100644
index 0000000..56d7b9c
--- /dev/null
+++ b/layer_gpu_profile/source/submit_visitor.cpp
@@ -0,0 +1,174 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2024-2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+#include "device_utils.hpp"
+#include "submit_visitor.hpp"
+#include "framework/utils.hpp"
+
+#include <nlohmann/json.hpp>
+
+#include <string>
+
+using json = nlohmann::json;
+
+/* See header for documentation */
+void ProfileSubmitVisitor::handleCPUTrap(
+    const std::string& workloadType,
+    const std::vector<std::string>& debugStack
+) {
+    json message {
+        { "type", workloadType },
+        { "labels", debugStack },
+        { "counters", json::array() }
+    };
+
+    waitForGPU(device);
+
+    auto ec = device.lgcSampler->sample_now();
+
+    notifyGPU(device);
+
+    if (ec)
+    {
+        LAYER_ERR("Failed to make libGPUCounters GPU counter sample");
+    }
+    else
+    {
+        for (const auto& pair : device.lgcActiveCounters)
+        {
+            hwcpipe::counter_sample sample;
+            ec = device.lgcSampler->get_counter_value(pair.first, sample);
+            if (ec)
+            {
+                LAYER_ERR("Failed to get libGPUCounters GPU counter value");
+                continue;
+            }
+
+            if (sample.type == hwcpipe::counter_sample::type::uint64)
+            {
+                json counter {
+                    { pair.second, sample.value.uint64 },
+                };
+
+                message["counters"].push_back(counter);
+
+            }
+            else
+            {
+                json counter {
+                    { pair.second, sample.value.float64 },
+                };
+
+                message["counters"].push_back(counter);
+
+            }
+        }
+    }
+
+    device.txMessage(message.dump());
+
+}
+
+/* See header for documentation */
+void ProfileSubmitVisitor::operator()(
+    const Tracker::LCSRenderPass& renderPass,
+    const std::vector<std::string>& debugStack
+) {
+    UNUSED(renderPass);
+
+    handleCPUTrap("renderpass", debugStack);
+}
+
+/* See header for documentation */
+void ProfileSubmitVisitor::operator()(
+    const Tracker::LCSRenderPassContinuation& continuation,
+    const std::vector<std::string>& debugStack,
+    uint64_t renderPassTagID
+) {
+    UNUSED(continuation);
+    UNUSED(debugStack);
+    UNUSED(renderPassTagID);
+
+    // Ignore continuations because we only trigger one trap per render pass
+}
+
+/* See header for documentation */
+void ProfileSubmitVisitor::operator()(
+    const Tracker::LCSDispatch& dispatch,
+    const std::vector<std::string>& debugStack
+) {
+    UNUSED(dispatch);
+
+    handleCPUTrap("compute", debugStack);
+}
+
+/* See header for documentation */
+void ProfileSubmitVisitor::operator()(
+    const Tracker::LCSTraceRays& traceRays,
+    const std::vector<std::string>& debugStack
+) {
+    UNUSED(traceRays);
+
+    handleCPUTrap("tracerays", debugStack);
+}
+
+/* See header for documentation */
+void ProfileSubmitVisitor::operator()(
+    const Tracker::LCSImageTransfer& imageTransfer,
+    const std::vector<std::string>& debugStack
+) {
+    UNUSED(imageTransfer);
+
+    handleCPUTrap("image_transfer", debugStack);
+}
+
+/* See header for documentation */
+void ProfileSubmitVisitor::operator()(
+    const Tracker::LCSBufferTransfer& bufferTransfer,
+    const std::vector<std::string>& debugStack
+) {
+    UNUSED(bufferTransfer);
+
+    handleCPUTrap("buffer_transfer", debugStack);
+}
+
+/* See header for documentation */
+void ProfileSubmitVisitor::operator()(
+    const Tracker::LCSAccelerationStructureBuild& asBuild,
+    const std::vector<std::string>& debugStack
+) {
+    UNUSED(asBuild);
+
+    handleCPUTrap("as_build", debugStack);
+}
+
+/* See header for documentation */
+void ProfileSubmitVisitor::operator()(
+    const Tracker::LCSAccelerationStructureTransfer& asTransfer,
+    const std::vector<std::string>& debugStack
+) {
+    UNUSED(asTransfer);
+
+    handleCPUTrap("as_transfer", debugStack);
+}
diff --git a/layer_gpu_profile/source/submit_visitor.hpp b/layer_gpu_profile/source/submit_visitor.hpp
new file mode 100644
index 0000000..90ac752
--- /dev/null
+++ b/layer_gpu_profile/source/submit_visitor.hpp
@@ -0,0 +1,105 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2024-2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+#pragma once
+
+#include "device.hpp"
+#include "trackers/layer_command_stream.hpp"
+#include "trackers/queue.hpp"
+
+#include <string>
+#include <vector>
+
+/**
+ * Handles setting up async state ...
+ */
+class ProfileSubmitVisitor : public Tracker::SubmitCommandWorkloadVisitor
+{
+public:
+    /**
+     * Construct a profile workload visitor for a layer command stream.
+     *
+     * @param _device The device object for the command stream.
+     */
+    ProfileSubmitVisitor(Device& _device)
+        : device(_device)
+    {
+    }
+
+    // Visitor should not be copied or moved from
+    ProfileSubmitVisitor(const ProfileSubmitVisitor&) = delete;
+    ProfileSubmitVisitor(ProfileSubmitVisitor&&) noexcept = delete;
+    ProfileSubmitVisitor& operator=(const ProfileSubmitVisitor&) = delete;
+    ProfileSubmitVisitor& operator=(ProfileSubmitVisitor&&) noexcept = delete;
+
+    // Methods from the visitor interface
+    void operator()(
+        const Tracker::LCSRenderPass& renderPass,
+        const std::vector<std::string>& debugStack) override;
+
+    void operator()(
+        const Tracker::LCSRenderPassContinuation& continuation,
+        const std::vector<std::string>& debugStack,
+        uint64_t renderPassTagID) override;
+
+    void operator()(
+        const Tracker::LCSDispatch& dispatch,
+        const std::vector<std::string>& debugStack) override;
+
+    void operator()(
+        const Tracker::LCSTraceRays& traceRays,
+        const std::vector<std::string>& debugStack) override;
+
+    void operator()(
+        const Tracker::LCSImageTransfer& imageTransfer,
+        const std::vector<std::string>& debugStack) override;
+
+    void operator()(
+        const Tracker::LCSBufferTransfer& bufferTransfer,
+        const std::vector<std::string>& debugStack) override;
+
+    void operator()(
+        const Tracker::LCSAccelerationStructureBuild& asBuild,
+        const std::vector<std::string>& debugStack) override;
+
+    void operator()(
+        const Tracker::LCSAccelerationStructureTransfer& asTransfer,
+        const std::vector<std::string>& debugStack) override;
+
+private:
+    /**
+     * @brief Handle the CPU-side of the counter sampling sequence.
+     *
+     * @param workloadType   The coarse type of the workload.
+     * @param debugStack     The user debug label stack.
+     */
+    void handleCPUTrap(
+        const std::string& workloadType,
+        const std::vector<std::string>& debugStack);
+
+private:
+    Device& device;
+};
+
diff --git a/layer_gpu_profile/source/version.hpp.in b/layer_gpu_profile/source/version.hpp.in
new file mode 100644
index 0000000..ff3777f
--- /dev/null
+++ b/layer_gpu_profile/source/version.hpp.in
@@ -0,0 +1,38 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2024-2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+/**
+ * @file Placeholder templates that are populated by CMake during configure.
+ */
+
+#pragma once
+
+#define LGL_VER_MAJOR @PROJECT_VERSION_MAJOR@
+#define LGL_VER_MINOR @PROJECT_VERSION_MINOR@
+#define LGL_VER_PATCH @PROJECT_VERSION_PATCH@
+#define LGL_LAYER_NAME "@LGL_LAYER_NAME_STR@"
+#define LGL_LAYER_DESC "@LGL_LAYER_DESC_STR@"
+
+#define LGL_LAYER_CONFIG "@LGL_LAYER_NAME_STR@.json"
diff --git a/layer_gpu_support/README_LAYER.md b/layer_gpu_support/README_LAYER.md
index ccf22ca..4d60ead 100644
--- a/layer_gpu_support/README_LAYER.md
+++ b/layer_gpu_support/README_LAYER.md
@@ -43,10 +43,10 @@ sections in the [Build documentation](../docs/building.md).
 
 ### Running using the layer
 
-You can perform support experiments by using the Android helper utility found
-in the root directory to configure the layer and manage the application. You
-must enable the support layer, and provide a configuration file to parameterize
-it.
+You can configure a device to run support experiments by using the Android
+helper utility found in the root directory to configure the layer and manage
+the application. You must enable the support layer, and provide a configuration
+file to parameterize it.
 
 ```sh
 python3 lgl_android_install.py --layer layer_gpu_support --config <your.json>
diff --git a/layer_gpu_support/source/layer_config.cpp b/layer_gpu_support/source/layer_config.cpp
index b150d02..a66c1c3 100644
--- a/layer_gpu_support/source/layer_config.cpp
+++ b/layer_gpu_support/source/layer_config.cpp
@@ -279,7 +279,6 @@ LayerConfig::LayerConfig()
         LAYER_ERR("Error: %s", e.what());
     }
 
-
     try
     {
         parse_serialization_options(data);
diff --git a/layer_gpu_timeline/source/layer_device_functions_debug.cpp b/layer_gpu_timeline/source/layer_device_functions_debug.cpp
index 7795598..37b67e2 100644
--- a/layer_gpu_timeline/source/layer_device_functions_debug.cpp
+++ b/layer_gpu_timeline/source/layer_device_functions_debug.cpp
@@ -1,7 +1,7 @@
 /*
  * SPDX-License-Identifier: MIT
  * ----------------------------------------------------------------------------
- * Copyright (c) 2024 Arm Limited
+ * Copyright (c) 2024-2025 Arm Limited
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
@@ -44,7 +44,7 @@ VKAPI_ATTR void VKAPI_CALL layer_vkCmdDebugMarkerBeginEXT<user_tag>(VkCommandBuf
     auto& tracker = layer->getStateTracker();
     auto& cb = tracker.getCommandBuffer(commandBuffer);
 
-    // Increment the render pass counter in the tracker
+    // Push the label scope to the tracker
     cb.debugMarkerBegin(pMarkerInfo->pMarkerName);
 
     // Note that we do not call the driver for user labels - they are
@@ -65,7 +65,7 @@ VKAPI_ATTR void VKAPI_CALL layer_vkCmdDebugMarkerEndEXT<user_tag>(VkCommandBuffe
     auto& tracker = layer->getStateTracker();
     auto& cb = tracker.getCommandBuffer(commandBuffer);
 
-    // Increment the render pass counter in the tracker
+    // Pop the label scope in the tracker
     cb.debugMarkerEnd();
 
     // Note that we do not call the driver for user labels - they are
@@ -87,7 +87,7 @@ VKAPI_ATTR void VKAPI_CALL layer_vkCmdBeginDebugUtilsLabelEXT<user_tag>(VkComman
     auto& tracker = layer->getStateTracker();
     auto& cb = tracker.getCommandBuffer(commandBuffer);
 
-    // Increment the render pass counter in the tracker
+    // Push the label scope to the tracker
     cb.debugMarkerBegin(pLabelInfo->pLabelName);
 
     // Note that we do not call the driver for user labels - they are
@@ -108,7 +108,7 @@ VKAPI_ATTR void VKAPI_CALL layer_vkCmdEndDebugUtilsLabelEXT<user_tag>(VkCommandB
     auto& tracker = layer->getStateTracker();
     auto& cb = tracker.getCommandBuffer(commandBuffer);
 
-    // Increment the render pass counter in the tracker
+    // Pop the label scope in the tracker
     cb.debugMarkerEnd();
 
     // Note that we do not call the driver for user labels - they are
diff --git a/layer_gpu_timeline/source/timeline_comms.hpp b/layer_gpu_timeline/source/timeline_comms.hpp
index 15bc518..fecd9eb 100644
--- a/layer_gpu_timeline/source/timeline_comms.hpp
+++ b/layer_gpu_timeline/source/timeline_comms.hpp
@@ -1,7 +1,7 @@
 /*
  * SPDX-License-Identifier: MIT
  * ----------------------------------------------------------------------------
- * Copyright (c) 2024 Arm Limited
+ * Copyright (c) 2024-2025 Arm Limited
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
@@ -33,11 +33,6 @@
 
 /**
  * @brief A simple message encoder for the timeline comms endpoint.
- *
- * TODO: This is currently a very simple implementation because we are simply
- * passing JSON strings around. This is not the most efficient way of doing
- * this and in future this module will be used to implement binary encoders
- * for each specific message type that needs sending.
  */
 class TimelineComms
 {
diff --git a/layer_gpu_timeline/source/timeline_protobuf_encoder.cpp b/layer_gpu_timeline/source/timeline_protobuf_encoder.cpp
index 2d5bdfd..bd8bf76 100644
--- a/layer_gpu_timeline/source/timeline_protobuf_encoder.cpp
+++ b/layer_gpu_timeline/source/timeline_protobuf_encoder.cpp
@@ -87,7 +87,7 @@ using Submit = pp::message<
     /* The VkQueue the frame belongs to */
     pp::uint64_field<"queue", 3>>;
 
-/* Enumerates the possible attachment types a renderpass can have */
+/* Enumerates the possible attachment types a render pass can have */
 enum class RenderpassAttachmentType
 {
     undefined = 0,
@@ -96,7 +96,7 @@ enum class RenderpassAttachmentType
     stencil = 3,
 };
 
-/* Describe an attachment to a renderpass */
+/* Describe an attachment to a render pass */
 using RenderpassAttachment = pp::message<
     /* The attachment type */
     pp::enum_field<"type", 1, RenderpassAttachmentType>,
@@ -112,29 +112,29 @@ using RenderpassAttachment = pp::message<
        things are not resolved, so this saves a field in the data) */
     pp::bool_field<"resolved", 5>>;
 
-/* Start a new renderpass */
+/* Start a new render pass */
 using BeginRenderpass = pp::message<
-    /* The unique identifier for this new renderpass */
+    /* The unique identifier for this new render pass */
     pp::uint64_field<"tag_id", 1>,
-    /* The dimensions of the renderpass' attachments */
+    /* The dimensions of the render pass' attachments */
     pp::uint32_field<"width", 2>,
     pp::uint32_field<"height", 3>,
-    /* The number of drawcalls in the renderpass */
+    /* The number of drawcalls in the render pass */
     pp::uint32_field<"draw_call_count", 4>,
     /* The subpass count */
     pp::uint32_field<"subpass_count", 5>,
-    /* Any user defined debug labels associated with the renderpass */
+    /* Any user defined debug labels associated with the render pass */
     pp::string_field<"debug_label", 6, pp::repeated>,
-    /* Any attachments associated with the renderpass */
+    /* Any attachments associated with the render pass */
     pp::message_field<"attachments", 7, RenderpassAttachment, pp::repeated>>;
 
-/* Continue a split renderpass */
+/* Continue a split render pass */
 using ContinueRenderpass = pp::message<
-    /* The unique identifier for the renderpass that is being continued */
+    /* The unique identifier for the render pass that is being continued */
     pp::uint64_field<"tag_id", 1>,
-    /* The number of drawcalls to add to the total in the renderpass */
+    /* The number of drawcalls to add to the total in the render pass */
     pp::uint32_field<"draw_call_count", 2>,
-    /* Any user defined debug labels to add to the renderpass */
+    /* Any user defined debug labels to add to the render pass */
     pp::string_field<"debug_label", 3, pp::repeated>>;
 
 /* A dispatch object submission */
@@ -295,7 +295,7 @@ Comms::MessageData packBuffer(pp::constant<F> c, T&& f)
  * @return A pair, where the first value is the corresponding attachment type, and the second value is
  * the corresponding attachment index (or nullopt in the case the index is not relevant).
  */
-constexpr std::pair<RenderpassAttachmentType, std::optional<uint32_t>> mapRenderpassAttachmentName(
+constexpr std::pair<RenderpassAttachmentType, std::optional<uint32_t>> mapRenderPassAttachmentName(
     Tracker::RenderPassAttachName name)
 {
     switch (name)
@@ -443,10 +443,10 @@ constexpr AccelerationStructureTransferType mapASTransferType(Tracker::LCSAccele
 /**
  * @brief Serialize the metadata for this render pass workload.
  *
- * @param renderpass   The render pass to serialize
+ * @param renderPass   The render pass to serialize
  * @param debugLabel   The debug label stack of the VkQueue at submit time.
  */
-Comms::MessageData serialize(const Tracker::LCSRenderPass& renderpass, const std::vector<std::string>& debugLabel)
+Comms::MessageData serialize(const Tracker::LCSRenderPass& renderPass, const std::vector<std::string>& debugLabel)
 {
     using namespace pp;
 
@@ -454,18 +454,18 @@ Comms::MessageData serialize(const Tracker::LCSRenderPass& renderpass, const std
     // associated with a single tagID if restartable across command buffer
     // boundaries because different command buffer submit combinations can
     // result in different draw counts for the same starting tagID.
-    const auto drawCount = (!renderpass.isOneTimeSubmit() && renderpass.isSuspending()
+    const auto drawCount = (!renderPass.isOneTimeSubmit() && renderPass.isSuspending()
                                 ? -1
-                                : static_cast<int64_t>(renderpass.getDrawCallCount()));
+                                : static_cast<int64_t>(renderPass.getDrawCallCount()));
 
     // Make the attachments array
-    const auto& attachments = renderpass.getAttachments();
+    const auto& attachments = renderPass.getAttachments();
     std::vector<RenderpassAttachment> attachmentsMsg {};
     attachmentsMsg.reserve(attachments.size());
 
     for (const auto& attachment : attachments)
     {
-        const auto [type, index] = mapRenderpassAttachmentName(attachment.getAttachmentName());
+        const auto [type, index] = mapRenderPassAttachmentName(attachment.getAttachmentName());
 
         attachmentsMsg.emplace_back(type,
                                     index,
@@ -479,11 +479,11 @@ Comms::MessageData serialize(const Tracker::LCSRenderPass& renderpass, const std
 
     return packBuffer("renderpass"_f,
                       BeginRenderpass {
-                          renderpass.getTagID(),
-                          renderpass.getWidth(),
-                          renderpass.getHeight(),
+                          renderPass.getTagID(),
+                          renderPass.getWidth(),
+                          renderPass.getHeight(),
                           drawCount,
-                          renderpass.getSubpassCount(),
+                          renderPass.getSubpassCount(),
                           debugLabel,
                           std::move(attachmentsMsg),
                       });
@@ -492,7 +492,7 @@ Comms::MessageData serialize(const Tracker::LCSRenderPass& renderpass, const std
 /**
  * @brief Serialize the metadata for this render pass continuation workload.
  *
- * @param continuation        The renderpass continuation to serialize
+ * @param continuation        The render pass continuation to serialize
  * @param tagIDContinuation   The ID of the workload if this is a continuation of it.
  */
 Comms::MessageData serialize(const Tracker::LCSRenderPassContinuation& continuation, uint64_t tagIDContinuation)
@@ -681,19 +681,19 @@ void TimelineProtobufEncoder::emitSubmit(VkQueue queue, uint64_t timestamp)
                                 }));
 }
 
-void TimelineProtobufEncoder::operator()(const Tracker::LCSRenderPass& renderpass,
+void TimelineProtobufEncoder::operator()(const Tracker::LCSRenderPass& renderPass,
                                          const std::vector<std::string>& debugStack)
 {
-    device.txMessage(serialize(renderpass, debugStack));
+    device.txMessage(serialize(renderPass, debugStack));
 }
 
 void TimelineProtobufEncoder::operator()(const Tracker::LCSRenderPassContinuation& continuation,
                                          const std::vector<std::string>& debugStack,
-                                         uint64_t renderpassTagID)
+                                         uint64_t renderPassTagID)
 {
     UNUSED(debugStack);
 
-    device.txMessage(serialize(continuation, renderpassTagID));
+    device.txMessage(serialize(continuation, renderPassTagID));
 }
 
 void TimelineProtobufEncoder::operator()(const Tracker::LCSDispatch& dispatch,
diff --git a/layer_gpu_timeline/source/timeline_protobuf_encoder.hpp b/layer_gpu_timeline/source/timeline_protobuf_encoder.hpp
index 34e6e4b..9b5b3c3 100644
--- a/layer_gpu_timeline/source/timeline_protobuf_encoder.hpp
+++ b/layer_gpu_timeline/source/timeline_protobuf_encoder.hpp
@@ -90,7 +90,7 @@ class TimelineProtobufEncoder : public Tracker::SubmitCommandWorkloadVisitor
     static void emitFrame(Device& device, uint64_t frameNumber, uint64_t timestamp);
 
     /**
-     * Construct a new workload metadata emitter that will output paylaods for the provided device
+     * Construct a new workload metadata emitter that will output payloads for the provided device
      *
      * @param _device The device object that the payloads are produced for, and to which they are passed for
      * transmission
@@ -100,17 +100,17 @@ class TimelineProtobufEncoder : public Tracker::SubmitCommandWorkloadVisitor
     {
     }
 
-    // visitor should not be copied or moved from
+    // Visitor should not be copied or moved from
     TimelineProtobufEncoder(const TimelineProtobufEncoder&) = delete;
     TimelineProtobufEncoder(TimelineProtobufEncoder&&) noexcept = delete;
     TimelineProtobufEncoder& operator=(const TimelineProtobufEncoder&) = delete;
     TimelineProtobufEncoder& operator=(TimelineProtobufEncoder&&) noexcept = delete;
 
-    // methods from the visitor interface
-    void operator()(const Tracker::LCSRenderPass& renderpass, const std::vector<std::string>& debugStack) override;
+    // Methods from the visitor interface
+    void operator()(const Tracker::LCSRenderPass& renderPass, const std::vector<std::string>& debugStack) override;
     void operator()(const Tracker::LCSRenderPassContinuation& continuation,
                     const std::vector<std::string>& debugStack,
-                    uint64_t renderpassTagID) override;
+                    uint64_t renderPassTagID) override;
     void operator()(const Tracker::LCSDispatch& dispatch, const std::vector<std::string>& debugStack) override;
     void operator()(const Tracker::LCSTraceRays& traceRays, const std::vector<std::string>& debugStack) override;
     void operator()(const Tracker::LCSImageTransfer& imageTransfer,
diff --git a/lgl_android_install.py b/lgl_android_install.py
index c4c9738..8e6cc9c 100755
--- a/lgl_android_install.py
+++ b/lgl_android_install.py
@@ -135,6 +135,7 @@
 from lglpy.android.filesystem import AndroidFilesystem
 from lglpy.comms import server
 from lglpy.comms import service_gpu_timeline
+from lglpy.comms import service_gpu_profile
 from lglpy.ui import console
 
 # Android 9 is the minimum version supported for our method of enabling layers
@@ -592,7 +593,9 @@ def cleanup(child_process):
         print('WARNING: Cannot enable logcat recording')
 
 
-def configure_server(conn: ADBConnect, output_path: str) -> None:
+def configure_server(conn: ADBConnect,
+                     timeline_file: Optional[str],
+                     profile_dir: Optional[str]) -> None:
     '''
     Configure the remote server to collect data.
 
@@ -601,13 +604,20 @@ def configure_server(conn: ADBConnect, output_path: str) -> None:
 
     Args:
         conn: The adb connection.
-        output_path: The desired output file path.
+        timeline_file: The desired output file path for timeline.
+        profile_dir: The desired output directory path for timeline. Existing
+            files in the directory may be overwritten.
     '''
     # Create a server instance
     instance = server.CommsServer(0)
 
-    service = service_gpu_timeline.GPUTimelineService(output_path)
-    instance.register_endpoint(service)
+    if timeline_file:
+        serviceTL = service_gpu_timeline.GPUTimelineService(timeline_file)
+        instance.register_endpoint(serviceTL)
+
+    if profile_dir:
+        serviceProf = service_gpu_profile.GPUProfileService(profile_dir)
+        instance.register_endpoint(serviceProf)
 
     # Start it running
     thread = threading.Thread(target=instance.run, daemon=True)
@@ -784,6 +794,10 @@ def parse_cli() -> Optional[argparse.Namespace]:
         '--timeline-perfetto', type=str, default=None,
         help='save Timeline Perfetto trace to this file')
 
+    parser.add_argument(
+        '--profile', type=str, default=None,
+        help='save Profile data to this directory')
+
     args = parser.parse_args()
 
     # Validate arguments
@@ -886,9 +900,11 @@ def main() -> int:
         print(f'    - {layer.name}')
     print()
 
-    # Enable Timeline
-    if args.timeline_metadata:
-        configure_server(conn, args.timeline_metadata)
+    # Enable communications server
+    if args.timeline_metadata or args.profile:
+        configure_server(conn,
+                         args.timeline_metadata,
+                         args.profile)
 
     # Enable logcat
     if args.logcat:
@@ -918,7 +934,7 @@ def main() -> int:
         print(f'{message:<{max_len}}')
 
     else:
-        input('Press any key when finished to uninstall all layers')
+        input('Press any key when finished to uninstall all layers\n\n')
 
     print('\nUninstalling all layers')
 
diff --git a/lglpy/comms/service_gpu_profile.py b/lglpy/comms/service_gpu_profile.py
new file mode 100644
index 0000000..7c30e5e
--- /dev/null
+++ b/lglpy/comms/service_gpu_profile.py
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: MIT
+# -----------------------------------------------------------------------------
+# Copyright (c) 2024-2025 Arm Limited
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the 'Software'), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# -----------------------------------------------------------------------------
+
+'''
+This module implements the server-side communications module service that
+handles record preprocessing and serializing the resulting GPU Profile layer.
+'''
+
+import csv
+import json
+import os
+from typing import Any, Optional, TypedDict, Union
+
+from lglpy.comms.server import Message
+
+
+class StartFrameMessage(TypedDict):
+    '''
+    Type information for a start_frame JSON message.
+    '''
+    type: str
+    frame: int
+
+
+class EndFrameMessage(TypedDict):
+    '''
+    Type information for an end_frame JSON message.
+    '''
+    type: str
+
+
+class WorkloadMessage(TypedDict):
+    '''
+    Type information for any workload JSON message.
+    '''
+    type: str
+    counters: list[dict[str, Union[int, float]]]
+    labels: list[str]
+
+
+class GPUProfileService:
+    '''
+    A service for handling network comms from the layer_gpu_profile layer.
+    '''
+
+    def __init__(self, dir_path: str, verbose: bool = False):
+        '''
+        Initialize the profile service.
+
+        Args:
+            dir_path: Directory to write on the filesystem
+            verbose: Should this use verbose logging?
+        '''
+        self.base_dir = dir_path
+
+        self.frame_id: Optional[int] = None
+        self.frame_header: Optional[list[str]] = None
+        self.frame_data: Optional[list[list[str]]] = None
+
+        os.makedirs(dir_path, exist_ok=True)
+
+    def get_service_name(self) -> str:
+        '''
+        Get the service endpoint name.
+
+        Returns:
+            The endpoint name.
+        '''
+        return 'GPUProfile'
+
+    def handle_start_frame(self, message: StartFrameMessage):
+        '''
+        Handle a start_frame message.
+
+        Args:
+            message: The decoded JSON.
+        '''
+        self.frame_id = message["frame"]
+        self.frame_header = None
+        self.frame_data = []
+
+    def handle_end_frame(self, message: EndFrameMessage):
+        '''
+        Handle an end_frame message.
+
+        Args:
+            message: The decoded JSON.
+        '''
+        # Message contains nothing we need
+        del message
+
+        assert self.frame_id is not None
+        assert self.frame_header is not None
+        assert self.frame_data is not None
+
+        # Emit the CSV file
+        print(f'Generating CSV for frame {self.frame_id}')
+        path = os.path.join(self.base_dir, f'frame_{self.frame_id:05d}.csv')
+        with open(path, 'w', newline='') as handle:
+            writer = csv.writer(handle)
+            writer.writerow(self.frame_header)
+            writer.writerows(self.frame_data)
+
+        # Reset the state
+        self.frame_id = None
+        self.frame_header = None
+        self.frame_data = None
+
+    def create_workload_header(self, message: WorkloadMessage):
+        '''
+        Create a table header row from a workload.
+
+        Args:
+            message: The decoded JSON.
+        '''
+        columns = []
+
+        columns.append('Index')
+        columns.append('Workload type')
+        for counter in message['counters']:
+            key = list(counter.keys())[0]
+            columns.append(key)
+        columns.append('Label')
+
+        self.frame_header = columns
+
+    def create_workload_data(self, message: WorkloadMessage):
+        '''
+        Create a table data row from a workload.
+
+        Args:
+            message: The decoded JSON.
+        '''
+        assert self.frame_id is not None
+        assert self.frame_header is not None
+        assert self.frame_data is not None
+
+        columns: list[str] = []
+
+        columns.append(str(len(self.frame_data)))
+        columns.append(message['type'])
+
+        for counter in message['counters']:
+            value = list(counter.values())[0]
+            columns.append(f'{value:0.2f}')
+        columns.append('|'.join(message['labels']))
+
+        self.frame_data.append(columns)
+
+    def handle_workload(self, message: WorkloadMessage):
+        '''
+        Handle a workload message.
+
+        Args:
+            message: The decoded JSON.
+        '''
+        if not self.frame_header:
+            self.create_workload_header(message)
+
+        self.create_workload_data(message)
+
+    def handle_message(self, message: Message) -> None:
+        '''
+        Handle a service request from a layer.
+
+        Note that this service only expects pushed TX or TX_ASYNC messages, so
+        never provides a response.
+        '''
+        encoded_payload = message.payload.decode('utf-8')
+        payload = json.loads(encoded_payload)
+
+        if payload['type'] == 'start_frame':
+            self.handle_start_frame(payload)
+        elif payload['type'] == 'end_frame':
+            self.handle_end_frame(payload)
+        else:
+            self.handle_workload(payload)
diff --git a/source_common/comms/comms_interface.hpp b/source_common/comms/comms_interface.hpp
index 20a237b..d21776e 100644
--- a/source_common/comms/comms_interface.hpp
+++ b/source_common/comms/comms_interface.hpp
@@ -1,7 +1,7 @@
 /*
  * SPDX-License-Identifier: MIT
  * ----------------------------------------------------------------------------
- * Copyright (c) 2024 Arm Limited
+ * Copyright (c) 2024-2025 Arm Limited
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
@@ -63,7 +63,7 @@ static const EndpointID NO_ENDPOINT {0};
 class CommsInterface
 {
 public:
-    virtual ~CommsInterface() { }
+    virtual ~CommsInterface() = default;
 
     /**
      * @brief Is this comms module connected to a host server?
diff --git a/source_common/trackers/queue.cpp b/source_common/trackers/queue.cpp
index 022eac9..763dded 100644
--- a/source_common/trackers/queue.cpp
+++ b/source_common/trackers/queue.cpp
@@ -80,7 +80,7 @@ namespace
         }
 
         /**
-         * @brief Visit a renderpass workload instruction
+         * @brief Visit a render pass workload instruction
          *
          * @param instruction The workload instruction
          */
@@ -99,7 +99,7 @@ namespace
         }
 
         /**
-         * @brief Visit a renderpass continuation workload instruction
+         * @brief Visit a render pass continuation workload instruction
          *
          * @param instruction The workload instruction
          */
diff --git a/source_common/trackers/queue.hpp b/source_common/trackers/queue.hpp
index 2116964..6d8bed3 100644
--- a/source_common/trackers/queue.hpp
+++ b/source_common/trackers/queue.hpp
@@ -62,23 +62,23 @@ class SubmitCommandWorkloadVisitor
     virtual ~SubmitCommandWorkloadVisitor() noexcept = default;
 
     /**
-     * @brief Visit a renderpass workload object
+     * @brief Visit a render pass workload object
      *
-     * @param renderpass The renderpass
-     * @param debugStack The stack of debug labels that are associated with this renderpass
+     * @param renderPass The render pass
+     * @param debugStack The stack of debug labels that are associated with this render pass
      */
-    virtual void operator()(const LCSRenderPass& renderpass, const std::vector<std::string>& debugStack) = 0;
+    virtual void operator()(const LCSRenderPass& renderPass, const std::vector<std::string>& debugStack) = 0;
 
     /**
-     * @brief Visit a renderpass continuation workload object
+     * @brief Visit a render pass continuation workload object
      *
-     * @param continuation The renderpass continuation
-     * @param debugStack The stack of debug labels that are associated with this renderpass
-     * @param renderpassTagID The renderpass tag that the continuation was associated with
+     * @param continuation The render pass continuation
+     * @param debugStack The stack of debug labels that are associated with this render pass
+     * @param renderPassTagID The render pass tag that the continuation was associated with
      */
     virtual void operator()(const LCSRenderPassContinuation& continuation,
                             const std::vector<std::string>& debugStack,
-                            uint64_t renderpassTagID) = 0;
+                            uint64_t renderPassTagID) = 0;
 
     /**
      * @brief Visit a dispatch workload object
diff --git a/source_common/utils/queue.hpp b/source_common/utils/queue.hpp
index 4434837..373f68d 100644
--- a/source_common/utils/queue.hpp
+++ b/source_common/utils/queue.hpp
@@ -1,7 +1,7 @@
 /*
  * SPDX-License-Identifier: MIT
  * ----------------------------------------------------------------------------
- * Copyright (c) 2024 Arm Limited
+ * Copyright (c) 2024-2025 Arm Limited
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
@@ -44,12 +44,13 @@ class Task
     /**
      * @brief Destroy the task.
      */
-    virtual ~Task() { }
+    virtual ~Task() = default;
 
     /**
      * @brief Wait for the task to be complete.
      */
-    void wait() {
+    void wait()
+    {
         std::unique_lock<std::mutex> lock(condition_lock);
         complete_condition.wait(lock, [this]{ return complete.load(); });
     }
@@ -57,7 +58,8 @@ class Task
     /**
      * @brief Notify that the task is complete.
      */
-    void notify() {
+    void notify()
+    {
         std::unique_lock<std::mutex> lock(condition_lock);
         complete = true;
         lock.unlock();
diff --git a/source_third_party/libGPUCounters b/source_third_party/libGPUCounters
new file mode 160000
index 0000000..f60cfa8
--- /dev/null
+++ b/source_third_party/libGPUCounters
@@ -0,0 +1 @@
+Subproject commit f60cfa830c85ffff09c70318e573a4672ab590c9