From 8aaaf4bcf8ddff4954629605ee9a4c666e1ff1f4 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Wed, 25 Jun 2025 13:35:48 +0000
Subject: [PATCH 1/2] Arm backend: Introduce support for a VGF runtime backend.

This is a first version of a VGF runtime with spport for simple VGF
files containing inputs and outputs (no weights) and will prepare the
appropriate Vulkan structures and dispatch the workload following the
normal backend delegate interfaces. It's intended to be extended to
take advantage of the existing Vulkan delegate by replacing the basic
object creation, and by re-using the VgfRepr in the appropriate way in
either a "direct" Arm backend for testing and simple deployment, or
integrated with the Vulkan backend to have good memory, sync and
performance interop with existing Vulkan delegate operators.

It re-uses the build-setup (headers, volk, etc) and
vulkan_executor_runner and has been tested on linux only. This was on
the simple S32 add kernel from the aot_arm_compiler, and a quantized
and non-quantized mv2.

It depends on a number of components which are not yet released, and
the script for these is not included, as our third party dependencies
are still evolving.

Details:
 * Minor build fix for vulkan runtime.
 * Bump vulkan and volk headers to get tensor and graph extensions
 * First version of VGFBackend, dispatching on a vulkan layer driver
 * Will process the examples/models mv2 model and constants

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
Change-Id: I1f278cb98872ae8c0675c72995f0249c038d07d8
---
 CMakeLists.txt                                |   4 +
 backends/arm/CMakeLists.txt                   |  52 +-
 backends/arm/runtime/VGFBackend.cpp           | 361 ++++++++
 backends/arm/runtime/VGFSetup.cpp             | 780 ++++++++++++++++++
 backends/arm/runtime/VGFSetup.h               | 119 +++
 backends/vulkan/CMakeLists.txt                |   8 +
 .../vulkan/runtime/graph/containers/Types.h   |   2 +
 backends/vulkan/third-party/Vulkan-Headers    |   2 +-
 backends/vulkan/third-party/volk              |   2 +-
 tools/cmake/preset/default.cmake              |   3 +
 10 files changed, 1329 insertions(+), 4 deletions(-)
 create mode 100644 backends/arm/runtime/VGFBackend.cpp
 create mode 100644 backends/arm/runtime/VGFSetup.cpp
 create mode 100644 backends/arm/runtime/VGFSetup.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 95e33fb109e..e83d8ea11b5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -817,6 +817,10 @@ endif()
 if(EXECUTORCH_BUILD_VULKAN)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/vulkan)
 endif()
+if(EXECUTORCH_BUILD_VGF)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/arm)
+endif()
+
 
 if(EXECUTORCH_BUILD_ANDROID_JNI)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/android)
diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
index b5e76e778a5..bf313232b66 100644
--- a/backends/arm/CMakeLists.txt
+++ b/backends/arm/CMakeLists.txt
@@ -12,13 +12,17 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
 
-add_compile_options("-Wall" "-Werror")
-
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
 set(_common_include_directories ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/runtime/core/portable_type/c10)
 add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)
 
+
+# bare metal backend builds
+if(EXECUTORCH_BUILD_ARM_BAREMETAL)
+
+add_compile_options("-Wall" "-Werror")
+
 # Third-party folder and Ethos-U driver inclued
 set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
 set(DRIVER_ETHOSU_INCLUDE_DIR "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include")
@@ -36,3 +40,47 @@ target_include_directories(
 target_include_directories(
   executorch_delegate_ethos_u PUBLIC ${DRIVER_ETHOSU_INCLUDE_DIR}
 )
+
+# end config for bare metal builds
+endif()
+
+
+# VGF backend builds 
+if(EXECUTORCH_BUILD_VGF)
+
+# include libvgf
+set(LIBVGF_PATH "${EXECUTORCH_ROOT}/examples/arm/ethos-u-scratch/ml-sdk-for-vulkan-manifest/sw/vgf-lib/")
+
+set(VULKAN_THIRD_PARTY_PATH ${EXECUTORCH_ROOT}/backends/vulkan/third-party)
+set(VULKAN_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/Vulkan-Headers/include)
+set(VOLK_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/volk)
+
+set(LIBVGF_STATIC "${LIBVGF_PATH}/build/src/libvgf.a")
+set(LIBVGF_INCLUDE "${LIBVGF_PATH}/include/")
+
+add_library(vgf STATIC IMPORTED)
+set_property( TARGET vgf PROPERTY IMPORTED_LOCATION "${LIBVGF_STATIC}" )
+target_include_directories(vgf INTERFACE "${LIBVGF_INCLUDE}")
+
+# Add backend delegate for VGF
+set(_vgf_backend_sources backends/arm/runtime/VGFBackend.cpp
+			 backends/arm/runtime/VGFSetup.cpp )
+
+# vgf backend
+list(TRANSFORM  _vgf_backend_sources PREPEND "${EXECUTORCH_ROOT}/")
+add_library(vgf_backend ${_vgf_backend_sources})
+target_include_directories(
+  vgf_backend PUBLIC
+  ${_common_include_directories}
+  ${VULKAN_HEADERS_PATH}
+  ${VOLK_HEADERS_PATH}
+)
+target_compile_options(vgf_backend PRIVATE -DUSE_VULKAN_WRAPPER -DUSE_VULKAN_VOLK)
+
+
+target_link_libraries(vgf_backend PRIVATE executorch_core)
+target_link_libraries(vgf_backend PRIVATE vgf)
+target_link_options_shared_lib(vgf_backend)
+
+# end config for VGF builds
+endif()
diff --git a/backends/arm/runtime/VGFBackend.cpp b/backends/arm/runtime/VGFBackend.cpp
new file mode 100644
index 00000000000..ea4f4286eb9
--- /dev/null
+++ b/backends/arm/runtime/VGFBackend.cpp
@@ -0,0 +1,361 @@
+/*
+ * Copyright 2025 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <list>
+#include <numeric>
+using namespace std;
+
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+
+using executorch::aten::Tensor;
+using executorch::runtime::ArrayRef;
+using executorch::runtime::Backend;
+using executorch::runtime::BackendExecutionContext;
+using executorch::runtime::BackendInitContext;
+using executorch::runtime::CompileSpec;
+using executorch::runtime::DelegateHandle;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::Result;
+
+// We use the platform and runtime environment provided by the Vulkan delegate
+#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
+
+// Dependencies for processing VGF files into Vulkan calls
+#include <vgf/decoder.hpp>
+#include <vgf/vulkan_helpers.generated.hpp>
+
+#include <executorch/backends/arm/runtime/VGFSetup.h>
+
+namespace executorch {
+namespace backends {
+namespace vgf {
+
+/*
+ * Simple function to populate function pointers for the relevant Tensor
+ * and DataGraph extension APIs.
+ */
+VkResult vkml_load_extensions(VkDevice const* device) {
+  // Note:
+  //    We no longer PFN_vkCreateTensorARM)vkGetDeviceProcAddr(*device,
+  //    "vkCreateTensorARM"); We just verify that the function pointers have
+  //    been populated by the loader
+  if (vkCreateTensorARM && vkDestroyTensorARM && vkCreateTensorViewARM &&
+      vkDestroyTensorViewARM && vkGetTensorMemoryRequirementsARM &&
+      vkBindTensorMemoryARM && vkCreateDataGraphPipelinesARM &&
+      vkCmdDispatchDataGraphARM && vkCreateDataGraphPipelineSessionARM) {
+    ET_LOG(Info, "VKML Extensions loaded");
+    return VK_SUCCESS;
+  }
+  ET_LOG(Error, "Failed to load VKML extensions");
+  return VK_ERROR_UNKNOWN;
+}
+
+/*
+ * Fetch vulkan basic objects - intended to be replaced with a shared
+ * device setup with the Vulkan backend.
+ */
+VkResult vkml_allocate_basics(
+    VkInstance* instance,
+    VkPhysicalDevice* physical_device,
+    VkDevice* device,
+    VkQueue* queue,
+    VkCommandPool* command_pool);
+
+void vkml_free_basics(
+    VkInstance* instance,
+    VkDevice* device,
+    VkCommandPool* command_pool) {
+  vkDestroyCommandPool(*device, *command_pool, nullptr);
+  // Note: These primitives are used by the emulation layer for vulkan
+  //       object allocation, the vulkan objects are freed in in library
+  //       shutdown, so we can't yet destroy these here without causing
+  //       a crash there.
+  //  vkDestroyDevice(*device, nullptr);
+  //  vkDestroyInstance(*instance, nullptr);
+}
+
+class VGFBackend final : public ::executorch::runtime::BackendInterface {
+ public:
+  VGFBackend() {
+    VkResult result;
+
+    // Fetch basic vulkan objects once
+    result = vkml_allocate_basics(
+        &vk_instance,
+        &vk_physical_device,
+        &vk_device,
+        &vk_queue,
+        &vk_command_pool);
+    if (result != VK_SUCCESS) {
+      ET_LOG(
+          Error, "Failed to initialize the Vulkan device error 0x%08X", result);
+      return;
+    }
+
+    // Query the device to ensure it has needed extensions
+    result = vkml_load_extensions(&vk_device);
+    if (result != VK_SUCCESS) {
+      ET_LOG(
+          Error,
+          "Failed to verify VKML extensions needed, error 0x%08X",
+          result);
+      return;
+    }
+  }
+  ~VGFBackend() {
+    vkml_free_basics(&vk_instance, &vk_device, &vk_command_pool);
+  }
+
+  bool is_available() const override {
+    VkResult result;
+
+    ET_LOG(Info, "Checking VGFBackend is available");
+    // Query the device prepared in constructor for needed extensions
+    result = vkml_load_extensions(&vk_device);
+    if (result != VK_SUCCESS)
+      return false;
+
+    return true;
+  }
+
+  Result<DelegateHandle*> init(
+      BackendInitContext& context,
+      FreeableBuffer* processed,
+      ArrayRef<CompileSpec> compile_specs) const override {
+    ET_LOG(Info, "Entered VGF init");
+
+    const char* vgf_data = reinterpret_cast<const char*>(processed->data());
+
+    MemoryAllocator* allocator = context.get_runtime_allocator();
+    VgfRepr* repr = allocator->allocateInstance<VgfRepr>();
+    new (repr) VgfRepr(
+        vk_instance, vk_physical_device, vk_device, vk_queue, vk_command_pool);
+
+    auto valid_vgf = repr->process_vgf(vgf_data, compile_specs);
+    if (!valid_vgf) {
+      ET_LOG(Error, "Failed to process VGF blob.");
+      return Error::Internal;
+    }
+
+    return repr;
+  }
+
+  Error execute(
+      ET_UNUSED BackendExecutionContext& context,
+      DelegateHandle* handle,
+      EValue** args) const override {
+    VgfRepr* repr = static_cast<VgfRepr*>(handle);
+
+    // Copy all inputs from EValue to VkDeviceMemory
+    for (int i = 0; i < repr->IOs.size(); i++) {
+      if (!args[i]->isTensor()) {
+        ET_LOG(
+            Error,
+            "Expected EValue %d to be tensor, got %d",
+            i,
+            static_cast<uint32_t>(args[i]->tag));
+        return Error::InvalidArgument;
+      }
+
+      Tensor* tensor = &args[i]->toTensor();
+      IO* io = &repr->IOs[i];
+
+      // skip non-inputs
+      if (!io->is_input)
+        continue;
+
+      size_t io_size = accumulate(
+          io->size.begin(), io->size.end(), io->elt_size, std::multiplies<>());
+
+      void* data;
+      if (!repr->map_io(io, &data)) {
+        ET_LOG(Error, "Failed to map Vulkan IO memory");
+        return Error::Internal;
+      }
+      memcpy(data, tensor->mutable_data_ptr(), io_size);
+      repr->unmap_io(io);
+    }
+
+    // Execute the workload
+    if (!repr->execute_vgf()) {
+      ET_LOG(Error, "Failed to execute the VGF representation");
+      return Error::Internal;
+    }
+
+    // Copy all outputs from VKDeviceMemory to EValue
+    for (int i = 0; i < repr->IOs.size(); i++) {
+      if (!args[i]->isTensor()) {
+        ET_LOG(
+            Error,
+            "Expected EValue %d to be tensor, got %d",
+            i,
+            static_cast<uint32_t>(args[i]->tag));
+        return Error::InvalidArgument;
+      }
+      Tensor* tensor = &args[i]->toTensor();
+      IO* io = &repr->IOs[i];
+
+      // skip non-outputs
+      if (io->is_input)
+        continue;
+
+      size_t io_size = accumulate(
+          io->size.begin(), io->size.end(), io->elt_size, std::multiplies<>());
+
+      void* data;
+      if (!repr->map_io(io, &data)) {
+        ET_LOG(Error, "Failed to map Vulkan IO memory");
+        return Error::Internal;
+      }
+      memcpy(tensor->mutable_data_ptr(), data, io_size);
+      repr->unmap_io(io);
+    }
+
+    return Error::Ok;
+  }
+
+  void destroy(DelegateHandle* handle) const override {
+    VgfRepr* repr = static_cast<VgfRepr*>(handle);
+    repr->~VgfRepr();
+  }
+
+ private:
+  VkInstance vk_instance;
+  VkPhysicalDevice vk_physical_device;
+  VkDevice vk_device;
+  VkQueue vk_queue;
+  VkCommandPool vk_command_pool;
+};
+
+namespace {
+auto cls = VGFBackend();
+Backend backend{"VgfBackend", &cls};
+static auto success_with_compiler = register_backend(backend);
+} // namespace
+
+VkResult vkml_allocate_basics(
+    VkInstance* instance,
+    VkPhysicalDevice* physical_device,
+    VkDevice* device,
+    VkQueue* queue,
+    VkCommandPool* command_pool) {
+  const char* dev_exts[] = {"VK_ARM_tensors", "VK_ARM_data_graph"};
+  VkResult result;
+
+  if (VK_SUCCESS != volkInitialize()) {
+    ET_LOG(Error, "Volk failed to initialize");
+  }
+
+  VkApplicationInfo app_info{
+      .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
+      .pNext = nullptr,
+      .pApplicationName = "VGF",
+      .applicationVersion = 0,
+      .pEngineName = "executorch",
+      .engineVersion = 0,
+      .apiVersion = VK_API_VERSION_1_3,
+  };
+  VkInstanceCreateInfo instance_info{
+      .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+      .pNext = nullptr,
+      .flags = 0,
+      .pApplicationInfo = &app_info,
+      0,
+      nullptr,
+      0,
+      nullptr};
+  result = vkCreateInstance(&instance_info, nullptr, instance);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to create VkInstance");
+    return result;
+  }
+  volkLoadInstance(*instance);
+
+  // Pick first GPU
+  uint32_t gpu_count = 0;
+  vkEnumeratePhysicalDevices(*instance, &gpu_count, nullptr);
+  if (gpu_count == 0) {
+    ET_LOG(Error, "Found no suitable devices");
+    return VK_ERROR_UNKNOWN;
+  }
+  vector<VkPhysicalDevice> gpus(gpu_count);
+  result = vkEnumeratePhysicalDevices(*instance, &gpu_count, gpus.data());
+  *physical_device = gpus[0];
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to select physical device");
+    return result;
+  }
+
+  // Find suitable queue family
+  uint32_t qf_count;
+  vkGetPhysicalDeviceQueueFamilyProperties(
+      *physical_device, &qf_count, nullptr);
+  vector<VkQueueFamilyProperties> qps(qf_count);
+  vkGetPhysicalDeviceQueueFamilyProperties(
+      *physical_device, &qf_count, qps.data());
+  uint32_t qf = UINT32_MAX;
+  for (uint32_t i = 0; i < qf_count; ++i) {
+    if (qps[i].queueFlags &
+        (VK_QUEUE_COMPUTE_BIT | VK_QUEUE_DATA_GRAPH_BIT_ARM)) {
+      qf = i;
+      break;
+    }
+  }
+  if (qf == UINT32_MAX) {
+    ET_LOG(Error, "Failed to find suitable queue");
+    return VK_ERROR_UNKNOWN;
+  }
+
+  // Device with ML tensor extension
+  float qp = 1.0f;
+  VkDeviceQueueCreateInfo queue_info{
+      .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
+      .pNext = nullptr,
+      .flags = 0,
+      .queueFamilyIndex = qf,
+      .queueCount = 1,
+      .pQueuePriorities = &qp,
+  };
+
+  VkDeviceCreateInfo dci{VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, nullptr};
+  dci.queueCreateInfoCount = 1;
+  dci.pQueueCreateInfos = &queue_info;
+  dci.enabledExtensionCount = 2;
+  dci.ppEnabledExtensionNames = dev_exts;
+  result = vkCreateDevice(*physical_device, &dci, nullptr, device);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to create VkDevice");
+    return result;
+  }
+  // Load the device with volk and populate function pointers
+  volkLoadDevice(*device);
+
+  vkGetDeviceQueue(*device, qf, 0, queue);
+
+  VkCommandPoolCreateInfo poolInfo{
+      .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+      .pNext = nullptr,
+      .flags = 0,
+      .queueFamilyIndex = qf,
+  };
+  result = vkCreateCommandPool(*device, &poolInfo, nullptr, command_pool);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to create VkCommandPool");
+    return result;
+  }
+
+  return result;
+}
+
+} // namespace vgf
+} // namespace backends
+} // namespace executorch
diff --git a/backends/arm/runtime/VGFSetup.cpp b/backends/arm/runtime/VGFSetup.cpp
new file mode 100644
index 00000000000..18c9dbc9727
--- /dev/null
+++ b/backends/arm/runtime/VGFSetup.cpp
@@ -0,0 +1,780 @@
+/*
+ * Copyright 2025 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * VGF functions which prepare a graph for execution by allocating the
+ * appropriate vulkan structures.
+ */
+
+#include <executorch/backends/arm/runtime/VGFSetup.h>
+
+#include <vgf/decoder.hpp>
+#include <vgf/vulkan_helpers.generated.hpp>
+
+using namespace mlsdk;
+
+namespace executorch {
+namespace backends {
+namespace vgf {
+
+/* static function to map format to byte count */
+static uint32_t get_format_size(VkFormat format);
+
+// Debug function to inspect memory properties
+static string memory_flags_to_string(VkMemoryPropertyFlags flags) {
+  if (flags == 0)
+    return "0";
+
+  vector<string> parts;
+#define TRY_FLAG(f)         \
+  if (flags & (f)) {        \
+    parts.emplace_back(#f); \
+    flags &= ~(f);          \
+  }
+
+  TRY_FLAG(VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT)
+  TRY_FLAG(VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
+  TRY_FLAG(VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)
+  TRY_FLAG(VK_MEMORY_PROPERTY_HOST_CACHED_BIT)
+  TRY_FLAG(VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT)
+#ifdef VK_MEMORY_PROPERTY_PROTECTED_BIT
+  TRY_FLAG(VK_MEMORY_PROPERTY_PROTECTED_BIT)
+#endif
+#undef TRY_FLAG
+
+  if (flags) {
+    // any leftover bits we didn’t name
+    ostringstream hex;
+    hex << "0x" << std::hex << flags;
+    parts.emplace_back(hex.str());
+  }
+
+  ostringstream joined;
+  for (size_t i = 0; i < parts.size(); ++i) {
+    if (i)
+      joined << " | ";
+    joined << parts[i];
+  }
+  return joined.str();
+}
+
+/**
+ * Tensor free helper function
+ */
+void free_tensor(
+    VkDevice device,
+    VkTensorViewARM tensor_view,
+    VkTensorARM tensor,
+    VkDeviceMemory memory) {
+  vkDestroyTensorViewARM(device, tensor_view, nullptr);
+  vkDestroyTensorARM(device, tensor, nullptr);
+  vkFreeMemory(device, memory, nullptr);
+}
+
+/**
+ * Tensor allocation helper function
+ */
+VkResult allocate_tensor(
+    VkPhysicalDevice physical,
+    VkDevice device,
+    VkFormat format,
+    uint32_t shape_size,
+    const int64_t* shape,
+    uint32_t stride_size,
+    const int64_t* stride,
+    VkTensorDescriptionARM* description,
+    VkTensorViewARM* tensor_view,
+    VkTensorARM* tensor,
+    VkDeviceMemory* memory) {
+  VkResult result;
+
+  *description = VkTensorDescriptionARM{
+      .sType = VK_STRUCTURE_TYPE_TENSOR_DESCRIPTION_ARM,
+      .pNext = nullptr,
+      .tiling = VK_TENSOR_TILING_LINEAR_ARM,
+      .format = format,
+      .dimensionCount = shape_size,
+      .pDimensions = shape,
+      // Note: stride_data of 0's causes size==0, null means stride==size
+      .pStrides = (0 == stride_size ? nullptr : stride),
+      .usage = VK_TENSOR_USAGE_SHADER_BIT_ARM |
+          VK_TENSOR_USAGE_TRANSFER_SRC_BIT_ARM |
+          VK_TENSOR_USAGE_TRANSFER_DST_BIT_ARM |
+          VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM,
+  };
+  const VkTensorCreateInfoARM create_info = {
+      .sType = VK_STRUCTURE_TYPE_TENSOR_CREATE_INFO_ARM,
+      .pNext = nullptr,
+      .flags = 0,
+      .pDescription = description,
+      .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+      .queueFamilyIndexCount = 0,
+      .pQueueFamilyIndices = nullptr,
+  };
+
+  result = vkCreateTensorARM(device, &create_info, nullptr, tensor);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to CreateTensor, error %d", result);
+    return result;
+  }
+
+  // Get backing memory requirements
+  const VkTensorMemoryRequirementsInfoARM memory_requirements_info = {
+      .sType = VK_STRUCTURE_TYPE_TENSOR_MEMORY_REQUIREMENTS_INFO_ARM,
+      .pNext = nullptr,
+      .tensor = *tensor,
+  };
+  VkMemoryRequirements2 memory_requirements = {
+      .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
+      .pNext = nullptr,
+  };
+  vkGetTensorMemoryRequirementsARM(
+      device, &memory_requirements_info, &memory_requirements);
+
+  VkPhysicalDeviceMemoryProperties memProps;
+  vkGetPhysicalDeviceMemoryProperties(physical, &memProps);
+
+  // Allocate memory
+  uint32_t memory_type = 0;
+  for (size_t j = 0; j < 31; ++j) {
+    if (memory_requirements.memoryRequirements.memoryTypeBits & (0x1 << j)) {
+      memory_type = j;
+      uint32_t aims = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
+          VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+          VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+      if ((memProps.memoryTypes[j].propertyFlags & aims) == aims)
+        break;
+    }
+  }
+  const VkMemoryAllocateInfo allocate_info = {
+      .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+      .pNext = nullptr,
+      .allocationSize = memory_requirements.memoryRequirements.size,
+      .memoryTypeIndex = memory_type};
+
+  vkAllocateMemory(device, &allocate_info, nullptr, memory);
+
+  // Bind tensor to memory
+  const VkBindTensorMemoryInfoARM bind_info = {
+      .sType = VK_STRUCTURE_TYPE_BIND_TENSOR_MEMORY_INFO_ARM,
+      .pNext = nullptr,
+      .tensor = *tensor,
+      .memory = *memory,
+      .memoryOffset = 0,
+  };
+  vkBindTensorMemoryARM(device, 1, &bind_info);
+
+  VkTensorViewCreateInfoARM tensor_view_info = {
+      .sType = VK_STRUCTURE_TYPE_TENSOR_VIEW_CREATE_INFO_ARM,
+      .pNext = nullptr,
+      .flags = 0,
+      .tensor = *tensor,
+      .format = format,
+  };
+  VkResult res_tv =
+      vkCreateTensorViewARM(device, &tensor_view_info, nullptr, tensor_view);
+  ET_LOG(Info, "    tensor view (success %d)", res_tv == VK_SUCCESS);
+
+  return res_tv;
+}
+
+static void debug_print_sequence(
+    unique_ptr<vgflib::ModelSequenceTableDecoder>& sequence_decoder) {
+  ET_LOG(Info, "VGF Sequences:");
+  for (int i = 0; i < sequence_decoder->modelSequenceTableSize(); i++) {
+    ET_LOG(
+        Info,
+        "  Sequence(%d) '%s':",
+        i,
+        string(sequence_decoder->getSegmentName(i)).c_str());
+    auto dispatch_shape = sequence_decoder->getSegmentDispatchShape(i);
+    ET_LOG(
+        Info,
+        "    dispatch shape %d %d %d",
+        dispatch_shape[0],
+        dispatch_shape[1],
+        dispatch_shape[2]);
+    ET_LOG(
+        Info,
+        "    is graph? %d",
+        vgflib::ModuleType::GRAPH == sequence_decoder->getSegmentType(i));
+    ET_LOG(
+        Info,
+        "    module index %d",
+        sequence_decoder->getSegmentModuleIndex(i));
+    auto input_names = sequence_decoder->getModelSequenceInputNamesHandle();
+    ET_LOG(
+        Info, "    names (%ld):", sequence_decoder->getNamesSize(input_names));
+    for (int j = 0; j < sequence_decoder->getNamesSize(input_names); j++) {
+      ET_LOG(
+          Info,
+          "      %d: %s",
+          i,
+          string(sequence_decoder->getName(input_names, i)).c_str());
+    }
+  }
+}
+
+static void debug_print_resources(
+    unique_ptr<vgflib::ModelResourceTableDecoder>& resource_decoder) {
+  ET_LOG(Info, "Resources:");
+  for (int i = 0; i < resource_decoder->size(); i++) {
+    ET_LOG(Info, "  MRT entry %d", i);
+    if (!resource_decoder->getDescriptorType(i).has_value()) {
+      ET_LOG(Info, "    DescriptorType NONE");
+    } else {
+      ET_LOG(
+          Info,
+          "    DescriptorType %u, is tensor? %d",
+          resource_decoder->getDescriptorType(i).value(),
+          resource_decoder->getDescriptorType(i).value() ==
+              VK_DESCRIPTOR_TYPE_TENSOR_ARM);
+    }
+    ET_LOG(
+        Info,
+        "    VkFormat %u from vgf format %u",
+        vgflib::ToVkFormat(resource_decoder->getVkFormat(i)),
+        resource_decoder->getVkFormat(i));
+    switch (resource_decoder->getCategory(i)) {
+      case vgflib::ResourceCategory::INPUT:
+      case vgflib::ResourceCategory::OUTPUT: {
+        ET_LOG(Info, "    Category INPUT/OUTPUT");
+        // Get tensor shape and strides
+        auto shape = resource_decoder->getTensorShape(i);
+        const vector<int64_t> the_shape(shape.begin(), shape.end());
+        auto stride = resource_decoder->getTensorStride(i);
+        const vector<int64_t> the_stride(stride.begin(), stride.end());
+        ET_LOG(
+            Info,
+            "    rank %ld, stride rank %ld",
+            the_shape.size(),
+            the_stride.size());
+        for (int j = 0; j < the_shape.size(); j++) {
+          ET_LOG(Info, "      %d: dim %ld", j, the_shape[j]);
+        }
+        // Allocate a tensor with bound memory
+        break;
+      }
+      case vgflib::ResourceCategory::INTERMEDIATE:
+        ET_LOG(Info, "    Category INTERMEDIATE");
+        break;
+      case vgflib::ResourceCategory::CONSTANT:
+        ET_LOG(Info, "    Category CONSTANT");
+        break;
+      default:
+        ET_LOG(Info, "    Category UNKNOWN");
+        break;
+    }
+  }
+}
+
+static void debug_print_modules(
+    unique_ptr<vgflib::ModuleTableDecoder>& module_decoder) {
+  ET_LOG(Info, "VGF Modules:");
+  for (int i = 0; i < module_decoder->size(); i++) {
+    auto name = string(module_decoder->getModuleName(i));
+    auto entrypoint = string(module_decoder->getModuleEntryPoint(i));
+    auto type = module_decoder->getModuleType(i);
+    auto spirv = module_decoder->getModuleCode(i);
+    ET_LOG(Info, "  Module(%d) '%s':", i, name.c_str());
+    ET_LOG(
+        Info,
+        "    is graph? %d",
+        vgflib::ModuleType::GRAPH == module_decoder->getModuleType(i));
+    ET_LOG(Info, "    entrypoint '%s'", entrypoint.c_str());
+    ET_LOG(Info, "    has spirv %d", module_decoder->hasSPIRV(i));
+    ET_LOG(
+        Info, "    code size %lu", spirv.size()); // read the .begin() to .end()
+  }
+}
+
+bool VgfRepr::process_vgf(const char* vgf_data, ArrayRef<CompileSpec> specs) {
+  ET_LOG(Info, "Preparing VGF as Vulkan objects");
+
+  VkResult result;
+
+  // Prepare temporary decoders
+  unique_ptr<vgflib::HeaderDecoder> header_decoder =
+      vgflib::CreateHeaderDecoder(vgf_data);
+  unique_ptr<vgflib::ModelSequenceTableDecoder> sequence_decoder =
+      vgflib::CreateModelSequenceTableDecoder(
+          vgf_data + header_decoder->GetModelSequenceTableOffset());
+  unique_ptr<vgflib::ModuleTableDecoder> module_decoder =
+      vgflib::CreateModuleTableDecoder(
+          vgf_data + header_decoder->GetModuleTableOffset());
+  unique_ptr<vgflib::ModelResourceTableDecoder> resource_decoder =
+      vgflib::CreateModelResourceTableDecoder(
+          vgf_data + header_decoder->GetModelResourceTableOffset());
+  unique_ptr<vgflib::ConstantDecoder> constant_decoder =
+      vgflib::CreateConstantDecoder(
+          vgf_data + header_decoder->GetConstantsOffset());
+  // Check the VGF decoders
+  if (not(header_decoder && module_decoder && sequence_decoder &&
+          resource_decoder && constant_decoder && header_decoder->IsValid() &&
+          header_decoder->CheckVersion())) {
+    ET_LOG(Error, "Failed to process VGF file internalsr");
+    return false;
+  }
+
+  // Parse the sequences in the VGF (while there can be multiple sequences of
+  // COMPUTE and GRAPH segments in the sequence, we currently expect a single
+  // GRAPH segment to be present.
+  debug_print_sequence(sequence_decoder);
+  if (sequence_decoder->modelSequenceTableSize() != 1) {
+    ET_LOG(Error, "Expected sequence length 1");
+    return false;
+  }
+  if (sequence_decoder->getSegmentType(0) != vgflib::ModuleType::GRAPH) {
+    ET_LOG(Error, "Expected segment to be of type GRAPH");
+    return false;
+  }
+
+  // Extract first segment and it's associated module
+  debug_print_modules(module_decoder);
+  auto segment_name = string(sequence_decoder->getSegmentName(0));
+  auto segment_module = sequence_decoder->getSegmentModuleIndex(0);
+
+  auto segment_m_name = string(module_decoder->getModuleName(segment_module));
+  auto segment_m_entrypoint =
+      string(module_decoder->getModuleEntryPoint(segment_module));
+  auto segment_m_spirv = module_decoder->getModuleCode(segment_module);
+
+  // Build a shader from the module
+  VkShaderModuleCreateInfo smci{
+      .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+      .pNext = nullptr,
+      .flags = 0,
+      .codeSize = segment_m_spirv.size() * sizeof(uint32_t),
+      .pCode = segment_m_spirv.begin(),
+  };
+  result = vkCreateShaderModule(vk_device, &smci, nullptr, &vk_shader);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to load shader from segment %d", segment_module);
+    return false;
+  }
+
+  // Record our shader and entrypoint string
+  vector<tuple<VkShaderModule, string>> shader_modules;
+  shader_modules.push_back({vk_shader, segment_m_entrypoint});
+
+  // Load our resource (tensors, constants) into their appropriate Vk objects
+  vector<VkTensorDescriptionARM> descriptors;
+  vector<tuple<VkTensorARM, VkTensorViewARM>> resources;
+  vector<VkDataGraphPipelineConstantARM> constants;
+
+  int IO_count = resource_decoder->size();
+  for (int i = 0; i < IO_count; i++) {
+    auto resource_type = resource_decoder->getDescriptorType(i).value_or(0);
+    auto resource_format = vgflib::ToVkFormat(resource_decoder->getVkFormat(i));
+
+    // Get tensor shape and strides
+    auto shape = resource_decoder->getTensorShape(i);
+    auto stride = resource_decoder->getTensorStride(i);
+
+    switch (resource_decoder->getCategory(i)) {
+      case vgflib::ResourceCategory::INPUT:
+      case vgflib::ResourceCategory::OUTPUT: {
+        // Expect IO to be a tensor type
+        if (resource_type != VK_DESCRIPTOR_TYPE_TENSOR_ARM) {
+          ET_LOG(
+              Error,
+              "Expected tensor type descriptor %u got %u",
+              VK_DESCRIPTOR_TYPE_TENSOR_ARM,
+              resource_type);
+          return false;
+        }
+
+        // Allocate a tensor with backing memory
+        VkTensorARM tensor;
+        VkTensorViewARM tensor_view;
+        VkDeviceMemory tensor_memory;
+        VkTensorDescriptionARM tensor_description;
+        result = allocate_tensor(
+            vk_physical,
+            vk_device,
+            vgflib::ToVkFormat(resource_decoder->getVkFormat(i)),
+            static_cast<uint32_t>(shape.size()),
+            shape.begin(),
+            static_cast<uint32_t>(stride.size()),
+            stride.begin(),
+            &tensor_description,
+            &tensor_view,
+            &tensor,
+            &tensor_memory);
+        if (result != VK_SUCCESS) {
+          ET_LOG(Error, "Failed to allocate tensor for VGF resource %d", i);
+          return false;
+        }
+        size_t e_size = get_format_size(
+            vgflib::ToVkFormat(resource_decoder->getVkFormat(i)));
+        if (0 == e_size) {
+          ET_LOG(Error, "failed to get element size of VkFormat");
+          return false;
+        }
+
+        bool is_in =
+            resource_decoder->getCategory(i) == vgflib::ResourceCategory::INPUT;
+        IOs.push_back(
+            IO{vector<int64_t>(shape.begin(), shape.end()),
+               vector<int64_t>(stride.begin(), stride.end()),
+               e_size,
+               tensor,
+               tensor_view,
+               tensor_memory,
+               is_in});
+        resources.push_back({tensor, tensor_view});
+        descriptors.push_back(tensor_description);
+        break;
+      }
+      case vgflib::ResourceCategory::CONSTANT:
+        // Constants just need a descriptor
+        descriptors.push_back(VkTensorDescriptionARM{
+            .sType = VK_STRUCTURE_TYPE_TENSOR_DESCRIPTION_ARM,
+            .pNext = nullptr,
+            .tiling = VK_TENSOR_TILING_LINEAR_ARM,
+            .format = vgflib::ToVkFormat(resource_decoder->getVkFormat(i)),
+            .dimensionCount = static_cast<uint32_t>(shape.size()),
+            .pDimensions = shape.begin(),
+            // Note: stride_data of 0's causes size==0, null means stride==size
+            .pStrides = (0 == stride.size() ? nullptr : stride.begin()),
+            .usage = VK_TENSOR_USAGE_DATA_GRAPH_BIT_ARM,
+        });
+        break;
+      case vgflib::ResourceCategory::INTERMEDIATE:
+        ET_LOG(Error, "Unsupported resource category INTERMEDIATE");
+        return false;
+      default:
+        ET_LOG(Info, "Unsupported resource category UNKNOWN");
+        return false;
+    }
+  }
+
+  // Constants table - mapping of shader bindings to MRT's and their descriptors
+  for (int i = 0; i < constant_decoder->size(); i++) {
+    auto mrt_i = constant_decoder->getConstantMrtIndex(i);
+    auto constant_data = constant_decoder->getConstant(i);
+    constants.push_back(VkDataGraphPipelineConstantARM{
+        .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_CONSTANT_ARM,
+        .pNext = &descriptors[mrt_i],
+        .id = mrt_i,
+        .pConstantData = constant_data.begin(),
+    });
+  }
+
+  // Prepare our layout bindings from the segment's information
+  vector<VkDescriptorSetLayoutBinding> layout_bindings;
+  vector<VkDataGraphPipelineResourceInfoARM> data_graph_resources;
+
+  auto set_count = sequence_decoder->getSegmentDescriptorSetInfosSize(0);
+  for (uint32_t d_idx = 0; d_idx < set_count; d_idx++) {
+    auto handle = sequence_decoder->getDescriptorBindingSlotsHandle(0, d_idx);
+    auto binding_count = sequence_decoder->getBindingsSize(handle);
+    for (int binding = 0; binding < binding_count; binding++) {
+      auto binding_index =
+          sequence_decoder->getBindingSlotBinding(handle, binding);
+      auto MRT_index =
+          sequence_decoder->getBindingSlotMrtIndex(handle, binding);
+      auto MRT_type = resource_decoder->getDescriptorType(MRT_index).value();
+
+      const VkDescriptorSetLayoutBinding layout_binding{
+          .binding = binding_index,
+          .descriptorType = vgflib::ToVkDescriptorType(MRT_type),
+          .descriptorCount = 1,
+          .stageFlags = VK_SHADER_STAGE_ALL,
+          .pImmutableSamplers = nullptr,
+      };
+      layout_bindings.push_back(layout_binding);
+
+      const VkDataGraphPipelineResourceInfoARM resource{
+          .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_RESOURCE_INFO_ARM,
+          // Note: we populate the resource_descriptors 1:1 with the MRT table,
+          // so can directly use that index into the resource_descriptors
+          .pNext = &descriptors[MRT_index],
+          .descriptorSet = d_idx,
+          .binding = binding_index,
+          .arrayElement = 0,
+      };
+      data_graph_resources.push_back(resource);
+    }
+  }
+
+  // create fixed layout for this module
+  const VkDescriptorSetLayoutCreateInfo layout_info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+      .pNext = nullptr,
+      .flags = 0,
+      .bindingCount = static_cast<uint32_t>(layout_bindings.size()),
+      layout_bindings.data(),
+  };
+  result =
+      vkCreateDescriptorSetLayout(vk_device, &layout_info, nullptr, &vk_layout);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to create descriptor layout");
+    return false;
+  }
+
+  // Create descriptor pool and descriptors for pipeline
+  const VkDescriptorPoolCreateInfo descriptor_pool_info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+      .pNext = nullptr,
+      .flags = 0,
+      .maxSets = static_cast<uint32_t>(set_count),
+      .poolSizeCount = 0,
+      .pPoolSizes = nullptr,
+  };
+  result = vkCreateDescriptorPool(
+      vk_device, &descriptor_pool_info, nullptr, &vk_descriptor_pool);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to create descriptor pool");
+    return false;
+  }
+
+  const VkDescriptorSetAllocateInfo descriptor_set_info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+      .pNext = nullptr,
+      .descriptorPool = vk_descriptor_pool,
+      .descriptorSetCount = static_cast<uint32_t>(set_count),
+      .pSetLayouts = &vk_layout,
+  };
+
+  // Alloc descriptor sets
+  // currently, as we require modelSequenceTableSize to == 1
+  // we can only get one descriptor set.
+  vector<VkDescriptorSet> descriptor_sets;
+  descriptor_sets.resize(1);
+  result = vkAllocateDescriptorSets(
+      vk_device, &descriptor_set_info, descriptor_sets.data());
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to allocate descriptor sets");
+    return false;
+  }
+
+  // write descriptor updates for every input
+  auto input_slots = sequence_decoder->getSegmentInputBindingSlotsHandle(0);
+  auto input_size = sequence_decoder->getBindingsSize(input_slots);
+  for (uint32_t i = 0; i < input_size; i++) {
+    auto binding = sequence_decoder->getBindingSlotBinding(input_slots, i);
+    auto mrt_i = sequence_decoder->getBindingSlotMrtIndex(input_slots, i);
+
+    VkWriteDescriptorSetTensorARM write_desc = {
+        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM,
+        .pNext = nullptr,
+        .tensorViewCount = 1,
+        .pTensorViews = &get<1>(resources[i]),
+    };
+    VkWriteDescriptorSet desc_set = {
+        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+        .pNext = &write_desc,
+        .dstSet = descriptor_sets[0],
+        .dstBinding = binding,
+        .dstArrayElement = 0,
+        .descriptorCount = 1,
+        .descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM,
+        .pImageInfo = nullptr,
+        .pBufferInfo = nullptr,
+        .pTexelBufferView = nullptr,
+    };
+    vkUpdateDescriptorSets(vk_device, 1, &desc_set, 0, nullptr);
+  }
+
+  // write descriptor updates for every output
+  auto output_slots = sequence_decoder->getSegmentOutputBindingSlotsHandle(0);
+  auto output_size = sequence_decoder->getBindingsSize(output_slots);
+  for (uint32_t i = 0; i < output_size; i++) {
+    auto binding = sequence_decoder->getBindingSlotBinding(output_slots, i);
+    auto mrt_i = sequence_decoder->getBindingSlotMrtIndex(output_slots, i);
+
+    VkWriteDescriptorSetTensorARM write_desc = {
+        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET_TENSOR_ARM,
+        .pNext = nullptr,
+        .tensorViewCount = 1,
+        .pTensorViews = &get<1>(resources[i + input_size]),
+    };
+    VkWriteDescriptorSet desc_set = {
+        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+        .pNext = &write_desc,
+        .dstSet = descriptor_sets[0],
+        .dstBinding = binding,
+        .dstArrayElement = 0,
+        .descriptorCount = 1,
+        .descriptorType = VK_DESCRIPTOR_TYPE_TENSOR_ARM,
+        .pImageInfo = nullptr,
+        .pBufferInfo = nullptr,
+        .pTexelBufferView = nullptr,
+    };
+    vkUpdateDescriptorSets(vk_device, 1, &desc_set, 0, nullptr);
+  }
+
+  // create our pipeline
+  VkPipelineLayoutCreateInfo pipeline_layout_info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+      .pNext = nullptr,
+      .flags = 0,
+      .setLayoutCount = 1,
+      .pSetLayouts = &vk_layout,
+      .pushConstantRangeCount = 0,
+      .pPushConstantRanges = nullptr,
+  };
+  result = vkCreatePipelineLayout(
+      vk_device, &pipeline_layout_info, nullptr, &vk_pipeline_layout);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to create pipeline layout");
+    return false;
+  }
+
+  // Shader Module Create
+  VkDataGraphPipelineShaderModuleCreateInfoARM shader_info{
+      .sType =
+          VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SHADER_MODULE_CREATE_INFO_ARM,
+      .pNext = nullptr,
+      .module = get<0>(shader_modules[0]),
+      .pName = get<1>(shader_modules[0]).c_str(),
+      .pSpecializationInfo = nullptr,
+      .constantCount = static_cast<uint32_t>(constants.size()),
+      .pConstants = constants.data(),
+  };
+
+  // Prepare Graph Pipeline
+  VkDataGraphPipelineCreateInfoARM graph_pipeline_info{
+      .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_CREATE_INFO_ARM,
+      .pNext = &shader_info,
+      .flags = VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT |
+          VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR,
+      .layout = vk_pipeline_layout,
+      .resourceInfoCount = static_cast<uint32_t>(data_graph_resources.size()),
+      .pResourceInfos = data_graph_resources.data(),
+  };
+
+  result = vkCreateDataGraphPipelinesARM(
+      vk_device, // device
+      VK_NULL_HANDLE, // deferredOperation
+      VK_NULL_HANDLE, // VkPipelineCache
+      1, // createInfoCount
+      &graph_pipeline_info, // pCreateInfos
+      nullptr, // pAllocator
+      &vk_pipeline // pPipelines (VkPipeline*)
+  );
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to create DataGraphPipeline");
+    return result;
+  }
+
+  // prepare the graph pipeline session
+  VkDataGraphPipelineSessionCreateInfoARM pipeline_session_info{
+      .sType = VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_CREATE_INFO_ARM,
+      .pNext = nullptr,
+      .flags = 0,
+      .dataGraphPipeline = vk_pipeline,
+  };
+  result = vkCreateDataGraphPipelineSessionARM(
+      vk_device, &pipeline_session_info, nullptr, &vk_session);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to create DataGraphPipelineSession");
+    return result;
+  }
+
+  // Allocate command buffer
+  VkCommandBufferAllocateInfo allocate_info{
+      .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+      .pNext = nullptr,
+      .commandPool = vk_command_pool,
+      .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+      .commandBufferCount = 1};
+  result = vkAllocateCommandBuffers(vk_device, &allocate_info, &vk_execute_cmd);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "Failed to allocate command buffers");
+    return result;
+  }
+
+  // Populate command once with our dispatch information
+  VkCommandBufferBeginInfo beginInfo{
+      VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO};
+  vkBeginCommandBuffer(vk_execute_cmd, &beginInfo);
+
+  // bind pipeline + descriptor set
+  vkCmdBindPipeline(
+      vk_execute_cmd, VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM, vk_pipeline);
+
+  vkCmdBindDescriptorSets(
+      vk_execute_cmd,
+      VK_PIPELINE_BIND_POINT_DATA_GRAPH_ARM,
+      vk_pipeline_layout,
+      0, // first set
+      1,
+      descriptor_sets.data(), // descriptor set count + pointer
+      0,
+      nullptr // no dynamic offsets
+  );
+
+  // Dispatch the graph command
+  vkCmdDispatchDataGraphARM(vk_execute_cmd, vk_session, nullptr);
+
+  // end the command buffer
+  vkEndCommandBuffer(vk_execute_cmd);
+
+  return true;
+}
+
+bool VgfRepr::execute_vgf() {
+  ET_LOG(Info, "Executing vgf");
+
+  // Submit & wait for idle
+  VkSubmitInfo submit{VK_STRUCTURE_TYPE_SUBMIT_INFO};
+  submit.commandBufferCount = 1;
+  submit.pCommandBuffers = &vk_execute_cmd;
+  VkResult result = vkQueueSubmit(vk_queue, 1, &submit, VK_NULL_HANDLE);
+  if (result != VK_SUCCESS) {
+    ET_LOG(Error, "VGF/VkCommandBuffer command submission failed");
+    return false;
+  }
+  vkQueueWaitIdle(vk_queue);
+
+  return true;
+}
+
+void VgfRepr::free_vgf() {
+  vkFreeCommandBuffers(vk_device, vk_command_pool, 1, &vk_execute_cmd);
+  vkDestroyDataGraphPipelineSessionARM(vk_device, vk_session, nullptr);
+  vkDestroyPipeline(vk_device, vk_pipeline, nullptr);
+  vkDestroyPipelineLayout(vk_device, vk_pipeline_layout, nullptr);
+  vkDestroyDescriptorPool(vk_device, vk_descriptor_pool, nullptr);
+  vkDestroyDescriptorSetLayout(vk_device, vk_layout, nullptr);
+  vkDestroyShaderModule(vk_device, vk_shader, nullptr);
+  for (int i = 0; i < IOs.size(); i++) {
+    free_tensor(
+        vk_device, IOs[i].tensor_view, IOs[i].tensor, IOs[i].tensor_memory);
+  }
+}
+
+static uint32_t get_format_size(VkFormat format) {
+  // Note: While this is a small subset of VkFormat, this supports all base
+  //       types for tensors coming from the compiler flow. Tensor formats only
+  //       specify single element type.
+  switch (format) {
+    case VK_FORMAT_R8_BOOL_ARM:
+    case VK_FORMAT_R8_UINT:
+    case VK_FORMAT_R8_SINT:
+      return 1;
+    case VK_FORMAT_R16_UINT:
+    case VK_FORMAT_R16_SINT:
+    case VK_FORMAT_R16_SFLOAT:
+      return 2;
+    case VK_FORMAT_R32_UINT:
+    case VK_FORMAT_R32_SINT:
+    case VK_FORMAT_R32_SFLOAT:
+      return 4;
+    case VK_FORMAT_R64_SINT:
+      return 8;
+    default:
+      ET_LOG(Error, "Unknown tensor format");
+      return 0;
+  }
+}
+
+} // namespace vgf
+} // namespace backends
+} // namespace executorch
diff --git a/backends/arm/runtime/VGFSetup.h b/backends/arm/runtime/VGFSetup.h
new file mode 100644
index 00000000000..29fc287865e
--- /dev/null
+++ b/backends/arm/runtime/VGFSetup.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2025 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <memory>
+#include <string>
+#include <vector>
+using namespace std;
+
+#include <executorch/runtime/backend/interface.h>
+
+using executorch::runtime::ArrayRef;
+using executorch::runtime::CompileSpec;
+
+// We use the platform and runtime environment provided by the Vulkan delegate
+#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
+
+namespace executorch {
+namespace backends {
+namespace vgf {
+
+class VgfRepr;
+
+/*
+ * Info about IOs used during execution
+ */
+typedef struct IO {
+  vector<int64_t> size;
+  vector<int64_t> stride;
+  size_t elt_size;
+  VkTensorARM tensor;
+  VkTensorViewARM tensor_view;
+  VkDeviceMemory tensor_memory;
+  bool is_input;
+} IO;
+
+/*
+ * In memory, and in-vulkan-object representation of the loaded
+ * VGF graph - ready to be dispatched based on provided inputs.
+ */
+class VgfRepr {
+ public:
+  VgfRepr(
+      VkInstance inst,
+      VkPhysicalDevice phys,
+      VkDevice dev,
+      VkQueue queue,
+      VkCommandPool pool)
+      : vk_instance(inst),
+        vk_physical(phys),
+        vk_device(dev),
+        vk_queue(queue),
+        vk_command_pool(pool) {}
+
+  /*
+   * Process a VGF ready for execution, allocate necessary Vulkan objects.
+   */
+  bool process_vgf(const char* vgf_data, ArrayRef<CompileSpec> specs);
+
+  /*
+   * Execute the VGF we've previously processed.
+   */
+  bool execute_vgf();
+
+  /*
+   * Free any allocations made in process_vgf.
+   */
+  void free_vgf();
+
+  /*
+   * input and outputs from the VGF - these are memory mapped and populated
+   * with the EValues coming the backend execute call
+   */
+  vector<IO> IOs;
+
+  bool map_io(IO* io, void** handle) {
+    VkResult result =
+        vkMapMemory(vk_device, io->tensor_memory, 0, VK_WHOLE_SIZE, 0, handle);
+    if (result != VK_SUCCESS) {
+      ET_LOG(Error, "Failed to map Vulkan IO memory");
+      return false;
+    }
+    return true;
+  }
+
+  void unmap_io(IO* io) {
+    vkUnmapMemory(vk_device, io->tensor_memory);
+  }
+
+  ~VgfRepr() {
+    free_vgf();
+  }
+
+ private:
+  // Basic Vulkan objects passed to us and re-used
+  VkInstance vk_instance;
+  VkPhysicalDevice vk_physical;
+  VkDevice vk_device;
+  VkQueue vk_queue;
+  VkCommandPool vk_command_pool;
+
+  // per-VgfRepr-instance objects allocated in process_vgf, used (can be more
+  // than once) in execute_vgf
+  VkCommandBuffer vk_execute_cmd = VK_NULL_HANDLE;
+  VkDataGraphPipelineSessionARM vk_session = VK_NULL_HANDLE;
+  VkPipeline vk_pipeline = VK_NULL_HANDLE;
+  VkPipelineLayout vk_pipeline_layout = VK_NULL_HANDLE;
+  VkDescriptorPool vk_descriptor_pool;
+  VkDescriptorSetLayout vk_layout;
+  VkShaderModule vk_shader;
+  // Note: the vector of tensor memory is stored in IOs above
+};
+
+} // namespace vgf
+} // namespace backends
+} // namespace executorch
diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt
index cb1b8a06afd..0b805aef5f4 100644
--- a/backends/vulkan/CMakeLists.txt
+++ b/backends/vulkan/CMakeLists.txt
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -127,11 +128,18 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$")
   set(VULKAN_RUNNER_SRCS ${_executor_runner__srcs})
   list(TRANSFORM VULKAN_RUNNER_SRCS PREPEND "${EXECUTORCH_ROOT}/")
 
+  set(VGF_BACKEND )
+  if(EXECUTORCH_BUILD_VGF)
+  set(VGF_BACKEND vgf_backend)
+  endif()
+
   add_executable(vulkan_executor_runner ${VULKAN_RUNNER_SRCS})
   target_link_libraries(
     vulkan_executor_runner ${_executor_runner_libs} vulkan_schema
     vulkan_backend
+    ${VGF_BACKEND}
   )
+
   target_compile_options(vulkan_executor_runner PUBLIC ${VULKAN_CXX_FLAGS})
 endif()
 
diff --git a/backends/vulkan/runtime/graph/containers/Types.h b/backends/vulkan/runtime/graph/containers/Types.h
index 5840d1695ee..48232179e06 100644
--- a/backends/vulkan/runtime/graph/containers/Types.h
+++ b/backends/vulkan/runtime/graph/containers/Types.h
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
+ * Copyright 2025 Arm Limited and/or its affiliates.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -8,6 +9,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <ostream>
 
 namespace vkcompute {
diff --git a/backends/vulkan/third-party/Vulkan-Headers b/backends/vulkan/third-party/Vulkan-Headers
index 0c5928795a6..10739e8e00a 160000
--- a/backends/vulkan/third-party/Vulkan-Headers
+++ b/backends/vulkan/third-party/Vulkan-Headers
@@ -1 +1 @@
-Subproject commit 0c5928795a66e93f65e5e68a36d8daa79a209dc2
+Subproject commit 10739e8e00a7b6f74d22dd0a547f1406ff1f5eb9
diff --git a/backends/vulkan/third-party/volk b/backends/vulkan/third-party/volk
index b3bc21e584f..49ba6858c13 160000
--- a/backends/vulkan/third-party/volk
+++ b/backends/vulkan/third-party/volk
@@ -1 +1 @@
-Subproject commit b3bc21e584f97400b6884cb2a541a56c6a5ddba3
+Subproject commit 49ba6858c13516019d699d94c31d5814025dd005
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 551c69bc93e..06558b85460 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -145,6 +145,9 @@ define_overridable_option(
 define_overridable_option(
   EXECUTORCH_BUILD_CORTEX_M "Build the Cortex-M backend" BOOL OFF
 )
+define_overridable_option(
+  EXECUTORCH_BUILD_VGF "Build the Arm VGF backend" BOOL OFF
+)
 define_overridable_option(
   EXECUTORCH_COREML_BUILD_EXECUTOR_RUNNER "Build CoreML executor runner." BOOL
   OFF

From 043be0ba202c1404103a198b381f469a9da7dec4 Mon Sep 17 00:00:00 2001
From: Rob Elliott <robert.elliott@arm.com>
Date: Mon, 28 Jul 2025 10:13:55 +0000
Subject: [PATCH 2/2] minor fix for cmake changes

Signed-off-by: Rob Elliott <robert.elliott@arm.com>
---
 backends/arm/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
index bf313232b66..11f61c0dfee 100644
--- a/backends/arm/CMakeLists.txt
+++ b/backends/arm/CMakeLists.txt
@@ -80,7 +80,7 @@ target_compile_options(vgf_backend PRIVATE -DUSE_VULKAN_WRAPPER -DUSE_VULKAN_VOL
 
 target_link_libraries(vgf_backend PRIVATE executorch_core)
 target_link_libraries(vgf_backend PRIVATE vgf)
-target_link_options_shared_lib(vgf_backend)
+executorch_target_link_options_shared_lib(vgf_backend)
 
 # end config for VGF builds
 endif()