diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt
deleted file mode 100644
index 29ff90e7293..00000000000
--- a/backends/vulkan/CMakeLists.txt
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-# Copyright 2025 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# ### Editing this file ###
-#
-# This file should be formatted with
-# ~~~
-# cmake-format -i CMakeLists.txt
-# ~~~
-# It should also be cmake-lint clean.
-#
-# The targets in this file will be built if EXECUTORCH_BUILD_VULKAN is ON
-
-cmake_minimum_required(VERSION 3.19)
-
-if(NOT EXECUTORCH_ROOT)
-  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
-endif()
-
-if(NOT RUNTIME_PATH)
-  set(RUNTIME_PATH ${CMAKE_CURRENT_SOURCE_DIR}/runtime)
-endif()
-
-# Include this file to access executorch_target_link_options_shared_lib This is
-# required to provide access to executorch_target_link_options_shared_lib which
-# allows libraries to be linked with the --whole-archive flag. This is required
-# for libraries that perform dynamic registration via static initialization.
-include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
-
-include(cmake/ShaderLibrary.cmake)
-
-# Third party include paths
-
-set(VULKAN_THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/third-party)
-
-set(VULKAN_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/Vulkan-Headers)
-set(VOLK_PATH ${VULKAN_THIRD_PARTY_PATH}/volk)
-set(VMA_PATH ${VULKAN_THIRD_PARTY_PATH}/VulkanMemoryAllocator)
-
-set(COMMON_INCLUDES
-    $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
-    $<BUILD_INTERFACE:${VULKAN_HEADERS_PATH}/include>
-    $<BUILD_INTERFACE:${VOLK_PATH}> $<BUILD_INTERFACE:${VMA_PATH}>
-)
-
-# Compile settings
-
-set(VULKAN_CXX_FLAGS "-fexceptions")
-list(APPEND VULKAN_CXX_FLAGS "-DUSE_VULKAN_WRAPPER")
-list(APPEND VULKAN_CXX_FLAGS "-DUSE_VULKAN_VOLK")
-
-# vulkan API files
-
-file(GLOB_RECURSE vulkan_api_cpp ${RUNTIME_PATH}/api/*)
-file(GLOB_RECURSE vulkan_vkapi_cpp ${RUNTIME_PATH}/vk_api/*)
-list(APPEND vulkan_api_cpp ${vulkan_vkapi_cpp})
-list(APPEND vulkan_api_cpp ${VOLK_PATH}/volk.c)
-
-# vulkan ComputeGraph files
-
-file(GLOB_RECURSE vulkan_graph_cpp ${RUNTIME_PATH}/graph/*)
-list(APPEND vulkan_graph_cpp ${vulkan_api_cpp})
-
-# Standard GLSL shader library
-
-set(VULKAN_GRAPH_SHADERS_PATH ${RUNTIME_PATH}/graph/ops/glsl/)
-# Generates a spv.cpp file containing compiled GLSL shaders
-gen_vulkan_shader_lib_cpp(${VULKAN_GRAPH_SHADERS_PATH})
-# Save the path of the generated cpp file
-set(vulkan_standard_shaders_cpp ${generated_spv_cpp})
-
-# Generate Vulkan Delegate Schema Files from flatc
-
-set(SCHEMA_INCLUDE_DIR ${CMAKE_BINARY_DIR}/schema/include)
-
-set(GENERATED_HEADER
-    ${SCHEMA_INCLUDE_DIR}/executorch/backends/vulkan/serialization/schema_generated.h
-)
-
-add_custom_command(
-  OUTPUT ${GENERATED_HEADER}
-  COMMAND
-    flatc --cpp --cpp-std c++11 --scoped-enums -o
-    "${SCHEMA_INCLUDE_DIR}/executorch/backends/vulkan/serialization/"
-    ${_vulkan_schema__srcs}
-  WORKING_DIRECTORY ${EXECUTORCH_ROOT}
-  DEPENDS flatc
-  COMMENT "Generating vulkan_schema headers"
-  VERBATIM
-)
-
-# vulkan_schema library
-
-add_library(vulkan_schema INTERFACE ${GENERATED_HEADER})
-set_target_properties(vulkan_schema PROPERTIES LINKER_LANGUAGE CXX)
-
-target_include_directories(
-  vulkan_schema
-  INTERFACE
-    $<BUILD_INTERFACE:${SCHEMA_INCLUDE_DIR}>
-    $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/third-party/flatbuffers/include>
-)
-
-# vulkan_backend
-
-file(GLOB vulkan_backend_cpp ${RUNTIME_PATH}/*.cpp)
-list(APPEND vulkan_backend_cpp ${vulkan_graph_cpp})
-list(APPEND vulkan_backend_cpp ${vulkan_standard_shaders_cpp})
-
-add_library(vulkan_backend ${vulkan_backend_cpp})
-target_include_directories(
-  vulkan_backend PRIVATE ${SCHEMA_INCLUDE_DIR} ${COMMON_INCLUDES}
-)
-target_link_libraries(vulkan_backend PRIVATE vulkan_schema executorch_core)
-target_compile_options(vulkan_backend PRIVATE ${VULKAN_CXX_FLAGS})
-# Link this library with --whole-archive due to dynamic backend registration
-executorch_target_link_options_shared_lib(vulkan_backend)
-
-set_property(TARGET vulkan_backend PROPERTY CXX_STANDARD 17)
-
-# Test targets
-
-install(
-  TARGETS vulkan_backend vulkan_schema
-  EXPORT ExecuTorchTargets
-  DESTINATION lib
-  INCLUDES
-  DESTINATION ${COMMON_INCLUDES}
-)
diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt
new file mode 120000
index 00000000000..c59c41b3538
--- /dev/null
+++ b/backends/vulkan/CMakeLists.txt
@@ -0,0 +1 @@
+/home/ssjia/fbsource/xplat/executorch/backends/vulkan/CMakeLists.txt
\ No newline at end of file
diff --git a/backends/vulkan/README.md b/backends/vulkan/README.md
deleted file mode 100644
index e0a953d05fe..00000000000
--- a/backends/vulkan/README.md
+++ /dev/null
@@ -1,205 +0,0 @@
-# Vulkan Backend
-
-The ExecuTorch Vulkan delegate is a native GPU delegate for ExecuTorch that is
-built on top of the cross-platform Vulkan GPU API standard. It is primarily
-designed to leverage the GPU to accelerate model inference on Android devices,
-but can be used on any platform that supports an implementation of Vulkan:
-laptops, servers, and edge devices.
-
-::::{note}
-The Vulkan delegate is currently under active development, and its components
-are subject to change.
-::::
-
-## What is Vulkan?
-
-Vulkan is a low-level GPU API specification developed as a successor to OpenGL.
-It is designed to offer developers more explicit control over GPUs compared to
-previous specifications in order to reduce overhead and maximize the
-capabilities of the modern graphics hardware.
-
-Vulkan has been widely adopted among GPU vendors, and most modern GPUs (both
-desktop and mobile) in the market support Vulkan. Vulkan is also included in
-Android from Android 7.0 onwards.
-
-**Note that Vulkan is a GPU API, not a GPU Math Library**. That is to say it
-provides a way to execute compute and graphics operations on a GPU, but does not
-come with a built-in library of performant compute kernels.
-
-## The Vulkan Compute Library
-
-The ExecuTorch Vulkan Delegate is a wrapper around a standalone runtime known as
-the **Vulkan Compute Library**. The aim of the Vulkan Compute Library is to
-provide GPU implementations for PyTorch operators via GLSL compute shaders.
-
-The Vulkan Compute Library is a fork/iteration of the [PyTorch Vulkan Backend](https://pytorch.org/tutorials/prototype/vulkan_workflow.html).
-The core components of the PyTorch Vulkan backend were forked into ExecuTorch
-and adapted for an AOT graph-mode style of model inference (as opposed to
-PyTorch which adopted an eager execution style of model inference).
-
-The components of the Vulkan Compute Library are contained in the
-`executorch/backends/vulkan/runtime/` directory. The core components are listed
-and described below:
-
-```
-runtime/
-├── api/ .................... Wrapper API around Vulkan to manage Vulkan objects
-└── graph/ .................. ComputeGraph class which implements graph mode inference
-    └── ops/ ................ Base directory for operator implementations
-        ├── glsl/ ........... GLSL compute shaders
-        │   ├── *.glsl
-        │   └── conv2d.glsl
-        └── impl/ ........... C++ code to dispatch GPU compute shaders
-            ├── *.cpp
-            └── Conv2d.cpp
-```
-
-## Features
-
-The Vulkan delegate currently supports the following features:
-
-* **Memory Planning**
-  * Intermediate tensors whose lifetimes do not overlap will share memory allocations. This reduces the peak memory usage of model inference.
-* **Capability Based Partitioning**:
-  * A graph can be partially lowered to the Vulkan delegate via a partitioner, which will identify nodes (i.e. operators) that are supported by the Vulkan delegate and lower only supported subgraphs
-* **Support for upper-bound dynamic shapes**:
-  * Tensors can change shape between inferences as long as its current shape is smaller than the bounds specified during lowering
-
-In addition to increasing operator coverage, the following features are
-currently in development:
-
-* **Quantization Support**
-  * We are currently working on support for 8-bit dynamic quantization, with plans to extend to other quantization schemes in the future.
-* **Memory Layout Management**
-  * Memory layout is an important factor to optimizing performance. We plan to introduce graph passes to introduce memory layout transitions throughout a graph to optimize memory-layout sensitive operators such as Convolution and Matrix Multiplication.
-* **Selective Build**
-  * We plan to make it possible to control build size by selecting which operators/shaders you want to build with
-
-## End to End Example
-
-To further understand the features of the Vulkan Delegate and how to use it,
-consider the following end to end example with a simple single operator model.
-
-### Compile and lower a model to the Vulkan Delegate
-
-Assuming ExecuTorch has been set up and installed, the following script can be
-used to produce a lowered MobileNet V2 model as `vulkan_mobilenetv2.pte`.
-
-Once ExecuTorch has been set up and installed, the following script can be used
-to generate a simple model and lower it to the Vulkan delegate.
-
-```
-# Note: this script is the same as the script from the "Setting up ExecuTorch"
-# page, with one minor addition to lower to the Vulkan backend.
-import torch
-from torch.export import export
-from executorch.exir import to_edge
-
-from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
-
-# Start with a PyTorch model that adds two input tensors (matrices)
-class Add(torch.nn.Module):
-  def __init__(self):
-    super(Add, self).__init__()
-
-  def forward(self, x: torch.Tensor, y: torch.Tensor):
-      return x + y
-
-# 1. torch.export: Defines the program with the ATen operator set.
-aten_dialect = export(Add(), (torch.ones(1), torch.ones(1)))
-
-# 2. to_edge: Make optimizations for Edge devices
-edge_program = to_edge(aten_dialect)
-# 2.1 Lower to the Vulkan backend
-edge_program = edge_program.to_backend(VulkanPartitioner())
-
-# 3. to_executorch: Convert the graph to an ExecuTorch program
-executorch_program = edge_program.to_executorch()
-
-# 4. Save the compiled .pte program
-with open("vk_add.pte", "wb") as file:
-    file.write(executorch_program.buffer)
-```
-
-Like other ExecuTorch delegates, a model can be lowered to the Vulkan Delegate
-using the `to_backend()` API. The Vulkan Delegate implements the
-`VulkanPartitioner` class which identifies nodes (i.e. operators) in the graph
-that are supported by the Vulkan delegate, and separates compatible sections of
-the model to be executed on the GPU.
-
-This means the a model can be lowered to the Vulkan delegate even if it contains
-some unsupported operators. This will just mean that only parts of the graph
-will be executed on the GPU.
-
-
-::::{note}
-The [supported ops list](https://github.com/pytorch/executorch/blob/main/backends/vulkan/op_registry.py#L194)
-Vulkan partitioner code can be inspected to examine which ops are currently
-implemented in the Vulkan delegate.
-::::
-
-### Build Vulkan Delegate libraries
-
-The easiest way to build and test the Vulkan Delegate is to build for Android
-and test on a local Android device. Android devices have built in support for
-Vulkan, and the Android NDK ships with a GLSL compiler which is needed to
-compile the Vulkan Compute Library's GLSL compute shaders.
-
-The Vulkan Delegate libraries can be built by setting `-DEXECUTORCH_BUILD_VULKAN=ON`
-when building with CMake.
-
-First, make sure that you have the Android NDK installed; any NDK version past
-NDK r19c should work. Note that the examples in this doc have been validated with
-NDK r27b. The Android SDK should also be installed so that you have access to `adb`.
-
-The instructions in this page assumes that the following environment variables
-are set.
-
-```shell
-export ANDROID_NDK=<path_to_ndk>
-# Select the appropriate Android ABI for your device
-export ANDROID_ABI=arm64-v8a
-# All subsequent commands should be performed from ExecuTorch repo root
-cd <path_to_executorch_root>
-# Make sure adb works
-adb --version
-```
-
-To build and install ExecuTorch libraries (for Android) with the Vulkan
-Delegate:
-
-```shell
-# From executorch root directory
-(rm -rf cmake-android-out && \
-  pp cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-    -DANDROID_ABI=$ANDROID_ABI \
-    -DEXECUTORCH_BUILD_VULKAN=ON \
-    -DPYTHON_EXECUTABLE=python \
-    -Bcmake-android-out && \
-  cmake --build cmake-android-out -j16 --target install)
-```
-
-### Run the Vulkan model on device
-
-::::{note}
-Since operator support is currently limited, only binary arithmetic operators
-will run on the GPU. Expect inference to be slow as the majority of operators
-are being executed via Portable operators.
-::::
-
-Now, the partially delegated model can be executed (partially) on your device's
-GPU!
-
-```shell
-# Build a model runner binary linked with the Vulkan delegate libs
-cmake --build cmake-android-out --target executor_runner -j32
-
-# Push model to device
-adb push vk_add.pte /data/local/tmp/vk_add.pte
-# Push binary to device
-adb push cmake-android-out/executor_runner /data/local/tmp/runner_bin
-
-# Run the model
-adb shell /data/local/tmp/runner_bin --model_path /data/local/tmp/vk_add.pte
-```
diff --git a/backends/vulkan/README.md b/backends/vulkan/README.md
new file mode 120000
index 00000000000..4017cdc2caa
--- /dev/null
+++ b/backends/vulkan/README.md
@@ -0,0 +1 @@
+/home/ssjia/fbsource/xplat/executorch/backends/vulkan/README.md
\ No newline at end of file
diff --git a/backends/vulkan/TARGETS b/backends/vulkan/TARGETS
deleted file mode 100644
index 41893d29274..00000000000
--- a/backends/vulkan/TARGETS
+++ /dev/null
@@ -1,4 +0,0 @@
-load(":targets.bzl", "define_common_targets")
-oncall("executorch")
-
-define_common_targets(is_fbcode = True)
diff --git a/backends/vulkan/TARGETS b/backends/vulkan/TARGETS
new file mode 120000
index 00000000000..2be80e569d1
--- /dev/null
+++ b/backends/vulkan/TARGETS
@@ -0,0 +1 @@
+/home/ssjia/fbsource/xplat/executorch/backends/vulkan/TARGETS
\ No newline at end of file
diff --git a/backends/vulkan/__init__.py b/backends/vulkan/__init__.py
deleted file mode 100644
index 6c25e56115b..00000000000
--- a/backends/vulkan/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from .partitioner.vulkan_partitioner import VulkanPartitioner
-
-from .vulkan_preprocess import VulkanBackend
-
-__all__ = [
-    "VulkanPartitioner",
-    "VulkanBackend",
-]
diff --git a/backends/vulkan/__init__.py b/backends/vulkan/__init__.py
new file mode 120000
index 00000000000..bf978851c6e
--- /dev/null
+++ b/backends/vulkan/__init__.py
@@ -0,0 +1 @@
+/home/ssjia/fbsource/xplat/executorch/backends/vulkan/__init__.py
\ No newline at end of file
diff --git a/backends/vulkan/_passes b/backends/vulkan/_passes
new file mode 120000
index 00000000000..e3c7d6c74fe
--- /dev/null
+++ b/backends/vulkan/_passes
@@ -0,0 +1 @@
+/home/ssjia/fbsource/xplat/executorch/backends/vulkan/_passes
\ No newline at end of file
diff --git a/backends/vulkan/_passes/TARGETS b/backends/vulkan/_passes/TARGETS
deleted file mode 100644
index 8558a2eea93..00000000000
--- a/backends/vulkan/_passes/TARGETS
+++ /dev/null
@@ -1,171 +0,0 @@
-load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-
-oncall("executorch")
-
-runtime.python_library(
-    name = "fuse_quantized_ops",
-    srcs = ["fuse_quantized_ops.py"],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/backends/transforms:utils",
-        "//executorch/backends/vulkan:custom_ops_lib",
-        "//executorch/backends/vulkan:utils_lib",
-        "//executorch/exir:pass_base",
-        "//executorch/exir:sym_util",
-        "//executorch/exir/dialects:lib",
-    ],
-)
-
-runtime.python_library(
-    name = "insert_prepack_nodes",
-    srcs = ["insert_prepack_nodes.py"],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/exir:pass_base",
-        "//executorch/backends/vulkan:utils_lib",
-        "//executorch/backends/vulkan:op_registry",
-    ],
-)
-
-runtime.python_library(
-    name = "int4_weight_only_quantizer",
-    srcs = [
-        "int4_weight_only_quantizer.py",
-    ],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        "//executorch/backends/vulkan:custom_ops_lib",
-        "//pytorch/ao:torchao",
-    ]
-)
-
-runtime.python_library(
-    name = "squeeze_unsqueeze_inputs",
-    srcs = [
-        "squeeze_unsqueeze_inputs.py",
-    ],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/backends/vulkan:custom_ops_lib",
-        "//executorch/exir:pass_base",
-        "//executorch/exir/dialects:lib",
-    ]
-)
-
-runtime.python_library(
-    name = "remove_asserts",
-    srcs = ["remove_asserts.py"],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/exir:pass_base",
-        "//executorch/exir/dialects:lib",
-    ],
-)
-
-runtime.python_library(
-    name = "remove_local_scalar_dense",
-    srcs = ["remove_local_scalar_dense_ops.py"],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/exir:pass_base",
-        "//executorch/exir/dialects:lib",
-    ],
-)
-
-runtime.python_library(
-    name = "remove_redundant_ops",
-    srcs = ["remove_redundant_ops.py"],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/exir:pass_base",
-        "//executorch/exir/dialects:lib",
-    ],
-)
-
-runtime.python_library(
-    name = "tag_memory_meta_pass",
-    srcs = ["tag_memory_meta_pass.py"],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/exir:pass_base",
-        "//executorch/exir/dialects:lib",
-        "//executorch/backends/vulkan:utils_lib",
-        "//executorch/backends/vulkan/serialization:lib",
-    ],
-)
-
-runtime.python_library(
-    name = "fold_qdq",
-    srcs = ["fold_qdq.py"],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/backends/vulkan:utils_lib",
-        "//executorch/exir:pass_base",
-    ],
-)
-
-runtime.python_library(
-    name = "fuse_patterns",
-    srcs = ["fuse_patterns.py"],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/backends/vulkan/patterns:vulkan_patterns",
-        "//executorch/exir:lib",
-        "//executorch/exir:pass_base",
-        "//executorch/exir/dialects:lib",
-    ],
-    typing = True,
-)
-
-runtime.python_library(
-    name = "vulkan_passes",
-    srcs = [
-        "__init__.py",
-    ],
-    visibility = [
-        "//executorch/backends/...",
-        "//executorch/examples/...",
-    ],
-    deps = [
-        ":fold_qdq",
-        ":fuse_patterns",
-        ":fuse_quantized_ops",
-        ":insert_prepack_nodes",
-        ":int4_weight_only_quantizer",
-        ":remove_asserts",
-        ":remove_local_scalar_dense",
-        ":remove_redundant_ops",
-        ":squeeze_unsqueeze_inputs",
-        ":tag_memory_meta_pass",
-    ]
-)
diff --git a/backends/vulkan/_passes/__init__.py b/backends/vulkan/_passes/__init__.py
deleted file mode 100644
index 2c4588ac43d..00000000000
--- a/backends/vulkan/_passes/__init__.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-strict
-
-from executorch.backends.vulkan._passes.fold_qdq import FoldQDQPass
-from executorch.backends.vulkan._passes.fuse_patterns import FusePatternsPass
-from executorch.backends.vulkan._passes.fuse_quantized_ops import (
-    FuseQuantizedOpsTransform,
-)
-from executorch.backends.vulkan._passes.insert_prepack_nodes import insert_prepack_nodes
-from executorch.backends.vulkan._passes.int4_weight_only_quantizer import (
-    VkInt4WeightOnlyQuantizer,
-)
-from executorch.backends.vulkan._passes.remove_asserts import (
-    remove_asserts,
-    RemoveAssertsTransform,
-)
-from executorch.backends.vulkan._passes.remove_local_scalar_dense_ops import (
-    RemoveLocalScalarDenseOpsTransform,
-)
-from executorch.backends.vulkan._passes.remove_redundant_ops import (
-    RemoveRedundantOpsTransform,
-)
-from executorch.backends.vulkan._passes.squeeze_unsqueeze_inputs import (
-    SqueezeUnsqueezeInputs,
-)
-from executorch.backends.vulkan._passes.tag_memory_meta_pass import TagMemoryMetaPass
-
-__all__ = [
-    "FoldQDQPass",
-    "FusePatternsPass",
-    "FuseQuantizedOpsTransform",
-    "insert_prepack_nodes",
-    "VkInt4WeightOnlyQuantizer",
-    "remove_asserts",
-    "RemoveAssertsTransform",
-    "RemoveLocalScalarDenseOpsTransform",
-    "RemoveRedundantOpsTransform",
-    "SqueezeUnsqueezeInputs",
-    "TagMemoryMetaPass",
-]
diff --git a/backends/vulkan/_passes/fold_qdq.py b/backends/vulkan/_passes/fold_qdq.py
deleted file mode 100644
index 3beccc2205c..00000000000
--- a/backends/vulkan/_passes/fold_qdq.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import executorch.backends.vulkan.utils as utils
-import torch
-
-from executorch.exir.pass_base import ExportPass, PassResult
-from executorch.exir.passes import dead_code_elimination_pass
-
-
-class FoldQDQPass(ExportPass):
-    """
-    Erase Q/DQ chain introduced by PT2E quantization workflow. It is assumed that all
-    valid quant op patterns have already been fused before this pass.
-    """
-
-    def __init__(self, edge_program: torch.export.ExportedProgram):
-        super(FoldQDQPass, self).__init__()
-        self.edge_program = edge_program
-
-    def call(self, graph_module: torch.fx.GraphModule):
-        for node in graph_module.graph.nodes:
-            if utils.is_quant_node(node):
-                original_node = node.args[0]
-                assert isinstance(original_node, torch.fx.Node)
-                # For each direct user that is a dequant node, connect the original
-                # node to the users of the dequant node.
-                for user in node.users:
-                    if utils.is_dequant_node(user):
-                        dq_node = user
-                        dq_node.replace_all_uses_with(original_node)
-
-        graph_module.recompile()
-        dead_code_elimination_pass(graph_module)
-        # Re-trace to validate everything is ok
-        graph_module = super().call(graph_module).graph_module
-
-        return PassResult(graph_module, True)
diff --git a/backends/vulkan/_passes/fuse_patterns.py b/backends/vulkan/_passes/fuse_patterns.py
deleted file mode 100644
index 6ced1f32a7c..00000000000
--- a/backends/vulkan/_passes/fuse_patterns.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import executorch.backends.vulkan.patterns as vk_patterns
-
-import torch
-
-from executorch.exir import ExportedProgram
-from executorch.exir.pass_base import ExportPass, PassResult
-
-
-class FusePatternsPass(ExportPass):
-    def __init__(self, exported_program: ExportedProgram) -> None:
-        super().__init__()
-        self.program = exported_program
-
-    def call(self, graph_module: torch.fx.GraphModule):
-        total_replaced = vk_patterns.replace_all_fusable_subgraphs(
-            self.program, graph_module
-        )
-
-        if total_replaced > 0:
-            graph_module.recompile()
-            # Re-trace the graph
-            graph_module = super().call(graph_module).graph_module
-
-        return PassResult(graph_module, total_replaced > 0)
diff --git a/backends/vulkan/_passes/fuse_quantized_ops.py b/backends/vulkan/_passes/fuse_quantized_ops.py
deleted file mode 100644
index 3d3214bb4ee..00000000000
--- a/backends/vulkan/_passes/fuse_quantized_ops.py
+++ /dev/null
@@ -1,515 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-strict
-
-from typing import Optional, Tuple
-
-import executorch.backends.vulkan.utils as utils
-import torch
-
-import torch.nn.functional as F
-
-from executorch.backends.transforms.utils import get_param_tensor, is_param_node
-from executorch.exir import ExportedProgram
-from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass, PassResult
-from executorch.exir.passes import dead_code_elimination_pass
-
-#################
-## linear_qcnw ##
-#################
-
-
-def matches_linear_qcnw_pattern(  # noqa: C901
-    program: ExportedProgram, node: torch.fx.Node
-) -> Optional[Tuple[torch.qscheme, int]]:
-    """
-    Checks if the nodes surrounding a linear node matches the pattern for weight only
-    quantized linear, where the weight is quantized channelswise to n bits.
-
-    If the graph pattern matches, then return a tuple of (quantization_method, nbits)
-    describing the type of quantization used for the weights. Otherwise, return None.
-    """
-    if not utils.is_linear_node(node):
-        return None
-
-    input_node = node.args[0]
-    weight_node = node.args[1]
-
-    # Type checking
-    if not isinstance(weight_node, torch.fx.Node):
-        return None
-    if not isinstance(input_node, torch.fx.Node):
-        return None
-
-    # The input arg should not be a dequant node; if it is, then it is indicative that
-    # dynamically quantized linear should be used instead
-    if utils.is_dequant_node(input_node):
-        return None
-
-    # The weight arg should be a dequant node dequantizing the quantized weight
-    # Furthermore, the op expects per channel quantization of the weight
-    if not utils.is_dequant_per_channel_node(weight_node):
-        return None
-
-    orig_weight = weight_node.args[0]
-    zeros = weight_node.args[2]
-
-    # Type checking
-    if not isinstance(orig_weight, torch.fx.Node):
-        return None
-    if not is_param_node(program, orig_weight):
-        return None
-    if not isinstance(zeros, torch.fx.Node):
-        return None
-    if not is_param_node(program, zeros):
-        return None
-
-    zeros_tensor = get_param_tensor(program, zeros)
-    if not isinstance(zeros_tensor, torch.Tensor):
-        return None
-
-    quant_method = torch.per_channel_affine
-    # Check for symmetric quantization, where the zeros used for dequantization will
-    # actually be all zeros.
-    if torch.all(zeros_tensor == 0):
-        quant_method = torch.per_channel_symmetric
-
-    orig_weight_tensor = get_param_tensor(program, orig_weight)
-    if not isinstance(orig_weight_tensor, torch.Tensor):
-        return None
-    # Sanity check the dtype of the quantized weight
-    if orig_weight_tensor.dtype != torch.int8:
-        return None
-
-    quant_min = orig_weight_tensor.min().item()
-    quant_max = orig_weight_tensor.max().item()
-    # Determine the number of bits the weight has been quantized to
-    if quant_min >= -8 and quant_max <= 7:
-        return quant_method, 4
-    elif quant_min >= -128 and quant_max <= 127:
-        return quant_method, 8
-
-    return None
-
-
-def pack_4bit_weight_tensor(inp: torch.Tensor) -> torch.Tensor:
-    """
-    Given a 8-bit weight tensor containing values quantized to 4 bits, create a packed
-    weight tensor by packing 2 4-bit values in one unsigned 8-bit value.
-
-    An input weight tensor of shape (M, K) will produce a packed weight tensor of shape
-    (M, K / 2).
-    """
-
-    # Assert we got a properly quantized tensor.
-    min, max = inp.min().item(), inp.max().item()
-    assert (
-        max <= 7 and min >= -8
-    ), f"convert_to_qc4w: [min,max] out of [-8, 7] range, got [{min}, {max}]"
-
-    # Assuming we have a 2d tensor
-    if inp.ndim != 2:
-        inp = inp.squeeze()
-    assert (
-        inp.ndim == 2
-    ), f"convert_to_qc4w: expecting input tensor to be 2d, got {inp.ndim}"
-
-    # pad ic
-    if inp.shape[-1] % 2 != 0:
-        inp = F.pad(input=inp, pad=(0, 1, 0, 0), mode="constant", value=0)
-
-    # Shape after padding
-    oc, ic = inp.shape
-    assert ic % 2 == 0, "convert_to_qc4w: expecting ic to be even"
-
-    # Adjust inp tensor for zp
-    inp = inp.to(dtype=torch.uint8) + 8
-
-    # Prepare the Result tensor
-    inp = inp.contiguous().view(-1)
-    return (inp[::2] << 4 | inp[1::2]).view(oc, int(ic / 2))
-
-
-def fuse_into_linear_qcnw_node(
-    program: ExportedProgram,
-    graph_module: torch.fx.GraphModule,
-    linear_node: torch.fx.Node,
-    quant_method: torch.qscheme,
-    nbits: int,
-) -> None:
-    """
-    The weight_int8pack_mm operator represents a weight only quantized linear operator,
-    where the weight tensor has been quantized channelswise to nbits bits.
-
-      After the PT2E quantization flow, the expected graph pattern is
-
-          dq_weight = dequantize(weight, scales)
-          out = linear(activation, dq_weight, bias?)
-
-      The goal of this function is to condense that sequence into
-
-          out = quantized_linear(activation, dq_weight, scales)
-          out = out + bias
-    """
-    activation = linear_node.args[0]
-    dq_weight_node = linear_node.args[1]
-    assert isinstance(activation, torch.fx.Node)
-    assert isinstance(dq_weight_node, torch.fx.Node)
-
-    bias = None
-    if len(linear_node.args) > 2:
-        bias = linear_node.args[2]
-        assert isinstance(bias, torch.fx.Node)
-
-    orig_weight = dq_weight_node.args[0]
-    scale = dq_weight_node.args[1]
-
-    # For 4 bit quantization, pack the weight tensor
-    if nbits == 4:
-        assert isinstance(orig_weight, torch.fx.Node)
-        orig_weight_tensor = get_param_tensor(program, orig_weight)
-        assert isinstance(orig_weight_tensor, torch.Tensor)
-        packed_weight_tensor = pack_4bit_weight_tensor(orig_weight_tensor)
-        utils.update_program_state_dict(
-            program,
-            orig_weight.name,
-            packed_weight_tensor,
-        )
-        orig_weight.meta["val"] = orig_weight.meta["val"][:, ::2].to(torch.uint8)
-
-    if nbits == 8 and quant_method == torch.per_channel_symmetric:
-        op_target = exir_ops.edge.aten._weight_int8pack_mm.default
-    elif nbits == 4 and quant_method == torch.per_channel_symmetric:
-        op_target = exir_ops.edge.et_vk.linear_qcs4w.default
-    else:
-        raise NotImplementedError(
-            "only 4 and 8 bits per channel symmetric quant supported for linear_qcnw"
-        )
-
-    with graph_module.graph.inserting_before(linear_node):
-        weight_int8pack_mm_node = graph_module.graph.create_node(
-            "call_function",
-            op_target,
-            (activation, orig_weight, scale),
-        )
-        if bias:
-            add_node = graph_module.graph.create_node(
-                "call_function",
-                exir_ops.edge.aten.add.Tensor,
-                (weight_int8pack_mm_node, bias),
-            )
-            linear_node.replace_all_uses_with(add_node)
-        else:
-            linear_node.replace_all_uses_with(weight_int8pack_mm_node)
-        graph_module.graph.erase_node(linear_node)
-        graph_module.graph.erase_node(dq_weight_node)
-
-
-#########################
-## linear_qta8a_qga4w ##
-#########################
-
-
-def _is_dequantize_affine_node(node: torch.fx.Node) -> bool:
-    """Check if a node is a dequantize_affine operation."""
-    return (
-        node.op == "call_function"
-        and node.target is not None
-        and hasattr(node.target, "__name__")
-        and "dequantize_affine" in getattr(node.target, "__name__", "")
-    )
-
-
-def _is_view_copy_node(node: torch.fx.Node) -> bool:
-    """Check if a node is a view_copy operation."""
-    return (
-        node.op == "call_function"
-        and node.target is not None
-        and hasattr(node.target, "__name__")
-        and "view_copy" in getattr(node.target, "__name__", "")
-    )
-
-
-def _validate_qta8a_qga4w_nodes(
-    input_node: torch.fx.node.Argument, weight_node: torch.fx.node.Argument
-) -> Optional[torch.fx.Node]:
-    """
-    Validate input and weight nodes for QTA8A_QGA4W pattern.
-    Returns the actual input node (after handling view operations) or None if invalid.
-    """
-    # Type checking - ensure we have torch.fx.Node objects
-    if not isinstance(weight_node, torch.fx.Node) or not isinstance(
-        input_node, torch.fx.Node
-    ):
-        return None
-
-    # Input may be preprocessed with a view node
-    actual_input_node = input_node
-    if _is_view_copy_node(input_node):
-        actual_input_node = input_node.args[0]
-        if not isinstance(actual_input_node, torch.fx.Node):
-            return None
-
-    # Check if input is dequantized with dequantize_affine (from dynamic quantization)
-    if not _is_dequantize_affine_node(actual_input_node):
-        return None
-
-    # Check if weight is dequantized with dequantize_affine
-    if not _is_dequantize_affine_node(weight_node):
-        return None
-
-    return actual_input_node
-
-
-def _extract_weight_params(
-    program: ExportedProgram, weight_node: torch.fx.Node
-) -> Optional[Tuple[torch.fx.Node, torch.fx.Node, torch.fx.Node]]:
-    """Extract and validate weight parameters from dequantize_affine node."""
-    # Get the original quantized weight and quantization parameters
-    if len(weight_node.args) < 4:
-        return None
-
-    orig_weight = weight_node.args[0]
-    weight_scales = weight_node.args[2]
-    weight_zeros = weight_node.args[3]
-
-    # Type checking
-    if not isinstance(orig_weight, torch.fx.Node) or not is_param_node(
-        program, orig_weight
-    ):
-        return None
-    if not isinstance(weight_scales, torch.fx.Node) or not is_param_node(
-        program, weight_scales
-    ):
-        return None
-    if not isinstance(weight_zeros, torch.fx.Node) or not is_param_node(
-        program, weight_zeros
-    ):
-        return None
-
-    return orig_weight, weight_scales, weight_zeros
-
-
-def _validate_4bit_quantization(weight_tensor: torch.Tensor) -> bool:
-    """Check if weight tensor is quantized to 4 bits (values in [-8, 7] range)."""
-    quant_min = weight_tensor.min().item()
-    quant_max = weight_tensor.max().item()
-    return quant_min >= -8 and quant_max <= 7
-
-
-def _calculate_group_size(
-    orig_weight_tensor: torch.Tensor, weight_scales_tensor: torch.Tensor
-) -> Optional[int]:
-    """Calculate and validate group size from weight and scales tensors."""
-    out_features, in_features = orig_weight_tensor.shape
-
-    if len(weight_scales_tensor.shape) != 2:
-        return None
-
-    scales_out_features, num_groups = weight_scales_tensor.shape
-
-    if scales_out_features != out_features:
-        return None
-
-    group_size = in_features // num_groups
-    if in_features % group_size != 0:
-        return None
-
-    return group_size
-
-
-def matches_linear_qta8a_qga4w_pattern(
-    program: ExportedProgram, node: torch.fx.Node
-) -> Optional[Tuple[int, int]]:
-    """
-    Checks if the nodes surrounding a linear node matches the pattern for dynamic
-    activation + grouped weight quantized linear (QTA8A_QGA4W).
-
-    This pattern involves:
-    1. Dynamic quantization of input activations (8-bit)
-    2. Grouped quantization of weights (4-bit with group size)
-
-    The expected pattern from Int8DynActInt4WeightQuantizer is:
-        scale, zero_point = choose_qparams_affine(input)
-        quantized_input = quantize_affine(input, scale, zero_point)
-        dequantized_input = dequantize_affine(quantized_input, ...)
-        dequantized_weight = dequantize_affine(weight, weight_scales, weight_zeros)
-        output = linear(dequantized_input, dequantized_weight)
-
-    If the pattern matches, return (group_size, weight_bits), otherwise None.
-    """
-    if not utils.is_linear_node(node):
-        return None
-
-    input_node = node.args[0]
-    weight_node = node.args[1]
-
-    # Validate nodes and get actual input node
-    actual_input_node = _validate_qta8a_qga4w_nodes(input_node, weight_node)
-    if actual_input_node is None:
-        return None
-
-    # Extract weight parameters
-    if not isinstance(weight_node, torch.fx.Node):
-        return None
-    weight_params = _extract_weight_params(program, weight_node)
-    if weight_params is None:
-        return None
-
-    orig_weight, weight_scales, weight_zeros = weight_params
-
-    # Get tensors to analyze the quantization scheme
-    orig_weight_tensor = get_param_tensor(program, orig_weight)
-    weight_scales_tensor = get_param_tensor(program, weight_scales)
-    weight_zeros_tensor = get_param_tensor(program, weight_zeros)
-
-    if not isinstance(orig_weight_tensor, torch.Tensor):
-        return None
-    if not isinstance(weight_scales_tensor, torch.Tensor):
-        return None
-    if not isinstance(weight_zeros_tensor, torch.Tensor):
-        return None
-
-    # Check if weight is quantized to 4 bits
-    if not _validate_4bit_quantization(orig_weight_tensor):
-        return None
-
-    # Calculate group size
-    group_size = _calculate_group_size(orig_weight_tensor, weight_scales_tensor)
-    if group_size is None:
-        return None
-
-    # Verify this is 4-bit grouped quantization
-    weight_bits = 4
-
-    return group_size, weight_bits
-
-
-def fuse_into_linear_qta8a_qga4w_node(
-    program: ExportedProgram,
-    graph_module: torch.fx.GraphModule,
-    linear_node: torch.fx.Node,
-    group_size: int,
-    weight_bits: int,
-) -> None:
-    """
-    Fuse the dynamic activation + grouped weight quantized linear pattern into
-    a single linear_qta8a_qga4w operator.
-
-    The pattern:
-        dequantized_input = dequantize_affine(quantized_input, block_size, scale, zero_point, ...)
-        dequantized_weight = dequantize_affine(weight, block_size, weight_scales, weight_zeros, ...)
-        output = linear(dequantized_input, dequantized_weight)
-
-    Becomes:
-        output = linear_qta8a_qga4w(quantized_input, input_scale, input_zero_point,
-                                   weight, group_size, weight_scales, weight_zeros)
-    """
-    dq_input_node = linear_node.args[0]
-    dq_weight_node = linear_node.args[1]
-
-    assert isinstance(dq_input_node, torch.fx.Node)
-
-    input_view_node = None
-    # Input may be preprocessed with a view node
-    if (
-        dq_input_node.op == "call_function"
-        and dq_input_node.target is not None
-        and hasattr(dq_input_node.target, "__name__")
-        and "view_copy" in getattr(dq_input_node.target, "__name__", "")
-    ):
-        input_view_node = dq_input_node
-        dq_input_node = dq_input_node.args[0]
-        assert isinstance(dq_input_node, torch.fx.Node)
-
-    assert isinstance(dq_input_node, torch.fx.Node)
-    assert isinstance(dq_weight_node, torch.fx.Node)
-
-    # Get the quantized input and quantization parameters from the input dequantize_affine node
-    # Args: (input, block_size, scale, zero_point, input_dtype, quant_min, quant_max, output_dtype)
-    quantized_input = dq_input_node.args[0]
-    input_scale = dq_input_node.args[2]  # scale is the 3rd argument
-    input_zero_point = dq_input_node.args[3] if len(dq_input_node.args) > 3 else None
-
-    # Get the weight and its quantization parameters from dequantize_affine
-    # Args: (weight, block_size, weight_scales, weight_zeros, input_dtype, quant_min, quant_max, output_dtype)
-    orig_weight = dq_weight_node.args[0]
-    weight_scales = dq_weight_node.args[2]
-    weight_zeros = dq_weight_node.args[3]
-
-    # Pack the 4-bit weight tensor for efficient storage
-    assert isinstance(orig_weight, torch.fx.Node)
-    orig_weight_tensor = get_param_tensor(program, orig_weight)
-    assert isinstance(orig_weight_tensor, torch.Tensor)
-    packed_weight_tensor = pack_4bit_weight_tensor(orig_weight_tensor)
-    utils.update_program_state_dict(
-        program,
-        orig_weight.name,
-        packed_weight_tensor,
-    )
-    # Update the metadata to reflect the new packed shape
-    orig_weight.meta["val"] = orig_weight.meta["val"][:, ::2].to(torch.uint8)
-
-    # Create the linear_qta8a_qga4w node
-    with graph_module.graph.inserting_before(linear_node):
-        linear_qta8a_qga4w_node = graph_module.graph.create_node(
-            "call_function",
-            exir_ops.edge.et_vk.linear_qta8a_qga4w.default,
-            (
-                quantized_input,  # quantized input (int8)
-                input_scale,  # mat1_scale
-                input_zero_point,  # mat1_zero_point
-                orig_weight,  # mat2_data (packed 4-bit weights)
-                group_size,  # group_size (int)
-                weight_scales,  # weight_scales
-                weight_zeros,  # weight_zeros
-            ),
-        )
-
-        # Replace the linear node with the new fused node
-        linear_node.replace_all_uses_with(linear_qta8a_qga4w_node)
-
-        # Erase nodes in the correct order (users first, then dependencies)
-        graph_module.graph.erase_node(linear_node)
-        if input_view_node is not None:
-            graph_module.graph.erase_node(input_view_node)
-        graph_module.graph.erase_node(dq_weight_node)
-        graph_module.graph.erase_node(dq_input_node)
-
-
-class FuseQuantizedOpsTransform(ExportPass):
-    def __init__(self, exported_program: ExportedProgram) -> None:
-        super().__init__()
-        self.program = exported_program
-
-    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
-        for node in graph_module.graph.nodes:
-            # Check for linear_qcnw pattern (weight-only quantization)
-            qcnw_details = matches_linear_qcnw_pattern(self.program, node)
-            if qcnw_details is not None:
-                qcnw_method, qcnw_nbits = qcnw_details
-                fuse_into_linear_qcnw_node(
-                    self.program, graph_module, node, qcnw_method, qcnw_nbits
-                )
-                continue
-
-            # Check for linear_qta8a_qga4w pattern (dynamic activation + grouped weight quantization)
-            qta8a_qga4w_details = None
-            if qta8a_qga4w_details is not None:
-                group_size, weight_bits = qta8a_qga4w_details
-                fuse_into_linear_qta8a_qga4w_node(
-                    self.program, graph_module, node, group_size, weight_bits
-                )
-                continue
-
-        graph_module.recompile()
-        dead_code_elimination_pass(graph_module)
-
-        # Re-trace the graph since new nodes were (potentially) inserted
-        graph_module = super().call(graph_module).graph_module
-        return PassResult(graph_module, True)
diff --git a/backends/vulkan/_passes/insert_prepack_nodes.py b/backends/vulkan/_passes/insert_prepack_nodes.py
deleted file mode 100644
index c45ed4ea25d..00000000000
--- a/backends/vulkan/_passes/insert_prepack_nodes.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-strict
-
-from copy import deepcopy
-
-from executorch.backends.vulkan.op_registry import handles_own_prepacking
-from executorch.backends.vulkan.utils import is_param_node
-
-from executorch.exir.dialects._ops import ops as exir_ops
-
-from torch.export import ExportedProgram
-
-
-def insert_prepack_nodes(program: ExportedProgram) -> ExportedProgram:
-    """
-    Insert `et_vk.prepack` nodes for constant tensors in the graph. The prepack operator
-    is responsible for transferring the tensor data, which is serialized with the model,
-    to a GPU tensor object during the prepacking stage of model execution.
-
-    Some operators are performance sensitive and will prefer to handle prepacking within
-    the operator. For these ops, the constant tensor data will be passed directly as an
-    argument into the operator implementation.
-    """
-
-    for node in program.graph_module.graph.nodes:
-        # Prepacking is only needed for constant tensors. Only nodes corresponding to
-        # constant tensors will proceed beyond this point.
-        if not is_param_node(program, node):
-            continue
-
-        # Mark that this node is going to be represented as a TensorRef type in the
-        # Vulkan compute graph. This annotation is used in later graph passes.
-        node.meta["etvk_tensorref"] = True
-
-        # Get the list of node users that do not handle their own prepacking
-        nodes_to_replace_input = []
-        for user in node.users:
-            if user.op == "call_function" and not handles_own_prepacking(user.target):
-                nodes_to_replace_input.append(user)
-
-        if len(nodes_to_replace_input) == 0:
-            continue
-
-        replace_all_uses = len(nodes_to_replace_input) == len(node.users)
-
-        with program.graph_module.graph.inserting_after(node):
-            prepack_node = program.graph_module.graph.create_node(
-                "call_function",
-                exir_ops.edge.et_vk.prepack.default,
-                (node,),
-            )
-            # This pass assumes that the SpecPropPass() has already been applied
-            assert "spec" in node.meta
-            # Mutable buffers will not be marked as constant, but it might as well be
-            # for the purposes of memory planning. Mark it as a constant tensor so that
-            # it is handled correctly by the memory planning pass.
-            if not node.meta["spec"].const:
-                assert is_param_node(program, node)
-                node.meta["spec"].const = True
-            # Validate that the original node is marked as a constant. Constant tensors
-            # do not participate in memory planning.
-            assert node.meta["spec"].const
-            prepack_node.meta["val"] = node.meta["val"]
-            prepack_node.meta["spec"] = deepcopy(node.meta["spec"])
-            # Set the mem_obj_id to -1 to indicate that this node requires a dedicated
-            # memory object.
-            prepack_node.meta["spec"].mem_obj_id = -1
-            if replace_all_uses:
-                node.replace_all_uses_with(
-                    prepack_node,
-                    lambda x, y=prepack_node: (x != y and x.op != "output"),
-                )
-            else:
-                for user_node in nodes_to_replace_input:
-                    user_node.replace_input_with(node, prepack_node)
-
-    program.graph.eliminate_dead_code()
-    return program
diff --git a/backends/vulkan/_passes/int4_weight_only_quantizer.py b/backends/vulkan/_passes/int4_weight_only_quantizer.py
deleted file mode 100644
index 34ff5937822..00000000000
--- a/backends/vulkan/_passes/int4_weight_only_quantizer.py
+++ /dev/null
@@ -1,283 +0,0 @@
-# pyre-unsafe
-import logging
-from typing import Any, Callable, Dict, Optional, Type
-
-import executorch.backends.vulkan.custom_ops_lib  # noqa
-
-import torch
-import torch.nn.functional as F
-
-from torchao.quantization.unified import Quantizer
-from torchao.quantization.utils import groupwise_affine_quantize_tensor
-
-
-# TODO: import from from torchao.quantization.GPTQ.GPTQ import _check_linear_int4_k
-# Once diff train catches up
-def _check_linear_int4_k(k, group_size=1, inner_k_tiles=None):
-    """
-    Check if the dimensions are compatible with int4 quantization.
-
-    Args:
-        k: The dimension size to check
-        group_size: The group size for quantization
-        inner_k_tiles: The inner k tiles size
-
-    Returns:
-        bool: Whether the dimensions are compatible
-    """
-    k_divisible_by_group_size = k % group_size == 0
-    if inner_k_tiles is not None:
-        k_divisible_by_16_times_inner_k_tiles = k % (inner_k_tiles * 16) == 0
-        return k_divisible_by_group_size and k_divisible_by_16_times_inner_k_tiles
-    return k_divisible_by_group_size
-
-
-# This module is copied from torchao.quantization.GPTQ.WeightOnlyInt4Linear with
-# changes at the annotated lines.
-class VkWeightOnlyInt4Linear(torch.nn.Module):
-    __constants__ = ["in_features", "out_features"]
-    in_features: int
-    out_features: int
-    weight: torch.Tensor
-
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        # TODO: remove dtype field, not used
-        bias=False,
-        device=None,
-        dtype=None,
-        groupsize: int = 128,
-        inner_k_tiles: int = 8,
-        precision: torch.dtype = torch.bfloat16,
-        scales_precision: torch.dtype = torch.bfloat16,
-    ) -> None:
-        super().__init__()
-        self.padding = not _check_linear_int4_k(in_features, groupsize, inner_k_tiles)
-        if self.padding:
-            from torchao.utils import find_multiple
-
-            self.origin_in_features = in_features
-            # pyre-ignore[6]: Incompatible parameter type
-            in_features = find_multiple(in_features, 1024)
-
-        self.use_bias = bias
-        self.in_features = in_features
-        self.out_features = out_features
-        self.device = device
-        self.groupsize = groupsize
-        self.inner_k_tiles = inner_k_tiles
-        self.precision = precision
-        self.scales_precision = scales_precision
-
-        if dtype is not None:
-            raise ValueError("Please specify 'precision' instead of 'dtype'")
-
-        assert out_features % 8 == 0, "require out_features % 8 == 0"
-        assert (
-            in_features % (inner_k_tiles * 16) == 0
-        ), "require in_features % (innerKTiles * 16) == 0"
-        # In the original implementation, the weight buffer is registered with the packed
-        # sizes, i.e. the result of calling the _convert_weight_to_int4pack operator.
-        # However, the Vulkan implementation does not expect the weights to be packed
-        # therefore the weight tensor is registered with the unpacked sizes instead.
-        # Note that in_features is divided by 2 because each `uint8` tensor element
-        # contains 2 4-bit packed values.
-        self.register_buffer(
-            "weight",
-            torch.empty(
-                (out_features, in_features // 2),
-                dtype=torch.uint8,
-                device=device,
-            ),
-        )
-        self.dtype = dtype
-        self.register_buffer(
-            "scales_and_zeros",
-            torch.empty(
-                (in_features // groupsize, out_features, 2),
-                dtype=self.scales_precision,
-                device=device,
-            ),
-        )
-        if bias:
-            self.register_buffer(
-                "bias",
-                torch.empty((out_features,), dtype=torch.float32, device=device),
-            )
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        if self.padding:
-            input = F.pad(input, pad=(0, self.in_features - self.origin_in_features))
-        # The forward method is replaced. In the original implementation, the forward
-        # method is torchao.quantization.GPTQ.linear_forward_int4; here a Vulkan custom
-        # operator is called instead.
-        r = torch.ops.et_vk.linear_weight_int4(
-            input,
-            self.weight,
-            self.groupsize,
-            self.scales_and_zeros,
-            self.inner_k_tiles,
-        )
-        if self.use_bias:
-            return r + self.bias
-        return r
-
-
-# This function is coped from torchao.quantization.GPTQ._replace_linear_int4
-# with small changes at the annotated locations.
-def _vk_replace_linear_int4(
-    module: torch.nn.Module,
-    groupsize: int,
-    inner_k_tiles: Optional[int],
-    padding_allowed: bool,
-    skip_layer_func: Optional[Callable] = None,
-    precision: torch.dtype = torch.bfloat16,
-    scales_precision: torch.dtype = torch.bfloat16,
-    # Use custom vulkan linear layer as default
-    linear_class: Type[torch.nn.Module] = VkWeightOnlyInt4Linear,
-    copy_weights: bool = False,
-):
-    for name, child in module.named_children():
-        if isinstance(child, torch.nn.Linear) and (
-            skip_layer_func is None or not skip_layer_func(child.weight)
-        ):
-            # Add an additional condition that the out/in features must not exceed the
-            # `feature_limit` argument.
-            if (
-                _check_linear_int4_k(child.in_features, groupsize, inner_k_tiles)
-                or padding_allowed
-            ):
-                new_linear = linear_class(
-                    child.in_features,
-                    child.out_features,
-                    bias=child.bias is not None,
-                    device=child.weight.device,
-                    groupsize=groupsize,
-                    inner_k_tiles=inner_k_tiles,
-                    precision=precision,
-                    scales_precision=scales_precision,
-                )
-                if copy_weights and child.weight.device != torch.device("meta"):
-                    # pyre-fixme[16]: `Module` has no attribute `weight`.
-                    new_linear.weight = child.weight
-                    if child.bias is not None:
-                        # pyre-fixme[16]: `Module` has no attribute `bias`.
-                        new_linear.bias = child.bias
-                setattr(module, name, new_linear)
-        else:
-            _vk_replace_linear_int4(
-                child,
-                groupsize,
-                inner_k_tiles,
-                padding_allowed,
-                skip_layer_func,
-                precision,
-                scales_precision,
-                linear_class,
-                copy_weights,
-            )
-
-
-# This module is copied from torchao.quantization.GPTQ.Int4WeightOnlyQuantizer
-# with some changes at the annotated lines.
-class VkInt4WeightOnlyQuantizer(Quantizer):
-    def __init__(
-        self,
-        groupsize: int = 256,
-        padding_allowed: bool = True,
-        inner_k_tiles: Optional[int] = 8,
-        device: torch.device = torch.device("cpu"),  # noqa
-        precision: torch.dtype = torch.float32,
-    ) -> None:
-        super().__init__()
-        assert inner_k_tiles in [2, 4, 8]
-        assert groupsize in [32, 64, 128, 256]
-
-        self.inner_k_tiles = inner_k_tiles
-        self.groupsize: int = groupsize
-        self.padding_allowed: bool = padding_allowed
-        self.device: torch.device = device
-        self.precision: torch.dtype = precision
-
-    @torch.no_grad()
-    def _create_quantized_state_dict(
-        self, model: torch.nn.Module
-    ) -> Dict[str, torch.Tensor]:
-        cur_state_dict = model.state_dict()
-        for fqn, mod in model.named_modules():
-            # Add additional check to make sure features do not exceed feature limit
-            if isinstance(mod, torch.nn.Linear):
-                out_features = mod.out_features
-                in_features = mod.in_features
-                logging.info(f"linear: {fqn}, in={in_features}, out={out_features}")
-
-                assert (
-                    in_features % self.groupsize == 0
-                ), f"require in_features:{in_features} % self.groupsize:{self.groupsize} == 0"
-
-                weight = mod.weight.data
-                if not _check_linear_int4_k(
-                    in_features, self.groupsize, self.inner_k_tiles
-                ):
-                    if self.padding_allowed:
-                        import torch.nn.functional as F
-
-                        from torchao.utils import find_multiple
-
-                        logging.warn(
-                            f"warning: {fqn} is padded to satisfy in_features % 1024 == 0"
-                        )
-                        # pyre-ignore[6]: Incompatible parameter type
-                        padded_in_features = find_multiple(in_features, 1024)
-                        weight = F.pad(
-                            weight, pad=(0, padded_in_features - in_features)
-                        )
-                    else:
-                        logging.warn(
-                            f"warning: {fqn} is skipped, int4 requires that in_features is 32, 64, or is divisible by 1024, "
-                            + "and that groupsize and inner_k_tiles*16 evenly divide into it"
-                        )
-                        continue
-                (w_int4x8, scales_and_zeros) = groupwise_affine_quantize_tensor(
-                    weight,
-                    4,  # n_bit
-                    self.groupsize,
-                    self.precision,  # dtype for scales_and_zeros
-                )
-                # If the packing of 2 4-bit values into a single 8-bit value was not
-                # performed in the previous function call, then do it manually now.
-                if w_int4x8.shape == weight.shape:
-                    w_int4x8 = (w_int4x8[::, ::2] << 4 | w_int4x8[::, 1::2]).to(
-                        torch.uint8
-                    )
-                # In the original implementation, w_int4x8 is packed via calling the
-                # _convert_weight_to_int4pack operator before storing the weight. However
-                # the Vulkan implementation does not expect the weights to be packed, so
-                # the w_int4x8 tensor is stored as the weight instead.
-                cur_state_dict[f"{fqn}.weight"] = w_int4x8.to(self.device)
-                cur_state_dict[f"{fqn}.scales_and_zeros"] = scales_and_zeros.to(
-                    self.device
-                )
-        return cur_state_dict
-
-    def _convert_for_runtime(self, model: torch.nn.Module) -> torch.nn.Module:
-        _vk_replace_linear_int4(
-            model,
-            self.groupsize,
-            self.inner_k_tiles,
-            self.padding_allowed,
-            skip_layer_func=None,
-            precision=self.precision,
-            scales_precision=self.precision,
-        )
-        return model
-
-    def quantize(
-        self, model: torch.nn.Module, *args: Any, **kwargs: Any
-    ) -> torch.nn.Module:
-        state_dict = self._create_quantized_state_dict(model)
-        model = self._convert_for_runtime(model)
-        model.load_state_dict(state_dict, strict=False)
-        return model
diff --git a/backends/vulkan/_passes/remove_asserts.py b/backends/vulkan/_passes/remove_asserts.py
deleted file mode 100644
index 835f2ec1415..00000000000
--- a/backends/vulkan/_passes/remove_asserts.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-strict
-
-from typing import Set, Union
-
-import torch
-
-from executorch.exir.dialects.edge._ops import EdgeOpOverload
-from executorch.exir.pass_base import ExportPass, PassResult
-from executorch.exir.program._program import _get_updated_graph_signature
-
-from torch.export.exported_program import ExportedProgram
-
-OpType = Union[str, torch._ops.OpOverload, EdgeOpOverload]
-
-
-class RemoveAssertsTransform(ExportPass):
-    """
-    Remove operators which perform assertions. These are not possible to execute in
-    Vulkan since GLSL shaders cannot abort execution at runtime. Therefore, remove these
-    operators.
-    """
-
-    assert_ops: Set[OpType] = {
-        torch.ops.aten._assert_scalar.default,
-        torch.ops.aten.sym_constrain_range_for_size.default,
-    }
-
-    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
-        for node in graph_module.graph.nodes:
-            if node.target in self.assert_ops:
-                graph_module.graph.erase_node(node)
-
-        graph_module.graph.eliminate_dead_code()
-        graph_module.recompile()
-        return PassResult(graph_module, True)
-
-
-def remove_asserts(edge_program: ExportedProgram) -> ExportedProgram:
-    graph_module = edge_program.graph_module
-    RemoveAssertsTransform()(graph_module)
-
-    edge_program._graph_signature = _get_updated_graph_signature(
-        edge_program.graph_signature, graph_module
-    )
-    edge_program._validate()
-    return edge_program
diff --git a/backends/vulkan/_passes/remove_local_scalar_dense_ops.py b/backends/vulkan/_passes/remove_local_scalar_dense_ops.py
deleted file mode 100644
index 6ce3572ec0c..00000000000
--- a/backends/vulkan/_passes/remove_local_scalar_dense_ops.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-strict
-
-import torch
-from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.pass_base import ExportPass, PassResult
-
-from torch._subclasses.fake_tensor import FakeTensor
-
-
-def node_is_local_scalar_dense_chain(node: torch.fx.Node) -> bool:
-    """
-    Converting a tensor to a scalar via tensor[0].item() creates a index_select +
-    local_scalar_dense pattern in the graph. Check if a node is the start of this pattern.
-    """
-    if (
-        node.op == "call_function"
-        and node.target == exir_ops.edge.aten.select_copy.int
-        and len(node.users) == 1
-    ):
-        user = list(node.users.keys())[0]
-        return user.target == torch.ops.aten._local_scalar_dense.default
-
-    return False
-
-
-def tag_node_if_scalar_tensor(node: torch.fx.Node) -> None:
-    """
-    A scalar tensor in the Vulkan backend is a tensor that can be represented as a scalar
-    value instead of a Tensor object. The criteria for identifying a tensor as a scalar
-    tensor are as follows:
-
-    1. The tensor has only 1 element
-    2. One of the node's uses is converting it to a scalar via `tensor[0].item()`, which
-       creates a index_select + local_scalar_dense pattern in the graph
-
-    If any of these criteria are fulfilled, then tag the node for the tensor to mark it
-    so that it is added as a scalar value during serialization.
-    """
-    tensor_val = node.meta["val"]
-    if not isinstance(tensor_val, FakeTensor):
-        return
-
-    # Scalar tensors must have only one element
-    if tensor_val.numel() != 1:
-        return
-
-    for user in node.users:
-        if node_is_local_scalar_dense_chain(user):
-            node.meta["etvk_is_scalar_tensor"] = True
-
-
-def remove_local_scalar_dense_chain(graph: torch.fx.Graph, node: torch.fx.Node) -> None:
-    """
-    Remove the index_select + local_scalar_dense pattern in the graph in favor of passing
-    the original scalar tensor directly.
-    """
-    replace_node = node.args[0]
-    assert isinstance(replace_node, torch.fx.Node)
-    # If the argument to the local_scalar_dense op is a select op with only
-    # one user, and the argument to the select op is a tensor with only one
-    # element (i.e. a scalar tensor), then replace the entire pattern with the
-    # scalar tensor.
-    if (
-        replace_node.op == "call_function"
-        and replace_node.target == exir_ops.edge.aten.select_copy.int
-    ):
-        # pyre-ignore
-        if replace_node.args[0].meta["val"].numel() == 1:
-            replace_node = replace_node.args[0]
-            assert isinstance(replace_node, torch.fx.Node)
-            assert replace_node.meta.get("etvk_is_scalar_tensor", True)
-
-    with graph.inserting_after(node):
-        node.replace_all_uses_with(replace_node)
-
-
-def remove_local_scalar_dense_ops(graph: torch.fx.Graph) -> torch.fx.Graph:
-    """
-    The purpose of this pass is twofold:
-    1. Tag scalar tensors (see `tag_node_if_scalar_tensor()` for the criteria)
-    2. Remove the index_select + local_scalar_dense pattern in the graph in favor of
-       passing the original scalar tensor directly (see `remove_local_scalar_dense_chain()`)
-
-    This makes it easier to deal with scalar tensors in the Vulkan backend. In particular,
-    it allows serializing scalar tensors as SymInt objects instead of Tensor objects.
-    Because scalar tensors are often used to inform tensor shapes, their values need to
-    be easily accessed by the CPU during resizing logic, while also being able to reflect
-    updates to their value in any GPU shaders that reference them.
-    """
-    target_op = torch.ops.aten._local_scalar_dense.default
-    for node in graph.nodes:
-        tag_node_if_scalar_tensor(node)
-
-        if node.op == "call_function" and node.target == target_op:
-            remove_local_scalar_dense_chain(graph, node)
-
-    graph.eliminate_dead_code()
-    return graph
-
-
-class RemoveLocalScalarDenseOpsTransform(ExportPass):
-    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
-        graph_module.graph = remove_local_scalar_dense_ops(graph_module.graph)
-        return PassResult(graph_module, True)
diff --git a/backends/vulkan/_passes/remove_redundant_ops.py b/backends/vulkan/_passes/remove_redundant_ops.py
deleted file mode 100644
index 530505f7003..00000000000
--- a/backends/vulkan/_passes/remove_redundant_ops.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-strict
-
-from typing import Set, Union
-
-import torch
-from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.dialects.edge._ops import EdgeOpOverload
-from executorch.exir.pass_base import ExportPass, PassResult
-from executorch.exir.passes import dead_code_elimination_pass
-
-OpType = Union[str, torch._ops.OpOverload, EdgeOpOverload]
-
-
-class RemoveRedundantOpsTransform(ExportPass):
-    """
-    Trim certain operators to reduce unnecessary overhead.
-    """
-
-    redundant_ops: Set[OpType] = {
-        torch.clone,
-        torch.ops.aten.clone.default,
-        exir_ops.edge.aten.clone.default,
-        torch.ops.aten.alias.default,
-        exir_ops.edge.aten.alias.default,
-        exir_ops.edge.aten.lift_fresh_copy.default,
-        exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
-    }
-
-    def __init__(self) -> None:
-        super(RemoveRedundantOpsTransform, self).__init__()
-
-    def _should_remove(self, node: torch.fx.Node) -> bool:
-        if node.target in self.redundant_ops:
-            return True
-
-        # Only remove to_copy if dtype does not change. Otherwise, memory format changes
-        # will be handled internally by the backend.
-        if (
-            node.target == exir_ops.edge.aten._to_copy.default
-            or node.target == torch.ops.aten._to_copy.default
-        ):
-            src_dtype = node.meta["val"].dtype
-            # pyre-ignore
-            dst_dtype = node.args[0].meta["val"].dtype
-            return src_dtype == dst_dtype
-
-        return False
-
-    def _remove(self, graph_module: torch.fx.GraphModule) -> None:
-        for node in graph_module.graph.nodes:
-            if not self._should_remove(node):
-                continue
-
-            with graph_module.graph.inserting_after(node):
-                node.replace_all_uses_with(node.args[0])
-
-        graph_module.graph.eliminate_dead_code()
-
-    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
-        self._remove(graph_module)
-        graph_module.recompile()
-        dead_code_elimination_pass(graph_module)
-        return PassResult(graph_module, True)
diff --git a/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py b/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py
deleted file mode 100644
index c415249383e..00000000000
--- a/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-strict
-
-from typing import Dict, List, Set, Tuple, Union
-
-import executorch.backends.vulkan.custom_ops_lib  # noqa: needed to access vk op
-from executorch.exir.dialects._ops import ops as exir_ops
-from executorch.exir.dialects.edge._ops import EdgeOpOverload
-from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
-
-from torch._ops import OpOverload
-
-from torch.fx.node import Argument
-
-OpType = Union[str, OpOverload, EdgeOpOverload]
-
-
-class SqueezeUnsqueezeInputs(ExportPass):
-    _squeezable_ops: Set[OpType] = {
-        exir_ops.edge.et_vk.linear_weight_int4.default,
-        exir_ops.edge.aten.relu.default,
-        exir_ops.edge.aten.gelu.default,
-    }
-
-    def should_squeeze(self, op, shape: List[int]) -> bool:  # pyre-ignore
-        if len(shape) == 3:
-            return shape[1] == 1 and shape[0] > 1
-        if len(shape) == 4:
-            # No need to squeeze if all dims are 1 except the width dim
-            if shape[0] == shape[1] == shape[2] == 1:
-                return False
-            # No need to squeeze if batch and channel dims are 1 and height and width are > 1
-            if shape[0] == shape[1] == 1 and shape[2] > 1 and shape[3] > 1:
-                return False
-            # No need to squeeze if batch dim is 1 and channel, height and width are > 1
-            if shape[0] == 1 and shape[1] > 1 and shape[2] > 1 and shape[3] > 1:
-                return False
-            # Otherwise, check for squeezable dim
-            return 1 in shape[:-1]
-
-        # Prefer not to introduce additional orchestration ops by default
-        return False
-
-    def call_operator(
-        self,
-        op,  # pyre-ignore
-        args: Tuple[Argument, ...],
-        kwargs: Dict[str, Argument],
-        meta: NodeMetadata,
-    ) -> ProxyValue:
-        if op not in self._squeezable_ops:
-            return super().call_operator(op, args, kwargs, meta)
-        # pyre-ignore[16]: `None` has no attribute `node`
-        input_shape = args[0].node.meta["val"].shape
-        output_shape = meta["val"].shape
-
-        if not self.should_squeeze(op, input_shape):
-            return super().call_operator(op, args, kwargs, meta)
-
-        def _squeezable(shape: List[int]) -> bool:
-            return len(shape) > 2 and 1 in shape
-
-        # squeeze input tensor
-        squeeze_shape = list(input_shape)
-        while _squeezable(squeeze_shape):
-            squeeze_shape.remove(1)
-
-        squeeze_out = super().call_operator(
-            exir_ops.edge.aten.view_copy.default,
-            (args[0], squeeze_shape),
-            kwargs,
-            meta,
-        )
-        # call linear on squeezed output
-        new_args = (squeeze_out, *args[1:])
-        linear_out = super().call_operator(
-            op,
-            new_args,
-            kwargs,
-            meta,
-        )
-        # unsqueeze output
-        unsqueeze_shape = list(output_shape)
-        return super().call_operator(
-            exir_ops.edge.aten.view_copy.default,
-            (linear_out, unsqueeze_shape),
-            kwargs,
-            meta,
-        )
diff --git a/backends/vulkan/_passes/tag_memory_meta_pass.py b/backends/vulkan/_passes/tag_memory_meta_pass.py
deleted file mode 100644
index db53cc666a8..00000000000
--- a/backends/vulkan/_passes/tag_memory_meta_pass.py
+++ /dev/null
@@ -1,466 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import operator
-
-from typing import Any
-
-import executorch.backends.vulkan.utils as utils
-
-import torch
-
-from executorch.backends.vulkan.op_registry import get_op_features, has_impl, OpFeatures
-
-from executorch.backends.vulkan.serialization.vulkan_graph_schema import (
-    VkMemoryLayout,
-    VkStorageType,
-)
-
-from executorch.exir.dialects._ops import ops as exir_ops
-
-from executorch.exir.pass_base import ExportPass, PassResult
-from executorch.exir.tensor import TensorSpec
-
-logger: logging.Logger = logging.getLogger("")
-logger.setLevel(logging.INFO)
-
-
-def insert_transition_node(
-    graph_module: torch.fx.GraphModule,
-    node: torch.fx.Node,
-    arg: torch.fx.Node,
-    arg_node_repr: utils.TensorRepr,
-) -> None:
-    """
-    Insert a clone node to transition the tensor associated with `arg` to a tensor with
-    the requested representation `arg_node_repr`, and use the cloned node as an argument
-    to `node` instead of `arg`.
-    """
-    with graph_module.graph.inserting_before(node):
-        clone_node = graph_module.graph.create_node(
-            "call_function",
-            exir_ops.edge.aten.clone.default,
-            (arg,),
-        )
-        clone_node.meta["val"] = arg.meta["val"]
-        clone_node.meta["spec"] = TensorSpec.from_tensor(clone_node.meta["val"])
-        clone_node.meta["spec"].const = False
-        utils.set_node_repr(clone_node, arg_node_repr)
-        arg.replace_all_uses_with(clone_node, lambda x, y=node: x == y)
-
-
-def set_arg_node_repr_or_transition(
-    graph_module: torch.fx.GraphModule,
-    op_node: torch.fx.Node,
-    arg_i: int,
-    arg_node_repr: utils.TensorRepr,
-    dirty: bool,
-) -> bool:
-    """
-    Does one of following:
-    1. Sets the `node_repr` of the argument at `arg_i` of `op_node` if the argument node
-       does not currently have a `node_repr`
-    2. No-op if the current `node_repr` is already the same as the requested represetnation.
-    3. Insert a transition node to create a copy of the argument with the desired `node_repr`
-       if the current `node_repr` is different than what is needed.
-    """
-    arg_node = op_node.args[arg_i]
-
-    def single_node_impl(node: torch.fx.Node) -> bool:
-        # Case where the arg node has not been touched yet; in this case, simply set it and
-        # return.
-        if not utils.has_node_repr(node):
-            utils.set_node_repr(node, arg_node_repr)
-            return False
-
-        # Case where the current node representation is the same as the new one.
-        cur_node_repr = utils.get_node_repr(node)
-        assert isinstance(cur_node_repr, utils.TensorRepr)
-
-        if cur_node_repr == arg_node_repr:
-            return False
-
-        if not dirty:
-            logger.info(
-                f"[Vulkan Delegate] Inserting transition(s) for {op_node.format_node()}:"
-            )
-
-        # Existing node representation is different; insert a transition node
-        # Currently, the transition node insertion logic can only handle single tensor nodes
-        assert utils.is_single_tensor_node(node)
-        insert_transition_node(graph_module, op_node, node, arg_node_repr)
-
-        logger.info(f"   arg {arg_i} ({node}): ({cur_node_repr}) -> ({arg_node_repr})")
-
-        return True
-
-    if isinstance(arg_node, torch.fx.Node):
-        return single_node_impl(arg_node)
-    elif isinstance(arg_node, (list, tuple)):
-        ret: bool = False
-        for n in arg_node:
-            assert isinstance(n, torch.fx.Node)
-            assert utils.is_single_tensor_node(n)
-            ret = single_node_impl(n) or ret
-
-        return ret
-
-    raise NotImplementedError(f"Unhandled node type {arg_node}")
-
-
-class TagMemoryMetaPass(ExportPass):
-    """
-    Operator implementations in the Vulkan delegate may require that input and output
-    tensors use a specific representation. Representation in this case refers to a
-    combination of storage type (buffer or texture) and memory layout (width, height, or
-    channels packed).
-
-    The tag memory metadata pass is responsible for marking each tensor in the graph
-    with the appropriate representation to use. It is also responsible for inserting
-    operators to transition argument tensors to a required/compatible representation if
-    a mismatch has been detected.
-    """
-
-    def __init__(
-        self,
-        texture_limits: utils.ImageExtents,
-        default_storage_type: VkStorageType = VkStorageType.TEXTURE_3D,
-        default_memory_layout: VkMemoryLayout = VkMemoryLayout.TENSOR_WIDTH_PACKED,
-    ):
-        super().__init__()
-        self.default_storage: VkStorageType = default_storage_type
-        self.default_layout: VkMemoryLayout = default_memory_layout
-        self.texture_limits = texture_limits
-
-        # Magic number to limit "lookahead" when tracing through users of an operator
-        # to constrain the representation of its arguments/outputs.
-        self.max_trace_search_depth = 20
-
-    def is_valid_op_node(self, node: Any) -> bool:
-        """
-        Fails the check for:
-        * nodes that are not associated with a tensor
-        * nodes that are associated with a constant tensor
-        * nodes that are not associated with a supported operator
-        """
-        if not isinstance(node, torch.fx.Node) or not utils.is_tensor_node(node):
-            return False
-        if node.meta.get("etvk_tensorref", False):
-            return False
-        if not has_impl(node.target):
-            return False
-
-        return True
-
-    def is_non_constant_tensor_node(self, node: Any) -> bool:
-        """
-        Fails the check for:
-        * Nodes that are not associated with tensor values
-        * Nodes associated with constant tensors
-        *
-        """
-        if isinstance(node, torch.fx.Node):
-            if not utils.is_tensor_node(node):
-                return False
-            if node.meta.get("etvk_tensorref", False):
-                return False
-            return True
-
-        if isinstance(node, (tuple, list)):
-            for n in node:
-                if not isinstance(n, torch.fx.Node):
-                    return False
-                if not self.is_non_constant_tensor_node(n):
-                    return False
-
-            return True
-
-        # Return false by default
-        return False
-
-    def get_node_cached_repsets(self, op_node: torch.fx.Node) -> utils.OpRepSets:
-        """
-        Implements a cache layer for getting the OpRepSets for a given operator node.
-        """
-        assert self.is_valid_op_node(op_node)
-
-        if "etvk_node_repsets" in op_node.meta:
-            op_repsets = op_node.meta["etvk_node_repsets"]
-            assert isinstance(op_repsets, utils.OpRepSets)
-            return op_repsets
-        else:
-            # Special case for getitem - set the input and output to the repset of the
-            # tensor value being extracted
-            if op_node.target == operator.getitem:
-                src_node = op_node.args[0]
-                assert isinstance(src_node, torch.fx.Node)
-                idx = op_node.args[1]
-                assert isinstance(idx, int)
-
-                arg_node_repsets = self.get_node_cached_repsets(src_node)
-                out_tensor_repset = arg_node_repsets.get_out_repset(idx)
-
-                op_repsets = utils.OpRepSets(
-                    utils.TensorRepSetList(out_tensor_repset),
-                    utils.TensorRepSetList(out_tensor_repset),
-                    op_node,
-                    self.texture_limits,
-                )
-            else:
-                features: OpFeatures = get_op_features(op_node.target)  # noqa
-                op_repsets = features.make_op_repsets(op_node, self.texture_limits)
-
-            op_node.meta["etvk_node_repsets"] = op_repsets
-            return op_repsets
-
-    def get_arg_tensor_source_repset(
-        self, op_node: torch.fx.Node, arg_i: int
-    ) -> utils.TensorRepSet:
-        """
-        Get the "source RepSet" for the tensor argument at index `arg_i` of `op_node`.
-        The source repset is obtained in one of two ways:
-
-        1. If the tensor argument already has a representation determined for it, return
-           a repset that contains that representation.
-        2. Otherwise, return the output repset of the operator that produces the tensor
-        """
-        arg_node = op_node.args[arg_i]
-
-        # Special case for cat - use the first tensor in the list as representative
-        if isinstance(arg_node, list):
-            arg_node = arg_node[0]
-
-        if utils.has_node_repr(arg_node):
-            arg_node_repr = utils.get_node_repr(arg_node)
-            assert isinstance(arg_node_repr, utils.TensorRepr)
-            return utils.make_tensor_repset(arg_node_repr)
-        elif self.is_valid_op_node(arg_node):
-            # Special case for getitem - propagate the node representation of the original node
-            if op_node.target == operator.getitem:
-                src_node = op_node.args[0]
-                assert isinstance(src_node, torch.fx.Node)
-                idx = op_node.args[1]
-                assert isinstance(idx, int)
-
-                src_node_repsets = self.get_node_cached_repsets(src_node)
-                return src_node_repsets.get_out_repset(idx)
-
-            src_node_repsets = self.get_node_cached_repsets(arg_node)
-            return src_node_repsets.get_out_repset(0)
-
-        # default return
-        return utils.ANY_STORAGE
-
-    def constrain_repset_with_user(
-        self,
-        current_node: torch.fx.Node,
-        arg_i: int,
-        arg_repset: utils.TensorRepSet,
-        search_depth: int = 0,
-    ) -> utils.TensorRepSet:
-        """
-        Attempts to constrain `arg_repset` based on the required repset of the argument
-        at index `arg_i` of `current_node`. This tries to find a representation for the
-        argument that can be used for as long as possible without needing a transition.
-        """
-        # The repset is already constrained; return it
-        if arg_repset.is_constrained():
-            return arg_repset
-
-        # The current node is not a valid op node, so no OpRepSets object can be created
-        # for it.
-        if not self.is_valid_op_node(current_node):
-            return arg_repset
-
-        cur_node_repsets = self.get_node_cached_repsets(current_node)
-
-        # Intersect with the repset required by the current operator; otherwise, return
-        # since a transition will be required anyways
-        req_arg_repset = cur_node_repsets.get_arg_repset(arg_i)
-        if req_arg_repset.any_in_common(arg_repset):
-            arg_repset = arg_repset.make_intersect(req_arg_repset)
-        else:
-            return arg_repset
-
-        # Check if the argument at `arg_i` will influence the output representation of
-        # the current operator.
-        repset_propagates_to_output = cur_node_repsets.sync_primary_io_repr and (
-            cur_node_repsets.sync_args_repr or arg_i == cur_node_repsets.primary_arg_idx
-        )
-
-        # If not, then no point in continuing to trace the users of the current node
-        if not repset_propagates_to_output:
-            return arg_repset
-
-        return self.trace_node_users_to_constrain_repset(
-            current_node, arg_repset, search_depth
-        )
-
-    def trace_node_users_to_constrain_repset(
-        self,
-        origin_node: torch.fx.Node,
-        repset: utils.TensorRepSet,
-        search_depth: int = 0,
-    ) -> utils.TensorRepSet:
-        """
-        For an ambiguous repset, try to constrain the repset by tracing the required
-        repsets of the users of `origin_node`. The idea is to try to find a representation
-        that can be used the longest without needing user nodes to insert a transition
-        for its arguments.
-        """
-        # Optionally limit the search depth to improve export time
-        if self.max_trace_search_depth is not None:
-            if search_depth > self.max_trace_search_depth:
-                return repset
-
-        users_to_trace = origin_node.users
-
-        sync_outs_repr = True
-        if self.is_valid_op_node(origin_node):
-            sync_outs_repr = self.get_node_cached_repsets(origin_node).sync_outs_repr
-
-        if utils.num_tensors_in_node(origin_node) > 1 and not sync_outs_repr:
-            users_to_trace = []
-            for usage_node in origin_node.users:
-                if usage_node.target == operator.getitem and usage_node.args[1] == 1:
-                    users_to_trace.append(usage_node)
-
-        for usage_node in users_to_trace:
-            arg_i_in_user = None
-            for i in range(len(usage_node.args)):
-                if origin_node == usage_node.args[i]:
-                    arg_i_in_user = i
-                    break
-
-            if arg_i_in_user is not None:
-                repset = self.constrain_repset_with_user(
-                    usage_node, arg_i_in_user, repset, search_depth + 1
-                )
-
-            if repset.is_constrained():
-                return repset
-
-        return repset
-
-    def constrain_op_arg_repset(self, arg_i: int, op_repsets: utils.OpRepSets) -> None:
-        """
-        Attempts to constrain the repset of the argument at index `arg_i` of the op
-        associated with `op_repsets`. Does this with two stages:
-
-        1. First, account for any existing representation that has already been determined
-           for the argument. If no existing representation has been determined, then use
-           the output repset of the operator that produces the argument.
-        2. Then, try to trace through the users of the argument to find a representation
-           that can be used for as long as possible without needing a transition.
-        """
-        arg_source_repset = self.get_arg_tensor_source_repset(op_repsets.op_node, arg_i)
-        op_repsets.try_constrain_with_arg_repset(arg_i, arg_source_repset)
-
-        arg_repset = op_repsets.get_arg_repset(arg_i)
-        if arg_repset.is_constrained():
-            return arg_repset
-
-        arg_node = op_repsets.op_node.args[arg_i]
-
-        if isinstance(arg_node, list):
-            arg_node = arg_node[0]
-
-        arg_repset = self.trace_node_users_to_constrain_repset(arg_node, arg_repset)
-        op_repsets.try_constrain_with_arg_repset(arg_i, arg_repset)
-
-    def constrain_op_repsets(self, op_repsets: utils.OpRepSets) -> None:
-        # For most ops, constraining the argument repsets will also contrain the output
-        # repset due to OpRepSets maintaining synchronization rules.
-        for i in range(len(op_repsets.op_node.args)):
-            if utils.is_tensor_arg_node(op_repsets.op_node.args[i]):
-                self.constrain_op_arg_repset(i, op_repsets)
-
-        # TODO(ssjia): For most ops, inputs and outputs must be synchronized, so there
-        # is no need to constrain output repsets explicitly. Currently, the exceptions
-        # (i.e. choose qparams) already define constrined repsets for the output, so
-        # there is again no need to explicitly constrain the outputs. If an operator
-        # appears later on that does not sync input and output representations, and
-        # defines ambiguous repsets for the output tensor(s), then we will need to add
-        # additional logic to this function to constrain the output repsets separately
-        # from the input repsets.
-
-    def set_op_node_tensor_reprs(
-        self, graph_module: torch.fx.GraphModule, op_node: torch.fx.Node
-    ) -> None:
-        """
-        For an operator representated by `op_node`, get the OpRepSets associated with
-        the operation and try to constrain the repsets by accounting for existing
-        representations and tracing through the users of the operator.
-
-        Then, determine a tensor representation for all tensors participating in the
-        operation and mark it in the node metadata. If the requested representation is
-        different than an already determined representation, then insert a transition
-        node to create a copy of the tensor with the desired representation.
-        """
-        if not self.is_valid_op_node(op_node):
-            return
-
-        # Special case for getitem - propagate the node representation of the original node
-        if op_node.target == operator.getitem:
-            src_node = op_node.args[0]
-            assert isinstance(src_node, torch.fx.Node)
-            idx = op_node.args[1]
-            assert isinstance(idx, int)
-
-            arg_node_repr = utils.get_node_repr(src_node)
-            assert isinstance(arg_node_repr, list)
-            utils.set_node_repr(op_node, arg_node_repr[idx])
-            return
-
-        # Get a "fresh" OpRepSets object instead of using the cache. Do this because this
-        # class instance will go through the constraining process which may modify it.
-        features: OpFeatures = get_op_features(op_node.target)
-        op_repsets = features.make_op_repsets(op_node, self.texture_limits)
-
-        self.constrain_op_repsets(op_repsets)
-
-        args_repr_list, outs_repr_list = op_repsets.pick_representations()
-
-        if len(outs_repr_list) == 1:
-            utils.set_node_repr(op_node, outs_repr_list[0])
-        else:
-            utils.set_node_repr(op_node, outs_repr_list)
-
-        transitions_inserted = False
-        for i, arg_node in enumerate(op_node.args):
-            if not self.is_non_constant_tensor_node(arg_node):
-                continue
-
-            arg_node_repr = args_repr_list[i]
-
-            if isinstance(arg_node, torch.fx.Node):
-                transitions_inserted = (
-                    set_arg_node_repr_or_transition(
-                        graph_module, op_node, i, arg_node_repr, transitions_inserted
-                    )
-                    or transitions_inserted
-                )
-            elif isinstance(arg_node, (list, tuple)):
-                for n in arg_node:
-                    assert isinstance(n, torch.fx.Node)
-                    assert utils.is_single_tensor_node(n)
-                    transitions_inserted = (
-                        set_arg_node_repr_or_transition(
-                            graph_module,
-                            op_node,
-                            i,
-                            arg_node_repr,
-                            transitions_inserted,
-                        )
-                        or transitions_inserted
-                    )
-
-    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
-        for node in graph_module.graph.nodes:
-            self.set_op_node_tensor_reprs(graph_module, node)
-
-        return PassResult(graph_module, True)
diff --git a/backends/vulkan/cmake b/backends/vulkan/cmake
new file mode 120000
index 00000000000..21498ceec01
--- /dev/null
+++ b/backends/vulkan/cmake
@@ -0,0 +1 @@
+/home/ssjia/fbsource/xplat/executorch/backends/vulkan/cmake
\ No newline at end of file
diff --git a/backends/vulkan/cmake/ShaderLibrary.cmake b/backends/vulkan/cmake/ShaderLibrary.cmake
deleted file mode 100644
index 1b6838c4dfd..00000000000
--- a/backends/vulkan/cmake/ShaderLibrary.cmake
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# ### Editing this file ###
-#
-# This file should be formatted with
-# ~~~
-# cmake-format -i ATenVulkan.cmake
-# ~~~
-# It should also be cmake-lint clean.
-#
-# The targets in this file will be built if EXECUTORCH_BUILD_VULKAN is ON
-
-if(NOT PYTHON_EXECUTABLE)
-  message(
-    "WARNING: PYTHON_EXECUTABLE is not set! A failure is likely imminent."
-  )
-endif()
-
-if(NOT EXECUTORCH_ROOT)
-  message("WARNING: EXECUTORCH_ROOT is not set! A failure is likely imminent.")
-endif()
-
-if(ANDROID)
-  if(NOT ANDROID_NDK)
-    message(FATAL_ERROR "ANDROID_NDK not set")
-  endif()
-
-  if(NOT GLSLC_PATH)
-    set(GLSLC_PATH
-        "${ANDROID_NDK}/shader-tools/${ANDROID_NDK_HOST_SYSTEM_NAME}/glslc"
-    )
-  endif()
-else()
-  find_program(GLSLC_PATH glslc PATHS $ENV{PATH})
-
-  if(NOT GLSLC_PATH)
-    message(FATAL_ERROR "USE_VULKAN glslc not found")
-  endif()
-endif()
-
-# Required to enable linking with --whole-archive
-include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
-
-function(gen_vulkan_shader_lib_cpp shaders_path)
-  set(VULKAN_SHADERGEN_ENV "")
-  set(VULKAN_SHADERGEN_OUT_PATH ${CMAKE_BINARY_DIR}/vulkan_compute_shaders)
-
-  set(GEN_SPV_ARGS "--optimize")
-  if(DEFINED ENV{ETVK_USING_SWIFTSHADER})
-    if("$ENV{ETVK_USING_SWIFTSHADER}" STREQUAL "1"
-       OR "$ENV{ETVK_USING_SWIFTSHADER}" STREQUAL "True"
-    )
-      list(APPEND GEN_SPV_ARGS "--replace-u16vecn")
-    endif()
-  endif()
-
-  add_custom_command(
-    COMMENT "Generating Vulkan Compute Shaders"
-    OUTPUT ${VULKAN_SHADERGEN_OUT_PATH}/spv.cpp
-    COMMAND
-      "${PYTHON_EXECUTABLE}"
-      ${EXECUTORCH_ROOT}/backends/vulkan/runtime/gen_vulkan_spv.py --glsl-path
-      ${shaders_path} --output-path ${VULKAN_SHADERGEN_OUT_PATH}
-      --glslc-path=${GLSLC_PATH}
-      --tmp-dir-path=${VULKAN_SHADERGEN_OUT_PATH}/shader_cache/ --env
-      ${VULKAN_GEN_ARG_ENV} ${GEN_SPV_ARGS}
-    DEPENDS ${shaders_path}/*
-            ${EXECUTORCH_ROOT}/backends/vulkan/runtime/gen_vulkan_spv.py
-  )
-
-  set(generated_spv_cpp
-      ${VULKAN_SHADERGEN_OUT_PATH}/spv.cpp
-      PARENT_SCOPE
-  )
-endfunction()
-
-function(vulkan_shader_lib library_name generated_spv_cpp)
-  add_library(${library_name} STATIC ${generated_spv_cpp})
-  target_include_directories(
-    ${library_name}
-    PRIVATE
-      ${EXECUTORCH_ROOT}/..
-      ${EXECUTORCH_ROOT}/backends/vulkan/third-party/Vulkan-Headers/include
-      ${EXECUTORCH_ROOT}/backends/vulkan/third-party/volk
-  )
-  target_link_libraries(${library_name} vulkan_backend)
-  target_compile_options(${library_name} PRIVATE ${VULKAN_CXX_FLAGS})
-  # Link this library with --whole-archive due to dynamic shader registrations
-  executorch_target_link_options_shared_lib(${library_name})
-endfunction()
-
-# Convenience macro to generate a SPIR-V shader library target. Given the path
-# to the shaders to compile and the name of the library, it will create a static
-# library containing the generated SPIR-V shaders. The generated_spv_cpp
-# variable can be used to reference the generated CPP file outside the macro.
-macro(vulkan_shader_library shaders_path library_name)
-  set(VULKAN_SHADERGEN_ENV "")
-  set(VULKAN_SHADERGEN_OUT_PATH ${CMAKE_BINARY_DIR}/${library_name})
-
-  set(generated_spv_cpp ${VULKAN_SHADERGEN_OUT_PATH}/spv.cpp)
-
-  add_library(${library_name} STATIC ${generated_spv_cpp})
-  target_include_directories(
-    ${library_name}
-    PRIVATE
-      ${EXECUTORCH_ROOT}/..
-      ${EXECUTORCH_ROOT}/backends/vulkan/third-party/Vulkan-Headers/include
-      ${EXECUTORCH_ROOT}/backends/vulkan/third-party/volk
-  )
-  target_link_libraries(${library_name} vulkan_backend)
-  target_compile_options(${library_name} PRIVATE ${VULKAN_CXX_FLAGS})
-  # Link this library with --whole-archive due to dynamic shader registrations
-  executorch_target_link_options_shared_lib(${library_name})
-
-  unset(VULKAN_SHADERGEN_ENV)
-  unset(VULKAN_SHADERGEN_OUT_PATH)
-endmacro()
diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py
deleted file mode 100644
index 4312971f5f1..00000000000
--- a/backends/vulkan/custom_ops_lib.py
+++ /dev/null
@@ -1,545 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Optional
-
-import executorch.backends.vulkan.patterns as vk_patterns
-import torch.library
-
-namespace = "et_vk"
-lib = torch.library.Library(namespace, "DEF")
-
-#############
-## prepack ##
-#############
-
-
-def prepack_impl(x: torch.Tensor):
-    return x
-
-
-name = "prepack"
-lib.define(f"{name}(Tensor x) -> Tensor")
-lib.impl(name, prepack_impl, "CompositeExplicitAutograd")
-prepack_op = getattr(getattr(torch.ops, namespace), name)
-
-#####################
-## conv_with_clamp ##
-#####################
-
-
-def conv_with_clamp_impl(
-    input,
-    weight,
-    bias=None,
-    stride=1,
-    padding=0,
-    dilation=1,
-    transposed=False,
-    output_padding=0,
-    groups=1,
-    output_min=-float("inf"),
-    output_max=float("inf"),
-):
-    return torch.clamp(
-        torch.convolution(
-            input,
-            weight,
-            bias,
-            stride,
-            padding,
-            dilation,
-            transposed,
-            output_padding,
-            groups,
-        ),
-        output_min,
-        output_max,
-    )
-
-
-name = "conv_with_clamp"
-lib.define(
-    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Scalar? output_min, Scalar? output_max) -> Tensor"
-)
-lib.impl(name, conv_with_clamp_impl, "CompositeExplicitAutograd")
-conv_with_clamp_op = getattr(getattr(torch.ops, namespace), name)
-
-#########################
-## conv_with_clamp.out ##
-#########################
-
-
-def conv_with_clamp_out_impl(
-    input,
-    weight,
-    bias=None,
-    stride=1,
-    padding=0,
-    dilation=1,
-    transposed=False,
-    output_padding=0,
-    groups=1,
-    output_min=-float("inf"),
-    output_max=float("inf"),
-    out=None,
-):
-    out = conv_with_clamp_impl(
-        input,
-        weight,
-        bias,
-        stride,
-        padding,
-        dilation,
-        transposed,
-        output_padding,
-        groups,
-        output_min,
-        output_max,
-    )
-    return out
-
-
-name = "conv_with_clamp.out"
-lib.define(
-    f"{name}(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, Scalar? output_min, Scalar? output_max, *, Tensor(a!) out) -> Tensor(a!)"
-)
-lib.impl(name, conv_with_clamp_out_impl, "CompositeExplicitAutograd")
-
-#################
-## grid_priors ##
-#################
-
-
-# The dimension of x should be larger than 1
-def grid_priors_impl(
-    x,
-    stride,
-    offset,
-):
-    height, width = x.shape[-2:]
-    # Need to specify device of torch.arange to avoid executorch exporting error
-    shift_x = (torch.arange(0, width, device=x.device) + offset) * stride
-    shift_y = (torch.arange(0, height, device=x.device) + offset) * stride
-    # Need to specify indexing parameter ('ij' is the default value) to avoid executorch exporting error
-    shift_xx, shift_yy = torch.meshgrid([shift_y, shift_x], indexing="ij")
-    shift_xx = shift_xx.reshape(-1)
-    shift_yy = shift_yy.reshape(-1)
-    shifts = torch.stack((shift_yy, shift_xx), dim=-1)
-    return shifts
-
-
-name = "grid_priors"
-lib.define(f"{name}(Tensor self, int stride, float offset) -> Tensor")
-lib.impl(name, grid_priors_impl, "CompositeExplicitAutograd")
-grid_priors_op = getattr(getattr(torch.ops, namespace), name)
-
-
-# When lowering to executorch, ops are converted from default to out variant. Hence, custom ops define both variants.
-def grid_priors_out_impl(
-    x,
-    stride,
-    offset,
-    out,
-):
-    out = grid_priors_impl(x, stride, offset)
-    return out
-
-
-name = "grid_priors_out"
-lib.define(
-    f"{name}(Tensor self, int stride, float offset, *, Tensor(a!) out) -> Tensor(a!)"
-)
-lib.impl(name, grid_priors_out_impl, "CompositeExplicitAutograd")
-
-########################
-## linear_weight_int4 ##
-########################
-
-
-def linear_weight_int4_impl(
-    x: torch.Tensor,
-    weights_4x8: torch.Tensor,
-    groupsize: int,
-    scales_and_zeros: torch.Tensor,
-    inner_k_tiles: int,
-):
-    original_x_size = x.size()
-    out_features = weights_4x8.size(0)
-    x = x.reshape(-1, original_x_size[-1])
-    weight_int4pack = torch.ops.aten._convert_weight_to_int4pack(
-        weights_4x8, inner_k_tiles
-    )
-    out = torch.ops.aten._weight_int4pack_mm(
-        x, weight_int4pack, groupsize, scales_and_zeros
-    )
-    out_shape = original_x_size[:-1] + (out_features,)
-    return out.reshape(out_shape)
-
-
-name = "linear_weight_int4"
-lib.define(
-    f"{name}(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros, int inner_k_tiles) -> Tensor"
-)
-lib.impl(name, linear_weight_int4_impl, "CompositeExplicitAutograd")
-linear_weight_int4_op = getattr(getattr(torch.ops, namespace), name)
-
-##################
-## linear_qcs4w ##
-##################
-
-
-def linear_qcs4w(
-    x: torch.Tensor,
-    weights_4x2: torch.Tensor,
-    scales: torch.Tensor,
-):
-    original_x_shape = x.shape
-    x = x.reshape(-1, original_x_shape[-1])
-
-    unpacked_weights_shape = weights_4x2.shape
-    out_features = unpacked_weights_shape[0]
-    in_features = unpacked_weights_shape[1]
-
-    weights_unpacked = torch.empty(
-        (out_features, in_features * 2), dtype=torch.int8, device=weights_4x2.device
-    )
-
-    weights_unpacked[:, ::2] = weights_4x2 >> 4
-    weights_unpacked[:, 1::2] = weights_4x2 & 0x0F
-
-    n_bit = 8
-    quant_min = -(2 ** (n_bit - 1))
-    quant_max = 2 ** (n_bit - 1) - 1
-    dq_weights = torch.ops.quantized_decomposed.dequantize_per_channel(
-        weights_unpacked,
-        scales,
-        None,
-        0,
-        quant_min,
-        quant_max,
-        torch.int8,
-    )
-
-    out = torch.nn.functional.linear(x, dq_weights)
-    out_shape = original_x_shape[:-1] + (out_features,)
-    return out.reshape(out_shape)
-
-
-name = "linear_qcs4w"
-lib.define(f"{name}(Tensor self, Tensor weight, Tensor scales) -> Tensor")
-lib.impl(name, linear_qcs4w, "CompositeExplicitAutograd")
-linear_qc4w_op = getattr(getattr(torch.ops, namespace), name)
-
-##################
-## linear_q4gsw ##
-##################
-
-
-def unpack_4bit_weight_tensor(
-    packed_weight_tensor: torch.Tensor, x: torch.Tensor
-) -> torch.Tensor:
-    """
-    Reverses the packing performed in quantized_linear.pack_4bit_weight_tensor
-    """
-    # Each packed byte contains two 4-bit values: high nibble and low nibble
-    K, N_half = packed_weight_tensor.shape
-    N = N_half * 2
-
-    # Unpack high and low nibbles
-    high_nibble = (packed_weight_tensor >> 4) & 0x0F
-    low_nibble = packed_weight_tensor & 0x0F
-
-    # Stack to shape (K, N)
-    unpacked = torch.empty(
-        (K, N), dtype=torch.uint8, device=packed_weight_tensor.device
-    )
-    unpacked[:, ::2] = low_nibble
-    unpacked[:, 1::2] = high_nibble
-
-    # Undo the +8 offset and convert to signed 4-bit range [-8, 7]
-    unpacked = unpacked.to(torch.int8) - 8
-
-    in_channels = x.shape[-1]
-    # Undo any padding that may have been added to input channels
-    if in_channels != unpacked.shape[-1]:
-        return unpacked[:, :in_channels]
-
-    return unpacked
-
-
-def linear_q4gsw(
-    x: torch.Tensor,
-    weights: torch.Tensor,
-    weight_scales: torch.Tensor,
-    group_size: int,
-    bias: Optional[torch.Tensor] = None,
-):
-    # Unpack the packed weights
-    weights = unpack_4bit_weight_tensor(weights, x)
-
-    # Un-transpose the weight scales
-    weight_scales = weight_scales.transpose(0, 1)
-    weight_zeros = torch.zeros_like(weight_scales, dtype=torch.int32)
-
-    weights = torch.ops.torchao.dequantize_affine(
-        weights, [1, group_size], weight_scales, weight_zeros, torch.int8, -8, 7
-    )
-
-    out = torch.nn.functional.linear(x, weights)
-    return out
-
-
-name = "linear_q4gsw"
-lib.define(
-    f"""
-            {name}(
-                Tensor self,
-                Tensor weights,
-                Tensor weight_scales,
-                int group_size,
-                Tensor? bias = None) -> Tensor
-            """
-)
-lib.impl(name, linear_q4gsw, "CompositeExplicitAutograd")
-linear_qc4w_op = getattr(getattr(torch.ops, namespace), name)
-
-########################
-## linear_qta8a_qga4w ##
-########################
-
-
-def linear_qta8a_qga4w(
-    x_quantized: torch.Tensor,
-    input_scale: torch.Tensor,
-    input_zero_point: torch.Tensor,
-    weights_4bit: torch.Tensor,
-    group_size: int,
-    weight_scales: torch.Tensor,
-    weight_zeros: torch.Tensor,
-):
-    """
-    Dynamic activation + grouped weight quantized linear (QTA8A_QGA4W).
-
-    Args:
-        x_quantized: Already quantized input tensor (int8, per-token quantized)
-        input_scale: Scale for per-token quantization of input (shape: [batch_size])
-        input_zero_point: Zero point for per-token quantization of input (shape: [batch_size])
-        weights_4bit: Packed 4-bit quantized weights
-        group_size: Group size for weight quantization (int)
-        weight_scales: Per-group scales for weights
-        weight_zeros: Per-group zero points for weights
-    """
-    original_x_shape = x_quantized.shape
-    feature_dim = original_x_shape[-1]
-
-    # Reshape for processing
-    x_quantized_2d = x_quantized.reshape(-1, feature_dim)
-
-    # Unpack 4-bit weights
-    unpacked_weights_shape = weights_4bit.shape
-    out_features = unpacked_weights_shape[0]
-    in_features = unpacked_weights_shape[1]
-
-    weights_unpacked = torch.empty(
-        (out_features, in_features * 2), dtype=torch.int8, device=weights_4bit.device
-    )
-
-    weights_unpacked[:, ::2] = weights_4bit >> 4
-    weights_unpacked[:, 1::2] = weights_4bit & 0x0F
-
-    # Convert to signed 4-bit range [-8, 7]
-    weights_unpacked = torch.where(
-        weights_unpacked > 7, weights_unpacked - 16, weights_unpacked
-    )
-
-    # Dequantize weights using grouped quantization
-    actual_in_features = in_features * 2
-    num_groups = actual_in_features // group_size
-
-    # Reshape weights for grouped dequantization
-    weights_grouped = weights_unpacked.view(out_features, num_groups, group_size)
-
-    # Expand scales and zeros to match grouped weights
-    scales_expanded = weight_scales.unsqueeze(-1).expand(-1, -1, group_size)
-    zeros_expanded = weight_zeros.unsqueeze(-1).expand(-1, -1, group_size)
-
-    # Dequantize: (quantized - zero_point) * scale
-    dq_weights_grouped = (weights_grouped.float() - zeros_expanded) * scales_expanded
-    dq_weights = dq_weights_grouped.view(out_features, actual_in_features)
-
-    # Dequantize input (per-token)
-    # For per-token quantization, each token (row) has its own scale and zero_point
-    x_dequantized = torch.ops.quantized_decomposed.dequantize_per_token(
-        x_quantized_2d,
-        input_scale,
-        input_zero_point,
-        -128,
-        127,
-        torch.int8,
-        torch.float32,
-    )
-
-    # Perform linear operation
-    out = torch.nn.functional.linear(x_dequantized, dq_weights)
-    out_shape = original_x_shape[:-1] + (out_features,)
-    return out.reshape(out_shape)
-
-
-name = "linear_qta8a_qga4w"
-lib.define(
-    f"{name}(Tensor self, Tensor input_scale, Tensor input_zero_point, Tensor weight, int group_size, Tensor weight_scales, Tensor weight_zeros) -> Tensor"
-)
-lib.impl(name, linear_qta8a_qga4w, "CompositeExplicitAutograd")
-linear_qta8a_qga4w_op = getattr(getattr(torch.ops, namespace), name)
-
-#################
-## qaqw_linear ##
-#################
-
-
-def linear_q8ta_q8csw(
-    x: torch.Tensor,
-    input_scale: float,
-    input_zero_point: int,
-    weights: torch.Tensor,
-    weight_sums: torch.Tensor,
-    weight_scales: torch.Tensor,
-    bias: Optional[torch.Tensor] = None,
-):
-    weight_zeros = torch.zeros_like(weight_scales, dtype=torch.int32)
-    weights = torch.ops.quantized_decomposed.dequantize_per_channel(
-        weights,
-        weight_scales,
-        weight_zeros,
-        0,
-        -127,
-        127,
-        torch.int8,
-    )
-
-    # Perform linear operation
-    out = torch.nn.functional.linear(x, weights)
-    if bias is not None:
-        out = out + bias
-
-    return out
-
-
-name = "linear_q8ta_q8csw"
-lib.define(
-    f"""
-    {name}(
-        Tensor x,
-        float input_scale,
-        int input_zero_point,
-        Tensor weights,
-        Tensor weight_sums,
-        Tensor weight_scales,
-        Tensor? bias = None) -> Tensor
-    """
-)
-lib.impl(name, linear_q8ta_q8csw, "CompositeExplicitAutograd")
-qa_q8csw_linear = getattr(getattr(torch.ops, namespace), name)
-
-##################
-## conv2d_q8ta_q8csw ##
-##################
-
-
-def conv2d_q8ta_q8csw(
-    x: torch.Tensor,
-    input_scale: float,
-    input_zero_point: int,
-    weights: torch.Tensor,
-    weight_sums: torch.Tensor,
-    weight_scales: torch.Tensor,
-    bias: Optional[torch.Tensor],
-    kernel_size: list,
-    stride: list,
-    padding: list,
-    dilation: list,
-    groups: int,
-):
-    IC = x.shape[1]
-    K_h, K_w = kernel_size[0], kernel_size[1]
-
-    canonical_weight_K_dim = K_h * K_w * IC
-    # Remove any padding added to output channels dim to align to a multiple of 4
-    if weights.shape[-1] != canonical_weight_K_dim:
-        weights = weights[:, :canonical_weight_K_dim]
-        weight_scales = weight_scales[:canonical_weight_K_dim]
-        if bias is not None:
-            bias = bias[:canonical_weight_K_dim]
-
-    weight_zeros = torch.zeros_like(weight_scales, dtype=torch.int32)
-
-    # Calculate dimensions
-    OC = weights.shape[0]
-    in_features = weights.shape[1]
-    IC = in_features // (K_h * K_w)
-
-    # Reshape to original 4D format (OC, IC, H, W)
-    weights = weights.view(OC, IC, K_h, K_w)
-
-    # Dequantize weights
-    weights = torch.ops.quantized_decomposed.dequantize_per_channel(
-        weights,
-        weight_scales,
-        weight_zeros,
-        0,  # axis=0 for output channel quantization
-        -127,
-        127,
-        torch.int8,
-    )
-
-    # Perform convolution
-    out = torch.nn.functional.conv2d(
-        x, weights, bias, stride, padding, dilation, groups
-    )
-
-    return out
-
-
-name = "conv2d_q8ta_q8csw"
-lib.define(
-    f"""
-    {name}(
-        Tensor x,
-        float input_scale,
-        int input_zero_point,
-        Tensor weights,
-        Tensor weight_sums,
-        Tensor weight_scales,
-        Tensor? bias,
-        SymInt[] kernel_size,
-        SymInt[] stride,
-        SymInt[] padding,
-        SymInt[] dilation,
-        SymInt groups) -> Tensor
-    """
-)
-lib.impl(name, conv2d_q8ta_q8csw, "CompositeExplicitAutograd")
-conv2d_q8ta_q8csw_op = getattr(getattr(torch.ops, namespace), name)
-
-######################
-## apply_rotary_emb ##
-######################
-
-
-def apply_rotary_emb_impl(
-    xq: torch.Tensor, xk: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor
-):
-    pattern = vk_patterns.RotaryEmbeddingPattern()
-    return pattern.forward(xq, xk, freqs_cos, freqs_sin)
-
-
-name = "apply_rotary_emb"
-lib.define(
-    f"{name}(Tensor xq, Tensor xk, Tensor freqs_cos, Tensor freqs_sin) -> (Tensor, Tensor)"
-)
-lib.impl(name, apply_rotary_emb_impl, "CompositeExplicitAutograd")
-apply_rotary_emb_op = getattr(getattr(torch.ops, namespace), name)
diff --git a/backends/vulkan/custom_ops_lib.py b/backends/vulkan/custom_ops_lib.py
new file mode 120000
index 00000000000..d0c384c9d97
--- /dev/null
+++ b/backends/vulkan/custom_ops_lib.py
@@ -0,0 +1 @@
+/home/ssjia/fbsource/xplat/executorch/backends/vulkan/custom_ops_lib.py
\ No newline at end of file
diff --git a/backends/vulkan/docs b/backends/vulkan/docs
new file mode 120000
index 00000000000..e833ee9a5ac
--- /dev/null
+++ b/backends/vulkan/docs
@@ -0,0 +1 @@
+/home/ssjia/fbsource/xplat/executorch/backends/vulkan/docs
\ No newline at end of file
diff --git a/backends/vulkan/docs/android_demo.md b/backends/vulkan/docs/android_demo.md
deleted file mode 100644
index ff84938b06f..00000000000
--- a/backends/vulkan/docs/android_demo.md
+++ /dev/null
@@ -1,128 +0,0 @@
-# Building and Running ExecuTorch with the Vulkan Backend
-
-The [ExecuTorch Vulkan Delegate](../../../docs/source/native-delegates-executorch-vulkan-delegate.md)
-is a native GPU delegate for ExecuTorch.
-
-<!----This will show a grid card on the page----->
-::::{grid} 2
-:::{grid-item-card}  What you will learn in this tutorial:
-:class-card: card-content
-* How to export the Llama3.2-1B parameter model with partial GPU delegation
-* How to execute the partially delegated model on Android
-:::
-:::{grid-item-card}  Prerequisites:
-:class-card: card-prerequisites
-* Follow [**Setting up ExecuTorch**](../../../docs/source/getting-started-setup.rst)
-* It is also recommended that you read through [**ExecuTorch Vulkan Delegate**](../../../docs/source/native-delegates-executorch-vulkan-delegate.md) and follow the example in that page
-:::
-::::
-
-## Prerequisites
-
-Note that all the steps below should be performed from the ExecuTorch repository
-root directory, and assumes that you have gone through the steps of setting up
-ExecuTorch.
-
-It is also assumed that the Android NDK and Android SDK is installed, and the
-following environment examples are set.
-
-```shell
-export ANDROID_NDK=<path_to_ndk>
-# Select an appropriate Android ABI for your device
-export ANDROID_ABI=arm64-v8a
-# All subsequent commands should be performed from ExecuTorch repo root
-cd <path_to_executorch_root>
-# Make sure adb works
-adb --version
-```
-
-## Lowering the Llama3.2-1B model to Vulkan
-
-::::{note}
-The resultant model will only be partially delegated to the Vulkan backend. In
-particular, only binary arithmetic operators (`aten.add`, `aten.sub`,
-`aten.mul`, `aten.div`), matrix multiplication operators (`aten.mm`, `aten.bmm`),
-and linear layers (`aten.linear`) will be executed on the GPU via the Vulkan
-delegate. The rest of the model will be executed using Portable operators.
-
-Operator support for LLaMA models is currently in active development; please
-check out the `main` branch of the ExecuTorch repo for the latest capabilities.
-::::
-
-First, obtain the `consolidated.00.pth`, `params.json` and `tokenizer.model`
-files for the `Llama3.2-1B` model from the [Llama website](https://www.llama.com/llama-downloads/).
-
-Once the files have been downloaded, the `export_llama` script can be used to
-partially lower the Llama model to Vulkan.
-
-```shell
-# The files will usually be downloaded to ~/.llama
-python -m examples.models.llama.export_llama \
-  --disable_dynamic_shape --vulkan -kv --use_sdpa_with_kv_cache -d fp32 \
-  --model "llama3_2" \
-  -c ~/.llama/checkpoints/Llama3.2-1B/consolidated.00.pth \
-  -p ~/.llama/checkpoints/Llama3.2-1B/params.json \
-  --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
-```
-
-A `vulkan_llama2.pte` file should have been created as a result of running the
-script.
-
-Push the tokenizer binary and `vulkan_llama2.pte` onto your Android device:
-
-```shell
-adb push ~/.llama/tokenizer.model /data/local/tmp/
-adb push vulkan_llama2.pte /data/local/tmp/
-```
-
-## Build and Run the LLaMA runner binary on Android
-
-First, build and install ExecuTorch libraries, then build the LLaMA runner
-binary using the Android NDK toolchain.
-
-```shell
-./install_executorch.sh --clean
-(mkdir cmake-android-out && \
-  cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-    -DANDROID_ABI=$ANDROID_ABI \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-    -DEXECUTORCH_BUILD_VULKAN=ON \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-    -DPYTHON_EXECUTABLE=python \
-    -Bcmake-android-out && \
-  cmake --build cmake-android-out -j16 --target install)
-
-# Build LLaMA Runner library
-(rm -rf cmake-android-out/examples/models/llama && \
-  cmake examples/models/llama \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-    -DANDROID_ABI=$ANDROID_ABI \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-    -DCMAKE_INSTALL_PREFIX=cmake-android-out \
-    -DPYTHON_EXECUTABLE=python \
-    -Bcmake-android-out/examples/models/llama && \
-  cmake --build cmake-android-out/examples/models/llama -j16)
-```
-
-Finally, push and run the llama runner binary on your Android device. Note that
-your device must have sufficient GPU memory to execute the model.
-
-```shell
-adb push cmake-android-out/examples/models/llama/llama_main /data/local/tmp/llama_main
-
-adb shell /data/local/tmp/llama_main \
-    --model_path=/data/local/tmp/vulkan_llama2.pte \
-    --tokenizer_path=/data/local/tmp/tokenizer.model \
-    --prompt "Hello"
-```
-
-Note that currently model inference will be very slow due to the high amount of
-delegate blobs in the lowered graph, which requires a transfer to and from the
-GPU for each sub graph. Performance is expected to improve drastically as more
-of the model can be lowered to the Vulkan delegate, and techniques such as
-quantization are supported.
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
deleted file mode 100644
index 1b74ef1ac65..00000000000
--- a/backends/vulkan/op_registry.py
+++ /dev/null
@@ -1,720 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-
-import operator
-
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import executorch.backends.vulkan.custom_ops_lib  # noqa
-
-import executorch.backends.vulkan.utils as utils
-
-import torch
-
-from executorch.backends.vulkan.serialization.vulkan_graph_schema import VkMemoryLayout
-
-from executorch.exir.dialects._ops import ops as exir_ops
-
-from executorch.exir.dialects.edge._ops import EdgeOpOverload
-from torch._subclasses.fake_tensor import FakeTensor
-
-######################
-## OpFeatures class ##
-######################
-
-
-def allow_node(node: torch.fx.Node) -> bool:
-    return True
-
-
-class OpFeatures:
-    __slots__ = [
-        # Sets of possible (storage types, memory layouts) to use for the input tensor(s)
-        "inputs_storage",
-        # Sets of possible (storage types, memory layouts) to use for the output tensor(s)
-        "outputs_storage",
-        # bool indicating if the operator has a resize function, which allows it to
-        # support models with dynamic shape
-        "supports_resize",
-        # bool indicating if the operator handles its own prepacking. If this is True,
-        # then the insert_prepack_nodes pass will not insert prepack nodes for the args
-        # of the op.
-        "supports_prepacking",
-        # Optional check function used during partitioning to determine if a node's
-        # inputs are supported by the operator implementation.
-        "are_node_inputs_supported_fn",
-    ]
-
-    def __init__(
-        self,
-        inputs_storage: Optional[
-            Union[utils.TensorRepSet, List[utils.TensorRepSet]]
-        ] = None,
-        outputs_storage: Optional[
-            Union[utils.TensorRepSet, List[utils.TensorRepSet]]
-        ] = None,
-        supports_resize: bool = False,
-        supports_prepacking: bool = False,
-        are_node_inputs_supported_fn: Optional[Callable] = allow_node,
-    ):
-        self.inputs_storage: utils.TensorRepSetList = utils.TensorRepSetList(
-            inputs_storage if inputs_storage is not None else []
-        )
-        self.outputs_storage: utils.TensorRepSetList = utils.TensorRepSetList(
-            outputs_storage if outputs_storage is not None else []
-        )
-
-        # If output storage is not set, assume that it is derived from the first input
-        if self.outputs_storage.any_is_empty():
-            self.outputs_storage = utils.TensorRepSetList(self.inputs_storage[0])
-
-        self.supports_resize = supports_resize
-        self.supports_prepacking = supports_prepacking
-
-        self.are_node_inputs_supported_fn = are_node_inputs_supported_fn
-
-    def make_op_repsets(
-        self,
-        op_node: torch.fx.Node,
-        texture_limits: utils.ImageExtents = utils.DEFAULT_TEXTURE_LIMITS,
-    ) -> utils.OpRepSets:
-        return utils.OpRepSets(
-            self.inputs_storage, self.outputs_storage, op_node, texture_limits
-        )
-
-
-#######################
-## Operator Registry ##
-#######################
-
-OpKey = Union[str, torch._ops.OpOverload, EdgeOpOverload]
-
-vulkan_supported_ops: Dict[OpKey, OpFeatures] = {}
-
-
-def update_features(aten_op):
-    def features_decorator(fn: Callable):
-        def update_features_impl(op: OpKey):
-            if op in vulkan_supported_ops:
-                raise RuntimeError(f"[Vulkan delegate] duplicate registration of {op}!")
-            vulkan_supported_ops[op] = fn()
-
-        if isinstance(aten_op, list):
-            for op in aten_op:
-                update_features_impl(op)
-        else:
-            update_features_impl(aten_op)
-
-        return fn
-
-    return features_decorator
-
-
-@update_features(
-    [
-        operator.getitem,
-        # Symbolic integer ops
-        torch.ops.aten.sym_size.int,
-        operator.add,
-        operator.lt,
-        operator.gt,
-        operator.ge,
-        operator.le,
-        operator.eq,
-        # Guard and assert ops
-        torch.ops.aten._assert_scalar.default,
-        torch.ops.aten.sym_constrain_range_for_size.default,
-    ]
-)
-def register_ephemeral_op():
-    return OpFeatures(
-        inputs_storage=utils.ANY_STORAGE,
-        supports_resize=True,
-    )
-
-
-@update_features(
-    [
-        exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
-        exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
-        exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
-        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
-        exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
-        exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
-        exir_ops.edge.quantized_decomposed.quantize_per_token.default,
-        exir_ops.edge.quantized_decomposed.dequantize_per_token.default,
-    ]
-)
-def register_quantization_op():
-    return OpFeatures(
-        inputs_storage=utils.CONTIGUOUS_BUFFER,
-        supports_resize=True,
-    )
-
-
-@update_features(
-    [
-        exir_ops.edge.torchao.quantize_affine.default,
-        exir_ops.edge.torchao.dequantize_affine.default,
-    ]
-)
-def register_affine_quantization_op():
-    return OpFeatures(
-        inputs_storage=utils.CONTIGUOUS_BUFFER,
-        supports_resize=True,
-    )
-
-
-@update_features(
-    [
-        exir_ops.edge.quantized_decomposed.choose_qparams.tensor,
-        exir_ops.edge.quantized_decomposed.choose_qparams_per_token_asymmetric.default,
-    ]
-)
-def register_torchao_quantization_op():
-    return OpFeatures(
-        inputs_storage=utils.CONTIGUOUS_BUFFER,
-        supports_resize=True,
-    )
-
-
-@update_features(
-    exir_ops.edge.torchao.choose_qparams_affine.default,
-)
-def register_torchao_choose_qparams_affine():
-    return OpFeatures(
-        inputs_storage=utils.CONTIGUOUS_ANY,
-        outputs_storage=[
-            utils.CONTIGUOUS_BUFFER,  # scales
-            utils.CONTIGUOUS_BUFFER,  # zero_points
-        ],
-        supports_resize=True,
-    )
-
-
-@update_features(
-    [
-        exir_ops.edge.aten.add.Tensor,
-        exir_ops.edge.aten.sub.Tensor,
-        exir_ops.edge.aten.minimum.default,
-        exir_ops.edge.aten.mul.Tensor,
-        exir_ops.edge.aten.div.Tensor,
-        exir_ops.edge.aten.div.Tensor_mode,
-        exir_ops.edge.aten.pow.Tensor_Tensor,
-        exir_ops.edge.aten.eq.Tensor,
-        exir_ops.edge.aten.lt.Tensor,
-        exir_ops.edge.aten.le.Tensor,
-        exir_ops.edge.aten.gt.Tensor,
-        exir_ops.edge.aten.ge.Tensor,
-    ]
-)
-def register_binary_op():
-    return OpFeatures(
-        inputs_storage=utils.ANY_STORAGE,
-        supports_resize=True,
-    )
-
-
-@update_features(
-    [
-        exir_ops.edge.aten.abs.default,
-        exir_ops.edge.aten.clamp.default,
-        exir_ops.edge.aten.cos.default,
-        exir_ops.edge.aten.exp.default,
-        exir_ops.edge.aten.gelu.default,
-        exir_ops.edge.aten.hardshrink.default,
-        exir_ops.edge.aten.hardtanh.default,
-        exir_ops.edge.aten.neg.default,
-        exir_ops.edge.aten.relu.default,
-        exir_ops.edge.aten.sigmoid.default,
-        exir_ops.edge.aten.sin.default,
-        exir_ops.edge.aten.sqrt.default,
-        exir_ops.edge.aten.rsqrt.default,
-        exir_ops.edge.aten.tanh.default,
-        exir_ops.edge.aten.round.default,
-        exir_ops.edge.aten.leaky_relu.default,
-    ]
-)
-def register_unary_op():
-    return OpFeatures(
-        inputs_storage=utils.ANY_STORAGE,
-        supports_resize=True,
-    )
-
-
-@update_features(exir_ops.edge.aten._to_copy.default)
-def register_to_copy_op():
-    def check_to_copy_node(node: torch.fx.Node) -> bool:
-        float_dtypes = [torch.float16, torch.float32]
-
-        if len(node.args) != 1:
-            return False
-
-        in_arg = node.args[0]
-        if not isinstance(in_arg, torch.fx.Node):
-            return False
-
-        in_tensor = in_arg.meta.get("val", None)
-        out_tensor = node.meta.get("val", None)
-
-        if isinstance(in_tensor, FakeTensor) and isinstance(out_tensor, FakeTensor):
-            if out_tensor.dtype in float_dtypes and in_tensor.dtype in float_dtypes:
-                return True
-
-        return False
-
-    return OpFeatures(
-        inputs_storage=utils.ANY_STORAGE,
-        supports_resize=True,
-        are_node_inputs_supported_fn=check_to_copy_node,
-    )
-
-
-@update_features(exir_ops.edge.dim_order_ops._to_dim_order_copy.default)
-def register_to_copy_dim_order_op():
-    # Currently there is no "real" implementation for to_dim_order_copy, but it can be
-    # removed as long as the operator is not changing the dtype, i.e. the operator call
-    # is modifying the dim order only. Therefore, check that the input and output dtypes
-    # are the same, if so the operator is safe to remove.
-    def check_dim_order_copy_node(node: torch.fx.Node) -> bool:
-        in_arg = node.args[0]
-        if not isinstance(in_arg, torch.fx.Node):
-            return False
-
-        in_tensor = in_arg.meta.get("val", None)
-        out_tensor = node.meta.get("val", None)
-
-        if in_tensor.dtype != out_tensor.dtype:
-            return False
-
-        return True
-
-    return OpFeatures(
-        inputs_storage=utils.ANY_STORAGE,
-        supports_resize=True,
-        are_node_inputs_supported_fn=check_dim_order_copy_node,
-    )
-
-
-@update_features(
-    [
-        exir_ops.edge.aten.bmm.default,
-        exir_ops.edge.aten.mm.default,
-        exir_ops.edge.aten.addmm.default,
-        exir_ops.edge.aten.linear.default,
-    ]
-)
-def register_mm_op():
-    return OpFeatures(
-        inputs_storage=utils.CONTIGUOUS_ANY,
-        supports_resize=True,
-        supports_prepacking=True,
-    )
-
-
-@update_features(
-    [
-        exir_ops.edge.aten._weight_int8pack_mm.default,
-        exir_ops.edge.et_vk.linear_qcs4w.default,
-    ]
-)
-def register_int8_mm_op():
-    return OpFeatures(
-        inputs_storage=utils.CONTIGUOUS_ANY,
-        supports_resize=True,
-        supports_prepacking=True,
-    )
-
-
-@update_features(
-    [
-        exir_ops.edge.et_vk.linear_q8ta_q8csw.default,
-        exir_ops.edge.et_vk.linear_q4gsw.default,
-    ]
-)
-def register_quantized_linear_ops():
-    return OpFeatures(
-        inputs_storage=utils.CONTIGUOUS_ANY,
-        supports_prepacking=True,
-        supports_resize=False,
-    )
-
-
-@update_features(
-    [
-        exir_ops.edge.et_vk.linear_weight_int4.default,
-    ]
-)
-def register_int4_mm_op():
-    return OpFeatures(
-        inputs_storage=utils.CONTIGUOUS_ANY,
-        supports_resize=True,
-        supports_prepacking=True,
-    )
-
-
-@update_features(
-    [
-        exir_ops.edge.et_vk.linear_qta8a_qga4w.default,
-    ]
-)
-def register_dqlinear_op():
-    return OpFeatures(
-        inputs_storage=[
-            utils.CONTIGUOUS_ANY,  # input
-            utils.CONTIGUOUS_BUFFER,  # mat1 scales
-            utils.CONTIGUOUS_BUFFER,  # mat1 zeros
-            utils.NO_STORAGE,  # weight (prepacked)
-            utils.NO_STORAGE,  # group size (non tensor)
-            utils.CONTIGUOUS_BUFFER,  # mat2 scales
-            utils.CONTIGUOUS_BUFFER,  # mat2 zeros
-        ],
-        supports_resize=True,
-        supports_prepacking=True,
-    )
-
-
-@update_features(
-    [
-        exir_ops.edge.aten._log_softmax.default,
-        exir_ops.edge.aten._softmax.default,
-    ]
-)
-def register_softmax_op():
-    return OpFeatures(
-        inputs_storage=utils.ANY_TEXTURE,
-        supports_resize=True,
-    )
-
-
-@update_features(
-    [
-        exir_ops.edge.aten.mean.dim,
-        exir_ops.edge.aten.sum.dim_IntList,
-        exir_ops.edge.aten.amax.default,
-        exir_ops.edge.aten.amin.default,
-    ]
-)
-def register_reduce_op():
-    def check_reduce_node(node: torch.fx.Node) -> bool:
-        dim_list = node.args[1]
-        if isinstance(dim_list, list) and len(dim_list) > 2:
-            return False
-
-        if isinstance(dim_list, list) and len(dim_list) == 2:
-            # Try to get the memory layout for this node
-            try:
-                memory_layout = utils.get_node_memory_layout(node)
-
-                # If we have memory layout information, check if any dimension in dim_list corresponds to a packed dimension
-                if (
-                    memory_layout is not None
-                    and memory_layout != VkMemoryLayout.DEFAULT_LAYOUT
-                ):
-                    # For now only default layout is supported for 2D reduction.
-                    # Because we can't determine if the input is NCHW or NHWC here,
-                    # assume the reduction dimension is packed so we cannot support it.
-                    return False
-            except (AssertionError, KeyError, AttributeError):
-                # If we can't get memory layout information, we'll assume the dims aren't packed
-                pass
-
-        def try_find_keepdim_arg(node: torch.fx.Node) -> bool:
-            for arg in node.args:
-                if isinstance(arg, bool):
-                    return arg
-
-            # Assume false by default
-            return False
-
-        keepdim = try_find_keepdim_arg(node)
-        if isinstance(keepdim, bool) and not keepdim:
-            return False
-
-        return True
-
-    return OpFeatures(
-        inputs_storage=utils.ANY_TEXTURE,
-        supports_resize=True,
-        are_node_inputs_supported_fn=check_reduce_node,
-    )
-
-
-@update_features(
-    [
-        exir_ops.edge.aten.avg_pool2d.default,
-        exir_ops.edge.aten.max_pool2d.default,
-        exir_ops.edge.aten.max_pool2d_with_indices.default,
-    ]
-)
-def register_2d_pool_op():
-    return OpFeatures(
-        inputs_storage=utils.CHANNELS_PACKED_TEXTURE,
-        supports_resize=True,
-    )
-
-
-@update_features(
-    [
-        exir_ops.edge.aten.convolution.default,
-        exir_ops.edge.et_vk.conv_with_clamp.default,
-    ]
-)
-def register_convolution_op():
-    return OpFeatures(
-        inputs_storage=[
-            utils.CHANNELS_PACKED_TEXTURE,  # input
-            utils.NO_STORAGE,  # weight (prepacked)
-            utils.NO_STORAGE,  # bias (prepacked)
-            utils.NO_STORAGE,  # stride (non tensor)
-            utils.NO_STORAGE,  # padding (non tensor)
-            utils.NO_STORAGE,  # dilation (non tensor)
-            utils.NO_STORAGE,  # transposed (non tensor)
-            utils.NO_STORAGE,  # output_padding (non tensor)
-            utils.NO_STORAGE,  # groups (non tensor)
-            utils.NO_STORAGE,  # output_min (non tensor)
-            utils.NO_STORAGE,  # output_max (non tensor)
-        ],
-        supports_resize=True,
-        supports_prepacking=True,
-    )
-
-
-@update_features(
-    [
-        exir_ops.edge.et_vk.conv2d_q8ta_q8csw.default,
-    ]
-)
-def register_quantized_conv_op():
-    return OpFeatures(
-        inputs_storage=[
-            utils.CHANNELS_PACKED_TEXTURE,  # input
-            utils.NO_STORAGE,  # input_scale (non tensor)
-            utils.NO_STORAGE,  # input_zero_point (non tensor)
-            utils.NO_STORAGE,  # weight (prepacked)
-            utils.NO_STORAGE,  # weight_sums (prepacked)
-            utils.NO_STORAGE,  # weight_scales (prepacked)
-            utils.NO_STORAGE,  # bias (prepacked)
-            utils.NO_STORAGE,  # kernel_size (non tensor)
-            utils.NO_STORAGE,  # stride (non tensor)
-            utils.NO_STORAGE,  # padding (non tensor)
-            utils.NO_STORAGE,  # dilation (non tensor)
-            utils.NO_STORAGE,  # groups (non tensor)
-            utils.NO_STORAGE,  # original OC count (non tensor)
-        ],
-        supports_resize=False,
-        supports_prepacking=True,
-    )
-
-
-@update_features("llama::sdpa_with_kv_cache")
-def register_sdpa_with_kv_cache_op():
-    return OpFeatures(
-        inputs_storage=utils.WIDTH_PACKED_TEXTURE,
-        supports_resize=True,
-        supports_prepacking=True,
-    )
-
-
-@update_features(
-    [
-        "llama::update_cache",
-        "llama::custom_sdpa",
-    ]
-)
-def register_sdpa_ops():
-    return OpFeatures(
-        inputs_storage=utils.WIDTH_PACKED_TEXTURE,
-        supports_resize=True,
-    )
-
-
-@update_features(exir_ops.edge.et_vk.apply_rotary_emb.default)
-def register_rotary_emb_op():
-    return OpFeatures(
-        inputs_storage=utils.WIDTH_PACKED_TEXTURE,
-        supports_resize=True,
-    )
-
-
-@update_features(
-    [
-        exir_ops.edge.aten.permute.default,
-    ]
-)
-def register_view_ops():
-    return OpFeatures(
-        inputs_storage=utils.ANY_TEXTURE,
-        supports_resize=True,
-    )
-
-
-@update_features(
-    [
-        exir_ops.edge.aten.view_copy.default,
-        exir_ops.edge.aten.squeeze_copy.dims,
-        exir_ops.edge.aten.unsqueeze_copy.default,
-        exir_ops.edge.aten.clone.default,
-        exir_ops.edge.aten.permute_copy.default,
-    ]
-)
-def register_view_ops_with_buffer_meta():
-    return OpFeatures(
-        inputs_storage=utils.ANY_STORAGE,
-        supports_resize=True,
-    )
-
-
-@update_features(exir_ops.edge.aten.expand_copy.default)
-def register_expand():
-    return OpFeatures(inputs_storage=utils.ANY_BUFFER, supports_resize=False)
-
-
-# Fully featured transfer operators (i.e. operators that copy data from the input
-# tensor(s) to the output tensor(s)), which have memory layout agnostic implementations
-# for both texture and buffer storage types.
-@update_features(exir_ops.edge.aten.cat.default)
-def register_cat_op():
-    return OpFeatures(
-        inputs_storage=utils.ANY_STORAGE,
-        supports_resize=True,
-    )
-
-
-# Fully featured transfer operators (i.e. operators that copy data from the input
-# tensor(s) to the output tensor(s)), which have memory layout agnostic implementations
-# for both texture and buffer storage types.
-@update_features(
-    [
-        exir_ops.edge.aten.select_copy.int,
-        exir_ops.edge.aten.slice_copy.Tensor,
-    ]
-)
-def register_transfer_ops():
-    return OpFeatures(
-        inputs_storage=utils.ANY_STORAGE,
-        supports_resize=True,
-    )
-
-
-# Ops ported from PyTorch Vulkan backend. These ops commonly support channels
-# packed tensors only and do not have a resize function.
-@update_features(
-    [
-        # Shape Manipulation
-        exir_ops.edge.aten.t_copy.default,
-        # Indexing and lookup
-        exir_ops.edge.aten.flip.default,
-        exir_ops.edge.aten.index_select.default,
-        # Tensor creation
-        exir_ops.edge.aten.arange.start_step,
-        exir_ops.edge.aten.constant_pad_nd.default,
-        exir_ops.edge.aten.full.default,
-        exir_ops.edge.aten.full_like.default,
-        exir_ops.edge.aten.ones.default,
-        exir_ops.edge.aten.ones_like.default,
-        exir_ops.edge.aten.scalar_tensor.default,
-        exir_ops.edge.aten.upsample_nearest2d.vec,
-        exir_ops.edge.aten.upsample_bilinear2d.vec,
-        exir_ops.edge.aten.zeros.default,
-        exir_ops.edge.aten.zeros_like.default,
-        exir_ops.edge.et_vk.grid_priors.default,
-    ]
-)
-def register_ported_op():
-    return OpFeatures(
-        inputs_storage=utils.CHANNELS_PACKED_TEXTURE,
-    )
-
-
-# Ops ported from PyTorch Vulkan backend. These ops are in a separate registry because they support all packed dimensions
-@update_features(
-    [
-        # Tensor combination
-        exir_ops.edge.aten.repeat.default,
-        exir_ops.edge.aten.split_with_sizes_copy.default,
-        exir_ops.edge.aten.split.Tensor,
-    ]
-)
-def register_ported_op_all_packed_dims():
-    return OpFeatures(
-        inputs_storage=utils.ANY_TEXTURE,
-    )
-
-
-# Ported ops that support their own prepacking.
-@update_features(
-    [
-        exir_ops.edge.aten.embedding.default,
-        exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
-    ]
-)
-def register_ported_ops_with_prepacking():
-    return OpFeatures(
-        inputs_storage=utils.CHANNELS_PACKED_TEXTURE,
-        supports_prepacking=True,
-    )
-
-
-@update_features(
-    [
-        exir_ops.edge.aten.native_group_norm.default,
-    ]
-)
-def register_native_group_norm():
-    return OpFeatures(
-        inputs_storage=utils.CHANNELS_PACKED_TEXTURE,
-        outputs_storage=[
-            utils.CHANNELS_PACKED_TEXTURE,
-            utils.CONTIGUOUS_BUFFER,
-            utils.CONTIGUOUS_BUFFER,
-        ],
-        supports_prepacking=True,
-    )
-
-
-# Ported ops that support their own prepacking.
-@update_features(
-    [
-        exir_ops.edge.aten.native_layer_norm.default,
-    ]
-)
-def register_ported_ops_with_prepacking_all_dims():
-    return OpFeatures(
-        inputs_storage=utils.ANY_TEXTURE,
-        supports_prepacking=True,
-    )
-
-
-#######################
-## Utility functions ##
-#######################
-
-
-def has_impl(target: Any) -> bool:
-    if not isinstance(target, str):
-        if target not in vulkan_supported_ops:
-            return target.name() in vulkan_supported_ops
-        return target in vulkan_supported_ops
-    else:
-        return target in vulkan_supported_ops
-
-
-def get_op_features(target: Any) -> OpFeatures:
-    if not isinstance(target, str):
-        if target not in vulkan_supported_ops:
-            # Try the op's name
-            return vulkan_supported_ops[target.name()]
-
-        return vulkan_supported_ops[target]
-    else:
-        return vulkan_supported_ops[target]
-
-
-def handles_own_prepacking(target: OpKey) -> bool:
-    return get_op_features(target).supports_prepacking
diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
new file mode 120000
index 00000000000..f34d32d3a0b
--- /dev/null
+++ b/backends/vulkan/op_registry.py
@@ -0,0 +1 @@
+/home/ssjia/fbsource/xplat/executorch/backends/vulkan/op_registry.py
\ No newline at end of file
diff --git a/backends/vulkan/partitioner b/backends/vulkan/partitioner
new file mode 120000
index 00000000000..a4f40e523fa
--- /dev/null
+++ b/backends/vulkan/partitioner
@@ -0,0 +1 @@
+/home/ssjia/fbsource/xplat/executorch/backends/vulkan/partitioner
\ No newline at end of file
diff --git a/backends/vulkan/partitioner/TARGETS b/backends/vulkan/partitioner/TARGETS
deleted file mode 100644
index 986d872f730..00000000000
--- a/backends/vulkan/partitioner/TARGETS
+++ /dev/null
@@ -1,26 +0,0 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-
-oncall("executorch")
-
-runtime.python_library(
-    name = "vulkan_partitioner",
-    srcs = [
-        "vulkan_partitioner.py",
-    ],
-    visibility = [
-        "//executorch/...",
-        "@EXECUTORCH_CLIENTS",
-    ],
-    deps = [
-        "//executorch/backends/vulkan:op_registry",
-        "//executorch/backends/vulkan:utils_lib",
-        "//executorch/backends/vulkan:vulkan_preprocess",
-        "//executorch/backends/vulkan/patterns:vulkan_patterns",
-        "//executorch/exir:delegate",
-        "//executorch/exir:lib",
-        "//executorch/exir/backend:partitioner",
-        "//executorch/exir/backend:utils",
-        "//executorch/exir/backend/canonical_partitioners:canonical_partitioner_lib",
-    ],
-    typing = True,
-)
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
deleted file mode 100644
index e5b2d0f7864..00000000000
--- a/backends/vulkan/partitioner/vulkan_partitioner.py
+++ /dev/null
@@ -1,425 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-strict
-
-import logging
-from typing import Any, Callable, Dict, final, List, Mapping, Optional, Set, Tuple
-
-import executorch.backends.vulkan.patterns as vk_patterns
-import executorch.backends.vulkan.utils as utils
-
-import torch
-
-from executorch.backends.vulkan.op_registry import (
-    get_op_features,
-    has_impl,
-    OpFeatures,
-    OpKey,
-    vulkan_supported_ops,
-)
-
-from executorch.backends.vulkan.patterns import PatternMatch
-
-from executorch.backends.vulkan.serialization.vulkan_graph_schema import (
-    VkMemoryLayout,
-    VkStorageType,
-)
-from executorch.backends.vulkan.vulkan_preprocess import VulkanBackend
-
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from executorch.exir.backend.partitioner import (
-    DelegationSpec,
-    Partitioner,
-    PartitionResult,
-)
-from executorch.exir.backend.utils import tag_constant_data
-from executorch.exir.dialects._ops import ops as exir_ops
-
-from torch.export.exported_program import ExportedProgram
-
-from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
-from torch.fx.passes.operator_support import OperatorSupportBase
-
-# pyre-ignore
-ops_not_to_decompose = [
-    torch.ops.aten.upsample_nearest2d.vec,
-]
-
-logger: logging.Logger = logging.getLogger("")
-logger.setLevel(logging.INFO)
-
-
-class VulkanSupportedOperators(OperatorSupportBase):
-    def __init__(
-        self,
-        texture_limits: utils.ImageExtents,
-        buffer_limit: int,
-        require_dynamic_shape: bool = False,
-        operator_blocklist: Optional[Set[OpKey]] = None,
-        operator_allowlist: Optional[Set[OpKey]] = None,
-        fusable_subgraphs: Optional[List[PatternMatch]] = None,
-        nn_module_blocklist: Optional[Set[str]] = None,
-        nn_module_allowlist: Optional[Set[str]] = None,
-    ) -> None:
-        super().__init__()
-        self.texture_limits: utils.ImageExtents = texture_limits
-        self.buffer_limit = buffer_limit
-        self.require_dynamic_shapes = require_dynamic_shape
-        self.operator_blocklist: Set[OpKey] = (
-            operator_blocklist if operator_blocklist is not None else set()
-        )
-        self.operator_allowlist = operator_allowlist
-        self.fusable_subgraphs: List[PatternMatch] = (
-            fusable_subgraphs if fusable_subgraphs is not None else []
-        )
-        # Create a set of all nodes that are part of fusable subgraphs for quick lookup
-        self.fusable_nodes: Set[torch.fx.Node] = set()
-        for match in self.fusable_subgraphs:
-            self.fusable_nodes.update(match.all_nodes)
-
-        self.nn_module_blocklist = nn_module_blocklist
-        self.nn_module_allowlist = nn_module_allowlist
-
-    def op_node_is_compatible(  # noqa: C901: Function is too complex
-        self, node: torch.fx.Node, features: Optional[OpFeatures] = None
-    ) -> Tuple[bool, str]:
-        """
-        Check if a given node is compatible with the Vulkan delegate's implementation
-        of the operator called by the node. Each tensor argument participating in the
-        operator call must be able to be represented with a (storage type, memory layout)
-        combination that is supported by the operator implementation.
-        """
-        target = node.target
-        # Account for custom operators
-        if node.target == torch.ops.higher_order.auto_functionalized:
-            first_arg = node.args[0]
-            assert isinstance(first_arg, torch._ops.OpOverload)
-            target = first_arg.name()
-
-        # Operator allow list is only used for torch ops
-        if (
-            utils.is_torch_op_node(node)
-            and (self.operator_allowlist is not None)
-            and (target not in self.operator_allowlist)
-        ):
-            return False, "op is not in allowlist"
-
-        if target in self.operator_blocklist:
-            return False, "op is in blocklist"
-
-        # Extract the features for the node's operator, if no override was provided
-        if features is None:
-            if not has_impl(target):
-                return False, "no operator implementation"
-            features = get_op_features(target)
-
-        # Get the possible tensor representations for each tensor participating in the
-        # this operator. Then check that all tensors are representable as either a
-        # buffer or texture.
-        op_repsets: utils.OpRepSets = features.make_op_repsets(
-            node, self.texture_limits
-        )
-
-        if op_repsets.any_is_empty():
-            return (
-                False,
-                f"no valid representations for op {utils.node_io_str(node)}",
-            )
-
-        return True, "Op is compatible"
-
-    def node_is_compatible(
-        self, node: torch.fx.Node, features: Optional[OpFeatures] = None
-    ) -> Tuple[bool, str]:
-        if utils.is_tensor_node(node):
-            return self.op_node_is_compatible(node, features=features)
-        # For non-tensor nodes, just check if the op is registered
-        elif hasattr(node, "target"):
-            return node.target in vulkan_supported_ops, "Op is compatible"
-
-        return False, f"Unsupported node type: {node.format_node()}"
-
-    def is_linear_permute(self, node: torch.fx.Node) -> Tuple[bool, bool]:
-        """
-        Detect if a node is a permute/transpose that precedes a call to a `mm` or
-        `addmm` operator. This node can be fused with the `mm` or `addmm` to produce a
-        `linear` operator.
-
-        This function returns two bool values:
-        1. The first indicates if this node can be fused into a linear node
-        2. The second indicates if the overall linear op can be executed with Vulkan
-
-        The node will be partitioned only if both are true.
-        """
-        if node.target not in [
-            exir_ops.edge.aten.t_copy.default,
-            exir_ops.edge.aten.permute_copy.default,
-        ]:
-            return False, False
-
-        if len(node.users) != 1:
-            return False, False
-
-        first_user = list(node.users.keys())[0]
-        if first_user.target in [
-            exir_ops.edge.aten.mm.default,
-            exir_ops.edge.aten.addmm.default,
-        ]:
-            # Only mark this node if the target linear op is valid
-            if self.node_is_compatible(first_user)[0]:
-                return True, True
-            else:
-                return True, False
-
-        return False, False
-
-    def is_in_local_scalar_dense_chain(self, node: torch.fx.Node) -> Tuple[bool, bool]:
-        """
-        Scalar tensors are usually converted to scalar values in the graph via`
-        scalar_tensor[0].item()` in Python, which translates to a chain of
-        `local_scalar_dense(torch.select.int(scalar_tensor, 0, 0))` in the graph.
-        This function marks the entire chain as supported by the Vulkan delegate.
-
-        Later, within vulkan_preprocess there will be a graph transform which replaces
-        the chain with passing in the scalar tensor directly.
-
-        Similar to the `is_linear_permute` function, this function has 2 return values.
-        """
-        if node.target == exir_ops.edge.aten.select_copy.int:
-            if len(node.users) != 1:
-                return False, False
-            # pyre-ignore
-            if node.args[0].meta["val"].numel() != 1:
-                return False, False
-
-            local_scalar_dense = list(node.users.keys())[0]
-            if local_scalar_dense.target != torch.ops.aten._local_scalar_dense.default:
-                return False, False
-
-            return self.is_in_local_scalar_dense_chain(local_scalar_dense)
-
-        if node.target == torch.ops.aten._local_scalar_dense.default:
-            return True, all(self.node_is_compatible(user)[0] for user in node.users)
-
-        return False, False
-
-    def log_skip(self, node: torch.fx.Node, reason: str) -> None:
-        if node.op == "call_function":
-            logger.info(
-                f"[Vulkan Partitioner] Due to [{reason}], skipping {utils.node_io_str(node)}"
-            )
-
-    def is_node_supported(
-        self, submodules: Mapping[str, torch.nn.Module], node: torch.fx.Node
-    ) -> bool:
-        r = self._is_node_supported(node)
-        return r
-
-    def _is_node_supported(self, node: torch.fx.Node) -> bool:  # noqa: C901
-        if node.op == "call_function":
-            # Apply nn module allowlist and blocklist
-            if self.nn_module_allowlist is not None:
-                if not utils.node_comes_from_any_nn_module_in_set(
-                    node, self.nn_module_allowlist
-                ):
-                    self.log_skip(node, "source nn.Module is not in allowlist")
-                    return False
-
-            if self.nn_module_blocklist is not None:
-                if utils.node_comes_from_any_nn_module_in_set(
-                    node, self.nn_module_blocklist
-                ):
-                    self.log_skip(node, "source nn.Module is in blocklist")
-                    return False
-
-            # Check if this node is part of a fusable subgraph
-            if node in self.fusable_nodes:
-                return True
-
-        target = node.target
-        if node.target == torch.ops.higher_order.auto_functionalized:
-            first_arg = node.args[0]
-            assert isinstance(first_arg, torch._ops.OpOverload)
-            target = first_arg.name()
-
-        is_linear_permute, target_linear_is_compatible = self.is_linear_permute(node)
-        if is_linear_permute and target_linear_is_compatible:
-            return True
-        elif is_linear_permute:
-            # Skip so that the permute can be fused into a linear by another backend
-            self.log_skip(node, "permute node of non compatible linear node")
-            return False
-
-        is_in_local_scalar_dense_chain, dst_node_is_compatible = (
-            self.is_in_local_scalar_dense_chain(node)
-        )
-        if is_in_local_scalar_dense_chain and dst_node_is_compatible:
-            return True
-        elif is_in_local_scalar_dense_chain:
-            self.log_skip(node, "local scalar dense of incompatible op node")
-            return False
-
-        features = None
-        if target not in vulkan_supported_ops:
-            # For some ops, i.e. custom ops the name is registered instead of the
-            # OpOverload object.
-            if hasattr(target, "name") and target.name() in vulkan_supported_ops:
-                features = vulkan_supported_ops[target.name()]
-            else:
-                self.log_skip(node, "no operator implementation")
-                return False
-        else:
-            features = vulkan_supported_ops[target]
-
-        assert features is not None
-
-        if not features.are_node_inputs_supported_fn(node):
-            self.log_skip(node, "op args not supported")
-            return False
-
-        if self.require_dynamic_shapes and not features.supports_resize:
-            self.log_skip(node, "no dynamic shape support")
-            return False
-
-        is_compatible, reason = self.node_is_compatible(node, features=features)
-        if not is_compatible:
-            self.log_skip(node, reason)
-
-        return is_compatible
-
-
-def parse_compile_options(compile_options: Dict[str, Any]) -> List[CompileSpec]:
-    compile_specs = []
-
-    for key, value in compile_options.items():
-        if isinstance(value, (VkStorageType, VkMemoryLayout)):
-            value_bytes = int(value).to_bytes(4, byteorder="little")
-            compile_specs.append(CompileSpec(key, value_bytes))
-
-        if isinstance(value, bool):
-            value_bytes = value.to_bytes(1, byteorder="little")
-            compile_specs.append(CompileSpec(key, value_bytes))
-
-        if key == "texture_limits":
-            compile_specs.append(
-                CompileSpec(
-                    "texture_limits_x", int(value[0]).to_bytes(4, byteorder="little")
-                )
-            )
-            compile_specs.append(
-                CompileSpec(
-                    "texture_limits_y", int(value[1]).to_bytes(4, byteorder="little")
-                )
-            )
-            compile_specs.append(
-                CompileSpec(
-                    "texture_limits_z", int(value[2]).to_bytes(4, byteorder="little")
-                )
-            )
-
-        # Unhandled options are ignored
-
-    return compile_specs
-
-
-@final
-class VulkanPartitioner(Partitioner):
-    def __init__(
-        self,
-        compile_options: Optional[Dict[str, Any]] = None,
-        operator_blocklist: Optional[List[OpKey]] = None,
-        operator_allowlist: Optional[List[OpKey]] = None,
-        nn_module_blocklist: Optional[List[str]] = None,
-        nn_module_allowlist: Optional[List[str]] = None,
-    ) -> None:
-        self.options: Dict[str, Any] = {}
-        if compile_options is not None:
-            self.options = compile_options
-
-        compile_spec = parse_compile_options(self.options)
-        self.delegation_spec = DelegationSpec(VulkanBackend.__name__, compile_spec)
-
-        self.operator_blocklist: Set[OpKey] = set()
-        if operator_blocklist is not None:
-            for entry in operator_blocklist or []:
-                self.operator_blocklist.add(entry)
-
-        self.operator_allowlist: Optional[Set[OpKey]] = None
-        if operator_allowlist is not None:
-            self.operator_allowlist = set()
-            for entry in operator_allowlist:
-                assert self.operator_allowlist is not None
-                self.operator_allowlist.add(entry)
-
-        self.nn_module_blocklist: Optional[Set[str]] = None
-        if nn_module_blocklist is not None:
-            self.nn_module_blocklist = set()
-            for entry in nn_module_blocklist or []:
-                assert self.nn_module_blocklist is not None
-                self.nn_module_blocklist.add(entry)
-
-        self.nn_module_allowlist: Optional[Set[str]] = None
-        if nn_module_allowlist is not None:
-            self.nn_module_allowlist = set()
-            for entry in nn_module_allowlist:
-                assert self.nn_module_allowlist is not None
-                self.nn_module_allowlist.add(entry)
-
-    def ops_to_not_decompose(
-        self, ep: ExportedProgram
-    ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
-        def filter_fn(node: torch.fx.Node) -> bool:
-            return True
-
-        return (ops_not_to_decompose, filter_fn)
-
-    def partition(self, exported_program: ExportedProgram) -> PartitionResult:
-        # Run the CapabilityBasedPartitioner to return the largest possible
-        # subgraphs containing the nodes with the tags
-        partition_tags = {}
-
-        # Get all fusable subgraphs from fuse_patterns
-        fusable_subgraphs = vk_patterns.get_all_fusable_subgraphs(
-            exported_program.graph_module
-        )
-
-        texture_limits: utils.ImageExtents = self.options.get(
-            "texture_limits", utils.DEFAULT_TEXTURE_LIMITS
-        )
-        buffer_limit: int = self.options.get("buffer_limit", utils.DEFAULT_BUFFER_LIMIT)
-        capability_partitioner = CapabilityBasedPartitioner(
-            exported_program.graph_module,
-            VulkanSupportedOperators(
-                texture_limits,
-                buffer_limit,
-                require_dynamic_shape=self.options.get("require_dynamic_shapes", False),
-                operator_blocklist=self.operator_blocklist,
-                operator_allowlist=self.operator_allowlist,
-                fusable_subgraphs=fusable_subgraphs,
-                nn_module_blocklist=self.nn_module_blocklist,
-                nn_module_allowlist=self.nn_module_allowlist,
-            ),
-            allows_single_node_partition=True,
-        )
-        partition_list = capability_partitioner.propose_partitions()
-        for partition in partition_list:
-            for node in partition.nodes:
-                tag = f"tag{partition.id}"
-                node.meta["delegation_tag"] = tag
-                partition_tags[tag] = self.delegation_spec
-
-        pl = len(partition_list)
-        if pl == 0:
-            logger.warning("No Vulkan subgraphs can be partitioned!")
-        else:
-            logger.info(f"Found {pl} Vulkan subgraphs to be partitioned.")
-
-        tag_constant_data(exported_program)
-
-        return PartitionResult(
-            tagged_exported_program=exported_program, partition_tags=partition_tags
-        )
diff --git a/backends/vulkan/patterns b/backends/vulkan/patterns
new file mode 120000
index 00000000000..8abcaf07403
--- /dev/null
+++ b/backends/vulkan/patterns
@@ -0,0 +1 @@
+/home/ssjia/fbsource/xplat/executorch/backends/vulkan/patterns
\ No newline at end of file
diff --git a/backends/vulkan/patterns/TARGETS b/backends/vulkan/patterns/TARGETS
deleted file mode 100644
index 791edf58984..00000000000
--- a/backends/vulkan/patterns/TARGETS
+++ /dev/null
@@ -1,26 +0,0 @@
-load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-
-oncall("executorch")
-
-runtime.python_library(
-    name = "vulkan_patterns",
-    srcs = [
-        "__init__.py",
-        "pattern_registry.py",
-        "rope.py",
-        "quantized_linear.py",
-        "quantized_convolution.py",
-    ],
-    visibility = [
-        "//executorch/backends/...",
-        "//executorch/examples/...",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/exir:lib",
-        "//executorch/backends/transforms:utils",
-        "//executorch/backends/vulkan:utils_lib",
-    ],
-    typing = True,
-)
diff --git a/backends/vulkan/patterns/__init__.py b/backends/vulkan/patterns/__init__.py
deleted file mode 100644
index 8ffad98b3c3..00000000000
--- a/backends/vulkan/patterns/__init__.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import List
-
-import executorch.backends.vulkan.patterns.quantized_convolution  # noqa
-
-import executorch.backends.vulkan.patterns.quantized_linear  # noqa
-
-import executorch.backends.vulkan.patterns.rope  # noqa
-
-import torch
-
-from executorch.backends.vulkan.patterns.pattern_registry import (
-    create_pattern_match_from_internal_match,
-    CreateReplacementFn,
-    DetectorFn,
-    fusable_patterns,
-    GetGraphFn,
-    PatternMatch,
-    register_pattern_detector,
-    register_pattern_graph,
-    register_pattern_replacement,
-)
-
-from executorch.backends.vulkan.patterns.rope import RotaryEmbeddingPattern
-
-from executorch.exir import ExportedProgram
-
-from torch.fx.passes.utils.matcher_utils import SubgraphMatcher
-
-
-__all__ = [
-    "PatternMatch",
-    "GetGraphFn",
-    "DetectorFn",
-    "CreateReplacementFn",
-    "RotaryEmbeddingPattern",
-    "fusable_patterns",
-    "register_pattern_graph",
-    "register_pattern_detector",
-    "register_pattern_replacement",
-]
-
-
-def all_fusable_graph_patterns() -> List[torch.fx.GraphModule]:
-    all_patterns = []
-    for entry in fusable_patterns.values():
-        if entry.get_graphs_fn is not None:
-            all_patterns.extend(entry.get_graphs_fn())
-
-    return all_patterns
-
-
-def get_all_fusable_subgraphs(
-    graph_module: torch.fx.GraphModule,
-) -> List[PatternMatch]:
-    fusable_subgraphs = []
-
-    fuse_patterns = all_fusable_graph_patterns()
-    for pattern in fuse_patterns:
-        sm = SubgraphMatcher(pattern.graph, ignore_literals=True)
-        matches = list(sm.match(graph_module.graph))
-        for match in matches:
-            fusable_subgraphs.append(create_pattern_match_from_internal_match(match))
-
-    for node in graph_module.graph.nodes:
-        for entry in fusable_patterns.values():
-            if entry.detector_fn is not None:
-                maybe_match = entry.detector_fn(node)
-                if maybe_match is not None:
-                    fusable_subgraphs.append(maybe_match)
-
-    return fusable_subgraphs
-
-
-def create_replacement_for_pattern(
-    ep: ExportedProgram,
-    graph_module: torch.fx.GraphModule,
-    patterns: List[torch.fx.GraphModule],
-    create_replacement_func: CreateReplacementFn,
-) -> int:
-    total_replaced = 0
-
-    for pattern in patterns:
-        sm = SubgraphMatcher(pattern.graph, ignore_literals=True)
-        matches = list(sm.match(graph_module.graph))
-
-        for partition_to_replace in matches:
-            pattern = create_pattern_match_from_internal_match(partition_to_replace)
-            create_replacement_func(ep, graph_module, pattern)
-            total_replaced += 1
-            # Remove dead code so they won't be matched again
-            graph_module.graph.eliminate_dead_code()
-
-    return total_replaced
-
-
-def replace_all_fusable_subgraphs(
-    ep: ExportedProgram,
-    graph_module: torch.fx.GraphModule,
-) -> int:
-    total_replaced = 0
-
-    # Handle patterns identified with SubgraphMatcher
-    for entry in fusable_patterns.values():
-        if entry.get_graphs_fn is not None and entry.create_replacement_fn is not None:
-            total_replaced += create_replacement_for_pattern(
-                ep,
-                graph_module,
-                entry.get_graphs_fn(),
-                # pyre-ignore[6]
-                entry.create_replacement_fn,
-            )
-
-    # Handle patterns identified with custom detector function
-    for node in graph_module.graph.nodes:
-        for entry in fusable_patterns.values():
-            if (
-                entry.detector_fn is not None
-                and entry.create_replacement_fn is not None
-            ):
-                maybe_match = entry.detector_fn(node)
-                if maybe_match is not None:
-                    assert entry.create_replacement_fn is not None
-                    entry.create_replacement_fn(ep, graph_module, maybe_match)
-                    total_replaced += 1
-
-    graph_module.graph.eliminate_dead_code()
-    return total_replaced
diff --git a/backends/vulkan/patterns/pattern_registry.py b/backends/vulkan/patterns/pattern_registry.py
deleted file mode 100644
index 9a906cd8770..00000000000
--- a/backends/vulkan/patterns/pattern_registry.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Callable, Dict, List, Optional
-
-import torch
-
-from executorch.exir import ExportedProgram
-
-from torch.fx.passes.utils.matcher_utils import InternalMatch
-
-GetGraphFn = Callable[[], List[torch.fx.GraphModule]]
-
-
-class PatternMatch:
-    __slots__ = ("input_nodes", "output_nodes", "all_nodes", "anchor_node")
-    """
-    The design of this class is based on InternalMatch from
-    torch.fx.passes.utils.matcher_utils. It represents nodes in a graph that
-    match a particular pattern.
-
-    The reason to not use InternalMatch directly is to enable more (i.e. custom)
-    methods to detect and represent matches other than through SubgraphMatcher.
-    """
-
-    def __init__(
-        self,
-        input_nodes: List[torch.fx.Node],
-        output_nodes: List[torch.fx.Node],
-        all_nodes: List[torch.fx.Node],
-        anchor_node: Optional[torch.fx.Node] = None,
-    ):
-        self.input_nodes = input_nodes
-        self.output_nodes = output_nodes
-        self.all_nodes = all_nodes
-        self.anchor_node = anchor_node
-
-
-def create_pattern_match_from_internal_match(
-    internal_match: InternalMatch,
-) -> PatternMatch:
-    return PatternMatch(
-        internal_match.placeholder_nodes,
-        internal_match.returning_nodes,
-        list(internal_match.nodes_map.values()),
-    )
-
-
-CreateReplacementFn = Callable[
-    [ExportedProgram, torch.fx.GraphModule, PatternMatch], None
-]
-
-
-DetectorFn = Callable[[torch.fx.Node], Optional[PatternMatch]]
-
-
-class PatternEntry:
-    def __init__(
-        self,
-        get_graphs_fn: Optional[GetGraphFn] = None,
-        detector_fn: Optional[DetectorFn] = None,
-        create_replacement_fn: Optional[CreateReplacementFn] = None,
-    ):
-        self.get_graphs_fn = get_graphs_fn
-        self.detector_fn = detector_fn
-        self.create_replacement_fn = create_replacement_fn
-
-    def is_valid(self):
-        return (
-            self.get_graphs_fn is not None or self.detector_fn is not None
-        ) and self.create_replacement_fn is not None
-
-
-fusable_patterns: Dict[str, PatternEntry] = {}
-
-
-def register_pattern_graph(pattern_name: str):
-    def decorator(fn: GetGraphFn):
-        if pattern_name not in fusable_patterns:
-            fusable_patterns[pattern_name] = PatternEntry()
-
-        # Cannot define both get_graphs_fn and detector_fn
-        assert fusable_patterns[pattern_name].detector_fn is None
-        fusable_patterns[pattern_name].get_graphs_fn = fn
-
-        return fn
-
-    return decorator
-
-
-def register_pattern_detector(pattern_name: str):
-    def decorator(fn: DetectorFn):
-        if pattern_name not in fusable_patterns:
-            fusable_patterns[pattern_name] = PatternEntry()
-
-        # Cannot define both get_graphs_fn and detector_fn
-        assert fusable_patterns[pattern_name].get_graphs_fn is None
-        fusable_patterns[pattern_name].detector_fn = fn
-
-        return fn
-
-    return decorator
-
-
-def register_pattern_replacement(pattern_name: str):
-    def decorator(fn: CreateReplacementFn):
-        if pattern_name not in fusable_patterns:
-            fusable_patterns[pattern_name] = PatternEntry()
-
-        fusable_patterns[pattern_name].create_replacement_fn = fn
-        return fn
-
-    return decorator
diff --git a/backends/vulkan/patterns/quantized_convolution.py b/backends/vulkan/patterns/quantized_convolution.py
deleted file mode 100644
index 65b51b5e103..00000000000
--- a/backends/vulkan/patterns/quantized_convolution.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Optional
-
-import executorch.backends.vulkan.utils as utils
-
-import torch
-
-from executorch.backends.transforms.utils import (
-    create_constant_placeholder,
-    get_param_tensor,
-)
-
-from executorch.backends.vulkan.patterns.pattern_registry import (
-    PatternMatch,
-    register_pattern_detector,
-    register_pattern_replacement,
-)
-
-from executorch.exir import ExportedProgram
-from executorch.exir.dialects._ops import ops as exir_ops
-
-from torch.export.graph_signature import InputKind
-
-
-class QuantizedConvolutionMatch(PatternMatch):
-    def __init__(self, conv_node: torch.fx.Node) -> None:
-        self.anchor_node = conv_node
-        self.match_found = False
-        self.all_nodes = [self.anchor_node]
-
-        # Extract convolution parameters
-        self.stride = conv_node.args[3] if len(conv_node.args) > 3 else [1, 1]
-        self.padding = conv_node.args[4] if len(conv_node.args) > 4 else [0, 0]
-        self.dilation = conv_node.args[5] if len(conv_node.args) > 5 else [1, 1]
-        self.groups = conv_node.args[8] if len(conv_node.args) > 8 else 1
-
-        const_node, arg_chain = utils.trace_args_until_placeholder(
-            self.anchor_node.args[1]
-        )
-
-        # weight is not a constant tensor - no match
-        if const_node is None:
-            return
-
-        dequantize_weight_node = None
-        # Search for a dequantize node in the arg chain of weight
-        for node in arg_chain:
-            if isinstance(node, torch.fx.Node) and utils.is_dequant_node(node):
-                dequantize_weight_node = node
-        # weight is not quantized - no match
-        if dequantize_weight_node is None:
-            return
-
-        self.weight_node = const_node
-        self.dequantize_weight_node = dequantize_weight_node
-        self.all_nodes.extend(arg_chain)
-
-        # Identify weight quantization parameter nodes
-        self.weight_scales_node, arg_chain = utils.trace_args_until_placeholder(
-            self.dequantize_weight_node.args[1]
-        )
-        assert self.weight_scales_node is not None
-        self.all_nodes.extend(arg_chain)
-
-        self.weight_zeros_node, arg_chain = utils.trace_args_until_placeholder(
-            self.dequantize_weight_node.args[2]
-        )
-        assert self.weight_zeros_node is not None
-        self.all_nodes.extend(arg_chain)
-
-        # Identify output node
-        self.output_node = self.anchor_node
-
-        out_channels = self.output_node.meta["val"].shape[-1]
-        # The implementation requires that for grouped convolutions, a group does not
-        # cross any texel boundary. The output channels per group must be a multiple of
-        # 4. If this is not true, then don't match the pattern.
-        if self.groups > 1 and (out_channels / self.groups) % 4 == 0:
-            return
-
-        # Identify bias node, if applicable
-        self.bias_node = None
-        if len(self.anchor_node.args) > 2 and self.anchor_node.args[2] is not None:
-            self.bias_node, arg_chain = utils.trace_args_until_placeholder(
-                self.anchor_node.args[2]
-            )
-            if self.bias_node is not None:
-                self.all_nodes.extend(arg_chain)
-
-        # Identify input node
-        self.fp_input_node, self.quantize_input_node, dq_node = (
-            utils.maybe_skip_q_dq_arg_chain(self.anchor_node.args[0])
-        )
-        assert self.fp_input_node is not None
-        self.all_nodes.append(self.fp_input_node)
-        assert self.quantize_input_node is not None
-        assert dq_node is not None
-
-        self.input_scales_node = self.quantize_input_node.args[1]
-        self.input_zeros_node = self.quantize_input_node.args[2]
-
-        self.all_nodes.extend(
-            [
-                self.quantize_input_node,
-                dq_node,
-            ]
-        )
-
-        self.match_found = True
-
-
-convolution_anchor_nodes = {
-    exir_ops.edge.aten.conv2d.default,
-    exir_ops.edge.aten.convolution.default,
-}
-
-
-@register_pattern_detector("quantized_convolution")
-def find_quantized_convolution_patterns(
-    node: torch.fx.Node,
-) -> Optional[QuantizedConvolutionMatch]:
-    if node.target not in convolution_anchor_nodes:
-        return None
-
-    matched_pattern = QuantizedConvolutionMatch(node)
-    if matched_pattern.match_found:
-        return matched_pattern
-
-    return None
-
-
-##
-## Pattern Replacement
-##
-
-
-@register_pattern_replacement("quantized_convolution")
-def make_conv2d_q8ta_q8csw_custom_op(
-    ep: ExportedProgram,
-    graph_module: torch.fx.GraphModule,
-    match: QuantizedConvolutionMatch,
-):
-    weight_tensor = get_param_tensor(ep, match.weight_node)
-    assert weight_tensor is not None
-
-    assert match.weight_scales_node is not None
-    weight_scales_tensor = get_param_tensor(ep, match.weight_scales_node)
-    assert weight_scales_tensor is not None
-
-    assert match.weight_zeros_node is not None
-    weight_zeros_tensor = get_param_tensor(ep, match.weight_zeros_node)
-    assert weight_zeros_tensor is not None
-
-    bias_tensor = None
-    if match.bias_node is not None:
-        bias_tensor = get_param_tensor(ep, match.bias_node)
-        assert bias_tensor is not None
-
-    OC, IC, H, W = weight_tensor.shape
-
-    # Reshape weight tensor from (OC, IC, H, W) to (OC, H * W * IC) (i.e. matrix format)
-    # This prepares the weights for Im2Col-based convolution
-    weight_tensor = (
-        weight_tensor.permute(0, 2, 3, 1).contiguous().view(OC, H * W * IC).contiguous()
-    )
-
-    # Need to make sure that OC dim is a multiple of 4 so that data load/stores are well
-    # aligned with texel boundaries. Add padding to align to the next multiple of 4 if
-    # needed.
-    utils.align_width_and_update_state_dict(
-        ep, match.weight_node, weight_tensor, force_update=True
-    )
-    utils.align_width_and_update_state_dict(
-        ep, match.weight_scales_node, weight_scales_tensor
-    )
-    if bias_tensor is not None:
-        utils.align_width_and_update_state_dict(ep, match.bias_node, bias_tensor)
-
-    first_graph_node = list(graph_module.graph.nodes)[0]
-    with graph_module.graph.inserting_before(first_graph_node):
-        qweight_tensor_name = utils.get_tensor_name(ep, match.weight_node)
-        # Pre-compute the weight sums which are needed to apply activation zero point
-        # when using integer accumulation. For the reshaped 2D weight matrix (IC * H * W, OC),
-        # sum over dimension 0 to get sums per output channel
-        sum_per_output_channel = weight_tensor.sum(dim=1).to(torch.int32).contiguous()
-        sums_name = qweight_tensor_name + "_sums"
-        # Sanitize the name
-        sums_name = sums_name.replace(".", "_")
-
-        weight_sums_node = create_constant_placeholder(
-            exp_program=ep,
-            graph=graph_module.graph,
-            kind=InputKind.CONSTANT_TENSOR,
-            name=sums_name,
-            data=sum_per_output_channel,
-        )
-
-    with graph_module.graph.inserting_before(match.output_node):
-        qconv_node = graph_module.graph.create_node(
-            "call_function",
-            exir_ops.edge.et_vk.conv2d_q8ta_q8csw.default,
-            args=(
-                match.fp_input_node,
-                match.input_scales_node,
-                match.input_zeros_node,
-                match.weight_node,
-                weight_sums_node,
-                match.weight_scales_node,
-                match.bias_node,  # Add bias after weight_scales
-                [H, W],  # Pass kernel size information before stride
-                match.stride,
-                match.padding,
-                match.dilation,
-                match.groups,
-            ),
-        )
-
-    qconv_node.meta["val"] = match.output_node.meta["val"]
-    match.output_node.replace_all_uses_with(qconv_node)
diff --git a/backends/vulkan/patterns/quantized_linear.py b/backends/vulkan/patterns/quantized_linear.py
deleted file mode 100644
index ee1c7ee2d2a..00000000000
--- a/backends/vulkan/patterns/quantized_linear.py
+++ /dev/null
@@ -1,363 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Optional
-
-import executorch.backends.vulkan.utils as utils
-
-import torch
-import torch.nn.functional as F
-
-from executorch.backends.transforms.utils import (
-    create_constant_placeholder,
-    get_param_tensor,
-)
-
-from executorch.backends.vulkan.patterns.pattern_registry import (
-    PatternMatch,
-    register_pattern_detector,
-    register_pattern_replacement,
-)
-
-from executorch.exir import ExportedProgram
-from executorch.exir.dialects._ops import ops as exir_ops
-
-from torch.export.graph_signature import InputKind
-
-
-class QuantizedLinearMatch(PatternMatch):
-    def __init__(self, mm_node: torch.fx.Node) -> None:
-        self.anchor_node = mm_node
-        self.match_found = False
-        self.all_nodes = [self.anchor_node]
-
-        const_node, arg_chain = utils.trace_args_until_placeholder(
-            self.anchor_node.args[1]
-        )
-
-        # mat2 is not a constant tensor - no match
-        if const_node is None:
-            return
-
-        dequantize_weight_node = None
-        # Search for a dequantize node in the arg chain of weight
-        for node in arg_chain:
-            if isinstance(node, torch.fx.Node) and utils.is_dequant_node(node):
-                dequantize_weight_node = node
-        # weight is not quantized - no match
-        if dequantize_weight_node is None:
-            return
-
-        self.weight_node = const_node
-        self.dequantize_weight_node = dequantize_weight_node
-        self.all_nodes.extend(arg_chain)
-
-        # By default, assume dequant node is from quantized_decomposed namespace
-        scales_arg_idx = 1
-        zeros_arg_idx = 2
-        # torchao dequantize has a different function schema than quantized_decomposed
-        if (
-            self.dequantize_weight_node.target
-            == exir_ops.edge.torchao.dequantize_affine.default
-        ):
-            scales_arg_idx = 2
-            zeros_arg_idx = 3
-
-        # Identify weight quantization parameter nodes
-        self.weight_scales_node, arg_chain = utils.trace_args_until_placeholder(
-            self.dequantize_weight_node.args[scales_arg_idx]
-        )
-        assert self.weight_scales_node is not None
-        self.all_nodes.extend(arg_chain)
-
-        self.weight_zeros_node, arg_chain = utils.trace_args_until_placeholder(
-            self.dequantize_weight_node.args[zeros_arg_idx]
-        )
-        assert self.weight_zeros_node is not None
-        self.all_nodes.extend(arg_chain)
-
-        # Identify output node
-        self.output_node = self.anchor_node
-
-        # The implementation has a limitation that output channels must be a
-        # multiple of 4. This is to ensure that data loads are aligned well with
-        # texel boundaries. If this is not true, then don't match the pattern.
-        out_channels = self.output_node.meta["val"].shape[-1]
-        if out_channels % 4 != 0:
-            return
-
-        # Identify input node
-        self.fp_input_node, self.quantize_input_node, dq_node = (
-            utils.maybe_skip_q_dq_arg_chain(self.anchor_node.args[0])
-        )
-        assert self.fp_input_node is not None
-        self.all_nodes.append(self.fp_input_node)
-
-        # The implementation has a limitation that input channels must be a
-        # multiple of 4. This is to ensure that data loads are aligned well with
-        # texel boundaries. If this is not true, then don't match the pattern.
-        in_channels = self.fp_input_node.meta["val"].shape[-1]
-        if in_channels % 4 != 0:
-            return
-
-        # Identify bias node, if applicable
-        self.bias_node = None
-        if self.anchor_node.target == exir_ops.edge.aten.addmm.default:
-            self.bias_node, arg_chain = utils.trace_args_until_placeholder(
-                self.anchor_node.args[2]
-            )
-            assert self.bias_node is not None
-            self.all_nodes.extend(arg_chain)
-
-        # If input is not quantized, then we are done
-        if self.quantize_input_node is None:
-            self.match_found = True
-            return
-
-        self.input_scales_node = self.quantize_input_node.args[1]
-        self.input_zeros_node = self.quantize_input_node.args[2]
-
-        assert dq_node is not None
-        self.all_nodes.extend(
-            [
-                self.quantize_input_node,
-                dq_node,
-            ]
-        )
-
-        self.match_found = True
-
-    def is_weight_only_quantized(self) -> bool:
-        return self.quantize_input_node is None
-
-    def is_weight_pergroup_quantized(self) -> bool:
-        weight_shape = self.weight_node.meta["val"].shape
-        scales_shape = self.weight_scales_node.meta["val"].shape
-        if len(scales_shape) != 2:
-            return False
-
-        # Check that:
-        # height dim of scales is same as height dim of weight (N / output channels dim)
-        # width dim of weight (K / in channels dim) is divisible by width dim of scales
-        # (number of quantization groups)
-        return scales_shape[-2] == weight_shape[-2] and (
-            weight_shape[-1] % scales_shape[-1] == 0
-        )
-
-    def is_weight_perchannel_quantized(self) -> bool:
-        weight_shape = self.weight_node.meta["val"].shape
-        scales_shape = self.weight_scales_node.meta["val"].shape
-        if len(scales_shape) != 1:
-            return False
-
-        # scales should have same size as weight's output channels dim
-        return scales_shape[0] == weight_shape[-2]
-
-    def is_input_static_per_tensor_quantized(self) -> bool:
-        if self.quantize_input_node is None:
-            return False
-
-        # For static quantization per tensor quantization, the scales and zeros
-        # are scalars.
-        return isinstance(self.input_scales_node, float)
-
-
-linear_anchor_nodes = {
-    exir_ops.edge.aten.linear.default,
-    exir_ops.edge.aten.mm.default,
-    exir_ops.edge.aten.addmm.default,
-}
-
-
-@register_pattern_detector("quantized_linear")
-def find_quantized_linear_patterns(
-    node: torch.fx.Node,
-) -> Optional[QuantizedLinearMatch]:
-    if node.target not in linear_anchor_nodes:
-        return None
-
-    matched_pattern = QuantizedLinearMatch(node)
-    if matched_pattern.match_found:
-        return matched_pattern
-
-    return None
-
-
-##
-## Constant tensor manipulation
-##
-
-
-def pack_4bit_weight_tensor(weight_tensor: torch.Tensor) -> torch.Tensor:
-    """
-    Given a 8-bit weight tensor containing values quantized to 4 bits, create a packed
-    weight tensor by transposing the weight tensor, then packing 2 4-bit values in one
-    8-bit value.
-
-    An input weight tensor of shape (N, K) will produce a packed weight tensor of shape
-    (K, N / 2).
-    """
-
-    # Assert we got a properly quantized tensor.
-    min_val, max_val = weight_tensor.min().item(), weight_tensor.max().item()
-    assert (
-        max_val <= 7 and min_val >= -8
-    ), f"pack_4bit_weight_tensor: [min_val,max_val] out of [-8, 7] range, got [{min_val}, {max_val}]"
-
-    # Assuming we have a 2d tensor
-    if weight_tensor.ndim != 2:
-        weight_tensor = weight_tensor.squeeze()
-    assert (
-        weight_tensor.ndim == 2
-    ), f"pack_4bit_weight_tensor: expecting input tensor to be 2d, got {weight_tensor.ndim}"
-
-    # Need to pad innermost dim to be a multiple of 8, since the minimum load granularity
-    # is int32 (4 bytes), which contains 8 4-bit values.
-    if weight_tensor.shape[-1] % 8 != 0:
-        num_pad = 8 - (weight_tensor.shape[-1] % 8)
-        weight_tensor = F.pad(input=weight_tensor, pad=(0, num_pad))
-
-    # Shape after padding
-    _, in_channels = weight_tensor.shape
-    assert in_channels % 8 == 0, "convert_to_qc4w: expecting ic to be divisible by 8"
-
-    # Adjust weight_tensor tensor for zp
-    weight_tensor = weight_tensor.to(dtype=torch.uint8) + 8
-    # Pack each 4-bit value into a single 8-bit value
-    return weight_tensor[::, 1::2] << 4 | weight_tensor[::, ::2]
-
-
-##
-## Pattern Replacement
-##
-
-
-def make_linear_q4gsw_op(
-    ep: ExportedProgram,
-    graph_module: torch.fx.GraphModule,
-    match: QuantizedLinearMatch,
-    weight_tensor: torch.Tensor,
-    weight_scales_tensor: torch.Tensor,
-):
-    num_groups = weight_scales_tensor.shape[-1]
-    in_channels = weight_tensor.shape[-1]
-    group_size = in_channels // num_groups
-
-    weight_tensor = pack_4bit_weight_tensor(weight_tensor)
-    # Use this function for convenience to update the state dict with the packed
-    # weight tensor. Alignment will already have been done in the above function.
-    weight_tensor = utils.align_width_and_update_state_dict(
-        ep, match.weight_node, weight_tensor, align_to=1, force_update=True
-    )
-
-    # Also transpose the weight scales tensor to shape [num_groups, N]
-    weight_scales_tensor = weight_scales_tensor.transpose(0, 1).contiguous()
-    # Align to multiple of 8 to ensure that data loads from the weight scales
-    # tensor do not go out of bounds. Each thread computes 8 output channels.
-    utils.align_width_and_update_state_dict(
-        ep,
-        match.weight_scales_node,
-        weight_scales_tensor,
-        align_to=8,
-        force_update=True,
-    )
-
-    with graph_module.graph.inserting_before(match.output_node):
-        linear_q4gsw_node = graph_module.graph.create_node(
-            "call_function",
-            exir_ops.edge.et_vk.linear_q4gsw.default,
-            args=(
-                match.fp_input_node,
-                match.weight_node,
-                match.weight_scales_node,
-                group_size,
-            ),
-        )
-
-    linear_q4gsw_node.meta["val"] = match.output_node.meta["val"]
-    match.output_node.replace_all_uses_with(linear_q4gsw_node)
-
-
-def make_linear_q8ta_q8csw_custom_op(
-    ep: ExportedProgram,
-    graph_module: torch.fx.GraphModule,
-    match: QuantizedLinearMatch,
-    weight_tensor: torch.Tensor,
-):
-    first_graph_node = list(graph_module.graph.nodes)[0]
-    with graph_module.graph.inserting_before(first_graph_node):
-        weight_tensor_name = utils.get_tensor_name(ep, match.weight_node)
-        # Pre-compute the weight sums which are needed to apply activation zero point
-        # when using integer accumulation.
-        sum_per_output_channel = weight_tensor.sum(dim=1).to(torch.int32).contiguous()
-        sums_name = weight_tensor_name + "_sums"
-        # Sanitize the name
-        sums_name = sums_name.replace(".", "_")
-
-        weight_sums_node = create_constant_placeholder(
-            exp_program=ep,
-            graph=graph_module.graph,
-            kind=InputKind.CONSTANT_TENSOR,
-            name=sums_name,
-            data=sum_per_output_channel,
-        )
-
-    with graph_module.graph.inserting_before(match.output_node):
-        qlinear_node = graph_module.graph.create_node(
-            "call_function",
-            exir_ops.edge.et_vk.linear_q8ta_q8csw.default,
-            args=(
-                match.fp_input_node,
-                match.input_scales_node,
-                match.input_zeros_node,
-                match.weight_node,
-                weight_sums_node,
-                match.weight_scales_node,
-            ),
-        )
-
-    qlinear_node.meta["val"] = match.output_node.meta["val"]
-    match.output_node.replace_all_uses_with(qlinear_node)
-
-
-@register_pattern_replacement("quantized_linear")
-def replace_quantized_linear_patterns(
-    ep: ExportedProgram,
-    graph_module: torch.fx.GraphModule,
-    match: QuantizedLinearMatch,
-):
-    # Extract relevant tensors
-    weight_tensor = get_param_tensor(ep, match.weight_node)
-    assert weight_tensor is not None
-
-    assert match.weight_scales_node is not None
-    weight_scales_tensor = get_param_tensor(ep, match.weight_scales_node)
-    assert weight_scales_tensor is not None
-
-    assert match.weight_zeros_node is not None
-    weight_zeros_tensor = get_param_tensor(ep, match.weight_zeros_node)
-    assert weight_zeros_tensor is not None
-
-    # Biases not supported at the moment
-    if match.bias_node is not None:
-        return
-
-    # Route to appropriate custom op
-    if (
-        match.is_weight_only_quantized()
-        and match.is_weight_pergroup_quantized()
-        and utils.is_in_4bit_range(weight_tensor)
-    ):
-        make_linear_q4gsw_op(
-            ep, graph_module, match, weight_tensor, weight_scales_tensor
-        )
-    elif (
-        match.is_input_static_per_tensor_quantized()
-        and match.is_weight_perchannel_quantized()
-    ):
-        make_linear_q8ta_q8csw_custom_op(ep, graph_module, match, weight_tensor)
-
-    # No-op for unsupported quant patterns
diff --git a/backends/vulkan/patterns/rope.py b/backends/vulkan/patterns/rope.py
deleted file mode 100644
index b174224ab78..00000000000
--- a/backends/vulkan/patterns/rope.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import operator
-
-from functools import lru_cache
-from typing import List, Optional
-
-import torch
-
-from executorch.backends.vulkan.patterns.pattern_registry import (
-    PatternMatch,
-    register_pattern_graph,
-    register_pattern_replacement,
-)
-
-from executorch.exir import EdgeCompileConfig, ExportedProgram, to_edge
-from executorch.exir.dialects._ops import ops as exir_ops
-
-from torch.export import export
-
-
-class RotaryEmbeddingPattern(torch.nn.Module):
-    """
-    Implementation of rotary embedding pattern that matches the one
-    in examples/model/llama/rope.py
-    """
-
-    def __init__(self):
-        super().__init__()
-
-    def forward(
-        self,
-        xq: torch.Tensor,
-        xk: torch.Tensor,
-        freqs_cos: torch.Tensor,
-        freqs_sin: torch.Tensor,
-    ):
-        # This implementation matches the apply_rotary_emb function in rope.py
-        # Split into real and imaginary parts
-        xq_r, xq_i = xq.float().reshape(xq.shape[:-1] + (-1, 2)).unbind(-1)
-        xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1)
-
-        # Reshape frequencies for broadcasting
-        freqs_cos = self._reshape_for_broadcast(freqs_cos, xq_r)
-        freqs_sin = self._reshape_for_broadcast(freqs_sin, xq_r)
-
-        # Apply rotary embedding
-        xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin
-        xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos
-        xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin
-        xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos
-
-        # Recombine real and imaginary parts
-        xq_out = torch.stack([xq_out_r, xq_out_i], dim=-1).flatten(3)
-        xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3)
-
-        return xq_out.type_as(xq), xk_out.type_as(xk)
-
-    def _reshape_for_broadcast(self, freqs_cis: torch.Tensor, x: torch.Tensor):
-        ndim = x.ndim
-        freqs_cis_ndim = freqs_cis.ndim
-        if freqs_cis_ndim == 3:
-            # freqs_cis: (seq_len, n_heads, head_dim // 2)
-            assert freqs_cis.shape == (x.shape[-3], x.shape[-2], x.shape[-1])
-            shape = [
-                d if (i == ndim - 3 or i == ndim - 2 or i == ndim - 1) else 1
-                for i, d in enumerate(x.shape)
-            ]
-        else:
-            # freqs_cis: (seq_len, head_dim // 2)
-            assert freqs_cis.shape == (x.shape[1], x.shape[-1])
-            shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
-        return freqs_cis.view(shape)
-
-
-@lru_cache(maxsize=2)
-@register_pattern_graph("export_llama_rope")
-def get_rope_graphs() -> List[torch.fx.GraphModule]:
-    batch_size = 1
-    seq_len = 1
-    n_heads = 4
-    n_kv_heads = 2
-    head_dim = 32
-
-    graphs = []
-    dtype = torch.float32
-
-    xq = torch.randn(batch_size, seq_len, n_heads, head_dim, dtype=dtype)
-    xk = torch.randn(batch_size, seq_len, n_kv_heads, head_dim, dtype=dtype)
-    freqs_cos = torch.randn(seq_len, head_dim // 2, dtype=dtype)
-    freqs_sin = torch.randn(seq_len, head_dim // 2, dtype=dtype)
-
-    edge = to_edge(
-        export(
-            RotaryEmbeddingPattern(),
-            (xq, xk, freqs_cos, freqs_sin),
-            strict=True,
-        ),
-        compile_config=EdgeCompileConfig(_check_ir_validity=False),
-    )
-    gm = edge.exported_program().graph_module
-    graphs.append(gm)
-
-    return graphs
-
-
-def identify_rotary_emb_io_nodes(
-    ep: ExportedProgram,
-    graph_module: torch.fx.GraphModule,
-    match: PatternMatch,
-) -> Optional[List[torch.fx.Node]]:
-    # Get the input inputs (xq, xk, freqs_cos, freqs_sin)
-    input_nodes = match.input_nodes
-    if len(input_nodes) != 4:
-        return None
-
-    xq, xk, freqs_cos, freqs_sin = input_nodes
-
-    output_nodes = match.output_nodes
-    if len(output_nodes) != 2:
-        return None
-
-    xq_out, xk_out = output_nodes
-
-    return [xq, xk, freqs_cos, freqs_sin, xq_out, xk_out]
-
-
-@register_pattern_replacement("export_llama_rope")
-def create_rotary_emb_custom_op(
-    ep: ExportedProgram,
-    graph_module: torch.fx.GraphModule,
-    match: PatternMatch,
-):
-    io_nodes = identify_rotary_emb_io_nodes(ep, graph_module, match)
-    if io_nodes is None:
-        return
-
-    assert len(io_nodes) == 6
-    xq, xk, freqs_cos, freqs_sin, xq_out, xk_out = io_nodes
-
-    # Create the custom op node
-    with graph_module.graph.inserting_before(xq_out):
-        rotary_emb_node = graph_module.graph.create_node(
-            "call_function",
-            exir_ops.edge.et_vk.apply_rotary_emb.default,
-            args=(xq, xk, freqs_cos, freqs_sin),
-        )
-
-    # The custom op returns a tuple (xq_out, xk_out)
-    # We need to extract the individual outputs
-    with graph_module.graph.inserting_after(rotary_emb_node):
-        getitem_0 = graph_module.graph.create_node(
-            "call_function",
-            operator.getitem,
-            args=(rotary_emb_node, 0),
-        )
-        getitem_1 = graph_module.graph.create_node(
-            "call_function",
-            operator.getitem,
-            args=(rotary_emb_node, 1),
-        )
-
-    if hasattr(xq_out, "meta") and "val" in xq_out.meta:
-        getitem_0.meta["val"] = xq_out.meta["val"]
-    if hasattr(xk_out, "meta") and "val" in xk_out.meta:
-        getitem_1.meta["val"] = xk_out.meta["val"]
-
-    xq_out.replace_all_uses_with(getitem_0)
-    xk_out.replace_all_uses_with(getitem_1)
diff --git a/backends/vulkan/quantizer b/backends/vulkan/quantizer
new file mode 120000
index 00000000000..46087273095
--- /dev/null
+++ b/backends/vulkan/quantizer
@@ -0,0 +1 @@
+/home/ssjia/fbsource/xplat/executorch/backends/vulkan/quantizer
\ No newline at end of file
diff --git a/backends/vulkan/quantizer/TARGETS b/backends/vulkan/quantizer/TARGETS
deleted file mode 100644
index 2c3ae37923a..00000000000
--- a/backends/vulkan/quantizer/TARGETS
+++ /dev/null
@@ -1,20 +0,0 @@
-load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
-
-oncall("executorch")
-
-python_library(
-    name = "vulkan_quantizer",
-    srcs = ["vulkan_quantizer.py"],
-    deps = [
-        ":vulkan_quantizer_utils",
-        "//caffe2:torch",
-    ],
-)
-
-python_library(
-    name = "vulkan_quantizer_utils",
-    srcs = ["vulkan_quantizer_utils.py"],
-    deps = [
-        "//caffe2:torch",
-    ],
-)
diff --git a/backends/vulkan/quantizer/vulkan_quantizer.py b/backends/vulkan/quantizer/vulkan_quantizer.py
deleted file mode 100644
index 40212c35c27..00000000000
--- a/backends/vulkan/quantizer/vulkan_quantizer.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-strict
-
-from __future__ import annotations
-
-import functools
-from typing import Callable, Optional
-
-import torch
-from executorch.backends.vulkan.quantizer.vulkan_quantizer_utils import (
-    _convert_scalars_to_attrs,
-    bits_to_range,
-    OP_TO_ANNOTATOR,
-    propagate_annotation,
-)
-from torch.fx import Node
-from torchao.quantization.pt2e import PerChannelMinMaxObserver, PlaceholderObserver
-from torchao.quantization.pt2e.quantizer import (
-    QuantizationConfig,
-    QuantizationSpec,
-    Quantizer,
-)
-
-
-__all__ = [
-    "VulkanQuantizer",
-    "get_symmetric_quantization_config",
-]
-
-
-@functools.lru_cache
-def get_symmetric_quantization_config(
-    is_dynamic: bool = False,
-    weight_bits: int = 8,
-    act_bits: int = 8,
-    act_qmin: Optional[int] = None,
-    act_qmax: Optional[int] = None,
-    weight_qmin: Optional[int] = None,
-    weight_qmax: Optional[int] = None,
-) -> QuantizationConfig:
-    """
-    Return a QuantizationConfig for Vulkan quantizer.
-
-    Args:
-        is_dynamic: If False, weight-only quantization. If True, dynamic quantization (activation + weight)
-        weight_bits: Number of bits for weight quantization (4 or 8)
-        act_bits: Number of bits for activation quantization (8)
-        act_qmin: Minimum quantization value for activations (auto-calculated if None)
-        act_qmax: Maximum quantization value for activations (auto-calculated if None)
-        weight_qmin: Minimum quantization value for weights (auto-calculated if None)
-        weight_qmax: Maximum quantization value for weights (auto-calculated if None)
-    """
-    assert weight_bits in {
-        8,
-        4,
-    }, f"Unsupported weight quantization bits: {weight_bits}"
-
-    assert act_bits in {
-        8,
-    }, f"Unsupported activation quantization bits: {act_bits}"
-
-    # Auto-calculate weight ranges if not provided
-    if weight_qmin is None or weight_qmax is None:
-        weight_range = bits_to_range(weight_bits)
-        weight_qmin = weight_qmin if weight_qmin is not None else weight_range[0]
-        weight_qmax = weight_qmax if weight_qmax is not None else weight_range[1]
-
-    # Weight quantization: per-channel symmetric for Vulkan
-    weight_quantization_spec = QuantizationSpec(
-        dtype=torch.int8,
-        quant_min=weight_qmin,
-        quant_max=weight_qmax,
-        qscheme=torch.per_channel_symmetric,
-        ch_axis=0,
-        is_dynamic=False,
-        observer_or_fake_quant_ctr=PerChannelMinMaxObserver,
-    )
-
-    # Configure activation quantization based on is_dynamic
-    if not is_dynamic:
-        # Weight-only quantization: no activation quantization
-        act_quantization_spec = None
-        output_activation_spec = None
-    else:
-        # Dynamic quantization: per-token input quantization, no output quantization
-        # Auto-calculate activation ranges if not provided
-        if act_qmin is None or act_qmax is None:
-            act_range = bits_to_range(act_bits)
-            act_qmin = act_qmin if act_qmin is not None else act_range[0]
-            act_qmax = act_qmax if act_qmax is not None else act_range[1]
-
-        act_observer_or_fake_quant_ctr = PlaceholderObserver
-        act_quantization_spec = QuantizationSpec(
-            dtype=torch.int8,
-            quant_min=act_qmin,
-            quant_max=act_qmax,
-            qscheme=torch.per_tensor_affine,
-            is_dynamic=True,
-            observer_or_fake_quant_ctr=act_observer_or_fake_quant_ctr,
-        )
-        output_activation_spec = None
-
-    return QuantizationConfig(
-        input_activation=act_quantization_spec,
-        output_activation=output_activation_spec,
-        weight=weight_quantization_spec,
-        bias=None,
-        is_qat=False,
-    )
-
-
-_SUPPORTED_OPS = [
-    "linear",
-]
-
-
-class VulkanQuantizer(Quantizer):
-
-    def __init__(self) -> None:
-        super().__init__()
-        self.global_config: Optional[QuantizationConfig] = None
-
-    def set_global(self, quantization_config: QuantizationConfig) -> VulkanQuantizer:
-        self.global_config = quantization_config
-        return self
-
-    def transform_for_annotation(
-        self, model: torch.fx.GraphModule
-    ) -> torch.fx.GraphModule:
-        """Transforms scalar values to tensor attributes"""
-        return _convert_scalars_to_attrs(model)
-
-    def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
-        model = self._annotate_for_quantization_config(model)
-        propagate_annotation(model)
-        return model
-
-    def _annotate_all_patterns(
-        self,
-        model: torch.fx.GraphModule,
-        quantization_config: Optional[QuantizationConfig],
-        filter_fn: Optional[Callable[[Node], bool]] = None,
-    ) -> torch.fx.GraphModule:
-        if quantization_config is None:
-            return model
-
-        for op in _SUPPORTED_OPS:
-            OP_TO_ANNOTATOR[op](model, quantization_config, filter_fn)
-        return model
-
-    def _annotate_for_quantization_config(
-        self, model: torch.fx.GraphModule
-    ) -> torch.fx.GraphModule:
-        self._annotate_all_patterns(
-            model,
-            self.global_config,
-        )
-        return model
-
-    def validate(self, model: torch.fx.GraphModule) -> None:
-        pass
diff --git a/backends/vulkan/quantizer/vulkan_quantizer_utils.py b/backends/vulkan/quantizer/vulkan_quantizer_utils.py
deleted file mode 100644
index c0b6ab39e84..00000000000
--- a/backends/vulkan/quantizer/vulkan_quantizer_utils.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-strict
-
-from typing import Callable, Optional, Tuple
-
-import torch
-from torch.fx import Node
-from torchao.quantization.pt2e.quantizer import (
-    annotate_input_qspec_map,
-    annotate_output_qspec,
-    get_bias_qspec,
-    get_input_act_qspec,
-    get_output_act_qspec,
-    get_weight_qspec,
-    QuantizationAnnotation,
-    QuantizationConfig,
-    SharedQuantizationSpec,
-)
-from torchao.quantization.pt2e.utils import get_new_attr_name_with_prefix
-
-__all__ = [
-    "OP_TO_ANNOTATOR",
-    "propagate_annotation",
-    "_convert_scalars_to_attrs",
-    "bits_to_range",
-]
-
-
-def bits_to_range(bits: int) -> Tuple[int, int]:
-    """
-    Calculate quantization range for given number of bits.
-
-    Args:
-        bits: Number of quantization bits
-
-    Returns:
-        Tuple of (qmin, qmax) for the given bit width
-    """
-    return (
-        -(2 ** (bits - 1)),
-        (2 ** (bits - 1) - 1),
-    )
-
-
-AnnotatorType = Callable[
-    [
-        torch.fx.GraphModule,
-        Optional[QuantizationConfig],
-        Optional[Callable[[Node], bool]],
-    ],
-    Optional[list[list[Node]]],
-]
-OP_TO_ANNOTATOR: dict[str, AnnotatorType] = {}
-
-
-def register_annotator(op: str) -> Callable[[AnnotatorType], None]:
-    def decorator(annotator: AnnotatorType) -> None:
-        OP_TO_ANNOTATOR[op] = annotator
-
-    return decorator
-
-
-def _is_annotated(nodes: list[Node]) -> bool:
-    """
-    Given a list of nodes (that represents an operator pattern),
-    check if any of the node is annotated, return True if any of the node
-    is annotated, otherwise return False
-    """
-    annotated = False
-    for node in nodes:
-        annotated = annotated or (
-            "quantization_annotation" in node.meta
-            and node.meta["quantization_annotation"]._annotated
-        )
-    return annotated
-
-
-def _mark_nodes_as_annotated(nodes: list[Node]) -> None:
-    for node in nodes:
-        if node is not None:
-            if "quantization_annotation" not in node.meta:
-                node.meta["quantization_annotation"] = QuantizationAnnotation()
-            node.meta["quantization_annotation"]._annotated = True
-
-
-@register_annotator("linear")
-def _annotate_linear(
-    gm: torch.fx.GraphModule,
-    quantization_config: Optional[QuantizationConfig],
-    filter_fn: Optional[Callable[[Node], bool]] = None,
-) -> Optional[list[list[Node]]]:
-    annotated_partitions = []
-    input_act_qspec = get_input_act_qspec(quantization_config)
-    output_act_qspec = get_output_act_qspec(quantization_config)
-    weight_qspec = get_weight_qspec(quantization_config)
-    bias_qspec = get_bias_qspec(quantization_config)
-    for node in gm.graph.nodes:
-        if node.op != "call_function" or node.target != torch.ops.aten.linear.default:
-            continue
-        if filter_fn and not filter_fn(node):
-            continue
-        act_node = node.args[0]
-        weight_node = node.args[1]
-        bias_node = None
-        if len(node.args) > 2:
-            bias_node = node.args[2]
-
-        if _is_annotated([node]) is False:  # type: ignore[list-item]
-            annotate_input_qspec_map(
-                node,
-                act_node,
-                input_act_qspec,
-            )
-            annotate_input_qspec_map(
-                node,
-                weight_node,
-                weight_qspec,
-            )
-            nodes_to_mark_annotated = [node, weight_node]
-            if bias_node:
-                annotate_input_qspec_map(
-                    node,
-                    bias_node,
-                    bias_qspec,
-                )
-                nodes_to_mark_annotated.append(bias_node)
-            annotate_output_qspec(node, output_act_qspec)
-            _mark_nodes_as_annotated(nodes_to_mark_annotated)
-            annotated_partitions.append(nodes_to_mark_annotated)
-
-    return annotated_partitions
-
-
-def _is_share_obs_or_fq_op(op: Callable[..., torch.Tensor]) -> bool:
-    return op in [
-        torch.ops.aten.relu.default,
-        torch.ops.aten.hardtanh.default,
-        torch.ops.aten.hardtanh_.default,
-        torch.ops.aten.max_pool2d.default,
-        torch.ops.aten.mean.default,
-        torch.ops.aten.mean.dim,
-        torch.ops.aten.permute.default,
-        torch.ops.aten.permute_copy.default,
-        torch.ops.aten.squeeze.dim,
-        torch.ops.aten.squeeze_copy.dim,
-        torch.ops.aten.adaptive_avg_pool2d.default,
-        torch.ops.aten.view_copy.default,
-        torch.ops.aten.view.default,
-        torch.ops.aten.slice_copy.Tensor,
-        torch.ops.aten.flatten.using_ints,
-    ]
-
-
-def propagate_annotation(model: torch.fx.GraphModule) -> None:
-    for n in model.graph.nodes:
-        if n.op != "call_function" or not _is_share_obs_or_fq_op(n.target):
-            continue
-
-        prev_node = n.args[0]
-        if not isinstance(prev_node, Node):
-            continue
-
-        quantization_annotation = prev_node.meta.get("quantization_annotation", None)
-        if not quantization_annotation:
-            continue
-
-        output_qspec = quantization_annotation.output_qspec
-        if not output_qspec:
-            continue
-
-        # make sure current node is not annotated
-        if (
-            "quantization_annotation" in n.meta
-            and n.meta["quantization_annotation"]._annotated
-        ):
-            continue
-
-        shared_qspec = SharedQuantizationSpec(prev_node)
-        # propagate the previous output_qspec to the current node
-        n.meta["quantization_annotation"] = QuantizationAnnotation(
-            input_qspec_map={
-                prev_node: shared_qspec,
-            },
-            output_qspec=shared_qspec,
-            _annotated=True,
-        )
-
-
-def _convert_scalars_to_attrs(model: torch.fx.GraphModule) -> torch.fx.GraphModule:
-    for n in model.graph.nodes:
-        if n.op != "call_function" or n.target not in [
-            torch.ops.aten.add.Tensor,
-            torch.ops.aten.mul.Tensor,
-        ]:
-            continue
-        args = list(n.args)
-        new_args = []
-        for i in range(len(args)):
-            if isinstance(args[i], torch.fx.Node):
-                new_args.append(args[i])
-                continue
-            prefix = "_tensor_constant_"
-            get_new_attr_name = get_new_attr_name_with_prefix(prefix)
-            tensor_constant_name = get_new_attr_name(model)
-            float_tensor = torch.tensor(float(args[i]))
-            model.register_buffer(tensor_constant_name, float_tensor)
-            fake_mode = n.meta["val"].fake_mode
-            with model.graph.inserting_before(n):
-                get_attr_node = model.graph.create_node(
-                    "get_attr", tensor_constant_name, (), {}
-                )
-                get_attr_node.meta["val"] = fake_mode.from_tensor(
-                    float_tensor, static_shapes=True
-                )
-                new_args.append(get_attr_node)
-        n.args = tuple(new_args)
-    model.recompile()
-    return model
diff --git a/backends/vulkan/runtime b/backends/vulkan/runtime
new file mode 120000
index 00000000000..a33641e0cc0
--- /dev/null
+++ b/backends/vulkan/runtime
@@ -0,0 +1 @@
+/home/ssjia/fbsource/xplat/executorch/backends/vulkan/runtime
\ No newline at end of file
diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
deleted file mode 100644
index 7b138072d50..00000000000
--- a/backends/vulkan/runtime/VulkanBackend.cpp
+++ /dev/null
@@ -1,680 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/VulkanDelegateHeader.h>
-#include <executorch/backends/vulkan/serialization/schema_generated.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/Runtime.h>
-
-#include <executorch/runtime/backend/interface.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/core/evalue.h>
-#ifdef ET_EVENT_TRACER_ENABLED
-#include <executorch/runtime/core/event_tracer_hooks_delegate.h>
-#endif // ET_EVENT_TRACER_ENABLED
-#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
-#include <executorch/runtime/core/named_data_map.h>
-#include <executorch/runtime/platform/compiler.h>
-#include <executorch/runtime/platform/profiler.h>
-
-#include <cstdio>
-#include <cstdlib> /* strtol */
-#include <cstring>
-#include <memory>
-#include <type_traits>
-#include <vector>
-
-namespace executorch {
-namespace backends {
-namespace vulkan {
-namespace {
-
-using executorch::runtime::ArrayRef;
-using executorch::runtime::Backend;
-using executorch::runtime::BackendExecutionContext;
-using executorch::runtime::BackendInitContext;
-using executorch::runtime::CompileSpec;
-using executorch::runtime::DelegateHandle;
-using executorch::runtime::Error;
-using executorch::runtime::EValue;
-using executorch::runtime::FreeableBuffer;
-using executorch::runtime::kTensorDimensionLimit;
-using executorch::runtime::NamedDataMap;
-using executorch::runtime::Result;
-using executorch::runtime::Span;
-
-using namespace vkcompute;
-
-// Flatbuffer types
-using VkGraphPtr = const vkgraph::VkGraph*;
-using OpCallPtr = const vkgraph::OperatorCall*;
-using VkValuePtr = const vkgraph::VkValue*;
-using VkTensorPtr = const vkgraph::VkTensor*;
-using VkBytesPtr = const vkgraph::VkBytes*;
-
-// Flatbuffer vector types
-using VkValuesVector =
-    const flatbuffers::Vector<flatbuffers::Offset<vkgraph::VkValue>>*;
-using BytesVector =
-    const flatbuffers::Vector<flatbuffers::Offset<vkgraph::VkBytes>>*;
-using UIntVector = const flatbuffers::Vector<uint32_t>*;
-
-vkapi::ScalarType get_scalar_type(const vkgraph::VkDataType& vk_datatype) {
-  switch (vk_datatype) {
-    case vkgraph::VkDataType::BOOL:
-      return vkapi::kBool;
-    case vkgraph::VkDataType::UINT8:
-      return vkapi::kByte;
-    case vkgraph::VkDataType::INT8:
-      return vkapi::kChar;
-    case vkgraph::VkDataType::INT32:
-      return vkapi::kInt;
-    case vkgraph::VkDataType::INT64:
-      return vkapi::kLong;
-    case vkgraph::VkDataType::FLOAT16:
-      return vkapi::kHalf;
-    case vkgraph::VkDataType::FLOAT32:
-      return vkapi::kFloat;
-    case vkgraph::VkDataType::FLOAT64:
-      return vkapi::kDouble;
-  }
-}
-
-utils::StorageType get_storage_type(
-    const vkgraph::VkStorageType& vk_storage_type) {
-  switch (vk_storage_type) {
-    case vkgraph::VkStorageType::BUFFER:
-      return utils::kBuffer;
-    case vkgraph::VkStorageType::TEXTURE_3D:
-      return utils::kTexture3D;
-    case vkgraph::VkStorageType::TEXTURE_2D:
-      return utils::kTexture2D;
-    default:
-      break;
-  }
-  VK_THROW("Invalid storage type encountered!");
-}
-
-utils::GPUMemoryLayout get_memory_layout(
-    const vkgraph::VkMemoryLayout& vk_memory_layout) {
-  switch (vk_memory_layout) {
-    case vkgraph::VkMemoryLayout::TENSOR_WIDTH_PACKED:
-      return utils::kWidthPacked;
-    case vkgraph::VkMemoryLayout::TENSOR_HEIGHT_PACKED:
-      return utils::kHeightPacked;
-    case vkgraph::VkMemoryLayout::TENSOR_CHANNELS_PACKED:
-      return utils::kChannelsPacked;
-    default:
-      break;
-  }
-  VK_THROW("Invalid memory layout encountered!");
-}
-
-GraphConfig get_graph_config(ArrayRef<CompileSpec>& compile_specs) {
-  GraphConfig config = GraphConfig();
-
-  for (const CompileSpec& spec : compile_specs) {
-    const uint8_t* value_data = (const uint8_t*)spec.value.buffer;
-    const size_t value_size = spec.value.nbytes;
-    if (strcmp(spec.key, "storage_type_override") == 0) {
-      ET_CHECK_MSG(value_size == sizeof(int32_t), "Unexpected value size!");
-      int value_as_int = static_cast<int>(getUInt32LE(value_data));
-      utils::StorageType storage_type =
-          static_cast<utils::StorageType>(value_as_int);
-
-      config.set_storage_type_override(storage_type);
-    }
-    if (strcmp(spec.key, "memory_layout_override") == 0) {
-      ET_CHECK_MSG(value_size == sizeof(uint32_t), "Unexpected value size!");
-      uint32_t value_as_int = getUInt32LE(value_data);
-      utils::GPUMemoryLayout memory_layout =
-          static_cast<utils::GPUMemoryLayout>(value_as_int);
-
-      config.set_memory_layout_override(memory_layout);
-    }
-    if (strcmp(spec.key, "require_dynamic_shapes") == 0) {
-      ET_CHECK_MSG(value_size == sizeof(uint8_t), "Unexpected value size!");
-      bool value = getBool(value_data);
-
-      if (value) {
-        config.expect_dynamic_shapes = true;
-      }
-    }
-  }
-#ifdef ET_EVENT_TRACER_ENABLED
-  config.enable_querypool = true;
-#endif // ET_EVENT_TRACER_ENABLED
-  return config;
-}
-
-class GraphBuilder {
-  ComputeGraph* compute_graph_;
-  VkGraphPtr flatbuffer_;
-  const uint8_t* constant_data_;
-  const NamedDataMap* named_data_map_;
-  std::vector<FreeableBuffer> loaded_buffers_from_map_;
-
-  std::vector<ValueRef> ref_mapping_;
-
- public:
-  explicit GraphBuilder(
-      ComputeGraph* compute_graph,
-      VkGraphPtr flatbuffer,
-      const uint8_t* constant_data,
-      const NamedDataMap* named_data_map)
-      : compute_graph_(compute_graph),
-        flatbuffer_(flatbuffer),
-        constant_data_(constant_data),
-        named_data_map_(named_data_map),
-        loaded_buffers_from_map_(),
-        ref_mapping_() {}
-
-  void resize(uint32_t size) {
-    ref_mapping_.resize(size, INT32_MAX);
-  }
-
-  bool fb_id_exists(const uint32_t fb_id) {
-    return fb_id < ref_mapping_.size() && ref_mapping_[fb_id] != INT32_MAX;
-  }
-
-  ValueRef get_fb_id_valueref(const uint32_t fb_id) {
-    ET_CHECK_MSG(
-        fb_id_exists(fb_id),
-        "Trying to extract a value that hasn't yet been added to the graph.");
-
-    return ref_mapping_[fb_id];
-  }
-
-  void add_tensor_to_graph(const uint32_t fb_id, VkTensorPtr tensor_fb) {
-    const vkapi::ScalarType& dtype = get_scalar_type(tensor_fb->datatype());
-    utils::StorageType storage_type =
-        tensor_fb->storage_type() == vkgraph::VkStorageType::DEFAULT_STORAGE
-        ? compute_graph_->suggested_storage_type()
-        : get_storage_type(tensor_fb->storage_type());
-
-    UIntVector dims_fb = tensor_fb->dims();
-    const std::vector<int64_t> dims_vector(dims_fb->cbegin(), dims_fb->cend());
-
-    utils::GPUMemoryLayout memory_layout =
-        tensor_fb->memory_layout() == vkgraph::VkMemoryLayout::DEFAULT_LAYOUT
-        ? compute_graph_->suggested_memory_layout(dims_vector)
-        : get_memory_layout(tensor_fb->memory_layout());
-
-    ValueRef ref;
-    if (tensor_fb->constant_id() >= 0) {
-      VkBytesPtr constant_bytes =
-          flatbuffer_->constants()->Get(tensor_fb->constant_id());
-
-      if (constant_bytes->named_key() != nullptr &&
-          constant_bytes->offset() == UINT64_MAX &&
-          named_data_map_ != nullptr) {
-        const std::string& data_name = constant_bytes->named_key()->str();
-        Result<FreeableBuffer> buffer =
-            named_data_map_->get_data(data_name.c_str());
-
-        VK_CHECK_COND(
-            buffer.ok(),
-            "Failed to get constant data for key %s from named_data_map. Error code: %u",
-            data_name.c_str(),
-            static_cast<uint32_t>(buffer.error()));
-        ref = compute_graph_->add_tensorref(
-            dims_vector, dtype, std::move(buffer.get()));
-      } else {
-        const uint8_t* tensor_data = constant_data_ + constant_bytes->offset();
-        ref = compute_graph_->add_tensorref(dims_vector, dtype, tensor_data);
-      }
-    } else {
-      ref = compute_graph_->add_tensor(
-          dims_vector,
-          dtype,
-          storage_type,
-          memory_layout,
-          tensor_fb->mem_obj_id());
-    }
-
-    ref_mapping_[fb_id] = ref;
-  }
-
-  void add_none_to_graph(const uint32_t fb_id) {
-    ValueRef ref = compute_graph_->add_none();
-    ref_mapping_[fb_id] = ref;
-  }
-
-  template <typename T>
-  typename std::enable_if<is_valid_scalar_type<T>::value, void>::type
-  add_scalar_to_graph(const uint32_t fb_id, T value) {
-    ValueRef ref = compute_graph_->add_scalar(value);
-    ref_mapping_[fb_id] = ref;
-  }
-
-  template <typename T>
-  typename std::enable_if<is_valid_scalar_type<T>::value, void>::type
-  add_scalar_list_to_graph(const uint32_t fb_id, std::vector<T>&& value) {
-    ValueRef ref = compute_graph_->add_scalar_list(std::move(value));
-    ref_mapping_[fb_id] = ref;
-  }
-
-  void add_value_list_to_graph(
-      const uint32_t fb_id,
-      std::vector<ValueRef>&& value) {
-    ValueRef ref = compute_graph_->add_value_list(std::move(value));
-    ref_mapping_[fb_id] = ref;
-  }
-
-  void add_string_to_graph(const uint32_t fb_id, VkValuePtr value) {
-    const auto fb_str = value->value_as_String()->string_val();
-    std::string string(fb_str->cbegin(), fb_str->cend());
-    ValueRef ref = compute_graph_->add_string(std::move(string));
-    ref_mapping_[fb_id] = ref;
-  }
-
-  void add_symint_to_graph(const uint32_t fb_id, VkValuePtr value) {
-    const int32_t fb_symint = value->value_as_SymInt()->value();
-    ValueRef ref = compute_graph_->add_symint(fb_symint);
-    ref_mapping_[fb_id] = ref;
-  }
-
-  void add_value_to_graph(const uint32_t fb_id, VkValuePtr value) {
-    ET_CHECK_MSG(
-        !fb_id_exists(fb_id),
-        "Trying to add a value that has already been added to the graph.");
-
-    switch (value->value_type()) {
-      case vkgraph::GraphTypes::Null:
-        add_none_to_graph(fb_id);
-        break;
-      case vkgraph::GraphTypes::Int:
-        add_scalar_to_graph(fb_id, value->value_as_Int()->int_val());
-        break;
-      case vkgraph::GraphTypes::Double:
-        add_scalar_to_graph(fb_id, value->value_as_Double()->double_val());
-        break;
-      case vkgraph::GraphTypes::Bool:
-        add_scalar_to_graph(fb_id, value->value_as_Bool()->bool_val());
-        break;
-      case vkgraph::GraphTypes::VkTensor:
-        add_tensor_to_graph(fb_id, value->value_as_VkTensor());
-        break;
-      case vkgraph::GraphTypes::IntList:
-        add_scalar_list_to_graph(
-            fb_id,
-            std::vector<int64_t>(
-                value->value_as_IntList()->items()->cbegin(),
-                value->value_as_IntList()->items()->cend()));
-        break;
-      case vkgraph::GraphTypes::DoubleList:
-        add_scalar_list_to_graph(
-            fb_id,
-            std::vector<double>(
-                value->value_as_DoubleList()->items()->cbegin(),
-                value->value_as_DoubleList()->items()->cend()));
-        break;
-      case vkgraph::GraphTypes::BoolList:
-        add_scalar_list_to_graph(
-            fb_id,
-            std::vector<bool>(
-                value->value_as_BoolList()->items()->cbegin(),
-                value->value_as_BoolList()->items()->cend()));
-        break;
-      case vkgraph::GraphTypes::ValueList:
-        add_value_list_to_graph(
-            fb_id,
-            std::vector<ValueRef>(
-                value->value_as_ValueList()->items()->cbegin(),
-                value->value_as_ValueList()->items()->cend()));
-        break;
-      case vkgraph::GraphTypes::String:
-        add_string_to_graph(fb_id, value);
-        break;
-      case vkgraph::GraphTypes::SymInt:
-        add_symint_to_graph(fb_id, value);
-        break;
-      default:
-        ET_CHECK_MSG(false, "Unsupported value type.");
-    }
-  }
-
-  void build_graph() {
-    // Resize the mapping to the number of values in the flatbuffer
-    resize(flatbuffer_->values()->size());
-
-    // First, add all values to the graph
-    for (uint32_t fb_id = 0; fb_id < flatbuffer_->values()->size(); ++fb_id) {
-      VkValuePtr value = flatbuffer_->values()->Get(fb_id);
-      add_value_to_graph(fb_id, value);
-    }
-
-    // Parse the inputs, which will be tensors most of the time but can also be
-    // symints and tensorrefs (which will be the case if the original graph had)
-    // mutable buffers.
-    for (const uint32_t fb_id : *flatbuffer_->input_ids()) {
-      const ValueRef ref = get_fb_id_valueref(fb_id);
-      if (compute_graph_->val_is_tensor(ref)) {
-        compute_graph_->set_input_tensor(ref);
-      } else {
-        compute_graph_->set_val_as_input(ref);
-      }
-    }
-
-    // Parse the operators
-    for (OpCallPtr op_call : *(flatbuffer_->chain())) {
-      std::string op_name = op_call->name()->str();
-      ET_CHECK_MSG(VK_HAS_OP(op_name), "Missing operator: %s", op_name.c_str());
-
-      std::vector<ValueRef> args;
-      args.reserve(op_call->args()->size());
-      for (const auto arg_fb_id : *op_call->args()) {
-        args.push_back(get_fb_id_valueref(static_cast<int>(arg_fb_id)));
-      }
-
-      auto vkFn = VK_GET_OP_FN(op_name);
-      vkFn(*compute_graph_, args);
-    }
-
-    // Parse the outputs, which will be mostly tensors but may contain tensorref
-    // values as well if the source graph returns parameter nodes.
-    for (const uint32_t fb_id : *flatbuffer_->output_ids()) {
-      const ValueRef ref = get_fb_id_valueref(fb_id);
-      compute_graph_->set_output_value(ref);
-    }
-
-    if (compute_graph_->graphconfig().enable_querypool) {
-      for (uint32_t i = 0; i < compute_graph_->prepack_nodes().size(); ++i) {
-        compute_graph_->prepack_nodes()[i]->set_node_id(i);
-      }
-      for (uint32_t i = 0; i < compute_graph_->execute_nodes().size(); ++i) {
-        compute_graph_->execute_nodes()[i]->set_node_id(i);
-      }
-    }
-  }
-};
-
-//
-// Execution tools
-//
-
-bool maybe_resize_input(
-    ComputeGraph* graph,
-    const size_t input_i,
-    executorch::aten::Tensor& et_tensor) {
-  ValueRef in_tensor_ref = graph->inputs()[input_i].value;
-
-  const std::vector<int64_t> in_tensor_vk_sizes =
-      graph->sizes_of(in_tensor_ref);
-
-  ET_CHECK_MSG(
-      et_tensor.dim() == in_tensor_vk_sizes.size(),
-      "Cannot resize input tensor: old ndim %zu does not match new ndim %zu",
-      static_cast<size_t>(in_tensor_vk_sizes.size()),
-      static_cast<size_t>(et_tensor.dim()));
-
-  bool should_resize = false;
-  std::vector<int64_t> new_sizes(et_tensor.dim());
-  for (size_t i = 0; i < et_tensor.dim(); i++) {
-    if (in_tensor_vk_sizes[i] != et_tensor.sizes()[i]) {
-      should_resize = true;
-    }
-    new_sizes.at(i) = et_tensor.sizes()[i];
-  }
-
-  if (should_resize) {
-    graph->resize_input(input_i, new_sizes);
-  }
-
-  const size_t in_tensor_vk_numel = graph->numel_of(in_tensor_ref);
-  ET_CHECK_MSG(
-      in_tensor_vk_numel == et_tensor.numel(),
-      "Vulkan tensor numel %zu does not match ET tensor numel %zu",
-      static_cast<size_t>(in_tensor_vk_numel),
-      static_cast<size_t>(et_tensor.numel()));
-
-  return should_resize;
-}
-
-bool maybe_update_scalar_tensor(
-    ComputeGraph* graph,
-    const ValueRef ref,
-    executorch::aten::Tensor& scalar_tensor_src) {
-  const int32_t cur_val = graph->read_symint(ref);
-  int32_t scalar_tensor_val = 0;
-  executorch::aten::ScalarType dtype = scalar_tensor_src.scalar_type();
-  if (dtype == executorch::aten::ScalarType::Int) {
-    scalar_tensor_val = *scalar_tensor_src.const_data_ptr<int32_t>();
-  } else if (dtype == executorch::aten::ScalarType::Long) {
-    scalar_tensor_val = int32_t(*scalar_tensor_src.const_data_ptr<int64_t>());
-  }
-  bool was_updated = false;
-  if (scalar_tensor_val != cur_val) {
-    graph->set_symint(ref, scalar_tensor_val);
-    was_updated = true;
-  }
-  return was_updated;
-}
-
-void maybe_resize_output(
-    ComputeGraph* graph,
-    const size_t output_i,
-    executorch::aten::Tensor& et_tensor) {
-  ValueRef out_tensor_ref = graph->outputs()[output_i].value;
-
-  const std::vector<int64_t> out_tensor_vk_sizes =
-      graph->sizes_of(out_tensor_ref);
-
-  executorch::aten::SizesType new_output_size[kTensorDimensionLimit];
-  size_t ndim = out_tensor_vk_sizes.size();
-  for (int i = 0; i < ndim; ++i) {
-    new_output_size[i] = out_tensor_vk_sizes[i];
-  }
-
-  executorch::aten::ArrayRef<executorch::aten::SizesType> output_size{
-      new_output_size, ndim};
-  Error err = resize_tensor(et_tensor, output_size);
-
-  ET_CHECK_MSG(err == Error::Ok, "Failed to resize output tensor.");
-}
-
-//
-// VulkanBackend class
-//
-
-class VulkanBackend final : public ::executorch::runtime::BackendInterface {
- public:
-  ~VulkanBackend() override = default;
-
-  bool is_available() const override {
-    // TODO(ssjia): replace with an actual Vulkan runtime availability check
-    return true;
-  }
-
-  ET_NODISCARD Error compileModel(
-      const void* buffer_pointer,
-      ComputeGraph* compute_graph,
-      const NamedDataMap* named_data_map) const {
-    Result<VulkanDelegateHeader> header =
-        VulkanDelegateHeader::parse(buffer_pointer);
-
-    const uint8_t* flatbuffer_data = nullptr;
-    const uint8_t* constant_data = nullptr;
-
-    if (header.ok()) {
-      const uint8_t* buffer_start =
-          reinterpret_cast<const uint8_t*>(buffer_pointer);
-      flatbuffer_data = buffer_start + header->flatbuffer_offset;
-      constant_data = buffer_start + header->bytes_offset;
-    } else {
-      ET_LOG(Error, "VulkanDelegateHeader may be corrupt");
-      return header.error();
-    }
-
-    ET_CHECK_OR_RETURN_ERROR(
-        vkgraph::VkGraphBufferHasIdentifier(flatbuffer_data),
-        DelegateInvalidCompatibility,
-        "Vulkan Delegate Serialization Format version identifier '%.4s' != expected '%.4s'",
-        flatbuffers::GetBufferIdentifier(flatbuffer_data),
-        vkgraph::VkGraphIdentifier());
-
-    VkGraphPtr flatbuffer_graph = vkgraph::GetVkGraph(flatbuffer_data);
-
-    GraphBuilder builder(
-        compute_graph, flatbuffer_graph, constant_data, named_data_map);
-
-    builder.build_graph();
-
-    compute_graph->prepare();
-    compute_graph->prepare_pipelines();
-
-    compute_graph->prepack();
-
-    return Error::Ok;
-  }
-
-  Result<DelegateHandle*> init(
-      BackendInitContext& context,
-      FreeableBuffer* processed,
-      ArrayRef<CompileSpec> compile_specs) const override {
-    ComputeGraph* compute_graph =
-        context.get_runtime_allocator()->allocateInstance<ComputeGraph>();
-    if (compute_graph == nullptr) {
-      return Error::MemoryAllocationFailed;
-    }
-
-    GraphConfig graph_config = get_graph_config(compile_specs);
-    graph_config.external_adapter = vkapi::set_and_get_external_adapter();
-    new (compute_graph) ComputeGraph(graph_config);
-
-    const NamedDataMap* named_data_map = context.get_named_data_map();
-    Error err = compileModel(processed->data(), compute_graph, named_data_map);
-
-    // This backend does not need its processed data after compiling the
-    // model.
-    processed->Free();
-
-    if (err != Error::Ok) {
-      return err;
-    }
-
-    return compute_graph;
-  }
-
-  Error execute(
-      ET_UNUSED BackendExecutionContext& context,
-      DelegateHandle* handle,
-      Span<EValue*> args) const override {
-    EXECUTORCH_SCOPE_PROF("VulkanBackend::execute");
-
-    ComputeGraph* compute_graph = static_cast<ComputeGraph*>(handle);
-
-    const size_t num_inputs = compute_graph->inputs().size();
-    bool should_propagate_resize = false;
-    for (size_t i = 0; i < num_inputs; i++) {
-      const ValueRef iref = compute_graph->inputs()[i].value;
-      if (compute_graph->val_is_tensor(iref)) {
-        VK_CHECK_COND(args[i]->isTensor());
-        bool was_resized =
-            maybe_resize_input(compute_graph, i, args[i]->toTensor());
-        should_propagate_resize = should_propagate_resize || was_resized;
-        compute_graph->copy_into_staging(
-            compute_graph->inputs()[i].staging,
-            args[i]->toTensor().const_data_ptr(),
-            args[i]->toTensor().numel());
-      } else if (compute_graph->val_is_symint(iref)) {
-        VK_CHECK_COND(
-            args[i]->isTensor(),
-            "Cannot handle symint arg to graph that is not derived from a "
-            "scalar tensor at the moment.");
-        bool was_updated = maybe_update_scalar_tensor(
-            compute_graph, iref, args[i]->toTensor());
-        // Since symint inputs may impact tensor's sizes, trigger a resize if
-        // any symbolic integer shapes are updated.
-        should_propagate_resize = should_propagate_resize || was_updated;
-      } else {
-        VK_THROW(
-            "Could not handle input with type ",
-            compute_graph->get_val_type(iref));
-      }
-    }
-
-    if (should_propagate_resize) {
-      compute_graph->propagate_resize();
-    }
-
-    compute_graph->execute();
-
-    for (size_t i = 0; i < compute_graph->outputs().size(); i++) {
-      const size_t o = i + num_inputs;
-      const ValueRef oref = compute_graph->outputs()[i].value;
-      if (compute_graph->val_is_tensor(oref)) {
-        VK_CHECK_COND(args[o]->isTensor());
-        maybe_resize_output(compute_graph, i, args[o]->toTensor());
-        // args holds inputs directly followed by outputs, so the i'th output
-        // for compute_graph corresponds to the o'th arg
-        compute_graph->copy_from_staging(
-            compute_graph->outputs()[i].staging,
-            args[o]->toTensor().mutable_data_ptr(),
-            args[o]->toTensor().numel());
-      }
-      // TensorRef values represent constant tensors which will not have been
-      // modified by the graph execution. Therefore, if a constant tensor is
-      // returned as an output, no action is required.
-      else if (compute_graph->val_is_tref(oref)) {
-        continue;
-      } else {
-        VK_THROW(
-            "Could not handle output with type ",
-            compute_graph->get_val_type(oref));
-      }
-    }
-
-#ifdef ET_EVENT_TRACER_ENABLED
-    runtime::EventTracer* event_tracer = context.event_tracer();
-    compute_graph->context()->querypool().extract_results();
-    for (const auto& r :
-         compute_graph->context()->querypool().get_shader_timestamp_data()) {
-      std::string event_name =
-          r.kernel_name + "_" + std::to_string(r.dispatch_id);
-      event_tracer_log_profiling_delegate(
-          event_tracer,
-          event_name.c_str(),
-          /* delegate_debug_id = */ -1,
-          r.start_time_ns,
-          r.end_time_ns,
-          (void*)(&r.metadata),
-          sizeof(r.metadata));
-    }
-#endif // ET_EVENT_TRACER_ENABLED
-
-    return Error::Ok;
-  }
-
-  void destroy(DelegateHandle* handle) const override {
-    if (handle != nullptr) {
-      ComputeGraph* compute_graph = static_cast<ComputeGraph*>(handle);
-      compute_graph->context()
-          ->adapter_ptr()
-          ->compute_pipeline_cache()
-          .save_cache();
-      // ComputeGraph is not trivially destructible. Since
-      // this was constructed manually in init(), we must destroy it manually
-      // here.
-      compute_graph->~ComputeGraph();
-    }
-  }
-};
-
-auto cls = VulkanBackend();
-Backend backend{"VulkanBackend", &cls};
-static auto success_with_compiler = register_backend(backend);
-
-} // namespace
-} // namespace vulkan
-} // namespace backends
-} // namespace executorch
diff --git a/backends/vulkan/runtime/VulkanDelegateHeader.cpp b/backends/vulkan/runtime/VulkanDelegateHeader.cpp
deleted file mode 100644
index 2a235144342..00000000000
--- a/backends/vulkan/runtime/VulkanDelegateHeader.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/VulkanDelegateHeader.h>
-
-#include <cstring>
-
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/core/result.h>
-
-#pragma clang diagnostic ignored "-Wdeprecated"
-
-namespace executorch {
-namespace backends {
-namespace vulkan {
-
-using executorch::runtime::Error;
-using executorch::runtime::Result;
-
-namespace {
-
-struct ByteSlice {
-  size_t offset;
-  size_t size;
-};
-
-constexpr size_t kExpectedSize = 30;
-constexpr char kExpectedMagic[4] = {'V', 'H', '0', '0'};
-
-constexpr ByteSlice kMagic = {4, 4};
-constexpr ByteSlice kHeaderSize = {8, 2};
-constexpr ByteSlice kFlatbufferOffset = {10, 4};
-constexpr ByteSlice kFlatbufferSize = {14, 4};
-constexpr ByteSlice kBytesOffset = {18, 4};
-constexpr ByteSlice kBytesSize = {22, 8};
-
-} // namespace
-
-/// Interprets the 8 bytes at `data` as a little-endian uint64_t.
-uint64_t getUInt64LE(const uint8_t* data) {
-  return (uint64_t)data[0] | ((uint64_t)data[1] << 8) |
-      ((uint64_t)data[2] << 16) | ((uint64_t)data[3] << 24) |
-      ((uint64_t)data[4] << 32) | ((uint64_t)data[5] << 40) |
-      ((uint64_t)data[6] << 48) | ((uint64_t)data[7] << 56);
-}
-
-/// Interprets the 4 bytes at `data` as a little-endian uint32_t.
-uint32_t getUInt32LE(const uint8_t* data) {
-  return (uint32_t)data[0] | ((uint32_t)data[1] << 8) |
-      ((uint32_t)data[2] << 16) | ((uint32_t)data[3] << 24);
-}
-
-/// Interprets the 2 bytes at `data` as a little-endian uint32_t.
-uint32_t getUInt16LE(const uint8_t* data) {
-  return (uint32_t)data[0] | ((uint32_t)data[1] << 8);
-}
-
-bool getBool(const uint8_t* data) {
-  return data[0] != 0;
-}
-
-bool VulkanDelegateHeader::is_valid() const {
-  if (header_size < kExpectedSize) {
-    return false;
-  }
-  if (flatbuffer_offset < header_size) {
-    return false;
-  }
-  if (flatbuffer_size == 0) {
-    return false;
-  }
-  if (bytes_offset < flatbuffer_offset + flatbuffer_size) {
-    return false;
-  }
-  if (bytes_size < 0) {
-    return false;
-  }
-
-  return true;
-}
-
-Result<VulkanDelegateHeader> VulkanDelegateHeader::parse(const void* data) {
-  const uint8_t* header_data = (const uint8_t*)data;
-
-  const uint8_t* magic_start = header_data + kMagic.offset;
-  if (std::memcmp(magic_start, kExpectedMagic, kMagic.size) != 0) {
-    return Error::NotFound;
-  }
-
-  VulkanDelegateHeader header = VulkanDelegateHeader{
-      getUInt16LE(header_data + kHeaderSize.offset),
-      getUInt32LE(header_data + kFlatbufferOffset.offset),
-      getUInt32LE(header_data + kFlatbufferSize.offset),
-      getUInt32LE(header_data + kBytesOffset.offset),
-      getUInt64LE(header_data + kBytesSize.offset),
-  };
-
-  if (!header.is_valid()) {
-    return Error::InvalidArgument;
-  }
-
-  return header;
-}
-
-} // namespace vulkan
-} // namespace backends
-} // namespace executorch
diff --git a/backends/vulkan/runtime/VulkanDelegateHeader.h b/backends/vulkan/runtime/VulkanDelegateHeader.h
deleted file mode 100644
index 722f01cbb75..00000000000
--- a/backends/vulkan/runtime/VulkanDelegateHeader.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/runtime/core/result.h>
-
-namespace executorch {
-namespace backends {
-namespace vulkan {
-
-// Byte decoding utilities
-uint64_t getUInt64LE(const uint8_t* data);
-uint32_t getUInt32LE(const uint8_t* data);
-uint32_t getUInt16LE(const uint8_t* data);
-
-// Bool is serialized as a single byte
-bool getBool(const uint8_t* data);
-
-struct VulkanDelegateHeader {
-  bool is_valid() const;
-
-  static executorch::runtime::Result<VulkanDelegateHeader> parse(
-      const void* data);
-
-  uint32_t header_size;
-  uint32_t flatbuffer_offset;
-  uint32_t flatbuffer_size;
-  uint32_t bytes_offset;
-  uint64_t bytes_size;
-};
-
-} // namespace vulkan
-} // namespace backends
-} // namespace executorch
diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
deleted file mode 100644
index 8599cbfffb6..00000000000
--- a/backends/vulkan/runtime/api/Context.cpp
+++ /dev/null
@@ -1,498 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/api/Context.h>
-
-#ifdef VULKAN_DEBUG
-#include <iomanip>
-#include <iostream>
-#endif // VULKAN_DEBUG
-
-#ifndef VULKAN_DESCRIPTOR_POOL_SIZE
-#define VULKAN_DESCRIPTOR_POOL_SIZE 1024u
-#endif
-
-#ifndef VULKAN_QUERY_POOL_SIZE
-#define VULKAN_QUERY_POOL_SIZE 4096u
-#endif
-
-namespace vkcompute {
-namespace api {
-
-Context::Context(vkapi::Adapter* adapter, const ContextConfig& config)
-    : config_(config),
-      // Important handles
-      adapter_p_(adapter),
-      device_(adapter_p_->device_handle()),
-      queue_(adapter_p_->request_queue()),
-      // Resource pools
-      command_pool_(device_, queue_.family_index, config_.cmd_pool_config),
-      descriptor_pool_(device_, config_.descriptor_pool_config),
-      fences_(device_),
-      // Profiling
-      querypool_(config_.query_pool_config, nullptr),
-      // Command buffer submission
-      cmd_mutex_{},
-      cmd_(VK_NULL_HANDLE, 0u),
-      submit_count_{0u},
-      // Memory Management
-      buffer_clearlist_mutex_{},
-      buffers_to_clear_{},
-      image_clearlist_mutex_{},
-      images_to_clear_{},
-      preferred_image_tiling_{VK_IMAGE_TILING_OPTIMAL} {
-  if (adapter_p_->linear_tiling_3d_enabled()) {
-    preferred_image_tiling_ = VK_IMAGE_TILING_LINEAR;
-  }
-}
-
-Context::~Context() {
-  try {
-    flush();
-    // Let the device know the context is done with the queue
-    adapter_p_->return_queue(queue_);
-  } catch (...) {
-  }
-}
-
-void Context::initialize_querypool() {
-  querypool_.initialize(adapter_p_);
-}
-
-void Context::cmd_reset_querypool() {
-  if (querypool_) {
-    set_cmd();
-    querypool_.reset_querypool(cmd_);
-  }
-}
-
-void Context::report_shader_dispatch_start(
-    const std::string& shader_name,
-    const utils::uvec3& global_wg_size,
-    const utils::WorkgroupSize& local_wg_size,
-    const uint32_t dispatch_id) {
-  if (querypool_) {
-    querypool_.shader_profile_begin(
-        cmd_,
-        dispatch_id,
-        shader_name,
-        vkapi::create_extent3d(global_wg_size),
-        vkapi::create_extent3d((utils::uvec3)local_wg_size));
-  }
-}
-
-void Context::report_shader_dispatch_end() {
-  if (querypool_) {
-    querypool_.shader_profile_end(cmd_);
-  }
-}
-
-void Context::check_device_capabilities(const vkapi::ShaderInfo& shader) {
-  if (shader.requires_shader_int16) {
-    if (!adapter_p_->supports_int16_shader_types()) {
-      throw vkapi::ShaderNotSupportedError(
-          shader.kernel_name, vkapi::VulkanExtension::SHADER_INT16);
-    }
-  }
-  if (shader.requires_16bit_storage) {
-    if (!adapter_p_->supports_16bit_storage_buffers()) {
-      throw vkapi::ShaderNotSupportedError(
-          shader.kernel_name, vkapi::VulkanExtension::INT16_STORAGE);
-    }
-  }
-  if (shader.requires_8bit_storage) {
-    if (!adapter_p_->supports_8bit_storage_buffers()) {
-      throw vkapi::ShaderNotSupportedError(
-          shader.kernel_name, vkapi::VulkanExtension::INT8_STORAGE);
-    }
-  }
-  if (shader.requires_integer_dot_product) {
-    if (!adapter_p_->supports_int8_dot_product()) {
-      throw vkapi::ShaderNotSupportedError(
-          shader.kernel_name, vkapi::VulkanExtension::INTEGER_DOT_PRODUCT);
-    }
-  }
-}
-
-vkapi::DescriptorSet Context::get_descriptor_set(
-    const vkapi::ShaderInfo& shader_descriptor,
-    const utils::WorkgroupSize& local_workgroup_size,
-    const vkapi::SpecVarList& additional_constants,
-    const uint32_t push_constants_size) {
-  VkDescriptorSetLayout shader_layout =
-      shader_layout_cache().retrieve(shader_descriptor.kernel_layout);
-
-  VkPipelineLayout pipeline_layout =
-      pipeline_layout_cache().retrieve(shader_layout, push_constants_size);
-
-  vkapi::SpecVarList spec_constants = {
-      SV(local_workgroup_size[0u]),
-      SV(local_workgroup_size[1u]),
-      SV(local_workgroup_size[2u])};
-
-  spec_constants.append(additional_constants);
-
-  VkPipeline pipeline = pipeline_cache().retrieve(
-      {pipeline_layout_cache().retrieve(shader_layout, push_constants_size),
-       shader_cache().retrieve(shader_descriptor),
-       spec_constants});
-
-  cmd_.bind_pipeline(pipeline, pipeline_layout, local_workgroup_size);
-
-  return descriptor_pool().get_descriptor_set(
-      shader_layout, shader_descriptor.kernel_layout);
-}
-
-void Context::register_shader_dispatch(
-    const vkapi::DescriptorSet& descriptors,
-    vkapi::PipelineBarrier& pipeline_barrier,
-    const vkapi::ShaderInfo& shader_descriptor,
-    const utils::uvec3& global_workgroup_size,
-    const void* push_constants_data,
-    const uint32_t push_constants_size) {
-  // Adjust the global workgroup size based on the output tile size
-  uint32_t global_wg_w = utils::div_up(
-      global_workgroup_size[0u], shader_descriptor.out_tile_size[0u]);
-  uint32_t global_wg_h = utils::div_up(
-      global_workgroup_size[1u], shader_descriptor.out_tile_size[1u]);
-  uint32_t global_wg_d = utils::div_up(
-      global_workgroup_size[2u], shader_descriptor.out_tile_size[2u]);
-
-  // Submitting a global work group size of 0 is undefined behaviour. If this is
-  // detected then submit a single workgroup instead.
-  if (global_wg_w == 0u || global_wg_h == 0u || global_wg_d == 0u) {
-    global_wg_w = 1u;
-    global_wg_h = 1u;
-    global_wg_d = 1u;
-  }
-
-  const utils::uvec3 effective_global_wg = {
-      global_wg_w,
-      global_wg_h,
-      global_wg_d,
-  };
-
-  cmd_.bind_descriptors(descriptors.get_bind_handle());
-  cmd_.insert_barrier(pipeline_barrier);
-
-  if (push_constants_size > 0 && push_constants_data != nullptr) {
-    const VkDescriptorSetLayout shader_layout =
-        shader_layout_cache().retrieve(shader_descriptor.kernel_layout);
-    const VkPipelineLayout pipeline_layout =
-        pipeline_layout_cache().retrieve(shader_layout, push_constants_size);
-    cmd_.set_push_constants(
-        pipeline_layout, push_constants_data, push_constants_size);
-  }
-
-  cmd_.dispatch(effective_global_wg);
-}
-
-void Context::register_blit(
-    vkapi::PipelineBarrier& pipeline_barrier,
-    vkapi::VulkanImage& src,
-    vkapi::VulkanImage& dst) {
-  cmd_.insert_barrier(pipeline_barrier);
-  cmd_.blit(src, dst);
-}
-
-void Context::submit_cmd_to_gpu(VkFence fence_handle, const bool final_use) {
-  if (cmd_) {
-    cmd_.end();
-    adapter_p_->submit_cmd(
-        queue_,
-        cmd_.get_submit_handle(final_use),
-        fence_handle,
-        VK_NULL_HANDLE,
-        VK_NULL_HANDLE);
-
-    submit_count_ = 0u;
-  }
-}
-
-void Context::flush() {
-  VK_CHECK(vkQueueWaitIdle(queue().handle));
-
-  command_pool_.flush();
-  descriptor_pool_.flush();
-
-  // If there is an existing command buffer, invalidate it
-  if (cmd_) {
-    cmd_.invalidate();
-  }
-
-  std::lock_guard<std::mutex> bufferlist_lock(buffer_clearlist_mutex_);
-  std::lock_guard<std::mutex> imagelist_lock(image_clearlist_mutex_);
-  buffers_to_clear_.clear();
-  images_to_clear_.clear();
-}
-
-bool available() {
-  return context();
-}
-
-Context* context() {
-  static const std::unique_ptr<Context> context([]() -> Context* {
-    try {
-      const uint32_t cmd_submit_frequency = 16u;
-
-      const vkapi::CommandPoolConfig cmd_config{
-          32u, // cmdPoolInitialSize
-          8u, // cmdPoolBatchSize
-      };
-
-      const vkapi::DescriptorPoolConfig descriptor_pool_config{
-          VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorPoolMaxSets
-          VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorUniformBufferCount
-          VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStorageBufferCount
-          VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorCombinedSamplerCount
-          VULKAN_DESCRIPTOR_POOL_SIZE, // descriptorStorageImageCount
-          32u, // descriptorPileSizes
-      };
-
-      const vkapi::QueryPoolConfig query_pool_config{
-          VULKAN_QUERY_POOL_SIZE, // maxQueryCount
-          256u, // initialReserveSize
-      };
-
-      const ContextConfig config{
-          cmd_submit_frequency,
-          cmd_config,
-          descriptor_pool_config,
-          query_pool_config,
-      };
-
-      return new Context(vkapi::runtime()->get_adapter_p(), config);
-    } catch (...) {
-    }
-
-    return nullptr;
-  }());
-
-  return context.get();
-}
-
-#ifdef VULKAN_DEBUG
-
-#ifdef VK_KHR_pipeline_executable_properties
-
-VkPipeline Context::get_shader_pipeline(
-    const vkapi::ShaderInfo& shader,
-    const vkapi::SpecVarList& additional_constants) {
-  const uint32_t push_constants_size = 128u;
-
-  VkDescriptorSetLayout shader_layout =
-      shader_layout_cache().retrieve(shader.kernel_layout);
-  VkPipelineLayout pipeline_layout =
-      pipeline_layout_cache().retrieve(shader_layout, push_constants_size);
-
-  const utils::WorkgroupSize local_workgroup_size(4u, 4u, 1u);
-  vkapi::SpecVarList spec_constants = {
-      SV(local_workgroup_size[0u]),
-      SV(local_workgroup_size[1u]),
-      SV(local_workgroup_size[2u])};
-
-  spec_constants.append(additional_constants);
-
-  VkPipeline pipeline = pipeline_cache().retrieve(
-      {pipeline_layout, shader_cache().retrieve(shader), spec_constants});
-
-  return pipeline;
-}
-
-std::vector<VkPipelineExecutablePropertiesKHR>
-Context::get_pipeline_executable_props(const VkPipeline pipeline) {
-  VkPipelineInfoKHR pipeline_info{
-      VK_STRUCTURE_TYPE_PIPELINE_INFO_KHR,
-      nullptr,
-      pipeline,
-  };
-
-  uint32_t shader_props_count = 0u;
-  vkGetPipelineExecutablePropertiesKHR(
-      device(), &pipeline_info, &shader_props_count, nullptr);
-
-  std::vector<VkPipelineExecutablePropertiesKHR> pipeline_props(
-      shader_props_count);
-  for (int i = 0; i < shader_props_count; i++) {
-    pipeline_props.at(i).sType =
-        VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_PROPERTIES_KHR;
-    pipeline_props.at(i).pNext = nullptr;
-  }
-  vkGetPipelineExecutablePropertiesKHR(
-      device(), &pipeline_info, &shader_props_count, pipeline_props.data());
-
-  return pipeline_props;
-}
-
-std::tuple<
-    std::vector<VkPipelineExecutableInternalRepresentationKHR>,
-    std::vector<std::vector<char>>>
-Context::get_shader_executable_irs(
-    const VkPipeline pipeline,
-    const uint32_t pipeline_exec_idx) {
-  VkPipelineExecutableInfoKHR exec_info{
-      VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_INFO_KHR,
-      nullptr,
-      pipeline,
-      pipeline_exec_idx,
-  };
-
-  uint32_t ir_count;
-  VK_CHECK(vkGetPipelineExecutableInternalRepresentationsKHR(
-      device(), &exec_info, &ir_count, nullptr));
-
-  std::vector<VkPipelineExecutableInternalRepresentationKHR> irs(ir_count);
-  for (int i = 0; i < ir_count; i++) {
-    irs.at(i).sType =
-        VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_INTERNAL_REPRESENTATION_KHR;
-    irs.at(i).pNext = nullptr;
-    irs.at(i).pData = nullptr;
-  }
-  VK_CHECK(vkGetPipelineExecutableInternalRepresentationsKHR(
-      device(), &exec_info, &ir_count, irs.data()));
-
-  std::vector<std::vector<char>> irs_data(ir_count);
-  for (int i = 0; i < ir_count; i++) {
-    irs_data.at(i).resize(irs.at(i).dataSize);
-    irs.at(i).pData = irs_data.at(i).data();
-  }
-  VK_CHECK(vkGetPipelineExecutableInternalRepresentationsKHR(
-      device(), &exec_info, &ir_count, irs.data()));
-
-  return std::make_tuple(irs, irs_data);
-}
-
-std::vector<VkPipelineExecutableStatisticKHR>
-Context::get_shader_executable_stats(
-    const VkPipeline pipeline,
-    const uint32_t pipeline_exec_idx) {
-  VkPipelineExecutableInfoKHR exec_info{
-      VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_INFO_KHR,
-      nullptr,
-      pipeline,
-      pipeline_exec_idx,
-  };
-
-  uint32_t stats_count;
-  VK_CHECK(vkGetPipelineExecutableStatisticsKHR(
-      device(), &exec_info, &stats_count, NULL));
-
-  std::vector<VkPipelineExecutableStatisticKHR> shader_stats(stats_count);
-  for (int i = 0; i < stats_count; i++) {
-    shader_stats.at(i).sType =
-        VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_STATISTIC_KHR;
-    shader_stats.at(i).pNext = nullptr;
-  }
-  vkGetPipelineExecutableStatisticsKHR(
-      device(), &exec_info, &stats_count, shader_stats.data());
-
-  return shader_stats;
-}
-
-std::ostream& operator<<(
-    std::ostream& os,
-    const VkPipelineExecutablePropertiesKHR& props) {
-  os << std::left << std::setw(10) << "name: " << props.name << std::endl;
-  os << std::left << std::setw(10) << "descr: " << props.description
-     << std::endl;
-  os << std::left << std::setw(10) << "subgroup: " << props.subgroupSize
-     << std::endl;
-
-  return os;
-}
-
-std::ostream& operator<<(
-    std::ostream& os,
-    const VkPipelineExecutableInternalRepresentationKHR& ir) {
-  os << std::left << std::setw(10) << "descr: " << ir.description << std::endl;
-  os << std::left << std::setw(10) << "isText: " << ir.isText << std::endl;
-  os << std::left << std::setw(10) << "size: " << ir.dataSize << std::endl;
-  if (ir.isText) {
-    os << "text:" << std::endl;
-    char* str = (char*)ir.pData;
-    os << str << std::endl;
-  }
-  return os;
-}
-
-std::ostream& operator<<(
-    std::ostream& os,
-    VkPipelineExecutableStatisticKHR& stat) {
-  os << stat.name << ": ";
-  switch (stat.format) {
-    case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_BOOL32_KHR:
-      os << (stat.value.b32 ? "true" : "false") << std::endl;
-      break;
-    case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_INT64_KHR:
-      os << stat.value.i64 << std::endl;
-      break;
-    case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR:
-      os << stat.value.u64 << std::endl;
-      break;
-    case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_FLOAT64_KHR:
-      os << stat.value.f64 << std::endl;
-      break;
-    default:
-      break;
-  }
-  os << "    " << stat.description << std::endl;
-  return os;
-}
-
-std::ostream& operator<<(
-    std::ostream& os,
-    std::vector<VkPipelineExecutableStatisticKHR>& shader_stats) {
-  for (int i = 0; i < shader_stats.size(); i++) {
-    VkPipelineExecutableStatisticKHR& stat = shader_stats.at(i);
-    os << stat;
-  }
-  return os;
-}
-
-void Context::print_shader_executable_properties(
-    const vkapi::ShaderInfo& shader,
-    const vkapi::SpecVarList& spec_constants) {
-  VkPipeline pipeline = get_shader_pipeline(shader, spec_constants);
-
-  std::vector<VkPipelineExecutablePropertiesKHR> pipeline_props_list =
-      get_pipeline_executable_props(pipeline);
-
-  VK_CHECK_COND(pipeline_props_list.size() == 1u);
-
-  std::cout << pipeline_props_list.at(0) << std::endl;
-
-  std::tuple<
-      std::vector<VkPipelineExecutableInternalRepresentationKHR>,
-      std::vector<std::vector<char>>>
-      irs_and_irs_data = get_shader_executable_irs(pipeline, 0u);
-
-  std::vector<VkPipelineExecutableInternalRepresentationKHR>& irs =
-      std::get<0>(irs_and_irs_data);
-
-  std::cout << "Found " << irs.size() << " IRs" << std::endl << std::endl;
-  for (int i = 0; i < irs.size(); i++) {
-    std::cout << "====== IR " << i << ": " << irs.at(i).name
-              << " ======" << std::endl;
-    std::cout << irs.at(i) << std::endl;
-  }
-
-  std::vector<VkPipelineExecutableStatisticKHR> shader_stats =
-      get_shader_executable_stats(pipeline, 0u);
-  std::cout << "Found " << shader_stats.size() << " Statistics" << std::endl;
-  if (shader_stats.size() > 0) {
-    std::cout << "====== Statistics: ======" << std::endl;
-    std::cout << shader_stats << std::endl;
-  }
-}
-
-#endif // VK_KHR_pipeline_executable_properties
-
-#endif // VULKAN_DEBUG
-
-} // namespace api
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/Context.h b/backends/vulkan/runtime/api/Context.h
deleted file mode 100644
index 9c7301b9971..00000000000
--- a/backends/vulkan/runtime/api/Context.h
+++ /dev/null
@@ -1,404 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/utils/MacroUtils.h>
-#include <executorch/backends/vulkan/runtime/utils/VecUtils.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/Adapter.h>
-#include <executorch/backends/vulkan/runtime/vk_api/Command.h>
-#include <executorch/backends/vulkan/runtime/vk_api/Descriptor.h>
-#include <executorch/backends/vulkan/runtime/vk_api/Fence.h>
-#include <executorch/backends/vulkan/runtime/vk_api/QueryPool.h>
-#include <executorch/backends/vulkan/runtime/vk_api/Runtime.h>
-#include <executorch/backends/vulkan/runtime/vk_api/VkUtils.h>
-
-namespace vkcompute {
-namespace api {
-
-struct ContextConfig final {
-  uint32_t cmd_submit_frequency;
-  vkapi::CommandPoolConfig cmd_pool_config;
-  vkapi::DescriptorPoolConfig descriptor_pool_config;
-  vkapi::QueryPoolConfig query_pool_config;
-};
-
-//
-// Vulkan Context holds onto all relevant Vulkan state as it pertains to our
-// use of Vulkan in PyTorch. A Context is associated with one, and only one,
-// Adapter as a precursor to multi-GPU support. All Vulkan tensors in PyTorch
-// are associated with a Context to make tensor <-> device affinity explicit.
-// The context is currently a global object, but technically it does not need
-// to be if we were to make it explicit to the user.
-//
-
-class Context final {
- public:
-  explicit Context(vkapi::Adapter*, const ContextConfig&);
-
-  Context(const Context&) = delete;
-  Context& operator=(const Context&) = delete;
-
-  Context(Context&&) = delete;
-  Context& operator=(Context&&) = delete;
-
-  ~Context();
-
- private:
-  // Config
-  ContextConfig config_;
-  // Important handles
-  vkapi::Adapter* adapter_p_;
-  VkDevice device_;
-  vkapi::Adapter::Queue queue_;
-  // Resource Pools
-  vkapi::CommandPool command_pool_;
-  vkapi::DescriptorPool descriptor_pool_;
-  vkapi::FencePool fences_;
-  // Diagnostics
-  vkapi::QueryPool querypool_;
-  // Command buffers submission
-  std::mutex cmd_mutex_;
-  vkapi::CommandBuffer cmd_;
-  uint32_t submit_count_;
-  // Memory Management
-  std::mutex buffer_clearlist_mutex_;
-  std::vector<vkapi::VulkanBuffer> buffers_to_clear_;
-  std::mutex image_clearlist_mutex_;
-  std::vector<vkapi::VulkanImage> images_to_clear_;
-  // Misc
-  VkImageTiling preferred_image_tiling_;
-
- public:
-  // Adapter access
-
-  inline vkapi::Adapter* adapter_ptr() {
-    return adapter_p_;
-  }
-
-  inline VkDevice device() {
-    return device_;
-  }
-
-  inline vkapi::Adapter::Queue& queue() {
-    return queue_;
-  }
-
-  // Device Caches
-
-  inline vkapi::ShaderLayoutCache& shader_layout_cache() {
-    return adapter_ptr()->shader_layout_cache();
-  }
-
-  inline vkapi::ShaderCache& shader_cache() {
-    return adapter_ptr()->shader_cache();
-  }
-
-  inline vkapi::PipelineLayoutCache& pipeline_layout_cache() {
-    return adapter_ptr()->pipeline_layout_cache();
-  }
-
-  inline vkapi::ComputePipelineCache& pipeline_cache() {
-    return adapter_ptr()->compute_pipeline_cache();
-  }
-
-  // Resource Pools
-
-  inline vkapi::DescriptorPool& descriptor_pool() {
-    return descriptor_pool_;
-  }
-
-  inline vkapi::FencePool& fences() {
-    return fences_;
-  }
-
-  // Diagnostics
-
-  inline vkapi::QueryPool& querypool() {
-    return querypool_;
-  }
-
-  inline VkImageTiling preferred_image_tiling() {
-    return preferred_image_tiling_;
-  }
-
-  /*
-   * By default, the querypool attached to a Context instance is uninitialized.
-   * This function triggers the querypool to be created via vkCreateQueryPool.
-   */
-  void initialize_querypool();
-
-  /*
-   * Encodes a vkResetQueryPool command to the current command buffer, and reset
-   * the internal state of the querypool. If the querypool is not initialized
-   * this function is a no-op.
-   */
-  void cmd_reset_querypool();
-
-  /*
-   * Encodes a vkCmdWriteTimestamp command to the current command buffer and
-   * record some metadata about the shader that will be dispatched. If the
-   * querypool is not initialized this function is a no-op.
-   */
-  void report_shader_dispatch_start(
-      const std::string& shader_name,
-      const utils::uvec3& global_wg_size,
-      const utils::WorkgroupSize& local_wg_size,
-      const uint32_t dispatch_id = UINT32_MAX);
-
-  /*
-   * Encodes a vkCmdWriteTimstamp command to the current command buffer to
-   * record when the last shader that was dispatched has completed execution.
-   * If the querypool is not initialized this function is a no-op.
-   */
-  void report_shader_dispatch_end();
-
-  // Memory Management
-
-  void register_buffer_cleanup(vkapi::VulkanBuffer& buffer) {
-    std::lock_guard<std::mutex> bufferlist_lock(buffer_clearlist_mutex_);
-    buffers_to_clear_.emplace_back(std::move(buffer));
-  }
-
-  void register_image_cleanup(vkapi::VulkanImage& image) {
-    std::lock_guard<std::mutex> imagelist_lock(image_clearlist_mutex_);
-    images_to_clear_.emplace_back(std::move(image));
-  }
-
-  // GPU RPC
-
-  inline std::unique_lock<std::mutex> dispatch_lock() {
-    return std::unique_lock<std::mutex>(cmd_mutex_);
-  }
-
-  inline void set_cmd(bool reusable = false) {
-    if (!cmd_) {
-      cmd_ = command_pool_.get_new_cmd(reusable);
-      cmd_.begin();
-    }
-  }
-
-  void check_device_capabilities(const vkapi::ShaderInfo& shader);
-
-  vkapi::DescriptorSet get_descriptor_set(
-      const vkapi::ShaderInfo&,
-      const utils::WorkgroupSize&,
-      const vkapi::SpecVarList&,
-      const uint32_t push_constants_size);
-
-  inline vkapi::DescriptorSet get_descriptor_set(
-      const vkapi::ShaderInfo& shader_descriptor,
-      const utils::WorkgroupSize& local_work_group_size) {
-    return get_descriptor_set(shader_descriptor, local_work_group_size, {}, 0u);
-  }
-
-  void register_shader_dispatch(
-      const vkapi::DescriptorSet&,
-      vkapi::PipelineBarrier&,
-      const vkapi::ShaderInfo&,
-      const utils::uvec3&,
-      const void* = nullptr,
-      const uint32_t = 0);
-
-  void register_blit(
-      vkapi::PipelineBarrier&,
-      vkapi::VulkanImage& src,
-      vkapi::VulkanImage& dst);
-
-  template <typename... Arguments>
-  bool submit_compute_job(
-      const vkapi::ShaderInfo&,
-      vkapi::PipelineBarrier&,
-      const utils::uvec3&,
-      const utils::uvec3&,
-      const vkapi::SpecVarList&,
-      VkFence fence_handle,
-      const uint32_t dispatch_id,
-      Arguments&&...);
-
-  void submit_cmd_to_gpu(
-      VkFence fence_handle = VK_NULL_HANDLE,
-      const bool final_use = false);
-
-  vkapi::CommandBuffer& extract_cmd() {
-    return cmd_;
-  }
-
-  void flush();
-
-#ifdef VULKAN_DEBUG
-
-#ifdef VK_KHR_pipeline_executable_properties
-
-  VkPipeline get_shader_pipeline(
-      const vkapi::ShaderInfo& shader,
-      const vkapi::SpecVarList& spec_constants);
-
-  std::vector<VkPipelineExecutablePropertiesKHR> get_pipeline_executable_props(
-      const VkPipeline pipeline);
-
-  std::tuple<
-      std::vector<VkPipelineExecutableInternalRepresentationKHR>,
-      std::vector<std::vector<char>>>
-  get_shader_executable_irs(
-      const VkPipeline pipeline,
-      const uint32_t pipeline_exec_idx = 0u);
-
-  std::vector<VkPipelineExecutableStatisticKHR> get_shader_executable_stats(
-      const VkPipeline pipeline,
-      const uint32_t pipeline_exec_idx = 0u);
-
-  void print_shader_executable_properties(
-      const vkapi::ShaderInfo& shader,
-      const vkapi::SpecVarList& spec_constants);
-
-#endif // VK_KHR_pipeline_executable_properties
-
-#endif // VULKAN_DEBUG
-};
-
-bool available();
-
-// The global runtime is retrieved using this function, where it is declared as
-// a static local variable.
-Context* context();
-
-namespace detail {
-
-inline void arg_is_empty(
-    bool& any_is_empty,
-    const vkapi::VulkanBuffer& buffer) {
-  // bool(buffer) will evaluate to false if no memory has been allocated
-  any_is_empty = any_is_empty || !buffer;
-}
-
-inline void arg_is_empty(bool& any_is_empty, const vkapi::VulkanImage& image) {
-  // bool(image) will evaluate to false if no memory has been allocated
-  any_is_empty = any_is_empty || !image;
-}
-
-inline void arg_is_empty(
-    bool& any_is_empty,
-    const vkapi::BufferBindInfo& bind_info) {
-  any_is_empty = any_is_empty || (bind_info.handle == VK_NULL_HANDLE);
-}
-
-/*
-  Reports if any VulkanBuffer or VulkanImage argument in a variadic argument
-  list does not have any memory associated with it.
- */
-template <typename... Arguments>
-inline bool any_arg_is_empty(Arguments&&... arguments) {
-  bool any_is_empty = false;
-  VK_UNUSED const int _[]{
-      0,
-      (arg_is_empty(any_is_empty, std::forward<Arguments>(arguments)), 0)...,
-  };
-
-  return any_is_empty;
-}
-
-template <size_t... Indices, typename... Arguments>
-inline void bind(
-    vkapi::DescriptorSet& descriptor_set,
-    const std::index_sequence<Indices...>&,
-    Arguments&&... arguments) {
-  VK_UNUSED const int _[]{
-      0,
-      (descriptor_set.bind(Indices, std::forward<Arguments>(arguments)), 0)...,
-  };
-}
-
-} // namespace detail
-
-/*
-  Records a compute shader dispatch into the current command buffer. If the
-  number of submit_*_job calls exceeds the configured frequency, or if a fence
-  is provided, then the command buffer is submitted to the GPU for execution.
-  Returns a bool indicating whether or not the function call resulted in a GPU
-  queue submission.
- */
-template <typename... Arguments>
-inline bool Context::submit_compute_job(
-    const vkapi::ShaderInfo& shader,
-    vkapi::PipelineBarrier& pipeline_barrier,
-    const utils::uvec3& global_work_group,
-    const utils::uvec3& local_work_group_size,
-    const vkapi::SpecVarList& specialization_constants,
-    VkFence fence_handle,
-    const uint32_t dispatch_id,
-    Arguments&&... arguments) {
-  // If any of the provided arguments does not have memory associated with it,
-  // then exit early as there is no work to be done. However, if a fence has
-  // been passed the command buffer is not empty, then the current command
-  // buffer must still be submitted so that the fence can be signaled.
-  if (detail::any_arg_is_empty(arguments...)) {
-    if (fence_handle != VK_NULL_HANDLE && submit_count_ > 0) {
-      submit_cmd_to_gpu(fence_handle);
-      return true;
-    }
-    return false;
-  }
-
-  // Serialize recording to the shared command buffer. Do not initialize with a
-  // mutex just yet, since in some cases it will be externally managed.
-  std::unique_lock<std::mutex> cmd_lock;
-  // If a fence was passed, then assume that the host intends to sync with
-  // the GPU, implying there will be imminent calls to fence.wait() and flush().
-  // We therefore assume the mutex is externally managed in this case, and the
-  // calling thread has already locked the mutex prior to calling the function,
-  // and will release the mutex manually after calling flush(). This will
-  // prevent more dispatches from being recorded until we have flushed the
-  // Context.
-  if (fence_handle == VK_NULL_HANDLE) {
-    cmd_lock = std::unique_lock<std::mutex>(cmd_mutex_);
-  }
-
-  set_cmd();
-
-  report_shader_dispatch_start(
-      shader.kernel_name,
-      global_work_group,
-      utils::WorkgroupSize(local_work_group_size),
-      dispatch_id);
-
-  // Factor out template parameter independent code to minimize code bloat.
-  // Note that push constants are not exposed yet via this API, therefore the
-  // push constants size is assumed to be 0.
-  vkapi::DescriptorSet descriptor_set = get_descriptor_set(
-      shader,
-      utils::WorkgroupSize(local_work_group_size),
-      specialization_constants,
-      0u);
-
-  detail::bind(
-      descriptor_set,
-      std::index_sequence_for<Arguments...>{},
-      std::forward<Arguments>(arguments)...);
-
-  // Factor out template parameter independent code to minimize code bloat.
-  register_shader_dispatch(
-      descriptor_set, pipeline_barrier, shader, global_work_group);
-
-  report_shader_dispatch_end();
-
-  submit_count_++;
-  if (fence_handle != VK_NULL_HANDLE ||
-      submit_count_ >= config_.cmd_submit_frequency) {
-    submit_cmd_to_gpu(fence_handle);
-    return true;
-  }
-
-  return false;
-}
-
-} // namespace api
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/ShaderRegistry.cpp b/backends/vulkan/runtime/api/ShaderRegistry.cpp
deleted file mode 100644
index f828e561a25..00000000000
--- a/backends/vulkan/runtime/api/ShaderRegistry.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/api/ShaderRegistry.h>
-
-namespace vkcompute {
-namespace api {
-
-bool ShaderRegistry::has_shader(const std::string& shader_name) {
-  const ShaderListing::const_iterator it = listings_.find(shader_name);
-  return it != listings_.end();
-}
-
-bool ShaderRegistry::has_dispatch(const std::string& op_name) {
-  const Registry::const_iterator it = registry_.find(op_name);
-  return it != registry_.end();
-}
-
-void ShaderRegistry::register_shader(vkapi::ShaderInfo&& shader_info) {
-  if (has_shader(shader_info.kernel_name)) {
-    VK_THROW(
-        "Shader with name ", shader_info.kernel_name, "already registered");
-  }
-  listings_.emplace(shader_info.kernel_name, shader_info);
-}
-
-void ShaderRegistry::register_op_dispatch(
-    const std::string& op_name,
-    const DispatchKey key,
-    const std::string& shader_name) {
-  if (!has_dispatch(op_name)) {
-    registry_.emplace(op_name, Dispatcher());
-  }
-  const Dispatcher::const_iterator it = registry_[op_name].find(key);
-  if (it != registry_[op_name].end()) {
-    registry_[op_name][key] = shader_name;
-  } else {
-    registry_[op_name].emplace(key, shader_name);
-  }
-}
-
-const vkapi::ShaderInfo& ShaderRegistry::get_shader_info(
-    const std::string& shader_name) {
-  const ShaderListing::const_iterator it = listings_.find(shader_name);
-
-  VK_CHECK_COND(
-      it != listings_.end(),
-      "Could not find ShaderInfo with name ",
-      shader_name);
-
-  return it->second;
-}
-
-ShaderRegistry& shader_registry() {
-  static ShaderRegistry registry;
-  return registry;
-}
-
-} // namespace api
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/ShaderRegistry.h b/backends/vulkan/runtime/api/ShaderRegistry.h
deleted file mode 100644
index f40e247c1b8..00000000000
--- a/backends/vulkan/runtime/api/ShaderRegistry.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/vk_api/Shader.h>
-
-#include <string>
-#include <unordered_map>
-
-#define VK_KERNEL(shader_name) \
-  ::vkcompute::api::shader_registry().get_shader_info(#shader_name)
-
-#define VK_KERNEL_FROM_STR(shader_name_str) \
-  ::vkcompute::api::shader_registry().get_shader_info(shader_name_str)
-
-namespace vkcompute {
-namespace api {
-
-enum class DispatchKey : int8_t {
-  CATCHALL,
-  ADRENO,
-  MALI,
-  OVERRIDE,
-};
-
-class ShaderRegistry final {
-  using ShaderListing = std::unordered_map<std::string, vkapi::ShaderInfo>;
-  using Dispatcher = std::unordered_map<DispatchKey, std::string>;
-  using Registry = std::unordered_map<std::string, Dispatcher>;
-
-  ShaderListing listings_;
-  Dispatcher dispatcher_;
-  Registry registry_;
-
- public:
-  /*
-   * Check if the registry has a shader registered under the given name
-   */
-  bool has_shader(const std::string& shader_name);
-
-  /*
-   * Check if the registry has a dispatch registered under the given name
-   */
-  bool has_dispatch(const std::string& op_name);
-
-  /*
-   * Register a ShaderInfo to a given shader name
-   */
-  void register_shader(vkapi::ShaderInfo&& shader_info);
-
-  /*
-   * Register a dispatch entry to the given op name
-   */
-  void register_op_dispatch(
-      const std::string& op_name,
-      const DispatchKey key,
-      const std::string& shader_name);
-
-  /*
-   * Given a shader name, return the ShaderInfo which contains the SPIRV binary
-   */
-  const vkapi::ShaderInfo& get_shader_info(const std::string& shader_name);
-};
-
-class ShaderRegisterInit final {
-  using InitFn = void();
-
- public:
-  ShaderRegisterInit(InitFn* init_fn) {
-    init_fn();
-  };
-};
-
-// The global shader registry is retrieved using this function, where it is
-// declared as a static local variable.
-ShaderRegistry& shader_registry();
-
-} // namespace api
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/api.h b/backends/vulkan/runtime/api/api.h
deleted file mode 100644
index b5d46b8bba4..00000000000
--- a/backends/vulkan/runtime/api/api.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/Context.h>
-#include <executorch/backends/vulkan/runtime/api/ShaderRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/api/containers/ParamsBuffer.h>
-#include <executorch/backends/vulkan/runtime/api/containers/StagingBuffer.h>
-#include <executorch/backends/vulkan/runtime/api/containers/Tensor.h>
-
-#include <executorch/backends/vulkan/runtime/utils/VecUtils.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/Adapter.h>
-#include <executorch/backends/vulkan/runtime/vk_api/Command.h>
-#include <executorch/backends/vulkan/runtime/vk_api/Descriptor.h>
-#include <executorch/backends/vulkan/runtime/vk_api/Fence.h>
-#include <executorch/backends/vulkan/runtime/vk_api/Pipeline.h>
-#include <executorch/backends/vulkan/runtime/vk_api/Runtime.h>
-#include <executorch/backends/vulkan/runtime/vk_api/Shader.h>
-#include <executorch/backends/vulkan/runtime/vk_api/VkUtils.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Allocation.h>
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Allocator.h>
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Buffer.h>
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Image.h>
diff --git a/backends/vulkan/runtime/api/containers/ParamsBuffer.cpp b/backends/vulkan/runtime/api/containers/ParamsBuffer.cpp
deleted file mode 100644
index 482a5c50be6..00000000000
--- a/backends/vulkan/runtime/api/containers/ParamsBuffer.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/api/containers/ParamsBuffer.h>
-
-#include <cstring>
-
-namespace vkcompute {
-namespace api {
-
-namespace {
-
-void memcpy_to_buffer(
-    const vkapi::VulkanBuffer& src,
-    vkapi::VulkanBuffer& dst) {
-  vkapi::MemoryMap dst_mapping(dst, vkapi::MemoryAccessType::WRITE);
-
-  vkapi::MemoryMap src_mapping(src, vkapi::MemoryAccessType::READ);
-  src_mapping.invalidate();
-
-  void* dst_ptr = dst_mapping.template data<void>();
-  void* src_ptr = src_mapping.template data<void>();
-
-  // @lint-ignore CLANGTIDY facebook-security-vulnerable-memcpy
-  memcpy(dst_ptr, src_ptr, src.mem_size());
-}
-
-} // namespace
-
-ParamsBuffer::ParamsBuffer(const ParamsBuffer& other)
-    : context_p_(other.context_p_), vulkan_buffer_{} {
-  if (other.vulkan_buffer_) {
-    vulkan_buffer_ = context_p_->adapter_ptr()->vma().create_uniform_buffer(
-        other.vulkan_buffer_.mem_size());
-
-    memcpy_to_buffer(other.vulkan_buffer_, vulkan_buffer_);
-  }
-}
-
-ParamsBuffer& ParamsBuffer::operator=(const ParamsBuffer& other) {
-  if (&other != this) {
-    context_p_ = other.context_p_;
-
-    // Move vulkan_buffer_ to another VulkanBuffer for cleanup
-    if (vulkan_buffer_) {
-      vkapi::VulkanBuffer temp_buffer(std::move(vulkan_buffer_));
-      context_p_->register_buffer_cleanup(temp_buffer);
-    }
-    // vulkan_buffer_ should now be empty
-
-    if (other.vulkan_buffer_) {
-      vulkan_buffer_ = context_p_->adapter_ptr()->vma().create_uniform_buffer(
-          other.vulkan_buffer_.mem_size());
-
-      memcpy_to_buffer(other.vulkan_buffer_, vulkan_buffer_);
-    }
-  }
-
-  return *this;
-}
-
-} // namespace api
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/containers/ParamsBuffer.h b/backends/vulkan/runtime/api/containers/ParamsBuffer.h
deleted file mode 100644
index ecc07892cf7..00000000000
--- a/backends/vulkan/runtime/api/containers/ParamsBuffer.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/api/Context.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Buffer.h>
-
-namespace vkcompute {
-namespace api {
-
-class ParamsBuffer final {
- private:
-  Context* context_p_;
-  vkapi::VulkanBuffer vulkan_buffer_;
-
- public:
-  ParamsBuffer() : context_p_{nullptr}, vulkan_buffer_{} {}
-
-  template <typename Block>
-  ParamsBuffer(Context* context_p, const Block& block)
-      : context_p_(context_p),
-        vulkan_buffer_(
-            context_p_->adapter_ptr()->vma().create_params_buffer(block)) {}
-
-  // The last bool argument, though unused, is required to disambiguate this
-  // constructor from the one above.
-  ParamsBuffer(Context* context_p, const VkDeviceSize nbytes, const bool unused)
-      : context_p_(context_p),
-        vulkan_buffer_(
-            context_p_->adapter_ptr()->vma().create_uniform_buffer(nbytes)) {}
-
-  ParamsBuffer(const ParamsBuffer&);
-  ParamsBuffer& operator=(const ParamsBuffer&);
-
-  ParamsBuffer(ParamsBuffer&&) = default;
-  ParamsBuffer& operator=(ParamsBuffer&&) = default;
-
-  ~ParamsBuffer() {
-    if (vulkan_buffer_) {
-      context_p_->register_buffer_cleanup(vulkan_buffer_);
-    }
-  }
-
-  const vkapi::VulkanBuffer& buffer() const {
-    return vulkan_buffer_;
-  }
-
-  template <typename Block>
-  void update(const Block& block, const uint32_t offset = 0) {
-    // Fill the uniform buffer with data in block
-    {
-      vkapi::MemoryMap mapping(vulkan_buffer_, vkapi::kWrite);
-      Block* data_ptr = mapping.template data<Block>(offset);
-
-      *data_ptr = block;
-    }
-  }
-
-  template <typename T>
-  T read() const {
-    T val;
-    if (sizeof(val) != vulkan_buffer_.mem_size()) {
-      VK_THROW(
-          "Attempted to store value from ParamsBuffer to type of different size");
-    }
-    // Read value from uniform buffer and store in val
-    {
-      vkapi::MemoryMap mapping(vulkan_buffer_, vkapi::kRead);
-      T* data_ptr = mapping.template data<T>();
-
-      val = *data_ptr;
-    }
-    return val;
-  }
-};
-
-} // namespace api
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/containers/StagingBuffer.h b/backends/vulkan/runtime/api/containers/StagingBuffer.h
deleted file mode 100644
index 1e9f569fc4a..00000000000
--- a/backends/vulkan/runtime/api/containers/StagingBuffer.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/api/Context.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Buffer.h>
-
-#include <cstring>
-
-namespace vkcompute {
-namespace api {
-
-class StagingBuffer final {
- private:
-  Context* context_p_;
-  vkapi::ScalarType dtype_;
-  vkapi::VulkanBuffer vulkan_buffer_;
-
-  void* mapped_data_;
-
- public:
-  StagingBuffer(
-      Context* context_p,
-      const vkapi::ScalarType dtype,
-      const size_t numel)
-      : context_p_(context_p),
-        dtype_(dtype),
-        vulkan_buffer_(context_p_->adapter_ptr()->vma().create_staging_buffer(
-            element_size(dtype_) * numel)),
-        mapped_data_(nullptr) {}
-
-  StagingBuffer(const StagingBuffer&) = delete;
-  StagingBuffer& operator=(const StagingBuffer&) = delete;
-
-  StagingBuffer(StagingBuffer&&) = default;
-  StagingBuffer& operator=(StagingBuffer&&) = default;
-
-  ~StagingBuffer() {
-    context_p_->register_buffer_cleanup(vulkan_buffer_);
-  }
-
-  inline vkapi::ScalarType dtype() {
-    return dtype_;
-  }
-
-  inline vkapi::VulkanBuffer& buffer() {
-    return vulkan_buffer_;
-  }
-
-  inline void* data() {
-    if (!mapped_data_) {
-      mapped_data_ = vulkan_buffer_.allocation_info().pMappedData;
-    }
-    return mapped_data_;
-  }
-
-  inline size_t numel() {
-    return nbytes() / element_size(dtype_);
-  }
-
-  inline size_t nbytes() {
-    return vulkan_buffer_.mem_size();
-  }
-
-  inline void copy_from(const void* src, const size_t nbytes) {
-    VK_CHECK_COND(nbytes <= this->nbytes());
-    memcpy(data(), src, nbytes);
-    vmaFlushAllocation(
-        vulkan_buffer_.vma_allocator(),
-        vulkan_buffer_.allocation(),
-        0u,
-        VK_WHOLE_SIZE);
-  }
-
-  inline void copy_to(void* dst, const size_t nbytes) {
-    VK_CHECK_COND(nbytes <= this->nbytes());
-    vmaInvalidateAllocation(
-        vulkan_buffer_.vma_allocator(),
-        vulkan_buffer_.allocation(),
-        0u,
-        VK_WHOLE_SIZE);
-    memcpy(dst, data(), nbytes);
-  }
-
-  inline void set_staging_zeros() {
-    memset(data(), 0, nbytes());
-  }
-};
-
-} // namespace api
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
deleted file mode 100644
index 433ae15db4e..00000000000
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ /dev/null
@@ -1,1066 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/api/containers/Tensor.h>
-#include <algorithm>
-#include <cassert>
-#include <cstring>
-
-namespace vkcompute {
-namespace api {
-
-/*
- * Used to infer the sizes of a tensor that would correspond to a given
- * VulkanImage.
- */
-std::vector<int64_t> calculate_sizes(
-    const vkapi::VulkanImage& image,
-    const utils::GPUMemoryLayout memory_layout) {
-  auto sizes = std::vector<int64_t>{
-      image.extents().width, image.extents().height, image.extents().depth};
-  const auto packed_dim = utils::to_packed_dim<int32_t>(memory_layout);
-  sizes.at(packed_dim) *= 4;
-  return sizes;
-}
-
-std::vector<int64_t> calculate_dim_order(
-    const size_t ndim,
-    const int32_t packed_dim) {
-  // Special case for zero dim tensors
-  if (ndim == 0) {
-    return {0};
-  }
-  std::vector<int64_t> dim_order(ndim);
-  // Explicitly convert ndim to signed to prevent underflow
-  int64_t last_dim = int64_t(ndim) - 1 - packed_dim;
-
-  int64_t cur_dim = 0;
-  for (int d = 0; d < ndim; ++d) {
-    if (d == last_dim) {
-      cur_dim++;
-    }
-    dim_order[d] = cur_dim;
-    cur_dim++;
-  }
-  if (last_dim >= 0) {
-    dim_order[ndim - 1] = last_dim;
-  }
-
-  return dim_order;
-}
-
-std::vector<int64_t> calculate_strides(
-    const std::vector<int64_t>& sizes,
-    const std::vector<int64_t>& dim_order) {
-  // For zero dim tensors
-  if (sizes.size() == 0) {
-    return {1};
-  }
-
-  size_t ndim = sizes.size();
-  std::vector<int64_t> strides(ndim);
-
-  strides[dim_order[ndim - 1]] = 1;
-  for (int32_t i = ndim - 2; i >= 0; --i) {
-    if (sizes[dim_order[i + 1]] == 0) {
-      strides[dim_order[i]] = strides[dim_order[i + 1]];
-    } else {
-      strides[dim_order[i]] =
-          strides[dim_order[i + 1]] * sizes[dim_order[i + 1]];
-    }
-  }
-
-  return strides;
-}
-
-/*
- * Axis mapping is somewhat analogous to strides for texture backed tensors.
- *
- * The axis mapping is normalized to 4 dimensions, similar to the padded sizes.
- * The first 3 values of the axis mapping indicate the (X,Y,Z) image texture
- * axis that corresponds to the width, height, and channels dimension of the
- * tensor. Thus the axis mapping can be considered to be in WHCN dimension
- * order.
- *
- * The last value `axis_map.at(3)` indicates the WHCN index of the tensor
- * dimension along which batches will be concatenated. This dimension can be
- * referred to as the "inner dimension" To determine which image texture axis is
- * used for the concatenation, a double lookup will need to be performed
- * (axis_map.at(axis_map.at(3))).
- *
- * The reason for strucuring axis mapping this way is because for the batch dim,
- * two things need to be easily derived:
- *
- * 1. The dim idx of the inner dimension, so that the size of the inner
- *    dimension can be easily determined.
- * 2. The texture axis used to concatenate batches
- *
- * By storing the dim index of the inner dimension instead of the texture axis
- * it maps to, both pieces of information are readily available.
- *
- * The axis mapping allows for permuted views of texture-backed tensors.
- */
-std::vector<int64_t> calculate_axis_map(
-    const std::vector<int64_t>& sizes,
-    utils::AxisMapLayout axis_map_layout) {
-  if (axis_map_layout == utils::AxisMapLayout::OPTIMIZED) {
-    std::vector<int64_t> axis_map(sizes.size() + 1);
-    std::iota(axis_map.begin(), axis_map.end() - 1, 0);
-
-    std::stable_sort(
-        axis_map.begin(), axis_map.end() - 1, [&sizes](size_t i1, size_t i2) {
-          return sizes[i1] < sizes[i2];
-        });
-
-    assert(axis_map.size() > 0);
-    // Find the index of the channel dimension
-    for (size_t i = 0; i < axis_map.size() - 1; ++i) {
-      assert(sizes.size() > axis_map[i]);
-      if (sizes[axis_map[i]] == 2) {
-        axis_map.back() = i;
-        break;
-      }
-    }
-
-    return axis_map;
-  }
-  // default
-  return {0, 1, 2, 2};
-}
-
-bool dim_order_is_valid(const std::vector<int64_t>& dim_order) {
-  int64_t sum = 0;
-  for (size_t i = 0; i < dim_order.size(); ++i) {
-    if (dim_order[i] < 0 || dim_order[i] >= dim_order.size()) {
-      return false;
-    }
-    sum += dim_order[i];
-  }
-  int64_t n = static_cast<int64_t>(dim_order.size() - 1);
-  // Sanity check that the sum of the indices in the vector is equal to the sum
-  // of 0 + 1 + 2 + ... + (ndim - 1)
-  return sum == n * (n + 1) / 2;
-}
-
-utils::ivec4 flip_and_unsqueeze_ivec4(
-    const std::vector<int64_t>& tensor_metadata,
-    const vTensor::Attribute metadata_type,
-    const size_t numel) {
-  VK_CHECK_COND(tensor_metadata.size() <= 4);
-  std::vector<int32_t> flipped_metadata =
-      flip_and_unsqueeze<int32_t>(tensor_metadata, metadata_type, numel);
-  return {
-      flipped_metadata.at(0),
-      flipped_metadata.at(1),
-      flipped_metadata.at(2),
-      flipped_metadata.at(3),
-  };
-}
-
-std::vector<int64_t> calculate_padded_sizes(
-    const std::vector<int64_t>& sizes,
-    const int32_t packed_dim) {
-  int64_t ndim = sizes.size();
-  if (ndim == 0) {
-    ndim = 1;
-  }
-
-  // Tensor sizes will be unsqueezed up to the next multiple of 4
-  const int64_t ndim_up4 = utils::align_up_4(ndim);
-  std::vector<int64_t> padded_sizes(ndim_up4);
-  for (int64_t i = 0; i < ndim_up4; ++i) {
-    padded_sizes.at(i) = utils::val_at(i - ndim_up4, sizes);
-  }
-
-  // Pad the packed dim to the next multiple of 4.
-  const int64_t dim_offset = packed_dim + 1;
-  const int64_t padded_dim_size = utils::val_at(-dim_offset, sizes);
-  padded_sizes.at(ndim_up4 - dim_offset) = utils::align_up_4(padded_dim_size);
-
-  return padded_sizes;
-}
-
-utils::uvec3 calculate_image_extents(
-    const std::vector<int64_t>& padded_sizes,
-    const std::vector<int64_t>& axis_map,
-    const int32_t packed_dim) {
-  utils::uvec3 extents({1, 1, 1});
-
-  // For high dimensional tensors, buffer storage must be used. No need to
-  // compute image extents in this case.
-  if (padded_sizes.size() > 4) {
-    return extents;
-  }
-
-  // First three elements of axis_map indicate which (X,Y,Z) image axis the
-  // width, height, and channels dim of the tensor maps to.
-  for (int whcn_dim = 0; whcn_dim < 3; ++whcn_dim) {
-    const int64_t axis = axis_map.at(whcn_dim);
-    const int64_t dim = padded_sizes.size() - 1 - whcn_dim;
-    extents[axis] = utils::safe_downcast<uint32_t>(padded_sizes.at(dim));
-  }
-
-  // axis_map[3] indicates the WHCN index of the dimension used for batch
-  // concatenation. Thus a double lookup is required to determine the image axis
-  // used for batch concatenation.
-  const int64_t concatted_whcn_dim = axis_map.at(3);
-  const int64_t batch_axis = axis_map.at(concatted_whcn_dim);
-  // Multiply the extents of the batch axis by the batch size.
-  extents[batch_axis] *= padded_sizes.at(0);
-
-  VK_CHECK_COND(extents[axis_map.at(packed_dim)] % 4 == 0);
-  extents[axis_map.at(packed_dim)] /= 4;
-  return extents;
-}
-
-/*
- * The physical image extents describe the size of an allocated texture resource
- * i.e. how many texels in the width, height and depth axis of the image.
- * However, the axis map allows a tensor logical dimension to map to a different
- * physical texture axis; in essence, it describes a permutation between the
- * logical width, height, channels, etc. dimensions of a tensor and the width,
- * height, depth axis of a texture.
- *
- * The "logical extents" is simply the physical image extents permuted by the
- * axis mapping. The logical extents is useful for constructing global work
- * group sizes, so that it is easier to convert the global thread ID to a
- * tensor index.
- */
-utils::uvec3 calculate_logical_limits(
-    const utils::uvec3& image_extents,
-    const std::vector<int64_t>& axis_map) {
-  return {
-      image_extents[axis_map.at(0)],
-      image_extents[axis_map.at(1)],
-      image_extents[axis_map.at(2)],
-  };
-}
-
-/*
- * Convenience overload of the above function to calculate logical limits
- * directly from tensor sizes.
- */
-utils::uvec3 calculate_logical_limits(
-    const std::vector<int64_t>& sizes,
-    const std::vector<int64_t>& axis_map,
-    const int32_t packed_dim) {
-  return calculate_logical_limits(
-      calculate_image_extents(
-          calculate_padded_sizes(sizes, packed_dim), axis_map, packed_dim),
-      axis_map);
-}
-
-int64_t calculate_gpu_buffer_numel(
-    Context* const context,
-    const std::vector<int64_t>& sizes,
-    const utils::uvec3 image_extents,
-    const utils::StorageType storage_type,
-    const vkapi::ScalarType dtype) {
-  // For texture backed tensors, simply multiply the total number of texels by 4
-  if (storage_type != utils::kBuffer) {
-    return image_extents[0] * image_extents[1] * image_extents[2] * 4;
-  }
-  const bool is_int8 = dtype == vkapi::kChar;
-  const bool int8_supported =
-      context->adapter_ptr()->has_full_int8_buffers_support();
-  const size_t numel = utils::multiply_integers(sizes);
-  // For int8 tensors, if the device does not support int8 buffers, then int32
-  // is used instead to represent the buffer data. Therefore the number of
-  // elements in the buffer is aligned to the next multiple of 4.
-  if (is_int8 && int8_supported) {
-    return utils::align_up_4(numel);
-  }
-  return numel;
-}
-
-template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
-int32_t pack_into_int32(const std::vector<T>& vec, const int32_t extra) {
-  int32_t packed = static_cast<int32_t>(
-      vec.at(0) + (vec.at(1) << 4) + (vec.at(2) << 8) + (vec.at(3) << 12) +
-      (extra << 16));
-  return packed;
-}
-
-int32_t create_hashed_layout(
-    const std::vector<int64_t>& dim_order,
-    const std::vector<int64_t>& axis_map,
-    const int32_t packed_dim,
-    const utils::StorageType storage_type) {
-  if (storage_type == utils::kBuffer) {
-    return pack_into_int32(
-        flip_and_unsqueeze<int64_t>(dim_order, kTensorDimOrder, 0), 0);
-  }
-  return pack_into_int32(axis_map, packed_dim);
-}
-
-size_t calculate_max_ubo_nbytes(
-    const size_t min_nbytes_per_ubo,
-    const utils::StorageType storage_type) {
-  size_t ivec4_ubo_nbytes = utils::align_up(size_t(16), min_nbytes_per_ubo);
-  size_t uvec3_ubo_nbytes = utils::align_up(size_t(12), min_nbytes_per_ubo);
-  size_t int32_ubo_nbytes = utils::align_up(size_t(4), min_nbytes_per_ubo);
-  if (storage_type == utils::kBuffer) {
-    // sizes, strides, dim order, numel
-    return 3 * ivec4_ubo_nbytes + int32_ubo_nbytes;
-  }
-  // sizes, logical limits
-  return ivec4_ubo_nbytes + uvec3_ubo_nbytes;
-}
-
-//
-// vTensorStorage
-//
-
-utils::StorageType storage_type(const vkapi::VulkanImage& image) {
-  const auto type = image.type();
-  switch (type) {
-    case VK_IMAGE_TYPE_3D:
-      return utils::kTexture3D;
-    case VK_IMAGE_TYPE_2D:
-      return utils::kTexture2D;
-    default:
-      VK_THROW("Unsupported image type", type);
-  }
-}
-
-vkapi::VulkanImage allocate_image(
-    Context* const context_ptr,
-    utils::uvec3& image_extents,
-    const utils::StorageType storage_type,
-    const VkFormat image_format,
-    const bool allocate_memory) {
-  vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr();
-
-  vkapi::ImageSampler::Properties sampler_props{
-      VK_FILTER_NEAREST,
-      VK_SAMPLER_MIPMAP_MODE_NEAREST,
-      VK_SAMPLER_ADDRESS_MODE_REPEAT,
-      VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK,
-  };
-
-  VkImageType image_type = VK_IMAGE_TYPE_3D;
-  VkImageViewType image_view_type;
-
-  switch (storage_type) {
-    case utils::kTexture3D:
-      image_type = VK_IMAGE_TYPE_3D;
-      image_view_type = VK_IMAGE_VIEW_TYPE_3D;
-      break;
-    case utils::kTexture2D:
-      image_type = VK_IMAGE_TYPE_2D;
-      image_view_type = VK_IMAGE_VIEW_TYPE_2D;
-      break;
-    default:
-      // Return an empty VulkanImage by default
-      return vkapi::VulkanImage();
-  }
-
-    // TODO(ssjia): change to always check that the image extents do not exceed
-    // physical limits. Adding the check now based on `maxImageDimension3D` will
-    // cause some existing models to break. Anecdotally, on Adreno and
-    // SwiftShader devices, using 3D textures that exceed `maxImageDimension3D`
-    // appears to be ok. So we need to figure out if is it undefined behaviour
-    // or if there's a better way to figure out what the limit is. For now, only
-    // check during debug build so that we can detect when exceeding physical
-    // limits could be a potential cause for model outputs to be wrong. In the
-    // meantime, the threshold for using texture storage can be configured at
-    // export time.
-#ifdef VULKAN_DEBUG
-  uint32_t max_extent = storage_type == utils::kTexture3D
-      ? adapter_ptr->max_texture3d_dim()
-      : adapter_ptr->max_texture2d_dim();
-
-  VK_CHECK_COND(
-      image_extents[0] <= max_extent && image_extents[1] <= max_extent &&
-      image_extents[2] <= max_extent);
-#endif
-
-  VkSampler sampler = adapter_ptr->sampler_cache().retrieve(sampler_props);
-
-  return adapter_ptr->vma().create_image(
-      context_ptr->device(),
-      vkapi::create_extent3d(image_extents),
-      image_format,
-      image_type,
-      context_ptr->preferred_image_tiling(),
-      image_view_type,
-      sampler_props,
-      sampler,
-      /*allow_transfer = */ true,
-      /*allocate_memory = */ allocate_memory);
-}
-
-vkapi::VulkanBuffer allocate_buffer(
-    Context* const context_ptr,
-    const int64_t numel,
-    const utils::StorageType storage_type,
-    const vkapi::ScalarType dtype,
-    const bool allocate_memory) {
-  vkapi::Adapter* adapter_ptr = context_ptr->adapter_ptr();
-
-  switch (storage_type) {
-    case utils::kBuffer:
-      break;
-    default:
-      // Return an empty VulkanBuffer if Buffer storage is not used
-      return vkapi::VulkanBuffer();
-  }
-
-  VK_CHECK_COND(numel <= context_ptr->adapter_ptr()->max_buffer_numel());
-
-  return adapter_ptr->vma().create_storage_buffer(
-      element_size(dtype) * numel, allocate_memory);
-}
-
-vTensorStorage::vTensorStorage(
-    Context* const context,
-    const utils::StorageType storage_type,
-    const std::vector<int64_t>& axis_map,
-    const int32_t packed_dim,
-    const std::vector<int64_t>& sizes,
-    const vkapi::ScalarType dtype,
-    const bool allocate_memory)
-    : context_(context),
-      storage_type_{storage_type},
-      image_extents_(calculate_image_extents(
-          calculate_padded_sizes(sizes, packed_dim),
-          axis_map,
-          packed_dim)),
-      buffer_length_{calculate_gpu_buffer_numel(
-          context_,
-          sizes,
-          image_extents_,
-          storage_type,
-          dtype)},
-      buffer_offset_{0},
-      image_(allocate_image(
-          context_,
-          image_extents_,
-          storage_type_,
-          to_vkformat(dtype),
-          allocate_memory)),
-      buffer_(allocate_buffer(
-          context_,
-          buffer_length_,
-          storage_type_,
-          dtype,
-          allocate_memory)),
-      last_access_{} {}
-
-vTensorStorage::vTensorStorage(
-    Context* const context,
-    const vkapi::VulkanImage& image)
-    : context_(context),
-      storage_type_{storage_type(image)},
-      image_extents_(
-          {image.extents().width,
-           image.extents().height,
-           image.extents().depth}),
-      buffer_length_{0},
-      buffer_offset_{0},
-      image_(image),
-      buffer_(vkapi::VulkanBuffer()),
-      last_access_{} {}
-
-vTensorStorage::~vTensorStorage() {
-  flush();
-}
-
-void vTensorStorage::flush() {
-  if (image_) {
-    context_->register_image_cleanup(image_);
-  } else if (buffer_) {
-    context_->register_buffer_cleanup(buffer_);
-  }
-  last_access_ = {};
-}
-
-void vTensorStorage::transition(
-    vkapi::PipelineBarrier& pipeline_barrier,
-    const vkapi::PipelineStageFlags cur_stage,
-    const vkapi::MemoryAccessFlags cur_access) {
-  // Get last stage access
-  vkapi::PipelineStageFlags prev_stage = last_access_.stage;
-  vkapi::MemoryAccessFlags prev_access = last_access_.access;
-
-  const bool prev_written = (prev_access & vkapi::MemoryAccessType::WRITE) != 0;
-  const bool cur_written = (cur_access & vkapi::MemoryAccessType::WRITE) != 0;
-
-  VkImageLayout cur_layout = VK_IMAGE_LAYOUT_UNDEFINED;
-  VkImageLayout new_layout = VK_IMAGE_LAYOUT_UNDEFINED;
-  bool layout_changed = false;
-  if (image_) {
-    cur_layout = image_.layout();
-    new_layout = vkapi::vk_layout(cur_stage, cur_access);
-
-    layout_changed = cur_layout != new_layout;
-  }
-
-  // RAW: need to make sure current read sees previous writes
-  // WAW: need to make sure the current write occurs after previous write so
-  //      the final value is correct.
-  // WAR: need to make sure previous read does not read the value from the
-  //      current write.
-  // RAR: no need for synchronization
-  if (prev_written || cur_written || layout_changed) {
-    VkPipelineStageFlags src_stage = vkapi::vk_stage(prev_stage);
-    if (0u == src_stage) {
-      src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
-    }
-    VkPipelineStageFlags dst_stage = vkapi::vk_stage(cur_stage);
-    if (0u == dst_stage) {
-      dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
-    }
-
-    pipeline_barrier.stage.src |= src_stage;
-    pipeline_barrier.stage.dst |= dst_stage;
-
-    if (image_) {
-      pipeline_barrier.images.emplace_back(
-          vkapi::vk_access(prev_stage, prev_access),
-          vkapi::vk_access(cur_stage, cur_access),
-          cur_layout,
-          new_layout,
-          image_);
-
-      image_.set_layout(new_layout);
-    } else if (buffer_) {
-      pipeline_barrier.buffers.emplace_back(
-          vkapi::vk_access(prev_stage, prev_access),
-          vkapi::vk_access(cur_stage, cur_access),
-          buffer_);
-    }
-  }
-
-  last_access_.stage = cur_stage;
-  last_access_.access = cur_access;
-}
-
-//
-// vTensor
-//
-
-vTensor::vTensor(
-    Context* const context,
-    const std::vector<int64_t>& sizes,
-    const vkapi::ScalarType dtype,
-    const utils::StorageType storage_type,
-    const utils::GPUMemoryLayout memory_layout,
-    const bool allocate_memory,
-    const utils::AxisMapLayout axis_map_layout)
-    : dtype_(dtype),
-      // Calculate tensor metadata
-      sizes_(sizes.begin(), sizes.end()),
-      packed_dim_(utils::to_packed_dim<int32_t>(memory_layout)),
-      dim_order_(calculate_dim_order(sizes_.size(), packed_dim_)),
-      axis_map_(calculate_axis_map(sizes_, axis_map_layout)),
-      strides_(calculate_strides(sizes, dim_order_)),
-      numel_(utils::multiply_integers(sizes_)),
-      hashed_layout_(create_hashed_layout(
-          dim_order_,
-          axis_map_,
-          packed_dim_,
-          storage_type)),
-      // Related to tensor metadata UBOs
-      min_nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
-      max_ubo_nbytes_{
-          calculate_max_ubo_nbytes(min_nbytes_per_ubo_, storage_type)},
-      uniforms_(),
-      buffer_meta_(),
-      // Construct Tensor storage
-      storage_(std::make_shared<vTensorStorage>(
-          context,
-          storage_type,
-          axis_map_,
-          packed_dim_,
-          sizes,
-          dtype_,
-          allocate_memory)) {
-  // uniform_data_ only valid for low dim tensors
-  if (sizes.size() <= 4) {
-    uniform_data_ = std::make_shared<UniformData>(UniformData{
-        numel_,
-        sizes_,
-        dim_order_,
-        strides_,
-        calculate_logical_limits(storage_->image_extents_, axis_map_)});
-  }
-
-  VK_CHECK_COND(
-      dim_order_is_valid(dim_order_), "computed dim order is invalid");
-}
-
-// NOLINTNEXTLINE
-vTensor::vTensor(
-    Context* context,
-    const vkapi::VulkanImage& image,
-    const utils::GPUMemoryLayout memory_layout,
-    const utils::AxisMapLayout axis_map_layout)
-    : dtype_(vkapi::element_scalartype(image.format())),
-      // Calculate tensor metadata
-      sizes_(calculate_sizes(image, memory_layout)),
-      packed_dim_(utils::to_packed_dim<int32_t>(memory_layout)),
-      dim_order_(),
-      axis_map_(calculate_axis_map(sizes_, axis_map_layout)),
-      strides_(),
-      numel_(utils::multiply_integers(sizes_)),
-      hashed_layout_(create_hashed_layout(
-          dim_order_,
-          axis_map_,
-          packed_dim_,
-          utils::kTexture3D)),
-      // Related to tensor metadata UBOs
-      min_nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
-      max_ubo_nbytes_{
-          calculate_max_ubo_nbytes(min_nbytes_per_ubo_, utils::kTexture3D)},
-      uniforms_(),
-      buffer_meta_(),
-      // Construct Tensor storage
-      storage_(std::make_shared<vTensorStorage>(context, image)) {
-  uniform_data_ = std::make_shared<UniformData>(UniformData{
-      numel_,
-      sizes_,
-      {0, 0, 0, 0},
-      {0, 0, 0, 0},
-      calculate_logical_limits(storage_->image_extents_, axis_map_)});
-}
-
-vTensor::vTensor(vTensor& other)
-    : dtype_(other.dtype_),
-      // Copy tensor size metadata
-      sizes_(other.sizes_.begin(), other.sizes_.end()),
-      packed_dim_{other.packed_dim_},
-      dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
-      axis_map_(other.axis_map_.begin(), other.axis_map_.end()),
-      strides_(other.strides_.begin(), other.strides_.end()),
-      numel_(other.numel_),
-      hashed_layout_(other.hashed_layout_),
-      min_nbytes_per_ubo_{other.min_nbytes_per_ubo_},
-      max_ubo_nbytes_{other.max_ubo_nbytes_},
-      uniforms_(),
-      buffer_meta_(),
-      // Copy Tensor storage
-      storage_(other.storage_) {
-  uniform_data_ = std::make_shared<UniformData>(*other.get_uniform_data());
-}
-
-vTensor::vTensor(
-    vTensor& other,
-    const std::vector<int64_t>& sizes,
-    const std::vector<int64_t>& dim_order)
-    : dtype_(other.dtype_),
-      // Copy tensor size metadata
-      sizes_(sizes.begin(), sizes.end()),
-      packed_dim_(other.packed_dim_),
-      dim_order_(dim_order.begin(), dim_order.end()),
-      axis_map_(calculate_axis_map(sizes_, utils::kDefaultAxisMap)),
-      strides_(calculate_strides(sizes_, dim_order_)),
-      numel_(other.numel_),
-      hashed_layout_(create_hashed_layout(
-          dim_order_,
-          axis_map_,
-          packed_dim_,
-          other.storage_type())),
-      min_nbytes_per_ubo_{other.min_nbytes_per_ubo_},
-      max_ubo_nbytes_{other.max_ubo_nbytes_},
-      uniforms_(),
-      buffer_meta_(),
-      // Copy Tensor storage
-      storage_(other.storage_) {
-  uniform_data_ = std::make_shared<UniformData>(UniformData{
-      static_cast<size_t>(utils::multiply_integers(sizes_)),
-      sizes_,
-      dim_order_,
-      strides_,
-      other.logical_limits()});
-
-  VK_CHECK_COND(
-      dim_order_is_valid(dim_order_), "new dim order provided is invalid");
-}
-
-vTensor::UniformData::UniformData(
-    const size_t numel_ll,
-    const std::vector<int64_t>& sizes,
-    const std::vector<int64_t>& dim_order,
-    const std::vector<int64_t>& strides,
-    const utils::uvec3& limits)
-    : numel(utils::safe_downcast<int32_t>(numel_ll)),
-      sizes_v(flip_and_unsqueeze_ivec4(sizes, kTensorSizes, numel_ll)),
-      dim_order_v(
-          flip_and_unsqueeze_ivec4(dim_order, kTensorDimOrder, numel_ll)),
-      strides_v(flip_and_unsqueeze_ivec4(strides, kTensorStrides, numel_ll)),
-      logical_limits(limits) {}
-
-uint32_t vTensor::UniformData::write_attribute(
-    void* dst,
-    const uint32_t dst_offset,
-    const uint32_t max_dst_size,
-    const Attribute attr) {
-#define WRITE_ATTRIBUTE_CASE(enum_name, member_name)                       \
-  case vTensor::Attribute::enum_name: {                                    \
-    VK_CHECK_COND(                                                         \
-        (dst_offset + sizeof(member_name)) <= max_dst_size,                \
-        "Attempting to write tensor attribute outside data boundary.");    \
-    memcpy((uint8_t*)dst + dst_offset, &member_name, sizeof(member_name)); \
-    return sizeof(member_name);                                            \
-  }
-  switch (attr) {
-    WRITE_ATTRIBUTE_CASE(NUMEL, numel);
-    WRITE_ATTRIBUTE_CASE(SIZES, sizes_v);
-    WRITE_ATTRIBUTE_CASE(WHCN_DIM_ORDER, dim_order_v);
-    WRITE_ATTRIBUTE_CASE(STRIDES, strides_v);
-    WRITE_ATTRIBUTE_CASE(LOGICAL_LIMITS, logical_limits);
-    default:
-      VK_THROW("Invalid Attribute");
-  }
-#undef WRITE_ATTRIBUTE_CASE
-  return 0;
-}
-
-vTensor::BufferMetadata::BufferMetadata(
-    std::vector<int64_t>& src_sizes,
-    std::vector<int64_t>& src_dim_order,
-    std::vector<int64_t>& src_strides,
-    size_t src_numel) {
-  update(src_sizes, src_dim_order, src_strides, src_numel);
-}
-
-void vTensor::BufferMetadata::update(
-    std::vector<int64_t>& src_sizes,
-    std::vector<int64_t>& src_dim_order,
-    std::vector<int64_t>& src_strides,
-    size_t src_numel) {
-  int32_t fixed_ndim = utils::safe_downcast<int32_t>(kTensorDimLimit);
-
-  std::vector<uint32_t> fu_sizes = flip_and_unsqueeze<uint32_t>(
-      src_sizes, kTensorSizes, src_numel, fixed_ndim);
-  std::vector<uint32_t> fu_dim_order = flip_and_unsqueeze<uint32_t>(
-      src_dim_order, kTensorDimOrder, src_numel, fixed_ndim);
-  std::vector<uint32_t> fu_strides = flip_and_unsqueeze<uint32_t>(
-      src_strides, kTensorStrides, src_numel, fixed_ndim);
-
-  for (int i = 0; i < fixed_ndim; ++i) {
-    sizes[i] = fu_sizes.at(i);
-    dim_order[i] = fu_dim_order.at(i);
-    strides[i] = fu_strides.at(i);
-  }
-
-  ndim = utils::safe_downcast<uint32_t>(src_sizes.size());
-  numel = utils::safe_downcast<uint32_t>(src_numel);
-}
-
-vkapi::VulkanImage& vTensor::image(
-    vkapi::PipelineBarrier& pipeline_barrier,
-    const vkapi::PipelineStageFlags stage) & {
-  storage_->transition(pipeline_barrier, stage, vkapi::MemoryAccessType::READ);
-  return storage_->image_;
-}
-
-vkapi::VulkanImage& vTensor::image(
-    vkapi::PipelineBarrier& pipeline_barrier,
-    const vkapi::PipelineStageFlags stage,
-    const vkapi::MemoryAccessFlags access) & {
-  storage_->transition(pipeline_barrier, stage, access);
-  return storage_->image_;
-}
-
-vkapi::VulkanBuffer& vTensor::buffer(
-    vkapi::PipelineBarrier& pipeline_barrier,
-    const vkapi::PipelineStageFlags stage) & {
-  storage_->transition(pipeline_barrier, stage, vkapi::MemoryAccessType::READ);
-  return storage_->buffer_;
-}
-
-vkapi::VulkanBuffer& vTensor::buffer(
-    vkapi::PipelineBarrier& pipeline_barrier,
-    const vkapi::PipelineStageFlags stage,
-    const vkapi::MemoryAccessFlags access) & {
-  storage_->transition(pipeline_barrier, stage, access);
-  return storage_->buffer_;
-}
-
-utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
-  switch (packed_dim_) {
-    case WHCN::kWidthDim:
-      return utils::kWidthPacked;
-    case WHCN::kHeightDim:
-      return utils::kHeightPacked;
-    case WHCN::kChannelsDim:
-      return utils::kChannelsPacked;
-    default:
-      VK_THROW("Invalid packed dim");
-  }
-}
-
-bool vTensor::is_contiguous() const {
-  if (storage_type() != utils::kBuffer) {
-    return false;
-  }
-  for (size_t i = 0; i < dim_order_.size(); ++i) {
-    if (dim_order_.at(i) != i) {
-      return false;
-    }
-  }
-  return true;
-}
-
-size_t vTensor::get_max_ubo_nbytes(const size_t nbytes_per_ubo) const {
-  // For texture backed tensors, the metadata fields needed are:
-  // sizes, logical limits
-  size_t max_metadata_field_count = 2u;
-  if (storage_type() == utils::kBuffer) {
-    // sizes, strides, dim order, numel
-    max_metadata_field_count = 4u;
-  }
-  return max_metadata_field_count * nbytes_per_ubo;
-}
-
-const vkapi::BufferBindInfo vTensor::sizes_ubo() {
-  VK_CHECK_COND(sizes_.size() <= 4);
-  return metadata_ubo_impl(&sizes_uniform_offset_, uniform_data_->sizes_v);
-}
-
-const vkapi::BufferBindInfo vTensor::dim_order_ubo() {
-  VK_CHECK_COND(sizes_.size() <= 4);
-  return metadata_ubo_impl(
-      &dim_order_uniform_offset_, uniform_data_->dim_order_v);
-}
-
-const vkapi::BufferBindInfo vTensor::strides_ubo() {
-  VK_CHECK_COND(sizes_.size() <= 4);
-  return metadata_ubo_impl(&strides_uniform_offset, uniform_data_->strides_v);
-}
-
-const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
-  VK_CHECK_COND(sizes_.size() <= 4);
-  return metadata_ubo_impl(
-      &logical_limits_uniform_offset_, uniform_data_->logical_limits);
-}
-
-const vkapi::BufferBindInfo vTensor::numel_ubo() {
-  VK_CHECK_COND(sizes_.size() <= 4);
-  return metadata_ubo_impl(&numel_uniform_offset_, uniform_data_->numel);
-}
-
-const vkapi::BufferBindInfo vTensor::buffer_meta_ubo() {
-  size_t ubo_nbytes = sizeof(BufferMetadata);
-  if (!buffer_meta_.buffer()) {
-    BufferMetadata data(sizes_, dim_order_, strides_, numel_);
-    buffer_meta_ = ParamsBuffer(storage_->context_, data);
-  }
-  return vkapi::BufferBindInfo(buffer_meta_.buffer(), 0, ubo_nbytes);
-}
-
-VkMemoryRequirements vTensor::get_memory_requirements() const {
-  switch (storage_type()) {
-    case utils::kBuffer:
-      return storage_->buffer_.get_memory_requirements();
-    case utils::kTexture2D:
-    case utils::kTexture3D:
-      return storage_->image_.get_memory_requirements();
-  }
-  return {};
-}
-
-bool vTensor::memory_is_bound() const {
-  switch (storage_type()) {
-    case utils::kBuffer:
-      return storage_->buffer_.has_memory();
-    case utils::kTexture2D:
-    case utils::kTexture3D:
-      return storage_->image_.has_memory();
-  }
-}
-
-void vTensor::bind_allocation(const vkapi::Allocation& allocation) {
-  switch (storage_type()) {
-    case utils::kBuffer:
-      storage_->buffer_.bind_allocation(allocation);
-      break;
-    case utils::kTexture2D:
-    case utils::kTexture3D:
-      storage_->image_.bind_allocation(allocation);
-      break;
-  }
-}
-
-void vTensor::acquire_allocation(vkapi::Allocation&& allocation) {
-  switch (storage_type()) {
-    case utils::kBuffer:
-      storage_->buffer_.acquire_allocation(std::move(allocation));
-      break;
-    case utils::kTexture2D:
-    case utils::kTexture3D:
-      storage_->image_.acquire_allocation(std::move(allocation));
-      break;
-  }
-}
-
-void vTensor::update_metadata() {
-  numel_ = utils::multiply_integers(sizes_);
-  strides_ = calculate_strides(sizes_, dim_order_);
-
-  // Update uniform data if it has been modified
-  if (sizes_.size() <= 4) {
-    uniform_data_->numel = utils::safe_downcast<int32_t>(numel_);
-    uniform_data_->sizes_v =
-        flip_and_unsqueeze_ivec4(sizes_, kTensorSizes, numel_);
-    uniform_data_->dim_order_v =
-        flip_and_unsqueeze_ivec4(dim_order_, kTensorDimOrder, numel_);
-    uniform_data_->strides_v =
-        flip_and_unsqueeze_ivec4(strides_, kTensorStrides, numel_);
-    uniform_data_->logical_limits.limits =
-        calculate_logical_limits(sizes_, axis_map_, packed_dim_);
-
-    if (sizes_uniform_offset_ != kUniformOffsetUnset) {
-      uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_);
-    }
-    if (dim_order_uniform_offset_ != kUniformOffsetUnset) {
-      uniforms_.update(uniform_data_->dim_order_v, dim_order_uniform_offset_);
-    }
-    if (strides_uniform_offset != kUniformOffsetUnset) {
-      uniforms_.update(uniform_data_->strides_v, strides_uniform_offset);
-    }
-    if (numel_uniform_offset_ != kUniformOffsetUnset) {
-      uniforms_.update(numel_, numel_uniform_offset_);
-    }
-    if (logical_limits_uniform_offset_ != kUniformOffsetUnset) {
-      uniforms_.update(
-          uniform_data_->logical_limits.limits, logical_limits_uniform_offset_);
-    }
-  }
-
-  if (buffer_meta_.buffer()) {
-    BufferMetadata data(sizes_, dim_order_, strides_, numel_);
-    buffer_meta_.update(data);
-  }
-}
-
-void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
-  if (storage_type() != utils::kBuffer) {
-    // For texture storage check that the current texture is large enough for
-    // the new sizes of the tensor.
-    utils::uvec3 virtual_extents = calculate_image_extents(
-        calculate_padded_sizes(sizes_, packed_dim_), axis_map_, packed_dim_);
-
-    bool valid_resize = virtual_extents[0] <= storage_->image_extents_[0];
-    valid_resize =
-        valid_resize && virtual_extents[1] <= storage_->image_extents_[1];
-    valid_resize =
-        valid_resize && virtual_extents[2] <= storage_->image_extents_[2];
-
-    VK_CHECK_COND(
-        valid_resize,
-        "tensor sizes requires a larger texture than the current one.");
-  } else {
-    // For buffer storage check that the current buffer is large enough for the
-    // new sizes of the tensor.
-    int64_t numel = utils::multiply_integers(sizes);
-    bool valid_resize =
-        numel + storage_->buffer_offset_ <= storage_->buffer_length_;
-    VK_CHECK_COND(
-        valid_resize,
-        "tensor sizes requires a larger buffer than the current one.");
-  }
-}
-
-void vTensor::virtual_reconfigure(
-    const std::vector<int64_t>& new_sizes,
-    const std::vector<int64_t>& new_dim_order) {
-  VK_CHECK_COND(
-      storage_type() == utils::kBuffer,
-      "virtual_reconfigure is only applicable for buffer backed tensors");
-  VK_CHECK_COND(new_sizes.size() == new_dim_order.size());
-  VK_CHECK_COND(dim_order_is_valid(new_dim_order));
-
-  check_sizes(new_sizes);
-  sizes_ = new_sizes;
-  dim_order_ = new_dim_order;
-
-  // Update the hashed layout because dim order is updated
-  hashed_layout_ =
-      create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type());
-
-  update_metadata();
-}
-
-void vTensor::virtual_clone(const vTensor& other) {
-  VK_CHECK_COND(is_view_of(other));
-  sizes_ = other.sizes_;
-  dim_order_ = other.dim_order_;
-  axis_map_ = other.axis_map_;
-  packed_dim_ = other.packed_dim_;
-  hashed_layout_ = other.hashed_layout_;
-
-  *uniform_data_ = *other.get_uniform_data();
-}
-
-void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
-  VK_CHECK_COND(
-      new_sizes.size() == dim_order_.size(),
-      "new sizes cannot modify the dimensionality of the tensor ");
-
-  check_sizes(new_sizes);
-  sizes_ = new_sizes;
-  update_metadata();
-}
-
-/*
- * Transposing the dim order is a bit unintuitive. dim0 and dim1 have swapped
- * their "identities", so we need to swap the values of dim0 and dim1 wherever
- * they appear in the dim order vector. Compare this to just swapping the
- * elements at dim0 and dim1 in the `sizes` vectors.
- */
-void transpose_dim_order_inplace(
-    std::vector<int64_t>& dim_order,
-    const int64_t dim0,
-    const int64_t dim1) {
-  for (int i = 0; i < dim_order.size(); ++i) {
-    if (dim_order[i] == dim0) {
-      dim_order[i] = dim1;
-    } else if (dim_order[i] == dim1) {
-      dim_order[i] = dim0;
-    }
-  }
-}
-
-void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) {
-  std::iter_swap(sizes_.begin() + dim0, sizes_.begin() + dim1);
-
-  const int dim0_whcn = sizes_.size() - 1 - dim0;
-  const int dim1_whcn = sizes_.size() - 1 - dim1;
-  if (packed_dim_ == dim0_whcn) {
-    packed_dim_ = dim1_whcn;
-  } else if (packed_dim_ == dim1_whcn) {
-    packed_dim_ = dim0_whcn;
-  }
-
-  if (storage_type() == utils::kBuffer) {
-    transpose_dim_order_inplace(dim_order_, dim0, dim1);
-  } else {
-    // Cannot transpose batch dimension for texture storage
-    VK_CHECK_COND(dim0_whcn < 3 && dim1_whcn < 3);
-    std::iter_swap(
-        axis_map_.begin() + dim0_whcn, axis_map_.begin() + dim1_whcn);
-    // Update the "identity" of the concatted dimension
-    if (axis_map_.at(3) == dim0_whcn) {
-      axis_map_.at(3) = dim1_whcn;
-    } else if (axis_map_.at(3) == dim1_whcn) {
-      axis_map_.at(3) = dim0_whcn;
-    }
-  }
-
-  // Update the hashed layout because dim order / axis mpa is updated
-  hashed_layout_ =
-      create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type());
-
-  update_metadata();
-}
-
-} // namespace api
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
deleted file mode 100644
index 66c1fd1e4da..00000000000
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ /dev/null
@@ -1,759 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/api/Context.h>
-
-#include <executorch/backends/vulkan/runtime/api/containers/ParamsBuffer.h>
-
-#include <executorch/backends/vulkan/runtime/utils/StorageUtils.h>
-
-namespace vkcompute {
-namespace api {
-
-static constexpr size_t kTensorDimLimit = 8;
-
-/*
- * Given a GPUMemoryLayout value, produce a dim order vector that matches the
- * given memory layout. The produced dim order vector will be in the NCHW
- * dimension order
- */
-std::vector<int64_t> calculate_dim_order(
-    const size_t ndim,
-    const int32_t packed_dim);
-
-/*
- * Given the sizes of a tensor and the dim order of the tensor (both in NCHW)
- * dimension order, calculate the strides of the tensor.
- */
-std::vector<int64_t> calculate_strides(
-    const std::vector<int64_t>& sizes,
-    const std::vector<int64_t>& dim_order);
-
-/*
- * When stored on the GPU, tensor data is stored using texels (i.e. a vector of
- * 4 scalar values) in order to take advantage of the GPU's native vectorization
- * capabilities. Furthermore, tensor metadata is passed in to shaders as ivec4
- * types.
- *
- * To accommodate these vectorized types, the sizes of a tensor will be modified
- * for GPU storage in the following ways:
- *
- *   1. The dimensionality of the tensor will be padded to a multiple of 4.
- *   2. The size of the packed dimension will be padded to a multiple of 4.
- *
- * The "packed dimension" is determined based on the utils::GPUMemoryLayout
- * argument.
- */
-std::vector<int64_t> calculate_padded_sizes(
-    const std::vector<int64_t>& sizes,
-    const int32_t packed_dim);
-
-/*
- * Calculate the image extents required of a texture backed tensor.
- */
-utils::uvec3 calculate_image_extents(
-    const std::vector<int64_t>& padded_sizes,
-    const std::vector<int64_t>& axis_map,
-    const int32_t packed_dim);
-
-struct LastAccess {
-  vkapi::PipelineStageFlags stage;
-  vkapi::MemoryAccessFlags access;
-
-  LastAccess()
-      : stage{vkapi::PipelineStage::NO_STAGE},
-        access{vkapi::MemoryAccessType::NONE} {}
-
-  LastAccess(
-      vkapi::PipelineStageFlags stage_flags,
-      vkapi::MemoryAccessFlags access_flags)
-      : stage{stage_flags}, access{access_flags} {}
-};
-
-/*
- * Calculate the number of elements that a GPU buffer would require to store the
- * contents of a tensor. This will depend on the storage type and dtype of the
- * tensor, as well as the features available on the device.
- */
-int64_t calculate_gpu_buffer_numel(
-    Context* const context,
-    const std::vector<int64_t>& sizes,
-    const utils::uvec3 image_extents,
-    const utils::StorageType storage_type,
-    const vkapi::ScalarType dtype);
-
-class vTensorStorage final {
- public:
-  // Do not allow empty vTensorStorage construction
-  vTensorStorage() = default;
-
-  vTensorStorage(
-      Context* context,
-      const utils::StorageType storage_type,
-      const std::vector<int64_t>& axis_map,
-      const int32_t packed_dim,
-      const std::vector<int64_t>& sizes,
-      const vkapi::ScalarType dtype,
-      const bool allocate_memory = true);
-
-  vTensorStorage(Context* const context, const vkapi::VulkanImage& image);
-
- public:
-  vTensorStorage(vTensorStorage& other) = delete;
-  vTensorStorage& operator=(const vTensorStorage& other) = delete;
-
-  vTensorStorage(vTensorStorage&& other) = default;
-  vTensorStorage& operator=(vTensorStorage&& other) = default;
-
-  ~vTensorStorage();
-
-  friend class vTensor;
-
- private:
-  // Context
-  Context* context_{};
-
-  utils::StorageType storage_type_;
-
-  // Resource sizings
-  utils::uvec3 image_extents_{};
-  int64_t buffer_length_{};
-  int64_t buffer_offset_{};
-
-  // GPU Storage
-  mutable vkapi::VulkanImage image_;
-  mutable vkapi::VulkanBuffer buffer_;
-
-  // Last Access - used to insert memory barriers
-  LastAccess last_access_;
-
- private:
-  // Registers underlying memory for cleanup
-  void flush();
-
-  // Memory barrier insertion
-  void transition(
-      vkapi::PipelineBarrier&,
-      const vkapi::PipelineStageFlags,
-      const vkapi::MemoryAccessFlags);
-
-  // Validation
-  void verify() const;
-
- public:
-  inline size_t buffer_len() const {
-    return utils::safe_downcast<size_t>(buffer_length_);
-  }
-
-  inline VkFormat texture_format() {
-    return image_.format();
-  }
-};
-
-class vTensor final {
-  struct TextureLimits {
-    // Alignment is required to conform with Vulkan specification; a 3 or 4
-    // component vector with components of size N must have base alignment of
-    // 4N.
-    alignas(16) utils::ivec3 limits;
-
-    TextureLimits(const utils::uvec3& ulimits) : limits{ulimits} {}
-  };
-
- public:
-  explicit vTensor(
-      Context* context,
-      const std::vector<int64_t>& sizes,
-      const vkapi::ScalarType dtype,
-      const utils::StorageType storage_type = utils::kTexture3D,
-      const utils::GPUMemoryLayout memory_layout = utils::kChannelsPacked,
-      const bool allocate_memory = true,
-      const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap);
-
-  vTensor(const vTensor& other) = delete;
-
-  explicit vTensor(
-      Context* context,
-      const vkapi::VulkanImage& image,
-      const utils::GPUMemoryLayout memory_layout = utils::kChannelsPacked,
-      const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap);
-
-  /*
-   * This constructor allows for the creation of a vTensor that references the
-   * same buffer resource of another vTensor, with the same sizes and strides
-   * metadata. The created vTensor will not own the underlying resource. This is
-   * only applicable for buffer backed tensors at the moment.
-   *
-   * Once created, the sizes and strides of the aliased vTensor can be changed
-   * using the `virtual_reconfigure` member function.
-   */
-  vTensor(vTensor& other);
-
-  /*
-   * This constructor allows for the creation of a vTensor that references the
-   * same buffer resource of another vTensor, but with different sizes and
-   * strides metatdata. The created vTensor will not own the underlying
-   * resource. This is only applicable for buffer backed tensors at the moment.
-   *
-   * Note that dim order is used as the source of truth regarding the strides,
-   * and the new strides are computed from the new sizes and new dim order.
-   * Thus only the dim order is provided as an argument to this function.
-   *
-   * The offset_numel argument allows the aliased tensor's memory region to
-   * begin at an offset of N elements from the start of the original tensor's
-   * buffer.
-   */
-  vTensor(
-      vTensor& other,
-      const std::vector<int64_t>& sizes,
-      const std::vector<int64_t>& dim_order);
-
-  // To discourage making copies, the copy assignment operator is still deleted
-  vTensor& operator=(const vTensor& other) = delete;
-
-  vTensor(vTensor&& other) = default;
-  vTensor& operator=(vTensor&& other) = default;
-
-  ~vTensor() = default;
-
-  enum class Attribute : uint8_t {
-    SIZES,
-    WHCN_DIM_ORDER,
-    STRIDES,
-    LOGICAL_LIMITS,
-    NUMEL,
-  };
-
-  class UniformData {
-    // Contains the number of elements in the tensor according to the canonical
-    // sizes.
-    int32_t numel;
-    utils::ivec4 sizes_v;
-    utils::ivec4 dim_order_v;
-    utils::ivec4 strides_v;
-    // See the comments documenting logical_limits() for more context.
-    TextureLimits logical_limits;
-
-    friend class vTensor;
-
-    UniformData(
-        const size_t numel_ll,
-        const std::vector<int64_t>& sizes,
-        const std::vector<int64_t>& dim_order,
-        const std::vector<int64_t>& strides,
-        const utils::uvec3& limits);
-
-   public:
-    /*
-     * Write tensor's metadata into dst, at the given dst_offset. max_dst_size
-     * is the size of dst and is used to avoid out of bounds writes.
-     */
-    uint32_t write_attribute(
-        void* dst,
-        const uint32_t dst_offset,
-        const uint32_t max_dst_size,
-        const Attribute attr);
-  };
-
-  struct BufferMetadata {
-    uint32_t sizes[kTensorDimLimit];
-    uint32_t dim_order[kTensorDimLimit];
-    uint32_t strides[kTensorDimLimit];
-    uint32_t ndim;
-    uint32_t numel;
-
-    BufferMetadata(
-        std::vector<int64_t>& sizes,
-        std::vector<int64_t>& dim_order,
-        std::vector<int64_t>& strides,
-        size_t numel);
-
-    void update(
-        std::vector<int64_t>& sizes,
-        std::vector<int64_t>& dim_order,
-        std::vector<int64_t>& strides,
-        size_t numel);
-  };
-
- private:
-  /*
-   * "Core" tensor metadata. They are the minimum amount of information required
-   * to construct a tensor.
-   */
-
-  // Whether the tensor has elements of type float, int, etc.
-  vkapi::ScalarType dtype_;
-  // sizes of the tensor in NCHW dimension order
-  std::vector<int64_t> sizes_;
-  // Describes which dimension is "tightly packed" using WHCN index (i.e. 0 for
-  // width, 1 for height, etc.). For texture backed tensors, this describes
-  // which dimension is packed along a texel. For buffer backed tensors, this
-  // describes which dimension has a stride of 1 (i.e. is last in the dim
-  // order).
-  int32_t packed_dim_;
-
-  /*
-   * "Layout" metadata. These describe with further detail how tensor data is
-   * laid out in memory. However, they are considered secondary to the "core"
-   * metadata members above because defaults can be assumed based on a given
-   * memory layout. When permuting the tensor without performing a copy, these
-   * metadata members are the ones that will be changed. All other metadata is
-   * derived from a combination of sizes, memory layout, and the below members.
-   */
-
-  // dim order of the tensor; dimension indices are in NCHW dimension order
-  // i.e. 0 is N, 1 is C, 2 is H, 3 is W for a 4D tensor. The dims with larger
-  // strides precede the dims with smaller strides in the dim order. The last
-  // dim is always the fastest moving dim with a stride of 1.
-  std::vector<int64_t> dim_order_;
-  // Describes which axis of an image texture each dimension of the tensor maps
-  // to. The axis mapping allows texture based tensors to be permuted and
-  // transposed without modifying the underlying texture storage. For a more in
-  // depth explanation of axis mapping, see the `default_axis_map()`
-  // function.
-  std::vector<int64_t> axis_map_;
-
-  /*
-   * The below can be consider "layout" metadata as well, but are derived from
-   * the above data members.
-   */
-
-  // strides of the tensor in NCHW dimension order
-  std::vector<int64_t> strides_;
-
-  // number of elements based on the canonical sizes
-  size_t numel_;
-
-  // For texture backed tensors, this int32 contains the axis map data packed
-  // into a single int32. For buffer backed tensors, this int32 contains the
-  // wchn dim order data packed into a single int32.
-  int32_t hashed_layout_;
-
-  // Pre-compute these quantities to avoid frequent re-computation
-  size_t min_nbytes_per_ubo_;
-  size_t max_ubo_nbytes_;
-
-  /*
-   * Utility GPU buffer that can be passed to shaders in order to convey tensor
-   * metadata. Uniform buffer will be initialized only the first time a ubo is
-   * requested. Buffer offsets will be initialized the first time they are
-   * accessed via the corresponding *_ubo() function. Uniform buffer's contents
-   * will be updated whenever virtual_resize() is called.
-   *
-   * Refer to the comments for the corresponding *_ubo() functions for more
-   * context about the data contained in each buffer.
-   */
-  ParamsBuffer uniforms_;
-
-  /*
-   * Used to store data for BufferMetadata to pass to shaders as buffer_meta_ubo
-   */
-  ParamsBuffer buffer_meta_;
-
-  uint32_t uniforms_size_ = 0u;
-  uint32_t sizes_uniform_offset_ = kUniformOffsetUnset;
-  uint32_t dim_order_uniform_offset_ = kUniformOffsetUnset;
-  uint32_t strides_uniform_offset = kUniformOffsetUnset;
-  uint32_t numel_uniform_offset_ = kUniformOffsetUnset;
-  uint32_t logical_limits_uniform_offset_ = kUniformOffsetUnset;
-
-  // Initial value of uniform buffer offsets. 1 is selected as it is essentially
-  // impossible for a ubo to have an offset of 1.
-  constexpr static uint32_t kUniformOffsetUnset = 1;
-
-  std::shared_ptr<vTensorStorage> storage_;
-
-  std::shared_ptr<UniformData> uniform_data_;
-
- public:
-  /*
-   Texture Access
-  */
-
-  inline vkapi::VulkanImage& image() const& {
-    return storage_->image_;
-  }
-
-  vkapi::VulkanImage& image(
-      vkapi::PipelineBarrier&,
-      const vkapi::PipelineStageFlags) &;
-
-  vkapi::VulkanImage& image(
-      vkapi::PipelineBarrier&,
-      const vkapi::PipelineStageFlags,
-      const vkapi::MemoryAccessFlags) &;
-
-  inline vkapi::VulkanBuffer& buffer() const& {
-    return storage_->buffer_;
-  }
-
-  vkapi::VulkanBuffer& buffer(
-      vkapi::PipelineBarrier&,
-      const vkapi::PipelineStageFlags) &;
-
-  vkapi::VulkanBuffer& buffer(
-      vkapi::PipelineBarrier&,
-      const vkapi::PipelineStageFlags,
-      const vkapi::MemoryAccessFlags) &;
-
-  /*
-    Metadata
-  */
-
-  inline utils::StorageType storage_type() const {
-    return storage_->storage_type_;
-  }
-
-  inline bool has_buffer_storage() const {
-    return storage_->storage_type_ == utils::kBuffer;
-  }
-
- public:
-  /*
-   * The logical limits of the tensor are derived from the image extents of the
-   * image texture used to store the tensor, but with two key differences.
-   *
-   * First, the image extents are permuted according to the axis map. This
-   * makes it so that the first element of the logical limit is the limit of the
-   * texture axis corresponding to the width dimension of the tensor, the next
-   * element is the limit of the texture axis corresponding to the height
-   * dimension and the last element is the limit of the texture axis that
-   * corresponds to the channels dimension of the tensor.
-   *
-   * Second, the logical limits may use smaller extents than the actual image
-   * extents of the image texture. This is due to dynamic shape; if the tensor's
-   * `virtual_resize()` function is called, then the logical limits will reflect
-   * the extents that would be needed to support a tensor with the updated sizes
-   * instead of the original sizes.
-   */
-  inline const utils::ivec3& logical_limits() const {
-    return uniform_data_->logical_limits.limits;
-  }
-
-  /*
-   * Extract an `vkapi::ScalarType` from the TensorOptions member
-   */
-  inline vkapi::ScalarType dtype() const {
-    return dtype_;
-  }
-
-  /*
-   * Provide a "best guess" of a memory layout that can be used to construct a
-   * tensor with similar layout metadata (i.e. strides, axis_map, etc.) as this
-   * tensor. In some scenarios, the exact layout of the tensor may not be able
-   * to be replicated due to calling `virtual_*()` functions after construction;
-   * however, this function will provide a memory layout that will produce the
-   * same `packed_dim_` as this tensor.
-   */
-  utils::GPUMemoryLayout estimate_memory_layout() const;
-
-  inline int32_t packed_dim() const {
-    return packed_dim_;
-  }
-
-  /*
-   * Returns the WHCN index of the dimension that is used to concatenate batches
-   * as an int32_t.
-   */
-  inline int32_t concat_dim() const {
-    return utils::safe_downcast<int32_t>(axis_map_.at(3));
-  }
-
-  inline const std::vector<int64_t>& sizes() const {
-    return sizes_;
-  }
-
-  inline const int64_t size(size_t dim) const {
-    return sizes().at(dim);
-  }
-
-  inline const int64_t dim() const {
-    return sizes_.size();
-  }
-
-  inline const std::vector<int64_t>& dim_order() const {
-    return dim_order_;
-  }
-
-  inline const std::vector<int64_t>& strides() const {
-    return strides_;
-  }
-
-  inline size_t numel() const {
-    return numel_;
-  }
-
-  inline size_t nbytes() const {
-    return element_size(dtype()) * numel();
-  }
-
-  inline const std::vector<int64_t>& axis_map() const {
-    return axis_map_;
-  }
-
-  /*
-   * For texture backed tensors, this function return a int32_t that contains
-   * the axis map + packed dimension. Each element of the axis map occupies 4
-   * bits of the int32.
-   *
-   * For buffer backed tensors, the int32_t contains the WHCN dim order, where
-   * each element of the dim order array occupies 4 bits of the int32.
-   *
-   * This int32 is typically consumed as a specialization constant in compute
-   * shaders where it is subsequently unpacked. The layout data of a vTensor
-   * instance is typically static once created, which is why this method is
-   * appropriate.
-   */
-  inline int32_t hashed_layout() const {
-    return hashed_layout_;
-  }
-
-  /*
-   * Return true if the tensor's axis map is {0, 1, 2, concat_dim}. This means
-   * that the width dim is mapped to the width axis of the texture, the height
-   * dim is mapped to the height axis of the texture, the channels dim is mapped
-   * to the depth axis of the texture.
-   */
-  inline bool has_standard_axis_map() const {
-    return axis_map_.at(0) == 0 && axis_map_.at(1) == 1 && axis_map_.at(2) == 2;
-  }
-
-  /*
-   * Return true if a buffer backed tensor's dim order matches that of a
-   * contiguous tensor, i.e. the dim order will be {0, 1, 2, ... }.
-   * Returns false for texture backed tensors.
-   */
-  bool is_contiguous() const;
-
- private:
-  inline size_t nbytes_per_ubo() const {
-    return storage_->context_->adapter_ptr()->min_ubo_alignment();
-  }
-
-  size_t get_max_ubo_nbytes(const size_t nbytes_per_ubo) const;
-
-  template <typename T>
-  const vkapi::BufferBindInfo metadata_ubo_impl(
-      uint32_t* param_buffer_offset,
-      const T& data) {
-    if (!uniforms_.buffer()) {
-      uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
-    }
-    size_t ubo_nbytes = utils::align_up(sizeof(data), min_nbytes_per_ubo_);
-    if (*param_buffer_offset == kUniformOffsetUnset) {
-      VK_CHECK_COND(
-          (uniforms_size_ + ubo_nbytes) <= max_ubo_nbytes_,
-          "Uniform data allocation has exceeded Tensor uniform buffer size");
-      *param_buffer_offset = uniforms_size_;
-      uniforms_size_ += ubo_nbytes;
-      uniforms_.update(data, *param_buffer_offset);
-    }
-    return vkapi::BufferBindInfo(
-        uniforms_.buffer(), *param_buffer_offset, ubo_nbytes);
-  }
-
- public:
-  /*
-   * The functions below return the buffer binding info for a UBO that contains
-   * some metadata of the tensor, which can be used to pass in tensor metadata
-   * to a compute shader. The other method of passing in tensor metadata is via
-   * push constants. The trade-off between each is that push constants may be
-   * slightly more performant and memory efficient; however, to update the
-   * values in a push constant due to i.e. a tensor resize between inferences,
-   * the command buffer must be re-encoded. On the other hand, UBOs can update
-   * their data by writing to their mapped memory without requiring a command
-   * buffer re-encode.
-   */
-
-  const vkapi::BufferBindInfo sizes_ubo();
-
-  const vkapi::BufferBindInfo dim_order_ubo();
-
-  const vkapi::BufferBindInfo strides_ubo();
-
-  const vkapi::BufferBindInfo logical_limits_ubo();
-
-  const vkapi::BufferBindInfo numel_ubo();
-
-  const vkapi::BufferBindInfo buffer_meta_ubo();
-
- public:
-  inline size_t staging_buffer_numel() const {
-    return storage_->buffer_len();
-  }
-
-  inline size_t staging_buffer_nbytes() const {
-    return element_size(dtype()) * staging_buffer_numel();
-  }
-
-  /*
-   * Return the VmaAllocationCreateInfo of the underlying resource
-   */
-  VmaAllocationCreateInfo get_allocation_create_info() const;
-
-  /*
-   * Checks if the tensor's underlying buffer or image resource is bound to a
-   * memory allocation.
-   */
-  bool memory_is_bound() const;
-
-  /*
-   * Return the VkMemoryRequirements of the underlying resource
-   */
-  VkMemoryRequirements get_memory_requirements() const;
-
-  /*
-   * Binds the underlying resource to the given memory allocation
-   */
-  void bind_allocation(const vkapi::Allocation& allocation);
-
-  /*
-   * Binds and acquires a rvalue memory allocation
-   */
-  void acquire_allocation(vkapi::Allocation&& allocation);
-
- private:
-  /*
-   * Assuming sizes, dim order, or axis mapping was modified, recompute all
-   * derived metadata and update metadata UBO with new values.
-   */
-  void update_metadata();
-
-  /*
-   * Check that tensor sizes are valid given the current storage resource's
-   * limits.
-   */
-  void check_sizes(const std::vector<int64_t>& sizes) const;
-
- public:
-  /*
-   * Change how the tensor should be interpreted by compute shaders via updating
-   * the size and dim order of the tensor. The new sizes and dim order may have
-   * different dimensionality than the current dimensionality of the tensor.
-   *
-   * This function can only be used for buffer-backed tensors, since texture
-   * backed buffers cannot change dimensionality or memory layout.
-   *
-   * TODO(ssjia): delete this API. prefer functions such as virtual_transpose
-   * instead.
-   */
-  void virtual_reconfigure(
-      const std::vector<int64_t>& new_sizes,
-      const std::vector<int64_t>& new_dim_order);
-
-  /*
-   * Set all metadata of this tensor to match the metadata of another tensor.
-   */
-  void virtual_clone(const vTensor& other);
-
-  /*
-   * Perform a virtual resize of the vTensor by modifying the size metadata that
-   * gets used in compute shaders. This allows the shader to treat the
-   * underlying resource as if it were a different size. The new sizes cannot
-   * modify the dimensionality of the tensor.
-   */
-  void virtual_resize(const std::vector<int64_t>& new_sizes);
-
-  /*
-   * Transpose the tensor in-place by updating its metadata.
-   */
-  void virtual_transpose(const int64_t dim0, const int64_t dim1);
-
-  /*
-   * Check if this vTensor instance is a view of another vTensor instance
-   */
-  inline bool is_view_of(const vTensor& other) const {
-    return storage_.get() == other.storage_.get();
-  }
-
-  const std::shared_ptr<UniformData>& get_uniform_data() const {
-    VK_CHECK_COND(sizes_.size() <= 4);
-    return uniform_data_;
-  }
-};
-
-static constexpr vTensor::Attribute kTensorSizes = vTensor::Attribute::SIZES;
-static constexpr vTensor::Attribute kTensorDimOrder =
-    vTensor::Attribute::WHCN_DIM_ORDER;
-static constexpr vTensor::Attribute kTensorStrides =
-    vTensor::Attribute::STRIDES;
-static constexpr vTensor::Attribute kTensorLogicalLimits =
-    vTensor::Attribute::LOGICAL_LIMITS;
-static constexpr vTensor::Attribute kTensorNumel = vTensor::Attribute::NUMEL;
-
-/*
- * Prepare tensor metadata vector for consumption on the GPU:
- * 1. Convert NCHW dim order and indexes to WCHN dim order and indexes
- * 2. Unsqueeze to the next multiple of 4 dims
- * 3. Convert to requested output dtype
- */
-template <
-    typename T,
-    typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
-std::vector<T> flip_and_unsqueeze(
-    const std::vector<int64_t>& tensor_metadata,
-    const vTensor::Attribute metadata_type,
-    const size_t numel,
-    const int32_t fixed_ndim = -1) {
-  const size_t ndim = tensor_metadata.size();
-  size_t ndim_up4 =
-      std::max(utils::align_up_4(tensor_metadata.size()), size_t(4));
-
-  if (fixed_ndim > 0) {
-    VK_CHECK_COND(fixed_ndim >= ndim);
-    ndim_up4 = static_cast<size_t>(fixed_ndim);
-  }
-
-  std::vector<T> flipped_metadata(ndim_up4);
-
-  for (int flipped_i = 0; flipped_i < ndim; ++flipped_i) {
-    T val_at_dim =
-        utils::safe_downcast<T>(tensor_metadata.at(ndim - 1 - flipped_i));
-    if (metadata_type == kTensorDimOrder) {
-      val_at_dim = utils::safe_downcast<T>(ndim - 1 - val_at_dim);
-    }
-    flipped_metadata.at(flipped_i) = val_at_dim;
-  }
-
-  switch (metadata_type) {
-    case kTensorStrides:
-      for (int unsqueezed_i = ndim; unsqueezed_i < ndim_up4; ++unsqueezed_i) {
-        flipped_metadata.at(unsqueezed_i) = utils::safe_downcast<T>(numel);
-      }
-      break;
-    case kTensorDimOrder:
-      for (int unsqueezed_i = ndim; unsqueezed_i < ndim_up4; ++unsqueezed_i) {
-        flipped_metadata.at(unsqueezed_i) =
-            utils::safe_downcast<T>(unsqueezed_i);
-      }
-      break;
-    // Default: unsqueeze with ones
-    default:
-      for (int unsqueezed_i = ndim; unsqueezed_i < ndim_up4; ++unsqueezed_i) {
-        flipped_metadata.at(unsqueezed_i) = utils::safe_downcast<T>(1);
-      }
-      break;
-  }
-
-  return flipped_metadata;
-}
-
-/*
- * Same as flip and unsqueeze, but returns the metadata as an `ivec4`.
- */
-utils::ivec4 flip_and_unsqueezed_ivec4(
-    const std::vector<int64_t>& tensor_metadata,
-    const vTensor::Attribute metadata_type,
-    const size_t numel);
-
-} // namespace api
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py
deleted file mode 100644
index 3f2d616b428..00000000000
--- a/backends/vulkan/runtime/gen_vulkan_spv.py
+++ /dev/null
@@ -1,1450 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-
-import argparse
-import array
-import codecs
-import copy
-import glob
-import hashlib
-import io
-import os
-import re
-import shutil
-import sys
-from itertools import product
-from multiprocessing.pool import ThreadPool
-
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
-import subprocess
-import textwrap
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
-
-import yaml
-from yaml.constructor import ConstructorError
-from yaml.nodes import MappingNode
-
-try:
-    from yaml import CLoader as Loader
-except ImportError:
-    from yaml import Loader  # type: ignore[assignment, misc]
-
-CPP_H_NAME = "spv.h"
-CPP_SRC_NAME = "spv.cpp"
-
-# Basic configuration settings for shaders
-DEFAULT_ENV: Dict[str, Any] = {
-    "PRECISION": "highp",
-    # B is shorthand for "binding". This is used to automatically increment the
-    # layout binding index when declaring layout bindings. Note that a container
-    # type is used because integers are immutable in Python.
-    "B": [0],
-    # C is shorthand for "constant_id". This is used to automatically increment the
-    # constant_id index for specialization constants.
-    # Note that it starts at 3, as 0-2 are reserved for local workgroup size ids.
-    "C": [3],
-}
-
-# Establishes relationships between different tensor types and different GLSL types
-TYPE_MAPPINGS: Dict[str, Any] = {
-    "IMAGE_T": {
-        3: {
-            "double": "image3D",
-            "float": "image3D",
-            "half": "image3D",
-            # integer dtypes
-            "int8": "iimage3D",
-            "uint8": "uimage3D",
-            "int16": "iimage3D",
-            "uint16": "uimage3D",
-            "int32": "iimage3D",
-            "uint32": "uimage3D",
-            "int64": "iimage3D",
-            "uint64": "uimage3D",
-            # common dtype aliases
-            "bool": "uimage3D",
-            "int": "iimage3D",
-            "uint": "uimage3D",
-        },
-        2: {
-            "double": "image2D",
-            "float": "image2D",
-            "half": "image2D",
-            # integer dtypes
-            "int8": "iimage2D",
-            "uint8": "uimage2D",
-            "int16": "iimage2D",
-            "uint16": "uimage2D",
-            "int32": "iimage2D",
-            "uint32": "uimage2D",
-            "int64": "iimage2D",
-            "uint64": "uimage2D",
-            # common dtype aliases
-            "bool": "uimage2D",
-            "int": "iimage2D",
-            "uint": "uimage2D",
-        },
-    },
-    "SAMPLER_T": {
-        3: {
-            "double": "sampler3D",
-            "float": "sampler3D",
-            "half": "sampler3D",
-            # integer dtypes
-            "int8": "isampler3D",
-            "uint8": "usampler3D",
-            "int16": "isampler3D",
-            "uint16": "usampler3D",
-            "int32": "isampler3D",
-            "uint32": "usampler3D",
-            "int64": "isampler3D",
-            "uint64": "usampler3D",
-            # common dtype aliases
-            "bool": "usampler3D",
-            "int": "isampler3D",
-            "uint": "usampler3D",
-        },
-        2: {
-            "double": "sampler2D",
-            "float": "sampler2D",
-            "half": "sampler2D",
-            # integer dtypes
-            "int8": "isampler2D",
-            "uint8": "usampler2D",
-            "int16": "isampler2D",
-            "uint16": "usampler2D",
-            "int32": "isampler2D",
-            "uint32": "usampler2D",
-            "int64": "isampler2D",
-            "uint64": "usampler2D",
-            # common dtype aliases
-            "bool": "usampler2D",
-            "int": "isampler2D",
-            "uint": "usampler2D",
-        },
-    },
-    "IMAGE_FORMAT": {
-        "double": "rgba32f",
-        "float": "rgba32f",
-        "half": "rgba16f",
-        # integer dtypes
-        "int8": "rgba8i",
-        "uint8": "rgba8ui",
-        "int16": "rgba16i",
-        "uint16": "rgba16ui",
-        "int32": "rgba32i",
-        "uint32": "rgba32ui",
-        "int64": "rgba32i",
-        "uint64": "rgba32ui",
-        # common dtype aliases
-        "bool": "rgba8ui",
-        "int": "rgba32i",
-        "uint": "rgba32ui",
-    },
-}
-
-
-def define_variable(name: str) -> str:
-    if name in locals():
-        return f"#define {name} {locals()[name]}"
-    elif name in globals():
-        return f"#define {name} {globals()[name]}"
-    else:
-        raise RuntimeError(f"{name} is not defined")
-
-
-def buffer_scalar_type(dtype: str) -> str:
-    if dtype == "half":
-        return "float16_t"
-    elif dtype == "float":
-        return "float"
-    elif dtype == "double":
-        return "float64_t"
-    # integer dtype alias conversion
-    elif dtype == "bool":
-        return "uint8_t"
-    # we don't want to append _t for int32 or uint32 as int is already 32bit
-    elif dtype == "int32" or dtype == "uint32":
-        return "int" if dtype == "int32" else "uint"
-    elif dtype[-1].isdigit():
-        return dtype + "_t"
-    return dtype
-
-
-def buffer_gvec_type(dtype: str, n: int) -> str:
-    if n == 1:
-        return buffer_scalar_type(dtype)
-
-    dtype_map = {
-        "half": f"f16vec{n}",
-        "float": f"vec{n}",
-        "double": f"vec{n}",  # No 64bit image format support in GLSL
-        "int8": f"i8vec{n}",
-        "uint8": f"u8vec{n}",
-        "int16": f"i16vec{n}",
-        "uint16": f"u16vec{n}",
-        "int32": f"ivec{n}",
-        "int": f"ivec{n}",
-        "uint32": f"uvec{n}",
-        "uint": f"uvec{n}",
-        "int64": f"ivec{n}",  # No 64bit image format support in GLSL
-        "uint64": f"uvec{n}",  # No 64bit image format support in GLSL
-        "bool": f"u8vec{n}",
-    }
-
-    vector_type = dtype_map.get(dtype)
-    if vector_type is None:
-        raise AssertionError(f"Invalid dtype: {dtype}")
-
-    return vector_type
-
-
-def texel_type(dtype: str) -> str:
-    image_format = TYPE_MAPPINGS["IMAGE_FORMAT"][dtype]
-    if image_format[-1:] == "f":
-        return "vec4"
-    elif image_format[-2:] == "ui":
-        return "uvec4"
-    elif image_format[-1:] == "i":
-        return "ivec4"
-    raise AssertionError(f"Invalid image format: {image_format}")
-
-
-def gvec_type(dtype: str, n: int) -> str:
-    gvec4_type = texel_type(dtype)
-    return gvec4_type[:-1] + str(n)
-
-
-def texel_component_type(dtype: str) -> str:
-    vec4_type = texel_type(dtype)
-    if vec4_type[:3] == "vec":
-        return "float"
-    elif vec4_type[:4] == "ivec":
-        return "int"
-    elif vec4_type[:4] == "uvec":
-        return "uint"
-    raise AssertionError(f"Invalid vec4 type: {vec4_type}")
-
-
-def texel_load_type(dtype: str, storage_type: str) -> str:
-    if storage_type.lower() == "buffer":
-        return buffer_gvec_type(dtype, 4)
-    else:
-        return texel_type(dtype)
-
-
-def texel_load_component_type(dtype: str, storage_type: str) -> str:
-    if storage_type.lower() == "buffer":
-        return buffer_scalar_type(dtype)
-    else:
-        return texel_component_type(dtype)
-
-
-def get_access_qualifier(access_type: Optional[str]) -> str:
-    if access_type is None:
-        return ""
-    if access_type.lower() == "r":
-        return "readonly"
-    if access_type.lower() == "w":
-        return "writeonly"
-    if access_type.lower() == "rw":
-        return ""
-
-    raise AssertionError(f"Invalid access type: {access_type}")
-
-
-def get_slot_val(slot: Union[int, List[int]]) -> int:
-    if isinstance(slot, list):
-        return slot[0]
-    return slot
-
-
-def layout_declare_buffer(
-    slot: Union[int, List[int]],
-    access_type: str,
-    var_name: str,
-    dtype: str,
-    precision: str = "PRECISION",
-    is_scalar_array: bool = True,
-) -> str:
-    array_type = buffer_gvec_type(dtype, 4)
-    if is_scalar_array:
-        array_type = buffer_scalar_type(dtype)
-
-    out_str = f"""
-layout(set = 0, binding = {get_slot_val(slot)}) buffer {precision} restrict {get_access_qualifier(access_type)} {var_name}Buffer {{
-    {array_type} {var_name}[];
-}};
-"""
-
-    if isinstance(slot, list):
-        slot[0] = slot[0] + 1
-    return out_str
-
-
-def layout_declare_image(
-    slot: Union[int, List[int]],
-    access_type: str,
-    var_name: str,
-    dtype: str,
-    precision: str = "PRECISION",
-    image_ndim: int = 3,
-) -> str:
-    image_format = TYPE_MAPPINGS["IMAGE_FORMAT"][dtype]
-    image_type = TYPE_MAPPINGS["IMAGE_T"][image_ndim][dtype]
-
-    ret_str = f"layout(set = 0, binding = {get_slot_val(slot)}, {image_format}) uniform {precision} restrict {get_access_qualifier(access_type)} {image_type} {var_name};"
-
-    if isinstance(slot, list):
-        slot[0] = slot[0] + 1
-    return ret_str
-
-
-def layout_declare_sampler(
-    slot: Union[int, List[int]],
-    access_type: str,
-    var_name: str,
-    dtype: str,
-    precision: str = "PRECISION",
-    access_qualifier: Optional[str] = None,
-    image_ndim: int = 3,
-) -> str:
-    sampler_type = TYPE_MAPPINGS["SAMPLER_T"][image_ndim][dtype]
-
-    ret_str = f"layout(set = 0, binding = {get_slot_val(slot)}) uniform {precision} {sampler_type} {var_name};"
-
-    if isinstance(slot, list):
-        slot[0] = slot[0] + 1
-    return ret_str
-
-
-def layout_declare_tensor(
-    slot: Union[int, List[int]],
-    access_type: str,
-    var_name: str,
-    dtype: str,
-    storage_type: str,
-    is_scalar_array: bool = True,
-    precision: str = "PRECISION",
-) -> str:
-    assert storage_type.lower() in ["buffer", "texture3d", "texture2d"]
-
-    image_ndim = 3
-    if storage_type.lower() == "texture2d":
-        image_ndim = 2
-
-    # Create buffer binding
-    if storage_type.lower() == "buffer":
-        return layout_declare_buffer(
-            slot,
-            access_type,
-            var_name,
-            dtype,
-            precision,
-            is_scalar_array=is_scalar_array,
-        )
-
-    # Create image/sampler binding
-    if access_type.lower() == "r":
-        return layout_declare_sampler(
-            slot, access_type, var_name, dtype, precision, image_ndim=image_ndim
-        )
-    else:
-        return layout_declare_image(
-            slot, access_type, var_name, dtype, precision, image_ndim=image_ndim
-        )
-
-
-def layout_declare_ubo(
-    slot: Union[int, List[int]], *args, precision: str = "PRECISION"
-) -> str:
-    assert len(args) % 2 == 0
-
-    var_list = list(zip(args[::2], args[1::2]))
-
-    ubo_name = ""
-    for _, var_name in var_list:
-        ubo_name += var_name + "_"
-
-    out_str = f"""
-layout(set = 0, binding = {get_slot_val(slot)}) uniform {precision} restrict readonly {ubo_name}UBO {{
-"""
-    for type_name, var_name in var_list:
-        out_str += f"  {type_name} {var_name};\n"
-    out_str += "};"
-
-    if isinstance(slot, list):
-        slot[0] = slot[0] + 1
-    return out_str
-
-
-def layout_declare_spec_const(
-    slot: Union[int, List[int]],
-    type_name: str,
-    var_name: str,
-    initial_val: Optional[str] = None,
-) -> str:
-    assert type_name in ["int", "uint", "float", "bool"]
-
-    out_str = f"layout(constant_id = {get_slot_val(slot)}) const {type_name} {var_name}"
-    if initial_val is not None:
-        out_str += f" = {initial_val}"
-    out_str += ";"
-
-    if isinstance(slot, list):
-        slot[0] = slot[0] + 1
-    return out_str
-
-
-def define_active_storage_type(storage_type: str):
-    if storage_type.lower() == "buffer":
-        return "#define USING_BUFFER"
-    elif storage_type.lower() == "texture3d":
-        return "#define USING_TEXTURE3D"
-    elif storage_type.lower() == "texture2d":
-        return "#define USING_TEXTURE2D"
-    else:
-        raise AssertionError(f"Invalid storage type: {storage_type}")
-
-
-def define_required_extensions(dtypes: Union[str, List[str]]):
-    out_str = "\n"
-    dtype_list = dtypes if isinstance(dtypes, list) else [dtypes]
-
-    for dtype in dtype_list:
-        nbit = None
-        glsl_type = None
-        if dtype == "half":
-            nbit = "16bit"
-            glsl_type = "float16"
-        elif dtype == "double":
-            # We only need to allow float64_t type usage
-            glsl_type = "float64"
-        elif dtype in ["int8", "uint8", "bool"]:
-            nbit = "8bit"
-            glsl_type = "int8"
-        elif dtype in ["int16", "uint16"]:
-            nbit = "16bit"
-            glsl_type = "int16"
-        elif dtype in ["int64", "uint64"]:
-            # We only need to allow int64_t and uint64_t type usage
-            glsl_type = "int64"
-
-        if nbit is not None:
-            out_str += f"#extension GL_EXT_shader_{nbit}_storage : require\n"
-        if glsl_type is not None:
-            out_str += f"#extension GL_EXT_shader_explicit_arithmetic_types_{glsl_type} : require\n"
-
-    return out_str
-
-
-UTILITY_FNS: Dict[str, Any] = {
-    "macro_define": define_variable,
-    "get_pos": {
-        3: lambda pos: pos,
-        2: lambda pos: f"{pos}.xy",
-    },
-    "buffer_scalar_type": buffer_scalar_type,
-    "buffer_gvec_type": buffer_gvec_type,
-    "texel_type": texel_type,
-    "gvec_type": gvec_type,
-    "texel_component_type": texel_component_type,
-    "texel_load_type": texel_load_type,
-    "texel_load_component_type": texel_load_component_type,
-    "layout_declare_buffer": layout_declare_buffer,
-    "layout_declare_image": layout_declare_image,
-    "layout_declare_sampler": layout_declare_sampler,
-    "layout_declare_tensor": layout_declare_tensor,
-    "layout_declare_ubo": layout_declare_ubo,
-    "layout_declare_spec_const": layout_declare_spec_const,
-    "define_active_storage_type": define_active_storage_type,
-    "define_required_extensions": define_required_extensions,
-}
-
-
-def extract_filename(path: str, keep_ext: bool = True) -> Any:
-    if keep_ext:
-        return os.path.basename(path)
-    else:
-        return os.path.basename(path).split(".")[0]
-
-
-def extract_extension(path: str) -> str:
-    return os.path.splitext(extract_filename(path))[1][1:]
-
-
-############################
-#  SPIR-V Code Generation  #
-############################
-
-
-# https://gist.github.com/pypt/94d747fe5180851196eb
-class UniqueKeyLoader(Loader):
-    def construct_mapping(self, node, deep=False):  # type: ignore[no-untyped-def]
-        if not isinstance(node, MappingNode):
-            raise ConstructorError(
-                None,
-                None,
-                f"expected a mapping node, but found {node.id}",
-                node.start_mark,
-            )
-        mapping = {}
-        for key_node, value_node in node.value:
-            key = self.construct_object(key_node, deep=deep)  # type: ignore[no-untyped-call]
-            try:
-                hash(key)
-            except TypeError as e:
-                raise ConstructorError(
-                    "while constructing a mapping",
-                    node.start_mark,
-                    "found unacceptable key ",
-                    key_node.start_mark,
-                ) from e
-            # check for duplicate keys
-            if key in mapping:
-                raise ConstructorError(
-                    "while constructing a mapping",
-                    node.start_mark,
-                    "found duplicate key",
-                    key_node.start_mark,
-                )
-            value = self.construct_object(value_node, deep=deep)  # type: ignore[no-untyped-call]
-            mapping[key] = value
-        return mapping
-
-
-# https://github.com/google/XNNPACK/blob/master/tools/xngen.py
-def extract_leading_whitespace(line: str) -> str:
-    match = re.match(r"\s*", line)
-    return match.group(0) if match else ""
-
-
-# https://github.com/google/XNNPACK/blob/master/tools/xngen.py
-def escape(line: str) -> str:
-    output_parts = []
-    while "${" in line:
-        start_pos = line.index("${")
-        end_pos = line.index("}", start_pos + 2)
-        if start_pos != 0:
-            output_parts.append('"' + line[:start_pos].replace('"', '\\"') + '"')
-        output_parts.append("str(" + line[start_pos + 2 : end_pos] + ")")
-        line = line[end_pos + 1 :]
-    if line:
-        output_parts.append('"' + line.replace('"', '\\"') + '"')
-    return " + ".join(output_parts)
-
-
-# https://github.com/google/XNNPACK/blob/master/tools/xngen.py
-def preprocess(
-    input_text: str, variables: Dict[str, Any], input_path: str = "codegen"
-) -> str:
-    # Workaround to handle source files using \ to extend mecros to a new line
-    input_text = re.sub(r"\\$", r"\\\\", input_text, flags=re.MULTILINE)
-
-    input_lines = input_text.splitlines()
-    python_lines = []
-
-    blank_lines = 0
-
-    last_indent = ""
-
-    # List of tuples (total_index, python_indent)
-    indent_stack = [("", "")]
-
-    # Indicates whether this is the first line inside Python
-    # code block (i.e. for, while, if, elif, else)
-    python_block_start = True
-    for input_line in input_lines:
-        if input_line == "":
-            blank_lines += 1
-            continue
-        # Skip lint markers.
-        if "LINT" in input_line:
-            continue
-
-        input_indent = extract_leading_whitespace(input_line)
-        if python_block_start:
-            assert input_indent.startswith(last_indent)
-            extra_python_indent = input_indent[len(last_indent) :]
-            python_indent = indent_stack[-1][1] + extra_python_indent
-            indent_stack.append((input_indent, python_indent))
-            assert input_indent.startswith(indent_stack[-1][0])
-        else:
-            while not input_indent.startswith(indent_stack[-1][0]):
-                del indent_stack[-1]
-        python_block_start = False
-
-        python_indent = indent_stack[-1][1]
-        stripped_input_line = input_line.strip()
-        if stripped_input_line.startswith("$") and not stripped_input_line.startswith(
-            "${"
-        ):
-            if stripped_input_line.endswith(":"):
-                python_block_start = True
-            while blank_lines != 0:
-                python_lines.append(python_indent + "print(file=OUT_STREAM)")
-                blank_lines -= 1
-            python_lines.append(python_indent + stripped_input_line.replace("$", ""))
-        else:
-            assert input_line.startswith(python_indent)
-            while blank_lines != 0:
-                python_lines.append(python_indent + "print(file=OUT_STREAM)")
-                blank_lines -= 1
-            python_lines.append(
-                python_indent
-                + "print(%s, file=OUT_STREAM)"
-                % escape(input_line[len(python_indent) :])
-            )
-        last_indent = input_indent
-
-    while blank_lines != 0:
-        python_lines.append(python_indent + "print(file=OUT_STREAM)")
-        blank_lines -= 1
-
-    exec_globals = dict(variables)
-    output_stream = io.StringIO()
-    exec_globals["OUT_STREAM"] = output_stream
-
-    python_bytecode = compile("\n".join(python_lines), input_path, "exec")
-    exec(python_bytecode, exec_globals)
-
-    return output_stream.getvalue()
-
-
-class SPVGenerator:
-    def __init__(
-        self,
-        src_dir_paths: Union[str, List[str]],
-        env: Dict[Any, Any],
-        glslc_path: Optional[str],
-        glslc_flags: str = "",
-        replace_u16vecn: bool = False,
-    ) -> None:
-        if isinstance(src_dir_paths, str):
-            self.src_dir_paths = [src_dir_paths]
-        else:
-            self.src_dir_paths = src_dir_paths
-
-        self.env = env
-        self.glslc_path = glslc_path
-        self.glslc_flags = glslc_flags.split()
-        self.glslc_flags_no_opt = self.glslc_flags.copy()
-        if "-O" in self.glslc_flags_no_opt:
-            self.glslc_flags_no_opt.remove("-O")
-        if "-Os" in self.glslc_flags_no_opt:
-            self.glslc_flags_no_opt.remove("-Os")
-        self.replace_u16vecn = replace_u16vecn
-
-        self.src_files: Dict[str, str] = {}
-        self.template_yaml_files: List[str] = []
-
-        self.addSrcAndYamlFiles(self.src_dir_paths)
-        self.shader_template_params: Dict[Any, Any] = {}
-        for yaml_file in self.template_yaml_files:
-            self.parseTemplateYaml(yaml_file)
-
-        self.output_file_map: Dict[str, Tuple[str, Dict[str, str]]] = {}
-        self.constructOutputMap()
-
-    def addSrcAndYamlFiles(self, src_dir_paths: List[str]) -> None:
-        for src_path in src_dir_paths:
-            # Collect glsl source files
-            src_files_list = glob.glob(
-                os.path.join(src_path, "**", "*.[gh]lsl*"), recursive=True
-            ) + glob.glob(os.path.join(src_path, "**", "*.h"), recursive=True)
-            for file in src_files_list:
-                if len(file) > 1:
-                    self.src_files[extract_filename(file, keep_ext=False)] = file
-            # Collect template yaml files
-            yaml_files = glob.glob(
-                os.path.join(src_path, "**", "*.yaml"), recursive=True
-            )
-            for file in yaml_files:
-                if len(file) > 1:
-                    self.template_yaml_files.append(file)
-
-    def generateVariantCombinations(
-        self,
-        iterated_params: Dict[str, Any],
-        exclude_params: Optional[Set[str]] = None,
-    ) -> List[Any]:
-        if exclude_params is None:
-            exclude_params = set()
-        all_iterated_params = []
-        for param_name, value_list in iterated_params.items():
-            if param_name not in exclude_params:
-                param_values = []
-                for value in value_list:
-                    if "RANGE" in value:
-                        value_range = value["RANGE"]
-                        suffix = value.get("SUFFIX", "")
-                        if isinstance(value_range, list) and len(value_range) == 2:
-                            for i in range(value_range[0], value_range[1] + 1):
-                                curr_suffix = (
-                                    suffix + "_" + str(i) if suffix else str(i)
-                                )
-                                param_values.append((param_name, curr_suffix, i))
-                        else:
-                            raise ValueError(
-                                f"{value['RANGE']} is not a valid range. Must be in format [start, end] (inclusive)."
-                            )
-
-                    elif "VALUE" in value:
-                        suffix = value.get("SUFFIX", value["VALUE"])
-                        if value["VALUE"] in ["int", "uint"]:
-                            raise ValueError(
-                                f"Use int32 or uint32 instead of {value['VALUE']}"
-                            )
-                        param_values.append((param_name, suffix, value["VALUE"]))
-
-                    else:
-                        raise KeyError(
-                            "Parameter must be 'VALUE: string' or 'RANGE: [a, b]'"
-                        )
-
-                all_iterated_params.append(param_values)
-
-        return list(product(*all_iterated_params))
-
-    def parseTemplateYaml(self, yaml_file: str) -> None:
-        with open(yaml_file) as f:
-            contents = yaml.load(f, Loader=UniqueKeyLoader)
-            for template_name, params_dict in contents.items():
-                if template_name in self.shader_template_params:
-                    raise KeyError(f"{template_name} params file is defined twice")
-
-                default_params = params_dict["parameter_names_with_default_values"]
-                default_params["YAML_SRC_FULLPATH"] = yaml_file
-                params_names = set(default_params.keys()).union({"NAME"})
-
-                self.shader_template_params[template_name] = []
-
-                default_iterated_params = params_dict.get(
-                    "generate_variant_forall", None
-                )
-
-                for variant in params_dict["shader_variants"]:
-                    default_iterated_params_names = set(
-                        default_iterated_params.keys()
-                        if default_iterated_params is not None
-                        else {}
-                    )
-                    variant_params_names = set(variant.keys())
-
-                    invalid_keys = (
-                        variant_params_names
-                        - default_iterated_params_names
-                        - params_names
-                        - {"generate_variant_forall"}
-                    )
-                    assert len(invalid_keys) == 0
-
-                    iterated_params = variant.get(
-                        "generate_variant_forall", default_iterated_params
-                    )
-
-                    if iterated_params is not None:
-                        variant_combinations = self.generateVariantCombinations(
-                            iterated_params, variant_params_names
-                        )
-
-                        for combination in variant_combinations:
-                            default_params_copy = copy.deepcopy(default_params)
-                            for key in variant:
-                                if key != "generate_variant_forall":
-                                    default_params_copy[key] = variant[key]
-
-                            variant_name = variant["NAME"]
-                            for param_value in combination:
-                                default_params_copy[param_value[0]] = param_value[2]
-                                if len(str(param_value[1])) > 0:
-                                    variant_name = f"{variant_name}_{param_value[1]}"
-
-                            default_params_copy["NAME"] = variant_name
-                            default_params_copy["VARIANT_NAME"] = variant["NAME"]
-
-                            self.shader_template_params[template_name].append(
-                                default_params_copy
-                            )
-                    else:
-                        default_params_copy = copy.deepcopy(default_params)
-                        for key in variant:
-                            default_params_copy[key] = variant[key]
-
-                        self.shader_template_params[template_name].append(
-                            default_params_copy
-                        )
-
-    def create_shader_params(
-        self, variant_params: Optional[Dict[str, Any]] = None
-    ) -> Dict[str, str]:
-        if variant_params is None:
-            variant_params = {}
-        shader_params = copy.deepcopy(self.env)
-        for key, value in variant_params.items():
-            shader_params[key] = value
-
-        return shader_params
-
-    def constructOutputMap(self) -> None:
-        for src_name, params in self.shader_template_params.items():
-            for variant in params:
-                src_file_fullpath = self.src_files[src_name]
-
-                self.output_file_map[variant["NAME"]] = (
-                    src_file_fullpath,
-                    self.create_shader_params(variant),
-                )
-
-        for src_name, src_file_fullpath in self.src_files.items():
-            if src_name not in self.shader_template_params:
-                self.output_file_map[src_name] = (
-                    src_file_fullpath,
-                    self.create_shader_params(),
-                )
-
-    def maybe_replace_u16vecn(self, input_text: str) -> str:
-        """
-        There is a latency benefit to using u16vecn variables to store texture position
-        variables instead of ivecn, likely due to reduced register pressure. However,
-        SwiftShader does not support 16 bit integer types in shaders, so this is a crude
-        way to fallback to using ivecn to store texture positions so that testing with
-        SwiftShader is still possible.
-        """
-        if not self.replace_u16vecn:
-            return input_text
-        if "codegen-nosub" in input_text:
-            return input_text
-
-        # Remove extension requirement so that generated ShaderInfo does not mark it
-        input_text = input_text.replace(
-            "#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require", ""
-        )
-        input_text = input_text.replace("u16vec", "ivec")
-        input_text = input_text.replace("uint16_t", "int")
-        return input_text
-
-    def get_md5_checksum(self, file_path: str) -> str:
-        # Use a reasonably sized buffer for better performance with large files
-        BUF_SIZE = 65536  # 64kb chunks
-
-        md5 = hashlib.md5()
-
-        with open(file_path, "rb") as f:
-            while True:
-                data = f.read(BUF_SIZE)
-                if not data:
-                    break
-                md5.update(data)
-
-        # Get the hexadecimal digest and compare
-        file_md5 = md5.hexdigest()
-        return file_md5
-
-    def generateSPV(  # noqa: C901
-        self,
-        output_dir: str,
-        cache_dir: Optional[str] = None,
-        force_rebuild: bool = False,
-    ) -> Dict[str, str]:
-        # The key of this dictionary is the full path to a generated source file. The
-        # value is a tuple that contains 3 entries:
-        #
-        # 1. A bool indicationg if the file has changed since the last compilation; this
-        #    is determined by comparing against the cached version.
-        # 2. List of other source files included by the generated file.
-        gen_file_meta: Dict[str, Tuple[bool, List[str], str]] = {}
-
-        # Return value of the function mapping the abspath of compiled SPIR-V binaries
-        # to the abspath of the generated GLSL file they were compiled from.
-        spv_to_glsl_map: Dict[str, str] = {}
-
-        # Convert output_dir to absolute path
-        assert os.path.exists(output_dir)
-        output_dir = os.path.abspath(output_dir)
-
-        if cache_dir is not None:
-            assert os.path.exists(cache_dir)
-
-        def get_glsl_includes(glsl_text):
-            """
-            Parse GLSL text content and return a list of included files.
-
-            Args:
-                glsl_text: String containing the GLSL file content to analyze
-
-            Returns:
-                List of included file names (e.g., ["random.h"])
-            """
-            includes = []
-            for line in glsl_text.splitlines():
-                # Look for #include directives with quoted filenames
-                # Matches: #include "filename.h" or #include <filename.h>
-                include_match = re.match(
-                    r'^\s*#include\s+[<"]([^>"]+)[>"]', line.strip()
-                )
-                if include_match:
-                    includes.append(include_match.group(1))
-
-            return includes
-
-        def file_has_changed(gen_file_path, cached_file_path):
-            # If the file does not exist in the cache, then return True
-            if not os.path.exists(cached_file_path):
-                return True
-            current_checksum = self.get_md5_checksum(gen_file_path)
-            cached_checksum = self.get_md5_checksum(cached_file_path)
-            return current_checksum != cached_checksum
-
-        def any_sources_changed(gen_file_path, output_dir):
-            """
-            Given the path to a generated source file, check the gen_file_meta dict to
-            determine if the ANY of the source files contributing to the compilation of
-            this file were changed since the last successful compilation.
-            """
-            gen_file_changed, includes_list = gen_file_meta[gen_file_path]
-            any_changed = gen_file_changed
-            for included_file in includes_list:
-                included_file_path = os.path.join(output_dir, included_file)
-                any_changed = any_changed or any_sources_changed(
-                    included_file_path, output_dir
-                )
-
-            return any_changed
-
-        def generate_src_file(shader_paths_pair) -> Tuple[bool, List[str]]:
-            """
-            Given an input tuple containing the following items:
-            (src_file_name, (template_file_path, codegen_params))
-
-            This function generates src_file_name by processing
-            template_file_path with the Python preprocessor using the
-            parameters specified by codegen_params.
-
-            Then, it returns a tuple containing:
-            1. The path of the generated source file
-            2. A bool indicating if the generated source file has changed since the last
-               compilation.
-            3. A list of files included by the generated source file
-            """
-            # name of .glsl, .glslh, or .h file to be generated
-            src_file_name = shader_paths_pair[0]
-            # path of template file used for codegen
-            template_file_path = shader_paths_pair[1][0]
-            # args to be used for codegen
-            codegen_params = shader_paths_pair[1][1]
-
-            # Assume that generated files will have the same file extension as the
-            # source template file.
-            out_file_ext = extract_extension(template_file_path)
-
-            # Construct generated file name
-            gen_out_path = os.path.join(output_dir, f"{src_file_name}.{out_file_ext}")
-            # Construct path of cached generated file
-            cached_gen_out_path = os.path.join(
-                cache_dir, f"{src_file_name}.{out_file_ext}"
-            )
-
-            # Execute codegen to generate the output file
-            with codecs.open(template_file_path, "r", encoding="utf-8") as input_file:
-                input_text = input_file.read()
-                input_text = self.maybe_replace_u16vecn(input_text)
-                output_text = preprocess(input_text, codegen_params)
-
-            included_files = get_glsl_includes(output_text)
-
-            with codecs.open(gen_out_path, "w", encoding="utf-8") as output_file:
-                output_file.write(output_text)
-
-            file_changed = (
-                file_has_changed(gen_out_path, cached_gen_out_path) or force_rebuild
-            )
-
-            # Save the generated file to cache so it can be used for future checks
-            if cache_dir is not None and file_changed:
-                shutil.copyfile(gen_out_path, cached_gen_out_path)
-
-            return gen_out_path, file_changed, included_files
-
-        def compile_spirv(shader_paths_pair) -> Tuple[str, str]:
-            """
-            Given an input tuple containing the following items:
-            (src_file_name, (template_file_path, codegen_params))
-
-            Infer the path of the GLSL source file generated by generate_src_file and
-            compile a SPIR-V binary from it. Returns the path of the compiled SPIR-V
-            binary and the path of the source file used to compile it.
-
-            This function also utilizes a caching mechanism; if generate_src_file
-            reported that the source file was unchanged since the last successful
-            compilation, AND if the SPIR-V from the last successful compilation was
-            stored in the cache, then directly use the cached SPIR-V without triggering
-            a re-compilation.
-            """
-            # name of generated .glsl, .glslh, or .h from generate_src_file
-            src_file_name = shader_paths_pair[0]
-            # path of template file used for codegen
-            template_file_path = shader_paths_pair[1][0]
-            # args used for codegen
-            codegen_params = shader_paths_pair[1][1]
-
-            # Assume that generated files will have the same file extension as the
-            # source template file.
-            out_file_ext = extract_extension(template_file_path)
-
-            # Infer name of generated file (created by generate_src_file)
-            gen_out_path = os.path.join(output_dir, f"{src_file_name}.{out_file_ext}")
-
-            # Only proceed if GLSL -> SPIR-V compilation is required for this file
-            if out_file_ext != "glsl":
-                return (None, gen_out_path)
-
-            # Validate that the source file actually exists
-            assert os.path.exists(gen_out_path) and gen_out_path in gen_file_meta
-
-            # Construct name of SPIR-V file to be compiled
-            spv_out_path = os.path.join(output_dir, f"{src_file_name}.spv")
-
-            if cache_dir is not None:
-                # Construct the file names of cached SPIR-V file to check if they exist
-                # in the cache.
-                cached_spv_out_path = os.path.join(cache_dir, f"{src_file_name}.spv")
-
-                can_use_cached = not any_sources_changed(gen_out_path, output_dir)
-                if can_use_cached and os.path.exists(cached_spv_out_path):
-                    shutil.copyfile(cached_spv_out_path, spv_out_path)
-                    return (spv_out_path, gen_out_path)
-
-            vk_version = codegen_params.get("VK_VERSION", "1.1")
-            # Only proceed if a GLSL compiler was specified
-            if self.glslc_path is not None:
-                cmd_base = [
-                    self.glslc_path,
-                    "-fshader-stage=compute",
-                    gen_out_path,
-                    "-o",
-                    spv_out_path,
-                    "--target-env=vulkan{}".format(vk_version),
-                    "-Werror",
-                    "-I",
-                    output_dir,
-                ]
-                cmd = cmd_base + self.glslc_flags
-
-                try:
-                    subprocess.run(cmd, check=True, capture_output=True, text=True)
-                except subprocess.CalledProcessError as e:
-                    opt_fail = "compilation succeeded but failed to optimize"
-                    err_msg_base = f"Failed to compile {os.getcwd()}/{gen_out_path}: "
-                    if opt_fail in e.stderr or opt_fail in e.stdout:
-                        cmd_no_opt = cmd_base + self.glslc_flags_no_opt
-                        try:
-                            subprocess.run(cmd_no_opt, check=True, capture_output=True)
-                        except subprocess.CalledProcessError as e_no_opt:
-                            # Delete any existing cached SPIR-V file if it exists
-                            if os.path.exists(cached_spv_out_path):
-                                os.remove(cached_spv_out_path)
-
-                            raise RuntimeError(
-                                f"{err_msg_base} {e_no_opt.stderr}"
-                            ) from e_no_opt
-
-                    else:
-                        # Delete any existing cached SPIR-V file if it exists
-                        if os.path.exists(cached_spv_out_path):
-                            os.remove(cached_spv_out_path)
-
-                        raise RuntimeError(f"{err_msg_base} {e.stderr}") from e
-
-                # If compilation was successful, store the compiled SPIR-V file in the
-                # cache for future use.
-                if cache_dir is not None:
-                    shutil.copyfile(spv_out_path, cached_spv_out_path)
-
-            return (spv_out_path, gen_out_path)
-
-        # Run codegen serially to ensure that all .glsl, .glslh, and .h files are up to
-        # date before compilation
-        for generated_file_tuple in self.output_file_map.items():
-            gen_out_path, file_changed, include_list = generate_src_file(
-                generated_file_tuple
-            )
-            gen_file_meta[gen_out_path] = (file_changed, include_list)
-
-        # Parallelize SPIR-V compilation to optimize build time
-        with ThreadPool(os.cpu_count()) as pool:
-            for spv_out_path, glsl_out_path in pool.map(
-                compile_spirv, self.output_file_map.items()
-            ):
-                spv_to_glsl_map[spv_out_path] = glsl_out_path
-
-        return spv_to_glsl_map
-
-
-##############################################
-#  Shader Info and Shader Registry Handling  #
-##############################################
-
-
-@dataclass
-class ShaderInfo:
-    tile_size: List[int]
-    layouts: List[str]
-    weight_storage_type: str = ""
-    bias_storage_type: str = ""
-    register_for: Optional[Tuple[str, List[str]]] = None
-    requires_shader_int16_ext: bool = False
-    requires_16bit_storage_ext: bool = False
-    requires_8bit_storage_ext: bool = False
-    requires_integer_dot_product_ext: bool = False
-
-
-def getName(filePath: str) -> str:
-    return os.path.basename(filePath).replace("/", "_").replace(".", "_")
-
-
-def isDescriptorLine(lineStr: str) -> bool:
-    descriptorLineId = r"^layout\(set"
-    return re.search(descriptorLineId, lineStr) is not None
-
-
-def isTileSizeLine(lineStr: str) -> bool:
-    tile_size_id = r"^ \* TILE_SIZE = \("
-    return re.search(tile_size_id, lineStr) is not None
-
-
-def findTileSizes(lineStr: str) -> List[int]:
-    tile_size_id = r"^ \* TILE_SIZE = \(([0-9]+), ([0-9]+), ([0-9]+)\)"
-    matches = re.search(tile_size_id, lineStr)
-    if matches is None:
-        raise AssertionError("matches is None in findTileSizes")
-    return [int(matches.group(1)), int(matches.group(2)), int(matches.group(3))]
-
-
-def isWeightStorageTypeLine(lineStr: str) -> bool:
-    weight_storage_id = r"^ \* WEIGHT_STORAGE = "
-    return re.search(weight_storage_id, lineStr) is not None
-
-
-def getWeightStorageType(lineStr: str) -> str:
-    weight_storage_id = r"^ \* WEIGHT_STORAGE = ([a-zA-Z]+_\dD)"
-    matches = re.search(weight_storage_id, lineStr)
-    if matches is None:
-        raise AssertionError("matches is None in getWeightStorageType")
-    return matches.group(1)
-
-
-def isBiasStorageTypeLine(lineStr: str) -> bool:
-    weight_storage_id = r"^ \* BIAS_STORAGE = "
-    return re.search(weight_storage_id, lineStr) is not None
-
-
-def getBiasStorageType(lineStr: str) -> str:
-    weight_storage_id = r"^ \* BIAS_STORAGE = ([a-zA-Z]+_\dD)"
-    matches = re.search(weight_storage_id, lineStr)
-    if matches is None:
-        raise AssertionError("matches is None in getBiasStorageType")
-    return matches.group(1)
-
-
-def isRegisterForLine(lineStr: str) -> bool:
-    # Check for Shader Name and a list of at least one Registry Key
-    register_for_id = (
-        r"^ \* REGISTER_FOR = \('([A-Za-z0-9_]+)'\s*,\s*\['([A-Za-z0-9_]+)'.*\]\)"
-    )
-    return re.search(register_for_id, lineStr) is not None
-
-
-def findRegisterFor(lineStr: str) -> Tuple[str, List[str]]:
-    register_for_pattern = r"'([A-Za-z0-9_]+)'"
-    matches = re.findall(register_for_pattern, lineStr)
-    if matches is None:
-        raise AssertionError("matches is None in getBiasStorageType")
-    matches_list = list(matches)
-    return (matches_list[0], matches_list[1:])
-
-
-def isExtensionRequireLine(lineStr: str) -> bool:
-    extension_require_id = r"^#extension ([A-Za-z0-9_]+)\s*:\s*require"
-    return re.search(extension_require_id, lineStr) is not None
-
-
-typeIdMapping = {
-    r"image[123]D\b": "VK_DESCRIPTOR_TYPE_STORAGE_IMAGE",
-    r"sampler[123]D\b": "VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER",
-    r"\bbuffer\b": "VK_DESCRIPTOR_TYPE_STORAGE_BUFFER",
-    r"\buniform\b": "VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER",
-}
-
-
-def determineDescriptorType(lineStr: str) -> str:
-    for identifier, typeNum in typeIdMapping.items():
-        if re.search(identifier, lineStr):
-            return typeNum
-    raise AssertionError(
-        "No matching descriptor type for " + lineStr + " in determineDescriptorType"
-    )
-
-
-def getShaderInfo(srcFilePath: str) -> ShaderInfo:
-    shader_info = ShaderInfo([], [], "")
-    with open(srcFilePath) as srcFile:
-        for line in srcFile:
-            if isDescriptorLine(line):
-                shader_info.layouts.append(determineDescriptorType(line))
-            if isTileSizeLine(line):
-                shader_info.tile_size = findTileSizes(line)
-            if isWeightStorageTypeLine(line):
-                shader_info.weight_storage_type = getWeightStorageType(line)
-            if isBiasStorageTypeLine(line):
-                shader_info.bias_storage_type = getBiasStorageType(line)
-            if isRegisterForLine(line):
-                shader_info.register_for = findRegisterFor(line)
-            if isExtensionRequireLine(line):
-                if "GL_EXT_shader_explicit_arithmetic_types_int16" in line:
-                    shader_info.requires_shader_int16_ext = True
-                if "GL_EXT_shader_16bit_storage" in line:
-                    shader_info.requires_16bit_storage_ext = True
-                if "GL_EXT_shader_8bit_storage" in line:
-                    shader_info.requires_8bit_storage_ext = True
-                if "GL_EXT_integer_dot_product" in line:
-                    shader_info.requires_integer_dot_product_ext = True
-
-    return shader_info
-
-
-##########################
-#  C++ File Generation  #
-#########################
-
-cpp_template = """
-#include <executorch/backends/vulkan/runtime/api/ShaderRegistry.h>
-#include <stdint.h>
-#include <vector>
-
-using namespace vkcompute;
-
-namespace at {{
-namespace native {{
-namespace vulkan {{
-
-namespace {{
-
-{spv_bin_arrays}
-
-}}
-
-static void register_fn() {{
-
-{register_shader_infos}
-
-{shader_info_registry}
-
-}}
-
-static const api::ShaderRegisterInit register_shaders(&register_fn);
-
-}}
-}}
-}}
-
-"""
-
-
-def generateSpvBinStr(spvPath: str, name: str) -> Tuple[int, str]:
-    with open(spvPath, "rb") as fr:
-        next_bin = array.array("I", fr.read())
-        sizeBytes = 4 * len(next_bin)
-        spv_bin_str = "const uint32_t {}_bin[] = {{\n{}\n}};".format(
-            name,
-            textwrap.indent(",\n".join(str(x) for x in next_bin), "  "),
-        )
-
-    return sizeBytes, spv_bin_str
-
-
-def generateShaderInfoStr(shader_info: ShaderInfo, name: str, sizeBytes: int) -> str:
-    tile_size = (
-        f"{{{', '.join(str(x) for x in shader_info.tile_size)}}}"
-        if (len(shader_info.tile_size) > 0)
-        else "{1, 1, 1}"
-    )
-
-    shader_info_layouts = "{{{}}}".format(",\n ".join(shader_info.layouts))
-
-    def to_cpp_str(val: bool):
-        return "true" if val else "false"
-
-    shader_info_args = [
-        f'"{name}"',
-        f"{name}_bin",
-        str(sizeBytes),
-        shader_info_layouts,
-        tile_size,
-        to_cpp_str(shader_info.requires_shader_int16_ext),
-        to_cpp_str(shader_info.requires_16bit_storage_ext),
-        to_cpp_str(shader_info.requires_8bit_storage_ext),
-        to_cpp_str(shader_info.requires_integer_dot_product_ext),
-    ]
-
-    shader_info_str = textwrap.indent(
-        "api::shader_registry().register_shader(\n  vkapi::ShaderInfo(\n{args}));\n".format(
-            args=textwrap.indent(",\n".join(shader_info_args), "     "),
-        ),
-        "    ",
-    )
-
-    return shader_info_str
-
-
-def generateShaderDispatchStr(shader_info: ShaderInfo, name: str) -> str:
-    if shader_info.register_for is None:
-        return ""
-
-    (op_name, registry_keys) = shader_info.register_for
-    shader_dispatch_str = ""
-    for registry_key in registry_keys:
-        shader_dispatch_str = textwrap.indent(
-            f'api::shader_registry().register_op_dispatch("{op_name}", api::DispatchKey::{registry_key.upper()}, "{name}");',
-            "    ",
-        )
-
-    return shader_dispatch_str
-
-
-def genCppFiles(
-    spv_files: Dict[str, str], cpp_header_path: str, cpp_src_file_path: str
-) -> None:
-    spv_bin_strs = []
-    register_shader_info_strs = []
-    shader_registry_strs = []
-
-    for spvPath, srcPath in spv_files.items():
-        if spvPath is None:
-            continue
-
-        name = getName(spvPath).replace("_spv", "")
-
-        sizeBytes, spv_bin_str = generateSpvBinStr(spvPath, name)
-        spv_bin_strs.append(spv_bin_str)
-
-        shader_info = getShaderInfo(srcPath)
-
-        register_shader_info_strs.append(
-            generateShaderInfoStr(shader_info, name, sizeBytes)
-        )
-
-        if shader_info.register_for is not None:
-            shader_registry_strs.append(generateShaderDispatchStr(shader_info, name))
-
-    spv_bin_arrays = "\n".join(spv_bin_strs)
-    register_shader_infos = "\n".join(register_shader_info_strs)
-    shader_info_registry = "\n".join(shader_registry_strs)
-
-    cpp = cpp_template.format(
-        spv_bin_arrays=spv_bin_arrays,
-        register_shader_infos=register_shader_infos,
-        shader_info_registry=shader_info_registry,
-    )
-
-    with open(cpp_src_file_path, "w") as fw:
-        fw.write(cpp)
-
-
-##########
-#  Main  #
-##########
-
-
-def parse_arg_env(items: Dict[Any, Any]) -> Dict[Any, Any]:
-    d = {}
-    if items:
-        for item in items:
-            tokens = item.split("=")
-            key = tokens[0].strip()
-            value = tokens[1].strip()
-            d[key] = value
-    return d
-
-
-def main(argv: List[str]) -> int:
-    parser = argparse.ArgumentParser(description="")
-    parser.add_argument(
-        "-i",
-        "--glsl-paths",
-        nargs="+",
-        help='List of paths to look for GLSL source files, separated by spaces. Ex: --glsl-paths "path1 path2 path3"',
-        default=["."],
-    )
-    parser.add_argument("-c", "--glslc-path", required=True, help="")
-    parser.add_argument(
-        "-t", "--tmp-dir-path", required=True, help="/tmp/vulkan_shaders/"
-    )
-    parser.add_argument("-o", "--output-path", required=True, help="")
-    parser.add_argument("-f", "--force-rebuild", action="store_true", default=False)
-    parser.add_argument("--replace-u16vecn", action="store_true", default=False)
-    parser.add_argument("--optimize_size", action="store_true", help="")
-    parser.add_argument("--optimize", action="store_true", help="")
-    parser.add_argument("--spv_debug", action="store_true", default=False)
-    parser.add_argument(
-        "--env", metavar="KEY=VALUE", nargs="*", help="Set a number of key-value pairs"
-    )
-    options = parser.parse_args()
-
-    env = DEFAULT_ENV
-    env.update(TYPE_MAPPINGS)
-    env.update(UTILITY_FNS)
-
-    for key, value in parse_arg_env(options.env).items():
-        env[key] = value
-
-    if not os.path.exists(options.output_path):
-        os.makedirs(options.output_path)
-
-    if not os.path.exists(options.tmp_dir_path):
-        os.makedirs(options.tmp_dir_path)
-
-    glslc_flags = []
-    if options.optimize_size:
-        glslc_flags.append("-Os")
-    elif options.optimize:
-        glslc_flags.append("-O")
-
-    if options.spv_debug:
-        glslc_flags.append("-g")
-
-    glslc_flags_str = " ".join(glslc_flags)
-
-    shader_generator = SPVGenerator(
-        options.glsl_paths,
-        env,
-        options.glslc_path,
-        glslc_flags=glslc_flags_str,
-        replace_u16vecn=options.replace_u16vecn,
-    )
-    output_spv_files = shader_generator.generateSPV(
-        options.output_path, options.tmp_dir_path, options.force_rebuild
-    )
-
-    genCppFiles(
-        output_spv_files,
-        f"{options.output_path}/{CPP_H_NAME}",
-        f"{options.output_path}/{CPP_SRC_NAME}",
-    )
-
-    return 0
-
-
-def invoke_main() -> None:
-    sys.exit(main(sys.argv))
-
-
-if __name__ == "__main__":
-    invoke_main()  # pragma: no cover
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
deleted file mode 100644
index 6609298b0d8..00000000000
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ /dev/null
@@ -1,1116 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// @lint-ignore-every CLANGTIDY
-// facebook-security-vulnerable-integer-sign-conversion
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
-
-namespace vkcompute {
-
-//
-// VTensorPtr
-//
-
-#define VALUE_PTR_CLASS_IMPL(classname, ctype, type_name)                 \
-  classname::classname(ComputeGraph* const graph, const ValueRef idx)     \
-      : graph_(graph), ptr_(&(graph_->values_.at(idx).to##type_name())) { \
-    graph_->values_in_use_++;                                             \
-  }                                                                       \
-  ctype* classname::operator->() const {                                  \
-    return ptr_;                                                          \
-  }                                                                       \
-  ctype& classname::operator*() const {                                   \
-    return *ptr_;                                                         \
-  }                                                                       \
-  classname::~classname() {                                               \
-    graph_->values_in_use_--;                                             \
-  }
-
-VALUE_PTR_CLASS_IMPL(vTensorPtr, api::vTensor, Tensor)
-VALUE_PTR_CLASS_IMPL(TensorRefPtr, TensorRef, TensorRef)
-VALUE_PTR_CLASS_IMPL(StagingPtr, api::StagingBuffer, Staging)
-VALUE_PTR_CLASS_IMPL(IntListPtr, std::vector<int64_t>, IntList)
-VALUE_PTR_CLASS_IMPL(DoubleListPtr, std::vector<double>, DoubleList)
-VALUE_PTR_CLASS_IMPL(BoolListPtr, std::vector<bool>, BoolList)
-VALUE_PTR_CLASS_IMPL(ValueListPtr, std::vector<ValueRef>, ValueList)
-VALUE_PTR_CLASS_IMPL(SymIntPtr, SymInt, SymInt)
-
-#undef VALUE_PTR_CLASS_IMPL
-
-//
-// TmpTensor
-//
-
-TmpTensor::TmpTensor(
-    ComputeGraph* const graph_ptr,
-    const std::vector<int64_t>& sizes,
-    const vkapi::ScalarType dtype,
-    const utils::StorageType storage_type,
-    const utils::GPUMemoryLayout memory_layout)
-    : graph_p(graph_ptr),
-      sobj_idx(get_sobj_idx()),
-      vref(graph_p->add_tensor(
-          sizes,
-          dtype,
-          storage_type,
-          memory_layout,
-          sobj_idx)) {}
-
-TmpTensor::TmpTensor(
-    ComputeGraph* const graph_ptr,
-    const std::vector<int64_t>& sizes,
-    const vkapi::ScalarType dtype,
-    const utils::StorageType storage_type)
-    : graph_p(graph_ptr),
-      sobj_idx(get_sobj_idx()),
-      vref(graph_p->add_tensor(sizes, dtype, storage_type, sobj_idx)) {}
-
-TmpTensor::TmpTensor(
-    ComputeGraph* const graph_ptr,
-    const std::vector<int64_t>& sizes,
-    const vkapi::ScalarType dtype,
-    const utils::GPUMemoryLayout memory_layout)
-    : graph_p(graph_ptr),
-      sobj_idx(get_sobj_idx()),
-      vref(graph_p->add_tensor(sizes, dtype, memory_layout, sobj_idx)) {}
-
-TmpTensor::TmpTensor(
-    ComputeGraph* const graph_ptr,
-    const std::vector<int64_t>& sizes,
-    const vkapi::ScalarType dtype)
-    : graph_p(graph_ptr),
-      sobj_idx(get_sobj_idx()),
-      vref(graph_p->add_tensor(sizes, dtype, sobj_idx)) {}
-
-TmpTensor::~TmpTensor() {
-  // Lifetime of this temporary tensor is expired; return the shared object to
-  // the pool, as long as the sobj index is valid
-  if (sobj_idx >= 0) {
-    graph_p->tmp_shared_object_idxs_.emplace(sobj_idx);
-  }
-}
-
-int64_t TmpTensor::get_sobj_idx() {
-  int64_t sobj_idx;
-  // If no available temporary shared objects, request a new one to be created
-  if (graph_p->tmp_shared_object_idxs_.empty()) {
-    sobj_idx = graph_p->shared_objects_.size();
-  } else {
-    // Get the first available shared object idx
-    sobj_idx = graph_p->tmp_shared_object_idxs_.top();
-    graph_p->tmp_shared_object_idxs_.pop();
-  }
-  return sobj_idx;
-}
-
-//
-// ComputeGraph
-//
-
-ComputeGraph::ComputeGraph(GraphConfig config)
-    : config_{config},
-      prepack_descriptor_counts_{},
-      execute_descriptor_counts_{},
-      context_{new api::Context(
-          config.external_adapter ? config.external_adapter
-                                  : vkapi::runtime()->get_adapter_p(),
-          config_.context_config)},
-      shared_objects_{},
-      values_{},
-      param_ubos_{},
-      prepack_nodes_{},
-      execute_nodes_{},
-      inputs_{},
-      outputs_{} {
-  // Ensure that descriptor counts are initialized to 0
-  prepack_descriptor_counts_.descriptor_pool_max_sets = 0;
-  prepack_descriptor_counts_.descriptor_uniform_buffer_count = 0;
-  prepack_descriptor_counts_.descriptor_storage_buffer_count = 0;
-  prepack_descriptor_counts_.descriptor_combined_sampler_count = 0;
-  prepack_descriptor_counts_.descriptor_storage_image_count = 0;
-
-  execute_descriptor_counts_.descriptor_pool_max_sets = 0;
-  execute_descriptor_counts_.descriptor_uniform_buffer_count = 0;
-  execute_descriptor_counts_.descriptor_storage_buffer_count = 0;
-  execute_descriptor_counts_.descriptor_combined_sampler_count = 0;
-  execute_descriptor_counts_.descriptor_storage_image_count = 0;
-
-  // If certain graph config variables are not specified, then set them
-  // automatically.
-  if (config_.prepack_threshold_nbytes == 0) {
-    config_.prepack_threshold_nbytes = 10 * MB;
-    config_.prepack_initial_threshold_nbytes = 10 * MB;
-  }
-  if (config_.execute_threshold_node_count == 0) {
-    config_.execute_threshold_node_count = 128;
-    config_.execute_initial_threshold_node_count = 64;
-  }
-
-  // Check if the underlying GPU can access accelerated integer dot product
-  // instructions
-  can_use_int8_dot_product_ =
-      context_->adapter_ptr()->supports_int8_dot_product();
-}
-
-ComputeGraph::~ComputeGraph() {
-  values_.clear();
-
-  prepack_nodes_.clear();
-  execute_nodes_.clear();
-  clear_deferred_cmds();
-
-  context_->flush();
-}
-
-std::vector<int64_t> ComputeGraph::extract_int_or_symint_list(
-    const ValueRef idx) {
-  const Value& val = values_.at(idx);
-  std::vector<int64_t> result;
-
-  if (val.isIntList()) {
-    // If it's an IntList, return a copy of the list
-    return val.toConstIntList();
-  } else if (val.isValueList()) {
-    // If it's a ValueList, extract each element as an Int or SymInt
-    const std::vector<ValueRef>& value_list = val.toConstValueList();
-    result.reserve(value_list.size());
-
-    for (const ValueRef& ref : value_list) {
-      const Value& element = values_.at(ref);
-      if (element.isInt()) {
-        result.push_back(element.toInt());
-      } else if (element.isSymInt()) {
-        result.push_back(read_symint(ref));
-      } else {
-        VK_THROW(
-            "ValueList element is neither Int nor SymInt, but has type ",
-            element.type());
-      }
-    }
-    return result;
-  }
-
-  VK_THROW(
-      "Cannot extract int or symint list from Value with type ", val.type());
-}
-
-utils::StorageType ComputeGraph::suggested_storage_type() {
-  if (config_.enable_storage_type_override) {
-    return config_.storage_type_override;
-  }
-  return utils::kTexture3D;
-}
-
-bool ComputeGraph::was_value_updated(const ValueRef idx) const noexcept {
-  if (!is_valid_value_idx(idx)) {
-    return false;
-  }
-
-  // Check if this ValueRef itself was updated
-  if (updated_values_.find(idx) != updated_values_.end()) {
-    return true;
-  }
-
-  // If this is a ValueList, check each ValueRef in the list
-  if (val_is_value_list(idx)) {
-    const auto& value_list = values_.at(idx).toConstValueList();
-    for (const auto& nested_idx : value_list) {
-      if (was_value_updated(nested_idx)) {
-        return true;
-      }
-    }
-  }
-
-  return false;
-}
-
-utils::GPUMemoryLayout ComputeGraph::suggested_memory_layout(
-    const std::vector<int64_t>& sizes) {
-  if (config_.enable_memory_layout_override) {
-    return config_.memory_layout_override;
-  }
-  if (sizes.size() < 3) {
-    return utils::kWidthPacked;
-  }
-  // For 3 dimensional tensors that only have a channels dimension of 1, still
-  // prefer width packed.
-  if (utils::val_at(-3, sizes) == 1) {
-    return utils::kWidthPacked;
-  }
-  return utils::kChannelsPacked;
-}
-
-bool ComputeGraph::device_name_contains(const char* substr) {
-  return context_->adapter_ptr()->device_name().find(substr) !=
-      std::string::npos;
-}
-
-void ComputeGraph::check_no_active_value_ptrs() {
-  VK_CHECK_COND(
-      values_in_use_ == 0,
-      "Make sure that there are no pointers stored from the return values of "
-      "`ComputeGraph::get_*()` functions in scope before adding Values to the "
-      "graph. Modifying the graph's values may cause existing pointers to be "
-      "invalidated.");
-}
-
-bool ComputeGraph::is_valid_value_idx(const ValueRef idx) const noexcept {
-  return idx >= 0 && idx < static_cast<int>(values_.size());
-}
-
-std::vector<int64_t> ComputeGraph::sizes_of(const ValueRef idx) const {
-  const Value& val = values_.at(idx);
-  if (val.isTensor()) {
-    return val.toConstTensor().sizes();
-  } else if (val.isTensorRef()) {
-    return val.toConstTensorRef().sizes;
-  }
-  VK_THROW("Could not get sizes of value with type ", val.type());
-}
-
-int64_t ComputeGraph::dim_of(const ValueRef idx) const {
-  const Value& val = values_.at(idx);
-  if (val.isTensor()) {
-    return val.toConstTensor().dim();
-  } else if (val.isTensorRef()) {
-    return val.toConstTensorRef().sizes.size();
-  }
-  VK_THROW("Could not get dim of value with type ", val.type());
-}
-
-std::vector<int64_t> ComputeGraph::dim_order_of(const ValueRef idx) const {
-  const Value& val = values_.at(idx);
-  if (val.isTensor()) {
-    return val.toConstTensor().dim_order();
-  }
-  VK_THROW("Could not get dim order of value with type ", val.type());
-}
-
-std::vector<int64_t> ComputeGraph::strides_of(const ValueRef idx) const {
-  const Value& val = values_.at(idx);
-  if (val.isTensor()) {
-    return val.toConstTensor().strides();
-  }
-  VK_THROW("Could not get strides of value with type ", val.type());
-}
-
-vkapi::ScalarType ComputeGraph::dtype_of(const ValueRef idx) const {
-  const Value& val = values_.at(idx);
-  if (val.isTensor()) {
-    return val.toConstTensor().dtype();
-  } else if (val.isTensorRef()) {
-    return val.toConstTensorRef().dtype;
-  } else if (val.isBool()) {
-    return vkapi::ScalarType::Bool;
-  } else if (val.isDouble()) {
-    // We downcast anyway in the shader and we want to avoid having to
-    // write special cases there.
-    return vkapi::ScalarType::Float;
-  } else if (val.isInt()) {
-    return vkapi::ScalarType::Int;
-  }
-  VK_THROW("Could not get dtype of value with type ", val.type());
-}
-
-bool ComputeGraph::is_contiguous_buffer_tensor(const ValueRef idx) const {
-  if (!val_is_tensor(idx)) {
-    return false;
-  }
-  if (!is_buffer_storage(idx)) {
-    return false;
-  }
-  return is_contiguous(idx);
-}
-
-bool ComputeGraph::is_contiguous_texture_tensor(const ValueRef idx) const {
-  if (!val_is_tensor(idx)) {
-    return false;
-  }
-  if (is_buffer_storage(idx)) {
-    return false;
-  }
-  return has_standard_axis_map(idx) && packed_dim_of(idx) == 0;
-}
-
-bool ComputeGraph::is_standard_channels_packed_texture_tensor(
-    const ValueRef idx) const {
-  if (!val_is_tensor(idx)) {
-    return false;
-  }
-  if (is_buffer_storage(idx)) {
-    return false;
-  }
-  return has_standard_axis_map(idx) && packed_dim_of(idx) == 2;
-}
-
-bool ComputeGraph::is_2d_matrix(const ValueRef idx) const {
-  std::vector<int64_t> sizes = sizes_of(idx);
-  const size_t ndim = sizes.size();
-  if (sizes.size() < 2) {
-    return false;
-  }
-  if (sizes.size() == 2) {
-    return true;
-  }
-
-  // Check that outermost dims have size of 1
-  for (int d = 0; d < ndim - 2; d++) {
-    if (sizes[d] != 1) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-bool ComputeGraph::is_vectorizable_contiguous_2d_matrix(
-    const ValueRef idx) const {
-  if (!is_2d_matrix(idx)) {
-    return false;
-  }
-  if (is_buffer_storage(idx)) {
-    return is_contiguous_buffer_tensor(idx) &&
-        size_at<int32_t>(-1, idx) % 4 == 0;
-  }
-  return is_contiguous_texture_tensor(idx);
-}
-
-bool ComputeGraph::is_vectorizable_width_packed_tensor(
-    const ValueRef idx) const {
-  // Not a tensor - return false
-  if (!val_is_tensor(idx)) {
-    return false;
-  }
-  if (is_buffer_storage(idx)) {
-    return is_contiguous_buffer_tensor(idx) &&
-        size_at<int32_t>(-1, idx) % 4 == 0;
-  }
-
-  return is_standard_channels_packed_texture_tensor(idx);
-}
-
-ValueRef ComputeGraph::add_tensor(
-    const std::vector<int64_t>& sizes,
-    const vkapi::ScalarType dtype,
-    const utils::StorageType storage_type,
-    const utils::GPUMemoryLayout memory_layout,
-    const int64_t shared_object_idx,
-    const utils::AxisMapLayout axis_map_layout) {
-  ValueRef idx(static_cast<int>(values_.size()));
-  check_no_active_value_ptrs();
-  values_.emplace_back(api::vTensor(
-      context(),
-      sizes,
-      dtype,
-      storage_type,
-      memory_layout,
-      false,
-      axis_map_layout));
-
-  if (shared_object_idx >= 0) {
-    get_shared_object(shared_object_idx).add_user(this, idx);
-  }
-  return idx;
-}
-
-ValueRef ComputeGraph::add_tensor(
-    const std::vector<int64_t>& sizes,
-    const vkapi::ScalarType dtype,
-    const utils::StorageType storage_type,
-    const int64_t shared_object_idx,
-    const utils::AxisMapLayout axis_map_layout) {
-  return add_tensor(
-      sizes,
-      dtype,
-      storage_type,
-      suggested_memory_layout(sizes),
-      shared_object_idx,
-      axis_map_layout);
-}
-
-ValueRef ComputeGraph::add_tensor(
-    const std::vector<int64_t>& sizes,
-    const vkapi::ScalarType dtype,
-    const utils::GPUMemoryLayout memory_layout,
-    const int64_t shared_object_idx,
-    const utils::AxisMapLayout axis_map_layout) {
-  return add_tensor(
-      sizes,
-      dtype,
-      suggested_storage_type(),
-      memory_layout,
-      shared_object_idx,
-      axis_map_layout);
-}
-
-ValueRef ComputeGraph::add_tensor_like(
-    const ValueRef idx,
-    const utils::StorageType storage_type,
-    const utils::GPUMemoryLayout memory_layout,
-    const utils::AxisMapLayout axis_map_layout) {
-  return add_tensor(
-      sizes_of(idx),
-      dtype_of(idx),
-      storage_type,
-      memory_layout,
-      -1,
-      axis_map_layout);
-}
-
-ValueRef ComputeGraph::add_tensor_like(
-    const ValueRef idx,
-    const utils::GPUMemoryLayout memory_layout,
-    const utils::AxisMapLayout axis_map_layout) {
-  return add_tensor(
-      sizes_of(idx),
-      dtype_of(idx),
-      storage_type_of(idx),
-      memory_layout,
-      -1,
-      axis_map_layout);
-}
-
-ValueRef ComputeGraph::add_tensor(
-    const std::vector<int64_t>& sizes,
-    const vkapi::ScalarType dtype,
-    const int64_t shared_object_idx,
-    const utils::AxisMapLayout axis_map_layout) {
-  return add_tensor(
-      sizes,
-      dtype,
-      suggested_memory_layout(sizes),
-      shared_object_idx,
-      axis_map_layout);
-}
-
-ValueRef ComputeGraph::add_tensor(const vkapi::VulkanImage& image) {
-  ValueRef idx(static_cast<int>(values_.size()));
-  check_no_active_value_ptrs();
-  values_.emplace_back(api::vTensor(context(), image));
-  return idx;
-}
-
-ValueRef ComputeGraph::add_tensor_view(const ValueRef vref) {
-  const vTensorPtr t = get_tensor(vref);
-  ValueRef idx(static_cast<int>(values_.size()));
-  values_.emplace_back(api::vTensor(*t));
-  return idx;
-}
-
-ValueRef ComputeGraph::add_tensor_view(
-    const ValueRef vref,
-    const std::vector<int64_t>& sizes,
-    const std::vector<int64_t>& strides) {
-  const vTensorPtr t = get_tensor(vref);
-  ValueRef idx(static_cast<int>(values_.size()));
-  values_.emplace_back(api::vTensor(*t, sizes, strides));
-  return idx;
-}
-
-ValueRef ComputeGraph::add_tensorref(
-    const std::vector<int64_t>& sizes,
-    const vkapi::ScalarType dtype,
-    const void* const data) {
-  ValueRef idx(static_cast<int>(values_.size()));
-  check_no_active_value_ptrs();
-  values_.emplace_back(TensorRef(sizes, dtype, data));
-  total_constant_nbytes_ += values_.back().toConstTensorRef().nbytes();
-  return idx;
-}
-
-ValueRef ComputeGraph::add_tensorref(
-    const std::vector<int64_t>& sizes,
-    const vkapi::ScalarType dtype,
-    executorch::runtime::FreeableBuffer&& buffer) {
-  ValueRef idx(static_cast<int>(values_.size()));
-  check_no_active_value_ptrs();
-  values_.emplace_back(TensorRef(sizes, dtype, std::move(buffer)));
-  total_constant_nbytes_ += values_.back().toConstTensorRef().nbytes();
-  return idx;
-}
-
-ValueRef ComputeGraph::add_staging(
-    const vkapi::ScalarType dtype,
-    const size_t numel) {
-  ValueRef idx(static_cast<int>(values_.size()));
-  check_no_active_value_ptrs();
-  values_.emplace_back(api::StagingBuffer(context(), dtype, numel));
-  return idx;
-}
-
-ValueRef ComputeGraph::add_none() {
-  ValueRef idx(static_cast<int>(values_.size()));
-  check_no_active_value_ptrs();
-  values_.emplace_back();
-  return idx;
-}
-
-ValueRef ComputeGraph::add_value_list(std::vector<ValueRef>&& value) {
-  ValueRef idx(static_cast<int>(values_.size()));
-  check_no_active_value_ptrs();
-  values_.emplace_back(std::move(value));
-  return idx;
-}
-
-ValueRef ComputeGraph::add_string(std::string&& str) {
-  ValueRef idx(static_cast<int>(values_.size()));
-  check_no_active_value_ptrs();
-  values_.emplace_back(std::move(str));
-  return idx;
-}
-
-ValueRef ComputeGraph::add_symint(const int32_t val) {
-  ValueRef idx(static_cast<int>(values_.size()));
-  check_no_active_value_ptrs();
-  values_.emplace_back(SymInt(context(), val));
-  return idx;
-}
-
-ValueRef ComputeGraph::get_or_add_value_for_int(const int64_t val) {
-  for (int i = 0; i < values_.size(); ++i) {
-    if (values_.at(i).isInt() && values_.at(i).toInt() == val) {
-      return i;
-    }
-  }
-  return add_scalar(val);
-}
-
-ValueRef ComputeGraph::set_input_tensor(
-    const ValueRef idx,
-    const bool use_staging) {
-  if (use_staging) {
-    vkapi::ScalarType dtype = get_tensor(idx)->dtype();
-    // For texture storage, the buffer size needs to account for the zero
-    // padding applied by unused texel elements.
-    size_t buf_numel = get_tensor(idx)->staging_buffer_numel();
-    ValueRef staging_idx = add_staging(dtype, buf_numel);
-    add_staging_to_tensor_node(*this, staging_idx, idx);
-    inputs_.push_back({idx, staging_idx});
-    return staging_idx;
-  }
-  inputs_.push_back({idx, kDummyValueRef});
-  return idx;
-}
-
-ValueRef ComputeGraph::set_output_tensor(
-    const ValueRef idx,
-    const bool use_staging) {
-  if (use_staging) {
-    vkapi::ScalarType dtype = get_tensor(idx)->dtype();
-    // For texture storage, the buffer size needs to account for the zero
-    // padding applied by unused texel elements.
-    size_t buf_numel = get_tensor(idx)->staging_buffer_numel();
-    ValueRef staging_idx = add_staging(dtype, buf_numel);
-    // We only run this when the tensor is non-empty.  When the underlying
-    // tensor is empty (e.g. padded_numel == 0), we do not allocate a VkImage to
-    // tensor, we will not be able to bind the node for execution.
-    if (buf_numel > 0) {
-      add_tensor_to_staging_node(*this, idx, staging_idx);
-    }
-    outputs_.push_back({idx, staging_idx});
-    return staging_idx;
-  }
-  outputs_.push_back({idx, kDummyValueRef});
-  return idx;
-}
-
-ValueRef ComputeGraph::set_output_value(const ValueRef idx) {
-  if (values_.at(idx).isTensor()) {
-    return set_output_tensor(idx);
-  }
-  outputs_.push_back({idx, kDummyValueRef});
-  return idx;
-}
-
-vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer(
-    const ValueRef idx) {
-  if (values_.at(idx).isInt()) {
-    const int32_t val = extract_scalar<int32_t>(idx);
-    return create_params_buffer(val);
-  } else if (values_.at(idx).isSymInt()) {
-    SymIntPtr symint = get_symint(idx);
-    return vkapi::BufferBindInfo(symint->gpu_buffer.buffer());
-  }
-  VK_THROW("Cannot create a int param buffer for the given value");
-}
-
-vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer(
-    const ValueRef idx,
-    const int32_t default_val) {
-  if (values_.at(idx).isNone()) {
-    return create_params_buffer(default_val);
-  } else {
-    return get_or_create_int_param_buffer(idx);
-  }
-}
-
-void ComputeGraph::set_symint(const ValueRef idx, const int32_t val) {
-  int32_t cur_val = read_symint(idx);
-  if (cur_val != val) {
-    get_symint(idx)->set(val);
-    // Track that this ValueRef was updated
-    updated_values_.insert(idx);
-  }
-}
-
-int32_t ComputeGraph::read_symint(const ValueRef idx) {
-  return get_symint(idx)->get();
-}
-
-SharedObject& ComputeGraph::get_shared_object(const int64_t idx) {
-  if (idx >= shared_objects_.size()) {
-    shared_objects_.resize(static_cast<size_t>(idx + 1));
-  }
-  return shared_objects_.at(idx);
-}
-
-void ComputeGraph::create_dedicated_allocation_for(const ValueRef idx) {
-  vTensorPtr tensor = get_tensor(idx);
-  if (!tensor->memory_is_bound()) {
-    VmaAllocationCreateInfo alloc_create_info =
-        context()->adapter_ptr()->vma().gpuonly_resource_create_info();
-    tensor->acquire_allocation(
-        context()->adapter_ptr()->vma().create_allocation(
-            tensor->get_memory_requirements(), alloc_create_info));
-  }
-}
-
-void ComputeGraph::update_descriptor_counts(
-    const vkapi::ShaderInfo& shader_info,
-    bool execute) {
-  vkapi::DescriptorPoolConfig* config =
-      execute ? &execute_descriptor_counts_ : &prepack_descriptor_counts_;
-
-  config->descriptor_pool_max_sets += 1;
-  for (const VkDescriptorType arg_type : shader_info.kernel_layout) {
-    switch (arg_type) {
-      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
-        config->descriptor_uniform_buffer_count += 1;
-        break;
-      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
-        config->descriptor_storage_buffer_count += 1;
-        break;
-      case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
-        config->descriptor_combined_sampler_count += 1;
-        break;
-      case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
-        config->descriptor_storage_image_count += 1;
-        break;
-      default:
-        VK_THROW("Unsupported descriptor type!");
-    }
-  }
-}
-
-void ComputeGraph::register_pipeline_to_create(
-    const vkapi::ShaderInfo& shader_info,
-    const utils::WorkgroupSize& local_workgroup_size,
-    const vkapi::SpecVarList& spec_vars,
-    const std::vector<PushConstantDataInfo>& push_constants) {
-  VkDescriptorSetLayout shader_layout =
-      context()->shader_layout_cache().retrieve(shader_info.kernel_layout);
-
-  uint32_t pc_offset = 0;
-  std::array<uint8_t, kMaxPushConstantSize> pc_data;
-  for (const auto& pc : push_constants) {
-    pc_offset += pc.write(pc_data.data(), pc_offset, kMaxPushConstantSize);
-  }
-
-  vkapi::SpecVarList spec_constants = {
-      SV(local_workgroup_size[0u]),
-      SV(local_workgroup_size[1u]),
-      SV(local_workgroup_size[2u])};
-
-  spec_constants.append(spec_vars);
-
-  const vkapi::ComputePipelineCache::Key desc = {
-      context()->pipeline_layout_cache().retrieve(shader_layout, pc_offset),
-      context()->shader_cache().retrieve(shader_info),
-      spec_constants};
-
-  if (context_->pipeline_cache().contains(desc)) {
-    return;
-  }
-  auto it = pipeline_descriptors_.find(desc);
-  if (it != pipeline_descriptors_.cend()) {
-    return;
-  }
-  pipeline_descriptors_.insert(desc);
-}
-
-utils::uvec3 ComputeGraph::create_global_wg_size(const ValueRef idx) {
-  if (is_buffer_storage(idx)) {
-    return {uint32_t(numel_of(idx)), 1u, 1u};
-  }
-  return logical_limits_of(idx);
-}
-
-utils::uvec3 ComputeGraph::create_local_wg_size(
-    const utils::uvec3 global_wg_size) {
-  if (config_.enable_local_wg_size_override) {
-    return config_.local_wg_size_override;
-  }
-
-  // array containing axis index and global workgroup size
-  std::pair<uint32_t, uint32_t> global_wg_size_desc[] = {
-      {0u, global_wg_size[0]},
-      {1u, global_wg_size[1]},
-      {2u, global_wg_size[2]}};
-
-  // sort the global workgroup size in descending order
-  if (global_wg_size_desc[0].second < global_wg_size_desc[1].second) {
-    std::swap(global_wg_size_desc[0], global_wg_size_desc[1]);
-  }
-  if (global_wg_size_desc[1].second < global_wg_size_desc[2].second) {
-    std::swap(global_wg_size_desc[1], global_wg_size_desc[2]);
-  }
-  if (global_wg_size_desc[0].second < global_wg_size_desc[1].second) {
-    std::swap(global_wg_size_desc[0], global_wg_size_desc[1]);
-  }
-
-  utils::uvec3 local_group_size = {
-      8,
-      std::max(1u, std::min(4u, global_wg_size_desc[1].second)),
-      std::max(1u, std::min(2u, global_wg_size_desc[2].second))};
-
-  if (global_wg_size_desc[2u].second == 1) {
-    if (global_wg_size_desc[1u].second == 1) {
-      local_group_size[0u] = 64;
-      local_group_size[1u] = 1;
-    } else if (global_wg_size_desc[1u].second % 4 == 0) {
-      local_group_size[0u] = 16;
-      local_group_size[1u] = 4;
-    } else {
-      local_group_size[0u] = 32;
-      local_group_size[1u] = 2;
-    }
-  }
-
-  return {
-      local_group_size[global_wg_size_desc[0].first],
-      local_group_size[global_wg_size_desc[1].first],
-      local_group_size[global_wg_size_desc[2].first]};
-}
-
-utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {
-  return create_local_wg_size(create_global_wg_size(idx));
-}
-
-void ComputeGraph::bind_tensor_to_descriptor_set(
-    const ValueRef ref,
-    vkapi::PipelineBarrier& pipeline_barrier,
-    const vkapi::MemoryAccessFlags access_type,
-    vkapi::DescriptorSet& descriptor_set,
-    const uint32_t idx) {
-  vTensorPtr tensor = get_tensor(ref);
-  if (tensor->buffer()) {
-    vkapi::VulkanBuffer& buffer = tensor->buffer(
-        pipeline_barrier, vkapi::PipelineStage::COMPUTE, access_type);
-    descriptor_set.bind(idx, buffer);
-  } else {
-    vkapi::VulkanImage& image = tensor->image(
-        pipeline_barrier, vkapi::PipelineStage::COMPUTE, access_type);
-    descriptor_set.bind(idx, image);
-  }
-}
-
-void ComputeGraph::bind_value_to_descriptor_set(
-    const ValueRef ref,
-    vkapi::PipelineBarrier& pipeline_barrier,
-    const vkapi::MemoryAccessFlags access_type,
-    vkapi::DescriptorSet& descriptor_set,
-    const uint32_t idx) {
-  if (val_is_tensor(ref)) {
-    bind_tensor_to_descriptor_set(
-        ref, pipeline_barrier, access_type, descriptor_set, idx);
-  } else if (val_is_staging(ref)) {
-    descriptor_set.bind(idx, get_staging(ref)->buffer());
-  }
-}
-
-void ComputeGraph::copy_into_staging(
-    const ValueRef idx,
-    const void* data,
-    const size_t numel) {
-  StagingPtr staging = get_staging(idx);
-  size_t nbytes = numel * vkapi::element_size(staging->dtype());
-  staging->copy_from(data, nbytes);
-}
-
-void ComputeGraph::copy_from_staging(
-    const ValueRef idx,
-    void* data,
-    const size_t numel) {
-  StagingPtr staging = get_staging(idx);
-  size_t nbytes = numel * vkapi::element_size(staging->dtype());
-  staging->copy_to(data, nbytes);
-}
-
-void ComputeGraph::prepare() {
-#define MERGE_FIELD(field)                    \
-  static_cast<uint32_t>(std::ceil(            \
-      std::max(                               \
-          execute_descriptor_counts_.field,   \
-          prepack_descriptor_counts_.field) * \
-      config_.descriptor_pool_safety_factor))
-
-  uint32_t max_sets = MERGE_FIELD(descriptor_pool_max_sets);
-  vkapi::DescriptorPoolConfig config{
-      max_sets,
-      std::max(MERGE_FIELD(descriptor_uniform_buffer_count), max_sets),
-      std::max(MERGE_FIELD(descriptor_storage_buffer_count), max_sets),
-      std::max(MERGE_FIELD(descriptor_combined_sampler_count), max_sets),
-      std::max(MERGE_FIELD(descriptor_storage_image_count), max_sets),
-      1u,
-  };
-
-  if (!context_->descriptor_pool()) {
-    context_->descriptor_pool().init(config);
-  }
-#undef MERGE_FIELD
-
-  if (config_.enable_querypool) {
-    context_->initialize_querypool();
-  }
-
-  // Calculate the threshold at which a new command buffer should be created
-  // during execute()
-  const size_t total_node_count = execute_nodes_.size();
-  size_t init_threshold = config_.execute_initial_threshold_node_count;
-  size_t count_threshold = config_.execute_threshold_node_count;
-
-  // If max command buffer count is set, we need to adjust the thresholds to
-  // accommodate execution within the limit, if total command buffers with
-  // current thresholds would exceed execute_max_cmds
-  if (config_.execute_max_cmds > 0) {
-    // Worse case scenario we have one command buffer for nodes before init
-    // threshold and config_.execute_max_cmds - 1 command buffers for the rest
-    // of dispatches
-
-    // If command buffers created after offsetting init_threshold would exceed
-    // max command buffer count, we need to adjust init and count thresholds
-    const bool slicing_exceeds_max_cmds = (total_node_count - init_threshold) >
-        count_threshold * (config_.execute_max_cmds - 1);
-    if (total_node_count > init_threshold && slicing_exceeds_max_cmds) {
-      // Increase count threshold so remaining nodes after offsetting init fits
-      // in config_.execute_max_cmds - 1
-      count_threshold = static_cast<size_t>(ceil(
-          (total_node_count - init_threshold) /
-          double(config_.execute_max_cmds - 1)));
-    }
-  }
-
-  execute_threshold_node_count_ = count_threshold;
-}
-
-void ComputeGraph::prepare_pipelines() {
-  for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
-    node->prepare_pipelines(this);
-  }
-  for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
-    node->prepare_pipelines(this);
-  }
-  context_->pipeline_cache().create_pipelines(pipeline_descriptors_);
-
-  pipeline_descriptors_ = std::unordered_set<
-      vkapi::ComputePipelineCache::Key,
-      vkapi::ComputePipelineCache::Hasher>();
-}
-
-void ComputeGraph::submit_current_cmd(const bool final_use) {
-  context_->submit_cmd_to_gpu(VK_NULL_HANDLE, final_use);
-}
-
-void ComputeGraph::submit_current_cmd_and_wait(const bool final_use) {
-  vkapi::VulkanFence fence = context_->fences().get_fence();
-  context_->submit_cmd_to_gpu(fence.get_submit_handle(), final_use);
-  fence.wait();
-  context_->fences().return_fence(fence);
-}
-
-void ComputeGraph::submit_cmd(vkapi::CommandBuffer& cmd_buf, VkFence fence) {
-  if (cmd_buf) {
-    cmd_buf.end();
-    context_->adapter_ptr()->submit_cmd(
-        context_->queue(), cmd_buf.get_submit_handle(false), fence);
-  }
-}
-
-void ComputeGraph::submit_deferred_cmds_and_wait() {
-  vkapi::VulkanFence fence = context_->fences().get_fence();
-
-  for (uint32_t i = 0; i < deferred_cmd_list_.size(); i++) {
-    auto& cmd = deferred_cmd_list_[i];
-
-    submit_cmd(
-        cmd,
-        i == (deferred_cmd_list_.size() - 1) ? fence.get_submit_handle()
-                                             : VK_NULL_HANDLE);
-  }
-  fence.wait();
-  context_->fences().return_fence(fence);
-}
-
-void ComputeGraph::clear_deferred_cmds() {
-  for (auto& cmd : deferred_cmd_list_) {
-    if (cmd) {
-      cmd.end();
-      cmd.invalidate();
-    }
-  }
-  deferred_cmd_list_.clear();
-}
-
-void ComputeGraph::prepack() {
-  int i = 0;
-  bool submitted = false;
-  const bool reduce_peak_memory = total_constant_nbytes_ > 500 * MB;
-  // int count = 0;
-  context_->set_cmd();
-  for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
-    // Do not trigger on the first or last prepack node.
-    const bool not_terminal = i != 0 && i != (prepack_nodes_.size() - 1);
-    size_t threshold = submitted ? config_.prepack_threshold_nbytes
-                                 : config_.prepack_initial_threshold_nbytes;
-    if (not_terminal && staging_nbytes_in_cmd_ > threshold) {
-      // If reducing peak memory usage, wait for the current command buffer to
-      // finish executing and flush to recycle the staging memory. This will
-      // reduce peak memory usage, but will slightly increase load latency.
-      // Otherwise, just submit the current command buffer for execution and
-      // proceed. This results in lower load latency at the cost of higher peak
-      // memory usage.
-      if (reduce_peak_memory) {
-        submit_current_cmd_and_wait();
-        context_->flush();
-      } else {
-        submit_current_cmd();
-      }
-      staging_nbytes_in_cmd_ = 0;
-      context_->set_cmd();
-      submitted = true;
-    }
-
-    node->encode(this);
-    i++;
-  }
-  submit_current_cmd_and_wait(/*final_use=*/true);
-  context_->flush();
-  staging_nbytes_in_cmd_ = 0;
-
-  // Initialize allocations for intermediate tensors
-  for (SharedObject& shared_object : shared_objects_) {
-    shared_object.allocate(this);
-    shared_object.bind_users(this);
-  }
-  // Make sure all remaining tensors have allocations
-  for (int i = 0; i < values_.size(); i++) {
-    if (values_.at(i).isTensor()) {
-      create_dedicated_allocation_for(i);
-    }
-  }
-}
-
-void ComputeGraph::execute() {
-  if (deferred_cmd_list_.empty()) {
-    context_->flush();
-    context_->set_cmd(/*reusable = */ true);
-
-    context_->cmd_reset_querypool();
-    const size_t total_node_count = execute_nodes_.size();
-    uint32_t encoded_node_count = 0;
-
-    for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
-      node->encode(this);
-      encoded_node_count++;
-
-      // Threshold is reached when the node count reached
-      // execute_initial_threshold_node_count or if its a multiple of
-      // execute_threshold_node_count.
-      const bool reached_threshold =
-          encoded_node_count >= config_.execute_initial_threshold_node_count &&
-          ((encoded_node_count - config_.execute_initial_threshold_node_count) %
-               execute_threshold_node_count_ ==
-           0);
-
-      // Create a new command buffer when threashold is reached
-      // But avoid it if this is the last node, since last cmd buf is submitted
-      // after the loop
-      if (reached_threshold && encoded_node_count != total_node_count) {
-        context_->submit_cmd_to_gpu(VK_NULL_HANDLE, false);
-        deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd()));
-        context_->set_cmd(true);
-      }
-    }
-
-    vkapi::VulkanFence fence = context_->fences().get_fence();
-    context_->submit_cmd_to_gpu(fence.get_submit_handle(), false);
-    fence.wait();
-    context_->fences().return_fence(fence);
-    deferred_cmd_list_.emplace_back(std::move(context_->extract_cmd()));
-  } else {
-    submit_deferred_cmds_and_wait();
-  }
-
-  execute_count_++;
-
-  // Clear the set of updated values at the end of inference
-  updated_values_.clear();
-
-  // Reset the re-encoding flag at the end of inference
-  requires_reencode_ = false;
-}
-
-void ComputeGraph::virtual_clone(const ValueRef dst, const ValueRef src) {
-  get_tensor(dst)->virtual_clone(*get_tensor(src));
-}
-
-void ComputeGraph::virtual_transpose(
-    const ValueRef tensor,
-    const int64_t dim0,
-    const int64_t dim1) {
-  get_tensor(tensor)->virtual_transpose(dim0, dim1);
-}
-
-void ComputeGraph::resize_input(
-    const int64_t idx,
-    const std::vector<int64_t>& new_sizes) {
-  IOValueRef io_val = inputs_.at(idx);
-  virtual_resize(io_val.value, new_sizes);
-  updated_values_.insert(io_val.staging);
-}
-
-void ComputeGraph::virtual_resize(
-    const ValueRef idx,
-    const std::vector<int64_t>& new_sizes) {
-  std::vector<int64_t> cur_sizes = sizes_of(idx);
-  if (cur_sizes != new_sizes) {
-    get_tensor(idx)->virtual_resize(new_sizes);
-    // Track that this ValueRef was updated
-    updated_values_.insert(idx);
-  }
-}
-
-void ComputeGraph::propagate_resize() {
-  for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
-    node->trigger_resize(this);
-  }
-  // A command buffer re-encode will be needed if:
-  // 1. Any push constant data (used for tensor metadata) was updated
-  // 2. Compute shader dispatch parameters (i.e. compute shader, global and
-  //    local work group sizes) were updated
-  if (requires_reencode_) {
-    clear_deferred_cmds();
-  }
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
deleted file mode 100644
index 23b5517fd22..00000000000
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ /dev/null
@@ -1,1099 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <optional>
-#include <stack>
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <executorch/backends/vulkan/runtime/graph/GraphConfig.h>
-
-#include <executorch/backends/vulkan/runtime/graph/containers/SharedObject.h>
-#include <executorch/backends/vulkan/runtime/graph/containers/Value.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/DispatchNode.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/ExecuteNode.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/PrepackNode.h>
-
-namespace vkcompute {
-
-// Define valid scalar types that the Value class can
-// accept
-template <typename T>
-struct is_valid_scalar_type : std::false_type {};
-
-template <>
-struct is_valid_scalar_type<int64_t> : std::true_type {};
-
-template <>
-struct is_valid_scalar_type<double> : std::true_type {};
-
-template <>
-struct is_valid_scalar_type<bool> : std::true_type {};
-
-//
-// Guarded Pointer Classes
-//
-
-class ComputeGraph;
-
-#define DECL_VALUE_PTR_CLASS(classname, ctype)                         \
-  class classname final {                                              \
-    ComputeGraph* const graph_;                                        \
-    ctype* ptr_;                                                       \
-                                                                       \
-   public:                                                             \
-    explicit classname(ComputeGraph* const graph, const ValueRef idx); \
-    ctype* operator->() const;                                         \
-    ctype& operator*() const;                                          \
-    ~classname();                                                      \
-  };
-
-DECL_VALUE_PTR_CLASS(vTensorPtr, api::vTensor)
-DECL_VALUE_PTR_CLASS(TensorRefPtr, TensorRef)
-DECL_VALUE_PTR_CLASS(StagingPtr, api::StagingBuffer)
-DECL_VALUE_PTR_CLASS(IntListPtr, std::vector<int64_t>)
-DECL_VALUE_PTR_CLASS(DoubleListPtr, std::vector<double>)
-DECL_VALUE_PTR_CLASS(BoolListPtr, std::vector<bool>)
-DECL_VALUE_PTR_CLASS(ValueListPtr, std::vector<ValueRef>)
-DECL_VALUE_PTR_CLASS(SymIntPtr, SymInt);
-
-#undef DECL_VALUE_PTR_CLASS
-
-//
-// TmpTensor
-//
-
-/*
- * This struct is used to recycle the memory of temporary tensors that are
- * created during the execution of a node. Upon construction, this struct will
- * check the `tmp_shared_object_idxs_` of the provided `ComputeGraph` instance
- * if any shared objects are available; if not, then a new one is created. A
- * tensor value is then added to the `ComputeGraph` instance with the requested
- * specifications. Upon destruction, the shared object index of the temporary
- * tensor is returned to `tmp_shared_object_idxs_`.
- *
- * Note that instances of this struct can be used as if they were `ValueRef` due
- * to implementation of a custom casting operator.
- *
- * This class should only be used to create tensors whose lifetimes exist only
- * in a well defined scope (i.e. within a function).
- */
-struct TmpTensor {
-  ComputeGraph* graph_p;
-  int64_t sobj_idx;
-  ValueRef vref;
-
-  //
-  // Match all available overloads of `add_tensor`
-  //
-
-  TmpTensor(
-      ComputeGraph* const graph_ptr,
-      const std::vector<int64_t>& sizes,
-      const vkapi::ScalarType dtype,
-      const utils::StorageType storage_type,
-      const utils::GPUMemoryLayout memory_layout);
-
-  TmpTensor(
-      ComputeGraph* const graph_ptr,
-      const std::vector<int64_t>& sizes,
-      const vkapi::ScalarType dtype,
-      const utils::StorageType storage_type);
-
-  TmpTensor(
-      ComputeGraph* const graph_ptr,
-      const std::vector<int64_t>& sizes,
-      const vkapi::ScalarType dtype,
-      const utils::GPUMemoryLayout memory_layout);
-
-  TmpTensor(
-      ComputeGraph* const graph_ptr,
-      const std::vector<int64_t>& sizes,
-      const vkapi::ScalarType dtype);
-
-  // No copy construction or assignment
-  TmpTensor(TmpTensor& other) = delete;
-  TmpTensor& operator=(TmpTensor& other) = delete;
-
-  // No move construction or assignment
-  TmpTensor(TmpTensor&& other) = delete;
-  TmpTensor& operator=(TmpTensor&& other) = delete;
-
-  // Custom cast to ValueRef
-  operator ValueRef() const {
-    return vref;
-  };
-
-  ~TmpTensor();
-
- private:
-  // Helper function to get first available shared object index or request a new
-  // one to be created.
-  int64_t get_sobj_idx();
-};
-
-//
-// ComputeGraph
-//
-
-/*
- * This is the core data structure used to execute Vulkan models in graph mode.
- * As opposed to ATen/eager mode where a command buffer is encoded every
- * inference (since ops are executed with the model), in graph mode the ops that
- * compose the model are intended to be parsed only once, upon which a command
- * buffer will be encoded. Model inference will then execute the cached command
- * buffer without needing to encode a new one.
- */
-class ComputeGraph final {
- public:
-  explicit ComputeGraph(GraphConfig config);
-
-  ComputeGraph(ComputeGraph&&) = default;
-  ComputeGraph& operator=(ComputeGraph&&) = default;
-
-  ~ComputeGraph();
-
- private:
-  GraphConfig config_;
-  vkapi::DescriptorPoolConfig prepack_descriptor_counts_;
-  vkapi::DescriptorPoolConfig execute_descriptor_counts_;
-
-  std::unique_ptr<api::Context> context_;
-
-  std::vector<SharedObject> shared_objects_;
-  // This stack is used by `TmpTensor` instances to recycle shared objects
-  // for temporary tensors. See the comments of `TmpTensor` for more details
-  std::stack<int64_t> tmp_shared_object_idxs_;
-
-  std::vector<Value> values_;
-  std::vector<api::ParamsBuffer> param_ubos_;
-
-  std::vector<std::unique_ptr<PrepackNode>> prepack_nodes_;
-  std::vector<std::unique_ptr<ExecuteNode>> execute_nodes_;
-
-  std::vector<IOValueRef> inputs_;
-  std::vector<IOValueRef> outputs_;
-
-  std::unordered_set<
-      vkapi::ComputePipelineCache::Key,
-      vkapi::ComputePipelineCache::Hasher>
-      pipeline_descriptors_;
-
-  // Utility constexpr to express byte quantities
-  constexpr static size_t MB = 1024 * 1024;
-
-  // List of command buffers deferred for submission
-  std::vector<vkapi::CommandBuffer> deferred_cmd_list_;
-
-  // Set to track which ValueRefs were updated during inference
-  std::unordered_set<ValueRef> updated_values_;
-
-  // Flag to indicate if re-encoding is required
-  bool requires_reencode_ = false;
-
- protected:
-  size_t values_in_use_ = 0;
-  size_t execute_count_ = 0;
-
-  // Total number of bytes needed to store model weights
-  size_t total_constant_nbytes_ = 0;
-
-  // Represents the amount of staging buffer data that will be copied if the
-  // current Context's command buffer is submitted now.
-  size_t staging_nbytes_in_cmd_ = 0;
-
-  // Represents the nodes to wait before submitting commands.
-  // If command buffers created with config.execute_threshold_node_count exceeds
-  // config.execute_max_cmds, then execute_threshold_node_count will be
-  // increased to fit command buffers within the limit. Otherwise,
-  // execute_threshold_node_count will be set to
-  // config.execute_threshold_node_count.
-  size_t execute_threshold_node_count_ = 0;
-
-  // Whether the underlying GPU support accelerated integer dot product
-  // extensions
-  bool can_use_int8_dot_product_ = false;
-
- public:
-  //
-  // Accessors
-  //
-
-  inline api::Context* context() {
-    return context_.get();
-  }
-
-  inline std::vector<IOValueRef>& inputs() {
-    return inputs_;
-  }
-
-  inline std::vector<IOValueRef>& outputs() {
-    return outputs_;
-  }
-
-  inline std::vector<std::unique_ptr<PrepackNode>>& prepack_nodes() {
-    return prepack_nodes_;
-  }
-
-  inline std::vector<std::unique_ptr<ExecuteNode>>& execute_nodes() {
-    return execute_nodes_;
-  }
-
-  inline GraphConfig& graphconfig() {
-    return config_;
-  }
-
-  // Check if the ComputeGraph has a value at the specified index
-  bool is_valid_value_idx(const ValueRef idx) const noexcept;
-
-  //
-  // Value Extraction
-  //
-
-#define GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(ptr_type, short_name, type_name) \
-  inline ptr_type get_##short_name(const ValueRef idx) {                   \
-    return ptr_type(this, idx);                                            \
-  }                                                                        \
-  inline bool val_is_##short_name(const ValueRef idx) const {              \
-    return values_.at(idx).is##type_name();                                \
-  }
-
- protected:
-  inline vTensorPtr get_tensor(const ValueRef idx) {
-    return vTensorPtr(this, idx);
-  }
-
- public:
-  inline bool val_is_tensor(const ValueRef idx) const {
-    return values_.at(idx).isTensor();
-  }
-
-  GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(TensorRefPtr, tref, TensorRef)
-  GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(StagingPtr, staging, Staging)
-  GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(IntListPtr, int_list, IntList)
-  GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(DoubleListPtr, double_list, DoubleList)
-  GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(BoolListPtr, bool_list, BoolList)
-  GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(ValueListPtr, value_list, ValueList)
-  GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(SymIntPtr, symint, SymInt);
-
-#undef GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS
-
-#define GET_AND_CHECK_VAL_AS_TYPE_FNS(ctype, short_name, type_name) \
-  inline ctype get_##short_name(const ValueRef idx) {               \
-    return values_.at(idx).to##type_name();                         \
-  }                                                                 \
-  inline bool val_is_##short_name(const ValueRef idx) {             \
-    return values_.at(idx).is##type_name();                         \
-  }
-
-  GET_AND_CHECK_VAL_AS_TYPE_FNS(int64_t, int, Int)
-  GET_AND_CHECK_VAL_AS_TYPE_FNS(double, double, Double)
-  GET_AND_CHECK_VAL_AS_TYPE_FNS(bool, bool, Bool)
-  GET_AND_CHECK_VAL_AS_TYPE_FNS(std::string, string, String)
-
-#undef GET_AND_CHECK_VAL_AS_TYPE_FNS
-
-  inline bool val_is_none(const ValueRef idx) {
-    return idx == kDummyValueRef ? true : values_.at(idx).isNone();
-  }
-
-  inline bool val_is_not_none(const ValueRef idx) {
-    return !val_is_none(idx);
-  }
-
-  inline TypeTag get_val_type(const ValueRef idx) {
-    return values_.at(idx).type();
-  }
-
-  //
-  // Tensor Properties Accessors
-  //
-
-  std::vector<int64_t> sizes_of(const ValueRef idx) const;
-
-  /*
-   * Returns the size of the tensor at `idx` along the specified dimension.
-   * Negative indexing is allowed.
-   */
-  template <typename T>
-  T size_at(const int64_t dim, const ValueRef idx) const {
-    const Value& val = values_.at(idx);
-    if (val.isTensor()) {
-      return static_cast<T>(utils::val_at(dim, val.toConstTensor().sizes()));
-    } else if (val.isTensorRef()) {
-      return static_cast<T>(utils::val_at(dim, val.toConstTensorRef().sizes));
-    }
-    VK_THROW("Could not get sizes of value with type ", val.type());
-  }
-
-  int64_t dim_of(const ValueRef idx) const;
-
-  std::vector<int64_t> dim_order_of(const ValueRef idx) const;
-
-  std::vector<int64_t> strides_of(const ValueRef idx) const;
-
-  vkapi::ScalarType dtype_of(const ValueRef idx) const;
-
-  inline const utils::ivec3& logical_limits_of(const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().logical_limits();
-  }
-
-  inline int32_t numel_of(const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().numel();
-  }
-
-  inline size_t staging_buffer_numel_of(const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().staging_buffer_numel();
-  }
-
-  inline utils::StorageType storage_type_of(const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().storage_type();
-  }
-
-  inline bool is_buffer_storage(const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().has_buffer_storage();
-  }
-
-  inline bool is_texture_storage(const ValueRef idx) const {
-    return !is_buffer_storage(idx);
-  }
-
-  /*
-   * Checks that the following is true:
-   * 1. The value at `idx` is a tensor
-   * 2. The tensor at `idx` has buffer storage
-   * 3. The buffer backed tensor at `idx` has a contiguous memory layout
-   */
-  bool is_contiguous_buffer_tensor(const ValueRef idx) const;
-
-  /*
-   * Checks that the following is true:
-   * 1. The value at `idx` is a tensor
-   * 2. The tensor at `idx` has texture storage
-   * 3. The texture backed tensor at `idx` has a standard axis mapping
-   * 4. The texture backed tensor at `idx` is width packed
-   */
-  bool is_contiguous_texture_tensor(const ValueRef idx) const;
-
-  /*
-   * Checks that the following is true:
-   * 1. The value at `idx` is a tensor
-   * 2. The tensor at `idx` has texture storage
-   * 3. The texture backed tensor at `idx` has a standard axis mapping
-   * 4. The texture backed tensor at `idx` is channels packed
-   */
-  bool is_standard_channels_packed_texture_tensor(const ValueRef idx) const;
-
-  /*
-   * Checks that the value at `idx` is either a 2D tensor, or if the tensor has
-   * more than 2 dims, the outermost dims have size of 1, i.e. can be squeezed
-   * to be a 2D tensor.
-   */
-  bool is_2d_matrix(const ValueRef idx) const;
-
-  /*
-   * Same as the above, but also requires that the tensor is a contiguous
-   * buffer with a width divisible by 4 or a standard width packed texture.
-   */
-  bool is_vectorizable_contiguous_2d_matrix(const ValueRef idx) const;
-
-  /*
-   * Checks that the following is true:
-   * 1. The value at `idx` is a tensor
-   * 2. The tensor at `idx` is width packed
-   * 3. The tensor at `idx` has a standard axis mapping or is a contiguous
-   * buffer
-   */
-  bool is_vectorizable_width_packed_tensor(const ValueRef idx) const;
-
-  inline bool val_is_view_of(const ValueRef maybe_view, const ValueRef base)
-      const {
-    return values_.at(maybe_view)
-        .toConstTensor()
-        .is_view_of(values_.at(base).toConstTensor());
-  }
-
-  inline utils::GPUMemoryLayout estimate_memory_layout_of(
-      const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().estimate_memory_layout();
-  }
-
-  inline int32_t hashed_layout_of(const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().hashed_layout();
-  }
-
-  inline int32_t packed_dim_of(const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().packed_dim();
-  }
-
-  inline int32_t concat_dim_of(const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().concat_dim();
-  }
-
-  inline vkapi::BufferBindInfo sizes_ubo(const ValueRef idx) {
-    return values_.at(idx).toTensor().sizes_ubo();
-  }
-
-  inline vkapi::BufferBindInfo buffer_meta_ubo(const ValueRef idx) {
-    return values_.at(idx).toTensor().buffer_meta_ubo();
-  }
-
-  inline vkapi::BufferBindInfo strides_ubo(const ValueRef idx) {
-    return values_.at(idx).toTensor().strides_ubo();
-  }
-
-  inline vkapi::BufferBindInfo dim_order_ubo(const ValueRef idx) {
-    return values_.at(idx).toTensor().dim_order_ubo();
-  }
-
-  inline vkapi::BufferBindInfo numel_ubo(const ValueRef idx) {
-    return values_.at(idx).toTensor().numel_ubo();
-  }
-
-  inline bool has_standard_axis_map(const ValueRef idx) const {
-    return values_.at(idx).toTensor().has_standard_axis_map();
-  }
-
-  inline bool is_contiguous(const ValueRef idx) const {
-    return values_.at(idx).toTensor().is_contiguous();
-  }
-
-  inline vkapi::BufferBindInfo logical_limits_ubo(const ValueRef idx) {
-    return values_.at(idx).toTensor().logical_limits_ubo();
-  }
-
-  inline PushConstantDataInfo sizes_pc_of(const ValueRef idx) const {
-    PushConstantDataInfo pc_data = PushConstantDataInfo(
-        values_.at(idx).toConstTensor().get_uniform_data(), api::kTensorSizes);
-    pc_data.set_value(idx);
-    return pc_data;
-  }
-
-  inline PushConstantDataInfo dim_order_pc_of(const ValueRef idx) const {
-    PushConstantDataInfo pc_data = PushConstantDataInfo(
-        values_.at(idx).toConstTensor().get_uniform_data(),
-        api::kTensorDimOrder);
-    pc_data.set_value(idx);
-    return pc_data;
-  }
-
-  inline PushConstantDataInfo strides_pc_of(const ValueRef idx) const {
-    PushConstantDataInfo pc_data = PushConstantDataInfo(
-        values_.at(idx).toConstTensor().get_uniform_data(),
-        api::kTensorStrides);
-    pc_data.set_value(idx);
-    return pc_data;
-  }
-
-  inline PushConstantDataInfo logical_limits_pc_of(const ValueRef idx) const {
-    PushConstantDataInfo pc_data = PushConstantDataInfo(
-        values_.at(idx).toConstTensor().get_uniform_data(),
-        api::kTensorLogicalLimits);
-    pc_data.set_value(idx);
-    return pc_data;
-  }
-
-  inline PushConstantDataInfo numel_pc_of(const ValueRef idx) const {
-    PushConstantDataInfo pc_data = PushConstantDataInfo(
-        values_.at(idx).toConstTensor().get_uniform_data(), api::kTensorNumel);
-    pc_data.set_value(idx);
-    return pc_data;
-  }
-
-  //
-  // Scalar Value Extraction
-  //
-
-  bool is_scalar_or_none(const ValueRef idx) const {
-    const Value& value = values_.at(idx);
-    return value.isInt() || value.isDouble() || value.isBool() ||
-        value.isNone();
-  }
-
-  template <typename T>
-  T extract_scalar(const ValueRef idx) {
-    Value& value = values_.at(idx);
-    if (value.isInt()) {
-      return static_cast<T>(value.toInt());
-    }
-    if (value.isDouble()) {
-      return static_cast<T>(value.toDouble());
-    }
-    if (value.isBool()) {
-      return static_cast<T>(value.toBool());
-    }
-    VK_THROW("Cannot extract scalar from Value with type ", value.type());
-  }
-
-  template <typename T>
-  T extract_scalar_or(const ValueRef idx, const T default_value) {
-    Value& value = values_.at(idx);
-    if (value.isNone()) {
-      return default_value;
-    }
-    return extract_scalar<T>(idx);
-  }
-
-  template <typename T>
-  std::optional<T> extract_optional_scalar(const ValueRef idx) {
-    if (val_is_none(idx)) {
-      return ::std::nullopt;
-    } else if (val_is_symint(idx)) {
-      return utils::safe_downcast<T>(read_symint(idx));
-    } else {
-      return extract_scalar<T>(idx);
-    }
-  }
-
-  template <typename T>
-  T extract_optional_scalar(const ValueRef idx, const T default_val) {
-    if (val_is_none(idx)) {
-      return default_val;
-    } else if (val_is_symint(idx)) {
-      return utils::safe_downcast<T>(read_symint(idx));
-    } else {
-      return extract_scalar<T>(idx);
-    }
-  }
-
-  std::string extract_string(const ValueRef idx) {
-    return values_.at(idx).toString();
-  }
-
-  /*
-   * Utility function to extract a list of integers from a ValueRef.
-   * If the ValueRef is an IntList, returns a copy of the list.
-   * If the ValueRef is a ValueList, extracts each element as an Int or SymInt
-   * and returns the resulting list.
-   * Throws an error if the ValueRef is neither an IntList nor a ValueList.
-   */
-  std::vector<int64_t> extract_int_or_symint_list(const ValueRef idx);
-
-  template <
-      typename T,
-      typename std::enable_if<
-          std::is_integral<T>::value && std::is_signed<T>::value,
-          int>::type = 0>
-  T extract_whcn_dim(const ValueRef idx, const int64_t ndim) {
-    T dim = extract_scalar<T>(idx);
-    // Normalize dim to account for negative indexing
-    dim = (dim % ndim + ndim) % ndim;
-    // Assume original value is NCHW ordering, obtain the WHCN ordering
-    return ndim - 1 - dim;
-  }
-
-  //
-  // Utility functions
-  //
-
-  /*
-   * Returns a suggested storage type (i.e. buffer or texture) that can be used
-   * to construct `api::vTensor`s. The storage type is typically determined by
-   * the GPU reported by the Vulkan context, unless a storage type override is
-   * defined in the graph configuration. Some GPU architectures work better with
-   * buffer storage, and others with texture storage. Current only texture
-   * storage is supported.
-   */
-  utils::StorageType suggested_storage_type();
-
-  /*
-   * Returns a suggested memory layout (i.e. channels, width, or height packed)
-   * that can be used to construct `api::vTensor`s. The memory layout impacts
-   * which dimension will be treated as the vectorized dimension. For texture
-   * storage, elements along the vectorized dimension are packed into texels.
-   * The suggested memory layout is determined based on the sizes of the tensor,
-   * unless a memory layout override is defined in the graph configuration.
-   */
-  utils::GPUMemoryLayout suggested_memory_layout(
-      const std::vector<int64_t>& sizes);
-
-  inline bool device_is_adreno() {
-    return context_->adapter_ptr()->device_type() == vkapi::DeviceType::ADRENO;
-  }
-  const std::string& device_name() {
-    return context()->adapter_ptr()->device_name();
-  }
-
-  bool device_name_contains(const char* substr);
-
-  //
-  // Graph Building
-  //
-
- private:
-  void check_no_active_value_ptrs();
-
- public:
-  /*
-   * Add a `api::vTensor` value to the graph with the specified properties.
-   * There are various convenience overloads of this function that may be used
-   * instead.
-   */
-  ValueRef add_tensor(
-      const std::vector<int64_t>& sizes,
-      const vkapi::ScalarType dtype,
-      const utils::StorageType storage_type,
-      const utils::GPUMemoryLayout memory_layout,
-      const int64_t shared_object_idx = -1,
-      const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap);
-
-  /*
-   * Add a `api::vTensor` value to the graph with the specified properties. The
-   * suggested memory layout will be used to construct the `api::vTensor`.
-   */
-  ValueRef add_tensor(
-      const std::vector<int64_t>& sizes,
-      const vkapi::ScalarType dtype,
-      const utils::StorageType storage_type,
-      const int64_t shared_object_idx = -1,
-      const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap);
-
-  /*
-   * Add a `api::vTensor` value to the graph with the specified properties. The
-   * suggested storage type will be used to construct the `api::vTensor`.
-   */
-  ValueRef add_tensor(
-      const std::vector<int64_t>& sizes,
-      const vkapi::ScalarType dtype,
-      const utils::GPUMemoryLayout memory_layout,
-      const int64_t shared_object_idx = -1,
-      const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap);
-
-  /*
-   * Add a `api::vTensor` value to the graph with the specified properties. The
-   * suggested storage type and memory layout will be used to construct the
-   * `api::vTensor`.
-   */
-  ValueRef add_tensor(
-      const std::vector<int64_t>& sizes,
-      const vkapi::ScalarType dtype,
-      const int64_t shared_object_idx = -1,
-      const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap);
-
-  /*
-   * Add a `api::vTensor` value to the graph with the specified image.
-   */
-  ValueRef add_tensor(const vkapi::VulkanImage& image);
-
-  /*
-   * Add a `api::vTensor` value to the graph with the properties of `vref`.
-   */
-  ValueRef add_tensor_like(
-      const ValueRef vref,
-      const utils::StorageType storage_type,
-      const utils::GPUMemoryLayout memory_layout,
-      const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap);
-
-  /*
-   * Add a `api::vTensor` value to the graph with the properties of `vref`. The
-   * suggested storage type will be used to construct the `api::vTensor`.
-   */
-  ValueRef add_tensor_like(
-      const ValueRef vref,
-      const utils::GPUMemoryLayout memory_layout,
-      const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap);
-
-  /*
-   * Use the copy constructor of `api::vTensor` to create a "view" of the
-   * `vTensor` value at `vref`. See the copy constructor of `api::vTensor` for
-   * more details.
-   */
-  ValueRef add_tensor_view(const ValueRef vref);
-
-  /*
-   * Use the copy constructor of `api::vTensor` to create a "view" of the
-   * `vTensor` value at `vref` with different sizes and dim order. See the copy
-   * constructor of `api::vTensor` for more details.
-   */
-  ValueRef add_tensor_view(
-      const ValueRef vref,
-      const std::vector<int64_t>& sizes,
-      const std::vector<int64_t>& dim_order);
-
-  /*
-   * Add a `TensorRef` value to the graph with the specific properties. A
-   * `TensorRef` is a reference to a `api::vTensor` whose data is stored in an
-   * external CPU buffer.
-   */
-  ValueRef add_tensorref(
-      const std::vector<int64_t>& sizes,
-      const vkapi::ScalarType dtype,
-      const void* const data);
-
-  /*
-   * Add a `TensorRef` value to the graph with the specific properties. A
-   * `TensorRef` is a reference to a `api::vTensor` whose data is stored in a
-   * FreeableBuffer. The TensorRef will take ownership of the FreeableBuffer.
-   */
-  ValueRef add_tensorref(
-      const std::vector<int64_t>& sizes,
-      const vkapi::ScalarType dtype,
-      executorch::runtime::FreeableBuffer&& buffer);
-
-  /*
-   * Add a staging buffer to the graph. Staging buffers are data buffers that
-   * use memory that is visible to both the CPU and GPU, and therefore is used
-   * as a intermediary when transferring data between the CPU and GPU.
-   */
-  ValueRef add_staging(const vkapi::ScalarType dtype, const size_t numel);
-
-  ValueRef add_none();
-
-  template <typename T>
-  typename std::enable_if<is_valid_scalar_type<T>::value, ValueRef>::type
-  add_scalar(T value);
-
-  template <typename T>
-  typename std::enable_if<is_valid_scalar_type<T>::value, ValueRef>::type
-  add_scalar_list(std::vector<T>&& value);
-
-  ValueRef add_value_list(std::vector<ValueRef>&& value);
-
-  ValueRef add_string(std::string&& str);
-
-  ValueRef add_symint(const int32_t val);
-
-  /*
-   * Searches the graph's value list for a Int value with the specified value.
-   * If one is found, returns the index of the value. Otherwise, add a new value
-   * and return the index of the new value.
-   */
-  ValueRef get_or_add_value_for_int(const int64_t val);
-
-  ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true);
-  ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true);
-
-  ValueRef set_output_value(const ValueRef idx);
-
-  template <typename Block>
-  vkapi::BufferBindInfo create_params_buffer(const Block& data) {
-    param_ubos_.emplace_back(api::ParamsBuffer(context_.get(), data));
-    return vkapi::BufferBindInfo(param_ubos_.back().buffer());
-  }
-
-  /*
-   * Given a ValueRef, do the following depending on the type of the Value:
-   * - If it is a SymInt, return the BufferBindInfo of the ParamsBuffer object
-   *   backing the SymInt.
-   * - If it is a regular Int, create a new ParamsBuffer using the integer value
-   *   and return the BufferBindInfo of the created ParamsBuffer.
-   */
-  vkapi::BufferBindInfo get_or_create_int_param_buffer(const ValueRef idx);
-
-  vkapi::BufferBindInfo get_or_create_int_param_buffer(
-      const ValueRef idx,
-      const int32_t default_value);
-
-  void set_symint(const ValueRef idx, const int32_t val);
-
-  int32_t read_symint(const ValueRef idx);
-
-  inline void set_val_as_input(const ValueRef idx) {
-    inputs_.push_back({idx, kDummyValueRef});
-  }
-
-  inline void set_val_as_output(const ValueRef idx) {
-    outputs_.push_back({idx, kDummyValueRef});
-  }
-
-  /*
-   * Convenience function to add an input tensor along with its staging buffer
-   */
-  inline IOValueRef add_input_tensor(
-      const std::vector<int64_t>& sizes,
-      const vkapi::ScalarType dtype,
-      const int64_t shared_object_idx = -1) {
-    ValueRef t = add_tensor(sizes, dtype, shared_object_idx);
-    ValueRef staging = set_input_tensor(t);
-    return {t, staging};
-  }
-
-  /*
-   * Convenience function to add an input tensor with a specific memory layout
-   * along with its staging buffer
-   */
-  inline IOValueRef add_input_tensor(
-      const std::vector<int64_t>& sizes,
-      const vkapi::ScalarType dtype,
-      const utils::GPUMemoryLayout memory_layout,
-      const int64_t shared_object_idx = -1) {
-    ValueRef t = add_tensor(sizes, dtype, memory_layout, shared_object_idx);
-    ValueRef staging = set_input_tensor(t);
-    return {t, staging};
-  }
-
-  /*
-   * Convenience function to add an input tensor with a specific storage type
-   * along with its staging buffer
-   */
-  inline IOValueRef add_input_tensor(
-      const std::vector<int64_t>& sizes,
-      const vkapi::ScalarType dtype,
-      const utils::StorageType storage_type,
-      const int64_t shared_object_idx = -1) {
-    ValueRef t = add_tensor(sizes, dtype, storage_type, shared_object_idx);
-    ValueRef staging = set_input_tensor(t);
-    return {t, staging};
-  }
-
-  /*
-   * Add an input tensor with the specified properties along with its staging
-   * buffer.
-   */
-  inline IOValueRef add_input_tensor(
-      const std::vector<int64_t>& sizes,
-      const vkapi::ScalarType dtype,
-      const utils::StorageType storage_type,
-      const utils::GPUMemoryLayout memory_layout,
-      const int64_t shared_object_idx = -1) {
-    ValueRef t = add_tensor(
-        sizes, dtype, storage_type, memory_layout, shared_object_idx);
-    ValueRef staging = set_input_tensor(t);
-    return {t, staging};
-  }
-
-  SharedObject& get_shared_object(const int64_t idx);
-
-  /*
-   * Creates a dedicated memory allocation for a vTensor value, and have the
-   * tensor acquire the allocation object. If the tensor is already bound to a
-   * memory allocation, this function will be a no-op.
-   */
-  void create_dedicated_allocation_for(const ValueRef idx);
-
-  //
-  // Graph Preparation
-  //
-
-  void update_descriptor_counts(
-      const vkapi::ShaderInfo& shader_info,
-      bool execute);
-
-  void register_pipeline_to_create(
-      const vkapi::ShaderInfo& shader_info,
-      const utils::WorkgroupSize& local_workgroup_size,
-      const vkapi::SpecVarList& spec_vars,
-      const std::vector<PushConstantDataInfo>& push_constants);
-
-  void prepare();
-
-  void prepare_pipelines();
-
-  //
-  // Dispatch Utilities
-  //
-
-  /*
-   * Create a global workgroup size for a given `api::vTensor` value assuming
-   * that every shader invocation calculates one texel element of the output
-   * tensor.
-   *
-   * For tensors that use texture storage, the image extents of the
-   * `api::vTensor` will be used to set the global workgroup size.
-   *
-   * For tensor that use buffer storage, the number of texels in the texel
-   * buffer will be used to set the x component of the global workgroup size.
-   * All other components will be set to 1 (i.e. {ntexels, 1, 1} will be
-   * returned).
-   */
-  utils::uvec3 create_global_wg_size(const ValueRef idx);
-
-  /*
-   * Suggest a local workgroup size for a given global workgroup size.
-   *
-   * The local workgroup size will be formed to try and minimize the number of
-   * inactive invocations.
-   *
-   * Currently, the local workgroup size is hard-coded to contain a total of 64
-   * shader invocations. In the future, this value can be configured.
-   */
-  utils::uvec3 create_local_wg_size(const utils::uvec3 global_wg_size);
-
-  /*
-   * Convenience function to suggest a local workgroup size for a given
-   * `api::vTensor` value, assuming that every shader invocation calculates one
-   * texel element of the output tensor.
-   */
-  utils::uvec3 create_local_wg_size(const ValueRef idx);
-
-  void bind_tensor_to_descriptor_set(
-      const ValueRef ref,
-      vkapi::PipelineBarrier& pipeline_barrier,
-      const vkapi::MemoryAccessFlags accessType,
-      vkapi::DescriptorSet& descriptor_set,
-      const uint32_t idx);
-
-  void bind_value_to_descriptor_set(
-      const ValueRef ref,
-      vkapi::PipelineBarrier& pipeline_barrier,
-      const vkapi::MemoryAccessFlags access_type,
-      vkapi::DescriptorSet& descriptor_set,
-      const uint32_t idx);
-
-  //
-  // Input/Output
-  //
-
-  void
-  copy_into_staging(const ValueRef idx, const void* data, const size_t numel);
-  void copy_from_staging(const ValueRef idx, void* data, const size_t numel);
-
- protected:
-  // Command Buffer Management
-
-  /*
-   * Submits the current command buffer in the Context to the GPU for execution.
-   */
-  void submit_current_cmd(const bool final_use = false);
-
-  /*
-   * Submits the current command buffer in the Context to the GPU for execution,
-   * and wait for it to complete before returning.
-   */
-  void submit_current_cmd_and_wait(const bool final_use = false);
-
-  /*
-   * Submit one command buffer to the GPU.
-   */
-  void submit_cmd(vkapi::CommandBuffer& cmd_buf, VkFence fence);
-
-  /*
-   * Submits all the commands gathered in deferred_cmd_bufs_ to the GPU.
-   */
-  void submit_deferred_cmds_and_wait();
-
-  /*
-   * Ends and invalidates all deferred commands.
-   */
-  void clear_deferred_cmds();
-
- public:
-  //
-  // Graph Prepacking
-  //
-
-  inline void update_staging_nbytes_in_cmd(const size_t staging_bytes) {
-    staging_nbytes_in_cmd_ += staging_bytes;
-  }
-
-  /*
-   * Executes prepacking operations to transfer model weight data from the CPU
-   * to GPU.
-   */
-  void prepack();
-
-  //
-  // Graph Execution
-  //
-
-  void execute();
-
-  //
-  // Tensor View
-  //
-
-  void virtual_clone(const ValueRef dst, const ValueRef src);
-
-  void virtual_transpose(
-      const ValueRef tensor,
-      const int64_t dim0,
-      const int64_t dim1);
-
-  //
-  // Dynamic Shape support
-  //
-
-  void resize_input(const int64_t idx, const std::vector<int64_t>& new_sizes);
-
-  void virtual_resize(
-      const ValueRef idx,
-      const std::vector<int64_t>& new_sizes);
-
-  void propagate_resize();
-
-  // Check if a specific ValueRef (or ValueList) was updated, with recursive
-  // handling
-  bool was_value_updated(const ValueRef idx) const noexcept;
-
-  // Set the flag to indicate that re-encoding is required
-  inline void set_requires_reencode() noexcept {
-    requires_reencode_ = true;
-  }
-
-  //
-  // Miscellaneous Utilities
-  //
-
-  inline bool int16_shader_types_enabled() const {
-    return context_->adapter_ptr()->supports_int16_shader_types();
-  }
-
-  inline size_t execute_count() const {
-    return execute_count_;
-  }
-
-  inline bool can_use_int8_dot_product() const {
-    return can_use_int8_dot_product_;
-  }
-
-  /*
-   * Check whether the GPU supports 8 bit buffers.
-   */
-  inline bool int8_buffers_enabled() const {
-    return context_->adapter_ptr()->has_full_int8_buffers_support();
-  }
-
-  //
-  // Debug support (implemented in Logging.cpp)
-  //
-
-  void print_readable();
-
-  //
-  // Friend classes
-  //
-
-  friend class vTensorPtr;
-  friend class TensorRefPtr;
-  friend class StagingPtr;
-  friend class IntListPtr;
-  friend class DoubleListPtr;
-  friend class BoolListPtr;
-  friend class ValueListPtr;
-  friend class SymIntPtr;
-
-  friend struct TmpTensor;
-  friend struct SharedObject;
-  friend class BlitNode;
-};
-
-template <typename T>
-inline typename std::enable_if<is_valid_scalar_type<T>::value, ValueRef>::type
-ComputeGraph::add_scalar(T value) {
-  ValueRef idx(static_cast<int>(values_.size()));
-  check_no_active_value_ptrs();
-  values_.emplace_back(value);
-  return idx;
-}
-
-template <typename T>
-inline typename std::enable_if<is_valid_scalar_type<T>::value, ValueRef>::type
-ComputeGraph::add_scalar_list(std::vector<T>&& value) {
-  ValueRef idx(static_cast<int>(values_.size()));
-  check_no_active_value_ptrs();
-  values_.emplace_back(std::move(value));
-  return idx;
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/GraphConfig.cpp b/backends/vulkan/runtime/graph/GraphConfig.cpp
deleted file mode 100644
index da5efbf8342..00000000000
--- a/backends/vulkan/runtime/graph/GraphConfig.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/GraphConfig.h>
-
-namespace vkcompute {
-
-GraphConfig::GraphConfig() {
-  // No automatic submissions
-  const uint32_t cmd_submit_frequency = UINT32_MAX;
-
-  // Only one command buffer will be encoded at a time
-  const vkapi::CommandPoolConfig cmd_config{
-      1u, // cmd_pool_initial_size
-      1u, // cmd_pool_batch_size
-  };
-
-  // Use lazy descriptor pool initialization by default; the graph runtime will
-  // tally up the number of descriptor sets needed while building the graph and
-  // trigger descriptor pool initialization with exact sizes before encoding the
-  // command buffer.
-  const vkapi::DescriptorPoolConfig descriptor_pool_config{
-      0u, // descriptor_pool_max_sets
-      0u, // descriptor_uniform_buffer_count
-      0u, // descriptor_storage_buffer_count
-      0u, // descriptor_combined_sampler_count
-      0u, // descriptor_storage_image_count
-      0u, // descriptor_pile_sizes
-  };
-
-  const vkapi::QueryPoolConfig query_pool_config{};
-
-  context_config = {
-      cmd_submit_frequency,
-      cmd_config,
-      descriptor_pool_config,
-      query_pool_config,
-  };
-
-  // Empirically selected safety factor. If descriptor pools start running out
-  // of memory, increase this safety factor.
-  descriptor_pool_safety_factor = 1.25;
-
-  // For now, force kTexture3D storage as we are still developing shader support
-  // for buffer storage type.
-  enable_storage_type_override = true;
-  storage_type_override = utils::kTexture3D;
-
-  // For now, force kWidthPacked memory layout by default as we are still
-  // developing support for other memory layouts. In the future memory layout
-  // settings will be serialized as part of the graph.
-  enable_memory_layout_override = true;
-  memory_layout_override = utils::kWidthPacked;
-
-  // QueryPool objects are used to measure execution times of individual shader
-  // dispatches. By default, this functionality is disabled.
-  enable_querypool = false;
-
-  enable_local_wg_size_override = false;
-  local_wg_size_override = {};
-
-  expect_dynamic_shapes = false;
-
-  external_adapter = nullptr;
-}
-
-void GraphConfig::set_storage_type_override(utils::StorageType storage_type) {
-  enable_storage_type_override = true;
-  storage_type_override = storage_type;
-}
-
-void GraphConfig::set_memory_layout_override(
-    utils::GPUMemoryLayout memory_layout) {
-  enable_memory_layout_override = true;
-  memory_layout_override = memory_layout;
-}
-
-void GraphConfig::set_local_wg_size_override(
-    const utils::uvec3& local_wg_size) {
-  enable_local_wg_size_override = true;
-  local_wg_size_override = local_wg_size;
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h
deleted file mode 100644
index aa5cd8f8c4e..00000000000
--- a/backends/vulkan/runtime/graph/GraphConfig.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-namespace vkcompute {
-
-struct GraphConfig final {
-  api::ContextConfig context_config;
-
-  // Creating a descriptor pool with exactly the number of descriptors tallied
-  // by iterating through the shader layouts of shaders used in the graph risks
-  // the descriptor pool running out of memory, therefore apply a safety factor
-  // to descriptor counts when creating the descriptor pool to mitigate this
-  // risk.
-  float descriptor_pool_safety_factor;
-
-  bool enable_storage_type_override;
-  utils::StorageType storage_type_override;
-
-  bool enable_memory_layout_override;
-  utils::GPUMemoryLayout memory_layout_override;
-
-  bool enable_querypool;
-
-  bool enable_local_wg_size_override;
-  utils::uvec3 local_wg_size_override;
-
-  // Whether or not the ComputeGraph should expect input shapes to be dynamic
-  bool expect_dynamic_shapes;
-
-  // Execution properties that determine specifics re: how command buffer
-  // submission is handled, etc. 0 means this field is not set.
-
-  // During prepacking, once this threshold is reached, submit the current
-  // command buffer for execution. This allows the work to be distributed over
-  // multiple command buffer submissions, which can improve model load
-  // performance and prevent crashes when loading large models.
-  size_t prepack_threshold_nbytes = 0;
-  // Threshold used for the first command buffer submission during prepacking.
-  // This can be set to be lower than prepack_submission_threshold_nbytes to
-  // submit a command buffer for execution earlier which can improve performance
-  // by taking more advantage of parallelism between the CPU and GPU.
-  size_t prepack_initial_threshold_nbytes = 0;
-
-  // During execute, once this node count is reached, submit the current
-  // command buffer for execution. This allows the work to be distributed over
-  // multiple command buffer submissions, which can improve execution
-  // performance.
-  size_t execute_threshold_node_count = 0;
-  // Execute node count used for the first command buffer submission during
-  // execute. This can be set to be lower than execute_threshold_nbytes to
-  // submit a command buffer for execution earlier which can improve performance
-  // by taking more advantage of parallelism between the CPU and GPU.
-  size_t execute_initial_threshold_node_count = 0;
-
-  // If this number is greater than 0 then, during execute create at most this
-  // many command buffers.
-  size_t execute_max_cmds = 0;
-
-  vkapi::Adapter* external_adapter;
-
-  // Generate a default graph config with pre-configured settings
-  explicit GraphConfig();
-
-  void set_storage_type_override(utils::StorageType storage_type);
-  void set_memory_layout_override(utils::GPUMemoryLayout memory_layout);
-  void set_local_wg_size_override(const utils::uvec3& local_wg_size);
-};
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/Logging.cpp b/backends/vulkan/runtime/graph/Logging.cpp
deleted file mode 100644
index 081083e3a63..00000000000
--- a/backends/vulkan/runtime/graph/Logging.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/Logging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-#include <iomanip>
-#include <iostream>
-#include <map>
-#include <set>
-
-namespace vkcompute {
-
-void ComputeGraph::print_readable() {
-  std::set<ValueRef> input_set;
-  for (const IOValueRef& io_val : inputs()) {
-    input_set.insert(io_val.value);
-  }
-
-  std::set<ValueRef> output_set;
-  for (const IOValueRef& io_val : outputs()) {
-    output_set.insert(io_val.value);
-  }
-
-  std::set<ValueRef> prepack_set;
-  for (const std::unique_ptr<PrepackNode>& node : prepack_nodes()) {
-    prepack_set.insert(node->tref_);
-    prepack_set.insert(node->packed_);
-  }
-
-  std::map<ValueRef, size_t> value_ref_to_shared_object_idx;
-
-  std::cout << "====================" << std::left << std::setfill('=')
-            << std::setw(40) << " Shared Object List " << std::right
-            << std::setfill(' ') << std::endl;
-
-  std::cout << std::setw(6) << "idx" << std::setw(20) << "sizes"
-            << std::setw(24) << "users" << std::endl;
-
-  size_t so_idx = 0;
-  for (const SharedObject& shared_object : shared_objects_) {
-    std::cout << std::setw(6) << so_idx;
-    {
-      std::stringstream ss;
-      ss << shared_object.aggregate_memory_requirements.size;
-      std::cout << std::setw(20) << ss.str();
-    }
-
-    {
-      std::stringstream ss;
-      ss << shared_object.users;
-      std::cout << std::setw(24) << ss.str();
-    }
-    std::cout << std::endl;
-
-    for (const ValueRef& user : shared_object.users) {
-      value_ref_to_shared_object_idx[user] = so_idx;
-    }
-
-    so_idx++;
-  }
-
-  std::cout << "====================" << std::left << std::setfill('=')
-            << std::setw(40) << " Value List " << std::right
-            << std::setfill(' ') << std::endl;
-
-  std::cout << std::setw(6) << "idx" << std::setw(10) << "type" << std::setw(20)
-            << "sizes" << std::setw(10) << "node_type" << std::setw(15)
-            << "storage_bytes" << std::setw(10) << "so_idx" << std::endl;
-
-  size_t value_idx = 0;
-  for (Value& val : values_) {
-    std::cout << std::setw(6) << value_idx << std::setw(10) << val.type();
-
-    // sizes
-    std::cout << std::setw(20);
-    if (val.isTensor()) {
-      const api::vTensor& v_tensor = val.toTensor();
-      std::stringstream ss;
-      ss << v_tensor.sizes();
-      std::cout << ss.str();
-    } else if (val.isTensorRef()) {
-      const TensorRef& tensor_ref = val.toTensorRef();
-      std::stringstream ss;
-      ss << tensor_ref.sizes;
-      std::cout << ss.str();
-    } else {
-      std::cout << "";
-    }
-
-    // Node type
-    std::cout << std::setw(10);
-    {
-      if (input_set.count(value_idx) > 0) {
-        std::cout << "INPUT";
-      } else if (output_set.count(value_idx) > 0) {
-        std::cout << "OUTPUT";
-      } else if (prepack_set.count(value_idx) > 0) {
-        std::cout << "PREPACK";
-      } else {
-        std::cout << "";
-      }
-    }
-
-    // Actual storage bytes used
-    std::cout << std::setw(15);
-    if (val.isTensor()) {
-      const api::vTensor& v_tensor = val.toTensor();
-      auto memory_reqs = v_tensor.get_memory_requirements();
-      std::cout << memory_reqs.size;
-    } else {
-      std::cout << "";
-    }
-
-    std::cout << std::setw(10);
-    if (value_ref_to_shared_object_idx.count(value_idx) > 0) {
-      size_t shared_obj_idx = value_ref_to_shared_object_idx.at(value_idx);
-      std::cout << shared_obj_idx;
-    } else {
-      std::cout << "";
-    }
-
-    std::cout << std::endl;
-    value_idx++;
-  }
-
-  std::cout << "====================" << std::left << std::setfill('=')
-            << std::setw(40) << " Prepack Node List " << std::right
-            << std::setfill(' ') << std::endl;
-  std::cout << std::setw(6) << "idx" << std::setw(32) << "shader_name"
-            << std::setw(8) << "tref" << std::setw(8) << "packed" << std::endl;
-
-  size_t prepack_node_idx = 0;
-  for (const std::unique_ptr<PrepackNode>& node : prepack_nodes()) {
-    std::cout << std::setw(6) << prepack_node_idx << std::setw(32)
-              << node->shader_.kernel_name << std::setw(8) << node->tref_
-              << std::setw(8) << node->packed_ << std::endl;
-
-    prepack_node_idx++;
-  }
-
-  std::cout << "====================" << std::left << std::setfill('=')
-            << std::setw(40) << " Execute Node List " << std::right
-            << std::setfill(' ') << std::endl;
-
-  std::cout << std::setw(6) << "idx" << std::setw(32) << "shader_name"
-            << std::setw(24) << "READ_arg" << std::setw(24) << "WRITE_arg"
-            << std::endl;
-
-  size_t node_idx = 0;
-  for (const std::unique_ptr<ExecuteNode>& node : execute_nodes()) {
-    std::cout << std::setw(6) << node_idx;
-    std::cout << std::setw(32) << node->name();
-
-    std::stringstream read_s;
-    for (const ArgGroup& arg_group : node->args_) {
-      if (arg_group.access != vkapi::MemoryAccessType::READ) {
-        continue;
-      }
-      read_s << arg_group.refs;
-    }
-    std::cout << std::setw(24) << read_s.str();
-
-    std::stringstream write_s;
-    for (const ArgGroup& arg_group : node->args_) {
-      if (arg_group.access != vkapi::MemoryAccessType::WRITE) {
-        continue;
-      }
-      write_s << arg_group.refs;
-    }
-    std::cout << std::setw(24) << write_s.str();
-
-    std::cout << std::endl;
-
-    node_idx++;
-  }
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/Logging.h b/backends/vulkan/runtime/graph/Logging.h
deleted file mode 100644
index fb2f66e2d6f..00000000000
--- a/backends/vulkan/runtime/graph/Logging.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <optional>
-#include <ostream>
-#include <vector>
-
-namespace vkcompute {
-
-template <typename T>
-inline std::ostream& operator<<(std::ostream& os, const std::vector<T>& vec) {
-  os << '[';
-  for (const auto& elem : vec) {
-    os << elem << ',';
-  }
-  os << ']';
-  return os; // Return the ostream to allow chaining
-}
-
-inline std::ostream& operator<<(std::ostream& os, const utils::uvec3& v) {
-  return utils::operator<<(os, v);
-}
-
-inline std::ostream& operator<<(std::ostream& os, const utils::uvec4& v) {
-  return utils::operator<<(os, v);
-}
-
-inline std::ostream& operator<<(std::ostream& os, const utils::ivec3& v) {
-  return utils::operator<<(os, v);
-}
-
-inline std::ostream& operator<<(std::ostream& os, const utils::ivec4& v) {
-  return utils::operator<<(os, v);
-}
-
-template <typename T>
-inline std::ostream& operator<<(std::ostream& os, const std::optional<T>& opt) {
-  os << "[";
-  if (opt) {
-    os << opt.value();
-  }
-  os << "]";
-  return os;
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/containers/Constant.cpp b/backends/vulkan/runtime/graph/containers/Constant.cpp
deleted file mode 100644
index 4dc2cdda8f5..00000000000
--- a/backends/vulkan/runtime/graph/containers/Constant.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/containers/Constant.h>
-
-namespace vkcompute {
-
-TensorRef::TensorRef(
-    const std::vector<int64_t>& t_sizes,
-    vkapi::ScalarType t_dtype,
-    const void* const t_data)
-    : sizes{}, dtype{t_dtype}, data{t_data}, buffer{} {
-  size_t ndim = t_sizes.size();
-  sizes.resize(ndim);
-  for (int i = 0; i < ndim; ++i) {
-    sizes[i] = t_sizes.at(i);
-  }
-}
-
-TensorRef::TensorRef(
-    const std::vector<int64_t>& t_sizes,
-    vkapi::ScalarType t_dtype,
-    executorch::runtime::FreeableBuffer&& t_buffer)
-    : sizes{},
-      dtype{t_dtype},
-      data{t_buffer.data()},
-      buffer{std::move(t_buffer)} {
-  size_t ndim = t_sizes.size();
-  sizes.resize(ndim);
-  for (int i = 0; i < ndim; ++i) {
-    sizes[i] = t_sizes.at(i);
-  }
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/containers/Constant.h b/backends/vulkan/runtime/graph/containers/Constant.h
deleted file mode 100644
index a18c284a219..00000000000
--- a/backends/vulkan/runtime/graph/containers/Constant.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/Context.h>
-#include <executorch/runtime/core/freeable_buffer.h>
-
-namespace vkcompute {
-
-/*
- * Represents a reference to a tensor that has been
- * serialized with the model, such as a serialized weight
- * tensor. It contains some metadata as well as a raw
- * pointer to the data of the tensor, which is assumed to
- * be contiguous.
- */
-struct TensorRef final {
-  std::vector<int64_t> sizes;
-  vkapi::ScalarType dtype;
-  const void* data;
-
-  // Optional FreeableBuffer for managing memory lifecycle
-  // This will be empty (default constructed) for the raw pointer constructor
-  executorch::runtime::FreeableBuffer buffer;
-
-  explicit TensorRef(
-      const std::vector<int64_t>& t_sizes,
-      vkapi::ScalarType t_dtype,
-      const void* const t_data);
-
-  // Constructor that takes ownership of a FreeableBuffer
-  explicit TensorRef(
-      const std::vector<int64_t>& t_sizes,
-      vkapi::ScalarType t_dtype,
-      executorch::runtime::FreeableBuffer&& t_buffer);
-
-  inline size_t nbytes() const {
-    return utils::multiply_integers(sizes) * vkapi::element_size(dtype);
-  }
-
-  // Manually free the buffer if needed (though it will be freed automatically
-  // on destruction)
-  void free_buffer() {
-    buffer.Free();
-  }
-};
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/containers/PushConstantData.cpp b/backends/vulkan/runtime/graph/containers/PushConstantData.cpp
deleted file mode 100644
index 7999118443b..00000000000
--- a/backends/vulkan/runtime/graph/containers/PushConstantData.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/containers/PushConstantData.h>
-
-namespace vkcompute {
-
-uint32_t PushConstantDataInfo::write(
-    void* dst,
-    const uint32_t dst_offset,
-    const uint32_t max_dst_size) const {
-  if (tensorUniformData != nullptr) {
-    return tensorUniformData->write_attribute(
-        dst, dst_offset, max_dst_size, payload_.attr);
-  }
-
-  VK_CHECK_COND(
-      (dst_offset + payload_.dataSize) <= max_dst_size,
-      "Attempting to write push constant data outside data boundary.");
-  memcpy((uint8_t*)dst + dst_offset, payload_.data, payload_.dataSize);
-  return payload_.dataSize;
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/containers/PushConstantData.h b/backends/vulkan/runtime/graph/containers/PushConstantData.h
deleted file mode 100644
index c86232983ea..00000000000
--- a/backends/vulkan/runtime/graph/containers/PushConstantData.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <executorch/backends/vulkan/runtime/graph/containers/Value.h>
-
-namespace vkcompute {
-
-class ComputeGraph;
-
-constexpr uint32_t kMaxPushConstantSize = 128;
-/*
- * Represents a push constant data entry
- * Which is either shared pointer to a tensor's uniform data with an attribute
- * Or data with a maximum size of 16 bytes
- */
-class PushConstantDataInfo {
-  std::shared_ptr<api::vTensor::UniformData> tensorUniformData;
-  union Payload {
-    struct {
-      api::vTensor::Attribute attr;
-    };
-    struct {
-      uint8_t data[16];
-      uint32_t dataSize;
-    };
-  };
-
-  Payload payload_;
-  // The value in a compute graph that this push constant data is associated
-  // with, if any.
-  ValueRef value_ = kDummyValueRef;
-
- public:
-  explicit PushConstantDataInfo(
-      const std::shared_ptr<api::vTensor::UniformData>& tensorUniformData,
-      api::vTensor::Attribute attr)
-      : tensorUniformData(tensorUniformData) {
-    payload_.attr = attr;
-  }
-
-  explicit PushConstantDataInfo(
-      const void* data,
-      uint32_t dataLen,
-      uint32_t pushConstantLen = 0)
-      : tensorUniformData(nullptr) {
-    VK_CHECK_COND(
-        dataLen <= 16, "Single push constant data size must be <= 16 bytes");
-    payload_.dataSize = pushConstantLen ? pushConstantLen : dataLen;
-    memcpy(payload_.data, data, dataLen);
-  }
-
-  /*
-   * Function writes push constant data to the destination buffer
-   */
-  uint32_t write(
-      void* dst,
-      const uint32_t dst_offset,
-      const uint32_t max_dst_size) const;
-
-  inline bool is_tensor_metadata() const noexcept {
-    return tensorUniformData != nullptr;
-  }
-
-  inline void set_value(ValueRef value) noexcept {
-    value_ = value;
-  }
-
-  inline ValueRef value() const noexcept {
-    return value_;
-  }
-};
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/containers/SharedObject.cpp b/backends/vulkan/runtime/graph/containers/SharedObject.cpp
deleted file mode 100644
index 10ddd6f2ca3..00000000000
--- a/backends/vulkan/runtime/graph/containers/SharedObject.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/containers/SharedObject.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-namespace vkcompute {
-
-bool SharedObject::has_user(const ValueRef idx) const {
-  return std::find(users.begin(), users.end(), idx) != users.end();
-}
-
-void SharedObject::add_user(ComputeGraph* const graph, const ValueRef idx) {
-  vTensorPtr t = graph->get_tensor(idx);
-
-  // Aggregate Memory Requirements
-  const VkMemoryRequirements mem_reqs = t->get_memory_requirements();
-  aggregate_memory_requirements.size =
-      std::max(mem_reqs.size, aggregate_memory_requirements.size);
-  aggregate_memory_requirements.alignment =
-      std::max(mem_reqs.alignment, aggregate_memory_requirements.alignment);
-  aggregate_memory_requirements.memoryTypeBits |= mem_reqs.memoryTypeBits;
-
-  users.emplace_back(idx);
-}
-
-void SharedObject::allocate(ComputeGraph* const graph) {
-  if (aggregate_memory_requirements.size == 0) {
-    return;
-  }
-
-  VmaAllocationCreateInfo alloc_create_info =
-      graph->context()->adapter_ptr()->vma().gpuonly_resource_create_info();
-
-  allocation = graph->context()->adapter_ptr()->vma().create_allocation(
-      aggregate_memory_requirements, alloc_create_info);
-}
-
-void SharedObject::bind_users(ComputeGraph* const graph) {
-  if (users.empty()) {
-    return;
-  }
-  for (const ValueRef idx : users) {
-    graph->get_tensor(idx)->bind_allocation(allocation);
-  }
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/containers/SharedObject.h b/backends/vulkan/runtime/graph/containers/SharedObject.h
deleted file mode 100644
index f9b16e6c202..00000000000
--- a/backends/vulkan/runtime/graph/containers/SharedObject.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/Types.h>
-
-#include <executorch/backends/vulkan/runtime/graph/GraphConfig.h>
-
-#include <executorch/backends/vulkan/runtime/graph/containers/Value.h>
-
-namespace vkcompute {
-
-class ComputeGraph;
-
-struct SharedObject {
-  friend class ComputeGraph;
-
-  explicit SharedObject() = default;
-
-  VkMemoryRequirements aggregate_memory_requirements;
-  std::vector<ValueRef> users;
-  vkapi::Allocation allocation;
-
-  bool has_user(const ValueRef idx) const;
-  void add_user(ComputeGraph* const graph, const ValueRef idx);
-  void allocate(ComputeGraph* const graph);
-  void bind_users(ComputeGraph* const graph);
-};
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/containers/SymInt.cpp b/backends/vulkan/runtime/graph/containers/SymInt.cpp
deleted file mode 100644
index a59a2d40141..00000000000
--- a/backends/vulkan/runtime/graph/containers/SymInt.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/containers/SymInt.h>
-
-namespace vkcompute {
-
-SymInt::SymInt(api::Context* context_p, const int32_t val)
-    : gpu_buffer(context_p, val){};
-
-void SymInt::set(const int32_t val) {
-  gpu_buffer.update(val);
-}
-
-int32_t SymInt::get() {
-  return gpu_buffer.read<int32_t>();
-}
-
-void SymInt::operator=(const int32_t val) {
-  gpu_buffer.update(val);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/containers/SymInt.h b/backends/vulkan/runtime/graph/containers/SymInt.h
deleted file mode 100644
index bd361aabe5a..00000000000
--- a/backends/vulkan/runtime/graph/containers/SymInt.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/Context.h>
-#include <executorch/backends/vulkan/runtime/api/containers/ParamsBuffer.h>
-
-namespace vkcompute {
-
-/*
- * Represents a symbolic integer whose value can be variable. It is implemented
- * as a thin wrapper around a `ParamsBuffer` object that holds the value of the
- * integer. The `ParamsBuffer` object allows the value of the symbolic integer
- * to be changed from the CPU and have those changes be visible to all shaders
- * that use the symbolic integer; it also allows the value of the symbolic
- * integer to be the result of a compute shader.
- *
- * Regular scalar types represented by `TypeTag::INT` cannot be used for
- * symbolic integers because their value is assumed to be constant; therefore
- * the `Value` instance holding the value of the scalar does not contain
- * any reference to the GPU buffers used to pass its value into compute shaders.
- * Therefore, updating the value of the scalar does not impact the value seen
- * by compute shaders.
- */
-struct SymInt final {
-  api::ParamsBuffer gpu_buffer;
-
-  explicit SymInt(api::Context* context_p, const int32_t val);
-
-  void set(const int32_t val);
-
-  int32_t get();
-
-  void operator=(const int32_t val);
-};
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/containers/Types.cpp b/backends/vulkan/runtime/graph/containers/Types.cpp
deleted file mode 100644
index e7a8951a552..00000000000
--- a/backends/vulkan/runtime/graph/containers/Types.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/containers/Types.h>
-
-namespace vkcompute {
-
-#define PRINT_CASE(name) \
-  case TypeTag::name:    \
-    out << #name;        \
-    break;
-
-std::ostream& operator<<(std::ostream& out, const TypeTag& tag) {
-  switch (tag) {
-    PRINT_CASE(NONE)
-    PRINT_CASE(INT)
-    PRINT_CASE(DOUBLE)
-    PRINT_CASE(BOOL)
-    PRINT_CASE(TENSOR)
-    PRINT_CASE(STAGING)
-    PRINT_CASE(TENSORREF)
-    PRINT_CASE(INTLIST)
-    PRINT_CASE(DOUBLELIST)
-    PRINT_CASE(BOOLLIST)
-    PRINT_CASE(VALUELIST)
-    PRINT_CASE(STRING)
-    PRINT_CASE(SYMINT)
-  }
-  return out;
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/containers/Types.h b/backends/vulkan/runtime/graph/containers/Types.h
deleted file mode 100644
index 48232179e06..00000000000
--- a/backends/vulkan/runtime/graph/containers/Types.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- * Copyright 2025 Arm Limited and/or its affiliates.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <cstdint>
-#include <ostream>
-
-namespace vkcompute {
-
-/*
- * This class is modelled after c10::IValue; however, it
- * is simplified and does not support as many types.
- * However, the core design is the same; it is a tagged
- * union over the types supported by the Vulkan Graph
- * type.
- */
-enum class TypeTag : uint32_t {
-  NONE,
-  // Scalar types
-  INT,
-  DOUBLE,
-  BOOL,
-  // Tensor and tensor adjacent types
-  TENSOR,
-  STAGING,
-  TENSORREF,
-  // Scalar lists
-  INTLIST,
-  DOUBLELIST,
-  BOOLLIST,
-  // Special Type
-  VALUELIST,
-  STRING,
-  SYMINT,
-};
-
-std::ostream& operator<<(std::ostream& out, const TypeTag& tag);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/containers/Value.h b/backends/vulkan/runtime/graph/containers/Value.h
deleted file mode 100644
index b73684307b2..00000000000
--- a/backends/vulkan/runtime/graph/containers/Value.h
+++ /dev/null
@@ -1,350 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <executorch/backends/vulkan/runtime/graph/containers/Constant.h>
-#include <executorch/backends/vulkan/runtime/graph/containers/SymInt.h>
-#include <executorch/backends/vulkan/runtime/graph/containers/Types.h>
-
-namespace vkcompute {
-
-using ValueRef = int32_t;
-
-constexpr ValueRef kDummyValueRef = -1;
-
-inline bool is_valid(ValueRef value_ref) {
-  return value_ref >= 0;
-}
-
-struct IOValueRef {
-  ValueRef value;
-  ValueRef staging;
-
-  // Custom cast to ValueRef
-  operator ValueRef() const {
-    return value;
-  };
-};
-
-/*
- * This class is modelled after c10::IValue; however, it is simplified and does
- * not support as many types. However, the core design is the same; it is a
- * tagged union over the types supported by the Vulkan Graph type.
- */
-struct Value final {
- private:
-  /*
-   * The union type which is used to store the value of the Value.
-   */
-  union Payload {
-    /*
-     * Similar to IValue::Payload, trivially copyable types are nested in their
-     * own union.
-     */
-    union TriviallyCopyablePayload {
-      TriviallyCopyablePayload() : as_int(0) {}
-      int64_t as_int;
-      double as_double;
-      bool as_bool;
-    } u;
-
-    std::unique_ptr<api::vTensor> as_tensor;
-    std::unique_ptr<api::StagingBuffer> as_staging;
-    TensorRef as_tensorref;
-
-    std::vector<int64_t> as_int_list;
-    std::vector<double> as_double_list;
-    std::vector<bool> as_bool_list;
-
-    // The below is a special type that is used to represent a list of other
-    // values stored in the graph. One application of the type is to represent
-    // a list of tensors or a list of optional tensors.
-    std::vector<ValueRef> as_value_list;
-
-    std::string as_string;
-
-    std::unique_ptr<SymInt> as_symint;
-
-    Payload() : u() {}
-    // NOLINTNEXTLINE
-    ~Payload(){};
-  };
-
- public:
-  //
-  // Copy constructor and assignment (disabled)
-  //
-
-  Value(const Value& rhs) = delete;
-  Value& operator=(const Value&) = delete;
-
-  //
-  // Move constructor and assignment; Move assignment is disabled but
-  // construction is implemented to allow for use in container types.
-  //
-
-  Value& operator=(Value&&) = delete;
-
-#define CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(type_tag, member_name) \
-  case type_tag:                                                 \
-    payload.u.member_name = rhs.payload.u.member_name;           \
-    break;
-
-#define CASE_MOVE_MOVEABLE_TYPE(type_tag, type, member_name, dtor_name)  \
-  case type_tag:                                                         \
-    new (&payload.member_name) type(std::move(rhs.payload.member_name)); \
-    rhs.payload.member_name.~dtor_name();                                \
-    break;
-
-#define CASE_MOVE_UNIQUE_PTR_TYPE(type_tag, member_name)      \
-  case type_tag:                                              \
-    payload.member_name = std::move(rhs.payload.member_name); \
-    break;
-
-  Value(Value&& rhs) noexcept : tag(rhs.tag) {
-    switch (tag) {
-      // Scalar types
-      CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(TypeTag::INT, as_int);
-      CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(TypeTag::DOUBLE, as_double);
-      CASE_MOVE_TRIVIALLY_COPYABLE_TYPE(TypeTag::BOOL, as_bool);
-      // Tensor adjacent type
-      CASE_MOVE_MOVEABLE_TYPE(
-          TypeTag::TENSORREF, TensorRef, as_tensorref, TensorRef);
-      // Scalar lists
-      CASE_MOVE_MOVEABLE_TYPE(
-          TypeTag::INTLIST, std::vector<int64_t>, as_int_list, vector);
-      CASE_MOVE_MOVEABLE_TYPE(
-          TypeTag::DOUBLELIST, std::vector<double>, as_double_list, vector);
-      CASE_MOVE_MOVEABLE_TYPE(
-          TypeTag::BOOLLIST, std::vector<bool>, as_bool_list, vector);
-      // Special types
-      CASE_MOVE_MOVEABLE_TYPE(
-          TypeTag::VALUELIST, std::vector<ValueRef>, as_value_list, vector);
-      CASE_MOVE_MOVEABLE_TYPE(
-          TypeTag::STRING, std::string, as_string, basic_string);
-      // Tensor type
-      CASE_MOVE_UNIQUE_PTR_TYPE(TypeTag::TENSOR, as_tensor);
-      // Small tensor adjacent types
-      CASE_MOVE_UNIQUE_PTR_TYPE(TypeTag::STAGING, as_staging);
-      // Large tensor adjacent types
-      CASE_MOVE_UNIQUE_PTR_TYPE(TypeTag::SYMINT, as_symint);
-
-      case TypeTag::NONE:
-        clearToNone();
-        break;
-    }
-    rhs.clearToNone();
-  }
-
-#undef CASE_MOVE_TRIVIALLY_COPYABLE_TYPE
-#undef CASE_MOVE_MOVEABLE_TYPE
-#undef CASE_MOVE_UNIQUE_PTR_TYPE
-
-  //
-  // Accessors
-  //
-
-  inline TypeTag type() const {
-    return tag;
-  }
-
-  //
-  // Destructor
-  //
-
-  ~Value() {
-    switch (tag) {
-      case TypeTag::TENSORREF:
-        payload.as_tensorref.~TensorRef();
-        break;
-      case TypeTag::INTLIST:
-        payload.as_int_list.~vector();
-        break;
-      case TypeTag::DOUBLELIST:
-        payload.as_double_list.~vector();
-        break;
-      case TypeTag::BOOLLIST:
-        payload.as_bool_list.~vector();
-        break;
-      case TypeTag::VALUELIST:
-        payload.as_value_list.~vector();
-        break;
-      case TypeTag::STRING:
-        payload.as_string.~basic_string();
-        break;
-      case TypeTag::STAGING:
-        payload.as_staging.reset();
-        break;
-      case TypeTag::SYMINT:
-        payload.as_symint.reset();
-        break;
-      case TypeTag::TENSOR:
-        payload.as_tensor.reset();
-        break;
-      // Manually list out the types so that if a type here is added later and
-      // not handled the compiler can catch it.
-      case TypeTag::NONE:
-      case TypeTag::INT:
-      case TypeTag::DOUBLE:
-      case TypeTag::BOOL:
-        break;
-    }
-  }
-
-  //
-  // Constructors, isType(), toType()
-  //
-
-  Value() : tag(TypeTag::NONE) {}
-
-  inline bool isNone() const {
-    return tag == TypeTag::NONE;
-  }
-
-#define SUPPORT_TRIVIALLY_COPYABLE_TYPE(                    \
-    type, type_name, type_tag, member_name)                 \
-  explicit Value(type t) : tag(type_tag) {                  \
-    payload.u.member_name = t;                              \
-  }                                                         \
-  inline bool is##type_name() const {                       \
-    return tag == type_tag;                                 \
-  }                                                         \
-  inline const type& to##type_name() const {                \
-    VK_CHECK_COND(                                          \
-        is##type_name(),                                    \
-        "Expected value to have type " #type_name ", got ", \
-        tag,                                                \
-        " instead.");                                       \
-    return payload.u.member_name;                           \
-  }
-
-  SUPPORT_TRIVIALLY_COPYABLE_TYPE(int64_t, Int, TypeTag::INT, as_int);
-  SUPPORT_TRIVIALLY_COPYABLE_TYPE(double, Double, TypeTag::DOUBLE, as_double);
-  SUPPORT_TRIVIALLY_COPYABLE_TYPE(bool, Bool, TypeTag::BOOL, as_bool);
-
-#undef SUPPORT_TRIVIALLY_COPYABLE_TYPE
-
-#define SUPPORT_TRIVIALLY_MOVEABLE_TYPE(                    \
-    type, type_name, type_tag, member_name)                 \
-  explicit Value(type&& t) : tag(type_tag) {                \
-    new (&payload.member_name) type(std::move(t));          \
-  }                                                         \
-  inline bool is##type_name() const {                       \
-    return tag == type_tag;                                 \
-  }                                                         \
-  inline type& to##type_name() {                            \
-    VK_CHECK_COND(                                          \
-        is##type_name(),                                    \
-        "Expected value to have type " #type_name ", got ", \
-        tag,                                                \
-        " instead.");                                       \
-    return payload.member_name;                             \
-  }                                                         \
-  inline const type& toConst##type_name() const {           \
-    VK_CHECK_COND(                                          \
-        is##type_name(),                                    \
-        "Expected value to have type " #type_name ", got ", \
-        tag,                                                \
-        " instead.");                                       \
-    return payload.member_name;                             \
-  }
-
-  SUPPORT_TRIVIALLY_MOVEABLE_TYPE(
-      TensorRef,
-      TensorRef,
-      TypeTag::TENSORREF,
-      as_tensorref);
-
-  SUPPORT_TRIVIALLY_MOVEABLE_TYPE(
-      std::vector<int64_t>,
-      IntList,
-      TypeTag::INTLIST,
-      as_int_list);
-
-  SUPPORT_TRIVIALLY_MOVEABLE_TYPE(
-      std::vector<double>,
-      DoubleList,
-      TypeTag::DOUBLELIST,
-      as_double_list);
-
-  SUPPORT_TRIVIALLY_MOVEABLE_TYPE(
-      std::vector<bool>,
-      BoolList,
-      TypeTag::BOOLLIST,
-      as_bool_list);
-
-  SUPPORT_TRIVIALLY_MOVEABLE_TYPE(
-      std::vector<ValueRef>,
-      ValueList,
-      TypeTag::VALUELIST,
-      as_value_list);
-
-  SUPPORT_TRIVIALLY_MOVEABLE_TYPE(
-      std::string,
-      String,
-      TypeTag::STRING,
-      as_string);
-
-#undef SUPPORT_TRIVIALLY_MOVEABLE_TYPE
-
-#define SUPPORT_UNIQUE_PTR_TYPE(type, type_name, type_tag, member_name) \
-  explicit Value(type t) : tag(type_tag) {                              \
-    payload.member_name = std::make_unique<type>(std::move(t));         \
-  }                                                                     \
-  inline bool is##type_name() const {                                   \
-    return tag == type_tag;                                             \
-  }                                                                     \
-  inline type& to##type_name() const {                                  \
-    VK_CHECK_COND(                                                      \
-        is##type_name(),                                                \
-        "Expected value to have type " #type_name ", got ",             \
-        tag,                                                            \
-        " instead.");                                                   \
-    return *payload.member_name;                                        \
-  }                                                                     \
-  inline const type& toConst##type_name() const {                       \
-    VK_CHECK_COND(                                                      \
-        is##type_name(),                                                \
-        "Expected value to have type " #type_name ", got ",             \
-        tag,                                                            \
-        " instead.");                                                   \
-    return *payload.member_name;                                        \
-  }
-
-  SUPPORT_UNIQUE_PTR_TYPE(api::vTensor, Tensor, TypeTag::TENSOR, as_tensor);
-
-  SUPPORT_UNIQUE_PTR_TYPE(
-      api::StagingBuffer,
-      Staging,
-      TypeTag::STAGING,
-      as_staging);
-
-  SUPPORT_UNIQUE_PTR_TYPE(SymInt, SymInt, TypeTag::SYMINT, as_symint);
-
-#undef SUPPORT_UNIQUE_PTR_TYPE
-
- private:
-  Payload payload;
-  TypeTag tag;
-
-  //
-  // Utility Functions
-  //
-
-  inline void clearToNone() noexcept {
-    payload.u.as_int = -1;
-    tag = TypeTag::NONE;
-  }
-};
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/BlitNode.cpp b/backends/vulkan/runtime/graph/ops/BlitNode.cpp
deleted file mode 100644
index de1ad596069..00000000000
--- a/backends/vulkan/runtime/graph/ops/BlitNode.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/BlitNode.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-namespace vkcompute {
-
-BlitNode::BlitNode(
-    ComputeGraph& graph,
-    ValueRef src,
-    ValueRef dst,
-    // const vkapi::ScalarType& dtype,
-    const ResizeFunction& resize_fn,
-    const std::vector<ValueRef>& resize_args)
-    : ExecuteNode(resize_fn, resize_args, {}, "Blit Node"),
-      src_(src),
-      dst_(dst) {
-  (void)graph;
-}
-
-void BlitNode::encode(ComputeGraph* graph) {
-  VK_CHECK_COND(
-      graph->storage_type_of(src_) != utils::kBuffer &&
-          graph->storage_type_of(dst_) != utils::kBuffer,
-      "BlitNode: Only texture backed tensors are supported.");
-
-  api::Context* const context = graph->context();
-  vkapi::PipelineBarrier pipeline_barrier{};
-
-  std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
-
-  // Hack to get timing data for non shader op
-  std::string kernel_name("Blit_");
-  kernel_name.reserve(32);
-  kernel_name += vkapi::to_string(graph->dtype_of(src_));
-  kernel_name += "_to_";
-  kernel_name += vkapi::to_string(graph->dtype_of(dst_));
-
-  context->report_shader_dispatch_start(
-      kernel_name, utils::uvec3(), utils::WorkgroupSize(), node_id_);
-
-  context->register_blit(
-      pipeline_barrier,
-      graph->get_tensor(src_)->image(
-          pipeline_barrier, vkapi::PipelineStage::TRANSFER, vkapi::kRead),
-      graph->get_tensor(dst_)->image(
-          pipeline_barrier, vkapi::PipelineStage::TRANSFER, vkapi::kWrite));
-
-  context->report_shader_dispatch_end();
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/BlitNode.h b/backends/vulkan/runtime/graph/ops/BlitNode.h
deleted file mode 100644
index 98d187b166a..00000000000
--- a/backends/vulkan/runtime/graph/ops/BlitNode.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-#include <executorch/backends/vulkan/runtime/graph/containers/Value.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/ExecuteNode.h>
-
-namespace vkcompute {
-
-/*
- * Represents a tensor blit execution op in a ML model.
- */
-class BlitNode final : public ExecuteNode {
-  friend class ComputeGraph;
-
- public:
-  explicit BlitNode(
-      ComputeGraph& graph,
-      ValueRef src,
-      ValueRef dst,
-      /*const vkapi::ScalarType& dtype,*/
-      const ResizeFunction& resize_fn = nullptr,
-      const std::vector<ValueRef>& resize_args = {});
-
-  ~BlitNode() override = default;
-
-  void encode(ComputeGraph* graph) override;
-
- protected:
-  ValueRef src_;
-  ValueRef dst_;
-  // const vkapi::ScalarType &dtype_;
-};
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
deleted file mode 100644
index 898a3415b7e..00000000000
--- a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/DispatchNode.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h>
-
-namespace vkcompute {
-
-DispatchNode::DispatchNode(
-    ComputeGraph& graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const utils::uvec3& local_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const vkapi::ParamsBindList& params,
-    const std::vector<PushConstantDataInfo>& push_constants,
-    const vkapi::SpecVarList& spec_vars,
-    const std::vector<ValueRef>& resize_args,
-    const ResizeFunction& resize_fn)
-    : ExecuteNode(resize_fn, resize_args, args, shader.kernel_name),
-      shader_(shader),
-      global_workgroup_size_(global_workgroup_size),
-      local_workgroup_size_(local_workgroup_size),
-      params_(params),
-      spec_vars_(spec_vars),
-      push_constants_(push_constants) {
-  graph.update_descriptor_counts(shader, /*execute = */ true);
-}
-
-void DispatchNode::prepare_pipelines(ComputeGraph* graph) {
-  graph->register_pipeline_to_create(
-      shader_, local_workgroup_size_, spec_vars_, push_constants_);
-}
-
-void DispatchNode::encode(ComputeGraph* graph) {
-  if (!shader_) {
-    return;
-  }
-  api::Context* const context = graph->context();
-  vkapi::PipelineBarrier pipeline_barrier{};
-
-  context->check_device_capabilities(shader_);
-
-  std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
-
-  write_push_constant_data();
-
-  context->report_shader_dispatch_start(
-      shader_.kernel_name,
-      global_workgroup_size_,
-      local_workgroup_size_,
-      node_id_);
-
-  vkapi::DescriptorSet descriptor_set = context->get_descriptor_set(
-      shader_, local_workgroup_size_, spec_vars_, push_constants_offset_);
-
-  uint32_t idx = 0;
-  idx = bind_values_to_descriptor_set(
-      graph, args_, pipeline_barrier, descriptor_set, idx);
-
-  bind_params_to_descriptor_set(params_, descriptor_set, idx);
-
-  context->register_shader_dispatch(
-      descriptor_set,
-      pipeline_barrier,
-      shader_,
-      global_workgroup_size_,
-      push_constants_data_.data(),
-      push_constants_offset_);
-
-  context->report_shader_dispatch_end();
-}
-
-void DispatchNode::write_push_constant_data() {
-  push_constants_offset_ = 0;
-  for (const auto& push_constant : push_constants_) {
-    push_constants_offset_ += push_constant.write(
-        push_constants_data_.data(),
-        push_constants_offset_,
-        kMaxPushConstantSize);
-  }
-}
-
-bool DispatchNode::trigger_resize(ComputeGraph* graph) {
-  const bool any_arg_updated = ExecuteNode::trigger_resize(graph);
-
-  if (any_arg_updated) {
-    // If this shader uses push constants, and the tensor metadata associated
-    // with the push constants has changed, then the command buffer needs to be
-    // re-encoded since push constants cannot be updated.
-    for (const auto& push_constant : push_constants_) {
-      if (push_constant.is_tensor_metadata() &&
-          graph->was_value_updated(push_constant.value())) {
-        graph->set_requires_reencode();
-      }
-    }
-  }
-  return any_arg_updated;
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.h b/backends/vulkan/runtime/graph/ops/DispatchNode.h
deleted file mode 100644
index 89d24a77d6e..00000000000
--- a/backends/vulkan/runtime/graph/ops/DispatchNode.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <executorch/backends/vulkan/runtime/graph/containers/PushConstantData.h>
-#include <executorch/backends/vulkan/runtime/graph/containers/Value.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/ExecuteNode.h>
-
-namespace vkcompute {
-
-class ComputeGraph;
-
-/*
- * Represents a single shader execution op in a ML model.
- */
-class DispatchNode : public ExecuteNode {
-  friend class ComputeGraph;
-
- public:
-  explicit DispatchNode(
-      ComputeGraph& graph,
-      const vkapi::ShaderInfo& shader,
-      const utils::uvec3& global_workgroup_size,
-      const utils::uvec3& local_workgroup_size,
-      const std::vector<ArgGroup>& args,
-      const vkapi::ParamsBindList& params,
-      const std::vector<PushConstantDataInfo>& push_constants = {},
-      const vkapi::SpecVarList& spec_vars = {},
-      const std::vector<ValueRef>& resize_args = {},
-      const ResizeFunction& resize_fn = nullptr);
-
-  ~DispatchNode() override = default;
-
-  void prepare_pipelines(ComputeGraph* graph) override;
-
-  void encode(ComputeGraph* graph) override;
-
-  bool trigger_resize(ComputeGraph* graph) override;
-
- protected:
-  vkapi::ShaderInfo shader_;
-  utils::uvec3 global_workgroup_size_;
-  utils::WorkgroupSize local_workgroup_size_;
-  const vkapi::ParamsBindList params_;
-  const vkapi::SpecVarList spec_vars_;
-  const std::vector<PushConstantDataInfo> push_constants_;
-
-  // For push constants
-  std::array<uint8_t, kMaxPushConstantSize> push_constants_data_{};
-  uint32_t push_constants_offset_ = 0;
-
-  void write_push_constant_data();
-
- public:
-  operator bool() const {
-    return shader_;
-  }
-};
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp b/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp
deleted file mode 100644
index 5a88bba88c9..00000000000
--- a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-namespace vkcompute {
-
-DynamicDispatchNode::DynamicDispatchNode(
-    ComputeGraph& graph,
-    const PickShaderFn& pick_shader_fn,
-    const PickGlobalFn& pick_global_wg_fn,
-    const PickLocalFn& pick_local_wg_fn,
-    const std::vector<ArgGroup>& args,
-    const vkapi::ParamsBindList& params,
-    const std::vector<PushConstantDataInfo>& push_constants,
-    const vkapi::SpecVarList& spec_vars,
-    const std::vector<ValueRef>& resize_args,
-    const ResizeFunction& resize_fn)
-    : DispatchNode(
-          graph,
-          pick_shader_fn(&graph, args, resize_args),
-          {1u, 1u, 1u},
-          {8u, 8u, 1u},
-          args,
-          params,
-          push_constants,
-          spec_vars,
-          resize_args,
-          resize_fn),
-      pick_shader_fn_(pick_shader_fn),
-      pick_global_wg_fn_(pick_global_wg_fn),
-      pick_local_wg_fn_(pick_local_wg_fn) {
-  global_workgroup_size_ =
-      pick_global_wg_fn(&graph, shader_, args, resize_args);
-  local_workgroup_size_ = utils::WorkgroupSize(pick_local_wg_fn(
-      &graph, shader_, global_workgroup_size_, args, resize_args));
-
-  // Calculate dispatch grid similar to Context.cpp register_shader_dispatch
-  wg_dispatch_grid_ = {
-      utils::div_up(global_workgroup_size_[0], local_workgroup_size_[0]),
-      utils::div_up(global_workgroup_size_[1], local_workgroup_size_[1]),
-      utils::div_up(global_workgroup_size_[2], local_workgroup_size_[2])};
-}
-
-DynamicDispatchNode::DynamicDispatchNode(
-    ComputeGraph& graph,
-    const vkapi::ShaderInfo& shader,
-    const PickGlobalFn& pick_global_wg_fn,
-    const PickLocalFn& pick_local_wg_fn,
-    const std::vector<ArgGroup>& args,
-    const vkapi::ParamsBindList& params,
-    const std::vector<PushConstantDataInfo>& push_constants,
-    const vkapi::SpecVarList& spec_vars,
-    const std::vector<ValueRef>& resize_args,
-    const ResizeFunction& resize_fn)
-    : DispatchNode(
-          graph,
-          shader,
-          {1u, 1u, 1u},
-          {8u, 8u, 1u},
-          args,
-          params,
-          push_constants,
-          spec_vars,
-          resize_args,
-          resize_fn),
-      pick_shader_fn_{nullptr},
-      pick_global_wg_fn_(pick_global_wg_fn),
-      pick_local_wg_fn_(pick_local_wg_fn) {
-  global_workgroup_size_ =
-      pick_global_wg_fn(&graph, shader_, args, resize_args);
-  local_workgroup_size_ = utils::WorkgroupSize(pick_local_wg_fn(
-      &graph, shader_, global_workgroup_size_, args, resize_args));
-  // Calculate the work group grid that will be dispatched
-  wg_dispatch_grid_ = {
-      utils::div_up(global_workgroup_size_[0], local_workgroup_size_[0]),
-      utils::div_up(global_workgroup_size_[1], local_workgroup_size_[1]),
-      utils::div_up(global_workgroup_size_[2], local_workgroup_size_[2])};
-}
-
-bool DynamicDispatchNode::trigger_resize(ComputeGraph* graph) {
-  // DispatchNode::trigger_resize() will return true if any of the values
-  // participating in this operation were updated.
-  const bool any_arg_updated = DispatchNode::trigger_resize(graph);
-  // Only re-compute the shader, global workgroup size, and local workgroup size
-  // if any of the values participating in this operation were updated.
-  // Otherwise, assume that these will not have changed.
-  if (!any_arg_updated) {
-    return false;
-  }
-
-  // Indicates if the shader dispatch should be changed since the last time the
-  // command buffer was encoded.
-  bool dispatch_params_changed = false;
-
-  if (pick_shader_fn_) {
-    vkapi::ShaderInfo new_shader = pick_shader_fn_(graph, args_, resize_args_);
-    // Compare shader kernel names as a proxy for shader equality
-    if (shader_.kernel_name != new_shader.kernel_name) {
-      shader_ = new_shader;
-      dispatch_params_changed = true;
-    }
-  }
-  if (pick_global_wg_fn_) {
-    // Note that if global workgroup size changes, then the dispatch params
-    // may not actually be different. The actual value to check is the
-    // work group grid size that will be dispatched, which is calculated
-    // below.
-    global_workgroup_size_ =
-        pick_global_wg_fn_(graph, shader_, args_, resize_args_);
-  }
-  if (pick_local_wg_fn_) {
-    utils::uvec3 new_local_wg_uvec3 = pick_local_wg_fn_(
-        graph, shader_, global_workgroup_size_, args_, resize_args_);
-    utils::WorkgroupSize new_local_wg =
-        utils::WorkgroupSize(new_local_wg_uvec3);
-    if (local_workgroup_size_ != new_local_wg) {
-      local_workgroup_size_ = new_local_wg;
-      dispatch_params_changed = true;
-    }
-  }
-
-  // Always recompute the new dispatch grid and check if it's different
-  utils::uvec3 new_wg_dispatch_grid = {
-      utils::div_up(global_workgroup_size_[0], local_workgroup_size_[0]),
-      utils::div_up(global_workgroup_size_[1], local_workgroup_size_[1]),
-      utils::div_up(global_workgroup_size_[2], local_workgroup_size_[2])};
-
-  // Check if the new dispatch grid is different from the old one
-  if (wg_dispatch_grid_ != new_wg_dispatch_grid) {
-    dispatch_params_changed = true;
-  }
-  wg_dispatch_grid_ = new_wg_dispatch_grid;
-
-  // If any of the dispatch params have changed, then the command buffer must
-  // be re-encoded.
-  if (dispatch_params_changed) {
-    graph->set_requires_reencode();
-  }
-
-  return true;
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h b/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h
deleted file mode 100644
index d3b82968eb2..00000000000
--- a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <executorch/backends/vulkan/runtime/graph/containers/PushConstantData.h>
-#include <executorch/backends/vulkan/runtime/graph/containers/Value.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/DispatchNode.h>
-
-namespace vkcompute {
-
-class ComputeGraph;
-
-/*
- * Represents a single shader execution op in a ML model.
- */
-class DynamicDispatchNode final : public DispatchNode {
-  friend class ComputeGraph;
-
- public:
-  using PickShaderFn = const std::function<vkapi::ShaderInfo(
-      ComputeGraph*,
-      const std::vector<ArgGroup>&,
-      const std::vector<ValueRef>&)>;
-  using PickGlobalFn = const std::function<utils::uvec3(
-      ComputeGraph*,
-      const vkapi::ShaderInfo& shader,
-      const std::vector<ArgGroup>&,
-      const std::vector<ValueRef>&)>;
-  using PickLocalFn = const std::function<utils::uvec3(
-      ComputeGraph*,
-      const vkapi::ShaderInfo& shader,
-      const utils::uvec3& global_workgroup_size,
-      const std::vector<ArgGroup>&,
-      const std::vector<ValueRef>&)>;
-
-  explicit DynamicDispatchNode(
-      ComputeGraph& graph,
-      const PickShaderFn& pick_shader_fn,
-      const PickGlobalFn& pick_global_wg_fn,
-      const PickLocalFn& pick_local_wg_fn,
-      const std::vector<ArgGroup>& args,
-      const vkapi::ParamsBindList& params,
-      const std::vector<PushConstantDataInfo>& push_constants,
-      const vkapi::SpecVarList& spec_vars,
-      const std::vector<ValueRef>& resize_args,
-      const ResizeFunction& resize_fn = nullptr);
-
-  explicit DynamicDispatchNode(
-      ComputeGraph& graph,
-      const vkapi::ShaderInfo& shader,
-      const PickGlobalFn& pick_global_wg_fn,
-      const PickLocalFn& pick_local_wg_fn,
-      const std::vector<ArgGroup>& args,
-      const vkapi::ParamsBindList& params,
-      const std::vector<PushConstantDataInfo>& push_constants,
-      const vkapi::SpecVarList& spec_vars,
-      const std::vector<ValueRef>& resize_args,
-      const ResizeFunction& resize_fn = nullptr);
-
-  ~DynamicDispatchNode() override = default;
-
-  bool trigger_resize(ComputeGraph* graph) override;
-
- protected:
-  const PickShaderFn pick_shader_fn_;
-  const PickGlobalFn pick_global_wg_fn_;
-  const PickLocalFn pick_local_wg_fn_;
-
-  utils::uvec3 wg_dispatch_grid_{1u, 1u, 1u};
-
- public:
-  operator bool() const {
-    return shader_;
-  }
-};
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp b/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp
deleted file mode 100644
index 953f15e7b4d..00000000000
--- a/backends/vulkan/runtime/graph/ops/ExecuteNode.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/ExecuteNode.h>
-
-namespace vkcompute {
-ExecuteNode::ExecuteNode(
-    const ResizeFunction& resize_fn,
-    const std::vector<ValueRef>& resize_args,
-    const std::vector<ArgGroup>& args,
-    const std::string& name)
-    : resize_fn_(resize_fn),
-      resize_args_(resize_args),
-      args_(args),
-      name_(name) {}
-
-bool ExecuteNode::trigger_resize(ComputeGraph* graph) {
-  const bool any_arg_updated = was_any_arg_updated(graph);
-  if (resize_fn_ && any_arg_updated) {
-    resize_fn_(graph, args_, resize_args_);
-  }
-  return any_arg_updated;
-}
-
-bool ExecuteNode::was_any_arg_updated(const ComputeGraph* const graph) const {
-  // Check all ValueRefs in ArgGroups
-  for (const auto& arg_group : args_) {
-    for (const auto& value_ref : arg_group.refs) {
-      if (graph->was_value_updated(value_ref)) {
-        return true;
-      }
-    }
-  }
-
-  // Check all ValueRefs in resize_args
-  for (const auto& value_ref : resize_args_) {
-    if (graph->was_value_updated(value_ref)) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
deleted file mode 100644
index 323036cef90..00000000000
--- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <executorch/backends/vulkan/runtime/graph/containers/Value.h>
-
-namespace vkcompute {
-
-class ComputeGraph;
-
-/*
- * Represents a group of shader arguments (images and/or buffers), with a common
- * access permission.
- */
-struct ArgGroup {
-  ArgGroup(const ValueRef ref, const vkapi::MemoryAccessFlags access)
-      : refs{ref}, access(access) {}
-
-  ArgGroup(
-      const std::vector<ValueRef>& refs,
-      const vkapi::MemoryAccessFlags access)
-      : refs(refs), access(access) {}
-
-  const std::vector<ValueRef> refs;
-  const vkapi::MemoryAccessFlags access;
-};
-
-/*
- * Represents a single execution op in a ML model. In graph mode, ops will be
- * implemented in a derived class that implements encode, which will implement
- * encoding of the shader corresponding to the op into the command buffer of a
- * ComputeGraph.
- */
-class ExecuteNode {
-  friend class ComputeGraph;
-
- public:
-  using ResizeFunction = std::function<void(
-      ComputeGraph*,
-      const std::vector<ArgGroup>&,
-      const std::vector<ValueRef>&)>;
-
-  /*
-   * This overload of the DispatchNode constructor is used to register ops which
-   * update a tensor view. No shader is dispatched, but the node still needs to
-   * update the view's sizes and strides after a resize.
-   */
-  explicit ExecuteNode(
-      const ResizeFunction& resize_fn = nullptr,
-      const std::vector<ValueRef>& resize_args = {},
-      const std::vector<ArgGroup>& args = {},
-      const std::string& name = "Graph Node");
-
-  virtual ~ExecuteNode() = default;
-
-  virtual void prepare_pipelines(ComputeGraph* graph) {
-    (void)graph;
-  }
-
-  virtual void encode(ComputeGraph* graph) {
-    (void)graph;
-  }
-
-  virtual bool trigger_resize(ComputeGraph* graph);
-
-  bool was_any_arg_updated(const ComputeGraph* const graph) const;
-
-  inline void set_node_id(uint32_t node_id) {
-    node_id_ = node_id;
-  }
-
-  inline const std::string& name() const {
-    return name_;
-  }
-
- protected:
-  uint32_t node_id_;
-  const ResizeFunction resize_fn_;
-  const std::vector<ValueRef> resize_args_;
-  const std::vector<ArgGroup> args_;
-  const std::string name_;
-};
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp b/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp
deleted file mode 100644
index 4d1f749830c..00000000000
--- a/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-namespace vkcompute {
-
-bool OperatorRegistry::has_op(const std::string& name) {
-  return table_.count(name) > 0;
-}
-
-OperatorRegistry::OpFunction& OperatorRegistry::get_op_fn(
-    const std::string& name) {
-  const auto it = table_.find(name);
-  VK_CHECK_COND(it != table_.end(), "Could not find operator with name ", name);
-  return it->second;
-}
-
-void OperatorRegistry::register_op(const std::string& name, OpFunction& fn) {
-  table_.insert(std::make_pair(name, fn));
-}
-
-OperatorRegistry& operator_registry() {
-  static OperatorRegistry registry;
-  return registry;
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/OperatorRegistry.h b/backends/vulkan/runtime/graph/ops/OperatorRegistry.h
deleted file mode 100644
index 9d41d48afb9..00000000000
--- a/backends/vulkan/runtime/graph/ops/OperatorRegistry.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/DispatchNode.h>
-
-#include <functional>
-#include <unordered_map>
-
-#define VK_HAS_OP(name) ::vkcompute::operator_registry().has_op(name)
-
-#define VK_GET_OP_FN(name) ::vkcompute::operator_registry().get_op_fn(name)
-
-#define VK_REGISTER_OP(name, function)          \
-  ::vkcompute::operator_registry().register_op( \
-      #name,                                    \
-      std::bind(&function, std::placeholders::_1, std::placeholders::_2))
-
-#define REGISTER_OPERATORS                              \
-  static void register_ops();                           \
-  static const OperatorRegisterInit reg(&register_ops); \
-  static void register_ops()
-
-namespace vkcompute {
-
-/*
- * The Vulkan operator registry maps ATen operator names
- * to their Vulkan delegate function implementation. It is
- * a simplified version of
- * executorch/runtime/kernel/operator_registry.h that uses
- * the C++ Standard Library.
- */
-class OperatorRegistry final {
-  using OpFunction =
-      const std::function<void(ComputeGraph&, const std::vector<ValueRef>&)>;
-  using OpTable = std::unordered_map<std::string, OpFunction>;
-
-  OpTable table_;
-
- public:
-  /*
-   * Check if the registry has an operator registered under the given name
-   */
-  bool has_op(const std::string& name);
-
-  /*
-   * Given an operator name, return the Vulkan delegate function
-   */
-  OpFunction& get_op_fn(const std::string& name);
-
-  /*
-   * Register a function to a given operator name
-   */
-  void register_op(const std::string& name, OpFunction& fn);
-};
-
-class OperatorRegisterInit final {
-  using InitFn = void();
-
- public:
-  explicit OperatorRegisterInit(InitFn* init_fn) {
-    init_fn();
-  }
-};
-
-// The Vulkan operator registry is global. It is retrieved using this function,
-// where it is declared as a static local variable.
-OperatorRegistry& operator_registry();
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
deleted file mode 100644
index 62e1dc86f43..00000000000
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/DispatchNode.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
-
-namespace vkcompute {
-
-vkapi::ShaderInfo get_noop_shader(ComputeGraph& graph, const ValueRef packed) {
-  std::string noop_shader_name("no_op");
-  add_dtype_suffix(noop_shader_name, graph.dtype_of(packed));
-  add_storage_type_suffix(noop_shader_name, graph.storage_type_of(packed));
-  return VK_KERNEL_FROM_STR(noop_shader_name);
-}
-
-PrepackNode::PrepackNode(
-    ComputeGraph& graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const utils::uvec3& local_workgroup_size,
-    const ValueRef tref,
-    const ValueRef packed,
-    const vkapi::ParamsBindList& params,
-    const vkapi::SpecVarList& spec_vars,
-    const std::vector<PushConstantDataInfo>& push_constants)
-    : shader_(shader),
-      noop_shader_(get_noop_shader(graph, packed)),
-      global_workgroup_size_(global_workgroup_size),
-      local_workgroup_size_(local_workgroup_size),
-      tref_(tref),
-      packed_(packed),
-      params_(params),
-      spec_vars_(spec_vars),
-      push_constants_(push_constants) {
-  graph.update_descriptor_counts(shader, /*execute = */ false);
-  graph.update_descriptor_counts(noop_shader_, /*execute = */ false);
-}
-
-api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
-  // If no TensorRef is provided, create a staging buffer of zeros based on the
-  // Tensor metadata.
-  if (graph->val_is_none(tref_)) {
-    const std::vector<int64_t> packed_sizes = graph->sizes_of(packed_);
-    size_t numel = utils::multiply_integers(packed_sizes);
-    api::StagingBuffer staging(
-        graph->context(), graph->dtype_of(packed_), numel);
-    staging.set_staging_zeros();
-    return staging;
-  }
-
-  TensorRefPtr tref = graph->get_tref(tref_);
-  size_t numel = utils::multiply_integers(tref->sizes);
-  api::StagingBuffer staging(graph->context(), tref->dtype, numel);
-  graph->update_staging_nbytes_in_cmd(staging.buffer().mem_size_as_size_t());
-  size_t nbytes = numel * vkapi::element_size(tref->dtype);
-  staging.copy_from(tref->data, nbytes);
-  // Once the staging buffer is copied, if the TensorRef owns a FreeableBuffer,
-  // it can be freed.
-  tref->free_buffer();
-  return staging;
-}
-
-void PrepackNode::prepare_pipelines(ComputeGraph* graph) {
-  graph->register_pipeline_to_create(
-      shader_, local_workgroup_size_, spec_vars_, push_constants_);
-  graph->register_pipeline_to_create(
-      noop_shader_, utils::WorkgroupSize(1, 1, 1), {}, {});
-}
-
-void PrepackNode::encode(ComputeGraph* graph) {
-  api::Context* const context = graph->context();
-
-  context->check_device_capabilities(shader_);
-
-  api::StagingBuffer staging = create_staging_buffer(graph);
-
-  std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
-
-  std::array<uint8_t, kMaxPushConstantSize> push_constants_data;
-  uint32_t push_constants_offset = 0;
-
-  for (const auto& push_constant : push_constants_) {
-    push_constants_offset += push_constant.write(
-        push_constants_data.data(),
-        push_constants_offset,
-        kMaxPushConstantSize);
-  }
-
-  {
-    // If the vTensor is not yet bound to a memory allocation, create a new one
-    // and aquire it.
-    graph->create_dedicated_allocation_for(packed_);
-
-    vkapi::PipelineBarrier pipeline_barrier{};
-    vkapi::DescriptorSet descriptor_set = context->get_descriptor_set(
-        shader_, local_workgroup_size_, spec_vars_, push_constants_offset);
-
-    uint32_t idx = 0;
-    graph->bind_tensor_to_descriptor_set(
-        packed_,
-        pipeline_barrier,
-        vkapi::MemoryAccessType::WRITE,
-        descriptor_set,
-        idx++);
-    bind_staging_to_descriptor_set(staging, descriptor_set, idx++);
-    bind_params_to_descriptor_set(params_, descriptor_set, idx);
-
-    context->register_shader_dispatch(
-        descriptor_set,
-        pipeline_barrier,
-        shader_,
-        global_workgroup_size_,
-        push_constants_data.data(),
-        push_constants_offset);
-  }
-
-  // Submit a compute shader that performs a no-op with the packed tensor in
-  // order to trigger an image layout transition from GENERAL to
-  // READ_ONLY_OPTIMAL. This ensures that future uses of the tensor will be
-  // bound with the correct image layout.
-  {
-    vkapi::PipelineBarrier pipeline_barrier{};
-    vkapi::DescriptorSet descriptor_set = context->get_descriptor_set(
-        noop_shader_, utils::WorkgroupSize(1, 1, 1));
-
-    graph->bind_tensor_to_descriptor_set(
-        packed_,
-        pipeline_barrier,
-        vkapi::MemoryAccessType::READ,
-        descriptor_set,
-        0);
-
-    context->register_shader_dispatch(
-        descriptor_set, pipeline_barrier, noop_shader_, {1, 1, 1});
-  }
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h
deleted file mode 100644
index 8ce8ac9f773..00000000000
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <executorch/backends/vulkan/runtime/graph/containers/PushConstantData.h>
-#include <executorch/backends/vulkan/runtime/graph/containers/Value.h>
-
-namespace vkcompute {
-
-class ComputeGraph;
-
-/*
- * Represents a single prepacking op in a ML model. In graph mode, ops will be
- * implemented in a derived class that implements encode, which will implement
- * encoding of shaders transferring necessary data (such as weights and biases)
- * to the GPU.
- */
-class PrepackNode final {
-  friend class ComputeGraph;
-
- public:
-  PrepackNode(
-      ComputeGraph& graph,
-      const vkapi::ShaderInfo& shader,
-      const utils::uvec3& global_workgroup_size,
-      const utils::uvec3& local_workgroup_size,
-      const ValueRef tref,
-      const ValueRef packed,
-      const vkapi::ParamsBindList& params,
-      const vkapi::SpecVarList& spec_vars = {},
-      const std::vector<PushConstantDataInfo>& push_constants = {});
-
-  ~PrepackNode() = default;
-
-  void prepare_pipelines(ComputeGraph* graph);
-
-  void encode(ComputeGraph* graph);
-
-  inline void set_node_id(uint32_t node_id) {
-    node_id_ = node_id;
-  }
-
- protected:
-  uint32_t node_id_;
-  const vkapi::ShaderInfo shader_;
-  vkapi::ShaderInfo noop_shader_;
-  const utils::uvec3 global_workgroup_size_;
-  const utils::WorkgroupSize local_workgroup_size_;
-  const ValueRef tref_;
-  const ValueRef packed_;
-  const vkapi::ParamsBindList params_;
-  const vkapi::SpecVarList spec_vars_;
-  const std::vector<PushConstantDataInfo> push_constants_;
-
- private:
-  api::StagingBuffer create_staging_buffer(ComputeGraph* graph);
-};
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/glsl/activations.h b/backends/vulkan/runtime/graph/ops/glsl/activations.h
deleted file mode 100644
index 2ba0ccc467d..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/activations.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-float hardswish(float x) {
-  if (x <= -3) {
-    return 0;
-  } else if (x >= 3) {
-    return x;
-  } else {
-    return x * (x + 3) / 6;
-  }
-}
-
-vec4 hardswish(vec4 tex) {
-  return vec4(
-      hardswish(tex.x), hardswish(tex.y), hardswish(tex.z), hardswish(tex.w));
-}
-
-float hardshrink(float x, float lambda, float neg_lambda) {
-  return x * (float(x > lambda) + float(x < neg_lambda));
-}
-
-vec4 hardshrink(vec4 tex, float lambda, float neg_lambda) {
-  return tex *
-      (vec4(greaterThan(tex, vec4(lambda))) +
-       vec4(lessThan(tex, vec4(neg_lambda))));
-}
-
-float hardsigmoid(float x) {
-  return mix(float(x >= 0.0), x / 6 + 0.5, float(abs(x) <= 3.0));
-}
-
-vec4 hardsigmoid(vec4 tex) {
-  return vec4(
-      hardsigmoid(tex.x),
-      hardsigmoid(tex.y),
-      hardsigmoid(tex.z),
-      hardsigmoid(tex.w));
-}
-
-float leaky_relu(float x, float negative_slope) {
-  return x * (float(x > 0.0) + negative_slope * float(x <= 0.0));
-}
-
-vec4 leaky_relu(vec4 tex, float negative_slope) {
-  return vec4(
-      leaky_relu(tex.x, negative_slope),
-      leaky_relu(tex.y, negative_slope),
-      leaky_relu(tex.z, negative_slope),
-      leaky_relu(tex.w, negative_slope));
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_buffer.glsl
deleted file mode 100644
index 1f3061ea100..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_buffer.glsl
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-$if HAS_BIAS:
-  #define HAS_BIAS
-
-#define T ${buffer_scalar_type(DTYPE)}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
-${layout_declare_tensor(B, "r", "t_mat1", DTYPE, "buffer")}
-${layout_declare_tensor(B, "r", "t_mat2", DTYPE, "buffer")}
-$if HAS_BIAS:
-  ${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer")}
-${layout_declare_ubo(B, "ivec4", "out_sizes")}
-${layout_declare_ubo(B, "ivec4", "out_strides")}
-${layout_declare_ubo(B, "ivec4", "mat1_sizes")}
-${layout_declare_ubo(B, "ivec4", "mat1_strides")}
-${layout_declare_ubo(B, "ivec4", "mat2_sizes")}
-${layout_declare_ubo(B, "ivec4", "mat2_strides")}
-${layout_declare_ubo(B, "int", "out_numel")}
-$if HAS_BIAS:
-  ${layout_declare_ubo(B, "float", "alpha", "float", "beta")}
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "mat2_is_transposed", "0")}
-
-void main() {
-  const ivec4 out_tidx = ivec4(
-      gl_GlobalInvocationID.x,
-      gl_GlobalInvocationID.y,
-      gl_GlobalInvocationID.z % out_sizes.z,
-      gl_GlobalInvocationID.z / out_sizes.z);
-
-  if (any(greaterThanEqual(out_tidx, out_sizes))) {
-    return;
-  }
-
-  int mat1_bufi = tidx_to_bufi(
-      ivec4(0, out_tidx.y, out_tidx.z, out_tidx.w), mat1_strides);
-  int mat2_bufi;
-  if (mat2_is_transposed > 0) {
-    mat2_bufi = tidx_to_bufi(
-        ivec4(0, out_tidx.x, 0, 0), mat2_strides);
-  } else {
-    mat2_bufi = tidx_to_bufi(
-        ivec4(out_tidx.x, 0, out_tidx.z, out_tidx.w), mat2_strides);
-  }
-
-  int mat2_stride;
-  if (mat2_is_transposed > 0) {
-    mat2_stride = mat2_strides.x;
-  } else {
-    mat2_stride = mat2_strides.y;
-  }
-
-  T sum = T(0.0);
-  for (int i = 0; i < mat1_sizes.x; ++i) {
-    sum += t_mat1[mat1_bufi] * t_mat2[mat2_bufi];
-
-    mat1_bufi += mat1_strides.x;
-    mat2_bufi += mat2_stride;
-  }
-
-  const int out_bufi = tidx_to_bufi(out_tidx, out_strides);
-#ifdef HAS_BIAS
-  t_out[out_bufi] = T(alpha) * T(sum) + T(beta) * t_bias[out_tidx.x];
-#else
-  t_out[out_bufi] = T(sum);
-#endif // HAS_BIAS
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_buffer.yaml
deleted file mode 100644
index b093d0c80b2..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_buffer.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-addmm_naive_buffer:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: buffer
-    HAS_BIAS: false
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-      - VALUE: half
-  shader_variants:
-    - NAME: matmul_naive_buffer
-    - NAME: addmm_naive_buffer
-      HAS_BIAS: true
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl
deleted file mode 100644
index a4ed494fe6d..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.glsl
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-$if MAT2_IS_TRANSPOSED:
-  #define MAT2_IS_TRANSPOSED
-
-$if HAS_BIAS:
-  #define HAS_BIAS
-
-${layout_declare_tensor(B, "w", "out_tensor", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "mat1_tensor", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "mat2_tensor", DTYPE, "texture3d")}
-$if HAS_BIAS:
-  ${layout_declare_tensor(B, "r", "bias_tensor", DTYPE, "texture3d")}
-${layout_declare_ubo(B, "ivec4", "out_sizes")}
-${layout_declare_ubo(B, "ivec3", "out_limits")}
-${layout_declare_ubo(B, "ivec4", "mat1_sizes")}
-${layout_declare_ubo(B, "ivec4", "mat2_sizes")}
-$if HAS_BIAS:
-  ${layout_declare_ubo(B, "ivec4", "bias_sizes")}
-  ${layout_declare_ubo(B, "float", "alpha", "float", "beta")}
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int out_packed_dim = unhash_packed_dim(out_layout);
-
-${layout_declare_spec_const(C, "int", "mat1_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 mat1_axis_map = unhash_axis_map(mat1_layout);
-const lowp int mat1_packed_dim = unhash_packed_dim(mat1_layout);
-
-${layout_declare_spec_const(C, "int", "mat2_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 mat2_axis_map = unhash_axis_map(mat2_layout);
-const lowp int mat2_packed_dim = unhash_packed_dim(mat2_layout);
-
-$if HAS_BIAS:
-  ${layout_declare_spec_const(C, "int", "bias_layout", "DEFAULT_LAYOUT")}
-  const lowp ivec4 bias_axis_map = unhash_axis_map(bias_layout);
-  const lowp int bias_packed_dim = unhash_packed_dim(bias_layout);
-
-#ifdef HAS_BIAS
-vec4 get_bias_texel_W_packed(ivec3 logical_pos) {
-  ivec3 bias_pos = ivec3(0);
-  if (bias_sizes.y == 1) {
-    bias_pos[bias_axis_map.y] = 0;
-  } else {
-    bias_pos[bias_axis_map.y] = logical_pos.y;
-  }
-  if (bias_sizes.x == 1) {
-    bias_pos[bias_axis_map.x] = 0;
-    vec4 bias_texel = texelFetch(bias_tensor, bias_pos, 0);
-    // Only the first value is valid, the rest is 0 padding
-    return vec4(bias_texel.x);
-  } else {
-    bias_pos[bias_axis_map.x] = logical_pos.x;
-  }
-
-  return texelFetch(bias_tensor, bias_pos, 0);
-}
-#endif // HAS_BIAS
-
-vec4 matmul_naive_k_dim_packed(const ivec3 out_lpos) {
-  ivec3 mat1_pos;
-  mat1_pos[mat1_axis_map.x] = 0;
-  mat1_pos[mat1_axis_map.y] = out_lpos.y;
-  mat1_pos[mat1_axis_map.z] = out_lpos.z;
-#ifdef MAT2_IS_TRANSPOSED
-  const int mat2_k_axis = mat2_axis_map.x;
-  const int mat2_row_axis = mat2_axis_map.y;
-#else
-  const int mat2_k_axis = mat2_axis_map.y;
-  const int mat2_row_axis = mat2_axis_map.x;
-#endif // MAT2_IS_TRANSPOSED
-
-  vec4 texel = vec4(0);
-  const int K = divup4(mat1_sizes.x);
-
-  for (int i = 0; i < K; ++i) {
-    const vec4 mat1_tex = texelFetch(mat1_tensor, mat1_pos, 0);
-
-    vec4 sums;
-    for (int r = 0; r < 4; ++r) {
-      // On-demand construction of mat2_pos appears to provide the lowest
-      // latency. Surprisingly, this doesn't translate to mat1_pos.
-      ivec3 mat2_pos = ivec3(0);
-      mat2_pos[mat2_k_axis] = i;
-      mat2_pos[mat2_row_axis] = out_lpos.x * 4 + r;
-#ifndef MAT2_IS_TRANSPOSED
-      mat2_pos[mat2_axis_map.z] = out_lpos.z;
-#endif // MAT2_IS_TRANSPOSED
-      sums[r] = dot(mat1_tex, texelFetch(mat2_tensor, mat2_pos, 0));
-    }
-
-    texel += sums;
-
-    mat1_pos[mat1_axis_map.x]++;
-  }
-
-  return texel;
-}
-
-vec4 matmul_naive_k_dim_packed_row_dim_packed(const ivec3 out_lpos) {
-  ivec3 mat1_pos;
-  mat1_pos[mat1_axis_map.x] = 0;
-  mat1_pos[mat1_axis_map.y] = out_lpos.y;
-  mat1_pos[mat1_axis_map.z] = out_lpos.z;
-
-  ivec3 mat2_pos;
-  mat2_pos[mat2_axis_map.x] = out_lpos.x;
-  mat2_pos[mat2_axis_map.y] = 0;
-  mat2_pos[mat2_axis_map.z] = out_lpos.z;
-
-  ivec3 mat2_pos_offset = ivec3(0);
-  mat2_pos_offset[mat2_axis_map.y] = 1;
-
-  const int mat2_y_axis = mat2_axis_map.y;
-
-  vec4 texel = vec4(0);
-  const int K = divup4(mat1_sizes.x);
-
-  for (int i = 0;
-       i < K;
-       ++i, mat1_pos[mat1_axis_map.x]++, mat2_pos[mat2_axis_map.y]+=4) {
-    const vec4 mat1_tex = texelFetch(mat1_tensor, mat1_pos, 0);
-
-    for (int r = 0; r < 4; ++r) {
-      // On-demand construction of mat2_pos appears to provide the lowest
-      // latency. Surprisingly, this doesn't translate to mat1_pos.
-      ivec3 mat2_pos = ivec3(0);
-      mat2_pos[mat2_axis_map.x] = out_lpos.x;
-      mat2_pos[mat2_axis_map.y] = 4 * i + r;
-      mat2_pos[mat2_axis_map.z] = out_lpos.z;
-
-      vec4 mat1_comp_vec = vec4(mat1_tex[r]);
-      texel = fma(mat1_comp_vec, texelFetch(mat2_tensor, mat2_pos, 0), texel);
-    }
-  }
-
-  return texel;
-}
-
-void main() {
-  const ivec3 out_lpos = ivec3(gl_GlobalInvocationID);
-  if (any(greaterThanEqual(out_lpos, out_limits))) {
-    return;
-  }
-
-  vec4 texel = vec4(0);
-
-#ifdef MAT2_IS_TRANSPOSED
-  if (mat2_packed_dim == W_DIM) {
-    texel = matmul_naive_k_dim_packed(out_lpos);
-  } else {
-    texel = matmul_naive_k_dim_packed_row_dim_packed(out_lpos);
-  }
-#else
-  if (mat2_packed_dim == W_DIM) {
-    texel = matmul_naive_k_dim_packed_row_dim_packed(out_lpos);
-  } else {
-    texel = matmul_naive_k_dim_packed(out_lpos);
-  }
-#endif // MAT2_IS_TRANSPOSED
-
-#ifdef HAS_BIAS
-  vec4 bias_texel = get_bias_texel_W_packed(out_lpos);
-  texel = beta * bias_texel + alpha * texel;
-#endif // HAS_BIAS
-
-  write_texel_lpos(out_tensor, out_lpos, texel, out_axis_map);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.yaml
deleted file mode 100644
index 33b617eed13..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_naive_texture3d.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-addmm_naive_texture3d:
-  parameter_names_with_default_values:
-    DTYPE: float
-    MAT2_IS_TRANSPOSED: false
-    HAS_BIAS: true
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-      - VALUE: half
-  shader_variants:
-    - NAME: addmm_naive_texture3d
-    - NAME: matmul_naive_texture3d
-      HAS_BIAS: false
-    - NAME: linear_naive_texture3d
-      MAT2_IS_TRANSPOSED: true
-    - NAME: matmul_transposed_naive_texture3d
-      MAT2_IS_TRANSPOSED: true
-      HAS_BIAS: false
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
deleted file mode 100644
index 05c227f302c..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.glsl
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-$if MAT2_IS_TRANSPOSED:
-  #define MAT2_IS_TRANSPOSED
-
-$if BATCH_MODE:
-  #define BATCH_MODE
-
-$if HAS_BIAS:
-  #define HAS_BIAS
-
-${layout_declare_tensor(B, "w", "out_tensor", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "mat1_tensor", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "mat2_tensor", DTYPE, "texture3d")}
-$if HAS_BIAS:
-  ${layout_declare_tensor(B, "r", "bias_tensor", DTYPE, "texture3d")}
-${layout_declare_ubo(B, "ivec4", "out_sizes")}
-${layout_declare_ubo(B, "ivec4", "mat1_sizes")}
-${layout_declare_ubo(B, "ivec4", "mat2_sizes")}
-$if HAS_BIAS:
-  ${layout_declare_ubo(B, "ivec4", "bias_sizes")}
-  ${layout_declare_ubo(B, "float", "alpha", "float", "beta")}
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int out_packed_dim = unhash_packed_dim(out_layout);
-
-${layout_declare_spec_const(C, "int", "mat1_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 mat1_axis_map = unhash_axis_map(mat1_layout);
-
-${layout_declare_spec_const(C, "int", "mat2_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 mat2_axis_map = unhash_axis_map(mat2_layout);
-
-$if HAS_BIAS:
-  ${layout_declare_spec_const(C, "int", "bias_layout", "DEFAULT_LAYOUT")}
-  const lowp ivec4 bias_axis_map = unhash_axis_map(bias_layout);
-
-// To convince the SPIR-V compiler to unroll the loops optimally, need this
-// macro
-#define FOUR 4
-
-#define TILE_ROWS ${TILE_ROWS}
-
-// we avoid mat4 and vec4 usage here as they compile to much less efficient
-// SPIR-V
-struct FloatMatrix_2d {
-  float data[TILE_ROWS][FOUR];
-};
-
-struct FloatMatrix_3d {
-  float data[TILE_ROWS][FOUR][FOUR];
-};
-
-#ifdef BATCH_MODE
-  #define FloatMatrix FloatMatrix_3d
-#else
-  #define FloatMatrix FloatMatrix_2d
-#endif // BATCH_MODE
-
-#ifdef HAS_BIAS
-// get texel from self tensor (channel_packed) in addmm
-vec4 get_texel_C_packed(const ivec2 idx) {
-  ivec3 bias_pos = ivec3(0);
-  if (bias_sizes.x > 1) {
-    bias_pos[bias_axis_map.x] = idx.x;
-  }
-  if (bias_sizes.y > 1) {
-    bias_pos[bias_axis_map.y] = idx.y;
-  }
-
-  return texelFetch(bias_tensor, bias_pos, 0);
-}
-#endif // HAS_BIAS
-
-FloatMatrix matmul_partial(const ivec4 out_idx_tl) {
-  FloatMatrix results;
-  for (int i = 0; i < TILE_ROWS; i++) {
-    for (int j = 0; j < FOUR; j++) {
-#ifdef BATCH_MODE
-      for (int k = 0; k < FOUR; k++) {
-        results.data[i][j][k] = 0.0f;
-      }
-#else
-      results.data[i][j] = 0.0f;
-#endif // BATCH_MODE
-    }
-  }
-  vec4 mat1_tensor_partial_load[TILE_ROWS];
-  vec4 mat2_tensor_partial_load[FOUR];
-
-#ifdef MAT2_IS_TRANSPOSED
-  const int mat2_k_axis = mat2_axis_map.x;
-  const int mat2_row_axis = mat2_axis_map.y;
-#else
-  const int mat2_k_axis = mat2_axis_map.y;
-  const int mat2_row_axis = mat2_axis_map.x;
-#endif // MAT2_IS_TRANSPOSED
-
-#ifdef BATCH_MODE
-  for (int batch_idx = 0; batch_idx < FOUR; batch_idx++) {
-    if (out_idx_tl.z + batch_idx >= out_sizes.z) {
-      break;
-    }
-#endif // BATCH_MODE
-  for (int k = 0; k < mat1_sizes.x; k+=4) {
-    const int k_div4 = k >> 2;
-    // read and cache (4 x TILE_ROWS) tile of mat1
-    for (int r = 0; r < TILE_ROWS; r++) {
-      ivec3 mat1_pos = ivec3(0);
-      mat1_pos[mat1_axis_map.x] = k_div4;
-      mat1_pos[mat1_axis_map.y] = out_idx_tl.y + r;
-#ifdef BATCH_MODE
-      mat1_pos[mat1_axis_map.z] = out_idx_tl.z + batch_idx;
-#endif // BATCH_MODE
-
-      mat1_tensor_partial_load[r] = texelFetch(mat1_tensor, mat1_pos, 0);
-    }
-
-    // read and cache (4 x 4) tile of mat2
-    for (int r = 0; r < FOUR; ++r) {
-      ivec3 mat2_pos = ivec3(0);
-      mat2_pos[mat2_k_axis] = k_div4;
-      mat2_pos[mat2_row_axis] = out_idx_tl.x + r;
-#if defined(BATCH_MODE) && !defined(MAT2_IS_TRANSPOSED)
-      mat2_pos[mat2_axis_map.z] = out_idx_tl.z + batch_idx;
-#endif // BATCH_MODE
-
-      mat2_tensor_partial_load[r] = texelFetch(mat2_tensor, mat2_pos, 0);
-    }
-
-    // perform partial dot products and add partial result to results
-    for (int out_row = 0; out_row < TILE_ROWS; out_row++) {
-      for (int out_col = 0; out_col < FOUR; out_col++) {
-#ifdef BATCH_MODE
-        results.data[out_row][out_col][batch_idx] +=
-#else
-        results.data[out_row][out_col] +=
-#endif // BATCH_MODE
-            dot(mat1_tensor_partial_load[out_row], mat2_tensor_partial_load[out_col]);
-      }
-    }
-  }
-#ifdef BATCH_MODE
-  }
-#endif // BATCH_MODE
-
-  return results;
-}
-
-//
-// Write result matrix to output (3D matmul)
-//
-
-void write_results_C_packed(const ivec4 out_idx_tl, FloatMatrix results) {
-  ivec3 out_pos = tidx_to_pos(
-      out_idx_tl, out_sizes, out_axis_map, out_packed_dim);
-
-  for (int tile_c = 0;
-       tile_c < TILE_ROWS;
-       tile_c++, out_pos[out_axis_map.y]++) {
-    out_pos[out_axis_map.x] = out_idx_tl.x;
-
-    for (int tile_r = 0;
-         tile_r < FOUR;
-         tile_r++, out_pos[out_axis_map.x]++) {
-
-#ifdef HAS_BIAS
-      ivec2 bias_idx;
-      bias_idx[bias_axis_map.x] = out_pos[out_axis_map.x];
-      bias_idx[bias_axis_map.y] = out_pos[out_axis_map.y];
-      float bias_val = get_texel_C_packed(bias_idx).x;
-#ifdef BATCH_MODE
-      vec4 bias_texel = vec4(bias_val);
-#else
-      vec4 bias_texel = vec4(bias_val, 0, 0, 0);
-#endif // BATCH_MODE
-#endif // HAS_BIAS
-
-#ifdef BATCH_MODE
-      vec4 out_texel = vec4(
-            results.data[tile_c][tile_r][0],
-            results.data[tile_c][tile_r][1],
-            results.data[tile_c][tile_r][2],
-            results.data[tile_c][tile_r][3]);
-#else
-      vec4 out_texel = vec4(
-            results.data[tile_c][tile_r],
-            0.0,
-            0.0,
-            0.0);
-#endif // BATCH_MODE
-
-#ifdef HAS_BIAS
-      imageStore(out_tensor, out_pos, beta * bias_texel + alpha * out_texel);
-#else
-      imageStore(out_tensor, out_pos, out_texel);
-#endif // HAS_BIAS
-    }
-  }
-}
-
-void main() {
-  // Each thread is responsible for calculating a (4 x TILE_ROWS x 1) tile of
-  // output elements. If the input matrices are 3D, then a (4 x TILE_ROWS x 4)
-  // tile of output elements will be computed. Note the sizes are written in
-  // (W x H x C) format.
-  const ivec3 tile_idx = ivec3(gl_GlobalInvocationID);
-
-  // Calculate the tensor index of the top left element in the output tile
-  const ivec4 out_idx_topleft = ivec4(
-      tile_idx.x * 4,
-      tile_idx.y * TILE_ROWS,
-#ifdef BATCH_MODE
-      tile_idx.z * 4,
-#else
-      tile_idx.z,
-#endif // BATCH_MODE
-      0);
-
-  // If the top left element is already out of range, then skip
-  if (any(greaterThanEqual(out_idx_topleft, out_sizes))) {
-    return;
-  }
-
-  FloatMatrix results = matmul_partial(out_idx_topleft);
-
-  write_results_C_packed(out_idx_topleft, results);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml b/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml
deleted file mode 100644
index c82c2003d20..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/addmm_optimized.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-addmm_optimized:
-  parameter_names_with_default_values:
-    DTYPE: float
-    MAT2_IS_TRANSPOSED: false
-    BATCH_MODE: false
-    TILE_ROWS: 4
-    HAS_BIAS: true
-  generate_variant_forall:
-    TILE_ROWS:
-      - VALUE: 4
-        SUFFIX: tile_row_4
-      - VALUE: 2
-        SUFFIX: tile_row_2
-    DTYPE:
-      - VALUE: float
-      - VALUE: half
-  shader_variants:
-    - NAME: addmm_optimized
-    - NAME: matmul_optimized
-      HAS_BIAS: false
-    - NAME: linear_optimized
-      MAT2_IS_TRANSPOSED: true
-    - NAME: matmul_transposed_optimized
-      MAT2_IS_TRANSPOSED: true
-      HAS_BIAS: false
-    - NAME: batch_addmm_optimized
-      BATCH_MODE: true
-    - NAME: batch_matmul_optimized
-      BATCH_MODE: true
-      HAS_BIAS: false
-    - NAME: batch_linear_optimized
-      MAT2_IS_TRANSPOSED: true
-      BATCH_MODE: true
-    - NAME: batch_matmul_transposed_optimized
-      MAT2_IS_TRANSPOSED: true
-      BATCH_MODE: true
-      HAS_BIAS: false
diff --git a/backends/vulkan/runtime/graph/ops/glsl/arange.glsl b/backends/vulkan/runtime/graph/ops/glsl/arange.glsl
deleted file mode 100644
index 8b1841888ad..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/arange.glsl
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_ubo(1, "ivec4", "sizes")}
-${layout_declare_ubo(2, "float", "start")}
-${layout_declare_ubo(3, "float", "step")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim);
-
-  if (pos_out_of_bounds(pos, sizes, packed_dim)) {
-    return;
-  }
-
-  VEC4_T outtex = VEC4_T(start + pos.x * step, 0, 0, 0);
-
-  imageStore(t_out, pos, outtex);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/arange.yaml b/backends/vulkan/runtime/graph/ops/glsl/arange.yaml
deleted file mode 100644
index 37b2027db85..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/arange.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-arange:
-  parameter_names_with_default_values:
-    NDIM: 3
-    DTYPE: int32
-    STORAGE: texture3d
-    PACKING: C_packed
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-  shader_variants:
-    - NAME: arange
diff --git a/backends/vulkan/runtime/graph/ops/glsl/avg_pool2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/avg_pool2d.glsl
deleted file mode 100644
index 2db9f842d75..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/avg_pool2d.glsl
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-#include "indexing_utils.h"
-
-layout(std430) buffer;
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_ubo(2, "ivec3", "out_limits")}
-${layout_declare_ubo(3, "ivec4", "in_sizes")}
-${layout_declare_ubo(4, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
-${layout_declare_ubo(5, "int", "divisor_override", "int", "count_include_pad")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  const ivec2 ipos = pos.xy * stride - padding;
-
-  const ivec2 start = max(ivec2(0), ipos);
-  const ivec2 end = min(ipos + kernel_size, ivec2(in_sizes.xy));
-
-  VEC4_T sum = VEC4_T(0);
-  for (int y = start.y; y < end.y; ++y) {
-    for (int x = start.x; x < end.x; ++x) {
-      sum += texelFetch(t_in, ivec3(x, y, pos.z), 0);
-    }
-  }
-
-  int div;
-  if (divisor_override > 0) {
-    div = divisor_override;
-  } else if (count_include_pad > 0) {
-    ivec2 empty = max(ipos + kernel_size - padding - ivec2(in_sizes.xy), ivec2(0));
-    div = (kernel_size.y - empty.y) * (kernel_size.x - empty.x);
-  } else {
-    div = (end.y - start.y) * (end.x - start.x);
-  }
-  imageStore(t_out, pos, sum / div);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/avg_pool2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/avg_pool2d.yaml
deleted file mode 100644
index b1e16dec8d6..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/avg_pool2d.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-avg_pool2d:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-  shader_variants:
-    - NAME: avg_pool2d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/batchnorm.glsl b/backends/vulkan/runtime/graph/ops/glsl/batchnorm.glsl
deleted file mode 100644
index c2fc5a56754..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/batchnorm.glsl
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "weight_in", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "bias_in", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "mean_in", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "var_in", DTYPE, STORAGE)}
-
-${layout_declare_ubo(B, "ivec3", "out_limits")}
-${layout_declare_ubo(B, "float", "eps")}
-${layout_declare_ubo(B, "int", "num_texel_per_batch")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  ivec3 pos = ivec3(gl_GlobalInvocationID);
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  VEC4_T v = VEC4_T(load_texel(t_in, pos));
-
-  ivec3 param_pos = ivec3(pos.z % num_texel_per_batch, 0, 0);
-
-  VEC4_T weight = VEC4_T(load_texel(weight_in, param_pos));
-  VEC4_T bias = VEC4_T(load_texel(bias_in, param_pos));
-  VEC4_T mean = VEC4_T(load_texel(mean_in, param_pos));
-  VEC4_T var = VEC4_T(load_texel(var_in, param_pos));
-
-  v = ((v - mean) / sqrt(var + eps)) * weight + bias;
-
-  write_texel(t_out, pos, v);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/batchnorm.yaml b/backends/vulkan/runtime/graph/ops/glsl/batchnorm.yaml
deleted file mode 100644
index 116773c816a..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/batchnorm.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-batchnorm:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: batchnorm
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
deleted file mode 100644
index 6f2a93667ea..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-// Binary comparison ops require that the output is boolean and not the same as input.
-$IS_COMPARISON_OP = (any([name in VARIANT_NAME for name in ["binary_eq",  "binary_lt", "binary_le", "binary_gt", "binary_ge"]]))
-
-#define NAME ${VARIANT_NAME}
-
-#define VEC4_T ${texel_type(DTYPE)}
-$if IS_COMPARISON_OP:
-  #define T ${buffer_scalar_type("uint8")}
-  #define VEC4_OUT_T ${texel_type("uint8")}
-$else:
-  #define T ${buffer_scalar_type(DTYPE)}
-  #define VEC4_OUT_T VEC4_T
-
-#define op(X, Y, A) ${OPERATOR}
-
-${define_active_storage_type(STORAGE)}
-${define_required_extensions(DTYPE)}
-
-
-$if IS_COMPARISON_OP:
-  ${define_required_extensions("uint8")}
-
-layout(std430) buffer;
-
-#include "indexing.glslh"
-
-$if IS_COMPARISON_OP:
-  ${layout_declare_tensor(B, "w", "t_out", "uint8", STORAGE)}
-$else:
-  ${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)}
-
-$if STORAGE == "buffer":
-  ${layout_declare_ubo(B, "BufferMetadata", "outp")}
-  ${layout_declare_ubo(B, "BufferMetadata", "inp")}
-  ${layout_declare_ubo(B, "BufferMetadata", "other")}
-
-  layout(push_constant) uniform restrict Block {
-    float alpha;
-  };
-$else:
-  layout(push_constant) uniform restrict Block {
-    ivec4 out_sizes;
-    ivec4 in_sizes;
-    ivec4 other_sizes;
-    ivec2 broadcast_params;
-    float alpha;
-  };
-
-#include "broadcasting_utils.h"
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")}
-
-$if STORAGE == "buffer":
-  const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
-$else:
-  const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-  const lowp int packed_dim = unhash_packed_dim(out_layout);
-
-  const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-
-  const lowp ivec4 other_axis_map = unhash_axis_map(other_layout);
-
-#ifdef USING_BUFFER
-
-void main() {
-  const uint out_bufi = gl_GlobalInvocationID.x;
-  if (out_bufi >= numel(outp)) {
-    return;
-  }
-
-  // Simple case; no broadcasting
-  if (are_equal(inp, other)) {
-    t_out[out_bufi] = T(op(t_in[out_bufi], t_other[out_bufi], T(alpha)));
-    return;
-  }
-
-  TensorIndex outp_tidx;
-  linear_idx_to_tensor_idx(outp, out_bufi, outp_tidx);
-
-  TensorIndex inp_tidx = outp_tidx;
-  clamp_tensor_idx(inp, inp_tidx);
-
-  TensorIndex other_tidx = outp_tidx;
-  clamp_tensor_idx(other, other_tidx);
-
-  uint inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx);
-  uint other_bufi = tensor_idx_to_linear_idx(other, other_tidx);
-
-  t_out[out_bufi] = T(op(t_in[inp_bufi], t_other[other_bufi], T(alpha)));
-}
-
-#else // USING_TEXTURE
-
-void main() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-  const ivec4 tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, packed_dim);
-
-  if (any(greaterThanEqual(tidx, out_sizes))) {
-    return;
-  }
-
-  // broadcast on logical sizes
-  ivec4 in_idx = broadcast_indices(tidx, in_sizes);
-  VEC4_T in_texel = VEC4_T(load_texel(
-    t_in,
-    // read axis mapped texel
-    tidx_to_pos(in_idx, in_sizes, in_axis_map, packed_dim)));
-
-  // broadcast on logical sizes
-  ivec4 other_idx = broadcast_indices(tidx, other_sizes);
-  VEC4_T other_texel = VEC4_T(load_texel(
-    t_other,
-    // read axis mapped texel
-    tidx_to_pos(other_idx, other_sizes, other_axis_map, packed_dim)));
-
-  // Check boolean broadcast flags; we use ivec2 instead of bvec2 for alignment.
-  if (broadcast_params.x > 0) {
-    in_texel = in_texel.xxxx;
-  }
-  if (broadcast_params.y > 0) {
-    other_texel = other_texel.xxxx;
-  }
-
-  write_texel_lpos(
-    t_out,
-    lpos,
-    VEC4_OUT_T(op(in_texel, other_texel, alpha)),
-    out_axis_map);
-}
-
-#endif
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml
deleted file mode 100644
index 70793628d80..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.yaml
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-binary_op:
-  parameter_names_with_default_values:
-    OPERATOR: X + A * Y
-    NDIM: 3
-    DTYPE: float
-    PACKING: C_packed
-  generate_variant_forall:
-    STORAGE:
-      - VALUE: texture3d
-      - VALUE: buffer
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-  shader_variants:
-    - NAME: binary_add
-    - NAME: binary_sub
-      OPERATOR: X - A * Y
-    - NAME: binary_mul
-      OPERATOR: X * Y
-    - NAME: binary_div
-      OPERATOR: X / Y
-    - NAME: binary_pow
-      OPERATOR: pow(X, Y)
-    - NAME: binary_floor_divide
-      OPERATOR: floor(X / Y)
-    - NAME: binary_minimum
-      OPERATOR: min(X, Y)
-    - NAME: binary_eq_int32
-      OPERATOR: X == Y
-      DTYPE: int32
-    - NAME: binary_eq_buffer
-      OPERATOR: abs(X - Y) < 1e-5
-      STORAGE: buffer
-      generate_variant_forall:
-        DTYPE:
-          - VALUE: half
-          - VALUE: float
-    - NAME: binary_eq_texture3d
-      OPERATOR: all(lessThanEqual(abs(X - Y), VEC4_T(1e-5)))
-      STORAGE: texture3d
-      generate_variant_forall:
-        DTYPE:
-          - VALUE: half
-          - VALUE: float
-    - NAME: binary_lt_buffer
-      OPERATOR: X < Y
-      STORAGE: buffer
-      generate_variant_forall:
-        DTYPE:
-          - VALUE: half
-          - VALUE: float
-          - VALUE: int32
-    - NAME: binary_lt_texture3d
-      OPERATOR: all(lessThan(X, Y))
-      STORAGE: texture3d
-      generate_variant_forall:
-        DTYPE:
-          - VALUE: half
-          - VALUE: float
-          - VALUE: int32
-    - NAME: binary_le_buffer
-      OPERATOR: X <= Y
-      STORAGE: buffer
-      generate_variant_forall:
-        DTYPE:
-          - VALUE: half
-          - VALUE: float
-          - VALUE: int32
-    - NAME: binary_le_texture3d
-      OPERATOR: all(lessThanEqual(X, Y))
-      STORAGE: texture3d
-      generate_variant_forall:
-        DTYPE:
-          - VALUE: half
-          - VALUE: float
-          - VALUE: int32
-    - NAME: binary_gt_buffer
-      OPERATOR: X > Y
-      STORAGE: buffer
-      generate_variant_forall:
-        DTYPE:
-          - VALUE: half
-          - VALUE: float
-          - VALUE: int32
-    - NAME: binary_gt_texture3d
-      OPERATOR: all(greaterThan(X, Y))
-      STORAGE: texture3d
-      generate_variant_forall:
-        DTYPE:
-          - VALUE: half
-          - VALUE: float
-          - VALUE: int32
-    - NAME: binary_ge_buffer
-      OPERATOR: X >= Y
-      STORAGE: buffer
-      generate_variant_forall:
-        DTYPE:
-          - VALUE: half
-          - VALUE: float
-          - VALUE: int32
-    - NAME: binary_ge_texture3d
-      OPERATOR: all(greaterThanEqual(X, Y))
-      STORAGE: texture3d
-      generate_variant_forall:
-        DTYPE:
-          - VALUE: half
-          - VALUE: float
-          - VALUE: int32
diff --git a/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.glsl
deleted file mode 100644
index ac39dd36fc3..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.glsl
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-${define_active_storage_type(STORAGE)}
-
-#include "indexing_utils.h"
-
-layout(std430) buffer;
-
-#extension GL_EXT_control_flow_attributes : require
-
-${layout_declare_buffer(B, "w", "nchw_out", "int")}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-
-$if USE_PUSH_CONST:
-  layout(push_constant) uniform restrict Block {
-    ivec4 tensor_sizes;
-    int out_numel;
-  };
-$else:
-  ${layout_declare_ubo(B, "ivec4", "tensor_sizes")}
-  ${layout_declare_ubo(B, "int", "out_numel")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 axis_map = unhash_axis_map(t_layout);
-const lowp int packed_dim = unhash_packed_dim(t_layout);
-
-void main() {
-  const int out_buf_idx = int(gl_GlobalInvocationID.x);
-  // On the CPU, the number of elements is determined based on a buffer of int8
-  // elements. However, on the GPU, since the int8 data type is not supported
-  // each group of 4 elements is interepreted as 1 int32 element. Thus each
-  // thread is actually writing to 4 output elements from the perspective of the
-  // CPU.
-  if (out_buf_idx * 4 >= out_numel) {
-    return;
-  }
-
-  ivec4 values;
-  int in_buf_idx = 4 * out_buf_idx;
-
-  [[unroll]] for (int i = 0; i < 4; ++i) {
-    const ivec4 tidx = nchwi_to_tidx(in_buf_idx, tensor_sizes);
-    const ivec4 texture_pos = to_texture_elem_pos(
-        tidx, tensor_sizes, packed_dim);
-    values[i] = ivec4(load_texel(t_in, texture_pos.xyz))[texture_pos.w];
-    in_buf_idx++;
-  }
-
-  // Manually pack 4x 8-bit integers into a 32 bit integer. Note that little
-  // endian is assumed, since most processors use little endian. Thus the
-  // "later" values are placed in most significant bytes.
-  int packed = ((values[3] & 0xFF) << 24)
-             | ((values[2] & 0xFF) << 16)
-             | ((values[1] & 0xFF) << 8)
-             | ((values[0] & 0xFF));
-
-  nchw_out[out_buf_idx] = packed;
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml
deleted file mode 100644
index 0386c261203..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/bitw8_image_to_nchw_nobitw8buffer.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-bitw8_image_to_nchw_nobitw8buffer:
-  parameter_names_with_default_values:
-    STORAGE: texture3d
-    DTYPE: int8
-    USE_PUSH_CONST: True
-  generate_variant_forall:
-    STORAGE:
-      - VALUE: texture2d
-      - VALUE: texture3d
-    DTYPE:
-      - VALUE: int8
-      - VALUE: uint8
-  shader_variants:
-    - NAME: bitw8_image_to_nchw_nobitw8buffer
-    - NAME: bitw8_image_to_nchw_nobitw8buffer_no_pc
-      USE_PUSH_CONST: False
diff --git a/backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h b/backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h
deleted file mode 100644
index 840e98a25ed..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/broadcasting_utils.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-ivec4 broadcast_indices(const ivec4 out_idx, const ivec4 in_sizes) {
-  ivec4 in_idx = out_idx;
-  for (int i = 0; i < 4; ++i) {
-    if (out_idx[i] >= in_sizes[i]) {
-      in_idx[i] = 0;
-    }
-  }
-  return in_idx;
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.glsl
deleted file mode 100644
index 9d4b18f0d10..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.glsl
+++ /dev/null
@@ -1,23 +0,0 @@
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define T ${buffer_scalar_type(DTYPE)}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(0, "w", "out_buf", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "in_buf", DTYPE, STORAGE)}
-${layout_declare_ubo(2, "int", "numel")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  int tid = int(gl_GlobalInvocationID.x);
-  if (tid >= numel) {
-    return;
-  }
-  out_buf[tid] = in_buf[tid];
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.yaml
deleted file mode 100644
index e8bb86dbf6a..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_buffer.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-buffer_to_buffer:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: buffer
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: double
-      - VALUE: int8
-      - VALUE: uint8
-      - VALUE: int32
-  shader_variants:
-    - NAME: buffer_to_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl
deleted file mode 100644
index 6d164ae2645..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl
+++ /dev/null
@@ -1,36 +0,0 @@
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define T ${buffer_scalar_type(DTYPE)}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing.glslh"
-
-${layout_declare_tensor(B, "w", "nchw_buf", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_inp", DTYPE, STORAGE)}
-
-${layout_declare_ubo(B, "BufferMetadata", "inp")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-// This constant is unused in this shader but is kept so that the signature is
-// consistent with image_to_nchw.
-${layout_declare_spec_const(C, "int", "unused", "0")}
-
-void main() {
-  uint inp_bufi = gl_GlobalInvocationID.x;
-  if (inp_bufi>= numel(inp)) {
-    return;
-  }
-
-  TensorIndex inp_tidx;
-  linear_idx_to_tensor_idx(inp, inp_bufi, inp_tidx);
-
-  uint nchwi = tensor_idx_to_contiguous_idx(inp, inp_tidx);
-
-  nchw_buf[nchwi] = t_inp[inp_bufi];
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml
deleted file mode 100644
index 929108cca5e..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-buffer_to_nchw:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: buffer
-    USE_PUSH_CONST: True
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: double
-      - VALUE: int8
-      - VALUE: uint8
-      - VALUE: int32
-  shader_variants:
-    - NAME: buffer_to_nchw
diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams.glslh b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams.glslh
deleted file mode 100644
index cfe5baa9c1d..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams.glslh
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef CHOOSE_QPARAMS_GLSLH
-#define CHOOSE_QPARAMS_GLSLH
-
-// mapping_type : 0 = ASYM, 1 = SYM, 2 = SYM_NO_CLIP
-void calc_scale_zp(
-    float lo, float hi,
-    int qmin, int qmax,
-    int mapping_type,
-    float eps,
-    out float scale, out int zp) {
-  // Handle case where lo and hi are +/-INF (no valid values found)
-  if (isinf(lo) || isinf(hi)) {
-    lo = 0.0;
-    hi = 0.0;
-  }
-
-  float minv = min(lo, 0.0);
-  float maxv = max(hi, 0.0);
-
-  if (mapping_type == 0) { // asymmetric
-    scale = (maxv - minv) / float(qmax - qmin);
-
-    // Handle zero or very small scale
-    if (scale == 0.0 || isinf(1.0/scale)) {
-      scale = eps;
-    }
-
-    if (scale < eps) {
-      float org_scale = scale;
-      scale = eps;
-
-      // Adjust min and max based on new scale to maintain proper quantization range
-      if (minv == 0.0) {
-        maxv = eps * float(qmax - qmin);
-      } else if (maxv == 0.0) {
-        minv = -eps * float(qmax - qmin);
-      } else {
-        float amplifier = eps / org_scale;
-        minv *= amplifier;
-        maxv *= amplifier;
-      }
-    }
-
-    // Calculate zero_point (matching reference implementation)
-    float initial_zero_point = float(qmin) - round(minv / scale);
-    zp = int(clamp(initial_zero_point, float(qmin), float(qmax)));
-  } else { // symmetric -- centred
-    float scale_sym;
-    if (mapping_type == 1) { // SYM
-      float M = max(abs(minv), abs(maxv));
-      scale_sym = M / (float(qmax - qmin) * 0.5);
-    } else { // SYM_NO_CLIP
-      float smin = abs(minv) / max(abs(float(qmin)), 1.0); // Avoid division by zero
-      float smax = maxv / max(float(qmax), 1.0); // Avoid division by zero
-      scale_sym = max(smin, smax);
-    }
-
-    // Handle zero or very small scale
-    if (scale_sym == 0.0 || isinf(1.0/scale_sym)) {
-      scale_sym = eps;
-    }
-
-    scale = max(scale_sym, eps);
-    zp = int((qmax + qmin + 1) >> 1); // mid-point – always fits
-  }
-}
-
-#endif // CHOOSE_QPARAMS_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_buffer.glsl
deleted file mode 100644
index 7e21bcf0eba..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_buffer.glsl
+++ /dev/null
@@ -1,400 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define IN_T ${buffer_scalar_type(IN_DTYPE)}
-#define SCALE_OUT_T ${buffer_scalar_type(SCALE_OUT_DTYPE)}
-#define ZP_OUT_T ${buffer_scalar_type(ZP_OUT_DTYPE)}
-
-#define ${MODE}
-
-${define_active_storage_type("buffer")}
-${define_required_extensions(IN_DTYPE)}
-${define_required_extensions(SCALE_OUT_DTYPE)}
-${define_required_extensions(ZP_OUT_DTYPE)}
-
-#extension GL_EXT_control_flow_attributes : require
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_scale", SCALE_OUT_DTYPE, "buffer")}
-${layout_declare_tensor(B, "w", "t_zero_point", ZP_OUT_DTYPE, "buffer")}
-${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "buffer")}
-
-$if MODE == "per_tensor":
-  layout(push_constant) uniform restrict Block {
-    int quant_min;
-    int quant_max;
-    float eps;
-  };
-$if MODE == "per_token":
-  layout(push_constant) uniform restrict Block {
-    int num_tokens;
-    int quant_min;
-    int quant_max;
-  };
-$if MODE == "block_wise":
-  layout(push_constant) uniform BlockPC {
-    ivec4 blockSize; // WHCN (>=1)
-    ivec4 numBlocks; // #blocks along W,H,C,N
-    ivec4 blockStride; // {1, #W, #W * #H, #W * #H * #C}
-    int mapping_type; // 0=ASYM, 1=SYM, 2=SYM_NO_CLIP
-    int quant_min;
-    int quant_max;
-    float eps;
-  };
-
-${layout_declare_ubo(B, "ivec4", "t_in_sizes")}
-${layout_declare_ubo(B, "ivec4", "t_in_strides")}
-${layout_declare_ubo(B, "ivec4", "t_scale_sizes")}
-${layout_declare_ubo(B, "ivec4", "t_scale_strides")}
-${layout_declare_ubo(B, "ivec4", "t_zero_point_sizes")}
-${layout_declare_ubo(B, "ivec4", "t_zero_point_strides")}
-
-#include "indexing_utils.h"
-#include "choose_qparams.glslh"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#define NWORKERS 64
-
-// Shared memory for reduction - must match local work group size
-shared float shared_min[NWORKERS];
-shared float shared_max[NWORKERS];
-
-/*
-  Quantization Parameter Computation Shader (Buffer Storage)
-    This shader computes quantization parameters (scale and zero_point) for converting
-    floating-point tensors to n-bit integer representations while preserving the
-    original data range as much as possible. The computed parameters enable efficient
-    quantization by mapping the continuous floating-point range to discrete integer values.
-
-  Important Considerations:
-    (+) The input tensor is assumed to be WIDTH_PACKED (i.e., contiguous in the last dimension)
-
-  Workgroup Configuration:
-  - choose_qparams_per_tensor
-      This mode computes a single set of quantization parameters for the entire tensor.
-      Uses parallel reduction across all threads to find global min/max values.
-
-    (*) global_wg_size: {1, 1, 1} (single workgroup processes entire tensor)
-    (*) local_wg_size: {64, 1, 1} (matches NWORKERS for shared memory)
-
-  - choose_qparams_per_token
-      This mode computes separate quantization parameters for each token in the tensor.
-      Each workgroup processes one token independently to find token-specific min/max.
-
-    (*) global_wg_size: {num_tokens, 1, 1} (one workgroup per token)
-    (*) local_wg_size: {1, 1, 1} (single thread per token)
-
-  - choose_qparams_block_wise
-      This mode computes quantization parameters for each block of elements, allowing
-      fine-grained control over quantization granularity within the tensor. Each block
-      is processed independently to find its own min/max values and compute corresponding
-      scale and zero_point parameters.
-
-    (*) global_wg_size: {nBlocks, 1u, 1u} (one workgroup per block)
-    (*) local_wg_size: {1, 1, 1} (single thread per block)
-
-    Block-wise quantization supports multiple mapping types for scale/zero_point calculation:
-
-    - mapping_type = 0 (ASYMMETRIC):
-        Uses asymmetric quantization where the full floating-point range [min, max] is
-        mapped to the quantized range [quant_min, quant_max]. This preserves the original
-        data distribution but may not center zero optimally.
-
-        Calculation:
-        scale = (max - min) / (quant_max - quant_min)
-        zero_point = quant_min - round(min / scale)
-
-        Example: For range [-3.5, 10.2] mapping to int4 [-8, 7]:
-        scale = (10.2 - (-3.5)) / (7 - (-8)) = 13.7 / 15 = 0.913
-        zero_point = -8 - round(-3.5 / 0.913) = -8 - (-4) = -4
-
-    - mapping_type = 1 (SYMMETRIC):
-        Uses symmetric quantization where the range is centered around zero. The scale
-        is computed based on the maximum absolute value, ensuring zero is exactly
-        representable in the quantized domain.
-
-        Calculation:
-        max_abs = max(abs(min), abs(max))
-        scale = max_abs / ((quant_max - quant_min) / 2)
-        zero_point = (quant_max + quant_min + 1) / 2  // midpoint
-
-        Example: For range [-3.5, 10.2] mapping to int4 [-8, 7]:
-        max_abs = max(3.5, 10.2) = 10.2
-        scale = 10.2 / ((7 - (-8)) / 2) = 10.2 / 7.5 = 1.36
-        zero_point = (-8 + 7 + 1) / 2 = 0
-
-    - mapping_type = 2 (SYMMETRIC_NO_CLIPPING_ERR):
-        A variant of symmetric quantization that minimizes clipping errors by computing
-        separate scales for positive and negative ranges, then using the maximum. This
-        reduces quantization error on the dominant range while ensuring no values are
-        clipped.
-
-        Calculation:
-        smin = abs(min) / abs(quant_min)  // scale for negative range
-        smax = max / quant_max            // scale for positive range
-        scale = max(smin, smax)           // use larger scale to avoid clipping
-        zero_point = (quant_max + quant_min + 1) / 2  // midpoint
-
-        Example: For range [-3.5, 10.2] mapping to int4 [-8, 7]:
-        smin = 3.5 / 8 = 0.4375
-        smax = 10.2 / 7 = 1.457
-        scale = max(0.4375, 1.457) = 1.457  // use smax to avoid clipping positives
-        zero_point = (-8 + 7 + 1) / 2 = 0
-
-  Tree Reduction Algorithm for Min/Max Finding:
-    The shader uses a parallel tree reduction algorithm to efficiently find minimum and
-    maximum values across multiple threads. This approach reduces the number of memory
-    accesses and synchronization points compared to sequential scanning.
-
-    Example with 8 threads processing values [10, 1, 8, 1, 0, 2, 3, 5]:
-
-    Step 1 - Initial Population:
-    Each thread loads its assigned value into shared memory arrays.
-    shared_min:  | 10 | 1 | 8 | 1 | 0 | 2 | 3 | 5 |
-    shared_max:  | 10 | 1 | 8 | 1 | 0 | 2 | 3 | 5 |
-    Thread ID:   |  0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
-
-    Step 2 - Stride 1 (Compare Adjacent Pairs):
-    Threads 0,2,4,6 compare with threads 1,3,5,7 respectively.
-    shared_min:  |  1 |   | 1 |   | 0 |   | 3 |   |  (min(10,1), min(8,1), min(0,2), min(3,5))
-    shared_max:  | 10 |   | 8 |   | 2 |   | 5 |   |  (max(10,1), max(8,1), max(0,2), max(3,5))
-    Active:      |  0 |   | 2 |   | 4 |   | 6 |   |
-
-    Step 3 - Stride 2 (Compare Pairs of Pairs):
-    Threads 0,4 compare with threads 2,6 respectively.
-    shared_min:  |  1 |   |   |   | 0 |   |   |   |  (min(1,1), min(0,3))
-    shared_max:  | 10 |   |   |   | 5 |   |   |   |  (max(10,8), max(2,5))
-    Active:      |  0 |   |   |   | 4 |   |   |   |
-
-    Step 4 - Stride 4 (Final Comparison):
-    Thread 0 compares with thread 4 to get final result.
-    shared_min:  |  0 |   |   |   |   |   |   |   |  (min(1,0) = 0)
-    shared_max:  | 10 |   |   |   |   |   |   |   |  (max(10,5) = 10)
-    Active:      |  0 |   |   |   |   |   |   |   |
-
-    Final Result: global_min = 0, global_max = 10 (stored in shared_min[0], shared_max[0])
-
-    The tree reduction completes in log_2(N) steps where N is the number of threads,
-    providing O(log N) time complexity instead of O(N) for sequential reduction.
-
-  Quantization Parameter Calculation:
-    Once min/max values are determined, the shader computes:
-    - scale = (max - min) / (quant_max - quant_min)
-    - zero_point = quantization offset to map floating-point zero to integer range
-
-  Mode-Specific Behavior:
-  - Per-Tensor: Single workgroup with strided access across entire tensor
-  - Per-Token: Multiple workgroups, each processing one token independently
-  - Block-Wise: Each thread processes assigned blocks using nested loops over block dimensions
-*/
-
-#ifdef per_tensor
-
-void choose_qparams_per_tensor() {
-  uint global_id = gl_GlobalInvocationID.x;
-  uint local_id = gl_LocalInvocationID.x;
-  uint total_threads = gl_NumWorkGroups.x * gl_WorkGroupSize.x;
-
-  uint total_elements = uint(t_in_sizes.x * t_in_sizes.y * t_in_sizes.z * t_in_sizes.w);
-
-  // Each thread processes multiple elements with stride
-  float thread_min = 1.0/0.0;  // +infinity
-  float thread_max = -1.0/0.0; // -infinity
-  bool found_valid = false;
-
-  for (uint i = global_id; i < total_elements; i += total_threads) {
-    float val = t_in[i];
-    if (!isnan(val) && !isinf(val)) {
-      if (!found_valid) {
-        thread_min = val;
-        thread_max = val;
-        found_valid = true;
-      } else {
-        thread_min = min(thread_min, val);
-        thread_max = max(thread_max, val);
-      }
-    }
-  }
-
-  // Intra-group reduction using shared memory
-  shared_min[local_id] = thread_min;
-  shared_max[local_id] = thread_max;
-  barrier();
-
-  // Tree reduction within work group
-  for (uint stride = gl_WorkGroupSize.x / 2; stride > 0; stride >>= 1) {
-    if (local_id < stride) {
-      float other_min = shared_min[local_id + stride];
-      float other_max = shared_max[local_id + stride];
-
-      if (!isinf(other_min) && (isinf(shared_min[local_id]) || other_min < shared_min[local_id])) {
-        shared_min[local_id] = other_min;
-      }
-      if (!isinf(other_max) && (isinf(shared_max[local_id]) || other_max > shared_max[local_id])) {
-        shared_max[local_id] = other_max;
-      }
-    }
-    barrier();
-  }
-
-  // Final result calculation (single workgroup only)
-  if (local_id == 0) {
-    float global_min = shared_min[0];
-    float global_max = shared_max[0];
-
-    float scale_val;
-    int zero_point_val;
-    // Use default values: mapping_type=0 (ASYMMETRIC), eps from push constant
-    calc_scale_zp(global_min, global_max, quant_min, quant_max, 0, eps, scale_val, zero_point_val);
-
-    t_scale[0] = SCALE_OUT_T(scale_val);
-    t_zero_point[0] = ZP_OUT_T(zero_point_val);
-  }
-}
-
-#elif defined(per_token)
-
-void choose_qparams_per_token() {
-  uint total_elements = uint(t_in_sizes.x * t_in_sizes.y * t_in_sizes.z * t_in_sizes.w);
-  uint token_size = total_elements / uint(num_tokens);
-
-  const uint TOTAL_TOKENS = uint(num_tokens);
-
-  /* each invocation handles token-ids: id, id+STRIDE, id+2·STRIDE … */
-  const uint STRIDE = gl_WorkGroupSize.x * gl_NumWorkGroups.x;
-  for (uint token_id = gl_GlobalInvocationID.x; token_id < TOTAL_TOKENS; token_id += STRIDE) {
-    // Calculate the start and end indices for this token
-    uint token_start = token_id * token_size;
-    uint token_end = token_start + token_size;
-
-    // Each thread processes the entire token
-    float lo = 1.0/0.0;   // +INF
-    float hi = -1.0/0.0;  // -INF
-    bool found_valid = false;
-
-    // Process all elements in this token
-    for (uint i = token_start; i < token_end; i++) {
-      float val = t_in[i];
-      if (!isnan(val) && !isinf(val)) {
-        if (!found_valid) {
-          lo = hi = val;
-          found_valid = true;
-        } else {
-          lo = min(lo, val);
-          hi = max(hi, val);
-        }
-      }
-    }
-
-    if (!found_valid) {
-      // If no valid values were found, use default values
-      lo = 0.0;
-      hi = 0.0;
-    }
-
-    // Calculate scale and zero point directly
-    float scale_val;
-    int zero_point_val;
-    // Use default values: mapping_type=0 (ASYMMETRIC), eps=1e-5
-    calc_scale_zp(lo, hi, quant_min, quant_max, 0, 1e-5, scale_val, zero_point_val);
-
-    // Write results
-    t_scale[token_id] = SCALE_OUT_T(scale_val);
-    t_zero_point[token_id] = ZP_OUT_T(zero_point_val);
-  }
-}
-
-#elif defined(block_wise)
-
-ivec4 block_id_to_coord(uint bid) {
-  ivec4 bc;
-  bc.w = int(bid) / blockStride.w;
-
-  int r = int(bid) - bc.w * blockStride.w;
-  bc.z = r / blockStride.z;
-
-  r -= bc.z * blockStride.z;
-  bc.y = r / blockStride.y;
-
-  r -= bc.y * blockStride.y;
-  bc.x =  r;
-  return bc;
-}
-
-void choose_qparams_block_wise() {
-  const uint TOTAL_BLOCKS = uint(numBlocks.x * numBlocks.y * numBlocks.z * numBlocks.w);
-
-  // each invocation handles block-ids: id, id+STRIDE, id+2·STRIDE
-  const uint STRIDE = gl_WorkGroupSize.x * gl_NumWorkGroups.x;
-  for (uint block_id = gl_GlobalInvocationID.x; block_id < TOTAL_BLOCKS; block_id += STRIDE) {
-    // block -> WHCN coordinate
-    ivec4 bc = block_id_to_coord(block_id);
-    ivec4 blockStart = bc * blockSize; // first element (inclusive)
-    ivec4 blockEnd = blockStart + blockSize; // last element (exclusive)
-
-    // min / max scan over the block
-    float lo =  1.0/0.0; // +INF
-    float hi = -1.0/0.0; // -INF
-    bool found_valid = false;
-
-    // Calculate actual block dimensions
-    ivec4 actualBlockSize = blockEnd - blockStart;
-    int blockElements = actualBlockSize.x * actualBlockSize.y * actualBlockSize.z * actualBlockSize.w;
-
-    // Linear iteration over block elements
-    for (int elemIdx = 0; elemIdx < blockElements; ++elemIdx) {
-      // Convert linear index to 4D coordinates within block
-      int remaining = elemIdx;
-      int dn = remaining / (actualBlockSize.x * actualBlockSize.y * actualBlockSize.z);
-      remaining -= dn * (actualBlockSize.x * actualBlockSize.y * actualBlockSize.z);
-      int dc = remaining / (actualBlockSize.x * actualBlockSize.y);
-      remaining -= dc * (actualBlockSize.x * actualBlockSize.y);
-      int dh = remaining / actualBlockSize.x;
-      int dw = remaining - dh * actualBlockSize.x;
-
-      ivec4 tidx = blockStart + ivec4(dw, dh, dc, dn);
-      uint idx = tidx_to_bufi(tidx, t_in_strides);
-      float v = t_in[idx];
-
-      if (!isnan(v) && !isinf(v)) {
-        if (!found_valid) {
-          lo = hi = v;
-          found_valid = true;
-        } else {
-          lo = min(lo, v);
-          hi = max(hi, v);
-        }
-      }
-    }
-
-    // Handle the case where no valid values were found in the block
-    if (!found_valid) {
-      lo = 0.0;
-      hi = 0.0;
-    }
-
-    float scale_val;
-    int zero_point_val;
-    calc_scale_zp(lo, hi, quant_min, quant_max, mapping_type, eps, scale_val, zero_point_val);
-
-    t_scale[block_id] = SCALE_OUT_T(scale_val);
-    t_zero_point[block_id] = ZP_OUT_T(zero_point_val);
-  }
-}
-
-#endif
-
-void main() {
-  choose_qparams_${MODE}();
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_buffer.yaml
deleted file mode 100644
index 8459b043baa..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_buffer.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-choose_qparams_buffer:
-  parameter_names_with_default_values:
-    IN_DTYPE: float
-    SCALE_OUT_DTYPE: float
-    ZP_OUT_DTYPE: int32
-    MODE: per_tensor
-  generate_variant_forall:
-    IN_DTYPE:
-      - VALUE: float
-    SCALE_OUT_DTYPE:
-      - VALUE: float
-    ZP_OUT_DTYPE:
-      - VALUE: int32
-      - VALUE: int8
-      - VALUE: float
-  shader_variants:
-    - NAME: choose_qparams_tensor_buffer
-      MODE: per_tensor
-    - NAME: choose_qparams_per_token_asymmetric_buffer
-      MODE: per_token
-    - NAME: choose_qparams_block_wise_buffer
-      MODE: block_wise
diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.glsl b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.glsl
deleted file mode 100644
index 653b0a251c0..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.glsl
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
-#define T ${texel_load_component_type(DTYPE, STORAGE)}
-
-#define NUM_OUTPUTS_PER_WG ${NUM_OUTPUTS_PER_WG}
-#define NUM_WORKERS_PER_OUTPUT ${NUM_WORKERS_PER_OUTPUT}
-
-// Maximum total threads in a work group
-#define MAX_THREADS 256
-
-${define_active_storage_type(STORAGE)}
-${define_required_extensions("int8")}
-
-#extension GL_EXT_control_flow_attributes : require
-
-layout(std430) buffer;
-
-#include "common.glslh"
-
-${layout_declare_tensor(B, "w", "t_scales", "float", "buffer")}
-${layout_declare_tensor(B, "w", "t_zps", "int", "buffer")}
-${layout_declare_tensor(B, "r", "t_input", DTYPE, STORAGE, is_scalar_array=False)}
-
-${layout_declare_ubo(B, "ivec4", "input_sizes")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(push_constant) uniform PushConstants {
-  int quant_min;
-  int quant_max;
-};
-
-// Shared memory for cooperative min/max finding
-shared T shared_min[NUM_OUTPUTS_PER_WG][NUM_WORKERS_PER_OUTPUT];
-shared T shared_max[NUM_OUTPUTS_PER_WG][NUM_WORKERS_PER_OUTPUT];
-
-const float SMALL_SCALE_THRESHOLD = 6.1e-5;
-
-void calculate_scale_and_zero_point(
-    float min_val,
-    float max_val,
-    int qmin,
-    int qmax,
-    out float scale,
-    out int8_t zero_point) {
-
-  // Extend the [min, max] interval to ensure it contains 0
-  min_val = min(min_val, 0.0);
-  max_val = max(max_val, 0.0);
-
-  // Calculate scale
-  scale = (max_val - min_val) / float(qmax - qmin);
-
-  // Handle special cases for scale
-  if (scale == 0.0 || isinf(1.0 / scale)) {
-    scale = 0.1;
-  }
-
-  // Cut off small scale
-  if (scale < SMALL_SCALE_THRESHOLD) {
-    float org_scale = scale;
-    scale = SMALL_SCALE_THRESHOLD;
-    // Adjust the min and max based on the new scale
-    if (min_val == 0.0) {
-      max_val = SMALL_SCALE_THRESHOLD * float(qmax - qmin);
-    } else if (max_val == 0.0) {
-      min_val = -SMALL_SCALE_THRESHOLD * float(qmax - qmin);
-    } else {
-      float amplifier = SMALL_SCALE_THRESHOLD / org_scale;
-      min_val *= amplifier;
-      max_val *= amplifier;
-    }
-  }
-
-  // Zero-point computation
-  float zero_point_from_min = float(qmin) - min_val / scale;
-  float zero_point_from_max = float(qmax) - max_val / scale;
-  float zero_point_from_min_error = abs(float(qmin)) - abs(min_val / scale);
-  float zero_point_from_max_error = abs(float(qmax)) - abs(max_val / scale);
-
-  float initial_zero_point = zero_point_from_min_error < zero_point_from_max_error
-    ? zero_point_from_min
-    : zero_point_from_max;
-
-  // Nudge zero point to be an integer
-  int nudged_zero_point;
-  if (initial_zero_point < float(qmin)) {
-    nudged_zero_point = qmin;
-  } else if (initial_zero_point > float(qmax)) {
-    nudged_zero_point = qmax;
-  } else {
-    nudged_zero_point = int(round(initial_zero_point));
-  }
-
-  zero_point = int8_t(nudged_zero_point);
-}
-
-#ifdef USING_BUFFER
-
-VEC4_T load_input_x4(const int x4, const int y, const int ntexels_x) {
-  return t_input[(y * ntexels_x) + x4];
-}
-
-#else // USING_TEXTURE
-
-VEC4_T load_input_x4(const int x4, const int y, const int ntexels_x) {
-  return texelFetch(t_input, ivec3(x4, y, 0), 0);
-}
-
-#endif // USING_BUFFER
-
-void main() {
-  const int worker_id = int(gl_LocalInvocationID.x);
-  const int output_id = int(gl_LocalInvocationID.y);
-
-  const int output_y = int(gl_GlobalInvocationID.y);
-
-  if (output_y >= input_sizes.y) {
-    return;
-  }
-
-  // Input is 2D tensor (height x width), width-packed
-  // Each channel corresponds to a row in the tensor
-  const int X4 = div_4(input_sizes.x);
-
-  // Initialize thread-local min/max
-  float local_min = 1e30;
-  float local_max = -1e30;
-
-  // Each thread processes elements along their assigned output_id with stride
-  // NUM_WORKERS_PER_OUTPUT
-  for (int x4 = worker_id; x4 < X4; x4 += NUM_WORKERS_PER_OUTPUT) {
-    VEC4_T in_texel = load_input_x4(x4, output_y, X4);
-    for (int i = 0; i < 4; i++) {
-      local_min = min(local_min, in_texel[i]);
-      local_max = max(local_max, in_texel[i]);
-    }
-  }
-
-  // Store thread-local results in shared memory
-  shared_min[output_id][worker_id] = local_min;
-  shared_max[output_id][worker_id] = local_max;
-
-  memoryBarrierShared();
-  barrier();
-
-  // Tree reduction to compute the overall result
-  for (int i = NUM_WORKERS_PER_OUTPUT / 2; i > 0; i >>= 1) {
-    if (worker_id < i) {
-      shared_min[output_id][worker_id] = min(
-          shared_min[output_id][worker_id],
-          shared_min[output_id][worker_id + i]);
-      shared_max[output_id][worker_id] = max(
-          shared_max[output_id][worker_id],
-          shared_max[output_id][worker_id + i]);
-    }
-    memoryBarrierShared();
-    barrier();
-  }
-
-  // Only first thread will write out result
-  if (worker_id == 0) {
-    local_min = shared_min[output_id][0];
-    local_max = shared_max[output_id][0];
-
-    float scale;
-    int8_t zero_point;
-    calculate_scale_and_zero_point(
-        local_min, local_max, quant_min, quant_max, scale, zero_point);
-
-    t_scales[output_y] = scale;
-    t_zps[output_y] = zero_point;
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.yaml b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.yaml
deleted file mode 100644
index 3608f7193bf..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_per_row.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-choose_qparams_per_row:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: texture3d
-    NUM_OUTPUTS_PER_WG: 1
-    NUM_WORKERS_PER_OUTPUT: 64
-  generate_variant_forall:
-    STORAGE:
-      - VALUE: texture3d
-      - VALUE: buffer
-    DTYPE:
-      - VALUE: float
-  shader_variants:
-    - NAME: choose_qparams_per_row_o1w64
-    - NAME: choose_qparams_per_row_o4w16
-      NUM_OUTPUTS_PER_WG: 4
-      NUM_WORKERS_PER_OUTPUT: 16
diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_texture.glsl
deleted file mode 100644
index a17a3ae41dd..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_texture.glsl
+++ /dev/null
@@ -1,533 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define IN_T ${buffer_scalar_type(IN_DTYPE)}
-#define FVEC4_T ${texel_load_type(IN_DTYPE, "texture3d")}
-#define SCALE_OUT_T ${buffer_scalar_type(SCALE_OUT_DTYPE)}
-#define ZP_OUT_T ${buffer_scalar_type(ZP_OUT_DTYPE)}
-
-#define ${MODE}
-
-${define_active_storage_type("texture3d")}
-${define_required_extensions(IN_DTYPE)}
-${define_required_extensions(SCALE_OUT_DTYPE)}
-${define_required_extensions(ZP_OUT_DTYPE)}
-
-#extension GL_EXT_control_flow_attributes : require
-
-layout(std430) buffer;
-
-$if MODE != "block_wise":
-  ${layout_declare_tensor(B, "w", "t_scale", SCALE_OUT_DTYPE, "texture3d")}
-  ${layout_declare_tensor(B, "w", "t_zero_point", ZP_OUT_DTYPE, "texture3d")}
-$else:
-  ${layout_declare_tensor(B, "w", "t_scale", SCALE_OUT_DTYPE, "buffer")}
-  ${layout_declare_tensor(B, "w", "t_zero_point", ZP_OUT_DTYPE, "buffer")}
-
-${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "texture3d")}
-
-$if MODE == "per_tensor":
-  layout(push_constant) uniform restrict Block {
-    int quant_min;
-    int quant_max;
-    float eps;
-  };
-$if MODE == "per_token":
-  layout(push_constant) uniform restrict Block {
-    int num_tokens;
-    int quant_min;
-    int quant_max;
-  };
-$if MODE == "block_wise":
-  layout(push_constant) uniform BlockPC {
-    ivec4 blockSize; // WHCN (>=1)
-    ivec4 numBlocks; // #blocks along W,H,C,N
-    ivec4 blockStride; // {1, #W, #W * #H, #W * #H * #C}
-    int mapping_type; // 0=ASYM, 1=SYM, 2=SYM_NO_CLIP
-    int quant_min;
-    int quant_max;
-    float eps;
-  };
-
-${layout_declare_ubo(B, "ivec3", "t_in_limits")}
-$if MODE != "block_wise":
-  ${layout_declare_ubo(B, "ivec3", "t_scale_limits")}
-  ${layout_declare_ubo(B, "ivec3", "t_zero_point_limits")}
-$else:
-  ${layout_declare_ubo(B, "ivec4", "t_scale_sizes")}
-  ${layout_declare_ubo(B, "ivec4", "t_scale_strides")}
-  ${layout_declare_ubo(B, "ivec4", "t_zero_point_sizes")}
-  ${layout_declare_ubo(B, "ivec4", "t_zero_point_strides")}
-
-
-#include "indexing_utils.h"
-#include "choose_qparams.glslh"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#define NWORKERS 64
-
-// Shared memory for reduction - must match local work group size
-shared float shared_min[NWORKERS];
-shared float shared_max[NWORKERS];
-
-/*/*
-  Quantization Parameter Computation Shader (Buffer Storage)
-    This shader computes quantization parameters (scale and zero_point) for converting
-    floating-point tensors to n-bit integer representations while preserving the
-    original data range as much as possible. The computed parameters enable efficient
-    quantization by mapping the continuous floating-point range to discrete integer values.
-
-  Important Considerations:
-    (+) The input tensor is assumed to be WIDTH_PACKED (i.e., contiguous in the last dimension)
-
-  Workgroup Configuration:
-  - choose_qparams_per_tensor
-      This mode computes a single set of quantization parameters for the entire tensor.
-      Uses parallel reduction across all threads to find global min/max values.
-
-    (*) global_wg_size: default
-    (*) local_wg_size: default
-
-  - choose_qparams_per_token
-      This mode computes separate quantization parameters for each token in the tensor.
-      Each workgroup processes one token independently to find token-specific min/max.
-
-    (*) global_wg_size: default
-    (*) local_wg_size: {1, 1, 1}
-
-  - choose_qparams_block_wise
-      This mode computes quantization parameters for each block of elements, allowing
-      fine-grained control over quantization granularity within the tensor. Each block
-      is processed independently to find its own min/max values and compute corresponding
-      scale and zero_point parameters.
-
-      NOTE: This mode currently only supports buffer storage for the output.
-
-    (*) global_wg_size: {nBlocks, 1u, 1u} (one workgroup per block)
-    (*) local_wg_size: {1, 1, 1} (single thread per block)
-
-  Tree Reduction Algorithm for Min/Max Finding:
-    The shader uses a parallel tree reduction algorithm to efficiently find minimum and
-    maximum values across multiple threads. This approach reduces the number of memory
-    accesses and synchronization points compared to sequential scanning.
-
-    Example with 8 threads processing values [10, 1, 8, 1, 0, 2, 3, 5]:
-
-    Step 1 - Initial Population:
-    Each thread loads its assigned value into shared memory arrays.
-    shared_min:  | 10 | 1 | 8 | 1 | 0 | 2 | 3 | 5 |
-    shared_max:  | 10 | 1 | 8 | 1 | 0 | 2 | 3 | 5 |
-    Thread ID:   |  0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
-
-    Step 2 - Stride 1 (Compare Adjacent Pairs):
-    Threads 0,2,4,6 compare with threads 1,3,5,7 respectively.
-    shared_min:  |  1 |   | 1 |   | 0 |   | 3 |   |  (min(10,1), min(8,1), min(0,2), min(3,5))
-    shared_max:  | 10 |   | 8 |   | 2 |   | 5 |   |  (max(10,1), max(8,1), max(0,2), max(3,5))
-    Active:      |  0 |   | 2 |   | 4 |   | 6 |   |
-
-    Step 3 - Stride 2 (Compare Pairs of Pairs):
-    Threads 0,4 compare with threads 2,6 respectively.
-    shared_min:  |  1 |   |   |   | 0 |   |   |   |  (min(1,1), min(0,3))
-    shared_max:  | 10 |   |   |   | 5 |   |   |   |  (max(10,8), max(2,5))
-    Active:      |  0 |   |   |   | 4 |   |   |   |
-
-    Step 4 - Stride 4 (Final Comparison):
-    Thread 0 compares with thread 4 to get final result.
-    shared_min:  |  0 |   |   |   |   |   |   |   |  (min(1,0) = 0)
-    shared_max:  | 10 |   |   |   |   |   |   |   |  (max(10,5) = 10)
-    Active:      |  0 |   |   |   |   |   |   |   |
-
-    Final Result: global_min = 0, global_max = 10 (stored in shared_min[0], shared_max[0])
-
-    The tree reduction completes in log_2(N) steps where N is the number of threads,
-    providing O(log N) time complexity instead of O(N) for sequential reduction.
-
-  Quantization Parameter Calculation:
-    Once min/max values are determined, the shader computes:
-    - scale = (max - min) / (quant_max - quant_min)
-    - zero_point = quantization offset to map floating-point zero to integer range
-
-  Mode-Specific Behavior:
-  - Per-Tensor: Single workgroup with strided access across entire tensor
-  - Per-Token: Multiple workgroups, each processing one token independently
-*/
-
-#ifdef per_tensor
-
-void choose_qparams_per_tensor() {
-  uint global_id = gl_GlobalInvocationID.x;
-  uint local_id = gl_LocalInvocationID.x;
-  uint group_id = gl_WorkGroupID.x;
-  uint total_threads = gl_NumWorkGroups.x * gl_WorkGroupSize.x;
-
-  uint total_texels = uint(t_in_limits.x * t_in_limits.y * t_in_limits.z);
-
-  // Each thread processes multiple texels with stride
-  float thread_min = 1.0/0.0;  // +infinity
-  float thread_max = -1.0/0.0; // -infinity
-  bool found_valid = false;
-
-  // Process texels with stride across all threads
-  for (uint texel_idx = global_id; texel_idx < total_texels; texel_idx += total_threads) {
-    // Convert linear texel index to 3D coordinates
-    uint z = texel_idx / uint(t_in_limits.x * t_in_limits.y);
-    uint remainder = texel_idx % uint(t_in_limits.x * t_in_limits.y);
-    uint y = remainder / uint(t_in_limits.x);
-    uint x = remainder % uint(t_in_limits.x);
-    ivec3 texel_pos = ivec3(int(x), int(y), int(z));
-
-    FVEC4_T texel_data = load_texel(t_in, texel_pos);
-
-    // For texture storage, we assume width-packed (packed_dim = 0)
-    // Calculate number of valid elements in this texel (handle padding)
-    int packed_dim = 0; // Width dimension is packed
-    ivec4 sizes = ivec4(t_in_limits, 1); // Convert limits to sizes format
-    ivec4 tensor_coord = to_tensor_idx(texel_pos, sizes, packed_dim);
-
-    // Calculate total tensor elements to determine padding
-    int total_elements = t_in_limits.x * t_in_limits.y * t_in_limits.z * 4;
-    int linear_tensor_idx = tensor_coord.x + tensor_coord.y * sizes.x +
-                            tensor_coord.z * sizes.x * sizes.y;
-    int remaining_elements = total_elements - (linear_tensor_idx);
-    int valid_elements = min(4, remaining_elements);
-
-    // Find min/max within this texel, considering only valid elements
-    if (valid_elements >= 1 && !isnan(texel_data.x) && !isinf(texel_data.x)) {
-      if (!found_valid) {
-        thread_min = texel_data.x;
-        thread_max = texel_data.x;
-        found_valid = true;
-      } else {
-        thread_min = min(thread_min, texel_data.x);
-        thread_max = max(thread_max, texel_data.x);
-      }
-    }
-
-    if (valid_elements >= 2 && !isnan(texel_data.y) && !isinf(texel_data.y)) {
-      if (!found_valid) {
-        thread_min = texel_data.y;
-        thread_max = texel_data.y;
-        found_valid = true;
-      } else {
-        thread_min = min(thread_min, texel_data.y);
-        thread_max = max(thread_max, texel_data.y);
-      }
-    }
-
-    if (valid_elements >= 3 && !isnan(texel_data.z) && !isinf(texel_data.z)) {
-      if (!found_valid) {
-        thread_min = texel_data.z;
-        thread_max = texel_data.z;
-        found_valid = true;
-      } else {
-        thread_min = min(thread_min, texel_data.z);
-        thread_max = max(thread_max, texel_data.z);
-      }
-    }
-
-    if (valid_elements >= 4 && !isnan(texel_data.w) && !isinf(texel_data.w)) {
-      if (!found_valid) {
-        thread_min = texel_data.w;
-        thread_max = texel_data.w;
-        found_valid = true;
-      } else {
-        thread_min = min(thread_min, texel_data.w);
-        thread_max = max(thread_max, texel_data.w);
-      }
-    }
-  }
-
-  // Intra-workgroup reduction using shared memory
-  shared_min[local_id] = thread_min;
-  shared_max[local_id] = thread_max;
-  barrier();
-
-  // Tree reduction within work group
-  for (uint stride = gl_WorkGroupSize.x / 2; stride > 0; stride >>= 1) {
-    if (local_id < stride) {
-      float other_min = shared_min[local_id + stride];
-      float other_max = shared_max[local_id + stride];
-
-      if (!isinf(other_min) && (isinf(shared_min[local_id]) || other_min < shared_min[local_id])) {
-        shared_min[local_id] = other_min;
-      }
-      if (!isinf(other_max) && (isinf(shared_max[local_id]) || other_max > shared_max[local_id])) {
-        shared_max[local_id] = other_max;
-      }
-    }
-    barrier();
-  }
-
-  // Final result calculation (single workgroup only for reliability)
-  if (local_id == 0 && group_id == 0) {
-    float global_min = shared_min[0];
-    float global_max = shared_max[0];
-
-    float scale_val;
-    int zero_point_val;
-    calc_scale_zp(global_min, global_max, quant_min, quant_max, 0, eps, scale_val, zero_point_val);
-
-    write_texel(t_scale, ivec3(0, 0, 0), vec4(SCALE_OUT_T(scale_val), 0.0, 0.0, 0.0));
-    write_texel(t_zero_point, ivec3(0, 0, 0), ivec4(ZP_OUT_T(zero_point_val), 0, 0, 0));
-  }
-}
-
-#elif defined(per_token)
-
-void choose_qparams_per_token() {
-  // Each token is processed by multiple workgroups for parallel reduction
-  uint local_id = gl_LocalInvocationID.x;
-  uint group_id = gl_WorkGroupID.x;
-  uint total_workgroups = gl_NumWorkGroups.x;
-
-  uint total_texels = uint(t_in_limits.x * t_in_limits.y * t_in_limits.z);
-
-  // Calculate texels per token (assuming last dimension contains the token data)
-  // For per-token quantization, we assume tokens are along the last dimension
-  uint texels_per_token = total_texels / uint(num_tokens);
-
-  // Calculate how many tokens each workgroup should process
-  uint tokens_per_workgroup = (uint(num_tokens) + total_workgroups - 1) / total_workgroups;
-
-  // Calculate which tokens this workgroup is responsible for
-  uint start_token = group_id * tokens_per_workgroup;
-  uint end_token = min(start_token + tokens_per_workgroup, uint(num_tokens));
-
-  // Process each token assigned to this workgroup
-  for (uint token_id = start_token; token_id < end_token; token_id++) {
-    // Calculate the texel range for this token
-    uint token_start_texel = token_id * texels_per_token;
-    uint token_end_texel = token_start_texel + texels_per_token;
-
-    // Each thread processes multiple texels within the token
-    float thread_min = 1.0/0.0;  // +infinity
-    float thread_max = -1.0/0.0; // -infinity
-    bool found_valid = false;
-
-    // Process texels within this token only
-    for (uint texel_idx = token_start_texel + local_id; texel_idx < token_end_texel; texel_idx += gl_WorkGroupSize.x) {
-      // Convert linear texel index to 3D coordinates
-      uint z = texel_idx / uint(t_in_limits.x * t_in_limits.y);
-      uint remainder = texel_idx % uint(t_in_limits.x * t_in_limits.y);
-      uint y = remainder / uint(t_in_limits.x);
-      uint x = remainder % uint(t_in_limits.x);
-      ivec3 texel_pos = ivec3(int(x), int(y), int(z));
-
-      FVEC4_T texel_data = load_texel(t_in, texel_pos);
-
-      // For texture storage, we assume width-packed (packed_dim = 0)
-      // Calculate number of valid elements in this texel (handle padding)
-      int packed_dim = 0; // Width dimension is packed
-      ivec4 sizes = ivec4(t_in_limits, 1); // Convert limits to sizes format
-      ivec4 tensor_coord = to_tensor_idx(texel_pos, sizes, packed_dim);
-
-      // Calculate total tensor elements to determine padding
-      int total_elements = t_in_limits.x * t_in_limits.y * t_in_limits.z * 4;
-      int linear_tensor_idx = tensor_coord.x + tensor_coord.y * sizes.x +
-                              tensor_coord.z * sizes.x * sizes.y;
-      int remaining_elements = total_elements - (linear_tensor_idx);
-      int valid_elements = min(4, remaining_elements);
-
-      // Find min/max within this texel, considering only valid elements
-      if (valid_elements >= 1 && !isnan(texel_data.x) && !isinf(texel_data.x)) {
-        if (!found_valid) {
-          thread_min = texel_data.x;
-          thread_max = texel_data.x;
-          found_valid = true;
-        } else {
-          thread_min = min(thread_min, texel_data.x);
-          thread_max = max(thread_max, texel_data.x);
-        }
-      }
-
-      if (valid_elements >= 2 && !isnan(texel_data.y) && !isinf(texel_data.y)) {
-        if (!found_valid) {
-          thread_min = texel_data.y;
-          thread_max = texel_data.y;
-          found_valid = true;
-        } else {
-          thread_min = min(thread_min, texel_data.y);
-          thread_max = max(thread_max, texel_data.y);
-        }
-      }
-
-      if (valid_elements >= 3 && !isnan(texel_data.z) && !isinf(texel_data.z)) {
-        if (!found_valid) {
-          thread_min = texel_data.z;
-          thread_max = texel_data.z;
-          found_valid = true;
-        } else {
-          thread_min = min(thread_min, texel_data.z);
-          thread_max = max(thread_max, texel_data.z);
-        }
-      }
-
-      if (valid_elements >= 4 && !isnan(texel_data.w) && !isinf(texel_data.w)) {
-        if (!found_valid) {
-          thread_min = texel_data.w;
-          thread_max = texel_data.w;
-          found_valid = true;
-        } else {
-          thread_min = min(thread_min, texel_data.w);
-          thread_max = max(thread_max, texel_data.w);
-        }
-      }
-    }
-
-    // Intra-workgroup reduction using shared memory
-    shared_min[local_id] = thread_min;
-    shared_max[local_id] = thread_max;
-    barrier();
-
-    // Tree reduction within work group
-    for (uint stride = gl_WorkGroupSize.x / 2; stride > 0; stride >>= 1) {
-      if (local_id < stride) {
-        float other_min = shared_min[local_id + stride];
-        float other_max = shared_max[local_id + stride];
-
-        // Handle infinity values properly
-        if (!isinf(other_min) && (isinf(shared_min[local_id]) || other_min < shared_min[local_id])) {
-          shared_min[local_id] = other_min;
-        }
-        if (!isinf(other_max) && (isinf(shared_max[local_id]) || other_max > shared_max[local_id])) {
-          shared_max[local_id] = other_max;
-        }
-      }
-      barrier();
-    }
-
-    // Final calculation for this token
-    if (local_id == 0) {
-      float token_min = shared_min[0];
-      float token_max = shared_max[0];
-
-      float scale_val;
-      int zero_point_val;
-      calc_scale_zp(token_min, token_max, quant_min, quant_max, 0, 1e-5, scale_val, zero_point_val);
-
-      // Convert token_id to 3D coordinates for output texture
-      // Assuming output tensors have the same layout as input but with different dimensions
-      uint out_z = token_id / uint(t_scale_limits.x * t_scale_limits.y);
-      uint out_remainder = token_id % uint(t_scale_limits.x * t_scale_limits.y);
-      uint out_y = out_remainder / uint(t_scale_limits.x);
-      uint out_x = out_remainder % uint(t_scale_limits.x);
-      ivec3 out_pos = ivec3(int(out_x), int(out_y), int(out_z));
-
-      write_texel(t_scale, out_pos, vec4(SCALE_OUT_T(scale_val), 0.0, 0.0, 0.0));
-      write_texel(t_zero_point, out_pos, ivec4(ZP_OUT_T(zero_point_val), 0, 0, 0));
-    }
-
-    // Synchronize before processing next token
-    barrier();
-  }
-}
-
-#elif defined(block_wise)
-
-ivec4 block_id_to_coord(uint bid) {
-  ivec4 bc;
-  bc.w = int(bid) / blockStride.w;
-
-  int r = int(bid) - bc.w * blockStride.w;
-  bc.z = r / blockStride.z;
-
-  r -= bc.z * blockStride.z;
-  bc.y = r / blockStride.y;
-
-  r -= bc.y * blockStride.y;
-  bc.x = r;
-  return bc;
-}
-
-void choose_qparams_block_wise() {
-  const uint T = uint(numBlocks.x * numBlocks.y * numBlocks.z * numBlocks.w);
-  const uint STRIDE = gl_WorkGroupSize.x * gl_NumWorkGroups.x;
-
-  // tensor full size in WHCN order
-  const ivec4 tensorSz = blockSize * numBlocks;
-
-  // Process blocks with stride for better parallelization
-  for (uint blkIdx = gl_GlobalInvocationID.x; blkIdx < T; blkIdx += STRIDE) {
-    // block index in WHCN
-    const ivec4 b4d = block_id_to_coord(blkIdx);
-    const ivec4 blockStart = b4d * blockSize;
-    const ivec4 blockEnd = blockStart + blockSize;
-
-    // scan all elements inside the block
-    float vmin = 3.402823e38;  // +FLT_MAX
-    float vmax = -3.402823e38; // -FLT_MAX
-    bool found_valid = false;
-
-    // Calculate total elements in block for linear iteration
-    const int blockElements = blockSize.x * blockSize.y * blockSize.z * blockSize.w;
-
-    // Linear iteration over block elements (more cache-friendly)
-    for (int elemIdx = 0; elemIdx < blockElements; ++elemIdx) {
-      // Convert linear index to 4D coordinates within block
-      int remaining = elemIdx;
-      int dn = remaining / (blockSize.x * blockSize.y * blockSize.z);
-      remaining -= dn * (blockSize.x * blockSize.y * blockSize.z);
-      int dc = remaining / (blockSize.x * blockSize.y);
-      remaining -= dc * (blockSize.x * blockSize.y);
-      int dh = remaining / blockSize.x;
-      int dw = remaining - dh * blockSize.x;
-
-      ivec4 tidx = blockStart + ivec4(dw, dh, dc, dn);
-
-      // skip padding when tensor size is not an exact multiple of block
-      if (any(greaterThanEqual(tidx, tensorSz))) { continue; }
-
-      // tensor index -> (x,y,z,component) inside input texture
-      ivec4 posi = to_texture_elem_pos(tidx, tensorSz, 0); // 0 = W_DIM (width packed)
-
-      // fetch texel and pick the element inside it
-      FVEC4_T texl = load_texel(t_in, posi.xyz);
-      float v;
-      if (posi.w == 0) v = texl.x;
-      else if (posi.w == 1) v = texl.y;
-      else if (posi.w == 2) v = texl.z;
-      else v = texl.w;
-
-      if (!isnan(v) && !isinf(v)) {
-        if (!found_valid) {
-          vmin = vmax = v;
-          found_valid = true;
-        } else {
-          vmin = min(vmin, v);
-          vmax = max(vmax, v);
-        }
-      }
-    }
-
-    // Handle case where no valid values were found
-    if (!found_valid) {
-      vmin = 0.0;
-      vmax = 0.0;
-    }
-
-    // compute scale / zero‑point (same maths as buffer kernel)
-    float scale;
-    int zp;
-    calc_scale_zp(vmin, vmax, quant_min, quant_max, mapping_type, eps, scale, zp);
-
-    // Write the scalar values directly to buffer using linear index
-    t_scale[blkIdx] = SCALE_OUT_T(scale);
-    t_zero_point[blkIdx] = ZP_OUT_T(zp);
-  }
-}
-
-#endif
-
-void main() {
-  choose_qparams_${MODE}();
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_texture.yaml
deleted file mode 100644
index 12228822d4b..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/choose_qparams_texture.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-choose_qparams_texture:
-  parameter_names_with_default_values:
-    IN_DTYPE: float
-    SCALE_OUT_DTYPE: float
-    ZP_OUT_DTYPE: int32
-    MODE: per_tensor
-  generate_variant_forall:
-    IN_DTYPE:
-      - VALUE: float
-    SCALE_OUT_DTYPE:
-      - VALUE: float
-    ZP_OUT_DTYPE:
-      - VALUE: int32
-      - VALUE: int8
-      - VALUE: float
-  shader_variants:
-    - NAME: choose_qparams_tensor_texture3d
-      MODE: per_tensor
-    - NAME: choose_qparams_per_token_asymmetric_texture3d
-      MODE: per_token
-    - NAME: choose_qparams_block_wise_texture3d
-      MODE: block_wise
diff --git a/backends/vulkan/runtime/graph/ops/glsl/clone.glsl b/backends/vulkan/runtime/graph/ops/glsl/clone.glsl
deleted file mode 100644
index 3bd1af8bb0c..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/clone.glsl
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#include "indexing_utils.h"
-
-#define PRECISION ${PRECISION}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_ubo(B, "ivec3", "out_limits")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  ivec3 pos = ivec3(gl_GlobalInvocationID);
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-  imageStore(t_out, pos, load_texel(t_in, pos));
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/clone.yaml b/backends/vulkan/runtime/graph/ops/glsl/clone.yaml
deleted file mode 100644
index 1fdbf506bfd..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/clone.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-clone:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: clone
diff --git a/backends/vulkan/runtime/graph/ops/glsl/col2im.glsl b/backends/vulkan/runtime/graph/ops/glsl/col2im.glsl
deleted file mode 100644
index c105ef18719..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/col2im.glsl
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_load_type(DTYPE, OUTPUT_STORAGE)}
-#define T ${texel_load_component_type(DTYPE, OUTPUT_STORAGE)}
-
-$if OUTPUT_STORAGE == "buffer":
-  #define OUTPUT_BUFFER
-$if INPUT_STORAGE == "buffer":
-  #define INPUT_BUFFER
-
-#define TILE_M4 1
-#define TILE_N4 1
-#define TILE_K4 1
-
-#define TILE_M 4
-#define TILE_N 4
-#define TILE_K 4
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "conv2d_common.glslh"
-
-${layout_declare_tensor(B, "w", "t_output", DTYPE, OUTPUT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)}
-
-// Sizes of the convolution output image
-${layout_declare_ubo(B, "ivec4", "output_sizes")}
-// Sizes of the convolution input image
-${layout_declare_ubo(B, "ivec4", "input_sizes")}
-// Sizes of the im2col matrix of the convolution output
-${layout_declare_ubo(B, "ivec4", "matrix_sizes")}
-
-${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#include "conv2d_fp_im2col_block_store.glslh"
-
-#ifdef INPUT_BUFFER
-
-void load_matrix_tile(
-    out FPOutTile tile,
-    const int n4,
-    const int m_start,
-    const int N4) {
-  [[unroll]] for (int m = 0; m < TILE_M; m++) {
-    tile.data[m][0] = t_input[(m_start + m) * N4 + n4];
-  }
-}
-
-#else // INPUT_TEXTURE
-
-void load_matrix_tile(
-    out FPOutTile tile,
-    const int n4,
-    const int m_start,
-    const int N4) {
-  [[unroll]] for (int m = 0; m < TILE_M; m++) {
-    tile.data[m][0] = texelFetch(
-        t_input, ivec3(n4, m_start + m, 0), 0);
-  }
-}
-
-#endif // INPUT_BUFFER
-
-void main() {
-  // Each thread loads and writes a 4 wide x 4 high block of the matrix
-  const int n4 = int(gl_GlobalInvocationID.x);
-  const int m4 = int(gl_GlobalInvocationID.y);
-
-  const int n = mul_4(n4);
-  const int m = mul_4(m4);
-
-  if (n >= matrix_sizes.x || m >= matrix_sizes.y) {
-    return;
-  }
-
-  FPOutTile tile;
-
-  const int N4 = div_4(matrix_sizes.x);
-  load_matrix_tile(tile, n4, m, N4);
-  write_im2col_tile_as_image(tile, n4, m);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/col2im.yaml b/backends/vulkan/runtime/graph/ops/glsl/col2im.yaml
deleted file mode 100644
index b6d0972271a..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/col2im.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-col2im:
-  parameter_names_with_default_values:
-    DTYPE: float
-    OUTPUT_STORAGE: texture3d
-    INPUT_STORAGE: buffer
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: col2im_texture3d_buffer
-    - NAME: col2im_texture3d_texture3d
-      INPUT_STORAGE: texture3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/common.glslh b/backends/vulkan/runtime/graph/ops/glsl/common.glslh
deleted file mode 100644
index 732b7006c2c..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/common.glslh
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef COMMON_GLSLH
-#define COMMON_GLSLH
-
-#define mul_2(x) ((x) << 1)
-#define mul_4(x) ((x) << 2)
-#define mul_8(x) ((x) << 3)
-
-#define div_2(x) ((x) >> 1)
-#define div_4(x) ((x) >> 2)
-#define div_8(x) ((x) >> 3)
-
-#define div_up_2(x) (((x) + 1) >> 1)
-#define div_up_4(x) (((x) + 3) >> 2)
-#define div_up_8(x) (((x) + 7) >> 3)
-
-#define align_up_2(x) ((x + 1) & -2)
-#define align_up_4(x) ((x + 3) & -4)
-#define align_up_8(x) ((x + 7) & -8)
-
-#define mod_2(x) ((x) & 1)
-#define mod_4(x) ((x) & 3)
-#define mod_8(x) ((x) & 7)
-
-struct TensorIndex4D {
-  ivec4 data;
-};
-
-#ifdef DEBUG_MODE
-
-#extension GL_EXT_debug_printf : require
-
-void printTensorIndex4D(const TensorIndex4D index) {
-  debugPrintfEXT(
-      "tensor_idx: %d, %d, %d, %d\\n",
-      index.data.x,
-      index.data.y,
-      index.data.z,
-      index.data.w);
-}
-
-#endif // DEBUG_MODE
-
-#endif // COMMON_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.glsl
deleted file mode 100644
index e34ecaf8309..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.glsl
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-#define T ${buffer_scalar_type(DTYPE)}
-
-${define_active_storage_type("buffer")}
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(B, "rw", "t_out", DTYPE, "buffer")}
-
-$for i in range(NUM_INPUTS):
-  ${layout_declare_tensor(B, "r", "t_inp" + str(i), DTYPE, "buffer")}
-
-${layout_declare_tensor(B, "r", "t_concat_offset", "int", "buffer")}
-
-${layout_declare_ubo(B, "int", "concat_dim")}
-
-${layout_declare_ubo(B, "ivec4", "out_sizes")}
-${layout_declare_ubo(B, "ivec4", "out_strides")}
-
-$for i in range(NUM_INPUTS):
-  ${layout_declare_ubo(B, "ivec4", "inp" + str(i) + "_sizes")}
-  ${layout_declare_ubo(B, "ivec4", "inp" + str(i) + "_strides")}
-
-${layout_declare_ubo(B, "int", "out_numel")}
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-
-const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#define NUM_INPUTS ${NUM_INPUTS}
-
-#include "concat_utils.glslh"
-
-/*
- * This shader template concatenates up to NUM_INPUT input tensors to the
- * output tensor along the concat_dim. Elements from the input tensor will
- * be inserted along the output's concat_dim starting at concat_offset.
- */
-void main() {
-  const int tid = ivec3(gl_GlobalInvocationID).x;
-
-  // The 1-3 input tensors are interpreted as one concatenated tensor ("volume")
-  // along the concat_dim for the purposes of tensor indexing. Each thread is
-  // responsible for reading one item from this volume and writing it to the
-  // appropriate output location.
-  ivec4 inp_volume_sizes = out_sizes;
-  inp_volume_sizes[concat_dim] = total_concat_dim_numel();
-
-  // Account for 0 size input tensors
-  if (any(lessThanEqual(inp_volume_sizes, ivec4(0)))) {
-    return;
-  }
-
-  ivec4 inp_volume_tidx = nchwi_to_tidx(tid, inp_volume_sizes);
-
-  // bounds check
-  if (any(greaterThanEqual(inp_volume_tidx, inp_volume_sizes))) {
-    return;
-  }
-
-  int concat_offset = t_concat_offset[0];
-
-  ivec4 out_tidx = inp_volume_tidx;
-  out_tidx[concat_dim] += concat_offset;
-
-  const uint out_bufi = tidx_to_bufi(out_tidx, out_strides);
-
-  // Go through the list of input tensors, and find which input this output
-  // element should be read from.
-  $for i in range(NUM_INPUTS):
-    if (inp_volume_tidx[concat_dim] < inp${i}_sizes[concat_dim]) {
-      int inp_bufi = tidx_to_bufi(inp_volume_tidx, inp${i}_strides);
-      t_out[out_bufi] = t_inp${i}[inp_bufi];
-      return;
-    }
-    else {
-      inp_volume_tidx[concat_dim] -= inp${i}_sizes[concat_dim];
-    }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.yaml
deleted file mode 100644
index 39f96df5e90..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-concat_buffer:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NUM_INPUTS: 2
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: concat_1_buffer
-      NUM_INPUTS: 1
-    - NAME: concat_2_buffer
-    - NAME: concat_3_buffer
-      NUM_INPUTS: 3
diff --git a/backends/vulkan/runtime/graph/ops/glsl/concat_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/concat_texture.glsl
deleted file mode 100644
index afab0c524d6..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/concat_texture.glsl
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-#define T ${buffer_scalar_type(DTYPE)}
-
-#define USING_TEXTURE3D
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(B, "rw", "t_out", DTYPE, "texture3d")}
-
-$for i in range(NUM_INPUTS):
-  ${layout_declare_tensor(B, "r", "t_inp" + str(i), DTYPE, "texture3d")}
-
-${layout_declare_tensor(B, "r", "t_concat_offset", "int", "buffer")}
-
-${layout_declare_ubo(B, "int", "concat_dim")}
-
-$in_metadata = ""
-$for i in range(NUM_INPUTS):
-  $in_metadata += "ivec4 inp" + str(i) + "_sizes;\n"
-
-layout(push_constant) uniform restrict Block {
-  ivec4 out_sizes;
-  ${in_metadata}
-};
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int out_packed_dim = unhash_packed_dim(out_layout);
-
-$for i in range(NUM_INPUTS):
-  ${layout_declare_spec_const(C, "int", "inp" + str(i) + "_layout", "DEFAULT_LAYOUT")}
-  const lowp ivec4 inp${i}_axis_map = unhash_axis_map(inp${i}_layout);
-  const lowp int inp${i}_packed_dim = unhash_packed_dim(inp${i}_layout);
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#define NUM_INPUTS ${NUM_INPUTS}
-
-#include "concat_utils.glslh"
-
-/*
- * This shader template concatenates up to NUM_INPUT input tensors to the
- * output tensor along the concat_dim. Elements from the input tensor will
- * be inserted along the output's concat_dim starting at concat_offset.
- *
- * Each thread is responsible for writing out one output texel. The data
- * required for the output texel may be read from multiple input texels of one
- * input tensor.
- */
-void main() {
-  const int tid = ivec3(gl_GlobalInvocationID).x;
-
-  // Sum of the sizes of all input tensors along the concat_dim
-  const int concat_numel = total_concat_dim_numel();
-
-  // The 1-3 input tensors are interpreted as one concatenated tensor ("volume")
-  // along the concat_dim for the purposes of tensor indexing. Each thread is
-  // responsible for writing out 4 elements along the packed dim of the output
-  // tensor by reading the source data from the input tensor(s).
-  ivec4 inp_volume_sizes = out_sizes;
-  inp_volume_sizes[concat_dim] = total_concat_dim_numel();
-
-  // Reconstruct inp_volume_texel_sizes from Concat.cpp
-  ivec4 inp_volume_texel_sizes = inp_volume_sizes;
-  inp_volume_texel_sizes[out_packed_dim] = DIV_UP_4(
-      inp_volume_texel_sizes[out_packed_dim]
-  ) + 1;
-
-  // tensor index of the first element that will be read from the input volume
-  ivec4 inp_volume_start_tidx = nchwi_to_tidx(tid, inp_volume_texel_sizes);
-  inp_volume_start_tidx[out_packed_dim] = MUL_4(
-      inp_volume_start_tidx[out_packed_dim]
-  );
-
-  int concat_offset = t_concat_offset[0];
-
-  // tensor index of the first element that will be written to the output tensor
-  ivec4 out_write_start_tidx = inp_volume_start_tidx;
-  out_write_start_tidx[concat_dim] += concat_offset;
-
-  // To write to the the desired output element, we will need to load the texel
-  // to which the element belongs. Calculate the tensor index of the first
-  // element of that texel.
-  ivec4 out_read_start_tidx = out_write_start_tidx;
-  out_read_start_tidx[out_packed_dim] = ALIGN_DOWN_4(
-      out_write_start_tidx[out_packed_dim]);
-
-  // bounds check
-  if (any(greaterThanEqual(out_read_start_tidx, out_sizes))) {
-    return;
-  }
-
-  ivec3 out_pos = tidx_to_pos(
-      out_read_start_tidx,
-      out_sizes,
-      out_axis_map,
-      out_packed_dim
-  );
-
-  VEC4_T out_texel = imageLoad(t_out, out_pos);
-
-  VEC4_T test_texel = VEC4_T(-1.0);
-
-  for (int comp = 0; comp < 4; ++comp) {
-    ivec4 out_tidx = out_read_start_tidx;
-    out_tidx[out_packed_dim] += comp;
-
-
-    // It's possible that the current texel element has been written to as part
-    // of the previous input batch; if so, then don't overwrite this texel
-    // element
-    if (out_tidx[concat_dim] < concat_offset) {
-      test_texel[comp] = -5.0;
-      continue;
-    }
-
-    // Calculate the tidx of the input volume that corresponds to this output
-    // element
-    ivec4 inp_volume_tidx = out_tidx;
-    inp_volume_tidx[concat_dim] -= concat_offset;
-
-    // go through the list of input tensors, and figure out which input this
-    // output element should be read from.
-    $for i in range(NUM_INPUTS):
-      if (inp_volume_tidx[concat_dim] < inp${i}_sizes[concat_dim]) {
-        // Special fast path case if, for the first output texel element, the
-        // corresponding input element is at the start of the texel it belongs
-        // to. In this case, the input texel can be written as-is to the output
-        // texel. Also require that The entire input texel is valid and does not
-        // contain any padding elements.
-        if (comp == 0 &&
-            out_tidx[out_packed_dim] % 4 == 0 &&
-            inp_volume_tidx[inp${i}_packed_dim] % 4 == 0 &&
-            inp_volume_tidx[inp${i}_packed_dim] + 3 < inp${i}_sizes[inp${i}_packed_dim]) {
-          const ivec3 in_pos = tidx_to_pos(
-              inp_volume_tidx,
-              inp${i}_sizes,
-              inp${i}_axis_map,
-              inp${i}_packed_dim);
-
-          out_texel = texelFetch(t_inp${i}, in_pos, 0);
-          break;
-        }
-
-        // Otherwise, locate the specific input element required
-        const ivec4 in_posi = tidx_to_posi(
-            inp_volume_tidx,
-            inp${i}_sizes,
-            inp${i}_axis_map,
-            inp${i}_packed_dim);
-
-        out_texel[comp] = texelFetch(t_inp${i}, in_posi.xyz, 0)[in_posi.w];
-        test_texel[comp] = out_texel[comp];
-        continue;
-      }
-      else {
-        inp_volume_tidx[concat_dim] -= inp${i}_sizes[concat_dim];
-      }
-  }
-
-  imageStore(t_out, out_pos, out_texel);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/concat_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/concat_texture.yaml
deleted file mode 100644
index ed5003382a1..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/concat_texture.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-concat_texture:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NUM_INPUTS: 2
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: concat_1_texture3d
-      NUM_INPUTS: 1
-    - NAME: concat_2_texture3d
-    - NAME: concat_3_texture3d
-      NUM_INPUTS: 3
diff --git a/backends/vulkan/runtime/graph/ops/glsl/concat_utils.glslh b/backends/vulkan/runtime/graph/ops/glsl/concat_utils.glslh
deleted file mode 100644
index 000b86a7fce..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/concat_utils.glslh
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef CONCAT_UTILS_H
-#define CONCAT_UTILS_H
-
-
-/**********************************
- * Concatenation utililty functions
- *
- */
-
-/*
- * Returns the total number of elements along the concatenation dim that will
- * be concatenated  in this input batch.
- */
-$for N in range(1, 4):
-  #if NUM_INPUTS == ${N}
-  int total_concat_dim_numel() {
-    int total = 0;
-    $for i in range(N):
-      total += inp${i}_sizes[concat_dim];
-
-    return total;
-  }
-  #endif
-
-#endif // CONCAT_UTILS_H
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl
deleted file mode 100644
index 4e3b91e6c49..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv1d.glsl
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-#define op(X, A, B) ${OPERATOR}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "kernel_in", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "bias_in", DTYPE, STORAGE)}
-
-${layout_declare_ubo(B, "ivec3", "out_limits")}
-${layout_declare_ubo(B, "ivec4", "in_sizes")}
-
-${layout_declare_ubo(B,"int", "kernel_size", "int", "stride", "int", "padding", "int", "dilation", "int", "in_group_size", "int", "out_group_size")}
-
-${layout_declare_ubo(B, "float", "out_min", "float", "out_max")}
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-
-${layout_declare_spec_const(C, "int", "kernel_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 kernel_axis_map = unhash_axis_map(kernel_layout);
-
-${layout_declare_spec_const(C, "int", "bias_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 bias_axis_map = unhash_axis_map(bias_layout);
-
-// Let us define
-//
-// input = (N, in_C, in_L),
-// output = (N, out_C, out_L),
-// groups = G,
-// kernel = K,
-//
-// which results in shapes
-//
-// weight = (out_C, in_C / G, K),
-// bias = (out_C,).
-//
-// This implementation performs N x out_C x out_L shader invocations, where each invocation
-// calculates the rolling kernel of the length dimension for each batch, i.e.,
-// computes out_L results.
-void main() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(lpos, out_limits))) {
-    return;
-  }
-
-  // "out_c" is the output's channel index where we write our result.
-  // Across shader invocations, this is the only value that varies.
-  const int out_c = lpos.y;
-
-  // "in_c" tracks the input's channel start index.
-  // We iterate over the input group that corresponds to the output group.
-  const int c_start = (out_c / out_group_size) * in_group_size;
-  const int c_end = c_start + in_group_size;
-
-  // "out_l" tracks the output's length index where we write our result.
-  const int out_l = lpos.x;
-
-  // "N" is the batch index
-  const int N = lpos.z;
-
-  // "in_l" tracks the input's length start index for our input-kernel overlay
-  // region.
-  const int in_l = out_l * stride - padding;
-  VEC4_T sum = VEC4_T(0);
-
-  const int out_c_packed_index = out_c >> 2;
-  const int out_c_packed_lane = out_c & 0x3;
-
-  for (int in_c = c_start; in_c < c_end; ++in_c) {
-    // "k" tracks the kernel's index for our input-kernel computation.
-    // It reads out-of-bound zeros, but trying to avoid them complicates
-    // for-loop conditions, which results in worse performance.
-
-    // The weight tensor is channel-packed. It may not be trival choice for
-    // performance reason since need to have more data fetch. The reason is
-    // for some sequence model, we found that the weight tensor
-    // (out_channel, in_channel / group, kernel) often has a large
-    // out_channel >> kernel, leading to non-optimal use of memory as the
-    // weight tensor gets very deep. As a mitigation, we use channel-packing
-    // for the weight tensor, yielding a 75% reduction in weight-tensor
-    // memory.
-
-    // It is possible to further reduce the memory footprint by swapping the
-    // dimensions, using x extent for out_channel, and y for kernel.
-    for (int k = 0; k < kernel_size; k++) {
-      const ivec3 w_lpos = ivec3(k, in_c % in_group_size, out_c_packed_index);
-      const VEC4_T weight_texel = load_texel_lpos(kernel_in, w_lpos, kernel_axis_map);
-      VEC4_T weight = VEC4_T(weight_texel[out_c_packed_lane]);
-
-      const ivec3 in_pos = lpos_to_pos(ivec3(in_l + k * dilation, in_c, N), in_axis_map);
-      sum = fma(weight, load_texel(t_in, in_pos), sum);
-    }
-  }
-
-  const VEC4_T bias = load_texel_lpos(bias_in, ivec3(out_c_packed_index, 0, 0), bias_axis_map);
-  const ivec3 out_lpos = ivec3(out_l, out_c, N);
-  write_texel_lpos(t_out, out_lpos, op(sum + bias[out_c_packed_lane], out_min, out_max), out_axis_map);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml
deleted file mode 100644
index 2266649d2b9..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv1d.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-conv1d:
-  parameter_names_with_default_values:
-    OPERATOR: X
-    DTYPE: float
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: conv1d
-    - NAME: conv1d_clamp
-      OPERATOR: clamp(X, A, B)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
deleted file mode 100644
index 0f5dbc41273..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d.glsl
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-#define op(X, A, B) ${OPERATOR}
-
-#include "indexing_utils.h"
-
-layout(std430) buffer;
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
-${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
-${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
-${layout_declare_ubo(4, "ivec3", "out_limits")}
-${layout_declare_ubo(5, "ivec4", "in_sizes")}
-${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
-${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
-${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "ngroups", "1")}
-
-/*
- * Computes a 2D convolution. Each shader invocation calculates the output at
- * a single output location.
- */
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  // Compute the index of the top-left element of the overlay region. Negative
-  // indices indicate that the top-left element is in a region added by padding.
-  const ivec2 ipos = pos.xy * stride - padding;
-
-  // Compute the start and end of the input indices to load. Padding is assumed
-  // to be constant 0 padding, so reads from the padding region are skipped.
-  ivec2 start = ipos;
-  if (start.x < 0) {
-    // number of "steps" to get to >= zero is div_up(-start, dilation)
-    int num_steps = ((-ipos.x) + dilation.x - 1) / dilation.x;
-    start.x = ipos.x + num_steps * dilation.x;
-  }
-  if (start.y < 0) {
-    // number of "steps" to get to >= zero is div_up(-start, dilation)
-    int num_steps = ((-ipos.y) + dilation.y - 1) / dilation.y;
-    start.y = ipos.y + num_steps * dilation.y;
-  }
-  const ivec2 end = min(ipos + overlay_region.xy, ivec2(in_sizes.xy));
-  // Compute the start of the kernel based on how far we are skipping ahead when
-  // reading the input. Note that these are "canonical" indices.
-  ivec2 kstart = (start - ipos) / dilation;
-  // During prepacking, the weight tensor was rearranged in order to optimize
-  // for data access linearity in this shader. Therefore we need to adjust the
-  // canonical coordinates to the corresponding index in the rearranged weight
-  // tensor. The x-coordinate is multipled by 4 since each group of 4 channels
-  // is folded into the X axis. The y-coordinate is offset based on the z-
-  // coordinate because the 2D planes were stacked atop each other vertically.
-  kstart.x *= 4;
-  kstart.y += pos.z * kernel_size.y;
-
-  // Perform the convolution by iterating over the overlay region.
-  VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
-  const int ic4 = in_group_size / 4;
-
-  int z_start = 0;
-  int z_end = ic4;
-  if (ngroups > 1) {
-    const int group_size = (out_limits.z) / ngroups;
-    const int group_idx = pos.z / group_size;
-
-    z_start = ic4 * group_idx;
-    z_end = z_start + ic4;
-  }
-
-  for (int z4 = z_start; z4 < z_end; ++z4, kstart.x += kernel_size.x * 4) {
-    for (int y = start.y, ky = kstart.y; y < end.y; y += dilation.y, ++ky) {
-      for (int x = start.x, kx = kstart.x; x < end.x; x += dilation.x, kx += 4) {
-        const VEC4_T in_texel = texelFetch(t_in, ivec3(x, y, z4), 0);
-        const ivec4 kxs = kx + ivec4(0, 1, 2, 3);
-
-        // To explain the calculation below, the contents of in_texel and the
-        // group of 4 texels loaded from t_kernel are shown:
-        //
-        //   in_texel               t_kernel
-        //    -x->                   ---x--->
-        //   +---+              +----+----+----+----+
-        // ^ | w |           ^  | D0 | D1 | D2 | D3 |
-        // | +---+           |  +----+----+----+----+
-        // | | z |           |  | C0 | C1 | C2 | C3 |
-        // z +---+           z  +----+----+----+----+
-        // | | y |           |  | B0 | B1 | B2 | B3 |
-        // | +---+           |  +----+----+----+----+
-        //   | x |              | A0 | A1 | A2 | A3 |
-        //   +---+              +----+----+----+----+
-        //
-        // In the t_kernel graphic, cells sharing the same letter are from
-        // the same batch/output channel index, and the number denotes a unique
-        // channel index. To calculate the output texel, the following
-        // calculation is performed:
-        //
-        //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
-        //  | x | | D0 |   | y | | D1 |   | z | | D2 |   | w | | D3 |
-        //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
-        //  | x | | C0 |   | y | | C1 |   | z | | C2 |   | w | | C3 |
-        //  +---+X+----+ + +---+X+----+ + +---+X+----+ + +---+X+----+
-        //  | x | | B0 |   | y | | B1 |   | z | | B2 |   | w | | B3 |
-        //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
-        //  | x | | A0 |   | y | | A1 |   | z | | A2 |   | w | | A3 |
-        //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
-        //
-        // which is expressed in the following statements.
-
-        sum = fma(in_texel.xxxx, texelFetch(t_kernel, ivec2(kxs.x, ky), 0), sum);
-        sum = fma(in_texel.yyyy, texelFetch(t_kernel, ivec2(kxs.y, ky), 0), sum);
-        sum = fma(in_texel.zzzz, texelFetch(t_kernel, ivec2(kxs.z, ky), 0), sum);
-        sum = fma(in_texel.wwww, texelFetch(t_kernel, ivec2(kxs.w, ky), 0), sum);
-      }
-    }
-  }
-
-  imageStore(t_out, pos, op(sum, out_min, out_max));
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d.yaml
deleted file mode 100644
index 1a5ed58876c..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-conv2d:
-  parameter_names_with_default_values:
-    OPERATOR: X
-    NDIM: 3
-    DTYPE: float
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: conv2d
-    - NAME: conv2d_clamp
-      OPERATOR: clamp(X, A, B)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_common.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_common.glslh
deleted file mode 100644
index 41825cba867..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_common.glslh
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef CONV2D_COMMON_GLSLH
-#define CONV2D_COMMON_GLSLH
-
-#include "common.glslh"
-
-struct Conv2DParams {
-  ivec2 kernel_size;
-  ivec2 stride;
-  ivec2 padding;
-  ivec2 dilation;
-  int groups;
-  int out_channels_per_group;
-  int in_channels_per_group;
-  int logical_K_per_group;
-  int K_per_group;
-  int K4_per_group;
-  int logical_K;
-  int K;
-  int K4;
-};
-
-#ifdef DEBUG_MODE
-
-void printConv2DParams(const Conv2DParams params) {
-  debugPrintfEXT("Conv2DParams: \\n");
-  debugPrintfEXT(
-      "  kernel_size: %d, %d\\n", params.kernel_size.x, params.kernel_size.y);
-  debugPrintfEXT("  stride: %d, %d\\n", params.stride.x, params.stride.y);
-  debugPrintfEXT("  padding: %d, %d\\n", params.padding.x, params.padding.y);
-  debugPrintfEXT("  dilation: %d, %d\\n", params.dilation.x, params.dilation.y);
-  debugPrintfEXT("  groups: %d\\n", params.groups);
-  debugPrintfEXT(
-      "  out_channels_per_group: %d\\n", params.out_channels_per_group);
-  debugPrintfEXT(
-      "  in_channels_per_group: %d\\n", params.in_channels_per_group);
-  debugPrintfEXT("  logical_K_per_group: %d\\n", params.logical_K_per_group);
-  debugPrintfEXT("  K_per_group: %d\\n", params.K_per_group);
-  debugPrintfEXT("  K4_per_group: %d\\n", params.K4_per_group);
-}
-
-#endif // DEBUG_MODE
-
-#endif // CONV2D_COMMON_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
deleted file mode 100644
index 02fbef29b75..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.glsl
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-#define op(X, A, B) ${OPERATOR}
-
-#include "indexing_utils.h"
-
-layout(std430) buffer;
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
-${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
-${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
-${layout_declare_ubo(4, "ivec3", "out_limits")}
-${layout_declare_ubo(5, "ivec4", "in_sizes")}
-${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
-${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
-${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "ngroups", "1")}
-
-/*
- * Computes a depthwise convolution. Each shader invocation calculates the
- * output at a single output location.
- */
-void main() {
-  const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x;
-  const ivec3 pos = ivec3(
-    gl_GlobalInvocationID.x % out_limits.x,
-    div_by_x % out_limits.y,
-    div_by_x / out_limits.y);
-
-  if (pos.z >= out_limits.z) {
-    return;
-  }
-
-  // Compute the index of the top-left element of the overlay region. Negative
-  // indices indicate that the top-left element is in a region added by padding.
-  const ivec2 ipos = pos.xy * stride - padding;
-
-  // Compute the start and end of the input indices to load. Padding is assumed
-  // to be constant 0 padding, so reads from the padding region are skipped.
-  const ivec2 start = ipos;
-  const ivec2 end = ipos + overlay_region.xy;
-
-  VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
-  int kx = 0;
-  for (int y = start.y; y < end.y; y += dilation.y) {
-    for (int x = start.x; x < end.x; x += dilation.x) {
-      // The weight kernel was rearranged such that every NxN filter is
-      // flattened to fit in one row. Each filter was then stacked on top of
-      // each other vertically.
-      const VEC4_T in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0);
-      sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum);
-      ++kx;
-    }
-  }
-
-  imageStore(t_out, pos, op(sum, out_min, out_max));
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.yaml
deleted file mode 100644
index 5202cddba76..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-conv2d_dw:
-  parameter_names_with_default_values:
-    OPERATOR: X
-    NDIM: 3
-    DTYPE: float
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: conv2d_dw
-    - NAME: conv2d_dw_clamp
-      OPERATOR: clamp(X, A, B)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
deleted file mode 100644
index 19250419baf..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-#define TILE_SIZE ${TILE_SIZE}
-
-#define BATCH_SIZE_X ${BATCH_SIZE_X}
-
-#define BATCH_SIZE_Y ${BATCH_SIZE_Y}
-
-#define LOCAL_WG_SIZE 64
-
-#define op(X, A, B) ${OPERATOR}
-
-#include "indexing_utils.h"
-
-layout(std430) buffer;
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
-${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
-${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 out_limits;
-  ivec4 in_sizes;
-  ivec2 kernel_size;
-  ivec2 stride;
-  ivec2 padding;
-  ivec2 dilation;
-  ivec2 overlay_region;
-  int in_group_size;
-  int dummy_padding;
-  float out_min;
-  float out_max;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-/*
- * Computes a depthwise convolution. Each shader invocation calculates the
- * output at a single output location.
- */
-
-void main() {
-  // x and y are divided by batch size to determine 3d position
-  // since work size is calculated by x * ((y + B_Y - 1) / B_Y) * z
-  const ivec2 out_limits_xy_scaled = (out_limits.xy + ivec2(BATCH_SIZE_X, BATCH_SIZE_Y) - 1) / ivec2(BATCH_SIZE_X, BATCH_SIZE_Y);
-
-  const uint div_by_x = gl_GlobalInvocationID.x / out_limits_xy_scaled.x;
-  ivec3 pos = ivec3(
-    gl_GlobalInvocationID.x % out_limits_xy_scaled.x,
-    div_by_x,
-    gl_GlobalInvocationID.y);
-
-  // do not process if top pixel does not fit within the output range
-  if (pos.y >= out_limits_xy_scaled.y || pos.z >= out_limits.z) {
-    return;
-  }
-
-  // scale pos.xy by batch sizes, because that's the top pixel to be processed
-  pos.x *= BATCH_SIZE_X;
-  pos.y *= BATCH_SIZE_Y;
-
-  // Compute the index of the top-left element of the overlay region. Negative
-  // indices indicate that the top-left element is in a region added by padding.
-  const ivec2 ipos = pos.xy * stride - padding;
-
-  // Compute the start and end of the input indices to load. Padding is assumed
-  // to be constant 0 padding, so any reads from the padding region is skipped.
-  const ivec2 start = ipos;
-
-  // sum outputs
-  VEC4_T sum[BATCH_SIZE_Y * BATCH_SIZE_X];
-
-  for (int i = 0; i < BATCH_SIZE_Y * BATCH_SIZE_X; i++) {
-    sum[i] = VEC4_T(0);
-  }
-
-  // array to store input texels
-  VEC4_T in_texels[TILE_SIZE + BATCH_SIZE_X - 1];
-
-  // array to store kernel data of previous y
-  VEC4_T prev_kernel_line[TILE_SIZE];
-
-  int kx = 0;
-  for (int y = start.y, i = 0; i < TILE_SIZE + BATCH_SIZE_Y - 1; y += dilation.y, i++) {
-    for (int x = start.x, j = 0; j < TILE_SIZE + BATCH_SIZE_X - 1; x += dilation.x, j++) {
-      in_texels[j] = texelFetch(t_in, ivec3(x, y, pos.z), 0);
-    }
-
-    // from 2nd iteration onwards accumulate dot product in 2nd sum
-    // based on kernel line data fetched in previous iteration and input texel from this iteration
-    if (i > 0) {
-      for (int j = 0; j < TILE_SIZE; j++) {
-        for (int s = 0; s < BATCH_SIZE_X; s++) {
-          sum[BATCH_SIZE_X + s] = fma(in_texels[j + s], prev_kernel_line[j], sum[BATCH_SIZE_X + s]);
-        }
-      }
-    }
-
-    // accumulate dot product in 1st sum only until tile size
-    if (i < TILE_SIZE) {
-      for (int j = 0; j < TILE_SIZE; j++, kx++) {
-        prev_kernel_line[j] = texelFetch(t_kernel, ivec2(kx, pos.z), 0);
-        for (int s = 0; s < BATCH_SIZE_X; s++) {
-          sum[s] = fma(in_texels[j + s], prev_kernel_line[j], sum[s]);
-        }
-      }
-    }
-  }
-
-  const VEC4_T bias = texelFetch(t_bias, ivec2(pos.z, 0), 0);
-  for (int y = 0; y < BATCH_SIZE_Y; y++) {
-    for (int x = 0; x < BATCH_SIZE_X; x++) {
-      const ivec3 out_pos = ivec3(pos.x + x, pos.y + y, pos.z);
-      if (all(lessThan(out_pos.xy, out_limits.xy))) {
-        imageStore(t_out, out_pos, op(sum[y * BATCH_SIZE_X + x] + bias, out_min, out_max));
-      }
-    }
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
deleted file mode 100644
index 9cf6c22c6ca..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-conv2d_dw_output_tile:
-  parameter_names_with_default_values:
-    OPERATOR: X
-    NDIM: 3
-    DTYPE: float
-    TILE_SIZE: 3
-    BATCH_SIZE_X: 4
-    BATCH_SIZE_Y: 2
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: conv2d_dw_output_tile_3x3
-    - NAME: conv2d_dw_output_tile_3x3_clamp
-      OPERATOR: clamp(X, A, B)
-    - NAME: conv2d_dw_output_tile_5x5
-      TILE_SIZE: 5
-    - NAME: conv2d_dw_output_tile_5x5_clamp
-      OPERATOR: clamp(X, A, B)
-      TILE_SIZE: 5
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl
deleted file mode 100644
index f5361d40b66..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define BUF_T ${buffer_scalar_type(DTYPE)}
-#define VEC4_T ${texel_type(DTYPE)}
-#define SCALAR_T ${texel_component_type(DTYPE)}
-
-#include "indexing_utils.h"
-
-$if DTYPE == "half":
-  #extension GL_EXT_shader_16bit_storage : require
-
-layout(std430) buffer;
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[2][DTYPE]} image_out;
-layout(set = 0, binding = 1) buffer  PRECISION restrict readonly Buffer {
-  BUF_T buffer_in[];
-};
-
-layout(push_constant) uniform PRECISION restrict Block {
-  ivec4 sizes;
-  ivec4 original_sizes;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-/*
- * Computes special prepacking for a depthwise convolution. Each shader invocation
- * calculates the input buffer location to read into the desired texel. This
- * packing was originally developed on CPU here:
- * https://github.com/pytorch/pytorch/blob/d63e7d0aa2e0a1b1fd7518f917224774afe97bae/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L58-L118
- */
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim);
-
-  if (any(greaterThanEqual(idx, sizes))) {
-    return;
-  }
-
-  // Map tensor_idx to normal buffer_i
-  const ivec4 p0 = tidx_to_nchwi(idx, sizes, packed_dim);
-
-  // Compute modified tensor_idx by inverting the CPU function
-  const int N = original_sizes.w;
-  const int C = original_sizes.z;
-  const int H = original_sizes.y;
-  const int W = original_sizes.x;
-  const int Y = sizes.y;
-
-  const ivec4 p1 = p0 / W;
-  const ivec4 p2 = p1 / H;
-
-  const ivec4 n = (p2 % Y) * 4 + (p2 / Y);
-  const ivec4 h = p1 % H;
-  const ivec4 w = p0 % W;
-
-  // Map modified tensor_idx to modifed buffer_i
-  // Zero out if modified tensor idx is out of bounds
-  const ivec4 buf_i = n * C*H*W + h * W + w;
-  const bvec4 mask = bvec4(lessThan(n, ivec4(N)));
-
-  VEC4_T texel = VEC4_T(0);
-  if (mask.x) {
-    texel.x = SCALAR_T(buffer_in[buf_i.x]);
-  }
-  if (mask.y) {
-    texel.y = SCALAR_T(buffer_in[buf_i.y]);
-  }
-  if (mask.z) {
-    texel.z = SCALAR_T(buffer_in[buf_i.z]);
-  }
-  if (mask.w) {
-    texel.w = SCALAR_T(buffer_in[buf_i.w]);
-  }
-
-  imageStore(image_out, pos.xy, texel);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.yaml
deleted file mode 100644
index 33342145a82..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-conv2d_dw_prepack_weights:
-  parameter_names_with_default_values:
-    DTYPE: float
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: conv2d_dw_prepack_weights
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl
deleted file mode 100644
index f161c1ba460..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-#define TILE_SIZE ${TILE_SIZE}
-
-#define op(X, A, B) ${OPERATOR}
-
-#include "indexing_utils.h"
-
-layout(std430) buffer;
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
-${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
-${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 out_limits;
-  ivec4 in_sizes;
-  ivec2 kernel_size;
-  ivec2 stride;
-  ivec2 padding;
-  ivec2 dilation;
-  ivec2 overlay_region;
-  int in_group_size;
-  int dummy_padding;
-  float out_min;
-  float out_max;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-/*
- * Computes a depthwise convolution. Each shader invocation calculates the
- * output at a single output location.
- */
-
-void main() {
-  const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x;
-  const ivec3 pos = ivec3(
-    gl_GlobalInvocationID.x % out_limits.x,
-    div_by_x,
-    gl_GlobalInvocationID.y);
-
-  // do not process if top pixel does not fit within the output range
-  if (pos.y >= out_limits.y || pos.z >= out_limits.z) {
-    return;
-  }
-
-  // Compute the index of the top-left element of the overlay region. Negative
-  // indices indicate that the top-left element is in a region added by padding.
-  const ivec2 ipos = pos.xy * stride - padding;
-
-  // Compute the start and end of the input indices to load. Padding is assumed
-  // to be constant 0 padding, so any reads from the padding region is skipped.
-  const ivec2 start = ipos;
-
-  VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
-  int kx = 0;
-  for (int y = start.y, i = 0; i < TILE_SIZE; y += dilation.y, i++) {
-    for (int x = start.x, j = 0; j < TILE_SIZE; x += dilation.x, j++) {
-      // The weight kernel was rearranged such that every NxN filter is
-      // flattened to fit in one row. Each filter was then stacked on top of
-      // each other vertically.
-      const vec4 in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0);
-      sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum);
-      kx++;
-    }
-  }
-
-  imageStore(t_out, pos, op(sum, out_min, out_max));
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.yaml
deleted file mode 100644
index f2ece8fa0f9..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-conv2d_dw_sned_output_tile:
-  parameter_names_with_default_values:
-    OPERATOR: X
-    NDIM: 3
-    DTYPE: float
-    TILE_SIZE: 3
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: conv2d_dw_sned_output_tile_3x3
-    - NAME: conv2d_dw_sned_output_tile_3x3_clamp
-      OPERATOR: clamp(X, A, B)
-    - NAME: conv2d_dw_sned_output_tile_5x5
-      TILE_SIZE: 5
-    - NAME: conv2d_dw_sned_output_tile_5x5_clamp
-      OPERATOR: clamp(X, A, B)
-      TILE_SIZE: 5
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_im2col_block.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_im2col_block.glslh
deleted file mode 100644
index 7add8c4cd16..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_im2col_block.glslh
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef CONV2D_FP_IM2COL_BLOCK
-#define CONV2D_FP_IM2COL_BLOCK
-
-/*
- * Defines utilities to convert between (col, row) indices of an im2col matrix
- * and 4-dimension tensor indices of image tensors.
- *
- * Requires:
- * - output_sizes to be defined in the shader layout, corresponding to the sizes
- *   of the output image of the convolution op.
- * - image_sizes to be defined in the shader layout, corresponding to the sizes
- *   of the input image of the convolution op.
- * - conv2d_params to be defined in the shader layout
- */
-
-#extension GL_EXT_control_flow_attributes : require
-
-#include "common.glslh"
-#include "conv2d_common.glslh"
-
-struct Im2ColMatrixIdx {
-  int row;
-  int col;
-  // Relevant for grouped convolution. This indicates the column index relative
-  // to the first column in the group.
-  int col_idx_in_group;
-  int group_idx;
-};
-
-void unwrap_m(out TensorIndex4D out_tidx_base, const int m) {
-  out_tidx_base.data[3] = m / (output_sizes.y * output_sizes.x);
-  out_tidx_base.data[1] = (m / output_sizes.x) % output_sizes.y;
-  out_tidx_base.data[0] = m % output_sizes.x;
-
-  // Initialize channels to 0; assume it will be set later on
-  out_tidx_base.data[2] = 0;
-}
-
-void im2col_tidx_to_output_tidx(
-    out TensorIndex4D output_tidx,
-    const Im2ColMatrixIdx im2col_tidx) {
-  unwrap_m(output_tidx, im2col_tidx.row);
-  // Set channels
-  output_tidx.data.z = im2col_tidx.col;
-}
-
-/*
- * Converts im2col matrix position to corresponding 4D tensor index, accounting
- * for grouped convolutions. The conversion should ensure that all data within
- * the same group occupy a contiguous block in memory.
- */
-void im2col_idx_to_input_tidx(
-    out TensorIndex4D input_tidx,
-    const Im2ColMatrixIdx im2col_idx) {
-  TensorIndex4D output_tidx;
-  unwrap_m(output_tidx, im2col_idx.row);
-
-  const int in_channels_per_group = conv2d_params.in_channels_per_group;
-  // Determine the corresponding position within the convolution window based
-  // on the col index (more specifically, the col index within the group)
-  const int channel_within_group =
-      im2col_idx.col_idx_in_group % in_channels_per_group;
-  const int kernel_x = (im2col_idx.col_idx_in_group / in_channels_per_group) %
-      conv2d_params.kernel_size.x;
-  const int kernel_y = im2col_idx.col_idx_in_group /
-      (in_channels_per_group * conv2d_params.kernel_size.x);
-
-  // Calculate the actual input channel index
-  const int channel_idx =
-      im2col_idx.group_idx * conv2d_params.in_channels_per_group +
-      channel_within_group;
-
-  // Calculate corresponding input coordinates based on output position
-  // associated with the row index.
-  const int input_y = int(output_tidx.data.y * conv2d_params.stride.y) -
-      int(conv2d_params.padding.y) + int(kernel_y * conv2d_params.dilation.y);
-  const int input_x = int(output_tidx.data.x * conv2d_params.stride.x) -
-      int(conv2d_params.padding.x) + int(kernel_x * conv2d_params.dilation.x);
-
-  input_tidx.data = ivec4(input_x, input_y, channel_idx, output_tidx.data.w);
-}
-
-// 4x4 block of the im2col matrix
-struct FPIm2ColBlock {
-  VEC4_T data[4];
-};
-
-#endif // CONV2D_FP_IM2COL_BLOCK
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_im2col_block_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_im2col_block_load.glslh
deleted file mode 100644
index c02b070e17e..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_im2col_block_load.glslh
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef CONV2D_FP_IM2COL_BLOCK_LOAD
-#define CONV2D_FP_IM2COL_BLOCK_LOAD
-
-/*
- * Defines utilities to load data for a 4x4 im2col matrix block from an
- * input image and store the data as a FPInputTile.
- *
- * Requires:
- * - t_input to be defined in the shader layout, representing the texture of the
- *   source image
- * - conv2d_params to be defined in the shader layout
- */
-
-#extension GL_EXT_control_flow_attributes : require
-
-#extension GL_EXT_debug_printf : require
-
-#include "common.glslh"
-#include "conv2d_common.glslh"
-#include "conv2d_fp_im2col_block.glslh"
-#include "linear_fp_input_tile.glslh"
-
-VEC4_T load_input_texel(const TensorIndex4D tidx) {
-  // Assumes batch size is 1 and channels packing
-  return texelFetch(
-      t_input, ivec3(tidx.data.x, tidx.data.y, div_4(tidx.data.z)), 0);
-}
-
-T load_input_texel_element(const TensorIndex4D tidx) {
-  const int channels_texel_idx = div_4(tidx.data.z);
-  const int texel_comp = mod_4(tidx.data.z);
-  // Assumes batch size is 1 and channels packing
-  return texelFetch(
-      t_input,
-      ivec3(tidx.data.x, tidx.data.y, channels_texel_idx),
-      0)[texel_comp];
-}
-
-// k4 -> group of 4 input channels idx
-// m -> flattened batch, output width, output height dim idx
-/*
- * Fast impl for when the input image's channels per group is a multiple of 4.
- * In this case, it is guaranteed that a texel loaded from the input can be
- * stored directly to the output without any additional filtering.
- */
-void load_im2col_block_fast(
-    out FPIm2ColBlock block,
-    const int k4,
-    const int m4,
-    const int logical_K,
-    const int M) {
-  Im2ColMatrixIdx im2col_idx;
-  im2col_idx.col = mul_4(k4); // k
-  im2col_idx.row = mul_4(m4); // m
-
-  // Due to the assumption that in_channels_per_group % 4 == 0, it is
-  // guaranteed that the next 4 columns (including this one) is part of the
-  // same group.
-  im2col_idx.group_idx = im2col_idx.col / conv2d_params.K_per_group;
-  im2col_idx.col_idx_in_group = im2col_idx.col % conv2d_params.K_per_group;
-
-  [[unroll]] for (int m_off = 0; m_off < 4; ++m_off) {
-    if (im2col_idx.row >= M) {
-      block.data[m_off] = VEC4_T(0);
-      continue;
-    }
-
-    TensorIndex4D input_tidx;
-    im2col_idx_to_input_tidx(input_tidx, im2col_idx);
-
-    // Load the texel
-    block.data[m_off] = load_input_texel(input_tidx);
-
-    im2col_idx.row++;
-  }
-}
-
-/*
- * If input image channels is not a multiple of 4, then it is likely that for
- * some matrix texels, the source data is split between different texels of the
- * source image. In this case it's better to retreive each element individually.
- */
-void load_im2col_block_slow(
-    out FPIm2ColBlock block,
-    const int k4,
-    const int m4,
-    const int logical_K,
-    const int M) {
-  Im2ColMatrixIdx im2col_idx_base;
-  im2col_idx_base.col = mul_4(k4);
-  im2col_idx_base.row = mul_4(m4);
-
-  im2col_idx_base.group_idx = im2col_idx_base.col / conv2d_params.K_per_group;
-  im2col_idx_base.col_idx_in_group =
-      im2col_idx_base.col % conv2d_params.K_per_group;
-
-  [[unroll]] for (int m_off = 0; m_off < 4; ++m_off) {
-    [[unroll]] for (int k_off = 0; k_off < 4; ++k_off) {
-      Im2ColMatrixIdx im2col_idx = im2col_idx_base;
-      im2col_idx.row += m_off;
-      im2col_idx.col_idx_in_group += k_off;
-
-      // bounds checking
-      if (im2col_idx.col_idx_in_group >= conv2d_params.logical_K_per_group ||
-          im2col_idx.row >= M) {
-        block.data[m_off][k_off] = T(0);
-        continue;
-      }
-
-      TensorIndex4D input_tidx;
-      im2col_idx_to_input_tidx(input_tidx, im2col_idx);
-
-      block.data[m_off][k_off] = load_input_texel_element(input_tidx);
-    }
-  }
-}
-
-void load_im2col_block(
-    out FPIm2ColBlock block,
-    const int k4,
-    const int m4,
-    const int logical_K,
-    const int M) {
-  if (mod_4(conv2d_params.in_channels_per_group) == 0) {
-    load_im2col_block_fast(block, k4, m4, logical_K, M);
-  } else {
-    load_im2col_block_slow(block, k4, m4, logical_K, M);
-  }
-}
-
-void load_input_im2col_tile(
-    out FPInputTile tile,
-    const int k4_start,
-    const int m4_start,
-    const int logical_K,
-    const int M) {
-  FPIm2ColBlock block;
-#if TILE_K4 == 1
-  [[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) {
-    load_im2col_block(block, k4_start, m4_start + m4, logical_K, M);
-    for (int row = 0; row < 4; ++row) {
-      const int m = mul_4(m4) + row;
-      tile.data[m][0] = block.data[row];
-    }
-  }
-
-#else
-  [[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) {
-    [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) {
-      load_im2col_block(block, k4_start + k4, m4_start + m4, logical_K, M);
-      for (int row = 0; row < 4; ++row) {
-        const int m = mul_4(m4) + row;
-        tile.data[m][k4] = block.data[row];
-      }
-    }
-  }
-
-#endif
-}
-
-#endif // CONV2D_FP_IM2COL_BLOCK_LOAD
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_im2col_block_store.glslh b/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_im2col_block_store.glslh
deleted file mode 100644
index 2171d75c628..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_fp_im2col_block_store.glslh
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef CONV2D_FP_IM2COL_BLOCK_STORE
-#define CONV2D_FP_IM2COL_BLOCK_STORE
-
-/*
- * Defines utilities to store data for a 4x4 im2col output matrix block computed
- * from matrix multiplication to an output image.
- *
- * Requires:
- * - t_output to be defined in the shader layout, representing the texture of
- *   the output image
- */
-
-#extension GL_EXT_control_flow_attributes : require
-
-#include "common.glslh"
-#include "conv2d_common.glslh"
-#include "conv2d_fp_im2col_block.glslh"
-#include "linear_fp_output_tile.glslh"
-
-// TODO: implement buffer support
-void write_output_texel(const VEC4_T out_texel, const TensorIndex4D tidx) {
-  // Assume batch size is 1
-  imageStore(
-      t_output, ivec3(tidx.data.x, tidx.data.y, div_4(tidx.data.z)), out_texel);
-}
-
-void write_im2col_tile_as_image(
-    const FPOutTile tile,
-    const int n4_start,
-    const int m_start) {
-  Im2ColMatrixIdx im2col_tidx;
-  im2col_tidx.col = mul_4(n4_start);
-  im2col_tidx.row = m_start;
-#if TILE_K4 == 1
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    TensorIndex4D output_tidx;
-    im2col_tidx_to_output_tidx(output_tidx, im2col_tidx);
-
-    if (any(greaterThanEqual(output_tidx.data, output_sizes))) {
-      continue;
-    }
-    write_output_texel(tile.data[m][0], output_tidx);
-    im2col_tidx.row++;
-  }
-
-#else
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) {
-      TensorIndex4D output_tidx;
-      im2col_tidx_to_output_tidx(output_tidx, im2col_tidx);
-
-      write_output_texel(tile.data[m][k4], output_tidx);
-      im2col_tidx.row++;
-    }
-  }
-
-#endif
-}
-
-#endif // CONV2D_FP_IM2COL_BLOCK_STORE
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl
deleted file mode 100644
index d2f3f615f74..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define BUF_T ${buffer_scalar_type(DTYPE)}
-#define VEC4_T ${texel_type(DTYPE)}
-#define SCALAR_T ${texel_component_type(DTYPE)}
-
-#include "indexing_utils.h"
-
-$if DTYPE == "half":
-  #extension GL_EXT_shader_16bit_storage : require
-
-layout(std430) buffer;
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[2][DTYPE]} image_out;
-layout(set = 0, binding = 1) buffer  PRECISION restrict readonly Buffer {
-  BUF_T buffer_in[];
-};
-
-layout(push_constant) uniform PRECISION restrict Block {
-  ivec4 sizes;
-  ivec4 original_sizes;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-/*
- * Computes special prepacking for a 2D convolution. Each shader invocation
- * calculates the input buffer locations to read into the desired texel. This
- * packing was originally developed on CPU here:
- * https://github.com/pytorch/pytorch/blob/d63e7d0aa2e0a1b1fd7518f917224774afe97bae/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L120-L211
- */
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim);
-
-  if (any(greaterThanEqual(idx, sizes))) {
-    return;
-  }
-
-  // Map tensor_idx to normal buffer_i
-  const ivec4 p0 = tidx_to_nchwi(idx, sizes, packed_dim);
-
-  // Compute modified tensor_idx by inverting the CPU function
-  const int N = original_sizes.w;
-  const int C = original_sizes.z;
-  const int H = original_sizes.y;
-  const int W = original_sizes.x;
-  const int J = sizes.x / (4*W);
-  const int K = sizes.y / H;
-
-  const ivec4 p1 = p0 / 4;
-  const ivec4 p2 = p1 / W;
-  const ivec4 p3 = p2 / J;
-  const ivec4 p4 = p3 / H;
-
-  const ivec4 n = (p4 % K) * 4 + (p4 / K);
-  const ivec4 c = (p2 % J) * 4 + (p0 % 4);
-  const ivec4 h = p3 % H;
-  const ivec4 w = p1 % W;
-
-  // Map modified tensor_idx to modified buffer_i
-  // Zero out if modified tensor idx is out of bounds
-  const ivec4 buf_i = n * C*H*W + c * H*W + h * W + w;
-  const bvec4 mask = bvec4(ivec4(lessThan(n, ivec4(N))) & ivec4(lessThan(c, ivec4(C))));
-
-  VEC4_T texel = VEC4_T(0);
-  if (mask.x) {
-    texel.x = SCALAR_T(buffer_in[buf_i.x]);
-  }
-  if (mask.y) {
-    texel.y = SCALAR_T(buffer_in[buf_i.y]);
-  }
-  if (mask.z) {
-    texel.z = SCALAR_T(buffer_in[buf_i.z]);
-  }
-  if (mask.w) {
-    texel.w = SCALAR_T(buffer_in[buf_i.w]);
-  }
-
-  imageStore(image_out, pos.xy, texel);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.yaml
deleted file mode 100644
index 28cf63dc163..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-conv2d_prepack_weights:
-  parameter_names_with_default_values:
-    DTYPE: float
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: conv2d_prepack_weights
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
deleted file mode 100644
index 4c6031152ee..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-#define TILE_SIZE_X ${TILE_SIZE_X}
-#define TILE_SIZE_Y ${TILE_SIZE_Y}
-
-#define op(X, A, B) ${OPERATOR}
-
-#include "indexing_utils.h"
-
-layout(std430) buffer;
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
-${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
-${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 out_limits;
-  ivec2 stride;
-  ivec2 padding;
-  int in_group_size;
-  int dummy_padding;
-  float out_min;
-  float out_max;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "ngroups", "1")}
-
-#extension GL_EXT_control_flow_attributes : require
-
-/*
- * Computes a 2D pointwise convolution of an NxN output tile. Calculating an
- * output tile for pointwise convolution is more efficient because the kernel
- * size is only 1x1, making it easier to re-use loaded texels from t_kernel.
- */
-void main() {
-  const int out_limits_scaled[2] =
-    {(out_limits.x + (TILE_SIZE_X - 1)) / TILE_SIZE_X,
-     (out_limits.y + (TILE_SIZE_Y - 1)) / TILE_SIZE_Y};
-
-  const int div_by_x = int(gl_GlobalInvocationID.x / out_limits_scaled[0]);
-  const int out_pos[3] = {int(gl_GlobalInvocationID.x % out_limits_scaled[0]), div_by_x, int(gl_GlobalInvocationID.y)};
-
-  // If the top left position is out of bounds, then this invocation will have
-  // no work to do.
-  if (out_pos[1] >= out_limits_scaled[1] || out_pos[2] >= out_limits.z) {
-    return;
-  }
-
-  // Output position for TILE_SIZE = 2
-  // +--------+--------+
-  // | pos[0] | pos[1] |
-  // +--------+--------+
-  // | pos[2] | pos[3] |
-  // +--------+--------+
-  int pos[TILE_SIZE_X * TILE_SIZE_Y * 2];
-  for (int y = 0, i = 0; y < TILE_SIZE_Y; ++y) {
-    for (int x = 0; x < TILE_SIZE_X; ++x) {
-      pos[i * 2] = out_pos[0] * TILE_SIZE_X + x;
-      pos[i * 2 + 1] = out_pos[1] * TILE_SIZE_Y + y;
-      i++;
-    }
-  }
-
-  // Compute the index of the input texture that needs to be loaded for each
-  // output position. Note that negative indices can be produced indicating that
-  // the top-left element is in a region added by padding.
-  int ipos[TILE_SIZE_X * TILE_SIZE_Y * 2];
-  for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-    ipos[i * 2] = pos[i * 2] * stride.x - padding.x;
-    ipos[i * 2 + 1] = pos[i * 2 + 1] * stride.y - padding.y;
-  }
-
-  // Final output array where each element is a tensor value.
-  // Tuple of consecutive 4 elements represents a single output texel.
-  float sum[TILE_SIZE_X * TILE_SIZE_Y * 4];
-
-  const vec4 bias = texelFetch(t_bias, ivec2(out_pos[2], 0), 0);
-
-  // Initialize the output array with the bias value
-  for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y * 4; i += 4) {
-    sum[i] = bias.x;
-    sum[i + 1] = bias.y;
-    sum[i + 2] = bias.z;
-    sum[i + 3] = bias.w;
-  }
-
-  int z4 = 0;
-  // Since the kernel is 1x1, we only have to loop over the depth dimension.
-  for (int z = 0; z < in_group_size; z += 4, ++z4) {
-    // During prepacking, the weight tensor has been permuted so that the
-    // channel (IC) dim is along the x-axis, and the batch (OC) dim is along
-    // the z-axis.
-    float kernel_values[4 * 4]; // 4 channels, 4 elements per channel
-
-    // Load kernel values from texels to array
-    [[unroll]] for (int i = 0; i < 4; ++i) {
-      const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, out_pos[2]), 0);
-      kernel_values[i * 4 + 0] = k_tex.x;
-      kernel_values[i * 4 + 1] = k_tex.y;
-      kernel_values[i * 4 + 2] = k_tex.z;
-      kernel_values[i * 4 + 3] = k_tex.w;
-    }
-
-    for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-      const vec4 in_tex = texelFetch(t_in, ivec3(ipos[i * 2], ipos[i * 2 + 1], z4), 0);
-      // Load the input texel into an array
-      float tex_values[4];
-      tex_values[0] = in_tex.x;
-      tex_values[1] = in_tex.y;
-      tex_values[2] = in_tex.z;
-      tex_values[3] = in_tex.w;
-
-      // For 2x2 tile size algorithm works as follows.
-      // To explain the calculations below, the contents of one in_tex and the
-      // group of 4 texels loaded from t_kernel are shown:
-      //
-      //   in_tex                 t_kernel
-      //    -x->                   ---x--->
-      //   +---+              +----+----+----+----+
-      // ^ | w |           ^  | D0 | D1 | D2 | D3 |
-      // | +---+           |  +----+----+----+----+
-      // | | z |           |  | C0 | C1 | C2 | C3 |
-      // z +---+           z  +----+----+----+----+
-      // | | y |           |  | B0 | B2 | B2 | B3 |
-      // | +---+           |  +----+----+----+----+
-      //   | x |              | A0 | A1 | A2 | A3 |
-      //   +---+              +----+----+----+----+
-      //
-      // In the t_kernel graphic, cells sharing the same letter are from
-      // the same batch/output channel index, and the number denotes a unique
-      // channel index. To calculate the output texel, the following
-      // calculation is performed:
-      //
-      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
-      //  | x | | D0 |   | y | | D1 |   | z | | D2 |   | w | | D3 |
-      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
-      //  | x | | C0 |   | y | | C1 |   | z | | C2 |   | w | | C3 |
-      //  +---+X+----+ + +---+X+----+ + +---+X+----+ + +---+X+----+
-      //  | x | | B0 |   | y | | B1 |   | z | | B2 |   | w | | B3 |
-      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
-      //  | x | | A0 |   | y | | A1 |   | z | | A2 |   | w | | A3 |
-      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
-      //
-      //  which is what is expressed in the following calculations. This is done
-      //  for each output position.
-      for (int j = 0; j < 4; ++j) {
-        sum[i * 4 + j] = tex_values[0] * kernel_values[0 + j] + sum[i * 4 + j];
-        sum[i * 4 + j] = tex_values[1] * kernel_values[4 + j] + sum[i * 4 + j];
-        sum[i * 4 + j] = tex_values[2] * kernel_values[8 + j] + sum[i * 4 + j];
-        sum[i * 4 + j] = tex_values[3] * kernel_values[12 + j] + sum[i * 4 + j];
-      }
-    }
-  }
-
-  for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-    const ivec3 pos_l = ivec3(pos[i * 2], pos[i * 2 + 1], out_pos[2]);
-    if (all(lessThan(pos_l, out_limits.xyz))) {
-      imageStore(t_out, pos_l, op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max));
-    }
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml
deleted file mode 100644
index d4cb69d7648..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-conv2d_pw:
-  parameter_names_with_default_values:
-    OPERATOR: X
-    NDIM: 3
-    DTYPE: float
-    TILE_SIZE_X: 1
-    TILE_SIZE_Y: 4
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: conv2d_pw
-    - NAME: conv2d_pw_clamp
-      OPERATOR: clamp(X, A, B)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl
deleted file mode 100644
index 9f84afeb1a1..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.glsl
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-#define TILE_SIZE_X uint16_t(${TILE_SIZE_X})
-#define TILE_SIZE_Y uint16_t(${TILE_SIZE_Y})
-
-#define op(X, A, B) ${OPERATOR}
-
-#include "indexing_utils.h"
-
-layout(std430) buffer;
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
-${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
-${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 out_limits;
-  ivec2 stride;
-  ivec2 padding;
-  int in_group_size;
-  int dummy_padding;
-  float out_min;
-  float out_max;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "ngroups", "1")}
-
-#extension GL_EXT_control_flow_attributes : require
-
-/*
- * Computes a 2D pointwise convolution of an NxN output tile. Calculating an
- * output tile for pointwise convolution is more efficient because the kernel
- * size is only 1x1, making it easier to re-use loaded texels from t_kernel.
- */
-void main() {
-  const int out_limits_scaled[2] =
-    {(out_limits.x + (TILE_SIZE_X - 1)) / TILE_SIZE_X,
-     (out_limits.y + (TILE_SIZE_Y - 1)) / TILE_SIZE_Y};
-
-  const uint16_t div_by_x = uint16_t(gl_GlobalInvocationID.x / out_limits_scaled[0]);
-  const uint16_t out_pos_xy[2] = {uint16_t(gl_GlobalInvocationID.x % out_limits_scaled[0]), div_by_x};
-  const int out_pos_z = int(gl_GlobalInvocationID.y);
-
-  // If the top left position is out of bounds, then this invocation will have
-  // no work to do.
-  if (out_pos_xy[1] >= out_limits_scaled[1] || out_pos_z >= out_limits.z) {
-    return;
-  }
-
-  // Output position for TILE_SIZE = 2
-  // +--------+--------+
-  // | pos[0] | pos[1] |
-  // +--------+--------+
-  // | pos[2] | pos[3] |
-  // +--------+--------+
-  uint16_t pos[TILE_SIZE_X * TILE_SIZE_Y * 2];
-  for (uint16_t y = uint16_t(0), i = uint16_t(0); y < TILE_SIZE_Y; ++y) {
-    for (uint16_t x = uint16_t(0); x < TILE_SIZE_X; ++x) {
-      pos[i * 2] = out_pos_xy[0] * TILE_SIZE_X + x;
-      pos[i * 2 + 1] = out_pos_xy[1] * TILE_SIZE_Y + y;
-      i++;
-    }
-  }
-
-  // Final output array where each element is a tensor value.
-  // Tuple of consecutive 4 elements represents a single output texel.
-  float sum[TILE_SIZE_X * TILE_SIZE_Y * 4];
-
-  // Initialize the output array with the bias value
-  for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y * 4; i++) {
-    sum[i] = 0;
-  }
-
-  int z4 = 0;
-  // Since the kernel is 1x1, we only have to loop over the depth dimension.
-  for (int z = 0; z < in_group_size; z += 4, ++z4) {
-    // During prepacking, the weight tensor has been permuted so that the
-    // channel (IC) dim is along the x-axis, and the batch (OC) dim is along
-    // the z-axis.
-    float kernel_values[4 * 4]; // 4 channels, 4 elements per channel
-
-    // Load kernel values from texels to array
-    [[unroll]] for (int i = 0; i < 4; ++i) {
-      const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, out_pos_z), 0);
-      kernel_values[i * 4 + 0] = k_tex.x;
-      kernel_values[i * 4 + 1] = k_tex.y;
-      kernel_values[i * 4 + 2] = k_tex.z;
-      kernel_values[i * 4 + 3] = k_tex.w;
-    }
-
-    for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-      const vec4 in_tex = texelFetch(t_in, ivec3(pos[i * 2], pos[i * 2 + 1], z4), 0);
-      // Load the input texel into an array
-      float tex_values[4];
-      tex_values[0] = in_tex.x;
-      tex_values[1] = in_tex.y;
-      tex_values[2] = in_tex.z;
-      tex_values[3] = in_tex.w;
-
-      // For 2x2 tile size algorithm works as follows.
-      // To explain the calculations below, the contents of one in_tex and the
-      // group of 4 texels loaded from t_kernel are shown:
-      //
-      //   in_tex                 t_kernel
-      //    -x->                   ---x--->
-      //   +---+              +----+----+----+----+
-      // ^ | w |           ^  | D0 | D1 | D2 | D3 |
-      // | +---+           |  +----+----+----+----+
-      // | | z |           |  | C0 | C1 | C2 | C3 |
-      // z +---+           z  +----+----+----+----+
-      // | | y |           |  | B0 | B2 | B2 | B3 |
-      // | +---+           |  +----+----+----+----+
-      //   | x |              | A0 | A1 | A2 | A3 |
-      //   +---+              +----+----+----+----+
-      //
-      // In the t_kernel graphic, cells sharing the same letter are from
-      // the same batch/output channel index, and the number denotes a unique
-      // channel index. To calculate the output texel, the following
-      // calculation is performed:
-      //
-      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
-      //  | x | | D0 |   | y | | D1 |   | z | | D2 |   | w | | D3 |
-      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
-      //  | x | | C0 |   | y | | C1 |   | z | | C2 |   | w | | C3 |
-      //  +---+X+----+ + +---+X+----+ + +---+X+----+ + +---+X+----+
-      //  | x | | B0 |   | y | | B1 |   | z | | B2 |   | w | | B3 |
-      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
-      //  | x | | A0 |   | y | | A1 |   | z | | A2 |   | w | | A3 |
-      //  +---+ +----+   +---+ +----+   +---+ +----+   +---+ +----+
-      //
-      //  which is what is expressed in the following calculations. This is done
-      //  for each output position.
-      for (int j = 0; j < 4; ++j) {
-        sum[i * 4 + j] = tex_values[0] * kernel_values[0 + j] + sum[i * 4 + j];
-        sum[i * 4 + j] = tex_values[1] * kernel_values[4 + j] + sum[i * 4 + j];
-        sum[i * 4 + j] = tex_values[2] * kernel_values[8 + j] + sum[i * 4 + j];
-        sum[i * 4 + j] = tex_values[3] * kernel_values[12 + j] + sum[i * 4 + j];
-      }
-    }
-  }
-
-  const vec4 bias = texelFetch(t_bias, ivec2(out_pos_z, 0), 0);
-
-  for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-    const ivec3 pos_l = ivec3(pos[i * 2], pos[i * 2 + 1], out_pos_z);
-    if (all(lessThan(pos_l.xy, out_limits.xy))) {
-      const vec4 out_sum = vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]);
-      imageStore(t_out, pos_l, op(out_sum + bias, out_min, out_max));
-    }
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml
deleted file mode 100644
index ebfee11c405..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_s1p0.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-conv2d_pw_s1p0:
-  parameter_names_with_default_values:
-    OPERATOR: X
-    NDIM: 3
-    DTYPE: float
-    TILE_SIZE_X: 1
-    TILE_SIZE_Y: 4
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: conv2d_pw_s1p0
-    - NAME: conv2d_pw_s1p0_clamp
-      OPERATOR: clamp(X, A, B)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8csw_linear_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8csw_linear_tiled.glsl
deleted file mode 100644
index e2b239800a8..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8csw_linear_tiled.glsl
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_load_type(DTYPE, OUTPUT_STORAGE)}
-#define T ${texel_load_component_type(DTYPE, OUTPUT_STORAGE)}
-
-$if OUTPUT_STORAGE == "buffer":
-  #define OUTPUT_BUFFER
-$if INPUT_STORAGE == "buffer":
-  #define INPUT_BUFFER
-$if WEIGHT_STORAGE == "buffer":
-  #define WEIGHT_BUFFER
-
-#define TILE_M4 ${TILE_M4}
-#define TILE_K4 ${TILE_K4}
-#define TILE_N4 ${TILE_N4}
-
-#define TILE_M ${TILE_M4 * 4}
-#define TILE_K ${TILE_K4 * 4}
-#define TILE_N ${TILE_N4 * 4}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "conv2d_common.glslh"
-
-${layout_declare_tensor(B, "w", "t_output", DTYPE, OUTPUT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)}
-
-${layout_declare_ubo(B, "ivec4", "output_sizes")}
-${layout_declare_ubo(B, "ivec4", "input_sizes")}
-${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "apply_bias", "1")}
-
-#include "linear_fp_input_tile_load.glslh"
-#include "linear_int8_weight_tile_load.glslh"
-#include "linear_fp_weight_scales_load.glslh"
-#include "linear_fp_bias_load.glslh"
-#include "linear_fp_output_tile_fp_int8_compute.glslh"
-#include "linear_fp_output_tile_fp_compute.glslh"
-#include "conv2d_fp_im2col_block_store.glslh"
-
-void main() {
-  // Each thread writes out a 4 wide x 4 high tile of output values
-  const int out_tile_x = int(gl_GlobalInvocationID.x);
-  const int out_tile_y = int(gl_GlobalInvocationID.y);
-
-  const int n = int(out_tile_x * TILE_N);
-  const int m = int(out_tile_y * TILE_M);
-
-  const int n4 = div_4(n);
-  const int m4 = div_4(m);
-
-  // M = flattened output width, height, batches dims
-  const int M = output_sizes.x * output_sizes.y * output_sizes.w;
-  // N = output channels
-  const int N = output_sizes.z;
-
-  if (n >= N || m >= M) {
-    return;
-  }
-
-  const int group_idx = n / conv2d_params.out_channels_per_group;
-  const int input_k4_offset = conv2d_params.K4_per_group * group_idx;
-
-  const int K4 = conv2d_params.K4;
-  const int N4 = div_up_4(N);
-
-  FPOutTile out_tile;
-  initialize(out_tile);
-
-  FPInputTile in_tile;
-  Int8WeightTile int8_weight_tile;
-
-  const bool dont_check_bounds = (M - m) >= TILE_M;
-
-  if (dont_check_bounds) {
-    for (int k4 = 0; k4 < conv2d_params.K4_per_group; k4++) {
-      load_input_tile_no_checks(in_tile, k4 + input_k4_offset, m, K4, M);
-      load_int8_weight_tile(int8_weight_tile, n4, k4, N4);
-      fp_accumulate_with_int8_weight(out_tile, in_tile, int8_weight_tile);
-    }
-  } else {
-    for (int k4 = 0; k4 < conv2d_params.K4_per_group; k4++) {
-      load_input_tile_with_checks(in_tile, k4 + input_k4_offset, m, K4, M);
-      load_int8_weight_tile(int8_weight_tile, n4, k4, N4);
-      fp_accumulate_with_int8_weight(out_tile, in_tile, int8_weight_tile);
-    }
-  }
-
-  FPPerOutChannelParams weight_scales_tile;
-  load_weight_scales_tile(weight_scales_tile, n4);
-
-  if (apply_bias > 0) {
-    FPPerOutChannelParams bias_tile;
-    load_bias_tile(bias_tile, n4);
-
-    apply_scales_and_biases(out_tile, weight_scales_tile, bias_tile);
-  }
-  else {
-    apply_scales(out_tile, weight_scales_tile);
-  }
-
-  write_im2col_tile_as_image(out_tile, n4, m);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8csw_linear_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8csw_linear_tiled.yaml
deleted file mode 100644
index 9b3b5aa2c0a..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8csw_linear_tiled.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-conv2d_q8csw_linear_tiled:
-  parameter_names_with_default_values:
-    DTYPE: float
-    OUTPUT_STORAGE: texture3d
-    INPUT_STORAGE: buffer
-    WEIGHT_STORAGE: texture2d
-    TILE_M4: 1
-    TILE_N4: 1
-    TILE_K4: 1
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-  shader_variants:
-    - NAME: conv2d_q8csw_linear_tiled_texture3d_buffer_texture2d
-    - NAME: conv2d_q8csw_linear_tiled_texture3d_buffer_buffer
-      WEIGHT_STORAGE: buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_linear_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_linear_tiled.glsl
deleted file mode 100644
index f74a1311095..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_linear_tiled.glsl
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_load_type(DTYPE, OUTPUT_STORAGE)}
-#define T ${texel_load_component_type(DTYPE, OUTPUT_STORAGE)}
-
-$if OUTPUT_STORAGE == "buffer":
-  #define OUTPUT_BUFFER
-$if PACKED_INT8_INPUT_STORAGE == "buffer":
-  #define PACKED_INT8_INPUT_BUFFER
-$if WEIGHT_STORAGE == "buffer":
-  #define WEIGHT_BUFFER
-
-#define TILE_M4 ${TILE_M4}
-#define TILE_K4 ${TILE_K4}
-#define TILE_N4 ${TILE_N4}
-
-#define TILE_M ${TILE_M4 * 4}
-#define TILE_K ${TILE_K4 * 4}
-#define TILE_N ${TILE_N4 * 4}
-
-${define_required_extensions(DTYPE)}
-
-#extension GL_EXT_integer_dot_product : require
-
-layout(std430) buffer;
-
-#include "conv2d_common.glslh"
-
-${layout_declare_tensor(B, "w", "t_output", DTYPE, OUTPUT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", PACKED_INT8_INPUT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)}
-
-${layout_declare_ubo(B, "ivec4", "output_sizes")}
-${layout_declare_ubo(B, "ivec4", "input_sizes")}
-${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
-
-layout(push_constant) uniform restrict Block {
-  float input_scale;
-  int input_zp;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "apply_bias", "1")}
-
-#include "linear_int8_input_tile_load.glslh"
-#include "linear_int8_weight_tile_load.glslh"
-#include "linear_int_weight_sums_load.glslh"
-#include "linear_fp_weight_scales_load.glslh"
-#include "linear_fp_output_tile_int8_int8_compute.glslh"
-#include "linear_fp_bias_load.glslh"
-#include "conv2d_fp_im2col_block_store.glslh"
-
-void main() {
-  // Each thread writes out a 4 wide x 4 high tile of output values
-  const int out_tile_x = int(gl_GlobalInvocationID.x);
-  const int out_tile_y = int(gl_GlobalInvocationID.y);
-
-  const int n = int(out_tile_x * TILE_N);
-  const int m = int(out_tile_y * TILE_M);
-
-  const int n4 = div_4(n);
-  const int m4 = div_4(m);
-
-  // M = flattened output width, height, batches dims
-  const int M = output_sizes.x * output_sizes.y * output_sizes.w;
-  // N = output channels
-  const int N = output_sizes.z;
-
-  if (n >= N || m >= M) {
-    return;
-  }
-
-  const int group_idx = n / conv2d_params.out_channels_per_group;
-  const int input_k4_offset = conv2d_params.K4_per_group * group_idx;
-
-  const int K4 = conv2d_params.K4;
-  const int N4 = div_up_4(N);
-
-  Int32Accum out_accum;
-  initialize(out_accum);
-
-  Int8InputTile int8_in_tile;
-  Int8WeightTile int8_weight_tile;
-
-  for (int k4 = 0; k4 < conv2d_params.K4_per_group; k4++) {
-    load_int8_input_tile(int8_in_tile, k4 + input_k4_offset, m4, K4);
-    load_int8_weight_tile(int8_weight_tile, n4, k4, N4);
-
-    int_accumulate_with_int8_weight(out_accum, int8_in_tile, int8_weight_tile);
-  }
-
-  FPPerOutChannelParams weight_scales_tile;
-  load_weight_scales_tile(weight_scales_tile, n4);
-
-  IntPerOutChannelParams weight_sums_tile;
-  load_weight_sums_tile(weight_sums_tile, n4);
-
-  FPOutTile out_tile;
-  initialize(out_tile);
-  if (apply_bias > 0) {
-    FPPerOutChannelParams bias_tile;
-    load_bias_tile(bias_tile, int(n4));
-
-    accumulate_out_tile_with_int_accum(
-        out_tile,
-        out_accum,
-        input_scale,
-        input_zp,
-        weight_sums_tile,
-        weight_scales_tile,
-        bias_tile);
-  }
-  else {
-    accumulate_out_tile_with_int_accum(
-        out_tile,
-        out_accum,
-        input_scale,
-        input_zp,
-        weight_sums_tile,
-        weight_scales_tile);
-  }
-
-  write_im2col_tile_as_image(out_tile, n4, m);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_linear_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_linear_tiled.yaml
deleted file mode 100644
index 629001765c1..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_q8ta_q8csw_linear_tiled.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-conv2d_q8ta_q8csw_linear_tiled:
-  parameter_names_with_default_values:
-    DTYPE: float
-    OUTPUT_STORAGE: texture3d
-    PACKED_INT8_INPUT_STORAGE: buffer
-    WEIGHT_STORAGE: texture2d
-    TILE_M4: 1
-    TILE_N4: 1
-    TILE_K4: 1
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-  shader_variants:
-    - NAME: conv2d_q8ta_q8csw_linear_tiled_texture3d_buffer_texture2d
-    - NAME: conv2d_q8ta_q8csw_linear_tiled_texture3d_buffer_buffer
-      WEIGHT_STORAGE: buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl
deleted file mode 100644
index 740fe10e048..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.glsl
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-#define op(X, A, B) ${OPERATOR}
-
-#include "indexing_utils.h"
-
-layout(std430) buffer;
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
-${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
-${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
-${layout_declare_ubo(4, "ivec3", "out_limits")}
-${layout_declare_ubo(5, "ivec4", "in_sizes")}
-${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
-${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
-${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-/*
- * Computes a 2D transpose convolution. Each shader invocation calculates the
- * output at a single output location. For details, refer to conv2d.glsl which
- * uses a similar approach.
- */
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  ivec2 ipos = pos.xy + padding;
-
-  const ivec2 start = max(
-      ivec2(0),
-      ivec2(ceil((vec2(ipos) - kernel_size + 1) / vec2(stride))));
-  const ivec2 end =
-      min(ivec2(in_sizes.xy),
-          ivec2(floor(vec2(ipos) / vec2(stride))) + 1);
-
-  const int ic = in_group_size;
-  const int kx_stride = ic * (stride.x - 1);
-
-  int ky_start = overlay_region.y - 1 - (ipos.y - stride.y * start.y) + pos.z * kernel_size.y;
-  int kx_start = (overlay_region.x - 1 - (ipos.x - stride.x * start.x)) * ic;
-
-  VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
-  for (int y = start.y, ky = ky_start; y < end.y; ++y, ky += stride.y) {
-    for (int x = start.x, kx = kx_start; x < end.x; ++x, kx += kx_stride) {
-      for (int z4 = 0; z4 < ic / 4; ++z4, kx += 4) {
-        const VEC4_T in_texel = texelFetch(t_in, ivec3(x, y, z4), 0);
-        const ivec4 kxs = kx + ivec4(0, 1, 2, 3);
-
-        sum = fma(in_texel.xxxx, texelFetch(t_kernel, ivec2(kxs.x, ky), 0), sum);
-        sum = fma(in_texel.yyyy, texelFetch(t_kernel, ivec2(kxs.y, ky), 0), sum);
-        sum = fma(in_texel.zzzz, texelFetch(t_kernel, ivec2(kxs.z, ky), 0), sum);
-        sum = fma(in_texel.wwww, texelFetch(t_kernel, ivec2(kxs.w, ky), 0), sum);
-      }
-    }
-  }
-
-  imageStore(t_out, pos, op(sum, out_min, out_max));
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.yaml
deleted file mode 100644
index 0940444bf7d..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-conv_transpose2d:
-  parameter_names_with_default_values:
-    OPERATOR: X
-    NDIM: 3
-    DTYPE: float
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: conv_transpose2d
-    - NAME: conv_transpose2d_clamp
-      OPERATOR: clamp(X, A, B)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl
deleted file mode 100644
index 0b10683cee4..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define BUF_T ${buffer_scalar_type(DTYPE)}
-#define VEC4_T ${texel_type(DTYPE)}
-#define SCALAR_T ${texel_component_type(DTYPE)}
-
-#include "indexing_utils.h"
-
-$if DTYPE == "half":
-  #extension GL_EXT_shader_16bit_storage : require
-
-layout(std430) buffer;
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[2][DTYPE]} image_out;
-layout(set = 0, binding = 1) buffer  PRECISION restrict readonly Buffer {
-  BUF_T buffer_in[];
-};
-
-layout(push_constant) uniform PRECISION restrict Block {
-  ivec4 sizes;
-  ivec4 original_sizes;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-/*
- * Computes special prepacking for a 2D transpose convolution. Each shader
- * invocation calculates the input buffer locations to read into the desired
- * texel. This packing was originally developed on CPU here:
- * https://github.com/pytorch/pytorch/blob/d63e7d0aa2e0a1b1fd7518f917224774afe97bae/aten/src/ATen/native/vulkan/ops/Convolution.cpp#L120-L211
- */
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim);
-
-  if (any(greaterThanEqual(idx, sizes))) {
-    return;
-  }
-
-  // Map tensor_idx to normal buffer_i
-  const ivec4 p0 = tidx_to_nchwi(idx, sizes, packed_dim);
-
-  // Compute modified tensor_idx by inverting the CPU function
-  const int N = original_sizes.w;
-  const int C = original_sizes.z;
-  const int H = original_sizes.y;
-  const int W = original_sizes.x;
-  const int J = sizes.y / H;
-  const int K = sizes.x / (4*W);
-
-  const ivec4 p1 = p0 / (4*K);
-  const ivec4 p2 = p1 / W;
-  const ivec4 p3 = p2 / H;
-
-  const ivec4 n = p0 % (4*K);
-  const ivec4 c = (p3 % J) * 4 + (p3 / J);
-  const ivec4 h = H-1 - p2 % H;
-  const ivec4 w = W-1 - p1 % W;
-
-  // Map modified tensor_idx to modifed buffer_i
-  // Zero out if modified tensor idx is out of bounds
-  const ivec4 buf_i = n * C*H*W + c * H*W + h * W + w;
-  const bvec4 mask = bvec4(ivec4(lessThan(n, ivec4(N))) & ivec4(lessThan(c, ivec4(C))));
-
-  VEC4_T texel = VEC4_T(0);
-  if (mask.x) {
-    texel.x = SCALAR_T(buffer_in[buf_i.x]);
-  }
-  if (mask.y) {
-    texel.y = SCALAR_T(buffer_in[buf_i.y]);
-  }
-  if (mask.z) {
-    texel.z = SCALAR_T(buffer_in[buf_i.z]);
-  }
-  if (mask.w) {
-    texel.w = SCALAR_T(buffer_in[buf_i.w]);
-  }
-
-  imageStore(image_out, pos.xy, texel);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.yaml
deleted file mode 100644
index d933cd097aa..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-conv_transpose2d_prepack_weights:
-  parameter_names_with_default_values:
-    NDIM: 3
-    DTYPE: float
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: conv_transpose2d_prepack_weights
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
deleted file mode 100644
index 39aa9b11a0d..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.glsl
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "existing_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 out_sizes;
-  ivec4 in_sizes;
-  // Operates on (x, y, z) logical extents.
-  // channel_range is stored in range.w
-  ivec4 range;
-  // Analogus to range variable in copy. It defines the # of channel being
-  // copied.
-  // dst channel offset is stored in dst_offset.w
-  ivec4 dst_offset;
-  int src_channel_offset;
-};
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int packed_dim = unhash_packed_dim(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-
-void main() {
-  // Note: Unlike other shaders, the range is often not equal to the destination
-  // texture extent.
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-  if (any(greaterThanEqual(lpos, range.xyz))) {
-    return;
-  }
-
-  const ivec3 out_lpos = lpos + dst_offset.xyz;
-
-  const ivec4 out_tidx = lpos_to_tidx(out_lpos, out_sizes, out_axis_map.w, packed_dim);
-
-  // First read the existing values to make sure the boundary values stay.
-  VEC4_T v = load_texel_lpos(existing_out, out_lpos, out_axis_map);
-
-  ivec4 in_tidx = out_tidx;
-  for (int i=0; i<4; i++) {
-
-    in_tidx[packed_dim] = out_tidx[packed_dim] - dst_offset.w + i;
-
-    // Handle the partial update for begining of channel in an existing tensor.
-    // If the source channel index is below zero or exceeds the range, we skip
-    // updating the element to avoid overwriting existing data.
-    if ((in_tidx[packed_dim] < 0) || (in_tidx[packed_dim] >= range.w)) {
-      continue;
-    }
-
-    // Readjust for the source offset.
-    in_tidx[packed_dim] += src_channel_offset;
-
-    ivec4 in_posi = tidx_to_posi(in_tidx, in_sizes, in_axis_map, packed_dim);
-    v[i] = load_texel(t_in, in_posi.xyz)[in_posi.w];
-  }
-
-  write_texel_lpos(t_out, out_lpos, v, out_axis_map);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml
deleted file mode 100644
index 984d9a09d43..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_channel_offset.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-copy_channel_offset:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-  shader_variants:
-    - NAME: copy_channel_offset
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
deleted file mode 100644
index 178814a90c3..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.glsl
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-${define_active_storage_type(STORAGE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-
-layout(push_constant) uniform restrict Block {
-  ivec3 range;
-  // xyz is source offset w is channel size
-  ivec4 src_offset;
-  // xyz is destination offset w is channel size
-  ivec4 dst_offset;
-};
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-
-${layout_declare_spec_const(C, "int", "batch_index_function", "0")}
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, range))) {
-    return;
-  }
-
-  ivec3 in_pos = pos + src_offset.xyz;
-  ivec3 out_pos = pos + dst_offset.xyz;
-  if (src_offset.w > 0) {
-    if (batch_index_function == 1) {
-      // batch index is calculated using source channel size
-      const int channel_index = pos.z % src_offset.w;
-      const int batch_index = pos.z / src_offset.w;
-      out_pos.z = channel_index + dst_offset.z + batch_index * dst_offset.w;
-    } else if (batch_index_function == 2) {
-      // batch index is calculated using destination channel size
-      const int channel_index = pos.z % dst_offset.w;
-      const int batch_index = pos.z / dst_offset.w;
-      in_pos.z = channel_index + src_offset.z + batch_index * src_offset.w;
-    }
-  }
-
-  write_texel_lpos(
-    t_out,
-    out_pos,
-    load_texel_lpos(t_in, in_pos, in_axis_map),
-    out_axis_map);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml
deleted file mode 100644
index 09f5ca36ea4..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_offset.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-copy_offset:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-      - VALUE: int8
-      - VALUE: uint8
-    STORAGE:
-      - VALUE: texture3d
-      - VALUE: texture2d
-  shader_variants:
-    - NAME: copy_offset
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
deleted file mode 100644
index 3100565d08a..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.glsl
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "existing_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 range;
-
-  // xyz is source offset w is channel size
-  ivec4 src_offset;
-
-  // xyz is destination offset w is channel size
-  ivec4 dst_offset;
-};
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int packed_dim = unhash_packed_dim(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, range.xyz))) {
-    return;
-  }
-
-  // Position in input tensor
-  ivec3 in_pos = pos + src_offset.xyz;
-  in_pos[packed_dim] = pos[packed_dim] + (src_offset[packed_dim] >> 2);
-
-  // Read input value mapping to this output texel
-  VEC4_T in_value = load_texel_lpos(t_in, in_pos, in_axis_map);
-
-  // Starting offset to read from a texel
-  const int src_lane_offset = src_offset[packed_dim] & 0x3;
-  const bool has_src_lane_offset = src_lane_offset != 0;
-
-  // If input lane offset is non zero i.e packed texel is composed from multiple sources
-  if (has_src_lane_offset) {
-    // Boundary values will come from next input texel in the packed dim.
-    ivec3 next_in_pos = in_pos;
-    next_in_pos[packed_dim] = in_pos[packed_dim] + 1;
-    VEC4_T next_value = load_texel_lpos(t_in, next_in_pos, in_axis_map);
-
-    // Keep input values from the end of current input pixel based on src_lane_offset
-    // offset 1 means the first lane of current input texel is not a part of the output texel
-    // offset 2 means first 2 lanes are not and so on
-    // Copy next texel's values towards the end of input texel, based on lane offset
-    // offset 1 means the first lane from next texel is part of the input texel
-    // offset 2 means first 2 lanes from next texel is part of the input texel and so on
-    if (src_lane_offset == 1) {
-      in_value = ivec4(in_value.yzw, next_value.x);
-    } else if (src_lane_offset == 2) {
-      in_value = ivec4(in_value.zw, next_value.xy);
-    } else {
-      in_value = ivec4(in_value.w, next_value.xyz);
-    }
-  }
-
-  // Starting offset to write at within a texel
-  const int out_lane_offset = dst_offset[packed_dim] & 0x3;
-  const bool has_dst_lane_offset = out_lane_offset != 0;
-
-  ivec3 out_pos = pos + dst_offset.xyz;
-  out_pos[packed_dim] = pos[packed_dim] + (dst_offset[packed_dim] >> 2);
-
-  VEC4_T out_value;
-
-  // If lane offset is non zero i.e packed texel is composed from multiple sources
-  if (has_dst_lane_offset) {
-    // When position in packed dim is > 0
-    if (pos[packed_dim] > 0) {
-      // Boundary values will come from previous input texel in the packed dim.
-      ivec3 prev_in_pos = in_pos;
-      prev_in_pos[packed_dim] = in_pos[packed_dim] - 1;
-      VEC4_T prev_value = load_texel_lpos(t_in, prev_in_pos, in_axis_map);
-
-      // Shift values toward the beginning based on out_lane_offset
-      // offset 1 means the last lane from the previous texel is a part of the output texel
-      // offset 2 means last 2 lanes and so on
-      if (out_lane_offset == 1) {
-        out_value.x = prev_value.w;
-      } else if (out_lane_offset == 2) {
-        out_value.xy = prev_value.zw;
-      } else {
-        out_value.xyz = prev_value.yzw;
-      }
-    } else {
-      // When position in packed dim is == 0
-      // Boundary values will be the previous texel values.
-      out_value = load_texel_lpos(existing_out, out_pos, out_axis_map);
-    }
-
-    // Copy input values towards the end of output array, based on lane offset
-    // offset 1 means the first lane from previous texel is part of the output texel starting at offset
-    // offset 2 means first 2 lanes from the previous texel is part of the output texel and so on
-    if (out_lane_offset == 1) {
-      out_value.yzw = in_value.xyz;
-    } else if (out_lane_offset == 2) {
-      out_value.zw = in_value.xy;
-    } else {
-      out_value.w = in_value.x;
-    }
-  } else {
-    out_value = in_value;
-  }
-
-  write_texel_lpos(
-    t_out,
-    out_pos,
-    out_value,
-    out_axis_map);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml
deleted file mode 100644
index 6e55876cb28..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/copy_packed_dim_offset.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-copy_packed_dim_offset:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-  shader_variants:
-    - NAME: copy_packed_dim_offset
diff --git a/backends/vulkan/runtime/graph/ops/glsl/dequantize.glslh b/backends/vulkan/runtime/graph/ops/glsl/dequantize.glslh
deleted file mode 100644
index 7194bebda35..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/dequantize.glslh
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef DEQUANTIZE_GLSLH
-#define DEQUANTIZE_GLSLH
-
-OUT_T dequantize_val(IN_T qvalue, float scale_val, int zero_point_val) {
-  return OUT_T(float(int(qvalue) - zero_point_val) * scale_val);
-}
-
-#endif // DEQUANTIZE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/dequantize_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/dequantize_buffer.glsl
deleted file mode 100644
index 57dc2d53fff..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/dequantize_buffer.glsl
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define IN_T ${buffer_scalar_type(IN_DTYPE)}
-#define OUT_T ${buffer_scalar_type(OUT_DTYPE)}
-#define SCALE_T ${buffer_scalar_type(SCALE_DTYPE)}
-#define ZP_T ${buffer_scalar_type(ZP_DTYPE)}
-
-#define ${MODE}
-
-${define_active_storage_type("buffer")}
-${define_required_extensions(IN_DTYPE)}
-${define_required_extensions(OUT_DTYPE)}
-${define_required_extensions(SCALE_DTYPE)}
-${define_required_extensions(ZP_DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "buffer")}
-${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "buffer")}
-
-$if MODE == "per_tensor":
-  ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
-  ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}
-
-  layout(push_constant) uniform restrict Block {
-    int quant_min;
-    int quant_max;
-  };
-$if MODE == "per_token":
-  ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
-  ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}
-
-  layout(push_constant) uniform restrict Block {
-    int num_tokens;
-    int quant_min;
-    int quant_max;
-  };
-$if MODE == "per_channel":
-  ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
-  ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}
-
-  layout(push_constant) uniform restrict Block {
-    int axis;
-    int num_channels;
-    int quant_min;
-    int quant_max;
-  };
-$if MODE == "block_wise":
-  ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
-  ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}
-
-  layout(push_constant) uniform restrict Block {
-    ivec4 blockSize;     // bW, bH, bC, bN
-    ivec4 numBlocks;     // tW/bW, tH/bH, tC/bC, tN/bN
-    ivec4 blockStride;   // pre-computed linear strides for the block grid
-    int quant_min;
-    int quant_max;
-  };
-
-${layout_declare_ubo(B, "int", "out_numel")}
-${layout_declare_ubo(B, "ivec4", "t_in_sizes")}
-${layout_declare_ubo(B, "ivec4", "t_in_strides")}
-${layout_declare_ubo(B, "ivec4", "t_out_sizes")}
-${layout_declare_ubo(B, "ivec4", "t_out_strides")}
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-
-#include "dequantize.glslh"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
-const lowp ivec4 in_dim_order = unhash_dim_order(in_layout);
-
-/*
-  Dequantization Shader (Buffer Storage)
-    This shader converts n-bit integer tensor values back to floating-point representations
-    using pre-computed quantization parameters (scale and zero_point). The dequantization
-    reconstructs the original floating-point values from their discrete integer representations
-    with minimal precision loss.
-
-  Important Considerations:
-    (+) All input tensors are assumed to be WIDTH_PACKED (i.e., contiguous in the last dimension)
-    (+) The axis map layout is assumed to be a standard layout for scales and zero_points
-    (++) The scale and zero_point tensors must be implemented as buffers
-
-  Workgroup Configuration:
-  - dequantize_per_tensor
-      This mode reverses the uniform quantization applied across the entire tensor by using the
-      single scale and zero_point values to convert quantized integer values back to their original
-      floating-point representation.
-
-    (*) global_wg_size: default
-    (*) local_wg_size: default
-
-  - dequantize_per_token
-      This mode reverses the quantization applied individually to each token (or element) in the
-      input by using separate scale and zero_point values for each token. For a tensor of shape
-      [B, S, H], it applies the inverse transformation token-wise across the B*S tokens, converting
-      quantized values back to their original floating-point representation for each group of H
-      elements independently.
-
-    (*) global_wg_size: default
-    (*) local_wg_size: default
-
-  - dequantize_per_channel
-      This mode reverses the quantization applied separately to each channel of the input tensor
-      by using distinct scale and zero_point values for each channel. For a tensor of shape
-      [B, C, H, W] with axis = 1, it applies the inverse transformation channel-wise across the C
-      channels, converting quantized values back to their original floating-point representation
-      independently for each channel.
-
-    (*) global_wg_size: default
-    (*) local_wg_size: default
-
-  - dequantize_block_wise
-      This mode reverses the block-wise quantization applied to groups of elements by using separate
-      scale and zero_point values for each block. Equivalent to dequantize_affine, it applies the
-      inverse affine transformation per block to convert quantized values back to their original
-      floating-point representation. For example, if the tensor shape is [6, 9, 4] and
-      blockSize = [3, 3, 2], the tensor is divided into 12 blocks, each containing 18 elements,
-      and dequantization is performed independently on each block.
-
-    (*) global_wg_size: default
-    (*) local_wg_size: default
-
-  Dequantization Formula:
-    value = (qvalue - zero_point) * scale
-*/
-
-#ifdef per_tensor
-
-void dequantize_per_tensor() {
-  const int out_bufi = int(gl_GlobalInvocationID.x);
-
-  if (out_bufi >= out_numel) {
-    return;
-  }
-
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order);
-  const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides);
-
-  IN_T qvalue = t_in[in_bufi];
-  OUT_T value = dequantize_val(qvalue, float(t_scale[0]), int(t_zero_point[0]));
-
-  t_out[out_bufi] = value;
-}
-
-#elif defined(per_token)
-
-void dequantize_per_token() {
-  const int out_bufi = int(gl_GlobalInvocationID.x);
-
-  if (out_bufi >= out_numel) {
-    return;
-  }
-
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order);
-  const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides);
-
-  IN_T qvalue = t_in[in_bufi];
-
-  int token_idx = 0;
-
-  if (t_out_sizes.w > 1) {
-    // 4D tensor
-    token_idx = out_tidx.w * (t_out_sizes.z * t_out_sizes.y) + out_tidx.z * t_out_sizes.y + out_tidx.y;
-  } else if (t_out_sizes.z > 1) {
-    // 3D tensor
-    token_idx = out_tidx.z * t_out_sizes.y + out_tidx.y;
-  } else if (t_out_sizes.y > 1) {
-    // 2D tensor
-    token_idx = out_tidx.y;
-  }
-  // For 1D tensor, token_idx remains 0
-
-  token_idx = min(token_idx, num_tokens - 1);
-
-  OUT_T value = dequantize_val(qvalue, float(t_scale[token_idx]), int(t_zero_point[token_idx]));
-
-  t_out[out_bufi] = value;
-}
-
-#elif defined(per_channel)
-
-void dequantize_per_channel() {
-  const int out_bufi = int(gl_GlobalInvocationID.x);
-
-  if (out_bufi >= out_numel) {
-    return;
-  }
-
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order);
-  const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides);
-
-  IN_T qvalue = t_in[in_bufi];
-
-  // Calculate channel index based on the dequantization axis (already converted to WHCN)
-  // The axis parameter is now in WHCN coordinate system:
-  // axis 0 -> W dimension (tidx.x)
-  // axis 1 -> H dimension (tidx.y)
-  // axis 2 -> C dimension (tidx.z)
-  // axis 3 -> N dimension (tidx.w)
-  int channel_idx = 0;
-
-  if (axis == 0) {
-    channel_idx = out_tidx.x;
-  } else if (axis == 1) {
-    channel_idx = out_tidx.y;
-  } else if (axis == 2) {
-    channel_idx = out_tidx.z;
-  } else if (axis == 3) {
-    channel_idx = out_tidx.w;
-  }
-
-  channel_idx = min(channel_idx, num_channels - 1);
-
-  OUT_T value = dequantize_val(qvalue, float(t_scale[channel_idx]), int(t_zero_point[channel_idx]));
-
-  t_out[out_bufi] = value;
-}
-
-#else // block_wise
-
-void dequantize_block_wise() {
-  const int out_bufi = int(gl_GlobalInvocationID.x);
-
-  if (out_bufi >= out_numel) {
-    return;
-  }
-
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order);
-  const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides);
-
-  IN_T qvalue = t_in[in_bufi];
-
-  const ivec4 bcoord = out_tidx / blockSize;
-
-  const int block_id = bcoord.x * blockStride.x + bcoord.y * blockStride.y + bcoord.z * blockStride.z + bcoord.w * blockStride.w;
-
-  const OUT_T value = dequantize_val(qvalue, float(t_scale[block_id]), int(t_zero_point[block_id]));
-
-  t_out[out_bufi] = value;
-}
-
-#endif
-
-void main() {
-  dequantize_${MODE}();
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/dequantize_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/dequantize_buffer.yaml
deleted file mode 100644
index a4375038a75..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/dequantize_buffer.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-dequantize_buffer:
-  parameter_names_with_default_values:
-    IN_DTYPE: int32
-    OUT_DTYPE: float
-    SCALE_DTYPE: float
-    ZP_DTYPE: int32
-    MODE: per_tensor
-  generate_variant_forall:
-    IN_DTYPE:
-      - VALUE: uint8
-      - VALUE: int8
-      - VALUE: int32
-    OUT_DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: double
-    SCALE_DTYPE:
-      - VALUE: float
-    ZP_DTYPE:
-      - VALUE: int8
-      - VALUE: int32
-      - VALUE: float
-  shader_variants:
-    - NAME: dequantize_per_tensor_buffer
-      MODE: per_tensor
-    - NAME: dequantize_per_token_buffer
-      MODE: per_token
-    - NAME: dequantize_per_channel_buffer
-      MODE: per_channel
-    - NAME: dequantize_block_wise_buffer
-      MODE: block_wise
diff --git a/backends/vulkan/runtime/graph/ops/glsl/dequantize_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/dequantize_texture.glsl
deleted file mode 100644
index 19276cd8f7f..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/dequantize_texture.glsl
+++ /dev/null
@@ -1,347 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define IN_T ${buffer_scalar_type(IN_DTYPE)}
-#define IVEC4_T ${texel_load_type(IN_DTYPE, "texture3d")}
-
-#define OUT_T ${buffer_scalar_type(OUT_DTYPE)}
-#define FVEC4_T ${texel_load_type(OUT_DTYPE, "texture3d")}
-#define SCALE_T ${buffer_scalar_type(SCALE_DTYPE)}
-#define ZP_T ${buffer_scalar_type(ZP_DTYPE)}
-
-#define ${MODE}
-
-${define_active_storage_type("texture3d")}
-${define_required_extensions(IN_DTYPE)}
-${define_required_extensions(OUT_DTYPE)}
-${define_required_extensions(SCALE_DTYPE)}
-${define_required_extensions(ZP_DTYPE)}
-
-#extension GL_EXT_control_flow_attributes : require
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "texture3d")}
-
-$if MODE == "per_tensor":
-  ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
-  ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}
-
-  layout(push_constant) uniform restrict Block {
-    int quant_min;
-    int quant_max;
-  };
-$if MODE == "per_token":
-  ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
-  ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}
-
-  layout(push_constant) uniform restrict Block {
-    int num_tokens;
-    int quant_min;
-    int quant_max;
-  };
-$if MODE == "per_channel":
-  ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
-  ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}
-
-  layout(push_constant) uniform restrict Block {
-    int axis;
-    int num_channels;
-    int quant_min;
-    int quant_max;
-  };
-$if MODE == "block_wise":
-  ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
-  ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}
-
-  layout(push_constant) uniform restrict Block {
-    ivec4 blockSize;     // bW, bH, bC, bN
-    ivec4 numBlocks;     // tW/bW, tH/bH, tC/bC, tN/bN
-    ivec4 blockStride;   // pre-computed linear strides for the block grid
-    int quant_min;
-    int quant_max;
-  };
-
-${layout_declare_ubo(B, "ivec3", "t_in_limits")}
-${layout_declare_ubo(B, "ivec3", "t_out_limits")}
-
-#include "indexing_utils.h"
-#include "dequantize.glslh"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-/*
- * DEQUANTIZATION SHADER (TEXTURE STORAGE)
- *
- * This shader converts n-bit integer tensor values back to floating-point representations
- * using pre-computed quantization parameters (scale and zero_point). The dequantization
- * reconstructs the original floating-point values from their discrete integer representations
- * with minimal precision loss.
- *
- * ALGORITHM:
- * 1. Load quantized integer texel (4 values) from 3D texture
- * 2. Apply dequantization formula to each component: value = (qvalue - zero_point) * scale
- * 3. Store reconstructed floating-point texel to output texture
- *
- * WORKGROUP CONFIGURATION:
- * - Per-Tensor Mode:
- *   - Global WG Size: {W, H, C/4} for input size (W, H, C) with width-packing
- *   - Local WG Size: Default (typically {8, 8, 1} or based on global WG size)
- * - Per-Token Mode:
- *   - Global WG Size: {W, H, C/4} for input size (W, H, C) with width-packing
- *   - Local WG Size: Default (typically {8, 8, 1} or based on global WG size)
- *
- * SUPPORTED CONFIGURATIONS:
- * - Texture Storage: Uses 3D texture indexing with texel-based processing
- * - Assumes width-packed layout (packed_dim = 0) for input/output textures
- * - Handles texel padding for non-multiple-of-4 tensor dimensions
- * - For per-token mode: scale/zero_point tensors must use buffer storage
- * - Input/output textures: Must use standard axis mapping for per-token mode
- *
- * DEQUANTIZATION FORMULA VISUALIZATION:
- * For integer range [quant_min, quant_max] mapped back to [min_val, max_val]:
- *
- * Integer Domain:           Floating Point Domain:
- * quant_min ──────────────► min_val
- *    │                         │
- *    │    scale = (max_val - min_val) / (quant_max - quant_min)
- *    │    zero_point = quant_min - round(min_val / scale)
- *    │                         │
- * quant_max ──────────────► max_val
- *
- * Texel Dequantization Process:
- * Input Texel: [-103, -128, -123, -96] (int4)
- * Per-component dequantization with scale=0.1, zero_point=-128:
- * Component 0: (-103 - (-128)) * 0.1 = 25 * 0.1 = 2.5
- * Component 1: (-128 - (-128)) * 0.1 = 0 * 0.1 = 0.0
- * Component 2: (-123 - (-128)) * 0.1 = 5 * 0.1 = 0.5
- * Component 3: (-96 - (-128)) * 0.1 = 32 * 0.1 = 3.2
- * Output Texel: [2.5, 0.0, 0.5, 3.2] (float4)
- *
- * PER-TENSOR DEQUANTIZATION:
- * - Single scale and zero_point values for entire tensor
- * - All texel components use same dequantization parameters
- * - Parameters passed as push constants for efficiency
- * - Each thread processes one texel (4 elements) independently
- * - Formula: value[i] = (qvalue[i] - zero_point) * scale
- *
- * PER-TOKEN DEQUANTIZATION:
- * - Separate scale and zero_point for each token
- * - Token = all elements except last dimension (e.g., for [B,S,H]: B*S tokens of H elements)
- * - Parameters stored in buffer arrays indexed by token_id
- * - Each thread calculates token_id from its 3D texture position
- * - Scale/zero_point buffers accessed directly (not as textures)
- * - Formula: value[i] = (qvalue[i] - zero_point[token_id]) * scale[token_id]
- *
- * Token ID calculation for texel at position (x, y, z):
- * - 3D tensor: token_id = z * texture_height + y
- * - 2D tensor: token_id = y
- * - 1D tensor: token_id = 0
- */
-
-#ifdef per_tensor
-
-void dequantize_per_tensor() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  // Skip if out of bounds
-  if (any(greaterThanEqual(pos, t_in_limits))) {
-    return;
-  }
-
-  IVEC4_T intex = load_texel(t_in, pos);
-  FVEC4_T outtex;
-
-  [[unroll]] for (int i = 0; i < 4; ++i) {
-    IN_T qvalue = IN_T(intex[i]);
-    OUT_T value = dequantize_val(qvalue, float(t_scale[0]), int(t_zero_point[0]));
-
-    $if OUT_DTYPE == "double":
-      outtex[i] = float(value);
-    $else:
-      outtex[i] = value;
-  }
-  write_texel(t_out, pos, outtex);
-}
-
-#elif defined(per_token)
-
-void dequantize_per_token() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, t_in_limits))) {
-    return;
-  }
-
-  IVEC4_T intex = load_texel(t_in, pos);
-
-  int token_idx = 0;
-  ivec3 dims = t_in_limits;
-
-  if (dims.z > 1) {
-    // 3D tensor
-    token_idx = pos.z * dims.y + pos.y;
-  } else if (dims.y > 1) {
-    // 2D tensor
-    token_idx = pos.y;
-  }
-  // For 1D tensor, token_idx remains 0
-
-  token_idx = min(token_idx, num_tokens - 1);
-
-  // Scale and zero_point are prepacked as buffers, so direct access
-  float scale_val = float(t_scale[token_idx]);
-  int zero_point_val = int(t_zero_point[token_idx]);
-
-  FVEC4_T outtex;
-  [[unroll]] for (int i = 0; i < 4; ++i) {
-    IN_T qvalue = IN_T(intex[i]);
-    OUT_T value = dequantize_val(qvalue, scale_val, zero_point_val);
-    $if OUT_DTYPE == "double":
-      outtex[i] = float(value);
-    $else:
-      outtex[i] = value;
-  }
-
-  write_texel(t_out, pos, outtex);
-}
-
-#elif defined(per_channel)
-
-void dequantize_per_channel() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, t_in_limits))) {
-    return;
-  }
-
-  IVEC4_T intex = load_texel(t_in, pos);
-  FVEC4_T outtex;
-
-  // Calculate channel index based on the dequantization axis (already converted to WHCN)
-  // The axis parameter is now in WHCN coordinate system:
-  // axis 0 -> W dimension (pos.x)
-  // axis 1 -> H dimension (pos.y)
-  // axis 2 -> C dimension (pos.z)
-  // axis 3 -> N dimension (batch folding in texture storage)
-
-  if (axis == 0) {
-    // Width dimension - each texel component has different channel index
-    [[unroll]] for (int i = 0; i < 4; ++i) {
-      IN_T qvalue = IN_T(intex[i]);
-      int channel_idx = pos.x * 4 + i;
-      channel_idx = min(channel_idx, num_channels - 1);
-
-      float scale_val = float(t_scale[channel_idx]);
-      int zero_point_val = int(t_zero_point[channel_idx]);
-      OUT_T value = dequantize_val(qvalue, scale_val, zero_point_val);
-      $if OUT_DTYPE == "double":
-        outtex[i] = float(value);
-      $else:
-        outtex[i] = value;
-    }
-  } else if (axis == 1) {
-    int channel_idx = pos.y;
-    channel_idx = min(channel_idx, num_channels - 1);
-    float scale_val = float(t_scale[channel_idx]);
-    int zero_point_val = int(t_zero_point[channel_idx]);
-
-    [[unroll]] for (int i = 0; i < 4; ++i) {
-      IN_T qvalue = IN_T(intex[i]);
-      OUT_T value = dequantize_val(qvalue, scale_val, zero_point_val);
-      $if OUT_DTYPE == "double":
-        outtex[i] = float(value);
-      $else:
-        outtex[i] = value;
-    }
-  } else if (axis == 2) {
-    // Channel dimension - for 4D tensors, need to account for batch-channel folding
-    // The Z coordinate contains folded batch*channel information
-    // We need to extract the actual channel index from the folded dimension
-    int folded_idx = pos.z;
-    int channel_idx = folded_idx % num_channels;
-
-    float scale_val = float(t_scale[channel_idx]);
-    int zero_point_val = int(t_zero_point[channel_idx]);
-
-    [[unroll]] for (int i = 0; i < 4; ++i) {
-      IN_T qvalue = IN_T(intex[i]);
-      OUT_T value = dequantize_val(qvalue, scale_val, zero_point_val);
-      $if OUT_DTYPE == "double":
-        outtex[i] = float(value);
-      $else:
-        outtex[i] = value;
-    }
-  } else if (axis == 3) {
-    // Batch dimension - for 4D tensors, need to account for batch-channel folding
-    // The Z coordinate contains folded batch*channel information
-    // We need to extract the actual channel index from the folded dimension
-    int folded_idx = pos.z;
-    // In this case num_channels actually corresponds to the number of channels
-    // the C dimension N(C)HW
-    int channel_idx = folded_idx / num_channels;
-
-    float scale_val = float(t_scale[channel_idx]);
-    int zero_point_val = int(t_zero_point[channel_idx]);
-
-    [[unroll]] for (int i = 0; i < 4; ++i) {
-      IN_T qvalue = IN_T(intex[i]);
-      OUT_T value = dequantize_val(qvalue, scale_val, zero_point_val);
-      $if OUT_DTYPE == "double":
-        outtex[i] = float(value);
-      $else:
-        outtex[i] = value;
-    }
-  }
-
-  write_texel(t_out, pos, outtex);
-}
-
-#else // block_wise
-
-void dequantize_block_wise() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, t_in_limits)))
-    return;
-
-  IVEC4_T intex = load_texel(t_in, pos);
-  FVEC4_T outtex;
-
-  ivec4 base_tidx = ivec4(pos.x * 4, pos.y, pos.z, 0);
-  int foldedZ = pos.z;
-
-  int C_total = numBlocks.z * blockSize.z;
-
-  [[unroll]] for (int i = 0; i < 4; ++i) {
-    ivec4 tidx = ivec4(base_tidx.x + i, base_tidx.y, (foldedZ % C_total), (foldedZ / C_total));
-
-    ivec4 bcoord = tidx / blockSize;
-    int block_id = bcoord.x * blockStride.x + bcoord.y * blockStride.y + bcoord.z * blockStride.z + bcoord.w * blockStride.w;
-
-    IN_T qvalue = IN_T(intex[i]);
-    OUT_T value = dequantize_val(qvalue, float(t_scale[block_id]), int(t_zero_point[block_id]));
-    $if OUT_DTYPE == "double":
-      outtex[i] = float(value);
-    $else:
-      outtex[i] = value;
-  }
-
-  write_texel(t_out, pos, outtex);
-}
-
-#endif
-
-void main() {
-  dequantize_${MODE}();
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/dequantize_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/dequantize_texture.yaml
deleted file mode 100644
index 7a58e9410d3..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/dequantize_texture.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-dequantize_texture:
-  parameter_names_with_default_values:
-    IN_DTYPE: int32
-    OUT_DTYPE: float
-    SCALE_DTYPE: float
-    ZP_DTYPE: int32
-    MODE: per_tensor
-  generate_variant_forall:
-    IN_DTYPE:
-      - VALUE: uint8
-      - VALUE: int8
-      - VALUE: int32
-    OUT_DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: double
-    SCALE_DTYPE:
-      - VALUE: float
-    ZP_DTYPE:
-      - VALUE: int8
-      - VALUE: int32
-      - VALUE: float
-  shader_variants:
-    - NAME: dequantize_per_tensor_texture3d
-      MODE: per_tensor
-    - NAME: dequantize_per_token_texture3d
-      MODE: per_token
-    - NAME: dequantize_per_channel_texture3d
-      MODE: per_channel
-    - NAME: dequantize_block_wise_texture3d
-      MODE: block_wise
diff --git a/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl b/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl
deleted file mode 100644
index 73a444cd84d..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", "int", STORAGE)}
-${layout_declare_tensor(B, "r", "t_weight", DTYPE, STORAGE)}
-${layout_declare_ubo(B, "ivec4", "sizes")}
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int packed_dim = unhash_packed_dim(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-
-${layout_declare_spec_const(C, "int", "weight_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 weight_axis_map = unhash_axis_map(weight_layout);
-
-void main() {
-  const ivec3 out_lpos = ivec3(gl_GlobalInvocationID);
-  const ivec4 out_tidx = lpos_to_tidx(out_lpos, sizes, out_axis_map.w, packed_dim);
-  if (any(greaterThanEqual(out_tidx, sizes))) {
-    return;
-  }
-  VEC4_T out_texel;
-
-  // Consider optimizing via W-packing format for t_in and t_weight.
-  for (int i = 0; i < 4; ++i) {
-    // Read input tensor for embedding index.
-    const ivec3 in_lpos = ivec3(out_tidx.y, out_tidx.z * 4 + i, out_tidx.w / 4);
-    const int in_texel_elem = load_texel_lpos(t_in, in_lpos, in_axis_map)[out_tidx.w % 4];
-
-    // Read weight tensor for embedding, it is height-packed.
-    const ivec3 weight_lpos = ivec3(out_tidx.x, in_texel_elem / 4, 0);
-    out_texel[i] = load_texel_lpos(t_weight, weight_lpos, weight_axis_map)[in_texel_elem % 4];
-  }
-
-  write_texel_lpos(t_out, out_lpos, out_texel, out_axis_map);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/embedding.yaml b/backends/vulkan/runtime/graph/ops/glsl/embedding.yaml
deleted file mode 100644
index 0e7b491c433..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/embedding.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-embedding:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-  shader_variants:
-    - NAME: embedding
diff --git a/backends/vulkan/runtime/graph/ops/glsl/expand_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/expand_buffer.glsl
deleted file mode 100644
index ce433040b66..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/expand_buffer.glsl
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-#define T ${buffer_scalar_type(DTYPE)}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing.glslh"
-
-${layout_declare_tensor(B, "w", "t_outp", DTYPE, "buffer")}
-${layout_declare_tensor(B, "r", "t_inp", DTYPE, "buffer")}
-
-${layout_declare_ubo(B, "BufferMetadata", "outp")}
-${layout_declare_ubo(B, "BufferMetadata", "inp")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const uint outp_bufi = gl_GlobalInvocationID.x;
-  if (outp_bufi >= numel(outp)) {
-    return;
-  }
-
-  TensorIndex outp_tidx;
-  linear_idx_to_tensor_idx(outp, outp_bufi, outp_tidx);
-
-  // Map output tensor index to input tensor index by taking modulo
-  // with input tensor sizes for each dimension
-  TensorIndex inp_tidx = outp_tidx;
-  for (int d = 0; d < ndim(inp); ++d) {
-    uint inp_size = size_at(inp, d);
-    uint outp_idx = idx_at(outp_tidx, d);
-    inp_tidx.data[div_4(d)][mod_4(d)] = outp_idx % inp_size;
-  }
-
-  const uint inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx);
-  // Copy data from input to output
-  t_outp[outp_bufi] = t_inp[inp_bufi];
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/expand_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/expand_buffer.yaml
deleted file mode 100644
index 6d90e1fa8b1..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/expand_buffer.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-expand_buffer:
-  parameter_names_with_default_values:
-    DTYPE: float
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-  shader_variants:
-    - NAME: expand_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/flash_attention_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/flash_attention_buffer.glsl
deleted file mode 100644
index 8509fdf1f49..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/flash_attention_buffer.glsl
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define T ${buffer_scalar_type(DTYPE)}
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-// Flash Attention inputs: Query, Key, Value tensors
-${layout_declare_tensor(B, "rw", "t_O", DTYPE, "buffer")}
-${layout_declare_tensor(B, "rw", "t_l", "float", "buffer")}
-${layout_declare_tensor(B, "rw", "t_m", "float", "buffer")}
-${layout_declare_tensor(B, "r", "t_Q", DTYPE, "buffer")}
-${layout_declare_tensor(B, "r", "t_K", DTYPE, "buffer")}
-${layout_declare_tensor(B, "r", "t_V", DTYPE, "buffer")}
-
-${layout_declare_ubo(B, "ivec4", "Q_sizes")}  // [B, H, N, D]
-${layout_declare_ubo(B, "ivec4", "K_sizes")}
-${layout_declare_ubo(B, "ivec4", "V_sizes")}
-${layout_declare_ubo(B, "ivec4", "O_sizes")}
-
-${layout_declare_ubo(B, "ivec3", "l_sizes")}  // [B, H, N]
-${layout_declare_ubo(B, "ivec3", "m_sizes")}  // [B, H, N]
-
-${layout_declare_ubo(B, "float", "scale")}
-${layout_declare_ubo(B, "int", "block_size_r")} // Br (num rows in Q block)
-${layout_declare_ubo(B, "int", "block_size_c")} // Bc (num cols in K/V block)
-${layout_declare_ubo(B, "int", "input_pos")}    // Starting position for causal masking
-${layout_declare_ubo(B, "int", "num_heads")}    // Number of query heads
-${layout_declare_ubo(B, "int", "num_kv_heads")} // Number of key/value heads
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-// Maximum block sizes to prevent array overflow
-#define MAX_BR 64
-#define MAX_BC 128
-
-void main() {
-    // Each thread processes one row block
-    const int thread_id = int(gl_GlobalInvocationID.x);
-
-    // Tensor dimensions: Q_sizes = [D, H, N, B] from graph.sizes_ubo()
-    // The UBO layout is different from the PyTorch tensor layout
-    const int head_dim = Q_sizes.x;     // D (head dim)
-    const int num_heads = Q_sizes.y;    // H (num heads)
-    const int seq_len = Q_sizes.z;      // N (sequence length)
-    const int batch_size = Q_sizes.w;   // B (batch)
-
-    // Block sizes
-    const int Br = block_size_r;
-    const int Bc = block_size_c;
-
-    const int Tr = (seq_len + Br - 1) / Br;  // Number of row blocks
-    const int total_row_blocks = batch_size * num_heads * Tr;
-
-    if (thread_id >= total_row_blocks) {
-        return;
-    }
-
-    // Decode thread_id to (batch, head, row_block)
-    const int batch = thread_id / (num_heads * Tr);
-    const int remaining = thread_id % (num_heads * Tr);
-    const int head = remaining / Tr;
-    const int row_block = remaining % Tr;
-
-    // Calculate row range for this block
-    const int row_start = row_block * Br;
-    const int row_end = min(row_start + Br, seq_len);
-    const int actual_Br = row_end - row_start;
-
-    // Base indices for this batch
-    const int q_base = batch * (seq_len * num_heads * head_dim);
-    const int k_base = batch * (seq_len * num_heads * head_dim);
-    const int v_base = batch * (seq_len * num_heads * head_dim);
-    const int o_base = batch * (seq_len * num_heads * head_dim);
-    const int lm_base = batch * (seq_len * num_heads);
-
-    // STEP 2: Initialize O = 0, l = 0, m = -inf for this row block
-    for (int r = 0; r < actual_Br; r++) {
-        const int seq_pos = row_start + r;
-        const int lm_idx = lm_base + head * seq_len + seq_pos;
-
-        t_l[lm_idx] = 0.0;
-        t_m[lm_idx] = -1.0 / 0.0; // -infinity
-
-        for (int dim = 0; dim < head_dim; dim++) {
-            const int o_idx = o_base + seq_pos * (num_heads * head_dim) + head * head_dim + dim;
-            t_O[o_idx] = T(0.0);
-        }
-    }
-
-    // STEP 5: Outer loop over column blocks (For K, V tensors)
-    const int Tc = (seq_len + Bc - 1) / Bc;  // Number of column blocks
-    for (int j = 0; j < Tc; j++) {
-        const int col_start = j * Bc;
-        const int col_end = min(col_start + Bc, seq_len);
-        const int actual_Bc = col_end - col_start;
-
-        // STEP 6-8 done implicitly below
-
-        // Load current statistics for all rows in this block
-        float m_i[MAX_BR];
-        float l_i[MAX_BR];
-        for (int r = 0; r < actual_Br; r++) {
-            const int seq_pos = row_start + r;
-            const int lm_idx = lm_base + head * seq_len + seq_pos;
-            m_i[r] = t_m[lm_idx];
-            l_i[r] = t_l[lm_idx];
-        }
-
-        // STEP 9: Compute Sij = Qi * Kj^T
-        T S_block[MAX_BR][MAX_BC]; // Use MAX_BR and MAX_BC constants
-        float m_tilde_ij[MAX_BR];   // Row maxes (float to match l/m)
-        float l_tilde_ij[MAX_BR];   // Row sums (float to match l/m)
-
-        // Initialize row statistics
-        for (int r = 0; r < actual_Br; r++) {
-            m_tilde_ij[r] = -1.0 / 0.0; // -infinity
-            l_tilde_ij[r] = 0.0;
-        }
-
-        // Compute attention scores Sij = Qi @ Kj^T
-        for (int r = 0; r < actual_Br; r++) {
-            const int global_row = row_start + r;
-            for (int c = 0; c < actual_Bc; c++) {
-                const int global_col = col_start + c;
-
-                // For multi-query attention: map query head to KV head
-                const int kv_head = (head * num_kv_heads) / num_heads;
-
-                // Dot product: Q[seq_pos, :] · K[col_pos, :]
-                T score = T(0.0);
-                for (int dim = 0; dim < head_dim; dim++) {
-                    const int q_idx = q_base + global_row * (num_heads * head_dim) + head * head_dim + dim;
-                    const int k_idx = k_base + global_col * (num_kv_heads * head_dim) + kv_head * head_dim + dim;
-                    score += t_Q[q_idx] * t_K[k_idx];
-                }
-                score *= scale;
-
-
-                // Apply causal masking: mask if global_col > global_row + input_pos
-                if (global_col > global_row + input_pos) {
-                    score = T(-1.0 / 0.0); // Set to negative infinity
-                }
-
-                S_block[r][c] = score;
-
-                // Track row maximum (after masking)
-                m_tilde_ij[r] = max(m_tilde_ij[r], float(score));
-            }
-        }
-
-        // STEP 10: Compute P'ij = exp(Sij − m'ij) and l'ij = rowsum(P'ij)
-        for (int r = 0; r < actual_Br; r++) {
-            // Handle the case where all scores are -inf (fully masked row)
-            if (isinf(m_tilde_ij[r]) && m_tilde_ij[r] < 0.0) {
-                // All scores are -inf, so all probabilities are 0
-                for (int c = 0; c < actual_Bc; c++) {
-                    S_block[r][c] = T(0.0);
-                }
-                l_tilde_ij[r] = 0.0;
-            } else {
-                // Normal case: compute softmax
-                for (int c = 0; c < actual_Bc; c++) {
-                    S_block[r][c] = exp(S_block[r][c] - T(m_tilde_ij[r]));
-                    l_tilde_ij[r] += float(S_block[r][c]);
-                }
-            }
-        }
-
-        // STEP 11: Softmax update
-        float m_new_i[MAX_BR];
-        float l_new_i[MAX_BR];
-        for (int r = 0; r < actual_Br; r++) {
-            m_new_i[r] = max(m_i[r], m_tilde_ij[r]);
-
-            l_new_i[r] = exp(m_i[r] - m_new_i[r]) * l_i[r] + exp(m_tilde_ij[r] - m_new_i[r]) * l_tilde_ij[r];
-        }
-
-        // STEP 12: Update Oi
-        for (int r = 0; r < actual_Br; r++) {
-            const int global_row = row_start + r;
-            float alpha = exp(m_i[r] - m_new_i[r]);
-            float beta = exp(m_tilde_ij[r] - m_new_i[r]);
-
-            // For multi-query attention: map query head to KV head
-            const int kv_head = (head * num_kv_heads) / num_heads;
-
-            for (int dim = 0; dim < head_dim; dim++) {
-                const int o_idx = o_base + global_row * (num_heads * head_dim) + head * head_dim + dim;
-
-                // Compute P'ij @ Vj for this dimension
-                T pv_sum = T(0.0);
-                for (int c = 0; c < actual_Bc; c++) {
-                    const int global_col = col_start + c;
-                    const int v_idx = v_base + global_col * (num_kv_heads * head_dim) + kv_head * head_dim + dim;
-                    pv_sum += S_block[r][c] * t_V[v_idx];
-                }
-
-                // Check for division by zero before updating output
-                if (l_new_i[r] <= 0.0) {
-                    t_O[o_idx] = T(0.0); // Set to zero to avoid NaN
-                } else {
-                    // Oi = (alpha * l_i * Oi + beta * P'ij @ Vj) / l_new_i
-                    t_O[o_idx] = (T(alpha) * T(l_i[r]) * t_O[o_idx] + T(beta) * pv_sum) / T(l_new_i[r]);
-                }
-            }
-        }
-
-        // STEP 13: Update li, mi
-        for (int r = 0; r < actual_Br; r++) {
-            const int seq_pos = row_start + r;
-            const int lm_idx = lm_base + head * seq_len + seq_pos;
-            t_l[lm_idx] = l_new_i[r];
-            t_m[lm_idx] = m_new_i[r];
-        }
-    }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/flash_attention_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/flash_attention_buffer.yaml
deleted file mode 100644
index 795ab906caa..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/flash_attention_buffer.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-flash_attention_buffer:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: buffer
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-  shader_variants:
-    - NAME: flash_attention_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/flash_attention_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/flash_attention_texture3d.glsl
deleted file mode 100644
index 1f72a583410..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/flash_attention_texture3d.glsl
+++ /dev/null
@@ -1,332 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define T ${buffer_scalar_type(DTYPE)}
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-// Flash Attention inputs: Query, Key, Value tensors using texture storage
-${layout_declare_tensor(B, "rw", "t_O", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "rw", "t_l", "float", "texture3d")}
-${layout_declare_tensor(B, "rw", "t_m", "float", "texture3d")}
-${layout_declare_tensor(B, "r", "t_Q", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "t_K", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "t_V", DTYPE, "texture3d")}
-
-${layout_declare_ubo(B, "ivec4", "Q_sizes")}  // [B, H, N, D]
-${layout_declare_ubo(B, "ivec4", "K_sizes")}
-${layout_declare_ubo(B, "ivec4", "V_sizes")}
-${layout_declare_ubo(B, "ivec4", "O_sizes")}
-
-${layout_declare_ubo(B, "ivec3", "l_sizes")}  // [B, H, N]
-${layout_declare_ubo(B, "ivec3", "m_sizes")}  // [B, H, N]
-
-${layout_declare_ubo(B, "float", "scale")}
-${layout_declare_ubo(B, "int", "block_size_r")} // Br (num rows in Q block)
-${layout_declare_ubo(B, "int", "block_size_c")} // Bc (num cols in K/V block)
-${layout_declare_ubo(B, "int", "input_pos")}    // Starting position for causal masking
-${layout_declare_ubo(B, "int", "num_heads")}    // Number of query heads
-${layout_declare_ubo(B, "int", "num_kv_heads")} // Number of key/value heads
-
-// Axis mapping setup for proper texture indexing
-${layout_declare_spec_const(C, "int", "Q_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 Q_axis_map = unhash_axis_map(Q_layout);
-const lowp int Q_packed_dim = unhash_packed_dim(Q_layout);
-
-${layout_declare_spec_const(C, "int", "K_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 K_axis_map = unhash_axis_map(K_layout);
-const lowp int K_packed_dim = unhash_packed_dim(K_layout);
-
-${layout_declare_spec_const(C, "int", "V_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 V_axis_map = unhash_axis_map(V_layout);
-const lowp int V_packed_dim = unhash_packed_dim(V_layout);
-
-${layout_declare_spec_const(C, "int", "O_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 O_axis_map = unhash_axis_map(O_layout);
-const lowp int O_packed_dim = unhash_packed_dim(O_layout);
-
-${layout_declare_spec_const(C, "int", "l_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 l_axis_map = unhash_axis_map(l_layout);
-const lowp int l_packed_dim = unhash_packed_dim(l_layout);
-
-${layout_declare_spec_const(C, "int", "m_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 m_axis_map = unhash_axis_map(m_layout);
-const lowp int m_packed_dim = unhash_packed_dim(m_layout);
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-// Maximum block sizes to prevent array overflow
-#define MAX_BR 64
-#define MAX_BC 128
-
-// Texture access helper functions using proper axis mapping
-// Q_sizes, K_sizes, V_sizes, O_sizes are [D, H, N, B] (UBO layout)
-// l_sizes, m_sizes are [B, H, N] (UBO layout)
-T load_tensor_Q(int batch, int seq_pos, int head, int dim) {
-    ivec4 tidx = ivec4(dim, head, seq_pos, batch);  // Match [D, H, N, B] order
-    ivec3 pos = tidx_to_pos(tidx, Q_sizes, Q_axis_map, Q_packed_dim);
-    int component = tidx[Q_packed_dim] % 4;
-    vec4 texel = texelFetch(t_Q, pos, 0);
-    return T(texel[component]);
-}
-
-T load_tensor_K(int batch, int seq_pos, int head, int dim) {
-    ivec4 tidx = ivec4(dim, head, seq_pos, batch);  // Match [D, H, N, B] order
-    ivec3 pos = tidx_to_pos(tidx, K_sizes, K_axis_map, K_packed_dim);
-    int component = tidx[K_packed_dim] % 4;
-    vec4 texel = texelFetch(t_K, pos, 0);
-    return T(texel[component]);
-}
-
-T load_tensor_V(int batch, int seq_pos, int head, int dim) {
-    ivec4 tidx = ivec4(dim, head, seq_pos, batch);  // Match [D, H, N, B] order
-    ivec3 pos = tidx_to_pos(tidx, V_sizes, V_axis_map, V_packed_dim);
-    int component = tidx[V_packed_dim] % 4;
-    vec4 texel = texelFetch(t_V, pos, 0);
-    return T(texel[component]);
-}
-
-T load_tensor_O(int batch, int seq_pos, int head, int dim) {
-    ivec4 tidx = ivec4(dim, head, seq_pos, batch);  // Match [D, H, N, B] order
-    ivec3 pos = tidx_to_pos(tidx, O_sizes, O_axis_map, O_packed_dim);
-    int component = tidx[O_packed_dim] % 4;
-    vec4 texel = imageLoad(t_O, pos);
-    return T(texel[component]);
-}
-
-void store_tensor_O(int batch, int seq_pos, int head, int dim, T value) {
-    ivec4 tidx = ivec4(dim, head, seq_pos, batch);  // Match [D, H, N, B] order
-    ivec3 pos = tidx_to_pos(tidx, O_sizes, O_axis_map, O_packed_dim);
-    int component = tidx[O_packed_dim] % 4;
-    vec4 texel = imageLoad(t_O, pos);
-    texel[component] = float(value);
-    imageStore(t_O, pos, texel);
-}
-
-float load_tensor_l(int batch, int head, int seq_pos) {
-    ivec4 tidx = ivec4(seq_pos, head, batch, 0);  // Match [N, H, B] order (with padding)
-    ivec3 pos = tidx_to_pos(tidx, ivec4(l_sizes, 1), l_axis_map, l_packed_dim);
-    int component = tidx[l_packed_dim] % 4;
-    vec4 texel = imageLoad(t_l, pos);
-    return texel[component];
-}
-
-void store_tensor_l(int batch, int head, int seq_pos, float value) {
-    ivec4 tidx = ivec4(seq_pos, head, batch, 0);  // Match [N, H, B] order (with padding)
-    ivec3 pos = tidx_to_pos(tidx, ivec4(l_sizes, 1), l_axis_map, l_packed_dim);
-    int component = tidx[l_packed_dim] % 4;
-    vec4 texel = imageLoad(t_l, pos);
-    texel[component] = value;
-    imageStore(t_l, pos, texel);
-}
-
-float load_tensor_m(int batch, int head, int seq_pos) {
-    ivec4 tidx = ivec4(seq_pos, head, batch, 0);  // Match [N, H, B] order (with padding)
-    ivec3 pos = tidx_to_pos(tidx, ivec4(m_sizes, 1), m_axis_map, m_packed_dim);
-    int component = tidx[m_packed_dim] % 4;
-    vec4 texel = imageLoad(t_m, pos);
-    return texel[component];
-}
-
-void store_tensor_m(int batch, int head, int seq_pos, float value) {
-    ivec4 tidx = ivec4(seq_pos, head, batch, 0);  // Match [N, H, B] order (with padding)
-    ivec3 pos = tidx_to_pos(tidx, ivec4(m_sizes, 1), m_axis_map, m_packed_dim);
-    int component = tidx[m_packed_dim] % 4;
-    vec4 texel = imageLoad(t_m, pos);
-    texel[component] = value;
-    imageStore(t_m, pos, texel);
-
-}
-
-void main() {
-    // Each thread processes one row block - same as buffer version
-    const int thread_id = int(gl_GlobalInvocationID.x);
-
-    // Tensor dimensions: Q_sizes = [D, H, N, B]
-    const int head_dim = Q_sizes.x;     // D (head dim)
-    const int num_heads_val = Q_sizes.y;    // H (num heads)
-    const int seq_len = Q_sizes.z;      // N (sequence length)
-    const int batch_size = Q_sizes.w;   // B (batch)
-
-    // Block sizes
-    const int Br = block_size_r;
-    const int Bc = block_size_c;
-
-    const int Tr = (seq_len + Br - 1) / Br;  // Number of row blocks
-    const int total_row_blocks = batch_size * num_heads_val * Tr;
-
-    if (thread_id >= total_row_blocks) {
-        return;
-    }
-
-    // Decode thread_id to (batch, head, row_block)
-    const int batch = thread_id / (num_heads_val * Tr);
-    const int remaining = thread_id % (num_heads_val * Tr);
-    const int head = remaining / Tr;
-    const int row_block = remaining % Tr;
-
-    // Calculate row range for this block
-    const int row_start = row_block * Br;
-    const int row_end = min(row_start + Br, seq_len);
-    const int actual_Br = row_end - row_start;
-
-    // STEP 1: Initialize only this thread's row block
-    // Each thread initializes its own rows to avoid cross-workgroup synchronization issues
-    for (int r = 0; r < actual_Br; r++) {
-        const int seq_pos = row_start + r;
-
-        // Initialize l and m textures for this row block's positions
-        ivec4 l_tidx = ivec4(batch, head, seq_pos, 0);
-        ivec3 l_pos = tidx_to_pos(l_tidx, ivec4(l_sizes, 1), l_axis_map, l_packed_dim);
-        vec4 l_texel = vec4(0.0);
-        imageStore(t_l, l_pos, l_texel);
-
-        ivec4 m_tidx = ivec4(batch, head, seq_pos, 0);
-        ivec3 m_pos = tidx_to_pos(m_tidx, ivec4(m_sizes, 1), m_axis_map, m_packed_dim);
-        vec4 m_texel = vec4(-1e10);
-        imageStore(t_m, m_pos, m_texel);
-
-        // Initialize output tensor for this row block
-        for (int dim = 0; dim < head_dim; dim++) {
-            store_tensor_O(batch, seq_pos, head, dim, T(0.0));
-        }
-    }
-
-    // STEP 5: Outer loop over column blocks (For K, V tensors)
-    const int Tc = (seq_len + Bc - 1) / Bc;  // Number of column blocks
-    for (int j = 0; j < Tc; j++) {
-        const int col_start = j * Bc;
-        const int col_end = min(col_start + Bc, seq_len);
-        const int actual_Bc = col_end - col_start;
-
-        // Load current statistics for all rows in this block
-        float m_i[MAX_BR];
-        float l_i[MAX_BR];
-        for (int r = 0; r < actual_Br; r++) {
-            const int seq_pos = row_start + r;
-            m_i[r] = load_tensor_m(batch, head, seq_pos);
-            l_i[r] = load_tensor_l(batch, head, seq_pos);
-        }
-
-        // STEP 9: Compute Sij = Qi * Kj^T
-        T S_block[MAX_BR][MAX_BC];
-        float m_tilde_ij[MAX_BR];   // Row maxes
-        float l_tilde_ij[MAX_BR];   // Row sums
-
-        // Initialize row statistics
-        for (int r = 0; r < actual_Br; r++) {
-            m_tilde_ij[r] = -1.0 / 0.0; // -infinity
-            l_tilde_ij[r] = 0.0;
-        }
-
-        // Compute attention scores Sij = Qi @ Kj^T
-        for (int r = 0; r < actual_Br; r++) {
-            const int global_row = row_start + r;
-            for (int c = 0; c < actual_Bc; c++) {
-                const int global_col = col_start + c;
-
-                // For multi-query attention: map query head to KV head
-                const int kv_head = (head * num_kv_heads) / num_heads_val;
-
-                // Dot product: Q[seq_pos, :] · K[col_pos, :]
-                T score = T(0.0);
-                for (int dim = 0; dim < head_dim; dim++) {
-                    T q_val = load_tensor_Q(batch, global_row, head, dim);
-                    T k_val = load_tensor_K(batch, global_col, kv_head, dim);
-                    score += q_val * k_val;
-                }
-                score *= scale;
-
-
-                // Apply causal masking: mask if global_col > global_row + input_pos
-                bool masked = (global_col > global_row + input_pos);
-                if (masked) {
-                    score = T(-1.0 / 0.0); // Set to negative infinity
-                }
-
-                S_block[r][c] = score;
-
-
-                // Track row maximum (after masking)
-                m_tilde_ij[r] = max(m_tilde_ij[r], float(score));
-            }
-        }
-
-        // STEP 10: Compute P'ij = exp(Sij − m'ij) and l'ij = rowsum(P'ij)
-        for (int r = 0; r < actual_Br; r++) {
-            // Handle the case where all scores are -inf (fully masked row)
-            if (isinf(m_tilde_ij[r]) && m_tilde_ij[r] < 0.0) {
-                // All scores are -inf, so all probabilities are 0
-                for (int c = 0; c < actual_Bc; c++) {
-                    S_block[r][c] = 0.0;
-                }
-                l_tilde_ij[r] = 0.0;
-            } else {
-                // Normal case: compute softmax
-                for (int c = 0; c < actual_Bc; c++) {
-                    S_block[r][c] = exp(S_block[r][c] - T(m_tilde_ij[r]));
-                    l_tilde_ij[r] += float(S_block[r][c]);
-                }
-            }
-        }
-
-        // STEP 11: Softmax update
-        float m_new_i[MAX_BR];
-        float l_new_i[MAX_BR];
-        for (int r = 0; r < actual_Br; r++) {
-            m_new_i[r] = max(m_i[r], m_tilde_ij[r]);
-            l_new_i[r] = exp(m_i[r] - m_new_i[r]) * l_i[r] + exp(m_tilde_ij[r] - m_new_i[r]) * l_tilde_ij[r];
-
-        }
-
-        // STEP 12: Update Oi
-        for (int r = 0; r < actual_Br; r++) {
-            const int global_row = row_start + r;
-            float alpha = exp(m_i[r] - m_new_i[r]);
-            float beta = exp(m_tilde_ij[r] - m_new_i[r]);
-
-            // For multi-query attention: map query head to KV head
-            const int kv_head = (head * num_kv_heads) / num_heads_val;
-
-            for (int dim = 0; dim < head_dim; dim++) {
-                // Compute P'ij @ Vj for this dimension
-                T pv_sum = T(0.0);
-                for (int c = 0; c < actual_Bc; c++) {
-                    const int global_col = col_start + c;
-                    T v_val = load_tensor_V(batch, global_col, kv_head, dim);
-                    pv_sum += S_block[r][c] * v_val;
-                }
-
-                // Check for division by zero before updating output
-                if (l_new_i[r] <= 0.0) {
-                    store_tensor_O(batch, global_row, head, dim, T(0.0));
-                } else {
-                    // Oi = (alpha * l_i * Oi + beta * P'ij @ Vj) / l_new_i
-                    T current_o = load_tensor_O(batch, global_row, head, dim);
-                    T new_o = (T(alpha) * T(l_i[r]) * current_o + T(beta) * pv_sum) / T(l_new_i[r]);
-                    store_tensor_O(batch, global_row, head, dim, new_o);
-
-                }
-            }
-        }
-
-        // STEP 13: Update li, mi
-        for (int r = 0; r < actual_Br; r++) {
-            const int seq_pos = row_start + r;
-            store_tensor_l(batch, head, seq_pos, l_new_i[r]);
-            store_tensor_m(batch, head, seq_pos, m_new_i[r]);
-        }
-
-    }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/flash_attention_texture3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/flash_attention_texture3d.yaml
deleted file mode 100644
index 909b8bfd3a9..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/flash_attention_texture3d.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-flash_attention_texture3d:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-  shader_variants:
-    - NAME: flash_attention_texture3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/flip.glsl b/backends/vulkan/runtime/graph/ops/glsl/flip.glsl
deleted file mode 100644
index 2291d1b6e4f..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/flip.glsl
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
-
-#include "indexing_utils.h"
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_ubo(B, "ivec3", "out_limits")}
-${layout_declare_ubo(B, "ivec4", "out_sizes")}
-${layout_declare_ubo(B, "ivec4", "dims")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  VEC4_T out_texel = VEC4_T(0);
-  uint src_x = pos.x;
-  uint src_y = pos.y;
-  uint src_z = pos.z;
-
-  int flattened_channels = int(ceil(out_sizes.z / 4.0));
-
-  // Width
-  if (dims.x == 1) {
-    src_x = out_sizes.x - 1 - pos.x;
-  }
-  // Height
-  if (dims.y == 1) {
-    src_y = out_sizes.y - 1 - pos.y;
-  }
-  // Batch
-  if (dims.w == 1) {
-    uint n = pos.z / flattened_channels;
-    uint src_n = out_sizes.w - 1 - n;
-    uint c4 = pos.z - n * flattened_channels;
-    src_z = src_n * flattened_channels + c4;
-  }
-
-  uint prev_src_z = src_z;
-  for (int p = 0; p < 4; ++p) {
-    uint src_p = p;
-
-    // Channel
-    if (dims.z == 1) {
-      uint nc = (pos.z / flattened_channels) * flattened_channels;
-      uint c4 = pos.z - nc;
-      uint c = c4 * 4 + p;
-      uint src_c = out_sizes.z - 1 - c;
-
-      src_z = (dims.w == 1)
-          ? prev_src_z - c4 + src_c / 4 // Batch and Channel
-          : nc + src_c / 4; // Channel only
-      src_p = src_c % 4;
-    }
-
-    VEC4_T in_texel = VEC4_T(texelFetch(t_in, ivec3(src_x, src_y, src_z), 0));
-    out_texel[p] = in_texel[src_p];
-  }
-  imageStore(t_out, pos, out_texel);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/flip.yaml b/backends/vulkan/runtime/graph/ops/glsl/flip.yaml
deleted file mode 100644
index f5e7c874773..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/flip.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-flip:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: double
-      - VALUE: int8
-      - VALUE: uint8
-      - VALUE: int32
-  shader_variants:
-    - NAME: flip
diff --git a/backends/vulkan/runtime/graph/ops/glsl/full.glsl b/backends/vulkan/runtime/graph/ops/glsl/full.glsl
deleted file mode 100644
index 81f1f182cdf..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/full.glsl
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-#define POS ${get_pos[NDIM]("pos")}
-
-#include "indexing_utils.h"
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_ubo(B, "ivec4", "sizes")}
-${layout_declare_ubo(B, "float", "fill_value")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim);
-
-  if (any(greaterThanEqual(idx, sizes))) {
-    return;
-  }
-
-  VEC4_T outtex = VEC4_T(fill_value);
-  const int packed_dim_size = sizes[packed_dim];
-  int packed_idx = idx[packed_dim];
-
-  if (packed_idx + 3 >= packed_dim_size) {
-    ivec4 packed_ind = ivec4(packed_idx) + ivec4(0, 1, 2, 3);
-    VEC4_T valid_idx = VEC4_T(lessThan(packed_ind, ivec4(packed_dim_size)));
-    outtex = outtex * valid_idx;
-  }
-
-  imageStore(t_out, POS, outtex);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/full.yaml b/backends/vulkan/runtime/graph/ops/glsl/full.yaml
deleted file mode 100644
index eff78a7938d..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/full.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-full:
-  parameter_names_with_default_values:
-    NDIM: 3
-    DTYPE: float
-    PACKING: C_packed
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: full
diff --git a/backends/vulkan/runtime/graph/ops/glsl/grid_priors.glsl b/backends/vulkan/runtime/graph/ops/glsl/grid_priors.glsl
deleted file mode 100644
index 93a2c53e013..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/grid_priors.glsl
+++ /dev/null
@@ -1,38 +0,0 @@
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_ubo(1, "ivec4", "in_sizes")}
-${layout_declare_ubo(2, "ivec4", "out_sizes")}
-${layout_declare_ubo(3, "int", "stride", "float", "offset")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim);
-
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
-    return;
-  }
-  int width = in_sizes.x;
-  VEC4_T outtex;
-  if (pos.x == 0) {
-    float value = (pos.y % width + offset) * stride;
-    outtex = VEC4_T(value, 0, 0, 0);
-  } else if (pos.x == 1) {
-    float value = (pos.y / width + offset) * stride;
-    outtex = VEC4_T(value, 0, 0, 0);
-  }
-
-  imageStore(t_out, pos, outtex);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/grid_priors.yaml b/backends/vulkan/runtime/graph/ops/glsl/grid_priors.yaml
deleted file mode 100644
index 654edca6108..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/grid_priors.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-grid_priors:
-  parameter_names_with_default_values:
-    NDIM: 3
-    DTYPE: float
-    PACKING: C_packed
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: grid_priors
diff --git a/backends/vulkan/runtime/graph/ops/glsl/group_norm_reduce_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/group_norm_reduce_texture.glsl
deleted file mode 100644
index 70fdf2bae17..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/group_norm_reduce_texture.glsl
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#include "broadcasting_utils.h"
-#include "indexing_utils.h"
-
-#define PRECISION ${PRECISION}
-
-#define BUF_T ${buffer_scalar_type(DTYPE)}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_mean", DTYPE, "buffer")}
-${layout_declare_tensor(B, "w", "t_rstd", DTYPE, "buffer")}
-
-${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}
-
-${layout_declare_ubo(B, "ivec4", "mean_strides")}
-${layout_declare_ubo(B, "int", "mean_numel")}
-${layout_declare_ubo(B, "ivec3", "in_limits")}
-${layout_declare_ubo(B, "ivec4", "in_sizes")}
-
-layout(push_constant) uniform PRECISION restrict Block {
-  int group;
-  float epsilon;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "mean_layout", "DEFAULT_DIM_ORDER")}
-const lowp ivec4 mean_dim_order = unhash_dim_order(mean_layout);
-
-#define LOCAL_WORK_GROUP_SIZE 64
-shared float shared_sum[LOCAL_WORK_GROUP_SIZE];
-shared float shared_sum_sq[LOCAL_WORK_GROUP_SIZE];
-
-/*
- * Computes the mean and standard deviation of one group of channels of the
- * input tensor for the group normalization operator.
- *
- * Given a tensor of shape [W, H, C, N] the mean and standard deviation tensors
- * will have a shape of [G, N] where G = C / group.
- *
- * The input tensor is assumed to be a channels-packed texture tensor with the
- * standard axis mapping. The output tensors are assumed to be contiguous buffer
- * tensors.
- *
- * Algorithm:
- * 1. Each shader invocation corresponds to one group in one batch
- * 2. The local work group cooperatively reduces over all spatial locations (H×W)
- *    and all channels within the group (C/group channels)
- * 3. Uses shared memory for efficient parallel reduction
- * 4. Main thread (local ID 0) writes the final mean and rstd to buffer
- *
- * Global work group size: {N, 1, 1}
- * N is the number of elements in the tensor buffer; each thread computes one
- * output element.
- *
- * Local work group size:  {1, float, 1}
- * float should be a power of 2, recommended 64 or 128 threads. This allows
- * efficient tree-based reduction in shared memory. Each local group will
- * cooperate to compute the output element.
- *
- * Each shader invocation will compute the mean and standard deviation for one
- * channel group in the input, and write out the corresponding result.
- */
-void group_norm_reduce_C_packed() {
-  const int global_idx = int(gl_GlobalInvocationID.x);
-  const int local_idx = int(gl_LocalInvocationID.y);
-
-  // Calculate group dimensions
-  const int D = in_sizes.z / group;  // channels per group
-  const int HxW = in_sizes.y * in_sizes.x;  // spatial size
-  const int group_size = D * HxW;  // total elements per group
-
-  // Convert global index to (group_idx, batch_idx)
-  const ivec4 mean_tidx = bufi_to_tidx(global_idx, mean_strides, mean_dim_order);
-
-  // Initialize local sums
-  float local_sum = 0.0;
-  float local_sum_sq = 0.0;
-  int local_count = 0;
-
-  // Calculate the range of channels for this group
-  const int group_start_channel = mean_tidx.x * D;
-  const int group_end_channel = group_start_channel + D;
-
-  // Calculate the range of texels that contain channels from this group
-  const int start_texel_idx = group_start_channel / 4;
-  const int end_texel_idx = divup4(group_end_channel);
-  const int texels_in_group = end_texel_idx - start_texel_idx;
-
-  // Total texels to process across all spatial locations
-  const int total_texels = texels_in_group * HxW;
-
-  // Each thread processes a subset of texels
-  const int texels_per_thread = (total_texels + LOCAL_WORK_GROUP_SIZE - 1) / LOCAL_WORK_GROUP_SIZE;
-  const int start_texel = local_idx * texels_per_thread;
-  const int end_texel = min(start_texel + texels_per_thread, total_texels);
-
-  // Process assigned texels
-  for (int texel_idx = start_texel; texel_idx < end_texel; texel_idx++) {
-    // Convert texel index to spatial and channel coordinates
-    const int spatial_idx = texel_idx / texels_in_group;
-    const int texel_in_group = texel_idx % texels_in_group;
-
-    // Convert to spatial coordinates
-    const int w = spatial_idx % in_sizes.x;
-    const int h = spatial_idx / in_sizes.x;
-
-    // Calculate the global texel index
-    const int global_texel_idx = start_texel_idx + texel_in_group;
-
-    // Convert to texture position using default axis mapping
-    ivec3 tex_pos = ivec3(w, h, global_texel_idx);
-
-    // Adjust for batch dimension if needed
-    if (in_sizes.w > 1) {
-      // default axis mapping means channels is the batch concat dim
-      tex_pos.z += mean_tidx.y * divup4(in_sizes.z);
-    }
-
-    // Check bounds and load texel
-    if (all(lessThan(tex_pos, in_limits))) {
-      const vec4 texel_val = load_texel(t_in, tex_pos);
-
-      // Process all components of the texel that belong to this group
-      const int texel_start_channel = global_texel_idx * 4;
-      for (int comp = 0; comp < 4; comp++) {
-        const int current_channel = texel_start_channel + comp;
-
-        // Check if this component belongs to the current group
-        if (current_channel >= group_start_channel && current_channel < group_end_channel) {
-          const float val = texel_val[comp];
-          local_sum += val;
-          local_sum_sq += val * val;
-          local_count++;
-        }
-      }
-    }
-  }
-
-  // Store local results in shared memory
-  shared_sum[local_idx] = local_sum;
-  shared_sum_sq[local_idx] = local_sum_sq;
-
-  // Synchronize threads
-  memoryBarrierShared();
-  barrier();
-
-  // Perform tree-based reduction in shared memory
-  for (int stride = LOCAL_WORK_GROUP_SIZE / 2; stride > 0; stride /= 2) {
-    if (local_idx < stride) {
-      shared_sum[local_idx] += shared_sum[local_idx + stride];
-      shared_sum_sq[local_idx] += shared_sum_sq[local_idx + stride];
-    }
-    memoryBarrierShared();
-    barrier();
-  }
-
-  // Main thread writes the result
-  if (local_idx == 0 && global_idx < mean_numel) {
-    const float total_sum = shared_sum[0];
-    const float total_sum_sq = shared_sum_sq[0];
-    const float count = float(group_size);
-
-    // Calculate mean and reciprocal standard deviation
-    const float mean_val = total_sum / count;
-    const float variance = (total_sum_sq / count) - (mean_val * mean_val);
-    const float rstd_val = 1.0 / sqrt(variance + epsilon);
-
-    // Write to buffer-backed tensors
-    t_mean[global_idx] = BUF_T(mean_val);
-    t_rstd[global_idx] = BUF_T(rstd_val);
-  }
-}
-
-void main() {
-  group_norm_reduce_C_packed();
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/group_norm_reduce_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/group_norm_reduce_texture.yaml
deleted file mode 100644
index 00c357a1d6e..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/group_norm_reduce_texture.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-group_norm_reduce_texture:
-  parameter_names_with_default_values:
-    DTYPE: float
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: group_norm_reduce_texture
diff --git a/backends/vulkan/runtime/graph/ops/glsl/group_norm_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/group_norm_texture.glsl
deleted file mode 100644
index 8440481963a..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/group_norm_texture.glsl
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#include "broadcasting_utils.h"
-#include "indexing_utils.h"
-
-#define PRECISION ${PRECISION}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
-
-${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "t_weight", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "t_bias", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "t_mean", DTYPE, "buffer")}
-${layout_declare_tensor(B, "r", "t_rstd", DTYPE, "buffer")}
-
-${layout_declare_ubo(B, "ivec3", "out_limits")}
-${layout_declare_ubo(B, "ivec4", "out_sizes")}
-${layout_declare_ubo(B, "ivec3", "weight_limits")}
-${layout_declare_ubo(B, "ivec4", "mean_strides")}
-
-layout(push_constant) uniform PRECISION restrict Block {
-  int group;
-  float epsilon;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-/*
- * Applies group normalization to t_in, and write the results to t_out. The mean
- * and rstd of the input tensor are precomputed and passed in as t_mean and
- * t_rstd.
- *
- * Given an input tensor t_in of shape [N, C, H, W], the mean and rstd will have
- * shape [N, C / ngroup], and the output will have the same shape as t_in. The
- * weight and bias tensor will have a shape of [C].
- *
- * In this implementation, the input and output tensors are assumed to be
- * channels packed textures with standard axis mapping.
- *
- * The weight and bias tensors are assumed to be width packed textures with
- * standard axis mapping.
- *
- * The mean and rstd tensors are assumed to be contiguous buffer-backed tensors.
- */
-void apply_group_norm() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  // Check bounds
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  // Convert texture position to tensor coordinates using default axis mapping
-  // and channels packing
-  ivec4 out_tidx = ivec4(pos.x, pos.y, mul4(pos.z), 0);
-
-  // Handle batch dimension if batches > 1
-  if (out_sizes.w > 1) {
-    const int C_aligned = alignup4(out_sizes.z);
-    // default axis mapping means channels is the batch concatenation dim
-    const int batch_idx = out_tidx.z / C_aligned;
-    out_tidx.w = batch_idx;
-    out_tidx.z = out_tidx.z % C_aligned;
-  }
-
-  // Load input texel (contains 4 consecutive channels)
-  const vec4 input_texel = load_texel(t_in, pos);
-
-  // Load weight and bias texels, which are width-packed; each element along the
-  // width dim corresponds to a channel in the input tensor.
-  const ivec3 weight_pos = ivec3(out_tidx.z / 4, 0, 0);
-  const vec4 weight_texel = load_texel(t_weight, weight_pos);
-  const vec4 bias_texel = load_texel(t_bias, weight_pos);
-
-  // Calculate which channels this texel represents
-  // For channels-packed layout: texel at position z contains channels [z, z+1, z+2, z+3]
-  const int base_channel = out_tidx.z;
-
-  // Calculate buffer indices for mean/rstd lookup
-  // Mean/rstd tensors have shape [G, N] where G = C/group
-  const int batch_idx = out_tidx.w;
-  const int channels_per_group = out_sizes.z / group;
-
-  vec4 bias;
-  // Process each element of the output texel individually, since each element
-  // may belong to a different channel group
-  for (int i = 0; i < 4; ++i) {
-    const int channel_idx = base_channel + i;
-    // Handle case where padding channels are added
-    if (channel_idx >= out_sizes.z) {
-      bias[i] = input_texel[i];
-      continue;
-    }
-
-    // Calculate group index for this channel
-    const int group_idx = channel_idx / channels_per_group;
-
-    // Create tensor index for mean/rstd buffer access
-    const ivec4 mean_tidx = ivec4(group_idx, batch_idx, 0, 0);
-    const int mean_bufi = tidx_to_bufi(mean_tidx, mean_strides);
-
-    // Load mean and rstd values for this channel
-    const float mean_val = t_mean[mean_bufi];
-    const float rstd_val = t_rstd[mean_bufi];
-
-    // Apply group normalization with weight and bias: ((input - mean) * rstd) * weight + bias
-    const float normalized = (input_texel[i] - mean_val) * rstd_val;
-    bias[i] = normalized * weight_texel[i] + bias_texel[i];
-  }
-
-  // Write result to output texture
-  write_texel(t_out, pos, bias);
-}
-
-void main() {
-  apply_group_norm();
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/group_norm_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/group_norm_texture.yaml
deleted file mode 100644
index b50853be3b0..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/group_norm_texture.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-group_norm_texture:
-  parameter_names_with_default_values:
-    DTYPE: float
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: group_norm_texture
diff --git a/backends/vulkan/runtime/graph/ops/glsl/im2col.glsl b/backends/vulkan/runtime/graph/ops/glsl/im2col.glsl
deleted file mode 100644
index f045d4e9702..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/im2col.glsl
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#extension GL_EXT_debug_printf : enable
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_load_type(DTYPE, INPUT_STORAGE)}
-#define T ${texel_load_component_type(DTYPE, INPUT_STORAGE)}
-
-$if OUTPUT_STORAGE == "buffer":
-  #define OUTPUT_BUFFER
-$if INPUT_STORAGE == "buffer":
-  #define INPUT_BUFFER
-
-#define TILE_M4 1
-#define TILE_N4 1
-#define TILE_K4 1
-
-#define TILE_M 4
-#define TILE_N 4
-#define TILE_K 4
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "conv2d_common.glslh"
-
-${layout_declare_tensor(B, "w", "t_output", DTYPE, OUTPUT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)}
-
-// Sizes of the im2col matrix of the convolution input
-${layout_declare_ubo(B, "ivec4", "matrix_sizes")}
-// Sizes of the input image
-${layout_declare_ubo(B, "ivec4", "input_sizes")}
-// Sizes of the output image
-${layout_declare_ubo(B, "ivec4", "output_sizes")}
-
-${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#include "conv2d_fp_im2col_block_load.glslh"
-
-#ifdef OUTPUT_BUFFER
-
-void write_tile(
-    const FPInputTile in_tile,
-    const int k4,
-    const int m_start,
-    const int K4) {
-  [[unroll]] for (int m = 0; m < TILE_M; m++) {
-    t_output[(m_start + m) * K4 + k4] = in_tile.data[m][0];
-  }
-}
-
-#else // OUTPUT_TEXTURE
-
-void write_tile(
-    const FPInputTile in_tile,
-    const int k4,
-    const int m_start,
-    const int K4) {
-  [[unroll]] for (int m = 0; m < TILE_M; m++) {
-    imageStore(t_output, ivec3(k4, m_start + m, 0), vec4(in_tile.data[m][0]));
-  }
-}
-
-#endif // OUTPUT_BUFFER
-
-void main() {
-  // Each thread writes out a 4 wide x 4 high block of the output matrix. The
-  // thread position corresponds to the block index.
-  const int k4 = int(gl_GlobalInvocationID.x);
-  const int m4 = int(gl_GlobalInvocationID.y);
-
-  // Convert block idx to tensor idx
-  const int k = mul_4(k4);
-  const int m = mul_4(m4);
-
-  const int in_channels_per_group = input_sizes.z / conv2d_params.groups;
-
-  // Logical K dim size (unpadded)
-  const int logical_K = conv2d_params.logical_K;
-  // Physical K dim, which contains padding elements
-  const int K = matrix_sizes.x;
-
-  // M dim, which represents the number of flattened output width, height,
-  // batches. Unlike K, there is no difference between the physical and logical
-  // sizes.
-  const int M = matrix_sizes.y;
-
-  if (k >= K || m >= M) {
-    return;
-  }
-
-  FPInputTile in_tile;
-  load_input_im2col_tile(in_tile, k4, m4, logical_K, M);
-
-  // Number of texels in the x dim of the output matrix
-  const int K4 = div_4(K);
-  write_tile(in_tile, k4, m, K4);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/im2col.yaml b/backends/vulkan/runtime/graph/ops/glsl/im2col.yaml
deleted file mode 100644
index dd486b0e1a6..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/im2col.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-im2col:
-  parameter_names_with_default_values:
-    DTYPE: float
-    OUTPUT_STORAGE: buffer
-    INPUT_STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-  shader_variants:
-    - NAME: im2col_buffer_texture3d
-    - NAME: im2col_texture3d_texture3d
-      OUTPUT_STORAGE: texture3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
deleted file mode 100644
index d7bef9f0163..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define BUF_T ${buffer_scalar_type(DTYPE)}
-#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
-
-${define_active_storage_type(STORAGE)}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_buffer(B, "w", "buf_out", DTYPE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-
-$if USE_PUSH_CONST:
-  layout(push_constant) uniform restrict Block {
-    ivec4 sizes;
-  $if not TO_STAGING:
-    ivec4 buf_strides;
-  };
-$else:
-  ${layout_declare_ubo(B, "ivec4", "sizes")}
-  $if not TO_STAGING:
-    ${layout_declare_ubo(B, "ivec4", "buf_strides")}
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 axis_map = unhash_axis_map(t_layout);
-const lowp int packed_dim = unhash_packed_dim(t_layout);
-
-void write_out_texel(VEC4_T texel, ivec4 tidx) {
-  $if TO_STAGING:
-    const ivec4 buf_indices = tidx_to_nchwi(tidx, sizes, packed_dim);
-  $else:
-    const ivec4 buf_indices = tidx_to_4bufi(tidx, buf_strides, packed_dim);
-
-  if (tidx[packed_dim] < sizes[packed_dim]) {
-    buf_out[buf_indices.x] = BUF_T(texel.x);
-  }
-  if (tidx[packed_dim] + 1 < sizes[packed_dim]) {
-    buf_out[buf_indices.y] = BUF_T(texel.y);
-  }
-  if (tidx[packed_dim] + 2 < sizes[packed_dim]) {
-    buf_out[buf_indices.z] = BUF_T(texel.z);
-  }
-  if (tidx[packed_dim] + 3 < sizes[packed_dim]) {
-    buf_out[buf_indices.w] = BUF_T(texel.w);
-  }
-}
-
-void main() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-  const ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim);
-
-  if (any(greaterThanEqual(tidx, sizes))) {
-    return;
-  }
-
-  const VEC4_T intex = load_texel(t_in, lpos_to_pos(lpos, axis_map));
-  write_out_texel(intex, tidx);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml
deleted file mode 100644
index 646d8f1be81..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.yaml
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-image_to_nchw:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: texture3d
-    TO_STAGING: True
-    USE_PUSH_CONST: True
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: double
-      - VALUE: int8
-      - VALUE: uint8
-      - VALUE: int32
-  shader_variants:
-    - NAME: image_to_nchw_texture3d
-    - NAME: image_to_nchw_texture2d
-      STORAGE: texture2d
-    - NAME: clone_image_to_buffer
-      TO_STAGING: False
-    - NAME: image_to_nchw_no_pc_texture3d
-      USE_PUSH_CONST: False
-    - NAME: image_to_nchw_no_pc_texture2d
-      STORAGE: texture2d
-      USE_PUSH_CONST: False
-    - NAME: clone_image_to_buffer_no_pc
-      TO_STAGING: False
-      USE_PUSH_CONST: False
diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_select.glsl b/backends/vulkan/runtime/graph/ops/glsl/index_select.glsl
deleted file mode 100644
index 4500d43b932..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/index_select.glsl
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_tensor(2, "r", "t_idx", "int", STORAGE)}
-${layout_declare_ubo(3, "ivec4", "sizes")}
-${layout_declare_ubo(4, "int", "gpu_dim", "int", "stride")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-void main() {
-  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
-
-  if (pos_out_of_bounds(out_pos, sizes, packed_dim)) {
-    return;
-  }
-
-  const int out_idx = out_pos[gpu_dim] / stride;
-  const int within_stride = out_pos[gpu_dim] % stride;
-  const int in_idx = texelFetch(t_idx, ivec3(out_idx, 0, 0), 0).x;
-
-  ivec3 in_pos = out_pos;
-  in_pos[gpu_dim] = in_idx * stride + within_stride;
-
-  imageStore(t_out, out_pos, texelFetch(t_in, in_pos, 0));
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_select.yaml b/backends/vulkan/runtime/graph/ops/glsl/index_select.yaml
deleted file mode 100644
index abef2225cd9..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/index_select.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-index_select:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-  shader_variants:
-    - NAME: index_select
diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl
deleted file mode 100644
index 76ec540838c..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.glsl
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_tensor(2, "r", "t_idx", "int", STORAGE)}
-${layout_declare_ubo(3, "ivec4", "out_sizes")}
-${layout_declare_ubo(4, "ivec4", "in_sizes")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-void main() {
-  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
-
-  if (pos_out_of_bounds(out_pos, out_sizes, packed_dim)) {
-    return;
-  }
-
-  const ivec4 idx = to_tensor_idx(out_pos, out_sizes, packed_dim);
-  const ivec4 buffer_ixs = tidx_to_nchwi(idx, out_sizes, packed_dim);
-
-  VEC4_T out_texel;
-  for (int i = 0; i < 4; ++i) {
-      const ivec4 out_tidx = nchwi_to_tidx(buffer_ixs[i], out_sizes);
-      int out_channel = out_tidx.z;
-      int in_channel = texelFetch(t_idx, ivec3(out_channel, 0, 0), 0).x;
-
-      ivec4 in_tidx = out_tidx;
-      in_tidx.z = in_channel;
-
-      ivec4 in_elem_pos = to_texture_elem_pos(in_tidx, in_sizes, packed_dim);
-
-      VEC4_T in_texel = texelFetch(t_in, in_elem_pos.xyz, 0);
-
-      out_texel[i] = in_texel[in_elem_pos.w];
-  }
-  imageStore(t_out, out_pos, out_texel);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.yaml b/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.yaml
deleted file mode 100644
index a306e3ce47d..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/index_select_channel.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-index_select_channel:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-  shader_variants:
-    - NAME: index_select_channel
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
deleted file mode 100644
index 81783422ab4..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef INDEXING_GLSLH
-#define INDEXING_GLSLH
-
-#define DIMLIMIT 8
-#define DIMLIMIT_DIV4 2
-
-#define mul_4(x) ((x) << 2)
-#define div_4(x) ((x) >> 2)
-
-#define mod_4(x) ((x) & 3)
-
-//
-// BufferMetadata
-//
-
-struct BufferMetadata {
-  uvec4 sizes[DIMLIMIT_DIV4];
-  uvec4 dim_order[DIMLIMIT_DIV4];
-  uvec4 strides[DIMLIMIT_DIV4];
-  uvec2 ndim_numel;
-};
-
-uint ndim(const BufferMetadata meta) {
-  return meta.ndim_numel[0];
-}
-
-int int_ndim(const BufferMetadata meta) {
-  return int(meta.ndim_numel[0]);
-}
-
-uint numel(const BufferMetadata meta) {
-  return meta.ndim_numel[1];
-}
-
-uint dim_order_at(const BufferMetadata meta, const int dim) {
-  return meta.dim_order[div_4(dim)][mod_4(dim)];
-}
-
-uint dim_order_at(const BufferMetadata meta, const uint dim) {
-  return meta.dim_order[div_4(dim)][mod_4(dim)];
-}
-
-uint stride_at(const BufferMetadata meta, const int dim) {
-  return meta.strides[div_4(dim)][mod_4(dim)];
-}
-
-uint stride_at(const BufferMetadata meta, const uint dim) {
-  return meta.strides[div_4(dim)][mod_4(dim)];
-}
-
-uint size_at(const BufferMetadata meta, const int dim) {
-  return meta.sizes[div_4(dim)][mod_4(dim)];
-}
-
-uint size_at(const BufferMetadata meta, const uint dim) {
-  return meta.sizes[div_4(dim)][mod_4(dim)];
-}
-
-bool are_equal(const BufferMetadata meta1, const BufferMetadata meta2) {
-  // sizes and strides must be the same to be considered equal
-  if (meta1.sizes[0] != meta2.sizes[0]) {
-    return false;
-  }
-  if (meta1.sizes[1] != meta2.sizes[1]) {
-    return false;
-  }
-  if (meta1.strides[0] != meta2.strides[0]) {
-    return false;
-  }
-  if (meta1.strides[1] != meta2.strides[1]) {
-    return false;
-  }
-  return true;
-}
-
-//
-// TensorIndex
-//
-
-struct TensorIndex {
-  uvec4 data[DIMLIMIT_DIV4];
-};
-
-void initialize(out TensorIndex tidx) {
-  tidx.data[0] = uvec4(0);
-  tidx.data[1] = uvec4(0);
-}
-
-uint idx_at(const TensorIndex tidx, const int dim) {
-  return tidx.data[div_4(dim)][mod_4(dim)];
-}
-
-void permute(inout TensorIndex tidx, const ivec4 permute_order[DIMLIMIT_DIV4]) {
-  TensorIndex new_tidx = tidx;
-  for (int d = 0; d < DIMLIMIT; ++d) {
-    int src_dim = permute_order[div_4(d)][mod_4(d)];
-    new_tidx.data[div_4(d)][mod_4(d)] = idx_at(tidx, src_dim);
-  }
-  tidx = new_tidx;
-}
-
-//
-// Index Conversions
-//
-
-void contiguous_idx_to_tensor_idx(
-    const BufferMetadata meta,
-    uint contiguous_idx,
-    out TensorIndex tidx) {
-  initialize(tidx);
-  int dim = int_ndim(meta);
-  int i = 0;
-
-  uint contiguous_strides[DIMLIMIT];
-  contiguous_strides[0] = 1;
-  for (int d = 1; d < DIMLIMIT; ++d) {
-    contiguous_strides[d] = size_at(meta, d - 1) * contiguous_strides[d - 1];
-  }
-
-  for (int d = max(dim - 1, 0); d >= 0; d--) {
-    uint dim_stride = contiguous_strides[d];
-
-    tidx.data[div_4(d)][mod_4(d)] = contiguous_idx / dim_stride;
-    contiguous_idx = contiguous_idx % dim_stride;
-  }
-}
-
-uint tensor_idx_to_contiguous_idx(
-    const BufferMetadata meta,
-    const TensorIndex tidx) {
-  uint contiguous_strides[DIMLIMIT];
-  contiguous_strides[0] = 1;
-  for (int d = 1; d < DIMLIMIT; ++d) {
-    contiguous_strides[d] = size_at(meta, d - 1) * contiguous_strides[d - 1];
-  }
-
-  uint contig_idx = 0;
-  for (int d = 0; d < ndim(meta); ++d) {
-    contig_idx += contiguous_strides[d] * idx_at(tidx, d);
-  }
-  return contig_idx;
-}
-
-void linear_idx_to_tensor_idx(
-    const BufferMetadata meta,
-    uint linear_idx,
-    out TensorIndex tidx) {
-  initialize(tidx);
-  int dim = int_ndim(meta);
-  int i = 0;
-  for (int d = max(dim - 1, 0); d >= 0; d--) {
-    uint dim_idx = dim_order_at(meta, d);
-    uint dim_stride = stride_at(meta, dim_idx);
-
-    tidx.data[div_4(dim_idx)][mod_4(dim_idx)] = linear_idx / dim_stride;
-    linear_idx = linear_idx % dim_stride;
-  }
-}
-
-uint tensor_idx_to_linear_idx(
-    const BufferMetadata meta,
-    const TensorIndex tidx) {
-  uint lin_idx = 0;
-  for (int d = 0; d < ndim(meta); ++d) {
-    lin_idx += stride_at(meta, d) * idx_at(tidx, d);
-  }
-  return lin_idx;
-}
-
-void clamp_tensor_idx(const BufferMetadata meta, inout TensorIndex tidx) {
-  tidx.data[0] = min(tidx.data[0], meta.sizes[0] - 1);
-  tidx.data[1] = min(tidx.data[1], meta.sizes[1] - 1);
-}
-
-//
-// Debug utilities
-//
-
-#ifdef DEBUG_MODE
-
-void printTensorIndex(const TensorIndex tidx) {
-    debugPrintfEXT(
-        "TensorIndex: tidx=[%u %u %u %u %u %u %u %u]\\n",
-        tidx.data[0][0], tidx.data[0][1], tidx.data[0][2], tidx.data[0][3],
-        tidx.data[1][0], tidx.data[1][1], tidx.data[1][2], tidx.data[1][3]
-    );
-}
-
-void printBufferMetadata(const BufferMetadata meta) {
-    debugPrintfEXT(
-        "BufferMetadata: ndim=%u numel=%u\\n  sizes=[%u %u %u %u %u %u %u %u]\\n  dim_order=[%u %u %u %u %u %u %u %u]\\n  strides=[%u %u %u %u %u %u %u %u]\\n",
-        meta.ndim_numel[0], meta.ndim_numel[1],
-        meta.sizes[0][0], meta.sizes[0][1], meta.sizes[0][2], meta.sizes[0][3],
-        meta.sizes[1][1], meta.sizes[1][1], meta.sizes[1][2], meta.sizes[1][3],
-        meta.dim_order[0][0], meta.dim_order[0][1],
-        meta.dim_order[0][2], meta.dim_order[0][3],
-        meta.dim_order[1][0], meta.dim_order[1][1],
-        meta.dim_order[1][2], meta.dim_order[1][3],
-        meta.strides[0][0], meta.strides[0][1],
-        meta.strides[0][2], meta.strides[0][3],
-        meta.strides[1][1], meta.strides[1][1],
-        meta.strides[1][2], meta.strides[1][3]
-    );
-}
-
-#endif
-
-#endif // INDEXING_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
deleted file mode 100644
index fdb6f514a3e..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
+++ /dev/null
@@ -1,417 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef INDEXING_UTILS_H
-#define INDEXING_UTILS_H
-
-/*
- * The functions defined in this header file use the following shorthand to
- * represent tensor related data structures.
- *
- * tidx  - ivec4 tensor indices, listed in WHCN order.
- *
- * pos   - ivec3 texel position, used to fetch from an image texture via the
- *         texelFetch(image, pos, lod) GLSL function.
- * posi  - ivec4 texel element position. It is the same as pos, except with an
- *         additional component of the index of an element within the texel.
- * lpos  - ivec3 logical position, listed in WHC order. This is a permutation of
- *         texture position based on a tensor's axis_map. lpos.x is the position
- *         component that corresponds to the tensor's width dimension, lpos.y is
- *         the position component that corresponds to the tensor's height dim,
- *         and so on.
- *
- * bufi  - int index into a GPU buffer that backs a tensor.
- * nchwi - int index into a staging buffer for a tensor. The data in the
- *         staging buffer is stored in contiguous data layout, irrespective of
- *         the tensor's strides.
- */
-
-// Width Dim Index, assuming WHCN order
-#define W_DIM 0
-// Height, assuming WHCN order
-#define H_DIM 1
-// Channels, assuming WHCN order
-#define C_DIM 2
-
-/*
- * Fast division by 4 using bit shifting
- */
-#define div4(x) ((x) >> 2)
-
-/*
- * Fast multiplication by 4 using bit shifting
- */
-#define mul4(x) ((x) << 2)
-
-/*
- * Divides input and rounds up to 4
- */
-#define divup4(x) (((x) + 3) >> 2)
-
-/*
- * Divides input by denominator and rounds up
- */
-#define divup(x, d) (((x) + (d) - 1) / (d))
-
-/*
- * Aligns input to the next multiple of 4
- */
-#define alignup4(x) (((x) + 3) & -4)
-
-/*
- * Fast modulo by 4 using bit masking
- */
-#define mod4(x) ((x) & 3)
-
-#define ALIGN_DOWN_4(x) ((x) & ~3)
-
-#define ALIGN_UP_4(x) (((x) + 3) & ~3)
-
-#define DIV_UP_8(x) (((x) + 7) >> 3)
-#define DIV_UP_4(x) (((x) + 3) >> 2)
-
-#define DIV_4(x) ((x) >> 2)
-#define DIV_2(x) ((x) >> 1)
-
-#define MUL_8(x) ((x) << 3)
-#define MUL_4(x) ((x) << 2)
-#define MUL_2(x) ((x) << 1)
-
-/*
- * Get the staging buffer indices that contain the data of the texel that
- * corresponds to the provided tensor index. Since the texel have 4 elements,
- * 4 buffer indices will be retrieved.
- */
-ivec4 tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes, const int packed_dim) {
-  ivec4 strides =
-      ivec4(1, sizes.x, sizes.x * sizes.y, sizes.x * sizes.y * sizes.z);
-
-  int base_i = tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z +
-      tidx.w * strides.w;
-
-  return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim];
-}
-
-/*
- * Get the buffer indices that contain the data of the texel that corresponds to
- * to the provided tensor index. Since the texel have 4 elements, 4 buffer
- * indices will be retrieved.
- */
-ivec4 tidx_to_4bufi(
-    const ivec4 tidx,
-    const ivec4 strides,
-    const int packed_dim) {
-  int base_i = tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z +
-      tidx.w * strides.w;
-
-  return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim];
-}
-
-/*
- * Given a buffer index to a contiguous tensor and the tensor's sizes, return
- * the tensor index that corresponds to the buffer index.
- */
-ivec4 nchwi_to_tidx(const int nchwi, const ivec4 sizes) {
-  const int nchwi_div_x = nchwi / sizes.x;
-  const int nchwi_div_y = nchwi_div_x / sizes.y;
-  return ivec4(
-      nchwi % sizes.x,
-      nchwi_div_x % sizes.y,
-      nchwi_div_y % sizes.z,
-      nchwi_div_y / sizes.z);
-}
-
-int tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes) {
-  const int sizes_xy = sizes.x * sizes.y;
-  return tidx.w * sizes_xy * sizes.z + tidx.z * sizes_xy + tidx.y * sizes.x +
-      tidx.x;
-}
-
-ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const ivec4 dim_order) {
-  ivec4 idx;
-  for (int i = 3; i >= 0; i--) {
-    int dim = dim_order[i];
-    idx[dim] = bufi / strides[dim];
-    bufi %= strides[dim];
-  }
-  return idx;
-}
-
-/*
- * bufi_to_tidx but assumes that the tensor is contiguous
- */
-ivec4 contiguous_bufi_to_tidx(int bufi, const ivec4 strides) {
-  ivec4 idx;
-  for (int i = 3; i >= 0; i--) {
-    idx[i] = bufi / strides[i];
-    bufi %= strides[i];
-  }
-  return idx;
-}
-
-int tidx_to_bufi(const ivec4 tidx, ivec4 strides) {
-  return tidx.x * strides.x + tidx.y * strides.y + tidx.z * strides.z +
-      tidx.w * strides.w;
-}
-
-ivec4 lpos_to_tidx(
-    ivec3 lpos,
-    ivec4 sizes,
-    const int batch_inner_dim,
-    const int packed_dim) {
-  // Align packed dim to next multiple of 4 to account for texel padding
-  sizes[packed_dim] = alignup4(sizes[packed_dim]);
-  // Moving 1 texel along the packed dim traverses 4 tensor elements
-  lpos[packed_dim] *= 4;
-
-  ivec4 tidx = ivec4(lpos, 0);
-
-  if (sizes.w > 1) {
-    tidx.w = tidx[batch_inner_dim] / sizes[batch_inner_dim];
-    tidx[batch_inner_dim] %= sizes[batch_inner_dim];
-  }
-  return tidx;
-}
-
-ivec3 tidx_to_lpos(
-    ivec4 tidx,
-    ivec4 sizes,
-    const int batch_inner_dim,
-    const int packed_dim) {
-  // Align packed dim to next multiple of 4 to account for texel padding
-  sizes[packed_dim] = alignup4(sizes[packed_dim]);
-
-  ivec3 lpos = tidx.xyz;
-
-  // Adjust batch inner dim by batch index if needed
-  if (sizes.w > 1) {
-    lpos[batch_inner_dim] += tidx.w * sizes[batch_inner_dim];
-  }
-  // Fast division by 4, since moving 1 texel along the packed dim traverses 4
-  // tensor elements.
-  lpos[packed_dim] >>= 2;
-  return lpos;
-}
-
-ivec3 tidx_to_pos(
-    ivec4 tidx,
-    ivec4 sizes,
-    const ivec4 axis_map,
-    const int packed_dim) {
-  // Align packed dim to next multiple of 4 to account for texel padding
-  sizes[packed_dim] = alignup4(sizes[packed_dim]);
-
-  ivec3 pos;
-  for (int dim = 0; dim < 3; ++dim) {
-    pos[axis_map[dim]] = tidx[dim];
-  }
-
-  // Adjust batch inner dim by batch index if needed
-  if (sizes.w > 1) {
-    pos[axis_map[axis_map.w]] += tidx.w * sizes[axis_map.w];
-  }
-  // Fast division by 4, since moving 1 texel along the packed dim traverses 4
-  // tensor elements.
-  pos[axis_map[packed_dim]] >>= 2;
-  return pos;
-}
-
-ivec4 tidx_to_posi(
-    ivec4 tidx,
-    ivec4 sizes,
-    const ivec4 axis_map,
-    const int packed_dim) {
-  return ivec4(
-      tidx_to_pos(tidx, sizes, axis_map, packed_dim), tidx[packed_dim] % 4);
-}
-
-ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) {
-  ivec3 pos;
-  pos[axis_map.x] = lpos.x;
-  pos[axis_map.y] = lpos.y;
-  pos[axis_map.z] = lpos.z;
-  return pos;
-}
-
-#ifdef USING_BUFFER
-#define load_texel(buf, idx) buf[idx]
-#elif defined(USING_TEXTURE2D)
-#define load_texel(im, pos) texelFetch(im, pos.xy, 0)
-#define load_texel_lpos(im, lpos, axis_map) \
-  texelFetch(im, lpos_to_pos(lpos, axis_map).xy, 0)
-#else // defined(USING_TEXTURE3D)
-#define load_texel(im, pos) texelFetch(im, pos, 0)
-#define load_texel_lpos(im, lpos, axis_map) \
-  texelFetch(im, lpos_to_pos(lpos, axis_map), 0)
-#endif
-
-#ifdef USING_BUFFER
-#define write_texel(buf, idx, texel) buf[idx] = texel
-#elif defined(USING_TEXTURE2D)
-#define write_texel(im, pos, texel) imageStore(im, pos.xy, texel)
-#define write_texel_lpos(im, lpos, texel, axis_map) \
-  imageStore(im, lpos_to_pos(lpos, axis_map).xy, texel)
-#else // defined(USING_TEXTURE3D)
-#define write_texel(im, pos, texel) imageStore(im, pos, texel)
-#define write_texel_lpos(im, lpos, texel, axis_map) \
-  imageStore(im, lpos_to_pos(lpos, axis_map), texel)
-#endif
-
-/*
- * Converts hashed layout to a ivec4 containing the axis map data and an int
- * containing the packed dim respectively. Each value takes up 4 bits in the
- * packed int, and values are read from least significant half byte (right-most)
- * to most significant half byte (left-most).
- * e.g. 0x20122, 2 -> ivec4(0, 1, 2, 2)
- * e.g. 0x11021, 1 -> ivec4(1, 2, 0, 1)
- */
-#define unhash_axis_map(hash) \
-  (ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf)))
-
-/*
- *
- */
-#define unhash_dim_order(hash) \
-  (ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf)))
-
-#define unhash_packed_dim(hash) int(hash >> 16 & 0xf)
-
-#define DEFAULT_LAYOUT 0x02210
-
-#define DEFAULT_DIM_ORDER 0x03210
-
-#define DEFAULT_DIM_ORDER_IVEC4 ivec4(0, 1, 2, 3)
-
-/************************
- * Deprecated Functions *
- ************************/
-
-// The below functions and macros are in the process of being deprecated in
-// favor of newer indexing functions that account for axis mapping and have more
-// explicit function names and more updated terminology.
-
-/*
- * Describes which texture axis the "batches" dimension runs along in a 4D
- * texture.
- *
- * Currently it is set to 2 since we represent batches by concatenating along
- * the channels dim, which has index 2 in (W, H, C, N) order and maps to the
- * depth dimension of a texture, which also corresponds to index 2 in (x, y, z)
- * order.
- */
-#define BATCH_AXIS 2
-
-//
-// (w, h, c, n) Tensor Index <-> (x, y, z) Texture Position Conversion
-//
-
-/*
- * Input: (x, y, z) texel position, (W, H, C, N) sizes of the tensor, which dim
- *        is packed along a texel
- * Output: Whether the texel position is outside the bounds of the image texture
- *         given the size and packed dimension of the tensor.
- */
-bool pos_out_of_bounds(ivec3 pos, ivec4 sizes, int packed_dim) {
-  // Align packed dim to next multiple of 4 to account for texel padding
-  sizes[packed_dim] = alignup4(sizes[packed_dim]);
-
-  ivec3 max_pos = sizes.xyz;
-  max_pos[BATCH_AXIS] += sizes.w * sizes[BATCH_AXIS];
-  max_pos[packed_dim] /= 4;
-  return (any(greaterThanEqual(pos, max_pos)));
-}
-
-/*
- * Input: (x, y, z) texel position, (W, H, C, N) sizes of the tensor,
- *        which dim is packed along a texel
- * Returns: the (w, h, c, n) tensor index cooresponding to the first element of
- *          the texel at the specified position
- */
-ivec4 to_tensor_idx(ivec3 pos, ivec4 sizes, int packed_dim) {
-  // Align packed dim to next multiple of 4 to account for texel padding
-  sizes[packed_dim] = alignup4(sizes[packed_dim]);
-
-  // Packed dim contains 4 elements per texel
-  pos[packed_dim] *= 4;
-  // Construct the initial tensor index via swizzling
-#if BATCH_AXIS == 2
-  ivec4 tensor_idx = pos.xyzz;
-#endif
-#if BATCH_AXIS == 1
-  ivec4 tensor_idx = pos.xyzy;
-#endif
-#if BATCH_AXIS == 0
-  ivec4 tensor_idx = pos.xyzx;
-#endif
-  // Adjust the axis that the batch dim runs along
-  tensor_idx[3] /= sizes[BATCH_AXIS];
-  tensor_idx[BATCH_AXIS] %= sizes[BATCH_AXIS];
-
-  return tensor_idx;
-}
-
-/*
- * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim
- *        is packed along a texel
- * Returns: the (x, y, z) texture position containing element of the tensor at
- *          the specified index
- */
-ivec3 to_texture_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
-  // Align packed dim to next multiple of 4 to account for texel padding
-  sizes[packed_dim] = alignup4(sizes[packed_dim]);
-
-  ivec3 pos = idx.xyz;
-  pos[BATCH_AXIS] += idx.w * sizes[BATCH_AXIS];
-  pos[packed_dim] /= 4;
-  return pos;
-}
-
-/*
- * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of the tensor, which dim
- *        is packed along a texel
- * Returns: the (x, y, z, i) texture position containing the element of the
- *          tensor at the specified index, where i is the component within the
- *          texel to which the element belongs
- */
-ivec4 to_texture_elem_pos(ivec4 idx, ivec4 sizes, int packed_dim) {
-  // Align packed dim to next multiple of 4 to account for texel padding
-  sizes[packed_dim] = alignup4(sizes[packed_dim]);
-
-  //  pos[4] is set to a placeholder value
-  ivec4 pos = idx.xyzx;
-  pos[BATCH_AXIS] += idx.w * sizes[BATCH_AXIS];
-  pos[packed_dim] >>= 2;
-  pos.w = idx[packed_dim] & 0x3;
-  return pos;
-}
-
-//
-// Miscellaneous Utility Functions and Macros
-//
-
-// Given a buffer(1-D) index cur, compute a new index where the corresponding
-// tensor(N-D)'s adjacent dimensions are swapped. The parameters x,y and plane
-// describe sizes. As an example, let's say we want to swap dimensions 0,1 for a
-// tensor of shape {4,3,2,24} to obtain {3,4,2,24}. Then, x=4, y=3 and
-// plane=2*24=48.
-#define swap_adj_dims(cur, x, y, plane)                        \
-  cur +                                                        \
-      plane *                                                  \
-          ((1 - y) * ((cur % (x * y * plane)) / (y * plane)) + \
-           (x - 1) * ((cur % (y * plane)) / plane))
-
-// Return the x, y, z and index value the channel-packed 3D tensor from the {n,
-// c, h, w}-index.
-ivec4 get_channel_packed_pos_from_index(ivec4 nchw, ivec4 sizes) {
-  int aligned_c = alignup4(sizes.y);
-  int c_stride = aligned_c / 4;
-
-  return ivec4(nchw.w, nchw.z, nchw.x * c_stride + nchw.y / 4, nchw.y % 4);
-}
-
-#endif // INDEXING_UTILS_H
diff --git a/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.glsl b/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.glsl
deleted file mode 100644
index 8028362c3e5..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.glsl
+++ /dev/null
@@ -1,80 +0,0 @@
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define T ${buffer_scalar_type(DTYPE)}
-
-${define_active_storage_type(STORAGE)}
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(B, "w", "cache", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "projected", DTYPE, STORAGE)}
-$if STORAGE == "buffer":
-  ${layout_declare_ubo(B, "int", "projected_numel")}
-  ${layout_declare_ubo(B, "ivec4", "cache_strides")}
-  ${layout_declare_ubo(B, "int", "input_pos")}
-$else:
-  ${layout_declare_ubo(B, "ivec3", "projected_limits")}
-  ${layout_declare_ubo(B, "int", "input_pos")}
-
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-/*
- * t_cache will have sizes of (max_batch_size, max_seq_len, n_heads, head_dim).
- * t_projected will have sizes of (batch_size, seq_len, n_heads, head_dim).
- *
- * The cache update inserts the values of t_projected into t_cache at the index
- * specified by input_pos at the seq_len dimension. It is equivalent to calling
-
- * t_cache = t_cache.slice_scatter(
- *     t_projected, dim=1, start=input_pos, end=input_pos+seq_len)
- *
- * Note that this shader is implemented assuming that max_batch_size is 1.
- */
-
-#ifdef USING_BUFFER
-
-/***************************
- ** Buffer Implementation **
- ***************************/
-
-void main() {
-  int projected_bufi = int(gl_GlobalInvocationID.x);
-  // Bump cache index forward by input_pos elements along the seq_len dimension.
-  // cache_strides contains the strides of the cache tensor.
-  int cache_bufi = input_pos * cache_strides.z + projected_bufi;
-  if (projected_bufi >= projected_numel) {
-    return;
-  }
-  cache[cache_bufi] = projected[projected_bufi];
-}
-
-#else
-
-/****************************
- ** Texture Implementation **
- ****************************/
-
-// Note that this shader assumes the that tensors are width packed, i.e.
-// packed_dim = 0
-void main() {
-  const ivec3 projected_pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(projected_pos, projected_limits))) {
-    return;
-  }
-
-  const ivec3 cache_pos = ivec3(
-      projected_pos.x,
-      projected_pos.y,
-      projected_pos.z + input_pos);
-
-  write_texel(cache, cache_pos, load_texel(projected, projected_pos));
-}
-
-#endif // USING_BUFFER
diff --git a/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.yaml b/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.yaml
deleted file mode 100644
index e2a96234465..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/kv_cache_update.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-kv_cache_update:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: buffer
-  generate_variant_forall:
-    STORAGE:
-      - VALUE: buffer
-      - VALUE: texture3d
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: kv_cache_update
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_common.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_common.glslh
deleted file mode 100644
index da326b26e93..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_common.glslh
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- * Defines common functions and structs to be used across matrix multiplication
- * operators.
- */
-
-#ifndef LINEAR_COMMON_GLSLH
-#define LINEAR_COMMON_GLSLH
-
-#include "common.glslh"
-
-int sign_extend_8bit(const int val) {
-  if ((val & 0x80) != 0) {
-    return val | (~0xFF);
-  }
-  return val;
-}
-
-int extract_8bit_from_packed_int_le(const int packed, const int i) {
-  // account for little endian
-  int byte = sign_extend_8bit(packed >> (8 * i) & 0xFF);
-  return byte;
-}
-
-// Extract a 4-bit value from a packed int (little endian)
-// It is assumed that the 4-bit value is in the range [0, 15]
-int extract_4bit_from_packed_int_le(const int packed, const int col) {
-  // Extract the 4-bit value from the 8-bit value
-  int val = packed >> (4 * col) & 0xF;
-  return val;
-}
-
-// Convenience overload for packed uint
-int extract_4bit_from_packed_uint_le(const uint packed, const int col) {
-  // Extract the 4-bit value from the 8-bit value
-  int val = int(packed >> (4 * col)) & 0xF;
-  return val;
-}
-
-#endif // LINEAR_COMMON_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_bias_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_bias_load.glslh
deleted file mode 100644
index f3d32be8b3d..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_bias_load.glslh
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef LINEAR_FP_BIAS_LOAD_GLSLH
-#define LINEAR_FP_BIAS_LOAD_GLSLH
-
-#include "linear_fp_per_out_channel_params.glslh"
-
-VEC4_T load_bias_x4(const int n4) {
-  return t_bias[n4];
-}
-
-void load_bias_tile(out FPPerOutChannelParams bias, const int n4_start) {
-#if TILE_N4 == 1
-  bias.data[0] = load_bias_x4(n4_start);
-
-#else
-  [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-    bias.data[n4] = load_bias_x4(n4_start + n4);
-  }
-
-#endif
-}
-
-#endif // LINEAR_FP_BIAS_LOAD_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_input_tile.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_input_tile.glslh
deleted file mode 100644
index 68eee57a132..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_input_tile.glslh
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef LINEAR_FP_INPUT_TILE_GLSLH
-#define LINEAR_FP_INPUT_TILE_GLSLH
-
-/*
- * Defines the FPInputTile struct, which is used to represent a tile of the
- * input matrix of a matrix multiplication operation.
- *
- * Settings:
- * - TILE_M: number of rows in the tile
- * - TILE_K4: number of (groups of 4) columns in the tile
- */
-
-#extension GL_EXT_control_flow_attributes : require
-
-struct FPInputTile {
-  VEC4_T data[TILE_M][TILE_K4];
-};
-
-#ifdef DEBUG_MODE
-
-void printFPInputTile(const FPInputTile in_tile) {
-  debugPrintfEXT("input_tile: \\n");
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) {
-      debugPrintfEXT(
-          "  %f, %f, %f, %f, \\n",
-          in_tile.data[m][k4].x,
-          in_tile.data[m][k4].y,
-          in_tile.data[m][k4].z,
-          in_tile.data[m][k4].w);
-    }
-  }
-}
-
-#endif // DEBUG_MODE
-
-#endif // LINEAR_FP_INPUT_TILE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_input_tile_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_input_tile_load.glslh
deleted file mode 100644
index 6697003935f..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_input_tile_load.glslh
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- * Defines functions to load a FPInputTile from input buffer/texture.
- *
- * Requires:
- * - t_input to be declared in the shader layout (input buffer/texture)
- *
- * Settings:
- * - INPUT_BUFFER to indicate input resource is a buffer, otherwise texture is
- *   assumed.
- */
-
-#ifndef LINEAR_FP_INPUT_TILE_LOAD_GLSLH
-#define LINEAR_FP_INPUT_TILE_LOAD_GLSLH
-
-#extension GL_EXT_control_flow_attributes : require
-
-#include "linear_fp_input_tile.glslh"
-
-#ifdef INPUT_BUFFER
-
-VEC4_T load_input_x4(const int k4, const int m, const int ntexels_k) {
-  return t_input[(m * ntexels_k) + k4];
-}
-
-#else
-
-VEC4_T load_input_x4(const int k4, const int m, const int ntexels_k) {
-  return texelFetch(t_input, ivec3(k4, m, 0), 0);
-}
-
-#endif // INPUT_BUFFER
-
-// To be used if (M - m_start >= TILE_M) || (K4 - k4_start >= TILE_K4)
-void load_input_tile_no_checks(
-    out FPInputTile in_tile,
-    const int k4_start,
-    const int m_start,
-    const int K4,
-    const int M) {
-#if TILE_K4 == 1
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    in_tile.data[m][0] = load_input_x4(k4_start, m_start + m, K4);
-  }
-
-#else
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) {
-      in_tile.data[m][k4] = load_input_x4(k4_start + k4, m_start + m, K4);
-    }
-  }
-#endif
-}
-
-// To be used if near tensor boundaries
-void load_input_tile_with_checks(
-    out FPInputTile in_tile,
-    const int k4_start,
-    const int m_start,
-    const int K4,
-    const int M) {
-#if TILE_K4 == 1
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    if (m_start + m < M) {
-      in_tile.data[m][0] = load_input_x4(k4_start, m_start + m, K4);
-    } else {
-      in_tile.data[m][0] = VEC4_T(0.0);
-    }
-  }
-
-#else
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) {
-      if (m_start + m < M && k4_start + k4 < K4) {
-        in_tile.data[m][k4] = load_input_x4(k4_start + k4, m_start + m, K4);
-      } else {
-        in_tile.data[m][k4] = VEC4_T(0.0);
-      }
-    }
-  }
-#endif
-}
-
-#endif // LINEAR_FP_INPUT_TILE_LOAD_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile.glslh
deleted file mode 100644
index dd571229a9c..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile.glslh
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- * Defines the FPOutTile struct, which is used to represent a tile of the output
- * matrix of a matrix multiplication operation.
- *
- * Settings:
- * - TILE_M: number of rows in the output tile
- * - TILE_N4: number of (groups of 4) columns in the output tile
- */
-
-#ifndef LINEAR_FP_OUTPUT_TILE_GLSLH
-#define LINEAR_FP_OUTPUT_TILE_GLSLH
-
-#extension GL_EXT_control_flow_attributes : require
-
-struct FPOutTile {
-  VEC4_T data[TILE_M][TILE_N4];
-};
-
-void initialize(out FPOutTile out_tile) {
-#if TILE_N4 == 1
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    out_tile.data[m][0] = VEC4_T(0);
-  }
-
-#else
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-      out_tile.data[m][n4] = VEC4_T(0);
-    }
-  }
-#endif
-}
-
-void add(inout FPOutTile out_tile, const FPOutTile other_out_tile) {
-#if TILE_M > 1 && TILE_N4 == 1
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    out_tile.data[m][0] += other_out_tile.data[m][0];
-  }
-
-#else
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-      out_tile.data[m][n4] += other_out_tile.data[m][n4];
-    }
-  }
-#endif
-}
-
-#ifdef DEBUG_MODE
-
-void printFPOutTile(const FPOutTile tile) {
-  debugPrintfEXT("output_tile: \\n");
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-      debugPrintfEXT(
-          "  %f, %f, %f, %f,",
-          tile.data[m][n4].x,
-          tile.data[m][n4].y,
-          tile.data[m][n4].z,
-          tile.data[m][n4].w);
-    }
-    debugPrintfEXT("\\n");
-  }
-}
-
-#endif // DEBUG_MODE
-
-#endif // LINEAR_FP_OUTPUT_TILE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_compute.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_compute.glslh
deleted file mode 100644
index ee50ad87f74..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_compute.glslh
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- * Defines functions to compute a FPOutTile using fp input and weight tiles.
- */
-
-#ifndef LINEAR_FP_OUTPUT_TILE_FP_COMPUTE_GLSLH
-#define LINEAR_FP_OUTPUT_TILE_FP_COMPUTE_GLSLH
-
-#extension GL_EXT_control_flow_attributes : require
-
-#include "linear_common.glslh"
-#include "linear_fp_input_tile.glslh"
-#include "linear_fp_output_tile.glslh"
-#include "linear_fp_per_out_channel_params.glslh"
-#include "linear_fp_weight_tile.glslh"
-
-/*
- * Accumulates floating point input tile and floating point weight tile into
- * floating point output tile.
- */
-void fp_accumulate_with_fp_weight(
-    inout FPOutTile accum,
-    FPInputTile in_tile,
-    FPWeightTile w_tile) {
-#if TILE_N4 == 1 && TILE_K4 == 1
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    accum.data[m][0] =
-        fma(VEC4_T(in_tile.data[m][0][0]),
-            w_tile.data[mul_4(0)][0],
-            accum.data[m][0]);
-
-    accum.data[m][0] =
-        fma(VEC4_T(in_tile.data[m][0][1]),
-            w_tile.data[mul_4(0) + 1][0],
-            accum.data[m][0]);
-
-    accum.data[m][0] =
-        fma(VEC4_T(in_tile.data[m][0][2]),
-            w_tile.data[mul_4(0) + 2][0],
-            accum.data[m][0]);
-
-    accum.data[m][0] =
-        fma(VEC4_T(in_tile.data[m][0][3]),
-            w_tile.data[mul_4(0) + 3][0],
-            accum.data[m][0]);
-  }
-
-#else
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-      const int n = mul_4(n4);
-      [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) {
-        accum.data[m][n4] =
-            fma(VEC4_T(in_tile.data[m][k4][0]),
-                w_tile.data[mul_4(k4)][n4],
-                accum.data[m][n4]);
-
-        accum.data[m][n4] =
-            fma(VEC4_T(in_tile.data[m][k4][1]),
-                w_tile.data[mul_4(k4) + 1][n4],
-                accum.data[m][n4]);
-
-        accum.data[m][n4] =
-            fma(VEC4_T(in_tile.data[m][k4][2]),
-                w_tile.data[mul_4(k4) + 2][n4],
-                accum.data[m][n4]);
-
-        accum.data[m][n4] =
-            fma(VEC4_T(in_tile.data[m][k4][3]),
-                w_tile.data[mul_4(k4) + 3][n4],
-                accum.data[m][n4]);
-      }
-    }
-  }
-
-#endif
-}
-
-/*
- * Applies per output channel weight scales to the output tile.
- */
-void apply_scales(inout FPOutTile tile, const FPPerOutChannelParams scales) {
-#if TILE_M > 1 && TILE_N4 == 1
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    tile.data[m][0] = tile.data[m][0] * scales.data[0];
-  }
-
-#else
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-      tile.data[m][n4] = tile.data[m][n4] * scales.data[n4];
-    }
-  }
-#endif
-}
-
-/*
- * Applies per output channel weight scales and per output channel biases to the
- * output tile.
- */
-void apply_scales_and_biases(
-    inout FPOutTile tile,
-    const FPPerOutChannelParams scales,
-    const FPPerOutChannelParams bias) {
-#if TILE_M > 1 && TILE_N4 == 1
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    tile.data[m][0] = tile.data[m][0] * scales.data[0] + bias.data[0];
-  }
-
-#else
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-      tile.data[m][n4] = tile.data[m][n4] * scales.data[n4] + bias.data[n4];
-    }
-  }
-#endif
-}
-
-void accumulate_out_tile_with_out_tile(
-    inout FPOutTile accum,
-    const FPOutTile other) {
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-      accum.data[m][n4] = accum.data[m][n4] + other.data[m][n4];
-    }
-  }
-}
-
-#endif // LINEAR_FP_OUTPUT_TILE_FP_COMPUTE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_int4_compute.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_int4_compute.glslh
deleted file mode 100644
index 0606759e393..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_int4_compute.glslh
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- * Defines functions to compute a FPOutTile using fp input and weight tiles.
- */
-
-#ifndef LINEAR_FP_OUTPUT_TILE_FP_INT4_COMPUTE_GLSLH
-#define LINEAR_FP_OUTPUT_TILE_FP_INT4_COMPUTE_GLSLH
-
-#extension GL_EXT_control_flow_attributes : require
-
-#include "linear_common.glslh"
-#include "linear_fp_input_tile.glslh"
-#include "linear_fp_output_tile.glslh"
-#include "linear_fp_per_out_channel_params.glslh"
-#include "linear_int4_weight_tile.glslh"
-
-// Unpacks a int containing 4 packed 8-bit integers into a vec4 containing each
-// of the 4 unpacked 8-bit integers.
-VEC4_T unpack_packed_4xint4(const int int8x4, const int n4_group) {
-  return VEC4_T(
-      extract_4bit_from_packed_int_le(int8x4, n4_group + 0),
-      extract_4bit_from_packed_int_le(int8x4, n4_group + 2),
-      extract_4bit_from_packed_int_le(int8x4, n4_group + 4),
-      extract_4bit_from_packed_int_le(int8x4, n4_group + 6));
-}
-
-T extract_4bit_from_weight_block(
-    const ivec4 block,
-    const int col,
-    const int row) {
-  return T(((block[row] >> (4 * col)) & 0xF) - 8);
-}
-
-void fp_accumulate_with_int4_weight(
-    inout FPOutTile accum,
-    FPInputTile in_tile,
-    Int4WeightTile w_tile,
-    FPPerOutChannelParams scales_tile,
-    FPPerOutChannelParams zeros_tile) {
-  // Accum tile is indexed as accum[m][n4][n4i]
-  // -> gives fp accumulator for output tile element at (x = n, y = m)
-  // Input tile is indexed as in_tile.data[m][k4]
-  // -> gives vec4 containing the fp inputs at index
-  //    (k, m), (k + 1, m), (k + 2, m), (k + 3, m)
-  // Weight tile is indexed as w_tile.data[k4][n8][n4i]
-  // -> gives packed integer containing the 8x 4-bit quantized values at index
-  //    (n, k), (n, k + 1), (n, k + 2), (n, k + 3),
-  //    (n + 4, k), (n + 4, k + 1), (n + 4, k + 2), (n + 4, k + 3)
-  VEC4_T weight_texels[2];
-#if TILE_K4 == 1 && TILE_N8 == 1
-  [[unroll]] for (int k = 0; k < 4; ++k) {
-    const int base_col_1 = mul_2(k);
-    const int base_col_2 = base_col_1 + 1;
-    weight_texels[0] = VEC4_T(
-        extract_4bit_from_weight_block(w_tile.data[0][0], base_col_1, 0),
-        extract_4bit_from_weight_block(w_tile.data[0][0], base_col_1, 1),
-        extract_4bit_from_weight_block(w_tile.data[0][0], base_col_1, 2),
-        extract_4bit_from_weight_block(w_tile.data[0][0], base_col_1, 3));
-    weight_texels[1] = VEC4_T(
-        extract_4bit_from_weight_block(w_tile.data[0][0], base_col_2, 0),
-        extract_4bit_from_weight_block(w_tile.data[0][0], base_col_2, 1),
-        extract_4bit_from_weight_block(w_tile.data[0][0], base_col_2, 2),
-        extract_4bit_from_weight_block(w_tile.data[0][0], base_col_2, 3));
-
-    weight_texels[0] =
-        fma(weight_texels[0], scales_tile.data[0], zeros_tile.data[0]);
-    weight_texels[1] =
-        fma(weight_texels[1], scales_tile.data[1], zeros_tile.data[1]);
-
-    [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-      accum.data[m][0] = fma(
-          VEC4_T(in_tile.data[m][0][k]), weight_texels[0], accum.data[m][0]);
-      accum.data[m][1] = fma(
-          VEC4_T(in_tile.data[m][0][k]), weight_texels[1], accum.data[m][1]);
-    }
-  }
-
-#else
-  // TODO(ssjia): Implement generic case
-  not implemented
-
-#endif
-}
-
-#endif // LINEAR_FP_OUTPUT_TILE_FP_INT4_COMPUTE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_int8_compute.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_int8_compute.glslh
deleted file mode 100644
index b2ab64a1573..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_fp_int8_compute.glslh
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- * Defines functions to compute a FPOutTile using fp input and weight tiles.
- */
-
-#ifndef LINEAR_FP_OUTPUT_TILE_FP_INT8_COMPUTE_GLSLH
-#define LINEAR_FP_OUTPUT_TILE_FP_INT8_COMPUTE_GLSLH
-
-#extension GL_EXT_control_flow_attributes : require
-
-#include "linear_common.glslh"
-#include "linear_fp_input_tile.glslh"
-#include "linear_fp_output_tile.glslh"
-#include "linear_int8_weight_tile.glslh"
-
-// Unpacks a int containing 4 packed 8-bit integers into a vec4 containing each
-// of the 4 unpacked 8-bit integers.
-VEC4_T unpack_packed_4xint8(int int8x4) {
-  return VEC4_T(
-      extract_8bit_from_packed_int_le(int8x4, 0),
-      extract_8bit_from_packed_int_le(int8x4, 1),
-      extract_8bit_from_packed_int_le(int8x4, 2),
-      extract_8bit_from_packed_int_le(int8x4, 3));
-}
-
-void fp_accumulate_with_int8_weight(
-    inout FPOutTile accum,
-    FPInputTile in_tile,
-    Int8WeightTile w_tile) {
-  // Accum tile is indexed as accum[m][n4][n4i]
-  //   -> gives fp accumulator for output tile element at (x = n, y = m)
-  // Input tile is indexed as in_tile.data[m][k4]
-  //   -> gives vec4 containing the fp inputs at index
-  //      (k, m), (k + 1, m), (k + 2, m), (k + 3, m)
-  // Weight tile is indexed as w_tile.data[k4][n4][n4i]
-  //   -> gives packed integer containing the 4x 8-bit quantized values at index
-  //      (n, k), (n, k + 1), (n, k + 2), (n, k + 3)
-  VEC4_T weight_texel;
-#if TILE_K4 == 1 && TILE_N4 == 1
-  [[unroll]] for (int k = 0; k < 4; ++k) {
-    // Unpack one column of weights
-    weight_texel = VEC4_T(
-        extract_8bit_from_packed_int_le(w_tile.data[0][0][0], k),
-        extract_8bit_from_packed_int_le(w_tile.data[0][0][1], k),
-        extract_8bit_from_packed_int_le(w_tile.data[0][0][2], k),
-        extract_8bit_from_packed_int_le(w_tile.data[0][0][3], k));
-
-    for (int m = 0; m < TILE_M; ++m) {
-      accum.data[m][0] =
-          fma(VEC4_T(in_tile.data[m][0][k]), weight_texel, accum.data[m][0]);
-    }
-  }
-
-#else
-  // TODO(ssjia): implement the general case
-  not implemented
-
-#endif
-}
-
-#endif // LINEAR_FP_OUTPUT_TILE_FP_INT8_COMPUTE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_int8_int8_compute.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_int8_int8_compute.glslh
deleted file mode 100644
index b04074eba75..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_int8_int8_compute.glslh
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- * Defines functions to compute a FPOutTile using int8 input and weight tiles.
- *
- * Settings:
- * - TILE_M: The number of rows in the output tile.
- * - TILE_N4: The number of (groups of 4) columns in the output tile.
- */
-
-#ifndef LINEAR_FP_OUTPUT_TILE_INT8_INT8_COMPUTE_GLSLH
-#define LINEAR_FP_OUTPUT_TILE_INT8_INT8_COMPUTE_GLSLH
-
-#extension GL_EXT_control_flow_attributes : require
-#extension GL_EXT_integer_dot_product : require
-
-#include "linear_common.glslh"
-#include "linear_fp_output_tile.glslh"
-#include "linear_fp_per_out_channel_params.glslh"
-#include "linear_int8_input_tile.glslh"
-#include "linear_int8_weight_tile.glslh"
-#include "linear_int_per_out_channel_params.glslh"
-
-// Stores integer accumulators for an output tile.
-struct Int32Accum {
-  ivec4 data[TILE_M][TILE_N4];
-};
-
-// Initialize values to 0
-void initialize(out Int32Accum out_accum) {
-#if TILE_N4 == 1
-  [[unroll]] for (int y = 0; y < TILE_M; ++y) {
-    out_accum.data[y][0] = ivec4(0);
-  }
-
-#else
-  [[unroll]] for (int y = 0; y < TILE_M; ++y) {
-    [[unroll]] for (int x4 = 0; x4 < TILE_K4; ++x4) {
-      out_accum.data[y][x4] = ivec4(0);
-    }
-  }
-#endif
-}
-
-// Accumulate int8 input and weight tiles into integer accumulator tile
-void int_accumulate_with_int8_weight(
-    inout Int32Accum accum,
-    Int8InputTile in_tile,
-    Int8WeightTile w_tile) {
-  // Accum tile is indexed as accum[m][n4][n4i]
-  //   -> gives integer accumulator for output tile element at (x = n, y = m)
-  // Input tile is indexed as in_tile.data[m4][k4][m4i]
-  //   -> gives packed integer containing the 4x 8-bit quantized values at index
-  //      (k, m), (k + 1, m), (k + 2, m), (k + 3, m)
-  // Weight tile is indexed as w_tile.data[k4][n4][n4i]
-  //   -> gives packed integer containing the 4x 8-bit quantized values at index
-  //      (n, k), (n, k + 1), (n, k + 2), (n, k + 3)
-#if TILE_M4 == 1 && TILE_K4 == 1 && TILE_N4 == 1
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    // n = 0
-    accum.data[m][0][0] = dotPacked4x8AccSatEXT(
-        in_tile.data[0][0][m], w_tile.data[0][0][0], accum.data[m][0][0]);
-    // n = 1
-    accum.data[m][0][1] = dotPacked4x8AccSatEXT(
-        in_tile.data[0][0][m], w_tile.data[0][0][1], accum.data[m][0][1]);
-    // n = 2
-    accum.data[m][0][2] = dotPacked4x8AccSatEXT(
-        in_tile.data[0][0][m], w_tile.data[0][0][2], accum.data[m][0][2]);
-    // n = 3
-    accum.data[m][0][3] = dotPacked4x8AccSatEXT(
-        in_tile.data[0][0][m], w_tile.data[0][0][3], accum.data[m][0][3]);
-  }
-
-#else
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    const int m4 = div_4(m);
-    const int m4i = mod_4(m);
-    [[unroll]] for (int n = 0; n < TILE_N; ++n) {
-      const int n4 = div_4(n);
-      const int n4i = mod_4(n);
-      [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) {
-        accum.data[m][n4][n4i] = dotPacked4x8AccSatEXT(
-            in_tile.data[m4][k4][m4i],
-            w_tile.data[k4][n4][n4i],
-            accum.data[m][n4][n4i]);
-      }
-    }
-  }
-
-#endif
-}
-
-/*
- * Computes final weight matrix output tile using:
- * - int8 accumulator tile
- * - per output channel weight sums
- * - per output channel scales
- */
-void accumulate_out_tile_with_int_accum(
-    inout FPOutTile out_tile,
-    const Int32Accum accum,
-    const float input_q_scale,
-    const int input_q_zp,
-    const IntPerOutChannelParams weight_sums,
-    const FPPerOutChannelParams weight_scales) {
-  ivec4 input_zp_vec = ivec4(-input_q_zp);
-#if TILE_N4 == 1
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    // Unfortunately fma doesn't work with ivec4. Prefer to preserve integer
-    // format for as long as possible to avoid precision loss.
-    ivec4 accum_adjusted =
-        input_zp_vec * weight_sums.data[0] + accum.data[m][0];
-    out_tile.data[m][0] =
-        fma(VEC4_T(accum_adjusted),
-            input_q_scale * weight_scales.data[0],
-            out_tile.data[m][0]);
-  }
-
-#else
-  // TODO(ssjia): Implement the general case
-  not implemented
-
-#endif
-}
-
-void accumulate_out_tile_with_int_accum(
-    inout FPOutTile out_tile,
-    const Int32Accum accum,
-    const float input_q_scale,
-    const int input_q_zp,
-    const IntPerOutChannelParams weight_sums,
-    const FPPerOutChannelParams weight_scales,
-    const FPPerOutChannelParams bias) {
-  ivec4 input_zp_vec = ivec4(-input_q_zp);
-#if TILE_N4 == 1
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    // Apply scale and zero points to the int accumulator
-    ivec4 accum_adjusted =
-        input_zp_vec * weight_sums.data[0] + accum.data[m][0];
-    out_tile.data[m][0] =
-        fma(VEC4_T(accum_adjusted),
-            input_q_scale * weight_scales.data[0],
-            out_tile.data[m][0]);
-    out_tile.data[m][0] += bias.data[0];
-  }
-
-#else
-  // TODO(ssjia): Implement the general case
-  not implemented
-
-#endif
-}
-
-#ifdef DEBUG_MODE
-
-void printInt32Accum(const Int32Accum tile) {
-  debugPrintfEXT("int accum: \\n");
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-      debugPrintfEXT(
-          "  %d, %d, %d, %d,",
-          tile.data[m][n4].x,
-          tile.data[m][n4].y,
-          tile.data[m][n4].z,
-          tile.data[m][n4].w);
-    }
-    debugPrintfEXT("\\n");
-  }
-}
-
-#endif
-
-#endif // LINEAR_FP_OUTPUT_TILE_INT8_INT8_COMPUTE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_store.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_store.glslh
deleted file mode 100644
index a4019204cc3..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_output_tile_store.glslh
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- * Defines functions store a FpOutTile to output buffer/texture.
- *
- * Requires:
- * - t_output to be declared in the shader layout
- *
- * Settings:
- * - OUTPUT_BUFFER to indicate t_output is a vec4 buffer, otherwise texture
- *   storage is assumed.
- */
-
-#ifndef LINEAR_FP_OUTPUT_TILE_STORE_GLSLH
-#define LINEAR_FP_OUTPUT_TILE_STORE_GLSLH
-
-#extension GL_EXT_control_flow_attributes : require
-
-#include "linear_fp_output_tile.glslh"
-
-#ifdef OUTPUT_BUFFER
-
-void write_output_x4(
-    const VEC4_T out_texel,
-    const int n4,
-    const int m,
-    const int N4) {
-  t_output[m * N4 + n4] = out_texel;
-}
-
-#else
-
-void write_output_x4(
-    const VEC4_T out_texel,
-    const int n4,
-    const int m,
-    const int N4) {
-  imageStore(t_output, ivec3(n4, m, 0), out_texel);
-}
-
-#endif // OUTPUT_BUFFER
-
-void write_output_tile(
-    const FPOutTile out_tile,
-    const int n4_start,
-    const int m_start,
-    const int N4) {
-#if TILE_K4 == 1
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    write_output_x4(out_tile.data[m][0], n4_start, m_start + m, N4);
-  }
-
-#else
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-      write_output_x4(out_tile.data[m][n4], n4_start + n4, m_start + m, N4);
-    }
-  }
-#endif
-}
-
-// To be used if M - m >= TILE_M && N4 - n4 >= TILE_N4
-void write_output_tile_no_checks(
-    const FPOutTile out_tile,
-    const int n4_start,
-    const int m_start,
-    const int N4,
-    const int M) {
-#if TILE_N4 == 1
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    write_output_x4(out_tile.data[m][0], n4_start, m_start + m, N4);
-  }
-
-#else
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-      write_output_x4(out_tile.data[m][n4], n4_start + n4, m_start + m, N4);
-    }
-  }
-#endif
-}
-
-// To be used if close to tensor boundaries
-void write_output_tile_with_checks(
-    const FPOutTile out_tile,
-    const int n4_start,
-    const int m_start,
-    const int N4,
-    const int M) {
-#if TILE_N4 == 1
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    if (m_start + m < M) {
-      write_output_x4(out_tile.data[m][0], n4_start, m_start + m, N4);
-    }
-  }
-
-#else
-  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
-    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-      if (m_start + m < M && n4_start + n4 < N4) {
-        write_output_x4(out_tile.data[m][n4], n4_start + n4, m_start + m, N4);
-      }
-    }
-  }
-#endif
-}
-
-#endif // LINEAR_FP_OUTPUT_TILE_STORE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_per_out_channel_params.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_per_out_channel_params.glslh
deleted file mode 100644
index 72b22988414..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_per_out_channel_params.glslh
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- * Defines common functions and structs to be used across matrix multiplication
- * operators.
- */
-
-#ifndef LINEAR_FP_PER_OUT_CHANNEL_PARAMS_GLSLH
-#define LINEAR_FP_PER_OUT_CHANNEL_PARAMS_GLSLH
-
-#include "common.glslh"
-
-#extension GL_EXT_control_flow_attributes : require
-
-// Represents floating point parameter tensors where each element is associated
-// with an output channel, such as weight scales, biases, etc.
-struct FPPerOutChannelParams {
-  VEC4_T data[TILE_N4];
-};
-
-#ifdef DEBUG_MODE
-
-void printFPPerOutChannelParams(const FPPerOutChannelParams params) {
-  debugPrintfEXT("per_out_channel_params: \\n");
-  [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-    debugPrintfEXT(
-        "  %f, %f, %f, %f, \\n",
-        params.data[n4].x,
-        params.data[n4].y,
-        params.data[n4].z,
-        params.data[n4].w);
-  }
-}
-
-#endif // DEBUG_MODE
-
-#endif // LINEAR_FP_PER_OUT_CHANNEL_PARAMS_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_weight_scales_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_weight_scales_load.glslh
deleted file mode 100644
index 1286c1d082f..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_weight_scales_load.glslh
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef LINEAR_FP_WEIGHT_SCALES_LOAD_GLSLH
-#define LINEAR_FP_WEIGHT_SCALES_LOAD_GLSLH
-
-#include "linear_fp_per_out_channel_params.glslh"
-
-VEC4_T load_weight_scale_x4(const int n4) {
-  return t_weight_scales[n4];
-}
-
-VEC4_T load_scale_x4(const int n4, const int quant_group_idx, const int N4) {
-  return t_weight_scales[quant_group_idx * N4 + n4];
-}
-
-void load_weight_scales_tile(
-    out FPPerOutChannelParams scales,
-    const int n4_start) {
-#if TILE_N4 == 1
-  scales.data[0] = load_weight_scale_x4(n4_start);
-
-#else
-  [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-    scales.data[n4] = load_weight_scale_x4(n4_start + n4);
-  }
-
-#endif
-}
-
-void load_weight_scales_tile_for_group(
-    out FPPerOutChannelParams scales,
-    const int n4_start,
-    const int quant_group_idx,
-    const int N4) {
-#if TILE_N4 == 1
-  scales.data[0] = load_scale_x4(n4_start, quant_group_idx, N4);
-
-#else
-  [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-    scales.data[n4] = load_scale_x4(n4_start + n4, quant_group_idx, N4);
-  }
-
-#endif
-}
-
-#endif // LINEAR_FP_WEIGHT_SCALES_LOAD_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_weight_tile.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_fp_weight_tile.glslh
deleted file mode 100644
index f44bbbc1565..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_fp_weight_tile.glslh
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- * Defines the FPWeightTile struct, which is used to represent a fp tile of a
- * weight matrix in matrix multiplication.
- *
- * Settings:
- * - TILE_K: number of rows in the output tile
- * - TILE_N4: number of (groups of 4) columns in the output tile
- */
-
-#ifndef LINEAR_FP_WEIGHT_TILE_GLSLH
-#define LINEAR_FP_WEIGHT_TILE_GLSLH
-
-#extension GL_EXT_control_flow_attributes : require
-
-#include "common.glslh"
-
-struct FPWeightTile {
-  VEC4_T data[TILE_K][TILE_N4];
-};
-
-#ifdef LINEAR_INT8_WEIGHT_TILE_GLSLH
-
-int sign_extend(const int val) {
-  if ((val & 0x80) != 0) {
-    return val | (~0xFF);
-  }
-  return val;
-}
-
-T extract_8bit_value(const Int8WeightTile w_tile, const int k, const int n) {
-#if TILE_K4 == 1 && TILE_N4 == 1
-  const int k4i = k;
-  const int n4i = n;
-  ivec4 block = w_tile.data[0][0];
-
-#else
-  const int k4 = div_4(k);
-  const int k4i = mod_4(k);
-
-  const int n4 = div_4(n);
-  const int n4i = mod_4(n);
-
-  ivec4 block = w_tile.data[k4][n4];
-#endif
-
-  int col = block[n4i];
-  int val = (col >> (k4i * 8)) & 0xFF;
-
-  return T(sign_extend(val));
-}
-
-void unpack(out FPWeightTile fp_w_tile, const Int8WeightTile w_tile) {
-#if TILE_K > 1 && TILE_N4 == 1
-  [[unroll]] for (int k = 0; k < TILE_K; ++k) {
-    fp_w_tile.data[k][0][0] = extract_8bit_value(w_tile, k, 0);
-    fp_w_tile.data[k][0][1] = extract_8bit_value(w_tile, k, 1);
-    fp_w_tile.data[k][0][2] = extract_8bit_value(w_tile, k, 2);
-    fp_w_tile.data[k][0][3] = extract_8bit_value(w_tile, k, 3);
-  }
-
-#else
-  [[unroll]] for (int k = 0; k < TILE_M; ++k) {
-    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-      const int n = mul_4(n4);
-      fp_w_tile.data[k][n4][0] = extract_8bit_value(w_tile, k, n);
-      fp_w_tile.data[k][n4][1] = extract_8bit_value(w_tile, k, n + 1);
-      fp_w_tile.data[k][n4][2] = extract_8bit_value(w_tile, k, n + 2);
-      fp_w_tile.data[k][n4][3] = extract_8bit_value(w_tile, k, n + 3);
-    }
-  }
-#endif
-}
-
-#endif // LINEAR_INT8_WEIGHT_TILE_GLSLH
-
-#ifdef DEBUG_MODE
-
-void printFPWeightTile(const FPWeightTile tile) {
-  debugPrintfEXT("weight_tile: \\n");
-  [[unroll]] for (int k = 0; k < TILE_K; ++k) {
-    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-      debugPrintfEXT(
-          "  %f, %f, %f, %f, ",
-          tile.data[k][n4].x,
-          tile.data[k][n4].y,
-          tile.data[k][n4].z,
-          tile.data[k][n4].w);
-    }
-    debugPrintfEXT("\\n");
-  }
-}
-
-#endif // DEBUG_MODE
-
-#endif // LINEAR_FP_WEIGHT_TILE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int4_weight_block.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int4_weight_block.glslh
deleted file mode 100644
index d813224c3aa..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_int4_weight_block.glslh
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef LINEAR_INT4_WEIGHT_BLOCK_GLSLH
-#define LINEAR_INT4_WEIGHT_BLOCK_GLSLH
-
-/*
- * This file defines utilties to perform weight prepacking of quantized int4
- * matrix multiplation weights. It also defines utilities to load source
- * weight data from inputbuffer, and write out a packed weight block to output
- * texture/buffer.
- *
- * Note: 2 4-bit values are packed into each 8-bit value in the source data.
- *
- * Requires:
- * - t_packed_int4_weight to be defined in shader layout (output texture/buffer)
- * - t_int4_weight to be defined in shader layout (input buffer)
- *
- * Settings:
- * - USING_BUFFER to indicate if output resource is a buffer. Otherwise texture
- *   is assumed.
- */
-
-#extension GL_EXT_control_flow_attributes : require
-
-#include "linear_common.glslh"
-
-// Represents source data for 2 8Kx4N block of the weight matrix read from the
-// input buffer. Each int element contains 8 packed 4-bit values along the K
-// dimension. Overall the data represents 8Kx8N block.
-struct Int4Weight2xBlockSourceData {
-  uint data[8];
-};
-
-// Represents data for a packed 4Kx8N block of the weight matrix to be written
-// out to output texture/buffer. An individual block was originally a 4Kx8N
-// block in the original weight tensor, and then the top and bottom halves are
-// concatenated along the width dim.
-struct Int4WeightBlockPacked {
-  ivec4 data;
-};
-
-void load_block_source_data_no_checks(
-    out Int4Weight2xBlockSourceData src_data,
-    const int k8,
-    const int n_start,
-    const int ntexels_K,
-    const int N) {
-  [[unroll]] for (int n = 0; n < 8; ++n) {
-    src_data.data[n] = t_int4_weight[(n_start + n) * ntexels_K + k8];
-  }
-}
-
-// To be used if K - k_start < 4
-void load_block_source_data_with_checks(
-    out Int4Weight2xBlockSourceData src_data,
-    const int k8,
-    const int n_start,
-    const int ntexels_K,
-    const int N) {
-  [[unroll]] for (int n = 0; n < 8; ++n) {
-    if (n_start + n < N) {
-      src_data.data[n] = t_int4_weight[(n_start + n) * ntexels_K + k8];
-    } else {
-      src_data.data[n] = 0x88888888;
-    }
-  }
-}
-
-int pack_8x4bit_signed_into_int(
-    const int val0,
-    const int val1,
-    const int val2,
-    const int val3,
-    const int val4,
-    const int val5,
-    const int val6,
-    const int val7) {
-  return int(
-      ((val7 & 0xF) << 28) | ((val6 & 0xF) << 24) | ((val5 & 0xF) << 20) |
-      ((val4 & 0xF) << 16) | ((val3 & 0xF) << 12) | ((val2 & 0xF) << 8) |
-      ((val1 & 0xF) << 4) | ((val0 & 0xF)));
-}
-
-void create_packed_blocks(
-    out Int4WeightBlockPacked block1,
-    out Int4WeightBlockPacked block2,
-    const Int4Weight2xBlockSourceData src_data) {
-  [[unroll]] for (int row = 0; row < 4; ++row) {
-    const int row_idx_1 = row;
-    const int row_idx_2 = row + 4;
-    block1.data[row] = pack_8x4bit_signed_into_int(
-        extract_4bit_from_packed_uint_le(src_data.data[row_idx_1], 0),
-        extract_4bit_from_packed_uint_le(src_data.data[row_idx_2], 0),
-        extract_4bit_from_packed_uint_le(src_data.data[row_idx_1], 1),
-        extract_4bit_from_packed_uint_le(src_data.data[row_idx_2], 1),
-        extract_4bit_from_packed_uint_le(src_data.data[row_idx_1], 2),
-        extract_4bit_from_packed_uint_le(src_data.data[row_idx_2], 2),
-        extract_4bit_from_packed_uint_le(src_data.data[row_idx_1], 3),
-        extract_4bit_from_packed_uint_le(src_data.data[row_idx_2], 3));
-
-    block2.data[row] = pack_8x4bit_signed_into_int(
-        extract_4bit_from_packed_uint_le(src_data.data[row_idx_1], 4),
-        extract_4bit_from_packed_uint_le(src_data.data[row_idx_2], 4),
-        extract_4bit_from_packed_uint_le(src_data.data[row_idx_1], 5),
-        extract_4bit_from_packed_uint_le(src_data.data[row_idx_2], 5),
-        extract_4bit_from_packed_uint_le(src_data.data[row_idx_1], 6),
-        extract_4bit_from_packed_uint_le(src_data.data[row_idx_2], 6),
-        extract_4bit_from_packed_uint_le(src_data.data[row_idx_1], 7),
-        extract_4bit_from_packed_uint_le(src_data.data[row_idx_2], 7));
-  }
-}
-
-#ifdef USING_BUFFER
-
-void write_packed_block(
-    const Int4WeightBlockPacked block,
-    const int k4,
-    const int n8,
-    const int nblocks_K) {
-  t_packed_int4_weight[n8 * nblocks_K + k4] = block.data;
-}
-
-#else // USING_TEXTURE
-
-void write_packed_block(
-    const Int4WeightBlockPacked block,
-    const int k4,
-    const int n8,
-    const int nblocks_K) {
-  imageStore(t_packed_int4_weight, ivec2(k4, n8), block.data);
-}
-
-#endif // USING_BUFFER
-
-#ifdef DEBUG_MODE
-
-void printInt4Weight2xBlockSourceData(
-    const Int4Weight2xBlockSourceData src_data) {
-  debugPrintfEXT("int4_weight_block_source_data: \\n");
-  [[unroll]] for (int row = 0; row < 8; ++row) {
-    debugPrintfEXT("row %i (raw: %u): ", row, src_data.data[row]);
-    // Extract and print individual 4-bit values directly from packed int
-    [[unroll]] for (int col = 0; col < 8; ++col) {
-      int val_4bit = extract_4bit_from_packed_uint_le(src_data.data[row], col);
-      debugPrintfEXT("[%i] ", val_4bit);
-    }
-    debugPrintfEXT("\\n");
-  }
-}
-
-void printInt4WeightBlockPacked(const Int4WeightBlockPacked block) {
-  debugPrintfEXT("int4_weight_block_packed: \\n");
-  // Print unpacked 4-bit values for each int in block.data
-  [[unroll]] for (int i = 0; i < 4; ++i) {
-    debugPrintfEXT("block.data[%i] 4-bit values: ", i);
-    [[unroll]] for (int col = 0; col < 8; ++col) {
-      int val_4bit = extract_4bit_from_packed_int_le(block.data[i], col);
-      debugPrintfEXT("[%i] ", val_4bit);
-    }
-    debugPrintfEXT("\\n");
-  }
-}
-
-#endif // DEBUG_MODE
-
-#endif // LINEAR_INT4_WEIGHT_BLOCK_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int4_weight_tile.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int4_weight_tile.glslh
deleted file mode 100644
index 559459f14a8..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_int4_weight_tile.glslh
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef LINEAR_INT4_WEIGHT_TILE_GLSLH
-#define LINEAR_INT4_WEIGHT_TILE_GLSLH
-
-#include "linear_common.glslh"
-#include "linear_fp_weight_tile.glslh"
-
-/*
- * Defines the Int4WeightTile struct, which is used to represent a tile of the
- * quantized int4 weight matrix of a quantized matrix multiplication operation.
- *
- * Settings:
- * - TILE_K4: number of (groups of 4) rows in the weight tile
- * - TILE_N8: number of (groups of 8) columns in the weight tile
- */
-
-#extension GL_EXT_control_flow_attributes : require
-
-struct Int4WeightTile {
-  ivec4 data[TILE_K4][TILE_N8];
-};
-
-void unpack_int4_weight_tile(
-    out FPWeightTile int8_tile,
-    const Int4WeightTile int4_tile) {
-#if TILE_K4 == 1 && TILE_N8 == 1
-  for (int k = 0; k < TILE_K; ++k) {
-    const int col_idx_1 = 2 * k;
-    const int col_idx_2 = 2 * k + 1;
-    int8_tile.data[k][0][0] =
-        T(extract_4bit_from_packed_int_le(int4_tile.data[0][0][0], col_idx_1));
-    int8_tile.data[k][0][1] =
-        T(extract_4bit_from_packed_int_le(int4_tile.data[0][0][1], col_idx_1));
-    int8_tile.data[k][0][2] =
-        T(extract_4bit_from_packed_int_le(int4_tile.data[0][0][2], col_idx_1));
-    int8_tile.data[k][0][3] =
-        T(extract_4bit_from_packed_int_le(int4_tile.data[0][0][3], col_idx_1));
-
-    // n4 = 1
-    int8_tile.data[k][1][0] =
-        T(extract_4bit_from_packed_int_le(int4_tile.data[0][0][0], col_idx_2));
-    int8_tile.data[k][1][1] =
-        T(extract_4bit_from_packed_int_le(int4_tile.data[0][0][1], col_idx_2));
-    int8_tile.data[k][1][2] =
-        T(extract_4bit_from_packed_int_le(int4_tile.data[0][0][2], col_idx_2));
-    int8_tile.data[k][1][3] =
-        T(extract_4bit_from_packed_int_le(int4_tile.data[0][0][3], col_idx_2));
-  }
-
-#else
-  for (int k = 0; k < TILE_K; ++k) {
-    const int k4 = div_4(k);
-    const int k4i = mod_4(k);
-    for (int n8 = 0; n8 < TILE_N8; ++n8) {
-      const int n4 = mul_2(n8);
-      const int col_idx_1 = 2 * k4i;
-      const int col_idx_2 = 2 * k4i + 1;
-      int8_tile.data[k][n4][0] = T(extract_4bit_from_packed_int_le(
-          int4_tile.data[k4][n8][0], col_idx_1));
-      int8_tile.data[k][n4][1] = T(extract_4bit_from_packed_int_le(
-          int4_tile.data[k4][n8][1], col_idx_1));
-      int8_tile.data[k][n4][2] = T(extract_4bit_from_packed_int_le(
-          int4_tile.data[k4][n8][2], col_idx_1));
-      int8_tile.data[k][n4][3] = T(extract_4bit_from_packed_int_le(
-          int4_tile.data[k4][n8][3], col_idx_1));
-
-      int8_tile.data[k][n4 + 1][0] = T(extract_4bit_from_packed_int_le(
-          int4_tile.data[k4][n8][0], col_idx_2));
-      int8_tile.data[k][n4 + 1][1] = T(extract_4bit_from_packed_int_le(
-          int4_tile.data[k4][n8][1], col_idx_2));
-      int8_tile.data[k][n4 + 1][2] = T(extract_4bit_from_packed_int_le(
-          int4_tile.data[k4][n8][2], col_idx_2));
-      int8_tile.data[k][n4 + 1][3] = T(extract_4bit_from_packed_int_le(
-          int4_tile.data[k4][n8][3], col_idx_2));
-    }
-  }
-
-#endif
-}
-
-#ifdef DEBUG_MODE
-
-void printInt4WeightTile(const Int4WeightTile block) {
-  debugPrintfEXT("int4_weight_tile: \\n");
-  // Print unpacked 4-bit values for each int in block.data
-  [[unroll]] for (int i = 0; i < TILE_K; ++i) {
-    const int k4 = div_4(i);
-    const int k4i = mod_4(i);
-    debugPrintfEXT("block.data[%i] 4-bit values: ", i);
-    [[unroll]] for (int col = 0; col < TILE_N; ++col) {
-      int val_4bit =
-          extract_4bit_from_packed_int_le(block.data[k4][0][k4i], col);
-      debugPrintfEXT("[%i] ", val_4bit);
-    }
-    debugPrintfEXT("\\n");
-  }
-}
-
-#endif // DEBUG_MODE
-
-#endif // LINEAR_INT4_WEIGHT_TILE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int4_weight_tile_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int4_weight_tile_load.glslh
deleted file mode 100644
index 033e0082436..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_int4_weight_tile_load.glslh
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef LINEAR_INT4_WEIGHT_TILE_LOAD_GLSLH
-#define LINEAR_INT4_WEIGHT_TILE_LOAD_GLSLH
-
-/*
- * Defines functions to load a Int4WeightTile from input buffer/texture.
- *
- * Requires:
- * - t_packed_int4_weight to be declared in the shader layout (input
- * buffer/texture)
- *
- * Settings:
- * - WEIGHT_BUFFER to indicate t_packed_int4_weight is a buffer, otherwise
- * texture storage is assumed.
- */
-
-#extension GL_EXT_control_flow_attributes : require
-
-#include "linear_int4_weight_tile.glslh"
-
-#ifdef WEIGHT_BUFFER
-
-ivec4 load_int4_weight_block(
-    const int block_x,
-    const int block_y,
-    const int nblocks_x) {
-  return t_packed_int4_weight[(block_y * nblocks_x) + block_x];
-}
-
-#else // WEIGHT_TEXTURE
-
-ivec4 load_int4_weight_block(
-    const int block_x,
-    const int block_y,
-    const int nblocks_x) {
-  return texelFetch(t_packed_int4_weight, ivec2(block_x, block_y), 0);
-}
-
-#endif // WEIGHT_BUFFER
-
-void load_int4_weight_tile(
-    out Int4WeightTile weight_tile,
-    const int block_x,
-    const int block_y,
-    const int nblocks_x) {
-#if TILE_K4 == 1 && TILE_N8 == 1
-  weight_tile.data[0][0] = load_int4_weight_block(block_x, block_y, nblocks_x);
-
-#elif TILE_K4 == 1 && TILE_N8 > 1
-  [[unroll]] for (int x = 0; x < TILE_N8; ++x) {
-    weight_tile.data[0][x] =
-        load_int4_weight_block(block_x + x, block_y, nblocks_x);
-  }
-
-#elif TILE_K4 > 1 && TILE_N8 == 1
-  [[unroll]] for (int y = 0; y < TILE_K4; ++y) {
-    weight_tile.data[y][0] =
-        load_int4_weight_block(block_x, block_y + y, nblocks_x);
-  }
-
-#else
-  [[unroll]] for (int y = 0; y < TILE_K4; ++y) {
-    [[unroll]] for (int x = 0; x < TILE_N8; ++x) {
-      weight_tile.data[y][x] =
-          load_int4_weight_block(block_x + x, block_y + y, nblocks_x);
-    }
-  }
-#endif
-}
-
-#endif // LINEAR_INT4_WEIGHT_TILE_LOAD_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_block.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_block.glslh
deleted file mode 100644
index 9535de21f7b..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_block.glslh
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- * This file defines utilties to perform int8 quantization and block packing of
- * matrix multiplation inputs. It also defines utilities to store packed block
- * data to an output buffer or texture.
- *
- * Requires:
- * - t_packed_int8_input to be defined in shader layout (output buffer/texture)
- *
- * Settings:
- * - OUTPUT_BUFFER to indicate if output resource is a buffer. Otherwise texture
- *   is assumed.
- */
-
-#ifndef LINEAR_INT8_INPUT_BLOCK_GLSLH
-#define LINEAR_INT8_INPUT_BLOCK_GLSLH
-
-#define TILE_M 4
-#define TILE_K4 1
-
-#include "linear_fp_input_tile.glslh"
-
-struct Int8InputBlock {
-  ivec4 data;
-};
-
-ivec4 quantize(
-    const VEC4_T val,
-    const float q_inv_scale,
-    const int q_zero_point) {
-  vec4 quantized = round(vec4(val) * q_inv_scale) + q_zero_point;
-  // hard-code 8 bit quantization range
-  return clamp(ivec4(quantized), -128, 127);
-}
-
-int pack_into_int32(const ivec4 quant_vals) {
-  int packed = ((quant_vals[0] & 0xFF) << 0) | ((quant_vals[1] & 0xFF) << 8) |
-      ((quant_vals[2] & 0xFF) << 16) | ((quant_vals[3] & 0xFF) << 24);
-
-  return packed;
-}
-
-void quantize_and_pack(
-    out Int8InputBlock packed,
-    const FPInputTile in_block,
-    const float q_inv_scale,
-    const int q_zero_point) {
-  for (int row = 0; row < 4; ++row) {
-    ivec4 quantized_inputs =
-        quantize(in_block.data[row][0], q_inv_scale, q_zero_point);
-    packed.data[row] = pack_into_int32(quantized_inputs);
-  }
-}
-
-#ifdef OUTPUT_BUFFER
-
-void write_block(
-    const Int8InputBlock block,
-    const int block_x,
-    const int block_y,
-    const int nblocks_x) {
-  t_packed_int8_input[block_y * nblocks_x + block_x] = block.data;
-}
-
-#else // OUTPUT_TEXTURE
-
-void write_block(
-    const Int8InputBlock block,
-    const int block_x,
-    const int block_y,
-    const int nblocks_x) {
-  imageStore(t_packed_int8_input, ivec3(block_x, block_y, 0), block.data);
-}
-
-#endif // OUTPUT_BUFFER
-
-#endif // LINEAR_INT8_INPUT_BLOCK_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_tile.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_tile.glslh
deleted file mode 100644
index 89a7e1b3f89..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_tile.glslh
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- * Defines the Int8InputTile struct, which is used to represent a tile of the
- * quantized int8 input matrix of a quantized matrix multiplication operation.
- *
- * Settings:
- * - TILE_M4: number of (groups of 4) rows in the tile
- * - TILE_K4: number of (groups of 4) columns in the tile
- */
-
-#ifndef LINEAR_INT8_INPUT_TILE_GLSLH
-#define LINEAR_INT8_INPUT_TILE_GLSLH
-
-#extension GL_EXT_control_flow_attributes : require
-
-struct Int8InputTile {
-  ivec4 data[TILE_M4][TILE_K4];
-};
-
-#ifdef DEBUG_MODE
-
-#include "linear_common.glslh"
-
-void printInt8InputTile(const Int8InputTile tile) {
-  debugPrintfEXT(
-      "Int8InputTile [TILE_M4=%d][TILE_K4=%d]:\\n", TILE_M4, TILE_K4);
-
-  [[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) {
-    [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) {
-      debugPrintfEXT("  tile[%d][%d] (ivec4): ", m4, k4);
-
-      // Each ivec4 contains 4 packed integers, each integer contains 4 8-bit
-      // values
-      [[unroll]] for (int vec_idx = 0; vec_idx < 4; ++vec_idx) {
-        int packed_int = tile.data[m4][k4][vec_idx];
-        debugPrintfEXT("packed_int[%d]=%d -> [", vec_idx, packed_int);
-
-        // Extract 4 8-bit values from this packed integer
-        [[unroll]] for (int byte_idx = 0; byte_idx < 4; ++byte_idx) {
-          int val = extract_8bit_from_packed_int_le(packed_int, byte_idx);
-          if (byte_idx < 3) {
-            debugPrintfEXT("%d, ", val);
-          } else {
-            debugPrintfEXT("%d] ", val);
-          }
-        }
-      }
-      debugPrintfEXT("\\n");
-    }
-  }
-}
-
-#endif // DEBUG_MODE
-
-#endif // LINEAR_INT8_INPUT_TILE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_tile_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_tile_load.glslh
deleted file mode 100644
index c79badab6c6..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_input_tile_load.glslh
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- * Defines functions to load a Int8InputTile from input buffer/texture.
- *
- * Requires:
- * - t_packed_int8_input to be declared in the shader layout
- *
- * Settings:
- * - PACKED_INT8_INPUT_BUFFER to indicate resource is a buffer, otherwise
- *   texture storage is assumed.
- */
-
-#ifndef LINEAR_INT8_INPUT_TILE_LOAD_GLSLH
-#define LINEAR_INT8_INPUT_TILE_LOAD_GLSLH
-
-#extension GL_EXT_control_flow_attributes : require
-
-#include "linear_int8_input_tile.glslh"
-
-#ifdef PACKED_INT8_INPUT_BUFFER
-
-ivec4 load_int8_input_block(
-    const int block_x,
-    const int block_y,
-    const int nblocks_x) {
-  return t_packed_int8_input[(block_y * nblocks_x) + block_x];
-}
-
-#else
-
-ivec4 load_int8_input_block(
-    const int block_x,
-    const int block_y,
-    const int nblocks_x) {
-  return texelFetch(t_packed_int8_input, ivec3(block_x, block_y, 0), 0);
-}
-
-#endif // PACKED_INT8_INPUT_BUFFER
-
-void load_int8_input_tile(
-    out Int8InputTile in_tile,
-    const int block_x,
-    const int block_y,
-    const int nblocks_x) {
-#if TILE_M4 == 1 && TILE_K4 == 1
-  in_tile.data[0][0] = load_int8_input_block(block_x, block_y, nblocks_x);
-
-#elif TILE_M4 == 1 && TILE_K4 > 1
-  [[unroll]] for (int x = 0; x < TILE_K4; ++x) {
-    in_tile.data[0][x] = load_int8_input_block(block_x + x, block_y, nblocks_x);
-  }
-
-#elif TILE_M4 > 1 && TILE_K4 == 1
-  [[unroll]] for (int y = 0; y < TILE_M4; ++y) {
-    in_tile.data[y][0] = load_int8_input_block(block_x, block_y + y, nblocks_x);
-  }
-
-#else
-  [[unroll]] for (int y = 0; y < TILE_M4; ++y) {
-    [[unroll]] for (int x = 0; x < TILE_K4; ++x) {
-      in_tile.data[y][x] =
-          load_int8_input_block(block_x + x, block_y + y, nblocks_x);
-    }
-  }
-#endif
-}
-
-#endif // LINEAR_INT8_INPUT_TILE_LOAD_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_weight_block.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_weight_block.glslh
deleted file mode 100644
index 6e98caea49e..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_weight_block.glslh
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef LINEAR_INT8_WEIGHT_BLOCK_GLSLH
-#define LINEAR_INT8_WEIGHT_BLOCK_GLSLH
-
-/*
- * This file defines utilties to perform weight prepacking of quantized int8
- * matrix multiplation weights. It also defines utilities to load source
- * weight data from inputbuffer, and write out a packed weight block to output
- * texture/buffer.
- *
- * Requires:
- * - t_packed_int8_weight to be defined in shader layout (output texture/buffer)
- * - t_int8_weight to be defined in shader layout (input buffer)
- *
- * Settings:
- * - USING_BUFFER to indicate if output resource is a buffer. Otherwise texture
- *   is assumed.
- */
-
-#extension GL_EXT_control_flow_attributes : require
-
-#include "linear_common.glslh"
-
-// Represents data for a 4x4 block of the weight matrix read from the input
-// buffer.
-struct Int8WeightBlock {
-  ivec4 data;
-};
-
-void load_block_data_no_checks(
-    out Int8WeightBlock block,
-    const int k4,
-    const int n_start,
-    const int ntexels_K,
-    const int N) {
-  [[unroll]] for (int n = 0; n < 4; ++n) {
-    block.data[n] = t_int8_weight[(n_start + n) * ntexels_K + k4];
-  }
-}
-
-void load_block_data_with_checks(
-    out Int8WeightBlock block,
-    const int k4,
-    const int n_start,
-    const int ntexels_K,
-    const int N) {
-  [[unroll]] for (int n = 0; n < 4; ++n) {
-    if (n_start + n < N) {
-      block.data[n] = t_int8_weight[(n_start + n) * ntexels_K + k4];
-    } else {
-      block.data[n] = 0;
-    }
-  }
-}
-
-#ifdef USING_BUFFER
-
-void write_weight_block(
-    const Int8WeightBlock block,
-    const int n4,
-    const int k4,
-    const int ntexels_N) {
-  t_packed_int8_weight[k4 * ntexels_N + n4] = block.data;
-}
-
-#else // USING_TEXTURE
-
-void write_weight_block(
-    const Int8WeightBlock block,
-    const int n4,
-    const int k4,
-    const int ntexels_N) {
-  imageStore(t_packed_int8_weight, ivec2(n4, k4), block.data);
-}
-
-#endif // USING_BUFFER
-
-#ifdef DEBUG_MODE
-
-void printInt8WeightBlock(const Int8WeightBlockPacked block) {
-  debugPrintfEXT("int8_weight_block_packed: \\n");
-  debugPrintfEXT(
-      "%i %i %i %i \\n",
-      block.data[0],
-      block.data[1],
-      block.data[2],
-      block.data[3]);
-}
-
-#endif // DEBUG_MODE
-
-#endif // LINEAR_INT8_WEIGHT_BLOCK_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_weight_tile.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_weight_tile.glslh
deleted file mode 100644
index f312db543db..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_weight_tile.glslh
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef LINEAR_INT8_WEIGHT_TILE_GLSLH
-#define LINEAR_INT8_WEIGHT_TILE_GLSLH
-
-/*
- * Defines the Int8WeightTile struct, which is used to represent a tile of the
- * quantized int8 weight matrix of a quantized matrix multiplication operation.
- *
- * Settings:
- * - TILE_K4: number of (groups of 4) rows in the weight tile
- * - TILE_N4: number of (groups of 4) columns in the weight tile
- */
-
-#extension GL_EXT_control_flow_attributes : require
-
-struct Int8WeightTile {
-  ivec4 data[TILE_K4][TILE_N4];
-};
-
-#ifdef DEBUG_MODE
-
-void printInt8WeightTile(const Int8WeightTile tile) {
-  debugPrintfEXT(
-      "Int8WeightTile [TILE_K4=%d][TILE_N4=%d]:\\n", TILE_K4, TILE_N4);
-
-  [[unroll]] for (int m4 = 0; m4 < TILE_M4; ++m4) {
-    [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) {
-      debugPrintfEXT("  tile[%d][%d] (ivec4): ", m4, k4);
-
-      // Each ivec4 contains 4 packed integers, each integer contains 4 8-bit
-      // values
-      [[unroll]] for (int vec_idx = 0; vec_idx < 4; ++vec_idx) {
-        int packed_int = tile.data[m4][k4][vec_idx];
-        debugPrintfEXT("packed_int[%d]=%d -> [", vec_idx, packed_int);
-
-        // Extract 4 8-bit values from this packed integer
-        [[unroll]] for (int byte_idx = 0; byte_idx < 4; ++byte_idx) {
-          int val = extract_8bit_from_packed_int_le(packed_int, byte_idx);
-          if (byte_idx < 3) {
-            debugPrintfEXT("%d, ", val);
-          } else {
-            debugPrintfEXT("%d] ", val);
-          }
-        }
-      }
-      debugPrintfEXT("\\n");
-    }
-  }
-}
-
-#endif // DEBUG_MODE
-
-#endif // LINEAR_INT8_WEIGHT_TILE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_weight_tile_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int8_weight_tile_load.glslh
deleted file mode 100644
index fe16d3469b3..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_int8_weight_tile_load.glslh
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef LINEAR_INT8_WEIGHT_TILE_LOAD_GLSLH
-#define LINEAR_INT8_WEIGHT_TILE_LOAD_GLSLH
-
-/*
- * Defines functions to load a Int8WeightTile from input buffer/texture.
- *
- * Requires:
- * - t_packed_int8_weight to be declared in the shader layout (input
- * buffer/texture)
- *
- * Settings:
- * - WEIGHT_BUFFER to indicate t_packed_int8_weight is a buffer, otherwise
- * texture storage is assumed.
- */
-
-#extension GL_EXT_control_flow_attributes : require
-
-#include "linear_int8_weight_tile.glslh"
-
-#ifdef WEIGHT_BUFFER
-
-ivec4 load_int8_weight_block(
-    const int block_x,
-    const int block_y,
-    const int nblocks_x) {
-  return t_packed_int8_weight[(block_y * nblocks_x) + block_x];
-}
-
-#else // WEIGHT_TEXTURE
-
-ivec4 load_int8_weight_block(
-    const int block_x,
-    const int block_y,
-    const int nblocks_x) {
-  return texelFetch(t_packed_int8_weight, ivec2(block_x, block_y), 0);
-}
-
-#endif // WEIGHT_BUFFER
-
-void load_int8_weight_tile(
-    out Int8WeightTile weight_tile,
-    const int block_x,
-    const int block_y,
-    const int nblocks_x) {
-#if TILE_K4 == 1 && TILE_N4 == 1
-  weight_tile.data[0][0] = load_int8_weight_block(block_x, block_y, nblocks_x);
-
-#elif TILE_K4 == 1 && TILE_N4 > 1
-  [[unroll]] for (int x = 0; x < TILE_N4; ++x) {
-    weight_tile.data[0][x] =
-        load_int8_weight_block(block_x + x, block_y, nblocks_x);
-  }
-
-#elif TILE_K4 > 1 && TILE_N4 == 1
-  [[unroll]] for (int y = 0; y < TILE_M4; ++y) {
-    weight_tile.data[y][0] =
-        load_int8_weight_block(block_x, block_y + y, nblocks_x);
-  }
-
-#else
-  [[unroll]] for (int y = 0; y < TILE_K4; ++y) {
-    [[unroll]] for (int x = 0; x < TILE_N4; ++x) {
-      weight_tile.data[y][x] =
-          load_int8_weight_block(block_x + x, block_y + y, nblocks_x);
-    }
-  }
-#endif
-}
-
-#endif // LINEAR_INT8_WEIGHT_TILE_LOAD_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int_per_out_channel_params.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int_per_out_channel_params.glslh
deleted file mode 100644
index ca29fd52780..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_int_per_out_channel_params.glslh
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- * Defines common functions and structs to be used across matrix multiplication
- * operators.
- */
-
-#ifndef LINEAR_INT_PER_OUT_CHANNEL_PARAMS_GLSLH
-#define LINEAR_INT_PER_OUT_CHANNEL_PARAMS_GLSLH
-
-#include "common.glslh"
-
-#extension GL_EXT_control_flow_attributes : require
-
-// Represents floating point parameter tensors where each element is associated
-// with an output channel, such as weight scales, biases, etc.
-struct IntPerOutChannelParams {
-  ivec4 data[TILE_N4];
-};
-
-#ifdef DEBUG_MODE
-
-void printIntPerOutChannelParams(const IntPerOutChannelParams params) {
-  debugPrintfEXT("per_out_channel_params: \\n");
-  [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-    debugPrintfEXT(
-        "  %d, %d, %d, %d, ",
-        params.data[n4].x,
-        params.data[n4].y,
-        params.data[n4].z,
-        params.data[n4].w);
-  }
-  debugPrintfEXT("\\n");
-}
-
-#endif // DEBUG_MODE
-
-#endif // LINEAR_INT_PER_OUT_CHANNEL_PARAMS_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_int_weight_sums_load.glslh b/backends/vulkan/runtime/graph/ops/glsl/linear_int_weight_sums_load.glslh
deleted file mode 100644
index 1a17f99ea4e..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_int_weight_sums_load.glslh
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef LINEAR_FP_WEIGHT_SUMS_LOAD_GLSLH
-#define LINEAR_FP_WEIGHT_SUMS_LOAD_GLSLH
-
-#include "linear_int_per_out_channel_params.glslh"
-
-ivec4 load_weight_sum_x4(const int n4) {
-  return ivec4(t_weight_sums[n4]);
-}
-
-void load_weight_sums_tile(
-    out IntPerOutChannelParams sums,
-    const int n4_start) {
-#if TILE_N4 == 1
-  sums.data[0] = load_weight_sum_x4(n4_start);
-
-#else
-  [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
-    sums.data[n4] = load_weight_sum_x4(n4_start + n4);
-  }
-
-#endif
-}
-
-#endif // LINEAR_FP_WEIGHT_SUMS_LOAD_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl
deleted file mode 100644
index 6f0d890a9c4..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.glsl
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_load_type(DTYPE, IO_STORAGE)}
-#define T ${texel_load_component_type(DTYPE, IO_STORAGE)}
-
-$if IO_STORAGE == "buffer":
-  #define OUTPUT_BUFFER
-  #define INPUT_BUFFER
-$if WEIGHT_STORAGE == "buffer":
-  #define WEIGHT_BUFFER
-
-#define TILE_N8 ${TILE_N8}
-
-#define TILE_K4 ${TILE_K4}
-#define TILE_N4 ${TILE_N8 * 2}
-
-#define TILE_M ${TILE_M}
-#define TILE_K ${TILE_K4 * 4}
-#define TILE_N ${TILE_N8 * 8}
-
-#define WGS ${WGS}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_output", DTYPE, IO_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_input", DTYPE, IO_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_packed_int4_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)}
-
-${layout_declare_ubo(B, "ivec4", "output_sizes")}
-${layout_declare_ubo(B, "ivec4", "input_sizes")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "apply_bias", "0")}
-${layout_declare_spec_const(C, "int", "K4_per_group", "0")}
-
-#include "common.glslh"
-#include "linear_fp_input_tile_load.glslh"
-#include "linear_int4_weight_tile_load.glslh"
-#include "linear_fp_weight_scales_load.glslh"
-#include "linear_fp_output_tile_fp_int4_compute.glslh"
-#include "linear_fp_output_tile_fp_compute.glslh"
-#include "linear_fp_output_tile_store.glslh"
-#include "linear_fp_bias_load.glslh"
-
-shared FPOutTile partial_sums[WGS];
-
-void main() {
-  const int lid = int(gl_LocalInvocationID.x);
-  const int n8 = int(gl_GlobalInvocationID.y);
-
-  // The output tensor will have a shape of [n, 1, 1, 1]. Each thread computes
-  // 8 output elements, so each thread will write to 8 elements starting at the
-  // tensor index (gid.x * 8, 0, 0, 0).
-  const int n = mul_8(n8);
-  const int n4 = mul_2(n8);
-  const int K4 = div_up_4(input_sizes.x);
-  const int N4 = div_up_4(output_sizes.x);
-
-  const int group_size = mul_4(K4_per_group);
-
-  if (n >= output_sizes.x) {
-    return;
-  }
-
-  FPOutTile out_tile;
-  initialize(out_tile);
-
-  FPInputTile in_tile;
-  Int4WeightTile int4_weight_tile;
-
-  FPPerOutChannelParams weight_scales_tile;
-  FPPerOutChannelParams weight_zeros_tile;
-  weight_zeros_tile.data[0] = VEC4_T(0.0);
-  weight_zeros_tile.data[1] = VEC4_T(0.0);
-
-  // initialize the group index to a value larger than the largest possible
-  int cur_group_idx = input_sizes.x;
-
-  for (int k4 = lid; k4 < div_up_4(input_sizes.x); k4 += WGS) {
-    const int group_idx = k4 / K4_per_group;
-
-    // Only update the scales/zeros if the current iteration is now working on a
-    // new quantization group.
-    if (group_idx != cur_group_idx) {
-      load_weight_scales_tile_for_group(weight_scales_tile, n4, group_idx, N4);
-      cur_group_idx = group_idx;
-    }
-
-    load_input_tile_no_checks(in_tile, k4, 0, K4, 1);
-    load_int4_weight_tile(int4_weight_tile, k4, n8, K4);
-
-    fp_accumulate_with_int4_weight(
-        out_tile,
-        in_tile,
-        int4_weight_tile,
-        weight_scales_tile,
-        weight_zeros_tile);
-  }
-
-  partial_sums[lid] = out_tile;
-
-  memoryBarrierShared();
-  barrier();
-
-  // Tree reduction to compute the overall result.
-  for (int i = WGS / 2; i > 0; i /= 2) {
-    if (lid < i) {
-      accumulate_out_tile_with_out_tile(
-          partial_sums[lid], partial_sums[lid + i]);
-    }
-    memoryBarrierShared();
-    barrier();
-  }
-
-  // Only the first thread will write out result
-  if (lid == 0) {
-    out_tile = partial_sums[0];
-    write_output_tile_with_checks(out_tile, n4, 0, N4, 1);
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.yaml
deleted file mode 100644
index bb5f44d4086..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_coop.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-linear_q4gsw_coop:
-  parameter_names_with_default_values:
-    DTYPE: float
-    IO_STORAGE: texture3d
-    WEIGHT_STORAGE: texture2d
-    TILE_M: 1
-    TILE_K4: 1
-    TILE_N8: 1
-    WGS: 64
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-      - VALUE: half
-  shader_variants:
-    - NAME: linear_q4gsw_coop_texture3d_texture2d
-    - NAME: linear_q4gsw_coop_texture3d_buffer
-      WEIGHT_STORAGE: buffer
-    - NAME: linear_q4gsw_coop_buffer_texture2d
-      IO_STORAGE: buffer
-    - NAME: linear_q4gsw_coop_buffer_buffer
-      IO_STORAGE: buffer
-      WEIGHT_STORAGE: buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl
deleted file mode 100644
index 0ad91643219..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.glsl
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_load_type(DTYPE, IO_STORAGE)}
-#define T ${texel_load_component_type(DTYPE, IO_STORAGE)}
-
-$if IO_STORAGE == "buffer":
-  #define OUTPUT_BUFFER
-  #define INPUT_BUFFER
-$if WEIGHT_STORAGE == "buffer":
-  #define WEIGHT_BUFFER
-
-#define TILE_N8 ${TILE_N8}
-
-#define TILE_M4 ${TILE_M4}
-#define TILE_K4 ${TILE_K4}
-#define TILE_N4 ${TILE_N8 * 2}
-
-#define TILE_M ${TILE_M4 * 4}
-#define TILE_K ${TILE_K4 * 4}
-#define TILE_N ${TILE_N8 * 8}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "common.glslh"
-
-${layout_declare_tensor(B, "w", "t_output", DTYPE, IO_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_input", DTYPE, IO_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_packed_int4_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)}
-
-${layout_declare_ubo(B, "ivec4", "output_sizes")}
-${layout_declare_ubo(B, "ivec4", "input_sizes")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "apply_bias", "0")}
-${layout_declare_spec_const(C, "int", "K4_per_group", "0")}
-
-#include "linear_fp_input_tile_load.glslh"
-#include "linear_int4_weight_tile_load.glslh"
-#include "linear_fp_weight_scales_load.glslh"
-#include "linear_fp_bias_load.glslh"
-#include "linear_fp_output_tile_fp_int4_compute.glslh"
-#include "linear_fp_output_tile_fp_compute.glslh"
-#include "linear_fp_output_tile_store.glslh"
-
-void main() {
-  const int out_tile_x = int(gl_GlobalInvocationID.x);
-  const int out_tile_y = int(gl_GlobalInvocationID.y);
-
-  const int n = out_tile_x * TILE_N;
-  const int m = out_tile_y * TILE_M;
-
-  const int n8 = div_8(n);
-  const int n4 = div_4(n);
-  const int m4 = div_4(m);
-
-  if (n >= output_sizes.x || m >= output_sizes.y) {
-    return;
-  }
-
-  const int M = input_sizes.y;
-  const int K4 = div_up_4(input_sizes.x);
-  const int N4 = div_up_4(output_sizes.x); // number of texels in each row
-  const int N8 = div_up_8(output_sizes.x); // number of texels in each row
-
-  bool should_print = (n8 == 0) && (m4 == 0);
-  should_print = false;
-
-  // VEC4_T out_texels[4][2];
-  FPOutTile out_tile;
-  initialize(out_tile);
-
-  FPInputTile in_tile;
-  Int4WeightTile int4_weight_tile;
-
-  FPPerOutChannelParams weight_scales_tile;
-  FPPerOutChannelParams weight_zeros_tile;
-  weight_zeros_tile.data[0] = VEC4_T(0.0);
-  weight_zeros_tile.data[1] = VEC4_T(0.0);
-
-  const int num_groups = K4 / K4_per_group;
-
-  for (int group_i = 0; group_i < num_groups; ++group_i) {
-    // Load quantization scales and zeros for the current group
-    load_weight_scales_tile_for_group(weight_scales_tile, n4, group_i, N4);
-
-    for (int k4_inner = 0; k4_inner < K4_per_group; k4_inner++) {
-      const int k4 = group_i * K4_per_group + k4_inner;
-
-      load_input_tile_no_checks(in_tile, k4, m, K4, M);
-      load_int4_weight_tile(int4_weight_tile, k4, n8, K4);
-
-      fp_accumulate_with_int4_weight(
-          out_tile,
-          in_tile,
-          int4_weight_tile,
-          weight_scales_tile,
-          weight_zeros_tile);
-    }
-  }
-
-  write_output_tile_with_checks(out_tile, n4, m, N4, M);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.yaml
deleted file mode 100644
index 5a6bcb711bb..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_q4gsw_tiled.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-linear_q4gsw_tiled:
-  parameter_names_with_default_values:
-    DTYPE: float
-    IO_STORAGE: texture3d
-    WEIGHT_STORAGE: texture2d
-    TILE_M4: 1
-    TILE_K4: 1
-    TILE_N8: 1
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-      - VALUE: half
-  shader_variants:
-    - NAME: linear_q4gsw_tiled_texture3d_texture2d
-    - NAME: linear_q4gsw_tiled_texture3d_buffer
-      WEIGHT_STORAGE: buffer
-    - NAME: linear_q4gsw_tiled_buffer_texture2d
-      IO_STORAGE: buffer
-      WEIGHT_STORAGE: texture2d
-    - NAME: linear_q4gsw_tiled_buffer_buffer
-      IO_STORAGE: buffer
-      WEIGHT_STORAGE: buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q8csw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_q8csw_tiled.glsl
deleted file mode 100644
index b6d932f0015..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_q8csw_tiled.glsl
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_load_type(DTYPE, IO_STORAGE)}
-#define T ${texel_load_component_type(DTYPE, IO_STORAGE)}
-
-$if IO_STORAGE == "buffer":
-  #define OUTPUT_BUFFER
-  #define INPUT_BUFFER
-$if WEIGHT_STORAGE == "buffer":
-  #define WEIGHT_BUFFER
-
-#define TILE_M4 ${TILE_M4}
-#define TILE_K4 ${TILE_K4}
-#define TILE_N4 ${TILE_N4}
-
-#define TILE_M ${TILE_M4 * 4}
-#define TILE_K ${TILE_K4 * 4}
-#define TILE_N ${TILE_N4 * 4}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_output", DTYPE, IO_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_input", DTYPE, IO_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)}
-
-${layout_declare_ubo(B, "ivec4", "output_sizes")}
-${layout_declare_ubo(B, "ivec4", "input_sizes")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "uint", "apply_bias", "0")}
-
-#include "linear_fp_input_tile_load.glslh"
-#include "linear_int8_weight_tile_load.glslh"
-#include "linear_fp_weight_tile.glslh"
-#include "linear_fp_output_tile_fp_compute.glslh"
-#include "linear_fp_output_tile_fp_int8_compute.glslh"
-#include "linear_fp_output_tile_store.glslh"
-#include "linear_fp_weight_scales_load.glslh"
-#include "linear_fp_bias_load.glslh"
-
-void main() {
-  // Each thread writes out a 4 wide x 4 high tile of output values
-  const int out_tile_x = int(gl_GlobalInvocationID.x);
-  const int out_tile_y = int(gl_GlobalInvocationID.y);
-
-  const int n = out_tile_x * TILE_N;
-  const int m = out_tile_y * TILE_M;
-
-  const int n4 = div_4(n);
-  const int m4 = div_4(m);
-
-  if (n >= output_sizes.x || m >= output_sizes.y) {
-    return;
-  }
-
-  const int M = input_sizes.y;
-  const int K4 = div_up_4(input_sizes.x);
-  const int N4 = div_up_4(output_sizes.x);
-
-  FPOutTile out_tile;
-  initialize(out_tile);
-
-  FPInputTile in_tile;
-  Int8WeightTile int8_weight_tile;
-
-  const bool dont_check_bounds = (M - m) >= TILE_M;
-  if (dont_check_bounds) {
-    for (int k4 = 0; k4 < K4; k4 += TILE_K4) {
-      load_input_tile_no_checks(in_tile, k4, m, K4, M);
-      load_int8_weight_tile(int8_weight_tile, n4, k4, N4);
-      fp_accumulate_with_int8_weight(out_tile, in_tile, int8_weight_tile);
-    }
-  } else {
-    for (int k4 = 0; k4 < K4; k4 += TILE_K4) {
-      load_input_tile_with_checks(in_tile, k4, m, K4, M);
-      load_int8_weight_tile(int8_weight_tile, n4, k4, N4);
-      fp_accumulate_with_int8_weight(out_tile, in_tile, int8_weight_tile);
-    }
-  }
-
-  FPPerOutChannelParams weight_scales_tile;
-  load_weight_scales_tile(weight_scales_tile, n4);
-
-  if (apply_bias > 0) {
-    FPPerOutChannelParams bias_tile;
-    load_bias_tile(bias_tile, n4);
-
-    apply_scales_and_biases(out_tile, weight_scales_tile, bias_tile);
-  }
-  else {
-    apply_scales(out_tile, weight_scales_tile);
-  }
-
-  if (dont_check_bounds) {
-    write_output_tile_no_checks(out_tile, n4, m, N4, M);
-  } else {
-    write_output_tile_with_checks(out_tile, n4, m, N4, M);
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q8csw_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_q8csw_tiled.yaml
deleted file mode 100644
index 242c4471b3d..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_q8csw_tiled.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-linear_q8csw_tiled:
-  parameter_names_with_default_values:
-    DTYPE: float
-    IO_STORAGE: texture3d
-    WEIGHT_STORAGE: texture2d
-    TILE_M4: 1
-    TILE_N4: 1
-    TILE_K4: 1
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-      - VALUE: half
-  shader_variants:
-    - NAME: linear_q8csw_tiled_texture3d_texture2d
-    - NAME: linear_q8csw_tiled_texture3d_buffer
-      WEIGHT_STORAGE: buffer
-    - NAME: linear_q8csw_tiled_buffer_texture2d
-      IO_STORAGE: buffer
-      WEIGHT_STORAGE: texture2d
-    - NAME: linear_q8csw_tiled_buffer_buffer
-      IO_STORAGE: buffer
-      WEIGHT_STORAGE: buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.glsl
deleted file mode 100644
index 9f7e00e3317..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.glsl
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_load_type(DTYPE, OUTPUT_STORAGE)}
-#define T int
-
-$if OUTPUT_STORAGE == "buffer":
-  #define OUTPUT_BUFFER
-$if PACKED_INT8_INPUT_STORAGE == "buffer":
-  #define PACKED_INT8_INPUT_BUFFER
-$if WEIGHT_STORAGE == "buffer":
-  #define WEIGHT_BUFFER
-
-#define TILE_M4 ${TILE_M4}
-#define TILE_K4 ${TILE_K4}
-#define TILE_N4 ${TILE_N4}
-
-#define TILE_M ${TILE_M4 * 4}
-#define TILE_K ${TILE_K4 * 4}
-#define TILE_N ${TILE_N4 * 4}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_output", DTYPE, OUTPUT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_packed_int8_input", "int", PACKED_INT8_INPUT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_packed_int8_weight", "int", WEIGHT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_weight_sums", "int", "buffer", is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_weight_scales", DTYPE, "buffer", is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=False)}
-
-${layout_declare_spec_const(C, "int", "apply_bias", "0")}
-
-${layout_declare_ubo(B, "ivec4", "output_sizes")}
-${layout_declare_ubo(B, "ivec4", "input_sizes")}
-
-layout(push_constant) uniform restrict Block {
-  float input_scale;
-  int input_zp;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#include "linear_int8_input_tile_load.glslh"
-#include "linear_int8_weight_tile_load.glslh"
-#include "linear_fp_output_tile_int8_int8_compute.glslh"
-#include "linear_fp_output_tile_store.glslh"
-#include "linear_fp_weight_scales_load.glslh"
-#include "linear_int_weight_sums_load.glslh"
-#include "linear_fp_bias_load.glslh"
-
-void main() {
-  // Each thread writes out a 4 wide x 4 high tile of output values
-  const int out_tile_x = int(gl_GlobalInvocationID.x);
-  const int out_tile_y = int(gl_GlobalInvocationID.y);
-
-  const int n = out_tile_x * TILE_N;
-  const int m = out_tile_y * TILE_M;
-
-  const int n4 = div_4(n);
-  const int m4 = div_4(m);
-
-  if (n >= output_sizes.x || m >= output_sizes.y) {
-    return;
-  }
-
-  const int M = output_sizes.y;
-  const int K4 = div_up_4(input_sizes.x);
-  const int N4 = div_up_4(output_sizes.x);
-
-  Int32Accum out_accum;
-  initialize(out_accum);
-
-  Int8InputTile int8_in_tile;
-  Int8WeightTile int8_weight_tile;
-
-  // No checks are needed since packed input and weight are structured in units
-  // of 4x4 blocks.
-  for (int k4 = 0; k4 < K4; k4 += TILE_K4) {
-    load_int8_input_tile(int8_in_tile, k4, m4, K4);
-    load_int8_weight_tile(int8_weight_tile, n4, k4, N4);
-
-    int_accumulate_with_int8_weight(out_accum, int8_in_tile, int8_weight_tile);
-  }
-
-  FPPerOutChannelParams weight_scales_tile;
-  load_weight_scales_tile(weight_scales_tile, n4);
-
-  IntPerOutChannelParams weight_sums_tile;
-  load_weight_sums_tile(weight_sums_tile, n4);
-
-  FPOutTile out_tile;
-  initialize(out_tile);
-
-  if (apply_bias > 0) {
-    FPPerOutChannelParams bias_tile;
-    load_bias_tile(bias_tile, n4);
-
-    accumulate_out_tile_with_int_accum(
-        out_tile,
-        out_accum,
-        input_scale,
-        input_zp,
-        weight_sums_tile,
-        weight_scales_tile,
-        bias_tile);
-  }
-  else {
-    accumulate_out_tile_with_int_accum(
-        out_tile,
-        out_accum,
-        input_scale,
-        input_zp,
-        weight_sums_tile,
-        weight_scales_tile);
-  }
-
-  if (M - m >= TILE_M) {
-    write_output_tile_no_checks(out_tile, n4, m, N4, M);
-  } else {
-    write_output_tile_with_checks(out_tile, n4, m, N4, M);
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.yaml
deleted file mode 100644
index aa1de3077fc..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_q8ta_q8csw_tiled.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-linear_q8ta_q8csw_tiled:
-  parameter_names_with_default_values:
-    DTYPE: float
-    OUTPUT_STORAGE: texture3d
-    PACKED_INT8_INPUT_STORAGE: buffer
-    WEIGHT_STORAGE: texture2d
-    TILE_M4: 1
-    TILE_N4: 1
-    TILE_K4: 1
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-  shader_variants:
-    - NAME: linear_q8ta_q8csw_tiled_texture3d_buffer_texture2d
-    - NAME: linear_q8ta_q8csw_tiled_buffer_buffer_texture2d
-      OUTPUT_STORAGE: buffer
-      PACKED_INT8_INPUT_STORAGE: buffer
-      WEIGHT_STORAGE: texture2d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl
deleted file mode 100644
index 4dd83f0d4ed..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
-#define FLOAT_T ${buffer_scalar_type(DTYPE)}
-
-${define_active_storage_type(STORAGE)}
-
-${define_required_extensions(DTYPE)}
-$if STORAGE == "buffer":
-  ${define_required_extensions("int8")}
-
-#include "indexing_utils.h"
-
-layout(std430) buffer;
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_mat1", DTYPE, STORAGE)}
-${layout_declare_tensor(2, "r", "t_qmat2", "int8", STORAGE)}
-${layout_declare_tensor(3, "r", "t_scales", DTYPE, STORAGE)}
-
-$if STORAGE == "buffer":
-  layout(push_constant) uniform restrict Block {
-    ivec4 out_sizes;
-    ivec4 out_strides;
-    ivec4 mat1_sizes;
-    ivec4 mat1_strides;
-    ivec4 qmat2_strides;
-    ivec4 scales_strides;
-    int out_numel;
-  };
-$else:
-  layout(push_constant) uniform restrict Block {
-    ivec3 out_limits;
-    ivec4 mat1_sizes;
-  };
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-// This header file must be defined after the layout descriptors have been
-// declared because the functions in the header assume some variables have been
-// declared as layout descriptors.
-
-#ifdef USING_BUFFER
-
-#ifndef FLOAT_T
-#define FLOAT_T float
-#endif
-
-void main() {
-  const int out_bufi = int(gl_GlobalInvocationID.x);
-  if (out_bufi >= out_numel) {
-    return;
-  }
-
-  const ivec4 out_tidx = contiguous_bufi_to_tidx(out_bufi, out_strides);
-
-  const FLOAT_T scale = t_scales[out_tidx.x];
-
-  FLOAT_T outval = FLOAT_T(0.0);
-
-  int mat1_offset = out_tidx.y * mat1_strides.y + out_tidx.z * qmat2_strides.z;
-  int qmat2_offset = out_tidx.x;
-
-  // TODO(ssjia): optimize memory access pattern by traversing mat1 x in inner loop
-  for (int i = 0; i < mat1_sizes.x; i++) {
-    const FLOAT_T mat1_val = t_mat1[mat1_offset];
-    const FLOAT_T mat2_val = FLOAT_T(t_qmat2[qmat2_offset]);
-
-    outval += mat1_val * mat2_val;
-
-    mat1_offset++;
-    qmat2_offset += qmat2_strides.y;
-  }
-
-  t_out[out_bufi] = outval * scale;
-}
-
-#else // USING_TEXTURE
-
-void main() {
-  const ivec2 out_pos = ivec2(
-    gl_GlobalInvocationID.x % out_limits.x,
-    gl_GlobalInvocationID.x / out_limits.x);
-
-  if (out_pos.y >= out_limits.y) {
-    return;
-  }
-
-  const int qmat2_pos_x = out_pos.x;
-
-  VEC4_T outtex = VEC4_T(0);
-
-  const VEC4_T scales = load_texel(t_scales,  ivec3(out_pos.x, 0, 0));
-
-  VEC4_T mat1_tex;
-  VEC4_T mat2_tex[4];
-  for (
-    int i = 0, x = 0;
-    i < mat1_sizes.x;
-    i += 4, x++)
-  {
-    mat1_tex = load_texel(t_mat1, ivec3(x, out_pos.y, 0));
-
-    mat2_tex[0] = load_texel(t_qmat2, ivec3(out_pos.x, i, 0));
-    mat2_tex[1] = load_texel(t_qmat2, ivec3(out_pos.x, i + 1, 0));
-    mat2_tex[2] = load_texel(t_qmat2, ivec3(out_pos.x, i + 2, 0));
-    mat2_tex[3] = load_texel(t_qmat2, ivec3(out_pos.x, i + 3, 0));
-
-    outtex += mat1_tex.x * mat2_tex[0] + mat1_tex.y * mat2_tex[1] + mat1_tex.z * mat2_tex[2] + mat1_tex.w * mat2_tex[3];
-  }
-
-  outtex *= scales;
-  write_texel(t_out, ivec3(out_pos, 0), outtex);
-}
-
-#endif
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.yaml
deleted file mode 100644
index 800007406f0..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-linear_qcsnw:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: texture3d
-    MAT1_PACKING: W_packed
-    MAT2_PACKING: W_packed
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-      - VALUE: half
-    STORAGE:
-      - VALUE: texture3d
-      - VALUE: buffer
-  shader_variants:
-    - NAME: linear_qcs8w_W_packed_W_packed
-    - NAME: linear_qcs8w_W_packed_H_packed
-      MAT2_PACKING: H_packed
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.glsl
deleted file mode 100644
index c766a3cd7d0..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.glsl
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define T ${buffer_scalar_type(DTYPE)}
-#define VEC4_T ${buffer_gvec_type(DTYPE, 4)}
-
-#define TILE_ROWS ${TILE_ROWS}
-#define TILE_TXCOLS ${TILE_TXCOLS}
-
-#define NGROUPS 8
-#define NWORKERS 8
-
-${define_required_extensions(DTYPE)}
-
-$if WEIGHT_STORAGE == "buffer":
-  ${define_required_extensions("int8")}
-
-#extension GL_EXT_control_flow_attributes : require
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, IN_STORAGE, is_scalar_array=False)}
-$if QUANT_NBITS == 4:
-  ${layout_declare_tensor(B, "r", "t_weight", "uint8", WEIGHT_STORAGE, is_scalar_array=False)}
-$else:
-  ${layout_declare_tensor(B, "r", "t_weight", "int8", WEIGHT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_scales", DTYPE, SCALES_STORAGE, is_scalar_array=False)}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 out_sizes;
-  ivec4 in_sizes;
-  ivec4 weight_sizes;
-};
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-shared VEC4_T partial_sums[NGROUPS][NWORKERS][TILE_ROWS][TILE_TXCOLS];
-
-void main() {
-  // txcol stands for "texel column". One txcol corresponds to 4 scalar columns.
-  $if TILE_TXCOLS > 1:
-    const uint global_wg_x = uint(divup(out_sizes.x, 4 * TILE_TXCOLS));
-    const uint out_txcol = uint(
-      (gl_GlobalInvocationID.x % global_wg_x) * TILE_TXCOLS);
-  $else:
-    const uint global_wg_x = uint(divup4(out_sizes.x));
-    const uint out_txcol = uint(gl_GlobalInvocationID.x % global_wg_x);
-
-  const uint out_row = uint(
-    (gl_GlobalInvocationID.x / global_wg_x) * TILE_ROWS);
-
-  $if QUANT_NBITS == 4:
-    const uint weight_txcol = uint(out_txcol / 2);
-
-  const int gid = int(gl_LocalInvocationID.x); // group id
-  const int wid = int(gl_LocalInvocationID.z); // worker id
-
-  if (out_row >= out_sizes.y) {
-    return;
-  }
-
-  VEC4_T mat1[TILE_ROWS];
-  VEC4_T qmat2[4][TILE_TXCOLS];
-  VEC4_T local_sums[TILE_ROWS][TILE_TXCOLS];
-
-  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-    $for c in range(TILE_TXCOLS):
-      local_sums[r][${c}] = VEC4_T(0.0);
-  }
-
-  VEC4_T scales[TILE_TXCOLS];
-  $for c in range(TILE_TXCOLS):
-    $if SCALES_STORAGE == "buffer":
-      scales[${c}] = VEC4_T(t_scales[out_txcol + ${c}]);
-    $else:
-      scales[${c}] = VEC4_T(
-        texelFetch(t_scales, ivec2(out_txcol + ${c}, 0), 0));
-
-  for (int pos = (4 * wid), txpos = wid;
-       pos < in_sizes.x;
-       pos += (4 * NWORKERS), txpos += NWORKERS) {
-    $if WEIGHT_STORAGE == "buffer":
-      uint qmat2_bufi;
-      uint weight_row_txstride = div4(weight_sizes.x);
-
-    // Preload weight tensor
-    [[unroll]] for (int r = 0; r < 4; r++) {
-      $if QUANT_NBITS == 4:
-        $for c in range(0, TILE_TXCOLS, 2):
-          $if WEIGHT_STORAGE == "buffer":
-            qmat2_bufi = (pos + r) * weight_row_txstride + weight_txcol;
-            const u8vec4 packed_weight_tex = t_weight[qmat2_bufi + ${c}]
-          $else:
-            const uvec4 packed_weight_tex = texelFetch(
-              t_weight, ivec2(weight_txcol + ${c}, pos + r), 0);
-
-          qmat2[r][${c}] = (VEC4_T((packed_weight_tex & 0xF0) >> 4) - 8.0);
-          qmat2[r][${c + 1}] = (VEC4_T(packed_weight_tex & 0x0F) - 8.0);
-      $else:
-        $for c in range(TILE_TXCOLS):
-          $if WEIGHT_STORAGE == "buffer":
-            qmat2_bufi = (pos + r) * weight_row_txstride + out_txcol;
-            qmat2[r][${c}] = t_weight[qmat2_bufi + ${c}];
-          $else:
-            qmat2[r][${c}] = VEC4_T(
-              texelFetch(t_weight, ivec2(out_txcol + ${c}, pos + r), 0));
-    }
-
-    $if IN_STORAGE == "buffer":
-      uint in_row_txstride = div4(in_sizes.x);
-
-    // Preload input tensor
-    [[unroll]] for (int i = 0; i < TILE_ROWS; i++) {
-      $if IN_STORAGE == "buffer":
-        mat1[i] = t_in[(out_row + i) * in_row_txstride + txpos];
-      $else:
-        mat1[i] = VEC4_T(
-          texelFetch(t_in, ivec3(txpos, out_row + i, 0), 0));
-    }
-
-    // Accumulate partial output
-    [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-      $for c in range(TILE_TXCOLS):
-        local_sums[r][${c}] += mat1[r].x * qmat2[0][${c}] +
-                               mat1[r].y * qmat2[1][${c}] +
-                               mat1[r].z * qmat2[2][${c}] +
-                               mat1[r].w * qmat2[3][${c}];
-    }
-  }
-
-  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-    $for c in range(TILE_TXCOLS):
-      partial_sums[gid][wid][r][${c}] = local_sums[r][${c}];
-  }
-
-  memoryBarrierShared();
-  barrier();
-
-  if (wid != 0) {
-    return;
-  }
-
-  VEC4_T sums[TILE_ROWS][TILE_TXCOLS];
-
-  for (int r = 0; r < TILE_ROWS; ++r) {
-    $for c in range(TILE_TXCOLS):
-      sums[r][${c}] = VEC4_T(0.0);
-
-    [[unroll]] for (int worker = 0; worker < NWORKERS; ++worker) {
-      $for c in range(TILE_TXCOLS):
-        sums[r][${c}] += partial_sums[gid][worker][r][${c}];
-    }
-  }
-
-  $if OUT_STORAGE == "buffer":
-    uint out_bufi;
-    uint out_row_txstride = div4(out_sizes.x);
-
-  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-    $for c in range(TILE_TXCOLS):
-      $if OUT_STORAGE == "buffer":
-        if (out_row + r < out_sizes.y) {
-          out_bufi = (out_row + r) * out_row_txstride + out_txcol;
-          t_out[out_bufi + ${c}] = sums[r][${c}] * scales[${c}];
-        }
-      $else:
-        imageStore(
-          t_out,
-          ivec3(out_txcol + ${c}, out_row + r, 0),
-          sums[r][${c}] * scales[${c}]);
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.yaml
deleted file mode 100644
index 3dff6855142..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_coop.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-linear_qcsnw_coop:
-  parameter_names_with_default_values:
-    DTYPE: float
-    IN_STORAGE: texture3d
-    OUT_STORAGE: texture3d
-    WEIGHT_STORAGE: texture2d
-    SCALES_STORAGE: texture2d
-    TILE_ROWS: 4
-    TILE_TXCOLS: 1
-    QUANT_NBITS: 8
-  generate_variant_forall:
-    TILE_ROWS:
-      - VALUE: 1
-        SUFFIX: o4x1
-  shader_variants:
-    - NAME: linear_qcs8w_coop_texture3d_texture3d_texture2d_texture2d_float
-    - NAME: linear_qcs8w_coop_buffer_buffer_texture2d_texture2d_float
-      IN_STORAGE: buffer
-      OUT_STORAGE: buffer
-    - NAME: linear_qcs8w_coop_buffer_buffer_buffer_buffer_float
-      IN_STORAGE: buffer
-      OUT_STORAGE: buffer
-      WEIGHT_STORAGE: buffer
-      SCALES_STORAGE: buffer
-    - NAME: linear_qcs4w_coop_texture3d_texture3d_texture2d_texture2d_float
-      TILE_TXCOLS: 2
-      QUANT_NBITS: 4
-    - NAME: linear_qcs4w_coop_buffer_buffer_texture2d_texture2d_float
-      IN_STORAGE: buffer
-      OUT_STORAGE: buffer
-      TILE_TXCOLS: 2
-      QUANT_NBITS: 4
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl
deleted file mode 100644
index f6f05aab7ca..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.glsl
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define T ${buffer_scalar_type(DTYPE)}
-#define VEC4_T ${buffer_gvec_type(DTYPE, 4)}
-
-#define TILE_ROWS ${TILE_ROWS}
-#define TILE_TXCOLS ${TILE_TXCOLS}
-
-${define_required_extensions(DTYPE)}
-
-$if WEIGHT_STORAGE == "buffer":
-  ${define_required_extensions("int8")}
-
-#extension GL_EXT_control_flow_attributes : require
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, IN_STORAGE, is_scalar_array=False)}
-$if QUANT_NBITS == 4:
-  ${layout_declare_tensor(B, "r", "t_weight", "uint8", WEIGHT_STORAGE, is_scalar_array=False)}
-$else:
-  ${layout_declare_tensor(B, "r", "t_weight", "int8", WEIGHT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_scales", DTYPE, SCALES_STORAGE, is_scalar_array=False)}
-
-
-layout(push_constant) uniform restrict Block {
-  ivec4 out_sizes;
-  ivec4 in_sizes;
-  ivec4 weight_sizes;
-};
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
-
-void main() {
-  // txcol stands for "texel column". One txcol corresponds to 4 scalar columns.
-  $if TILE_TXCOLS > 1:
-    const uint16_t global_wg_x = uint16_t(divup(out_sizes.x, 4 * TILE_TXCOLS));
-    const uint16_t out_txcol = uint16_t(
-      (gl_GlobalInvocationID.x % global_wg_x) * TILE_TXCOLS);
-  $else:
-    const uint16_t global_wg_x = uint16_t(divup4(out_sizes.x));
-    const uint16_t out_txcol = uint16_t(gl_GlobalInvocationID.x % global_wg_x);
-
-  const uint16_t out_row = uint16_t(
-    (gl_GlobalInvocationID.x / global_wg_x) * TILE_ROWS);
-
-  $if QUANT_NBITS == 4:
-    const uint16_t weight_txcol = uint16_t(out_txcol / 2);
-
-  if (out_row >= uint16_t(out_sizes.y)) {
-    return;
-  }
-
-  VEC4_T mat1[TILE_ROWS];
-  VEC4_T qmat2[4][TILE_TXCOLS];
-  VEC4_T sums[TILE_ROWS][TILE_TXCOLS];
-
-  VEC4_T scales[TILE_TXCOLS];
-  $for c in range(TILE_TXCOLS):
-    $if SCALES_STORAGE == "buffer":
-      scales[${c}] = VEC4_T(t_scales[out_txcol + ${c}]);
-    $else:
-      scales[${c}] = VEC4_T(
-        texelFetch(t_scales, u16vec2(out_txcol + ${c}, 0), 0));
-
-  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-    $for c in range(TILE_TXCOLS):
-      sums[r][${c}] = VEC4_T(0.0);
-  }
-
-  for (uint16_t pos = uint16_t(0), txpos = uint16_t(0);
-       pos < uint16_t(in_sizes.x);
-       pos += uint16_t(4), txpos += uint16_t(1)) {
-    $if WEIGHT_STORAGE == "buffer":
-      uint qmat2_bufi;
-      uint weight_row_txstride = div4(weight_sizes.x);
-
-    // Preload weight tensor
-    [[unroll]] for (int r = 0; r < 4; r++) {
-      $if QUANT_NBITS == 4:
-        $for c in range(0, TILE_TXCOLS, 2):
-          $if WEIGHT_STORAGE == "buffer":
-            qmat2_bufi = (pos + r) * weight_row_txstride + weight_txcol;
-            const u8vec4 packed_weight_tex = t_weight[qmat2_bufi + ${c}]
-          $else:
-            const uvec4 packed_weight_tex = texelFetch(
-              t_weight, u16vec2(weight_txcol + ${c}, pos + r), 0);
-
-          qmat2[r][${c}] = (VEC4_T((packed_weight_tex & 0xF0) >> 4) - 8.0);
-          qmat2[r][${c + 1}] = (VEC4_T(packed_weight_tex & 0x0F) - 8.0);
-      $else:
-        $for c in range(TILE_TXCOLS):
-          $if WEIGHT_STORAGE == "buffer":
-            qmat2_bufi = (pos + r) * weight_row_txstride + out_txcol;
-            qmat2[r][${c}] = t_weight[qmat2_bufi + ${c}];
-          $else:
-            qmat2[r][${c}] = VEC4_T(
-              texelFetch(t_weight, u16vec2(out_txcol + ${c}, pos + r), 0));
-    }
-
-    $if IN_STORAGE == "buffer":
-      uint in_row_txstride = div4(in_sizes.x);
-
-    // Preload input tensor
-    [[unroll]] for (int i = 0; i < TILE_ROWS; i++) {
-      $if IN_STORAGE == "buffer":
-        mat1[i] = t_in[(out_row + i) * in_row_txstride + txpos];
-      $else:
-        mat1[i] = VEC4_T(
-          texelFetch(t_in, u16vec3(txpos, out_row + i, 0), 0));
-    }
-
-    // Accumulate output
-    [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-      $for c in range(TILE_TXCOLS):
-        sums[r][${c}] += mat1[r].x * qmat2[0][${c}] +
-                         mat1[r].y * qmat2[1][${c}] +
-                         mat1[r].z * qmat2[2][${c}] +
-                         mat1[r].w * qmat2[3][${c}];
-    }
-  }
-
-  // Store to output tensor
-  $if OUT_STORAGE == "buffer":
-    uint out_bufi;
-    uint out_row_txstride = div4(out_sizes.x);
-
-  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-    $for c in range(TILE_TXCOLS):
-      $if OUT_STORAGE == "buffer":
-        if (out_row + r < out_sizes.y) {
-          out_bufi = (out_row + r) * out_row_txstride + out_txcol;
-          t_out[out_bufi + ${c}] = sums[r][${c}] * scales[${c}];
-        }
-      $else:
-        imageStore(
-          t_out,
-          ivec3(out_txcol + ${c}, out_row + r, 0),
-          sums[r][${c}] * scales[${c}]);
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.yaml
deleted file mode 100644
index 1c9ec4e524a..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw_tiled.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-linear_qcsnw_tiled:
-  parameter_names_with_default_values:
-    DTYPE: float
-    IN_STORAGE: texture3d
-    OUT_STORAGE: texture3d
-    WEIGHT_STORAGE: texture2d
-    SCALES_STORAGE: texture2d
-    TILE_ROWS: 4
-    TILE_TXCOLS: 1
-    QUANT_NBITS: 8
-  generate_variant_forall:
-    TILE_ROWS:
-      - VALUE: 1
-        SUFFIX: o4x1
-      - VALUE: 2
-        SUFFIX: o4x2
-      - VALUE: 4
-        SUFFIX: o4x4
-  shader_variants:
-    - NAME: linear_qcs8w_tiled_texture3d_texture3d_texture2d_texture2d_float
-    - NAME: linear_qcs8w_tiled_buffer_buffer_texture2d_texture2d_float
-      IN_STORAGE: buffer
-      OUT_STORAGE: buffer
-    - NAME: linear_qcs8w_tiled_buffer_buffer_buffer_buffer_float
-      IN_STORAGE: buffer
-      OUT_STORAGE: buffer
-      WEIGHT_STORAGE: buffer
-      SCALES_STORAGE: buffer
-    - NAME: linear_qcs4w_tiled_texture3d_texture3d_texture2d_texture2d_float
-      TILE_TXCOLS: 2
-      QUANT_NBITS: 4
-    - NAME: linear_qcs4w_tiled_buffer_buffer_texture2d_texture2d_float
-      IN_STORAGE: buffer
-      OUT_STORAGE: buffer
-      TILE_TXCOLS: 2
-      QUANT_NBITS: 4
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_coop.glsl
deleted file mode 100644
index 150efeef1ad..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_coop.glsl
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define T ${texel_load_component_type(DTYPE, IO_STORAGE)}
-#define VEC4_T ${texel_load_type(DTYPE, IO_STORAGE)}
-
-#define WGS ${WGS}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(B, "w", "t_output", DTYPE, IO_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_input", DTYPE, IO_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_qmat2", "uint", WEIGHT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_qparams", DTYPE, "buffer", is_scalar_array=False)}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 output_sizes;
-  ivec4 input_sizes;
-  ivec4 weight_sizes;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int group_size = 64;
-
-shared VEC4_T partial_sums[WGS][2];
-
-$if IO_STORAGE == "buffer":
-  #define BUFFER_IO
-$if WEIGHT_STORAGE == "buffer":
-  #define BUFFER_WEIGHT
-
-#include "qlinear_utils.glslh"
-
-void main() {
-  const uint lid = gl_LocalInvocationID.x;
-  const uint n8 = gl_GlobalInvocationID.y;
-  // The output tensor will have a shape of [n, 1, 1, 1]. Each thread computes
-  // 8 output elements, so each thread will write to 8 elements starting at the
-  // tensor index (gid.x * 8, 0, 0, 0).
-  const uint n = MUL_8(n8);
-  const uint K4 = DIV_UP_4(input_sizes.x);
-
-  if (n >= output_sizes.x) {
-    return;
-  }
-
-  VEC4_T out_texels[2];
-  out_texels[0] = VEC4_T(0);
-  out_texels[1] = VEC4_T(0);
-
-  // initialize the group index to a value larger than the largest possible
-  uint cur_group_idx = input_sizes.x;
-
-  // Each thread in the work group accumulates a partial result.
-  for (uint k4 = lid; k4 < DIV_UP_4(input_sizes.x); k4 += WGS) {
-    const uint k = MUL_4(k4);
-    const uint group_idx = k / group_size;
-
-    VEC4_T scales[2];
-    VEC4_T zeros[2];
-
-    // Only update the scales/zeros if the current iteration is now working on a
-    // new quantization group.
-    if (group_idx != cur_group_idx) {
-      // The qparams tensor contains the quantization scales and zeros, with
-      // shape [2, N, K / group_size, 1].
-      // Loading a texel from the qparams tensor will return 2 scales and 2
-      // zeros for 2 adjacent output channels.
-      uint qparams_bufi = group_idx * DIV_2(output_sizes.x) + DIV_2(n);
-      VEC4_T scales_zeros_texels[4];
-      $for comp in range(4):
-        scales_zeros_texels[${comp}] = t_qparams[qparams_bufi++];
-
-      scales[0] = VEC4_T(scales_zeros_texels[0].xz, scales_zeros_texels[1].xz);
-      zeros[0] = VEC4_T(scales_zeros_texels[0].yw, scales_zeros_texels[1].yw);
-
-      scales[1] = VEC4_T(scales_zeros_texels[2].xz, scales_zeros_texels[3].xz);
-      zeros[1] = VEC4_T(scales_zeros_texels[2].yw, scales_zeros_texels[3].yw);
-
-      cur_group_idx = group_idx;
-    }
-    // The input tensor will have a shape of [K, 1, 1, 1]; in each iteration,
-    // load 4 elements starting from the tensor index (k, 0, 0, 0).
-    VEC4_T in_texel = load_input_texel_1d(k4);
-    // Extract each element of the in_texel into a separate vectorized variable;
-    // these are used to "broadcast" the input values in subsequent fma calls.
-    VEC4_T in_texel_val[4];
-    $for comp in range(4):
-      in_texel_val[${comp}] = VEC4_T(in_texel[${comp}]);
-
-    uvec4 packed_weight_block = load_transposed_weight_block(k4, n8, K4);
-
-    VEC4_T weight_texels[2];
-    $for comp in range(4):
-      {
-        weight_texels[0].x = extract_4bit_from_transposed_block(packed_weight_block, 0, ${comp});
-        weight_texels[0].y = extract_4bit_from_transposed_block(packed_weight_block, 1, ${comp});
-        weight_texels[0].z = extract_4bit_from_transposed_block(packed_weight_block, 2, ${comp});
-        weight_texels[0].w = extract_4bit_from_transposed_block(packed_weight_block, 3, ${comp});
-
-        weight_texels[1].x = extract_4bit_from_transposed_block(packed_weight_block, 4, ${comp});
-        weight_texels[1].y = extract_4bit_from_transposed_block(packed_weight_block, 5, ${comp});
-        weight_texels[1].z = extract_4bit_from_transposed_block(packed_weight_block, 6, ${comp});
-        weight_texels[1].w = extract_4bit_from_transposed_block(packed_weight_block, 7, ${comp});
-
-        weight_texels[0] = fma(weight_texels[0], scales[0], zeros[0]);
-        weight_texels[1] = fma(weight_texels[1], scales[1], zeros[1]);
-
-        out_texels[0] = fma(in_texel_val[${comp}], weight_texels[0], out_texels[0]);
-        out_texels[1] = fma(in_texel_val[${comp}], weight_texels[1], out_texels[1]);
-      }
-  }
-
-  partial_sums[lid][0] = out_texels[0];
-  partial_sums[lid][1] = out_texels[1];
-
-  memoryBarrierShared();
-  barrier();
-
-  // Tree reduction to compute the overall result.
-  for (int i = WGS / 2; i > 0; i /= 2) {
-    if (lid < i) {
-      partial_sums[lid][0] = partial_sums[lid][0] + partial_sums[lid + i][0];
-      partial_sums[lid][1] = partial_sums[lid][1] + partial_sums[lid + i][1];
-    }
-    memoryBarrierShared();
-    barrier();
-  }
-
-  // Only the first thread will write out result
-  if (lid == 0) {
-    out_texels[0] = partial_sums[0][0];
-    out_texels[1] = partial_sums[0][1];
-
-    uint n4 = DIV_4(n);
-    write_output_texel_1d(out_texels[0], n4);
-    if (n + 4 < output_sizes.x) {
-      write_output_texel_1d(out_texels[1], n4 + 1);
-    }
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_coop.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_coop.yaml
deleted file mode 100644
index 04e803a2e94..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_coop.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-linear_qga4w_coop:
-  parameter_names_with_default_values:
-    DTYPE: float
-    IO_STORAGE: texture3d
-    WEIGHT_STORAGE: texture2d
-    WGS: 64
-  shader_variants:
-    - NAME: linear_qga4w_coop_texture3d_texture3d_texture2d_float
-    - NAME: linear_qga4w_coop_buffer_buffer_texture2d_float
-      IO_STORAGE: buffer
-    - NAME: linear_qga4w_coop_buffer_buffer_buffer_float
-      IO_STORAGE: buffer
-      WEIGHT_STORAGE: buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_tiled.glsl
deleted file mode 100644
index 97327ea5818..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_tiled.glsl
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define T ${texel_load_component_type(DTYPE, IO_STORAGE)}
-#define VEC4_T ${texel_load_type(DTYPE, IO_STORAGE)}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(B, "w", "t_output", DTYPE, IO_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_input", DTYPE, IO_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_qmat2", "uint", WEIGHT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_qparams", DTYPE, "buffer", is_scalar_array=False)}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 output_sizes;
-  ivec4 input_sizes;
-  ivec4 weight_sizes;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int group_size = 64;
-
-$if IO_STORAGE == "buffer":
-  #define BUFFER_IO
-$if WEIGHT_STORAGE == "buffer":
-  #define BUFFER_WEIGHT
-
-#include "qlinear_utils.glslh"
-
-void main() {
-  // Each thread writes out a 8 wide x 4 high tile of output values
-  const uint n8 = gl_GlobalInvocationID.x;
-  const uint m4 = gl_GlobalInvocationID.y;
-
-  const uint n = MUL_8(n8); // output col idx
-  const uint m = MUL_4(m4); // output row idx
-  const uint n4 = MUL_2(n8); // output col texel idx
-
-  const uint group_num = input_sizes.x / group_size;
-  const uint group_ntexels = DIV_UP_4(group_size);
-
-  if (n >= output_sizes.x || m >= output_sizes.y) {
-    return;
-  }
-
-  const uint K4 = DIV_UP_4(input_sizes.x);
-  const uint N4 = DIV_UP_4(output_sizes.x); // number of texels in each row
-
-  VEC4_T out_texels[4][2];
-  // Initialize to 0
-  $for row_i in range(4):
-    $for col_i in range(2):
-      out_texels[${row_i}][${col_i}] = VEC4_T(0.00);
-
-  for (uint group_i = 0; group_i < group_num; ++group_i) {
-    // Load quantization scales and zeros for the current group
-    VEC4_T scales[2];
-    VEC4_T zeros[2];
-    {
-      uint qparams_bufi = group_i * DIV_2(output_sizes.x) + DIV_2(n);
-
-      VEC4_T scales_zeros_texels[4];
-      $for comp in range(4):
-        scales_zeros_texels[${comp}] = t_qparams[qparams_bufi++];
-
-      scales[0] = VEC4_T(scales_zeros_texels[0].xz, scales_zeros_texels[1].xz);
-      zeros[0] = VEC4_T(scales_zeros_texels[0].yw, scales_zeros_texels[1].yw);
-
-      scales[1] = VEC4_T(scales_zeros_texels[2].xz, scales_zeros_texels[3].xz);
-      zeros[1] = VEC4_T(scales_zeros_texels[2].yw, scales_zeros_texels[3].yw);
-    }
-
-    for (uint inner_k4 = 0; inner_k4 < group_ntexels; inner_k4++) {
-      const uint k4 = group_i * group_ntexels + inner_k4;
-
-      // Load 4x4 block of the input tensor, with the top left corner of the
-      // block at (k, m)
-      VEC4_T in_texels[4];
-      $for comp in range(4):
-        in_texels[${comp}] = load_input_texel_2d(k4, m + ${comp}, K4);
-
-      uvec4 packed_weight_block = load_transposed_weight_block(k4, n8, K4);
-
-      VEC4_T weight_texels[2];
-      $for tile_k in range(4):
-        // Process weight row k + comp
-        {
-          // Weight columns n + 0, 1, 2, 3
-          weight_texels[0].x = extract_4bit_from_transposed_block(packed_weight_block, 0, ${tile_k});
-          weight_texels[0].y = extract_4bit_from_transposed_block(packed_weight_block, 1, ${tile_k});
-          weight_texels[0].z = extract_4bit_from_transposed_block(packed_weight_block, 2, ${tile_k});
-          weight_texels[0].w = extract_4bit_from_transposed_block(packed_weight_block, 3, ${tile_k});
-
-          // Weight colums n + 4, 5, 6, 7
-          weight_texels[1].x = extract_4bit_from_transposed_block(packed_weight_block, 4, ${tile_k});
-          weight_texels[1].y = extract_4bit_from_transposed_block(packed_weight_block, 5, ${tile_k});
-          weight_texels[1].z = extract_4bit_from_transposed_block(packed_weight_block, 6, ${tile_k});
-          weight_texels[1].w = extract_4bit_from_transposed_block(packed_weight_block, 7, ${tile_k});
-
-          weight_texels[0] = fma(weight_texels[0], scales[0], zeros[0]);
-          weight_texels[1] = fma(weight_texels[1], scales[1], zeros[1]);
-
-          $for tile_m in range(4):
-            out_texels[${tile_m}][0] = fma(VEC4_T(in_texels[${tile_m}][${tile_k}]), weight_texels[0], out_texels[${tile_m}][0]);
-            out_texels[${tile_m}][1] = fma(VEC4_T(in_texels[${tile_m}][${tile_k}]), weight_texels[1], out_texels[${tile_m}][1]);
-        }
-    }
-  }
-
-  for (uint row_i = 0; row_i < 4 && m + row_i < output_sizes.y; ++row_i) {
-    write_output_texel_2d(out_texels[row_i][0], n4,     m + row_i, N4);
-    if (n + 4 < output_sizes.x) {
-      write_output_texel_2d(out_texels[row_i][1], n4 + 1, m + row_i, N4);
-    }
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_tiled.yaml
deleted file mode 100644
index 94d10dcf978..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qga4w_tiled.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-linear_qga4w_tiled:
-  parameter_names_with_default_values:
-    DTYPE: float
-    IO_STORAGE: texture3d
-    WEIGHT_STORAGE: texture2d
-  shader_variants:
-    - NAME: linear_qga4w_tiled_texture3d_texture3d_texture2d_float
-    - NAME: linear_qga4w_tiled_buffer_buffer_texture2d_float
-      IO_STORAGE: buffer
-    - NAME: linear_qga4w_tiled_buffer_buffer_buffer_float
-      IO_STORAGE: buffer
-      WEIGHT_STORAGE: buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_coop.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_coop.glsl
deleted file mode 100644
index 174ea1cc9bb..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_coop.glsl
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define T ${buffer_scalar_type(DTYPE)}
-#define VEC4_T ${buffer_gvec_type(DTYPE, 4)}
-
-#define TILE_ROWS ${TILE_ROWS}
-
-#define NGROUPS 8
-#define NWORKERS 8
-
-${define_required_extensions(DTYPE)}
-$if IN_STORAGE == "buffer":
-  ${define_required_extensions("int8")}
-$if WEIGHT_STORAGE == "buffer":
-  ${define_required_extensions("uint8")}
-
-#extension GL_EXT_control_flow_attributes : require
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_mat1", "int8", IN_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_qmat2", "uint8", WEIGHT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_weight_scales", "float", PARAMS_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_weight_zeros", "int", PARAMS_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_input_scale", "float", PARAMS_STORAGE, is_scalar_array=True)}
-${layout_declare_tensor(B, "r", "t_input_zero_point", "int", PARAMS_STORAGE, is_scalar_array=True)}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 out_sizes;
-  ivec4 mat1_sizes;
-  ivec4 qmat2_sizes;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int group_size = 64;
-
-shared vec4 partial_results[NGROUPS][NWORKERS][TILE_ROWS][2];
-
-/*
- * This shader computes a linear operator between a quantized int8 input matrix
- * x and a weights matrix that is quantized to 4 bits, producing a float output.
- *
- * This shader implements a co-operative algorithm to compute the output. The
- * work group size is {NGROUP, 1, NWORKERS}, and each group of NWORKERS threads
- * cooperative to compute TILE_ROWS * 2 output texels. Therefore,
- * NGROUP * TILE_ROWS * 2 output texels are computed across one work group.
- *
- * The threads co-operate by each thread computing a partial reduction along the
- * K dimension. To illustrate the computation, consider a scalar variant of the
- * algorithm that computes the dot product of 2 vectors. Also assume that
- * NWORKERS is 8.
- *
- * Thread 1 in each group will compute:
- * (mat1[0] * mat2[0]) + (mat1[8] * mat2[8]) + (mat1[16] * mat2[16]) + ...
- *
- * Thread 2 in each group will compute:
- * (mat1[1] * mat2[1]) + (mat2[9] * mat2[9]) + (mat1[17] * mat2[17]) + ...
- *
- * Thread 3 in each group will compute:
- * (mat1[2] * mat2[2]) + (mat2[10] * mat2[10]) + (mat1[18] * mat2[18]) + ...
- *
- * The partial accumulations is structured such that memory accesses in each
- * loop iteration can be coalesced.
- *
- * Then, at the end first thread in each group will accumulate the partial
- * accumulations computed by each thread to obtain the final result.
- *
- * Note that this shader assumes that all tensors are width packed.
- */
-
-void main() {
-  const uint out_row = gl_GlobalInvocationID.y * TILE_ROWS;
-  const uint out_col = gl_GlobalInvocationID.x << 3;
-  const int out_col_texel_idx = int(gl_GlobalInvocationID.x) << 1;
-
-  const uint gid = gl_LocalInvocationID.x; // group id
-  const uint wid = gl_LocalInvocationID.z; // worker id
-
-  if (out_col >= out_sizes.x || out_row >= out_sizes.y) {
-    return;
-  }
-
-  const int num_blocks = mat1_sizes.x / group_size;
-
-  ivec4 mat1_quantized[TILE_ROWS];
-  ivec4 qmat2_quantized[4][2];
-  vec4 final_result[TILE_ROWS][2];
-
-  // Initialize accumulators
-  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-    final_result[r][0] = vec4(0.0);
-    final_result[r][1] = vec4(0.0);
-  }
-
-  vec4 scales[2];
-  vec4 zeros[2];
-
-  $if WEIGHT_STORAGE == "buffer":
-    const int qmat2_stride = qmat2_sizes.x >> 2;
-  $if PARAMS_STORAGE == "buffer":
-    const int qparams_stride = out_sizes.x >> 2;
-
-  for (int block_idx = 0; block_idx < num_blocks; ++block_idx) {
-    $if PARAMS_STORAGE == "buffer":
-      scales[0] = t_weight_scales[block_idx * qparams_stride + out_col_texel_idx];
-      scales[1] = t_weight_scales[block_idx * qparams_stride + out_col_texel_idx + 1];
-
-      zeros[0] = vec4(t_weight_zeros[block_idx * qparams_stride + out_col_texel_idx]);
-      zeros[1] = vec4(t_weight_zeros[block_idx * qparams_stride + out_col_texel_idx + 1]);
-    $else:
-      scales[0] = texelFetch(t_weight_scales, ivec3(out_col_texel_idx, block_idx, 0), 0);
-      scales[1] = texelFetch(t_weight_scales, ivec3(out_col_texel_idx + 1, block_idx, 0), 0);
-
-      zeros[0] = vec4(texelFetch(t_weight_zeros, ivec3(out_col_texel_idx, block_idx, 0), 0));
-      zeros[1] = vec4(texelFetch(t_weight_zeros, ivec3(out_col_texel_idx + 1, block_idx, 0), 0));
-
-    ivec4 int32_sums[TILE_ROWS][2];
-    int input_sums[TILE_ROWS];
-
-    // Initialize accumulators for this block
-    [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-      int32_sums[r][0] = ivec4(0);
-      int32_sums[r][1] = ivec4(0);
-      input_sums[r] = 0;
-    }
-
-    for (int g_idx = 4 * int(wid); g_idx < group_size; g_idx += (4 * NWORKERS)) {
-      const int k = block_idx * group_size + g_idx;
-
-      // Preload B (weights) - keep as quantized integers
-      [[unroll]] for (int r = 0; r < 4; ++r) {
-        $if WEIGHT_STORAGE == "buffer":
-          const u8vec4 packed_weight_tex = t_qmat2[(k + r) * qmat2_stride + gl_GlobalInvocationID.x];
-        $else:
-          const uvec4 packed_weight_tex = texelFetch(
-              t_qmat2,
-              ivec2(gl_GlobalInvocationID.x, k + r),
-              0);
-
-        // Unpack 4-bit weights to integers and subtract zero point (8 for 4-bit)
-        qmat2_quantized[r][0] = ivec4((packed_weight_tex & 0xF0) >> 4) - 8;
-        qmat2_quantized[r][1] = ivec4(packed_weight_tex & 0x0F) - 8;
-      }
-
-      // Preload A (quantized input) - keep as quantized integers
-      [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-        $if IN_STORAGE == "buffer":
-          mat1_quantized[r] = t_mat1[((out_row + r) * mat1_sizes.x + k) >> 2] - t_input_zero_point[int(out_row) + r];
-        $else:
-          mat1_quantized[r] = texelFetch(t_mat1, ivec3(k >> 2, out_row + r, 0), 0) - t_input_zero_point[int(out_row) + r];
-      }
-
-      // Accumulate in integer arithmetic: (input_quantized - input_zero_point) * (weight_quantized - weight_zero_point)
-      [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-        input_sums[r] += mat1_quantized[r].x + mat1_quantized[r].y + mat1_quantized[r].z + mat1_quantized[r].w;
-
-        int32_sums[r][0] +=   mat1_quantized[r].x * qmat2_quantized[0][0]
-                            + mat1_quantized[r].y * qmat2_quantized[1][0]
-                            + mat1_quantized[r].z * qmat2_quantized[2][0]
-                            + mat1_quantized[r].w * qmat2_quantized[3][0];
-
-        int32_sums[r][1] +=   mat1_quantized[r].x * qmat2_quantized[0][1]
-                            + mat1_quantized[r].y * qmat2_quantized[1][1]
-                            + mat1_quantized[r].z * qmat2_quantized[2][1]
-                            + mat1_quantized[r].w * qmat2_quantized[3][1];
-      }
-    }
-
-    // Incorporates this block's results into the final accumulation
-    // Following proper quantization paradigm: result = input_scale * weight_scale *
-    // Sum((input_quantized - input_zero) * (weight_quantized - weight_zero))
-    [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-      if (out_row + r >= out_sizes.y) {
-        continue;
-      }
-
-      float input_scale = t_input_scale[int(out_row) + r];
-      float input_sum_scalar = float(input_sums[r]);
-
-      // Apply proper quantization paradigm: input_scale * weight_scale * (accumulator - weight_zero * input_sum)
-      final_result[r][0] += input_scale * scales[0] * (vec4(int32_sums[r][0]) - zeros[0] * input_sum_scalar);
-      final_result[r][1] += input_scale * scales[1] * (vec4(int32_sums[r][1]) - zeros[1] * input_sum_scalar);
-    }
-  }
-
-  // Store worker results in shared memory
-  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-    partial_results[gid][wid][r][0] = final_result[r][0];
-    partial_results[gid][wid][r][1] = final_result[r][1];
-  }
-
-  memoryBarrierShared();
-  barrier();
-
-  // Only the first worker in each group accumulates and writes output
-  if (wid != 0) {
-    return;
-  }
-
-  vec4 cooperative_result[TILE_ROWS][2];
-
-  for (int r = 0; r < TILE_ROWS; ++r) {
-    cooperative_result[r][0] = vec4(0.0);
-    cooperative_result[r][1] = vec4(0.0);
-    [[unroll]] for (int worker = 0; worker < NWORKERS; ++worker) {
-      cooperative_result[r][0] += partial_results[gid][worker][r][0];
-      cooperative_result[r][1] += partial_results[gid][worker][r][1];
-    }
-  }
-
-  // Apply final output quantization
-  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-    $if OUT_STORAGE == "buffer":
-      t_out[((out_row + r) * out_sizes.x + out_col) >> 2] = cooperative_result[r][0];
-      t_out[((out_row + r) * out_sizes.x + out_col + 4) >> 2] = cooperative_result[r][1];
-    $else:
-      imageStore(t_out, ivec3(out_col_texel_idx, out_row + r, 0), cooperative_result[r][0]);
-      imageStore(t_out, ivec3(out_col_texel_idx + 1, out_row + r, 0), cooperative_result[r][1]);
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_coop.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_coop.yaml
deleted file mode 100644
index 9f6db77094a..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_coop.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-linear_qta8a_qga4w_coop:
-  parameter_names_with_default_values:
-    DTYPE: float
-    OUT_STORAGE: texture3d
-    IN_STORAGE: texture3d
-    WEIGHT_STORAGE: texture2d
-    PARAMS_STORAGE: buffer
-    TILE_ROWS: 1
-  shader_variants:
-    - NAME: linear_qta8a_qga4w_coop_texture3d_texture3d_texture2d_float
-    - NAME: linear_qta8a_qga4w_coop_buffer_buffer_texture2d_float
-      OUT_STORAGE: buffer
-      IN_STORAGE: buffer
-    - NAME: linear_qta8a_qga4w_coop_buffer_buffer_buffer_float
-      OUT_STORAGE: buffer
-      IN_STORAGE: buffer
-      WEIGHT_STORAGE: buffer
-    - NAME: linear_qta8a_qga4w_coop_buffer_texture2d_buffer_float
-      OUT_STORAGE: buffer
-      WEIGHT_STORAGE: buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_tiled.glsl
deleted file mode 100644
index dbb7da998f4..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_tiled.glsl
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define T ${buffer_scalar_type(DTYPE)}
-#define VEC4_T ${buffer_gvec_type(DTYPE, 4)}
-
-#define TILE_ROWS ${TILE_ROWS}
-
-${define_required_extensions(DTYPE)}
-$if IN_STORAGE == "buffer":
-  ${define_required_extensions("int8")}
-$if WEIGHT_STORAGE == "buffer":
-  ${define_required_extensions("uint8")}
-
-#extension GL_EXT_control_flow_attributes : require
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_mat1", "int8", IN_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_qmat2", "uint8", WEIGHT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_weight_scales", "float", PARAMS_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_weight_zeros", "int", PARAMS_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_input_scale", "float", "buffer", is_scalar_array=True)}
-${layout_declare_tensor(B, "r", "t_input_zero_point", "int", "buffer", is_scalar_array=True)}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 out_sizes;
-  ivec4 mat1_sizes;
-  ivec4 qmat2_sizes;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int group_size = 64;
-
-/*
- * This shader computes a linear operator between a quantized int8 input matrix
- * x and a weights matrix that is quantized to 4 bits, producing a float output.
- *
- * The (W, H, C) shape of each tensor is:
- * - x: (K, M) - quantized int8 input with per-token quantization
- * - weights: (N / 2, K)
- *   - The weights tensor has a data type of `uint8`. Each element in the tensor
- *     contains 2 4-bit values packed into a uint8.
- *   - See the pack_int4_linear_weight_transposed_interleave shader to see more
- *     details on how the weight tensor is stored.
- * - qparams: (2, N, number_of_groups)
- *   - This tensor contains the scales and zeros quantization parameters for the
- *     weights tensor. The weight tensor is quantized group-wise, which means
- *     that every `group_size` elements along the K dimension of the weights
- *     tensor has independent quantization parameters. Along the width dim, the
- *     first value contains the scale for the group and the second value
- *     contains the zero point for the group.
- * - input_scale: (num_tokens,) - per-token scale values for input quantization
- * - input_zero_point: (num_tokens,) - per-token zero points for input quantization
- * - output: (N, M) - float output
- *
- * Each thread computes a tile of TILE_ROWS * 2 texels of the output tensor.
- *
- * Note that this shader assumes that all tensors are width packed.
- */
-
-void main() {
-  const uint out_row = gl_GlobalInvocationID.y * TILE_ROWS;
-  const uint out_col = gl_GlobalInvocationID.x << 3;
-  const int out_col_texel_idx = int(gl_GlobalInvocationID.x) << 1;
-
-  if (out_col >= out_sizes.x || out_row >= out_sizes.y) {
-    return;
-  }
-
-  const int num_blocks = mat1_sizes.x / group_size;
-
-  ivec4 mat1_quantized[TILE_ROWS];
-  ivec4 qmat2_quantized[4][2];
-  vec4 final_result[TILE_ROWS][2];
-
-  // Initialize accumulatoxrs
-  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-    final_result[r][0] = vec4(0.0);
-    final_result[r][1] = vec4(0.0);
-  }
-
-  vec4 scales[2];
-  vec4 zeros[2];
-
-  $if WEIGHT_STORAGE == "buffer":
-    const int qmat2_stride = qmat2_sizes.x >> 2;
-  $if PARAMS_STORAGE == "buffer":
-    const int qparams_stride = out_sizes.x >> 2;
-
-  for (int block_idx = 0; block_idx < num_blocks; ++block_idx) {
-    $if PARAMS_STORAGE == "buffer":
-      scales[0] = t_weight_scales[block_idx * qparams_stride + out_col_texel_idx];
-      scales[1] = t_weight_scales[block_idx * qparams_stride + out_col_texel_idx + 1];
-
-      zeros[0] = vec4(t_weight_zeros[block_idx * qparams_stride + out_col_texel_idx]);
-      zeros[1] = vec4(t_weight_zeros[block_idx * qparams_stride + out_col_texel_idx + 1]);
-    $else:
-      scales[0] = texelFetch(t_weight_scales, ivec3(out_col_texel_idx, block_idx, 0), 0);
-      scales[1] = texelFetch(t_weight_scales, ivec3(out_col_texel_idx + 1, block_idx, 0), 0);
-
-      zeros[0] = vec4(texelFetch(t_weight_zeros, ivec3(out_col_texel_idx, block_idx, 0), 0));
-      zeros[1] = vec4(texelFetch(t_weight_zeros, ivec3(out_col_texel_idx + 1, block_idx, 0), 0));
-
-    ivec4 int32_sums[TILE_ROWS][2];
-    int input_sums[TILE_ROWS];
-
-    // Initialize accumulators
-    [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-      int32_sums[r][0] = ivec4(0);
-      int32_sums[r][1] = ivec4(0);
-      input_sums[r] = 0;
-    }
-
-    for (int g_idx = 0; g_idx < group_size; g_idx += 4) {
-      const int k = block_idx * group_size + g_idx;
-
-      // Preload B (weights) - keep as quantized integers
-      [[unroll]] for (int r = 0; r < 4; ++r) {
-        $if WEIGHT_STORAGE == "buffer":
-          const u8vec4 packed_weight_tex = t_qmat2[(k + r) * qmat2_stride + gl_GlobalInvocationID.x];
-        $else:
-          const uvec4 packed_weight_tex = texelFetch(
-              t_qmat2,
-              ivec2(gl_GlobalInvocationID.x, k + r),
-              0);
-
-        // Unpack 4-bit weights to integers (subtract 8 as the 4-bit zero point)
-        qmat2_quantized[r][0] = ivec4((packed_weight_tex & 0xF0) >> 4) - 8;
-        qmat2_quantized[r][1] = ivec4(packed_weight_tex & 0x0F) - 8;
-      }
-
-      // Preload A (quantized input) - keep as quantized integers
-      [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-        $if IN_STORAGE == "buffer":
-          mat1_quantized[r] = t_mat1[((out_row + r) * mat1_sizes.x + k) >> 2] - t_input_zero_point[int(out_row) + r];
-        $else:
-          mat1_quantized[r] = texelFetch(t_mat1, ivec3(k >> 2, out_row + r, 0), 0) - t_input_zero_point[int(out_row) + r];
-      }
-
-      // Accumulate in integer arithmetic: (input_quantized - input_zero_point) * (weight_quantized - weight_zero_point)
-      [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-        input_sums[r] += mat1_quantized[r].x + mat1_quantized[r].y + mat1_quantized[r].z + mat1_quantized[r].w;
-
-        int32_sums[r][0] +=   mat1_quantized[r].x * qmat2_quantized[0][0]
-                            + mat1_quantized[r].y * qmat2_quantized[1][0]
-                            + mat1_quantized[r].z * qmat2_quantized[2][0]
-                            + mat1_quantized[r].w * qmat2_quantized[3][0];
-
-        int32_sums[r][1] +=   mat1_quantized[r].x * qmat2_quantized[0][1]
-                            + mat1_quantized[r].y * qmat2_quantized[1][1]
-                            + mat1_quantized[r].z * qmat2_quantized[2][1]
-                            + mat1_quantized[r].w * qmat2_quantized[3][1];
-      }
-    }
-
-    // Incorporates this block's results into the final accumulation
-    // Following proper quantization paradigm: result = input_scale * weight_scale *
-    // Sum((input_quantized - input_zero) * (weight_quantized - weight_zero))
-    [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-      if (out_row + r >= out_sizes.y) {
-        continue;
-      }
-
-      float input_scale = t_input_scale[int(out_row) + r];
-      float input_sum_scalar = float(input_sums[r]);
-
-      // Apply proper quantization paradigm: input_scale * weight_scale * (accumulator - weight_zero * input_sum)
-      final_result[r][0] += input_scale * scales[0] * (vec4(int32_sums[r][0]) - zeros[0] * input_sum_scalar);
-      final_result[r][1] += input_scale * scales[1] * (vec4(int32_sums[r][1]) - zeros[1] * input_sum_scalar);
-    }
-  }
-
-  // Apply ALL scaling at the very end
-  [[unroll]] for (int r = 0; r < TILE_ROWS; ++r) {
-    $if OUT_STORAGE == "buffer":
-      if (out_row + r < out_sizes.y) {
-        t_out[((out_row + r) * out_sizes.x + out_col) >> 2] = final_result[r][0];
-        t_out[((out_row + r) * out_sizes.x + out_col + 4) >> 2] = final_result[r][1];
-      }
-    $else:
-      imageStore(t_out, ivec3(out_col_texel_idx, out_row + r, 0), final_result[r][0]);
-      imageStore(t_out, ivec3(out_col_texel_idx + 1, out_row + r, 0), final_result[r][1]);
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_tiled.yaml
deleted file mode 100644
index c96d693834b..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qta8a_qga4w_tiled.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-linear_qta8a_qga4w_tiled:
-  parameter_names_with_default_values:
-    DTYPE: float
-    OUT_STORAGE: texture3d
-    IN_STORAGE: texture3d
-    WEIGHT_STORAGE: texture2d
-    PARAMS_STORAGE: buffer
-    TILE_ROWS: 3
-  shader_variants:
-    - NAME: linear_qta8a_qga4w_tiled_texture3d_texture3d_texture2d_float
-    - NAME: linear_qta8a_qga4w_tiled_buffer_buffer_texture2d_float
-      OUT_STORAGE: buffer
-      IN_STORAGE: buffer
-    - NAME: linear_qta8a_qga4w_tiled_buffer_buffer_buffer_float
-      OUT_STORAGE: buffer
-      IN_STORAGE: buffer
-      WEIGHT_STORAGE: buffer
-    - NAME: linear_qta8a_qga4w_tiled_buffer_texture2d_buffer_float
-      OUT_STORAGE: buffer
-      WEIGHT_STORAGE: buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl
deleted file mode 100644
index 28afe5a822f..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define FLT_MIN -3.402823466e+38
-
-#include "indexing_utils.h"
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "w", "t_idx", "int", STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_ubo(B, "ivec3", "out_limits")}
-${layout_declare_ubo(B, "ivec4", "in_sizes")}
-${layout_declare_ubo(B, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "write_indices", "1")}
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  const ivec2 ipos = pos.xy * stride - padding;
-
-  const ivec2 start = ipos;
-  const ivec2 end = ipos + kernel_size * dilation;
-
-  vec4 out_texel = vec4(FLT_MIN);
-  ivec4 idx_texel = ivec4(0);
-
-  for (int y = start.y; y < end.y; y += dilation.y) {
-    for (int x = start.x; x < end.x; x += dilation.x) {
-      if ((x >= 0 && x < in_sizes.x) && (y >= 0 && y < in_sizes.y)) {
-        const vec4 cur_texel = load_texel(t_in, ivec3(x, y, pos.z));
-
-        // Set idx if value is greatest in the pool; else, keep the existing idx.
-        ivec4 cur_idx = ivec4(x + int(in_sizes.x) * y);
-        ivec4 mask = ivec4(greaterThan(cur_texel, out_texel));
-        idx_texel = ivec4(mix(idx_texel, cur_idx, mask));
-
-        out_texel = max(cur_texel, out_texel);
-      }
-    }
-  }
-
-  imageStore(t_out, pos, out_texel);
-  if (write_indices > 0) {
-    imageStore(t_idx, pos, idx_texel);
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.yaml
deleted file mode 100644
index d8e3aa599f5..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/max_pool2d.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-max_pool2d:
-  parameter_names_with_default_values:
-    NDIM: 3
-    DTYPE: float
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: max_pool2d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
deleted file mode 100644
index 7897f0e8133..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#include "broadcasting_utils.h"
-#include "indexing_utils.h"
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-#define T ${texel_component_type(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "w", "t_mean", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "w", "t_rstd", DTYPE, STORAGE)}
-
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_weight", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_bias", DTYPE, STORAGE)}
-
-layout(push_constant) uniform PRECISION restrict Block {
-  ivec3 out_limits;
-  ivec4 sizes;
-  float epsilon;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-const lowp int in_packed_dim = unhash_packed_dim(in_layout);
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int out_packed_dim = unhash_packed_dim(out_layout);
-
-#define MAX_WORKGROUP_SIZE 64
-
-// Shared memory factor increases shared memory allocation by a scale that should either be 1 or a power of 2.
-//
-// Increasing factor allows more data to be stored in shared memory and increase thread utilization during reduction.
-// Why? Because when performing reduction, the number of active threads becomes half in each iteration.
-// Increasing scaling factor increases the thread occupancy and hence utilize the GPU better.
-// eg.
-// If local thread size in x dimension is 32, and SHARED_MEMORY_FACTOR is 1, 32 elements will be loaded into shared memory.
-// First iteration of reduce will have 16 threads sum up 32 elements.
-// Second iteration will have 8 threads sum up 16 elements from previous iteration and so on.
-// So thread utilization starts at 50%.
-//
-// By contrast if local thread size in x dimension is 32, and SHARED_MEMORY_FACTOR is 2, 64 elements will be loaded into shared memory.
-// First iteration of reduce will have 32 threads sum up 64 elements.
-// Second iteration will have 32 threads sum up 16 elements from previous iteration and so on.
-// Thus thread utilization starts at 100%.
-#define SHARED_MEMORY_FACTOR 1
-
-#define offset_pos_index(index) ((index) + ((index) >> 3))
-
-shared VEC4_T shared_input[offset_pos_index(MAX_WORKGROUP_SIZE * SHARED_MEMORY_FACTOR)];
-
-// Function to reduce input data in workgroup's x dimension
-//
-// The implementation resembles reduction as depicted below
-// | 10 | 1 | 8 | 1 | 0 | 2 | 3 | 5 | 2 | 3 | 2 | 7 | 0 | 11 | 0 | 2 | current_stride -> 1
-//   |   /    |   /   |   /   |   /   |   /   |   /   |   /    |   /
-//   |  /     |  /    |  /    |  /    |  /    |  /    |  /     |  /
-//   | /      | /     | /     | /     | /     | /     | /      | /
-// | 11 | 1 | 9 | 1 | 2 | 2 | 8 | 5 | 5 | 3 | 9 | 7 | 11 | 11 | 2 | 2 | current_stride -> 2
-//   |       /        |       /       |       /       |       /
-//   |    /           |    /          |    /          |    /
-//   | /              | /             | /             | /
-// | 20 | 1 | 9 | 1 | 10 | 2 | 8 | 5 |14 | 3 | 9 | 7 |13 | 11 | 2 | 2 | current_stride -> 4
-//   |                /               |               /
-//   |            /                   |            /
-//   |        /                       |         /
-//   |    /                           |     /
-//   | /                              | /
-// | 30 | 1 | 9 | 1 | 10 | 2 | 8 | 5 |27 | 3 | 9 | 7 |13 | 11 | 2 | 2 | current_stride -> 8
-//   |                                /
-//   |                             /
-//   |                         /
-//   |                     /
-//   |                 /
-//   |             /
-//   |         /
-//   |     /
-//   | /
-// | 57 | 1 | 9 | 1 | 10 | 2 | 8 | 5 |27 | 3 | 9 | 7 |13 | 11 | 2 | 2 | current_stride = -> 16
-//
-// Threads access shared index in following pattern
-// Thread       | 0 | 1 | 2 | 3 | 4 | 5  | 6  | 7  | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 1
-// Shared Index | 0 | 2 | 4 | 6 | 8 | 10 | 12 | 14 | X | X | X  | X  | X  | X  | X  | X  | index *= 1
-//
-// Thread       | 0 | 1 | 2 | 3  | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 2
-// Shared Index | 0 | 4 | 8 | 12 | X | X | X | X | X | X | X  | X  | X  | X  | X  | X  | index *= 2
-//
-// Thread       | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 4
-// Shared Index | 0 | 8 | X | X | X | X | X | X | X | X | X  | X  | X  | X  | X  | X  | index *= 4
-//
-// Thread       | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | current_stride -> 8
-// Shared Index | 0 | X | X | X | X | X | X | X | X | X | X  | X  | X  | X  | X  | X  | index *= 8
-
-void reduce_input(const int width_stride, const int shared_idx_offset) {
-  // wait for all shared memory writes to finish
-  memoryBarrierShared();
-  barrier();
-
-  // loop log(width_stride) times
-  for (int current_stride = 1, index = int(gl_LocalInvocationID.x << 1); current_stride < width_stride; current_stride *= 2, index <<= 1) {
-    // if the index at this thread is within the width stride
-    if (index < width_stride) {
-      const int local_shared_idx = shared_idx_offset + index;
-      // add the value at current stride to this thread's value
-      shared_input[offset_pos_index(local_shared_idx)] += shared_input[offset_pos_index(local_shared_idx + current_stride)];
-    }
-
-    memoryBarrierShared();
-    barrier();
-  }
-}
-
-void reduce_non_packed_dim() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-  const int width = int(sizes.x);
-  ivec3 in_pos = lpos_to_pos(lpos, in_axis_map);
-
-  // width batch read stride
-  const int width_stride = int(gl_WorkGroupSize.x) * SHARED_MEMORY_FACTOR;
-
-  // local memory starting offset for this thread
-  const int shared_idx_offset = width_stride * int(gl_WorkGroupSize.y * gl_LocalInvocationID.z + gl_LocalInvocationID.y);
-
-  // local memory index for this thread
-  const int shared_idx = shared_idx_offset + int(gl_LocalInvocationID.x);
-
-  VEC4_T mean = VEC4_T(0);
-  VEC4_T var = VEC4_T(0);
-
-  // Loop over the width in stride increments
-  for (int width_offset = 0; width_offset < width; width_offset += width_stride) {
-    // Read input in shared memory
-    for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) {
-      in_pos[in_axis_map.x] = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x);
-
-      VEC4_T in_val = VEC4_T(0);
-      if (all(lessThan(in_pos, out_limits))) {
-        in_val = load_texel(t_in, in_pos);
-      }
-      mean += in_val;
-    }
-  }
-
-  shared_input[offset_pos_index(shared_idx)] = mean;
-  reduce_input(width_stride, shared_idx_offset);
-  mean = shared_input[offset_pos_index(shared_idx_offset)] / width;
-
-  memoryBarrierShared();
-  barrier();
-
-  // Loop over the width in stride increments
-  for (int width_offset = 0; width_offset < width; width_offset += width_stride) {
-    // Read input in shared memory
-    for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) {
-      in_pos[in_axis_map.x] = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x);
-
-      VEC4_T in_val = mean;
-      if (all(lessThan(in_pos, out_limits))) {
-        in_val = load_texel(t_in, in_pos);
-      }
-
-      const VEC4_T delta = in_val - mean;
-      var += delta * delta;
-    }
-  }
-
-  shared_input[offset_pos_index(shared_idx)] = var;
-  reduce_input(width_stride, shared_idx_offset);
-  var = shared_input[offset_pos_index(shared_idx_offset)] / width;
-
-  VEC4_T rstd = pow(var + epsilon, VEC4_T(-0.5));
-  VEC4_T offset = -rstd * mean;
-
-  VEC4_T v = load_texel(t_in, lpos);
-  VEC4_T weight = load_texel(t_weight, ivec3(lpos.x, 0, 0)).xxxx;
-  VEC4_T bias = load_texel(t_bias, ivec3(lpos.x, 0, 0)).xxxx;
-  VEC4_T outtex = (v * rstd + offset) * weight + bias;
-
-  if (all(lessThan(lpos, out_limits))) {
-    write_texel_lpos(t_out, lpos, outtex, out_axis_map);
-  }
-
-  if (gl_GlobalInvocationID.x == 0) {
-    write_texel(t_mean, lpos, mean);
-    write_texel(t_rstd, lpos, rstd);
-  }
-}
-
-void reduce_packed_dim() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-  const int width = int(sizes.x);
-  ivec3 in_pos = lpos_to_pos(lpos, in_axis_map);
-
-  // width batch read stride
-  const int width_stride = int(gl_WorkGroupSize.x) * SHARED_MEMORY_FACTOR;
-
-  // local memory starting offset for this thread
-  const int shared_idx_offset = width_stride * int(gl_WorkGroupSize.y * gl_LocalInvocationID.z + gl_LocalInvocationID.y);
-
-  // local memory index for this thread
-  const int shared_idx = shared_idx_offset + int(gl_LocalInvocationID.x);
-
-  const int last_packed_width_index = divup4(width) - 1;
-  T mean = T(0);
-  T var = T(0);
-  const int remain = width & 3;
-
-  const int in_pos_x_limit = out_limits[in_axis_map.x];
-
-  VEC4_T accum = VEC4_T(0);
-  // Loop over the width in stride increments
-  for (int width_offset = 0; width_offset <= last_packed_width_index; width_offset += width_stride) {
-    // Read input in shared memory
-    for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) {
-      const int in_pos_x = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x);
-      in_pos[in_axis_map.x] = in_pos_x;
-
-      VEC4_T in_val = VEC4_T(0);
-      if (in_pos_x < in_pos_x_limit) {
-        in_val = load_texel(t_in, in_pos);
-      }
-
-      if (in_pos_x == last_packed_width_index && remain != 0) {
-        const int remain_inv = 4 - remain;
-        in_val.y = mix(in_val.y, T(0), remain_inv > 2);
-        in_val.z = mix(in_val.z, T(0), remain_inv > 1);
-        in_val.w = mix(in_val.w, T(0), remain_inv > 0);
-      }
-      accum += in_val;
-    }
-  }
-
-  shared_input[offset_pos_index(shared_idx)] = accum;
-  reduce_input(width_stride, shared_idx_offset);
-  VEC4_T val = shared_input[offset_pos_index(shared_idx_offset)];
-  mean = (val.x + val.y + val.z + val.w) / width;
-
-  memoryBarrierShared();
-  barrier();
-
-  VEC4_T delta2 = VEC4_T(0);
-
-  // Loop over the width in stride increments
-  for (int width_offset = 0; width_offset <= last_packed_width_index; width_offset += width_stride) {
-    // Read input in shared memory
-    for (int si = 0; si < SHARED_MEMORY_FACTOR; si++) {
-      const int in_pos_x = width_offset + int(gl_LocalInvocationID.x + si * gl_WorkGroupSize.x);
-      in_pos[in_axis_map.x] = in_pos_x;
-
-      VEC4_T in_val = VEC4_T(mean);
-      if (in_pos_x < in_pos_x_limit) {
-        in_val = load_texel(t_in, in_pos);
-      }
-
-      if (in_pos_x == last_packed_width_index && remain != 0) {
-        const int remain_inv = 4 - remain;
-        in_val.y = mix(in_val.y, mean.x, remain_inv > 2);
-        in_val.z = mix(in_val.z, mean.x, remain_inv > 1);
-        in_val.w = mix(in_val.w, mean.x, remain_inv > 0);
-      }
-
-      const VEC4_T delta = in_val - mean;
-      delta2 += delta * delta;
-    }
-  }
-
-  shared_input[offset_pos_index(shared_idx)] = delta2;
-  reduce_input(width_stride, shared_idx_offset);
-  val = shared_input[offset_pos_index(shared_idx_offset)];
-  var = (val.x + val.y + val.z + val.w) / width;
-
-  T rstd = pow(var + epsilon, T(-0.5));
-  T offset = -rstd * mean;
-
-  VEC4_T v = load_texel(t_in, lpos);
-  VEC4_T weight = load_texel(t_weight, ivec3(lpos.x, 0, 0));
-  VEC4_T bias = load_texel(t_bias, ivec3(lpos.x, 0, 0));
-  VEC4_T outtex = (v * rstd + offset) * weight + bias;
-
-  if (all(lessThan(lpos, out_limits))) {
-    write_texel_lpos(t_out, lpos, outtex, out_axis_map);
-  }
-
-  if (gl_GlobalInvocationID.x == 0) {
-    write_texel(t_mean, lpos, VEC4_T(mean));
-    write_texel(t_rstd, lpos, VEC4_T(rstd));
-  }
-}
-
-void main() {
-  // if packed dimension width
-  if (in_packed_dim != W_DIM) {
-    reduce_non_packed_dim();
-  } else {
-    reduce_packed_dim();
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.yaml b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.yaml
deleted file mode 100644
index ac478599f8a..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-native_layer_norm:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: native_layer_norm
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl
deleted file mode 100644
index 1a2c257baec..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
-
-${define_active_storage_type(STORAGE)}
-
-#include "indexing_utils.h"
-
-layout(std430) buffer;
-
-#extension GL_EXT_control_flow_attributes : require
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_buffer(B, "r", "nchw_in", "int")}
-
-$if USE_PUSH_CONST:
-  layout(push_constant) uniform restrict Block {
-    ivec4 sizes;
-  };
-$else:
-  ${layout_declare_ubo(B, "ivec4", "sizes")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")}
-${layout_declare_spec_const(C, "int", "transpose_hw", "0")}
-
-const lowp ivec4 axis_map = unhash_axis_map(t_layout);
-const lowp int packed_dim = unhash_packed_dim(t_layout);
-
-/*
- * Extends sign of int8
- */
-int extend_sign(int x) {
-  return x | mix(0, 0xFFFFFF00, x >= (1 << 7));
-}
-
-ivec4 read_texel(ivec4 tidx) {
-  const ivec4 tidx_to_use = ivec4(mix(tidx.xy, tidx.yx, bvec2(transpose_hw == 1)), tidx.zw);
-  const ivec4 sizes_to_use = ivec4(mix(sizes.xy, sizes.yx, bvec2(transpose_hw == 1)), sizes.zw);
-  const int packed_dim_to_use = mix(packed_dim, packed_dim ^ transpose_hw, packed_dim < 2);
-
-  const ivec4 buf_indices = tidx_to_nchwi(
-      tidx_to_use, sizes_to_use, packed_dim_to_use);
-
-  const int mask = (1 << 8) - 1;
-
-  ivec4 out_tex = ivec4(0);
-
-  [[unroll]] for (int i = 0; i < 4; ++i) {
-    if (tidx[packed_dim] + i < sizes[packed_dim]) {
-      const int in_texel = nchw_in[buf_indices[i] >> 2];
-      int extracted_val = (in_texel >> (8 * (buf_indices[i] & 3))) & mask;
-      extracted_val = extend_sign(extracted_val);
-      out_tex[i] = extracted_val;
-    }
-  }
-
-  return out_tex;
-}
-
-void main() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-  ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim);
-
-  if (any(greaterThanEqual(tidx, sizes))) {
-    return;
-  }
-
-  write_texel(t_out, lpos_to_pos(lpos, axis_map), VEC4_T(read_texel(tidx)));
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml
deleted file mode 100644
index 0b8bbecb7bd..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-nchw_to_bitw8_image_nobitw8buffer:
-  parameter_names_with_default_values:
-    STORAGE: texture3d
-    DTYPE: int8
-    USE_PUSH_CONST: True
-  generate_variant_forall:
-    STORAGE:
-      - VALUE: texture2d
-      - VALUE: texture3d
-    DTYPE:
-      - VALUE: int8
-      - VALUE: uint8
-  shader_variants:
-    - NAME: nchw_to_bitw8_image_nobitw8buffer
-    - NAME: nchw_to_bitw8_image_nobitw8buffer_no_pc
-      USE_PUSH_CONST: False
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
deleted file mode 100644
index 074624dc37e..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
+++ /dev/null
@@ -1,48 +0,0 @@
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define T ${buffer_scalar_type(DTYPE)}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing.glslh"
-
-${layout_declare_tensor(B, "w", "t_outp", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "nchw_in", DTYPE, STORAGE)}
-
-${layout_declare_ubo(B, "BufferMetadata", "outp")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-// This constant is unused in this shader but is kept so that the signature is
-// consistent with nchw_to_image.
-${layout_declare_spec_const(C, "int", "unused", "0")}
-${layout_declare_spec_const(C, "int", "transpose_hw", "0")}
-
-void main() {
-  const uint outp_bufi = int(gl_GlobalInvocationID.x);
-  if (outp_bufi >= numel(outp)) {
-    return;
-  }
-
-  TensorIndex outp_tidx;
-  uint nchwi;
-
-  linear_idx_to_tensor_idx(outp, outp_bufi, outp_tidx);
-
-  if (transpose_hw == 1) {
-    BufferMetadata transposed_meta = outp;
-    transposed_meta.sizes[0].xy = transposed_meta.sizes[0].yx;
-    outp_tidx.data[0].xy = outp_tidx.data[0].yx;
-    nchwi = tensor_idx_to_contiguous_idx(transposed_meta, outp_tidx);
-  }
-  // Normal case
-  else {
-    nchwi = tensor_idx_to_contiguous_idx(outp, outp_tidx);
-  }
-
-  t_outp[outp_bufi] = nchw_in[nchwi];
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml
deleted file mode 100644
index 9d6c3aa76a9..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-nchw_to_buffer:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: buffer
-    USE_PUSH_CONST: True
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: double
-      - VALUE: int8
-      - VALUE: uint8
-      - VALUE: int32
-  shader_variants:
-    - NAME: nchw_to_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
deleted file mode 100644
index f3f604e10cd..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
-#define SCALAR_T ${texel_load_component_type(DTYPE, STORAGE)}
-
-${define_active_storage_type(STORAGE)}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_buffer(B, "r", "buf_in", DTYPE)}
-
-$if USE_PUSH_CONST:
-  layout(push_constant) uniform restrict Block {
-    ivec4 sizes;
-  $if not FROM_STAGING:
-    ivec4 buf_strides;
-  };
-$else:
-  ${layout_declare_ubo(B, "ivec4", "sizes")}
-  $if not FROM_STAGING:
-    ${layout_declare_ubo(B, "ivec4", "buf_strides")}
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "t_layout", "DEFAULT_LAYOUT")}
-${layout_declare_spec_const(C, "int", "transpose_hw", "0")}
-
-const lowp ivec4 axis_map = unhash_axis_map(t_layout);
-const lowp int packed_dim = unhash_packed_dim(t_layout);
-
-VEC4_T read_texel(ivec4 tidx) {
-  ivec4 tidx_to_use = tidx;
-  ivec4 sizes_to_use = sizes;
-  int packed_dim_to_use = packed_dim;
-  if (transpose_hw == 1) {
-    sizes_to_use.xy = sizes_to_use.yx;
-    tidx_to_use.xy = tidx.yx;
-
-    if (packed_dim == 1) {
-      packed_dim_to_use = 0;
-    }
-    if (packed_dim == 0) {
-      packed_dim_to_use = 1;
-    }
-  }
-
-  $if FROM_STAGING:
-    const ivec4 buf_indices = tidx_to_nchwi(tidx_to_use, sizes_to_use, packed_dim_to_use);
-  $else:
-    const ivec4 buf_indices = tidx_to_4bufi(tidx_to_use, buf_strides, packed_dim_to_use);
-
-  VEC4_T texel = VEC4_T(0);
-  if (tidx[packed_dim] < sizes[packed_dim]) {
-    texel.x = SCALAR_T(buf_in[buf_indices.x]);
-  }
-  if (tidx[packed_dim] + 1 < sizes[packed_dim]) {
-    texel.y = SCALAR_T(buf_in[buf_indices.y]);
-  }
-  if (tidx[packed_dim] + 2 < sizes[packed_dim]) {
-    texel.z = SCALAR_T(buf_in[buf_indices.z]);
-  }
-  if (tidx[packed_dim] + 3 < sizes[packed_dim]) {
-    texel.w = SCALAR_T(buf_in[buf_indices.w]);
-  }
-  return texel;
-}
-
-void main() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-  const ivec4 tidx = lpos_to_tidx(lpos, sizes, axis_map.w, packed_dim);
-  if (any(greaterThanEqual(tidx, sizes))) {
-    return;
-  }
-
-  $if DTYPE == "double" and DTYPE == "int64":
-    VEC4_T texel = read_texel(tidx);
-    write_texel(t_out, lpos_to_pos(lpos, axis_map), texel);
-  $else:
-    write_texel(t_out, lpos_to_pos(lpos, axis_map), read_texel(tidx));
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
deleted file mode 100644
index 85119c8d508..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.yaml
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-nchw_to_image:
-  parameter_names_with_default_values:
-    STORAGE: texture3d
-    DTYPE: float
-    FROM_STAGING: True
-    USE_PUSH_CONST: True
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: double
-      - VALUE: int8
-      - VALUE: uint8
-      - VALUE: int32
-  shader_variants:
-    - NAME: nchw_to_image_texture3d
-    - NAME: nchw_to_image_texture2d
-      STORAGE: texture2d
-    - NAME: clone_buffer_to_image
-      FROM_STAGING: False
-    - NAME: nchw_to_image_no_pc_texture3d
-      USE_PUSH_CONST: False
-    - NAME: nchw_to_image_no_pc_texture2d
-      STORAGE: texture2d
-      USE_PUSH_CONST: False
-    - NAME: clone_buffer_to_image_no_pc
-      FROM_STAGING: False
-      USE_PUSH_CONST: False
diff --git a/backends/vulkan/runtime/graph/ops/glsl/no_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/no_op.glsl
deleted file mode 100644
index 325635a5716..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/no_op.glsl
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-${define_required_extensions(DTYPE)}
-
-#include "broadcasting_utils.h"
-#include "indexing_utils.h"
-
-layout(std430) buffer;
-
-${layout_declare_tensor(0, "r", "t_out", DTYPE, STORAGE)}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml
deleted file mode 100644
index f888e8661d3..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/no_op.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-no_op:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-      - VALUE: uint32
-      - VALUE: int8
-      - VALUE: uint8
-    STORAGE:
-      - VALUE: texture3d
-      - VALUE: texture2d
-      - VALUE: buffer
-  shader_variants:
-    - NAME: no_op
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_block_4x8.glsl b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_block_4x8.glsl
deleted file mode 100644
index e42cf05dd7f..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_block_4x8.glsl
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(B, "w", "t_qmat2", "uint", STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_input", "uint", "buffer")}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 qmat2_sizes;
-  ivec2 orig_sizes;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-$if STORAGE == "buffer":
-  #define BUFFER_WEIGHT
-
-#include "qlinear_weight_pack_utils.glslh"
-
-#define extract_4bit(input_block_data, col, row) \
-  (extract_4bit_from_packed_uint_le(input_block_data[row], col))
-
-/*
- * This shader packs the weight tensor into blocks for efficient consumption.
- *
- * The input tensor has shape [K/2, N] where each element is a uint8 containing
- * 2 packed 4-bit values. The logical tensor shape is [K, N] of 4-bit values.
- *
- * The transformation partitions the tensor into blocks of size 4x8 (4-bit values)
- * and transposes each block to 8x4, then packs the result so that each uvec4
- * contains an entire transposed block.
- *
- * Original block (4x8 4-bit values, shown as 2x8 uint8 values):
- * w00|w10, w20|w30,
- * w01|w11, w21|w31,
- * w02|w12, w22|w32,
- * w03|w13, w23|w33,
- * w04|w14, w24|w34,
- * w05|w15, w25|w35,
- * w06|w16, w26|w36,
- * w07|w17, w27|w37,
- *
- * Transposed block (8x4 4-bit values, packed into uvec4):
- * w00|w01, w02|w03, w04|w05, w06|w07
- * w10|w11, w12|w13, w14|w15, w16|w17
- * w20|w21, w22|w23, w24|w25, w26|w27
- * w30|w31, w32|w33, w34|w35, w36|w37
- */
-void main() {
-  // Each thread writes out 2 adjacent 8 wide x 4 high transposed block. Each
-  // block is packed as one uvec4.
-  ivec2 block_pos = ivec2(
-      MUL_2(gl_GlobalInvocationID.x),
-      gl_GlobalInvocationID.y);
-
-  // There are K wide x N high 4-bit values in the original weight tensor
-  const int input_width = orig_sizes.x;   // K
-  const int input_height = orig_sizes.y;  // N
-
-  const int input_width_uint = DIV_UP_8(input_width);
-
-  // Original block spans 4 wide x 8 high 4-bit values. Since uint is used to
-  // read the input tensor, each block spans 0.5 wide x 8 high uint values.
-  const ivec2 block_start = ivec2(
-      DIV_2(block_pos.x),
-      MUL_8(block_pos.y));
-
-  // Check bounds
-  if (block_start.x >= input_width_uint || block_start.y >= input_height) {
-    return;
-  }
-
-  // Read input block. Note that this block will contain the source data for
-  // both output blocks, as it contains 1 wide x 8 high uint values, which is
-  // equivalent to 8 wide x 8 high 4-bit values.
-  uint input_block_data[8];
-
-  // Read in 8 rows along the same column of uints, each uint contains 4 4-bit
-  // values. This will be the source data for the transposed block.
-  for (int i = 0; i < 8; ++i) {
-    uint input_bufi = (block_start.y + i) * input_width_uint + block_start.x;
-    input_block_data[i] = t_input[input_bufi];
-  }
-
-  for (int col_offset = 0; col_offset <= 4; col_offset+=4) {
-    uvec4 output_block;
-
-    output_block.x = pack_8x4bit_into_uint(
-        extract_4bit(input_block_data, col_offset, 0),
-        extract_4bit(input_block_data, col_offset, 1),
-        extract_4bit(input_block_data, col_offset, 2),
-        extract_4bit(input_block_data, col_offset, 3),
-        extract_4bit(input_block_data, col_offset, 4),
-        extract_4bit(input_block_data, col_offset, 5),
-        extract_4bit(input_block_data, col_offset, 6),
-        extract_4bit(input_block_data, col_offset, 7));
-
-    output_block.y = pack_8x4bit_into_uint(
-        extract_4bit(input_block_data, col_offset + 1, 0),
-        extract_4bit(input_block_data, col_offset + 1, 1),
-        extract_4bit(input_block_data, col_offset + 1, 2),
-        extract_4bit(input_block_data, col_offset + 1, 3),
-        extract_4bit(input_block_data, col_offset + 1, 4),
-        extract_4bit(input_block_data, col_offset + 1, 5),
-        extract_4bit(input_block_data, col_offset + 1, 6),
-        extract_4bit(input_block_data, col_offset + 1, 7));
-
-    output_block.z = pack_8x4bit_into_uint(
-        extract_4bit(input_block_data, col_offset + 2, 0),
-        extract_4bit(input_block_data, col_offset + 2, 1),
-        extract_4bit(input_block_data, col_offset + 2, 2),
-        extract_4bit(input_block_data, col_offset + 2, 3),
-        extract_4bit(input_block_data, col_offset + 2, 4),
-        extract_4bit(input_block_data, col_offset + 2, 5),
-        extract_4bit(input_block_data, col_offset + 2, 6),
-        extract_4bit(input_block_data, col_offset + 2, 7));
-
-    output_block.w = pack_8x4bit_into_uint(
-        extract_4bit(input_block_data, col_offset + 3, 0),
-        extract_4bit(input_block_data, col_offset + 3, 1),
-        extract_4bit(input_block_data, col_offset + 3, 2),
-        extract_4bit(input_block_data, col_offset + 3, 3),
-        extract_4bit(input_block_data, col_offset + 3, 4),
-        extract_4bit(input_block_data, col_offset + 3, 5),
-        extract_4bit(input_block_data, col_offset + 3, 6),
-        extract_4bit(input_block_data, col_offset + 3, 7));
-
-    const uint qmat2_texel_stride_x = DIV_UP_4(qmat2_sizes.x);
-    write_transposed_weight_block(
-        output_block,
-        block_pos.x,
-        block_pos.y,
-        qmat2_texel_stride_x);
-
-    if (MUL_8(block_start.x) + 4 >= input_width) {
-      return;
-    }
-    // Otherwise, implement the block position to write to the next block in the
-    // following iteration.
-    block_pos.x += 1;
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_block_4x8.yaml b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_block_4x8.yaml
deleted file mode 100644
index c72a2cc1df6..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_block_4x8.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-pack_int4_linear_weight_transposed_block_4x8:
-  parameter_names_with_default_values:
-    STORAGE: buffer
-  shader_variants:
-    - NAME: pack_int4_linear_weight_transposed_block_4x8_buffer
-      STORAGE: buffer
-    - NAME: pack_int4_linear_weight_transposed_block_4x8_texture2d
-      STORAGE: texture2d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl
deleted file mode 100644
index 0079526c248..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-$if not NO_INT8_BUFFERS:
-  ${define_required_extensions("uint8")}
-$if STORAGE == "buffer":
-  ${define_required_extensions("int8")}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_qmat2", "uint8", STORAGE, is_scalar_array=False)}
-$if NO_INT8_BUFFERS:
-  ${layout_declare_tensor(B, "r", "nchw_4x2", "uint", "buffer")}
-$else:
-  ${layout_declare_tensor(B, "r", "nchw_4x2", "uint8", "buffer")}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 qmat2_sizes;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-$if NO_INT8_BUFFERS:
-  #define BUF_T uint
-$else:
-  #define BUF_T uint8_t
-
-$if STORAGE == "buffer":
-  #define UVEC4_T u8vec4
-$else:
-  #define UVEC4_T uvec4
-
-uint get_first(const BUF_T packed) {
-  return (packed & 0xF0) >> 4;
-}
-
-uint get_second(const BUF_T packed) {
-  return packed & 0x0F;
-}
-
-uint combine(const uint first, const uint second) {
-  return (first << 4 | second);
-}
-
-$if NO_INT8_BUFFERS:
-  uint extract_comp(const uint packed4, const uint idx) {
-    return (packed4 >> (idx * 8)) & 0xFF;
-  }
-
-/*
- * This shader packs the weight tensor into a texture.
- *
- * The original tensor has a (W, H) shape of (K / 2, N) and each scalar element
- * is a uint8_t, which contains 2 packed 4 bit uint values.
- *
- * The transform performed by this shader is to first transpose the tensor, so
- * the shape of the packed tensor becomes (N / 2, K). Then, the 4 bit integers
- * are re-packed in groups of 8. For each 4 uint8_t values, the "left" 4-bits
- * of each value contain the 0, 1, 2, 3 4-bit values, and the "right" 4-bits of
- * each value contain the 4, 5, 6, 7 4-bit values.
- *
- * As a concrete example, consider the following weight tensor. The | demarks
- * the packing boundary, so 1| 2 represents a single uint8_t value with 1 in the
- * leftmost 4 bits and 2 in the rightmost 4 bits.
- *
- *  1| 2,  3| 4,  5| 6,  7| 8,
- *  9|10, 11|12, 13|14, 15|16,
- * 17|18, 19|20, 21|22, 23|24,
- * 25|26, 27|28, 29|30, 31|32,
- * 33|34, 35|36, 37|38, 39|40,
- * 41|42, 43|44, 45|46, 47|48,
- * 49|50, 51|52, 53|54, 55|56,
- * 57|58, 59|60, 61|62, 63|64,
- *
- * After packing, the packed tensor would contain
- *
- *  1|33,  9|41, 17|49, 25|57,
- *  2|34, 10|42, 18|50, 26|58,
- *  3|35, 11|43, 19|51, 27|59,
- *  4|36, 12|44, 20|52, 28|60,
- *  5|37, 13|45, 21|53, 29|61,
- *  6|38, 14|46, 22|54, 30|62,
- *  7|39, 15|47, 23|55, 31|63,
- *  8|40, 16|48, 24|56, 32|64,
- *
- * The purpose of interleaving is to make it easier to extract the unpacked
- * values in order using the u8vec4 vectorized type. With the packing in place,
- * The 4-bit values can be extracted via
- *
- * u8vec4 packed;
- * u8vec4 vals_0123 = (packed & 0xF0) >> 4;
- * u8vec4 vals_4567 = (packed | 0x0F);
- */
-void main() {
-  // Each thread writes 2 output texels along the height axis
-  ivec2 packed_pos = ivec2(
-      gl_GlobalInvocationID.x,
-      gl_GlobalInvocationID.y << 1);
-
-  // The packed tensor is width packed
-  if ((packed_pos.x << 2) >= qmat2_sizes.x || packed_pos.y >= qmat2_sizes.y) {
-    return;
-  }
-
-  int out_col = packed_pos.x << 3;
-  int out_row = packed_pos.y;
-
-  int in_col = out_row;
-  int in_int8_col = in_col >> 1;
-  int in_row = out_col;
-
-  int in_numrows = qmat2_sizes.x << 1;
-  int in_numcols = qmat2_sizes.y;
-  int in_num_int8_cols = qmat2_sizes.y >> 1;
-
-  uint in_vals[8][2];
-  for (int r = 0; r < 8; ++r) {
-    if (in_row + r < in_numrows) {
-      uint scalar_idx = (in_row + r) * in_num_int8_cols + in_int8_col;
-      $if NO_INT8_BUFFERS:
-        BUF_T in_val_packed_texel = nchw_4x2[scalar_idx >> 2];
-        const uint packed_idx = scalar_idx % 4;
-        uint in_val_packed = extract_comp(in_val_packed_texel, packed_idx);
-      $else:
-        BUF_T in_val_packed = nchw_4x2[scalar_idx];
-
-      in_vals[r][0] = get_first(in_val_packed);
-      in_vals[r][1] = get_second(in_val_packed);
-    } else {
-      in_vals[r][0] = uint(0);
-      in_vals[r][1] = uint(0);
-    }
-  }
-
-  UVEC4_T out_tex_1 = UVEC4_T(
-      combine(in_vals[0][0], in_vals[4][0]),
-      combine(in_vals[1][0], in_vals[5][0]),
-      combine(in_vals[2][0], in_vals[6][0]),
-      combine(in_vals[3][0], in_vals[7][0]));
-
-  UVEC4_T out_tex_2 = UVEC4_T(
-      combine(in_vals[0][1], in_vals[4][1]),
-      combine(in_vals[1][1], in_vals[5][1]),
-      combine(in_vals[2][1], in_vals[6][1]),
-      combine(in_vals[3][1], in_vals[7][1]));
-
-  $if STORAGE == "buffer":
-    int stride = qmat2_sizes.x >> 2;
-    t_qmat2[packed_pos.y * stride + packed_pos.x] = out_tex_1;
-    t_qmat2[(packed_pos.y + 1) * stride + packed_pos.x] = out_tex_2;
-  $else:
-    imageStore(t_qmat2, packed_pos.xy, out_tex_1);
-    imageStore(t_qmat2, ivec2(packed_pos.x, packed_pos.y + 1), out_tex_2);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml b/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml
deleted file mode 100644
index 145f4301f14..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-pack_int4_linear_weight_transposed_interleaved:
-  parameter_names_with_default_values:
-    STORAGE: texture2d
-    NO_INT8_BUFFERS: false
-  shader_variants:
-    - NAME: pack_int4_linear_weight_transposed_interleaved_texture2d
-    - NAME: pack_int4_linear_weight_transposed_interleaved_buffer
-      STORAGE: buffer
-    - NAME: pack_int4_linear_weight_transposed_interleaved_nobitw8buffer_texture2d
-      NO_INT8_BUFFERS: true
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_q4_linear_weight.glsl b/backends/vulkan/runtime/graph/ops/glsl/pack_q4_linear_weight.glsl
deleted file mode 100644
index b9f5c994910..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/pack_q4_linear_weight.glsl
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-${define_active_storage_type(STORAGE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_packed_int4_weight", "int", STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_int4_weight", "uint", "buffer")}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 qmat2_sizes;
-  ivec2 orig_sizes;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#include "common.glslh"
-#include "linear_int4_weight_block.glslh"
-
-void main() {
-  const int k8 = int(gl_GlobalInvocationID.x);
-  const int n8 = int(gl_GlobalInvocationID.y);
-
-  const int K = orig_sizes.x;
-  const int N = orig_sizes.y;
-
-  // Each shader invocation processes a 4x8 block of the input data.
-  const int K4 = div_up_4(K);
-  const int K8 = div_up_8(K);
-  const int N8 = div_up_8(N);
-
-  // Check bounds
-  if (n8 >= N8 || k8 >= K8) {
-    return;
-  }
-
-  Int4Weight2xBlockSourceData src_data;
-  const int n = mul_8(n8);
-  if (N - n >= 8) {
-    load_block_source_data_no_checks(src_data, k8, n, K8, N);
-  } else {
-    load_block_source_data_with_checks(src_data, k8, n, K8, N);
-  }
-
-  // A 8Kx8K block of the weight matrix is loaded into memory. This will be
-  // split into two blocks each holding 4Kx8N worth of data.
-  // The first block contains data for k + (0, 1, 2, 3) i.e. the first 4 columns
-  // of the loaded weight block.
-  Int4WeightBlockPacked packed_block_1;
-  // The second block contains data for k + (4, 5, 6, 7) i.e. the second 4 cols
-  // of the loaded weight block
-  Int4WeightBlockPacked packed_block_2;
-  create_packed_blocks(packed_block_1, packed_block_2, src_data);
-
-  const int k4 = mul_2(k8);
-  write_packed_block(packed_block_1, k4, n8, K4);
-  write_packed_block(packed_block_2, k4 + 1, n8, K4);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_q4_linear_weight.yaml b/backends/vulkan/runtime/graph/ops/glsl/pack_q4_linear_weight.yaml
deleted file mode 100644
index 7a145ec95d7..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/pack_q4_linear_weight.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-pack_q4_linear_weight:
-  parameter_names_with_default_values:
-    STORAGE: buffer
-  shader_variants:
-    - NAME: pack_q4_linear_weight_buffer
-      STORAGE: buffer
-    - NAME: pack_q4_linear_weight_texture2d
-      STORAGE: texture2d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_q8_linear_weight.glsl b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_linear_weight.glsl
deleted file mode 100644
index f2c74b67283..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/pack_q8_linear_weight.glsl
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-${define_active_storage_type(STORAGE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_packed_int8_weight", "int", STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_int8_weight", "int", "buffer")}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 qmat2_sizes;
-  ivec2 orig_sizes;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#include "common.glslh"
-#include "linear_int8_weight_block.glslh"
-
-void main() {
-  // The size of the source weight tensor is [W=K, H=N]. Each shader invocation
-  // processes a 4x4 block. The thread position corresponds to the block index.
-  int n4 = int(gl_GlobalInvocationID.x);
-  int k4 = int(gl_GlobalInvocationID.y);
-
-  const int K = orig_sizes.x;
-  const int N = orig_sizes.y;
-
-  // Determine the total number of blocks and check bounds
-  const int N4 = div_up_4(N);
-  const int K4 = div_up_4(K);
-  if (n4 >= N4 || k4 >= K4) {
-    return;
-  }
-
-  // Each block is represented as an ivec4. Each int corresponds to a row i.e.
-  // N dim of the weight tensor and contains data for 4 columns i.e. K dim.
-  Int8WeightBlock block;
-  const int n = mul_4(n4);
-  if (N - n >= 4) {
-    load_block_data_no_checks(block, k4, n, K4, N);
-  } else {
-    load_block_data_with_checks(block , k4, n, K4, N);
-  }
-
-  // The weight blocks are stored in a tranposed manner, such that weight blocks
-  // are indexed like packed_weight[k4][n4]. This is to optimize memory
-  // coalescing when computing tiled GEMM.
-  write_weight_block(block, n4, k4, N4);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pack_q8_linear_weight.yaml b/backends/vulkan/runtime/graph/ops/glsl/pack_q8_linear_weight.yaml
deleted file mode 100644
index 13e6d43b2c5..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/pack_q8_linear_weight.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-pack_q8_linear_weight:
-  parameter_names_with_default_values:
-    STORAGE: buffer
-  shader_variants:
-    - NAME: pack_q8_linear_weight_buffer
-      STORAGE: buffer
-    - NAME: pack_q8_linear_weight_texture2d
-      STORAGE: texture2d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_channel.glsl
deleted file mode 100644
index 8c01ebef897..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.glsl
+++ /dev/null
@@ -1,80 +0,0 @@
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_ubo(2, "ivec4", "out_sizes")}
-${layout_declare_ubo(3, "ivec4", "in_sizes")}
-${layout_declare_ubo(4, "int", "pad_left", "int", "pad_top", "int", "pad_front")}
-${layout_declare_ubo(5, "float", "fill_value")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim);
-
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
-    return;
-  }
-
-  VEC4_T outtex = VEC4_T(fill_value);
-  // mask_z/y/x is used to determine whether need to fecth data from input tensor
-  bool mask_z = (idx.z + 3) < pad_front || idx.z > (pad_front + in_sizes.z - 1);
-  bool mask_y = idx.y >= pad_top && idx.y <= pad_top + in_sizes.y - 1;
-  bool mask_x = idx.x >= pad_left && idx.x <= pad_left + in_sizes.x - 1;
-
-  if (!mask_z && mask_y && mask_x) {
-    // channel_mask is to determine the situation that when padding channel dimension,
-    // in one texel, some elements are filled vaule and some value are from input tensor
-    ivec4 c_ind = ivec4(idx.z) + ivec4(0, 1, 2, 3);
-    ivec4 channel_mask = ivec4(lessThan(c_ind, ivec4(pad_front))) + ivec4(greaterThan(c_ind, ivec4(pad_front + in_sizes.z - 1)));
-
-    ivec4 in_idx = idx;
-    in_idx.x -= pad_left;
-    in_idx.y -= pad_top;
-    in_idx.z -= divup4(pad_front) * 4;
-    const int shift = pad_front % 4;
-    VEC4_T cur_in_texel = texelFetch(t_in, to_texture_pos(in_idx, in_sizes, packed_dim), 0);
-    VEC4_T next_in_texel;
-    // When shift is not 0, we need to read 2 texels from input tensor to write into output
-    // for example:
-    // input texel is [[1 2 3 4], [5 6 x x]] and front_pad = 2
-    // output texel is [[p p 1 2], [3 4 5 6]], where p is the filled value then need to fetch 2 texels to fill [3 4 5 6].
-    if (shift != 0) {
-      in_idx.z += 4;
-      next_in_texel = texelFetch(t_in, to_texture_pos(in_idx, in_sizes, packed_dim), 0);
-    } else {
-      next_in_texel = cur_in_texel;
-    }
-
-    VEC4_T inter_texel;
-    for (int i = 0; i < 4; i++) {
-      if (i < shift) {
-        inter_texel[i] = cur_in_texel[4-shift+i];
-      } else {
-        inter_texel[i] = next_in_texel[i-shift];
-      }
-    }
-    outtex = inter_texel * (VEC4_T(1) - channel_mask) + outtex * channel_mask;
-  }
-
-  int packed_idx = idx[packed_dim];
-  const int packed_dim_size = out_sizes[packed_dim];
-  if (packed_idx + 3 >= packed_dim_size) {
-    ivec4 packed_ind = ivec4(packed_idx) + ivec4(0, 1, 2, 3);
-    VEC4_T valid_idx = VEC4_T(lessThan(packed_ind, ivec4(packed_dim_size)));
-    outtex = outtex * valid_idx;
-  }
-
-  imageStore(t_out, pos, outtex);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.yaml b/backends/vulkan/runtime/graph/ops/glsl/pad_channel.yaml
deleted file mode 100644
index 02afc3846a2..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/pad_channel.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-pad_channel:
-  parameter_names_with_default_values:
-    NDIM: 3
-    DTYPE: float
-    PACKING: C_packed
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-      - VALUE: half
-  shader_variants:
-    - NAME: pad_channel
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.glsl b/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.glsl
deleted file mode 100644
index c5b2c692bdc..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.glsl
+++ /dev/null
@@ -1,50 +0,0 @@
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_ubo(2, "ivec4", "out_sizes")}
-${layout_declare_ubo(3, "ivec4", "in_sizes")}
-${layout_declare_ubo(4, "int", "pad_left", "int", "pad_top", "int", "pad_front")}
-${layout_declare_ubo(5, "float", "fill_value")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(pos, out_sizes, packed_dim);
-
-  if (pos_out_of_bounds(pos, out_sizes, packed_dim)) {
-    return;
-  }
-
-  bool mask_height = idx.y >= pad_top && idx.y <= pad_top + in_sizes.y - 1;
-  bool mask_width = idx.x >= pad_left && idx.x <= pad_left + in_sizes.x - 1;
-
-  VEC4_T outtex = VEC4_T(fill_value);
-  if (mask_height && mask_width) {
-    ivec4 in_idx = idx;
-    in_idx.x -= pad_left;
-    in_idx.y -= pad_top;
-    outtex = texelFetch(t_in, to_texture_pos(in_idx, in_sizes, packed_dim), 0);
-  }
-
-  int packed_idx = idx[packed_dim];
-  const int packed_dim_size = out_sizes[packed_dim];
-  if (packed_idx + 3 >= packed_dim_size) {
-    ivec4 packed_ind = ivec4(packed_idx) + ivec4(0, 1, 2, 3);
-    VEC4_T valid_idx = VEC4_T(lessThan(packed_ind, ivec4(packed_dim_size)));
-    outtex = outtex * valid_idx;
-  }
-
-  imageStore(t_out, pos, outtex);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.yaml b/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.yaml
deleted file mode 100644
index dd74ec9cc28..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/pad_height_width.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-pad_height_width:
-  parameter_names_with_default_values:
-    NDIM: 3
-    DTYPE: float
-    PACKING: C_packed
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-      - VALUE: half
-  shader_variants:
-    - NAME: pad_height_width
diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.glsl
deleted file mode 100644
index 3447ab07552..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.glsl
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-#define T ${buffer_scalar_type(DTYPE)}
-
-${define_active_storage_type("buffer")}
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing.glslh"
-
-${layout_declare_tensor(B, "w", "t_outp", DTYPE, "buffer")}
-${layout_declare_tensor(B, "r", "t_inp", DTYPE, "buffer")}
-
-${layout_declare_ubo(B, "BufferMetadata", "outp")}
-${layout_declare_ubo(B, "BufferMetadata", "inp")}
-
-${layout_declare_ubo(B, "ivec4[DIMLIMIT_DIV4]", "permute_order")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const uint inp_bufi = gl_GlobalInvocationID.x;
-  if (inp_bufi >= numel(inp)) {
-    return;
-  }
-
-  TensorIndex inp_tidx;
-  linear_idx_to_tensor_idx(inp, inp_bufi, inp_tidx);
-
-  TensorIndex outp_tidx = inp_tidx;
-  permute(outp_tidx, permute_order);
-
-  const uint outp_bufi = tensor_idx_to_linear_idx(outp, outp_tidx);
-  // Copy data from input to output
-  t_outp[outp_bufi] = t_inp[inp_bufi];
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.yaml
deleted file mode 100644
index 81675ae8917..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/permute_buffer.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-permute_buffer:
-  parameter_names_with_default_values:
-    DTYPE: float
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-  shader_variants:
-    - NAME: permute_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl
deleted file mode 100644
index 274077f4181..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-#define T ${buffer_scalar_type(DTYPE)}
-
-${define_active_storage_type("texture3d")}
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 out_sizes;
-  ivec4 in_sizes;
-  ivec4 permute_dims; // Permutation mapping: permute_dims[i] = j means output dim i comes from input dim j
-};
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int out_packed_dim = unhash_packed_dim(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-const lowp int in_packed_dim = unhash_packed_dim(in_layout);
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-// Convert output tensor index to input tensor index based on permutation
-ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
-  ivec4 in_tidx;
-
-  // Apply the permutation mapping: in_tidx[permute_dims[i]] = out_tidx[i]
-  in_tidx[permute_dims.x] = out_tidx.x;
-  in_tidx[permute_dims.y] = out_tidx.y;
-  in_tidx[permute_dims.z] = out_tidx.z;
-  in_tidx[permute_dims.w] = out_tidx.w;
-
-  return in_tidx;
-}
-
-// Check if we can use the fast path where texels from the input tensor can be
-// copied directly into the output tensor. This occurs when the packed dimension
-// is preserved in the permutation, i.e. reading a texel from the output tensor
-// produces 4 texels along the same dimension as reading a texel from the input
-// tensor.
-bool can_use_fast_path() {
-  // Fast path is possible when the packed dimension is preserved in the permutation
-  // This means permute_dims[out_packed_dim] == in_packed_dim
-  return permute_dims[out_packed_dim] == in_packed_dim;
-}
-
-void main() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-  ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim);
-
-  if (any(greaterThanEqual(out_tidx, out_sizes))) {
-    return;
-  }
-
-  if (can_use_fast_path()) {
-    // Fast path: packed dimension is preserved, so we can copy texels directly
-    ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
-    ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
-    VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
-
-    write_texel_lpos(t_out, lpos, in_texel, out_axis_map);
-  }
-  else {
-    // Slow path: packed dimension is not preserved, so each element of the
-    // output texel may be "sourced" from a different texel in the input tensor.
-    // Therefore each output texel element is processed individually.
-    VEC4_T out_texel = VEC4_T(0);
-
-    for (int texel_i = 0; texel_i < 4; ++texel_i) {
-      ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
-      ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
-      int element_idx = in_tidx[in_packed_dim] % 4;
-
-      VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
-      T selected_value = T(in_texel[element_idx]);
-
-      out_texel[texel_i] = selected_value;
-
-      out_tidx[out_packed_dim]++;
-    }
-
-    write_texel_lpos(t_out, lpos, out_texel, out_axis_map);
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/permute_texture.yaml
deleted file mode 100644
index f68b8dcdd3d..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/permute_texture.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-permute_texture:
-  parameter_names_with_default_values:
-    DTYPE: float
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-  shader_variants:
-    - NAME: permute_texture3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/qlinear_utils.glslh b/backends/vulkan/runtime/graph/ops/glsl/qlinear_utils.glslh
deleted file mode 100644
index 80ec44c153a..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/qlinear_utils.glslh
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef QLINEAR_UTILS_H
-#define QLINEAR_UTILS_H
-
-/***********************************
- * Packed Weight data read/write functions
- *
- * These functions assume that t_qmat2 is declared in the shader layout as a storage
- * buffer or storage image.
- */
-
-#ifdef BUFFER_WEIGHT
-
-uvec4 load_transposed_weight_block(const uint k4, const uint n8, const uint K4) {
-  return t_qmat2[n8 * K4 + k4];
-}
-
-#else // TEXTURE_WEIGHT
-
-uvec4 load_transposed_weight_block(const uint k4, const uint n8, const uint K4) {
-  return texelFetch(t_qmat2, ivec2(k4, n8), 0);
-}
-
-#endif // BUFFER_WEIGHT
-
-/***********************************
- * Packed weight data extraction functions
- */
-
-/*
- * uvec4 block contains a packed 4 high x 8 wide matrix of 4-bit signed integers. This
- * function extracts the 4-bit values at the given column and row index.
- *
- * Each uint in the uvec4 corresponds to one row; thus the desired row can be extracted
- * via block[row]. From there, column 0 is packed in bits 28-31, column 1 is packed into
- * bits 24-27, column 3 is packed into bits 20-23, and so on. To extract the desired
- * value:
- *
- * 1. First, shift the row uint by 4 * (7 - col) bits
- * 2. Apply a mask of 0b1111 = 15
- *
- * Finally, convert the masked value to int and subtract it by int to obtain the desired
- * signed integer.
- */
-T extract_4bit_from_transposed_block(const uvec4 block, const uint col, const uint row) {
-  return T(int((block[row] >> (4 * (7 - col))) & 15) - 8);
-}
-
-/***********************************
- * Input/Output read/write functions
- *
- * These functions assume that t_input and t_output are declared in the shader layout as
- * storage buffers or storage images.
- */
-
-#ifdef BUFFER_IO
-
-VEC4_T load_input_texel_1d(const uint k4) {
-  return t_input[k4];
-}
-
-VEC4_T load_input_texel_2d(
-    const uint k4,
-    const uint m,
-    const uint K4) {
-  return t_input[(m * K4) + k4];
-}
-
-void write_output_texel_1d(const VEC4_T out_texel, const uint n4) {
-  t_output[n4] = out_texel;
-}
-
-void write_output_texel_2d(
-    const VEC4_T out_texel,
-    const uint n4,
-    const uint m,
-    const uint N4) {
-  t_output[m * N4 + n4] = out_texel;
-}
-
-#else // TEXTURE_IO
-
-VEC4_T load_input_texel_1d(const uint k4) {
-  return texelFetch(t_input, ivec3(k4, 0, 0), 0);
-}
-
-VEC4_T load_input_texel_2d(
-    const uint k4,
-    const uint m,
-    const uint K4) {
-  return texelFetch(t_input, ivec3(k4, m, 0), 0);
-}
-
-
-void write_output_texel_1d(const VEC4_T out_texel, const uint n4) {
-  imageStore(t_output, ivec3(n4, 0, 0), out_texel);
-}
-
-void write_output_texel_2d(
-    const VEC4_T out_texel,
-    const uint n4,
-    const uint m,
-    const uint N4) {
-  imageStore(t_output, ivec3(n4, m, 0), out_texel);
-}
-
-#endif // BUFFER_IO
-
-#endif // QLINEAR_UTILS_H
diff --git a/backends/vulkan/runtime/graph/ops/glsl/qlinear_weight_pack_utils.glslh b/backends/vulkan/runtime/graph/ops/glsl/qlinear_weight_pack_utils.glslh
deleted file mode 100644
index 1f481f4f859..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/qlinear_weight_pack_utils.glslh
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef QLINEAR_WEIGHT_PACK_UTILS_H
-#define QLINEAR_WEIGHT_PACK_UTILS_H
-
-/***********************************
- * Packed Weight data write functions
- *
- * These functions assume that t_qmat2 has been defined in the shader layout as either
- * a storage buffer or a storage image.
- */
-
-#ifdef BUFFER_WEIGHT
-
-void write_transposed_weight_block(const uvec4 block, const uint k4, const uint n8, const uint K4) {
-  t_qmat2[n8 * K4 + k4] = block;
-}
-
-#else // TEXTURE_WEIGHT
-
-void write_transposed_weight_block(const uvec4 block, const uint k4, const uint n8, const uint K4) {
-  imageStore(t_qmat2, ivec2(k4, n8), block);
-}
-
-#endif // BUFFER_WEIGHT
-
-/***********************************
- * Utilities for packing weight data
- */
-
-uint extract_4bit_from_packed_uint_le(const uint packed, const uint i) {
-  // account for little endian
-  uint byte = packed >> (8 * (i / 2)) & 255;
-  return (byte  >> (4 - 4 * (i % 2))) & 15;
-}
-
-uint pack_8x4bit_into_uint(
-    const uint val0,
-    const uint val1,
-    const uint val2,
-    const uint val3,
-    const uint val4,
-    const uint val5,
-    const uint val6,
-    const uint val7) {
-  return uint(
-    (val0 << 28) | (val1 << 24) | (val2 << 20) | (val3 << 16) | (val4 << 12) |
-    (val5 << 8) | (val6 << 4) | val7
-  );
-}
-
-#endif // QLINEAR_WEIGHT_PACK_UTILS_H
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize.glslh b/backends/vulkan/runtime/graph/ops/glsl/quantize.glslh
deleted file mode 100644
index cde72e41ac7..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/quantize.glslh
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef QUANTIZE_GLSLH
-#define QUANTIZE_GLSLH
-
-OUT_T quantize_val(IN_T value, float scale_val, int zero_point_val) {
-  float inv_scale = 1.0 / scale_val;
-
-  float rounded_float = round(inv_scale * float(value));
-
-  int qvalue = zero_point_val + int(rounded_float);
-
-  qvalue = max(qvalue, quant_min);
-  qvalue = min(qvalue, quant_max);
-
-  return OUT_T(qvalue);
-}
-
-#endif // QUANTIZE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_im2col.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_im2col.glsl
deleted file mode 100644
index 450d6376537..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_im2col.glsl
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_load_type(DTYPE, INPUT_STORAGE)}
-#define T ${texel_load_component_type(DTYPE, INPUT_STORAGE)}
-
-$if OUTPUT_STORAGE == "buffer":
-  #define OUTPUT_BUFFER
-$if INPUT_STORAGE == "buffer":
-  #define INPUT_BUFFER
-
-#define TILE_M4 1
-#define TILE_N4 1
-#define TILE_K4 1
-
-#define TILE_M 4
-#define TILE_N 4
-#define TILE_K 4
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "conv2d_common.glslh"
-
-${layout_declare_tensor(B, "w", "t_packed_int8_input", "int", OUTPUT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)}
-
-// Sizes of the im2col matrix of the convolution input
-${layout_declare_ubo(B, "ivec4", "matrix_sizes")}
-// Sizes of the input image
-${layout_declare_ubo(B, "ivec4", "input_sizes")}
-// Sizes of the output image
-${layout_declare_ubo(B, "ivec4", "output_sizes")}
-
-${layout_declare_ubo(B, "Conv2DParams", "conv2d_params")}
-
-layout(push_constant) uniform restrict Block {
-  float inv_scale;
-  int zp;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#include "conv2d_fp_im2col_block_load.glslh"
-#include "linear_int8_input_block.glslh"
-
-void main() {
-  // The quantized and packed im2col matrix can be conceptualized as a 2D matrix
-  // with K/4 columns and M/4 rows. Each element of the matrix is a ivec4 which
-  // contains packed data for a 4 wide x 4 high block of the original im2col
-  // matrix. Each shader invocation works on writing out one ivec4, i.e. one
-  // block of the quantized and packed matrix.
-
-  // Thread id corresponds to the block index
-  const int k4 = int(gl_GlobalInvocationID.x);
-  const int m4 = int(gl_GlobalInvocationID.y);
-
-  // Convert block idx to tensor idx
-  const int k = mul_4(k4);
-  const int m = mul_4(m4);
-
-  const int logical_K = conv2d_params.logical_K;
-  // Similarly, compute the logical size of the M dim.
-  const int logical_M = output_sizes.x * output_sizes.y * output_sizes.w;
-
-  // Check if tensor indices are out of bounds
-  if (k >= logical_K || m >= logical_M) {
-    return;
-  }
-
-  FPInputTile in_tile;
-  load_input_im2col_tile(in_tile, k4, m4, logical_K, logical_M);
-
-  Int8InputBlock packed_block;
-  quantize_and_pack(packed_block, in_tile, inv_scale, zp);
-
-  // Number of texels in the x dim of the output matrix
-  const int K4 = div_4(matrix_sizes.x);
-  write_block(packed_block, k4, m4, K4);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_im2col.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_im2col.yaml
deleted file mode 100644
index 93f8269d607..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_im2col.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-quantize_and_pack_im2col:
-  parameter_names_with_default_values:
-    DTYPE: float
-    OUTPUT_STORAGE: buffer
-    INPUT_STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-  shader_variants:
-    - NAME: quantize_and_pack_im2col_buffer_texture3d
-    - NAME: quantize_and_pack_im2col_texture3d_texture3d
-      OUTPUT_STORAGE: texture3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_linear_input.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_linear_input.glsl
deleted file mode 100644
index 6ba9343f10d..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_linear_input.glsl
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_load_type(DTYPE, INPUT_STORAGE)}
-#define T ${texel_load_component_type(DTYPE, INPUT_STORAGE)}
-
-$if OUTPUT_STORAGE == "buffer":
-  #define OUTPUT_BUFFER
-$if INPUT_STORAGE == "buffer":
-  #define INPUT_BUFFER
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "common.glslh"
-
-${layout_declare_tensor(B, "w", "t_packed_int8_input", "int", OUTPUT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)}
-
-$if GRANULARITY == "per_channel":
-  ${layout_declare_tensor(B, "r", "t_scale", DTYPE, "buffer")}
-
-${layout_declare_ubo(B, "ivec4", "input_sizes")}
-
-layout(push_constant) uniform restrict Block {
-  float inv_scale;
-  int zp;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#include "linear_int8_input_block.glslh"
-#include "linear_fp_input_tile_load.glslh"
-
-void main() {
-  // Each input block contains 4x4 int8 quantized values, which are packed into
-  // a ivec4. k4 and m4 represent the "block index" of the current block being
-  // processed.
-  int k4 = int(gl_GlobalInvocationID.x);
-  int m4 = int(gl_GlobalInvocationID.y);
-
-  const int K = input_sizes.x;
-  const int M = input_sizes.y;
-
-  // K4 and M4 represent the number of blocks in each dimension.
-  const int K4 = div_up_4(K);
-  const int M4 = div_up_4(M);
-
-  if (k4 >= K4 || m4 >= M4) {
-    return;
-  }
-
-  // row of the input tensor to start loading from. Note the input tensor is
-  // interpreted as a t
-  const int m = mul_4(m4);
-
-  const bool dont_check_bounds = (M - m) >= 4;
-
-  FPInputTile in_tile;
-  if (dont_check_bounds) {
-    load_input_tile_no_checks(in_tile, k4, m, K4, M);
-  } else {
-    load_input_tile_with_checks(in_tile, k4, m, K4, M);
-  }
-
-  Int8InputBlock packed_block;
-  quantize_and_pack(packed_block, in_tile, inv_scale, zp);
-
-  write_block(packed_block, k4, m4, K4);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_linear_input.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_linear_input.yaml
deleted file mode 100644
index 37721db1ba8..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/quantize_and_pack_linear_input.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-quantize_and_pack_linear_input:
-  parameter_names_with_default_values:
-    DTYPE: float
-    OUTPUT_STORAGE: texture3d
-    INPUT_STORAGE: texture3d
-    STORAGE: texture3d
-    GRANULARITY: per_tensor
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: quantize_and_pack_linear_input_per_tensor_texture3d_texture3d
-    - NAME: quantize_and_pack_linear_input_per_tensor_buffer_texture3d
-      OUTPUT_STORAGE: buffer
-    - NAME: quantize_and_pack_linear_input_per_tensor_buffer_buffer
-      OUTPUT_STORAGE: buffer
-      INPUT_STORAGE: buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl
deleted file mode 100644
index 7bf3a932c6c..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.glsl
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define IN_T ${buffer_scalar_type(IN_DTYPE)}
-#define OUT_T ${buffer_scalar_type(OUT_DTYPE)}
-#define SCALE_T ${buffer_scalar_type(SCALE_DTYPE)}
-#define ZP_T ${buffer_scalar_type(ZP_DTYPE)}
-
-#define ${MODE}
-
-${define_active_storage_type("buffer")}
-${define_required_extensions(IN_DTYPE)}
-${define_required_extensions(OUT_DTYPE)}
-${define_required_extensions(SCALE_DTYPE)}
-${define_required_extensions(ZP_DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "buffer")}
-${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "buffer")}
-
-$if MODE == "per_tensor":
-  ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
-  ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}
-
-  layout(push_constant) uniform restrict Block {
-    int quant_min;
-    int quant_max;
-  };
-$if MODE == "per_token":
-  ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
-  ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}
-
-  layout(push_constant) uniform restrict Block {
-    int num_tokens;
-    int quant_min;
-    int quant_max;
-  };
-$if MODE == "per_channel":
-  ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
-  ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}
-
-  layout(push_constant) uniform restrict Block {
-    int axis;
-    int num_channels;
-    int quant_min;
-    int quant_max;
-  };
-$if MODE == "block_wise":
-  ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
-  ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}
-
-  layout(push_constant) uniform restrict Block {
-    ivec4 blockSize;     // bW, bH, bC, bN
-    ivec4 numBlocks;     // tW/bW, tH/bH, tC/bC, tN/bN
-    ivec4 blockStride;   // pre-computed linear strides for the block grid
-    int quant_min;
-    int quant_max;
-  };
-
-${layout_declare_ubo(B, "int", "out_numel")}
-${layout_declare_ubo(B, "ivec4", "t_in_sizes")}
-${layout_declare_ubo(B, "ivec4", "t_in_strides")}
-${layout_declare_ubo(B, "ivec4", "t_out_sizes")}
-${layout_declare_ubo(B, "ivec4", "t_out_strides")}
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-
-#include "quantize.glslh"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
-const lowp ivec4 in_dim_order = unhash_dim_order(in_layout);
-
-/*
-  Quantization Shader (Buffer Storage)
-    This shader converts floating-point tensor values to n-bit integer representations
-    using pre-computed quantization parameters (scale and zero_point). The quantization
-    maps floating-point values to a discrete integer range while preserving the original
-    data distribution as much as possible.
-
-  Important Considerations:
-    (+) All input tensors are assumed to be WIDTH_PACKED (i.e., contiguous in the last dimension)
-    (+) The axis map layout is assumed to be a standard layout for scales and zero_points
-    (++) The scale and zero_point tensors must be implemented as buffers
-
-  Workgroup Configuration:
-  - quantize_per_tensor
-      This mode applies uniform quantization across the entire tensor using a single scale
-      and zero_point value.
-
-    (*) global_wg_size: default
-    (*) local_wg_size: default
-
-  - quantize_per_token
-      This mode applies quantization individually to each token (or element) in the input,
-      using separate scale and zero_point values for each token. For instance if we have
-      a tensor of shape [B, S, H] then we have B*S tokens (and s+zp pairs) of H elements each.
-
-    (*) global_wg_size: default
-    (*) local_wg_size: default
-
-  - quantize_per_channel
-      This mode applies quantization separately to each channel of the input tensor, using
-      distinct scale and zero_point values for each channel. For example, if the tensor shape
-      is [B, C, H, W] and axis = 1, quantization parameters are computed per channel C, allowing
-      each channel to be quantized independently.
-
-    (*) global_wg_size: default
-    (*) local_wg_size: default
-
-  - quantize_block_wise
-      This mode applies quantization in blocks or groups of elements, allowing different scale
-      and zero_point values for each block. It is equivalent to quantize_affine, where quantization
-      parameters are affine transformations applied per block. For example, if the tensor shape
-      is [6, 9, 4] and blockSize = [3, 3, 2], then we have 12 blocks each with 18 elements.
-
-    (*) global_wg_size: default
-    (*) local_wg_size: default
-
-  Quantization Formula:
-    qvalue = clamp(round(value / scale) + zero_point, quant_min, quant_max).
-*/
-
-#ifdef per_tensor
-
-void quantize_per_tensor() {
-  const int out_bufi = int(gl_GlobalInvocationID.x);
-
-  if (out_bufi >= out_numel) {
-    return;
-  }
-
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order);
-  const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides);
-
-  IN_T value = t_in[in_bufi];
-  OUT_T qvalue = quantize_val(value, float(t_scale[0]), int(t_zero_point[0]));
-
-  t_out[out_bufi] = qvalue;
-}
-
-#elif defined(per_token)
-
-void quantize_per_token() {
-  const int out_bufi = int(gl_GlobalInvocationID.x);
-
-  if (out_bufi >= out_numel) {
-    return;
-  }
-
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order);
-  const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides);
-
-  IN_T value = t_in[in_bufi];
-
-  int token_idx = 0;
-
-  if (t_out_sizes.w > 1) {
-    // 4D tensor
-    token_idx = out_tidx.w * (t_out_sizes.z * t_out_sizes.y) + out_tidx.z * t_out_sizes.y + out_tidx.y;
-  } else if (t_out_sizes.z > 1) {
-    // 3D tensor
-    token_idx = out_tidx.z * t_out_sizes.y + out_tidx.y;
-  } else if (t_out_sizes.y > 1) {
-    // 2D tensor
-    token_idx = out_tidx.y;
-  }
-  // For 1D tensor, token_idx remains 0
-
-  token_idx = min(token_idx, num_tokens - 1);
-
-  OUT_T qvalue = quantize_val(value, float(t_scale[token_idx]), int(t_zero_point[token_idx]));
-
-  t_out[out_bufi] = qvalue;
-}
-
-#elif defined(per_channel)
-
-void quantize_per_channel() {
-  const int out_bufi = int(gl_GlobalInvocationID.x);
-
-  if (out_bufi >= out_numel) {
-    return;
-  }
-
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order);
-  const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides);
-
-  IN_T value = t_in[in_bufi];
-
-  // Calculate channel index based on the quantization axis (already converted to WHCN)
-  // The axis parameter is now in WHCN coordinate system:
-  // axis 0 -> W dimension (tidx.x)
-  // axis 1 -> H dimension (tidx.y)
-  // axis 2 -> C dimension (tidx.z)
-  // axis 3 -> N dimension (tidx.w)
-  int channel_idx = 0;
-
-  if (axis == 0) {
-    channel_idx = out_tidx.x;
-  } else if (axis == 1) {
-    channel_idx = out_tidx.y;
-  } else if (axis == 2) {
-    channel_idx = out_tidx.z;
-  } else if (axis == 3) {
-    channel_idx = out_tidx.w;
-  }
-
-  channel_idx = min(channel_idx, num_channels - 1);
-
-  OUT_T qvalue = quantize_val(value, float(t_scale[channel_idx]), int(t_zero_point[channel_idx]));
-
-  t_out[out_bufi] = qvalue;
-}
-
-#else // block_wise
-
-void quantize_block_wise() {
-  const int out_bufi = int(gl_GlobalInvocationID.x);
-
-  if (out_bufi >= out_numel) {
-    return;
-  }
-
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, t_out_strides, out_dim_order);
-  const int in_bufi = tidx_to_bufi(out_tidx, t_in_strides);
-
-  IN_T value = t_in[in_bufi];
-
-  const ivec4 bcoord = out_tidx / blockSize;
-
-  const int block_id = bcoord.x * blockStride.x + bcoord.y * blockStride.y + bcoord.z * blockStride.z + bcoord.w * blockStride.w;
-
-  const OUT_T qvalue = quantize_val(value, float(t_scale[block_id]), int(t_zero_point[block_id]));
-
-  t_out[out_bufi] = qvalue;
-}
-
-#endif
-
-void main() {
-  quantize_${MODE}();
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml
deleted file mode 100644
index fb5853ecd20..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/quantize_buffer.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-quantize_buffer:
-  parameter_names_with_default_values:
-    IN_DTYPE: float
-    OUT_DTYPE: int32
-    SCALE_DTYPE: float
-    ZP_DTYPE: int32
-    MODE: per_tensor
-  generate_variant_forall:
-    IN_DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: double
-    OUT_DTYPE:
-      - VALUE: uint8
-      - VALUE: int8
-      - VALUE: int32
-    SCALE_DTYPE:
-      - VALUE: float
-    ZP_DTYPE:
-      - VALUE: int8
-      - VALUE: int32
-      - VALUE: float
-  shader_variants:
-    - NAME: quantize_per_tensor_buffer
-      MODE: per_tensor
-    - NAME: quantize_per_token_buffer
-      MODE: per_token
-    - NAME: quantize_per_channel_buffer
-      MODE: per_channel
-    - NAME: quantize_block_wise_buffer
-      MODE: block_wise
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl
deleted file mode 100644
index 12e5769f50d..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.glsl
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define IN_T ${buffer_scalar_type(IN_DTYPE)}
-#define FVEC4_T ${texel_load_type(IN_DTYPE, "texture3d")}
-
-#define OUT_T ${buffer_scalar_type(OUT_DTYPE)}
-#define IVEC4_T ${texel_load_type(OUT_DTYPE, "texture3d")}
-#define SCALE_T ${buffer_scalar_type(SCALE_DTYPE)}
-#define ZP_T ${buffer_scalar_type(ZP_DTYPE)}
-
-#define ${MODE}
-
-${define_active_storage_type("texture3d")}
-${define_required_extensions(IN_DTYPE)}
-${define_required_extensions(OUT_DTYPE)}
-${define_required_extensions(SCALE_DTYPE)}
-${define_required_extensions(ZP_DTYPE)}
-
-#extension GL_EXT_control_flow_attributes : require
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(B, "w", "t_out", OUT_DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "t_in", IN_DTYPE, "texture3d")}
-
-$if MODE == "per_tensor":
-  ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
-  ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}
-
-  layout(push_constant) uniform restrict Block {
-    int quant_min;
-    int quant_max;
-  };
-$if MODE == "per_token":
-  ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
-  ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}
-
-  layout(push_constant) uniform restrict Block {
-    int num_tokens;
-    int quant_min;
-    int quant_max;
-  };
-$if MODE == "per_channel":
-  ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
-  ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}
-
-  layout(push_constant) uniform restrict Block {
-    int axis;
-    int num_channels;
-    int quant_min;
-    int quant_max;
-  };
-$if MODE == "block_wise":
-  ${layout_declare_tensor(B, "r", "t_scale", SCALE_DTYPE, "buffer")}
-  ${layout_declare_tensor(B, "r", "t_zero_point", ZP_DTYPE, "buffer")}
-
-  layout(push_constant) uniform restrict BlockPC {
-    ivec4 blockSize;        // WHCN
-    ivec4 numBlocks;        // (#W,#H,#C,#N)
-    ivec4 blockStride;      // {1, #W, #W * #H, #W * #H * #C}
-    int   quant_min;
-    int   quant_max;
-  };
-
-${layout_declare_ubo(B, "ivec3", "t_in_limits")}
-${layout_declare_ubo(B, "ivec3", "t_out_limits")}
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-
-#include "quantize.glslh"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-/*
-  Quantization Shader (Texture Storage)
-    This shader converts floating-point tensor values to n-bit integer representations
-    using pre-computed quantization parameters (scale and zero_point). The quantization
-    maps floating-point values to a discrete integer range while preserving the original
-    data distribution as much as possible.
-
-  Important Considerations:
-    (+) All input tensors are assumed to be WIDTH_PACKED (i.e., contiguous in the last dimension)
-    (+) The axis map layout is assumed to be a standard layout for scales and zero_points
-    (++) The scale and zero_point tensors must be implemented as buffers
-
-  Workgroup Configuration:
-  - quantize_per_tensor
-      This mode applies uniform quantization across the entire tensor using a single scale
-      and zero_point value.
-
-    (*) global_wg_size: default
-    (*) local_wg_size: default
-
-  - quantize_per_token
-      This mode applies quantization individually to each token (or element) in the input,
-      using separate scale and zero_point values for each token. For instance if we have
-      a tensor of shape [B, S, H] then we have B*S tokens (and s+zp pairs) of H elements each.
-
-    (*) global_wg_size: default
-    (*) local_wg_size: default
-
-  - quantize_per_channel
-      This mode applies quantization separately to each channel of the input tensor, using
-      distinct scale and zero_point values for each channel. For example, if the tensor shape
-      is [B, C, H, W] and axis = 1, quantization parameters are computed per channel C, allowing
-      each channel to be quantized independently.
-
-    (*) global_wg_size: default
-    (*) local_wg_size: Default with special handling for batch dimension. When quantizing along
-        the batch axis, Z dimension is set to 1 to ensure correct workgroup dispatching. Otherwise,
-        uses standard workgroup size derived from global workgroup dimensions.
-
-  - quantize_block_wise
-      This mode applies quantization in blocks or groups of elements, allowing different scale
-      and zero_point values for each block. It is equivalent to quantize_affine, where quantization
-      parameters are affine transformations applied per block. For example, if the tensor shape
-      is [6, 9, 4] and blockSize = [3, 3, 2], then we have 12 blocks each with 18 elements.
-
-    (*) global_wg_size: default
-    (*) local_wg_size: Default with special handling for batch dimension. When quantizing along
-        the batch axis, Z dimension is set to 1 to ensure correct workgroup dispatching. Otherwise,
-        uses standard workgroup size derived from global workgroup dimensions.
-
-  Quantization Formula:
-    qvalue = clamp(round(value / scale) + zero_point, quant_min, quant_max).
-*/
-
-#ifdef per_tensor
-
-void quantize_per_tensor() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, t_in_limits))) {
-    return;
-  }
-
-  FVEC4_T intex = load_texel(t_in, pos);
-  IVEC4_T outtex;
-
-  [[unroll]] for (int i = 0; i < 4; ++i) {
-    IN_T value = IN_T(intex[i]);
-    OUT_T qvalue = quantize_val(value, float(t_scale[0]), int(t_zero_point[0]));
-    outtex[i] = qvalue;
-  }
-  write_texel(t_out, pos, outtex);
-}
-
-#elif defined(per_token)
-
-void quantize_per_token() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, t_in_limits))) {
-    return;
-  }
-
-  FVEC4_T intex = load_texel(t_in, pos);
-
-  int token_idx = 0;
-  ivec3 dims = t_in_limits;
-
-  if (dims.z > 1) {
-    // 3D tensor
-    token_idx = pos.z * dims.y + pos.y;
-  } else if (dims.y > 1) {
-    // 2D tensor
-    token_idx = pos.y;
-  }
-  // For 1D tensor, token_idx remains 0
-
-  token_idx = min(token_idx, num_tokens - 1);
-
-  // Scale and zero_point are prepacked as buffers, so direct access
-  float scale_val = float(t_scale[token_idx]);
-  int zero_point_val = int(t_zero_point[token_idx]);
-
-  IVEC4_T outtex;
-  [[unroll]] for (int i = 0; i < 4; ++i) {
-    IN_T value = IN_T(intex[i]);
-    OUT_T qvalue = quantize_val(value, scale_val, zero_point_val);
-    outtex[i] = qvalue;
-  }
-
-  write_texel(t_out, pos, outtex);
-}
-
-#elif defined(per_channel)
-
-void quantize_per_channel() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, t_in_limits))) {
-    return;
-  }
-
-  FVEC4_T intex = load_texel(t_in, pos);
-  IVEC4_T outtex;
-
-  // Calculate channel index based on the quantization axis (already converted to WHCN)
-  // The axis parameter is now in WHCN coordinate system:
-  // axis 0 -> W dimension (pos.x for texture, but width-packed so pos.x * 4 + component)
-  // axis 1 -> H dimension (pos.y)
-  // axis 2 -> C dimension (pos.z / C), but for 4D tensors this includes batch-channel folding
-  // axis 3 -> N dimension (pos.z / N), but for 4D tensors this includes batch-channel folding
-
-  if (axis == 0) {
-    // Width dimension - each texel component has different channel index
-    [[unroll]] for (int i = 0; i < 4; ++i) {
-      IN_T value = IN_T(intex[i]);
-      int channel_idx = pos.x * 4 + i;
-      channel_idx = min(channel_idx, num_channels - 1);
-
-      float scale_val = float(t_scale[channel_idx]);
-      int zero_point_val = int(t_zero_point[channel_idx]);
-      OUT_T qvalue = quantize_val(value, scale_val, zero_point_val);
-      outtex[i] = qvalue;
-    }
-  } else if (axis == 1) {
-    // Height dimension - all texel components use same channel index
-    int channel_idx = pos.y;
-    channel_idx = min(channel_idx, num_channels - 1);
-    float scale_val = float(t_scale[channel_idx]);
-    int zero_point_val = int(t_zero_point[channel_idx]);
-
-    [[unroll]] for (int i = 0; i < 4; ++i) {
-      IN_T value = IN_T(intex[i]);
-      OUT_T qvalue = quantize_val(value, scale_val, zero_point_val);
-      outtex[i] = qvalue;
-    }
-  } else if (axis == 2) {
-    // Channel dimension - for 4D tensors, need to account for batch-channel folding
-    // The Z coordinate contains folded batch*channel information
-    // We need to extract the actual channel index from the folded dimension
-    int folded_idx = pos.z;
-    int channel_idx = folded_idx % num_channels;
-
-    float scale_val = float(t_scale[channel_idx]);
-    int zero_point_val = int(t_zero_point[channel_idx]);
-
-    [[unroll]] for (int i = 0; i < 4; ++i) {
-      IN_T value = IN_T(intex[i]);
-      OUT_T qvalue = quantize_val(value, scale_val, zero_point_val);
-      outtex[i] = qvalue;
-    }
-  } else if (axis == 3) {
-    // Batch dimension - for 4D tensors, need to account for batch-channel folding
-    // The Z coordinate contains folded batch*channel information
-    // We need to extract the actual batch index from the folded dimension
-    int folded_idx = pos.z;
-    int batch_idx = folded_idx / num_channels;
-
-    float scale_val = float(t_scale[batch_idx]);
-    int zero_point_val = int(t_zero_point[batch_idx]);
-
-    [[unroll]] for (int i = 0; i < 4; ++i) {
-      IN_T value = IN_T(intex[i]);
-      OUT_T qvalue = quantize_val(value, scale_val, zero_point_val);
-      outtex[i] = qvalue;
-    }
-  }
-
-  write_texel(t_out, pos, outtex);
-}
-
-#else // block_wise
-
-void quantize_block_wise() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, t_in_limits)))
-    return;
-
-  FVEC4_T intex = load_texel(t_in, pos);
-  IVEC4_T outtex;
-
-  ivec4 base_tidx = ivec4(pos.x * 4, pos.y, pos.z, 0);
-  int foldedZ = pos.z;
-
-  int C_total = numBlocks.z * blockSize.z;
-
-  [[unroll]] for (int i = 0; i < 4; ++i) {
-    ivec4 tidx = ivec4(base_tidx.x + i, base_tidx.y, (foldedZ % C_total), (foldedZ / C_total));
-
-    ivec4 bcoord = tidx / blockSize;
-    int block_id = bcoord.x * blockStride.x + bcoord.y * blockStride.y + bcoord.z * blockStride.z + bcoord.w * blockStride.w;
-
-    IN_T value = IN_T(intex[i]);
-    OUT_T qvalue = quantize_val(value, float(t_scale[block_id]), int(t_zero_point[block_id]));
-    outtex[i] = qvalue;
-  }
-
-  write_texel(t_out, pos, outtex);
-}
-
-#endif
-
-void main() {
-  quantize_${MODE}();
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml
deleted file mode 100644
index 03d418ff2f7..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/quantize_texture.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-quantize_texture:
-  parameter_names_with_default_values:
-    IN_DTYPE: float
-    OUT_DTYPE: int32
-    SCALE_DTYPE: float
-    ZP_DTYPE: int32
-    MODE: per_tensor
-  generate_variant_forall:
-    IN_DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: double
-    OUT_DTYPE:
-      - VALUE: uint8
-      - VALUE: int8
-      - VALUE: int32
-    SCALE_DTYPE:
-      - VALUE: float
-    ZP_DTYPE:
-      - VALUE: int8
-      - VALUE: int32
-      - VALUE: float
-  shader_variants:
-    - NAME: quantize_per_tensor_texture3d
-      MODE: per_tensor
-    - NAME: quantize_per_token_texture3d
-      MODE: per_token
-    - NAME: quantize_per_channel_texture3d
-      MODE: per_channel
-    - NAME: quantize_block_wise_texture3d
-      MODE: block_wise
diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl b/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl
deleted file mode 100644
index 7a6263d9f55..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/reduce.glsl
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
-
-${define_active_storage_type(STORAGE)}
-
-#extension GL_EXT_control_flow_attributes : require
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)}
-
-${layout_declare_ubo(B, "ivec3", "tin_limits")}
-${layout_declare_ubo(B, "ivec4", "tin_sizes")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = 0;
-layout(constant_id = 4) const int reduce_dim = 0;
-layout(constant_id = 5) const int group_dim = 1;
-
-// A more verbose name would be NWORKERS_PER_GROUP. This describes the number of
-// threads that will co-operate to compute one reduction output. There may be
-// multiple groups computing distinct reduction outputs within one work group.
-#define NWORKERS 4
-
-// Sets an upper limit on the total size of a work group based on how many
-// elements are allocated in the shared memory array below. Each thread in the
-// work group will write into its assigned element in the shared array.
-#define MAX_NTHREADS 16
-
-
-shared vec4 shared_vecs[MAX_NTHREADS];
-
-#include "indexing_utils.h"
-
-int tid_to_smi(const ivec2 tid) {
-  return tid.x + tid.y * NWORKERS;
-}
-
-/*
- * The functions below compute reduction along a single dimension for a tensor.
- * The shader template generalize reduction by abstracting the initial value of
- * the accumulator, the calculation used to update the accumulator with new
- * values, and a postprocessing calculation that can be used to modify the
- * accumulator before writing to output.
- *
- * This shader also utilize shared memory to have multiple threads help compute
- * the max and sum reduction operations. A total of NGROUPS x NWORKERS threads
- * are expected to be launched. Each group works on a unique reduction "row", and
- * within a group NWORKERS threads co-operate to compute the max and sum of one
- * "row". Each worker in the group is responsible for computing a partial output
- * of the "row" and uploading it to shared memory; the overall reduction output
- * can then be determined by aggregating the partial outputs stored in shared
- * memory.
- *
- * As a caveat, this shader does not currently support cases where `batch` > 1
- * and the reduce dim happens to also be the batch concatenation dim.  To support
- * this, there will need to be additional logic to set the starting value of
- * `scan_pos[reduce_dim]`. Since this is not expected to be a common use-case,
- * supporting this case is left as an exercise for when it is required.
- */
-
-// Initializing the accumulator accepts the first value in the reduction row,
-// since some reduction operations (i.e. amax, amin) prefer to initialize with
-// a data point instead of a static value.
-#define INIT_ACCUM(first_val) ${INIT_ACCUM}
-#define UPDATE_ACCUM(accum, new_val) ${UPDATE_ACCUM}
-// Useful for operators such as mean which want to perform a final calculation
-// with the accumulator.
-#define POSTPROCESS(accum) ${POSTPROCESS}
-
-/*
- * Computes reduction where the reduction dim is orthogonal to the packed dim.
- * This case is simpler because each element of a texel belongs to a separate
- * reduction "group", meaning we don't have to perform reduction along a texel.
- */
-void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
-  // shared memory index of this thread
-  const int smi = tid_to_smi(tid);
-
-  scan_pos[reduce_dim] = 0;
-  vec4 accum = INIT_ACCUM(load_texel(tin, scan_pos));
-
-  scan_pos[reduce_dim] = tid.x;
-  // Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of
-  // the reduction row
-  for (int i = tid.x; i < tin_sizes[reduce_dim];
-       i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
-    accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos));
-  }
-  // Write partial output to shared memory and synchronize work group
-  shared_vecs[smi] = accum;
-  barrier();
-
-  // Since the reduction row is reduced to only one element, only the "main"
-  // thread in the group needs aggregate the partial outputs
-  if (tid.x == 0) {
-    // Iterate over the partial outputs to obtain the overall output
-    int group_i = tid.y * NWORKERS;
-    accum = shared_vecs[group_i++];
-    for (int i = 1; i < NWORKERS; i++, group_i++) {
-      accum = UPDATE_ACCUM(accum, shared_vecs[group_i]);
-    }
-
-    // Determine if there are any padding elements in the final texel of the
-    // packed dimension
-    const int nspill = mod4(tin_sizes[packed_dim]);
-    // Detect if this thread is working on the final texels of the packed
-    // dimension, which may have padding elements
-    const bool is_last_texel =
-        scan_pos[packed_dim] == (tin_limits[packed_dim] - 1);
-
-    // Explicitly set padding elements to 0
-    if (is_last_texel && nspill > 0) {
-      [[unroll]] for (int i = nspill; i < 4; i++) {
-        accum[i] = 0;
-      }
-    }
-    scan_pos[reduce_dim] = tid.x;
-    write_texel(tout, scan_pos, POSTPROCESS(accum));
-  }
-}
-
-/*
- * Compute reduction where the reduction dim is also the packed dim. This case is
- * complex because the reduction needs to occur over the individual texels.
- * Therefore, in this algorithm each element of the accumulator texels are
- * themselves partial outputs. Special care has to be taken to ignore padding
- * elements in texels (which occur when the size of the packed dim is not a
- * multiple of 4) so that they do not influence the output of reduction.
- */
-void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
-  // shared memory index of this thread
-  const int smi = tid_to_smi(tid);
-
-  // Number of non-padding elements in the last texel in the reduction row
-  const int nspill = mod4(tin_sizes[packed_dim]);
-  // Only reduce up to the last "complete" texel. The last texel will need to be
-  // handled specially if it has padding elements.
-  const int reduce_len = tin_sizes[packed_dim] - nspill;
-
-  scan_pos[reduce_dim] = 0;
-  vec4 accum = INIT_ACCUM(vec4(load_texel(tin, scan_pos).x));
-
-  // Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of
-  // the reduction row
-  scan_pos[reduce_dim] = tid.x;
-  for (int i = tid.x * 4; i < reduce_len;
-       i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
-    accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos));
-  }
-  // For the last texel in the dim, if there are padding elements then each
-  // element of the texel needs to be processed individually such that the
-  // padding elements are ignored
-  if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) {
-    const vec4 intex = load_texel(tin, scan_pos);
-    for (int i = 0; i < nspill; i++) {
-      accum.x = UPDATE_ACCUM(accum.x, intex[i]);
-    }
-  }
-  // Write partial output to shared memory and synchronize work group
-  shared_vecs[smi] = accum;
-  barrier();
-
-  // Since the reduction row is reduced to only one element, only the "main"
-  // thread in the group needs aggregate the partial outputs
-  if (tid.x == 0) {
-    // Iterate over the partial maximums to obtain the overall maximum
-    int group_i = tid.y * NWORKERS;
-    accum = shared_vecs[group_i++];
-    for (int i = 1; i < NWORKERS; i++, group_i++) {
-      accum = UPDATE_ACCUM(accum, shared_vecs[group_i]);
-    }
-    // Each element of the texel is itself a partial maximum; iterate over the
-    // texel to find the actual maximum
-    float accum_final = accum.x;
-    [[unroll]] for (int i = 1; i < 4; i++) {
-      accum_final = UPDATE_ACCUM(accum[i], accum_final);
-    }
-
-    scan_pos[reduce_dim] = tid.x;
-    write_texel(tout, scan_pos, POSTPROCESS(vec4(accum_final, 0, 0, 0)));
-  }
-}
-
-void main() {
-  ivec3 scan_pos = ivec3(gl_GlobalInvocationID);
-  scan_pos[reduce_dim] = 0;
-
-  const ivec2 tid = ivec2(
-      gl_LocalInvocationID[reduce_dim],
-      gl_LocalInvocationID[group_dim]);
-
-  if (any(greaterThanEqual(scan_pos, tin_limits))) {
-    return;
-  }
-
-  if (reduce_dim != packed_dim) {
-    reduce_nonpacked_dim(tid, scan_pos);
-  } else {
-    reduce_packed_dim(tid, scan_pos);
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce.yaml b/backends/vulkan/runtime/graph/ops/glsl/reduce.yaml
deleted file mode 100644
index 21a7132b8db..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/reduce.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-reduce:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: texture3d
-    INIT_ACCUM: VEC4_T(0)
-    UPDATE_ACCUM: accum + new_val
-    POSTPROCESS: accum
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: sum
-    - NAME: mean
-      POSTPROCESS: (accum / tin_sizes[reduce_dim])
-    - NAME: amax
-      INIT_ACCUM: first_val
-      UPDATE_ACCUM: max(accum, new_val)
-      POSTPROCESS: accum
-    - NAME: amin
-      INIT_ACCUM: first_val
-      UPDATE_ACCUM: min(accum, new_val)
-      POSTPROCESS: accum
diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl
deleted file mode 100644
index 98370a9bcde..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
-
-${define_active_storage_type(STORAGE)}
-
-#extension GL_EXT_control_flow_attributes : require
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)}
-
-${layout_declare_ubo(B, "ivec3", "tin_limits")}
-${layout_declare_ubo(B, "ivec4", "tin_sizes")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = 0;
-layout(constant_id = 4) const int reduce_dim1 = 0;
-layout(constant_id = 5) const int reduce_dim2 = 1;
-layout(constant_id = 6) const int group_dim = 2;
-
-// A more verbose name would be NWORKERS_PER_GROUP. This describes the number of
-// threads that will co-operate to compute one reduction output. There may be
-// multiple groups computing distinct reduction outputs within one work group.
-#define NWORKERS 4
-
-// Sets an upper limit on the total size of a work group based on how many
-// elements are allocated in the shared memory array below. Each thread in the
-// work group will write into its assigned element in the shared array.
-#define MAX_NTHREADS 16
-
-
-shared vec4 shared_vecs[MAX_NTHREADS];
-
-#include "indexing_utils.h"
-
-int tid_to_smi(const ivec2 tid) {
-  return tid.x + tid.y * NWORKERS;
-}
-
-// Initializing the accumulator accepts the first value in the reduction row,
-// since some reduction operations (i.e. amax, amin) prefer to initialize with
-// a data point instead of a static value.
-#define INIT_ACCUM(first_val) ${INIT_ACCUM}
-#define UPDATE_ACCUM(accum, new_val) ${UPDATE_ACCUM}
-// Useful for operators such as mean which want to perform a final calculation
-// with the accumulator.
-#define POSTPROCESS(accum) ${POSTPROCESS}
-
-void reduce_2d_non_packed_dim(const ivec2 tid, ivec3 scan_pos) {
-  // shared memory index of this thread
-  const int smi = tid_to_smi(tid);
-
-  scan_pos[reduce_dim1] = 0;
-  scan_pos[reduce_dim2] = 0;
-  vec4 accum = INIT_ACCUM(load_texel(tin, scan_pos));
-  
-  // First dimension reduction
-  scan_pos[reduce_dim1] = tid.x;
-  for (int i = tid.x; i < tin_sizes[reduce_dim1]; 
-       i += NWORKERS, scan_pos[reduce_dim1] += NWORKERS) {
-    
-    // Second dimension reduction
-    scan_pos[reduce_dim2] = 0;
-    for (int j = 0; j < tin_sizes[reduce_dim2]; j++, scan_pos[reduce_dim2]++) {
-      accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos));
-    }
-  }
-  
-  // Write partial output to shared memory and synchronize
-  shared_vecs[smi] = accum;
-  barrier();
-  
-  // Main thread aggregates results
-  if (tid.x == 0) {
-    // Iterate over the partial outputs to obtain the overall output
-    int group_i = tid.y * NWORKERS;
-    accum = shared_vecs[group_i++];
-    for (int i = 1; i < NWORKERS; i++, group_i++) {
-      accum = UPDATE_ACCUM(accum, shared_vecs[group_i]);
-    }
-    
-    // Determine if there are any padding elements in the final texel of the
-    // packed dimension
-    const int nspill = mod4(tin_sizes[packed_dim]);
-    // Detect if this thread is working on the final texels of the packed
-    // dimension, which may have padding elements
-    const bool is_last_texel = 
-        scan_pos[packed_dim] == (tin_limits[packed_dim] - 1);
-    
-    // Explicitly set padding elements to 0
-    if (is_last_texel && nspill > 0) {
-      [[unroll]] for (int i = nspill; i < 4; i++) {
-        accum[i] = 0;
-      }
-    }
-    scan_pos[reduce_dim1] = 0;
-    scan_pos[reduce_dim2] = 0;
-    write_texel(tout, scan_pos, POSTPROCESS(accum));
-  }
-}
-
-void main() {
-  ivec3 scan_pos = ivec3(gl_GlobalInvocationID);
-  scan_pos[reduce_dim1] = 0;
-  scan_pos[reduce_dim2] = 0;
-
-  const ivec2 tid = ivec2(
-      gl_LocalInvocationID[reduce_dim1],
-      gl_LocalInvocationID[group_dim]);
-
-  if (any(greaterThanEqual(scan_pos, tin_limits))) {
-    return;
-  }
-
-  reduce_2d_non_packed_dim(tid, scan_pos);
-}
\ No newline at end of file
diff --git a/backends/vulkan/runtime/graph/ops/glsl/reduce2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/reduce2d.yaml
deleted file mode 100644
index fdc5eb9f105..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/reduce2d.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-reduce2d:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: texture3d
-    INIT_ACCUM: VEC4_T(0)
-    UPDATE_ACCUM: accum + new_val
-    POSTPROCESS: accum
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: sum2d
-    - NAME: mean2d
-      POSTPROCESS: (accum / (tin_sizes[reduce_dim1] * tin_sizes[reduce_dim2]))
-    - NAME: amax2d
-      INIT_ACCUM: first_val
-      UPDATE_ACCUM: max(accum, new_val)
-      POSTPROCESS: accum
-    - NAME: amin2d
-      INIT_ACCUM: first_val
-      UPDATE_ACCUM: min(accum, new_val)
-      POSTPROCESS: accum
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat.glsl
deleted file mode 100644
index 441cd57c17d..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/repeat.glsl
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 range;
-  // source tensor sizes in WHCB dims respectively
-  ivec4 src_dims;
-  // destination tensor repeats in WHCB dims respectively
-  ivec4 dst_repeats;
-};
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int packed_dim = unhash_packed_dim(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-
-void main() {
-  ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, range.xyz))) {
-    return;
-  }
-
-  // expand position in packed dim
-  pos[packed_dim] <<= 2;
-
-  // channel size aligned by 4 when tensors are channel packed raw value otherwise
-  const int channel_size = (packed_dim == C_DIM ? alignup4(src_dims.z) : src_dims.z);
-
-  // find input texel's WHCB index
-  const int width_index = pos.x % src_dims.x;
-  const int height_index = pos.y % src_dims.y;
-  int channel_index;
-  int batch_index;
-
-  // if tensors are channel packed
-  if (packed_dim == C_DIM) {
-    // the output channels in a batch will be channel size * channel repetitions aligned by 4
-    const int out_channel_size = alignup4(src_dims.z * dst_repeats.z);
-
-    // batch index in the output
-    const int out_pos_batch_index = pos.z / out_channel_size;
-
-    // source batch index for based on current output pos
-    batch_index = out_pos_batch_index % src_dims.w;
-
-    // batch repetition count for current output pos
-    const int batch_repetition_index = out_pos_batch_index / src_dims.w;
-
-    // calculate input channel index based on current output pos and batch index
-    // its done this way because we want source channel to restart from zero when a batch index increments
-    // also batch_index will reset to zero after hitting batch repetition count
-    // so track the current repetition in batch_repetition_index so it can be used for determining current_index
-    channel_index = (pos.z - (batch_index + batch_repetition_index * src_dims.w) * out_channel_size) % src_dims.z;
-  } else {
-    // the output channels in a batch will be channel size * channel repetitions
-    const int out_channel_size = src_dims.z * dst_repeats.z;
-
-    // source batch index for based on current output pos
-    batch_index = (pos.z / out_channel_size) % src_dims.w;
-
-    // source channel index is current output pos wrapped based on channel count
-    channel_index = pos.z % src_dims.z;
-  }
-
-  // input texel's WCB position
-  const ivec3 in_pos = ivec3(width_index, height_index, channel_index);
-
-  // squeeze position in packed dim
-  pos[packed_dim] >>= 2;
-
-  // packed dim index of texel last fetched
-  int fetched_in_pos_packed_dim = -1;
-
-  // fetched input texel
-  VEC4_T in_value;
-
-  // output texel value
-  VEC4_T out_value = VEC4_T(0);
-
-  int src_lane_offset = in_pos[packed_dim];
-
-  for (int i=0; i<4; i++) {
-    if ((src_lane_offset >> 2) != fetched_in_pos_packed_dim) {
-      fetched_in_pos_packed_dim = (src_lane_offset >> 2);
-
-      ivec3 curr_in_pos = in_pos;
-      curr_in_pos[packed_dim] = src_lane_offset;
-      curr_in_pos.z = curr_in_pos.z + batch_index * channel_size;
-      curr_in_pos[packed_dim] >>= 2;
-
-      in_value = VEC4_T(load_texel_lpos(t_in, curr_in_pos, in_axis_map));
-    }
-
-    out_value[i] = in_value[src_lane_offset & 0x3];
-
-    src_lane_offset++;
-    // if packed index exceeded source packed dim round to zero
-    src_lane_offset = mix(src_lane_offset, 0, src_lane_offset >= src_dims[packed_dim]);
-  }
-
-  write_texel_lpos(
-    t_out,
-    pos,
-    out_value,
-    out_axis_map);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat.yaml b/backends/vulkan/runtime/graph/ops/glsl/repeat.yaml
deleted file mode 100644
index f40d94142e1..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/repeat.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-repeat:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-      - VALUE: int8
-      - VALUE: uint8
-  shader_variants:
-    - NAME: repeat
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.glsl
deleted file mode 100644
index 42c7f86aea8..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.glsl
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict RepeatArgs {
-  // With input_size (n, c_i, h, w) and repeat r
-  // out_size == (n, c_i * r, h, w)
-  ivec4 out_sizes;
-  ivec4 in_sizes;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-
-void main() {
-  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
-
-  const ivec4 out_whcn = to_tensor_idx(out_pos, out_sizes, packed_dim);
-
-  if (any(greaterThanEqual(out_whcn, out_sizes))) {
-    return;
-  }
-
-  VEC4_T v;
-  // Loop over the 4 elements in texel, calculate the corresponding elem, and
-  // fetch. Not most efficient algorithm because likely we fetch same texel
-  // multiple times in this loop.
-
-  for (int i=0; i<4;i++) {
-    ivec4 in_whcn = out_whcn;
-    in_whcn.z = (out_whcn.z + i) % in_sizes.z;
-
-    ivec4 in_elem_pos = to_texture_elem_pos(in_whcn, in_sizes, packed_dim);
-
-    v[i] = VEC4_T(texelFetch(image_in, in_elem_pos.xyz, 0))[in_elem_pos.w];
-  }
-
-  imageStore(image_out, out_pos, v);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.yaml b/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.yaml
deleted file mode 100644
index 4147e82965a..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/repeat_channel.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-repeat_channel:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: repeat_channel
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.glsl b/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.glsl
deleted file mode 100644
index 1a8e677a38f..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.glsl
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)}
-${layout_declare_ubo(B, "ivec3", "tin_limits")}
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "tout_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 tout_axis_map = unhash_axis_map(tout_layout);
-
-${layout_declare_spec_const(C, "int", "tin_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 tin_axis_map = unhash_axis_map(tin_layout);
-
-${layout_declare_spec_const(C, "int", "nrepeats", "1")}
-${layout_declare_spec_const(C, "int", "repeat_dim", "1")}
-
-void main() {
-  const ivec3 tin_lpos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(tin_lpos, tin_limits))) {
-    return;
-  }
-
-  const VEC4_T intex = load_texel_lpos(tin, tin_lpos, tin_axis_map);
-
-  ivec3 tout_lpos = tin_lpos;
-  tout_lpos[repeat_dim] *= nrepeats;
-
-  for (int i = 0; i < nrepeats; ++i, tout_lpos[repeat_dim]++) {
-    write_texel_lpos(tout, tout_lpos, intex, tout_axis_map);
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.yaml b/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.yaml
deleted file mode 100644
index 5c284a580c9..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-repeat_interleave:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: repeat_interleave
diff --git a/backends/vulkan/runtime/graph/ops/glsl/rotary_embedding.glsl b/backends/vulkan/runtime/graph/ops/glsl/rotary_embedding.glsl
deleted file mode 100644
index 30375728921..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/rotary_embedding.glsl
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "xqout", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "w", "xkout", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "xq", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "xk", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "freqs_cos", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "freqs_sin", DTYPE, STORAGE)}
-${layout_declare_ubo(B, "ivec3", "xqout_limits")}
-${layout_declare_ubo(B, "ivec3", "xkout_limits")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = 0;
-
-#include "indexing_utils.h"
-
-/*
- * This shader computes rotary positional embeddings which are used in the Llama
- * model architecture. There are 4 input tensors with the following shapes.
- * Note that head_dim = embedding_dim / num_heads
- *
- * 1. xq (batch_size, sequence_len, num_heads, head_dim)
- * 2. xk (batch_size, sequence_len, num_kv_heads, head_dim)
- * 3. freqs_cos (sequence_len, head_dim / 2)
- * 4. freqs_cos (sequence_len, head_dim / 2)
- *
- * Two output tensors are produced, with the same shapes as xq and xk
- * respectively.
- *
- * The computation of rotary positional embeddings can be summarized with the
- * following equations:
- *
- * xq_out[2i] = xq[2i] * freqs_cos[i] - xq[2i + 1] * freqs_sin[i]
- * xq_out[2i + 1] = xq[2i] * freqs_sin[i] + xq[2i + 1] * freqs_cos[i]
- *
- * Essentially, taking each row along head_dim of the xq and xk tensors, each
- * row is split into even and odd elements (xq[2i] and xq[2i + 1] respectively).
- * The even components of the output multiply the even components of the inputs
- * with the freqs_cos tensor, and the odd components of the inputs with the
- * freqs_sin tensor. The odd components of the output swap this. Throughout the
- * implementation the even components have the _r suffix and the odd components
- * have the _i suffix; this is a reference to complex numbers which can be used
- * to represent rotations.
- *
- * Note that this implementation assumes that all input tensors have the width
- * dim as the packed dim.
- */
-void main() {
-  // Each thread will write to two output locations to maximize data re-use.
-  // One texel loaded from the freqs_cos/freqs_sin tensors can be used to
-  // calculate two output texels.
-  const ivec3 x_pos_1 = ivec3(
-      gl_GlobalInvocationID.x * 2, gl_GlobalInvocationID.yz);
-  const ivec3 x_pos_2 = ivec3(x_pos_1.x + 1, x_pos_1.yz);
-
-  if (any(greaterThanEqual(x_pos_2, xqout_limits))) {
-    return;
-  }
-
-  const ivec3 freqs_pos = ivec3(gl_GlobalInvocationID.xz, 0);
-
-  VEC4_T cos_tex = load_texel(freqs_cos, freqs_pos);
-  VEC4_T sin_tex = load_texel(freqs_sin, freqs_pos);
-
-  // Compute xqout
-
-  VEC4_T x_tex_1 = load_texel(xq, x_pos_1);
-  VEC4_T x_tex_2 = load_texel(xq, x_pos_2);
-
-  // Separate into even and odd elements
-  VEC4_T x_r = VEC4_T(x_tex_1.xz, x_tex_2.xz);
-  VEC4_T x_i = VEC4_T(x_tex_1.yw, x_tex_2.yw);
-
-  VEC4_T xout_r = x_r * cos_tex - x_i * sin_tex;
-  VEC4_T xout_i = x_r * sin_tex + x_i * cos_tex;
-
-  VEC4_T xout_tex_1 = VEC4_T(xout_r.x, xout_i.x, xout_r.y, xout_i.y);
-  VEC4_T xout_tex_2 = VEC4_T(xout_r.z, xout_i.z, xout_r.w, xout_i.w);
-
-  write_texel(xqout, x_pos_1, xout_tex_1);
-  write_texel(xqout, x_pos_2, xout_tex_2);
-
-  // n_heads will be greater than or equal to n_kv_heads, therefore xq and xqout
-  // may have a larger height dim than xk and xkout. Only compute xkout if this
-  // invocation is still within bounds.
-  if (any(greaterThanEqual(x_pos_2, xkout_limits))) {
-    return;
-  }
-
-  // Compute xkout
-
-  x_tex_1 = load_texel(xk, x_pos_1);
-  x_tex_2 = load_texel(xk, x_pos_2);
-
-  x_r = VEC4_T(x_tex_1.xz, x_tex_2.xz);
-  x_i = VEC4_T(x_tex_1.yw, x_tex_2.yw);
-
-  xout_r = x_r * cos_tex - x_i * sin_tex;
-  xout_i = x_r * sin_tex + x_i * cos_tex;
-
-  xout_tex_1 = VEC4_T(xout_r.x, xout_i.x, xout_r.y, xout_i.y);
-  xout_tex_2 = VEC4_T(xout_r.z, xout_i.z, xout_r.w, xout_i.w);
-
-  write_texel(xkout, x_pos_1, xout_tex_1);
-  write_texel(xkout, x_pos_2, xout_tex_2);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/rotary_embedding.yaml b/backends/vulkan/runtime/graph/ops/glsl/rotary_embedding.yaml
deleted file mode 100644
index a81fd564d10..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/rotary_embedding.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-rotary_embedding:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: rotary_embedding
diff --git a/backends/vulkan/runtime/graph/ops/glsl/scalar_tensor.glsl b/backends/vulkan/runtime/graph/ops/glsl/scalar_tensor.glsl
deleted file mode 100644
index 09857451f7c..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/scalar_tensor.glsl
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define BUF_T ${buffer_scalar_type(DTYPE)}
-#define VEC4_T ${texel_type(DTYPE)}
-
-${define_active_storage_type(STORAGE)}
-${define_required_extensions(DTYPE)}
-${define_required_extensions(SCALAR_VALUE_TYPE)}
-
-#include "indexing_utils.h"
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_ubo(B, buffer_scalar_type(SCALAR_VALUE_TYPE), "scalar_value")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#ifdef USING_BUFFER
-
-void main() {
-  const int i = int(gl_GlobalInvocationID.x);
-
-  if (i > 0) {
-    return;
-  }
-
-  t_out[i] = BUF_T(scalar_value);
-}
-
-# else // !USING_BUFFER
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  // Scalar tensor is a special case where the packed dim is always 1.
-  if (any(greaterThanEqual(pos, ivec3(1)))) {
-    return;
-  }
-
-  VEC4_T outtex = VEC4_T(scalar_value);
-  write_texel(t_out, pos, outtex);
-}
-
-#endif // !USING_BUFFER
diff --git a/backends/vulkan/runtime/graph/ops/glsl/scalar_tensor.yaml b/backends/vulkan/runtime/graph/ops/glsl/scalar_tensor.yaml
deleted file mode 100644
index cd45b80c4dc..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/scalar_tensor.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-scalar_tensor:
-  parameter_names_with_default_values:
-    NDIM: 3
-    DTYPE: float
-    SCALAR_VALUE_TYPE: float
-    PACKING: C_packed
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-    STORAGE:
-      - VALUE: texture3d
-      - VALUE: buffer
-    SCALAR_VALUE_TYPE:
-      - VALUE: float
-      - VALUE: int32
-      - VALUE: bool
-  shader_variants:
-    - NAME: scalar_tensor
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.glsl b/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.glsl
deleted file mode 100644
index 1e854bf7f85..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.glsl
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define T ${buffer_scalar_type(DTYPE)}
-
-${define_active_storage_type(STORAGE)}
-${define_required_extensions(DTYPE)}
-
-#extension GL_EXT_control_flow_attributes : require
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "rw", "attn_weight", DTYPE, STORAGE)}
-
-$if STORAGE == "buffer":
-  ${layout_declare_ubo(B, "ivec4", "attn_weight_sizes")}
-  ${layout_declare_ubo(B, "ivec4", "attn_weight_strides")}
-$else:
-  ${layout_declare_ubo(B, "ivec3", "attn_weight_limits")}
-
-${layout_declare_ubo(B, "int", "input_pos")}
-${layout_declare_ubo(B, "float", "scale")}
-
-
-#include "indexing_utils.h"
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-// Negative infinity is represented by having sign bit be 1, all exponent bits
-// be 1, all mantissa bits be 0.
-#define NEGATIVE_INF_BITS 0xFF800000
-const float negative_infinity = NEGATIVE_INF_BITS;
-
-#ifdef USING_BUFFER
-
-/*
- * This implementations applies a scale and mask to the attention weight tensor
- * of an SDPA block. The sizes of the attention weight is
- * (batch_size, n_heads, seq_len, input_pos + seq_len)
- * Conceptually the weights represent the relationship between each token in the
- * sequence with each token preceding it.
- *
- * The scale applied is 1.0 / sqrt(head_dim_length)
- *
- * The mask applied is a bit more complicated. Imagine you create a square
- * matrix of size (input_pos + seq_len, input_pos + seq_len), and then set the
- * lower triangular section of the matrix to -inf. Then, slice the matrix along
- * the row dimension starting from input_pos to input_pos + seq_len. You end up
- * with a partial mask with size (seq_len, input_pos + seq_len). This is the
- * mask that is applied to the attention weight.
- *
- * In the shader, instead of generating the mask, the index of the elment is
- * inspected to determine if it would have been masked. Given an element at
- * tensor index (n, c, h, w), it would be masked if w < h + input_pos.
- */
-
-/***************************
- ** Buffer Implementation **
- ***************************/
-
-void main() {
-  const ivec4 attn_weight_idx = ivec4(
-      gl_GlobalInvocationID.x,
-      gl_GlobalInvocationID.y,
-      gl_GlobalInvocationID.z,
-      0);
-
-  if (any(greaterThanEqual(attn_weight_idx, attn_weight_sizes))) {
-    return;
-  }
-
-  const T scale_conv = T(scale);
-
-  const int attn_weight_id = tidx_to_bufi(attn_weight_idx, attn_weight_strides);
-  if (attn_weight_idx.x <= attn_weight_idx.y + input_pos) {
-    attn_weight[attn_weight_id] = attn_weight[attn_weight_id] * scale_conv;
-  } else {
-    attn_weight[attn_weight_id] = T(negative_infinity);
-  }
-}
-
-#else
-
-/****************************
- ** Texture Implementation **
- ****************************/
-
-/*
- * This implementation assumes that the attention weight is width packed, i.e.
- * the packed dim of the attn_weight is 0.
- */
-void main() {
-  const ivec3 attn_weight_pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(attn_weight_pos, attn_weight_limits))) {
-    return;
-  }
-
-  vec4 outtex = imageLoad(attn_weight, attn_weight_pos) * scale;
-
-  // Mask out the upper triangular of attn_weight to -inf
-  [[unroll]] for (int i = 0; i < 4; ++i) {
-    if (attn_weight_pos.x * 4 + i > attn_weight_pos.y + input_pos) {
-      outtex[i] = negative_infinity;
-    }
-  }
-
-  write_texel(attn_weight, attn_weight_pos, outtex);
-}
-
-#endif // USING_BUFFER
diff --git a/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.yaml b/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.yaml
deleted file mode 100644
index ca8806fe000..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/sdpa_attn_weight_scale_and_mask.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-sdpa_attn_weight_scale_and_mask:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: buffer
-  generate_variant_forall:
-    STORAGE:
-      - VALUE: buffer
-      - VALUE: texture3d
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: sdpa_attn_weight_scale_and_mask
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select.glslh b/backends/vulkan/runtime/graph/ops/glsl/select.glslh
deleted file mode 100644
index 6509015b4b6..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select.glslh
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef SELECT_GLSLH
-#define SELECT_GLSLH
-
-#ifndef USING_BUFFER
-
-/*
- * Enable the fast path if a texel loaded from the input texture can be used as
- * is to store to the output texture. The following conditions must be met:
- *
- * 1. The input and output textures have the same packed dimension.
- * 2. The selected_dim must not be the packed dimension of the input.
- * 3. The packed dimension of the input must "map" to the packed dimension of
- *    the output. This occurs if selected_dim is greater than the packed dimension
- *    of the input.
- */
-bool can_use_fast_path() {
-  if (out_packed_dim != in_packed_dim) {
-    return false;
-  }
-  if (selected_dim <= in_packed_dim) {
-    return false;
-  }
-  return true;
-}
-
-#endif // USING_BUFFER
-
-/*
- * Given an output tensor index, return the corresponding input tensor index for
- * the select operator. This is done by "inserting" the select index at the
- * selected_dim in the input tensor index.
- *
- * A simple example is (note all tensor index are in WHCN order):
- *   out_tidx = [7, 5, 9]
- *   selected_dim = 2
- *   index = 3
- *   in_tidx = [7, 3, 5, 9]
- *
- * This function assumes that the following variables are defined in the layout:
- * - in_sizes
- * - selected_dim
- * - index
- */
-ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
-  ivec4 in_tidx = ivec4(0);
-
-  int adjusted_index = index;
-  if (index < 0) {
-    adjusted_index = index + in_sizes[selected_dim];
-  }
-
-  // Handle different dimensions for selection
-  if (selected_dim == 0) {
-    // Select from width dimension
-    in_tidx = ivec4(adjusted_index, out_tidx.x, out_tidx.y, out_tidx.z);
-  } else if (selected_dim == 1) {
-    // Select from height dimension
-    in_tidx = ivec4(out_tidx.x, adjusted_index, out_tidx.y, out_tidx.z);
-  } else if (selected_dim == 2) {
-    // Select from channel dimension
-    in_tidx = ivec4(out_tidx.x, out_tidx.y, adjusted_index, out_tidx.z);
-  } else if (selected_dim == 3) {
-    // Select from batch dimension
-    in_tidx = ivec4(out_tidx.x, out_tidx.y, out_tidx.z, adjusted_index);
-  }
-
-  return in_tidx;
-}
-
-#endif // SELECT_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/set_zero.glsl b/backends/vulkan/runtime/graph/ops/glsl/set_zero.glsl
deleted file mode 100644
index d01780b9e30..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/set_zero.glsl
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define T ${buffer_scalar_type(DTYPE)}
-
-${define_active_storage_type("buffer")}
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
-
-${layout_declare_ubo(B, "int", "out_numel")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const int out_bufi = ivec3(gl_GlobalInvocationID).x;
-  if (out_bufi >= out_numel) {
-    return;
-  }
-
-  t_out[out_bufi] = T(0);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/set_zero.yaml b/backends/vulkan/runtime/graph/ops/glsl/set_zero.yaml
deleted file mode 100644
index cee87c468b1..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/set_zero.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-set_zero:
-  parameter_names_with_default_values:
-    DTYPE: float
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: int32
-  shader_variants:
-    - NAME: set_zero
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
deleted file mode 100644
index 87325754f4d..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#ifndef SLICE_GLSLH
-#define SLICE_GLSLH
-
-#ifndef USING_BUFFER
-
-/**
- * Enable the fast path if a texel loaded from the input texture can be used as
- * is to store to the output texture. The following conditions must be met:
- *
- * 1. The input and output textures have the same packed dimension.
- * 2. The select_dim must not be the packed dimension of the input.
- */
-bool can_use_fast_path() {
-  if (out_packed_dim != in_packed_dim) {
-    return false;
-  }
-  if (in_packed_dim == selected_dim) {
-    return false;
-  }
-  return true;
-}
-
-#endif // USING_BUFFER
-
-/*
- * Converts output tensor indices to input tensor indices for the slice operation.
- * This function maps the output indices to the corresponding input indices based on
- * the slice parameters (start, step, selected_dim).
- *
- * Parameters assumed to be defined in the layout specifier:
- * - in_sizes
- * - selected_dim
- * - start
- * - step
- */
-ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
-  ivec4 in_tidx = out_tidx;
-
-  int adjusted_start = start;
-  if (start < 0) {
-    adjusted_start = start + in_sizes[selected_dim];
-  }
-
-  in_tidx[selected_dim] = adjusted_start + out_tidx[selected_dim] * step;
-
-  return in_tidx;
-}
-
-#endif // SLICE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl b/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
deleted file mode 100644
index d35492bc367..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/softmax.glsl
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define op1(X) ${OPERATOR1}
-
-#define op2(X, Y) ${OPERATOR2}
-
-${define_active_storage_type(STORAGE)}
-
-#extension GL_EXT_control_flow_attributes : require
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)}
-
-${layout_declare_ubo(B, "ivec3", "tout_limits")}
-${layout_declare_ubo(B, "ivec4", "tin_sizes")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = 0;
-layout(constant_id = 4) const int reduce_dim = 0;
-layout(constant_id = 5) const int group_dim = 1;
-
-// A more verbose name would be NWORKERS_PER_GROUP. This describes the number of
-// threads that will co-operate to compute one reduction output. There may be
-// multiple groups computing distinct reduction outputs within one work group.
-#define NWORKERS 4
-
-// Sets an upper limit on the total size of a work group based on how many
-// elements are allocated in the shared memory array below. Each thread in the
-// work group will write into its assigned element in the shared array.
-#define MAX_NTHREADS 16
-
-shared vec4 shared_vecs[MAX_NTHREADS];
-
-#include "indexing_utils.h"
-
-int tid_to_smi(const ivec2 tid) {
-  return tid.x + tid.y * NWORKERS;
-}
-
-/*
- * The shaders below compute softmax for a tensor. Softmax is an interesting mix
- * between a reduction operator and a unary elementwise operator, defined as
- * exp(x) / (sum of exp(x)). The general flow of the computation is:
- *
- * First, find the maximum element along the reduction dim. The maximum element
- * is used to preserve numerical stability, since division of exponents is
- * translation invariant.
- *
- * Next, compute the sum of exp(x - max_element) along the reduction dim.
- *
- * Finally, for each element along the reduction dim, we compute the output as
- * exp(x - max_element) / sum_of_exponents.
- *
- * The shaders below also utilize shared memory to have multiple threads help
- * compute the max and sum reduction operations. A total of NGROUPS x NWORKERS
- * threads are launched. Each group works on a unique reduction "row", and
- * within a group NWORKERS threads co-operate to compute the max and sum of one
- * "row". Each worker in the group is responsible for computing a partial output
- * of the "row" and uploading it to shared memory; the overall reduction output
- * can then be determined by aggregating the partial outputs stored in shared
- * memory.
- *
- * As a caveat, this shader does not currently support cases where `batch` > 1
- * and the reduce dim happens to also be the batch concatenation dim.  To support
- * this, there will need to be additional logic to set the starting value of
- * `scan_pos[reduce_dim]`. Since this is not expected to be a common use-case,
- * supporting this case is left as an exercise for when it is required.
- *
- * As a final note, log softmax is supported with this shader as well since via
- * the op1 and op2 macro definitions. See the corresponding YAML file for more
- * details.
- */
-
-/*
- * Computes softmax where the reduction dim is orthogonal to the packed dim.
- * This case is simpler because each element of a texel belongs to a separate
- * reduction dim, meaning we don't have to perform reduction along a texel.
- */
-void softmax_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
-  // shared memory index of this thread
-  const int smi = tid_to_smi(tid);
-  // used to iterate over all shared memory in the group
-  int group_i;
-
-  scan_pos[reduce_dim] = tid.x;
-  vec4 max_elements = load_texel(tin, scan_pos);
-  // This thread computes a partial maximum
-  for (int i = tid.x; i < tin_sizes[reduce_dim];
-       i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
-    max_elements = max(max_elements, load_texel(tin, scan_pos));
-  }
-  shared_vecs[smi] = max_elements;
-  barrier();
-  // Iterate over the partial maximums to obtain the overall maximum
-  group_i = tid.y * NWORKERS;
-  max_elements = shared_vecs[group_i++];
-  for (int i = 1; i < NWORKERS; ++i, group_i++) {
-    max_elements = max(max_elements, shared_vecs[group_i]);
-  }
-
-  scan_pos[reduce_dim] = tid.x;
-  vec4 denominators = vec4(0);
-  // Compute partial sum
-  for (int i = tid.x; i < tin_sizes[reduce_dim];
-       i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
-    denominators += exp(load_texel(tin, scan_pos) - max_elements);
-  }
-  shared_vecs[smi] = denominators;
-  barrier();
-  // Iterate over the partial sums to obtain the overall sum
-  group_i = tid.y * NWORKERS;
-  denominators = shared_vecs[group_i++];
-  for (int i = 1; i < NWORKERS; ++i, group_i++) {
-    denominators += shared_vecs[group_i];
-  }
-
-  // Determine if there are any padding elements in the final texel of the
-  // packed dimension
-  const int nspill = mod4(tin_sizes[packed_dim]);
-  // Detect if this thread is working on the final texels of the packed
-  // dimension, which may have padding elements
-  const bool is_last_texel =
-      scan_pos[packed_dim] == (tout_limits[packed_dim] - 1);
-
-  scan_pos[reduce_dim] = tid.x;
-  for (int i = tid.x; i < tin_sizes[reduce_dim];
-       i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
-    const vec4 numerators = op1(load_texel(tin, scan_pos) - max_elements);
-    vec4 outtex = op2(numerators, denominators);
-    // For the last texel in the packed dim, make sure that the padding elements
-    // are explicitly set to 0. Otherwise, they may influence computations later
-    // down the line.
-    if (is_last_texel && nspill > 0) {
-      [[unroll]] for (int i = nspill; i < 4; ++i) {
-        outtex[i] = 0;
-      }
-    }
-    write_texel(tout, scan_pos, outtex);
-  }
-}
-
-/*
- * Compute softmax where the reduction dim is also the packed dim. This case is
- * complex because the reduction needs to occur over the individual texels.
- * Therefore, in this algorithm each element of the accumulator texels are
- * themselves partial outputs. Special care has to be taken to ignore padding
- * elements in texels (which occur when the size of the packed dim is not a
- * multiple of 4) so that they do not influence the output of reduction.
- */
-void softmax_packed_dim(const ivec2 tid, ivec3 scan_pos) {
-  // shared memory index of this thread
-  const int smi = tid_to_smi(tid);
-  // used to iterate over all shared memory in the group
-  int group_i;
-
-  const int nspill = mod4(tin_sizes[packed_dim]);
-  const int reduce_len = tin_sizes[packed_dim] - nspill;
-
-  scan_pos[reduce_dim] = tid.x;
-  vec4 max_elements = vec4(load_texel(tin, scan_pos).x);
-  for (int i = tid.x * 4; i < reduce_len;
-       i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
-    max_elements = max(max_elements, load_texel(tin, scan_pos));
-  }
-  // For the last texel in the dim, if there are padding elements then each
-  // element of the texel needs to be processed individually such that the
-  // padding elements are ignored
-  if (scan_pos[reduce_dim] == tout_limits[reduce_dim] - 1 && nspill > 0) {
-    const vec4 intex = load_texel(tin, scan_pos);
-    for (int i = 0; i < nspill; ++i) {
-      max_elements.x = max(intex[i], max_elements.x);
-    }
-  }
-  shared_vecs[smi] = max_elements;
-  barrier();
-  // Iterate over the partial maximums to obtain the overall maximum
-  group_i = tid.y * NWORKERS;
-  max_elements = shared_vecs[group_i++];
-  for (int i = 1; i < NWORKERS; ++i, group_i++) {
-    max_elements = max(max_elements, shared_vecs[group_i]);
-  }
-  // Each element of the texel is itself a partial maximum; iterate over the
-  // texel to find the actual maximum
-  float max_element = max_elements.x;
-  [[unroll]] for (int i = 1; i < 4; ++i) {
-    max_element = max(max_elements[i], max_element);
-  }
-
-  scan_pos[reduce_dim] = tid.x;
-  vec4 denominators = vec4(0);
-  for (int i = tid.x * 4; i < reduce_len;
-       i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
-    denominators += exp(load_texel(tin, scan_pos) - max_element);
-  }
-  // For the last texel in the dim, if there are padding elements then each
-  // element of the texel needs to be processed individually such that the
-  // padding elements are ignored
-  if (nspill > 0 && scan_pos[reduce_dim] == tout_limits[reduce_dim] - 1) {
-    const vec4 intex = load_texel(tin, scan_pos);
-    for (int i = 0; i < nspill; ++i) {
-      denominators.x += exp(intex[i] - max_element);
-    }
-  }
-  shared_vecs[smi] = denominators;
-  barrier();
-  // Iterate over the partial sums to obtain the overall sum
-  group_i = tid.y * NWORKERS;
-  denominators = shared_vecs[group_i++];
-  for (int i = 1; i < NWORKERS; ++i, group_i++) {
-    denominators += shared_vecs[group_i];
-  }
-  // Reduce over the accumulated texel to find the overall sum
-  float denominator = 0;
-  [[unroll]] for (int i = 0; i < 4; ++i) {
-    denominator += denominators[i];
-  }
-
-  scan_pos[reduce_dim] = tid.x;
-  for (int i = tid.x * 4; i < reduce_len;
-       i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
-    const vec4 numerators = op1(load_texel(tin, scan_pos) - max_element);
-    write_texel(tout, scan_pos, op2(numerators, denominator));
-  }
-  // For the last texel in the dim, if there are padding elements then the
-  // padding elements need to be set to 0 explicitly, otherwise they may
-  // influence subsequent operations.
-  if (nspill > 0 && scan_pos[reduce_dim] == tout_limits[reduce_dim] - 1) {
-    const vec4 numerator = op1(load_texel(tin, scan_pos) - max_element);
-    vec4 outtex = op2(numerator, denominator);
-    [[unroll]] for (int i = nspill; i < 4; ++i) {
-      outtex[i] = 0;
-    }
-    write_texel(tout, scan_pos, outtex);
-  }
-}
-
-void main() {
-  ivec3 scan_pos = ivec3(gl_GlobalInvocationID);
-  scan_pos[reduce_dim] = 0;
-
-  const ivec2 tid = ivec2(
-      gl_LocalInvocationID[reduce_dim],
-      gl_LocalInvocationID[group_dim]);
-
-  if (any(greaterThanEqual(scan_pos, tout_limits))) {
-    return;
-  }
-
-  if (reduce_dim != packed_dim) {
-    softmax_nonpacked_dim(tid, scan_pos);
-  } else {
-    softmax_packed_dim(tid, scan_pos);
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/softmax.yaml b/backends/vulkan/runtime/graph/ops/glsl/softmax.yaml
deleted file mode 100644
index d50bbb85f33..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/softmax.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-softmax:
-  parameter_names_with_default_values:
-    OPERATOR1: exp(X)
-    OPERATOR2: X / Y
-    DTYPE: float
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: softmax
-    - NAME: log_softmax
-      OPERATOR1: X
-      OPERATOR2: X - log(Y)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/tan.glsl b/backends/vulkan/runtime/graph/ops/glsl/tan.glsl
deleted file mode 100644
index 876cd43ad08..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/tan.glsl
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
-#define T ${buffer_scalar_type(DTYPE)}
-
-${define_active_storage_type(STORAGE)}
-
-#include "indexing_utils.h"
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-$if STORAGE == "buffer":
-  ${layout_declare_ubo(2, "int", "numel")}
-$else:
-  ${layout_declare_ubo(2, "ivec3", "out_limits")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#include "activations.h"
-
-#ifdef USING_BUFFER
-
-void main() {
-  const int i = int(gl_GlobalInvocationID.x);
-  if (i >= numel) {
-    return;
-  }
-
-  float in_val = float(t_in[i]);
-  t_out[i] = T(tan(in_val));
-}
-
-#else
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  VEC4_T in_texel = texelFetch(t_in, pos, 0);
-  imageStore(t_out, pos, VEC4_T(tan(in_texel)));
-}
-
-#endif
diff --git a/backends/vulkan/runtime/graph/ops/glsl/tan.yaml b/backends/vulkan/runtime/graph/ops/glsl/tan.yaml
deleted file mode 100644
index ad0755bfad0..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/tan.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-tan:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-    STORAGE:
-      - VALUE: texture3d
-      - VALUE: buffer
-  shader_variants:
-    - NAME: tan
diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
deleted file mode 100644
index 7605c59c72f..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define UBO_PARAMS ${UBO_PARAMS}
-
-#define VEC4_T ${texel_type(DTYPE)}
-#define T ${buffer_scalar_type(DTYPE)}
-
-${define_active_storage_type("buffer")}
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
-
-$if UBO_PARAMS:
-  $if OP_NAME == "slice":
-    ${layout_declare_ubo(B, "int", "start")}
-    ${layout_declare_ubo(B, "int", "step")}
-
-  $if OP_NAME == "select":
-    ${layout_declare_ubo(B, "int", "index")}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 in_sizes;
-  ivec4 out_strides;
-  ivec4 in_strides;
-  int out_numel;
-  int selected_dim;
-  $if not UBO_PARAMS:
-    $if OP_NAME == "slice":
-      int start;
-      int step;
-
-    $if OP_NAME == "select":
-      int index;
-};
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-
-const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#include "${OP_NAME}.glslh"
-
-void main() {
-  const int out_bufi = ivec3(gl_GlobalInvocationID).x;
-  if (out_bufi >= out_numel) {
-    return;
-  }
-
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
-  ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
-
-  const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
-  t_out[out_bufi] = t_in[in_bufi];
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.yaml
deleted file mode 100644
index f68b2bd1250..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-transfer_buffer:
-  parameter_names_with_default_values:
-    DTYPE: float
-    OP_NAME: select
-    UBO_PARAMS: False
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-  shader_variants:
-    - NAME: select_buffer
-      OP_NAME: select
-    - NAME: slice_buffer
-      OP_NAME: slice
-    - NAME: select_ubo_buffer
-      OP_NAME: select
-      UBO_PARAMS: True
-    - NAME: slice_ubo_buffer
-      OP_NAME: slice
-      UBO_PARAMS: True
diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl
deleted file mode 100644
index 0f34713cb43..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define UBO_PARAMS ${UBO_PARAMS}
-
-#define VEC4_T ${texel_type(DTYPE)}
-#define T ${buffer_scalar_type(DTYPE)}
-
-${define_active_storage_type("texture3d")}
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}
-
-$if UBO_PARAMS:
-  $if OP_NAME == "slice":
-    ${layout_declare_ubo(B, "int", "start")}
-    ${layout_declare_ubo(B, "int", "step")}
-
-  $if OP_NAME == "select":
-    ${layout_declare_ubo(B, "int", "index")}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 out_sizes;
-  ivec4 in_sizes;
-  int selected_dim;
-  $if not UBO_PARAMS:
-    $if OP_NAME == "slice":
-      int start;
-      int step;
-
-    $if OP_NAME == "select":
-      int index;
-};
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
-const lowp int out_packed_dim = unhash_packed_dim(out_layout);
-
-${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
-const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
-const lowp int in_packed_dim = unhash_packed_dim(in_layout);
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#include "${OP_NAME}.glslh"
-
-void main() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-  ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim);
-
-  if (any(greaterThanEqual(out_tidx, out_sizes))) {
-    return;
-  }
-
-  if (can_use_fast_path()) {
-    ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
-    ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
-    VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
-
-    write_texel_lpos(t_out, lpos, in_texel, out_axis_map);
-  }
-  else {
-    VEC4_T out_texel = VEC4_T(0);
-    for (int texel_i = 0; texel_i < 4; ++texel_i) {
-      ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
-      ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
-      int element_idx = in_tidx[in_packed_dim] % 4;
-
-      VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
-      T selected_value = T(in_texel[element_idx]);
-
-      out_texel[texel_i] = selected_value;
-
-      out_tidx[out_packed_dim]++;
-    }
-
-    write_texel_lpos(t_out, lpos, out_texel, out_axis_map);
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.yaml
deleted file mode 100644
index 6922f120e49..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-transfer_texture:
-  parameter_names_with_default_values:
-    DTYPE: float
-    OP_NAME: select
-    UBO_PARAMS: False
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-  shader_variants:
-    - NAME: select_texture3d
-      OP_NAME: select
-    - NAME: slice_texture3d
-      OP_NAME: slice
-    - NAME: select_ubo_texture3d
-      OP_NAME: select
-      UBO_PARAMS: True
-    - NAME: slice_ubo_texture3d
-      OP_NAME: slice
-      UBO_PARAMS: True
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
deleted file mode 100644
index bb7ce482a7a..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.glsl
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
-#define T ${buffer_scalar_type(DTYPE)}
-
-#define op(X, A, B) ${OPERATOR}
-
-${define_active_storage_type(STORAGE)}
-
-#include "indexing_utils.h"
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-
-layout(push_constant) uniform restrict Block {
-$if STORAGE == "buffer":
-  int numel;
-$else:
-  ivec4 out_limits;
-float minimum;
-float maximum;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#include "activations.h"
-
-#ifdef USING_BUFFER
-
-void main() {
-  const int i = int(gl_GlobalInvocationID.x);
-  if (i >= numel) {
-    return;
-  }
-
-  float in_val = float(t_in[i]);
-  t_out[i] = T(op(in_val, minimum, maximum));
-}
-
-#else
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits.xyz))) {
-    return;
-  }
-
-  VEC4_T in_texel = texelFetch(t_in, pos, 0);
-  imageStore(t_out, pos, VEC4_T(op(in_texel, minimum, maximum)));
-}
-
-#endif
diff --git a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml b/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
deleted file mode 100644
index 47f538aee6c..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/unary_op.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-unary_op:
-  parameter_names_with_default_values:
-    OPERATOR: clamp(X, A, B)
-    DTYPE: float
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-    STORAGE:
-      - VALUE: texture3d
-      - VALUE: buffer
-  shader_variants:
-    - NAME: abs
-      OPERATOR: abs(X)
-    - NAME: clamp
-      OPERATOR: clamp(X, A, B)
-    - NAME: clamp_int32
-      OPERATOR: clamp(X, A, B)
-      DTYPE: int32
-    - NAME: cos
-      OPERATOR: cos(X)
-    - NAME: exp
-      OPERATOR: exp(X)
-    - NAME: gelu
-      OPERATOR: 0.5 * X * (1 + tanh(sqrt(2 / 3.141593) * (X + 0.044715 * X * X * X)))
-    - NAME: neg
-      OPERATOR: -X
-    - NAME: sigmoid
-      OPERATOR: 1 / (1 + exp(-1 * X))
-    - NAME: sin
-      OPERATOR: sin(X)
-    - NAME: sqrt
-      OPERATOR: sqrt(X)
-    - NAME: rsqrt
-      OPERATOR: (1 / sqrt(X))
-    - NAME: tanh
-      OPERATOR: tanh(clamp(X, -15.0, 15.0))
-    - NAME: hardshrink
-      OPERATOR: hardshrink(X, A, B)
-    - NAME: hardswish
-      OPERATOR: hardswish(X)
-    - NAME: hardsigmoid
-      OPERATOR: hardsigmoid(X)
-    - NAME: leaky_relu
-      OPERATOR: leaky_relu(X, A)
-    - NAME: round
-      OPERATOR: round(X)
diff --git a/backends/vulkan/runtime/graph/ops/glsl/update_concat_offset.glsl b/backends/vulkan/runtime/graph/ops/glsl/update_concat_offset.glsl
deleted file mode 100644
index ba02da1c301..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/update_concat_offset.glsl
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define T ${buffer_scalar_type(DTYPE)}
-
-${define_active_storage_type("buffer")}
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "concat_offset", DTYPE, "buffer")}
-
-${layout_declare_ubo(B, "int", "concat_dim")}
-
-$for i in range(NUM_INPUTS):
-  ${layout_declare_ubo(B, "ivec4", "in" + str(i+1) + "_sizes")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  // Only one thread needs to update the offset
-  if (gl_GlobalInvocationID.x != 0) {
-    return;
-  }
-
-  // Sum up the sizes along the concat dimension for all input tensors
-  int total_size = 0;
-  $for i in range(NUM_INPUTS):
-    total_size += in${i+1}_sizes[concat_dim];
-
-  // Add to the current offset
-  concat_offset[0] += T(total_size);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/update_concat_offset.yaml b/backends/vulkan/runtime/graph/ops/glsl/update_concat_offset.yaml
deleted file mode 100644
index 35e8740e0a3..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/update_concat_offset.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-update_concat_offset:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NUM_INPUTS: 2
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: int32
-  shader_variants:
-    - NAME: update_concat_offset_1
-      NUM_INPUTS: 1
-    - NAME: update_concat_offset_2
-    - NAME: update_concat_offset_3
-      NUM_INPUTS: 3
diff --git a/backends/vulkan/runtime/graph/ops/glsl/upsample_2d.glsl b/backends/vulkan/runtime/graph/ops/glsl/upsample_2d.glsl
deleted file mode 100644
index 85b63ad20ba..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/upsample_2d.glsl
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_ubo(B, "ivec3", "out_limits")}
-${layout_declare_ubo(B, "ivec3", "in_limits")}
-${layout_declare_ubo(B, "vec2", "recip_scales")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int align_corners = 0;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  ivec2 max_in_xy = in_limits.xy - 1;
-  vec2 scaled_xy;
-
-  if (align_corners == 1) {
-    scaled_xy = pos.xy * recip_scales;
-  } else {
-    scaled_xy = (pos.xy + 0.5) * recip_scales - 0.5;
-  }
-
-  $if MODE == "nearest":
-    const ivec2 ipos = clamp(ivec2(round(scaled_xy)), ivec2(0), max_in_xy);
-    VEC4_T out_tex = texelFetch(t_in, ivec3(ipos, pos.z), 0);
-  $elif MODE == "bilinear":
-    vec2 upper_xy = ceil(scaled_xy);
-    vec2 lower_xy = floor(scaled_xy);
-
-    // Clamp coordinates to valid input range
-    upper_xy = clamp(upper_xy, ivec2(0), max_in_xy);
-    lower_xy = clamp(lower_xy, ivec2(0), max_in_xy);
-
-    // Calculate interpolation weights
-    vec2 interp_weights = (scaled_xy - lower_xy);
-
-    // Sample the four nearest texels
-    VEC4_T sample00 = texelFetch(t_in, ivec3(lower_xy.x, lower_xy.y, pos.z), 0);
-    VEC4_T sample10 = texelFetch(t_in, ivec3(upper_xy.x, lower_xy.y, pos.z), 0);
-    VEC4_T sample01 = texelFetch(t_in, ivec3(lower_xy.x, upper_xy.y, pos.z), 0);
-    VEC4_T sample11 = texelFetch(t_in, ivec3(upper_xy.x, upper_xy.y, pos.z), 0);
-
-    // Perform bilinear interpolation
-    VEC4_T out_tex = mix(
-      mix(sample00, sample10, interp_weights.x),
-      mix(sample01, sample11, interp_weights.x),
-      interp_weights.y
-    );
-
-  imageStore(t_out, pos, out_tex);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/upsample_2d.yaml b/backends/vulkan/runtime/graph/ops/glsl/upsample_2d.yaml
deleted file mode 100644
index 3bd1c282e13..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/upsample_2d.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-upsample_2d:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: texture3d
-    MODE: nearest
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: upsample_nearest2d
-    - NAME: upsample_bilinear2d
-      MODE: bilinear
diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/var_buffer.glsl
deleted file mode 100644
index 30f283d6f01..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/var_buffer.glsl
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define T ${buffer_scalar_type(DTYPE)}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "out_buf", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "in_buf", DTYPE, STORAGE)}
-
-${layout_declare_ubo(B, "ivec4", "in_sizes")}
-${layout_declare_ubo(B, "ivec4", "in_strides")}
-${layout_declare_ubo(B, "ivec4", "out_sizes")}
-${layout_declare_ubo(B, "ivec4", "out_strides")}
-
-layout(push_constant) uniform PushConstants {
-  int unbiased;
-} pc;
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int reduce_dim = 0;
-
-#define NWORKERS 4
-#define MAX_THREADS 16
-
-shared T shared_sum[NWORKERS];
-shared T shared_sum_sq[NWORKERS];
-shared int shared_count[NWORKERS];
-
-#include "indexing_utils.h"
-
-void main() {
-  const ivec4 out_idx = ivec4(
-      gl_GlobalInvocationID.x,
-      gl_GlobalInvocationID.y,
-      gl_GlobalInvocationID.z % out_sizes.z,
-      gl_GlobalInvocationID.z / out_sizes.z);
-
-  const uint tid = gl_LocalInvocationID[reduce_dim];
-
-  shared_sum[tid] = T(0);
-  shared_sum_sq[tid] = T(0);
-  shared_count[tid] = 0;
-  barrier();
-
-  const int R = in_sizes[reduce_dim];
-  const uint N = gl_WorkGroupSize[reduce_dim];
-
-  // Each workgroup processes a contiguous chunk of the input tensor
-  // along the reduce_dim. Each thread processes a part of this chunk.
-  uint q = R / N;
-  uint rem = R % N;
-
-  uint len = q + (tid < rem ? 1u : 0u);
-  uint base = tid * q + min(tid, rem);
-
-  T sum = T(0);
-  T sum_sq = T(0);
-  int count = 0;
-
-  ivec4 in_idx = out_idx;
-  for (uint off = 0u; off < len; ++off) {
-    uint i = base + off;
-    in_idx[reduce_dim] = int(i);
-
-    // out_idx is a 4D index, so for tensors with reduce_dim == 2,
-    // we need to set the reduce_dim + 1 to 0 as gl_GlobalInvocationID.z
-    // is influenced by the tid
-    if (reduce_dim == 2) {
-      in_idx[reduce_dim + 1] -= int(tid);
-    }
-
-    T v = in_buf[tidx_to_bufi(in_idx, in_strides)];
-
-    sum += v;
-    sum_sq += v * v;
-    count += 1;
-  }
-
-  shared_sum[tid] = sum;
-  shared_sum_sq[tid] = sum_sq;
-  shared_count[tid] = count;
-  barrier();
-
-  if (tid == 0u) {
-    T tot_sum = T(0);
-    T tot_sum_sq = T(0);
-    int tot_count = 0;
-
-    for (uint i = 0; i < N; ++i) {
-      tot_sum += shared_sum[i];
-      tot_sum_sq += shared_sum_sq[i];
-      tot_count += shared_count[i];
-    }
-
-    T var;
-    if (tot_count > 0) {
-      T mean = tot_sum / T(tot_count);
-      var = (tot_sum_sq / T(tot_count)) - (mean * mean);
-      if (pc.unbiased != 0 && tot_count > 1) {
-        var *= T(tot_count) / T(tot_count - 1);
-      }
-    } else{
-      // NaN to match PyTorch behavior
-      var = T(0.0/0.0);
-    }
-
-    out_buf[tidx_to_bufi(out_idx, out_strides)] = var;
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/var_buffer.yaml
deleted file mode 100644
index 7cb783775c9..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/var_buffer.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-var_buffer:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: buffer
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: var_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl
deleted file mode 100644
index faeac01fcd2..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.glsl
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
-
-${define_active_storage_type(STORAGE)}
-
-#extension GL_EXT_control_flow_attributes : require
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)}
-
-${layout_declare_ubo(B, "ivec3", "tin_limits")}
-${layout_declare_ubo(B, "ivec4", "tin_sizes")}
-
-layout(push_constant) uniform PushConstants {
-  int unbiased;
-} pc;
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = 0;
-layout(constant_id = 4) const int reduce_dim = 0;
-layout(constant_id = 5) const int group_dim = 1;
-
-// A more verbose name would be NWORKERS_PER_GROUP. This describes the number of
-// threads that will co-operate to compute one reduction output. There may be
-// multiple groups computing distinct reduction outputs within one work group.
-#define NWORKERS 4
-
-// Sets an upper limit on the total size of a work group based on how many
-// elements are allocated in the shared memory array below. Each thread in the
-// work group will write into its assigned element in the shared array.
-#define MAX_NTHREADS 16
-
-shared VEC4_T shared_sum[MAX_NTHREADS];
-shared VEC4_T shared_sum_sq[MAX_NTHREADS];
-shared int shared_count[MAX_NTHREADS];
-
-#include "indexing_utils.h"
-
-int tid_to_smi(const ivec2 tid) {
-  return tid.x + tid.y * NWORKERS;
-}
-
-VEC4_T calculate_variance(VEC4_T sum, VEC4_T sum_sq, int count) {
-  VEC4_T mean = sum / float(count);
-  VEC4_T variance = (sum_sq / float(count)) - (mean * mean);
-
-  if ((pc.unbiased != 0) && (count > 1)) {
-    variance = variance * (float(count) / float(count - 1.0));
-  }
-
-  return variance;
-}
-
-void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
-  // shared memory index of this thread
-  const int smi = tid_to_smi(tid);
-
-  VEC4_T sum = VEC4_T(0);
-  VEC4_T sum_sq = VEC4_T(0);
-  int count = 0;
-
-  scan_pos[reduce_dim] = tid.x;
-  for (int i = tid.x; i < tin_sizes[reduce_dim];
-       i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
-    VEC4_T val = load_texel(tin, scan_pos);
-    sum += val;
-    sum_sq += val * val;
-    count += 1;
-  }
-  // Write partial output to shared memory and synchronize work group
-  shared_sum[smi] = sum;
-  shared_sum_sq[smi] = sum_sq;
-  shared_count[smi] = count;
-  barrier();
-
-  // Since the reduction row is reduced to only one element, only the "main"
-  // thread in the group needs aggregate the partial outputs
-  if (tid.x == 0) {
-    int group_i = tid.y * NWORKERS;
-    sum = shared_sum[group_i];
-    sum_sq = shared_sum_sq[group_i];
-    count = shared_count[group_i];
-
-    for (int i = 1; i < NWORKERS; i++) {
-      int idx = tid.y * NWORKERS + i;
-      sum += shared_sum[idx];
-      sum_sq += shared_sum_sq[idx];
-      count += shared_count[idx];
-    }
-
-    // Determine if there are any padding elements in the final texel of the
-    // packed dimension
-    const int nspill = mod4(tin_sizes[packed_dim]);
-    // Detect if this thread is working on the final texels of the packed
-    // dimension, which may have padding elements
-    const bool is_last_texel =
-        scan_pos[packed_dim] == (tin_limits[packed_dim] - 1);
-
-    VEC4_T variance = calculate_variance(sum, sum_sq, count);
-
-    // Explicitly set padding elements to 0
-    if (is_last_texel && nspill > 0) {
-      [[unroll]] for (int i = nspill; i < 4; i++) {
-        variance[i] = 0;
-      }
-    }
-
-    scan_pos[reduce_dim] = tid.x;
-    write_texel(tout, scan_pos, variance);
-  }
-}
-
-/*
- * Compute reduction where the reduction dim is also the packed dim. This case is
- * complex because the reduction needs to occur over the individual texels.
- * Therefore, in this algorithm each element of the accumulator texels are
- * themselves partial outputs. Special care has to be taken to ignore padding
- * elements in texels (which occur when the size of the packed dim is not a
- * multiple of 4) so that they do not influence the output of reduction.
- */
-void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
-  // shared memory index of this thread
-  const int smi = tid_to_smi(tid);
-
-  // Number of non-padding elements in the last texel in the reduction row
-  const int nspill = mod4(tin_sizes[packed_dim]);
-  // Only reduce up to the last "complete" texel. The last texel will need to be
-  // handled specially if it has padding elements.
-  const int reduce_len = tin_sizes[packed_dim] - nspill;
-
-  VEC4_T sum = VEC4_T(0);
-  VEC4_T sum_sq = VEC4_T(0);
-  int count = 0;
-
-  // Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of
-  // the reduction row
-  scan_pos[reduce_dim] = tid.x;
-  for (int i = tid.x * 4; i < reduce_len;
-       i += NWORKERS * 4, scan_pos[reduce_dim] += NWORKERS) {
-    VEC4_T val = load_texel(tin, scan_pos);
-    sum += val;
-    sum_sq += val * val;
-    count += 4;
-  }
-  // For the last texel in the dim, if there are padding elements then each
-  // element of the texel needs to be processed individually such that the
-  // padding elements are ignored
-  if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) {
-    const VEC4_T val = load_texel(tin, scan_pos);
-    for (int i = 0; i < nspill; i++) {
-      sum.x += val[i];
-      sum_sq.x += val[i] * val[i];
-      count += 1;
-    }
-  }
-  // Write partial output to shared memory and synchronize work group
-  shared_sum[smi] = sum;
-  shared_sum_sq[smi] = sum_sq;
-  shared_count[smi] = count;
-  barrier();
-
-  // Since the reduction row is reduced to only one element, only the "main"
-  // thread in the group needs aggregate the partial outputs
-  if (tid.x == 0) {
-    sum = shared_sum[tid.y * NWORKERS];
-    sum_sq = shared_sum_sq[tid.y * NWORKERS];
-    count = shared_count[tid.y * NWORKERS];
-    for (int i = 1; i < NWORKERS; i++) {
-      int idx = tid.y * NWORKERS + i;
-      sum += shared_sum[idx];
-      sum_sq += shared_sum_sq[idx];
-      count += shared_count[idx];
-    }
-
-    // Combine across the elements of the combined state
-    float total_sum = sum.x + sum.y + sum.z + sum.w;
-    float total_sum_sq = sum_sq.x + sum_sq.y + sum_sq.z + sum_sq.w;
-    int total_count = count;
-
-    float mean = total_sum / float(total_count);
-    float variance = (total_sum_sq / float(total_count)) - (mean * mean);
-
-    if ((pc.unbiased != 0) && (total_count > 1)) {
-      variance = variance * (float(total_count) / float(total_count - 1.0));
-    }
-
-    scan_pos[reduce_dim] = tid.x;
-    write_texel(tout, scan_pos, VEC4_T(variance, 0, 0, 0));
-  }
-}
-
-void main() {
-  ivec3 scan_pos = ivec3(gl_GlobalInvocationID);
-  scan_pos[reduce_dim] = 0;
-
-  const ivec2 tid = ivec2(
-      gl_LocalInvocationID[reduce_dim],
-      gl_LocalInvocationID[group_dim]);
-
-  if (any(greaterThanEqual(scan_pos, tin_limits))) {
-    return;
-  }
-
-  if (reduce_dim != packed_dim) {
-    reduce_nonpacked_dim(tid, scan_pos);
-  } else {
-    reduce_packed_dim(tid, scan_pos);
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.yaml
deleted file mode 100644
index 9cecbedca1a..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/var_texture3d.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-var_texture3d:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: var_texture3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/view.glsl b/backends/vulkan/runtime/graph/ops/glsl/view.glsl
deleted file mode 100644
index 599879514e3..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/view.glsl
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-
-layout(push_constant) uniform PRECISION restrict Block {
-  ivec4 out_sizes;
-  ivec4 in_sizes;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int in_packed_dim = C_DIM;
-
-layout(constant_id = 4) const int out_packed_dim = C_DIM;
-
-void main() {
-	const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
-	ivec4 out_tensor_idx = to_tensor_idx(out_pos, out_sizes, out_packed_dim);
-
-  if (all(greaterThanEqual(out_tensor_idx, out_sizes))) {
-    return;
-  }
-
-  // Assume there is a virtual continous buffer in nchw format. From the output
-  // pos, we first calculate the index in the virual buffer, and then calculate
-  // the input position from the indx.
-  const ivec4 buf_indices = tidx_to_nchwi(out_tensor_idx, out_sizes, out_packed_dim);
-
-  VEC4_T value = VEC4_T(0);
-  // Need to look up the 4 values in the output texel separately.
-  for (int i = 0 ; i < 4; i++) {
-    if (out_tensor_idx[out_packed_dim]++ < out_sizes[out_packed_dim]) {
-      ivec4 user_coor = nchwi_to_tidx(buf_indices[i], in_sizes);
-      ivec4 in_pos_elem = to_texture_elem_pos(user_coor, in_sizes, in_packed_dim);
-      VEC4_T intex = texelFetch(t_in, in_pos_elem.xyz, 0);
-      value[i] = intex[in_pos_elem.w];
-    }
-  }
-
-  imageStore(t_out, out_pos, value);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/view.yaml b/backends/vulkan/runtime/graph/ops/glsl/view.yaml
deleted file mode 100644
index 33364a25225..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/view.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-view:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-  shader_variants:
-    - NAME: view
diff --git a/backends/vulkan/runtime/graph/ops/glsl/view_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/view_buffer.glsl
deleted file mode 100644
index 2c02803a9b1..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/view_buffer.glsl
+++ /dev/null
@@ -1,44 +0,0 @@
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define T ${buffer_scalar_type(DTYPE)}
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing.glslh"
-
-${layout_declare_tensor(B, "w", "t_outp", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_inp", DTYPE, STORAGE)}
-
-${layout_declare_ubo(B, "BufferMetadata", "outp")}
-${layout_declare_ubo(B, "BufferMetadata", "inp")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-/*
- * The insight behind the view operation is that the contiguous index of each
- * tensor element in the input and output tensors are the same.
- */
-void main() {
-  const uint outp_bufi = gl_GlobalInvocationID.x;
-  if (outp_bufi >= numel(outp)) {
-    return;
-  }
-
-  TensorIndex outp_tidx;
-  linear_idx_to_tensor_idx(outp, outp_bufi, outp_tidx);
-
-  // To map the output to the input, find the input element that has the same
-  // contiguous index as the output element.
-  const uint contig_idx = tensor_idx_to_contiguous_idx(outp, outp_tidx);
-
-  TensorIndex inp_tidx;
-  contiguous_idx_to_tensor_idx(inp, contig_idx, inp_tidx);
-
-  const uint inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx);
-
-  t_outp[outp_bufi] = t_inp[inp_bufi];
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/view_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/view_buffer.yaml
deleted file mode 100644
index ec92bf483c8..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/view_buffer.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-view_buffer:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: buffer
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: double
-      - VALUE: int8
-      - VALUE: uint8
-      - VALUE: int32
-  shader_variants:
-    - NAME: view_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/where.glsl b/backends/vulkan/runtime/graph/ops/glsl/where.glsl
deleted file mode 100644
index fe6304c0fa0..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/where.glsl
+++ /dev/null
@@ -1,99 +0,0 @@
-// where.glsl
-
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
-#define T ${buffer_scalar_type(DTYPE)}
-#define COND_T ${buffer_scalar_type("bool")}
-
-${define_active_storage_type(STORAGE)}
-${define_required_extensions(DTYPE)}
-${define_required_extensions("bool")}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_condition", "bool", STORAGE)}
-${layout_declare_tensor(B, "r", "t_self", DTYPE, STORAGE)}
-${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)}
-
-
-#include "indexing_utils.h"
-
-$if STORAGE == "buffer":
-  ${layout_declare_ubo(B, "int", "out_numl")}
-  ${layout_declare_ubo(B, "ivec4", "out_strides")}
-  ${layout_declare_ubo(B, "ivec4", "cond_strides")}
-  ${layout_declare_ubo(B, "ivec4", "self_strides")}
-  ${layout_declare_ubo(B, "ivec4", "other_strides")}
-$else:
-  ${layout_declare_ubo(B, "ivec3", "out_limits")}
-
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_DIM_ORDER")}
-
-const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#ifdef USING_BUFFER
-
-void main() {
-  int out_bufi = int(gl_GlobalInvocationID.x);
-  if (out_bufi >= out_numl) {
-    return;
-  }
-
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
-
-  const int cond_bufi = tidx_to_bufi(out_tidx, cond_strides);
-  const int self_bufi = tidx_to_bufi(out_tidx, self_strides);
-  const int other_bufi = tidx_to_bufi(out_tidx, other_strides);
-
-  COND_T cond = t_condition[cond_bufi] ;
-  T v_self = t_self[self_bufi];
-  T v_other = t_other[other_bufi];
-
-  if (cond > 0) {
-    t_out[out_bufi] = v_self;
-  } else {
-    t_out[out_bufi] = v_other;
-  }
-}
-
-#else // !USING_BUFFER
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  vec4 cond = load_texel(t_condition, pos);
-  VEC4_T selftex = load_texel(t_self, pos);
-  VEC4_T othertex = load_texel(t_other, pos);
-
-  VEC4_T outtex;
-
-  for (int idx = 0; idx < 4; ++idx) {
-    if (cond[idx] == 1) {
-      outtex[idx] = selftex[idx];
-    } else {
-      outtex[idx] = othertex[idx];
-    }
-  }
-  write_texel(t_out, pos, outtex);
-}
- #endif // !USING_BUFFER
diff --git a/backends/vulkan/runtime/graph/ops/glsl/where.yaml b/backends/vulkan/runtime/graph/ops/glsl/where.yaml
deleted file mode 100644
index edbd843a336..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/where.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-where:
-  parameter_names_with_default_values:
-    DTYPE: float
-  generate_variant_forall:
-    STORAGE:
-      - VALUE: texture3d
-      - VALUE: buffer
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: where
diff --git a/backends/vulkan/runtime/graph/ops/impl/Arange.cpp b/backends/vulkan/runtime/graph/ops/impl/Arange.cpp
deleted file mode 100644
index 3171fbeb488..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Arange.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void resize_arange_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-
-  int start_val = 0;
-  int step_val = 1;
-  if (!graph->val_is_none(extra_args.at(0))) {
-    start_val = graph->extract_scalar<int64_t>(extra_args.at(0));
-  }
-  const int end_val = graph->extract_scalar<int64_t>(extra_args.at(1));
-  if (!graph->val_is_none(extra_args.at(2))) {
-    step_val = graph->extract_scalar<int64_t>(extra_args.at(2));
-  }
-
-  const std::vector<int64_t> out_sizes = {
-      utils::div_up(end_val - start_val, step_val)};
-
-  graph->virtual_resize(out, out_sizes);
-}
-
-void check_arange_input(
-    ComputeGraph& graph,
-    const ValueRef start,
-    const ValueRef end,
-    const ValueRef step) {
-  if (!graph.val_is_none(start) && !graph.val_is_int(end)) {
-    VK_THROW("arange: start must be int!");
-  }
-  if (!graph.val_is_none(end) && !graph.val_is_int(end)) {
-    VK_THROW("arange: end must be int!");
-  }
-  if (!graph.val_is_none(step) && !graph.val_is_int(end)) {
-    VK_THROW("arange: step must be int!");
-  }
-}
-
-void add_arange_node(
-    ComputeGraph& graph,
-    const ValueRef start,
-    const ValueRef end,
-    const ValueRef step,
-    const ValueRef out) {
-  float start_val = 0.0f;
-  float step_val = 1.0f;
-
-  if (graph.val_is_none(end)) {
-    VK_THROW("arange: end must be specified!");
-  }
-
-  if (!graph.val_is_none(start)) {
-    if (graph.val_is_int(start)) {
-      start_val = static_cast<float>(graph.extract_scalar<int64_t>(start));
-    } else {
-      start_val = graph.extract_scalar<float>(start);
-    }
-  }
-  if (!graph.val_is_none(step)) {
-    if (graph.val_is_int(step)) {
-      step_val = static_cast<float>(graph.extract_scalar<int64_t>(step));
-    } else {
-      step_val = graph.extract_scalar<float>(step);
-    }
-  }
-
-  std::string kernel_name("arange");
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}},
-      // Shader params buffers
-      {graph.sizes_ubo(out),
-       graph.create_params_buffer(start_val),
-       graph.create_params_buffer(step_val)},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {start, end, step},
-      // Resizing Logic
-      resize_arange_node));
-}
-
-void arange(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_arange_node(graph, args[0], args[1], args[2], args[7]);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.arange.start_step, arange);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
deleted file mode 100644
index 757afd06849..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-ValueRef check_and_prepack_arg(
-    ComputeGraph& graph,
-    ValueRef arg_ref,
-    const utils::StorageType stype,
-    int64_t num_channels,
-    const std::string& debug_name) {
-  VK_CHECK_COND(
-      graph.val_is_tref(arg_ref),
-      "native_batch_norm requires ",
-      debug_name,
-      " to be a constant tensorref");
-  VK_CHECK_COND(graph.get_tref(arg_ref)->sizes[0] == num_channels);
-
-  // batch_norm's param are broadcasted on the channel dimension.
-  // In this implementation, we pack the weights along the x dimension, and
-  // in the shader, we lookup using the along the x.
-  return prepack_standard(graph, arg_ref, stype, utils::kWidthPacked);
-}
-
-void add_native_batch_norm_node(
-    ComputeGraph& graph,
-    ValueRef in_ref,
-    ValueRef weight_ref,
-    ValueRef bias_ref,
-    ValueRef mean_ref,
-    ValueRef var_ref,
-    ValueRef eps_ref,
-    ValueRef out_tuple_ref) {
-  const std::vector<int64_t> in_sizes = graph.sizes_of(in_ref);
-  const std::vector<int64_t> out_sizes = graph.sizes_of(in_ref);
-
-  VK_CHECK_COND(in_sizes.size() == 4, "BatchNorm only support 4d tensor");
-  VK_CHECK_COND(out_sizes.size() == 4, "BatchNorm only support 4d tensor");
-
-  // Only the first element of the return value is propagated. The remaining 2
-  // elements are zero-size dummy tensor.
-  const ValueRef out_ref = graph.get_value_list(out_tuple_ref)->at(0);
-
-  const utils::StorageType stype = graph.storage_type_of(out_ref);
-
-  const int64_t num_channels = dim_at<kChannel4D>(in_sizes);
-
-  const ValueRef arg_weight =
-      check_and_prepack_arg(graph, weight_ref, stype, num_channels, "weight");
-  const ValueRef arg_bias =
-      check_and_prepack_arg(graph, bias_ref, stype, num_channels, "bias");
-  const ValueRef arg_mean =
-      check_and_prepack_arg(graph, mean_ref, stype, num_channels, "mean");
-  const ValueRef arg_var =
-      check_and_prepack_arg(graph, var_ref, stype, num_channels, "var");
-  const float epsilon = graph.extract_scalar<float>(eps_ref);
-
-  VK_CHECK_COND(!graph.val_is_tref(out_ref), "Output should not be tref");
-
-  const std::vector<int64_t> out_tensor_sizes = graph.sizes_of(out_ref);
-  VK_CHECK_COND(
-      dim_at<kChannel4D>(out_tensor_sizes) == num_channels,
-      "out channel must match in channel");
-
-  std::string kernel_name = "batchnorm";
-  add_dtype_suffix(kernel_name, graph.dtype_of(out_ref));
-
-  const int32_t num_texel_per_batch =
-      utils::div_up_4((dim_at<kChannel4D>(in_sizes)));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      {{out_ref, vkapi::kWrite},
-       {{in_ref, arg_weight, arg_bias, arg_mean, arg_var}, vkapi::kRead}},
-      {graph.logical_limits_ubo(out_ref),
-       graph.create_params_buffer(epsilon),
-       graph.create_params_buffer(num_texel_per_batch)},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-void native_batch_norm(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  // args[5] is momentum. It is not used in the calculation.
-  return add_native_batch_norm_node(
-      graph, args[0], args[1], args[2], args[3], args[4], args[6], args[7]);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(
-      aten._native_batch_norm_legit_no_training.default, native_batch_norm);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
deleted file mode 100644
index 025b483eab7..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void check_binary_op_args(
-    ComputeGraph& graph,
-    const ValueRef self,
-    const ValueRef other,
-    const ValueRef out) {
-  VK_CHECK_COND(graph.packed_dim_of(self) == graph.packed_dim_of(other));
-  VK_CHECK_COND(graph.packed_dim_of(self) == graph.packed_dim_of(out));
-
-  const std::vector<int64_t> self_sizes = graph.sizes_of(self);
-  const std::vector<int64_t> other_sizes = graph.sizes_of(other);
-  const std::vector<int64_t> out_sizes = graph.sizes_of(out);
-
-  std::vector<int64_t> broadcasted_sizes =
-      calculate_broadcasted_output_size(self_sizes, other_sizes);
-  VK_CHECK_COND(out_sizes == broadcasted_sizes);
-}
-
-void resize_binary_op_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-  const ValueRef out = args.at(0).refs.at(0);
-
-  // TODO(T183442143): Verify tensors are broadcastable.
-  const ValueRef self = args.at(1).refs.at(0);
-  const ValueRef other = args.at(1).refs.at(1);
-
-  const std::vector<int64_t> self_sizes = graph->sizes_of(self);
-  const std::vector<int64_t> other_sizes = graph->sizes_of(other);
-  const std::vector<int64_t> new_out_sizes =
-      calculate_broadcasted_output_size(self_sizes, other_sizes);
-
-  graph->virtual_resize(out, new_out_sizes);
-}
-
-void add_binary_op_texture_node(
-    ComputeGraph& graph,
-    const ValueRef in1,
-    const ValueRef in2,
-    const ValueRef alpha,
-    const ValueRef out,
-    const std::string& op_name) {
-  ValueRef arg1 = prepack_standard_like(graph, in1, out, true);
-  ValueRef arg2 = prepack_standard_like(graph, in2, out, true);
-
-  check_binary_op_args(graph, arg1, arg2, out);
-
-  float alpha_val = 1.0f;
-  // String is checked since floor_div passes in an unused string argument in
-  // place of alpha
-  if (is_valid(alpha) && !graph.val_is_string(alpha)) {
-    alpha_val = graph.extract_scalar<float>(alpha);
-  }
-
-  const struct BinaryOpsParams {
-    const utils::ivec2 broadcast_params;
-    const float alpha_val;
-  } binary_ops_params{create_broadcast_params(graph, arg1, arg2), alpha_val};
-
-  std::string kernel_name("binary_");
-  kernel_name.reserve(kShaderNameReserve);
-  kernel_name += op_name;
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(in1));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {{arg1, arg2}, vkapi::kRead}},
-      // Shader params buffers
-      {},
-      // Push Constants
-      {{graph.sizes_pc_of(out),
-        graph.sizes_pc_of(arg1),
-        graph.sizes_pc_of(arg2),
-        PushConstantDataInfo(&binary_ops_params, sizeof(binary_ops_params))}},
-      // Specialization Constants
-      {graph.hashed_layout_of(out),
-       graph.hashed_layout_of(arg1),
-       graph.hashed_layout_of(arg2)},
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_binary_op_node));
-}
-
-void add_binary_op_buffer_node(
-    ComputeGraph& graph,
-    const ValueRef in1,
-    const ValueRef in2,
-    const ValueRef alpha,
-    const ValueRef out,
-    const std::string& op_name) {
-  // check_binary_op_args(*t_in1, *t_in2, *t_out);
-
-  float alpha_val = 1.0f;
-  // String is checked since floor_div passes in an unused string argument in
-  // place of alpha
-  if (is_valid(alpha) && !graph.val_is_string(alpha)) {
-    alpha_val = graph.extract_scalar<float>(alpha);
-  }
-
-  std::string kernel_name("binary_");
-  kernel_name.reserve(kShaderNameReserve);
-  kernel_name += op_name;
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-
-  add_dtype_suffix(kernel_name, graph.dtype_of(in1));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {{in1, in2}, vkapi::kRead}},
-      // Shader params buffers
-      {graph.buffer_meta_ubo(out),
-       graph.buffer_meta_ubo(in1),
-       graph.buffer_meta_ubo(in2)},
-      // Push Constants
-      {{
-          PushConstantDataInfo(&alpha_val, sizeof(float)),
-      }},
-      // Specialization Constants
-      {graph.hashed_layout_of(out),
-       graph.hashed_layout_of(in1),
-       graph.hashed_layout_of(in2)},
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_binary_op_node));
-}
-
-void add_binary_op_node(
-    ComputeGraph& graph,
-    const ValueRef in1,
-    const ValueRef in2,
-    const ValueRef alpha,
-    const ValueRef out,
-    const std::string& op_name) {
-  if (graph.is_buffer_storage(out)) {
-    add_binary_op_buffer_node(graph, in1, in2, alpha, out, op_name);
-  } else {
-    add_binary_op_texture_node(graph, in1, in2, alpha, out, op_name);
-  }
-}
-
-#define DEFINE_BINARY_OP_WITH_ALPHA_FN(op_name)                          \
-  void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
-    return add_binary_op_node(                                           \
-        graph, args[0], args[1], args[2], args[3], #op_name);            \
-  }
-
-#define DEFINE_BINARY_OP_FN(op_name)                                     \
-  void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
-    return add_binary_op_node(                                           \
-        graph, args[0], args[1], kDummyValueRef, args[2], #op_name);     \
-  }
-
-DEFINE_BINARY_OP_WITH_ALPHA_FN(add);
-DEFINE_BINARY_OP_WITH_ALPHA_FN(sub);
-
-// Floor div does not have an alpha, but a string argument (which is unused) is
-// passed in at the same location as the alpha argument in other op.
-DEFINE_BINARY_OP_WITH_ALPHA_FN(floor_divide);
-
-DEFINE_BINARY_OP_FN(mul);
-DEFINE_BINARY_OP_FN(div);
-DEFINE_BINARY_OP_FN(pow);
-DEFINE_BINARY_OP_FN(minimum);
-DEFINE_BINARY_OP_FN(eq);
-DEFINE_BINARY_OP_FN(lt);
-DEFINE_BINARY_OP_FN(le);
-DEFINE_BINARY_OP_FN(gt);
-DEFINE_BINARY_OP_FN(ge);
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.add.Tensor, add);
-  VK_REGISTER_OP(aten.sub.Tensor, sub);
-  VK_REGISTER_OP(aten.mul.Tensor, mul);
-  VK_REGISTER_OP(aten.div.Tensor, div);
-  VK_REGISTER_OP(aten.div.Tensor_mode, floor_divide);
-  VK_REGISTER_OP(aten.pow.Tensor_Tensor, pow);
-  VK_REGISTER_OP(aten.minimum.default, minimum);
-  VK_REGISTER_OP(aten.eq.Tensor, eq);
-  VK_REGISTER_OP(aten.lt.Tensor, lt);
-  VK_REGISTER_OP(aten.le.Tensor, le);
-  VK_REGISTER_OP(aten.gt.Tensor, gt);
-  VK_REGISTER_OP(aten.ge.Tensor, ge);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/ChooseQParams.cpp b/backends/vulkan/runtime/graph/ops/impl/ChooseQParams.cpp
deleted file mode 100644
index 0d0be08bb38..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/ChooseQParams.cpp
+++ /dev/null
@@ -1,815 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-
-namespace vkcompute {
-
-void resize_choose_qparams_per_row(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
-
-  ValueRef input_scales = args.at(0).refs.at(0);
-  ValueRef input_zeros = args.at(0).refs.at(1);
-  ValueRef input = args.at(1).refs.at(0);
-
-  std::vector<int64_t> new_sizes = graph->sizes_of(input_scales);
-  const size_t ndim = new_sizes.size();
-
-  const int64_t input_height = graph->size_at<int64_t>(-2, input);
-  new_sizes.at(ndim - 1) = input_height;
-
-  graph->virtual_resize(input_scales, new_sizes);
-  graph->virtual_resize(input_zeros, new_sizes);
-}
-
-utils::uvec3 choose_qparams_pick_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-
-  // For per-tensor quantization, we want a single workgroup that can handle
-  // all elements with proper reduction. The shader uses NWORKERS=64 threads.
-  const ValueRef input = args.at(1).refs.at(0);
-
-  if (graph->is_buffer_storage(input)) {
-    // For buffer storage, use a single workgroup in X dimension
-    // The shader will handle strided access across all elements
-    return {1u, 1u, 1u};
-  } else {
-    // For texture storage, use the default logic
-    return graph->create_global_wg_size(args.at(0).refs.at(0));
-  }
-}
-
-utils::uvec3 choose_qparams_pick_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-
-  const ValueRef input = args.at(1).refs.at(0);
-
-  if (graph->is_buffer_storage(input)) {
-    // For buffer storage, use 64 threads in X dimension to match NWORKERS
-    // This ensures the shared memory arrays are properly sized
-    return {64u, 1u, 1u};
-  } else {
-    // For texture storage, use the default logic
-    return graph->create_local_wg_size(global_workgroup_size);
-  }
-}
-
-utils::uvec3 choose_qparams_per_token_pick_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-
-  const ValueRef input = args.at(1).refs.at(0);
-
-  if (graph->is_buffer_storage(input)) {
-    // For per-token quantization, we need one workgroup per token
-    // Calculate number of tokens (product of all dimensions except the last
-    // one)
-    const auto input_sizes = graph->sizes_of(input);
-    int64_t num_tokens = 1;
-    for (size_t i = 0; i < input_sizes.size() - 1; i++) {
-      num_tokens *= input_sizes[i];
-    }
-
-    return {static_cast<uint32_t>(num_tokens), 1u, 1u};
-  } else {
-    // For texture storage, use the default logic
-    return graph->create_global_wg_size(args.at(0).refs.at(0));
-  }
-}
-
-utils::uvec3 choose_qparams_per_token_pick_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-
-  const ValueRef input = args.at(1).refs.at(0);
-
-  if (graph->is_buffer_storage(input)) {
-    return {1u, 1u, 1u};
-  } else {
-    // For texture storage, use the default logic
-    return graph->create_local_wg_size(global_workgroup_size);
-  }
-}
-
-utils::uvec3 choose_qparams_block_wise_pick_global_wg_size(
-    ComputeGraph* g,
-    const vkapi::ShaderInfo&,
-    const std::vector<ArgGroup>& a,
-    const std::vector<ValueRef>& r) {
-  const ValueRef input = a.at(2).refs.at(0);
-  const auto blkRef = r.at(0);
-  const auto inSz = g->sizes_of(input);
-  const auto blkList = g->get_int_list(blkRef);
-
-  // Use same code as in add_choose_qparams_block_wise_node
-  utils::ivec4 block_size_vec = utils::make_whcn_ivec4(*blkList);
-  utils::ivec4 tensor_size_whcn = utils::make_whcn_ivec4(inSz);
-
-  // Calculate numBlocks: ceil(tensorSize / blockSize) (both in WHCN order)
-  utils::ivec4 nBlk = {
-      (tensor_size_whcn[0] + block_size_vec[0] - 1) / block_size_vec[0],
-      (tensor_size_whcn[1] + block_size_vec[1] - 1) / block_size_vec[1],
-      (tensor_size_whcn[2] + block_size_vec[2] - 1) / block_size_vec[2],
-      (tensor_size_whcn[3] + block_size_vec[3] - 1) / block_size_vec[3]};
-
-  uint32_t nBlocks = nBlk[0] * nBlk[1] * nBlk[2] * nBlk[3];
-
-  // For texture storage, use more threads to better utilize GPU parallelism
-  // Each thread can process multiple blocks with stride
-  if (g->is_buffer_storage(input)) {
-    return {nBlocks, 1u, 1u};
-  } else {
-    // For texture storage, use more workgroups to better utilize GPU
-    // Aim for ~64-256 threads per workgroup for good occupancy
-    uint32_t preferred_threads_per_wg = 64;
-    uint32_t num_workgroups =
-        (nBlocks + preferred_threads_per_wg - 1) / preferred_threads_per_wg;
-    num_workgroups = std::max(1u, std::min(num_workgroups, nBlocks));
-    return {num_workgroups * preferred_threads_per_wg, 1u, 1u};
-  }
-}
-
-utils::uvec3 choose_qparams_block_wise_pick_local_wg_size(
-    ComputeGraph* g,
-    const vkapi::ShaderInfo&,
-    const utils::uvec3& global_wg_size,
-    const std::vector<ArgGroup>& a,
-    const std::vector<ValueRef>&) {
-  const ValueRef input = a.at(2).refs.at(0);
-
-  if (g->is_buffer_storage(input)) {
-    return {1u, 1u, 1u};
-  } else {
-    // For texture storage, use 64 threads per workgroup for better occupancy
-    uint32_t local_size = std::min(64u, global_wg_size[0]);
-    return {local_size, 1u, 1u};
-  }
-}
-
-vkapi::ShaderInfo pick_choose_qparams_per_row_shader(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-
-  const ValueRef input = args.at(1).refs.at(0);
-
-  // number of output channels
-  const int64_t width = graph->size_at<int64_t>(-1, input);
-  const int64_t height = graph->size_at<int64_t>(-2, input);
-
-  std::string kernel_name = "choose_qparams_per_row";
-  if (width > 256 || height == 1) {
-    kernel_name += "_o1w64";
-  } else {
-    kernel_name += "_o4w16";
-  }
-  add_storage_type_suffix(kernel_name, graph->storage_type_of(input));
-  add_dtype_suffix(kernel_name, graph->dtype_of(input));
-
-  return VK_KERNEL_FROM_STR(kernel_name);
-}
-
-utils::uvec3 pick_choose_qparams_per_row_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-
-  const ValueRef input = args.at(1).refs.at(0);
-  const uint32_t height = graph->size_at<uint32_t>(-2, input);
-  return {1u, height, 1u};
-}
-
-utils::uvec3 pick_choose_qparams_per_row_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)global_workgroup_size;
-  (void)args;
-  (void)resize_args;
-
-  uint32_t outputs_per_wg = 1u;
-  uint32_t workers_per_output = 64u;
-
-  if (shader.kernel_name.find("o4w16") != std::string::npos) {
-    outputs_per_wg = 4u;
-    workers_per_output = 16u;
-  }
-
-  return {workers_per_output, outputs_per_wg, 1u};
-}
-
-void add_choose_qparams_tensor_node(
-    ComputeGraph& graph,
-    const ValueRef& input,
-    const ValueRef& quant_min,
-    const ValueRef& quant_max,
-    const ValueRef& eps,
-    const ValueRef& scale_out,
-    const ValueRef& zero_point_out) {
-  std::string kernel_name("choose_qparams_tensor");
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(scale_out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(zero_point_out));
-
-  // Handle optional quant_min and quant_max parameters independently
-  auto bounds = get_dtype_bounds(graph.dtype_of(zero_point_out));
-
-  int quant_min_val, quant_max_val;
-
-  // Handle quant_min
-  if (graph.val_is_none(quant_min)) {
-    quant_min_val = bounds.first;
-  } else {
-    VK_CHECK_COND(
-        graph.val_is_int(quant_min),
-        "quant_min must be an integer, got type: ",
-        graph.get_val_type(quant_min));
-    quant_min_val = static_cast<int>(graph.get_int(quant_min));
-  }
-
-  // Handle quant_max
-  if (graph.val_is_none(quant_max)) {
-    quant_max_val = bounds.second;
-  } else {
-    VK_CHECK_COND(
-        graph.val_is_int(quant_max),
-        "quant_max must be an integer, got type: ",
-        graph.get_val_type(quant_max));
-    quant_max_val = static_cast<int>(graph.get_int(quant_max));
-  }
-  float eps_val = static_cast<float>(graph.get_double(eps));
-
-  vkapi::ParamsBindList param_ubos;
-  std::vector<PushConstantDataInfo> push_constants;
-
-  if (graph.is_buffer_storage(input)) {
-    param_ubos = {
-        graph.sizes_ubo(input),
-        graph.strides_ubo(input),
-        graph.sizes_ubo(scale_out),
-        graph.strides_ubo(scale_out),
-        graph.sizes_ubo(zero_point_out),
-        graph.strides_ubo(zero_point_out)};
-  } else {
-    param_ubos = {
-        graph.logical_limits_ubo(input),
-        graph.logical_limits_ubo(scale_out),
-        graph.logical_limits_ubo(zero_point_out)};
-  }
-
-  push_constants = {
-      PushConstantDataInfo(&quant_min_val, sizeof(int)),
-      PushConstantDataInfo(&quant_max_val, sizeof(int)),
-      PushConstantDataInfo(&eps_val, sizeof(float)),
-  };
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      choose_qparams_pick_global_wg_size,
-      choose_qparams_pick_local_wg_size,
-      // Inputs and Outputs
-      {{scale_out, vkapi::kWrite},
-       {zero_point_out, vkapi::kWrite},
-       {input, vkapi::kRead}},
-      // Shader param buffers
-      param_ubos,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      {},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-void add_choose_qparams_per_token_asymmetric_node(
-    ComputeGraph& graph,
-    const ValueRef& input,
-    const ValueRef& scale_out,
-    const ValueRef& zero_point_out) {
-  std::string kernel_name("choose_qparams_per_token_asymmetric");
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(scale_out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(zero_point_out));
-
-  // Calculate number of tokens (product of all dimensions except the last one)
-  int64_t num_tokens = 1;
-  const auto input_sizes = graph.sizes_of(input);
-  for (size_t i = 0; i < input_sizes.size() - 1; i++) {
-    num_tokens *= input_sizes[i];
-  }
-
-  int num_tokens_val = static_cast<int>(num_tokens);
-  int quant_min_val = -128; // Fixed for asymmetric quantization
-  int quant_max_val = 127; // Fixed for asymmetric quantization
-
-  vkapi::ParamsBindList param_ubos;
-  std::vector<PushConstantDataInfo> push_constants;
-
-  if (graph.is_buffer_storage(input)) {
-    param_ubos = {
-        graph.sizes_ubo(input),
-        graph.strides_ubo(input),
-        graph.sizes_ubo(scale_out),
-        graph.strides_ubo(scale_out),
-        graph.sizes_ubo(zero_point_out),
-        graph.strides_ubo(zero_point_out)};
-  } else {
-    param_ubos = {
-        graph.logical_limits_ubo(input),
-        graph.logical_limits_ubo(scale_out),
-        graph.logical_limits_ubo(zero_point_out)};
-  }
-
-  push_constants = {
-      PushConstantDataInfo(&num_tokens_val, sizeof(int)),
-      PushConstantDataInfo(&quant_min_val, sizeof(int)),
-      PushConstantDataInfo(&quant_max_val, sizeof(int)),
-  };
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      choose_qparams_per_token_pick_global_wg_size,
-      choose_qparams_per_token_pick_local_wg_size,
-      // Inputs and Outputs
-      {{scale_out, vkapi::kWrite},
-       {zero_point_out, vkapi::kWrite},
-       {input, vkapi::kRead}},
-      // Shader param buffers
-      param_ubos,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      {},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-void add_choose_qparams_per_row_node(
-    ComputeGraph& graph,
-    const ValueRef& input,
-    const ValueRef& quant_min,
-    const ValueRef& quant_max,
-    const ValueRef& input_scales,
-    const ValueRef& input_zps) {
-  int32_t quant_min_val = -128;
-  int32_t quant_max_val = 127;
-
-  // Int8 range by default
-  if (graph.val_is_none(quant_min)) {
-    quant_min_val = -128;
-  } else {
-    quant_min_val = graph.extract_scalar<int32_t>(quant_min);
-  }
-
-  // Int8 range by default
-  if (graph.val_is_none(quant_min)) {
-    quant_max_val = 127;
-  } else {
-    quant_max_val = graph.extract_scalar<int32_t>(quant_max);
-  }
-
-  vkapi::ParamsBindList param_ubos = {
-      graph.sizes_ubo(input),
-  };
-  std::vector<PushConstantDataInfo> push_constants = {
-      PushConstantDataInfo(&quant_min_val, sizeof(int32_t)),
-      PushConstantDataInfo(&quant_max_val, sizeof(int32_t)),
-  };
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      pick_choose_qparams_per_row_shader,
-      pick_choose_qparams_per_row_global_wg_size,
-      pick_choose_qparams_per_row_local_wg_size,
-      // Inputs and Outputs
-      {{{input_scales, input_zps}, vkapi::kWrite}, {input, vkapi::kRead}},
-      // Shader param buffers
-      param_ubos,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      {},
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_choose_qparams_per_row));
-}
-
-void add_choose_qparams_block_wise_node(
-    ComputeGraph& graph,
-    ValueRef input,
-    ValueRef block_size,
-    int mapping_type, // 0 / 1 / 2
-    ValueRef quant_min,
-    ValueRef quant_max,
-    ValueRef eps,
-    ValueRef scale_out,
-    ValueRef zp_out) {
-  const auto input_sizes = graph.sizes_of(input);
-  const auto block_size_list = graph.get_int_list(block_size);
-
-  // For shader compatibility, we still need to convert to WHCN order
-  // but the output shape calculation is now handled correctly in resize
-  // function
-  utils::ivec4 block_size_vec = utils::make_whcn_ivec4(*block_size_list);
-  utils::ivec4 tensor_size_whcn = utils::make_whcn_ivec4(input_sizes);
-
-  // Calculate numBlocks: ceil(tensorSize / blockSize) (both in WHCN order)
-  utils::ivec4 num_blocks_vec = {
-      (tensor_size_whcn[0] + block_size_vec[0] - 1) / block_size_vec[0],
-      (tensor_size_whcn[1] + block_size_vec[1] - 1) / block_size_vec[1],
-      (tensor_size_whcn[2] + block_size_vec[2] - 1) / block_size_vec[2],
-      (tensor_size_whcn[3] + block_size_vec[3] - 1) / block_size_vec[3]};
-
-  // Calculate blockStride: pre-computed linear strides for the block grid
-  utils::ivec4 block_stride_vec = {
-      1,
-      num_blocks_vec[0],
-      num_blocks_vec[0] * num_blocks_vec[1],
-      num_blocks_vec[0] * num_blocks_vec[1] * num_blocks_vec[2]};
-
-  // Handle optional quant_min and quant_max parameters
-  int qmin, qmax;
-  if (graph.val_is_none(quant_min) || graph.val_is_none(quant_max)) {
-    // Use default values based on target_dtype (similar to
-    // _get_and_check_qmin_qmax) For now, assume int8 range as default - this
-    // should match the Python implementation
-    qmin = -128;
-    qmax = 127;
-  } else {
-    qmin = static_cast<int>(graph.get_int(quant_min));
-    qmax = static_cast<int>(graph.get_int(quant_max));
-  }
-
-  float eps_val;
-  if (graph.val_is_none(eps)) {
-    // Use default eps value (similar to Python implementation)
-    eps_val = 1.192092896e-07f; // torch.finfo(torch.float32).eps
-  } else {
-    eps_val = static_cast<float>(graph.get_double(eps));
-  }
-
-  // Create push constants vector
-  std::vector<PushConstantDataInfo> push_constants = {
-      PushConstantDataInfo(&block_size_vec, sizeof(block_size_vec)),
-      PushConstantDataInfo(&num_blocks_vec, sizeof(num_blocks_vec)),
-      PushConstantDataInfo(&block_stride_vec, sizeof(block_stride_vec)),
-      PushConstantDataInfo(&mapping_type, sizeof(int)),
-      PushConstantDataInfo(&qmin, sizeof(int)),
-      PushConstantDataInfo(&qmax, sizeof(int)),
-      PushConstantDataInfo(&eps_val, sizeof(float))};
-
-  std::string kernel_name("choose_qparams_block_wise");
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(scale_out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(zp_out));
-
-  vkapi::ParamsBindList param_ubos;
-
-  if (graph.is_buffer_storage(input)) {
-    param_ubos = {
-        graph.sizes_ubo(input),
-        graph.strides_ubo(input),
-        graph.sizes_ubo(scale_out),
-        graph.strides_ubo(scale_out),
-        graph.sizes_ubo(zp_out),
-        graph.strides_ubo(zp_out)};
-  } else {
-    // For texture input, the shader uses buffer storage for outputs
-    // so we need buffer UBOs for the output tensors
-    param_ubos = {
-        graph.logical_limits_ubo(input),
-        graph.sizes_ubo(scale_out),
-        graph.strides_ubo(scale_out),
-        graph.sizes_ubo(zp_out),
-        graph.strides_ubo(zp_out)};
-  }
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      choose_qparams_block_wise_pick_global_wg_size,
-      choose_qparams_block_wise_pick_local_wg_size,
-      // Inputs and Outputs
-      {{scale_out, vkapi::kWrite},
-       {zp_out, vkapi::kWrite},
-       {input, vkapi::kRead}},
-      // Shader param buffers
-      param_ubos,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      {},
-      // Resize Args
-      {block_size},
-      // Resizing Logic
-      nullptr));
-}
-
-void choose_qparams_tensor_impl(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  int arg_idx = 0;
-  const ValueRef input = args[arg_idx++];
-  const ValueRef quant_min = args[arg_idx++];
-  const ValueRef quant_max = args[arg_idx++];
-  const ValueRef eps = args[arg_idx++];
-  const ValueRef dtype = args[arg_idx++];
-  const ValueRef out_tuple_ref = args[arg_idx++];
-
-  ValueRef scale_out = kDummyValueRef;
-  ValueRef zero_point_out = kDummyValueRef;
-
-  {
-    const ValueListPtr out_tuple = graph.get_value_list(out_tuple_ref);
-    scale_out = out_tuple->at(0);
-    zero_point_out = out_tuple->at(1);
-  }
-
-  // Void the unused dtype parameter to match ATen signature
-  (void)dtype;
-
-  // Check tensor types
-  VK_CHECK_COND(graph.val_is_tensor(input));
-  VK_CHECK_COND(graph.val_is_tensor(scale_out));
-  VK_CHECK_COND(graph.val_is_tensor(zero_point_out));
-
-  // Verify input is a floating point type
-  VK_CHECK_COND(graph.dtype_of(input) == vkapi::kFloat);
-
-  // Get scale and zero point output dtypes
-  vkapi::ScalarType scale_out_dtype = graph.dtype_of(scale_out);
-  vkapi::ScalarType zero_point_out_dtype = graph.dtype_of(zero_point_out);
-
-  // Verify supported output types for scale (fp32 only for now)
-  VK_CHECK_COND(scale_out_dtype == vkapi::kFloat);
-
-  // Verify supported output types for zero point (int32, int8, fp32)
-  VK_CHECK_COND(
-      zero_point_out_dtype == vkapi::kInt ||
-      zero_point_out_dtype == vkapi::kChar ||
-      zero_point_out_dtype == vkapi::kFloat);
-
-  // Check that texture storage is width packed
-  if (!graph.is_buffer_storage(input)) {
-    VK_CHECK_COND(graph.packed_dim_of(input) == WHCN::kWidthDim);
-  }
-
-  add_choose_qparams_tensor_node(
-      graph, input, quant_min, quant_max, eps, scale_out, zero_point_out);
-}
-
-void choose_qparams_per_token_asymmetric_impl(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  int arg_idx = 0;
-  const ValueRef input = args[arg_idx++];
-  const ValueRef dtype = args[arg_idx++];
-  const ValueRef out_tuple_ref = args[arg_idx++];
-
-  ValueRef scale_out = kDummyValueRef;
-  ValueRef zero_point_out = kDummyValueRef;
-
-  {
-    const ValueListPtr out_tuple = graph.get_value_list(out_tuple_ref);
-    scale_out = out_tuple->at(0);
-    zero_point_out = out_tuple->at(1);
-  }
-
-  // Void the unused parameter to match ATen signature
-  (void)dtype;
-
-  // Check tensor types
-  VK_CHECK_COND(graph.val_is_tensor(input));
-  VK_CHECK_COND(graph.val_is_tensor(scale_out));
-  VK_CHECK_COND(graph.val_is_tensor(zero_point_out));
-
-  // Verify input is a floating point type
-  VK_CHECK_COND(graph.dtype_of(input) == vkapi::kFloat);
-
-  // Get scale and zero point output dtypes
-  vkapi::ScalarType scale_out_dtype = graph.dtype_of(scale_out);
-  vkapi::ScalarType zero_point_out_dtype = graph.dtype_of(zero_point_out);
-
-  // Verify supported output types for scale (fp32 only for now)
-  VK_CHECK_COND(scale_out_dtype == vkapi::kFloat);
-
-  // Verify supported output types for zero point (int32, int8, fp32)
-  VK_CHECK_COND(
-      zero_point_out_dtype == vkapi::kInt ||
-      zero_point_out_dtype == vkapi::kChar ||
-      zero_point_out_dtype == vkapi::kFloat);
-
-  // Check that texture storage is width packed
-  if (!graph.is_buffer_storage(input)) {
-    VK_CHECK_COND(graph.packed_dim_of(input) == WHCN::kWidthDim);
-  }
-
-  add_choose_qparams_per_token_asymmetric_node(
-      graph, input, scale_out, zero_point_out);
-}
-
-bool can_use_choose_qparams_per_row(
-    ComputeGraph& graph,
-    const ValueRef input,
-    const ValueRef block_size,
-    const ValueRef input_zero_point) {
-  if (!graph.is_vectorizable_contiguous_2d_matrix(input)) {
-    return false;
-  }
-
-  std::vector<int64_t> input_sizes = graph.sizes_of(input);
-  const IntListPtr block_size_vals = graph.get_int_list(block_size);
-  const size_t ndim = block_size_vals->size();
-
-  // Check for per y - dim quantization
-  if (utils::val_at(-1, input_sizes) != utils::val_at(-1, *block_size_vals)) {
-    return false;
-  }
-
-  for (int d = 0; d < ndim - 1; ++d) {
-    if (block_size_vals->at(d) != 1) {
-      return false;
-    }
-  }
-  return true;
-}
-
-void choose_qparams_affine_impl(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  int arg_idx = 0;
-  const ValueRef input = args[arg_idx++];
-  const ValueRef mapping_type = args[arg_idx++];
-  const ValueRef block_size = args[arg_idx++];
-  const ValueRef target_dtype = args[arg_idx++];
-  const ValueRef quant_min = args[arg_idx++];
-  const ValueRef quant_max = args[arg_idx++];
-  const ValueRef eps = args[arg_idx++];
-  const ValueRef scale_dtype = args[arg_idx++];
-  const ValueRef zero_point_dtype = args[arg_idx++];
-  const ValueRef out_tuple_ref = args[arg_idx++];
-
-  // Suppress unused variable warnings
-  (void)target_dtype;
-  (void)scale_dtype;
-  (void)zero_point_dtype;
-
-  ValueRef scale_out = kDummyValueRef;
-  ValueRef zero_point_out = kDummyValueRef;
-
-  {
-    const ValueListPtr out_tuple = graph.get_value_list(out_tuple_ref);
-    scale_out = out_tuple->at(0);
-    zero_point_out = out_tuple->at(1);
-  }
-
-  // Use fast path if certain conditions are met
-  if (can_use_choose_qparams_per_row(
-          graph, input, block_size, zero_point_out)) {
-    return add_choose_qparams_per_row_node(
-        graph, input, quant_min, quant_max, scale_out, zero_point_out);
-  }
-
-  // Check tensor types
-  VK_CHECK_COND(graph.val_is_tensor(input));
-  VK_CHECK_COND(graph.val_is_tensor(scale_out));
-  VK_CHECK_COND(graph.val_is_tensor(zero_point_out));
-
-  // Verify input is a floating point type
-  VK_CHECK_COND(graph.dtype_of(input) == vkapi::kFloat);
-
-  // Get scale and zero point dtypes from arguments
-  vkapi::ScalarType scale_out_dtype = graph.dtype_of(scale_out);
-  vkapi::ScalarType zero_point_out_dtype = graph.dtype_of(zero_point_out);
-
-  // Verify supported output types for scale (fp32 only for now)
-  VK_CHECK_COND(scale_out_dtype == vkapi::kFloat);
-
-  // Verify supported output types for zero point (int32, int8, fp32)
-  VK_CHECK_COND(
-      zero_point_out_dtype == vkapi::kInt ||
-      zero_point_out_dtype == vkapi::kChar ||
-      zero_point_out_dtype == vkapi::kFloat);
-
-  // Check that texture storage is width packed
-  if (!graph.is_buffer_storage(input)) {
-    VK_CHECK_COND(graph.packed_dim_of(input) == WHCN::kWidthDim);
-  }
-
-  const auto input_sizes = graph.sizes_of(input);
-  const auto block_size_list = graph.get_int_list(block_size);
-  VK_CHECK_COND(block_size_list->size() == input_sizes.size());
-
-  std::string mapping_type_str = graph.get_string(mapping_type);
-  int mapping_type_val = 0; // Default to ASYMMETRIC
-
-  if (mapping_type_str == "ASYMMETRIC" || mapping_type_str.empty()) {
-    mapping_type_val = 0; // ASYMMETRIC
-  } else if (mapping_type_str == "SYMMETRIC") {
-    mapping_type_val = 1;
-  } else if (mapping_type_str == "SYMMETRIC_NO_CLIPPING_ERR") {
-    mapping_type_val = 2;
-  } else {
-    VK_THROW("Unsupported mapping_type: ", mapping_type_str);
-  }
-
-  add_choose_qparams_block_wise_node(
-      graph,
-      input,
-      block_size,
-      mapping_type_val,
-      quant_min,
-      quant_max,
-      eps,
-      scale_out,
-      zero_point_out);
-}
-
-void choose_qparams_per_row(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  int arg_idx = 0;
-  const ValueRef input = args[arg_idx++];
-  const ValueRef quant_min = args[arg_idx++];
-  const ValueRef quant_max = args[arg_idx++];
-  const ValueRef input_scales = args[arg_idx++];
-  const ValueRef input_zps = args[arg_idx++];
-
-  //   ValueRef scale_out = kDummyValueRef;
-  //   ValueRef zero_point_out = kDummyValueRef;
-  //
-  //   {
-  //     const ValueListPtr out_tuple = graph.get_value_list(out_tuple_ref);
-  //     scale_out = out_tuple->at(0);
-  //     zero_point_out = out_tuple->at(1);
-  //   }
-  //
-
-  add_choose_qparams_per_row_node(
-      graph, input, quant_min, quant_max, input_scales, input_zps);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(
-      quantized_decomposed.choose_qparams.tensor, choose_qparams_tensor_impl);
-  VK_REGISTER_OP(
-      quantized_decomposed.choose_qparams_per_token_asymmetric.default,
-      choose_qparams_per_token_asymmetric_impl);
-
-  // Register the per-channel quantization operator
-  VK_REGISTER_OP(etvk.choose_qparams_per_row.default, choose_qparams_per_row);
-
-  // TorchAO affine choose_qparams operators
-  VK_REGISTER_OP(
-      torchao.choose_qparams_affine.default, choose_qparams_affine_impl);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
deleted file mode 100644
index 0ae9d53a481..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/Logging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/View.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void resize_clone_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = args.at(1).refs.at(0);
-  // TODO: support for when dimensionality doesn't match, i.e. clone is used to
-  // implement squeeze.
-  if (graph->dim_of(out) == graph->dim_of(in)) {
-    graph->virtual_resize(out, graph->sizes_of(in));
-  }
-}
-
-void add_clone_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef out) {
-  std::string kernel_name = "clone";
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Parameter Buffers
-      {graph.logical_limits_ubo(out)},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_clone_node));
-}
-
-utils::uvec3 clone_image_to_buffer_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-  const ValueRef image = args.at(1).refs.at(0);
-  return graph->create_global_wg_size(image);
-}
-
-void add_image_to_buffer_node(
-    ComputeGraph& graph,
-    const ValueRef image,
-    const ValueRef buffer) {
-  std::string kernel_name = "clone_image_to_buffer";
-  add_dtype_suffix(kernel_name, graph.dtype_of(image));
-  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      shader,
-      clone_image_to_buffer_global_wg_size,
-      default_pick_local_wg_size,
-      // Input and Outputs
-      {{buffer, vkapi::kWrite}, {image, vkapi::kRead}},
-      // Parameter Buffers
-      {},
-      // Push Constants
-      {graph.sizes_pc_of(image), graph.strides_pc_of(buffer)},
-      // Specialization Constants
-      {graph.hashed_layout_of(image)},
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_clone_node));
-}
-
-void add_buffer_to_image_node(
-    ComputeGraph& graph,
-    const ValueRef buffer,
-    const ValueRef image) {
-  std::string kernel_name = "clone_buffer_to_image";
-  add_dtype_suffix(kernel_name, graph.dtype_of(image));
-  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      shader,
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Input and Outputs
-      {{image, vkapi::kWrite}, {buffer, vkapi::kRead}},
-      // Parameter Buffers
-      {},
-      // Push Constants
-      {graph.sizes_pc_of(image), graph.strides_pc_of(buffer)},
-      // Specialization Constants
-      {graph.hashed_layout_of(image)},
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_clone_node));
-}
-
-void clone(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  const ValueRef src = args[0];
-  const ValueRef dst = args[2];
-
-  const utils::StorageType src_storage = graph.storage_type_of(src);
-  const utils::StorageType dst_storage = graph.storage_type_of(dst);
-  if (src_storage == utils::kTexture3D && dst_storage == utils::kTexture3D) {
-    if (graph.hashed_layout_of(src) == graph.hashed_layout_of(dst)) {
-      return add_clone_node(graph, src, dst);
-    } else {
-      return add_view_node(graph, src, kDummyValueRef, dst);
-    }
-  }
-  if (src_storage == utils::kTexture3D && dst_storage == utils::kBuffer) {
-    return add_image_to_buffer_node(graph, src, dst);
-  }
-  if (src_storage == utils::kBuffer && dst_storage == utils::kTexture3D) {
-    return add_buffer_to_image_node(graph, src, dst);
-  }
-
-  std::vector<ValueRef> extra_args = {};
-  // Buffer to buffer copy
-  return add_view_copy_buffer_node(
-      graph, src, dst, extra_args, resize_clone_node);
-}
-
-// Clone node is not the most efficient implementation for the aten.clone
-// operation. A more efficient implementation can be achieved during vulkan
-// export with the use of shared object. This clone node is introduced to enable
-// a "copy" mechanism if there is no alternative (e.g. during direct
-// ComputeGraph manipulation, we need to make a copy of a Tensor).
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.clone.default, clone);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.h b/backends/vulkan/runtime/graph/ops/impl/Clone.h
deleted file mode 100644
index 8efaa259a24..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Clone.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-namespace vkcompute {
-
-void add_clone_node(ComputeGraph& graph, const ValueRef in, const ValueRef out);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.cpp b/backends/vulkan/runtime/graph/ops/impl/Common.cpp
deleted file mode 100644
index 6c701224f7f..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Common.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-
-namespace vkcompute {
-
-utils::uvec3 default_pick_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  return graph->create_global_wg_size(out);
-}
-
-utils::uvec3 default_pick_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)args;
-  (void)resize_args;
-  return graph->create_local_wg_size(global_workgroup_size);
-}
-
-utils::uvec3 pick_hw_square_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)graph;
-  (void)shader;
-  (void)args;
-  (void)resize_args;
-  // Some inactive invocations are okay; set 6 as the threshold to use the
-  // a square wg size.
-  if (global_workgroup_size[0u] >= 6 && global_workgroup_size[1u] >= 6) {
-    return {8u, 8u, 1u};
-  }
-  // If width dim is sufficiently small, then bias towards height dim to reduce
-  // the number of inactive invocations.
-  if (global_workgroup_size[0u] < 6u) {
-    return {4u, 16u, 1u};
-  }
-  return {16u, 4u, 1u};
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.h b/backends/vulkan/runtime/graph/ops/impl/Common.h
deleted file mode 100644
index 1831ab2a845..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Common.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/ExecuteNode.h>
-
-namespace vkcompute {
-
-/**
- * Creates a global workgroup size based on the first output tensor in the args.
- * This is a utility function that extracts the output tensor from
- * args.at(0).refs.at(0) and calls graph->create_global_wg_size(out) on it.
- */
-utils::uvec3 default_pick_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args);
-
-/**
- * Creates a local workgroup size based on the first output tensor in the args.
- * This is a utility function that extracts the output tensor from
- * args.at(0).refs.at(0) and calls graph->create_local_wg_size(out) on it.
- */
-utils::uvec3 default_pick_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args);
-
-/**
- * Constructs a local work group size with the shape {W, H, 1}. The function
- * will try to set W == H == sqrt(num_invocations), where num_invocations is
- * typically 64. This configuration is good for ops like matrix multiplication
- * as it reduces the total volume of unique data that the entire work group
- * will need to read from input tensors in order to produce the output data.
- * To compute an output tile of {W, H, 1}, the work group will need to read
- * H unique rows = H * K unique elements from the input tensor and W unique cols
- * = W * K elements from the weight tensor, resulting in (W + H) * K unique
- * elements in total.
- */
-utils::uvec3 pick_hw_square_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Concat.cpp b/backends/vulkan/runtime/graph/ops/impl/Concat.cpp
deleted file mode 100644
index 0a4acb6cef3..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Concat.cpp
+++ /dev/null
@@ -1,356 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Clone.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-std::vector<int64_t> get_concat_sizes(
-    ComputeGraph& graph,
-    ValueRef all_input_refs,
-    const int64_t concat_dim) {
-  ValueListPtr in_value_refs = graph.get_value_list(all_input_refs);
-  // Get the sizes of the first input tensor as a starting point
-  std::vector<int64_t> new_out_sizes = graph.sizes_of(in_value_refs->at(0));
-
-  // Sum up the sizes along the concatenation dimension
-  for (size_t i = 1; i < in_value_refs->size(); ++i) {
-    const std::vector<int64_t> in_sizes = graph.sizes_of(in_value_refs->at(i));
-    new_out_sizes.at(concat_dim) += in_sizes.at(concat_dim);
-  }
-
-  return new_out_sizes;
-}
-
-void resize_concat_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef all_inputs = extra_args.at(0);
-
-  int64_t concat_dim = graph->extract_scalar<int64_t>(extra_args.at(1));
-
-  // Normalize concat_dim if negative
-  const int64_t ndim = graph->dim_of(out);
-  if (concat_dim < 0) {
-    concat_dim += ndim;
-  }
-
-  // Calculate the new sizes
-  std::vector<int64_t> new_out_sizes =
-      get_concat_sizes(*graph, all_inputs, concat_dim);
-
-  // Resize the output tensor
-  graph->virtual_resize(out, new_out_sizes);
-}
-
-utils::uvec3 concat_pick_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)shader;
-  (void)extra_args;
-
-  const ValueRef out = args.at(0).refs.at(0);
-  const std::vector<ValueRef> inputs_in_batch = args.at(1).refs;
-
-  int64_t concat_dim = graph->extract_scalar<int64_t>(extra_args.at(1));
-
-  // Normalize concat_dim if negative
-  const int64_t ndim = graph->dim_of(out);
-  if (concat_dim < 0) {
-    concat_dim += ndim;
-  }
-
-  // The concat shader concatenates N input tensors at a time to the output
-  // tensor. Since the shader may need to be invoked multiple times to finish
-  // concatenation when the number of input tensors is >N, the global workgroup
-  // is based on the volume of input data being concatenated in this batch,
-  // as opposed to the overall size of the output tensor. Conceptually, the
-  // global work group size represents which elements of the output tensor will
-  // be written to during this dispatch.
-
-  uint32_t total_input_numel = 0;
-  int64_t concat_dim_numel = 0;
-  for (const ValueRef input : inputs_in_batch) {
-    total_input_numel += graph->numel_of(input);
-    concat_dim_numel += graph->size_at<int64_t>(concat_dim, input);
-  }
-
-  if (graph->is_buffer_storage(out)) {
-    return {total_input_numel, 1, 1};
-  }
-
-  // The texture implementation is similar, except each invocation writes out 4
-  // output elements along the packed dim (i.e. one texel). In this case, the
-  // global work group size represents the number of output texels that will be
-  // written to in this batch, rather than the number of output elements. Note
-  // that to update an element of the output, the entire texel that contains it
-  // will need to be loaded, updated, then written back.
-
-  std::vector<int64_t> inp_volume_sizes = graph->sizes_of(out);
-  inp_volume_sizes.at(concat_dim) = concat_dim_numel;
-
-  // Calculate what the image extents would be of a tensor with the input
-  // volume's sizes. This produces the number of texels that would need to be
-  // written to.
-  const int32_t packed_dim = graph->packed_dim_of(out);
-  std::vector<int64_t> inp_volume_texel_sizes =
-      api::calculate_padded_sizes(inp_volume_sizes, packed_dim);
-  // If the concat_dim is the same as the packed dim, and the concat_offset for
-  // this input batch is not a multiple of 4, then the data from an input texel
-  // may be split up between two output texels. For example:
-  //                I0 , I1 , I2 , I2
-  // O0 , O1 , O2 , X  | X  , X  , X ,  X
-  // Therefore, 1 texel is added to the packed dim to account for this.
-  inp_volume_texel_sizes.at(3 - packed_dim) =
-      utils::div_up_4(inp_volume_texel_sizes.at(3 - packed_dim)) + 1;
-
-  const uint32_t inp_volume_texel_numel =
-      utils::multiply_integers(inp_volume_texel_sizes);
-
-  return {inp_volume_texel_numel, 1, 1};
-
-  // The texture implementation is similar, expect each thread is responsible
-  // for writing out an entire output texel. Therefore, the overall global work
-  // group size will be the concatenation of the texture extents of the input
-  // tensors in this batch.
-
-  // One complication is when the previous concatenation batch does not write
-  // up to a texel boundary. An example is if the previous concatenation batch
-  // only wrote 7 elements along the concatenation dim. The first input element
-  // would then have to be inserted at the last element of the final texel
-  // written by the previous batch. To account for this, initialize the
-  // workgroup size at the concatenation dim to 1 (need to read N total texels
-  // along the concat dim for input tensors + up to 1 texel from the output
-  // tensor).
-
-  // The axis along which to concatenate the input texture extents
-  int64_t extent_concat_axis = nchw_dim_to_whcn_dim(concat_dim, ndim);
-  // For batch concatenation, the concat axis is the batch-concatenation axis
-  if (concat_dim == 4) {
-    extent_concat_axis = graph->concat_dim_of(out);
-  }
-
-  utils::uvec3 global_workgroup_size = graph->create_global_wg_size(out);
-  global_workgroup_size[concat_dim] = 0;
-  for (const ValueRef input : inputs_in_batch) {
-    utils::uvec3 texture_extents = graph->logical_limits_of(input);
-    global_workgroup_size[extent_concat_axis] += texture_extents[concat_dim];
-  }
-
-  return global_workgroup_size;
-}
-
-void add_concat_node(
-    ComputeGraph& graph,
-    const ValueRef tensors_ref,
-    const ValueRef dim_ref,
-    const ValueRef out) {
-  std::vector<ValueRef> in_value_refs;
-
-  {
-    const ValueListPtr tensors = graph.get_value_list(tensors_ref);
-
-    for (const ValueRef in : *tensors) {
-      in_value_refs.push_back(in);
-    }
-  }
-
-  const int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-
-  const int64_t ndim = graph.dim_of(in_value_refs.at(0));
-  int64_t normalized_dim = dim;
-  if (normalized_dim < 0) {
-    normalized_dim += ndim;
-  }
-
-  const int64_t dim_whcn = nchw_dim_to_whcn_dim(normalized_dim, ndim);
-  const ValueRef dim_whcn_ref = graph.get_or_add_value_for_int(dim_whcn);
-
-  // Create a temporary tensor to hold the concat offset
-  TmpTensor concat_offset(
-      &graph, {1}, vkapi::kInt, utils::kBuffer, utils::kWidthPacked);
-
-  // Add node to set concat_offset to 0
-  {
-    std::string kernel_name = "set_zero";
-    add_dtype_suffix(kernel_name, graph.dtype_of(concat_offset));
-
-    vkapi::ParamsBindList param_buffers = {graph.numel_ubo(concat_offset)};
-
-    graph.execute_nodes().emplace_back(new DispatchNode(
-        graph,
-        VK_KERNEL_FROM_STR(kernel_name),
-        {1, 1, 1},
-        {1, 1, 1},
-        // Inputs and Outputs
-        {{concat_offset, vkapi::kWrite}},
-        // Parameter buffers
-        param_buffers,
-        // Push Constants
-        {},
-        // Specialization Constants
-        {},
-        // Resize Args
-        {},
-        // Resizing Logic
-        nullptr));
-  }
-
-  // Process inputs in batches of up to 3 tensors
-  const size_t batch_size = 3;
-  for (size_t batch_start = 0; batch_start < in_value_refs.size();
-       batch_start += batch_size) {
-    const size_t batch_end =
-        std::min(batch_start + batch_size, in_value_refs.size());
-    const size_t current_batch_size = batch_end - batch_start;
-
-    std::vector<ValueRef> batch_inputs;
-    for (size_t i = batch_start; i < batch_end; ++i) {
-      batch_inputs.push_back(in_value_refs.at(i));
-    }
-
-    // Add concat node for this batch
-    {
-      vkapi::ParamsBindList param_buffers = {
-          graph.get_or_create_int_param_buffer(dim_whcn_ref, 0)};
-
-      std::vector<PushConstantDataInfo> push_constants;
-      vkapi::SpecVarList spec_vars;
-
-      if (graph.is_buffer_storage(out)) {
-        param_buffers.append(graph.sizes_ubo(out));
-        param_buffers.append(graph.strides_ubo(out));
-
-        for (const ValueRef in_ref : batch_inputs) {
-          param_buffers.append(graph.sizes_ubo(in_ref));
-          param_buffers.append(graph.strides_ubo(in_ref));
-        }
-
-        param_buffers.append(graph.numel_ubo(out));
-
-        spec_vars = {graph.hashed_layout_of(out)};
-      } else {
-        push_constants = {graph.sizes_pc_of(out)};
-
-        spec_vars = {graph.hashed_layout_of(out)};
-
-        for (const ValueRef in_ref : batch_inputs) {
-          push_constants.push_back(graph.sizes_pc_of(in_ref));
-          spec_vars.append(graph.hashed_layout_of(in_ref));
-        }
-      }
-
-      std::string kernel_name = "concat";
-      if (current_batch_size == 1) {
-        kernel_name += "_1";
-      } else if (current_batch_size == 2) {
-        kernel_name += "_2";
-      } else if (current_batch_size == 3) {
-        kernel_name += "_3";
-      }
-      if (graph.is_buffer_storage(out)) {
-        kernel_name += "_buffer";
-      } else {
-        kernel_name += "_texture3d";
-      }
-
-      add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-      DispatchNode::ResizeFunction resize_fn = nullptr;
-      if (batch_start == 0) {
-        resize_fn = resize_concat_node;
-      }
-      graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-          graph,
-          VK_KERNEL_FROM_STR(kernel_name),
-          concat_pick_global_wg_size,
-          default_pick_local_wg_size,
-          // Inputs and Outputs
-          {{out, vkapi::kReadWrite},
-           {batch_inputs, vkapi::kRead},
-           {concat_offset, vkapi::kRead}},
-          // Parameter buffers
-          param_buffers,
-          // Push Constants
-          push_constants,
-          // Specialization Constants
-          spec_vars,
-          // Resize Args
-          {tensors_ref, dim_ref},
-          // Resizing Logic
-          resize_fn));
-    }
-
-    // Add node to update concat_offset (except for the last batch)
-    if (batch_end < in_value_refs.size()) {
-      vkapi::ParamsBindList param_buffers = {
-          graph.get_or_create_int_param_buffer(dim_whcn_ref, 0)};
-
-      for (const ValueRef in_ref : batch_inputs) {
-        param_buffers.append(graph.sizes_ubo(in_ref));
-      }
-
-      std::string kernel_name = "update_concat_offset";
-      if (current_batch_size == 1) {
-        kernel_name += "_1";
-      } else if (current_batch_size == 2) {
-        kernel_name += "_2";
-      } else if (current_batch_size == 3) {
-        kernel_name += "_3";
-      }
-      add_dtype_suffix(kernel_name, graph.dtype_of(concat_offset));
-
-      vkapi::SpecVarList spec_vars = {};
-
-      graph.execute_nodes().emplace_back(new DispatchNode(
-          graph,
-          VK_KERNEL_FROM_STR(kernel_name),
-          {1u, 1u, 1u},
-          {1u, 1u, 1u},
-          // Inputs and Outputs
-          {{concat_offset, vkapi::kWrite}},
-          // Parameter buffers
-          param_buffers,
-          // Push Constants
-          {},
-          // Specialization Constants
-          spec_vars,
-          // Resize Args
-          {},
-          // Resizing Logic
-          nullptr));
-    }
-  }
-}
-
-void cat_tensor(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  // Extract arguments
-  const ValueRef tensors_ref = args.at(0);
-  const ValueRef dim_ref = args.at(1);
-  const ValueRef out = args.at(2);
-
-  // Add concat node
-  add_concat_node(graph, tensors_ref, dim_ref, out);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.cat.default, cat_tensor);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
deleted file mode 100644
index ded1defe973..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ /dev/null
@@ -1,787 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-enum class Conv2dMethod : uint8_t {
-  Depthwise,
-  Pointwise,
-  SlidingWindow,
-  Transposed,
-};
-
-void resize_conv2d_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef self = args.at(1).refs.at(0);
-
-  size_t ndim = graph->dim_of(self);
-  std::vector<int64_t> new_out_sizes(ndim);
-  const bool transposed = graph->get_bool(extra_args.at(4));
-
-  std::vector<int64_t> self_sizes = graph->sizes_of(self);
-  // Batch, Channel
-  if (ndim == 4) {
-    new_out_sizes.at(ndim - 4) = self_sizes.at(ndim - 4);
-  }
-
-  TensorRefPtr weight_ref = graph->get_tref(extra_args.at(0));
-  const auto& weight_sizes = weight_ref->sizes;
-  new_out_sizes.at(ndim - 3) =
-      transposed ? weight_sizes.at(ndim - 3) : weight_sizes.at(ndim - 4);
-
-  // Height, Width
-  const auto& new_out_sizes_hw = calc_out_sizes_hw(
-      *graph,
-      self_sizes,
-      extra_args.at(0),
-      /*kernel_size_only = */ false,
-      {extra_args.at(1), extra_args.at(2), extra_args.at(3), extra_args.at(5)},
-      transposed);
-  new_out_sizes.at(ndim - 2) = new_out_sizes_hw.at(0);
-  new_out_sizes.at(ndim - 1) = new_out_sizes_hw.at(1);
-
-  graph->virtual_resize(out, new_out_sizes);
-}
-
-void resize_conv1d_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef self = args.at(1).refs.at(0);
-  TensorRefPtr weight_ref = graph->get_tref(extra_args.at(0));
-
-  const int64_t stride_size = graph->get_int_list(extra_args.at(1))->at(0);
-  const int64_t padding_size = graph->get_int_list(extra_args.at(2))->at(0);
-  const int64_t dilation_size = graph->get_int_list(extra_args.at(3))->at(0);
-
-  const std::vector<int64_t>& weight_sizes = weight_ref->sizes;
-
-  const std::vector<int64_t> in_sizes = graph->sizes_of(self);
-  const size_t ndim = in_sizes.size();
-  std::vector<int64_t> new_out_sizes(ndim);
-
-  const int64_t kernel_size = weight_sizes.at(2);
-  const int64_t in_length = in_sizes.at(2);
-
-  new_out_sizes.at(0) = in_sizes.at(0);
-  new_out_sizes.at(1) = weight_sizes.at(0);
-  new_out_sizes.at(2) = calc_out_size(
-      in_length, kernel_size, stride_size, padding_size, dilation_size, false);
-
-  graph->virtual_resize(out, new_out_sizes);
-}
-
-ValueRef prepack_biases(
-    ComputeGraph& graph,
-    const ValueRef vref,
-    const ValueRef weight,
-    const bool transposed,
-    const utils::StorageType storage_type,
-    const utils::GPUMemoryLayout memory_layout) {
-  auto sizes = graph.sizes_of(weight);
-  const int64_t out_channels = transposed ? sizes.at(1) : sizes.at(0);
-
-  ValueRef v = graph.add_tensor(
-      {out_channels}, graph.dtype_of(weight), storage_type, memory_layout);
-
-  vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(graph, v);
-
-  graph.prepack_nodes().emplace_back(new PrepackNode(
-      graph,
-      shader,
-      graph.create_global_wg_size(v),
-      graph.create_local_wg_size(v),
-      vref,
-      v,
-      {},
-      // Specialization constants
-      {graph.hashed_layout_of(v)},
-      {graph.sizes_pc_of(v)}));
-
-  return v;
-}
-
-vkapi::ShaderInfo get_conv2d_shader(
-    ComputeGraph& graph,
-    const ValueRef out,
-    const bool prepack_weights,
-    const Conv2dMethod method,
-    const ValueRef weight,
-    const bool clamp_out = false,
-    const bool stride_equals_dilation = false,
-    const bool stride_1_padding_0 = false) {
-  std::string kernel_name;
-  kernel_name.reserve(kShaderNameReserve);
-  switch (method) {
-    case Conv2dMethod::Depthwise:
-      kernel_name = "conv2d_dw";
-      if (!prepack_weights) {
-        if (!stride_equals_dilation) {
-          kernel_name += "_sned";
-        }
-        const auto& weight_sizes = graph.get_tref(weight)->sizes;
-        if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) {
-          kernel_name += "_output_tile_3x3";
-        }
-        if (weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5) {
-          kernel_name += "_output_tile_5x5";
-        }
-      }
-      break;
-    case Conv2dMethod::Pointwise:
-      if (prepack_weights) {
-        kernel_name = "conv2d";
-      } else {
-        kernel_name = stride_1_padding_0 ? "conv2d_pw_s1p0" : "conv2d_pw";
-      }
-      break;
-    case Conv2dMethod::SlidingWindow:
-      kernel_name = "conv2d";
-      break;
-    case Conv2dMethod::Transposed:
-      kernel_name = "conv_transpose2d";
-      break;
-  }
-  if (prepack_weights) {
-    kernel_name += "_prepack_weights";
-  } else if (clamp_out) {
-    kernel_name += "_clamp";
-  }
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  return VK_KERNEL_FROM_STR(kernel_name);
-}
-
-std::vector<int64_t> get_final_sizes(
-    const std::vector<int64_t>& original_sizes,
-    const Conv2dMethod method) {
-  int64_t batch_padded = utils::align_up_4(utils::val_at(-4, original_sizes));
-  int64_t channels_padded =
-      utils::align_up_4(utils::val_at(-3, original_sizes));
-  int64_t height = utils::val_at(-2, original_sizes);
-  int64_t width = utils::val_at(-1, original_sizes);
-
-  switch (method) {
-    case Conv2dMethod::Depthwise:
-      return std::vector<int64_t>{4, batch_padded / 4, height * width};
-    case Conv2dMethod::Pointwise:
-    case Conv2dMethod::SlidingWindow:
-      return std::vector<int64_t>{
-          4, batch_padded * height / 4, channels_padded * width};
-    case Conv2dMethod::Transposed:
-      return std::vector<int64_t>{
-          4, channels_padded * height / 4, batch_padded * width};
-  }
-}
-
-ValueRef prepack_weights(
-    ComputeGraph& graph,
-    const ValueRef vref,
-    const Conv2dMethod method) {
-  const auto original_sizes = graph.sizes_of(vref);
-  const auto final_sizes = get_final_sizes(original_sizes, method);
-
-  ValueRef v = graph.add_tensor(
-      final_sizes,
-      graph.dtype_of(vref),
-      utils::kTexture2D,
-      utils::kChannelsPacked);
-
-  vkapi::ShaderInfo shader =
-      get_conv2d_shader(graph, v, /*prepack_weights = */ true, method, vref);
-
-  const auto original_sizes_pc =
-      utils::make_ivec4(original_sizes, /*reverse = */ true);
-  graph.prepack_nodes().emplace_back(new PrepackNode(
-      graph,
-      shader,
-      graph.create_global_wg_size(v),
-      graph.create_local_wg_size(v),
-      vref,
-      v,
-      {},
-      // Specialization constants
-      {graph.packed_dim_of(v)},
-      {graph.sizes_pc_of(v),
-       PushConstantDataInfo(&original_sizes_pc, sizeof(original_sizes_pc))}));
-
-  return v;
-}
-
-void check_conv_args(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef out) {
-  VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kChannelsDim);
-  VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kChannelsDim);
-}
-
-struct Conv2dParams final {
-  utils::ivec2 overlay_region;
-  int in_group_size;
-};
-
-struct OutputParams final {
-  float out_min;
-  float out_max;
-};
-
-Conv2dParams create_conv2d_params(
-    ComputeGraph& graph,
-    const ValueRef weight,
-    const Kernel2dParams& p,
-    const bool transposed) {
-  const auto& overlay_region = utils::make_ivec2({
-      p.kernel_size[0] + (p.kernel_size[0] - 1) * (p.dilation[0] - 1),
-      p.kernel_size[1] + (p.kernel_size[1] - 1) * (p.dilation[1] - 1),
-  });
-  const auto weight_sizes = graph.sizes_of(weight);
-  const int32_t in_group_size = utils::safe_downcast<int32_t>(
-      utils::align_up_4(transposed ? weight_sizes.at(0) : weight_sizes.at(1)));
-  return {overlay_region, in_group_size};
-}
-
-void check_conv2d_params(const Kernel2dParams& p, const bool transposed) {
-  if (transposed) {
-    if (p.dilation[0] > 1 || p.dilation[1] > 1) {
-      VK_THROW(
-          "aten.convolution.default: transposed = true, dilation > 1 is not supported yet!");
-    }
-  }
-}
-
-Conv2dMethod get_conv2d_method(
-    ComputeGraph& graph,
-    const ValueRef weight,
-    const int64_t groups,
-    const bool transposed) {
-  const auto weight_sizes = graph.sizes_of(weight);
-  if (!transposed && weight_sizes.at(0) == groups && weight_sizes.at(1) == 1) {
-    return Conv2dMethod::Depthwise;
-  }
-  if (transposed) {
-    return Conv2dMethod::Transposed;
-  }
-  if (weight_sizes.at(2) == 1 && weight_sizes.at(3) == 1) {
-    return Conv2dMethod::Pointwise;
-  }
-  return Conv2dMethod::SlidingWindow;
-}
-
-utils::uvec2 get_conv2d_dw_dispatch_divisor(
-    const std::vector<int64_t>& weight_sizes) {
-  if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) {
-    return {4u, 2u};
-  }
-  if (weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5) {
-    return {4u, 2u};
-  }
-  return {4u, 2u};
-}
-
-utils::uvec3 create_conv2d_global_wg_size(
-    ComputeGraph& graph,
-    const Conv2dMethod method,
-    const ValueRef out,
-    const ValueRef weight_data,
-    const bool stride_equals_dilation) {
-  if (method == Conv2dMethod::Pointwise) {
-    const utils::uvec3 image_extents = graph.logical_limits_of(out);
-    return {
-        utils::div_up(image_extents[0u], 1u),
-        utils::div_up(image_extents[1u], 4u),
-        image_extents[2u]};
-  } else if (method == Conv2dMethod::Depthwise && stride_equals_dilation) {
-    const utils::uvec3 image_extents = graph.create_global_wg_size(out);
-    const utils::uvec2 div =
-        get_conv2d_dw_dispatch_divisor(graph.get_tref(weight_data)->sizes);
-    return {
-        utils::div_up(image_extents[0], div[0]),
-        utils::div_up(image_extents[1], div[1]),
-        image_extents[2]};
-  } else {
-    return graph.create_global_wg_size(out);
-  }
-}
-
-// Custom global workgroup size function for conv2d
-utils::uvec3 conv2d_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef weight_data = resize_args.at(0);
-
-  // Determine method from shader name
-  Conv2dMethod method;
-  if (shader.kernel_name.find("conv2d_dw") != std::string::npos) {
-    method = Conv2dMethod::Depthwise;
-  } else if (
-      shader.kernel_name.find("conv2d_pw") != std::string::npos ||
-      (shader.kernel_name.find("conv2d") != std::string::npos &&
-       shader.kernel_name.find("conv_transpose2d") == std::string::npos)) {
-    // Check if it's pointwise by examining weight sizes
-    const auto& weight_sizes = graph->get_tref(weight_data)->sizes;
-    if (weight_sizes.at(2) == 1 && weight_sizes.at(3) == 1) {
-      method = Conv2dMethod::Pointwise;
-    } else {
-      method = Conv2dMethod::SlidingWindow;
-    }
-  } else if (shader.kernel_name.find("conv_transpose2d") != std::string::npos) {
-    method = Conv2dMethod::Transposed;
-  } else {
-    method = Conv2dMethod::SlidingWindow;
-  }
-
-  // Determine stride_equals_dilation from shader name
-  bool stride_equals_dilation =
-      shader.kernel_name.find("_sned") == std::string::npos;
-
-  utils::uvec3 wg_size = create_conv2d_global_wg_size(
-      *graph, method, out, weight_data, stride_equals_dilation);
-
-  if (method == Conv2dMethod::Depthwise || method == Conv2dMethod::Pointwise) {
-    wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
-  }
-
-  return wg_size;
-}
-
-// Custom local workgroup size function for conv2d
-utils::uvec3 conv2d_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)args;
-  (void)resize_args;
-
-  // Determine method from shader name
-  Conv2dMethod method;
-  if (shader.kernel_name.find("conv2d_dw") != std::string::npos) {
-    method = Conv2dMethod::Depthwise;
-  } else if (
-      shader.kernel_name.find("conv2d_pw") != std::string::npos ||
-      (shader.kernel_name.find("conv2d") != std::string::npos &&
-       shader.kernel_name.find("conv_transpose2d") == std::string::npos)) {
-    method = Conv2dMethod::Pointwise;
-  } else {
-    method = Conv2dMethod::SlidingWindow;
-  }
-
-  if (method == Conv2dMethod::Pointwise) {
-    uint32_t local_wg_size_y = 1;
-    if (global_workgroup_size[1] % 8 == 0) {
-      local_wg_size_y = 8;
-    } else if (global_workgroup_size[1] % 4 == 0) {
-      local_wg_size_y = 4;
-    } else if (global_workgroup_size[1] % 2 == 0) {
-      local_wg_size_y = 2;
-    }
-    return {64 / local_wg_size_y, local_wg_size_y, 1};
-  } else if (method == Conv2dMethod::Depthwise) {
-    return {64, 1, 1};
-  } else {
-    return graph->create_local_wg_size(global_workgroup_size);
-  }
-}
-
-// Custom global workgroup size function for conv1d
-utils::uvec3 conv1d_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-  const ValueRef out = args.at(0).refs.at(0);
-
-  return {// out length
-          graph->size_at<uint32_t>(-1, out),
-          // out channels
-          static_cast<uint32_t>(graph->size_at<int64_t>(-2, out)),
-          // out batches
-          utils::div_up_4(graph->size_at<uint32_t>(-3, out))};
-}
-
-void add_conv2d_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef weight_data,
-    const ValueRef bias,
-    const ValueRef stride,
-    const ValueRef padding,
-    const ValueRef dilation,
-    const ValueRef transposed,
-    const ValueRef output_padding,
-    const ValueRef groups,
-    const ValueRef out_min,
-    const ValueRef out_max,
-    const ValueRef out,
-    const bool clamp_out) {
-  const bool transposed_val = graph.get_bool(transposed);
-
-  float out_min_val = 0.0f;
-  float out_max_val = 0.0f;
-  if (out_min != kDummyValueRef) {
-    out_min_val = graph.extract_scalar<float>(out_min);
-  }
-  if (out_max != kDummyValueRef) {
-    out_max_val = graph.extract_scalar<float>(out_max);
-  }
-
-  const int64_t groups_val = graph.get_int(groups);
-
-  const Conv2dMethod method =
-      get_conv2d_method(graph, weight_data, groups_val, transposed_val);
-
-  ValueRef arg_weight = prepack_weights(graph, weight_data, method);
-  ValueRef arg_bias = prepack_biases(
-      graph,
-      bias,
-      weight_data,
-      transposed_val,
-      /* storage_type = */ utils::kTexture2D,
-      /* memory_layout = */ utils::kWidthPacked);
-
-  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
-  if (in_sizes.at(0) > 1) {
-    VK_THROW("conv2d: input batch size > 1 is not supported yet!");
-  }
-
-  check_conv_args(graph, in, out);
-
-  Kernel2dParams kernel_params = create_kernel2d_params(
-      graph,
-      weight_data,
-      /*kernel_size_only = */ false,
-      stride,
-      padding,
-      dilation);
-  Conv2dParams extra_params =
-      create_conv2d_params(graph, weight_data, kernel_params, transposed_val);
-
-  const bool stride_equals_dilation =
-      (kernel_params.stride[0] == kernel_params.dilation[0] &&
-       kernel_params.stride[1] == kernel_params.dilation[1]);
-
-  const bool stride_1_padding_0 =
-      (kernel_params.stride[0] == 1 && kernel_params.stride[1] == 1 &&
-       kernel_params.padding[0] == 0 && kernel_params.padding[1] == 0);
-
-  OutputParams out_params = {out_min_val, out_max_val};
-
-  check_conv2d_params(kernel_params, transposed_val);
-
-  vkapi::ShaderInfo shader = get_conv2d_shader(
-      graph,
-      out,
-      /*prepack_weights = */ false,
-      method,
-      weight_data,
-      clamp_out,
-      stride_equals_dilation,
-      stride_1_padding_0);
-
-  utils::uvec3 wg_size = create_conv2d_global_wg_size(
-      graph, method, out, weight_data, stride_equals_dilation);
-
-  utils::uvec3 local_wg_size;
-  if (method == Conv2dMethod::Depthwise || method == Conv2dMethod::Pointwise) {
-    wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
-  }
-
-  if (method == Conv2dMethod::Pointwise) {
-    uint32_t local_wg_size_y = 1;
-    if (wg_size[1] % 8 == 0) {
-      local_wg_size_y = 8;
-    } else if (wg_size[1] % 4 == 0) {
-      local_wg_size_y = 4;
-    } else if (wg_size[1] % 2 == 0) {
-      local_wg_size_y = 2;
-    }
-    local_wg_size = {64 / local_wg_size_y, local_wg_size_y, 1};
-  } else if (method == Conv2dMethod::Depthwise) {
-    local_wg_size = {64, 1, 1};
-  } else {
-    local_wg_size = graph.create_local_wg_size(wg_size);
-  }
-
-  vkapi::ParamsBindList param_buffers;
-  std::vector<PushConstantDataInfo> push_constants;
-  if (method == Conv2dMethod::Pointwise) {
-    const utils::ivec4 kernel_param_stride_pad = {
-        kernel_params.stride[0],
-        kernel_params.stride[1],
-        kernel_params.padding[0],
-        kernel_params.padding[1],
-    };
-
-    struct Conv2dPWParams final {
-      int in_group_size;
-      int dummy_padding;
-      OutputParams out_params;
-    } param{extra_params.in_group_size, 0, out_params};
-
-    push_constants = {
-        graph.logical_limits_pc_of(out),
-        PushConstantDataInfo(
-            &kernel_param_stride_pad, sizeof(kernel_param_stride_pad)),
-        PushConstantDataInfo(&param, sizeof(param)),
-    };
-  } else if (method == Conv2dMethod::Depthwise) {
-    const utils::ivec4 kernel_param_size_stride = {
-        kernel_params.kernel_size[0],
-        kernel_params.kernel_size[1],
-        kernel_params.stride[0],
-        kernel_params.stride[1]};
-
-    const utils::ivec4 kernel_param_pad_dial = {
-        kernel_params.padding[0],
-        kernel_params.padding[1],
-        kernel_params.dilation[0],
-        kernel_params.dilation[1]};
-
-    push_constants = {
-        graph.logical_limits_pc_of(out),
-        graph.sizes_pc_of(in),
-        PushConstantDataInfo(
-            &kernel_param_size_stride, sizeof(kernel_param_size_stride)),
-        PushConstantDataInfo(
-            &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)),
-        PushConstantDataInfo(
-            &extra_params, sizeof(extra_params), sizeof(utils::ivec4)),
-        PushConstantDataInfo(&out_params, sizeof(out_params)),
-    };
-  } else {
-    param_buffers = {
-        graph.logical_limits_ubo(out),
-        graph.sizes_ubo(in),
-        graph.create_params_buffer(kernel_params),
-        graph.create_params_buffer(extra_params),
-        graph.create_params_buffer(out_params),
-    };
-  }
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      shader,
-      conv2d_global_wg_size,
-      conv2d_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {{in, arg_weight, arg_bias}, vkapi::kRead}},
-      // Shader params buffers
-      param_buffers,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      {utils::safe_downcast<int32_t>(groups_val)},
-      // Resize Args
-      {weight_data, stride, padding, dilation, transposed, output_padding},
-      // Resizing Logic
-      resize_conv2d_node));
-}
-
-void add_conv1d_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef weight,
-    const ValueRef bias,
-    const ValueRef stride,
-    const ValueRef padding,
-    const ValueRef dilation,
-    const ValueRef groups,
-    const ValueRef out_min,
-    const ValueRef out_max,
-    const ValueRef out,
-    const bool clamp_out) {
-  ValueRef arg_weight = prepack_standard(
-      graph,
-      weight,
-      graph.storage_type_of(out),
-      utils::kChannelsPacked,
-      /* passthrough = */ false,
-      utils::kOptimizedAxisMap);
-  ValueRef arg_bias = prepack_biases(
-      graph,
-      bias,
-      weight,
-      /*transposed = */ false,
-      /*storage_type = */ utils::kTexture3D,
-      /*memory_layout = */ utils::kWidthPacked);
-
-  float out_min_val = 0.0f;
-  float out_max_val = 0.0f;
-  if (out_min != kDummyValueRef) {
-    out_min_val = graph.extract_scalar<float>(out_min);
-  }
-  if (out_max != kDummyValueRef) {
-    out_max_val = graph.extract_scalar<float>(out_max);
-  }
-
-  const int64_t groups_val = graph.get_int(groups);
-
-  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
-  const std::vector<int64_t> weight_sizes = graph.sizes_of(arg_weight);
-  const std::vector<int64_t> out_sizes = graph.sizes_of(out);
-
-  check_conv_args(graph, in, out);
-
-  const int32_t in_channels = in_sizes.at(1);
-  const int32_t out_channels = weight_sizes.at(0);
-  const int32_t kernel_size = weight_sizes.at(2);
-  const int32_t stride_size = graph.get_int_list(stride)->at(0);
-  const int32_t padding_size = graph.get_int_list(padding)->at(0);
-  const int32_t dilation_size = graph.get_int_list(dilation)->at(0);
-  const int32_t in_group_size = static_cast<int64_t>(in_channels / groups_val);
-  const int32_t out_group_size =
-      static_cast<int64_t>(out_channels / groups_val);
-
-  Kernel1dParams kernel_params = {
-      kernel_size,
-      stride_size,
-      padding_size,
-      dilation_size,
-      in_group_size,
-      out_group_size};
-
-  const OutputParams out_params = {out_min_val, out_max_val};
-
-  std::string kernel_name("conv1d");
-  if (clamp_out) {
-    kernel_name += "_clamp";
-  }
-  kernel_name.reserve(kShaderNameReserve);
-
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      conv1d_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {{in, arg_weight, arg_bias}, vkapi::kRead}},
-      // Shader params buffers
-      {
-          graph.logical_limits_ubo(out),
-          graph.sizes_ubo(in),
-          graph.create_params_buffer(kernel_params),
-          graph.create_params_buffer(out_params),
-      },
-      // Push Constants
-      {},
-      // Specialization Constants
-      {graph.hashed_layout_of(out),
-       graph.hashed_layout_of(in),
-       graph.hashed_layout_of(arg_weight),
-       graph.hashed_layout_of(arg_bias)},
-      // Resize Args
-      {weight, stride, padding, dilation},
-      // Resizing Logic
-      resize_conv1d_node));
-}
-
-void conv(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  int64_t in_ndim = graph.dim_of(args[0]);
-  if (in_ndim == 4) {
-    if (args.size() == 10) {
-      // ordinary conv2d
-      return add_conv2d_node(
-          graph,
-          args[0],
-          args[1],
-          args[2],
-          args[3],
-          args[4],
-          args[5],
-          args[6],
-          args[7],
-          args[8],
-          /*out_min = */ kDummyValueRef,
-          /*out_max = */ kDummyValueRef,
-          args[9],
-          false);
-    } else {
-      // conv2d with clamp
-      return add_conv2d_node(
-          graph,
-          args[0],
-          args[1],
-          args[2],
-          args[3],
-          args[4],
-          args[5],
-          args[6],
-          args[7],
-          args[8],
-          args[9],
-          args[10],
-          args[11],
-          true);
-    }
-  } else {
-    if (args.size() == 10) {
-      // ordinary conv1d
-      return add_conv1d_node(
-          graph,
-          args[0],
-          args[1],
-          args[2],
-          args[3],
-          args[4],
-          args[5],
-          args[8],
-          /*out_min = */ kDummyValueRef,
-          /*out_max = */ kDummyValueRef,
-          args[9],
-          false);
-    } else {
-      // conv1d with clamp
-      return add_conv1d_node(
-          graph,
-          args[0],
-          args[1],
-          args[2],
-          args[3],
-          args[4],
-          args[5],
-          args[8],
-          args[9],
-          args[10],
-          args[11],
-          true);
-    }
-  }
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.convolution.default, conv);
-  VK_REGISTER_OP(conv_with_clamp.default, conv);
-  VK_REGISTER_OP(et_vk.conv_with_clamp.default, conv);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp b/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
deleted file mode 100644
index bd648dbae2d..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Copy.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-using utils::ivec3;
-using utils::ivec4;
-using utils::uvec3;
-
-void add_copy_offset_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ivec3& range,
-    const ivec4& src_offset,
-    const ivec4& dst_offset,
-    const ValueRef out,
-    bool calc_out_pos_using_src_chnl,
-    bool calc_in_pos_using_dst_chnl) {
-  std::string kernel_name = "copy_offset";
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-
-  auto shader = VK_KERNEL_FROM_STR(kernel_name);
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {
-          {out, vkapi::kWrite},
-          {in, vkapi::kRead},
-      },
-      // Parameter buffers
-      {},
-      // Push Constants
-      {
-          PushConstantDataInfo(&range, sizeof(range), sizeof(ivec4)),
-          PushConstantDataInfo(&src_offset, sizeof(src_offset), sizeof(ivec4)),
-          PushConstantDataInfo(&dst_offset, sizeof(dst_offset), sizeof(ivec4)),
-      },
-      // Specialization Constants
-      {graph.hashed_layout_of(out),
-       graph.hashed_layout_of(in),
-       (calc_out_pos_using_src_chnl      ? 1
-            : calc_in_pos_using_dst_chnl ? 2
-                                         : 0)},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-void add_copy_packed_dim_offset_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ivec3& range,
-    const ivec4& src_offset,
-    const ivec4& dst_offset,
-    const ValueRef out) {
-  // Check the packed dimension is same for both tensors, also check if the
-  // packed dimension is Width or Height. Since the function does not support
-  // channel packing.
-  VK_CHECK_COND(
-      graph.packed_dim_of(in) == graph.packed_dim_of(out) &&
-      (graph.packed_dim_of(in) == WHCN::kWidthDim ||
-       graph.packed_dim_of(in) == WHCN::kHeightDim));
-
-  std::string kernel_name = "copy_packed_dim_offset";
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
-  const std::vector<int64_t> out_sizes = graph.sizes_of(out);
-
-  // A copy of range with the last element set to batch size of the input tensor
-  ivec4 final_range = {
-      range[0], range[1], range[2], dim_at(in_sizes, kBatch4D)};
-  ivec3 global_wg_size = graph.logical_limits_of(out);
-
-  const auto packed_dim = graph.packed_dim_of(in);
-  // The starting offset in a texel where this tensor will start copying from
-  const auto src_lane_offset = src_offset[packed_dim] & 0x3;
-  // The starting offset in a texel where this tensor will start copying to
-  const auto dst_lane_offset = dst_offset[packed_dim] & 0x3;
-
-  // The total packed texels this tensor will be copied from
-  // The first texel of tensor data in packed dimension will be copied from
-  // remaining lanes from current source Hence (4 - src_lane_offset) is added
-  // to tensor size in packed dimension
-  const auto src_packed_size = utils::div_up_4(
-      (4 - src_lane_offset) + utils::val_at(-packed_dim, out_sizes));
-
-  // The total packed texels this tensor will be copied to
-  // The first texel of tensor data in packed dimension will be copied to
-  // remaining lanes from previous write Hence (4 - dst_lane_offset) is added
-  // to tensor size in packed dimension
-  const auto dst_packed_size = utils::div_up_4(
-      (4 - dst_lane_offset) + utils::val_at(-packed_dim, in_sizes));
-
-  // If the starting src offset is not 0, and the total packed texels is
-  // greater than the source texel range
-  const bool has_additional_src_work =
-      src_lane_offset != 0 && src_packed_size > final_range[packed_dim];
-  // If the starting dst offset is not 0, and the total packed texels is
-  // greater than the source texel range
-  const bool has_additional_dst_work =
-      dst_lane_offset != 0 && dst_packed_size > final_range[packed_dim];
-
-  if (has_additional_src_work || has_additional_dst_work) {
-    global_wg_size[packed_dim]++; // Increase the global work group size in
-                                  // packed dimension
-    final_range[packed_dim]++; // Increase the range in packed dimension
-  }
-
-  auto shader = VK_KERNEL_FROM_STR(kernel_name);
-
-  graph.execute_nodes().emplace_back(new DispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      global_wg_size,
-      graph.create_local_wg_size(global_wg_size),
-      // Inputs and Outputs
-      {
-          {out, vkapi::kWrite},
-          {out, vkapi::kRead},
-          {in, vkapi::kRead},
-      },
-      // Parameter buffers
-      {},
-      // Push Constants
-      {
-          PushConstantDataInfo(
-              &final_range, sizeof(final_range), sizeof(ivec4)),
-          PushConstantDataInfo(&src_offset, sizeof(src_offset), sizeof(ivec4)),
-          PushConstantDataInfo(&dst_offset, sizeof(dst_offset), sizeof(ivec4)),
-      },
-      // Specialization Constants
-      {graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-void add_copy_channel_offset_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    int32_t channel_range,
-    int32_t src_channel_offset,
-    int32_t dst_channel_offset,
-    const ValueRef out) {
-  // Likely need to prepad these numbers.
-  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
-  const std::vector<int64_t> out_sizes = graph.sizes_of(out);
-
-  VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kChannelsDim);
-  VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kChannelsDim);
-
-  // NOTE: This function should be able to support 1d and 2d tensors when
-  // range=1, src_offset=dst_offset=1.
-  VK_CHECK_COND(graph.dim_of(in) >= 3, "Src dim should be at least 3");
-  VK_CHECK_COND(graph.dim_of(out) >= 3, "Dst dim should be at least 3");
-
-  VK_CHECK_COND(
-      dim_at<kChannel4D>(in_sizes) >= src_channel_offset + channel_range,
-      "Src channel (",
-      src_channel_offset,
-      ") and range (",
-      channel_range,
-      ") should be less than or equal to input tensor's channel size (",
-      dim_at<kChannel4D>(in_sizes),
-      ")");
-
-  VK_CHECK_COND(
-      dim_at<kChannel4D>(out_sizes) >= dst_channel_offset + channel_range,
-      "Dst channel (",
-      dst_channel_offset,
-      ") and range (",
-      channel_range,
-      ") should be less than or equal to input tensor's channel size (",
-      dim_at<kChannel4D>(out_sizes),
-      ")");
-
-  VK_CHECK_COND(channel_range >= 0, "Channel range must be non-negative");
-  VK_CHECK_COND(
-      src_channel_offset >= 0, "Src channel offset must be non-negative");
-  VK_CHECK_COND(
-      dst_channel_offset >= 0, "Dst channel offset must be non-negative");
-
-  std::string kernel_name = "copy_channel_offset";
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  int32_t out_channels = dim_at<kChannel4D>(out_sizes);
-
-  // Copy one batch at a time.
-  for (int batch_idx = 0; batch_idx < dim_at<kBatch4D>(in_sizes); batch_idx++) {
-    // Mapping the tensor NCHW coordinates into texture XYZ coordinates
-    int32_t dst_first_z = dst_channel_offset / 4;
-    int32_t dst_last_z = (dst_channel_offset + channel_range - 1) / 4;
-
-    // We copy the entire width and height dimension. For the channel dimension,
-    // we use the z-dimension of the global_size to specify the texture range.
-    // The shader combines the global invocation id and the dst_offset to get
-    // the actual coordinate.
-
-    const ivec3 dst_offset{
-        0, 0, dst_first_z + batch_idx * utils::div_up_4(out_channels)};
-
-    const uvec3 global_size{
-        utils::safe_downcast<uint32_t>(dim_at<kWidth4D>(in_sizes)),
-        utils::safe_downcast<uint32_t>(dim_at<kHeight4D>(in_sizes)),
-        utils::safe_downcast<uint32_t>(dst_last_z - dst_first_z + 1)};
-    const uvec3 local_size = graph.create_local_wg_size(global_size);
-
-    const utils::ivec4 range_params = {
-        static_cast<int>(global_size[0]),
-        static_cast<int>(global_size[1]),
-        static_cast<int>(global_size[2]),
-        channel_range};
-
-    const ivec4 offset_params = {
-        dst_offset[0], dst_offset[1], dst_offset[2], dst_channel_offset};
-
-    auto shader = VK_KERNEL_FROM_STR(kernel_name);
-
-    graph.execute_nodes().emplace_back(new DispatchNode(
-        graph,
-        VK_KERNEL_FROM_STR(kernel_name),
-        global_size,
-        local_size,
-        // Inputs and Outputs
-        {
-            {out, vkapi::kWrite},
-            {out, vkapi::kRead},
-            {in, vkapi::kRead},
-        },
-        // Parameter buffers
-        {},
-        // Push Constants
-        {graph.sizes_pc_of(out),
-         graph.sizes_pc_of(in),
-         PushConstantDataInfo(&range_params, sizeof(range_params)),
-         PushConstantDataInfo(&offset_params, sizeof(offset_params)),
-         PushConstantDataInfo(&src_channel_offset, sizeof(src_channel_offset))},
-        // Specialization Constants
-        {graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
-        // Resize Args
-        {},
-        // Resizing Logic
-        nullptr));
-  }
-}
-
-void add_copy_offset_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    ValueRef range_ref,
-    ValueRef src_offset_ref,
-    ValueRef dst_offset_ref,
-    ValueRef out) {
-  ivec3 range = utils::make_ivec3(*graph.get_int_list(range_ref));
-  ivec3 src = utils::make_ivec3(*graph.get_int_list(src_offset_ref));
-  ivec3 dst = utils::make_ivec3(*graph.get_int_list(dst_offset_ref));
-
-  ivec4 src_offset = {src[0], src[1], src[2], 0};
-  ivec4 dst_offset = {dst[0], dst[1], dst[2], 0};
-
-  add_copy_offset_node(
-      graph, in, range, src_offset, dst_offset, out, false, false);
-}
-
-void copy_offset(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  add_copy_offset_node(graph, args[0], args[1], args[2], args[3], args[4]);
-}
-
-void copy_channel_offset(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  ValueRef in = args[0];
-  ValueRef channel_range_ref = args[1];
-  ValueRef src_channel_offset_ref = args[2];
-  ValueRef dst_channel_offset_ref = args[3];
-  ValueRef out = args[4];
-
-  auto channel_range = graph.extract_scalar<int64_t>(channel_range_ref);
-  auto src_channel_offset =
-      graph.extract_scalar<int64_t>(src_channel_offset_ref);
-  auto dst_channel_offset =
-      graph.extract_scalar<int64_t>(dst_channel_offset_ref);
-
-  add_copy_channel_offset_node(
-      graph, in, channel_range, src_channel_offset, dst_channel_offset, out);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(etvk.copy_offset, copy_offset);
-  VK_REGISTER_OP(etvk.copy_channel_offset, copy_channel_offset);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Copy.h b/backends/vulkan/runtime/graph/ops/impl/Copy.h
deleted file mode 100644
index 41956d482d9..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Copy.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-namespace vkcompute {
-
-// add_copy_offset_node resumes the vkCmdCopyImage command. It copies the
-// texture extents specified by the range, src_offset, and dst_offset (all are
-// in texture coordinate (x, y, z) from the input image to the output image.
-// src_offset.w and dst_offset.w may contain channel size information.
-//
-// It is possible to have input and output to point to the same image
-// object. But when the source range and destination range overlap, the behavior
-// is undefined.
-//
-// boolean flags calc_out_pos_using_src_chnl and calc_in_pos_using_dst_chnl
-// can be used to specify an indexing function in the shader
-// If calc_out_pos_using_src_chnl is set to true channel and batch index will be
-// calculated based on source channel size and will be used to determine
-// destination texel position.
-//
-// If calc_in_pos_using_dst_chnl is set to truechannel and batch index will be
-// calculated based on destination channel size and will be used to determine
-// source texel position.
-//
-// If both are true calc_out_pos_using_src_chnl is picked. If both are false no
-// index calculation happens.
-void add_copy_offset_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const utils::ivec3& range,
-    const utils::ivec4& src_offset,
-    const utils::ivec4& dst_offset,
-    const ValueRef out,
-    bool calc_out_pos_using_src_chnl,
-    bool calc_in_pos_using_dst_chnl);
-
-// add_copy_packed_dim_offset_node behaves similar to add_copy_node, except that
-// its used when copying packed dimension, if tensor is width or height packed.
-// src_offset.w and dst_offset.w may contain channel size information.
-//
-// It copies the texture extents specified by the range, src_offset, and
-// dst_offset (all are in texture coordinate (x, y, z) from the input image to
-// the output image.
-void add_copy_packed_dim_offset_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const utils::ivec3& range,
-    const utils::ivec4& src_offset,
-    const utils::ivec4& dst_offset,
-    const ValueRef out);
-
-// add_copy_channel_offset_node behaves similar to add_copy_node, except that it
-// works on the channel dimensions of the tensor (up to 4 dimensions in NCHW).
-// The range and offset arguments are in the tensor coordinate. It assumes the
-// underlying texture is channel-packed.
-//
-// This function is specialized implementation for copying
-// channel packed values. The complication comes from when reading / writing the
-// channel dimension on indices that are not aligned to packing, we will need
-// be careful about the boundaries.
-//
-// It achieves the following:
-//   out[:, dst_channel_offset:dst_channel_offset + channel_range, :, :] =
-//       in [:, src_channel_offset:src_channel_offset + channel_range, :, :]
-void add_copy_channel_offset_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    int32_t channel_range,
-    int32_t src_channel_offset,
-    int32_t dst_channel_offset,
-    const ValueRef out);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Dequantize.cpp b/backends/vulkan/runtime/graph/ops/impl/Dequantize.cpp
deleted file mode 100644
index a217734653d..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Dequantize.cpp
+++ /dev/null
@@ -1,843 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-namespace vkcompute {
-
-void resize_dequantize_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
-
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = args.at(1).refs.at(0);
-
-  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
-  graph->virtual_resize(out, in_sizes);
-}
-
-utils::uvec3 dequantize_per_channel_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)args;
-  (void)resize_args;
-
-  const ValueRef input = args.at(1).refs.at(0);
-
-  utils::uvec3 local_wg_size =
-      graph->create_local_wg_size(global_workgroup_size);
-
-  // WORKAROUND: The CommandBuffer::dispatch function divides
-  // global_workgroup_size by local_workgroup_size to get the number of
-  // workgroups to dispatch. We need to ensure that we dispatch the correct
-  // number of workgroups in the Z dimension to cover all batch-channel
-  // combinations.
-  //
-  // If local_wg_size[2] > 1, then div_up(global_workgroup_size[2],
-  // local_wg_size[2]) might reduce the number of workgroups dispatched. To
-  // ensure we dispatch global_workgroup_size[2] workgroups in the Z dimension,
-  // we set local_wg_size[2] = 1.
-  const auto input_sizes = graph->sizes_of(input);
-  if (input_sizes.size() == 4 && !graph->is_buffer_storage(input) &&
-      global_workgroup_size[2] > 1) {
-    local_wg_size[2] = 1;
-  }
-
-  return local_wg_size;
-}
-
-utils::uvec3 dequantize_block_wise_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-  const ValueRef input = args.at(1).refs.at(0);
-
-  utils::uvec3 local_wg_size =
-      graph->create_local_wg_size(global_workgroup_size);
-
-  // WORKAROUND: The CommandBuffer::dispatch function divides
-  // global_workgroup_size by local_workgroup_size to get the number of
-  // workgroups to dispatch. We need to ensure that we dispatch the correct
-  // number of workgroups in the Z dimension to cover all batch-channel
-  // combinations.
-  //
-  // If local_wg_size[2] > 1, then div_up(global_workgroup_size[2],
-  // local_wg_size[2]) might reduce the number of workgroups dispatched. To
-  // ensure we dispatch global_workgroup_size[2] workgroups in the Z dimension,
-  // we set local_wg_size[2] = 1.
-  const auto input_sizes = graph->sizes_of(input);
-  if (input_sizes.size() == 4 && !graph->is_buffer_storage(input) &&
-      global_workgroup_size[2] > 1) {
-    local_wg_size[2] = 1;
-  }
-
-  return local_wg_size;
-}
-
-void add_dequantize_per_tensor_node(
-    ComputeGraph& graph,
-    const ValueRef& input,
-    const ValueRef& scale,
-    const ValueRef& zero_point,
-    const ValueRef& quant_min,
-    const ValueRef& quant_max,
-    const ValueRef& output) {
-  std::string kernel_name("dequantize_per_tensor");
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(output));
-  add_dtype_suffix(kernel_name, graph.dtype_of(scale));
-  add_dtype_suffix(kernel_name, graph.dtype_of(zero_point));
-
-  // Handle optional quant_min and quant_max parameters independently
-  auto bounds = get_dtype_bounds(graph.dtype_of(input));
-
-  int quant_min_val, quant_max_val;
-
-  // Handle quant_min
-  if (graph.val_is_none(quant_min)) {
-    quant_min_val = bounds.first;
-  } else {
-    VK_CHECK_COND(
-        graph.val_is_int(quant_min),
-        "quant_min must be an integer, got type: ",
-        graph.get_val_type(quant_min));
-    quant_min_val = static_cast<int>(graph.get_int(quant_min));
-  }
-
-  // Handle quant_max
-  if (graph.val_is_none(quant_max)) {
-    quant_max_val = bounds.second;
-  } else {
-    VK_CHECK_COND(
-        graph.val_is_int(quant_max),
-        "quant_max must be an integer, got type: ",
-        graph.get_val_type(quant_max));
-    quant_max_val = static_cast<int>(graph.get_int(quant_max));
-  }
-
-  vkapi::ParamsBindList param_ubos;
-  std::vector<PushConstantDataInfo> push_constants;
-
-  if (graph.is_buffer_storage(input)) {
-    param_ubos = {
-        graph.numel_ubo(input),
-        graph.sizes_ubo(input),
-        graph.strides_ubo(input),
-        graph.sizes_ubo(output),
-        graph.strides_ubo(output)};
-  } else {
-    param_ubos = {
-        graph.logical_limits_ubo(input), graph.logical_limits_ubo(output)};
-  }
-
-  push_constants = {
-      PushConstantDataInfo(&quant_min_val, sizeof(int)),
-      PushConstantDataInfo(&quant_max_val, sizeof(int)),
-  };
-
-  vkapi::SpecVarList spec_vars = {
-      graph.hashed_layout_of(output),
-      graph.hashed_layout_of(input),
-  };
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{output, vkapi::kWrite},
-       {input, vkapi::kRead},
-       {{scale, zero_point}, vkapi::kRead}},
-      // Shader param buffers
-      param_ubos,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      spec_vars,
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_dequantize_node));
-}
-
-void add_dequantize_per_token_node(
-    ComputeGraph& graph,
-    const ValueRef& input,
-    const ValueRef& scale,
-    const ValueRef& zero_point,
-    const ValueRef& quant_min,
-    const ValueRef& quant_max,
-    const ValueRef& output) {
-  std::string kernel_name("dequantize_per_token");
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(output));
-  add_dtype_suffix(kernel_name, graph.dtype_of(scale));
-  add_dtype_suffix(kernel_name, graph.dtype_of(zero_point));
-
-  // Handle optional quant_min and quant_max parameters independently
-  auto bounds = get_dtype_bounds(graph.dtype_of(input));
-
-  int quant_min_val, quant_max_val;
-
-  // Handle quant_min
-  if (graph.val_is_none(quant_min)) {
-    quant_min_val = bounds.first;
-  } else {
-    VK_CHECK_COND(
-        graph.val_is_int(quant_min),
-        "quant_min must be an integer, got type: ",
-        graph.get_val_type(quant_min));
-    quant_min_val = static_cast<int>(graph.get_int(quant_min));
-  }
-
-  // Handle quant_max
-  if (graph.val_is_none(quant_max)) {
-    quant_max_val = bounds.second;
-  } else {
-    VK_CHECK_COND(
-        graph.val_is_int(quant_max),
-        "quant_max must be an integer, got type: ",
-        graph.get_val_type(quant_max));
-    quant_max_val = static_cast<int>(graph.get_int(quant_max));
-  }
-
-  int num_tokens = static_cast<int>(graph.sizes_of(scale)[0]);
-
-  vkapi::ParamsBindList param_ubos;
-  std::vector<PushConstantDataInfo> push_constants;
-
-  if (graph.is_buffer_storage(input)) {
-    param_ubos = {
-        graph.numel_ubo(input),
-        graph.sizes_ubo(input),
-        graph.strides_ubo(input),
-        graph.sizes_ubo(output),
-        graph.strides_ubo(output)};
-  } else {
-    param_ubos = {
-        graph.logical_limits_ubo(input), graph.logical_limits_ubo(output)};
-  }
-
-  push_constants = {
-      PushConstantDataInfo(&num_tokens, sizeof(int)),
-      PushConstantDataInfo(&quant_min_val, sizeof(int)),
-      PushConstantDataInfo(&quant_max_val, sizeof(int)),
-  };
-
-  vkapi::SpecVarList spec_vars = {
-      graph.hashed_layout_of(output),
-      graph.hashed_layout_of(input),
-  };
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{output, vkapi::kWrite},
-       {input, vkapi::kRead},
-       {{scale, zero_point}, vkapi::kRead}},
-      // Shader param buffers
-      param_ubos,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      spec_vars,
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_dequantize_node));
-}
-
-void add_dequantize_per_channel_node(
-    ComputeGraph& graph,
-    const ValueRef& input,
-    const ValueRef& scale,
-    const ValueRef& zero_point,
-    const ValueRef& axis,
-    const ValueRef& quant_min,
-    const ValueRef& quant_max,
-    const ValueRef& output) {
-  std::string kernel_name("dequantize_per_channel");
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(output));
-  add_dtype_suffix(kernel_name, graph.dtype_of(scale));
-  add_dtype_suffix(kernel_name, graph.dtype_of(zero_point));
-
-  int axis_val = static_cast<int>(graph.get_int(axis));
-
-  // Handle optional quant_min and quant_max parameters independently
-  auto bounds = get_dtype_bounds(graph.dtype_of(input));
-
-  int quant_min_val, quant_max_val;
-
-  // Handle quant_min
-  if (graph.val_is_none(quant_min)) {
-    quant_min_val = bounds.first;
-  } else {
-    VK_CHECK_COND(
-        graph.val_is_int(quant_min),
-        "quant_min must be an integer, got type: ",
-        graph.get_val_type(quant_min));
-    quant_min_val = static_cast<int>(graph.get_int(quant_min));
-  }
-
-  // Handle quant_max
-  if (graph.val_is_none(quant_max)) {
-    quant_max_val = bounds.second;
-  } else {
-    VK_CHECK_COND(
-        graph.val_is_int(quant_max),
-        "quant_max must be an integer, got type: ",
-        graph.get_val_type(quant_max));
-    quant_max_val = static_cast<int>(graph.get_int(quant_max));
-  }
-
-  // Normalize axis and convert from NCHW to WHCN using utility functions
-  const auto input_sizes = graph.sizes_of(input);
-  const int64_t ndim = graph.dim_of(input);
-
-  // Normalize axis to handle negative indices
-  axis_val = normalize(axis_val, ndim);
-
-  // Convert from NCHW axis to WHCN axis for shader (vulkan representation)
-  int axis_whcn = nchw_dim_to_whcn_dim(axis_val, ndim);
-
-  int num_channels;
-  if (axis_val == 0 && ndim == 4 && !graph.is_buffer_storage(input)) {
-    // For batch dimension dequantization in 4D tensors, pass the actual number
-    // of channels so the shader can correctly unfold the batch-channel folding
-    num_channels = static_cast<int>(input_sizes[1]); // Channel dimension
-  } else {
-    num_channels = static_cast<int>(input_sizes[axis_val]);
-  }
-
-  vkapi::ParamsBindList param_ubos;
-  std::vector<PushConstantDataInfo> push_constants;
-
-  if (graph.is_buffer_storage(input)) {
-    param_ubos = {
-        graph.numel_ubo(input),
-        graph.sizes_ubo(input),
-        graph.strides_ubo(input),
-        graph.sizes_ubo(output),
-        graph.strides_ubo(output)};
-  } else {
-    param_ubos = {
-        graph.logical_limits_ubo(input), graph.logical_limits_ubo(output)};
-  }
-
-  push_constants = {
-      PushConstantDataInfo(&axis_whcn, sizeof(int)),
-      PushConstantDataInfo(&num_channels, sizeof(int)),
-      PushConstantDataInfo(&quant_min_val, sizeof(int)),
-      PushConstantDataInfo(&quant_max_val, sizeof(int)),
-  };
-
-  vkapi::SpecVarList spec_vars = {
-      graph.hashed_layout_of(output),
-      graph.hashed_layout_of(input),
-  };
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      dequantize_per_channel_local_wg_size,
-      // Inputs and Outputs
-      {{output, vkapi::kWrite},
-       {input, vkapi::kRead},
-       {{scale, zero_point}, vkapi::kRead}},
-      // Shader param buffers
-      param_ubos,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      spec_vars,
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_dequantize_node));
-}
-
-void add_dequantize_block_wise_node(
-    ComputeGraph& graph,
-    const ValueRef& input,
-    const ValueRef& block_size,
-    const ValueRef& scale,
-    const ValueRef& zero_point,
-    const ValueRef& quant_min,
-    const ValueRef& quant_max,
-    const ValueRef& output) {
-  std::string kernel_name("dequantize_block_wise");
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(output));
-  add_dtype_suffix(kernel_name, graph.dtype_of(scale));
-  add_dtype_suffix(kernel_name, graph.dtype_of(zero_point));
-
-  // Handle optional quant_min and quant_max parameters independently
-  auto bounds = get_dtype_bounds(graph.dtype_of(input));
-
-  int quant_min_val, quant_max_val;
-
-  // Handle quant_min
-  if (graph.val_is_none(quant_min)) {
-    quant_min_val = bounds.first;
-  } else {
-    VK_CHECK_COND(
-        graph.val_is_int(quant_min),
-        "quant_min must be an integer, got type: ",
-        graph.get_val_type(quant_min));
-    quant_min_val = static_cast<int>(graph.get_int(quant_min));
-  }
-
-  // Handle quant_max
-  if (graph.val_is_none(quant_max)) {
-    quant_max_val = bounds.second;
-  } else {
-    VK_CHECK_COND(
-        graph.val_is_int(quant_max),
-        "quant_max must be an integer, got type: ",
-        graph.get_val_type(quant_max));
-    quant_max_val = static_cast<int>(graph.get_int(quant_max));
-  }
-
-  const auto input_sizes = graph.sizes_of(input);
-  const auto block_size_list = graph.get_int_list(block_size);
-
-  // Convert dimensions to WHCN order for shader
-  utils::ivec4 block_size_vec = utils::make_whcn_ivec4(*block_size_list);
-  utils::ivec4 tensor_size_whcn = utils::make_whcn_ivec4(input_sizes);
-
-  // Calculate numBlocks: tensorSize / blockSize (both in WHCN order)
-  utils::ivec4 num_blocks_vec = {
-      tensor_size_whcn[0] / block_size_vec[0],
-      tensor_size_whcn[1] / block_size_vec[1],
-      tensor_size_whcn[2] / block_size_vec[2],
-      tensor_size_whcn[3] / block_size_vec[3]};
-
-  // Calculate blockStride: pre-computed linear strides for the block grid
-  utils::ivec4 block_stride_vec = {
-      1,
-      num_blocks_vec[0],
-      num_blocks_vec[0] * num_blocks_vec[1],
-      num_blocks_vec[0] * num_blocks_vec[1] * num_blocks_vec[2]};
-
-  vkapi::ParamsBindList param_ubos;
-  std::vector<PushConstantDataInfo> push_constants;
-
-  if (graph.is_buffer_storage(input)) {
-    param_ubos = {
-        graph.numel_ubo(input),
-        graph.sizes_ubo(input),
-        graph.strides_ubo(input),
-        graph.sizes_ubo(output),
-        graph.strides_ubo(output)};
-  } else {
-    param_ubos = {
-        graph.logical_limits_ubo(input), graph.logical_limits_ubo(output)};
-  }
-
-  push_constants = {
-      PushConstantDataInfo(&block_size_vec, sizeof(block_size_vec)),
-      PushConstantDataInfo(&num_blocks_vec, sizeof(num_blocks_vec)),
-      PushConstantDataInfo(&block_stride_vec, sizeof(block_stride_vec)),
-      PushConstantDataInfo(&quant_min_val, sizeof(int)),
-      PushConstantDataInfo(&quant_max_val, sizeof(int)),
-  };
-
-  vkapi::SpecVarList spec_vars = {
-      graph.hashed_layout_of(output),
-      graph.hashed_layout_of(input),
-  };
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      dequantize_block_wise_local_wg_size,
-      // Inputs and Outputs
-      {{output, vkapi::kWrite},
-       {input, vkapi::kRead},
-       {{scale, zero_point}, vkapi::kRead}},
-      // Shader param buffers
-      param_ubos,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      spec_vars,
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_dequantize_node));
-}
-
-void dequantize_per_tensor_impl(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  int arg_idx = 0;
-  const ValueRef input = args[arg_idx++];
-  const ValueRef scale = args[arg_idx++];
-  const ValueRef zero_point = args[arg_idx++];
-  const ValueRef quant_min = args[arg_idx++];
-  const ValueRef quant_max = args[arg_idx++];
-  const ValueRef dtype = args[arg_idx++];
-  const ValueRef output_dtype = args[arg_idx++];
-  const ValueRef output = args[arg_idx++];
-
-  // Suppress unused variable warnings - dtype and output_dtype are inferred
-  (void)dtype;
-  (void)output_dtype;
-
-  // Check tensor types
-  VK_CHECK_COND(graph.val_is_tensor(input));
-  VK_CHECK_COND(graph.val_is_tensor(scale));
-  VK_CHECK_COND(graph.val_is_tensor(zero_point));
-  VK_CHECK_COND(graph.val_is_tensor(output));
-
-  // Verify input is an integer type
-  VK_CHECK_COND(
-      graph.dtype_of(input) == vkapi::kByte ||
-      graph.dtype_of(input) == vkapi::kChar ||
-      graph.dtype_of(input) == vkapi::kInt);
-
-  // Get scale and zero point dtypes
-  vkapi::ScalarType scale_dtype = graph.dtype_of(scale);
-  vkapi::ScalarType zero_point_dtype = graph.dtype_of(zero_point);
-
-  // Verify supported types for scale (fp32 only for now)
-  VK_CHECK_COND(scale_dtype == vkapi::kFloat);
-
-  // Verify supported types for zero point (int32, int8, fp32)
-  VK_CHECK_COND(
-      zero_point_dtype == vkapi::kInt || zero_point_dtype == vkapi::kChar ||
-      zero_point_dtype == vkapi::kFloat);
-
-  // Check that scale and zero_point have buffer storage and width packing
-  VK_CHECK_COND(graph.is_buffer_storage(scale));
-  VK_CHECK_COND(graph.packed_dim_of(scale) == WHCN::kWidthDim);
-  VK_CHECK_COND(graph.is_buffer_storage(zero_point));
-  VK_CHECK_COND(graph.packed_dim_of(zero_point) == WHCN::kWidthDim);
-
-  // Check that tensors with texture storage have standard axis map
-  if (!graph.is_buffer_storage(input)) {
-    VK_CHECK_COND(graph.has_standard_axis_map(input));
-  }
-  if (!graph.is_buffer_storage(output)) {
-    VK_CHECK_COND(graph.has_standard_axis_map(output));
-  }
-
-  add_dequantize_per_tensor_node(
-      graph, input, scale, zero_point, quant_min, quant_max, output);
-}
-
-void dequantize_per_token_impl(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  int arg_idx = 0;
-  const ValueRef input = args[arg_idx++];
-  const ValueRef scale = args[arg_idx++];
-  const ValueRef zero_point = args[arg_idx++];
-  const ValueRef quant_min = args[arg_idx++];
-  const ValueRef quant_max = args[arg_idx++];
-  const ValueRef dtype = args[arg_idx++];
-  const ValueRef output_dtype = args[arg_idx++];
-  const ValueRef output = args[arg_idx++];
-
-  // Suppress unused variable warnings - dtype and output_dtype are inferred
-  (void)dtype;
-  (void)output_dtype;
-
-  // Check tensor types
-  VK_CHECK_COND(graph.val_is_tensor(input));
-  VK_CHECK_COND(graph.val_is_tensor(scale));
-  VK_CHECK_COND(graph.val_is_tensor(zero_point));
-  VK_CHECK_COND(graph.val_is_tensor(output));
-
-  // Verify input is an integer type
-  VK_CHECK_COND(
-      graph.dtype_of(input) == vkapi::kByte ||
-      graph.dtype_of(input) == vkapi::kChar ||
-      graph.dtype_of(input) == vkapi::kInt);
-
-  // Get scale and zero point dtypes
-  vkapi::ScalarType scale_dtype = graph.dtype_of(scale);
-  vkapi::ScalarType zero_point_dtype = graph.dtype_of(zero_point);
-
-  // Verify supported types for scale (fp32 only for now)
-  VK_CHECK_COND(scale_dtype == vkapi::kFloat);
-
-  // Verify supported types for zero point (int32, int8, fp32)
-  VK_CHECK_COND(
-      zero_point_dtype == vkapi::kInt || zero_point_dtype == vkapi::kChar ||
-      zero_point_dtype == vkapi::kFloat);
-
-  // Check that scale and zero_point have buffer storage and width packing
-  VK_CHECK_COND(graph.is_buffer_storage(scale));
-  VK_CHECK_COND(graph.packed_dim_of(scale) == WHCN::kWidthDim);
-  VK_CHECK_COND(graph.is_buffer_storage(zero_point));
-  VK_CHECK_COND(graph.packed_dim_of(zero_point) == WHCN::kWidthDim);
-
-  // Check that tensors with texture storage have standard axis map
-  if (!graph.is_buffer_storage(input)) {
-    VK_CHECK_COND(graph.has_standard_axis_map(input));
-  }
-  if (!graph.is_buffer_storage(output)) {
-    VK_CHECK_COND(graph.has_standard_axis_map(output));
-  }
-
-  // Calculate number of tokens (product of all dimensions except the last one)
-  int64_t num_tokens = 1;
-  const auto input_sizes = graph.sizes_of(input);
-  for (size_t i = 0; i < input_sizes.size() - 1; i++) {
-    num_tokens *= input_sizes[i];
-  }
-
-  const auto scale_sizes = graph.sizes_of(scale);
-  const auto zero_point_sizes = graph.sizes_of(zero_point);
-
-  // Calculate total number of elements in scale and zero_point tensors
-  int64_t scale_numel = 1;
-  for (size_t i = 0; i < scale_sizes.size(); i++) {
-    scale_numel *= scale_sizes[i];
-  }
-
-  int64_t zero_point_numel = 1;
-  for (size_t i = 0; i < zero_point_sizes.size(); i++) {
-    zero_point_numel *= zero_point_sizes[i];
-  }
-
-  // Check that the total number of elements matches num_tokens
-  // This allows for both 1D tensors (size [num_tokens]) and reshaped tensors
-  // (size [num_tokens, 1])
-  VK_CHECK_COND(scale_numel == num_tokens);
-  VK_CHECK_COND(zero_point_numel == num_tokens);
-
-  add_dequantize_per_token_node(
-      graph, input, scale, zero_point, quant_min, quant_max, output);
-}
-
-void dequantize_per_channel_impl(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  int arg_idx = 0;
-  const ValueRef input = args[arg_idx++];
-  const ValueRef scale = args[arg_idx++];
-  const ValueRef zero_point = args[arg_idx++];
-  const ValueRef axis = args[arg_idx++];
-  const ValueRef quant_min = args[arg_idx++];
-  const ValueRef quant_max = args[arg_idx++];
-  const ValueRef dtype = args[arg_idx++];
-  const ValueRef output_dtype = args[arg_idx++];
-  const ValueRef output = args[arg_idx++];
-
-  // Suppress unused variable warnings - dtype and output_dtype are inferred
-  (void)dtype;
-  (void)output_dtype;
-
-  // Check tensor types
-  VK_CHECK_COND(graph.val_is_tensor(input));
-  VK_CHECK_COND(graph.val_is_tensor(scale));
-  VK_CHECK_COND(graph.val_is_tensor(zero_point));
-  VK_CHECK_COND(graph.val_is_tensor(output));
-
-  // Verify input is an integer type
-  VK_CHECK_COND(
-      graph.dtype_of(input) == vkapi::kByte ||
-      graph.dtype_of(input) == vkapi::kChar ||
-      graph.dtype_of(input) == vkapi::kInt);
-
-  // Get scale and zero point dtypes
-  vkapi::ScalarType scale_dtype = graph.dtype_of(scale);
-  vkapi::ScalarType zero_point_dtype = graph.dtype_of(zero_point);
-
-  // Verify supported types for scale (fp32 only for now)
-  VK_CHECK_COND(scale_dtype == vkapi::kFloat);
-
-  // Verify supported types for zero point (int32, int8, fp32)
-  VK_CHECK_COND(
-      zero_point_dtype == vkapi::kInt || zero_point_dtype == vkapi::kChar ||
-      zero_point_dtype == vkapi::kFloat);
-
-  // Check that scale and zero_point have buffer storage and width packing
-  VK_CHECK_COND(graph.is_buffer_storage(scale));
-  VK_CHECK_COND(graph.packed_dim_of(scale) == WHCN::kWidthDim);
-  VK_CHECK_COND(graph.is_buffer_storage(zero_point));
-  VK_CHECK_COND(graph.packed_dim_of(zero_point) == WHCN::kWidthDim);
-
-  // Check that tensors with texture storage have standard axis map
-  if (!graph.is_buffer_storage(input)) {
-    VK_CHECK_COND(graph.has_standard_axis_map(input));
-  }
-  if (!graph.is_buffer_storage(output)) {
-    VK_CHECK_COND(graph.has_standard_axis_map(output));
-  }
-
-  // Normalize axis
-  int axis_val = static_cast<int>(graph.get_int(axis));
-  const auto input_sizes = graph.sizes_of(input);
-  int ndim = graph.dim_of(input);
-  if (axis_val < 0) {
-    axis_val += ndim;
-  }
-
-  // Verify axis is valid
-  VK_CHECK_COND(axis_val >= 0 && axis_val < ndim);
-
-  // Get number of channels along the specified axis
-  int64_t num_channels = input_sizes[axis_val];
-
-  const auto scale_sizes = graph.sizes_of(scale);
-  const auto zero_point_sizes = graph.sizes_of(zero_point);
-
-  // Calculate total number of elements in scale and zero_point tensors
-  int64_t scale_numel = 1;
-  for (size_t i = 0; i < scale_sizes.size(); i++) {
-    scale_numel *= scale_sizes[i];
-  }
-
-  int64_t zero_point_numel = 1;
-  for (size_t i = 0; i < zero_point_sizes.size(); i++) {
-    zero_point_numel *= zero_point_sizes[i];
-  }
-
-  // Check that the total number of elements matches num_channels
-  VK_CHECK_COND(scale_numel == num_channels);
-  VK_CHECK_COND(zero_point_numel == num_channels);
-
-  add_dequantize_per_channel_node(
-      graph, input, scale, zero_point, axis, quant_min, quant_max, output);
-}
-
-void dequantize_affine_impl(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  int arg_idx = 0;
-  const ValueRef input = args[arg_idx++];
-  const ValueRef block_size = args[arg_idx++];
-  const ValueRef scale = args[arg_idx++];
-  const ValueRef zero_point = args[arg_idx++];
-  const ValueRef input_dtype = args[arg_idx++];
-  const ValueRef quant_min = args[arg_idx++];
-  const ValueRef quant_max = args[arg_idx++];
-  const ValueRef output_dtype = args[arg_idx++];
-  const ValueRef output = args[arg_idx++];
-
-  // Suppress unused variable warnings
-  (void)input_dtype;
-  (void)output_dtype;
-
-  // Check tensor types
-  VK_CHECK_COND(graph.val_is_tensor(input));
-  VK_CHECK_COND(graph.val_is_tensor(scale));
-  VK_CHECK_COND(graph.val_is_tensor(zero_point));
-  VK_CHECK_COND(graph.val_is_tensor(output));
-
-  // Verify input is an integer type
-  VK_CHECK_COND(
-      graph.dtype_of(input) == vkapi::kByte ||
-      graph.dtype_of(input) == vkapi::kChar ||
-      graph.dtype_of(input) == vkapi::kInt);
-
-  // Get scale and zero point dtypes
-  vkapi::ScalarType scale_dtype = graph.dtype_of(scale);
-  vkapi::ScalarType zero_point_dtype = graph.dtype_of(zero_point);
-
-  // Verify supported types for scale (fp32 only for now)
-  VK_CHECK_COND(scale_dtype == vkapi::kFloat);
-
-  // Verify supported types for zero point (int32, int8, fp32)
-  VK_CHECK_COND(
-      zero_point_dtype == vkapi::kInt || zero_point_dtype == vkapi::kChar ||
-      zero_point_dtype == vkapi::kFloat);
-
-  // Check that scale and zero_point have buffer storage and width packing
-  VK_CHECK_COND(graph.is_buffer_storage(scale));
-  VK_CHECK_COND(graph.packed_dim_of(scale) == WHCN::kWidthDim);
-  VK_CHECK_COND(graph.is_buffer_storage(zero_point));
-  VK_CHECK_COND(graph.packed_dim_of(zero_point) == WHCN::kWidthDim);
-
-  // Check that tensors with texture storage have standard axis map
-  if (!graph.is_buffer_storage(input)) {
-    VK_CHECK_COND(graph.has_standard_axis_map(input));
-  }
-  if (!graph.is_buffer_storage(output)) {
-    VK_CHECK_COND(graph.has_standard_axis_map(output));
-  }
-
-  // Verify block_size is valid (each dimension must divide evenly into input
-  // size)
-  const auto input_sizes = graph.sizes_of(input);
-  const auto block_size_list = graph.get_int_list(block_size);
-  VK_CHECK_COND(block_size_list->size() == input_sizes.size());
-
-  for (size_t i = 0; i < input_sizes.size(); i++) {
-    if ((*block_size_list)[i] > 1) {
-      VK_CHECK_COND(
-          input_sizes[i] % (*block_size_list)[i] == 0,
-          "Input size at dimension ",
-          i,
-          " (",
-          input_sizes[i],
-          ") must be divisible by block_size at dimension ",
-          i,
-          " (",
-          (*block_size_list)[i],
-          ")");
-    }
-  }
-
-  add_dequantize_block_wise_node(
-      graph,
-      input,
-      block_size,
-      scale,
-      zero_point,
-      quant_min,
-      quant_max,
-      output);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(
-      quantized_decomposed.dequantize_per_tensor.tensor,
-      dequantize_per_tensor_impl);
-  VK_REGISTER_OP(
-      quantized_decomposed.dequantize_per_token.default,
-      dequantize_per_token_impl);
-  VK_REGISTER_OP(
-      quantized_decomposed.dequantize_per_channel.default,
-      dequantize_per_channel_impl);
-
-  // TorchAO affine dequantization operators
-  VK_REGISTER_OP(torchao.dequantize_affine.default, dequantize_affine_impl);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp b/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
deleted file mode 100644
index 475e7796b09..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-#include <executorch/backends/vulkan/runtime/utils/StorageUtils.h>
-
-namespace vkcompute {
-
-using utils::GPUMemoryLayout;
-using utils::StorageType;
-
-void check_embedding_args(
-    ComputeGraph& graph,
-    const ValueRef weight,
-    const ValueRef in,
-    const ValueRef out) {
-  // The packing logic may not be trivial here. Input and output are Channel
-  // Packed, which is default for the Vulkan backend. However, weight vector is
-  // height-packed instead of channel-packed for space reason.
-  VK_CHECK_COND(graph.packed_dim_of(weight) == WHCN::kHeightDim);
-  VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kChannelsDim);
-  VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kChannelsDim);
-}
-
-void add_embedding_node(
-    ComputeGraph& graph,
-    ValueRef weight,
-    ValueRef in,
-    ValueRef out) {
-  check_embedding_args(graph, weight, in, out);
-
-  std::string kernel_name = "embedding";
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      {{out, vkapi::kWrite}, {{in, weight}, vkapi::kRead}},
-      {
-          graph.sizes_ubo(out),
-      },
-      // Push Constants
-      {},
-      // Specialization Constants
-      {graph.hashed_layout_of(out),
-       graph.hashed_layout_of(in),
-       graph.hashed_layout_of(weight)},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-void embedding(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  ValueRef in = args[1];
-  ValueRef out = args[5];
-
-  ValueRef weight = prepack_standard(
-      graph,
-      args[0],
-      StorageType::TEXTURE_2D,
-      GPUMemoryLayout::TENSOR_HEIGHT_PACKED);
-
-  add_embedding_node(graph, weight, in, out);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.embedding.default, embedding);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Expand.cpp b/backends/vulkan/runtime/graph/ops/impl/Expand.cpp
deleted file mode 100644
index 1623a26b2a1..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Expand.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void add_expand_buffer_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef size,
-    const ValueRef out) {
-  std::string kernel_name = "expand";
-  kernel_name.reserve(kShaderNameReserve);
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  vkapi::ParamsBindList param_buffers = {
-      graph.buffer_meta_ubo(out),
-      graph.buffer_meta_ubo(in),
-  };
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Parameter buffers
-      param_buffers,
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {size},
-      // Resizing Logic
-      nullptr));
-}
-
-void expand(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  int idx = 0;
-  const ValueRef in = args.at(idx++);
-  const ValueRef size = args.at(idx++);
-  const ValueRef implicit = args.at(idx++);
-  (void)implicit;
-  const ValueRef out = args.at(idx++);
-
-  if (graph.is_buffer_storage(out)) {
-    return add_expand_buffer_node(graph, in, size, out);
-  }
-
-  VK_THROW("Expand operator only supports buffer storage");
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.expand_copy.default, expand);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Flip.cpp b/backends/vulkan/runtime/graph/ops/impl/Flip.cpp
deleted file mode 100644
index 52288734704..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Flip.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-// Custom global workgroup size function for flip
-utils::uvec3 flip_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  return graph->create_global_wg_size(out);
-}
-
-void check_flip_args(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef out) {
-  VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kChannelsDim);
-  VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kChannelsDim);
-}
-
-void resize_flip_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = args.at(1).refs.at(0);
-
-  graph->virtual_resize(out, graph->sizes_of(in));
-}
-
-utils::ivec4 create_whcn_bitmap(
-    const std::vector<int64_t>& list,
-    const int64_t ndim) {
-  std::vector<int64_t> bm(4, 0);
-  for (const auto e : list) {
-    auto x = (e % ndim + ndim) % ndim; // normalize
-    x = ndim - 1 - x; // reverse
-    bm.at(x) = 1;
-  }
-  return utils::make_ivec4(bm);
-}
-
-void add_flip_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const std::vector<int64_t>& dim_list,
-    const ValueRef out) {
-  check_flip_args(graph, in, out);
-
-  const auto dim_bitmap = create_whcn_bitmap(dim_list, graph.dim_of(in));
-
-  std::string kernel_name("flip");
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      flip_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {
-          {out, vkapi::kWrite},
-          {in, vkapi::kRead},
-      },
-      // Parameter buffers
-      {
-          graph.logical_limits_ubo(out),
-          graph.sizes_ubo(out),
-          graph.create_params_buffer(dim_bitmap),
-      },
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_flip_node));
-}
-
-void flip(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  ValueRef in = args[0];
-  auto dims = graph.get_int_list(args[1]);
-  ValueRef out = args[2];
-
-  add_flip_node(graph, in, *dims, out);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.flip.default, flip);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Full.cpp b/backends/vulkan/runtime/graph/ops/impl/Full.cpp
deleted file mode 100644
index fe2676e91e0..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Full.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void resize_full_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  std::vector<int64_t> out_sizes;
-  if (graph->val_is_tensor(extra_args.at(0))) {
-    out_sizes = graph->sizes_of(extra_args.at(0));
-  } else {
-    out_sizes = *graph->get_int_list(extra_args.at(0));
-  }
-
-  graph->virtual_resize(out, out_sizes);
-}
-
-void add_full_node(
-    ComputeGraph& graph,
-    const ValueRef size_or_in,
-    const ValueRef fill_value,
-    const ValueRef out) {
-  float fill_value_val = graph.extract_scalar<float>(fill_value);
-
-  std::string kernel_name("full");
-  kernel_name.reserve(kShaderNameReserve);
-
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}},
-      // Shader params buffers
-      {graph.sizes_ubo(out), graph.create_params_buffer(fill_value_val)},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {graph.packed_dim_of(out)},
-      // Resize Args
-      {size_or_in},
-      // Resizing Logic
-      resize_full_node));
-}
-
-void full(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_full_node(graph, args[0], args[1], args[args.size() - 1]);
-}
-
-void zeros(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_full_node(
-      graph, args[0], graph.add_scalar<int64_t>(0), args[args.size() - 1]);
-}
-
-void ones(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_full_node(
-      graph, args[0], graph.add_scalar<int64_t>(1), args[args.size() - 1]);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.full.default, full);
-  VK_REGISTER_OP(aten.full_like.default, full);
-  VK_REGISTER_OP(aten.zeros.default, zeros);
-  VK_REGISTER_OP(aten.zeros_like.default, zeros);
-  VK_REGISTER_OP(aten.ones.default, ones);
-  VK_REGISTER_OP(aten.ones_like.default, ones);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp b/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp
deleted file mode 100644
index 5f39c16d405..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/GridPriors.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-struct GridPriorsParam final {
-  int32_t stride;
-  float offset;
-};
-
-void resize_grid_priors_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = extra_args.at(0);
-  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
-  const int64_t height = in_sizes.at(in_sizes.size() - 2);
-  const int64_t width = in_sizes.at(in_sizes.size() - 1);
-  const std::vector<int64_t> sizes = {height * width, 2};
-  graph->virtual_resize(out, sizes);
-}
-
-void add_grid_priors_node(
-    ComputeGraph& graph,
-    const ValueRef& in,
-    const ValueRef& stride_ref,
-    const ValueRef& offset_ref,
-    const ValueRef& out) {
-  const int32_t stride = graph.extract_scalar<int32_t>(stride_ref);
-  const float offset = graph.extract_scalar<float>(offset_ref);
-
-  std::string kernel_name = "grid_priors";
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  const GridPriorsParam param = {stride, offset};
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {
-          {out, vkapi::kWrite},
-      },
-      // Shader params buffers
-      {
-          graph.sizes_ubo(in),
-          graph.sizes_ubo(out),
-          graph.create_params_buffer(param),
-      },
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {in},
-      // Resizing Logic
-      resize_grid_priors_node));
-}
-
-void grid_priors(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_grid_priors_node(graph, args[0], args[1], args[2], args[3]);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(et_vk.grid_priors.default, grid_priors);
-}
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/GroupNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/GroupNorm.cpp
deleted file mode 100644
index 368b95c9d3b..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/GroupNorm.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-utils::uvec3 group_norm_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)graph;
-  (void)shader;
-  (void)global_workgroup_size;
-  (void)args;
-  (void)resize_args;
-
-  return {1, 64, 1};
-}
-
-void resize_group_norm_texture_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  // Extract tensor references from args
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = args.at(1).refs.at(0);
-  const ValueRef mean = args.at(1).refs.at(3);
-  const ValueRef rstd = args.at(1).refs.at(4);
-
-  // Extract group from resize args
-  const int64_t group_val = graph->extract_scalar<int64_t>(resize_args.at(0));
-
-  // Get input tensor sizes using ComputeGraph APIs
-  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
-
-  // Output tensor should have the same size as input
-  graph->virtual_resize(out, in_sizes);
-
-  // Mean and rstd tensors should have size {num_batches, num_groups}
-  const int64_t N = in_sizes.at(0); // batch dimension
-  const std::vector<int64_t> mean_rstd_sizes = {N, group_val};
-
-  // Resize mean and rstd tensors
-  graph->virtual_resize(mean, mean_rstd_sizes);
-  graph->virtual_resize(rstd, mean_rstd_sizes);
-}
-
-void add_native_group_norm_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef weight_data,
-    const ValueRef bias_data,
-    const ValueRef N,
-    const ValueRef C,
-    const ValueRef HxW,
-    const ValueRef group,
-    const ValueRef eps,
-    const ValueRef out,
-    const ValueRef mean,
-    const ValueRef rstd) {
-  (void)N;
-  (void)C;
-  (void)HxW;
-
-  const ValueRef arg_weight = prepack_standard(
-      graph,
-      weight_data,
-      graph.storage_type_of(in),
-      utils::kWidthPacked,
-      false);
-  const ValueRef arg_bias = prepack_standard(
-      graph, bias_data, graph.storage_type_of(in), utils::kWidthPacked, false);
-
-  const int64_t group_val = graph.extract_scalar<int64_t>(group);
-  const float epsilon = graph.extract_scalar<float>(eps);
-
-  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
-
-  std::string kernel_name("group_norm_reduce_texture");
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  const struct {
-    int32_t group;
-    float epsilon;
-  } params_uniform = {static_cast<int32_t>(group_val), epsilon};
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      group_norm_local_wg_size,
-      // Inputs and Outputs
-      {{{mean, rstd}, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Shader params buffers
-      {
-          graph.strides_ubo(mean),
-          graph.numel_ubo(mean),
-          graph.logical_limits_ubo(in),
-          graph.sizes_ubo(in),
-      },
-      // Push Constants
-      {
-          PushConstantDataInfo(&params_uniform, sizeof(params_uniform)),
-      },
-      // Specialization Constants
-      {
-          graph.hashed_layout_of(mean),
-      },
-      // Resize Args
-      {group},
-      // Resizing Logic
-      nullptr));
-
-  // Compute element-wise normalization, now that mean and rstd have been
-  // computed.
-  std::string norm_kernel_name("group_norm_texture");
-  norm_kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(norm_kernel_name, graph.dtype_of(out));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(norm_kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite},
-       {{in, arg_weight, arg_bias, mean, rstd}, vkapi::kRead}},
-      // Shader params buffers
-      {
-          graph.logical_limits_ubo(out),
-          graph.sizes_ubo(out),
-          graph.logical_limits_ubo(arg_weight),
-          graph.strides_ubo(mean),
-      },
-      // Push Constants
-      {
-          PushConstantDataInfo(&params_uniform, sizeof(params_uniform)),
-      },
-      // Specialization Constants
-      {
-          graph.hashed_layout_of(in),
-      },
-      // Resize Args
-      {group},
-      // Resizing Logic
-      resize_group_norm_texture_node));
-}
-
-void native_group_norm(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  // Assign each element of the args vector to const ValueRef variables
-  const ValueRef in = args.at(0);
-  const ValueRef weight_data = args.at(1);
-  const ValueRef bias_data = args.at(2);
-  const ValueRef N = args.at(3);
-  const ValueRef C = args.at(4);
-  const ValueRef HxW = args.at(5);
-  const ValueRef group = args.at(6);
-  const ValueRef eps = args.at(7);
-  const ValueRef out_tuple_ref = args.at(8);
-
-  ValueRef out = kDummyValueRef;
-  ValueRef mean = kDummyValueRef;
-  ValueRef rstd = kDummyValueRef;
-
-  {
-    const ValueListPtr out_tuple = graph.get_value_list(out_tuple_ref);
-    out = out_tuple->at(0);
-    mean = out_tuple->at(1);
-    rstd = out_tuple->at(2);
-  }
-
-  VK_CHECK_COND(graph.val_is_tref(weight_data));
-  VK_CHECK_COND(graph.val_is_tref(bias_data));
-
-  // Check expected storage types and memory layouts for tensor variables
-  VK_CHECK_COND(graph.is_standard_channels_packed_texture_tensor(in));
-  VK_CHECK_COND(graph.is_standard_channels_packed_texture_tensor(out));
-
-  VK_CHECK_COND(graph.is_contiguous_buffer_tensor(mean));
-  VK_CHECK_COND(graph.is_contiguous_buffer_tensor(rstd));
-
-  return add_native_group_norm_node(
-      graph,
-      in,
-      weight_data,
-      bias_data,
-      N,
-      C,
-      HxW,
-      group,
-      eps,
-      out,
-      mean,
-      rstd);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.native_group_norm.default, native_group_norm);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp b/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp
deleted file mode 100644
index 576711a86f1..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/IndexSelect.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void check_index_select_args(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef idx,
-    const ValueRef out) {
-  VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kChannelsDim);
-  VK_CHECK_COND(graph.packed_dim_of(idx) == WHCN::kChannelsDim);
-  VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kChannelsDim);
-}
-
-void add_index_select_channel_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    ValueRef idx,
-    ValueRef out) {
-  check_index_select_args(graph, in, idx, out);
-
-  std::string kernel_name = "index_select_channel";
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      {{out, vkapi::kWrite}, {{in, idx}, vkapi::kRead}},
-      {graph.sizes_ubo(out), graph.sizes_ubo(in)},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-struct IndexSelectParams final {
-  int32_t gpu_dim;
-  int32_t stride;
-};
-
-IndexSelectParams create_index_select_params(
-    ComputeGraph& graph,
-    const int64_t dim_idx,
-    const ValueRef in) {
-  if (dim_idx == kWidth4D) {
-    return {0, 1};
-  } else if (dim_idx == kHeight4D) {
-    return {1, 1};
-  } else if (dim_idx == kBatch4D) {
-    const std::vector<int64_t> in_sizes = graph.sizes_of(in);
-    int64_t n_channels = dim_at(in_sizes, kChannel4D);
-    int64_t stride = utils::div_up_4(n_channels);
-    return {2, static_cast<int32_t>(stride)};
-  } else {
-    VK_THROW("Unexpected dim_idx!");
-  }
-}
-
-void add_index_select_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    const int64_t dim_idx,
-    ValueRef idx,
-    ValueRef out) {
-  check_index_select_args(graph, in, idx, out);
-
-  IndexSelectParams params = create_index_select_params(graph, dim_idx, in);
-
-  std::string kernel_name = "index_select";
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      {{out, vkapi::kWrite}, {{in, idx}, vkapi::kRead}},
-      {graph.sizes_ubo(out), graph.create_params_buffer(params)},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-int64_t get_dim_idx(ComputeGraph& graph, ValueRef in, ValueRef dim_ref) {
-  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-  const int64_t ndim = graph.dim_of(in);
-  dim = normalize(dim, ndim);
-
-  // Convert to DimIndex - this replicates normalize_to_dim_index logic
-  return dim < 0 ? dim : dim - ndim;
-}
-
-void index_select(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  ValueRef in = args[0];
-  ValueRef dim_ref = args[1];
-  ValueRef idx = args[2];
-  ValueRef out = args[3];
-
-  const int64_t dim_idx = get_dim_idx(graph, in, dim_ref);
-  if (dim_idx == kChannel4D) {
-    add_index_select_channel_node(graph, in, idx, out);
-  } else {
-    add_index_select_node(graph, in, dim_idx, idx, out);
-  }
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.index_select.default, index_select);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
deleted file mode 100644
index 38d70271f4f..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/MatMul.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-// Custom global workgroup size function for addmm_naive_texture
-utils::uvec3 addmm_naive_texture_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  return graph->logical_limits_of(out);
-}
-
-// Custom global workgroup size function for addmm_naive_buffer
-utils::uvec3 addmm_naive_buffer_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  return {
-      graph->size_at<uint32_t>(-1, out),
-      graph->size_at<uint32_t>(-2, out),
-      graph->size_at<uint32_t>(-3, out) * graph->size_at<uint32_t>(-4, out)};
-}
-
-// Custom global workgroup size function for addmm_optimized
-utils::uvec3 addmm_optimized_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef mat1 = args.at(1).refs.at(0);
-
-  std::vector<int64_t> mat1_sizes = graph->sizes_of(mat1);
-  int mat1_dims = mat1_sizes.size();
-
-  utils::uvec3 global_size = graph->logical_limits_of(out);
-
-  if (mat1_sizes.at(mat1_dims - 2) < 8) {
-    global_size = utils::divup_vec(global_size, {4, 2, 1});
-  } else {
-    global_size = utils::divup_vec(global_size, {4, 4, 1});
-  }
-  return global_size;
-}
-
-// Custom local workgroup size function for addmm_optimized
-utils::uvec3 addmm_optimized_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)args;
-  (void)resize_args;
-  return adaptive_work_group_size(global_workgroup_size);
-}
-
-void check_addmm_args(
-    ComputeGraph& graph,
-    const ValueRef self,
-    const ValueRef mat1,
-    const ValueRef mat2_data,
-    const ValueRef beta,
-    const ValueRef alpha,
-    const ValueRef out) {
-  (void)alpha;
-  (void)beta;
-
-  std::vector<int64_t> self_sizes = graph.sizes_of(self);
-  std::vector<int64_t> mat1_sizes = graph.sizes_of(mat1);
-  std::vector<int64_t> mat2_sizes = graph.sizes_of(mat2_data);
-
-  VK_CHECK_COND(mat1_sizes.size() == 2 || mat1_sizes.size() == 3);
-  VK_CHECK_COND(mat1_sizes.size() == mat2_sizes.size());
-
-  VK_CHECK_COND(graph.packed_dim_of(mat1) == graph.packed_dim_of(out));
-
-  VK_CHECK_COND(utils::val_at(-1, mat1_sizes) == utils::val_at(-2, mat2_sizes));
-
-  if (utils::val_at(-1, self_sizes) != 1) {
-    VK_CHECK_COND(
-        utils::val_at(-1, self_sizes) == utils::val_at(-1, mat2_sizes));
-  }
-  if (utils::val_at(-2, self_sizes) != 1) {
-    VK_CHECK_COND(
-        utils::val_at(-2, self_sizes) == utils::val_at(-2, mat1_sizes));
-  }
-}
-
-void resize_addmm_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef mat1 = args.at(1).refs.at(0);
-  const ValueRef mat2 = args.at(1).refs.at(1);
-
-  const bool mat2_is_transposed = graph->get_bool(extra_args.at(0));
-
-  const std::vector<int64_t> mat1_sizes = graph->sizes_of(mat1);
-  const std::vector<int64_t> mat2_sizes = graph->sizes_of(mat2);
-
-  const int out_cols = utils::val_at(-2, mat1_sizes);
-  const int out_rows = mat2_is_transposed ? utils::val_at(-2, mat2_sizes)
-                                          : utils::val_at(-1, mat2_sizes);
-
-  std::vector<int64_t> new_out_sizes(3);
-  if (mat1_sizes.size() == 2) {
-    new_out_sizes.resize(2);
-    new_out_sizes.at(0) = out_cols;
-    new_out_sizes.at(1) = out_rows;
-  } else {
-    new_out_sizes.at(0) = mat1_sizes.at(0);
-    new_out_sizes.at(1) = out_cols;
-    new_out_sizes.at(2) = out_rows;
-  }
-
-  graph->virtual_resize(out, new_out_sizes);
-}
-
-struct Params final {
-  float alpha;
-  float beta;
-};
-
-void add_addmm_naive_texture_node(
-    ComputeGraph& graph,
-    const ValueRef self_data,
-    const ValueRef mat1,
-    const ValueRef mat2_data,
-    const ValueRef beta,
-    const ValueRef alpha,
-    const ValueRef out,
-    const Params& params,
-    const ValueRef mat2_is_transposed) {
-  utils::StorageType stype = graph.storage_type_of(out);
-  ValueRef self = prepack_standard(
-      graph, self_data, stype, utils::kWidthPacked, /*passthrough = */ true);
-  ValueRef mat2 = prepack_standard(
-      graph, mat2_data, stype, utils::kHeightPacked, /*passthrough = */ true);
-
-  std::string kernel_name =
-      graph.get_bool(mat2_is_transposed) ? "linear_naive" : "addmm_naive";
-  kernel_name.reserve(kShaderNameReserve);
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      addmm_naive_texture_global_wg_size,
-      pick_hw_square_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {{mat1, mat2, self}, vkapi::kRead}},
-      // Shader params buffers
-      {
-          graph.sizes_ubo(out),
-          graph.logical_limits_ubo(out),
-          graph.sizes_ubo(mat1),
-          graph.sizes_ubo(mat2),
-          graph.sizes_ubo(self),
-          graph.create_params_buffer(params),
-      },
-      // Push Constants
-      {},
-      // Specialization Constants
-      {graph.hashed_layout_of(out),
-       graph.hashed_layout_of(mat1),
-       graph.hashed_layout_of(mat2),
-       graph.hashed_layout_of(self)},
-      // Resize Args
-      {mat2_is_transposed},
-      // Resizing Logic
-      resize_addmm_node));
-}
-
-void add_addmm_naive_buffer_node(
-    ComputeGraph& graph,
-    const ValueRef self_data,
-    const ValueRef mat1,
-    const ValueRef mat2_data,
-    const ValueRef beta,
-    const ValueRef alpha,
-    const ValueRef out,
-    const Params& params,
-    const ValueRef mat2_is_transposed) {
-  (void)beta;
-  (void)alpha;
-  ValueRef mat2 = prepack_standard(
-      graph,
-      mat2_data,
-      graph.storage_type_of(out),
-      utils::kHeightPacked,
-      /*passthrough = */ true);
-  ValueRef self = prepack_standard(
-      graph,
-      self_data,
-      graph.storage_type_of(out),
-      utils::kWidthPacked,
-      /*passthrough = */ true);
-
-  std::string kernel_name = "addmm_naive_buffer";
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  utils::uvec3 global_size = {
-      graph.size_at<uint32_t>(-1, out),
-      graph.size_at<uint32_t>(-2, out),
-      graph.size_at<uint32_t>(-3, out) * graph.size_at<uint32_t>(-4, out)};
-
-  int mat2_is_transposed_val = (mat2_is_transposed != kDummyValueRef &&
-                                graph.get_bool(mat2_is_transposed))
-      ? 1
-      : 0;
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      addmm_naive_buffer_global_wg_size,
-      pick_hw_square_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {{mat1, mat2, self}, vkapi::kRead}},
-      // Shader params buffers
-      {
-          graph.sizes_ubo(out),
-          graph.strides_ubo(out),
-          graph.sizes_ubo(mat1),
-          graph.strides_ubo(mat1),
-          graph.sizes_ubo(mat2),
-          graph.strides_ubo(mat2),
-          graph.numel_ubo(out),
-          graph.create_params_buffer(params),
-      },
-      // Push Constants
-      {},
-      // Specialization Constants
-      {mat2_is_transposed_val},
-      // Resize Args
-      {mat2_is_transposed},
-      // Resizing Logic
-      resize_addmm_node));
-}
-
-void add_addmm_optimized_node(
-    ComputeGraph& graph,
-    const ValueRef self_data,
-    const ValueRef mat1,
-    const ValueRef mat2_data,
-    const ValueRef beta,
-    const ValueRef alpha,
-    const ValueRef out,
-    const Params& params,
-    const ValueRef mat2_is_transposed) {
-  utils::StorageType stype = graph.storage_type_of(out);
-  ValueRef self = prepack_standard(
-      graph, self_data, stype, utils::kChannelsPacked, /*passthrough=*/true);
-  ValueRef mat2 = prepack_standard(
-      graph, mat2_data, stype, utils::kHeightPacked, /*passthrough=*/true);
-
-  // Ensure mat1 is width packed
-  ValueRef mat1_W_packed = graph.add_tensor_like(mat1, utils::kWidthPacked);
-  auto viewFn = VK_GET_OP_FN("aten.view_copy.default");
-  viewFn(graph, {mat1, graph.add_none(), mat1_W_packed});
-
-  const bool mat2_is_transposed_val = graph.get_bool(mat2_is_transposed);
-
-  // Ensure mat2 is height packed
-  ValueRef mat2_packed = mat2;
-  const utils::GPUMemoryLayout mat2_layout =
-      mat2_is_transposed_val ? utils::kWidthPacked : utils::kHeightPacked;
-  if (graph.estimate_memory_layout_of(mat2) != mat2_layout) {
-    mat2_packed = graph.add_tensor_like(mat2, mat2_layout);
-    viewFn(graph, {mat2, graph.add_none(), mat2_packed});
-  }
-
-  std::string kernel_name = graph.get_bool(mat2_is_transposed)
-      ? "linear_optimized"
-      : "addmm_optimized";
-
-  std::vector<int64_t> mat1_sizes = graph.sizes_of(mat1_W_packed);
-  int mat1_dims = mat1_sizes.size();
-  if (mat1_dims == 3) {
-    kernel_name = "batch_" + kernel_name;
-  }
-  if (mat1_sizes.at(mat1_dims - 2) < 8) {
-    kernel_name += "_tile_row_2";
-  } else {
-    kernel_name += "_tile_row_4";
-  }
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      addmm_optimized_global_wg_size,
-      addmm_optimized_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite},
-       {{mat1_W_packed, mat2_packed, self}, vkapi::kRead}},
-      // Shader params buffers
-      {
-          graph.sizes_ubo(out),
-          graph.sizes_ubo(mat1_W_packed),
-          graph.sizes_ubo(mat2_packed),
-          graph.sizes_ubo(self),
-          graph.create_params_buffer(params),
-      },
-      // Push Constants
-      {},
-      // Specialization Constants
-      {graph.hashed_layout_of(out),
-       graph.hashed_layout_of(mat1_W_packed),
-       graph.hashed_layout_of(mat2_packed),
-       graph.hashed_layout_of(self)},
-      // Resize Args
-      {mat2_is_transposed},
-      // Resizing Logic
-      resize_addmm_node));
-}
-
-void add_addmm_node(
-    ComputeGraph& graph,
-    const ValueRef self,
-    const ValueRef mat1,
-    const ValueRef mat2,
-    const ValueRef beta,
-    const ValueRef alpha,
-    const ValueRef out,
-    const ValueRef mat2_is_transposed) {
-  float alpha_val = 1.0f;
-  float beta_val = 1.0f;
-
-  if (alpha != kDummyValueRef) {
-    alpha_val = graph.extract_scalar<float>(alpha);
-  }
-  if (beta != kDummyValueRef) {
-    beta_val = graph.extract_scalar<float>(beta);
-  }
-
-  Params params = {alpha_val, beta_val};
-  if (graph.is_buffer_storage(out)) {
-    add_addmm_naive_buffer_node(
-        graph, self, mat1, mat2, beta, alpha, out, params, mat2_is_transposed);
-  } else if (graph.packed_dim_of(mat1) == WHCN::kChannelsDim) {
-    add_addmm_optimized_node(
-        graph, self, mat1, mat2, beta, alpha, out, params, mat2_is_transposed);
-  } else if (graph.packed_dim_of(mat1) == WHCN::kWidthDim) {
-    add_addmm_naive_texture_node(
-        graph, self, mat1, mat2, beta, alpha, out, params, mat2_is_transposed);
-  } else {
-    VK_THROW("Input should be channel packed or width packed.");
-  }
-}
-
-void addmm(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  check_addmm_args(graph, args[0], args[1], args[2], args[3], args[4], args[5]);
-  ValueRef mat2_is_transposed = graph.add_scalar(false);
-  return add_addmm_node(
-      graph,
-      args[0],
-      args[1],
-      args[2],
-      args[3],
-      args[4],
-      args[5],
-      mat2_is_transposed);
-}
-
-void linear(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  ValueRef input = args.at(0);
-  ValueRef weight_data = args.at(1);
-  ValueRef bias = args.at(2);
-  ValueRef out = args.at(3);
-  ValueRef weight = prepack_standard(
-      graph,
-      weight_data,
-      graph.storage_type_of(out),
-      utils::kWidthPacked,
-      /*passthrough = */ true);
-  ValueRef mat2_is_transposed = graph.add_scalar(true);
-
-  if (graph.val_is_none(bias)) {
-    return add_matmul_node(graph, input, weight, out, mat2_is_transposed);
-  } else {
-    return add_addmm_node(
-        graph,
-        bias,
-        input,
-        weight,
-        kDummyValueRef,
-        kDummyValueRef,
-        out,
-        mat2_is_transposed);
-  }
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.addmm.default, addmm);
-  VK_REGISTER_OP(aten.linear.default, linear);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
deleted file mode 100644
index 47ecf5f18d2..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/MatMul.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void check_matmul_args(
-    const ComputeGraph& graph,
-    const ValueRef mat1,
-    const ValueRef mat2_data,
-    const ValueRef out) {
-  std::vector<int64_t> mat1_sizes = graph.sizes_of(mat1);
-  std::vector<int64_t> mat2_sizes = graph.sizes_of(mat2_data);
-
-  VK_CHECK_COND(mat1_sizes.size() == 2 || mat1_sizes.size() == 3);
-  VK_CHECK_COND(mat1_sizes.size() == mat2_sizes.size());
-
-  VK_CHECK_COND(graph.packed_dim_of(mat1) == graph.packed_dim_of(out));
-
-  VK_CHECK_COND(utils::val_at(-1, mat1_sizes) == utils::val_at(-2, mat2_sizes));
-}
-
-void resize_matmul_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef mat1 = args.at(1).refs.at(0);
-  const ValueRef mat2 = args.at(1).refs.at(1);
-
-  bool mat2_is_transposed = graph->get_bool(resize_args.at(0));
-
-  const std::vector<int64_t> mat1_sizes = graph->sizes_of(mat1);
-  const std::vector<int64_t> mat2_sizes = graph->sizes_of(mat2);
-
-  const int out_cols = utils::val_at(-2, mat1_sizes);
-  const int out_rows = mat2_is_transposed ? utils::val_at(-2, mat2_sizes)
-                                          : utils::val_at(-1, mat2_sizes);
-
-  const int64_t out_dim = graph->dim_of(out);
-  std::vector<int64_t> new_out_sizes(mat1_sizes);
-  new_out_sizes.at(out_dim - 1) = out_rows;
-  new_out_sizes.at(out_dim - 2) = out_cols;
-
-  graph->virtual_resize(out, new_out_sizes);
-}
-
-/**
- * Custom global workgroup size function for naive buffer matmul operations.
- */
-utils::uvec3 matmul_naive_buffer_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  return {
-      graph->size_at<uint32_t>(-1, out),
-      graph->size_at<uint32_t>(-2, out),
-      graph->size_at<uint32_t>(-3, out) * graph->size_at<uint32_t>(-4, out)};
-}
-
-void add_matmul_naive_buffer_node(
-    ComputeGraph& graph,
-    const ValueRef mat1,
-    const ValueRef mat2_data,
-    const ValueRef out,
-    const ValueRef mat2_is_transposed) {
-  ValueRef mat2 = prepack_standard(
-      graph,
-      mat2_data,
-      graph.storage_type_of(out),
-      utils::kHeightPacked,
-      /*passthrough = */ true);
-
-  std::string kernel_name = "matmul_naive_buffer";
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  int mat2_is_transposed_val = (mat2_is_transposed != kDummyValueRef &&
-                                graph.get_bool(mat2_is_transposed))
-      ? 1
-      : 0;
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      matmul_naive_buffer_global_wg_size,
-      pick_hw_square_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {{mat1, mat2}, vkapi::kRead}},
-      // Shader params buffers
-      {
-          graph.sizes_ubo(out),
-          graph.strides_ubo(out),
-          graph.sizes_ubo(mat1),
-          graph.strides_ubo(mat1),
-          graph.sizes_ubo(mat2),
-          graph.strides_ubo(mat2),
-          graph.numel_ubo(out),
-      },
-      // Push Constants
-      {},
-      // Specialization Constants
-      {mat2_is_transposed_val},
-      // Resize Args
-      {mat2_is_transposed},
-      // Resizing Logic
-      resize_matmul_node));
-}
-
-vkapi::ShaderInfo pick_matmul_naive_texture3d_shader(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const bool is_transposed = graph->get_bool(resize_args.at(0));
-
-  std::string kernel_name =
-      is_transposed ? "matmul_transposed_naive" : "matmul_naive";
-  kernel_name.reserve(kShaderNameReserve);
-  add_storage_type_suffix(kernel_name, graph->storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph->dtype_of(out));
-
-  return VK_KERNEL_FROM_STR(kernel_name);
-}
-
-void add_matmul_naive_texture3d_node(
-    ComputeGraph& graph,
-    const ValueRef mat1,
-    const ValueRef mat2_data,
-    const ValueRef out,
-    const ValueRef mat2_is_transposed) {
-  ValueRef mat2 = prepack_standard(
-      graph,
-      mat2_data,
-      graph.storage_type_of(out),
-      utils::kHeightPacked,
-      /*passthrough = */ true);
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      pick_matmul_naive_texture3d_shader,
-      default_pick_global_wg_size,
-      pick_hw_square_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {{mat1, mat2}, vkapi::kRead}},
-      // Shader params buffers
-      {
-          graph.sizes_ubo(out),
-          graph.logical_limits_ubo(out),
-          graph.sizes_ubo(mat1),
-          graph.sizes_ubo(mat2),
-      },
-      // Push Constants
-      {},
-      // Specialization Constants
-      {graph.hashed_layout_of(out),
-       graph.hashed_layout_of(mat1),
-       graph.hashed_layout_of(mat2)},
-      // Resize Args
-      {mat2_is_transposed},
-      // Resizing Logic
-      resize_matmul_node));
-}
-
-vkapi::ShaderInfo pick_matmul_optimized_shader(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef mat1_W_packed = resize_args.at(1);
-  const bool mat2_is_transposed_val = graph->get_bool(resize_args.at(0));
-
-  std::string kernel_name = mat2_is_transposed_val
-      ? "matmul_transposed_optimized"
-      : "matmul_optimized";
-
-  std::vector<int64_t> mat1_sizes = graph->sizes_of(mat1_W_packed);
-  size_t mat1_dims = mat1_sizes.size();
-  if (mat1_dims == 3) {
-    kernel_name = "batch_" + kernel_name;
-  }
-  if (mat1_sizes.at(mat1_dims - 2) < 8) {
-    kernel_name += "_tile_row_2";
-  } else {
-    kernel_name += "_tile_row_4";
-  }
-
-  add_dtype_suffix(kernel_name, graph->dtype_of(out));
-
-  return VK_KERNEL_FROM_STR(kernel_name);
-}
-
-utils::uvec3 matmul_optimized_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef mat1_W_packed = resize_args.at(1);
-
-  const std::vector<int64_t> mat1_sizes = graph->sizes_of(mat1_W_packed);
-  const size_t mat1_dims = mat1_sizes.size();
-
-  utils::uvec3 global_size = graph->logical_limits_of(out);
-  if (mat1_sizes.at(mat1_dims - 2) < 8) {
-    // Use `logical_extents` instead of `image_extents` because the workgroup
-    // axes need to correspond to tensor dimensions.
-    global_size = utils::divup_vec(global_size, {4, 2, 1});
-  } else {
-    global_size = utils::divup_vec(global_size, {4, 4, 1});
-  }
-
-  return global_size;
-}
-
-void add_matmul_optimized_node(
-    ComputeGraph& graph,
-    const ValueRef mat1,
-    const ValueRef mat2_data,
-    const ValueRef out,
-    const ValueRef mat2_is_transposed) {
-  ValueRef mat2 = prepack_standard(
-      graph,
-      mat2_data,
-      graph.storage_type_of(out),
-      utils::kHeightPacked,
-      /*passthrough = */ true);
-
-  // Ensure mat1 is width packed
-  TmpTensor mat1_tmp(
-      &graph, graph.sizes_of(mat1), graph.dtype_of(mat1), utils::kWidthPacked);
-  ValueRef mat1_W_packed = mat1;
-  auto viewFn = VK_GET_OP_FN("aten.view_copy.default");
-  if (graph.packed_dim_of(mat1) != WHCN::kWidthDim) {
-    mat1_W_packed = mat1_tmp;
-    viewFn(graph, {mat1, graph.add_none(), mat1_W_packed});
-  }
-
-  const bool mat2_is_transposed_val = graph.get_bool(mat2_is_transposed);
-
-  // Ensure mat2 to height packed
-  ValueRef mat2_packed = mat2;
-  const utils::GPUMemoryLayout mat2_layout =
-      mat2_is_transposed_val ? utils::kWidthPacked : utils::kHeightPacked;
-  TmpTensor mat2_tmp(
-      &graph, graph.sizes_of(mat2), graph.dtype_of(mat2), mat2_layout);
-  if (graph.estimate_memory_layout_of(mat2) != mat2_layout) {
-    mat2_packed = mat2_tmp;
-    viewFn(graph, {mat2, graph.add_none(), mat2_packed});
-  }
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      pick_matmul_optimized_shader,
-      matmul_optimized_global_wg_size,
-      pick_hw_square_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {{mat1_W_packed, mat2_packed}, vkapi::kRead}},
-      // Shader params buffers
-      {
-          graph.sizes_ubo(out),
-          graph.sizes_ubo(mat1_W_packed),
-          graph.sizes_ubo(mat2_packed),
-      },
-      // Push Constants
-      {},
-      // Specialization Constants
-      {graph.hashed_layout_of(out),
-       graph.hashed_layout_of(mat1_W_packed),
-       graph.hashed_layout_of(mat2_packed)},
-      // Resize Args
-      {mat2_is_transposed, mat1_W_packed},
-      // Resizing Logic
-      resize_matmul_node));
-}
-
-void add_matmul_node(
-    ComputeGraph& graph,
-    const ValueRef mat1,
-    const ValueRef mat2_data,
-    const ValueRef out,
-    const ValueRef mat2_is_transposed) {
-  if (graph.is_buffer_storage(out)) {
-    add_matmul_naive_buffer_node(
-        graph, mat1, mat2_data, out, mat2_is_transposed);
-  } else if (graph.packed_dim_of(mat1) == WHCN::kChannelsDim) {
-    add_matmul_optimized_node(graph, mat1, mat2_data, out, mat2_is_transposed);
-  } else if (graph.packed_dim_of(mat1) == WHCN::kWidthDim) {
-    add_matmul_naive_texture3d_node(
-        graph, mat1, mat2_data, out, mat2_is_transposed);
-  } else {
-    VK_THROW("Input texture should be channel packed or width packed.");
-  }
-}
-
-void matmul(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  check_matmul_args(graph, args[0], args[1], args[2]);
-  const ValueRef mat2_is_transposed = graph.add_scalar(false);
-  return add_matmul_node(graph, args[0], args[1], args[2], mat2_is_transposed);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.mm.default, matmul);
-  VK_REGISTER_OP(aten.bmm.default, matmul);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.h b/backends/vulkan/runtime/graph/ops/impl/MatMul.h
deleted file mode 100644
index 38f7907f1b6..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/MatMul.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-namespace vkcompute {
-
-void add_matmul_node(
-    ComputeGraph& graph,
-    const ValueRef mat1,
-    const ValueRef mat2_data,
-    const ValueRef out,
-    const ValueRef mat2_is_transposed);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
deleted file mode 100644
index 8e15b56b208..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-std::vector<int64_t> calc_out_mean_sizes(
-    const std::vector<int64_t>& self_sizes,
-    int64_t normalized_shape_dim) {
-  std::vector<int64_t> output_size = self_sizes;
-  int64_t self_dim = self_sizes.size();
-  for (int64_t i = 0; i < normalized_shape_dim; ++i) {
-    output_size.at(self_dim - i - 1) = 1;
-  }
-  return output_size;
-}
-
-void resize_native_layer_norm_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef mean = args.at(0).refs.at(1);
-  const ValueRef rstd = args.at(0).refs.at(2);
-  const ValueRef in = args.at(1).refs.at(0);
-  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
-
-  const auto normalized_shape_dim =
-      graph->get_int_list(extra_args.at(0))->size();
-
-  const std::vector<int64_t> mean_size =
-      calc_out_mean_sizes(in_sizes, normalized_shape_dim);
-
-  graph->virtual_resize(out, in_sizes);
-  graph->virtual_resize(mean, mean_size);
-  graph->virtual_resize(rstd, mean_size);
-}
-
-void add_native_layer_norm_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef normalized_shape,
-    const ValueRef weight_data,
-    const ValueRef bias_data,
-    const ValueRef eps,
-    const ValueRef out) {
-  const auto normalized_shape_dim =
-      graph.get_int_list(normalized_shape)->size();
-  if (normalized_shape_dim > 1) {
-    VK_THROW("native_layer_norm only supports normalized_shape with dim == 1");
-  }
-
-  if (graph.val_is_none(weight_data)) {
-    VK_THROW("native_layer_norm requires weight to be non-None");
-  }
-
-  if (graph.val_is_none(bias_data)) {
-    VK_THROW("native_layer_norm requires bias to be non-None");
-  }
-
-  ValueRef arg_weight = prepack_standard_like(graph, weight_data, in);
-  ValueRef arg_bias = prepack_standard_like(graph, bias_data, in);
-
-  const auto out_val = graph.get_value_list(out);
-  const ValueRef out_tensor = out_val->at(0);
-  const ValueRef mean_tensor = out_val->at(1);
-  const ValueRef rstd_tensor = out_val->at(2);
-
-  float epsilon = graph.extract_scalar<float>(eps);
-
-  VK_CHECK_COND(check_same_packed_dim(graph, in, out_tensor));
-
-  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
-
-  utils::uvec3 global_size = graph.logical_limits_of(out_tensor);
-  utils::uvec3 local_size;
-
-  // Since the shader sets shared memory scale factor > 1, if dispatch is
-  // greater than maximum WG size. Setting WG size in X axis to max WG size,
-  // would allow best thread utilization.
-  if (global_size[0] > 64) {
-    local_size = {64, 1, 1};
-  } else {
-    // If thread size in X axis is smaller or equal to maximum WG size, we can
-    // let the function decide the best WG size.
-    local_size = graph.create_local_wg_size(global_size);
-  }
-
-  std::string kernel_name("native_layer_norm");
-  kernel_name.reserve(kShaderNameReserve);
-
-  add_dtype_suffix(kernel_name, graph.dtype_of(out_tensor));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{{out_tensor, mean_tensor, rstd_tensor}, vkapi::kWrite},
-       {{in, arg_weight, arg_bias}, vkapi::kRead}},
-      // Shader params buffers
-      {},
-      // Push Constants
-      {
-          graph.logical_limits_pc_of(out_tensor),
-          graph.sizes_pc_of(out_tensor),
-          PushConstantDataInfo(&epsilon, sizeof(epsilon)),
-      },
-      // Specialization Constants
-      {
-          graph.hashed_layout_of(in),
-          graph.hashed_layout_of(out_tensor),
-      },
-      // Resize Args
-      {normalized_shape},
-      // Resizing Logic
-      resize_native_layer_norm_node));
-}
-
-void native_layer_norm(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_native_layer_norm_node(
-      graph, args[0], args[1], args[2], args[3], args[4], args[5]);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.native_layer_norm.default, native_layer_norm);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Pad.cpp b/backends/vulkan/runtime/graph/ops/impl/Pad.cpp
deleted file mode 100644
index d225af05633..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Pad.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-struct PadParam final {
-  int32_t left;
-  int32_t top;
-  int32_t front;
-};
-
-PadParam creat_pad_param(const std::vector<int64_t>& pad) {
-  if (pad.size() == 2) {
-    return PadParam{static_cast<int32_t>(pad[0]), 0, 0};
-  } else if (pad.size() == 4) {
-    return PadParam{
-        static_cast<int32_t>(pad[0]), static_cast<int32_t>(pad[2]), 0};
-  } else if (pad.size() == 6) {
-    return PadParam{
-        static_cast<int32_t>(pad[0]),
-        static_cast<int32_t>(pad[2]),
-        static_cast<int32_t>(pad[4])};
-  } else {
-    VK_THROW("invalid pad form");
-  }
-}
-
-void resize_constant_pad_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef self = args.at(1).refs.at(0);
-  const IntListPtr pad_vec = graph->get_int_list(extra_args.at(0));
-  std::vector<int64_t> in_size = graph->sizes_of(self);
-  int dim = in_size.size() - 1;
-  for (int i = 0; i < pad_vec->size(); i += 2) {
-    in_size.at(dim) += pad_vec->at(i) + pad_vec->at(i + 1);
-    dim--;
-  }
-
-  graph->virtual_resize(out, in_size);
-}
-
-void add_constant_pad_nd_node(
-    ComputeGraph& graph,
-    const ValueRef& in,
-    const ValueRef& pad,
-    const ValueRef& fill_value,
-    const ValueRef& out) {
-  const float fill_value_val = graph.extract_scalar<float>(fill_value);
-  const IntListPtr pad_vec = graph.get_int_list(pad);
-
-  std::string kernel_name = "";
-  const PadParam pad_param = creat_pad_param(*pad_vec);
-
-  if (pad_vec->size() <= 4) {
-    kernel_name = "pad_height_width";
-    kernel_name.reserve(kShaderNameReserve);
-    add_dtype_suffix(kernel_name, graph.dtype_of(out));
-  } else {
-    kernel_name = "pad_channel";
-    kernel_name.reserve(kShaderNameReserve);
-    add_dtype_suffix(kernel_name, graph.dtype_of(out));
-  }
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Shader params buffers
-      {graph.sizes_ubo(out),
-       graph.sizes_ubo(in),
-       graph.create_params_buffer(pad_param),
-       graph.create_params_buffer(fill_value_val)},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {pad},
-      // Resizing Logic
-      resize_constant_pad_node));
-}
-
-void constant_pad_nd(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_constant_pad_nd_node(graph, args[0], args[1], args[2], args[3]);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.constant_pad_nd.default, constant_pad_nd);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp b/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
deleted file mode 100644
index 9ac4c963bc3..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Permute.cpp
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Permute.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-using utils::ivec2;
-using utils::ivec3;
-using utils::ivec4;
-using utils::uvec4;
-
-namespace {
-
-void check_args(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef permute_dims,
-    const ValueRef out) {
-  (void)permute_dims;
-  VK_CHECK_COND(check_same_packed_dim(graph, in, out));
-
-  // This implementation doesn't not requires the input tensor to have the same
-  // dim size as the argument. The code will work as long as the input tensor's
-  // dim size is shorter than the permute dim array. In this case, the code
-  // assume size of 1 at the higher dimensions.
-}
-
-} // namespace
-
-void resize_permute_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef out = args[0].refs[0];
-  const ValueRef in = args[1].refs[0];
-
-  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
-  const std::vector<int64_t> out_sizes = graph->sizes_of(out);
-
-  const std::vector<int64_t> permute_dims =
-      graph->extract_int_or_symint_list(resize_args[0]);
-
-  if (in_sizes.size() == out_sizes.size() &&
-      in_sizes.size() == permute_dims.size()) {
-    std::vector<int64_t> new_out_sizes(out_sizes.size(), 1);
-    const int64_t out_ndim = std::max(in_sizes.size(), out_sizes.size());
-    for (int i = 0; i < out_ndim; i++) {
-      const int64_t permute_dim = permute_dims.at(i);
-      new_out_sizes.at(i) = in_sizes.at(permute_dim);
-    }
-    graph->virtual_resize(out, new_out_sizes);
-  }
-  // Case where permute is being used to implement squeeze
-  else if (
-      in_sizes.size() > out_sizes.size() &&
-      in_sizes.size() == permute_dims.size()) {
-    std::vector<int64_t> new_out_sizes(out_sizes.size(), 1);
-    const size_t offset = in_sizes.size() - out_sizes.size();
-    for (int i = 0; i < out_sizes.size(); i++) {
-      const int64_t permute_dim = permute_dims.at(i + offset);
-      new_out_sizes.at(i) = in_sizes.at(permute_dim);
-    }
-    graph->virtual_resize(out, new_out_sizes);
-  }
-  // Case where Permute is being used to implement unsqueeze
-  else if (
-      in_sizes.size() < out_sizes.size() &&
-      out_sizes.size() == permute_dims.size()) {
-    std::vector<int64_t> new_out_sizes(out_sizes.size(), 1);
-    const size_t offset = out_sizes.size() - in_sizes.size();
-    for (int i = 0; i < out_sizes.size(); i++) {
-      int64_t permute_dim = permute_dims.at(i) - offset;
-      if (permute_dim >= 0) {
-        new_out_sizes.at(i) = in_sizes.at(permute_dim);
-      }
-    }
-    graph->virtual_resize(out, new_out_sizes);
-  } else {
-    VK_THROW("Invalid permute dims");
-  }
-}
-
-void add_permute_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef permute_dims,
-    const ValueRef out) {
-  check_args(graph, in, permute_dims, out);
-
-  // Convert the permute dims to WHCN dimension order, which is the standard in
-  // our compute shaders. The following transformations are applied.
-  // 1. Change dimension index values from NCHW order valueto WHCN order value
-  // 2. Reverse the order of the permute array from NCHW order to WHCN order
-  ivec4 whcn_permute_dims{0, 1, 2, 3};
-  {
-    IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims);
-    const int32_t permute_ndim =
-        utils::safe_downcast<int>(permute_dims_ptr->size());
-
-    for (int32_t nchw_i = permute_ndim - 1, whcn_i = 0; nchw_i >= 0;
-         nchw_i--, whcn_i++) {
-      const int32_t permute_dim_nchw = permute_dims_ptr->at(nchw_i);
-      const int32_t permute_dim_whcn = permute_ndim - 1 - permute_dim_nchw;
-
-      whcn_permute_dims[whcn_i] = permute_dim_whcn;
-    }
-  }
-
-  std::string kernel_name = "permute";
-  kernel_name.reserve(kShaderNameReserve);
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  vkapi::ParamsBindList param_buffers;
-  std::vector<PushConstantDataInfo> push_constants;
-  vkapi::SpecVarList spec_vars;
-
-  const int32_t out_channels = dim_at<kChannel4D>(graph.sizes_of(out));
-  const int32_t in_channels = dim_at<kChannel4D>(graph.sizes_of(in));
-
-  const int32_t packed_dim = graph.packed_dim_of(in);
-  ivec2 channel_info = {out_channels, in_channels};
-  if (packed_dim == WHCN::kChannelsDim) {
-    channel_info[0] = utils::align_up_4(channel_info[0]);
-    channel_info[1] = utils::align_up_4(channel_info[1]);
-  }
-
-  push_constants = {
-      graph.sizes_pc_of(out),
-      graph.sizes_pc_of(in),
-      PushConstantDataInfo(&whcn_permute_dims, sizeof(whcn_permute_dims))};
-
-  spec_vars = {graph.hashed_layout_of(out), graph.hashed_layout_of(in)};
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Parameter buffers
-      param_buffers,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      spec_vars,
-      // Resize Args
-      {permute_dims},
-      // Resizing Logic
-      resize_permute_node));
-}
-
-struct WHCNPermuteDims {
-  int32_t whcn_permute_dims[api::kTensorDimLimit];
-
-  void initialize(const std::vector<int64_t>& permute_dims) {
-    const int32_t permute_ndim = permute_dims.size();
-    for (int32_t whcn_i = 0; whcn_i < permute_ndim; whcn_i++) {
-      const int32_t nchw_i = permute_ndim - 1 - whcn_i;
-      int64_t index_val = permute_dims.at(nchw_i);
-      if (index_val < 0) {
-        index_val += permute_ndim;
-      }
-      const int32_t permute_dim_whcn = permute_ndim - 1 - index_val;
-      whcn_permute_dims[whcn_i] = permute_dim_whcn;
-    }
-    for (int32_t whcn_i = permute_ndim; whcn_i < api::kTensorDimLimit;
-         whcn_i++) {
-      whcn_permute_dims[whcn_i] = whcn_i;
-    }
-  }
-};
-
-void add_permute_buffer_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef permute_dims,
-    const ValueRef out) {
-  check_args(graph, in, permute_dims, out);
-
-  WHCNPermuteDims whcn_permute_dims;
-  // Convert the permute dims to WHCN dimension order, which is the standard in
-  // our compute shaders. The following transformations are applied.
-  // 1. Change dimension index values from NCHW order valueto WHCN order value
-  // 2. Extend the permute array to kTensorDimLimit
-  {
-    IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims);
-    whcn_permute_dims.initialize(*permute_dims_ptr);
-  }
-
-  std::string kernel_name = "permute";
-  kernel_name.reserve(kShaderNameReserve);
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  vkapi::ParamsBindList param_buffers = {
-      graph.buffer_meta_ubo(out),
-      graph.buffer_meta_ubo(in),
-      graph.create_params_buffer(whcn_permute_dims),
-  };
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Parameter buffers
-      param_buffers,
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {permute_dims},
-      // Resizing Logic
-      resize_permute_node));
-}
-
-void permute(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  int idx = 0;
-  const ValueRef in = args.at(idx++);
-  const ValueRef permute_dims = args.at(idx++);
-  const ValueRef out = args.at(idx++);
-
-  if (graph.is_buffer_storage(args[2])) {
-    return add_permute_buffer_node(graph, in, permute_dims, out);
-  }
-  return add_permute_node(graph, in, permute_dims, out);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.permute.default, permute);
-  VK_REGISTER_OP(aten.permute_copy.default, permute);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Permute.h b/backends/vulkan/runtime/graph/ops/impl/Permute.h
deleted file mode 100644
index 0f17a4a26b0..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Permute.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-#include <vector>
-
-namespace vkcompute {
-
-void add_permute_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef permute_dims,
-    const ValueRef out);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp b/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
deleted file mode 100644
index 250fcdd5490..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Pool.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void check_pool2d_args(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef out) {
-  VK_CHECK_COND(graph.packed_dim_of(in) == WHCN::kChannelsDim);
-  VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kChannelsDim);
-}
-
-void resize_pool2d_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  bool is_max_pool2d = extra_args.at(3) != kDummyValueRef;
-
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef self = args.at(1).refs.at(0);
-
-  const std::vector<int64_t> self_sizes = graph->sizes_of(self);
-  size_t ndim = self_sizes.size();
-  std::vector<int64_t> new_out_sizes(ndim);
-
-  // Batch, Channel
-  if (ndim == 4) {
-    new_out_sizes.at(ndim - 4) = self_sizes.at(ndim - 4);
-  }
-  new_out_sizes.at(ndim - 3) = self_sizes.at(ndim - 3);
-
-  // Height, Width
-  const auto& new_out_sizes_hw = calc_out_sizes_hw(
-      *graph,
-      self_sizes,
-      extra_args.at(0),
-      /*kernel_size_only = */ true,
-      {extra_args.at(1), extra_args.at(2), extra_args.at(3), extra_args.at(4)});
-  new_out_sizes.at(ndim - 2) = new_out_sizes_hw.at(0);
-  new_out_sizes.at(ndim - 1) = new_out_sizes_hw.at(1);
-
-  graph->virtual_resize(out, new_out_sizes);
-
-  if (is_max_pool2d) {
-    const ValueRef indices = args.at(0).refs.at(1);
-    graph->virtual_resize(indices, new_out_sizes);
-  }
-}
-
-//
-// max_pool2d
-//
-
-void add_max_pool2d_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef kernel_size,
-    const ValueRef stride,
-    const ValueRef padding,
-    const ValueRef dilation,
-    const ValueRef ceil_mode,
-    const ValueRef out) {
-  ValueRef out_tensor = out;
-  // Placeholder tensor to fill binding slot for indices tensor in case we are
-  // computing max_pool2d instead of max_pool2d_with_indices.
-  TmpTensor tmp_indices_tensor =
-      TmpTensor(&graph, {}, graph.dtype_of(in), graph.storage_type_of(in));
-  ValueRef indices_tensor = tmp_indices_tensor.vref;
-  int32_t write_indices = 0;
-  if (graph.val_is_value_list(out)) {
-    const auto out_val = graph.get_value_list(out);
-    out_tensor = out_val->at(0);
-    indices_tensor = out_val->at(1);
-    write_indices = 1;
-  }
-
-  check_pool2d_args(graph, in, out_tensor);
-
-  std::string kernel_name("max_pool2d");
-  add_dtype_suffix(kernel_name, graph.dtype_of(out_tensor));
-
-  Kernel2dParams kernel_params = create_kernel2d_params(
-      graph,
-      kernel_size,
-      /*kernel_size_only = */ true,
-      stride,
-      padding,
-      dilation);
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{{out_tensor, indices_tensor}, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Shader params buffers
-      {
-          graph.logical_limits_ubo(out_tensor),
-          graph.sizes_ubo(in),
-          graph.create_params_buffer(kernel_params),
-      },
-      // Push Constants
-      {},
-      // Specialization Constants
-      {write_indices},
-      // Resize Args
-      {kernel_size, stride, padding, dilation, ceil_mode},
-      // Resizing Logic
-      resize_pool2d_node));
-}
-
-void max_pool2d(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_max_pool2d_node(
-      graph, args[0], args[1], args[2], args[3], args[4], args[5], args[6]);
-}
-
-//
-// avg_pool2d
-//
-
-struct DivisorParams final {
-  int32_t divisor_override;
-  bool count_include_pad;
-};
-
-DivisorParams create_divisor_params(
-    ComputeGraph& graph,
-    const ValueRef divisor_override,
-    const ValueRef count_include_pad) {
-  return {
-      graph.val_is_int(divisor_override)
-          ? static_cast<int32_t>(graph.get_int(divisor_override))
-          : 0,
-      graph.get_bool(count_include_pad)};
-}
-
-void add_avg_pool2d_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef kernel_size,
-    const ValueRef stride,
-    const ValueRef padding,
-    const ValueRef ceil_mode,
-    const ValueRef count_include_pad,
-    const ValueRef divisor_override,
-    const ValueRef out) {
-  check_pool2d_args(graph, in, out);
-
-  std::string kernel_name("avg_pool2d");
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  Kernel2dParams kernel_params =
-      create_kernel2d_params(graph, kernel_size, stride, padding);
-
-  DivisorParams divisor_params =
-      create_divisor_params(graph, divisor_override, count_include_pad);
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Shader params buffers
-      {graph.logical_limits_ubo(out),
-       graph.sizes_ubo(in),
-       graph.create_params_buffer(kernel_params),
-       graph.create_params_buffer(divisor_params)},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {kernel_size,
-       stride,
-       padding,
-       /*dilation= */ kDummyValueRef,
-       ceil_mode},
-      // Resizing Logic
-      resize_pool2d_node));
-}
-
-void avg_pool2d(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_avg_pool2d_node(
-      graph,
-      args[0],
-      args[1],
-      args[2],
-      args[3],
-      args[4],
-      args[5],
-      args[6],
-      args[7]);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.avg_pool2d.default, avg_pool2d);
-  VK_REGISTER_OP(aten.max_pool2d_with_indices.default, max_pool2d);
-  VK_REGISTER_OP(aten.max_pool2d.default, max_pool2d);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp b/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp
deleted file mode 100644
index 88f77261f4f..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Quantize.cpp
+++ /dev/null
@@ -1,836 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-namespace vkcompute {
-
-void resize_quantize_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
-
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = args.at(1).refs.at(0);
-
-  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
-  graph->virtual_resize(out, in_sizes);
-}
-
-utils::uvec3 quantize_per_channel_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)args;
-  (void)resize_args;
-
-  const ValueRef input = args.at(1).refs.at(0);
-
-  utils::uvec3 local_wg_size =
-      graph->create_local_wg_size(global_workgroup_size);
-
-  // WORKAROUND: The CommandBuffer::dispatch function divides
-  // global_workgroup_size by local_workgroup_size to get the number of
-  // workgroups to dispatch. For per-channel quantization along the batch axis,
-  // we need to ensure that we dispatch the correct number of workgroups in the
-  // Z dimension to cover all batch-channel combinations.
-  //
-  // If local_wg_size[2] > 1, then div_up(global_workgroup_size[2],
-  // local_wg_size[2]) might reduce the number of workgroups dispatched. To
-  // ensure we dispatch global_workgroup_size[2] workgroups in the Z dimension,
-  // we set local_wg_size[2] = 1.
-  const auto input_sizes = graph->sizes_of(input);
-  if (input_sizes.size() == 4 && !graph->is_buffer_storage(input) &&
-      global_workgroup_size[2] > 1) {
-    local_wg_size[2] = 1;
-  }
-
-  return local_wg_size;
-}
-
-utils::uvec3 quantize_block_wise_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-  const ValueRef input = args.at(1).refs.at(0);
-
-  utils::uvec3 local_wg_size =
-      graph->create_local_wg_size(global_workgroup_size);
-
-  // WORKAROUND: The CommandBuffer::dispatch function divides
-  // global_workgroup_size by local_workgroup_size to get the number of
-  // workgroups to dispatch. For per-channel quantization along the batch axis,
-  // we need to ensure that we dispatch the correct number of workgroups in the
-  // Z dimension to cover all batch-channel combinations.
-  //
-  // If local_wg_size[2] > 1, then div_up(global_workgroup_size[2],
-  // local_wg_size[2]) might reduce the number of workgroups dispatched. To
-  // ensure we dispatch global_workgroup_size[2] workgroups in the Z dimension,
-  // we set local_wg_size[2] = 1.
-  const auto input_sizes = graph->sizes_of(input);
-  if (input_sizes.size() == 4 && !graph->is_buffer_storage(input) &&
-      global_workgroup_size[2] > 1) {
-    local_wg_size[2] = 1;
-  }
-
-  return local_wg_size;
-}
-
-void add_quantize_per_tensor_node(
-    ComputeGraph& graph,
-    const ValueRef& input,
-    const ValueRef& scale,
-    const ValueRef& zero_point,
-    const ValueRef& quant_min,
-    const ValueRef& quant_max,
-    const ValueRef& output) {
-  std::string kernel_name("quantize_per_tensor");
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(output));
-  add_dtype_suffix(kernel_name, graph.dtype_of(scale));
-  add_dtype_suffix(kernel_name, graph.dtype_of(zero_point));
-
-  // Handle optional quant_min and quant_max parameters independently
-  auto bounds = get_dtype_bounds(graph.dtype_of(output));
-
-  int quant_min_val, quant_max_val;
-
-  // Handle quant_min
-  if (graph.val_is_none(quant_min)) {
-    quant_min_val = bounds.first;
-  } else {
-    VK_CHECK_COND(
-        graph.val_is_int(quant_min),
-        "quant_min must be an integer, got type: ",
-        graph.get_val_type(quant_min));
-    quant_min_val = static_cast<int>(graph.get_int(quant_min));
-  }
-
-  // Handle quant_max
-  if (graph.val_is_none(quant_max)) {
-    quant_max_val = bounds.second;
-  } else {
-    VK_CHECK_COND(
-        graph.val_is_int(quant_max),
-        "quant_max must be an integer, got type: ",
-        graph.get_val_type(quant_max));
-    quant_max_val = static_cast<int>(graph.get_int(quant_max));
-  }
-
-  vkapi::ParamsBindList param_ubos;
-  std::vector<PushConstantDataInfo> push_constants;
-
-  if (graph.is_buffer_storage(input)) {
-    param_ubos = {
-        graph.numel_ubo(input),
-        graph.sizes_ubo(input),
-        graph.strides_ubo(input),
-        graph.sizes_ubo(output),
-        graph.strides_ubo(output)};
-  } else {
-    param_ubos = {
-        graph.logical_limits_ubo(input), graph.logical_limits_ubo(output)};
-  }
-
-  push_constants = {
-      PushConstantDataInfo(&quant_min_val, sizeof(int)),
-      PushConstantDataInfo(&quant_max_val, sizeof(int)),
-  };
-
-  vkapi::SpecVarList spec_vars = {
-      graph.hashed_layout_of(output),
-      graph.hashed_layout_of(input),
-  };
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{output, vkapi::kWrite},
-       {input, vkapi::kRead},
-       {{scale, zero_point}, vkapi::kRead}},
-      // Shader param buffers
-      param_ubos,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      spec_vars,
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_quantize_node));
-}
-
-void add_quantize_per_token_node(
-    ComputeGraph& graph,
-    const ValueRef& input,
-    const ValueRef& scale,
-    const ValueRef& zero_point,
-    const ValueRef& quant_min,
-    const ValueRef& quant_max,
-    const ValueRef& output) {
-  std::string kernel_name("quantize_per_token");
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(output));
-  add_dtype_suffix(kernel_name, graph.dtype_of(scale));
-  add_dtype_suffix(kernel_name, graph.dtype_of(zero_point));
-
-  // Handle optional quant_min and quant_max parameters independently
-  auto bounds = get_dtype_bounds(graph.dtype_of(output));
-
-  int quant_min_val, quant_max_val;
-
-  // Handle quant_min
-  if (graph.val_is_none(quant_min)) {
-    quant_min_val = bounds.first;
-  } else {
-    VK_CHECK_COND(
-        graph.val_is_int(quant_min),
-        "quant_min must be an integer, got type: ",
-        graph.get_val_type(quant_min));
-    quant_min_val = static_cast<int>(graph.get_int(quant_min));
-  }
-
-  // Handle quant_max
-  if (graph.val_is_none(quant_max)) {
-    quant_max_val = bounds.second;
-  } else {
-    VK_CHECK_COND(
-        graph.val_is_int(quant_max),
-        "quant_max must be an integer, got type: ",
-        graph.get_val_type(quant_max));
-    quant_max_val = static_cast<int>(graph.get_int(quant_max));
-  }
-
-  int num_tokens = static_cast<int>(graph.sizes_of(scale)[0]);
-
-  vkapi::ParamsBindList param_ubos;
-  std::vector<PushConstantDataInfo> push_constants;
-
-  if (graph.is_buffer_storage(input)) {
-    param_ubos = {
-        graph.numel_ubo(input),
-        graph.sizes_ubo(input),
-        graph.strides_ubo(input),
-        graph.sizes_ubo(output),
-        graph.strides_ubo(output),
-    };
-    push_constants = {
-        PushConstantDataInfo(&num_tokens, sizeof(int)),
-        PushConstantDataInfo(&quant_min_val, sizeof(int)),
-        PushConstantDataInfo(&quant_max_val, sizeof(int)),
-    };
-  } else {
-    param_ubos = {
-        graph.logical_limits_ubo(input),
-        graph.logical_limits_ubo(output),
-    };
-    push_constants = {
-        PushConstantDataInfo(&num_tokens, sizeof(int)),
-        PushConstantDataInfo(&quant_min_val, sizeof(int)),
-        PushConstantDataInfo(&quant_max_val, sizeof(int)),
-    };
-  }
-
-  vkapi::SpecVarList spec_vars = {
-      graph.hashed_layout_of(output),
-      graph.hashed_layout_of(input),
-  };
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{output, vkapi::kWrite},
-       {input, vkapi::kRead},
-       {{scale, zero_point}, vkapi::kRead}},
-      // Shader param buffers
-      param_ubos,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      spec_vars,
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_quantize_node));
-}
-
-void add_quantize_per_channel_node(
-    ComputeGraph& graph,
-    const ValueRef& input,
-    const ValueRef& scale,
-    const ValueRef& zero_point,
-    const ValueRef& axis,
-    const ValueRef& quant_min,
-    const ValueRef& quant_max,
-    const ValueRef& output) {
-  std::string kernel_name("quantize_per_channel");
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(output));
-  add_dtype_suffix(kernel_name, graph.dtype_of(scale));
-  add_dtype_suffix(kernel_name, graph.dtype_of(zero_point));
-
-  int axis_val = static_cast<int>(graph.get_int(axis));
-
-  // Handle optional quant_min and quant_max parameters independently
-  auto bounds = get_dtype_bounds(graph.dtype_of(output));
-
-  int quant_min_val, quant_max_val;
-
-  // Handle quant_min
-  if (graph.val_is_none(quant_min)) {
-    quant_min_val = bounds.first;
-  } else {
-    VK_CHECK_COND(
-        graph.val_is_int(quant_min),
-        "quant_min must be an integer, got type: ",
-        graph.get_val_type(quant_min));
-    quant_min_val = static_cast<int>(graph.get_int(quant_min));
-  }
-
-  // Handle quant_max
-  if (graph.val_is_none(quant_max)) {
-    quant_max_val = bounds.second;
-  } else {
-    VK_CHECK_COND(
-        graph.val_is_int(quant_max),
-        "quant_max must be an integer, got type: ",
-        graph.get_val_type(quant_max));
-    quant_max_val = static_cast<int>(graph.get_int(quant_max));
-  }
-
-  // Normalize axis and convert from NCHW to WHCN using utility functions
-  const auto input_sizes = graph.sizes_of(input);
-  const int64_t ndim = graph.dim_of(input);
-
-  // Normalize axis to handle negative indices
-  axis_val = normalize(axis_val, ndim);
-
-  // Convert from NCHW axis to WHCN axis for shader (vulkan representation)
-  int axis_whcn = nchw_dim_to_whcn_dim(axis_val, ndim);
-
-  int num_channels;
-  if (axis_val == 0 && ndim == 4 && !graph.is_buffer_storage(input)) {
-    // For batch dimension quantization in 4D tensors, pass the actual number of
-    // channels so the shader can correctly unfold the batch-channel folding
-    num_channels = static_cast<int>(input_sizes[1]); // Channel dimension
-  } else {
-    num_channels = static_cast<int>(input_sizes[axis_val]);
-  }
-
-  vkapi::ParamsBindList param_ubos;
-  std::vector<PushConstantDataInfo> push_constants;
-
-  if (graph.is_buffer_storage(input)) {
-    param_ubos = {
-        graph.numel_ubo(input),
-        graph.sizes_ubo(input),
-        graph.strides_ubo(input),
-        graph.sizes_ubo(output),
-        graph.strides_ubo(output),
-    };
-    push_constants = {
-        PushConstantDataInfo(&axis_whcn, sizeof(int)),
-        PushConstantDataInfo(&num_channels, sizeof(int)),
-        PushConstantDataInfo(&quant_min_val, sizeof(int)),
-        PushConstantDataInfo(&quant_max_val, sizeof(int)),
-    };
-  } else {
-    param_ubos = {
-        graph.logical_limits_ubo(input),
-        graph.logical_limits_ubo(output),
-    };
-    push_constants = {
-        PushConstantDataInfo(&axis_whcn, sizeof(int)),
-        PushConstantDataInfo(&num_channels, sizeof(int)),
-        PushConstantDataInfo(&quant_min_val, sizeof(int)),
-        PushConstantDataInfo(&quant_max_val, sizeof(int)),
-    };
-  }
-
-  vkapi::SpecVarList spec_vars = {
-      graph.hashed_layout_of(output),
-      graph.hashed_layout_of(input),
-  };
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      quantize_per_channel_local_wg_size,
-      // Inputs and Outputs
-      {{output, vkapi::kWrite},
-       {input, vkapi::kRead},
-       {{scale, zero_point}, vkapi::kRead}},
-      // Shader param buffers
-      param_ubos,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      spec_vars,
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_quantize_node));
-}
-
-void add_quantize_block_wise_node(
-    ComputeGraph& graph,
-    const ValueRef& input,
-    const ValueRef& block_size,
-    const ValueRef& scale,
-    const ValueRef& zero_point,
-    const ValueRef& quant_min,
-    const ValueRef& quant_max,
-    const ValueRef& output) {
-  std::string kernel_name("quantize_block_wise");
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(input));
-  add_dtype_suffix(kernel_name, graph.dtype_of(output));
-  add_dtype_suffix(kernel_name, graph.dtype_of(scale));
-  add_dtype_suffix(kernel_name, graph.dtype_of(zero_point));
-
-  // Handle optional quant_min and quant_max parameters independently
-  auto bounds = get_dtype_bounds(graph.dtype_of(output));
-
-  int quant_min_val, quant_max_val;
-
-  // Handle quant_min
-  if (graph.val_is_none(quant_min)) {
-    quant_min_val = bounds.first;
-  } else {
-    VK_CHECK_COND(
-        graph.val_is_int(quant_min),
-        "quant_min must be an integer, got type: ",
-        graph.get_val_type(quant_min));
-    quant_min_val = static_cast<int>(graph.get_int(quant_min));
-  }
-
-  // Handle quant_max
-  if (graph.val_is_none(quant_max)) {
-    quant_max_val = bounds.second;
-  } else {
-    VK_CHECK_COND(
-        graph.val_is_int(quant_max),
-        "quant_max must be an integer, got type: ",
-        graph.get_val_type(quant_max));
-    quant_max_val = static_cast<int>(graph.get_int(quant_max));
-  }
-
-  const auto input_sizes = graph.sizes_of(input);
-  const auto block_size_list = graph.get_int_list(block_size);
-
-  // Convert PyTorch dimensions to WHCN order for shader
-  utils::ivec4 block_size_vec = utils::make_whcn_ivec4(*block_size_list);
-  utils::ivec4 tensor_size_whcn = utils::make_whcn_ivec4(input_sizes);
-
-  // Calculate numBlocks: tensorSize / blockSize (both in WHCN order)
-  utils::ivec4 num_blocks_vec = {
-      tensor_size_whcn[0] / block_size_vec[0],
-      tensor_size_whcn[1] / block_size_vec[1],
-      tensor_size_whcn[2] / block_size_vec[2],
-      tensor_size_whcn[3] / block_size_vec[3]};
-
-  // Calculate blockStride: pre-computed linear strides for the block grid
-  utils::ivec4 block_stride_vec = {
-      1,
-      num_blocks_vec[0],
-      num_blocks_vec[0] * num_blocks_vec[1],
-      num_blocks_vec[0] * num_blocks_vec[1] * num_blocks_vec[2]};
-
-  vkapi::ParamsBindList param_ubos;
-  std::vector<PushConstantDataInfo> push_constants;
-
-  if (graph.is_buffer_storage(input)) {
-    param_ubos = {
-        graph.numel_ubo(input),
-        graph.sizes_ubo(input),
-        graph.strides_ubo(input),
-        graph.sizes_ubo(output),
-        graph.strides_ubo(output)};
-  } else {
-    param_ubos = {
-        graph.logical_limits_ubo(input), graph.logical_limits_ubo(output)};
-  }
-
-  push_constants = {
-      PushConstantDataInfo(&block_size_vec, sizeof(block_size_vec)),
-      PushConstantDataInfo(&num_blocks_vec, sizeof(num_blocks_vec)),
-      PushConstantDataInfo(&block_stride_vec, sizeof(block_stride_vec)),
-      PushConstantDataInfo(&quant_min_val, sizeof(int)),
-      PushConstantDataInfo(&quant_max_val, sizeof(int)),
-  };
-
-  vkapi::SpecVarList spec_vars = {
-      graph.hashed_layout_of(output),
-      graph.hashed_layout_of(input),
-  };
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      quantize_block_wise_local_wg_size,
-      // Inputs and Outputs
-      {{output, vkapi::kWrite},
-       {input, vkapi::kRead},
-       {{scale, zero_point}, vkapi::kRead}},
-      // Shader param buffers
-      param_ubos,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      spec_vars,
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_quantize_node));
-}
-
-void quantize_per_tensor_impl(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  int arg_idx = 0;
-  const ValueRef input = args[arg_idx++];
-  const ValueRef scale = args[arg_idx++];
-  const ValueRef zero_point = args[arg_idx++];
-  const ValueRef quant_min = args[arg_idx++];
-  const ValueRef quant_max = args[arg_idx++];
-  const ValueRef dtype = args[arg_idx++];
-  const ValueRef output = args[arg_idx++];
-
-  // Suppress unused variable warning - dtype is inferred from output
-  (void)dtype;
-
-  // Check tensor types
-  VK_CHECK_COND(graph.val_is_tensor(input));
-  VK_CHECK_COND(graph.val_is_tensor(scale));
-  VK_CHECK_COND(graph.val_is_tensor(zero_point));
-  VK_CHECK_COND(graph.val_is_tensor(output));
-
-  // Verify input is a floating point type
-  VK_CHECK_COND(
-      graph.dtype_of(input) == vkapi::kDouble ||
-      graph.dtype_of(input) == vkapi::kFloat ||
-      graph.dtype_of(input) == vkapi::kHalf);
-
-  // Get scale and zero point dtypes
-  vkapi::ScalarType scale_dtype = graph.dtype_of(scale);
-  vkapi::ScalarType zero_point_dtype = graph.dtype_of(zero_point);
-
-  // Verify supported types for scale (fp32 only for now)
-  VK_CHECK_COND(scale_dtype == vkapi::kFloat);
-
-  // Verify supported types for zero point (int32, int8, fp32)
-  VK_CHECK_COND(
-      zero_point_dtype == vkapi::kInt || zero_point_dtype == vkapi::kChar ||
-      zero_point_dtype == vkapi::kFloat);
-
-  add_quantize_per_tensor_node(
-      graph, input, scale, zero_point, quant_min, quant_max, output);
-}
-
-void quantize_per_token_impl(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  int arg_idx = 0;
-  const ValueRef input = args[arg_idx++];
-  const ValueRef scale = args[arg_idx++];
-  const ValueRef zero_point = args[arg_idx++];
-  const ValueRef quant_min = args[arg_idx++];
-  const ValueRef quant_max = args[arg_idx++];
-  const ValueRef dtype = args[arg_idx++];
-  const ValueRef output = args[arg_idx++];
-
-  // Suppress unused variable warning - dtype is inferred from output
-  (void)dtype;
-
-  // Check tensor types
-  VK_CHECK_COND(graph.val_is_tensor(input));
-  VK_CHECK_COND(graph.val_is_tensor(scale));
-  VK_CHECK_COND(graph.val_is_tensor(zero_point));
-  VK_CHECK_COND(graph.val_is_tensor(output));
-
-  // Verify input is a floating point type
-  VK_CHECK_COND(
-      graph.dtype_of(input) == vkapi::kDouble ||
-      graph.dtype_of(input) == vkapi::kFloat ||
-      graph.dtype_of(input) == vkapi::kHalf);
-
-  // Get scale and zero point dtypes
-  vkapi::ScalarType scale_dtype = graph.dtype_of(scale);
-  vkapi::ScalarType zero_point_dtype = graph.dtype_of(zero_point);
-
-  // Verify supported types for scale (fp32 only for now)
-  VK_CHECK_COND(scale_dtype == vkapi::kFloat);
-
-  // Verify supported types for zero point (int32, int8, fp32)
-  VK_CHECK_COND(
-      zero_point_dtype == vkapi::kInt || zero_point_dtype == vkapi::kChar ||
-      zero_point_dtype == vkapi::kFloat);
-
-  // Check that scale and zero_point have buffer storage and width packing
-  VK_CHECK_COND(graph.is_buffer_storage(scale));
-  VK_CHECK_COND(graph.packed_dim_of(scale) == WHCN::kWidthDim);
-  VK_CHECK_COND(graph.is_buffer_storage(zero_point));
-  VK_CHECK_COND(graph.packed_dim_of(zero_point) == WHCN::kWidthDim);
-
-  // Check that tensors with texture storage have standard axis map
-  if (!graph.is_buffer_storage(input)) {
-    VK_CHECK_COND(graph.has_standard_axis_map(input));
-  }
-  if (!graph.is_buffer_storage(output)) {
-    VK_CHECK_COND(graph.has_standard_axis_map(output));
-  }
-
-  // Calculate number of tokens (product of all dimensions except the last one)
-  int64_t num_tokens = 1;
-  const auto input_sizes = graph.sizes_of(input);
-  for (size_t i = 0; i < input_sizes.size() - 1; i++) {
-    num_tokens *= input_sizes[i];
-  }
-
-  const auto scale_sizes = graph.sizes_of(scale);
-  const auto zero_point_sizes = graph.sizes_of(zero_point);
-
-  // Calculate total number of elements in scale and zero_point tensors
-  int64_t scale_numel = 1;
-  for (size_t i = 0; i < scale_sizes.size(); i++) {
-    scale_numel *= scale_sizes[i];
-  }
-
-  int64_t zero_point_numel = 1;
-  for (size_t i = 0; i < zero_point_sizes.size(); i++) {
-    zero_point_numel *= zero_point_sizes[i];
-  }
-
-  // Check that the total number of elements matches num_tokens
-  // This allows for both 1D tensors (size [num_tokens]) and reshaped tensors
-  // (size [num_tokens, 1])
-  VK_CHECK_COND(scale_numel == num_tokens);
-  VK_CHECK_COND(zero_point_numel == num_tokens);
-
-  add_quantize_per_token_node(
-      graph, input, scale, zero_point, quant_min, quant_max, output);
-}
-
-void quantize_per_channel_impl(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  int arg_idx = 0;
-  const ValueRef input = args[arg_idx++];
-  const ValueRef scale = args[arg_idx++];
-  const ValueRef zero_point = args[arg_idx++];
-  const ValueRef axis = args[arg_idx++];
-  const ValueRef quant_min = args[arg_idx++];
-  const ValueRef quant_max = args[arg_idx++];
-  const ValueRef dtype = args[arg_idx++];
-  const ValueRef output = args[arg_idx++];
-
-  // Suppress unused variable warning - dtype is inferred from output
-  (void)dtype;
-
-  // Check tensor types
-  VK_CHECK_COND(graph.val_is_tensor(input));
-  VK_CHECK_COND(graph.val_is_tensor(scale));
-  VK_CHECK_COND(graph.val_is_tensor(zero_point));
-  VK_CHECK_COND(graph.val_is_tensor(output));
-
-  // Verify input is a floating point type
-  VK_CHECK_COND(
-      graph.dtype_of(input) == vkapi::kDouble ||
-      graph.dtype_of(input) == vkapi::kFloat ||
-      graph.dtype_of(input) == vkapi::kHalf);
-
-  // Get scale and zero point dtypes
-  vkapi::ScalarType scale_dtype = graph.dtype_of(scale);
-  vkapi::ScalarType zero_point_dtype = graph.dtype_of(zero_point);
-
-  // Verify supported types for scale (fp32 only for now)
-  VK_CHECK_COND(scale_dtype == vkapi::kFloat);
-
-  // Verify supported types for zero point (int32, int8, fp32)
-  VK_CHECK_COND(
-      zero_point_dtype == vkapi::kInt || zero_point_dtype == vkapi::kChar ||
-      zero_point_dtype == vkapi::kFloat);
-
-  // Check that scale and zero_point have buffer storage and width packing
-  VK_CHECK_COND(graph.is_buffer_storage(scale));
-  VK_CHECK_COND(graph.packed_dim_of(scale) == WHCN::kWidthDim);
-  VK_CHECK_COND(graph.is_buffer_storage(zero_point));
-  VK_CHECK_COND(graph.packed_dim_of(zero_point) == WHCN::kWidthDim);
-
-  // Check that tensors with texture storage have standard axis map
-  if (!graph.is_buffer_storage(input)) {
-    VK_CHECK_COND(graph.has_standard_axis_map(input));
-  }
-  if (!graph.is_buffer_storage(output)) {
-    VK_CHECK_COND(graph.has_standard_axis_map(output));
-  }
-
-  // Normalize axis
-  int axis_val = static_cast<int>(graph.get_int(axis));
-  const auto input_sizes = graph.sizes_of(input);
-  int64_t ndim = graph.dim_of(input);
-  if (axis_val < 0) {
-    axis_val += ndim;
-  }
-
-  // Verify axis is valid
-  VK_CHECK_COND(axis_val >= 0 && axis_val < ndim);
-
-  // Get number of channels along the specified axis
-  int64_t num_channels = input_sizes[axis_val];
-
-  const auto scale_sizes = graph.sizes_of(scale);
-  const auto zero_point_sizes = graph.sizes_of(zero_point);
-
-  // Calculate total number of elements in scale and zero_point tensors
-  int64_t scale_numel = 1;
-  for (size_t i = 0; i < scale_sizes.size(); i++) {
-    scale_numel *= scale_sizes[i];
-  }
-
-  int64_t zero_point_numel = 1;
-  for (size_t i = 0; i < zero_point_sizes.size(); i++) {
-    zero_point_numel *= zero_point_sizes[i];
-  }
-
-  // Check that the total number of elements matches num_channels
-  VK_CHECK_COND(scale_numel == num_channels);
-  VK_CHECK_COND(zero_point_numel == num_channels);
-
-  add_quantize_per_channel_node(
-      graph, input, scale, zero_point, axis, quant_min, quant_max, output);
-}
-
-void quantize_affine_impl(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  int arg_idx = 0;
-  const ValueRef input = args[arg_idx++];
-  const ValueRef block_size = args[arg_idx++];
-  const ValueRef scale = args[arg_idx++];
-  const ValueRef zero_point = args[arg_idx++];
-  const ValueRef output_dtype = args[arg_idx++];
-  const ValueRef quant_min = args[arg_idx++];
-  const ValueRef quant_max = args[arg_idx++];
-  const ValueRef output = args[arg_idx++];
-
-  // Suppress unused variable warnings
-  (void)output_dtype;
-
-  // Check tensor types
-  VK_CHECK_COND(graph.val_is_tensor(input));
-  VK_CHECK_COND(graph.val_is_tensor(scale));
-  VK_CHECK_COND(graph.val_is_tensor(zero_point));
-  VK_CHECK_COND(graph.val_is_tensor(output));
-
-  // Verify input is a floating point type
-  VK_CHECK_COND(
-      graph.dtype_of(input) == vkapi::kDouble ||
-      graph.dtype_of(input) == vkapi::kFloat ||
-      graph.dtype_of(input) == vkapi::kHalf);
-
-  // Get scale and zero point dtypes
-  vkapi::ScalarType scale_dtype = graph.dtype_of(scale);
-  vkapi::ScalarType zero_point_dtype = graph.dtype_of(zero_point);
-
-  // Verify supported types for scale (fp32 only for now)
-  VK_CHECK_COND(scale_dtype == vkapi::kFloat);
-
-  // Verify supported types for zero point (int32, int8, fp32)
-  VK_CHECK_COND(
-      zero_point_dtype == vkapi::kInt || zero_point_dtype == vkapi::kChar ||
-      zero_point_dtype == vkapi::kFloat);
-
-  // Check that scale and zero_point have buffer storage and width packing
-  VK_CHECK_COND(graph.is_buffer_storage(scale));
-  VK_CHECK_COND(graph.packed_dim_of(scale) == WHCN::kWidthDim);
-  VK_CHECK_COND(graph.is_buffer_storage(zero_point));
-  VK_CHECK_COND(graph.packed_dim_of(zero_point) == WHCN::kWidthDim);
-
-  // Check that tensors with texture storage have standard axis map
-  if (!graph.is_buffer_storage(input)) {
-    VK_CHECK_COND(graph.has_standard_axis_map(input));
-  }
-  if (!graph.is_buffer_storage(output)) {
-    VK_CHECK_COND(graph.has_standard_axis_map(output));
-  }
-
-  // Verify block_size is valid (each dimension must divide evenly into input
-  // size)
-  const auto input_sizes = graph.sizes_of(input);
-  const auto block_size_list = graph.get_int_list(block_size);
-  VK_CHECK_COND(block_size_list->size() == input_sizes.size());
-
-  for (size_t i = 0; i < input_sizes.size(); i++) {
-    if ((*block_size_list)[i] > 1) {
-      VK_CHECK_COND(
-          input_sizes[i] % (*block_size_list)[i] == 0,
-          "Input size at dimension ",
-          i,
-          " (",
-          input_sizes[i],
-          ") must be divisible by block_size at dimension ",
-          i,
-          " (",
-          (*block_size_list)[i],
-          ")");
-    }
-  }
-
-  add_quantize_block_wise_node(
-      graph,
-      input,
-      block_size,
-      scale,
-      zero_point,
-      quant_min,
-      quant_max,
-      output);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(
-      quantized_decomposed.quantize_per_tensor.tensor,
-      quantize_per_tensor_impl);
-  VK_REGISTER_OP(
-      quantized_decomposed.quantize_per_token.default, quantize_per_token_impl);
-  VK_REGISTER_OP(
-      quantized_decomposed.quantize_per_channel.default,
-      quantize_per_channel_impl);
-
-  // TorchAO affine quantization operators
-  VK_REGISTER_OP(torchao.quantize_affine.default, quantize_affine_impl);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
deleted file mode 100644
index 51f8138485e..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedConvolution.cpp
+++ /dev/null
@@ -1,695 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-//
-// Utility functions
-//
-
-struct Conv2DParams {
-  utils::ivec2 kernel_size;
-  utils::ivec2 stride;
-  utils::ivec2 padding;
-  utils::ivec2 dilation;
-  int32_t groups;
-  int32_t out_channels_per_group;
-  int32_t in_channels_per_group;
-  int32_t logical_K_per_group;
-  int32_t K_per_group;
-  int32_t K4_per_group;
-  int32_t logical_K;
-  int32_t K;
-  int32_t K4;
-};
-
-Conv2DParams create_conv2d_params(
-    ComputeGraph& graph,
-    const ValueRef& conv_input,
-    const ValueRef& conv_output,
-    const ValueRef& kernel_size,
-    const ValueRef& stride,
-    const ValueRef& padding,
-    const ValueRef& dilation,
-    const ValueRef& groups) {
-  const auto kernel_size_list = graph.get_int_list(kernel_size);
-  const auto stride_list = graph.get_int_list(stride);
-  const auto padding_list = graph.get_int_list(padding);
-  const auto dilation_list = graph.get_int_list(dilation);
-  const int32_t groups_val = graph.get_int(groups);
-
-  // Pre-compute input and output channels per group
-
-  std::vector<int64_t> out_sizes = graph.sizes_of(conv_output);
-  const int32_t out_channels = utils::val_at(-3, out_sizes);
-  const int32_t out_channels_per_group = out_channels / groups_val;
-
-  std::vector<int64_t> in_sizes = graph.sizes_of(conv_input);
-  const int32_t in_channels = utils::val_at(-3, in_sizes);
-  const int32_t in_channels_per_group = in_channels / groups_val;
-
-  // Pre-compute the number of elements along the K dimension per group. This
-  // quantity is aligned to the next multiple of 4 to ensure data loads are
-  // aligned to texel boundaries.
-
-  const int32_t logical_K_per_group =
-      kernel_size_list->at(0) * kernel_size_list->at(1) * in_channels_per_group;
-  const int32_t K_per_group = utils::align_up_4(logical_K_per_group);
-  const int32_t K4_per_group = K_per_group / 4;
-
-  // Pre-compute the "theoretical" size of the K dim of the input im2col matrix,
-  // which represents the flattened convolution window used to compute an output
-  // element. This is used for bounds checking.
-
-  const int32_t logical_K =
-      kernel_size_list->at(0) * kernel_size_list->at(1) * in_channels;
-
-  const int32_t K = K_per_group * groups_val;
-  // Used for texel stride calculations
-  const int32_t K4 = K / 4;
-
-  return Conv2DParams{
-      // Swap the order from HW to WH
-      utils::make_ivec2({kernel_size_list->at(1), kernel_size_list->at(0)}),
-      utils::make_ivec2({stride_list->at(1), stride_list->at(0)}),
-      utils::make_ivec2({padding_list->at(1), padding_list->at(0)}),
-      utils::make_ivec2({dilation_list->at(1), dilation_list->at(0)}),
-      groups_val,
-      out_channels_per_group,
-      in_channels_per_group,
-      logical_K_per_group,
-      K_per_group,
-      K4_per_group,
-      logical_K,
-      K,
-      K4,
-  };
-}
-
-std::vector<int64_t> calculate_input_im2col_sizes(
-    ComputeGraph* graph,
-    const ValueRef& input,
-    const ValueRef& output,
-    const ValueRef& kernel_size,
-    const ValueRef& groups) {
-  std::vector<int64_t> in_sizes = graph->sizes_of(input);
-  const int64_t in_channels = utils::val_at(-3, in_sizes);
-
-  std::vector<int64_t> out_sizes = graph->sizes_of(output);
-  const int64_t batches = utils::val_at(-4, out_sizes);
-  const int64_t out_height = utils::val_at(-2, out_sizes);
-  const int64_t out_width = utils::val_at(-1, out_sizes);
-
-  // Represents the number of channel groups
-  const int64_t groups_val = graph->extract_scalar<int64_t>(groups);
-  // No need to div_up because in_channels % groups_val = 0
-  const int64_t in_channels_per_group = in_channels / groups_val;
-
-  const auto kernel_size_list = graph->get_int_list(kernel_size);
-
-  // Align to the next multiple of 4 to ensure that data loads align nicely with
-  // texel boundaries. We want to ensure that the first data element of each
-  // group is at the start of its texel.
-  const int64_t flattened_kernel_len = utils::align_up_4(
-      in_channels_per_group * kernel_size_list->at(0) *
-      kernel_size_list->at(1));
-
-  // K -> flattened convolution window (adjusted)
-  const int64_t K = flattened_kernel_len * groups_val;
-  // M -> number of elements in 2D output plane. This is aligned to the next
-  // multiple of 4 since the im2col shader operates on 4x4 blocks.
-  const int64_t M = utils::align_up_4(out_height * out_width * batches);
-
-  return {M, K};
-}
-
-std::vector<int64_t> calculate_output_im2col_sizes(
-    ComputeGraph* graph,
-    const ValueRef& output) {
-  std::vector<int64_t> out_sizes = graph->sizes_of(output);
-  const int64_t batches = utils::val_at(-4, out_sizes);
-  const int64_t out_channels = utils::val_at(-3, out_sizes);
-  const int64_t out_height = utils::val_at(-2, out_sizes);
-  const int64_t out_width = utils::val_at(-1, out_sizes);
-
-  // N -> output channels
-  const int64_t N = out_channels;
-  // M -> number of elements in 2D output plane
-  const int64_t M = out_height * out_width * batches;
-
-  return {M, N};
-}
-
-//
-// Shader dispatch utilities
-//
-
-utils::uvec3 im2col_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef input_image = args.at(1).refs.at(0);
-  const ValueRef output_image = resize_args.at(0);
-  const ValueRef kernel_size = resize_args.at(1);
-  const ValueRef groups = resize_args.at(2);
-
-  std::vector<int64_t> im2col_sizes = calculate_input_im2col_sizes(
-      graph, input_image, output_image, kernel_size, groups);
-  const uint32_t K = utils::safe_downcast<uint32_t>(im2col_sizes[1]);
-  const uint32_t M = utils::safe_downcast<uint32_t>(im2col_sizes[0]);
-
-  // 1 output tile is 4x4 elements
-  const uint32_t K4 = utils::div_up(K, 4u);
-  const uint32_t M4 = utils::div_up(M, 4u);
-
-  return {K4, M4, 1};
-}
-
-utils::uvec3 col2im_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef output = args.at(0).refs.at(0);
-
-  std::vector<int64_t> im2col_sizes =
-      calculate_output_im2col_sizes(graph, output);
-  const uint32_t N = utils::safe_downcast<uint32_t>(im2col_sizes[1]);
-  const uint32_t M = utils::safe_downcast<uint32_t>(im2col_sizes[0]);
-
-  // 1 output tile is 4x4 elements
-  const uint32_t N4 = utils::div_up(N, 4u);
-  const uint32_t M4 = utils::div_up(M, 4u);
-
-  return {N4, M4, 1};
-}
-
-//
-// Dispatch nodes
-//
-
-void add_input_im2col_node(
-    ComputeGraph& graph,
-    const ValueRef input_image,
-    const ValueRef kernel_size,
-    const ValueRef stride,
-    const ValueRef padding,
-    const ValueRef dilation,
-    const ValueRef groups,
-    const ValueRef output_image,
-    const ValueRef input_im2col) {
-  Conv2DParams conv_params = create_conv2d_params(
-      graph,
-      input_image,
-      output_image,
-      kernel_size,
-      stride,
-      padding,
-      dilation,
-      groups);
-
-  std::string kernel_name = "im2col";
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(input_im2col));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(input_image));
-  add_dtype_suffix(kernel_name, graph.dtype_of(output_image));
-
-  vkapi::ParamsBindList param_buffers = {
-      graph.sizes_ubo(input_im2col),
-      graph.sizes_ubo(input_image),
-      graph.sizes_ubo(output_image),
-      graph.create_params_buffer(conv_params)};
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      im2col_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{input_im2col, vkapi::kWrite}, {input_image, vkapi::kRead}},
-      // Shader params buffers
-      param_buffers,
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize args
-      {output_image, kernel_size, groups},
-      // Resizing Logic
-      nullptr));
-}
-
-void add_quantize_and_pack_im2col_node(
-    ComputeGraph& graph,
-    const ValueRef input_image,
-    const ValueRef input_scale,
-    const ValueRef input_zp,
-    const ValueRef kernel_size,
-    const ValueRef stride,
-    const ValueRef padding,
-    const ValueRef dilation,
-    const ValueRef groups,
-    const ValueRef output_image,
-    const ValueRef input_int_im2col) {
-  Conv2DParams conv_params = create_conv2d_params(
-      graph,
-      input_image,
-      output_image,
-      kernel_size,
-      stride,
-      padding,
-      dilation,
-      groups);
-
-  float inv_scale = 1.0f / graph.extract_scalar<float>(input_scale);
-  int32_t zp = graph.extract_scalar<int32_t>(input_zp);
-
-  // Get shader for quantized conv2d linear tiled
-  std::string kernel_name = "quantize_and_pack_im2col";
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(input_int_im2col));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(input_image));
-  add_dtype_suffix(kernel_name, graph.dtype_of(output_image));
-
-  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
-
-  vkapi::ParamsBindList param_buffers = {
-      graph.sizes_ubo(input_int_im2col),
-      graph.sizes_ubo(input_image),
-      graph.sizes_ubo(output_image),
-      graph.create_params_buffer(conv_params)};
-
-  std::vector<PushConstantDataInfo> push_constants = {
-      PushConstantDataInfo(&inv_scale, sizeof(inv_scale)),
-      PushConstantDataInfo(&zp, sizeof(zp)),
-  };
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      im2col_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{input_int_im2col, vkapi::kWrite}, {input_image, vkapi::kRead}},
-      // Shader params buffers
-      param_buffers,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      {},
-      // Resize args
-      {output_image, kernel_size, groups},
-      // Resizing Logic
-      nullptr));
-}
-
-void add_conv2d_q8csw_linear_node(
-    ComputeGraph& graph,
-    const ValueRef input_im2col,
-    const ValueRef input_image,
-    const ValueRef packed_weight,
-    const ValueRef packed_weight_scales,
-    const ValueRef bias_data,
-    const ValueRef packed_bias,
-    const ValueRef kernel_size,
-    const ValueRef stride,
-    const ValueRef padding,
-    const ValueRef dilation,
-    const ValueRef groups,
-    const ValueRef output_image) {
-  Conv2DParams conv_params = create_conv2d_params(
-      graph,
-      input_image,
-      output_image,
-      kernel_size,
-      stride,
-      padding,
-      dilation,
-      groups);
-
-  // One limitation of the current implementation is that for grouped convs,
-  // the number of output_image channels per group must be a multiple of 4. One
-  // loaded 4x4 weight tile must all belong to the same group.
-  if (conv_params.groups > 1) {
-    VK_CHECK_COND(conv_params.out_channels_per_group % 4 == 0);
-  }
-
-  std::string kernel_name = "conv2d_q8csw_linear_tiled";
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(output_image));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(input_im2col));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight));
-  add_dtype_suffix(kernel_name, graph.dtype_of(output_image));
-  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
-
-  vkapi::ParamsBindList param_buffers = {
-      graph.sizes_ubo(output_image),
-      graph.sizes_ubo(input_image),
-      graph.create_params_buffer(conv_params)};
-
-  uint32_t apply_bias = 1;
-  if (graph.val_is_none(bias_data)) {
-    apply_bias = 0;
-  }
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      col2im_global_wg_size,
-      quantized_linear_local_wg_size,
-      // Inputs and Outputs
-      {{output_image, vkapi::kWrite},
-       {{input_im2col, packed_weight, packed_weight_scales, packed_bias},
-        vkapi::kRead}},
-      // Shader params buffers
-      param_buffers,
-      // Push Constants
-      {},
-      // Specialization Constants
-      {apply_bias},
-      // Resize args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-void add_conv2d_q8ta_q8csw_linear_node(
-    ComputeGraph& graph,
-    const ValueRef input_int_im2col,
-    const ValueRef input_image,
-    const ValueRef input_scale,
-    const ValueRef input_zp,
-    const ValueRef weight_data,
-    const ValueRef packed_weight,
-    const ValueRef packed_weight_sums,
-    const ValueRef packed_weight_scales,
-    const ValueRef bias_data,
-    const ValueRef packed_bias,
-    const ValueRef kernel_size,
-    const ValueRef stride,
-    const ValueRef padding,
-    const ValueRef dilation,
-    const ValueRef groups,
-    const ValueRef output_image) {
-  Conv2DParams conv_params = create_conv2d_params(
-      graph,
-      input_image,
-      output_image,
-      kernel_size,
-      stride,
-      padding,
-      dilation,
-      groups);
-
-  // One limitation of the current implementation is that for grouped convs,
-  // the number of output channels per group must be a multiple of 4. One loaded
-  // 4x4 weight tile must all belong to the same group.
-  if (conv_params.groups > 1) {
-    VK_CHECK_COND(conv_params.out_channels_per_group % 4 == 0);
-  }
-
-  float scale = graph.extract_scalar<float>(input_scale);
-  int32_t zp = graph.extract_scalar<int32_t>(input_zp);
-
-  std::string kernel_name = "conv2d_q8ta_q8csw_linear_tiled";
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(output_image));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(input_int_im2col));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight));
-  add_dtype_suffix(kernel_name, graph.dtype_of(output_image));
-  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
-
-  vkapi::ParamsBindList param_buffers = {
-      graph.sizes_ubo(output_image),
-      graph.sizes_ubo(input_image),
-      graph.create_params_buffer(conv_params)};
-
-  std::vector<PushConstantDataInfo> push_constants = {
-      PushConstantDataInfo(&scale, sizeof(scale)),
-      PushConstantDataInfo(&zp, sizeof(zp)),
-  };
-
-  uint32_t apply_bias = 1;
-  if (graph.val_is_none(bias_data)) {
-    apply_bias = 0;
-  }
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      col2im_global_wg_size,
-      quantized_linear_local_wg_size,
-      // Inputs and Outputs
-      {{output_image, vkapi::kWrite},
-       {{input_int_im2col,
-         packed_weight,
-         packed_weight_sums,
-         packed_weight_scales,
-         packed_bias},
-        vkapi::kRead}},
-      // Shader params buffers
-      param_buffers,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      {apply_bias},
-      // Resize args
-      {weight_data},
-      // Resizing Logic
-      nullptr));
-}
-
-//
-// High level operator impl
-//
-
-void quantized_conv2d_impl(
-    ComputeGraph& graph,
-    const QuantizationConfig& input_quant_config,
-    const QuantizationConfig& weight_quant_config,
-    const ValueRef input_image,
-    const ValueRef input_scale,
-    const ValueRef input_zp,
-    const ValueRef weight_data,
-    const ValueRef weight_sums_data,
-    const ValueRef weight_scales_data,
-    const ValueRef bias_data,
-    const ValueRef kernel_size,
-    const ValueRef stride,
-    const ValueRef padding,
-    const ValueRef dilation,
-    const ValueRef groups,
-    const ValueRef output_image) {
-  VK_CHECK_COND(weight_quant_config.granularity == kPerChannel);
-  VK_CHECK_COND(weight_quant_config.nbits == 8);
-  VK_CHECK_COND(weight_quant_config.is_symmetric);
-
-  const ValueRef packed_weight =
-      prepack_quantized_linear_weight(graph, weight_quant_config, weight_data);
-  ValueRef packed_weight_scales = prepack_standard(
-      graph, weight_scales_data, utils::kBuffer, utils::kWidthPacked);
-
-  // Create a dummy tensor to fill the binding slot of the bias tensor if it is
-  // not provided. This helps simplify dispatch logic and makes it so that
-  // fewer shader variants need to be generated.
-  TmpTensor dummy_bias(
-      &graph,
-      {},
-      graph.dtype_of(output_image),
-      utils::kBuffer,
-      utils::kWidthPacked);
-
-  ValueRef packed_bias = dummy_bias.vref;
-  if (!graph.val_is_none(bias_data)) {
-    packed_bias =
-        prepack_standard(graph, bias_data, utils::kBuffer, utils::kWidthPacked);
-  }
-
-  std::vector<int64_t> input_im2col_sizes = calculate_input_im2col_sizes(
-      &graph, input_image, output_image, kernel_size, groups);
-
-  // Use weight only quantized conv2d if at least one is true:
-  // 1. Device does not support int8 dot product
-  // 2. Input is not quantized
-  if (!graph.can_use_int8_dot_product() ||
-      input_quant_config.granularity == kNoQuantization) {
-    TmpTensor input_im2col(
-        &graph,
-        input_im2col_sizes,
-        vkapi::kFloat,
-        utils::kBuffer,
-        utils::kWidthPacked);
-
-    add_input_im2col_node(
-        graph,
-        input_image,
-        kernel_size,
-        stride,
-        padding,
-        dilation,
-        groups,
-        output_image,
-        input_im2col);
-
-    add_conv2d_q8csw_linear_node(
-        graph,
-        input_im2col,
-        input_image,
-        packed_weight,
-        packed_weight_scales,
-        bias_data,
-        packed_bias,
-        kernel_size,
-        stride,
-        padding,
-        dilation,
-        groups,
-        output_image);
-    return;
-  } else {
-    // Otherwise, use activation + weight quantized conv2d
-    VK_CHECK_COND(input_quant_config.granularity == kPerTensor);
-    VK_CHECK_COND(weight_quant_config.nbits == 8);
-    VK_CHECK_COND(!weight_quant_config.is_dynamic);
-
-    ValueRef packed_weight_sums = prepack_standard(
-        graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked);
-
-    // Allocate quantized + packed im2col matrix for input
-    const int64_t num_blocks_M = utils::div_up_4(input_im2col_sizes.at(0));
-    const int64_t num_blocks_K = utils::div_up_4(input_im2col_sizes.at(1));
-
-    TmpTensor input_int_im2col(
-        &graph,
-        {num_blocks_M, num_blocks_K * 4},
-        vkapi::kInt,
-        utils::kBuffer,
-        utils::kWidthPacked);
-
-    add_quantize_and_pack_im2col_node(
-        graph,
-        input_image,
-        input_scale,
-        input_zp,
-        kernel_size,
-        stride,
-        padding,
-        dilation,
-        groups,
-        output_image,
-        input_int_im2col);
-
-    add_conv2d_q8ta_q8csw_linear_node(
-        graph,
-        input_int_im2col,
-        input_image,
-        input_scale,
-        input_zp,
-        weight_data,
-        packed_weight,
-        packed_weight_sums,
-        packed_weight_scales,
-        bias_data,
-        packed_bias,
-        kernel_size,
-        stride,
-        padding,
-        dilation,
-        groups,
-        output_image);
-    return;
-  };
-}
-
-void conv2d_q8ta_q8csw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  int32_t idx = 0;
-  const ValueRef input_image = args.at(idx++);
-  const ValueRef input_scale = args.at(idx++);
-  const ValueRef input_zp = args.at(idx++);
-  const ValueRef weight_data = args.at(idx++);
-  const ValueRef weight_sums_data = args.at(idx++);
-  const ValueRef weight_scales_data = args.at(idx++);
-  const ValueRef bias_data = args.at(idx++);
-  const ValueRef kernel_size = args.at(idx++);
-  const ValueRef stride = args.at(idx++);
-  const ValueRef padding = args.at(idx++);
-  const ValueRef dilation = args.at(idx++);
-  const ValueRef groups = args.at(idx++);
-  const ValueRef output_image = args.at(idx++);
-
-  const int64_t K = graph.size_at<int64_t>(-1, weight_data);
-
-  QuantizationConfig input_quant_config(8, kPerTensor, {}, false);
-  QuantizationConfig weight_quant_config(8, kPerChannel, {K});
-
-  quantized_conv2d_impl(
-      graph,
-      input_quant_config,
-      weight_quant_config,
-      input_image,
-      input_scale,
-      input_zp,
-      weight_data,
-      weight_sums_data,
-      weight_scales_data,
-      bias_data,
-      kernel_size,
-      stride,
-      padding,
-      dilation,
-      groups,
-      output_image);
-}
-
-void conv2d_q8csw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  int32_t idx = 0;
-  const ValueRef input_image = args.at(idx++);
-  const ValueRef weight_data = args.at(idx++);
-  const ValueRef weight_scales_data = args.at(idx++);
-  const ValueRef bias_data = args.at(idx++);
-  const ValueRef kernel_size = args.at(idx++);
-  const ValueRef stride = args.at(idx++);
-  const ValueRef padding = args.at(idx++);
-  const ValueRef dilation = args.at(idx++);
-  const ValueRef groups = args.at(idx++);
-  const ValueRef output_image = args.at(idx++);
-
-  const int64_t K = graph.size_at<int64_t>(-1, weight_data);
-
-  QuantizationConfig input_quant_config(32, kNoQuantization, {});
-  QuantizationConfig weight_quant_config(8, kPerChannel, {K});
-
-  quantized_conv2d_impl(
-      graph,
-      input_quant_config,
-      weight_quant_config,
-      input_image,
-      kDummyValueRef, // input scale
-      kDummyValueRef, // input zero point
-      weight_data,
-      kDummyValueRef, // weight sums
-      weight_scales_data,
-      bias_data,
-      kernel_size,
-      stride,
-      padding,
-      dilation,
-      groups,
-      output_image);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(et_vk.conv2d_q8ta_q8csw.default, conv2d_q8ta_q8csw);
-  VK_REGISTER_OP(et_vk.conv2d_q8csw.default, conv2d_q8csw);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
deleted file mode 100644
index 4831c6f2f85..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
+++ /dev/null
@@ -1,728 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-//
-// Shader dispatch utilities
-//
-
-bool is_gemv(ComputeGraph* graph, const ValueRef& fp_input) {
-  return graph->size_at<uint32_t>(-2, fp_input) == 1;
-}
-
-void resize_linear_qw_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
-
-  ValueRef output = args.at(0).refs.at(0);
-  ValueRef fp_input = args.at(1).refs.at(0);
-  ValueRef weight_data = extra_args.at(1);
-
-  std::vector<int64_t> mat1_sizes = graph->sizes_of(fp_input);
-  std::vector<int64_t> mat2_sizes = graph->sizes_of(weight_data);
-
-  const int64_t out_cols = utils::val_at(-2, mat1_sizes);
-  const int64_t out_rows = utils::val_at(-2, mat2_sizes);
-
-  std::vector<int64_t> new_out_sizes(3);
-  if (mat1_sizes.size() == 2) {
-    new_out_sizes.resize(2);
-    new_out_sizes.at(0) = out_cols;
-    new_out_sizes.at(1) = out_rows;
-  } else {
-    new_out_sizes.at(0) = mat1_sizes.at(0);
-    new_out_sizes.at(1) = out_cols;
-    new_out_sizes.at(2) = out_rows;
-  }
-
-  graph->virtual_resize(output, new_out_sizes);
-}
-
-utils::uvec3 quantized_linear_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-
-  std::vector<int64_t> out_sizes = graph->sizes_of(out);
-  // height
-  const uint32_t M = utils::val_at(-2, out_sizes);
-  // width
-  const uint32_t N = utils::val_at(-1, out_sizes);
-
-  const uint32_t M4 = utils::div_up(M, 4u);
-  const uint32_t N4 = utils::div_up(N, 4u);
-
-  // For 4-bit weights, each output tile contains 8 columns and 4 rows
-  if (shader.kernel_name.find("q4") != std::string::npos) {
-    const uint32_t N8 = utils::div_up(N, 8u);
-
-    const bool using_coop_algorithm =
-        shader.kernel_name.find("_coop") != std::string::npos;
-    // TODO: explain
-    if (using_coop_algorithm) {
-      return {64, N8, M};
-    }
-    return {N8, M4, 1};
-  }
-
-  // Otherwise, each output tile contains 4 columns and 4 rows
-  return {N4, M4, 1};
-}
-
-utils::uvec3 quantized_linear_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const bool use_coop_algorithm =
-      shader.kernel_name.find("_coop") != std::string::npos;
-
-  if (use_coop_algorithm) {
-    return {64, 1, 1};
-  } else {
-    return pick_hw_square_wg_size(
-        graph, shader, global_workgroup_size, args, resize_args);
-  }
-}
-
-std::tuple<int64_t, int64_t> get_quantized_input_num_blocks(
-    ComputeGraph& graph,
-    const ValueRef input) {
-  std::vector<int64_t> input_sizes = graph.sizes_of(input);
-  const int64_t ndim = graph.dim_of(input);
-
-  const int64_t M = input_sizes.at(ndim - 2);
-  const int64_t K = input_sizes.at(ndim - 1);
-
-  const int64_t num_blocks_M = utils::div_up(M, int64_t(4));
-  const int64_t num_blocks_K = utils::div_up(K, int64_t(4));
-
-  return std::make_tuple(num_blocks_M, num_blocks_K);
-}
-
-utils::uvec3 quant_pack_input_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef input = args.at(1).refs.at(0);
-  int64_t num_blocks_M, num_blocks_K;
-  std::tie(num_blocks_M, num_blocks_K) =
-      get_quantized_input_num_blocks(*graph, input);
-
-  return {
-      utils::safe_downcast<uint32_t>(num_blocks_K),
-      utils::safe_downcast<uint32_t>(num_blocks_M),
-      1u};
-}
-
-vkapi::ShaderInfo pick_linear_qw_shader(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-
-  const ValueRef output = args.at(0).refs.at(0);
-  const ValueRef fp_input = args.at(1).refs.at(0);
-  const ValueRef packed_int_weight = args.at(1).refs.at(1);
-
-  const bool weight_is_4bit = resize_args.at(0) != kDummyValueRef;
-  const bool is_gemv_case = is_gemv(graph, fp_input);
-
-  std::string kernel_name = "linear_";
-  if (weight_is_4bit) {
-    kernel_name += "q4gsw";
-  } else {
-    kernel_name += "q8csw";
-  }
-
-  if (weight_is_4bit && is_gemv_case) {
-    kernel_name += "_coop";
-  } else {
-    kernel_name += "_tiled";
-  }
-  add_storage_type_suffix(kernel_name, graph->storage_type_of(output));
-  add_storage_type_suffix(
-      kernel_name, graph->storage_type_of(packed_int_weight));
-  add_dtype_suffix(kernel_name, graph->dtype_of(output));
-
-  return VK_KERNEL_FROM_STR(kernel_name);
-}
-
-//
-// Prepacking nodes
-//
-
-ValueRef prepack_quantized_linear_weight(
-    ComputeGraph& graph,
-    const QuantizationConfig& weight_quant_config,
-    const ValueRef qmat2_data) {
-  VK_CHECK_COND(
-      weight_quant_config.nbits == 8 || weight_quant_config.nbits == 4);
-
-  std::vector<int64_t> qmat2_orig_sizes = graph.sizes_of(qmat2_data);
-  const int64_t ndim = graph.dim_of(qmat2_data);
-
-  int64_t qmat2_width = qmat2_orig_sizes.at(ndim - 1);
-  int64_t qmat2_height = qmat2_orig_sizes.at(ndim - 2);
-
-  int64_t K;
-  int64_t N;
-  if (weight_quant_config.nbits == 4) {
-    // For 4-bit quantization, weight source data has shape [N, K/2]. Each byte
-    // contains 2 * 4-bit values.
-    K = qmat2_width * 2;
-    N = qmat2_height;
-  } else {
-    // For 8-bit quantization, the weight source data has shape [N, K]
-    K = qmat2_width;
-    N = qmat2_height;
-  }
-
-  // Sanity check that assumptions are correct. Data loads along the innermost
-  // dimension must be well aligned along texel boundaries.
-  if (weight_quant_config.nbits == 4) {
-    VK_CHECK_COND(K % 8 == 0);
-  } else {
-    VK_CHECK_COND(K % 4 == 0);
-  }
-
-  // The packing format packs the weight tensor into blocks of 4 columns (K) and
-  // 4 rows (N)
-  int64_t N_per_block = 4;
-  int64_t K_per_block = 4;
-
-  // For 4 bit, quantization, the amount of information contained in one block
-  // can be doubled. Each block will contain data for 8 rows (N) instead of the
-  // usual 4.
-  if (weight_quant_config.nbits == 4) {
-    N_per_block = 8;
-  }
-
-  // To figure out the size of the output tensor, determine the number of blocks
-  // along each dimension.
-  const int64_t num_blocks_K = utils::div_up(K, K_per_block);
-  const int64_t num_blocks_N = utils::div_up(N, N_per_block);
-
-  // The blocks are arranged in a transposed manner, such that the transposed
-  // weight block is indexed like packed_weights[k4][n4] - this is to allow for
-  // optimal memory coalescing when computing GEMM.
-  int64_t output_height = num_blocks_K;
-  // The base dtype of the packed tensor is int32 (each int32 contains 4x 8bit
-  // values) and each block is represented as a ivec4. Therefore the width dim
-  // of the packed tensor is multiplied by 4.
-  int64_t output_width = num_blocks_N * 4;
-
-  // For 4 bit quantization, The blocks are arranged without the transposition,
-  // such that a weight block is accessed like packed_weights[n8][k4]. This is
-  // an optimization targeted for LLMs, which need to compute GEMV as well as
-  // GEMM. This memory layout provides better performance for the co-operative
-  // algorithm used to compute GEMV, at the cost of slightly reducing GEMM
-  // performance.
-  if (weight_quant_config.nbits == 4) {
-    output_height = num_blocks_N;
-    output_width = num_blocks_K * 4;
-  }
-
-  // Store the original sizes of the weight data to pass to the shader
-  utils::ivec2 orig_sizes = {
-      utils::safe_downcast<int32_t>(K), utils::safe_downcast<int32_t>(N)};
-
-  std::vector<int64_t> qmat2_sizes{output_height, output_width};
-
-  utils::StorageType storage_type = utils::kTexture2D;
-  uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
-  if (output_width > max_extent * 4 || output_height > max_extent) {
-    storage_type = utils::kBuffer;
-  }
-
-  ValueRef qmat2 = graph.add_tensor(
-      qmat2_sizes, vkcompute::vkapi::kInt, storage_type, utils::kWidthPacked);
-
-  utils::uvec3 global_wg_size;
-  if (weight_quant_config.nbits == 4) {
-    // For 4-bit quantization, each thread writes out two adjacent blocks
-    global_wg_size = {
-        utils::safe_downcast<uint32_t>(utils::div_up(num_blocks_K, int64_t(2))),
-        utils::safe_downcast<uint32_t>(num_blocks_N),
-        1u};
-  } else {
-    global_wg_size = {
-        utils::safe_downcast<uint32_t>(num_blocks_N),
-        utils::safe_downcast<uint32_t>(num_blocks_K),
-        1u};
-  }
-
-  std::string kernel_name = weight_quant_config.nbits == 4
-      ? "pack_q4_linear_weight"
-      : "pack_q8_linear_weight";
-  add_storage_type_suffix(kernel_name, storage_type);
-
-  graph.prepack_nodes().emplace_back(new PrepackNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      global_wg_size,
-      graph.create_local_wg_size(global_wg_size),
-      // Inputs and Outputs
-      qmat2_data,
-      qmat2,
-      // UBOs
-      {},
-      // Specialization Constants
-      {},
-      // Push Constants
-      {graph.sizes_pc_of(qmat2),
-       PushConstantDataInfo(&orig_sizes, sizeof(utils::ivec2))}));
-
-  return qmat2;
-}
-
-//
-// Dispatch nodes
-//
-
-/*
- * Shader dispatch for linear with quantized weight but fp activations.
- */
-DynamicDispatchNode make_linear_qw_node(
-    ComputeGraph& graph,
-    const QuantizationConfig& weight_quant_config,
-    const ValueRef fp_input,
-    const ValueRef weight_data,
-    const ValueRef packed_weight,
-    const ValueRef packed_weight_scales,
-    const ValueRef packed_weight_zeros,
-    const ValueRef group_size,
-    const ValueRef bias_data,
-    const ValueRef packed_bias,
-    const ValueRef output) {
-  // Only certain quantization types supported at the moment
-  VK_CHECK_COND(
-      weight_quant_config.granularity == kPerChannel ||
-      weight_quant_config.granularity == kPerGroup);
-  VK_CHECK_COND(weight_quant_config.is_symmetric);
-  VK_CHECK_COND(
-      weight_quant_config.nbits == 8 || weight_quant_config.nbits == 4);
-
-  vkapi::ParamsBindList param_buffers = {
-      graph.sizes_ubo(output), graph.sizes_ubo(fp_input)};
-
-  uint32_t apply_bias = 1;
-  if (graph.val_is_none(bias_data)) {
-    apply_bias = 0;
-  }
-
-  int32_t K4_per_group = 0;
-  if (weight_quant_config.nbits == 4) {
-    int32_t group_size_val = graph.extract_scalar<int32_t>(group_size);
-    K4_per_group = utils::div_up(group_size_val, int32_t(4));
-  }
-
-  const ValueRef is_4bit_flag =
-      weight_quant_config.nbits == 4 ? group_size : kDummyValueRef;
-
-  return DynamicDispatchNode(
-      graph,
-      pick_linear_qw_shader,
-      quantized_linear_global_wg_size,
-      quantized_linear_local_wg_size,
-      // Inputs and Outputs
-      {{output, vkapi::kWrite},
-       {{fp_input, packed_weight, packed_weight_scales, packed_bias},
-        vkapi::kRead}},
-      // Shader params buffers
-      param_buffers,
-      // Push Constants
-      {},
-      // Specialization Constants
-      {apply_bias, K4_per_group},
-      // Resize args
-      {is_4bit_flag, weight_data},
-      // Resizing Logic
-      resize_linear_qw_node);
-}
-
-DynamicDispatchNode make_quantize_and_pack_linear_input_node(
-    ComputeGraph& graph,
-    const QuantizationConfig& input_quant_config,
-    const ValueRef fp_input,
-    const ValueRef packed_input_scale,
-    const ValueRef packed_input_zp,
-    const ValueRef input_scale_data,
-    const ValueRef input_zp_data,
-    const ValueRef packed_int_input,
-    const ValueRef group_size) {
-  // Only certain quantization types supported at the moment
-  VK_CHECK_COND(input_quant_config.granularity == kPerTensor);
-
-  int64_t num_blocks_M, num_blocks_K;
-  std::tie(num_blocks_M, num_blocks_K) =
-      get_quantized_input_num_blocks(graph, fp_input);
-
-  float inv_scale = 1.0f / graph.extract_scalar<float>(input_scale_data);
-  int32_t zp = graph.extract_scalar<int32_t>(input_zp_data);
-
-  std::string shader_name = "quantize_and_pack_linear_input_per_tensor";
-  add_storage_type_suffix(shader_name, graph.storage_type_of(packed_int_input));
-  add_storage_type_suffix(shader_name, graph.storage_type_of(fp_input));
-  add_dtype_suffix(shader_name, graph.dtype_of(fp_input));
-
-  vkapi::ParamsBindList param_buffers = {graph.sizes_ubo(fp_input)};
-
-  std::vector<PushConstantDataInfo> push_constants = {
-      PushConstantDataInfo(&inv_scale, sizeof(inv_scale)),
-      PushConstantDataInfo(&zp, sizeof(zp)),
-  };
-
-  return DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(shader_name),
-      quant_pack_input_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{packed_int_input, vkapi::kWrite}, {fp_input, vkapi::kRead}},
-      // Shader params buffers
-      param_buffers,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      {},
-      // Resize args
-      {});
-}
-
-DynamicDispatchNode make_linear_qa_qw_node(
-    ComputeGraph& graph,
-    const QuantizationConfig& input_quant_config,
-    const QuantizationConfig& weight_quant_config,
-    const ValueRef fp_input,
-    const ValueRef packed_int_input,
-    const ValueRef packed_input_scale,
-    const ValueRef packed_input_zp,
-    const ValueRef input_scale_data,
-    const ValueRef input_zp_data,
-    const ValueRef weight_data,
-    const ValueRef packed_weight,
-    const ValueRef packed_weight_sums,
-    const ValueRef packed_weight_scales,
-    const ValueRef group_size,
-    const ValueRef bias_data,
-    const ValueRef packed_bias,
-    const ValueRef output) {
-  VK_CHECK_COND(input_quant_config.granularity == kPerTensor);
-  VK_CHECK_COND(input_quant_config.nbits == 8);
-  VK_CHECK_COND(weight_quant_config.granularity == kPerChannel);
-  VK_CHECK_COND(weight_quant_config.is_symmetric);
-  VK_CHECK_COND(weight_quant_config.nbits == 8);
-
-  float scale = graph.extract_scalar<float>(input_scale_data);
-  int32_t zp = graph.extract_scalar<int32_t>(input_zp_data);
-
-  // Get shader for quantized linear
-  std::string kernel_name = "linear_q8ta_q8csw_tiled";
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(output));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_int_input));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight));
-  add_dtype_suffix(kernel_name, graph.dtype_of(output));
-  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
-
-  vkapi::ParamsBindList param_buffers = {
-      graph.sizes_ubo(output), graph.sizes_ubo(packed_int_input)};
-
-  std::vector<PushConstantDataInfo> push_constants = {
-      PushConstantDataInfo(&scale, sizeof(scale)),
-      PushConstantDataInfo(&zp, sizeof(zp)),
-  };
-
-  uint32_t apply_bias = 1;
-  if (graph.val_is_none(bias_data)) {
-    apply_bias = 0;
-  }
-
-  // Add the compute node
-  return DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      quantized_linear_global_wg_size,
-      quantized_linear_local_wg_size,
-      // Inputs and Outputs
-      {{output, vkapi::kWrite},
-       {{packed_int_input,
-         packed_weight,
-         packed_weight_sums,
-         packed_weight_scales,
-         packed_bias},
-        vkapi::kRead}},
-      // Shader params buffers
-      param_buffers,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      {apply_bias},
-      // Resize args
-      {fp_input},
-      // Resizing Logic
-      nullptr);
-}
-
-//
-// High level operator impl
-//
-
-void quantized_linear_impl(
-    ComputeGraph& graph,
-    const QuantizationConfig& input_quant_config,
-    const QuantizationConfig& weight_quant_config,
-    const ValueRef fp_input,
-    const ValueRef input_scale,
-    const ValueRef input_zp,
-    const ValueRef weight_data,
-    const ValueRef weight_sums_data,
-    const ValueRef weight_scales_data,
-    const ValueRef weight_zeros_data,
-    const ValueRef group_size,
-    const ValueRef bias_data,
-    const ValueRef output) {
-  std::vector<int64_t> input_sizes = graph.sizes_of(fp_input);
-  std::vector<int64_t> weight_sizes = graph.sizes_of(weight_data);
-
-  const int64_t K = utils::val_at(-1, input_sizes);
-  // K (input channels) must be a multiple of 4 to ensure that reading a group
-  // of 4 input channels from the input tensor will be aligned on a texel
-  // boundary.
-  VK_CHECK_COND(K % 4 == 0);
-
-  // Prepack weight data
-
-  const ValueRef packed_weight =
-      prepack_quantized_linear_weight(graph, weight_quant_config, weight_data);
-  const ValueRef packed_weight_scales = prepack_standard(
-      graph, weight_scales_data, utils::kBuffer, utils::kWidthPacked);
-  // Weight affine quant not supported at the moment
-  const ValueRef packed_weight_zeros = kDummyValueRef;
-
-  // Prepack bias data
-
-  // Create a dummy tensor to fill the binding slot of the bias tensor if it is
-  // not provided. This helps simplify dispatch logic and makes it so that
-  // fewer shdaer variants need to be generated.
-  TmpTensor dummy_bias(
-      &graph, {}, graph.dtype_of(output), utils::kBuffer, utils::kWidthPacked);
-
-  ValueRef packed_bias = dummy_bias.vref;
-  if (graph.val_is_not_none(bias_data)) {
-    packed_bias =
-        prepack_standard(graph, bias_data, utils::kBuffer, utils::kWidthPacked);
-  }
-
-  // Use weight only quantized linear if at least one is true:
-  // 1. Device does not support int8 dot product
-  // 2. Input is not quantized
-  if (!graph.can_use_int8_dot_product() ||
-      input_quant_config.granularity == kNoQuantization) {
-    DynamicDispatchNode linear_qw_node(make_linear_qw_node(
-        graph,
-        weight_quant_config,
-        fp_input,
-        weight_data,
-        packed_weight,
-        packed_weight_scales,
-        packed_weight_zeros,
-        group_size,
-        bias_data,
-        packed_bias,
-        output));
-
-    graph.execute_nodes().emplace_back(new DynamicDispatchNode(linear_qw_node));
-    return;
-  } else {
-    // Otherwise, use input and weight quantized linear computed with integer
-    // accumulation
-
-    // Input scale/zero point only used for activation & weight quantized linear
-    ValueRef packed_input_scale = input_scale;
-    ValueRef packed_input_zp = input_zp;
-    if (graph.val_is_tref(input_scale)) {
-      VK_CHECK_COND(graph.val_is_tref(packed_input_zp));
-      packed_input_scale = prepack_standard(
-          graph, input_scale, utils::kBuffer, utils::kWidthPacked);
-      packed_input_zp = prepack_standard(
-          graph, input_zp, utils::kBuffer, utils::kWidthPacked);
-    }
-
-    // Pre-computed per quant group weight sums are needed for int accumulation,
-    // but not for weight only
-    const ValueRef packed_weight_sums = prepack_standard(
-        graph, weight_sums_data, utils::kBuffer, utils::kWidthPacked);
-
-    // Allocate temporary tensor to store quantized and packed input
-
-    int64_t num_blocks_M, num_blocks_K;
-    std::tie(num_blocks_M, num_blocks_K) =
-        get_quantized_input_num_blocks(graph, fp_input);
-
-    const int64_t int_input_height = num_blocks_M;
-    const int64_t int_input_width = num_blocks_K * 4;
-
-    TmpTensor packed_int_input(
-        &graph,
-        {int_input_height, int_input_width},
-        vkapi::kInt,
-        utils::kBuffer,
-        utils::kWidthPacked);
-
-    DynamicDispatchNode quantize_and_pack_linear_node(
-        make_quantize_and_pack_linear_input_node(
-            graph,
-            input_quant_config,
-            fp_input,
-            packed_input_scale,
-            packed_input_zp,
-            input_scale,
-            input_zp,
-            packed_int_input,
-            group_size));
-
-    graph.execute_nodes().emplace_back(
-        new DynamicDispatchNode(quantize_and_pack_linear_node));
-
-    DynamicDispatchNode linear_qa_qw_node(make_linear_qa_qw_node(
-        graph,
-        input_quant_config,
-        weight_quant_config,
-        fp_input,
-        packed_int_input,
-        packed_input_scale,
-        packed_input_zp,
-        input_scale,
-        input_zp,
-        weight_data,
-        packed_weight,
-        packed_weight_sums,
-        packed_weight_scales,
-        group_size,
-        bias_data,
-        packed_bias,
-        output));
-
-    graph.execute_nodes().emplace_back(
-        new DynamicDispatchNode(linear_qa_qw_node));
-  }
-}
-
-void linear_q8ta_q8csw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  int32_t idx = 0;
-  const ValueRef fp_input = args.at(idx++);
-  const ValueRef input_scale = args.at(idx++);
-  const ValueRef input_zp = args.at(idx++);
-  const ValueRef weight_data = args.at(idx++);
-  const ValueRef weight_sums_data = args.at(idx++);
-  const ValueRef weight_scales_data = args.at(idx++);
-  const ValueRef bias_data = args.at(idx++);
-  const ValueRef output = args.at(idx++);
-
-  const int64_t K = graph.size_at<int64_t>(-1, fp_input);
-
-  QuantizationConfig input_quant_config(8, kPerTensor, {}, false);
-  QuantizationConfig weight_quant_config(8, kPerChannel, {K});
-
-  quantized_linear_impl(
-      graph,
-      input_quant_config,
-      weight_quant_config,
-      fp_input,
-      input_scale,
-      input_zp,
-      weight_data,
-      weight_sums_data,
-      weight_scales_data,
-      kDummyValueRef, // weight_zeros_data
-      kDummyValueRef, // group_size
-      bias_data,
-      output);
-}
-
-void linear_q8csw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  int32_t idx = 0;
-  const ValueRef fp_input = args.at(idx++);
-  const ValueRef weight_data = args.at(idx++);
-  const ValueRef weight_scales_data = args.at(idx++);
-  const ValueRef bias_data = args.at(idx++);
-  const ValueRef output = args.at(idx++);
-
-  const int64_t K = graph.size_at<int64_t>(-1, fp_input);
-
-  QuantizationConfig input_quant_config(32, kNoQuantization, {});
-  QuantizationConfig weight_quant_config(8, kPerChannel, {K});
-
-  quantized_linear_impl(
-      graph,
-      input_quant_config,
-      weight_quant_config,
-      fp_input,
-      kDummyValueRef, // input scale
-      kDummyValueRef, // input zp
-      weight_data,
-      kDummyValueRef, // weight sums
-      weight_scales_data,
-      kDummyValueRef, // weight zeros
-      kDummyValueRef, // group size
-      bias_data,
-      output);
-}
-
-void linear_q4gsw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  int32_t idx = 0;
-  const ValueRef fp_input = args.at(idx++);
-  const ValueRef weight_data = args.at(idx++);
-  const ValueRef weight_scales_data = args.at(idx++);
-  const ValueRef group_size = args.at(idx++);
-  const ValueRef bias_data = args.at(idx++);
-  const ValueRef output = args.at(idx++);
-
-  const int64_t group_size_val = graph.extract_scalar<int64_t>(group_size);
-
-  QuantizationConfig input_quant_config(32, kNoQuantization, {});
-  QuantizationConfig weight_quant_config(4, kPerGroup, {group_size_val});
-
-  quantized_linear_impl(
-      graph,
-      input_quant_config,
-      weight_quant_config,
-      fp_input,
-      kDummyValueRef, // input scale
-      kDummyValueRef, // input zp
-      weight_data,
-      kDummyValueRef, // weight sums
-      weight_scales_data,
-      kDummyValueRef, // weight zeros
-      group_size, // group size
-      bias_data,
-      output);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(et_vk.linear_q8ta_q8csw.default, linear_q8ta_q8csw);
-  VK_REGISTER_OP(et_vk.linear_q8csw.default, linear_q8csw);
-  VK_REGISTER_OP(et_vk.linear_q4gsw.default, linear_q4gsw);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.h b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.h
deleted file mode 100644
index 7b62c98390d..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/ExecuteNode.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/QuantizationConfig.h>
-
-namespace vkcompute {
-
-utils::uvec3 quantized_linear_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args);
-
-ValueRef prepack_quantized_linear_weight(
-    ComputeGraph& graph,
-    const QuantizationConfig& weight_quant_config,
-    const ValueRef qmat2_data);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp
deleted file mode 100644
index 89c9e847724..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-// Custom global workgroup size function for linear_qcs8w
-utils::uvec3 linear_qcs8w_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  return {static_cast<uint32_t>(graph->numel_of(out)), 1, 1};
-}
-
-// Custom local workgroup size function for linear_qcs8w
-utils::uvec3 linear_qcs8w_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)graph;
-  (void)shader;
-  (void)global_workgroup_size;
-  (void)args;
-  (void)resize_args;
-  return {64, 1, 1};
-}
-
-// Custom global workgroup size function for linear_qcsnw_tiled
-utils::uvec3 linear_qcsnw_tiled_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef mat1 = args.at(1).refs.at(0);
-
-  // Determine quantization bits from shader name
-  int quant_nbits = 8;
-  if (shader.kernel_name.find("qcs4w") != std::string::npos) {
-    quant_nbits = 4;
-  }
-
-  std::vector<int64_t> mat1_sizes = graph->sizes_of(mat1);
-  const int64_t M = utils::val_at(-2, mat1_sizes);
-  uint32_t out_tile_nrows = 4;
-  if (M % 6 == 0) {
-    out_tile_nrows = 2;
-  } else if (M % 4 == 0) {
-    out_tile_nrows = 4;
-  } else if (M % 1 == 0) {
-    out_tile_nrows = 1;
-  } else {
-    out_tile_nrows = 4;
-  }
-
-  // Number of output texels in the output tile
-  uint32_t out_tile_ntxcols = 1;
-  if (quant_nbits == 4) {
-    out_tile_ntxcols = 2;
-  }
-
-  utils::uvec3 out_limits = graph->logical_limits_of(out);
-  uint32_t global_wg_x = utils::div_up(out_limits[0], out_tile_ntxcols);
-  return {
-      global_wg_x * (utils::div_up(out_limits[1], out_tile_nrows)),
-      1,
-      out_limits[2]};
-}
-
-// Custom local workgroup size function for linear_qcsnw_tiled
-utils::uvec3 linear_qcsnw_tiled_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)graph;
-  (void)global_workgroup_size;
-  (void)args;
-  (void)resize_args;
-
-  // Check if using cooperative algorithm from shader name
-  bool use_coop_algorithm =
-      shader.kernel_name.find("_coop") != std::string::npos;
-
-  if (use_coop_algorithm) {
-    return {8, 1, 8};
-  } else {
-    return {64, 1, 1};
-  }
-}
-
-void check_linear_qcsnw_args(
-    const ComputeGraph& graph,
-    const int quant_nbits,
-    const ValueRef mat1,
-    const ValueRef qmat2_data,
-    const ValueRef scales,
-    const ValueRef out) {
-  std::vector<int64_t> mat1_sizes = graph.sizes_of(mat1);
-  std::vector<int64_t> qmat2_sizes = graph.sizes_of(qmat2_data);
-  std::vector<int64_t> scales_sizes = graph.sizes_of(scales);
-
-  VK_CHECK_COND(mat1_sizes.size() == 2 || mat1_sizes.size() == 3);
-  VK_CHECK_COND(qmat2_sizes.size() == 2);
-  VK_CHECK_COND(scales_sizes.size() == 1);
-
-  VK_CHECK_COND(graph.packed_dim_of(mat1) == graph.packed_dim_of(out));
-
-  if (quant_nbits == 4) {
-    VK_CHECK_COND(
-        utils::val_at(-1, mat1_sizes) == utils::val_at(-1, qmat2_sizes) * 2);
-    VK_CHECK_COND(
-        utils::val_at(-1, scales_sizes) == utils::val_at(-2, qmat2_sizes));
-  } else {
-    VK_CHECK_COND(
-        utils::val_at(-1, mat1_sizes) == utils::val_at(-1, qmat2_sizes));
-    VK_CHECK_COND(
-        utils::val_at(-1, scales_sizes) == utils::val_at(-2, qmat2_sizes));
-  }
-
-  if (graph.is_buffer_storage(out)) {
-    VK_CHECK_COND(graph.is_contiguous(out));
-  }
-}
-
-void resize_linear_qcsnw_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
-
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef mat1 = args.at(1).refs.at(0);
-  const ValueRef qmat2 = args.at(1).refs.at(1);
-
-  const std::vector<int64_t> mat1_sizes = graph->sizes_of(mat1);
-  const std::vector<int64_t> qmat2_sizes = graph->sizes_of(qmat2);
-
-  const int out_cols = utils::val_at(-2, mat1_sizes);
-  int out_rows = utils::val_at(-1, qmat2_sizes);
-  // Byte dtype suggests 4-bit quantization in which case the weight tensor is
-  // packed with 2 values per byte.
-  if (graph->dtype_of(qmat2) == vkapi::kByte) {
-    out_rows *= 2;
-  }
-
-  std::vector<int64_t> new_out_sizes(3);
-  if (mat1_sizes.size() == 2) {
-    new_out_sizes.resize(2);
-    new_out_sizes.at(0) = out_cols;
-    new_out_sizes.at(1) = out_rows;
-  } else {
-    new_out_sizes.at(0) = mat1_sizes.at(0);
-    new_out_sizes.at(1) = out_cols;
-    new_out_sizes.at(2) = out_rows;
-  }
-
-  graph->virtual_resize(out, new_out_sizes);
-}
-
-void add_linear_qcs8w_node(
-    ComputeGraph& graph,
-    const ValueRef mat1,
-    const ValueRef q_mat2_data,
-    const ValueRef scales_data,
-    const ValueRef out) {
-  auto viewFn = VK_GET_OP_FN("aten.view_copy.default");
-  ValueRef mat1_W_packed = mat1;
-  ValueRef out_W_packed = out;
-  // Create temporary tensors to store the width packed versions of mat1 and out
-  TmpTensor mat1_tmp(
-      &graph, graph.sizes_of(mat1), graph.dtype_of(mat1), utils::kWidthPacked);
-  TmpTensor out_tmp(
-      &graph, graph.sizes_of(out), graph.dtype_of(out), utils::kWidthPacked);
-  if (!graph.is_buffer_storage(out) &&
-      graph.packed_dim_of(mat1) != WHCN::kWidthDim) {
-    // Ensure mat1 is width packed
-    mat1_W_packed = mat1_tmp;
-    viewFn(graph, {mat1, graph.add_none(), mat1_W_packed});
-    // Ensure out is packed correctly
-    out_W_packed = out_tmp;
-  }
-  ValueRef q_mat2 = prepack_standard_hw_transposed(
-      graph, q_mat2_data, graph.storage_type_of(out), utils::kWidthPacked);
-  ValueRef scales = prepack_standard(
-      graph, scales_data, graph.storage_type_of(out), utils::kWidthPacked);
-
-  std::string kernel_name = "linear_qcs8w";
-  kernel_name.reserve(kShaderNameReserve);
-  add_packed_dim_suffix(kernel_name, graph.packed_dim_of(mat1_W_packed));
-  add_packed_dim_suffix(kernel_name, graph.packed_dim_of(q_mat2));
-  add_dtype_suffix(kernel_name, graph.dtype_of(out_W_packed));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out_W_packed));
-
-  std::vector<PushConstantDataInfo> pcs;
-  if (graph.is_buffer_storage(out_W_packed)) {
-    pcs = {
-        graph.sizes_pc_of(out_W_packed),
-        graph.strides_pc_of(out_W_packed),
-        graph.sizes_pc_of(mat1_W_packed),
-        graph.strides_pc_of(mat1),
-        graph.strides_pc_of(q_mat2),
-        graph.strides_pc_of(scales),
-        graph.numel_pc_of(out_W_packed)};
-  } else {
-    pcs = {
-        graph.logical_limits_pc_of(out_W_packed),
-        graph.sizes_pc_of(mat1_W_packed)};
-  }
-
-  const utils::uvec3 global_wg = {
-      static_cast<uint32_t>(graph.numel_of(out_W_packed)), 1, 1};
-  const utils::uvec3 local_wg{64, 1, 1};
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      linear_qcs8w_global_wg_size,
-      linear_qcs8w_local_wg_size,
-      // Inputs and Outputs
-      {{out_W_packed, vkapi::MemoryAccessType::WRITE},
-       {{mat1_W_packed, q_mat2, scales}, vkapi::MemoryAccessType::READ}},
-      // Shader params buffers
-      {},
-      // Push Constants
-      pcs,
-      // Specialization Constants
-      {},
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_linear_qcsnw_node));
-  if (!graph.is_buffer_storage(out) &&
-      graph.packed_dim_of(out) != WHCN::kWidthDim) {
-    viewFn(graph, {out_W_packed, graph.add_none(), out});
-  }
-}
-
-void add_linear_qcsnw_tiled_node(
-    ComputeGraph& graph,
-    const bool use_coop_algorithm,
-    const int quant_nbits,
-    const ValueRef mat1,
-    const ValueRef q_mat2_data,
-    const ValueRef scales_data,
-    const ValueRef out) {
-  uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
-  std::vector<int64_t> qmat2_orig_sizes = graph.sizes_of(q_mat2_data);
-  const int64_t ndim = graph.dim_of(q_mat2_data);
-  const int64_t K = qmat2_orig_sizes.at(ndim - 1);
-  const int64_t N = qmat2_orig_sizes.at(ndim - 2);
-
-  ValueRef q_mat2;
-  if (quant_nbits == 4) {
-    q_mat2 =
-        prepack_int4_linear_weight_transposed_interleaved(graph, q_mat2_data);
-  } else {
-    utils::StorageType q_mat2_storage = utils::kTexture2D;
-    if (N > max_extent * 4 || K > max_extent) {
-      q_mat2_storage = utils::kBuffer;
-    }
-
-    q_mat2 = prepack_standard_hw_transposed(
-        graph, q_mat2_data, q_mat2_storage, utils::kWidthPacked);
-  }
-
-  utils::StorageType scales_storage = utils::kTexture2D;
-  if (N > max_extent) {
-    scales_storage = utils::kBuffer;
-  }
-  ValueRef scales =
-      prepack_standard(graph, scales_data, scales_storage, utils::kWidthPacked);
-
-  std::string kernel_name;
-  if (quant_nbits == 4) {
-    kernel_name =
-        use_coop_algorithm ? "linear_qcs4w_coop" : "linear_qcs4w_tiled";
-  } else {
-    kernel_name =
-        use_coop_algorithm ? "linear_qcs8w_coop" : "linear_qcs8w_tiled";
-  }
-  kernel_name.reserve(kShaderNameReserve);
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(mat1));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(q_mat2));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(scales));
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  std::vector<int64_t> mat1_sizes = graph.sizes_of(mat1);
-  const int64_t M = utils::val_at(-2, mat1_sizes);
-  uint32_t out_tile_nrows = 4;
-  if (M % 6 == 0) {
-    kernel_name += "_o4x2";
-    out_tile_nrows = 2;
-  } else if (M % 4 == 0) {
-    kernel_name += "_o4x4";
-    out_tile_nrows = 4;
-  } else if (M % 1 == 0) {
-    kernel_name += "_o4x1";
-    out_tile_nrows = 1;
-  } else {
-    kernel_name += "_o4x4";
-    out_tile_nrows = 4;
-  }
-
-  // Number of output texels in the output tile
-  uint32_t out_tile_ntxcols = 1;
-  if (quant_nbits == 4) {
-    out_tile_ntxcols = 2;
-  }
-
-  utils::uvec3 out_limits = graph.logical_limits_of(out);
-  uint32_t global_wg_x = utils::div_up(out_limits[0], out_tile_ntxcols);
-  utils::uvec3 global_wg_size = {
-      global_wg_x * (utils::div_up(out_limits[1], out_tile_nrows)),
-      1,
-      out_limits[2]};
-
-  utils::uvec3 local_wg_size{64, 1, 1};
-  if (use_coop_algorithm) {
-    local_wg_size = {8, 1, 8};
-  }
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      linear_qcsnw_tiled_global_wg_size,
-      linear_qcsnw_tiled_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {{mat1, q_mat2, scales}, vkapi::kRead}},
-      // Shader params buffers
-      {},
-      // Push Constants
-      {{graph.sizes_pc_of(out), graph.sizes_pc_of(mat1)}},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_linear_qcsnw_node));
-}
-
-bool can_use_tiled_impl(
-    ComputeGraph& graph,
-    const ValueRef mat1,
-    const ValueRef q_mat2_data,
-    const ValueRef scales_data,
-    const ValueRef out) {
-  (void)q_mat2_data;
-  (void)scales_data;
-
-  // Check if mat1 is not a 3D tensor or that batches = 1
-  // TODO(ssjia): Add support for batches in the tiled impl
-  if (graph.dim_of(mat1) == 3 && graph.size_at<int>(0, mat1) != 1) {
-    return false;
-  }
-  // Check that K is a multiple of 4
-  if (graph.size_at<int>(-1, mat1) % 4 != 0) {
-    return false;
-  }
-  // Check that N is a multiple of 4
-  if (graph.size_at<int>(-1, out) % 4 != 0) {
-    return false;
-  }
-  // Check that the packed dim is the width dim
-  if (graph.packed_dim_of(mat1) != WHCN::kWidthDim &&
-      graph.packed_dim_of(out) != WHCN::kWidthDim) {
-    return false;
-  }
-  // Check that no special axis mapping is used for the input
-  // TODO(ssjia): Add support for non-standard axis mapping in the tiled impl
-  if (!graph.has_standard_axis_map(mat1)) {
-    return false;
-  }
-  // Check that no special axis mapping is used for the output
-  // TODO(ssjia): Add support for non-standard axis mapping in the tiled impl
-  if (!graph.has_standard_axis_map(out)) {
-    return false;
-  }
-
-  return true;
-}
-
-bool can_use_coop_impl(ComputeGraph& graph, const ValueRef mat1) {
-  // Do not use coop algorithm for Adreno 702; manual experimentation shows that
-  // it performs worse than the tiled algorithm.
-  // TODO(ssjia): Determine a more robust heuristic to determine when the coop
-  // algorithm should be used, instead of depending on specific device identity.
-  if (graph.device_is_adreno() && graph.device_name_contains("702")) {
-    return false;
-  }
-  // Check that the computation is vector * matrix
-  return (graph.size_at<int>(-2, mat1) == 1);
-}
-
-void weight_int8pack_mm(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  check_linear_qcsnw_args(graph, 8, args[0], args[1], args[2], args[3]);
-  if (can_use_tiled_impl(graph, args[0], args[1], args[2], args[3])) {
-    bool use_coop_algorithm = can_use_coop_impl(graph, args[0]);
-    return add_linear_qcsnw_tiled_node(
-        graph, use_coop_algorithm, 8, args[0], args[1], args[2], args[3]);
-  }
-  return add_linear_qcs8w_node(graph, args[0], args[1], args[2], args[3]);
-}
-
-void linear_qcs4w(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  check_linear_qcsnw_args(graph, 4, args[0], args[1], args[2], args[3]);
-
-  VK_CHECK_COND(can_use_tiled_impl(graph, args[0], args[1], args[2], args[3]));
-  bool use_coop_algorithm = can_use_coop_impl(graph, args[0]);
-  return add_linear_qcsnw_tiled_node(
-      graph, use_coop_algorithm, 4, args[0], args[1], args[2], args[3]);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten._weight_int8pack_mm.default, weight_int8pack_mm);
-  VK_REGISTER_OP(et_vk.linear_qcs4w.default, linear_qcs4w);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQGANW.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQGANW.cpp
deleted file mode 100644
index 52cf75e28b5..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQGANW.cpp
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void check_linear_qga4w_args(
-    ComputeGraph& graph,
-    const ValueRef mat1,
-    const ValueRef mat2_data,
-    const ValueRef group_size,
-    const ValueRef scales_and_zeros,
-    const ValueRef out) {
-  VK_CHECK_COND(graph.val_is_tensor(mat1));
-  VK_CHECK_COND(graph.val_is_tref(mat2_data));
-  VK_CHECK_COND(graph.val_is_tref(scales_and_zeros));
-
-  VK_CHECK_COND(graph.dim_of(mat1) <= 3);
-  VK_CHECK_COND(graph.dim_of(mat2_data) == 2);
-  VK_CHECK_COND(graph.dim_of(scales_and_zeros) == 3);
-
-  VK_CHECK_COND(graph.size_at<int>(-3, mat1) == 1);
-  const int K = graph.size_at<int>(-1, mat1);
-  VK_CHECK_COND(graph.size_at<int>(-1, mat2_data) * 2 == K);
-
-  const int group_size_val = graph.extract_scalar<int>(group_size);
-  VK_CHECK_COND(K % group_size_val == 0);
-  // Due to the way weight packing works, group size needs to be a multiple of 8
-  VK_CHECK_COND(group_size_val % 8 == 0);
-
-  VK_CHECK_COND(graph.has_standard_axis_map(mat1));
-  VK_CHECK_COND(graph.has_standard_axis_map(out));
-}
-
-void resize_linear_qga4w_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
-
-  ValueRef out = args.at(0).refs.at(0);
-  ValueRef mat1 = args.at(1).refs.at(0);
-  ValueRef mat2_data = extra_args.at(0);
-
-  std::vector<int64_t> mat1_sizes = graph->sizes_of(mat1);
-  std::vector<int64_t> mat2_sizes = graph->sizes_of(mat2_data);
-
-  const int64_t out_cols = utils::val_at(-2, mat1_sizes);
-  const int64_t out_rows = utils::val_at(-2, mat2_sizes);
-
-  std::vector<int64_t> new_out_sizes(3);
-  if (mat1_sizes.size() == 2) {
-    new_out_sizes.resize(2);
-    new_out_sizes.at(0) = out_cols;
-    new_out_sizes.at(1) = out_rows;
-  } else {
-    new_out_sizes.at(0) = mat1_sizes.at(0);
-    new_out_sizes.at(1) = out_cols;
-    new_out_sizes.at(2) = out_rows;
-  }
-
-  graph->virtual_resize(out, new_out_sizes);
-}
-
-/**
- * Determines if the cooperative algorithm should be used based on input tensor
- * dimensions. Apply the coop algorithm for gemv cases, i.e. mat1 is avector as
- * as opposed to a matrix.
- */
-bool should_use_coop_algorithm(ComputeGraph* graph, const ValueRef& mat1) {
-  return graph->size_at<uint32_t>(-2, mat1) == 1;
-}
-
-vkapi::ShaderInfo pick_linear_qga4w_shader(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef mat1 = args.at(1).refs.at(0);
-  const ValueRef mat2 = args.at(1).refs.at(1);
-
-  const bool use_coop_algorithm = should_use_coop_algorithm(graph, mat1);
-
-  std::string kernel_name = "linear_qga4w";
-  if (use_coop_algorithm) {
-    kernel_name += "_coop";
-  } else {
-    kernel_name += "_tiled";
-  }
-  add_storage_type_suffix(kernel_name, graph->storage_type_of(out));
-  add_storage_type_suffix(kernel_name, graph->storage_type_of(mat1));
-  add_storage_type_suffix(kernel_name, graph->storage_type_of(mat2));
-  add_dtype_suffix(kernel_name, graph->dtype_of(out));
-
-  return VK_KERNEL_FROM_STR(kernel_name);
-}
-
-utils::uvec3 linear_qga4w_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-  const ValueRef out = args.at(0).refs.at(0);
-
-  const bool use_coop_algorithm =
-      shader.kernel_name.find("_coop") != std::string::npos;
-
-  if (!use_coop_algorithm) {
-    // Constructing the global workgroup size for the tiled algorithm
-    utils::uvec3 global_wg_size = graph->logical_limits_of(out);
-    // Each shader thread computes a 4 high x 8 wide tile of the output matrix,
-    // which is equivalent to 4 x 2 texels. Since the output tensor must be
-    // width packed, div-up the "texel-width" of the output by 2 and the height
-    // of the output tensor by 4 to obtain the number of tiles that need to be
-    // computed.
-    global_wg_size[0] = utils::div_up(global_wg_size[0], uint32_t(2));
-    global_wg_size[1] = utils::div_up(global_wg_size[1], uint32_t(4));
-    return global_wg_size;
-  }
-
-  uint32_t output_channels = graph->size_at<uint32_t>(-1, out);
-  uint32_t batch_size = graph->size_at<uint32_t>(-2, out);
-
-  // Constructing the global workgroup size of the co-operative algorithm. The
-  // local work group size is 64, and each local work group co-operates to
-  // compute 8 output channels of the output. Therefore, a total of
-  // (output_channels / 8 x 64) threads should be launched, assuming a batch
-  // size of 1.
-  return {64, utils::div_up(output_channels, 8u), batch_size};
-}
-
-utils::uvec3 linear_qga4w_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)args;
-  (void)resize_args;
-  const bool use_coop_algorithm =
-      shader.kernel_name.find("_coop") != std::string::npos;
-
-  if (use_coop_algorithm) {
-    return {64, 1, 1};
-  } else {
-    return pick_hw_square_wg_size(
-        graph, shader, global_workgroup_size, args, resize_args);
-  }
-}
-
-void add_linear_qga4w_node(
-    ComputeGraph& graph,
-    const ValueRef mat1,
-    const ValueRef mat2_data,
-    const ValueRef group_size,
-    const ValueRef scales_and_zeros_data,
-    const ValueRef out) {
-  check_linear_qga4w_args(
-      graph, mat1, mat2_data, group_size, scales_and_zeros_data, out);
-
-  const uint32_t group_size_val = graph.extract_scalar<uint32_t>(group_size);
-
-  ValueRef mat2 =
-      prepack_int4_linear_weight_transposed_block_4x8(graph, mat2_data);
-
-  ValueRef scales_and_zeros = prepack_standard(
-      graph, scales_and_zeros_data, utils::kBuffer, utils::kWidthPacked);
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      pick_linear_qga4w_shader,
-      linear_qga4w_global_wg_size,
-      linear_qga4w_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {{mat1, mat2, scales_and_zeros}, vkapi::kRead}},
-      // Shader params buffers
-      {},
-      // Push Constants
-      {graph.sizes_pc_of(out),
-       graph.sizes_pc_of(mat1),
-       graph.sizes_pc_of(mat2)},
-      // Specialization Constants
-      {SV(group_size_val)},
-      // Resize Args
-      {mat2_data},
-      // Resizing Logic
-      resize_linear_qga4w_node));
-}
-
-void linear_weight_int4(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  return add_linear_qga4w_node(
-      graph,
-      args[0], // mat1
-      args[1], // mat2
-      args[2], // group_size
-      args[3], // scales_and_zeros
-      // There is an unused variable inner_k_tiles which is used to call
-      // _convert_weight_to_int4pack in the AOT custom op, which is why the 4th
-      // argument is skipped.
-      args[5] // out
-  );
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(et_vk.linear_weight_int4.default, linear_weight_int4);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear_QTA8A_QGA4W.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear_QTA8A_QGA4W.cpp
deleted file mode 100644
index e3443ca34e6..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear_QTA8A_QGA4W.cpp
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void check_linear_qta8a_qga4w_args(
-    ComputeGraph& graph,
-    const ValueRef mat1,
-    const ValueRef mat1_scale,
-    const ValueRef mat1_zero_point,
-    const ValueRef mat2_data,
-    const ValueRef group_size,
-    const ValueRef weight_scales,
-    const ValueRef weight_zeros,
-    const ValueRef out) {
-  VK_CHECK_COND(graph.val_is_tensor(mat1));
-  VK_CHECK_COND(graph.val_is_tensor(mat1_scale));
-  VK_CHECK_COND(graph.val_is_tensor(mat1_zero_point));
-  VK_CHECK_COND(graph.val_is_tref(mat2_data));
-  VK_CHECK_COND(graph.val_is_tref(weight_scales));
-  VK_CHECK_COND(graph.val_is_tref(weight_zeros));
-
-  VK_CHECK_COND(graph.dim_of(mat1) <= 3);
-  VK_CHECK_COND(graph.dim_of(mat2_data) == 2);
-  VK_CHECK_COND(graph.dim_of(weight_scales) == 2);
-  VK_CHECK_COND(graph.dim_of(weight_zeros) == 2);
-
-  VK_CHECK_COND(graph.size_at<int>(-3, mat1) == 1);
-  const int K = graph.size_at<int>(-1, mat1);
-  VK_CHECK_COND(graph.size_at<int>(-1, mat2_data) * 2 == K);
-
-  const int group_size_val = graph.extract_scalar<int>(group_size);
-  VK_CHECK_COND(K % group_size_val == 0);
-  // Due to the way weight packing works, group size needs to be a multiple of 8
-  VK_CHECK_COND(group_size_val % 8 == 0);
-
-  VK_CHECK_COND(graph.has_standard_axis_map(mat1));
-  VK_CHECK_COND(graph.has_standard_axis_map(out));
-
-  // Check that scale and zero_point tensors are buffer storage with width
-  // packing
-  VK_CHECK_COND(graph.is_buffer_storage(mat1_scale));
-  VK_CHECK_COND(graph.packed_dim_of(mat1_scale) == WHCN::kWidthDim);
-  VK_CHECK_COND(graph.is_buffer_storage(mat1_zero_point));
-  VK_CHECK_COND(graph.packed_dim_of(mat1_zero_point) == WHCN::kWidthDim);
-
-  // Calculate number of tokens for input
-  int64_t input_num_tokens = 1;
-  const auto mat1_sizes = graph.sizes_of(mat1);
-  for (size_t i = 0; i < mat1_sizes.size() - 1; i++) {
-    input_num_tokens *= mat1_sizes[i];
-  }
-
-  // Verify scale and zero_point tensor sizes match number of tokens
-  const auto mat1_scale_sizes = graph.sizes_of(mat1_scale);
-  const auto mat1_zero_point_sizes = graph.sizes_of(mat1_zero_point);
-
-  VK_CHECK_COND(
-      utils::val_at<int64_t>(-1, mat1_scale_sizes) == input_num_tokens);
-  VK_CHECK_COND(
-      utils::val_at<int64_t>(-1, mat1_zero_point_sizes) == input_num_tokens);
-
-  // Verify weight scales and zeros have the same shape
-  const auto weight_scales_sizes = graph.sizes_of(weight_scales);
-  const auto weight_zeros_sizes = graph.sizes_of(weight_zeros);
-  VK_CHECK_COND(weight_scales_sizes == weight_zeros_sizes);
-}
-
-void resize_linear_qta8a_qga4w_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
-
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef mat1 = args.at(1).refs.at(0);
-  const ValueRef mat2 = args.at(1).refs.at(1);
-
-  const std::vector<int64_t> mat1_sizes = graph->sizes_of(mat1);
-  const std::vector<int64_t> mat2_sizes = graph->sizes_of(mat2);
-
-  const int64_t out_cols = utils::val_at(-2, mat1_sizes);
-  const int64_t out_rows = utils::val_at(-1, mat2_sizes) * 2;
-
-  std::vector<int64_t> new_out_sizes(3);
-  if (mat1_sizes.size() == 2) {
-    new_out_sizes.resize(2);
-    new_out_sizes.at(0) = out_cols;
-    new_out_sizes.at(1) = out_rows;
-  } else {
-    new_out_sizes.at(0) = mat1_sizes.at(0);
-    new_out_sizes.at(1) = out_cols;
-    new_out_sizes.at(2) = out_rows;
-  }
-
-  graph->virtual_resize(out, new_out_sizes);
-}
-
-/**
- * Determines if the cooperative algorithm should be used based on input tensor
- * dimensions. Apply the coop algorithm for vectors (GEMV cases), tiled for
- * matrices (GEMM cases).
- */
-bool should_use_coop_algorithm_qta8a_qga4w(
-    ComputeGraph* graph,
-    const ValueRef& mat1) {
-  const uint32_t M = graph->size_at<uint32_t>(-2, mat1);
-  // Use coop algorithm for vectors (GEMV), tiled for larger matrices (GEMM)
-  return M == 1;
-}
-
-vkapi::ShaderInfo pick_linear_qta8a_qga4w_shader(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef mat1 = args.at(1).refs.at(0);
-  const ValueRef mat2 = args.at(1).refs.at(1);
-
-  const bool use_coop_algorithm =
-      should_use_coop_algorithm_qta8a_qga4w(graph, mat1);
-
-  std::string kernel_name = "linear_qta8a_qga4w";
-  if (use_coop_algorithm) {
-    kernel_name += "_coop";
-  } else {
-    kernel_name += "_tiled";
-  }
-  add_storage_type_suffix(kernel_name, graph->storage_type_of(out));
-  add_storage_type_suffix(kernel_name, graph->storage_type_of(mat1));
-  add_storage_type_suffix(kernel_name, graph->storage_type_of(mat2));
-  add_dtype_suffix(kernel_name, graph->dtype_of(out));
-
-  return VK_KERNEL_FROM_STR(kernel_name);
-}
-
-utils::uvec3 linear_qta8a_qga4w_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-  const ValueRef out = args.at(0).refs.at(0);
-
-  const bool use_coop_algorithm =
-      shader.kernel_name.find("_coop") != std::string::npos;
-
-  // C = 1, H = 2, W = 3
-  // global_wg_size = {round_up(C / 2f), round_up(H / 3f), W} --> (2W, 1H, 0C)
-  // --> {1, 1, 3} global
-
-  utils::uvec3 global_wg_size = graph->logical_limits_of(out);
-  global_wg_size[0] = utils::div_up(global_wg_size[0], uint32_t(2));
-  if (!use_coop_algorithm) { // GEMM - TILED
-    global_wg_size[1] = utils::div_up(global_wg_size[1], uint32_t(3));
-  }
-
-  return global_wg_size;
-}
-
-utils::uvec3 linear_qta8a_qga4w_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)args;
-  (void)resize_args;
-
-  const bool use_coop_algorithm =
-      shader.kernel_name.find("_coop") != std::string::npos;
-
-  utils::uvec3 local_wg_size;
-  if (use_coop_algorithm) { // GEMV - COOP
-    local_wg_size = {8, 1, 8};
-  } else { // GEMM - TILED
-    local_wg_size = graph->create_local_wg_size(global_workgroup_size);
-  }
-
-  return local_wg_size;
-}
-
-void add_linear_qta8a_qga4w_node(
-    ComputeGraph& graph,
-    const ValueRef mat1,
-    const ValueRef mat1_scale,
-    const ValueRef mat1_zero_point,
-    const ValueRef mat2_data,
-    const ValueRef group_size,
-    const ValueRef weight_scales_data,
-    const ValueRef weight_zeros_data,
-    const ValueRef out) {
-  check_linear_qta8a_qga4w_args(
-      graph,
-      mat1,
-      mat1_scale,
-      mat1_zero_point,
-      mat2_data,
-      group_size,
-      weight_scales_data,
-      weight_zeros_data,
-      out);
-  const uint32_t group_size_val = graph.extract_scalar<int32_t>(group_size);
-
-  ValueRef mat2 =
-      prepack_int4_linear_weight_transposed_interleaved(graph, mat2_data);
-  ValueRef weight_scales = prepack_standard(
-      graph, weight_scales_data, utils::kBuffer, utils::kWidthPacked);
-  ValueRef weight_zeros = prepack_standard(
-      graph, weight_zeros_data, utils::kBuffer, utils::kWidthPacked);
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      pick_linear_qta8a_qga4w_shader,
-      linear_qta8a_qga4w_global_wg_size,
-      linear_qta8a_qga4w_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite},
-       {{mat1, mat2, weight_scales, weight_zeros, mat1_scale, mat1_zero_point},
-        vkapi::kRead}},
-      // Shader params buffers
-      {},
-      // Push Constants
-      {graph.sizes_pc_of(out),
-       graph.sizes_pc_of(mat1),
-       graph.sizes_pc_of(mat2)},
-      // Specialization Constants
-      {SV(group_size_val)},
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_linear_qta8a_qga4w_node));
-}
-
-void linear_qta8a_qga4w(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  return add_linear_qta8a_qga4w_node(
-      graph,
-      args[0], // quantized input (char tensor)
-      args[1], // input_scale (float buffer tensor)
-      args[2], // input_zero_point (int buffer tensor)
-      args[3], // quantized weights (4-bit packed, byte)
-      args[4], // group_size (int)
-      args[5], // weight_scales (float tensor)
-      args[6], // weight_zeros (int tensor)
-      args[7] // float output tensor
-  );
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(et_vk.linear_qta8a_qga4w.default, linear_qta8a_qga4w);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Reduce.cpp b/backends/vulkan/runtime/graph/ops/impl/Reduce.cpp
deleted file mode 100644
index 6ad1d7f371d..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Reduce.cpp
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-using namespace utils;
-
-void resize_reduce_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = args.at(1).refs.at(0);
-
-  const int32_t reduce_dim_nchw =
-      graph->extract_scalar<int32_t>(resize_args.at(0));
-
-  std::vector<int64_t> new_sizes = graph->sizes_of(in);
-  new_sizes.at(normalize(reduce_dim_nchw, new_sizes.size())) = 1;
-  graph->virtual_resize(out, new_sizes);
-}
-
-void resize_reduce2d_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = args.at(1).refs.at(0);
-
-  // Extract the dimensions to reduce over
-  const std::vector<int64_t> dims_list =
-      graph->extract_int_or_symint_list(resize_args.at(0));
-  int32_t reduce_dim1_nchw = dims_list[0];
-  int32_t reduce_dim2_nchw = dims_list[1];
-
-  std::vector<int64_t> new_sizes = graph->sizes_of(in);
-  new_sizes.at(normalize(reduce_dim1_nchw, new_sizes.size())) = 1;
-  new_sizes.at(normalize(reduce_dim2_nchw, new_sizes.size())) = 1;
-  graph->virtual_resize(out, new_sizes);
-}
-
-utils::uvec3 reduce_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  const ValueRef out = args.at(0).refs.at(0);
-  const int32_t reduce_dim_whcn =
-      graph->extract_scalar<int32_t>(resize_args.at(1));
-
-  utils::uvec3 global_wg_size = graph->logical_limits_of(out);
-  global_wg_size[reduce_dim_whcn] = 1;
-  return global_wg_size;
-}
-
-utils::uvec3 reduce_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)args;
-  (void)global_workgroup_size;
-
-  const int32_t reduce_dim_whcn =
-      graph->extract_scalar<int32_t>(resize_args.at(1));
-  const int64_t group_dim_whcn =
-      graph->extract_scalar<int64_t>(resize_args.at(2));
-
-  // This should match the value of MAX_NTHREADS in the reduce shader.
-  constexpr uint32_t max_nthreads = 16;
-
-  const uint32_t nworkers_per_group = 4;
-  const uint32_t ngroups = 4;
-  VK_CHECK_COND(nworkers_per_group * ngroups <= max_nthreads);
-
-  utils::uvec3 local_wg_size{1, 1, 1};
-  local_wg_size[reduce_dim_whcn] = nworkers_per_group;
-  local_wg_size[group_dim_whcn] = ngroups;
-
-  return local_wg_size;
-}
-
-void add_reduce_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef dim_ref,
-    const ValueRef out,
-    const std::string& op_name) {
-  VK_CHECK_COND(
-      !graph.is_buffer_storage(in) && !graph.is_buffer_storage(out),
-      "Vulkan reduction only supports texture storage");
-
-  const int64_t ndim = graph.dim_of(in);
-
-  int32_t reduce_dim = graph.extract_scalar<int32_t>(dim_ref);
-  reduce_dim = normalize(reduce_dim, ndim);
-  reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim);
-
-  // Check that the concat dim is not the reduction dim, if the tensor has a
-  // batch dim greater than 1.
-  if (graph.dim_of(in) == 4 && graph.size_at<int>(0, in) > 1) {
-    VK_CHECK_COND(graph.concat_dim_of(in) != reduce_dim);
-    VK_CHECK_COND(graph.concat_dim_of(out) != reduce_dim);
-  }
-
-  std::string kernel_name = op_name;
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  // Calculate group_dim for specialization constants
-  const int other_dim_1 = (reduce_dim + 1) % 3;
-  const int other_dim_2 = (reduce_dim + 2) % 3;
-  int32_t group_dim;
-  utils::uvec3 limits = graph.logical_limits_of(out);
-  if (limits[other_dim_1] > limits[other_dim_2]) {
-    group_dim = other_dim_1;
-  } else {
-    group_dim = other_dim_2;
-  }
-
-  const ValueRef reduce_dim_whcn_ref =
-      graph.get_or_add_value_for_int(reduce_dim);
-  const ValueRef group_dim_whcn_ref = graph.get_or_add_value_for_int(group_dim);
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      reduce_global_wg_size,
-      reduce_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Shader params buffers
-      {graph.logical_limits_ubo(in), graph.sizes_ubo(in)},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {graph.packed_dim_of(out), reduce_dim, group_dim},
-      // Resize Args
-      {dim_ref, reduce_dim_whcn_ref, group_dim_whcn_ref},
-      // Resizing Logic
-      resize_reduce_node));
-}
-
-void add_reduce2d_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef dims_ref,
-    const ValueRef out,
-    const std::string& op_name) {
-  VK_CHECK_COND(
-      !graph.is_buffer_storage(in) && !graph.is_buffer_storage(out),
-      "Vulkan reduction only supports texture storage");
-
-  const int64_t ndim = graph.dim_of(in);
-
-  // Extract the two dimensions to reduce over
-  const std::vector<int64_t> dims_list =
-      graph.extract_int_or_symint_list(dims_ref);
-  VK_CHECK_COND(
-      dims_list.size() == 2, "reduce2d requires exactly 2 dimensions");
-
-  int32_t reduce_dim1 = normalize(dims_list[0], ndim);
-  int32_t reduce_dim2 = normalize(dims_list[1], ndim);
-
-  // Convert to WHCN format
-  reduce_dim1 = nchw_dim_to_whcn_dim(reduce_dim1, ndim);
-  reduce_dim2 = nchw_dim_to_whcn_dim(reduce_dim2, ndim);
-
-  // Check that none of the reduction dims are packed
-  VK_CHECK_COND(graph.packed_dim_of(in) != reduce_dim1);
-  VK_CHECK_COND(graph.packed_dim_of(in) != reduce_dim2);
-  VK_CHECK_COND(graph.packed_dim_of(out) != reduce_dim1);
-  VK_CHECK_COND(graph.packed_dim_of(out) != reduce_dim2);
-
-  // Check that the concat dim is not one of the reduction dims
-  if (graph.dim_of(in) == 4 && graph.size_at<int>(0, in) > 1) {
-    VK_CHECK_COND(graph.concat_dim_of(in) != reduce_dim1);
-    VK_CHECK_COND(graph.concat_dim_of(in) != reduce_dim2);
-    VK_CHECK_COND(graph.concat_dim_of(out) != reduce_dim1);
-    VK_CHECK_COND(graph.concat_dim_of(out) != reduce_dim2);
-  }
-
-  std::string kernel_name = op_name + "2d"; // Add "2d" suffix
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  // Calculate group_dim for specialization constants (use remaining dimension)
-  int32_t group_dim = 0;
-  for (int i = 0; i < 3; i++) {
-    if (i != reduce_dim1 && i != reduce_dim2) {
-      group_dim = i;
-      break;
-    }
-  }
-
-  const ValueRef reduce_dim1_whcn_ref =
-      graph.get_or_add_value_for_int(reduce_dim1);
-  const ValueRef reduce_dim2_whcn_ref =
-      graph.get_or_add_value_for_int(reduce_dim2);
-  const ValueRef group_dim_whcn_ref = graph.get_or_add_value_for_int(group_dim);
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      reduce_global_wg_size,
-      reduce_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Shader params buffers
-      {graph.logical_limits_ubo(in), graph.sizes_ubo(in)},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {graph.packed_dim_of(out), reduce_dim1, reduce_dim2, group_dim},
-      // Resize Args
-      {dims_ref,
-       reduce_dim1_whcn_ref,
-       reduce_dim2_whcn_ref,
-       group_dim_whcn_ref},
-      // Resizing Logic
-      resize_reduce2d_node));
-}
-
-#define DEFINE_REDUCE_FN(op_name, out_arg_idx)                           \
-  void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
-    const std::vector<int64_t> dims_list =                               \
-        graph.extract_int_or_symint_list(args[1]);                       \
-    if (dims_list.size() == 1) {                                         \
-      const int64_t dim_val = dims_list.at(0);                           \
-      const ValueRef dim_ref = graph.get_or_add_value_for_int(dim_val);  \
-      return add_reduce_node(                                            \
-          graph, args[0], dim_ref, args[out_arg_idx], #op_name);         \
-    }                                                                    \
-    if (dims_list.size() == 2) {                                         \
-      return add_reduce2d_node(                                          \
-          graph, args[0], args[1], args[out_arg_idx], #op_name);         \
-    }                                                                    \
-    VK_CHECK_COND(false, "Only 1 or 2 dimensions supported");            \
-  }
-
-DEFINE_REDUCE_FN(sum, 4)
-DEFINE_REDUCE_FN(mean, 4)
-DEFINE_REDUCE_FN(amax, 3)
-DEFINE_REDUCE_FN(amin, 3)
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.sum.dim_IntList, sum);
-  VK_REGISTER_OP(aten.mean.dim, mean);
-  VK_REGISTER_OP(aten.amax.default, amax);
-  VK_REGISTER_OP(aten.amin.default, amin);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp b/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
deleted file mode 100644
index 72c1637a2c9..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Repeat.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Copy.h>
-
-namespace vkcompute {
-
-namespace {
-
-void check_args(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const std::vector<int64_t>& repeats,
-    const ValueRef out) {
-  VK_CHECK_COND(graph.packed_dim_of(in) == graph.packed_dim_of(out));
-
-  VK_CHECK_COND(graph.storage_type_of(in) == graph.storage_type_of(out));
-  if (graph.storage_type_of(in) == utils::kTexture2D) {
-    VK_CHECK_COND(graph.dim_of(in) <= 2);
-  }
-
-  const int64_t in_dim = graph.dim_of(in);
-  VK_CHECK_COND(
-      in_dim <= repeats.size(),
-      "Input tensor dim size must be not greater than the repeat argument's size");
-
-  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
-  const std::vector<int64_t> out_sizes = graph.sizes_of(out);
-
-  VK_CHECK_COND(
-      dim_at<kWidth4D>(in_sizes) * dim_at<kWidth4D>(repeats) ==
-          dim_at<kWidth4D>(out_sizes),
-      "Output's width doesn't match input's width * repeat count");
-
-  VK_CHECK_COND(
-      dim_at<kHeight4D>(in_sizes) * dim_at<kHeight4D>(repeats) ==
-          dim_at<kHeight4D>(out_sizes),
-      "Output's height doesn't match input's height * repeat count");
-
-  VK_CHECK_COND(
-      dim_at<kChannel4D>(in_sizes) * dim_at<kChannel4D>(repeats) ==
-          dim_at<kChannel4D>(out_sizes),
-      "Output's channel doesn't match input's channel * repeat count");
-
-  VK_CHECK_COND(
-      dim_at<kBatch4D>(in_sizes) * dim_at<kBatch4D>(repeats) ==
-          dim_at<kBatch4D>(out_sizes),
-      "Output's batch doesn't match input's batch * repeat count");
-}
-
-} // namespace
-
-void add_repeat_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    ValueRef repeats_ref,
-    ValueRef out) {
-  const std::vector<int64_t> repeats = *(graph.get_int_list(repeats_ref));
-
-  check_args(graph, in, repeats, out);
-
-  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
-  const utils::ivec4 src_dims{
-      dim_at<kWidth4D>(in_sizes),
-      dim_at<kHeight4D>(in_sizes),
-      dim_at<kChannel4D>(in_sizes),
-      dim_at<kBatch4D>(in_sizes)};
-  const utils::ivec4 dst_repeats{
-      dim_at<kWidth4D>(repeats),
-      dim_at<kHeight4D>(repeats),
-      dim_at<kChannel4D>(repeats),
-      dim_at<kBatch4D>(repeats)};
-
-  std::string kernel_name = "repeat";
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  // A copy of range with the last element set to batch size of the input tensor
-  const utils::ivec3 wg_size = graph.logical_limits_of(out);
-
-  const auto shader = VK_KERNEL_FROM_STR(kernel_name);
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {
-          {out, vkapi::kWrite},
-          {in, vkapi::kRead},
-      },
-      // Parameter buffers
-      {},
-      // Push Constants
-      {
-          PushConstantDataInfo(&wg_size, sizeof(wg_size), sizeof(utils::ivec4)),
-          PushConstantDataInfo(
-              &src_dims, sizeof(src_dims), sizeof(utils::ivec4)),
-          PushConstantDataInfo(
-              &dst_repeats, sizeof(dst_repeats), sizeof(utils::ivec4)),
-      },
-      // Specialization Constants
-      {graph.hashed_layout_of(out), graph.hashed_layout_of(in)},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-void repeat(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  add_repeat_node(graph, args[0], args[1], args[2]);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.repeat.default, repeat);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.cpp b/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.cpp
deleted file mode 100644
index 221d0d23f51..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void resize_repeat_interleave_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = args.at(1).refs.at(0);
-
-  const int64_t nrepeats = graph->extract_scalar<int64_t>(extra_args.at(0));
-  int64_t repeat_dim = graph->extract_scalar<int64_t>(extra_args.at(1));
-
-  std::vector<int64_t> new_sizes = graph->sizes_of(in);
-  repeat_dim = normalize(repeat_dim, new_sizes.size());
-  new_sizes.at(repeat_dim) *= nrepeats;
-
-  graph->virtual_resize(out, new_sizes);
-}
-
-void add_repeat_interleave_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef num_repeats,
-    const ValueRef dim,
-    const ValueRef out) {
-  const int32_t nrepeats = graph.extract_scalar<int32_t>(num_repeats);
-  const int32_t repeat_dim =
-      graph.extract_whcn_dim<int32_t>(dim, graph.dim_of(in));
-
-  VK_CHECK_COND(repeat_dim != graph.packed_dim_of(out));
-  VK_CHECK_COND(repeat_dim != graph.packed_dim_of(in));
-
-  std::string kernel_name = "repeat_interleave";
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::MemoryAccessType::WRITE},
-       {in, vkapi::MemoryAccessType::READ}},
-      // Parameter buffers
-      {graph.logical_limits_ubo(in)},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {graph.hashed_layout_of(out),
-       graph.hashed_layout_of(in),
-       nrepeats,
-       repeat_dim},
-      // Resize Args
-      {num_repeats, dim},
-      // Resizing Logic
-      resize_repeat_interleave_node));
-}
-
-void repeat_interleave(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  int args_i = 0;
-  const ValueRef in = args[args_i++];
-  const ValueRef num_repeats = args[args_i++];
-  const ValueRef dim = args[args_i++];
-  const ValueRef output_size = args[args_i++];
-  const ValueRef out = args[args_i++];
-
-  // Output size is not used in the kernel
-  (void)output_size;
-
-  add_repeat_interleave_node(graph, in, num_repeats, dim, out);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.repeat_interleave.self_int, repeat_interleave);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h b/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h
deleted file mode 100644
index f29a817e86e..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-namespace vkcompute {
-
-void add_repeat_interleave_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef num_repeats,
-    const ValueRef dim,
-    const ValueRef out);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/RotaryEmbedding.cpp b/backends/vulkan/runtime/graph/ops/impl/RotaryEmbedding.cpp
deleted file mode 100644
index fcc8fe4b265..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/RotaryEmbedding.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void resize_rotary_embedding_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-
-  const ValueRef xq_out = args.at(0).refs.at(0);
-  const ValueRef xk_out = args.at(0).refs.at(1);
-
-  const ValueRef xq = args.at(1).refs.at(0);
-  const ValueRef xk = args.at(1).refs.at(1);
-
-  const std::vector<int64_t> xq_sizes = graph->sizes_of(xq);
-  const std::vector<int64_t> xk_sizes = graph->sizes_of(xk);
-
-  graph->virtual_resize(xq_out, xq_sizes);
-  graph->virtual_resize(xk_out, xk_sizes);
-}
-
-utils::uvec3 rotary_embedding_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-
-  const ValueRef xq_out = args.at(0).refs.at(0);
-
-  utils::uvec3 global_wg_size = graph->logical_limits_of(xq_out);
-  global_wg_size[0] /= 2;
-
-  return global_wg_size;
-}
-
-void add_rotary_embedding_node(
-    ComputeGraph& graph,
-    const ValueRef xq,
-    const ValueRef xk,
-    const ValueRef freqs_cos,
-    const ValueRef freqs_sin,
-    const ValueRef xq_out,
-    const ValueRef xk_out) {
-  VK_CHECK_COND(graph.size_at<int>(-1, xq) == graph.size_at<int>(-1, xk));
-  VK_CHECK_COND(graph.size_at<int>(-3, xq) == graph.size_at<int>(-3, xk));
-  VK_CHECK_COND(
-      graph.size_at<int>(-1, xq) == graph.size_at<int>(-1, freqs_cos) * 2);
-  VK_CHECK_COND(graph.sizes_of(freqs_cos) == graph.sizes_of(freqs_sin));
-
-  VK_CHECK_COND(graph.packed_dim_of(xq) == WHCN::kWidthDim);
-  VK_CHECK_COND(graph.packed_dim_of(xk) == WHCN::kWidthDim);
-  VK_CHECK_COND(graph.packed_dim_of(freqs_cos) == WHCN::kWidthDim);
-  VK_CHECK_COND(graph.packed_dim_of(freqs_sin) == WHCN::kWidthDim);
-  VK_CHECK_COND(graph.has_standard_axis_map(xq));
-  VK_CHECK_COND(graph.has_standard_axis_map(xk));
-  VK_CHECK_COND(graph.has_standard_axis_map(freqs_cos));
-  VK_CHECK_COND(graph.has_standard_axis_map(freqs_sin));
-
-  std::string kernel_name = "rotary_embedding";
-  add_dtype_suffix(kernel_name, graph.dtype_of(xq_out));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      rotary_embedding_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{{xq_out, xk_out}, vkapi::kWrite},
-       {{xq, xk, freqs_cos, freqs_sin}, vkapi::kRead}},
-      // Parameter buffers
-      {graph.logical_limits_ubo(xq_out), graph.logical_limits_ubo(xk_out)},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_rotary_embedding_node));
-}
-
-void apply_rotary_emb(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  const ValueListPtr out_tuple = graph.get_value_list(args[4]);
-  const ValueRef xq_out = out_tuple->at(0);
-  const ValueRef xk_out = out_tuple->at(1);
-
-  add_rotary_embedding_node(
-      graph, args[0], args[1], args[2], args[3], xq_out, xk_out);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(et_vk.apply_rotary_emb.default, apply_rotary_emb);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp b/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp
deleted file mode 100644
index 2cc7455cd4a..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp
+++ /dev/null
@@ -1,566 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/MatMul.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Slice.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Softmax.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Transpose.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void resize_sdpa_out(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)args;
-
-  int arg_idx = 0;
-  const ValueRef q_projected = extra_args[arg_idx++];
-  const ValueRef out = extra_args[arg_idx++];
-  graph->virtual_resize(out, graph->sizes_of(q_projected));
-}
-
-void resize_flash_attention_out(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-
-  // Find the output tensor in the args - it's the first tensor in the first
-  // ArgGroup
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef q_projected = args.at(1).refs.at(0);
-  graph->virtual_resize(out, graph->sizes_of(q_projected));
-}
-
-utils::uvec3 flash_attention_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-
-  const ValueRef q_projected = resize_args.at(0);
-  const ValueRef block_size_r = resize_args.at(1);
-
-  // Get tensor dimensions - PyTorch format is [B, N, H, D]
-  // But Vulkan uses negative indexing: -4=B, -3=N, -2=H, -1=D
-  const int32_t B = graph->size_at<int32_t>(-4, q_projected); // batch
-  const int32_t N = graph->size_at<int32_t>(-3, q_projected); // sequence length
-  const int32_t H = graph->size_at<int32_t>(-2, q_projected); // num heads
-  const int32_t Br =
-      static_cast<int32_t>(graph->extract_scalar<int64_t>(block_size_r));
-
-  // Calculate number of row blocks
-  const int32_t Tr = (N + Br - 1) / Br;
-
-  return {static_cast<uint32_t>(B * H * Tr), 1, 1};
-}
-
-void flash_attention_impl(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  int arg_idx = 0;
-  const ValueRef q_projected = args[arg_idx++];
-  const ValueRef k_cache = args[arg_idx++];
-  const ValueRef v_cache = args[arg_idx++];
-  const ValueRef input_pos_symint = args[arg_idx++];
-  const ValueRef attn_mask = args[arg_idx++];
-  const ValueRef dropout_p = args[arg_idx++];
-  const ValueRef is_causal = args[arg_idx++];
-  const ValueRef scale = args[arg_idx++];
-
-  const ValueRef out = args[arg_idx++];
-
-  // Extract input_pos value for causal masking
-  const int32_t input_pos_val = graph.read_symint(input_pos_symint);
-
-  const ValueRef k_cache_tensor = k_cache;
-  const ValueRef v_cache_tensor = v_cache;
-
-  // Validation checks - re-enable with correct indexing
-  VK_CHECK_COND(graph.size_at<int32_t>(-4, q_projected) == 1); // batch size = 1
-  VK_CHECK_COND(graph.size_at<int32_t>(-4, k_cache_tensor) == 1);
-  VK_CHECK_COND(graph.size_at<int32_t>(-4, v_cache_tensor) == 1);
-  VK_CHECK_COND(
-      graph.sizes_of(k_cache_tensor) == graph.sizes_of(v_cache_tensor));
-  VK_CHECK_COND(
-      graph.size_at<int32_t>(-1, q_projected) ==
-      graph.size_at<int32_t>(-1, k_cache_tensor)); // head_dim must match
-  VK_CHECK_COND(
-      graph.val_is_none(dropout_p) ||
-      graph.extract_scalar<double>(dropout_p) == 0);
-  VK_CHECK_COND(graph.val_is_none(scale));
-  VK_CHECK_COND(
-      graph.val_is_none(is_causal) || graph.extract_scalar<bool>(is_causal));
-  VK_CHECK_COND(graph.val_is_none(attn_mask));
-
-  if (graph.is_buffer_storage(q_projected)) {
-    VK_CHECK_COND(graph.is_buffer_storage(k_cache_tensor));
-    VK_CHECK_COND(graph.is_buffer_storage(v_cache_tensor));
-    VK_CHECK_COND(graph.is_buffer_storage(out));
-  }
-
-  // Calculate scale factor
-  const int32_t head_dim_size = graph.size_at<int32_t>(-1, q_projected);
-  const float scale_val = 1.0f / std::sqrt(static_cast<float>(head_dim_size));
-
-  // Get number of heads for multi-query attention support
-  const int32_t num_heads = graph.size_at<int32_t>(-2, q_projected);
-  const int32_t num_kv_heads = graph.size_at<int32_t>(-2, k_cache_tensor);
-
-  const int32_t block_size_r = 32; // Row block size
-  const int32_t block_size_c = 32; // Column block size
-
-  // l and m have shape [B, H, N]
-  std::vector<int64_t> lm_sizes = {
-      graph.size_at<int64_t>(-4, q_projected), // B (batch)
-      graph.size_at<int64_t>(-2, q_projected), // H (num heads)
-      graph.size_at<int64_t>(-3, q_projected) // N (sequence length)
-  };
-
-  // t_l stores row-wise normalization sums for softmax computation
-  // t_m stores row-wise maximum values for numerical stability in softmax
-  TmpTensor t_l(&graph, lm_sizes, vkapi::kFloat, graph.storage_type_of(out));
-  TmpTensor t_m(&graph, lm_sizes, vkapi::kFloat, graph.storage_type_of(out));
-
-  // Choose kernel name based on storage type
-  std::string kernel_name = "flash_attention";
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  vkapi::ParamsBindList param_ubos = {
-      graph.sizes_ubo(q_projected), // Q_sizes
-      graph.sizes_ubo(k_cache_tensor), // K_sizes
-      graph.sizes_ubo(v_cache_tensor), // V_sizes
-      graph.sizes_ubo(out), // O_sizes
-      graph.sizes_ubo(t_l), // l_sizes
-      graph.sizes_ubo(t_m), // m_sizes
-      graph.create_params_buffer(scale_val), // scale
-      graph.create_params_buffer(block_size_r), // block_size_r
-      graph.create_params_buffer(block_size_c), // block_size_c
-      graph.create_params_buffer(input_pos_val), // input_pos
-      graph.create_params_buffer(num_heads), // num_heads
-      graph.create_params_buffer(num_kv_heads) // num_kv_heads
-  };
-
-  // Create block size references for dispatch calculation
-  const ValueRef block_size_r_ref =
-      graph.add_scalar<int64_t>(static_cast<int64_t>(block_size_r));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      flash_attention_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {
-          {{out, t_l, t_m}, vkapi::kReadWrite},
-          {{q_projected, k_cache_tensor, v_cache_tensor}, vkapi::kRead},
-      },
-      // Shader param buffers
-      param_ubos,
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {q_projected, block_size_r_ref},
-      // Resizing Logic
-      resize_flash_attention_out));
-}
-
-utils::uvec3 kv_cache_update_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-
-  const ValueRef cache = args.at(0).refs.at(0);
-  const ValueRef projected = args.at(1).refs.at(0);
-
-  if (graph->is_buffer_storage(cache)) {
-    return graph->create_global_wg_size(projected);
-  } else {
-    return graph->logical_limits_of(projected);
-  }
-}
-
-void add_kv_cache_update_node(
-    ComputeGraph& graph,
-    const ValueRef input_pos_symint,
-    const ValueRef projected,
-    const ValueRef cache) {
-  std::string kernel_name("kv_cache_update");
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(projected));
-  add_dtype_suffix(kernel_name, graph.dtype_of(projected));
-
-  vkapi::ParamsBindList param_ubos;
-
-  if (graph.is_buffer_storage(cache)) {
-    param_ubos = {
-        graph.numel_ubo(projected),
-        graph.strides_ubo(cache),
-        graph.get_or_create_int_param_buffer(input_pos_symint)};
-  } else {
-    param_ubos = {
-        graph.logical_limits_ubo(projected),
-        graph.get_or_create_int_param_buffer(input_pos_symint)};
-  }
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      kv_cache_update_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{cache, vkapi::kWrite}, {projected, vkapi::kRead}},
-      // Shader param buffers
-      param_ubos,
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-utils::uvec3 attn_weight_scale_and_mask_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-
-  const ValueRef attn_weight = args.at(0).refs.at(0);
-
-  if (graph->is_buffer_storage(attn_weight)) {
-    return {
-        graph->size_at<uint32_t>(-1, attn_weight),
-        graph->size_at<uint32_t>(-2, attn_weight),
-        graph->size_at<uint32_t>(-3, attn_weight),
-    };
-  } else {
-    return graph->logical_limits_of(attn_weight);
-  }
-}
-
-void add_attn_weight_scale_and_mask_node(
-    ComputeGraph& graph,
-    const ValueRef input_pos_symint,
-    const ValueRef q_projected,
-    const ValueRef attn_weight) {
-  std::string kernel_name("sdpa_attn_weight_scale_and_mask");
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(attn_weight));
-  add_dtype_suffix(kernel_name, graph.dtype_of(attn_weight));
-
-  const int32_t head_dim_size = graph.size_at<int32_t>(-1, q_projected);
-  const float scale_val = 1.0f / std::sqrt(static_cast<float>(head_dim_size));
-
-  vkapi::ParamsBindList param_ubos;
-
-  if (graph.is_buffer_storage(attn_weight)) {
-    param_ubos = {
-        graph.sizes_ubo(attn_weight),
-        graph.strides_ubo(attn_weight),
-        graph.create_params_buffer(scale_val)};
-  } else {
-    param_ubos = {
-        graph.logical_limits_ubo(attn_weight),
-        graph.get_or_create_int_param_buffer(input_pos_symint),
-        graph.create_params_buffer(scale_val)};
-  }
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      attn_weight_scale_and_mask_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{attn_weight, vkapi::kReadWrite}},
-      // Shader param buffers
-      param_ubos,
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-std::vector<int64_t> get_cache_slice_sizes(
-    ComputeGraph& graph,
-    ValueRef cache,
-    ValueRef input_pos_symint,
-    ValueRef q_projected) {
-  std::vector<int64_t> slice_sizes = graph.sizes_of(cache);
-
-  // Cache slicing will always be in the channels dim
-  const int32_t input_pos_val = graph.read_symint(input_pos_symint);
-  const int64_t q_seq_len = graph.size_at<int64_t>(1, q_projected);
-  slice_sizes.at(1) = input_pos_val + q_seq_len;
-  return slice_sizes;
-}
-
-void resize_cache_slice_view_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)args;
-  std::vector<int64_t> slice_sizes = get_cache_slice_sizes(
-      *graph, extra_args[0], extra_args[1], extra_args[2]);
-
-  graph->virtual_resize(extra_args[3], slice_sizes);
-}
-
-void add_cache_slice_view_node(
-    ComputeGraph& graph,
-    ValueRef cache,
-    ValueRef input_pos_symint,
-    ValueRef q_projected,
-    ValueRef cache_sliced,
-    const int64_t max_seq_len) {
-  std::vector<int64_t> slice_sizes =
-      get_cache_slice_sizes(graph, cache, input_pos_symint, q_projected);
-  // Initialize the slice to the maximum possible size to start
-  slice_sizes.at(1) = max_seq_len;
-
-  graph.virtual_resize(cache_sliced, slice_sizes);
-
-  graph.execute_nodes().emplace_back(new ExecuteNode(
-      resize_cache_slice_view_node,
-      {cache, input_pos_symint, q_projected, cache_sliced}));
-}
-
-void update_cache_impl(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  int arg_idx = 0;
-  const ValueRef value = args[arg_idx++];
-  const ValueRef cache = args[arg_idx++];
-  const ValueRef input_pos_symint = args[arg_idx++];
-  const ValueRef out = args[arg_idx++];
-
-  // Unused variables
-  (void)out;
-
-  VK_CHECK_COND(graph.size_at<int32_t>(-4, value) == 1);
-  VK_CHECK_COND(graph.size_at<int32_t>(-4, cache) == 1);
-  VK_CHECK_COND(
-      graph.size_at<int32_t>(-1, value) == graph.size_at<int32_t>(-1, cache));
-  VK_CHECK_COND(
-      graph.size_at<int32_t>(-2, value) == graph.size_at<int32_t>(-2, cache));
-
-  add_kv_cache_update_node(graph, input_pos_symint, value, cache);
-}
-
-void sdpa_impl(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  int arg_idx = 0;
-  const ValueRef q_projected = args[arg_idx++];
-  const ValueRef k_cache = args[arg_idx++];
-  const ValueRef v_cache = args[arg_idx++];
-  const ValueRef input_pos_symint = args[arg_idx++];
-  const ValueRef attn_mask = args[arg_idx++];
-  const ValueRef dropout_p = args[arg_idx++];
-  const ValueRef is_causal = args[arg_idx++];
-  const ValueRef scale = args[arg_idx++];
-
-  // Output tensors
-  const ValueRef out = args[arg_idx++];
-
-  // Batches must be 1
-  VK_CHECK_COND(graph.size_at<int32_t>(-4, q_projected) == 1);
-  VK_CHECK_COND(graph.size_at<int32_t>(-4, k_cache) == 1);
-  VK_CHECK_COND(graph.size_at<int32_t>(-4, v_cache) == 1);
-  // k and v projected must have the same shape
-  VK_CHECK_COND(graph.sizes_of(k_cache) == graph.sizes_of(v_cache));
-  // head dim must match between tensors
-  VK_CHECK_COND(
-      graph.size_at<int32_t>(-1, q_projected) ==
-      graph.size_at<int32_t>(-1, k_cache));
-  // All tensors must have the packed dim be the width (head) dimension
-  VK_CHECK_COND(graph.packed_dim_of(q_projected) == WHCN::kWidthDim);
-  VK_CHECK_COND(graph.packed_dim_of(k_cache) == WHCN::kWidthDim);
-  VK_CHECK_COND(graph.packed_dim_of(v_cache) == WHCN::kWidthDim);
-  // Some variables are not supported yet
-  VK_CHECK_COND(
-      graph.val_is_none(dropout_p) ||
-      graph.extract_scalar<double>(dropout_p) == 0);
-  VK_CHECK_COND(graph.val_is_none(scale));
-  // is_causal is assumed to be true in the current implementation.
-  VK_CHECK_COND(
-      graph.val_is_none(is_causal) || graph.extract_scalar<bool>(is_causal));
-  VK_CHECK_COND(graph.val_is_none(attn_mask));
-
-  const int32_t max_seq_len = graph.size_at<int32_t>(1, k_cache);
-
-  // Slice caches from 0 to input_pos + sequence_len
-  const ValueRef k_cache_sliced = graph.add_tensor_view(k_cache);
-  const ValueRef v_cache_sliced = graph.add_tensor_view(v_cache);
-  add_cache_slice_view_node(
-      graph,
-      k_cache,
-      input_pos_symint,
-      q_projected,
-      k_cache_sliced,
-      max_seq_len);
-  add_cache_slice_view_node(
-      graph,
-      v_cache,
-      input_pos_symint,
-      q_projected,
-      v_cache_sliced,
-      max_seq_len);
-
-  // Scalar values for various dims
-  const ValueRef channels = graph.add_scalar<int64_t>(1);
-  const ValueRef height = graph.add_scalar<int64_t>(2);
-  const ValueRef width = graph.add_scalar<int64_t>(3);
-
-  // Repeat interleave
-  const int64_t num_heads = graph.size_at<int64_t>(2, q_projected);
-  const int64_t num_kv_heads = graph.size_at<int64_t>(2, k_cache);
-
-  const ValueRef num_repeats =
-      graph.add_scalar<int64_t>(num_heads / num_kv_heads);
-
-  std::vector<int64_t> cache_slice_repeated_sizes(graph.sizes_of(q_projected));
-  cache_slice_repeated_sizes.at(1) = max_seq_len;
-
-  TmpTensor k_cache_sliced_repeated(
-      &graph, cache_slice_repeated_sizes, graph.dtype_of(k_cache_sliced));
-  TmpTensor v_cache_sliced_repeated(
-      &graph, cache_slice_repeated_sizes, graph.dtype_of(v_cache_sliced));
-
-  add_repeat_interleave_node(
-      graph, k_cache_sliced, num_repeats, height, k_cache_sliced_repeated);
-  add_repeat_interleave_node(
-      graph, v_cache_sliced, num_repeats, height, v_cache_sliced_repeated);
-
-  // Transpose sequence and head dims
-  const ValueRef q_transposed = graph.add_tensor_view(q_projected);
-  const ValueRef k_transposed = graph.add_tensor_view(k_cache_sliced_repeated);
-  const ValueRef v_transposed = graph.add_tensor_view(v_cache_sliced_repeated);
-
-  add_transpose_view_node(graph, q_projected, channels, height, q_transposed);
-  add_transpose_view_node(
-      graph, k_cache_sliced_repeated, channels, height, k_transposed);
-  add_transpose_view_node(
-      graph, v_cache_sliced_repeated, channels, height, v_transposed);
-
-  // Transpose K again to prepare for matmul
-  const ValueRef k_transposed_2 = graph.add_tensor_view(k_transposed);
-  add_transpose_view_node(graph, k_transposed, height, width, k_transposed_2);
-
-  // Initialize attn_weight to the maximum possible size
-  std::vector<int64_t> attn_weight_full_sizes = graph.sizes_of(q_transposed);
-  attn_weight_full_sizes.at(2) = max_seq_len;
-  attn_weight_full_sizes.at(3) = max_seq_len;
-  TmpTensor attn_weight(
-      &graph, attn_weight_full_sizes, graph.dtype_of(q_transposed));
-
-  // Resize attn_weight to the correct dim
-  std::vector<int64_t> attn_weight_sizes = attn_weight_full_sizes;
-  attn_weight_sizes.at(2) = graph.size_at<int64_t>(2, q_transposed);
-  attn_weight_sizes.at(3) = graph.size_at<int64_t>(2, k_transposed);
-  graph.virtual_resize(attn_weight, attn_weight_sizes);
-
-  // Calculate attention weight, which is a matmul of Q and K
-  const ValueRef mat2_is_transposed = graph.add_scalar<bool>(false);
-  add_matmul_node(
-      graph, q_transposed, k_transposed_2, attn_weight, mat2_is_transposed);
-
-  // Apply scale and mask to the attention weight
-  add_attn_weight_scale_and_mask_node(
-      graph, input_pos_symint, q_projected, attn_weight);
-
-  TmpTensor attn_weight_softmax(
-      &graph, attn_weight_full_sizes, graph.dtype_of(q_transposed));
-  graph.virtual_resize(attn_weight_softmax, attn_weight_sizes);
-  add_softmax_node(graph, attn_weight, width, attn_weight_softmax, false);
-
-  // Calculate final output
-  const ValueRef out_transposed = graph.add_tensor_view(out);
-  add_transpose_view_node(graph, out, channels, height, out_transposed);
-  add_matmul_node(
-      graph,
-      attn_weight_softmax,
-      v_transposed,
-      out_transposed,
-      mat2_is_transposed);
-
-  graph.execute_nodes().emplace_back(
-      new ExecuteNode(resize_sdpa_out, {q_projected, out}));
-}
-
-void sdpa_with_kv_cache_impl(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  int arg_idx = 0;
-  const ValueRef q_projected = args[arg_idx++];
-  const ValueRef k_projected = args[arg_idx++];
-  const ValueRef v_projected = args[arg_idx++];
-  const ValueRef k_cache_data = args[arg_idx++];
-  const ValueRef v_cache_data = args[arg_idx++];
-  const ValueRef input_pos_symint = args[arg_idx++];
-  const ValueRef sequence_len = args[arg_idx++];
-  const ValueRef attn_mask = args[arg_idx++];
-  const ValueRef dropout_p = args[arg_idx++];
-  const ValueRef is_causal = args[arg_idx++];
-  const ValueRef scale = args[arg_idx++];
-
-  // Output tensors
-  const ValueRef out = args[arg_idx++];
-
-  (void)sequence_len;
-
-  const ValueRef k_cache =
-      prepack_standard_like(graph, k_cache_data, q_projected);
-  const ValueRef v_cache =
-      prepack_standard_like(graph, v_cache_data, q_projected);
-
-  update_cache_impl(graph, {k_projected, k_cache, input_pos_symint, -1});
-  update_cache_impl(graph, {v_projected, v_cache, input_pos_symint, -1});
-
-  sdpa_impl(
-      graph,
-      {q_projected,
-       k_cache,
-       v_cache,
-       input_pos_symint,
-       attn_mask,
-       dropout_p,
-       is_causal,
-       scale,
-       out});
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(sdpa_with_kv_cache.default, sdpa_with_kv_cache_impl);
-  VK_REGISTER_OP(update_cache.default, update_cache_impl);
-  VK_REGISTER_OP(llama.custom_sdpa.default, sdpa_impl);
-  VK_REGISTER_OP(llama.flash_attention.default, flash_attention_impl);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/ScalarTensor.cpp b/backends/vulkan/runtime/graph/ops/impl/ScalarTensor.cpp
deleted file mode 100644
index 82fc5c977d3..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/ScalarTensor.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void scalar_tensor(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  // Extract the scalar value from the first argument
-  ValueRef scalar_in = args[0];
-  float scalar_value = graph.extract_scalar<float>(scalar_in);
-
-  // Get the output tensor reference
-  ValueRef out = args[args.size() - 1];
-
-  std::string kernel_name("scalar_tensor");
-  kernel_name.reserve(kShaderNameReserve);
-
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(scalar_in));
-
-  graph.execute_nodes().emplace_back(new DispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      graph.create_global_wg_size(out),
-      graph.create_local_wg_size(out),
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}},
-      // Shader params buffers
-      {graph.create_params_buffer(scalar_value)},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.scalar_tensor.default, scalar_tensor);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Select.cpp b/backends/vulkan/runtime/graph/ops/impl/Select.cpp
deleted file mode 100644
index 69d49e8283b..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Select.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Transfer.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void resize_select_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  ValueRef out = args.at(0).refs.at(0);
-  ValueRef in = args.at(1).refs.at(0);
-  int64_t dim = graph->extract_scalar<int64_t>(extra_args.at(0));
-
-  int64_t in_ndim = graph->dim_of(in);
-
-  if (dim < 0) {
-    dim += in_ndim;
-  }
-
-  std::vector<int64_t> new_out_sizes;
-  for (int64_t i = 0; i < in_ndim; ++i) {
-    if (i != dim) {
-      new_out_sizes.push_back(graph->size_at<int64_t>(i, in));
-    }
-  }
-
-  graph->virtual_resize(out, new_out_sizes);
-}
-
-void check_select_args(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef dim_ref,
-    const ValueRef index_ref,
-    const ValueRef out) {
-  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-  int64_t index = graph.extract_optional_scalar<int64_t>(index_ref, 0);
-  int64_t in_ndim = graph.dim_of(in);
-
-  if (dim < 0) {
-    dim += in_ndim;
-  }
-
-  VK_CHECK_COND(
-      dim >= 0 && dim < in_ndim,
-      "Dimension out of range (expected to be in range of [",
-      -in_ndim,
-      ", ",
-      in_ndim - 1,
-      "], but got ",
-      dim,
-      ")");
-
-  const int64_t in_size_at_dim = graph.size_at<int64_t>(dim, in);
-
-  if (index < 0) {
-    index += in_size_at_dim;
-  }
-
-  VK_CHECK_COND(
-      index >= 0 && index < in_size_at_dim,
-      "select(): index ",
-      index,
-      " out of range for tensor of size ",
-      in_size_at_dim,
-      " at dimension ",
-      dim);
-
-  // Check that output tensor has correct dimensions
-  int64_t out_dim = graph.dim_of(out);
-  VK_CHECK_COND(
-      out_dim == in_ndim - 1,
-      "Output tensor dimension mismatch (expected ",
-      in_size_at_dim - 1,
-      ", but got ",
-      out_dim,
-      ")");
-
-  // Check that output tensor has correct sizes
-  int64_t out_idx = 0;
-  for (int64_t i = 0; i < in_size_at_dim; ++i) {
-    if (i != dim) {
-      VK_CHECK_COND(
-          graph.size_at<int64_t>(out_idx, out) == graph.size_at<int64_t>(i, in),
-          "Output size mismatch at dimension ",
-          out_idx,
-          " (expected ",
-          graph.size_at<int16_t>(i, in),
-          ", but got ",
-          graph.size_at<int64_t>(out_idx, out),
-          ")");
-      out_idx++;
-    }
-  }
-}
-
-/**
- * Adds a select operation node to the compute graph.
- *
- * The select operator extracts a slice from a tensor along a specified
- * dimension at a given index. It effectively reduces the dimensionality of the
- * input tensor by one, by selecting a single slice at the specified index along
- * the given dimension. For example, if input is a 3D tensor with shape [2,3,4]
- * and we select dimension 1, index 2, the output will be a 2D tensor with shape
- * [2,4].
- */
-void add_select_copy_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef dim_ref,
-    const ValueRef index_ref,
-    const ValueRef out) {
-  check_select_args(graph, in, dim_ref, index_ref, out);
-
-  add_transfer_copy_node(
-      graph,
-      TransferType::SELECT,
-      in,
-      dim_ref,
-      index_ref,
-      kDummyValueRef,
-      kDummyValueRef,
-      out,
-      {dim_ref, index_ref},
-      resize_select_node);
-}
-
-void select_int(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_select_copy_node(graph, args[0], args[1], args[2], args[3]);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.select.int, select_int);
-  VK_REGISTER_OP(aten.select_copy.int, select_int);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
deleted file mode 100644
index 67d714d10aa..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Slice.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Transfer.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-inline int64_t normalize_idx(
-    const int64_t index,
-    const int64_t max,
-    const int64_t default_value) {
-  // INT64_MAX is passed when value is unspecified
-  if (index == INT64_MAX) {
-    return default_value;
-  }
-  if (index == default_value) {
-    return index;
-  }
-  return normalize(index, max);
-}
-
-void resize_slice_copy_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  ValueRef out_ref = args.at(0).refs.at(0);
-  ValueRef in_ref = args.at(1).refs.at(0);
-
-  int64_t dim = graph->extract_scalar<int64_t>(extra_args.at(0));
-  std::optional<int64_t> opt_start =
-      graph->extract_optional_scalar<int64_t>(extra_args.at(1));
-  std::optional<int64_t> opt_end =
-      graph->extract_optional_scalar<int64_t>(extra_args.at(2));
-  int64_t step = graph->extract_scalar<int64_t>(extra_args.at(3));
-
-  // Normalize dim
-  if (dim < 0) {
-    dim += graph->dim_of(in_ref);
-  }
-
-  const std::vector<int64_t> in_sizes = graph->sizes_of(in_ref);
-  int64_t dim_size = in_sizes.at(dim);
-
-  int64_t start = opt_start.value_or(0);
-  int64_t end = opt_end.value_or(dim_size);
-
-  // Normalize start and end indices
-  start = normalize_idx(start, dim_size, 0);
-  end = normalize_idx(end, dim_size, dim_size);
-
-  // Calculate output size
-  std::vector<int64_t> new_out_sizes = in_sizes;
-  new_out_sizes.at(dim) = (end - start + step - 1) / step; // Ceiling division
-
-  graph->virtual_resize(out_ref, new_out_sizes);
-}
-
-/**
- * Adds a slice_copy operation node to the compute graph.
- *
- * The slice operator extracts a portion of a tensor along a specified
- * dimension. It creates a new tensor that contains a subset of the input
- * tensor's data, defined by start, end, and step parameters along the given
- * dimension.
- *
- * For example, if input is a tensor with shape [4,5,6] and we slice along
- * dimension 1 with start=1, end=4, step=2, the output will have shape [4,2,6],
- * containing elements from the input at positions 1 and 3 along dimension 1.
- */
-void add_slice_copy_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    ValueRef dim_ref,
-    ValueRef opt_start_ref,
-    ValueRef opt_end_ref,
-    ValueRef step_ref,
-    ValueRef out) {
-  add_transfer_copy_node(
-      graph,
-      TransferType::SLICE,
-      in,
-      dim_ref,
-      opt_start_ref,
-      opt_end_ref,
-      step_ref,
-      out,
-      {dim_ref, opt_start_ref, opt_end_ref, step_ref},
-      resize_slice_copy_node);
-}
-
-std::vector<int64_t> get_slice_sizes(
-    ComputeGraph& graph,
-    ValueRef in_ref,
-    ValueRef dim_ref,
-    ValueRef opt_start_ref,
-    ValueRef opt_end_ref) {
-  const int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-  std::optional<int64_t> opt_start =
-      graph.extract_optional_scalar<int64_t>(opt_start_ref);
-  std::optional<int64_t> opt_end =
-      graph.extract_optional_scalar<int64_t>(opt_end_ref);
-
-  int64_t dim_size = graph.size_at<int64_t>(dim, in_ref);
-  int64_t start = opt_start.value_or(0);
-  int64_t end = opt_end.value_or(dim_size);
-
-  start = normalize_idx(start, dim_size, 0);
-  end = normalize_idx(end, dim_size, dim_size);
-
-  std::vector<int64_t> new_out_sizes = graph.sizes_of(in_ref);
-  new_out_sizes.at(dim) = end - start;
-
-  return new_out_sizes;
-}
-
-void resize_slice_view_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)args;
-  ValueRef out_ref = extra_args.at(0);
-
-  std::vector<int64_t> new_out_sizes = get_slice_sizes(
-      *graph,
-      extra_args.at(1), // input
-      extra_args.at(2), // dim
-      extra_args.at(3), // optional start
-      extra_args.at(4)); // optional end
-
-  graph->virtual_resize(out_ref, new_out_sizes);
-}
-
-void check_slice_view_args(
-    ComputeGraph& graph,
-    ValueRef in_ref,
-    ValueRef dim_ref,
-    ValueRef opt_start_ref,
-    ValueRef opt_end_ref,
-    ValueRef opt_step_ref,
-    ValueRef out_ref) {
-  VK_CHECK_COND(
-      graph.val_is_view_of(out_ref, in_ref),
-      "output must be a view of the input");
-
-  const int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-  const int64_t dim_size = graph.size_at<int64_t>(dim, in_ref);
-
-  int64_t start =
-      graph.extract_optional_scalar<int64_t>(opt_start_ref).value_or(0);
-  int64_t end = graph.extract_optional_scalar<int64_t>(opt_end_ref).value_or(0);
-  int64_t step =
-      graph.extract_optional_scalar<int64_t>(opt_step_ref).value_or(1);
-
-  start = normalize_idx(start, dim_size, 0);
-  end = normalize_idx(end, dim_size, dim_size);
-
-  // The start idx must be 0; this is to ensure that the start of the slice view
-  // does not have any offset with respect to the base buffer storage. If the
-  // offset is nonzero, then it will potentially change upon a resize; however
-  // the buffer offset of the view tensor will have been "locked in" when the
-  // descriptor for its buffer storage is bound to a compute shader. Therefore
-  // there is no way to update the offset of the view once it has been bound.
-  VK_CHECK_COND(start == 0, "start must be 0 for slice view");
-  VK_CHECK_COND(step == 1, "step must be 1 for slice view");
-
-  VK_CHECK_COND(
-      end < dim_size, "end must be less than dim size for slice view");
-
-  // We must also check that all earlier dims in the dim order have a size of 1.
-  // This ensures that the slice view encompasses a contiguous memory region of
-  // the source tensor's memory buffer.
-  std::vector<int64_t> in_sizes = graph.sizes_of(in_ref);
-  std::vector<int64_t> in_dim_order = graph.dim_order_of(in_ref);
-  for (int i = 0; i < in_dim_order.size(); ++i) {
-    if (in_dim_order[i] == dim) {
-      break;
-    }
-    VK_CHECK_COND(in_sizes[in_dim_order[i]] == 1);
-  }
-}
-
-void add_slice_view_node(
-    ComputeGraph& graph,
-    ValueRef in_ref,
-    ValueRef dim_ref,
-    ValueRef opt_start_ref,
-    ValueRef opt_end_ref,
-    ValueRef opt_step_ref,
-    ValueRef out_ref) {
-  check_slice_view_args(
-      graph,
-      in_ref,
-      dim_ref,
-      opt_start_ref,
-      opt_end_ref,
-      opt_step_ref,
-      out_ref);
-
-  std::vector<int64_t> new_out_sizes =
-      get_slice_sizes(graph, in_ref, dim_ref, opt_start_ref, opt_end_ref);
-
-  graph.virtual_resize(out_ref, new_out_sizes);
-
-  graph.execute_nodes().emplace_back(new ExecuteNode(
-      resize_slice_view_node,
-      {out_ref, in_ref, dim_ref, opt_start_ref, opt_end_ref, opt_step_ref}));
-}
-
-void slice_copy(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_slice_copy_node(
-      graph,
-      args.at(0),
-      args.at(1), // dim
-      args.at(2), // optional start
-      args.at(3), // optional end
-      args.at(4), // step
-      args.at(5));
-}
-
-void slice(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  ValueRef in = args.at(0);
-  ValueRef out = args.at(5);
-
-  // Special case if out is a view of in
-  if (graph.val_is_view_of(out, in)) {
-    add_slice_view_node(
-        graph,
-        in,
-        args.at(1), // dim
-        args.at(2), // optional start
-        args.at(3), // optional end
-        args.at(4), // step
-        out);
-    return;
-  }
-
-  add_slice_copy_node(
-      graph,
-      in,
-      args.at(1), // dim
-      args.at(2), // optional start
-      args.at(3), // optional end
-      args.at(4), // step
-      out);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.slice_copy.Tensor, slice_copy);
-  VK_REGISTER_OP(aten.slice.Tensor, slice);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.h b/backends/vulkan/runtime/graph/ops/impl/Slice.h
deleted file mode 100644
index 220066ff1bb..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Slice.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-#include <vector>
-
-namespace vkcompute {
-
-void add_slice_view_node(
-    ComputeGraph& graph,
-    ValueRef in_ref,
-    ValueRef dim_ref,
-    ValueRef opt_start_ref,
-    ValueRef opt_end_ref,
-    ValueRef opt_step_ref,
-    ValueRef out_ref);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp b/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp
deleted file mode 100644
index 5e645e29e3d..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-using namespace utils;
-
-utils::uvec3 pick_softmax_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-
-  const ValueRef out = args.at(0).refs.at(0);
-  const int32_t reduce_dim_xyz =
-      graph->extract_scalar<int32_t>(resize_args.at(1));
-
-  utils::uvec3 global_size = graph->logical_limits_of(out);
-  global_size[reduce_dim_xyz] = 1;
-  return global_size;
-}
-
-utils::uvec3 pick_softmax_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)global_workgroup_size;
-  (void)args;
-
-  const int64_t group_dim_xyz =
-      graph->extract_scalar<int64_t>(resize_args.at(2));
-
-  const int32_t reduce_dim_xyz =
-      graph->extract_scalar<int32_t>(resize_args.at(1));
-
-  // These values are hardcoded in add_softmax_node
-  const uint32_t nworkers_per_group = 4;
-  const uint32_t ngroups = 4;
-
-  utils::uvec3 local_wg_size{1, 1, 1};
-  local_wg_size[reduce_dim_xyz] = nworkers_per_group;
-  local_wg_size[group_dim_xyz] = ngroups;
-
-  return local_wg_size;
-}
-
-void resize_softmax_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = args.at(1).refs.at(0);
-
-  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
-  graph->virtual_resize(out, in_sizes);
-}
-
-void add_softmax_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef dim_ref,
-    const ValueRef out,
-    bool log_softmax) {
-  VK_CHECK_COND(
-      !graph.is_buffer_storage(in) && !graph.is_buffer_storage(out),
-      "Vulkan softmax only supports texture storage");
-
-  const int64_t ndim = graph.dim_of(in);
-
-  int32_t reduce_dim_nchw = graph.extract_scalar<int32_t>(dim_ref);
-  reduce_dim_nchw = normalize(reduce_dim_nchw, ndim);
-  const int32_t reduce_dim_xyz = nchw_dim_to_whcn_dim(reduce_dim_nchw, ndim);
-
-  // Check that the concat dim is not the reduction dim, if the tensor has a
-  // batch dim greater than 1.
-  if (graph.dim_of(in) == 4 && graph.size_at<int>(0, in) > 1) {
-    VK_CHECK_COND(
-        graph.concat_dim_of(in) != reduce_dim_xyz,
-        "Softmax shader currently does not support concat dim == reduce dim");
-    VK_CHECK_COND(
-        graph.concat_dim_of(out) != reduce_dim_xyz,
-        "Softmax shader currently does not support concat dim == reduce dim");
-  }
-
-  vkapi::ShaderInfo shader_descriptor;
-  std::string kernel_name = "softmax";
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-  if (log_softmax) {
-    kernel_name = "log_" + kernel_name;
-  }
-
-  // This should match the value of MAX_NTHREADS in the softmax shader.
-  constexpr uint32_t max_nthreads = 16;
-
-  const uint32_t nworkers_per_group = 4;
-  const uint32_t ngroups = 4;
-  VK_CHECK_COND(nworkers_per_group * ngroups <= max_nthreads);
-
-  // Determine the group dimension
-  const int other_dim_1 = (reduce_dim_xyz + 1) % 3;
-  const int other_dim_2 = (reduce_dim_xyz + 2) % 3;
-  int32_t group_dim;
-  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
-  if (global_wg_size[other_dim_1] > global_wg_size[other_dim_2]) {
-    group_dim = other_dim_1;
-  } else {
-    group_dim = other_dim_2;
-  }
-
-  const ValueRef reduce_dim_xyz_ref =
-      graph.get_or_add_value_for_int(reduce_dim_xyz);
-  const ValueRef group_dim_xyz_ref = graph.get_or_add_value_for_int(group_dim);
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      pick_softmax_global_wg_size,
-      pick_softmax_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Shader params buffers
-      {graph.logical_limits_ubo(out), graph.sizes_ubo(in)},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {graph.packed_dim_of(out), reduce_dim_xyz, group_dim},
-      // Resize Args
-      {dim_ref, reduce_dim_xyz_ref, group_dim_xyz_ref},
-      // Resizing Logic
-      resize_softmax_node));
-}
-
-void softmax(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  // args[1] bool half_to_float is unused
-  return add_softmax_node(
-      graph, args[0], args[1], args[3], /* log_softmax = */ false);
-}
-
-void log_softmax(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  // args[1] bool half_to_float is unused
-  return add_softmax_node(
-      graph, args[0], args[1], args[3], /* log_softmax = */ true);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten._softmax.default, softmax);
-  VK_REGISTER_OP(aten._log_softmax.default, log_softmax);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Softmax.h b/backends/vulkan/runtime/graph/ops/impl/Softmax.h
deleted file mode 100644
index 58fcfb93404..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Softmax.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-#include <vector>
-
-namespace vkcompute {
-
-void add_softmax_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    ValueRef dim,
-    ValueRef out,
-    bool log_softmax);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Split.cpp b/backends/vulkan/runtime/graph/ops/impl/Split.cpp
deleted file mode 100644
index f87af08ee69..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Split.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Copy.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void add_split_with_sizes_default_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    const std::vector<int64_t>& split_sizes,
-    int64_t dim,
-    ValueRef out_list_ref) {
-  const ValueListPtr out_list = graph.get_value_list(out_list_ref);
-
-  const int64_t input_ndim = graph.dim_of(in);
-  const DimIndex dim_index = dim < 0 ? static_cast<DimIndex>(dim)
-                                     : static_cast<DimIndex>(dim - input_ndim);
-
-  VK_CHECK_COND(out_list->size() == split_sizes.size());
-
-  for (int split_idx = 0; split_idx < split_sizes.size(); split_idx++) {
-    const int64_t split_size = split_sizes.at(split_idx);
-    const ValueRef out_ref = out_list->at(split_idx);
-
-    VK_CHECK_COND(dim_at(graph.sizes_of(out_ref), dim_index) == split_size);
-  }
-
-  const auto packed_dim = graph.packed_dim_of(in);
-  const auto packed_dim_index = static_cast<DimIndex>(kWidth4D - packed_dim);
-
-  // Index of dimension to be concatenated in (w, h, c * b) coordinate system
-  const auto dim_xyz_index = std::min(2, -dim_index - 1);
-
-  utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-  utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-
-  const bool is_splitting_channel = (dim_index == kChannel4D);
-
-  // if splitting channels
-  if (is_splitting_channel) {
-    // set source offset w as channel size of the input tensor
-    src_offset[3] = dim_at(graph.sizes_of(in), kChannel4D);
-  }
-
-  for (ValueRef out_ref : *out_list) {
-    // Doesn't need to use split_size since we have already verified that the
-    // output tensor's size matches with the split_size.
-    const auto out_channel_size = dim_at(graph.sizes_of(out_ref), kChannel4D);
-    const utils::ivec3 range = graph.logical_limits_of(out_ref);
-
-    if (dim_index == packed_dim_index) {
-      // if splitting channels, use add_copy_channel_offset_node function as
-      // add_copy_packed_dim_offset_node does not support channel packing
-      if (is_splitting_channel) {
-        add_copy_channel_offset_node(
-            graph, in, out_channel_size, src_offset[2], dst_offset[2], out_ref);
-        src_offset[dim_xyz_index] += out_channel_size;
-      } else {
-        // dst_offset[3] is not used now but will be used in the future when
-        // add_copy_packed_dim_offset_node will support channel packing
-        //
-        // set destination offset w as channel size of the output tensor if
-        // splitting channel
-        dst_offset[3] = is_splitting_channel ? out_channel_size : 0;
-        add_copy_packed_dim_offset_node(
-            graph, in, range, src_offset, dst_offset, out_ref);
-        src_offset[dim_xyz_index] +=
-            dim_at(graph.sizes_of(out_ref), packed_dim_index);
-      }
-    } else {
-      // set destination offset w as channel size of the output tensor if
-      // splitting channels
-      dst_offset[3] = is_splitting_channel ? out_channel_size : 0;
-      add_copy_offset_node(
-          graph, in, range, src_offset, dst_offset, out_ref, false, true);
-      src_offset[dim_xyz_index] +=
-          is_splitting_channel ? out_channel_size : range[dim_xyz_index];
-    }
-  }
-}
-
-void add_split_with_sizes_default_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    ValueRef split_sizes_ref,
-    ValueRef dim_ref,
-    ValueRef out) {
-  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-  std::vector<int64_t> split_sizes = *(graph.get_int_list(split_sizes_ref));
-
-  add_split_with_sizes_default_node(graph, in, split_sizes, dim, out);
-}
-
-void split_with_sizes_copy_default(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  add_split_with_sizes_default_node(graph, args[0], args[1], args[2], args[3]);
-}
-
-void add_split_tensor_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    ValueRef split_size_ref,
-    ValueRef dim_ref,
-    ValueRef out) {
-  const int64_t split_size = graph.extract_scalar<int64_t>(split_size_ref);
-  const int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-
-  const int64_t input_ndim = graph.dim_of(in);
-  const DimIndex dim_index = dim < 0 ? static_cast<DimIndex>(dim)
-                                     : static_cast<DimIndex>(dim - input_ndim);
-  const int64_t size = dim_at(graph.sizes_of(in), dim_index);
-  const std::vector<int64_t> split_sizes(size / split_size, split_size);
-
-  add_split_with_sizes_default_node(graph, in, split_sizes, dim, out);
-}
-
-void split_tensor(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  add_split_tensor_node(graph, args[0], args[1], args[2], args[3]);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(
-      aten.split_with_sizes_copy.default, split_with_sizes_copy_default);
-  VK_REGISTER_OP(aten.split.Tensor, split_tensor);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp b/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp
deleted file mode 100644
index 13801b45cc7..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Squeeze.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Clone.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Permute.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/View.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void add_squeeze_copy_dims_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef dims_ref,
-    const ValueRef out) {
-  const int64_t in_dim = graph.dim_of(in);
-  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
-  const std::vector<int64_t> out_sizes = graph.sizes_of(in);
-
-  const std::vector<int64_t> dims = graph.extract_int_or_symint_list(dims_ref);
-  std::vector<int64_t> squeeze_dims;
-  // Filter out edge cases that we don't need squeeze:
-  // 1. The size of squeeze dim is larger than 1.
-  // 2. Squeeze outter most dim
-  // For these cases, just pass input to output via clone.
-  for (int i = 0; i < dims.size(); ++i) {
-    if (dims.at(i) != 0 && in_sizes.at(dims.at(i)) == 1) {
-      squeeze_dims.push_back(dims.at(i));
-    }
-  }
-  if (squeeze_dims.size() == 0) {
-    add_clone_node(graph, in, out);
-  } else {
-    std::vector<int64_t> permute_dims(in_dim);
-    for (int i = 0; i < in_dim; ++i) {
-      permute_dims.at(i) = i;
-    }
-    for (auto& elem : squeeze_dims) {
-      auto it = std::find(permute_dims.begin(), permute_dims.end(), elem);
-      VK_CHECK_COND(
-          it != permute_dims.end(), "Squeeze dim not found in permute_dims");
-      std::rotate(permute_dims.begin(), it, it + 1);
-    }
-
-    const ValueRef permute_dims_ref =
-        graph.add_scalar_list<int64_t>(std::vector<int64_t>(permute_dims));
-    add_permute_node(graph, in, permute_dims_ref, out);
-  }
-}
-
-void resize_squeeze_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = args.at(1).refs.at(0);
-  const ValueRef dims_ref = extra_args.at(0);
-
-  const IntListPtr dims = graph->get_int_list(dims_ref);
-
-  std::vector<int64_t> out_sizes = graph->sizes_of(in);
-
-  // Remove the dimensions specified in dims if their size is 1
-  for (int64_t dim : *dims) {
-    if (dim >= 0 && dim < static_cast<int64_t>(out_sizes.size()) &&
-        out_sizes[dim] == 1) {
-      out_sizes.erase(out_sizes.begin() + dim);
-      // After erasing, all subsequent dims shift left by one
-      // So we need to decrement all subsequent dims in dims
-      for (auto& d : *dims) {
-        if (d > dim) {
-          --d;
-        }
-      }
-    }
-  }
-
-  graph->virtual_resize(out, out_sizes);
-}
-
-void squeeze_copy_dims(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  int idx = 0;
-  const ValueRef in = args.at(idx++);
-  const ValueRef dims = args.at(idx++);
-  const ValueRef out = args.at(idx++);
-
-  std::vector<ValueRef> resize_args = {dims};
-
-  if (graph.is_buffer_storage(in)) {
-    return add_view_copy_buffer_node(
-        graph, in, out, resize_args, resize_squeeze_node);
-  }
-  return add_squeeze_copy_dims_node(graph, in, dims, out);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.squeeze_copy.dims, squeeze_copy_dims);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
deleted file mode 100644
index 6cd5115563a..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
+++ /dev/null
@@ -1,407 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-namespace vkcompute {
-
-void add_staging_to_tensor_node(
-    ComputeGraph& graph,
-    const ValueRef in_staging,
-    const ValueRef out_tensor) {
-  VK_CHECK_COND(graph.val_is_staging(in_staging));
-
-  vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
-      graph, out_tensor, graph.int8_buffers_enabled());
-
-  vkapi::ParamsBindList param_buffers = {};
-  if (graph.is_buffer_storage(out_tensor)) {
-    param_buffers.append(graph.buffer_meta_ubo(out_tensor));
-  }
-
-  std::vector<PushConstantDataInfo> pcs;
-  if (graph.is_texture_storage(out_tensor)) {
-    pcs = {graph.sizes_pc_of(out_tensor)};
-  }
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      shader,
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Input and Outputs
-      {{out_tensor, vkapi::kWrite}, {in_staging, vkapi::kRead}},
-      // Parameter Buffers
-      param_buffers,
-      // Push Constants
-      pcs,
-      // Specialization Constants
-      {graph.hashed_layout_of(out_tensor)},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-const std::string kBitw8PrefixStr = "bitw8_image_to_nchw_nobitw8buffer";
-
-bool is_bitw8_shader(const vkapi::ShaderInfo& shader) {
-  const auto size = kBitw8PrefixStr.size();
-  const std::string& shader_prefix_str = shader.kernel_name.substr(0, size);
-  return shader_prefix_str == kBitw8PrefixStr;
-}
-
-vkapi::ShaderInfo get_tensor_to_staging_shader(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-  const ValueRef in_tensor = args.at(1).refs.at(0);
-  return get_tensor_to_nchw_shader(
-      *graph, in_tensor, graph->int8_buffers_enabled());
-}
-
-utils::uvec3 tensor_to_staging_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-  const ValueRef in_tensor = args.at(1).refs.at(0);
-  const ValueRef out_staging = args.at(0).refs.at(0);
-
-  utils::uvec3 global_wg_size = graph->create_global_wg_size(in_tensor);
-
-  // Normally, the image_to_nchw shader is structured so that each thread reads
-  // one texel from the input texture and writes each component of the texel
-  // into the corresponding location in the output buffer. However, this shader
-  // is structured slightly differently in that each thread writes out a
-  // complete 32 bit integer (containing 4 packed 8-bit integers) into the
-  // output buffer. Therefore, the global work group size for this shader will
-  // be the number of elements in the output buffer divided by 4, as opposed to
-  // the extents of the input texture.
-  if (is_bitw8_shader(shader)) {
-    const uint32_t buffer_len = utils::safe_downcast<uint32_t>(
-        graph->get_staging(out_staging)->numel() / 4);
-    global_wg_size = {buffer_len, 1, 1};
-  }
-
-  return global_wg_size;
-}
-
-void add_tensor_to_staging_node(
-    ComputeGraph& graph,
-    const ValueRef in_tensor,
-    const ValueRef out_staging) {
-  VK_CHECK_COND(graph.val_is_staging(out_staging));
-
-  vkapi::ShaderInfo shader =
-      get_tensor_to_nchw_shader(graph, in_tensor, graph.int8_buffers_enabled());
-
-  vkapi::ParamsBindList param_buffers = {};
-  if (graph.is_buffer_storage(in_tensor)) {
-    param_buffers.append(graph.buffer_meta_ubo(in_tensor));
-  }
-
-  std::vector<PushConstantDataInfo> pcs;
-  if (graph.is_texture_storage(in_tensor)) {
-    pcs = {graph.sizes_pc_of(in_tensor)};
-  }
-
-  if (is_bitw8_shader(shader)) {
-    pcs.push_back(graph.numel_pc_of(in_tensor));
-  }
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      shader,
-      tensor_to_staging_global_wg_size,
-      default_pick_local_wg_size,
-      // Input and Outputs
-      {{out_staging, vkapi::kWrite}, {in_tensor, vkapi::kRead}},
-      // Parameter Buffers
-      param_buffers,
-      // Push Constants
-      pcs,
-      // Specialization Constants
-      {graph.hashed_layout_of(in_tensor)},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-void add_prepack_standard_node(
-    ComputeGraph& graph,
-    const ValueRef tensor_data,
-    const ValueRef tensor,
-    const bool transpose_hw = false) {
-  vkapi::ShaderInfo shader =
-      get_nchw_to_tensor_shader(graph, tensor, graph.int8_buffers_enabled());
-
-  vkapi::ParamsBindList param_buffers = {};
-  if (graph.is_buffer_storage(tensor)) {
-    param_buffers.append(graph.buffer_meta_ubo(tensor));
-  }
-
-  std::vector<PushConstantDataInfo> pcs;
-  if (graph.is_buffer_storage(tensor)) {
-    pcs = {
-        graph.sizes_pc_of(tensor),
-        graph.strides_pc_of(tensor),
-        graph.numel_pc_of(tensor)};
-  } else {
-    pcs = {graph.sizes_pc_of(tensor)};
-  }
-
-  int transpose_hw_spec = transpose_hw ? 1 : 0;
-
-  graph.prepack_nodes().emplace_back(new PrepackNode(
-      graph,
-      shader,
-      graph.create_global_wg_size(tensor),
-      graph.create_local_wg_size(tensor),
-      // Input and Outputs
-      tensor_data,
-      tensor,
-      // Parameter Buffers
-      param_buffers,
-      // Specialization Constants
-      {graph.hashed_layout_of(tensor), transpose_hw_spec},
-      pcs));
-}
-
-ValueRef prepack_standard(
-    ComputeGraph& graph,
-    const ValueRef tensor_data,
-    const utils::StorageType storage_type,
-    const utils::GPUMemoryLayout layout,
-    const bool passthrough,
-    const utils::AxisMapLayout axis_map_layout) {
-  if (passthrough && graph.val_is_tensor(tensor_data)) {
-    return tensor_data;
-  }
-  VK_CHECK_COND(graph.val_is_tref(tensor_data));
-  ValueRef tensor =
-      graph.add_tensor_like(tensor_data, storage_type, layout, axis_map_layout);
-  add_prepack_standard_node(graph, tensor_data, tensor);
-  return tensor;
-}
-
-ValueRef prepack_standard_hw_transposed(
-    ComputeGraph& graph,
-    const ValueRef tensor_data,
-    const utils::StorageType storage_type,
-    const utils::GPUMemoryLayout layout,
-    const bool passthrough,
-    const utils::AxisMapLayout axis_map_layout) {
-  (void)passthrough;
-
-  VK_CHECK_COND(graph.val_is_tref(tensor_data));
-  std::vector<int64_t> new_out_sizes = graph.sizes_of(tensor_data);
-  const int w_dim = new_out_sizes.size() - 1;
-  const int h_dim = new_out_sizes.size() - 2;
-  const int64_t tmp = new_out_sizes.at(w_dim);
-  new_out_sizes.at(w_dim) = new_out_sizes.at(h_dim);
-  new_out_sizes.at(h_dim) = tmp;
-  ValueRef tensor = graph.add_tensor(
-      new_out_sizes,
-      graph.dtype_of(tensor_data),
-      storage_type,
-      layout,
-      -1,
-      axis_map_layout);
-  add_prepack_standard_node(graph, tensor_data, tensor, true);
-  return tensor;
-}
-
-ValueRef prepack_standard_like(
-    ComputeGraph& graph,
-    const ValueRef tensor_data,
-    const ValueRef to_copy,
-    const bool passthrough) {
-  VK_CHECK_COND(graph.val_is_tensor(to_copy));
-  return prepack_standard(
-      graph,
-      tensor_data,
-      graph.storage_type_of(to_copy),
-      graph.estimate_memory_layout_of(to_copy),
-      passthrough);
-}
-
-void add_prepack_direct_copy_buffer_node(
-    ComputeGraph& graph,
-    const ValueRef tensor_data,
-    const ValueRef tensor) {
-  std::string kernel_name = "buffer_to_buffer";
-  add_dtype_suffix(kernel_name, graph.dtype_of(tensor_data));
-  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
-
-  vkapi::ParamsBindList ubos;
-  ubos.append({graph.numel_ubo(tensor)});
-
-  graph.prepack_nodes().emplace_back(new PrepackNode(
-      graph,
-      shader,
-      graph.create_global_wg_size(tensor),
-      graph.create_local_wg_size(tensor),
-      // Input and Outputs
-      tensor_data,
-      tensor,
-      // Parameter Buffers
-      ubos,
-      // Specialization Constants
-      {}));
-}
-
-ValueRef prepack_direct_copy_buffer(
-    ComputeGraph& graph,
-    const ValueRef tensor_data) {
-  VK_CHECK_COND(graph.val_is_tref(tensor_data));
-  ValueRef tensor =
-      graph.add_tensor_like(tensor_data, utils::kBuffer, utils::kWidthPacked);
-  add_prepack_direct_copy_buffer_node(graph, tensor_data, tensor);
-  return tensor;
-}
-
-ValueRef prepack_int4_linear_weight_transposed_interleaved(
-    ComputeGraph& graph,
-    const ValueRef qmat2_data) {
-  std::vector<int64_t> qmat2_orig_sizes = graph.sizes_of(qmat2_data);
-  const int64_t ndim = graph.dim_of(qmat2_data);
-
-  const int64_t K = qmat2_orig_sizes.at(ndim - 1) * 2;
-  const int64_t N = qmat2_orig_sizes.at(ndim - 2);
-  const int64_t N_div2 = N / int64_t(2);
-
-  utils::StorageType storage_type = utils::kTexture2D;
-  uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
-  if (N_div2 > max_extent * 4 || K > max_extent) {
-    storage_type = utils::kBuffer;
-  }
-
-  std::vector<int64_t> qmat2_sizes{K, N_div2};
-  ValueRef qmat2 = graph.add_tensor(
-      qmat2_sizes, vkcompute::vkapi::kByte, storage_type, utils::kWidthPacked);
-
-  utils::uvec3 global_wg_size;
-  global_wg_size = graph.logical_limits_of(qmat2);
-  global_wg_size[1] = utils::div_up(global_wg_size[1], uint32_t(2));
-
-  std::string kernel_name =
-      graph.context()->adapter_ptr()->has_full_int8_buffers_support()
-      ? "pack_int4_linear_weight_transposed_interleaved"
-      : "pack_int4_linear_weight_transposed_interleaved_nobitw8buffer";
-  add_storage_type_suffix(kernel_name, storage_type);
-
-  graph.prepack_nodes().emplace_back(new PrepackNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      global_wg_size,
-      graph.create_local_wg_size(global_wg_size),
-      // Inputs and Outputs
-      qmat2_data,
-      qmat2,
-      // UBOs
-      {},
-      // Specialization Constants
-      {},
-      // Push Constants
-      {graph.sizes_pc_of(qmat2)}));
-
-  return qmat2;
-}
-
-ValueRef prepack_int4_linear_weight_transposed_block_4x8(
-    ComputeGraph& graph,
-    const ValueRef qmat2_data) {
-  std::vector<int64_t> qmat2_orig_sizes = graph.sizes_of(qmat2_data);
-  const int64_t ndim = graph.dim_of(qmat2_data);
-
-  const int64_t K_div2 = qmat2_orig_sizes.at(ndim - 1); // Input is [N, K/2]
-  const int64_t N = qmat2_orig_sizes.at(ndim - 2);
-  // Logical K dimension. Each value in the tensor is a uint8 that contains 2
-  // packed 4-bit values.
-  const int64_t K = K_div2 * 2;
-
-  // This packing format partitions the weight tensor into 4 wide x 8 high
-  // blocks. To figure out the size of the output tensor, determine the number
-  // of blocks along the width and height dims.
-  const int64_t num_blocks_K = utils::div_up(K, int64_t(4));
-  const int64_t num_blocks_N = utils::div_up(N, int64_t(8));
-  // Each transposed block is 8 wide x 4 high. In terms of 8-bit values, the
-  // block is 4 wide x 4 high. To maximize memory loading efficiency, the packed
-  // weight tensor will use a base data type of uint32_t; in terms of uint32_t,
-  // each block is 1 wide x 4 high. However, each block is also flattened as it
-  // is stored, so that the whole block can be loaded at once. As a result, the
-  // stored block will be 4 wide x 1 high.
-  const int64_t output_width = num_blocks_K * 4;
-  const int64_t output_height = num_blocks_N;
-
-  // Store the original sizes of the tensor to pass to the shader
-  utils::ivec2 orig_sizes{
-      utils::safe_downcast<int32_t>(K), utils::safe_downcast<int32_t>(N)};
-
-  std::vector<int64_t> qmat2_sizes{output_height, output_width};
-
-  utils::StorageType storage_type = utils::kTexture2D;
-  uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
-  if (output_width > max_extent * 4 || output_height > max_extent) {
-    storage_type = utils::kBuffer;
-  }
-
-  ValueRef qmat2 = graph.add_tensor(
-      qmat2_sizes, vkcompute::vkapi::kUInt, storage_type, utils::kWidthPacked);
-
-  // Global workgroup size: each thread writes out two adjacent blocks
-  utils::uvec3 global_wg_size{
-      utils::div_up(utils::safe_downcast<uint32_t>(num_blocks_K), uint32_t(2)),
-      utils::safe_downcast<uint32_t>(num_blocks_N),
-      1u};
-
-  std::string kernel_name = "pack_int4_linear_weight_transposed_block_4x8";
-  add_storage_type_suffix(kernel_name, storage_type);
-
-  graph.prepack_nodes().emplace_back(new PrepackNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      global_wg_size,
-      graph.create_local_wg_size(global_wg_size),
-      // Inputs and Outputs
-      qmat2_data,
-      qmat2,
-      // UBOs
-      {},
-      // Specialization Constants
-      {},
-      // Push Constants
-      {graph.sizes_pc_of(qmat2),
-       PushConstantDataInfo(&orig_sizes, sizeof(utils::ivec2))}));
-
-  return qmat2;
-}
-
-void prepack_op(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_prepack_standard_node(graph, args[0], args[1]);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(et_vk.prepack.default, prepack_op);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.h b/backends/vulkan/runtime/graph/ops/impl/Staging.h
deleted file mode 100644
index 0b1568ca139..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-#include <cstring>
-
-namespace vkcompute {
-
-//
-// Staging Buffer <-> Tensor
-//
-
-void add_staging_to_tensor_node(
-    ComputeGraph& graph,
-    const ValueRef in_staging,
-    const ValueRef out_tensor);
-
-void add_tensor_to_staging_node(
-    ComputeGraph& graph,
-    const ValueRef in_tensor,
-    const ValueRef out_staging);
-
-//
-// Standard Prepack
-//
-
-/*
- * Given that `v` is a `TensorRef`, create a new `Tensor` value with the
- * specified `storage_type` and `memory_layout`, and add a a prepacking node to
- * transfer the `TensorRef` data to the new `Tensor` object via a staging to
- * tensor shader. The created `Tensor` value is then returned.
- *
- * If `passthrough` is `true`, then `v` may be a `Tensor` as well. If `v` is a
- * `Tensor`, then it is returned as-is. If `passthrough` is `false` (default),
- * then an exception will be thrown.
- */
-
-ValueRef prepack_standard(
-    ComputeGraph& graph,
-    const ValueRef tensor_data,
-    const utils::StorageType storage_type,
-    const utils::GPUMemoryLayout layout,
-    const bool passthrough = false,
-    const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap);
-
-/*
- * Same as prepack_standard, but transpose the height and width dimensions of
- * the tensor while packing.
- */
-ValueRef prepack_standard_hw_transposed(
-    ComputeGraph& graph,
-    const ValueRef tensor_data,
-    const utils::StorageType storage_type,
-    const utils::GPUMemoryLayout layout,
-    const bool passthrough = false,
-    const utils::AxisMapLayout axis_map_layout = utils::kDefaultAxisMap);
-
-/*
- * Equivalent to `prepack_standard()` function, except the `storage_type` and
- * `memory_layout` are set to match `to_copy`, which must be a `Tensor`.
- */
-ValueRef prepack_standard_like(
-    ComputeGraph& graph,
-    const ValueRef tensor_data,
-    const ValueRef to_copy,
-    const bool passthrough = false);
-
-//
-// Direct buffer copy prepack
-//
-
-/*
- * Given that `v` is a `TensorRef`, create a new `Tensor` value with buffer
- * storage and `kWidthPacked` memory layout, and add a prepacking node to
- * transfer the `TensorRef` data to the new `Tensor` object via a direct buffer
- * to buffer copy shader.
- */
-ValueRef prepack_direct_copy_buffer(
-    ComputeGraph& graph,
-    const ValueRef tensor_data);
-
-//
-// Op specific prepack functions
-//
-
-ValueRef prepack_int4_linear_weight_transposed_interleaved(
-    ComputeGraph& graph,
-    const ValueRef qmat2_data);
-
-ValueRef prepack_int4_linear_weight_transposed_block_4x8(
-    ComputeGraph& graph,
-    const ValueRef qmat2_data);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/SymIntOps.cpp b/backends/vulkan/runtime/graph/ops/impl/SymIntOps.cpp
deleted file mode 100644
index f07522d2578..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/SymIntOps.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/ExecuteNode.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-namespace vkcompute {
-
-//
-// sym_size
-//
-
-void sym_size_impl(ComputeGraph* graph, const std::vector<ValueRef>& args) {
-  const ValueRef in_tensor = args.at(0);
-  const ValueRef dim = args.at(1);
-  const ValueRef out_symint = args.at(2);
-
-  const int64_t dim_val = graph->extract_scalar<int64_t>(dim);
-  const int64_t size_at_dim = graph->size_at<int64_t>(dim_val, in_tensor);
-
-  graph->set_symint(out_symint, static_cast<int32_t>(size_at_dim));
-}
-
-void resize_sym_size_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)args; // Unused parameter
-  sym_size_impl(graph, resize_args);
-}
-
-/*
- * This operator takes a tensor and an integer dimension as inputs, and produces
- * a symint as output. The symint's value is the size of the tensor at the
- * specified dimension.
- */
-void sym_size_int(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  sym_size_impl(&graph, args);
-
-  graph.execute_nodes().emplace_back(
-      new ExecuteNode(resize_sym_size_node, args));
-}
-
-//
-// binary operators
-//
-
-void sym_add_impl(ComputeGraph* graph, const std::vector<ValueRef>& args) {
-  const ValueRef a = args.at(0);
-  const ValueRef b = args.at(1);
-  const ValueRef out = args.at(2);
-
-  const int32_t a_val = graph->read_symint(a);
-  const int32_t b_val = graph->read_symint(b);
-  const int32_t result = a_val + b_val;
-
-  graph->set_symint(out, result);
-}
-
-void resize_sym_add_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)args; // Unused parameter
-  sym_add_impl(graph, resize_args);
-}
-
-/*
- * This operator takes two symints as inputs and produces a symint as output.
- * The output symint's value is the sum of the two input symints.
- */
-void sym_add(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  sym_add_impl(&graph, args);
-
-  graph.execute_nodes().emplace_back(
-      new ExecuteNode(resize_sym_add_node, args));
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(sym_size.int, sym_size_int);
-  VK_REGISTER_OP(add, sym_add);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Tan.cpp b/backends/vulkan/runtime/graph/ops/impl/Tan.cpp
deleted file mode 100644
index 687b3923354..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Tan.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-using namespace utils;
-
-void resize_tan_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef self = args.at(1).refs.at(0);
-
-  const std::vector<int64_t> self_sizes = graph->sizes_of(self);
-  graph->virtual_resize(out, self_sizes);
-}
-
-void add_tan_node(ComputeGraph& graph, const ValueRef in, const ValueRef out) {
-  std::string kernel_name = "tan";
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-
-  vkapi::ParamsBindList ubos({});
-  ubos.append({graph.logical_limits_ubo(out)});
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Shader params buffers
-      ubos,
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_tan_node));
-}
-
-void tan(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_tan_node(graph, args[0], args[1]);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.tan.default, tan);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/ToCopy.cpp b/backends/vulkan/runtime/graph/ops/impl/ToCopy.cpp
deleted file mode 100644
index b7e0218823a..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/ToCopy.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/BlitNode.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-#include <set>
-
-namespace vkcompute {
-
-void resize_to_copy_op_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef self = args.at(1).refs.at(0);
-
-  graph->virtual_resize(out, graph->sizes_of(self));
-}
-
-void add_to_copy_node(ComputeGraph& graph, ValueRef in, ValueRef out) {
-  static std::set<vkapi::ScalarType> supported_types = {
-      vkapi::ScalarType::Float, vkapi::ScalarType::Half};
-
-  VK_CHECK_COND(
-      supported_types.find(graph.dtype_of(in)) != supported_types.end() &&
-          supported_types.find(graph.dtype_of(out)) != supported_types.end(),
-      "Unsupported dtype for to_copy, only Float and Half are currently supported, recieved ",
-      vkapi::to_string(graph.dtype_of(in)),
-      " <-> ",
-      vkapi::to_string(graph.dtype_of(out)));
-
-  graph.execute_nodes().emplace_back(new BlitNode(graph, in, out));
-}
-
-void to_copy(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_to_copy_node(graph, args[0], args[7]);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten._to_copy.default, to_copy);
-}
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
deleted file mode 100644
index 60127ecf9bd..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Transfer.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-/**
- * Adds a transfer copy operation node to the compute graph.
- * This function handles both SELECT and SLICE operations based on the
- * transfer_type parameter.
- */
-void add_transfer_copy_node(
-    ComputeGraph& graph,
-    TransferType transfer_type,
-    const ValueRef in,
-    const ValueRef dim_ref,
-    const ValueRef index_or_start_ref,
-    const ValueRef end_ref,
-    const ValueRef step_ref,
-    const ValueRef out,
-    const std::vector<ValueRef>& resize_args,
-    const ExecuteNode::ResizeFunction& resize_fn) {
-  int64_t ndim = graph.dim_of(in);
-  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-
-  if (dim < 0) {
-    dim += ndim;
-  }
-
-  int64_t dim_whcn = nchw_dim_to_whcn_dim(dim, ndim);
-
-  struct TransferParams {
-    int32_t dim;
-    int32_t index_or_start_ref;
-    int32_t step_ref;
-  } transfer_params{static_cast<int32_t>(dim_whcn), 0, 0};
-
-  const bool param_is_scalar = graph.is_scalar_or_none(index_or_start_ref) &&
-      (transfer_type == TransferType::SELECT ||
-       graph.is_scalar_or_none(step_ref));
-
-  vkapi::ParamsBindList param_buffers;
-  if (!param_is_scalar) {
-    if (transfer_type == TransferType::SELECT) {
-      param_buffers = {
-          graph.get_or_create_int_param_buffer(index_or_start_ref, 0)};
-    } else { // TransferType::SLICE
-      param_buffers = {
-          graph.get_or_create_int_param_buffer(index_or_start_ref, 0),
-          graph.get_or_create_int_param_buffer(step_ref, 1)};
-    }
-  } else {
-    transfer_params.index_or_start_ref =
-        graph.extract_scalar_or<int32_t>(index_or_start_ref, 0);
-    if (transfer_type != TransferType::SELECT) {
-      transfer_params.step_ref = graph.extract_scalar_or<int32_t>(step_ref, 1);
-    }
-  }
-
-  std::vector<PushConstantDataInfo> push_constants;
-  push_constants.reserve(graph.is_buffer_storage(out) ? 5 : 3);
-
-  if (graph.is_buffer_storage(out)) {
-    push_constants.emplace_back(graph.sizes_pc_of(in));
-    push_constants.emplace_back(graph.strides_pc_of(out));
-    push_constants.emplace_back(graph.strides_pc_of(in));
-    push_constants.emplace_back(graph.numel_pc_of(out));
-  } else {
-    push_constants.emplace_back(graph.sizes_pc_of(out));
-    push_constants.emplace_back(graph.sizes_pc_of(in));
-  }
-
-  if (param_is_scalar) {
-    push_constants.emplace_back(&transfer_params, sizeof(transfer_params));
-  } else {
-    push_constants.emplace_back(
-        &transfer_params.dim, sizeof(transfer_params.dim));
-  }
-
-  vkapi::SpecVarList spec_vars = {
-      graph.hashed_layout_of(out),
-      graph.hashed_layout_of(in),
-  };
-
-  // Determine the shader directly
-  std::string kernel_name;
-  if (transfer_type == TransferType::SELECT) {
-    kernel_name = "select";
-  } else { // TransferType::SLICE
-    kernel_name = "slice";
-  }
-  if (!param_is_scalar) {
-    kernel_name += "_ubo";
-  }
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  // Create and add the dispatch node
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Parameter buffers
-      param_buffers,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      spec_vars,
-      // Resize Args
-      resize_args,
-      // Resizing Logic
-      resize_fn));
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transfer.h b/backends/vulkan/runtime/graph/ops/impl/Transfer.h
deleted file mode 100644
index 09aae144994..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Transfer.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/ExecuteNode.h>
-
-namespace vkcompute {
-
-enum class TransferType { SELECT, SLICE };
-
-/**
- * Adds a transfer copy operation node to the compute graph, which implements
- * operators for which each element of the output tensor maps to a unique
- * element of the input tensor.
- *
- * This function currently handles the following operations:
- * - select
- * - slice
- */
-void add_transfer_copy_node(
-    ComputeGraph& graph,
-    TransferType transfer_type,
-    const ValueRef in,
-    const ValueRef dim_ref,
-    const ValueRef index_or_start_ref,
-    const ValueRef end_ref,
-    const ValueRef step_ref,
-    const ValueRef out,
-    const std::vector<ValueRef>& resize_args,
-    const ExecuteNode::ResizeFunction& resize_fn = nullptr);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp b/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp
deleted file mode 100644
index b797536d817..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Transpose.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/Logging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Transpose.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-#include <algorithm>
-
-namespace vkcompute {
-
-void resize_transpose_view_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)args;
-  const ValueRef out = extra_args.at(0);
-  const ValueRef in = extra_args.at(1);
-
-  const int64_t dim0 = graph->extract_scalar<int64_t>(extra_args.at(2));
-  const int64_t dim1 = graph->extract_scalar<int64_t>(extra_args.at(3));
-
-  std::vector<int64_t> new_sizes = graph->sizes_of(in);
-  // Transpose the resized input sizes
-  std::iter_swap(new_sizes.begin() + dim0, new_sizes.begin() + dim1);
-  graph->virtual_resize(out, new_sizes);
-}
-
-void check_transpose_view_args(
-    ComputeGraph& graph,
-    ValueRef in_ref,
-    const int64_t dim0,
-    const int64_t dim1,
-    ValueRef out_ref) {
-  VK_CHECK_COND(
-      graph.val_is_view_of(out_ref, in_ref),
-      "output tensor must be a view of the input tensor");
-
-  const int64_t in_ndim = graph.dim_of(in_ref);
-  VK_CHECK_COND(
-      dim0 >= 0 && dim0 < in_ndim, "dim0 is not in the range of [0, in_ndim)");
-  VK_CHECK_COND(
-      dim1 >= 0 && dim1 < in_ndim, "dim1 is not in the range of [0, in_ndim)");
-}
-
-void add_transpose_view_node(
-    ComputeGraph& graph,
-    ValueRef input_ref,
-    ValueRef dim0_ref,
-    ValueRef dim1_ref,
-    ValueRef out_ref) {
-  const int64_t dim0 = graph.extract_scalar<int64_t>(dim0_ref);
-  const int64_t dim1 = graph.extract_scalar<int64_t>(dim1_ref);
-
-  check_transpose_view_args(graph, input_ref, dim0, dim1, out_ref);
-  graph.virtual_clone(out_ref, input_ref);
-  graph.virtual_transpose(out_ref, dim0, dim1);
-
-  graph.execute_nodes().emplace_back(new ExecuteNode(
-      resize_transpose_view_node, {out_ref, input_ref, dim0_ref, dim1_ref}));
-}
-
-void transpose(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  const ValueRef out = args[3];
-  return add_transpose_view_node(
-      graph,
-      args[0], // input
-      args[1], // dim0
-      args[2], // dim1
-      out);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.transpose.int, transpose);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transpose.h b/backends/vulkan/runtime/graph/ops/impl/Transpose.h
deleted file mode 100644
index a4fc4029222..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Transpose.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-#include <vector>
-
-namespace vkcompute {
-
-void add_transpose_view_node(
-    ComputeGraph& graph,
-    ValueRef input_ref,
-    ValueRef dim0_ref,
-    ValueRef dim1_ref,
-    ValueRef out_ref);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
deleted file mode 100644
index 9830a8e8784..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-constexpr float kDummyFloat = -1.0f;
-const std::string kClampShaderName = "clamp";
-
-void resize_unary_op_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef self = args.at(1).refs.at(0);
-
-  const std::vector<int64_t> self_sizes = graph->sizes_of(self);
-  graph->virtual_resize(out, self_sizes);
-}
-
-void add_unary_op_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const float min,
-    const float max,
-    const ValueRef out,
-    const std::string& op_name) {
-  std::string kernel_name(op_name);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-
-  const utils::vec2 min_max = {min, max};
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Shader params buffers
-      {},
-      // Push Constants
-      {
-          graph.is_buffer_storage(out) ? graph.numel_pc_of(out)
-                                       : graph.logical_limits_pc_of(out),
-          PushConstantDataInfo(&min_max, sizeof(min_max)),
-      },
-      // pcs,
-      // Specialization Constants
-      {},
-      // Resize Args
-      {},
-      // Resizing Logic
-      resize_unary_op_node));
-}
-
-float get_val_or_inf(ComputeGraph& graph, const ValueRef& val, bool max) {
-  if (!graph.val_is_none(val)) {
-    return graph.extract_scalar<float>(val);
-  }
-  return max ? std::numeric_limits<float>::infinity()
-             : -std::numeric_limits<float>::infinity();
-}
-
-#define DEFINE_ACTIVATION_FN(op_name)                                    \
-  void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
-    return add_unary_op_node(                                            \
-        graph, args[0], kDummyFloat, kDummyFloat, args[1], #op_name);    \
-  }
-
-#define DEFINE_CLAMP_FN(op_name)                                         \
-  void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
-    return add_unary_op_node(                                            \
-        graph,                                                           \
-        args[0],                                                         \
-        get_val_or_inf(graph, args[1], /*max = */ false),                \
-        get_val_or_inf(graph, args[2], /*max = */ true),                 \
-        args[3],                                                         \
-        kClampShaderName);                                               \
-  }
-
-#define DEFINE_RELU_FN(op_name)                                          \
-  void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
-    return add_unary_op_node(                                            \
-        graph,                                                           \
-        args[0],                                                         \
-        0,                                                               \
-        std::numeric_limits<float>::infinity(),                          \
-        args[1],                                                         \
-        kClampShaderName);                                               \
-  }
-
-#define DEFINE_RELU6_FN(op_name)                                               \
-  void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) {       \
-    return add_unary_op_node(graph, args[0], 0, 6, args[1], kClampShaderName); \
-  }
-
-#define DEFINE_HARDSHRINK_FN(op_name)                                    \
-  void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
-    return add_unary_op_node(                                            \
-        graph,                                                           \
-        args[0],                                                         \
-        get_val_or_inf(graph, args[1], /*max = */ false),                \
-        -get_val_or_inf(graph, args[1], /*max = */ true),                \
-        args[2],                                                         \
-        "hardshrink");                                                   \
-  }
-
-#define DEFINE_LEAKY_RELU_FN(op_name)                                    \
-  void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
-    return add_unary_op_node(                                            \
-        graph,                                                           \
-        args[0],                                                         \
-        get_val_or_inf(graph, args[1], /*neg slope*/ false),             \
-        kDummyFloat,                                                     \
-        args[2],                                                         \
-        "leaky_relu");                                                   \
-  }
-
-void gelu(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  // args[1] is the `approximate` string
-  // https://fburl.com/code/9omngmyo
-  // currently only `approximate = "tanh"` is supported
-  return add_unary_op_node(
-      graph, args[0], kDummyFloat, kDummyFloat, args[2], "gelu");
-}
-
-DEFINE_ACTIVATION_FN(abs);
-DEFINE_ACTIVATION_FN(cos);
-DEFINE_ACTIVATION_FN(exp);
-DEFINE_ACTIVATION_FN(neg);
-DEFINE_ACTIVATION_FN(sigmoid);
-DEFINE_ACTIVATION_FN(sin);
-DEFINE_ACTIVATION_FN(sqrt);
-DEFINE_ACTIVATION_FN(rsqrt);
-DEFINE_ACTIVATION_FN(tanh);
-DEFINE_CLAMP_FN(clamp);
-DEFINE_CLAMP_FN(hardtanh);
-DEFINE_RELU_FN(relu);
-DEFINE_RELU6_FN(relu6);
-DEFINE_HARDSHRINK_FN(hardshrink);
-DEFINE_ACTIVATION_FN(hardswish);
-DEFINE_ACTIVATION_FN(hardsigmoid);
-DEFINE_LEAKY_RELU_FN(leaky_relu);
-DEFINE_ACTIVATION_FN(round);
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.abs.default, abs);
-  VK_REGISTER_OP(aten.clamp.default, clamp);
-  VK_REGISTER_OP(aten.cos.default, cos);
-  VK_REGISTER_OP(aten.exp.default, exp);
-  VK_REGISTER_OP(aten.gelu.default, gelu);
-  VK_REGISTER_OP(aten.hardtanh.default, hardtanh);
-  VK_REGISTER_OP(aten.neg.default, neg);
-  VK_REGISTER_OP(aten.relu.default, relu);
-  VK_REGISTER_OP(aten.relu6.default, relu6);
-  VK_REGISTER_OP(aten.sigmoid.default, sigmoid);
-  VK_REGISTER_OP(aten.sin.default, sin);
-  VK_REGISTER_OP(aten.sqrt.default, sqrt);
-  VK_REGISTER_OP(aten.rsqrt.default, rsqrt);
-  VK_REGISTER_OP(aten.tanh.default, tanh);
-  VK_REGISTER_OP(aten.hardshrink.default, hardshrink);
-  VK_REGISTER_OP(aten.hardswish.default, hardswish);
-  VK_REGISTER_OP(aten.hardsigmoid.default, hardsigmoid);
-  VK_REGISTER_OP(aten.leaky_relu.default, leaky_relu);
-  VK_REGISTER_OP(aten.round.default, round);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp b/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp
deleted file mode 100644
index 0a98f6d8f43..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Unsqueeze.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Permute.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/View.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void add_unsqueeze_node(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef dim_ref,
-    const ValueRef out) {
-  const int64_t in_dim = graph.dim_of(in);
-  const int64_t out_dim = graph.dim_of(out);
-
-  VK_CHECK_COND(
-      in_dim < 4, "Cannot unsqueeze a tensor with more than 3 dimensions");
-
-  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-  if (dim < 0) {
-    dim += out_dim;
-  }
-
-  std::vector<int64_t> permute_dims(out_dim);
-  for (int i = 1; i <= dim; i++) {
-    permute_dims[i - 1] = i;
-  }
-  permute_dims[dim] = 0;
-
-  for (int i = dim + 1; i < out_dim; i++) {
-    permute_dims[i] = i;
-  }
-
-  const ValueRef permute_dims_ref =
-      graph.add_scalar_list<int64_t>(std::vector<int64_t>(permute_dims));
-  add_permute_node(graph, in, permute_dims_ref, out);
-}
-
-void resize_unsqueeze_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = args.at(1).refs.at(0);
-  const ValueRef dims_ref = extra_args.at(0);
-
-  const IntListPtr dims = graph->get_int_list(dims_ref);
-
-  std::vector<int64_t> out_sizes = graph->sizes_of(in);
-
-  // Insert singleton dimensions at the specified positions
-  for (auto dim : *dims) {
-    int64_t d = dim;
-    if (d < 0) {
-      d += static_cast<int64_t>(out_sizes.size()) + 1;
-    }
-    out_sizes.insert(out_sizes.begin() + d, 1);
-  }
-
-  graph->virtual_resize(out, out_sizes);
-}
-
-void unsqueeze(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  int idx = 0;
-  const ValueRef in = args.at(idx++);
-  const ValueRef dims = args.at(idx++);
-  const ValueRef out = args.at(idx++);
-
-  std::vector<ValueRef> resize_args = {dims};
-  if (graph.is_buffer_storage(in)) {
-    return add_view_copy_buffer_node(
-        graph, in, out, resize_args, resize_unsqueeze_node);
-  }
-  return add_unsqueeze_node(graph, in, dims, out);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.unsqueeze_copy.default, unsqueeze);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp b/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp
deleted file mode 100644
index 6662ae367c5..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Upsample.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-enum class UpsampleMode : int { NEAREST, BILINEAR };
-
-void resize_upsample_nearest2d_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef self = args.at(1).refs.at(0);
-  std::vector<int64_t> out_sizes = graph->sizes_of(self); // NCHW
-
-  const ValueRef output_sizes = extra_args.at(0); // HW
-  const ValueRef scale_factors = extra_args.at(1); // HW
-  if (!graph->val_is_none(output_sizes)) {
-    IntListPtr output_size_ref = graph->get_int_list(output_sizes);
-    out_sizes.at(2) = output_size_ref->at(0);
-    out_sizes.at(3) = output_size_ref->at(1);
-  } else {
-    DoubleListPtr scales = graph->get_double_list(scale_factors);
-    out_sizes.at(2) *= scales->at(0);
-    out_sizes.at(3) *= scales->at(1);
-  }
-
-  graph->virtual_resize(out, out_sizes);
-}
-
-void add_upsample_nearest2d_node(
-    ComputeGraph& graph,
-    const UpsampleMode mode,
-    const ValueRef in,
-    const ValueRef output_sizes,
-    const ValueRef align_corners,
-    const ValueRef scale_factors,
-    const ValueRef out) {
-  if (graph.val_is_none(output_sizes) && graph.val_is_none(scale_factors)) {
-    VK_THROW(
-        "Invalid input, must provide either output_sizes or scale_factors");
-  }
-  if (!graph.val_is_none(output_sizes) && !graph.val_is_none(scale_factors)) {
-    VK_THROW(
-        "Invalid input, must provide ONLY one of output_sizes or scale_factors");
-  }
-
-  int align_corners_val = 0;
-  if (is_valid(align_corners) && graph.get_bool(align_corners)) {
-    align_corners_val = 1;
-  }
-
-  utils::uvec3 in_limits = graph.logical_limits_of(in);
-  utils::uvec3 out_limits = graph.logical_limits_of(out);
-
-  uint32_t out_width = out_limits[0u];
-  uint32_t out_height = out_limits[1u];
-
-  float scale_factor_x = float(in_limits[0u]) / float(out_width);
-  float scale_factor_y = float(in_limits[1u]) / float(out_height);
-
-  float recip_scale_factor_x = 1.0f / scale_factor_x;
-  float recip_scale_factor_y = 1.0f / scale_factor_y;
-
-  if (!graph.val_is_none(output_sizes)) {
-    IntListPtr output_size_ref = graph.get_int_list(output_sizes);
-    out_width = output_size_ref->at(1);
-    out_height = output_size_ref->at(0);
-
-    VK_CHECK_COND(out_width == out_limits[0u]);
-    VK_CHECK_COND(out_height == out_limits[1u]);
-
-  } else {
-    DoubleListPtr scales = graph.get_double_list(scale_factors);
-    scale_factor_x = scales->at(1);
-    scale_factor_y = scales->at(0);
-
-    VK_CHECK_COND(in_limits[0u] * scale_factor_x == out_width);
-    VK_CHECK_COND(in_limits[1u] * scale_factor_y == out_height);
-  }
-
-  if (align_corners_val == 1) {
-    recip_scale_factor_x = float(in_limits[0u] - 1) / float(out_width - 1);
-    recip_scale_factor_y = float(in_limits[1u] - 1) / float(out_height - 1);
-  } else {
-    recip_scale_factor_x = float(in_limits[0u]) / float(out_width);
-    recip_scale_factor_y = float(in_limits[1u]) / float(out_height);
-  }
-
-  utils::vec2 recip_scales = {recip_scale_factor_x, recip_scale_factor_y};
-
-  std::string kernel_name;
-  kernel_name.reserve(kShaderNameReserve);
-  switch (mode) {
-    case UpsampleMode::NEAREST:
-      kernel_name = "upsample_nearest2d";
-      break;
-    case UpsampleMode::BILINEAR:
-      kernel_name = "upsample_bilinear2d";
-      break;
-  }
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::MemoryAccessType::WRITE},
-       {in, vkapi::MemoryAccessType::READ}},
-      // Shader params buffers
-      {graph.logical_limits_ubo(out),
-       graph.logical_limits_ubo(in),
-       graph.create_params_buffer(recip_scales)},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {align_corners_val},
-      // Resize Args
-      {output_sizes, scale_factors},
-      // Resizing Logic
-      resize_upsample_nearest2d_node));
-}
-
-void upsample_nearest2d(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  return add_upsample_nearest2d_node(
-      graph,
-      UpsampleMode::NEAREST,
-      args[0],
-      args[1],
-      kDummyValueRef,
-      args[2],
-      args[3]);
-}
-
-void upsample_bilinear2d(
-    ComputeGraph& graph,
-    const std::vector<ValueRef>& args) {
-  return add_upsample_nearest2d_node(
-      graph,
-      UpsampleMode::BILINEAR,
-      args[0],
-      args[1],
-      args[2],
-      args[3],
-      args[4]);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.upsample_nearest2d.vec, upsample_nearest2d);
-  VK_REGISTER_OP(aten.upsample_bilinear2d.vec, upsample_bilinear2d);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Var.cpp b/backends/vulkan/runtime/graph/ops/impl/Var.cpp
deleted file mode 100644
index d8fd367f18a..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Var.cpp
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-using namespace utils;
-
-// Custom global workgroup size function for var_buffer
-utils::uvec3 var_buffer_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)resize_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  return {
-      graph->size_at<uint32_t>(-1, out),
-      graph->size_at<uint32_t>(-2, out),
-      graph->size_at<uint32_t>(-3, out) * graph->size_at<uint32_t>(-4, out)};
-}
-
-// Custom local workgroup size function for var_buffer
-utils::uvec3 var_buffer_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  (void)global_workgroup_size;
-  const ValueRef in = args.at(1).refs.at(0);
-  const int dim = resize_args.at(0);
-
-  const int64_t ndim = graph->dim_of(in);
-  int32_t reduce_dim = normalize(dim, ndim);
-  reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim);
-
-  const uint32_t nworkers_per_group = 4;
-  utils::uvec3 local_wg_size{1, 1, 1};
-  local_wg_size[reduce_dim] = nworkers_per_group;
-  return local_wg_size;
-}
-
-// Custom global workgroup size function for var_texture
-utils::uvec3 var_texture_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = args.at(1).refs.at(0);
-  const int dim = resize_args.at(0);
-
-  const int64_t ndim = graph->dim_of(in);
-  int32_t reduce_dim = normalize(dim, ndim);
-  reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim);
-
-  utils::uvec3 global_wg_size = graph->logical_limits_of(out);
-  global_wg_size[reduce_dim] = 1;
-  return global_wg_size;
-}
-
-// Custom local workgroup size function for var_texture
-utils::uvec3 var_texture_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  const ValueRef in = args.at(1).refs.at(0);
-  const int dim = resize_args.at(0);
-
-  const int64_t ndim = graph->dim_of(in);
-  int32_t reduce_dim = normalize(dim, ndim);
-  reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim);
-
-  const uint32_t nworkers_per_group = 4;
-  const uint32_t ngroups = 4;
-
-  utils::uvec3 local_wg_size{1, 1, 1};
-  local_wg_size[reduce_dim] = nworkers_per_group;
-  const int other_dim_1 = (reduce_dim + 1) % 3;
-  const int other_dim_2 = (reduce_dim + 2) % 3;
-  if (global_workgroup_size[other_dim_1] > global_workgroup_size[other_dim_2]) {
-    local_wg_size[other_dim_1] = ngroups;
-  } else {
-    local_wg_size[other_dim_2] = ngroups;
-  }
-  return local_wg_size;
-}
-
-void resize_var_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = args.at(1).refs.at(0);
-
-  const int dim = extra_args.at(0);
-
-  std::vector<int64_t> new_sizes = graph->sizes_of(in);
-  if (!new_sizes.empty()) {
-    new_sizes.at(normalize(dim, new_sizes.size())) = 1;
-  }
-
-  graph->virtual_resize(out, new_sizes);
-}
-
-void add_var_buffer_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    const int dim,
-    bool unbiased,
-    ValueRef out) {
-  const int64_t ndim = graph.dim_of(in);
-  int32_t reduce_dim = normalize(dim, ndim);
-  reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim);
-
-  // Check that the concat dim is not the reduction dim, if the tensor has a
-  // batch dim greater than 1
-  if (graph.dim_of(in) == 4 && graph.size_at<int>(0, in) > 1) {
-    VK_CHECK_COND(graph.concat_dim_of(in) != reduce_dim);
-    VK_CHECK_COND(graph.concat_dim_of(out) != reduce_dim);
-  }
-
-  std::string kernel_name = "var";
-  kernel_name.reserve(kShaderNameReserve);
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  const uint32_t nworkers_per_group = 4;
-
-  utils::uvec3 global_wg_size = {
-      graph.size_at<uint32_t>(-1, out),
-      graph.size_at<uint32_t>(-2, out),
-      graph.size_at<uint32_t>(-3, out) * graph.size_at<uint32_t>(-4, out)};
-
-  utils::uvec3 local_wg_size{1, 1, 1};
-  local_wg_size[reduce_dim] = nworkers_per_group;
-
-  std::vector<PushConstantDataInfo> push_constants;
-  int32_t unbiased_int = static_cast<int32_t>(unbiased);
-  push_constants.emplace_back(&unbiased_int, sizeof(unbiased_int));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      var_buffer_global_wg_size,
-      var_buffer_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Shader params buffers
-      {
-          graph.sizes_ubo(in),
-          graph.strides_ubo(in),
-          graph.sizes_ubo(out),
-          graph.strides_ubo(out),
-      },
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      {reduce_dim},
-      // Resize Args
-      {dim},
-      // Resizing Logic
-      resize_var_node));
-}
-
-void add_var_texture_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    const int dim,
-    bool unbiased,
-    ValueRef out) {
-  const int64_t ndim = graph.dim_of(in);
-
-  int32_t reduce_dim = dim;
-  reduce_dim = normalize(reduce_dim, ndim);
-  reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim);
-
-  // Check that the concat dim is not the reduction dim, if the tensor has a
-  // batch dim greater than 1.
-  if (graph.dim_of(in) == 4 && graph.size_at<int>(0, in) > 1) {
-    VK_CHECK_COND(graph.concat_dim_of(in) != reduce_dim);
-    VK_CHECK_COND(graph.concat_dim_of(out) != reduce_dim);
-  }
-
-  std::string kernel_name = "var";
-  kernel_name.reserve(kShaderNameReserve);
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  // This should match the value of MAX_NTHREADS in the softmax shader.
-  constexpr uint32_t max_nthreads = 16;
-
-  const uint32_t nworkers_per_group = 4;
-  const uint32_t ngroups = 4;
-  VK_CHECK_COND(nworkers_per_group * ngroups <= max_nthreads);
-
-  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
-  global_wg_size[reduce_dim] = 1;
-
-  utils::uvec3 local_wg_size{1, 1, 1};
-  local_wg_size[reduce_dim] = nworkers_per_group;
-  const int other_dim_1 = (reduce_dim + 1) % 3;
-  const int other_dim_2 = (reduce_dim + 2) % 3;
-  int32_t group_dim;
-  if (global_wg_size[other_dim_1] > global_wg_size[other_dim_2]) {
-    local_wg_size[other_dim_1] = ngroups;
-    group_dim = other_dim_1;
-  } else {
-    local_wg_size[other_dim_2] = ngroups;
-    group_dim = other_dim_2;
-  }
-
-  std::vector<PushConstantDataInfo> push_constants;
-  int32_t unbiased_int = static_cast<int32_t>(unbiased);
-  push_constants.emplace_back(&unbiased_int, sizeof(unbiased_int));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      var_texture_global_wg_size,
-      var_texture_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Shader params buffers
-      {graph.logical_limits_ubo(in), graph.sizes_ubo(in)},
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      {graph.packed_dim_of(out), reduce_dim, group_dim},
-      // Resize Args
-      {dim},
-      // Resizing Logic
-      resize_var_node));
-}
-
-void add_var_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    const int dim,
-    bool unbiased,
-    ValueRef out) {
-  bool is_buffer = graph.is_buffer_storage(in) || graph.is_buffer_storage(out);
-
-  if (is_buffer) {
-    add_var_buffer_node(graph, in, dim, unbiased, out);
-  } else {
-    add_var_texture_node(graph, in, dim, unbiased, out);
-  }
-}
-
-void var(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  const IntListPtr dims_list = graph.get_int_list(args[1]);
-  VK_CHECK_COND(dims_list->size() == 1);
-  bool unbiased = true;
-  if (args.size() > 2) {
-    unbiased = graph.get_bool(args[2]);
-  }
-  return add_var_node(
-      graph, args[0], static_cast<int>(dims_list->at(0)), unbiased, args[4]);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.var.dim, var);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/View.cpp b/backends/vulkan/runtime/graph/ops/impl/View.cpp
deleted file mode 100644
index 8701a6246b0..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/View.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/View.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-std::vector<int64_t> compute_out_sizes(
-    std::vector<int64_t> orig_sizes,
-    std::vector<int64_t>& view_sizes) {
-  std::vector<int64_t> out_sizes(view_sizes.begin(), view_sizes.end());
-  int64_t numel = 1;
-  int64_t transferred_numel = 1;
-
-  for (int i = 0; i < orig_sizes.size(); i++) {
-    numel *= orig_sizes.at(i);
-  }
-  for (int i = 0; i < view_sizes.size(); i++) {
-    if (view_sizes.at(i) > 0) {
-      transferred_numel *= view_sizes.at(i);
-    }
-  }
-  for (int i = 0; i < out_sizes.size(); i++) {
-    if (out_sizes.at(i) == -1) {
-      out_sizes.at(i) = numel / transferred_numel;
-    }
-  }
-  return out_sizes;
-}
-
-void resize_view_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = args.at(1).refs.at(0);
-  if (extra_args.at(0) == kDummyValueRef ||
-      graph->val_is_none(extra_args.at(0))) {
-    const std::vector<int64_t> in_sizes = graph->sizes_of(in);
-    graph->virtual_resize(out, in_sizes);
-  } else {
-    std::vector<int64_t> view_sizes =
-        graph->extract_int_or_symint_list(extra_args.at(0));
-    const std::vector<int64_t> in_sizes = graph->sizes_of(in);
-    const std::vector<int64_t> out_sizes =
-        compute_out_sizes(in_sizes, view_sizes);
-    graph->virtual_resize(out, out_sizes);
-  }
-}
-
-void add_view_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    ValueRef sizes,
-    ValueRef out) {
-  std::string kernel_name = "view";
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::MemoryAccessType::WRITE},
-       {in, vkapi::MemoryAccessType::READ}},
-      // Parameter Buffers
-      {},
-      // Push Constants
-      {{graph.sizes_pc_of(out), graph.sizes_pc_of(in)}},
-      // Specialization Constants
-      {graph.packed_dim_of(in), graph.packed_dim_of(out)},
-      // Resize Args
-      {sizes},
-      // Resizing Logic
-      resize_view_node));
-}
-
-void add_view_copy_buffer_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    ValueRef out,
-    const std::vector<ValueRef>& resize_args,
-    const ExecuteNode::ResizeFunction& resize_fn) {
-  std::string kernel_name = "view_buffer";
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
-      // Parameter Buffers
-      {graph.buffer_meta_ubo(out), graph.buffer_meta_ubo(in)},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      resize_args,
-      // Resizing Logic
-      resize_fn));
-}
-
-void view(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  int idx = 0;
-  const ValueRef in = args.at(idx++);
-  const ValueRef sizes = args.at(idx++);
-  const ValueRef out = args.at(idx++);
-
-  std::vector<ValueRef> resize_args = {sizes};
-
-  if (graph.is_buffer_storage(out)) {
-    return add_view_copy_buffer_node(
-        graph, in, out, resize_args, resize_view_node);
-  }
-  return add_view_node(graph, in, sizes, out);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.view_copy.default, view);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/View.h b/backends/vulkan/runtime/graph/ops/impl/View.h
deleted file mode 100644
index 7a7a8d57742..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/View.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-namespace vkcompute {
-
-/*
- * Dispatches the view_copy compute shader. This can be used to implement ops
- * that preserve the "contiguous" indexes of elements between the input and
- * output such as view_copy, squeeze_copy, unsqueeze_copy, etc.
- */
-void add_view_copy_buffer_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    ValueRef out,
-    const std::vector<ValueRef>& resize_args,
-    const ExecuteNode::ResizeFunction& resize_fn);
-
-void add_view_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    ValueRef sizes,
-    ValueRef out);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Where.cpp b/backends/vulkan/runtime/graph/ops/impl/Where.cpp
deleted file mode 100644
index c1c482d9967..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Where.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-// Where.cpp
-
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void resize_where_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in = args.at(1).refs.at(0);
-
-  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
-  graph->virtual_resize(out, in_sizes);
-}
-
-void add_where_texture_node(
-    ComputeGraph& graph,
-    const ValueRef cond,
-    const ValueRef self,
-    const ValueRef other,
-    const ValueRef out) {
-  std::string kernel_name = "where";
-
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {{cond, self, other}, vkapi::kRead}},
-      // Parameter buffers
-      {graph.logical_limits_ubo(self)},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {graph.hashed_layout_of(out)},
-      // Resize Arguments
-      {},
-      // Resizing Logic
-      resize_where_node));
-}
-
-void add_where_buffer_node(
-    ComputeGraph& graph,
-    const ValueRef cond,
-    const ValueRef self,
-    const ValueRef other,
-    const ValueRef out) {
-  std::string kernel_name = "where";
-
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  vkapi::ParamsBindList ubos = {
-      graph.numel_ubo(out),
-      graph.strides_ubo(out),
-      graph.strides_ubo(cond),
-      graph.strides_ubo(self),
-      graph.strides_ubo(other)};
-
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      default_pick_global_wg_size,
-      default_pick_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {{cond, self, other}, vkapi::kRead}},
-      // Parameter buffers
-      ubos,
-      // Push Constants
-      {},
-      // Specialization Constants
-      {graph.hashed_layout_of(out)},
-      // Resize Arguments
-      {},
-      // Resizing Logic
-      resize_where_node));
-}
-
-void where(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  int args_i = 0;
-  const ValueRef cond = args[args_i++];
-  const ValueRef self = args[args_i++];
-  const ValueRef other = args[args_i++];
-  const ValueRef out = args[args_i++];
-  if (graph.is_buffer_storage(out)) {
-    add_where_buffer_node(graph, cond, self, other, out);
-  } else {
-    add_where_texture_node(graph, cond, self, other, out);
-  }
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.where.self, where);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h
deleted file mode 100644
index 5ed07dece38..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-namespace vkcompute {
-
-/*
- * Maps a semantic dimension name to an integer that corresponds to its
- * innermost ordering in a 4D tensor in NCHW format. In a way, it is the
- * "negative index" associated with a dim. For instance: in a NCHW tensor, Width
- * is the innermost dimension, so it corresponds to 1, height is the next
- * innermost, so it corresponds to 2, and so on.
- */
-enum DimIndex : int32_t {
-  DIM_LAST = -1,
-  DIM_2ND_LAST = -2,
-  DIM_3RD_LAST = -3,
-  DIM_4TH_LAST = -4,
-};
-
-constexpr DimIndex kWidth4D = DimIndex::DIM_LAST;
-constexpr DimIndex kHeight4D = DimIndex::DIM_2ND_LAST;
-constexpr DimIndex kChannel4D = DimIndex::DIM_3RD_LAST;
-constexpr DimIndex kBatch4D = DimIndex::DIM_4TH_LAST;
-
-/*
- * Semantic dimension names for a 1D tensor
- */
-struct Dim1D {
-  static constexpr uint32_t Length = 1u;
-};
-
-/*
- * Semantic dimension names for a 2D Convolution kernel.
- */
-struct DimConv2DKernel {
-  static constexpr uint32_t Width = 1u;
-  static constexpr uint32_t Height = 2u;
-  static constexpr uint32_t InChannels = 3u;
-  static constexpr uint32_t OutChannels = 4u;
-};
-
-/*
- * The same as the above, except for a 2D Transposed Convolution kernel.
- */
-struct DimTConv2DKernel {
-  static constexpr uint32_t Width = 1u;
-  static constexpr uint32_t Height = 2u;
-  static constexpr uint32_t OutChannels = 3u;
-  static constexpr uint32_t InChannels = 4u;
-};
-
-/*
- * The functions below safely return the size of the dimension at the N-th
- * innermost index. If the dimensionality of the size array is not sufficient
- * then 1 will be returned. The structs above are intended to be used with
- * these functions.
- */
-
-inline int32_t dim_at(const std::vector<int64_t>& sizes, DimIndex dim_index) {
-  const uint32_t dims = sizes.size();
-  // Recall that dim_index is a negative index.
-  return dims < -dim_index
-      ? 1
-      : utils::safe_downcast<int32_t>(sizes[dims + dim_index]);
-}
-
-template <DimIndex DI>
-int32_t dim_at(const std::vector<int64_t>& sizes) {
-  return dim_at(sizes, DI);
-}
-
-inline std::ostream& operator<<(std::ostream& os, DimIndex dim_index) {
-  switch (dim_index) {
-    case kWidth4D:
-      os << "kWidth4D";
-      break;
-    case kHeight4D:
-      os << "kHeight4D";
-      break;
-    case kChannel4D:
-      os << "kChannel4D";
-      break;
-    case kBatch4D:
-      os << "kBatch4D";
-      break;
-    default:
-      os << "kDim4DUnknown";
-      break;
-  }
-  return os;
-}
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp
deleted file mode 100644
index 2fb0f60b249..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-
-namespace vkcompute {
-
-utils::ivec2 make_ivec2_from_list(ComputeGraph& graph, ValueRef vref) {
-  return utils::make_ivec2(*graph.get_int_list(vref), /*reverse = */ true);
-}
-
-utils::ivec2 make_ivec2_kernel_size(
-    ComputeGraph& graph,
-    const ValueRef weight,
-    const bool kernel_size_only) {
-  if (kernel_size_only) {
-    return make_ivec2_from_list(graph, weight);
-  } else {
-    const auto weight_sizes = graph.get_tref(weight)->sizes;
-    return utils::make_ivec2({weight_sizes.at(3), weight_sizes.at(2)});
-  }
-}
-
-Kernel2dParams create_kernel2d_params(
-    ComputeGraph& graph,
-    const ValueRef weight,
-    const bool kernel_size_only,
-    const ValueRef stride,
-    const ValueRef padding,
-    const ValueRef dilation) {
-  return {
-      make_ivec2_kernel_size(graph, weight, kernel_size_only),
-      make_ivec2_from_list(graph, stride),
-      make_ivec2_from_list(graph, padding),
-      make_ivec2_from_list(graph, dilation),
-  };
-}
-
-Kernel2dParams create_kernel2d_params(
-    ComputeGraph& graph,
-    const ValueRef kernel_size,
-    const ValueRef stride,
-    const ValueRef padding) {
-  return {
-      make_ivec2_kernel_size(graph, kernel_size, /*kernel_size_only = */ true),
-      make_ivec2_from_list(graph, stride),
-      make_ivec2_from_list(graph, padding),
-      {},
-  };
-}
-
-int64_t calc_out_size(
-    const int64_t in_size,
-    const int64_t kernel_size,
-    const int64_t stride,
-    const int64_t padding,
-    const int64_t dilation,
-    const bool ceil_mode) {
-  int64_t c = ceil_mode ? stride - 1 : 0;
-  int64_t out_size =
-      (in_size + 2 * padding - dilation * (kernel_size - 1) - 1 + c) / stride +
-      1;
-  if (ceil_mode && (out_size - 1) * stride >= in_size + padding) {
-    --out_size;
-  }
-  VK_CHECK_COND(out_size >= 1);
-  return out_size;
-}
-
-std::vector<int64_t> calc_out_sizes_hw(
-    const std::vector<int64_t>& in_sizes,
-    const utils::ivec2& kernel_size,
-    const utils::ivec2& stride,
-    const utils::ivec2& padding,
-    const utils::ivec2& dilation,
-    const bool ceil_mode) {
-  const int64_t ndim = in_sizes.size();
-  std::vector<int64_t> out_sizes(2);
-
-  // Height
-  out_sizes.at(0) = calc_out_size(
-      in_sizes.at(ndim - 2),
-      kernel_size[1],
-      stride[1],
-      padding[1],
-      dilation[1],
-      ceil_mode);
-  // Width
-  out_sizes.at(1) = calc_out_size(
-      in_sizes.at(ndim - 1),
-      kernel_size[0],
-      stride[0],
-      padding[0],
-      dilation[0],
-      ceil_mode);
-
-  return out_sizes;
-}
-
-int64_t calc_transpose_out_size(
-    const int64_t in_size,
-    const int64_t kernel,
-    const int64_t stride,
-    const int64_t padding,
-    const int64_t dilation,
-    const int64_t output_padding) {
-  int64_t out_size = (in_size - 1) * stride - 2 * padding +
-      dilation * (kernel - 1) + output_padding + 1;
-  VK_CHECK_COND(out_size >= 1);
-  return out_size;
-}
-
-std::vector<int64_t> calc_transpose_out_sizes_hw(
-    const std::vector<int64_t>& in_sizes,
-    const utils::ivec2& kernel_size,
-    const utils::ivec2& stride,
-    const utils::ivec2& padding,
-    const utils::ivec2& dilation,
-    const utils::ivec2& output_padding) {
-  const int64_t ndim = in_sizes.size();
-  std::vector<int64_t> out_sizes(2);
-
-  // Height
-  out_sizes.at(0) = calc_transpose_out_size(
-      in_sizes.at(ndim - 2),
-      kernel_size[1],
-      stride[1],
-      padding[1],
-      dilation[1],
-      output_padding[1]);
-  // Width
-  out_sizes.at(1) = calc_transpose_out_size(
-      in_sizes.at(ndim - 1),
-      kernel_size[0],
-      stride[0],
-      padding[0],
-      dilation[0],
-      output_padding[0]);
-
-  return out_sizes;
-}
-
-std::vector<int64_t> calc_out_sizes_hw(
-    ComputeGraph& graph,
-    const std::vector<int64_t>& in_sizes,
-    const ValueRef weight,
-    const bool kernel_size_only,
-    const std::vector<ValueRef>& args,
-    const bool transposed) {
-  const auto kernel_size =
-      make_ivec2_kernel_size(graph, weight, kernel_size_only);
-  const auto stride = make_ivec2_from_list(graph, args[0]);
-  const auto padding = make_ivec2_from_list(graph, args[1]);
-  const auto dilation = args[2] == kDummyValueRef
-      ? utils::ivec2{1, 1}
-      : make_ivec2_from_list(graph, args[2]);
-
-  if (transposed) {
-    const auto output_padding = make_ivec2_from_list(graph, args[3]);
-    return calc_transpose_out_sizes_hw(
-        in_sizes, kernel_size, stride, padding, dilation, output_padding);
-  } else {
-    const bool ceil_mode =
-        graph.val_is_bool(args[3]) ? graph.get_bool(args[3]) : false;
-
-    return calc_out_sizes_hw(
-        in_sizes, kernel_size, stride, padding, dilation, ceil_mode);
-  }
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h
deleted file mode 100644
index 1e8b5b0f7a4..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-#include <executorch/backends/vulkan/runtime/graph/containers/Value.h>
-
-namespace vkcompute {
-
-struct Kernel1dParams final {
-  int kernel_size;
-  int stride;
-  int padding;
-  int dilation;
-  int in_group_size;
-  int out_group_size;
-};
-
-struct Kernel2dParams final {
-  utils::ivec2 kernel_size;
-  utils::ivec2 stride;
-  utils::ivec2 padding;
-  utils::ivec2 dilation;
-};
-
-Kernel2dParams create_kernel2d_params(
-    ComputeGraph& graph,
-    const ValueRef weight,
-    const bool kernel_size_only,
-    const ValueRef stride,
-    const ValueRef padding,
-    const ValueRef dilation);
-
-Kernel2dParams create_kernel2d_params(
-    ComputeGraph& graph,
-    const ValueRef kernel_size,
-    const ValueRef stride,
-    const ValueRef padding);
-
-int64_t calc_out_size(
-    const int64_t in_size,
-    const int64_t kernel_size,
-    const int64_t stride,
-    const int64_t padding,
-    const int64_t dilation,
-    const bool ceil_mode);
-
-std::vector<int64_t> calc_out_sizes_hw(
-    ComputeGraph& graph,
-    const std::vector<int64_t>& in_sizes,
-    const ValueRef weight,
-    const bool kernel_size_only,
-    const std::vector<ValueRef>& args,
-    const bool transposed = false);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.cpp
deleted file mode 100644
index 4cf678a9dcb..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h>
-
-namespace vkcompute {
-
-void pack4(const uint8_t* w_ptr, uint8_t* b_ptr, uint32_t N, uint32_t K) {
-  for (int32_t n = 0; n < N; n++) {
-    for (int32_t k2 = 0; k2 < K / 2; k2++) {
-      uint8_t src_val0 = w_ptr[n * K + k2 * 2];
-      uint8_t src_val1 = w_ptr[n * K + k2 * 2 + 1];
-      b_ptr[n * (K / 2) + k2] = (uint8_t(src_val1) << 4) | uint8_t(src_val0);
-    }
-  }
-}
-
-std::vector<uint8_t> int4mm_pack_weights(
-    const std::vector<int64_t>& W_sizes,
-    const uint8_t* w_ptr) {
-  const int32_t N = utils::val_at(-1, W_sizes);
-  const int32_t K = utils::val_at(-2, W_sizes);
-
-  const auto numel = K * N;
-  std::vector<uint8_t> w_ptr_T(numel);
-  std::vector<uint8_t> b_ptr(utils::div_up(numel, 2));
-
-  // Transpose the weights
-  for (int32_t k = 0; k < K; k++) {
-    for (int32_t n = 0; n < N; n++) {
-      w_ptr_T[n * K + k] = w_ptr[k * N + n];
-    }
-  }
-
-  // Pack two int4s into each int8
-  pack4(w_ptr_T.data(), b_ptr.data(), N, K);
-
-  return b_ptr;
-}
-
-std::vector<float> int4mm_dequantize_weights(
-    const std::vector<int64_t>& W_sizes,
-    const uint8_t* w_ptr,
-    const uint32_t group_size,
-    const float* scales_and_zeros) {
-  const int64_t N = utils::val_at(-1, W_sizes);
-  const int64_t K = utils::val_at(-2, W_sizes);
-
-  std::vector<float> w_ptr_deq(K * N);
-  const int k_groups = K / group_size;
-  const int zeros_stride = k_groups * N;
-
-  for (int k = 0; k < K; k++) {
-    for (int n = 0; n < N; n++) {
-      const int kb = k / group_size;
-      const int scale_idx = k_groups * n + kb;
-      const float scale = scales_and_zeros[scale_idx];
-      const float zero =
-          scales_and_zeros[scale_idx + zeros_stride] - scale * 8.0;
-      w_ptr_deq[k * N + n] = w_ptr[k * N + n] * scale + zero;
-    }
-  }
-
-  return w_ptr_deq;
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h
deleted file mode 100644
index 4c4cf26d504..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-namespace vkcompute {
-
-std::vector<uint8_t> int4mm_pack_weights(
-    const std::vector<int64_t>& W_sizes,
-    const uint8_t* w_ptr);
-
-std::vector<float> int4mm_dequantize_weights(
-    const std::vector<int64_t>& W_sizes,
-    const uint8_t* w_ptr,
-    const uint32_t group_size,
-    const float* scales_and_zeros);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/QuantizationConfig.h b/backends/vulkan/runtime/graph/ops/impl/utils/QuantizationConfig.h
deleted file mode 100644
index 4bc8c7c3bfc..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/utils/QuantizationConfig.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <vector>
-
-namespace vkcompute {
-
-enum class QuantizationGranularity {
-  PerChannel,
-  PerTensor,
-  PerGroup,
-  NoQuantization,
-};
-
-static constexpr QuantizationGranularity kPerChannel =
-    QuantizationGranularity::PerChannel;
-static constexpr QuantizationGranularity kPerTensor =
-    QuantizationGranularity::PerTensor;
-static constexpr QuantizationGranularity kPerGroup =
-    QuantizationGranularity::PerGroup;
-static constexpr QuantizationGranularity kNoQuantization =
-    QuantizationGranularity::NoQuantization;
-
-struct QuantizationConfig {
-  int nbits;
-  QuantizationGranularity granularity;
-  std::vector<int64_t> granularity_sizes;
-  bool is_symmetric;
-  bool is_dynamic;
-
-  QuantizationConfig()
-      : nbits(8),
-        granularity(kPerTensor),
-        granularity_sizes(),
-        is_symmetric(true),
-        is_dynamic(false) {}
-
-  QuantizationConfig(
-      int nbits_,
-      QuantizationGranularity granularity_,
-      const std::vector<int64_t>& granularity_sizes_,
-      bool is_symmetric_ = true,
-      bool is_dynamic_ = false)
-      : nbits(nbits_),
-        granularity(granularity_),
-        granularity_sizes(granularity_sizes_),
-        is_symmetric(is_symmetric_),
-        is_dynamic(is_dynamic_) {}
-};
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h
deleted file mode 100644
index 270bdd1cd6b..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <executorch/backends/vulkan/runtime/graph/containers/Value.h>
-
-namespace vkcompute {
-
-template <typename T>
-T extract_scalar(const Value& value) {
-  if (value.isInt()) {
-    return static_cast<T>(value.toInt());
-  }
-  if (value.isDouble()) {
-    return static_cast<T>(value.toDouble());
-  }
-  if (value.isBool()) {
-    return static_cast<T>(value.toBool());
-  }
-  VK_THROW("Cannot extract scalar from Value with type ", value.type());
-}
-
-// Helper function to get default quant_min and quant_max based on dtype
-// This matches the logic in _get_and_check_qmin_qmax from quant_primitives.py
-inline std::pair<int, int> get_dtype_bounds(vkapi::ScalarType dtype) {
-  switch (dtype) {
-    case vkapi::kByte: // uint8
-      return {0, 255};
-    case vkapi::kChar: // int8
-      return {-128, 127};
-    case vkapi::kShort: // int16
-      return {-(1 << 15), (1 << 15) - 1};
-    case vkapi::kInt: // int32
-      return {-(1LL << 31), (1LL << 31) - 1};
-    default:
-      // For unsupported types, throw an error instead of assuming int8
-      VK_THROW("Unsupported dtype for quantization bounds: ", dtype);
-  }
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
deleted file mode 100644
index a52572289a4..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-namespace vkcompute {
-
-//
-// Tensor output size calculation functions
-//
-
-std::vector<int64_t> calculate_broadcasted_output_size(
-    const std::vector<int64_t>& sizes1,
-    const std::vector<int64_t>& sizes2) {
-  std::vector<int64_t> out_sizes(std::max(sizes1.size(), sizes2.size()));
-
-  // Match the sizes in reverse because sizes are in NCHW order
-  for (int i = -1; i >= -out_sizes.size(); --i) {
-    out_sizes.at(out_sizes.size() + i) =
-        std::max(utils::val_at(i, sizes1), utils::val_at(i, sizes2));
-  }
-
-  return out_sizes;
-}
-
-//
-// Tensor property checking functions
-//
-
-bool check_same_packed_dim(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef out) {
-  return graph.packed_dim_of(in) == graph.packed_dim_of(out);
-}
-
-//
-// Broadcast flag functions
-//
-
-bool is_packed_dim_broadcasted(
-    ComputeGraph& graph,
-    const ValueRef sndr,
-    const ValueRef rcvr) {
-  // We assume that the tensors are broadcastable. If values aren't equal at
-  // some index, then the value of rcvr is 1 and hence should be broadcasted.
-  const std::vector<int64_t> sndr_sizes = graph.sizes_of(sndr);
-  const std::vector<int64_t> rcvr_sizes = graph.sizes_of(rcvr);
-
-  switch (graph.packed_dim_of(sndr)) {
-    case WHCN::kChannelsDim:
-      return utils::val_at(-3, sndr_sizes) > utils::val_at(-3, rcvr_sizes);
-    case WHCN::kHeightDim:
-      return utils::val_at(-2, sndr_sizes) > utils::val_at(-2, rcvr_sizes);
-    case WHCN::kWidthDim:
-      return utils::val_at(-1, sndr_sizes) > utils::val_at(-1, rcvr_sizes);
-    default:
-      VK_THROW("Invalid packed dim");
-  }
-}
-
-utils::ivec2 create_broadcast_params(
-    ComputeGraph& graph,
-    const ValueRef t1,
-    const ValueRef t2) {
-  return utils::make_ivec2(
-      {is_packed_dim_broadcasted(graph, t2, t1),
-       is_packed_dim_broadcasted(graph, t1, t2)});
-}
-
-//
-// Work group size calculation functions
-//
-
-utils::uvec3 adaptive_work_group_size(const utils::uvec3& global_work_group) {
-  utils::uvec3 local_group_size = {4, 4, 4};
-  if (global_work_group[2u] == 1) {
-    if (global_work_group[1u] < 8) {
-      local_group_size[0u] = 16;
-      local_group_size[1u] = 4;
-      local_group_size[2u] = 1;
-    } else {
-      local_group_size[0u] = 8;
-      local_group_size[1u] = 8;
-      local_group_size[2u] = 1;
-    }
-  }
-  return local_group_size;
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
deleted file mode 100644
index b62bf661995..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-namespace vkcompute {
-
-//
-// Tensor output size calculation functions
-//
-
-std::vector<int64_t> calculate_broadcasted_output_size(
-    const std::vector<int64_t>& sizes1,
-    const std::vector<int64_t>& sizes2);
-
-//
-// Tensor property checking functions
-//
-
-bool check_same_packed_dim(
-    ComputeGraph& graph,
-    const ValueRef in,
-    const ValueRef out);
-
-//
-// Broadcast flag functions
-//
-
-bool is_packed_dim_broadcasted(
-    ComputeGraph& graph,
-    const ValueRef sndr,
-    const ValueRef rcvr);
-
-utils::ivec2 create_broadcast_params(
-    ComputeGraph& graph,
-    const ValueRef t1,
-    const ValueRef t2);
-
-//
-// Work group size calculation functions
-//
-
-utils::uvec3 adaptive_work_group_size(const utils::uvec3& global_work_group);
-
-//
-// Tensor dim utilities
-//
-
-template <
-    typename T,
-    typename std::enable_if<
-        std::is_integral<T>::value && std::is_signed<T>::value,
-        int>::type = 0>
-T normalize(const T& nchw_dim, const int64_t ndim) {
-  return (nchw_dim % ndim + ndim) % ndim;
-}
-
-template <
-    typename T,
-    typename std::enable_if<
-        std::is_integral<T>::value && std::is_signed<T>::value,
-        int>::type = 0>
-T nchw_dim_to_whcn_dim(const T& nchw_dim, const int64_t ndim) {
-  return ndim - 1 - nchw_dim;
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
deleted file mode 100644
index e829f355fe2..00000000000
--- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h>
-
-namespace vkcompute {
-
-uint32_t bind_values_to_descriptor_set(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    vkapi::PipelineBarrier& pipeline_barrier,
-    vkapi::DescriptorSet& descriptor_set,
-    const uint32_t base_idx) {
-  uint32_t idx = base_idx;
-  for (auto& arg : args) {
-    for (auto& ref : arg.refs) {
-      graph->bind_value_to_descriptor_set(
-          ref, pipeline_barrier, arg.access, descriptor_set, idx++);
-    }
-  }
-  return idx;
-}
-
-uint32_t bind_params_to_descriptor_set(
-    const vkapi::ParamsBindList& params,
-    vkapi::DescriptorSet& descriptor_set,
-    const uint32_t base_idx) {
-  uint32_t idx = base_idx;
-  for (auto& param : params.bind_infos) {
-    descriptor_set.bind(idx++, param);
-  }
-  return idx;
-}
-
-void bind_staging_to_descriptor_set(
-    api::StagingBuffer& staging,
-    vkapi::DescriptorSet& descriptor_set,
-    const uint32_t idx) {
-  descriptor_set.bind(idx, staging.buffer());
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h b/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
deleted file mode 100644
index 307bec154f3..00000000000
--- a/backends/vulkan/runtime/graph/ops/utils/BindingUtils.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-namespace vkcompute {
-
-//
-// For objects in the graph
-//
-
-uint32_t bind_values_to_descriptor_set(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    vkapi::PipelineBarrier& pipeline_barrier,
-    vkapi::DescriptorSet& descriptor_set,
-    const uint32_t base_idx);
-
-//
-// For objects NOT in the graph
-//
-
-uint32_t bind_params_to_descriptor_set(
-    const vkapi::ParamsBindList& params,
-    vkapi::DescriptorSet& descriptor_set,
-    const uint32_t base_idx);
-
-void bind_staging_to_descriptor_set(
-    api::StagingBuffer& staging,
-    vkapi::DescriptorSet& descriptor_set,
-    const uint32_t idx);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
deleted file mode 100644
index 231e6d0c7f6..00000000000
--- a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void add_storage_type_suffix(
-    std::string& kernel_name,
-    const utils::StorageType storage_type) {
-  switch (storage_type) {
-    case utils::kBuffer:
-      kernel_name += "_buffer";
-      break;
-    case utils::kTexture3D:
-      kernel_name += "_texture3d";
-      break;
-    case utils::kTexture2D:
-      kernel_name += "_texture2d";
-      break;
-  }
-}
-
-void add_dtype_suffix(std::string& kernel_name, const vkapi::ScalarType dtype) {
-  switch (dtype) {
-    case vkapi::kDouble:
-      kernel_name += "_double";
-      break;
-    case vkapi::kFloat:
-      kernel_name += "_float";
-      break;
-    case vkapi::kHalf:
-      kernel_name += "_half";
-      break;
-    case vkapi::kChar:
-    case vkapi::kQInt8:
-      kernel_name += "_int8";
-      break;
-    case vkapi::kByte:
-    case vkapi::kBool:
-    case vkapi::kQUInt8:
-      kernel_name += "_uint8";
-      break;
-    case vkapi::kShort:
-      kernel_name += "_int16";
-      break;
-    case vkapi::kUInt16:
-      kernel_name += "_uint16";
-      break;
-    case vkapi::kInt:
-      kernel_name += "_int32";
-      break;
-    case vkapi::kUInt:
-      kernel_name += "_uint32";
-      break;
-    case vkapi::kLong:
-      kernel_name += "_int64";
-      break;
-    case vkapi::kUInt64:
-      kernel_name += "_uint64";
-      break;
-    default:
-      break;
-  }
-}
-
-void add_packed_dim_suffix(std::string& kernel_name, const int32_t packed_dim) {
-  switch (packed_dim) {
-    case WHCN::kWidthDim:
-      kernel_name += "_W_packed";
-      break;
-    case WHCN::kHeightDim:
-      kernel_name += "_H_packed";
-      break;
-    case WHCN::kChannelsDim:
-      kernel_name += "_C_packed";
-      break;
-    default:
-      VK_THROW("Invalid packed dim!");
-  }
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h b/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
deleted file mode 100644
index 4a2fddb5cf2..00000000000
--- a/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <string>
-
-namespace vkcompute {
-
-constexpr size_t kShaderNameReserve = 64u;
-
-void add_storage_type_suffix(
-    std::string& kernel_name,
-    const utils::StorageType storage_type);
-
-void add_dtype_suffix(std::string& kernel_name, const vkapi::ScalarType dtype);
-
-void add_ndim_suffix(std::string& kernel_name, const size_t ndim);
-
-void add_packed_dim_suffix(std::string& kernel_name, const int32_t packed_dim);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
deleted file mode 100644
index c90bfa402bb..00000000000
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// @lint-ignore-every CLANGTIDY facebook-security-vulnerable-memcpy
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-
-namespace vkcompute {
-
-bool is_bitw8(vkapi::ScalarType dtype) {
-  return dtype == vkapi::kByte || dtype == vkapi::kChar ||
-      dtype == vkapi::kQInt8 || dtype == vkapi::kQUInt8;
-}
-
-vkapi::ShaderInfo get_nchw_to_tensor_shader(
-    ComputeGraph& graph,
-    const ValueRef dst,
-    bool int8_buffer_enabled,
-    bool push_constant_variant) {
-  std::string kernel_name;
-  kernel_name.reserve(kShaderNameReserve);
-
-  const vkapi::ScalarType dst_dtype = graph.dtype_of(dst);
-  const utils::StorageType dst_storage_type = graph.storage_type_of(dst);
-
-  if (is_bitw8(dst_dtype) && dst_storage_type != utils::kBuffer &&
-      !int8_buffer_enabled) {
-    kernel_name = "nchw_to_bitw8_image_nobitw8buffer";
-    if (!push_constant_variant) {
-      kernel_name += "_no_pc";
-    }
-    add_storage_type_suffix(kernel_name, dst_storage_type);
-    add_dtype_suffix(kernel_name, dst_dtype);
-    return VK_KERNEL_FROM_STR(kernel_name);
-  }
-
-  if (dst_storage_type == utils::kBuffer) {
-    kernel_name = "nchw_to_buffer";
-    add_dtype_suffix(kernel_name, dst_dtype);
-    return VK_KERNEL_FROM_STR(kernel_name);
-  }
-
-  kernel_name = "nchw_to_image";
-  if (!push_constant_variant) {
-    kernel_name += "_no_pc";
-  }
-  add_storage_type_suffix(kernel_name, dst_storage_type);
-  add_dtype_suffix(kernel_name, dst_dtype);
-
-  return VK_KERNEL_FROM_STR(kernel_name);
-}
-
-vkapi::ShaderInfo get_tensor_to_nchw_shader(
-    ComputeGraph& graph,
-    const ValueRef src,
-    bool int8_buffer_enabled,
-    bool push_constant_variant) {
-  std::string kernel_name;
-  kernel_name.reserve(kShaderNameReserve);
-
-  const vkapi::ScalarType src_dtype = graph.dtype_of(src);
-  const utils::StorageType src_storage_type = graph.storage_type_of(src);
-
-  if (is_bitw8(src_dtype) && src_storage_type != utils::kBuffer &&
-      !int8_buffer_enabled) {
-    kernel_name = "bitw8_image_to_nchw_nobitw8buffer";
-    if (!push_constant_variant) {
-      kernel_name += "_no_pc";
-    }
-    add_storage_type_suffix(kernel_name, src_storage_type);
-    add_dtype_suffix(kernel_name, src_dtype);
-    return VK_KERNEL_FROM_STR(kernel_name);
-  }
-
-  if (src_storage_type == utils::kBuffer) {
-    kernel_name = "buffer_to_nchw";
-    add_dtype_suffix(kernel_name, src_dtype);
-    return VK_KERNEL_FROM_STR(kernel_name);
-  }
-
-  kernel_name = "image_to_nchw";
-  if (!push_constant_variant) {
-    kernel_name += "_no_pc";
-  }
-  add_storage_type_suffix(kernel_name, src_storage_type);
-  add_dtype_suffix(kernel_name, src_dtype);
-
-  return VK_KERNEL_FROM_STR(kernel_name);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
deleted file mode 100644
index 71c92b833b7..00000000000
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-namespace vkcompute {
-
-vkapi::ShaderInfo get_nchw_to_tensor_shader(
-    ComputeGraph& graph,
-    const ValueRef dst,
-    bool int8_buffer_enabled = true,
-    bool push_constant_variant = true);
-vkapi::ShaderInfo get_tensor_to_nchw_shader(
-    ComputeGraph& graph,
-    const ValueRef src,
-    bool int8_buffer_enabled = true,
-    bool push_constant_variant = true);
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/utils/MacroUtils.h b/backends/vulkan/runtime/utils/MacroUtils.h
deleted file mode 100644
index a182f9046b7..00000000000
--- a/backends/vulkan/runtime/utils/MacroUtils.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// Suppress an unused variable. Copied from [[maybe_unused]]
-#if defined(_MSC_VER) && !defined(__clang__)
-#define VK_UNUSED __pragma(warning(suppress : 4100 4101))
-#else
-#define VK_UNUSED __attribute__((__unused__))
-#endif //_MSC_VER
diff --git a/backends/vulkan/runtime/utils/StorageUtils.h b/backends/vulkan/runtime/utils/StorageUtils.h
deleted file mode 100644
index 20addf88c53..00000000000
--- a/backends/vulkan/runtime/utils/StorageUtils.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <ostream>
-
-namespace vkcompute {
-
-// Convenience constexpr to attach semantic names to WHCN dimension index
-namespace WHCN {
-
-constexpr int32_t kWidthDim = 0;
-constexpr int32_t kHeightDim = 1;
-constexpr int32_t kChannelsDim = 2;
-
-} // namespace WHCN
-
-namespace utils {
-
-//
-// GPU Storage Options
-//
-
-/**
- * The enum below is used to describe what type of GPU memory will be used to
- * store a particular tensor's data.
- *
- * BUFFER means that a SSBO (Shader Storage Buffer Object) will be used.
- * TEXTURE_3D means that a 3-dimensional image texture will be used.
- * TEXTURE_2D means that a 2-dimensional image texture will be used.
- *
- * UNKNOWN is not expected to be used.
- */
-enum class StorageType : uint8_t {
-  BUFFER,
-  TEXTURE_3D,
-  TEXTURE_2D,
-};
-
-static constexpr StorageType kBuffer = StorageType::BUFFER;
-static constexpr StorageType kTexture3D = StorageType::TEXTURE_3D;
-static constexpr StorageType kTexture2D = StorageType::TEXTURE_2D;
-
-/*
- * A tensor's memory layout is defined in one of two ways:
- *
- * 1. If it's a buffer backed tensor, the memory layout is defined by its
- *    `dim_order`, and by extension its `strides`.
- * 2. If it's a texture backed tensor, the memory layout is defined by the
- *    combination of its `axis_map` and its `packed_dim`.
- *
- * Providing explicit memory layout metadata upon tensor construction is not
- * very convenient from an API perspective, so the `GPUMemoryLayout` serves as
- * an abstraction that is used to determine how to initialize a tensor's layout
- * metadata based on the developer's intent. A `GPUMemoryLayout` is provided to
- * the constructor of `vTensor`, which will use it to determine how to set its
- * `dim_order` if it's a buffer backed tensor, or how to set its `axis_map` and
- * `packed_dim` if it's a texture backed tensor.
- *
- * Note that GPUMemoryLayout is not stored as a tensor property, as it does not
- * have any meaning after the vTensor is constructed. After construction,
- * methods such as `virtual_transpose()` may be used to modify the tensor's
- * layout metadata that cannot be represented by any `GPUMemoryLayout` entry.
- * Nonetheless, a "best guess" of the closest memory layout can be produced via
- * the `estimate_memory_layout()` API of `vTensor`.
- *
- * Currently, only 3 memory layouts are provided, but more will be added in the
- * future that will enable different functionality such as minimizing texture
- * memory footprint.
- */
-enum class GPUMemoryLayout : uint8_t {
-  /*
-   * The below memory layouts will produce a `vTensor` with the following
-   * properties:
-   *
-   * 1. For buffer backed tensors, the `dim_order` will be the same as a
-   *    contiguous dim order, but with the specified dim last in the dim order.
-   * 2. For texture backed tensors, the packed dim will be the specified dim.
-   *    The axis map will be `{0, 1, 2, 2}`.
-   */
-  TENSOR_WIDTH_PACKED = 0u,
-  TENSOR_HEIGHT_PACKED = 1u,
-  TENSOR_CHANNELS_PACKED = 2u,
-};
-
-static constexpr GPUMemoryLayout kWidthPacked =
-    GPUMemoryLayout::TENSOR_WIDTH_PACKED;
-
-static constexpr GPUMemoryLayout kHeightPacked =
-    GPUMemoryLayout::TENSOR_HEIGHT_PACKED;
-
-static constexpr GPUMemoryLayout kChannelsPacked =
-    GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
-
-template <typename T>
-T to_packed_dim(const GPUMemoryLayout layout) {
-  switch (layout) {
-    case kWidthPacked:
-      return 0;
-    case kHeightPacked:
-      return 1;
-    case kChannelsPacked:
-      return 2;
-  };
-  // Should be unreachable
-  return 0;
-}
-
-inline std::ostream& operator<<(
-    std::ostream& os,
-    const StorageType storage_type) {
-  switch (storage_type) {
-    case kBuffer:
-      os << "BUFFER";
-      break;
-    case kTexture3D:
-      os << "TEXTURE_3D";
-      break;
-    case kTexture2D:
-      os << "TEXTURE_2D";
-      break;
-  }
-  return os;
-}
-
-inline std::ostream& operator<<(
-    std::ostream& os,
-    const GPUMemoryLayout layout) {
-  switch (layout) {
-    case kWidthPacked:
-      os << "TENSOR_WIDTH_PACKED";
-      break;
-    case kHeightPacked:
-      os << "TENSOR_HEIGHT_PACKED";
-      break;
-    case kChannelsPacked:
-      os << "TENSOR_CHANNELS_PACKED";
-      break;
-  }
-  return os;
-}
-
-enum class AxisMapLayout : uint8_t {
-  DEFAULT = 0u,
-  OPTIMIZED = 1u,
-};
-
-constexpr AxisMapLayout kDefaultAxisMap = AxisMapLayout::DEFAULT;
-
-constexpr AxisMapLayout kOptimizedAxisMap = AxisMapLayout::OPTIMIZED;
-
-} // namespace utils
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/utils/StringUtils.h b/backends/vulkan/runtime/utils/StringUtils.h
deleted file mode 100644
index 986b58c3303..00000000000
--- a/backends/vulkan/runtime/utils/StringUtils.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-// @lint-ignore-every CLANGTIDY facebook-hte-LocalUncheckedArrayBounds
-
-#include <ostream>
-#include <sstream>
-#include <string>
-
-namespace vkcompute {
-namespace utils {
-
-namespace detail {
-
-struct CompileTimeEmptyString {
-  operator const std::string&() const {
-    static const std::string empty_string_literal;
-    return empty_string_literal;
-  }
-  operator const char*() const {
-    return "";
-  }
-};
-
-template <typename T>
-struct CanonicalizeStrTypes {
-  using type = const T&;
-};
-
-template <size_t N>
-struct CanonicalizeStrTypes<char[N]> {
-  using type = const char*;
-};
-
-inline std::ostream& _str(std::ostream& ss) {
-  return ss;
-}
-
-template <typename T>
-inline std::ostream& _str(std::ostream& ss, const T& t) {
-  ss << t;
-  return ss;
-}
-
-template <>
-inline std::ostream& _str<CompileTimeEmptyString>(
-    std::ostream& ss,
-    const CompileTimeEmptyString&) {
-  return ss;
-}
-
-template <typename T, typename... Args>
-inline std::ostream& _str(std::ostream& ss, const T& t, const Args&... args) {
-  return _str(_str(ss, t), args...);
-}
-
-template <typename... Args>
-struct _str_wrapper final {
-  static std::string call(const Args&... args) {
-    std::ostringstream ss;
-    _str(ss, args...);
-    return ss.str();
-  }
-};
-
-template <>
-struct _str_wrapper<> final {
-  static CompileTimeEmptyString call() {
-    return CompileTimeEmptyString();
-  }
-};
-
-} // namespace detail
-
-template <typename... Args>
-inline std::string concat_str(const Args&... args) {
-  return detail::_str_wrapper<
-      typename detail::CanonicalizeStrTypes<Args>::type...>::call(args...);
-}
-
-} // namespace utils
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/utils/VecUtils.h b/backends/vulkan/runtime/utils/VecUtils.h
deleted file mode 100644
index d84eb54d2b9..00000000000
--- a/backends/vulkan/runtime/utils/VecUtils.h
+++ /dev/null
@@ -1,556 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/Exception.h>
-
-#include <cmath>
-#include <limits>
-#include <numeric>
-#include <type_traits>
-
-namespace vkcompute {
-namespace utils {
-
-//
-// Hashing
-//
-
-/**
- * hash_combine is taken from c10/util/hash.h, which in turn is based on
- * implementation from Boost
- */
-inline size_t hash_combine(size_t seed, size_t value) {
-  return seed ^ (value + 0x9e3779b9 + (seed << 6u) + (seed >> 2u));
-}
-
-//
-// Alignment
-//
-
-template <typename Type>
-inline constexpr Type align_down(const Type& number, const Type& multiple) {
-  return (number / multiple) * multiple;
-}
-
-template <typename Type>
-inline constexpr Type align_up(const Type& number, const Type& multiple) {
-  return align_down(number + multiple - 1, multiple);
-}
-
-template <typename Type>
-inline constexpr Type align_up_4(const Type& numerator) {
-  return (numerator + 3) & -4;
-}
-
-template <typename Type>
-inline constexpr Type div_up(const Type& numerator, const Type& denominator) {
-  return (numerator + denominator - 1) / denominator;
-}
-
-template <typename Type>
-inline constexpr Type div_up_4(const Type& numerator) {
-  return (numerator + 3) / 4;
-}
-
-//
-// Casting Utilities
-//
-
-namespace detail {
-
-/*
- * x cannot be less than 0 if x is unsigned
- */
-template <typename T>
-static inline constexpr bool is_negative(
-    const T& /*x*/,
-    std::true_type /*is_unsigned*/) {
-  return false;
-}
-
-/*
- * check if x is less than 0 if x is signed
- */
-template <typename T>
-static inline constexpr bool is_negative(
-    const T& x,
-    std::false_type /*is_unsigned*/) {
-  return x < T(0);
-}
-
-/*
- * Returns true if x < 0
- */
-template <typename T>
-inline constexpr bool is_negative(const T& x) {
-  return is_negative(x, std::is_unsigned<T>());
-}
-
-/*
- * Returns true if x < lowest(Limit); standard comparison
- */
-template <typename Limit, typename T>
-static inline constexpr bool less_than_lowest(
-    const T& x,
-    std::false_type /*limit_is_unsigned*/,
-    std::false_type /*x_is_unsigned*/) {
-  return x < std::numeric_limits<Limit>::lowest();
-}
-
-/*
- * Limit can contained negative values, but x cannot; return false
- */
-template <typename Limit, typename T>
-static inline constexpr bool less_than_lowest(
-    const T& /*x*/,
-    std::false_type /*limit_is_unsigned*/,
-    std::true_type /*x_is_unsigned*/) {
-  return false;
-}
-
-/*
- * Limit cannot contained negative values, but x can; check if x is negative
- */
-template <typename Limit, typename T>
-static inline constexpr bool less_than_lowest(
-    const T& x,
-    std::true_type /*limit_is_unsigned*/,
-    std::false_type /*x_is_unsigned*/) {
-  return x < T(0);
-}
-
-/*
- * Both x and Limit cannot be negative; return false
- */
-template <typename Limit, typename T>
-static inline constexpr bool less_than_lowest(
-    const T& /*x*/,
-    std::true_type /*limit_is_unsigned*/,
-    std::true_type /*x_is_unsigned*/) {
-  return false;
-}
-
-/*
- * Returns true if x is less than the lowest value of type T
- */
-template <typename Limit, typename T>
-inline constexpr bool less_than_lowest(const T& x) {
-  return less_than_lowest<Limit>(
-      x, std::is_unsigned<Limit>(), std::is_unsigned<T>());
-}
-
-// Suppress sign compare warning when compiling with GCC
-// as later does not account for short-circuit rule before
-// raising the warning, see https://godbolt.org/z/Tr3Msnz99
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wsign-compare"
-#endif
-
-/*
- * Returns true if x is greater than the greatest value of the type Limit
- */
-template <typename Limit, typename T>
-inline constexpr bool greater_than_max(const T& x) {
-  constexpr bool can_overflow =
-      std::numeric_limits<T>::digits > std::numeric_limits<Limit>::digits;
-  return can_overflow && x > std::numeric_limits<Limit>::max();
-}
-
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-
-template <typename To, typename From>
-std::enable_if_t<
-    std::is_integral<From>::value && !std::is_same<From, bool>::value,
-    bool>
-overflows(From f) {
-  using limit = std::numeric_limits<To>;
-  // Casting from signed to unsigned; allow for negative numbers to wrap using
-  // two's complement arithmetic.
-  if (!limit::is_signed && std::numeric_limits<From>::is_signed) {
-    return greater_than_max<To>(f) ||
-        (is_negative(f) && -static_cast<uint64_t>(f) > limit::max());
-  }
-  // standard case, check if f is outside the range of type To
-  else {
-    return less_than_lowest<To>(f) || greater_than_max<To>(f);
-  }
-}
-
-template <typename To, typename From>
-std::enable_if_t<std::is_floating_point<From>::value, bool> overflows(From f) {
-  using limit = std::numeric_limits<To>;
-  if (limit::has_infinity && std::isinf(static_cast<double>(f))) {
-    return false;
-  }
-  return f < limit::lowest() || f > limit::max();
-}
-
-template <typename To, typename From>
-inline constexpr To safe_downcast(const From& v) {
-  VK_CHECK_COND(!overflows<To>(v), "Cast failed: out of range!");
-  return static_cast<To>(v);
-}
-
-template <typename To, typename From>
-inline constexpr bool is_signed_to_unsigned() {
-  return std::is_signed<From>::value && std::is_unsigned<To>::value;
-}
-
-} // namespace detail
-
-template <
-    typename To,
-    typename From,
-    std::enable_if_t<detail::is_signed_to_unsigned<To, From>(), bool> = true>
-inline constexpr To safe_downcast(const From& v) {
-  VK_CHECK_COND(v >= From{}, "Cast failed: negative signed to unsigned!");
-  return detail::safe_downcast<To, From>(v);
-}
-
-template <
-    typename To,
-    typename From,
-    std::enable_if_t<!detail::is_signed_to_unsigned<To, From>(), bool> = true>
-inline constexpr To safe_downcast(const From& v) {
-  return detail::safe_downcast<To, From>(v);
-}
-
-//
-// Vector Types
-//
-
-namespace detail {
-
-template <typename Type, uint32_t N>
-struct vec final {
-  // NOLINTNEXTLINE
-  Type data[N];
-
-  vec() = default;
-
-  // Standard constructor with initializer list
-  vec(std::initializer_list<Type> values) {
-    VK_CHECK_COND(values.size() == N);
-    std::copy(values.begin(), values.end(), data);
-  }
-
-  // Conversion constructor from an _integral_ vec type. Note that this is only
-  // defined if `OtherType` is an integral type to disallow implicit narrowing.
-  template <
-      typename OtherType,
-      typename std::enable_if<
-          !std::is_same<Type, OtherType>::value &&
-              std::is_integral<OtherType>::value,
-          int>::type = 0>
-  /* implicit */ vec(const vec<OtherType, N>& other) {
-    for (int i = 0; i < N; ++i) {
-      data[i] = safe_downcast<Type>(other[i]);
-    }
-  }
-
-  template <
-      typename IndexType,
-      typename = std::enable_if_t<std::is_integral<IndexType>::value>>
-  const Type& operator[](const IndexType& i) const {
-    VK_CHECK_COND(i >= 0 && i < N, "Index out of bounds!");
-    return data[i];
-  }
-
-  template <
-      typename IndexType,
-      typename = std::enable_if_t<std::is_integral<IndexType>::value>>
-  Type& operator[](const IndexType& i) {
-    VK_CHECK_COND(i >= 0 && i < N, "Index out of bounds!");
-    return data[i];
-  }
-
-  bool operator==(const vec<Type, N>& other) const {
-    for (uint32_t i = 0; i < N; ++i) {
-      if (data[i] != other.data[i]) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  bool operator!=(const vec<Type, N>& other) const {
-    return !(*this == other);
-  }
-};
-
-} // namespace detail
-
-template <uint32_t N>
-using ivec = detail::vec<int32_t, N>;
-using ivec2 = ivec<2u>;
-using ivec3 = ivec<3u>;
-using ivec4 = ivec<4u>;
-
-template <uint32_t N>
-using uvec = detail::vec<uint32_t, N>;
-using uvec2 = uvec<2u>;
-using uvec3 = uvec<3u>;
-using uvec4 = uvec<4u>;
-
-template <uint32_t N>
-using vec = detail::vec<float, N>;
-using vec2 = vec<2u>;
-using vec3 = vec<3u>;
-using vec4 = vec<4u>;
-
-// uvec3 is the type representing tensor extents. Useful for debugging.
-inline std::ostream& operator<<(std::ostream& os, const uvec3& v) {
-  os << "(" << v[0u] << ", " << v[1u] << ", " << v[2u] << ")";
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const ivec3& v) {
-  os << "(" << v[0u] << ", " << v[1u] << ", " << v[2u] << ")";
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const uvec4& v) {
-  os << "(" << v[0u] << ", " << v[1u] << ", " << v[2u] << ", " << v[3u] << ")";
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const ivec4& v) {
-  os << "(" << v[0u] << ", " << v[1u] << ", " << v[2u] << ", " << v[3u] << ")";
-  return os;
-}
-
-template <typename T, uint32_t N>
-inline detail::vec<T, N> divup_vec(
-    const detail::vec<T, N>& a,
-    const detail::vec<T, N>& b) {
-  detail::vec<T, N> result;
-  for (uint32_t i = 0; i < N; ++i) {
-    result[i] = utils::div_up(a[i], b[i]);
-  }
-  return result;
-}
-
-//
-// std::vector<T> Handling
-//
-
-/*
- * Utility function to perform indexing on an std::vector<T>. Negative indexing
- * is allowed. For instance, passing an index of -1 will retrieve the last
- * element. If the requested index is out of bounds, then 1u will be returned.
- */
-template <typename T>
-inline T val_at(const int64_t index, const std::vector<T>& sizes) {
-  const int64_t ndim = static_cast<int64_t>(sizes.size());
-  if (index >= 0) {
-    return index >= ndim ? 1 : sizes[index];
-  } else {
-    return ndim + index < 0 ? 1 : sizes[ndim + index];
-  }
-}
-
-inline ivec2 make_ivec2(
-    const std::vector<int64_t>& ints,
-    bool reverse = false) {
-  VK_CHECK_COND(ints.size() == 2);
-  if (reverse) {
-    return {safe_downcast<int32_t>(ints[1]), safe_downcast<int32_t>(ints[0])};
-  } else {
-    return {safe_downcast<int32_t>(ints[0]), safe_downcast<int32_t>(ints[1])};
-  }
-}
-
-inline ivec3 make_ivec3(
-    const std::vector<int64_t>& ints,
-    bool reverse = false) {
-  VK_CHECK_COND(ints.size() == 3);
-  if (reverse) {
-    return {
-        safe_downcast<int32_t>(ints[2]),
-        safe_downcast<int32_t>(ints[1]),
-        safe_downcast<int32_t>(ints[0]),
-    };
-  } else {
-    return {
-        safe_downcast<int32_t>(ints[0]),
-        safe_downcast<int32_t>(ints[1]),
-        safe_downcast<int32_t>(ints[2]),
-    };
-  }
-}
-
-inline ivec4 make_ivec4(
-    const std::vector<int64_t>& ints,
-    bool reverse = false) {
-  VK_CHECK_COND(ints.size() == 4);
-  if (reverse) {
-    return {
-        safe_downcast<int32_t>(ints[3]),
-        safe_downcast<int32_t>(ints[2]),
-        safe_downcast<int32_t>(ints[1]),
-        safe_downcast<int32_t>(ints[0]),
-    };
-  } else {
-    return {
-        safe_downcast<int32_t>(ints[0]),
-        safe_downcast<int32_t>(ints[1]),
-        safe_downcast<int32_t>(ints[2]),
-        safe_downcast<int32_t>(ints[3]),
-    };
-  }
-}
-
-inline ivec4 make_ivec4_prepadded1(const std::vector<int64_t>& ints) {
-  VK_CHECK_COND(ints.size() <= 4);
-
-  ivec4 result = {1, 1, 1, 1};
-  size_t base = 4 - ints.size();
-  for (size_t i = 0; i < ints.size(); ++i) {
-    result[i + base] = safe_downcast<int32_t>(ints[i]);
-  }
-
-  return result;
-}
-
-inline ivec3 make_ivec3(uvec3 ints) {
-  return {
-      safe_downcast<int32_t>(ints[0u]),
-      safe_downcast<int32_t>(ints[1u]),
-      safe_downcast<int32_t>(ints[2u])};
-}
-
-inline uvec3 make_uvec3(ivec3 ints) {
-  return {
-      safe_downcast<uint32_t>(ints[0u]),
-      safe_downcast<uint32_t>(ints[1u]),
-      safe_downcast<uint32_t>(ints[2u])};
-}
-
-/*
- * Given an vector of up to 4 uint64_t representing the sizes of a tensor,
- * constructs a uvec4 containing those elements in reverse order.
- */
-inline uvec4 make_whcn_uvec4(const std::vector<int64_t>& arr) {
-  uint32_t w = safe_downcast<uint32_t>(val_at(-1, arr));
-  uint32_t h = safe_downcast<uint32_t>(val_at(-2, arr));
-  uint32_t c = safe_downcast<uint32_t>(val_at(-3, arr));
-  uint32_t n = safe_downcast<uint32_t>(val_at(-4, arr));
-
-  return {w, h, c, n};
-}
-
-/*
- * Given an vector of up to 4 int64_t representing the sizes of a tensor,
- * constructs an ivec4 containing those elements in reverse order.
- */
-inline ivec4 make_whcn_ivec4(const std::vector<int64_t>& arr) {
-  int32_t w = val_at(-1, arr);
-  int32_t h = val_at(-2, arr);
-  int32_t c = val_at(-3, arr);
-  int32_t n = val_at(-4, arr);
-
-  return {w, h, c, n};
-}
-
-/*
- * Wrapper around std::accumulate that accumulates values of a container of
- * integral types into int64_t. Taken from `multiply_integers` in
- * <c10/util/accumulate.h>
- */
-template <
-    typename C,
-    std::enable_if_t<std::is_integral<typename C::value_type>::value, int> = 0>
-inline int64_t multiply_integers(const C& container) {
-  return std::accumulate(
-      container.begin(),
-      container.end(),
-      static_cast<int64_t>(1),
-      std::multiplies<>());
-}
-
-/*
- * Product of integer elements referred to by iterators; accumulates into the
- * int64_t datatype. Taken from `multiply_integers` in <c10/util/accumulate.h>
- */
-template <
-    typename Iter,
-    std::enable_if_t<
-        std::is_integral<
-            typename std::iterator_traits<Iter>::value_type>::value,
-        int> = 0>
-inline int64_t multiply_integers(Iter begin, Iter end) {
-  // std::accumulate infers return type from `init` type, so if the `init` type
-  // is not large enough to hold the result, computation can overflow. We use
-  // `int64_t` here to avoid this.
-  return std::accumulate(
-      begin, end, static_cast<int64_t>(1), std::multiplies<>());
-}
-
-class WorkgroupSize final {
-  uint32_t val;
-
- public:
-  explicit WorkgroupSize() : val(0) {}
-  explicit WorkgroupSize(const uint32_t x, const uint32_t y, const uint32_t z) {
-    // shift numbers by multiple of 11 bits, since each local workgroup axis can
-    // be 1024 at most and which is 0x400. only z axis can't store 1024, because
-    // it would overflow uint32_t storage.
-    if (z == 1024) {
-      throw std::runtime_error(
-          "Workgroup size in z axis cannot be 1024 because it would overflow uint32_t storage");
-    }
-    val = x | (y << 11) | (z << 22);
-  }
-
-  explicit WorkgroupSize(const uvec3& vec) {
-    // shift numbers by multiple of 11 bits, since each local workgroup axis can
-    // be 1024 at most and which is 0x400. only z axis can't store 1024, because
-    // it would overflow uint32_t storage.
-    if (vec[2u] == 1024) {
-      throw std::runtime_error(
-          "Workgroup size in z axis cannot be 1024 because it would overflow uint32_t storage");
-    }
-    val = vec[0u] | (vec[1u] << 11) | (vec[2u] << 22);
-  }
-
-  explicit inline operator uvec3() const {
-    return {
-        val & 0x7ffu,
-        (val >> 11) & 0x7ffu,
-        (val >> 22),
-    };
-  }
-
-  explicit inline operator uint32_t() const {
-    return val;
-  }
-
-  inline constexpr uint32_t operator[](const int idx) const {
-    return (val >> (11 * idx)) & 0x7ffu;
-  }
-
-  // Equality operator
-  bool operator==(const WorkgroupSize& other) const {
-    return val == other.val;
-  }
-
-  // Inequality operator (optional, for completeness)
-  bool operator!=(const WorkgroupSize& other) const {
-    return !(*this == other);
-  }
-};
-
-} // namespace utils
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Adapter.cpp b/backends/vulkan/runtime/vk_api/Adapter.cpp
deleted file mode 100644
index 0e87dde1922..00000000000
--- a/backends/vulkan/runtime/vk_api/Adapter.cpp
+++ /dev/null
@@ -1,566 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// @lint-ignore-every CLANGTIDY clang-diagnostic-missing-field-initializers
-
-#include <executorch/backends/vulkan/runtime/vk_api/Adapter.h>
-
-#include <iomanip>
-
-namespace vkcompute {
-namespace vkapi {
-
-namespace {
-
-void find_compute_queues(
-    const PhysicalDevice& physical_device,
-    const uint32_t num_queues_to_create,
-    std::vector<VkDeviceQueueCreateInfo>& queue_create_infos,
-    std::vector<std::pair<uint32_t, uint32_t>>& queues_to_get) {
-  queue_create_infos.reserve(num_queues_to_create);
-  queues_to_get.reserve(num_queues_to_create);
-
-  uint32_t remaining_queues = num_queues_to_create;
-  for (uint32_t family_i = 0; family_i < physical_device.queue_families.size();
-       ++family_i) {
-    const VkQueueFamilyProperties& queue_properties =
-        physical_device.queue_families.at(family_i);
-    // Check if this family has compute capability
-    if (queue_properties.queueFlags & VK_QUEUE_COMPUTE_BIT) {
-      const uint32_t queues_to_init =
-          std::min(remaining_queues, queue_properties.queueCount);
-
-      const std::vector<float> queue_priorities(queues_to_init, 1.0f);
-      queue_create_infos.push_back({
-          VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, // sType
-          nullptr, // pNext
-          0u, // flags
-          family_i, // queueFamilyIndex
-          queues_to_init, // queueCount
-          queue_priorities.data(), // pQueuePriorities
-      });
-
-      for (size_t queue_i = 0; queue_i < queues_to_init; ++queue_i) {
-        // Use this to get the queue handle once device is created
-        queues_to_get.emplace_back(family_i, queue_i);
-      }
-      remaining_queues -= queues_to_init;
-    }
-    if (remaining_queues == 0) {
-      break;
-    }
-  }
-}
-
-void populate_queue_info(
-    const PhysicalDevice& physical_device,
-    VkDevice logical_device,
-    const std::vector<std::pair<uint32_t, uint32_t>>& queues_to_get,
-    std::vector<Adapter::Queue>& queues,
-    std::vector<uint32_t>& queue_usage) {
-  queues.reserve(queues_to_get.size());
-  queue_usage.reserve(queues_to_get.size());
-
-  // Obtain handles for the created queues and initialize queue usage heuristic
-
-  for (const std::pair<uint32_t, uint32_t>& queue_idx : queues_to_get) {
-    VkQueue queue_handle = VK_NULL_HANDLE;
-    VkQueueFlags flags =
-        physical_device.queue_families.at(queue_idx.first).queueFlags;
-    vkGetDeviceQueue(
-        logical_device, queue_idx.first, queue_idx.second, &queue_handle);
-    queues.push_back({queue_idx.first, queue_idx.second, flags, queue_handle});
-    // Initial usage value
-    queue_usage.push_back(0);
-  }
-}
-
-VkDevice create_logical_device(
-    const PhysicalDevice& physical_device,
-    const uint32_t num_queues_to_create,
-    std::vector<Adapter::Queue>& queues,
-    std::vector<uint32_t>& queue_usage) {
-  // Find compute queues up to the requested number of queues
-
-  std::vector<VkDeviceQueueCreateInfo> queue_create_infos;
-  std::vector<std::pair<uint32_t, uint32_t>> queues_to_get;
-  find_compute_queues(
-      physical_device, num_queues_to_create, queue_create_infos, queues_to_get);
-
-  // Create the VkDevice
-  std::vector<const char*> requested_device_extensions{
-#ifdef VK_KHR_portability_subset
-      VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME,
-#endif /* VK_KHR_portability_subset */
-#ifdef VK_ANDROID_external_memory_android_hardware_buffer
-      VK_ANDROID_EXTERNAL_MEMORY_ANDROID_HARDWARE_BUFFER_EXTENSION_NAME,
-#endif /* VK_ANDROID_external_memory_android_hardware_buffer */
-#ifdef VK_KHR_16bit_storage
-      VK_KHR_16BIT_STORAGE_EXTENSION_NAME,
-#endif /* VK_KHR_16bit_storage */
-#ifdef VK_KHR_8bit_storage
-      VK_KHR_8BIT_STORAGE_EXTENSION_NAME,
-#endif /* VK_KHR_8bit_storage */
-#ifdef VK_KHR_shader_float16_int8
-      VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME,
-#endif /* VK_KHR_shader_float16_int8 */
-#ifdef VK_KHR_shader_integer_dot_product
-      VK_KHR_SHADER_INTEGER_DOT_PRODUCT_EXTENSION_NAME,
-#endif /* VK_KHR_shader_integer_dot_product */
-#if defined(VK_KHR_pipeline_executable_properties) && defined(VULKAN_DEBUG)
-      VK_KHR_PIPELINE_EXECUTABLE_PROPERTIES_EXTENSION_NAME,
-#endif /* VK_KHR_pipeline_executable_properties */
-  };
-
-  std::vector<const char*> enabled_device_extensions;
-  find_requested_device_extensions(
-      physical_device.handle,
-      enabled_device_extensions,
-      requested_device_extensions);
-
-  VkDeviceCreateInfo device_create_info{
-      VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-      static_cast<uint32_t>(queue_create_infos.size()), // queueCreateInfoCount
-      queue_create_infos.data(), // pQueueCreateInfos
-      0u, // enabledLayerCount
-      nullptr, // ppEnabledLayerNames
-      static_cast<uint32_t>(
-          enabled_device_extensions.size()), // enabledExtensionCount
-      enabled_device_extensions.data(), // ppEnabledExtensionNames
-      nullptr, // pEnabledFeatures
-  };
-
-  void* extension_list_top = nullptr;
-
-#ifdef VK_KHR_16bit_storage
-  VkPhysicalDevice16BitStorageFeatures shader_16bit_storage{
-      physical_device.shader_16bit_storage};
-
-  shader_16bit_storage.pNext = extension_list_top;
-  extension_list_top = &shader_16bit_storage;
-#endif /* VK_KHR_16bit_storage */
-
-#ifdef VK_KHR_8bit_storage
-  VkPhysicalDevice8BitStorageFeatures shader_8bit_storage{
-      physical_device.shader_8bit_storage};
-
-  shader_8bit_storage.pNext = extension_list_top;
-  extension_list_top = &shader_8bit_storage;
-#endif /* VK_KHR_8bit_storage */
-
-#ifdef VK_KHR_shader_float16_int8
-  VkPhysicalDeviceShaderFloat16Int8Features shader_float16_int8_types{
-      physical_device.shader_float16_int8_types};
-
-  shader_float16_int8_types.pNext = extension_list_top;
-  extension_list_top = &shader_float16_int8_types;
-#endif /* VK_KHR_shader_float16_int8 */
-
-#ifdef VK_KHR_shader_integer_dot_product
-  VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR
-      shader_int_dot_product_features{
-          physical_device.shader_int_dot_product_features};
-  shader_int_dot_product_features.pNext = extension_list_top;
-  extension_list_top = &shader_int_dot_product_features;
-#endif /* VK_KHR_shader_integer_dot_product */
-
-  device_create_info.pNext = extension_list_top;
-
-  VkDevice handle = nullptr;
-  VK_CHECK(vkCreateDevice(
-      physical_device.handle, &device_create_info, nullptr, &handle));
-
-#ifdef USE_VULKAN_VOLK
-  volkLoadDevice(handle);
-#endif /* USE_VULKAN_VOLK */
-
-  populate_queue_info(
-      physical_device, handle, queues_to_get, queues, queue_usage);
-
-  return handle;
-}
-
-bool test_linear_tiling_3d_image_support(VkDevice device) {
-  // Test creating a 3D image with linear tiling to see if it is supported.
-  // According to the Vulkan spec, linear tiling may not be supported for 3D
-  // images.
-  VkExtent3D image_extents{1u, 1u, 1u};
-  const VkImageCreateInfo image_create_info{
-      VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-      VK_IMAGE_TYPE_3D, // imageType
-      VK_FORMAT_R32G32B32A32_SFLOAT, // format
-      image_extents, // extents
-      1u, // mipLevels
-      1u, // arrayLayers
-      VK_SAMPLE_COUNT_1_BIT, // samples
-      VK_IMAGE_TILING_LINEAR, // tiling
-      VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT, // usage
-      VK_SHARING_MODE_EXCLUSIVE, // sharingMode
-      0u, // queueFamilyIndexCount
-      nullptr, // pQueueFamilyIndices
-      VK_IMAGE_LAYOUT_UNDEFINED, // initialLayout
-  };
-  VkImage image = VK_NULL_HANDLE;
-  VkResult res = vkCreateImage(device, &image_create_info, nullptr, &image);
-
-  if (res == VK_SUCCESS) {
-    vkDestroyImage(device, image, nullptr);
-  }
-
-  return res == VK_SUCCESS;
-}
-
-} // namespace
-
-//
-// Adapter
-//
-
-Adapter::Adapter(
-    VkInstance instance,
-    PhysicalDevice physical_device,
-    const uint32_t num_queues,
-    const std::string& cache_data_path)
-    : queue_usage_mutex_{},
-      physical_device_(std::move(physical_device)),
-      queues_{},
-      queue_usage_{},
-      queue_mutexes_{},
-      instance_(instance),
-      device_(create_logical_device(
-          physical_device_,
-          num_queues,
-          queues_,
-          queue_usage_)),
-      shader_layout_cache_(device_.handle),
-      shader_cache_(device_.handle),
-      pipeline_layout_cache_(device_.handle),
-      compute_pipeline_cache_(device_.handle, cache_data_path),
-      sampler_cache_(device_.handle),
-      vma_(instance_, physical_device_.handle, device_.handle),
-      linear_tiling_3d_enabled_{
-          test_linear_tiling_3d_image_support(device_.handle)},
-      owns_device_{true} {}
-
-Adapter::Adapter(
-    VkInstance instance,
-    VkPhysicalDevice physical_device,
-    VkDevice logical_device,
-    const uint32_t num_queues,
-    const std::string& cache_data_path)
-    : queue_usage_mutex_{},
-      physical_device_(physical_device),
-      queues_{},
-      queue_usage_{},
-      queue_mutexes_{},
-      instance_(instance),
-      device_(logical_device),
-      shader_layout_cache_(device_.handle),
-      shader_cache_(device_.handle),
-      pipeline_layout_cache_(device_.handle),
-      compute_pipeline_cache_(device_.handle, cache_data_path),
-      sampler_cache_(device_.handle),
-      vma_(instance_, physical_device_.handle, device_.handle),
-      linear_tiling_3d_enabled_{
-          test_linear_tiling_3d_image_support(device_.handle)},
-      owns_device_{false} {
-  std::vector<VkDeviceQueueCreateInfo> queue_create_infos;
-  std::vector<std::pair<uint32_t, uint32_t>> queues_to_get;
-  find_compute_queues(
-      physical_device_, num_queues, queue_create_infos, queues_to_get);
-  populate_queue_info(
-      physical_device_, device_.handle, queues_to_get, queues_, queue_usage_);
-}
-
-Adapter::~Adapter() {
-  if (!owns_device_) {
-    device_.handle = VK_NULL_HANDLE;
-  }
-}
-
-Adapter::Queue Adapter::request_queue() {
-  // Lock the mutex as multiple threads can request a queue at the same time
-  std::lock_guard<std::mutex> lock(queue_usage_mutex_);
-
-  uint32_t min_usage = UINT32_MAX;
-  uint32_t min_used_i = 0;
-  for (size_t i = 0; i < queues_.size(); ++i) {
-    if (queue_usage_[i] < min_usage) {
-      min_used_i = i;
-      min_usage = queue_usage_[i];
-    }
-  }
-  queue_usage_[min_used_i] += 1;
-
-  return queues_[min_used_i];
-}
-
-void Adapter::return_queue(Adapter::Queue& compute_queue) {
-  for (size_t i = 0; i < queues_.size(); ++i) {
-    if ((queues_[i].family_index == compute_queue.family_index) &&
-        (queues_[i].queue_index == compute_queue.queue_index)) {
-      std::lock_guard<std::mutex> lock(queue_usage_mutex_);
-      queue_usage_[i] -= 1;
-      break;
-    }
-  }
-}
-
-void Adapter::submit_cmd(
-    const Adapter::Queue& device_queue,
-    VkCommandBuffer cmd,
-    VkFence fence,
-    VkSemaphore wait_semaphore,
-    VkSemaphore signal_semaphore) {
-  const VkPipelineStageFlags flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
-  const bool set_wait_semaphore = wait_semaphore != VK_NULL_HANDLE;
-  const bool set_signal_semaphore = signal_semaphore != VK_NULL_HANDLE;
-  const VkSubmitInfo submit_info{
-      VK_STRUCTURE_TYPE_SUBMIT_INFO, // sType
-      nullptr, // pNext
-      set_wait_semaphore ? 1u : 0u, // waitSemaphoreCount
-      set_wait_semaphore ? &wait_semaphore : nullptr, // pWaitSemaphores
-      &flags, // pWaitDstStageMask
-      1u, // commandBufferCount
-      &cmd, // pCommandBuffers
-      set_signal_semaphore ? 1u : 0u, // signalSemaphoreCount
-      set_signal_semaphore ? &signal_semaphore : nullptr, // pSignalSemaphores
-  };
-
-  std::lock_guard<std::mutex> queue_lock(
-      queue_mutexes_[device_queue.queue_index % NUM_QUEUE_MUTEXES]);
-
-  VK_CHECK(vkQueueSubmit(device_queue.handle, 1u, &submit_info, fence));
-}
-
-std::string Adapter::stringize() const {
-  std::stringstream ss;
-
-  VkPhysicalDeviceProperties properties = physical_device_.properties;
-  uint32_t v_major = VK_VERSION_MAJOR(properties.apiVersion);
-  uint32_t v_minor = VK_VERSION_MINOR(properties.apiVersion);
-  std::string device_type = get_device_type_str(properties.deviceType);
-  VkPhysicalDeviceLimits limits = properties.limits;
-
-  ss << "{" << std::endl;
-  ss << "  Physical Device Info {" << std::endl;
-  ss << "    apiVersion:    " << v_major << "." << v_minor << std::endl;
-  ss << "    driverversion: " << properties.driverVersion << std::endl;
-  ss << "    deviceType:    " << device_type << std::endl;
-  ss << "    deviceName:    " << properties.deviceName << std::endl;
-
-#define PRINT_BOOL(value, name) \
-  ss << "      " << std::left << std::setw(36) << #name << value << std::endl;
-
-#define PRINT_PROP(struct, name)                                       \
-  ss << "      " << std::left << std::setw(36) << #name << struct.name \
-     << std::endl;
-
-#define PRINT_PROP_VEC3(struct, name)                                     \
-  ss << "      " << std::left << std::setw(36) << #name << struct.name[0] \
-     << "," << struct.name[1] << "," << struct.name[2] << std::endl;
-
-  ss << "    Physical Device Limits {" << std::endl;
-  PRINT_PROP(limits, maxImageDimension1D);
-  PRINT_PROP(limits, maxImageDimension2D);
-  PRINT_PROP(limits, maxImageDimension3D);
-  PRINT_PROP(limits, maxStorageBufferRange);
-  PRINT_PROP(limits, maxTexelBufferElements);
-  PRINT_PROP(limits, maxPushConstantsSize);
-  PRINT_PROP(limits, maxMemoryAllocationCount);
-  PRINT_PROP(limits, maxSamplerAllocationCount);
-  PRINT_PROP(limits, maxComputeSharedMemorySize);
-  PRINT_PROP_VEC3(limits, maxComputeWorkGroupCount);
-  PRINT_PROP(limits, maxComputeWorkGroupInvocations);
-  PRINT_PROP_VEC3(limits, maxComputeWorkGroupSize);
-  ss << "    }" << std::endl;
-
-#ifdef VK_KHR_16bit_storage
-  ss << "    16bit Storage Features {" << std::endl;
-  PRINT_PROP(physical_device_.shader_16bit_storage, storageBuffer16BitAccess);
-  PRINT_PROP(
-      physical_device_.shader_16bit_storage,
-      uniformAndStorageBuffer16BitAccess);
-  PRINT_PROP(physical_device_.shader_16bit_storage, storagePushConstant16);
-  PRINT_PROP(physical_device_.shader_16bit_storage, storageInputOutput16);
-  ss << "    }" << std::endl;
-#endif /* VK_KHR_16bit_storage */
-
-#ifdef VK_KHR_8bit_storage
-  ss << "    8bit Storage Features {" << std::endl;
-  PRINT_PROP(physical_device_.shader_8bit_storage, storageBuffer8BitAccess);
-  PRINT_PROP(
-      physical_device_.shader_8bit_storage, uniformAndStorageBuffer8BitAccess);
-  PRINT_PROP(physical_device_.shader_8bit_storage, storagePushConstant8);
-  ss << "    }" << std::endl;
-#endif /* VK_KHR_8bit_storage */
-
-  ss << "    Shader 16bit and 8bit Features {" << std::endl;
-  PRINT_BOOL(physical_device_.supports_int16_shader_types, shaderInt16)
-#ifdef VK_KHR_shader_float16_int8
-  PRINT_PROP(physical_device_.shader_float16_int8_types, shaderFloat16);
-  PRINT_PROP(physical_device_.shader_float16_int8_types, shaderInt8);
-#endif /* VK_KHR_shader_float16_int8 */
-  ss << "    }" << std::endl;
-
-#ifdef VK_KHR_shader_integer_dot_product
-  ss << "    Shader Integer Dot Product Features {" << std::endl;
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_features,
-      shaderIntegerDotProduct);
-  ss << "    }" << std::endl;
-
-  ss << "    Shader Integer Dot Product Properties {" << std::endl;
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProduct8BitUnsignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProduct8BitSignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProduct8BitMixedSignednessAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProduct4x8BitPackedUnsignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProduct4x8BitPackedSignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProduct4x8BitPackedMixedSignednessAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProduct16BitUnsignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProduct16BitSignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProduct16BitMixedSignednessAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProduct32BitUnsignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProduct32BitSignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProduct32BitMixedSignednessAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProduct64BitUnsignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProduct64BitSignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProduct64BitMixedSignednessAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProductAccumulatingSaturating8BitUnsignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProductAccumulatingSaturating8BitSignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProductAccumulatingSaturating16BitUnsignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProductAccumulatingSaturating16BitSignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProductAccumulatingSaturating32BitUnsignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProductAccumulatingSaturating32BitSignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProductAccumulatingSaturating64BitUnsignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProductAccumulatingSaturating64BitSignedAccelerated);
-  PRINT_PROP(
-      physical_device_.shader_int_dot_product_properties,
-      integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated);
-  ss << "    }" << std::endl;
-#endif /* VK_KHR_shader_integer_dot_product */
-
-  const VkPhysicalDeviceMemoryProperties& mem_props =
-      physical_device_.memory_properties;
-
-  ss << "  }" << std::endl;
-  ss << "  Memory Info {" << std::endl;
-  ss << "    Memory Types [" << std::endl;
-  for (size_t i = 0; i < mem_props.memoryTypeCount; ++i) {
-    ss << "      " << " [Heap " << mem_props.memoryTypes[i].heapIndex << "] "
-       << get_memory_properties_str(mem_props.memoryTypes[i].propertyFlags)
-       << std::endl;
-  }
-  ss << "    ]" << std::endl;
-  ss << "    Memory Heaps [" << std::endl;
-  for (size_t i = 0; i < mem_props.memoryHeapCount; ++i) {
-    ss << "      " << mem_props.memoryHeaps[i].size << std::endl;
-  }
-  ss << "    ]" << std::endl;
-  ss << "  }" << std::endl;
-
-  ss << "  Queue Families {" << std::endl;
-  for (const VkQueueFamilyProperties& queue_family_props :
-       physical_device_.queue_families) {
-    ss << "    (" << queue_family_props.queueCount << " Queues) "
-       << get_queue_family_properties_str(queue_family_props.queueFlags)
-       << std::endl;
-  }
-  ss << "  }" << std::endl;
-  ss << "  VkDevice: " << device_.handle << std::endl;
-  ss << "  Compute Queues [" << std::endl;
-  for (const Adapter::Queue& compute_queue : queues_) {
-    ss << "    Family " << compute_queue.family_index << ", Queue "
-       << compute_queue.queue_index << ": " << compute_queue.handle
-       << std::endl;
-    ;
-  }
-  ss << "  ]" << std::endl;
-  ss << "}";
-
-#undef PRINT_PROP
-#undef PRINT_PROP_VEC3
-
-  return ss.str();
-}
-
-std::ostream& operator<<(std::ostream& os, const Adapter& adapter) {
-  os << adapter.stringize() << std::endl;
-  return os;
-}
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Adapter.h b/backends/vulkan/runtime/vk_api/Adapter.h
deleted file mode 100644
index 6a68b487348..00000000000
--- a/backends/vulkan/runtime/vk_api/Adapter.h
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/Device.h>
-#include <executorch/backends/vulkan/runtime/vk_api/Pipeline.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Allocator.h>
-
-#include <array>
-
-namespace vkcompute {
-namespace vkapi {
-
-//
-// A Vulkan Adapter represents a logical device and all its properties. It
-// manages all relevant properties of the underlying physical device, a
-// handle to the logical device, and a number of compute queues available to
-// the device. It is primarily responsible for managing the VkDevice handle
-// which points to the logical device object on the GPU.
-//
-// This class is primarily used by the Runtime class, which holds one Adapter
-// instance for each physical device visible to the VkInstance. Upon
-// construction, this class will populate the physical device properties, but
-// will not create the logical device until specifically requested via the
-// init_device() function.
-//
-// init_device() will create the logical device and obtain the VkDevice handle
-// for it. It will also create a number of compute queues up to the amount
-// requested when the Adapter instance was constructed.
-//
-// Contexts (which represent one thread of execution) will request a compute
-// queue from an Adapter. The Adapter will then select a compute queue to
-// assign to the Context, attempting to balance load between all available
-// queues. This will allow different Contexts (which typically execute on
-// separate threads) to run concurrently.
-//
-
-#define NUM_QUEUE_MUTEXES 4
-
-class Adapter final {
- public:
-  explicit Adapter(
-      VkInstance instance,
-      PhysicalDevice physical_device,
-      const uint32_t num_queues,
-      const std::string& cache_data_path);
-
-  explicit Adapter(
-      VkInstance instance,
-      VkPhysicalDevice physical_device,
-      VkDevice logical_device,
-      const uint32_t num_queues,
-      const std::string& cache_data_path);
-
-  Adapter(const Adapter&) = delete;
-  Adapter& operator=(const Adapter&) = delete;
-
-  Adapter(Adapter&&) = delete;
-  Adapter& operator=(Adapter&&) = delete;
-
-  ~Adapter();
-
-  struct Queue {
-    uint32_t family_index;
-    uint32_t queue_index;
-    VkQueueFlags capabilities;
-    VkQueue handle;
-  };
-
- private:
-  // Use a mutex to manage queue usage info since
-  // it can be accessed from multiple threads
-  std::mutex queue_usage_mutex_;
-  // Physical Device Info
-  PhysicalDevice physical_device_;
-  // Queue Management
-  std::vector<Queue> queues_;
-  std::vector<uint32_t> queue_usage_;
-  std::array<std::mutex, NUM_QUEUE_MUTEXES> queue_mutexes_;
-  // Handles
-  VkInstance instance_;
-  DeviceHandle device_;
-  // Device-level resource caches
-  ShaderLayoutCache shader_layout_cache_;
-  ShaderCache shader_cache_;
-  PipelineLayoutCache pipeline_layout_cache_;
-  ComputePipelineCache compute_pipeline_cache_;
-  // Memory Management
-  SamplerCache sampler_cache_;
-  Allocator vma_;
-  // Miscellaneous
-  bool linear_tiling_3d_enabled_;
-  bool owns_device_;
-
- public:
-  // Physical Device metadata
-
-  inline VkPhysicalDevice physical_handle() const {
-    return physical_device_.handle;
-  }
-
-  inline VkDevice device_handle() const {
-    return device_.handle;
-  }
-
-  inline bool has_unified_memory() const {
-    return physical_device_.has_unified_memory;
-  }
-
-  inline uint32_t num_compute_queues() const {
-    return physical_device_.num_compute_queues;
-  }
-
-  inline bool timestamp_compute_and_graphics() const {
-    return physical_device_.has_timestamps;
-  }
-
-  inline float timestamp_period() const {
-    return physical_device_.timestamp_period;
-  }
-
-  // Device Identity
-  inline const std::string& device_name() const {
-    return physical_device_.device_name;
-  }
-
-  inline vkapi::DeviceType device_type() const {
-    return physical_device_.device_type;
-  }
-
-  // Queue Management
-
-  Queue request_queue();
-  void return_queue(Queue&);
-
-  // Caches
-
-  inline ShaderLayoutCache& shader_layout_cache() {
-    return shader_layout_cache_;
-  }
-
-  inline ShaderCache& shader_cache() {
-    return shader_cache_;
-  }
-
-  inline PipelineLayoutCache& pipeline_layout_cache() {
-    return pipeline_layout_cache_;
-  }
-
-  inline ComputePipelineCache& compute_pipeline_cache() {
-    return compute_pipeline_cache_;
-  }
-
-  // Memory Allocation
-
-  inline SamplerCache& sampler_cache() {
-    return sampler_cache_;
-  }
-
-  inline Allocator& vma() {
-    return vma_;
-  }
-
-  inline bool linear_tiling_3d_enabled() const {
-    return linear_tiling_3d_enabled_;
-  }
-
-  // Physical Device Features
-
-  inline bool supports_16bit_storage_buffers() {
-#ifdef VK_KHR_16bit_storage
-    return physical_device_.shader_16bit_storage.storageBuffer16BitAccess ==
-        VK_TRUE;
-#else
-    return false;
-#endif /* VK_KHR_16bit_storage */
-  }
-
-  inline bool supports_8bit_storage_buffers() {
-#ifdef VK_KHR_8bit_storage
-    return physical_device_.shader_8bit_storage.storageBuffer8BitAccess ==
-        VK_TRUE;
-#else
-    return false;
-#endif /* VK_KHR_8bit_storage */
-  }
-
-  inline bool supports_float16_shader_types() {
-#ifdef VK_KHR_shader_float16_int8
-    return physical_device_.shader_float16_int8_types.shaderFloat16 == VK_TRUE;
-#else
-    return false;
-#endif /* VK_KHR_shader_float16_int8 */
-  }
-
-  inline bool supports_int8_shader_types() {
-#ifdef VK_KHR_shader_float16_int8
-    return physical_device_.shader_float16_int8_types.shaderInt8 == VK_TRUE;
-#else
-    return false;
-#endif /* VK_KHR_shader_float16_int8 */
-  }
-
-  inline bool supports_int8_dot_product() {
-#ifdef VK_KHR_shader_integer_dot_product
-    return physical_device_.shader_int_dot_product_features
-               .shaderIntegerDotProduct == VK_TRUE;
-#else
-    return false;
-#endif /* VK_KHR_shader_integer_dot_product */
-  }
-
-  inline bool supports_int16_shader_types() {
-    return physical_device_.supports_int16_shader_types;
-  }
-
-  inline bool has_full_float16_buffers_support() {
-    return supports_16bit_storage_buffers() && supports_float16_shader_types();
-  }
-
-  inline bool has_full_int8_buffers_support() {
-    return supports_8bit_storage_buffers() && supports_int8_shader_types();
-  }
-
-  inline size_t min_ubo_alignment() const {
-    return physical_device_.min_ubo_alignment;
-  }
-
-  inline uint32_t max_texture2d_dim() const {
-    return physical_device_.properties.limits.maxImageDimension2D;
-  }
-
-  inline uint32_t max_texture3d_dim() const {
-    return physical_device_.properties.limits.maxImageDimension3D;
-  }
-
-  inline uint32_t max_buffer_numel() const {
-    return physical_device_.properties.limits.maxStorageBufferRange;
-  }
-
-  // Command Buffer Submission
-
-  void submit_cmd(
-      const Queue&,
-      VkCommandBuffer,
-      VkFence fence = VK_NULL_HANDLE,
-      VkSemaphore wait_semaphore = VK_NULL_HANDLE,
-      VkSemaphore signal_semaphore = VK_NULL_HANDLE);
-
-  std::string stringize() const;
-  friend std::ostream& operator<<(std::ostream&, const Adapter&);
-};
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Command.cpp b/backends/vulkan/runtime/vk_api/Command.cpp
deleted file mode 100644
index 84e1f68dc68..00000000000
--- a/backends/vulkan/runtime/vk_api/Command.cpp
+++ /dev/null
@@ -1,354 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/vk_api/Adapter.h>
-#include <executorch/backends/vulkan/runtime/vk_api/Command.h>
-
-#include <mutex>
-
-namespace vkcompute {
-namespace vkapi {
-
-//
-// CommandBuffer
-//
-
-CommandBuffer::CommandBuffer(
-    VkCommandBuffer handle,
-    const VkCommandBufferUsageFlags flags)
-    : handle_(handle),
-      flags_(flags),
-      state_(CommandBuffer::State::NEW),
-      bound_{} {}
-
-CommandBuffer::CommandBuffer(CommandBuffer&& other) noexcept
-    : handle_(other.handle_),
-      flags_(other.flags_),
-      state_(other.state_),
-      bound_(other.bound_) {
-  other.handle_ = VK_NULL_HANDLE;
-  other.bound_.reset();
-}
-
-CommandBuffer& CommandBuffer::operator=(CommandBuffer&& other) noexcept {
-  handle_ = other.handle_;
-  flags_ = other.flags_;
-  state_ = other.state_;
-  bound_ = other.bound_;
-
-  other.handle_ = VK_NULL_HANDLE;
-  other.bound_.reset();
-  other.state_ = CommandBuffer::State::INVALID;
-
-  return *this;
-}
-
-void CommandBuffer::begin() {
-  VK_CHECK_COND(
-      state_ == CommandBuffer::State::NEW,
-      "Vulkan CommandBuffer: called begin() on a command buffer whose state "
-      "is not NEW.");
-
-  const VkCommandBufferBeginInfo begin_info{
-      VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
-      nullptr,
-      flags_,
-      nullptr,
-  };
-
-  VK_CHECK(vkBeginCommandBuffer(handle_, &begin_info));
-  state_ = CommandBuffer::State::RECORDING;
-}
-
-void CommandBuffer::end() {
-  VK_CHECK_COND(
-      state_ == CommandBuffer::State::RECORDING ||
-          state_ == CommandBuffer::State::SUBMITTED,
-      "Vulkan CommandBuffer: called end() on a command buffer whose state "
-      "is not RECORDING or SUBMITTED.");
-
-  if (state_ == CommandBuffer::State::RECORDING) {
-    VK_CHECK(vkEndCommandBuffer(handle_));
-  }
-  state_ = CommandBuffer::State::READY;
-}
-
-void CommandBuffer::bind_pipeline(
-    VkPipeline pipeline,
-    VkPipelineLayout pipeline_layout,
-    const utils::WorkgroupSize local_workgroup_size) {
-  VK_CHECK_COND(
-      state_ == CommandBuffer::State::RECORDING,
-      "Vulkan CommandBuffer: called bind_pipeline() on a command buffer whose state "
-      "is not RECORDING.");
-
-  if (pipeline != bound_.pipeline) {
-    vkCmdBindPipeline(handle_, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
-
-    bound_.pipeline = pipeline;
-  }
-
-  bound_.pipeline_layout = pipeline_layout;
-  bound_.local_workgroup_size = local_workgroup_size;
-
-  state_ = CommandBuffer::State::PIPELINE_BOUND;
-}
-
-void CommandBuffer::bind_descriptors(VkDescriptorSet descriptors) {
-  VK_CHECK_COND(
-      state_ == CommandBuffer::State::PIPELINE_BOUND,
-      "Vulkan CommandBuffer: called bind_descriptors() on a command buffer whose state "
-      "is not PIPELINE_BOUND.");
-
-  if (descriptors != bound_.descriptors) {
-    vkCmdBindDescriptorSets(
-        handle_, // commandBuffer
-        VK_PIPELINE_BIND_POINT_COMPUTE, // pipelineBindPoint
-        bound_.pipeline_layout, // layout
-        0u, // firstSet
-        1u, // descriptorSetCount
-        &descriptors, // pDescriptorSets
-        0u, // dynamicOffsetCount
-        nullptr); // pDynamicOffsets
-  }
-
-  bound_.descriptors = descriptors;
-
-  state_ = CommandBuffer::State::DESCRIPTORS_BOUND;
-}
-
-void CommandBuffer::set_push_constants(
-    VkPipelineLayout pipeline_layout,
-    const void* push_constants_data,
-    uint32_t push_constants_size) {
-  if (push_constants_data != nullptr && push_constants_size > 0) {
-    vkCmdPushConstants(
-        handle_,
-        pipeline_layout,
-        VK_SHADER_STAGE_COMPUTE_BIT,
-        0,
-        push_constants_size,
-        push_constants_data);
-  }
-}
-
-void CommandBuffer::insert_barrier(PipelineBarrier& pipeline_barrier) {
-  VK_CHECK_COND(
-      state_ == CommandBuffer::State::DESCRIPTORS_BOUND ||
-          state_ == CommandBuffer::State::RECORDING,
-      "Vulkan CommandBuffer: called insert_barrier() on a command buffer whose state "
-      "is not DESCRIPTORS_BOUND or RECORDING.");
-
-  if (pipeline_barrier) {
-    if (!pipeline_barrier.buffer_barrier_handles.empty()) {
-      pipeline_barrier.buffer_barrier_handles.clear();
-    }
-    for (const BufferMemoryBarrier& memory_barrier : pipeline_barrier.buffers) {
-      pipeline_barrier.buffer_barrier_handles.push_back(memory_barrier.handle);
-    }
-
-    if (!pipeline_barrier.image_barrier_handles.empty()) {
-      pipeline_barrier.image_barrier_handles.clear();
-    }
-    for (const ImageMemoryBarrier& memory_barrier : pipeline_barrier.images) {
-      pipeline_barrier.image_barrier_handles.push_back(memory_barrier.handle);
-    }
-    vkCmdPipelineBarrier(
-        handle_, // commandBuffer
-        pipeline_barrier.stage.src, // srcStageMask
-        pipeline_barrier.stage.dst, // dstStageMask
-        0u, // dependencyFlags
-        0u, // memoryBarrierCount
-        nullptr, // pMemoryBarriers
-        pipeline_barrier.buffers.size(), // bufferMemoryBarrierCount
-        !pipeline_barrier.buffers.empty()
-            ? pipeline_barrier.buffer_barrier_handles.data()
-            : nullptr, // pMemoryBarriers
-        pipeline_barrier.images.size(), // imageMemoryBarrierCount
-        !pipeline_barrier.images.empty()
-            ? pipeline_barrier.image_barrier_handles.data()
-            : nullptr); // pImageMemoryBarriers
-  }
-
-  state_ = CommandBuffer::State::BARRIERS_INSERTED;
-}
-
-void CommandBuffer::dispatch(const utils::uvec3& global_workgroup_size) {
-  VK_CHECK_COND(
-      state_ == CommandBuffer::State::BARRIERS_INSERTED,
-      "Vulkan CommandBuffer: called dispatch() on a command buffer whose state "
-      "is not BARRIERS_INSERTED.");
-
-  vkCmdDispatch(
-      handle_,
-      utils::div_up(global_workgroup_size[0u], bound_.local_workgroup_size[0u]),
-      utils::div_up(global_workgroup_size[1u], bound_.local_workgroup_size[1u]),
-      utils::div_up(
-          global_workgroup_size[2u], bound_.local_workgroup_size[2u]));
-
-  state_ = CommandBuffer::State::RECORDING;
-}
-
-void CommandBuffer::blit(vkapi::VulkanImage& src, vkapi::VulkanImage& dst) {
-  VK_CHECK_COND(
-      state_ == CommandBuffer::State::BARRIERS_INSERTED,
-      "Vulkan CommandBuffer: called blit() on a command buffer whose state "
-      "is not BARRIERS_INSERTED.");
-
-  auto src_extents = src.extents();
-  auto dst_extents = dst.extents();
-
-  VkImageBlit blit{};
-  blit.srcOffsets[0] = {0, 0, 0},
-  blit.srcOffsets[1] =
-      {static_cast<int32_t>(src_extents.width),
-       static_cast<int32_t>(src_extents.height),
-       static_cast<int32_t>(src_extents.depth)},
-  blit.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
-  blit.srcSubresource.mipLevel = 0, blit.srcSubresource.baseArrayLayer = 0,
-  blit.srcSubresource.layerCount = 1, blit.dstOffsets[0] = {0, 0, 0},
-  blit.dstOffsets[1] =
-      {static_cast<int32_t>(dst_extents.width),
-       static_cast<int32_t>(dst_extents.height),
-       static_cast<int32_t>(dst_extents.depth)},
-  blit.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
-  blit.dstSubresource.mipLevel = 0, blit.dstSubresource.baseArrayLayer = 0,
-  blit.dstSubresource.layerCount = 1,
-
-  vkCmdBlitImage(
-      handle_,
-      src.handle(),
-      src.layout(),
-      dst.handle(),
-      dst.layout(),
-      1,
-      &blit,
-      VK_FILTER_NEAREST);
-
-  state_ = CommandBuffer::State::RECORDING;
-}
-
-void CommandBuffer::write_timestamp(VkQueryPool querypool, const uint32_t idx)
-    const {
-  VK_CHECK_COND(
-      state_ == CommandBuffer::State::RECORDING,
-      "Vulkan CommandBuffer: called write_timestamp() on a command buffer whose state "
-      "is not RECORDING.");
-
-  vkCmdWriteTimestamp(
-      handle_, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, querypool, idx);
-}
-
-void CommandBuffer::reset_querypool(
-    VkQueryPool querypool,
-    const uint32_t first_idx,
-    const uint32_t count) const {
-  VK_CHECK_COND(
-      state_ == CommandBuffer::State::RECORDING,
-      "Vulkan CommandBuffer: called reset_querypool() on a command buffer whose state "
-      "is not RECORDING.");
-
-  vkCmdResetQueryPool(handle_, querypool, first_idx, count);
-}
-
-VkCommandBuffer CommandBuffer::get_submit_handle(const bool final_use) {
-  VK_CHECK_COND(
-      state_ == CommandBuffer::State::READY,
-      "Vulkan CommandBuffer: called begin() on a command buffer whose state "
-      "is not READY.");
-
-  VkCommandBuffer handle = handle_;
-
-  if (!is_reusable() || final_use) {
-    invalidate();
-  }
-  state_ = CommandBuffer::State::SUBMITTED;
-
-  return handle;
-}
-
-//
-// CommandPool
-//
-
-CommandPool::CommandPool(
-    VkDevice device,
-    const uint32_t queue_family_idx,
-    const CommandPoolConfig& config)
-    : device_(device),
-      queue_family_idx_(queue_family_idx),
-      pool_(VK_NULL_HANDLE),
-      config_(config),
-      mutex_{},
-      buffers_{},
-      in_use_(0u) {
-  const VkCommandPoolCreateInfo create_info{
-      VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
-      nullptr,
-      VK_COMMAND_POOL_CREATE_TRANSIENT_BIT,
-      queue_family_idx_,
-  };
-
-  VK_CHECK(vkCreateCommandPool(device_, &create_info, nullptr, &pool_));
-
-  // Pre-allocate some command buffers
-  allocate_new_batch(config_.cmd_pool_initial_size);
-}
-
-CommandPool::~CommandPool() {
-  if (pool_ == VK_NULL_HANDLE) {
-    return;
-  }
-  vkDestroyCommandPool(device_, pool_, nullptr);
-}
-
-CommandBuffer CommandPool::get_new_cmd(bool reusable) {
-  std::lock_guard<std::mutex> lock(mutex_);
-
-  // No-ops if there are command buffers available
-  allocate_new_batch(config_.cmd_pool_batch_size);
-
-  VkCommandBuffer handle = buffers_[in_use_];
-
-  VkCommandBufferUsageFlags cmd_flags = 0u;
-  if (!reusable) {
-    cmd_flags |= VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
-  }
-
-  in_use_++;
-  return CommandBuffer(handle, cmd_flags);
-}
-
-void CommandPool::flush() {
-  std::lock_guard<std::mutex> lock(mutex_);
-  VK_CHECK(vkResetCommandPool(device_, pool_, 0u));
-  in_use_ = 0u;
-}
-
-void CommandPool::allocate_new_batch(const uint32_t count) {
-  // No-ops if there are still command buffers available
-  if (in_use_ < buffers_.size()) {
-    return;
-  }
-
-  buffers_.resize(buffers_.size() + count);
-
-  const VkCommandBufferAllocateInfo allocate_info{
-      VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, // sType
-      nullptr, // pNext
-      pool_, // commandPool
-      VK_COMMAND_BUFFER_LEVEL_PRIMARY, // level
-      count, // commandBufferCount
-  };
-
-  VK_CHECK(vkAllocateCommandBuffers(
-      device_, &allocate_info, buffers_.data() + in_use_));
-}
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Command.h b/backends/vulkan/runtime/vk_api/Command.h
deleted file mode 100644
index ff1e5934a5c..00000000000
--- a/backends/vulkan/runtime/vk_api/Command.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
-
-#include <executorch/backends/vulkan/runtime/utils/VecUtils.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/Descriptor.h>
-#include <executorch/backends/vulkan/runtime/vk_api/Pipeline.h>
-#include <executorch/backends/vulkan/runtime/vk_api/Shader.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Buffer.h>
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Image.h>
-
-namespace vkcompute {
-namespace vkapi {
-
-class CommandBuffer final {
- public:
-  explicit CommandBuffer(VkCommandBuffer, const VkCommandBufferUsageFlags);
-
-  CommandBuffer(const CommandBuffer&) = delete;
-  CommandBuffer& operator=(const CommandBuffer&) = delete;
-
-  CommandBuffer(CommandBuffer&&) noexcept;
-  CommandBuffer& operator=(CommandBuffer&&) noexcept;
-
-  ~CommandBuffer() = default;
-
-  // The lifecycle of a command buffer is as follows:
-  enum State {
-    INVALID, // Used to indicate the command buffer is moved from
-    NEW, // Set during constructor
-    RECORDING, // Set during call to begin() and dispatch()
-    PIPELINE_BOUND, // Set during call to  bind_pipeline()
-    DESCRIPTORS_BOUND, // Set during call to bind_descriptors()
-    BARRIERS_INSERTED, // Set during call to insert_barrier()
-    READY, //  Set during call to end()
-    SUBMITTED, // Set during call to get_submit_handle()
-  };
-
-  struct Bound {
-    VkPipeline pipeline;
-    VkPipelineLayout pipeline_layout;
-    utils::WorkgroupSize local_workgroup_size;
-    VkDescriptorSet descriptors;
-
-    explicit Bound()
-        : pipeline{VK_NULL_HANDLE},
-          pipeline_layout{VK_NULL_HANDLE},
-          local_workgroup_size{0u, 0u, 0u},
-          descriptors{VK_NULL_HANDLE} {}
-
-    inline void reset() {
-      pipeline = VK_NULL_HANDLE;
-      pipeline_layout = VK_NULL_HANDLE;
-      local_workgroup_size = utils::WorkgroupSize{0u, 0u, 0u};
-      descriptors = VK_NULL_HANDLE;
-    }
-  };
-
- private:
-  VkCommandBuffer handle_;
-  VkCommandBufferUsageFlags flags_;
-  State state_;
-  Bound bound_;
-
- public:
-  inline bool is_reusable() {
-    return !(flags_ & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT);
-  }
-
-  inline void invalidate() {
-    handle_ = VK_NULL_HANDLE;
-    bound_.reset();
-  }
-
-  void begin();
-  void end();
-
-  void bind_pipeline(VkPipeline, VkPipelineLayout, const utils::WorkgroupSize);
-  void bind_descriptors(VkDescriptorSet);
-  void set_push_constants(VkPipelineLayout, const void*, uint32_t);
-
-  void insert_barrier(PipelineBarrier& pipeline_barrier);
-  void dispatch(const utils::uvec3&);
-  void blit(vkapi::VulkanImage& src, vkapi::VulkanImage& dst);
-
-  void write_timestamp(VkQueryPool, const uint32_t) const;
-  void reset_querypool(VkQueryPool, const uint32_t, const uint32_t) const;
-
-  VkCommandBuffer get_submit_handle(const bool final_use = false);
-
-  inline operator bool() const {
-    return handle_ != VK_NULL_HANDLE;
-  }
-};
-
-struct CommandPoolConfig final {
-  uint32_t cmd_pool_initial_size;
-  uint32_t cmd_pool_batch_size;
-};
-
-class CommandPool final {
- public:
-  explicit CommandPool(VkDevice, const uint32_t, const CommandPoolConfig&);
-
-  CommandPool(const CommandPool&) = delete;
-  CommandPool& operator=(const CommandPool&) = delete;
-
-  CommandPool(CommandPool&&) = delete;
-  CommandPool& operator=(CommandPool&&) = delete;
-
-  ~CommandPool();
-
- private:
-  VkDevice device_;
-  uint32_t queue_family_idx_;
-  VkCommandPool pool_;
-  CommandPoolConfig config_;
-  // New Buffers
-  std::mutex mutex_;
-  std::vector<VkCommandBuffer> buffers_;
-  size_t in_use_;
-
- public:
-  CommandBuffer get_new_cmd(bool reusable = false);
-
-  void flush();
-
- private:
-  void allocate_new_batch(const uint32_t);
-};
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Descriptor.cpp b/backends/vulkan/runtime/vk_api/Descriptor.cpp
deleted file mode 100644
index 9e8394ffa9c..00000000000
--- a/backends/vulkan/runtime/vk_api/Descriptor.cpp
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/vk_api/Descriptor.h>
-
-#include <executorch/backends/vulkan/runtime/utils/VecUtils.h>
-
-#include <algorithm>
-#include <utility>
-
-namespace vkcompute {
-namespace vkapi {
-
-//
-// BufferBindInfo
-//
-
-BufferBindInfo::BufferBindInfo()
-    : handle(VK_NULL_HANDLE), offset(0u), range(0u) {}
-
-BufferBindInfo::BufferBindInfo(
-    const VulkanBuffer& buffer_p,
-    const uint32_t offset_p)
-    : handle(buffer_p.handle()),
-      offset(buffer_p.mem_offset() + offset_p),
-      range(buffer_p.mem_range() - offset_p) {}
-
-BufferBindInfo::BufferBindInfo(
-    const VulkanBuffer& buffer_p,
-    const size_t offset_p,
-    const size_t range_p)
-    : handle(buffer_p.handle()),
-      offset(buffer_p.mem_offset() + offset_p),
-      range(range_p) {
-  VK_CHECK_COND(range_p <= (buffer_p.mem_range() - offset_p));
-}
-
-//
-// ParamsBindList
-//
-
-ParamsBindList::ParamsBindList(
-    std::initializer_list<const BufferBindInfo> init_list) {
-  bind_infos.resize(init_list.size());
-  std::copy(init_list.begin(), init_list.end(), bind_infos.begin());
-}
-
-void ParamsBindList::append(const BufferBindInfo& bind_info) {
-  bind_infos.emplace_back(bind_info);
-}
-
-void ParamsBindList::append(const ParamsBindList& other) {
-  bind_infos.insert(
-      bind_infos.end(), other.bind_infos.begin(), other.bind_infos.end());
-}
-
-//
-// DescriptorSet
-//
-
-DescriptorSet::DescriptorSet(
-    VkDevice device,
-    VkDescriptorSet handle,
-    ShaderLayout::Signature shader_layout_signature)
-    : device_(device),
-      handle_(handle),
-      shader_layout_signature_(std::move(shader_layout_signature)),
-      bindings_{} {}
-
-DescriptorSet::DescriptorSet(DescriptorSet&& other) noexcept
-    : device_(other.device_),
-      handle_(other.handle_),
-      shader_layout_signature_(std::move(other.shader_layout_signature_)),
-      bindings_(std::move(other.bindings_)) {
-  other.handle_ = VK_NULL_HANDLE;
-}
-
-DescriptorSet& DescriptorSet::operator=(DescriptorSet&& other) noexcept {
-  device_ = other.device_;
-  handle_ = other.handle_;
-  shader_layout_signature_ = std::move(other.shader_layout_signature_);
-  bindings_ = std::move(other.bindings_);
-
-  other.handle_ = VK_NULL_HANDLE;
-
-  return *this;
-}
-
-DescriptorSet& DescriptorSet::bind(
-    const uint32_t idx,
-    const VulkanBuffer& buffer) {
-  VK_CHECK_COND(
-      buffer.has_memory(),
-      "Buffer must be bound to memory for it to be usable");
-
-  DescriptorSet::ResourceBinding binder{};
-  binder.binding_idx = idx; // binding_idx
-  binder.descriptor_type = shader_layout_signature_[idx]; // descriptor_type
-  binder.is_image = false; // is_image
-  binder.resource_info.buffer_info.buffer = buffer.handle(); // buffer
-  binder.resource_info.buffer_info.offset = buffer.mem_offset(); // offset
-  binder.resource_info.buffer_info.range = buffer.mem_range(); // range
-  add_binding(binder);
-
-  return *this;
-}
-
-DescriptorSet& DescriptorSet::bind(
-    const uint32_t idx,
-    const BufferBindInfo& bind_info) {
-  DescriptorSet::ResourceBinding binder{};
-  binder.binding_idx = idx; // binding_idx
-  binder.descriptor_type = shader_layout_signature_[idx]; // descriptor_type
-  binder.is_image = false; // is_image
-  binder.resource_info.buffer_info.buffer = bind_info.handle; // buffer
-  binder.resource_info.buffer_info.offset = bind_info.offset; // offset
-  binder.resource_info.buffer_info.range = bind_info.range; // range
-  add_binding(binder);
-
-  return *this;
-}
-
-DescriptorSet& DescriptorSet::bind(
-    const uint32_t idx,
-    const VulkanImage& image) {
-  // If the image does not have an allocator attached, then it is externally
-  // allocated; assume it is already bound to memory. Otherwise, it must be
-  // bound to a VmaAllocation to be used.
-  VK_CHECK_COND(
-      image.vma_allocator() == VK_NULL_HANDLE || image.has_memory(),
-      "Image must be bound to memory for it to be usable");
-
-  VkImageLayout binding_layout = image.layout();
-  if (shader_layout_signature_[idx] == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE) {
-    binding_layout = VK_IMAGE_LAYOUT_GENERAL;
-  }
-
-  DescriptorSet::ResourceBinding binder{};
-  binder.binding_idx = idx; // binding_idx
-  binder.descriptor_type = shader_layout_signature_[idx]; // descriptor_type
-  binder.is_image = true; // is_image
-  binder.resource_info.image_info.sampler = image.sampler(); // buffer
-  binder.resource_info.image_info.imageView = image.image_view(); // imageView
-  binder.resource_info.image_info.imageLayout = binding_layout; // imageLayout
-  add_binding(binder);
-
-  return *this;
-}
-
-VkDescriptorSet DescriptorSet::get_bind_handle() const {
-  std::vector<VkWriteDescriptorSet> write_descriptor_sets;
-  write_descriptor_sets.reserve(bindings_.size());
-
-  for (const ResourceBinding& binding : bindings_) {
-    VkWriteDescriptorSet write{
-        VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, // sType
-        nullptr, // pNext
-        handle_, // dstSet
-        binding.binding_idx, // dstBinding
-        0u, // dstArrayElement
-        1u, // descriptorCount
-        binding.descriptor_type, // descriptorType
-        nullptr, // pImageInfo
-        nullptr, // pBufferInfo
-        nullptr, // pTexelBufferView
-    };
-
-    if (binding.is_image) {
-      write.pImageInfo = &binding.resource_info.image_info;
-    } else {
-      write.pBufferInfo = &binding.resource_info.buffer_info;
-    }
-
-    write_descriptor_sets.emplace_back(write);
-  }
-
-  vkUpdateDescriptorSets(
-      device_,
-      write_descriptor_sets.size(),
-      write_descriptor_sets.data(),
-      0u,
-      nullptr);
-
-  return handle_;
-}
-
-void DescriptorSet::add_binding(const ResourceBinding& binding) {
-  const auto bindings_itr = std::find_if(
-      bindings_.begin(),
-      bindings_.end(),
-      [binding_idx = binding.binding_idx](const ResourceBinding& other) {
-        return other.binding_idx == binding_idx;
-      });
-
-  if (bindings_.end() == bindings_itr) {
-    bindings_.emplace_back(binding);
-  } else {
-    *bindings_itr = binding;
-  }
-}
-
-//
-// DescriptorSetPile
-//
-
-DescriptorSetPile::DescriptorSetPile(
-    const uint32_t pile_size,
-    VkDescriptorSetLayout descriptor_set_layout,
-    VkDevice device,
-    VkDescriptorPool descriptor_pool)
-    : pile_size_{pile_size},
-      set_layout_{descriptor_set_layout},
-      device_{device},
-      pool_{descriptor_pool},
-      descriptors_{},
-      in_use_(0u) {
-  descriptors_.resize(pile_size_);
-  allocate_new_batch();
-}
-
-VkDescriptorSet DescriptorSetPile::get_descriptor_set() {
-  // No-ops if there are descriptor sets available
-  allocate_new_batch();
-
-  VkDescriptorSet handle = descriptors_[in_use_];
-  descriptors_[in_use_] = VK_NULL_HANDLE;
-
-  in_use_++;
-  return handle;
-}
-
-void DescriptorSetPile::allocate_new_batch() {
-  // No-ops if there are still descriptor sets available
-  if (in_use_ < descriptors_.size() &&
-      descriptors_[in_use_] != VK_NULL_HANDLE) {
-    return;
-  }
-
-  std::vector<VkDescriptorSetLayout> layouts(descriptors_.size());
-  fill(layouts.begin(), layouts.end(), set_layout_);
-
-  const VkDescriptorSetAllocateInfo allocate_info{
-      VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, // sType
-      nullptr, // pNext
-      pool_, // descriptorPool
-      utils::safe_downcast<uint32_t>(layouts.size()), // descriptorSetCount
-      layouts.data(), // pSetLayouts
-  };
-
-  VK_CHECK(
-      vkAllocateDescriptorSets(device_, &allocate_info, descriptors_.data()));
-
-  in_use_ = 0u;
-}
-
-//
-// DescriptorPool
-//
-
-DescriptorPool::DescriptorPool(
-    VkDevice device,
-    const DescriptorPoolConfig& config)
-    : device_(device),
-      pool_(VK_NULL_HANDLE),
-      config_(config),
-      mutex_{},
-      piles_{} {
-  if (config.descriptor_pool_max_sets > 0) {
-    init(config);
-  }
-}
-
-DescriptorPool::~DescriptorPool() {
-  if (pool_ == VK_NULL_HANDLE) {
-    return;
-  }
-  vkDestroyDescriptorPool(device_, pool_, nullptr);
-}
-
-void DescriptorPool::init(const DescriptorPoolConfig& config) {
-  VK_CHECK_COND(
-      pool_ == VK_NULL_HANDLE,
-      "Trying to init a DescriptorPool that has already been created!");
-
-  config_ = config;
-
-  std::vector<VkDescriptorPoolSize> type_sizes{
-      {
-          VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
-          config_.descriptor_uniform_buffer_count,
-      },
-      {
-          VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-          config_.descriptor_storage_buffer_count,
-      },
-      {
-          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
-          config_.descriptor_combined_sampler_count,
-      },
-      {
-          VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
-          config_.descriptor_storage_buffer_count,
-      },
-  };
-
-  const VkDescriptorPoolCreateInfo create_info{
-      VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-      config_.descriptor_pool_max_sets, // maxSets
-      static_cast<uint32_t>(type_sizes.size()), // poolSizeCounts
-      type_sizes.data(), // pPoolSizes
-  };
-
-  VK_CHECK(vkCreateDescriptorPool(device_, &create_info, nullptr, &pool_));
-}
-
-DescriptorSet DescriptorPool::get_descriptor_set(
-    VkDescriptorSetLayout set_layout,
-    const ShaderLayout::Signature& signature) {
-  VK_CHECK_COND(
-      pool_ != VK_NULL_HANDLE, "DescriptorPool has not yet been initialized!");
-
-  auto it = piles_.find(set_layout);
-  if (piles_.cend() == it) {
-    it = piles_
-             .insert({
-                 set_layout,
-                 DescriptorSetPile(
-                     config_.descriptor_pile_sizes, set_layout, device_, pool_),
-             })
-             .first;
-  }
-
-  VkDescriptorSet handle = it->second.get_descriptor_set();
-
-  return DescriptorSet(device_, handle, signature);
-}
-
-void DescriptorPool::flush() {
-  if (pool_ != VK_NULL_HANDLE) {
-    VK_CHECK(vkResetDescriptorPool(device_, pool_, 0u));
-    piles_.clear();
-  }
-}
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Descriptor.h b/backends/vulkan/runtime/vk_api/Descriptor.h
deleted file mode 100644
index 15ea5e23e33..00000000000
--- a/backends/vulkan/runtime/vk_api/Descriptor.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/Shader.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Buffer.h>
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Image.h>
-
-#include <unordered_map>
-
-namespace vkcompute {
-namespace vkapi {
-
-/*
- * Stores the binding information of a Vulkan Buffer so that the buffer can be
- * bound at a later time. This struct should only be used if the buffer to be
- * bound is guaranteed to be active at the time of binding.
- */
-struct BufferBindInfo final {
-  VkBuffer handle;
-  VkDeviceSize offset;
-  VkDeviceSize range;
-
-  BufferBindInfo();
-  BufferBindInfo(const VulkanBuffer& buffer_p, const uint32_t offset_p = 0u);
-  BufferBindInfo(
-      const VulkanBuffer& buffer_p,
-      const size_t offset_p,
-      const size_t range_p);
-};
-
-struct ParamsBindList final {
-  std::vector<BufferBindInfo> bind_infos;
-
-  ParamsBindList() = default;
-  ParamsBindList(std::initializer_list<const BufferBindInfo> init_list);
-
-  void append(const BufferBindInfo& bind_info);
-  void append(const ParamsBindList& other);
-};
-
-class DescriptorSet final {
- public:
-  explicit DescriptorSet(VkDevice, VkDescriptorSet, ShaderLayout::Signature);
-
-  DescriptorSet(const DescriptorSet&) = delete;
-  DescriptorSet& operator=(const DescriptorSet&) = delete;
-
-  DescriptorSet(DescriptorSet&&) noexcept;
-  DescriptorSet& operator=(DescriptorSet&&) noexcept;
-
-  ~DescriptorSet() = default;
-
-  struct ResourceBinding final {
-    uint32_t binding_idx;
-    VkDescriptorType descriptor_type;
-    bool is_image;
-
-    union {
-      VkDescriptorBufferInfo buffer_info;
-      VkDescriptorImageInfo image_info;
-    } resource_info;
-  };
-
- private:
-  VkDevice device_;
-  VkDescriptorSet handle_;
-  ShaderLayout::Signature shader_layout_signature_;
-  std::vector<ResourceBinding> bindings_;
-
- public:
-  DescriptorSet& bind(const uint32_t, const BufferBindInfo&);
-  DescriptorSet& bind(const uint32_t, const VulkanBuffer&);
-  DescriptorSet& bind(const uint32_t, const VulkanImage&);
-
-  VkDescriptorSet get_bind_handle() const;
-
- private:
-  void add_binding(const ResourceBinding& resource);
-};
-
-class DescriptorSetPile final {
- public:
-  DescriptorSetPile(
-      const uint32_t,
-      VkDescriptorSetLayout,
-      VkDevice,
-      VkDescriptorPool);
-
-  DescriptorSetPile(const DescriptorSetPile&) = delete;
-  DescriptorSetPile& operator=(const DescriptorSetPile&) = delete;
-
-  DescriptorSetPile(DescriptorSetPile&&) = default;
-  DescriptorSetPile& operator=(DescriptorSetPile&&) = default;
-
-  ~DescriptorSetPile() = default;
-
- private:
-  uint32_t pile_size_;
-  VkDescriptorSetLayout set_layout_;
-  VkDevice device_;
-  VkDescriptorPool pool_;
-  std::vector<VkDescriptorSet> descriptors_;
-  size_t in_use_;
-
- public:
-  VkDescriptorSet get_descriptor_set();
-
- private:
-  void allocate_new_batch();
-};
-
-struct DescriptorPoolConfig final {
-  // Overall Pool capacity
-  uint32_t descriptor_pool_max_sets;
-  // DescriptorCounts by type
-  uint32_t descriptor_uniform_buffer_count;
-  uint32_t descriptor_storage_buffer_count;
-  uint32_t descriptor_combined_sampler_count;
-  uint32_t descriptor_storage_image_count;
-  // Pile size for pre-allocating descriptor sets
-  uint32_t descriptor_pile_sizes;
-};
-
-class DescriptorPool final {
- public:
-  explicit DescriptorPool(VkDevice, const DescriptorPoolConfig&);
-
-  DescriptorPool(const DescriptorPool&) = delete;
-  DescriptorPool& operator=(const DescriptorPool&) = delete;
-
-  DescriptorPool(DescriptorPool&&) = delete;
-  DescriptorPool& operator=(DescriptorPool&&) = delete;
-
-  ~DescriptorPool();
-
- private:
-  VkDevice device_;
-  VkDescriptorPool pool_;
-  DescriptorPoolConfig config_;
-  // New Descriptors
-  std::mutex mutex_;
-  std::unordered_map<VkDescriptorSetLayout, DescriptorSetPile> piles_;
-
- public:
-  operator bool() const {
-    return (pool_ != VK_NULL_HANDLE);
-  }
-
-  void init(const DescriptorPoolConfig& config);
-
-  DescriptorSet get_descriptor_set(
-      VkDescriptorSetLayout handle,
-      const ShaderLayout::Signature& signature);
-
-  void flush();
-};
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Device.cpp b/backends/vulkan/runtime/vk_api/Device.cpp
deleted file mode 100644
index a21130f1231..00000000000
--- a/backends/vulkan/runtime/vk_api/Device.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// @lint-ignore-every CLANGTIDY clang-diagnostic-missing-field-initializers
-
-#include <executorch/backends/vulkan/runtime/vk_api/Device.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/Exception.h>
-
-#include <algorithm>
-#include <bitset>
-#include <cctype>
-#include <cstring>
-
-namespace vkcompute {
-namespace vkapi {
-
-PhysicalDevice::PhysicalDevice(VkPhysicalDevice physical_device_handle)
-    : handle(physical_device_handle),
-      properties{},
-      memory_properties{},
-#ifdef VK_KHR_16bit_storage
-      shader_16bit_storage{
-          VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES},
-#endif /* VK_KHR_16bit_storage */
-#ifdef VK_KHR_8bit_storage
-      shader_8bit_storage{
-          VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES},
-#endif /* VK_KHR_8bit_storage */
-#ifdef VK_KHR_shader_float16_int8
-      shader_float16_int8_types{
-          VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT16_INT8_FEATURES_KHR},
-#endif /* VK_KHR_shader_float16_int8 */
-#ifdef VK_KHR_shader_integer_dot_product
-      shader_int_dot_product_features{
-          VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR},
-      shader_int_dot_product_properties{
-          VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_PROPERTIES_KHR},
-#endif
-      queue_families{},
-      num_compute_queues(0),
-      supports_int16_shader_types(false),
-      has_unified_memory(false),
-      has_timestamps(false),
-      timestamp_period(0),
-      min_ubo_alignment(0),
-      device_name{},
-      device_type{DeviceType::UNKNOWN} {
-  // Extract physical device properties
-  vkGetPhysicalDeviceProperties(handle, &properties);
-
-  // Extract fields of interest
-  has_timestamps = properties.limits.timestampComputeAndGraphics;
-  timestamp_period = properties.limits.timestampPeriod;
-  min_ubo_alignment = properties.limits.minUniformBufferOffsetAlignment;
-
-  vkGetPhysicalDeviceMemoryProperties(handle, &memory_properties);
-
-  VkPhysicalDeviceFeatures2 features2{
-      VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2};
-
-  // Create linked list to query availability of extensions
-
-  void* extension_list_top = nullptr;
-
-#ifdef VK_KHR_16bit_storage
-  shader_16bit_storage.pNext = extension_list_top;
-  extension_list_top = &shader_16bit_storage;
-#endif /* VK_KHR_16bit_storage */
-
-#ifdef VK_KHR_8bit_storage
-  shader_8bit_storage.pNext = extension_list_top;
-  extension_list_top = &shader_8bit_storage;
-#endif /* VK_KHR_8bit_storage */
-
-#ifdef VK_KHR_shader_float16_int8
-  shader_float16_int8_types.pNext = extension_list_top;
-  extension_list_top = &shader_float16_int8_types;
-#endif /* VK_KHR_shader_float16_int8 */
-
-#ifdef VK_KHR_shader_integer_dot_product
-  shader_int_dot_product_features.pNext = extension_list_top;
-  extension_list_top = &shader_int_dot_product_features;
-  shader_int_dot_product_properties.pNext = extension_list_top;
-  extension_list_top = &shader_int_dot_product_properties;
-#endif /* VK_KHR_shader_integer_dot_product */
-
-  features2.pNext = extension_list_top;
-
-  vkGetPhysicalDeviceFeatures2(handle, &features2);
-
-  if (features2.features.shaderInt16 == VK_TRUE) {
-    supports_int16_shader_types = true;
-  }
-
-  // Check if there are any memory types have both the HOST_VISIBLE and the
-  // DEVICE_LOCAL property flags
-  const VkMemoryPropertyFlags unified_memory_flags =
-      VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
-  for (size_t i = 0; i < memory_properties.memoryTypeCount; ++i) {
-    if (memory_properties.memoryTypes[i].propertyFlags | unified_memory_flags) {
-      has_unified_memory = true;
-      break;
-    }
-  }
-
-  uint32_t queue_family_count = 0;
-  vkGetPhysicalDeviceQueueFamilyProperties(
-      handle, &queue_family_count, nullptr);
-
-  queue_families.resize(queue_family_count);
-  vkGetPhysicalDeviceQueueFamilyProperties(
-      handle, &queue_family_count, queue_families.data());
-
-  // Find the total number of compute queues
-  for (const VkQueueFamilyProperties& p : queue_families) {
-    // Check if this family has compute capability
-    if (p.queueFlags & VK_QUEUE_COMPUTE_BIT) {
-      num_compute_queues += p.queueCount;
-    }
-  }
-
-  // Obtain device identity metadata
-  device_name = std::string(properties.deviceName);
-  std::transform(
-      device_name.begin(),
-      device_name.end(),
-      device_name.begin(),
-      [](unsigned char c) { return std::tolower(c); });
-
-  if (device_name.find("adreno") != std::string::npos) {
-    device_type = DeviceType::ADRENO;
-  } else if (device_name.find("swiftshader") != std::string::npos) {
-    device_type = DeviceType::SWIFTSHADER;
-  } else if (device_name.find("nvidia") != std::string::npos) {
-    device_type = DeviceType::NVIDIA;
-  } else if (device_name.find("mali") != std::string::npos) {
-    device_type = DeviceType::MALI;
-  }
-}
-
-//
-// DeviceHandle
-//
-
-DeviceHandle::DeviceHandle(VkDevice device) : handle(device) {}
-
-DeviceHandle::~DeviceHandle() {
-  if (handle == VK_NULL_HANDLE) {
-    return;
-  }
-  vkDestroyDevice(handle, nullptr);
-}
-
-//
-// Utils
-//
-
-void find_requested_device_extensions(
-    VkPhysicalDevice physical_device,
-    std::vector<const char*>& enabled_extensions,
-    const std::vector<const char*>& requested_extensions) {
-  uint32_t device_extension_properties_count = 0;
-  VK_CHECK(vkEnumerateDeviceExtensionProperties(
-      physical_device, nullptr, &device_extension_properties_count, nullptr));
-  std::vector<VkExtensionProperties> device_extension_properties(
-      device_extension_properties_count);
-  VK_CHECK(vkEnumerateDeviceExtensionProperties(
-      physical_device,
-      nullptr,
-      &device_extension_properties_count,
-      device_extension_properties.data()));
-
-  std::vector<const char*> enabled_device_extensions;
-
-  for (const auto& requested_extension : requested_extensions) {
-    for (const auto& extension : device_extension_properties) {
-      if (strcmp(requested_extension, extension.extensionName) == 0) {
-        enabled_extensions.push_back(requested_extension);
-        break;
-      }
-    }
-  }
-}
-
-std::string get_device_type_str(const VkPhysicalDeviceType type) {
-  switch (type) {
-    case VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU:
-      return "INTEGRATED_GPU";
-    case VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU:
-      return "DISCRETE_GPU";
-    case VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU:
-      return "VIRTUAL_GPU";
-    case VK_PHYSICAL_DEVICE_TYPE_CPU:
-      return "CPU";
-    default:
-      return "UNKNOWN";
-  }
-}
-
-std::string get_memory_properties_str(const VkMemoryPropertyFlags flags) {
-  std::bitset<10> values(flags);
-  std::stringstream ss("|");
-  if (values[0]) {
-    ss << " DEVICE_LOCAL |";
-  }
-  if (values[1]) {
-    ss << " HOST_VISIBLE |";
-  }
-  if (values[2]) {
-    ss << " HOST_COHERENT |";
-  }
-  if (values[3]) {
-    ss << " HOST_CACHED |";
-  }
-  if (values[4]) {
-    ss << " LAZILY_ALLOCATED |";
-  }
-
-  return ss.str();
-}
-
-std::string get_queue_family_properties_str(const VkQueueFlags flags) {
-  std::bitset<10> values(flags);
-  std::stringstream ss("|");
-  if (values[0]) {
-    ss << " GRAPHICS |";
-  }
-  if (values[1]) {
-    ss << " COMPUTE |";
-  }
-  if (values[2]) {
-    ss << " TRANSFER |";
-  }
-
-  return ss.str();
-}
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Device.h b/backends/vulkan/runtime/vk_api/Device.h
deleted file mode 100644
index f5b7154d260..00000000000
--- a/backends/vulkan/runtime/vk_api/Device.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
-
-#include <sstream>
-#include <vector>
-
-namespace vkcompute {
-namespace vkapi {
-
-enum class DeviceType : uint32_t {
-  UNKNOWN,
-  NVIDIA,
-  MALI,
-  ADRENO,
-  SWIFTSHADER,
-};
-
-struct PhysicalDevice final {
-  // Handle
-  VkPhysicalDevice handle;
-
-  // Properties obtained from Vulkan
-  VkPhysicalDeviceProperties properties;
-  VkPhysicalDeviceMemoryProperties memory_properties;
-
-  // Additional features available from extensions
-#ifdef VK_KHR_16bit_storage
-  VkPhysicalDevice16BitStorageFeatures shader_16bit_storage;
-#endif /* VK_KHR_16bit_storage */
-#ifdef VK_KHR_8bit_storage
-  VkPhysicalDevice8BitStorageFeatures shader_8bit_storage;
-#endif /* VK_KHR_8bit_storage */
-#ifdef VK_KHR_shader_float16_int8
-  VkPhysicalDeviceShaderFloat16Int8Features shader_float16_int8_types;
-#endif /* VK_KHR_shader_float16_int8 */
-#ifdef VK_KHR_shader_integer_dot_product
-  VkPhysicalDeviceShaderIntegerDotProductFeatures
-      shader_int_dot_product_features;
-  VkPhysicalDeviceShaderIntegerDotProductProperties
-      shader_int_dot_product_properties;
-#endif /* VK_KHR_shader_integer_dot_product */
-
-  // Available GPU queues
-  std::vector<VkQueueFamilyProperties> queue_families;
-
-  // Metadata
-  uint32_t num_compute_queues;
-  bool supports_int16_shader_types;
-  bool has_unified_memory;
-  bool has_timestamps;
-  float timestamp_period;
-  size_t min_ubo_alignment;
-
-  // Device identity
-  std::string device_name;
-  DeviceType device_type;
-
-  explicit PhysicalDevice(VkPhysicalDevice);
-};
-
-struct DeviceHandle final {
-  VkDevice handle;
-
-  explicit DeviceHandle(VkDevice);
-  ~DeviceHandle();
-};
-
-void find_requested_device_extensions(
-    VkPhysicalDevice physical_device,
-    std::vector<const char*>& enabled_extensions,
-    const std::vector<const char*>& requested_extensions);
-
-std::string get_device_type_str(const VkPhysicalDeviceType type);
-
-std::string get_memory_properties_str(const VkMemoryPropertyFlags flags);
-
-std::string get_queue_family_properties_str(const VkQueueFlags flags);
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Exception.cpp b/backends/vulkan/runtime/vk_api/Exception.cpp
deleted file mode 100644
index c07349fa7ca..00000000000
--- a/backends/vulkan/runtime/vk_api/Exception.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/vk_api/Exception.h>
-
-#include <sstream>
-
-namespace vkcompute {
-namespace vkapi {
-
-#define VK_RESULT_CASE(code) \
-  case code:                 \
-    out << #code;            \
-    break;
-
-std::ostream& operator<<(std::ostream& out, const VkResult result) {
-  switch (result) {
-    VK_RESULT_CASE(VK_SUCCESS)
-    VK_RESULT_CASE(VK_NOT_READY)
-    VK_RESULT_CASE(VK_TIMEOUT)
-    VK_RESULT_CASE(VK_EVENT_SET)
-    VK_RESULT_CASE(VK_EVENT_RESET)
-    VK_RESULT_CASE(VK_INCOMPLETE)
-    VK_RESULT_CASE(VK_ERROR_OUT_OF_HOST_MEMORY)
-    VK_RESULT_CASE(VK_ERROR_OUT_OF_DEVICE_MEMORY)
-    VK_RESULT_CASE(VK_ERROR_INITIALIZATION_FAILED)
-    VK_RESULT_CASE(VK_ERROR_DEVICE_LOST)
-    VK_RESULT_CASE(VK_ERROR_MEMORY_MAP_FAILED)
-    VK_RESULT_CASE(VK_ERROR_LAYER_NOT_PRESENT)
-    VK_RESULT_CASE(VK_ERROR_EXTENSION_NOT_PRESENT)
-    VK_RESULT_CASE(VK_ERROR_FEATURE_NOT_PRESENT)
-    VK_RESULT_CASE(VK_ERROR_INCOMPATIBLE_DRIVER)
-    VK_RESULT_CASE(VK_ERROR_TOO_MANY_OBJECTS)
-    VK_RESULT_CASE(VK_ERROR_FORMAT_NOT_SUPPORTED)
-    VK_RESULT_CASE(VK_ERROR_FRAGMENTED_POOL)
-    default:
-      out << "VK_ERROR_UNKNOWN (VkResult " << result << ")";
-      break;
-  }
-  return out;
-}
-
-#undef VK_RESULT_CASE
-
-//
-// SourceLocation
-//
-
-std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
-  out << loc.function << " at " << loc.file << ":" << loc.line;
-  return out;
-}
-
-//
-// Exception
-//
-
-Error::Error(SourceLocation source_location, std::string msg)
-    : msg_(std::move(msg)), source_location_{source_location} {
-  std::ostringstream oss;
-  oss << "Exception raised from " << source_location_ << ": ";
-  oss << msg_;
-  what_ = oss.str();
-}
-
-Error::Error(SourceLocation source_location, const char* cond, std::string msg)
-    : msg_(std::move(msg)), source_location_{source_location} {
-  std::ostringstream oss;
-  oss << "Exception raised from " << source_location_ << ": ";
-  oss << "(" << cond << ") is false! ";
-  oss << msg_;
-  what_ = oss.str();
-}
-
-//
-// ShaderNotSupportedError
-//
-
-std::ostream& operator<<(std::ostream& out, const VulkanExtension result) {
-  switch (result) {
-    case VulkanExtension::SHADER_INT16:
-      out << "shaderInt16";
-      break;
-    case VulkanExtension::INT16_STORAGE:
-      out << "VK_KHR_16bit_storage";
-      break;
-    case VulkanExtension::INT8_STORAGE:
-      out << "VK_KHR_8bit_storage";
-      break;
-    case VulkanExtension::INTEGER_DOT_PRODUCT:
-      out << "VK_KHR_shader_integer_dot_product";
-      break;
-  }
-  return out;
-}
-
-ShaderNotSupportedError::ShaderNotSupportedError(
-    std::string shader_name,
-    VulkanExtension extension)
-    : shader_name_(std::move(shader_name)), extension_{extension} {
-  std::ostringstream oss;
-  oss << "Shader " << shader_name_ << " ";
-  oss << "not compatible with device. ";
-  oss << "Missing support for extension or physical device feature: ";
-  oss << extension_;
-  what_ = oss.str();
-}
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Exception.h b/backends/vulkan/runtime/vk_api/Exception.h
deleted file mode 100644
index a883a68fefc..00000000000
--- a/backends/vulkan/runtime/vk_api/Exception.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
-
-#include <executorch/backends/vulkan/runtime/utils/StringUtils.h>
-
-#include <exception>
-#include <ostream>
-#include <string>
-#include <vector>
-
-#define VK_CHECK(function)                                                  \
-  do {                                                                      \
-    const VkResult result = (function);                                     \
-    if (VK_SUCCESS != result) {                                             \
-      throw ::vkcompute::vkapi::Error(                                      \
-          {__func__, __FILE__, static_cast<uint32_t>(__LINE__)},            \
-          ::vkcompute::utils::concat_str(#function, " returned ", result)); \
-    }                                                                       \
-  } while (false)
-
-#define VK_CHECK_COND(cond, ...)                                 \
-  do {                                                           \
-    if (!(cond)) {                                               \
-      throw ::vkcompute::vkapi::Error(                           \
-          {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, \
-          #cond,                                                 \
-          ::vkcompute::utils::concat_str(__VA_ARGS__));          \
-    }                                                            \
-  } while (false)
-
-#define VK_THROW(...)                                          \
-  do {                                                         \
-    throw ::vkcompute::vkapi::Error(                           \
-        {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, \
-        ::vkcompute::utils::concat_str(__VA_ARGS__));          \
-  } while (false)
-
-namespace vkcompute {
-namespace vkapi {
-
-std::ostream& operator<<(std::ostream& out, const VkResult loc);
-
-struct SourceLocation {
-  const char* function;
-  const char* file;
-  uint32_t line;
-};
-
-std::ostream& operator<<(std::ostream& out, const SourceLocation& loc);
-
-class Error : public std::exception {
- public:
-  Error(SourceLocation source_location, std::string msg);
-  Error(SourceLocation source_location, const char* cond, std::string msg);
-
- private:
-  std::string msg_;
-  SourceLocation source_location_;
-  std::string what_;
-
- public:
-  const std::string& msg() const {
-    return msg_;
-  }
-
-  const char* what() const noexcept override {
-    return what_.c_str();
-  }
-};
-
-enum class VulkanExtension : uint8_t {
-  SHADER_INT16,
-  INT16_STORAGE,
-  INT8_STORAGE,
-  INTEGER_DOT_PRODUCT,
-};
-
-class ShaderNotSupportedError : public std::exception {
- public:
-  ShaderNotSupportedError(std::string shader_name, VulkanExtension extension);
-
- private:
-  std::string shader_name_;
-  VulkanExtension extension_;
-  std::string what_;
-
- public:
-  const char* what() const noexcept override {
-    return what_.c_str();
-  }
-};
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Fence.cpp b/backends/vulkan/runtime/vk_api/Fence.cpp
deleted file mode 100644
index d359990e634..00000000000
--- a/backends/vulkan/runtime/vk_api/Fence.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/vk_api/Fence.h>
-
-namespace vkcompute {
-namespace vkapi {
-
-VulkanFence::VulkanFence()
-    : device_(VK_NULL_HANDLE), handle_(VK_NULL_HANDLE), waiting_(false) {}
-
-VulkanFence::VulkanFence(VkDevice device)
-    : device_(device), handle_(VK_NULL_HANDLE), waiting_(VK_NULL_HANDLE) {
-  const VkFenceCreateInfo fence_create_info{
-      VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-  };
-
-  VK_CHECK(vkCreateFence(device_, &fence_create_info, nullptr, &handle_));
-}
-
-VulkanFence::VulkanFence(VulkanFence&& other) noexcept
-    : device_(other.device_), handle_(other.handle_), waiting_(other.waiting_) {
-  other.handle_ = VK_NULL_HANDLE;
-  other.waiting_ = false;
-}
-
-VulkanFence& VulkanFence::operator=(VulkanFence&& other) noexcept {
-  device_ = other.device_;
-  handle_ = other.handle_;
-  waiting_ = other.waiting_;
-
-  other.device_ = VK_NULL_HANDLE;
-  other.handle_ = VK_NULL_HANDLE;
-  other.waiting_ = false;
-
-  return *this;
-}
-
-VulkanFence::~VulkanFence() {
-  if (handle_ == VK_NULL_HANDLE) {
-    return;
-  }
-  vkDestroyFence(device_, handle_, nullptr);
-}
-
-void VulkanFence::wait() {
-  // if get_submit_handle() has not been called, then this will no-op
-  if (waiting_) {
-    VkResult fence_status = VK_NOT_READY;
-    // Run the wait in a loop to keep the CPU hot. A single call to
-    // vkWaitForFences with no timeout may cause the calling thread to be
-    // scheduled out.
-    do {
-      // The timeout (last) arg is in units of ns
-      fence_status = vkWaitForFences(device_, 1u, &handle_, VK_TRUE, 100000);
-
-      VK_CHECK_COND(
-          fence_status != VK_ERROR_DEVICE_LOST,
-          "Vulkan Fence: Device lost while waiting for fence!");
-    } while (fence_status != VK_SUCCESS);
-
-    VK_CHECK(vkResetFences(device_, 1u, &handle_));
-
-    waiting_ = false;
-  }
-}
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Fence.h b/backends/vulkan/runtime/vk_api/Fence.h
deleted file mode 100644
index 52fa24de55b..00000000000
--- a/backends/vulkan/runtime/vk_api/Fence.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/Exception.h>
-
-#include <stack>
-
-namespace vkcompute {
-namespace vkapi {
-
-class VulkanFence final {
- public:
-  // TODO: This is required for the lazy allocation pattern in api::vTensor.
-  //       It will be disabled pending future refactors.
-  explicit VulkanFence();
-
-  explicit VulkanFence(VkDevice);
-
-  VulkanFence(const VulkanFence&) = delete;
-  VulkanFence& operator=(const VulkanFence&) = delete;
-
-  VulkanFence(VulkanFence&&) noexcept;
-  VulkanFence& operator=(VulkanFence&&) noexcept;
-
-  ~VulkanFence();
-
- private:
-  VkDevice device_;
-  VkFence handle_;
-  bool waiting_;
-
- public:
-  // Used to get the handle for a queue submission.
-  VkFence get_submit_handle() {
-    if (handle_ != VK_NULL_HANDLE) {
-      // Indicate we are now waiting for this fence to be signaled
-      waiting_ = true;
-    }
-    return handle_;
-  }
-
-  VkFence handle() {
-    return handle_;
-  }
-
-  // Trigger a synchronous wait for the fence to be signaled
-  void wait();
-
-  bool waiting() const {
-    return waiting_;
-  }
-
-  operator bool() const {
-    return (handle_ != VK_NULL_HANDLE);
-  }
-};
-
-// A pool to track created Fences and reuse ones that are available.
-// Only intended to be modified by one thread at a time.
-struct FencePool final {
-  VkDevice device_;
-
-  std::stack<VulkanFence> pool_;
-
-  explicit FencePool(VkDevice device) : device_(device), pool_{} {}
-
-  // Returns an rvalue reference to a fence, so that it can be moved
-  inline VulkanFence get_fence() {
-    if (pool_.empty()) {
-      VulkanFence new_fence = VulkanFence(device_);
-      return new_fence;
-    }
-
-    VulkanFence top_fence = std::move(pool_.top());
-    pool_.pop();
-
-    return top_fence;
-  }
-
-  // Marks the fence as available
-  inline void return_fence(VulkanFence& fence) {
-    pool_.push(std::move(fence));
-  }
-};
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Pipeline.cpp b/backends/vulkan/runtime/vk_api/Pipeline.cpp
deleted file mode 100644
index 994b46b8c76..00000000000
--- a/backends/vulkan/runtime/vk_api/Pipeline.cpp
+++ /dev/null
@@ -1,589 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/vk_api/Pipeline.h>
-
-#include <fstream>
-
-namespace vkcompute {
-namespace vkapi {
-
-//
-// Utility Functions
-//
-
-VkAccessFlags vk_access(
-    const PipelineStageFlags stage,
-    const MemoryAccessFlags access) {
-  VkAccessFlags vk_access = 0u;
-
-  if (access & MemoryAccessType::READ) {
-    if (stage & PipelineStage::COMPUTE) {
-      vk_access |= VK_ACCESS_SHADER_READ_BIT;
-    }
-
-    if (stage & PipelineStage::HOST) {
-      vk_access |= VK_ACCESS_HOST_READ_BIT;
-    }
-
-    if (stage & PipelineStage::TRANSFER) {
-      vk_access |= VK_ACCESS_TRANSFER_READ_BIT;
-    }
-  }
-
-  if (access & MemoryAccessType::WRITE) {
-    if (stage & PipelineStage::COMPUTE) {
-      vk_access |= VK_ACCESS_SHADER_WRITE_BIT;
-    }
-
-    if (stage & PipelineStage::HOST) {
-      vk_access |= VK_ACCESS_HOST_WRITE_BIT;
-    }
-
-    if (stage & PipelineStage::TRANSFER) {
-      vk_access |= VK_ACCESS_TRANSFER_WRITE_BIT;
-    }
-  }
-
-  return vk_access;
-}
-
-VkPipelineStageFlags vk_stage(const PipelineStageFlags stage) {
-  VkPipelineStageFlags vk_stage = 0u;
-
-  if (stage & PipelineStage::COMPUTE) {
-    vk_stage |= VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
-  }
-
-  if (stage & PipelineStage::HOST) {
-    vk_stage |= VK_PIPELINE_STAGE_HOST_BIT;
-  }
-
-  if (stage & PipelineStage::TRANSFER) {
-    vk_stage |= VK_PIPELINE_STAGE_TRANSFER_BIT;
-  }
-
-  return vk_stage;
-}
-
-VkImageLayout vk_layout(
-    const PipelineStageFlags stage,
-    const MemoryAccessFlags access) {
-  switch (stage) {
-    case PipelineStage::COMPUTE:
-      switch (access) {
-        case MemoryAccessType::READ:
-          return VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
-        default:
-          return VK_IMAGE_LAYOUT_GENERAL;
-      }
-      break;
-    case PipelineStage::TRANSFER:
-      switch (access) {
-        case MemoryAccessType::READ:
-          return VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
-        case MemoryAccessType::WRITE:
-          return VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
-        default:
-          VK_THROW("Invalid memory access type for transfer stage!");
-      }
-      break;
-    default:
-      VK_THROW("Cannot determine appropriate image layout");
-  }
-
-  return VK_IMAGE_LAYOUT_UNDEFINED;
-}
-
-//
-// SpecVar
-//
-
-SpecVar::SpecVar() : type(SpecVar::Type::INT) {
-  value.as_int32 = 0;
-}
-
-SpecVar::SpecVar(const float val) : type(SpecVar::Type::FLOAT) {
-  value.as_float = val;
-}
-
-SpecVar::SpecVar(const int32_t val) : type(SpecVar::Type::INT) {
-  value.as_int32 = val;
-}
-
-SpecVar::SpecVar(const uint32_t val) : type(SpecVar::Type::UINT) {
-  value.as_uint32 = val;
-}
-
-SpecVar::SpecVar(const bool val) : type(SpecVar::Type::BOOL) {
-  value.as_bool = val;
-}
-
-uint32_t SpecVar::val_size() const {
-  switch (type) {
-    case SpecVar::Type::FLOAT:
-      return sizeof(float);
-    case SpecVar::Type::INT:
-      return sizeof(int32_t);
-    case SpecVar::Type::UINT:
-      return sizeof(uint32_t);
-    case SpecVar::Type::BOOL:
-      return sizeof(bool);
-  }
-  return 4;
-}
-
-uint32_t SpecVar::val_offset() const {
-  return utils::safe_downcast<uint32_t>(offsetof(SpecVar, value));
-}
-
-bool operator==(const SpecVar& lhs, const SpecVar& rhs) {
-  if (lhs.type != rhs.type) {
-    return false;
-  }
-  switch (lhs.type) {
-    case SpecVar::Type::FLOAT:
-      return lhs.value.as_float == rhs.value.as_float;
-    case SpecVar::Type::INT:
-      return lhs.value.as_int32 == rhs.value.as_int32;
-    case SpecVar::Type::UINT:
-      return lhs.value.as_uint32 == rhs.value.as_uint32;
-    case SpecVar::Type::BOOL:
-      return lhs.value.as_bool == rhs.value.as_bool;
-  }
-  return false;
-}
-
-bool operator!=(const SpecVar& lhs, const SpecVar& rhs) {
-  return !(lhs == rhs);
-}
-
-SpecVarList::SpecVarList() {}
-
-SpecVarList::SpecVarList(std::initializer_list<SpecVar> init_list) {
-  vars.resize(init_list.size());
-  std::copy(init_list.begin(), init_list.end(), vars.begin());
-}
-
-void SpecVarList::append(const SpecVarList& other) {
-  vars.insert(vars.end(), other.vars.begin(), other.vars.end());
-}
-
-void SpecVarList::reserve(const size_t size) {
-  vars.reserve(size);
-}
-
-void SpecVarList::append(const SpecVar& other) {
-  vars.push_back(other);
-}
-
-std::vector<VkSpecializationMapEntry> SpecVarList::generate_map_entries()
-    const {
-  std::vector<VkSpecializationMapEntry> map_entries;
-  map_entries.resize(vars.size());
-  uint32_t cur_offset = 0u;
-  for (uint32_t i = 0; i < vars.size(); ++i) {
-    map_entries.at(i) = {
-        i, cur_offset + vars.at(i).val_offset(), vars.at(i).val_size()};
-    cur_offset += sizeof(SpecVar);
-  }
-  return map_entries;
-}
-
-bool operator==(const SpecVarList& lhs, const SpecVarList& rhs) {
-  if (lhs.size() != rhs.size()) {
-    return false;
-  }
-  for (uint32_t i = 0; i < lhs.size(); ++i) {
-    if (lhs.vars.at(i) != rhs.vars.at(i)) {
-      return false;
-    }
-  }
-  return true;
-}
-
-//
-// PipelineLayout
-//
-
-PipelineLayout::PipelineLayout(
-    VkDevice device,
-    VkDescriptorSetLayout descriptor_layout,
-    const uint32_t push_constants_size)
-    : device_(device), handle_{VK_NULL_HANDLE} {
-  VkPushConstantRange pc_range{
-      VK_SHADER_STAGE_COMPUTE_BIT, // stageFlags
-      0u, // offset
-      push_constants_size, // size
-  };
-  uint32_t num_push_constants = 0u;
-  VkPushConstantRange* pc_ranges_ptr = nullptr;
-  if (push_constants_size > 0u) {
-    num_push_constants = 1u;
-    pc_ranges_ptr = &pc_range;
-  }
-
-  const VkPipelineLayoutCreateInfo pipeline_layout_create_info{
-      VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-      1u, // setLayoutCount
-      &descriptor_layout, // pSetLayouts
-      num_push_constants, // pushConstantRangeCount
-      pc_ranges_ptr, // pPushConstantRanges
-  };
-
-  VK_CHECK(vkCreatePipelineLayout(
-      device_, &pipeline_layout_create_info, nullptr, &handle_));
-}
-
-PipelineLayout::PipelineLayout(PipelineLayout&& other) noexcept
-    : device_(other.device_), handle_(other.handle_) {
-  other.handle_ = VK_NULL_HANDLE;
-}
-
-PipelineLayout::~PipelineLayout() {
-  if (handle_ == VK_NULL_HANDLE) {
-    return;
-  }
-  vkDestroyPipelineLayout(device_, handle_, nullptr);
-  handle_ = VK_NULL_HANDLE;
-}
-
-void swap(PipelineLayout& lhs, PipelineLayout& rhs) noexcept {
-  VkDevice tmp_device = lhs.device_;
-  VkPipelineLayout tmp_handle = lhs.handle_;
-
-  lhs.device_ = rhs.device_;
-  lhs.handle_ = rhs.handle_;
-
-  rhs.device_ = tmp_device;
-  rhs.handle_ = tmp_handle;
-}
-
-//
-// ComputePipeline
-//
-
-ComputePipeline::ComputePipeline(VkDevice device, VkPipeline handle)
-    : device_{device}, handle_{handle} {}
-
-ComputePipeline::ComputePipeline(
-    VkDevice device,
-    const ComputePipeline::Descriptor& descriptor,
-    VkPipelineCache pipeline_cache)
-    : device_(device), handle_{VK_NULL_HANDLE} {
-  map_entries_ = descriptor.specialization_constants.generate_map_entries();
-
-  const VkSpecializationInfo specialization_info{
-      descriptor.specialization_constants.size(), // mapEntryCount
-      map_entries_.data(), // pMapEntries
-      descriptor.specialization_constants.data_nbytes(), // dataSize
-      descriptor.specialization_constants.data(), // pData
-  };
-
-  const VkPipelineShaderStageCreateInfo shader_stage_create_info{
-      VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-      VK_SHADER_STAGE_COMPUTE_BIT, // stage
-      descriptor.shader_module, // module
-      "main", // pName
-      &specialization_info, // pSpecializationInfo
-  };
-
-  VkPipelineCreateFlags flags = 0u;
-#if defined(VULKAN_DEBUG) && defined(VK_KHR_pipeline_executable_properties)
-  flags = VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR |
-      VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR | flags;
-#endif /* VULKAN_DEBUG && VK_KHR_pipeline_executable_properties */
-
-  const VkComputePipelineCreateInfo compute_pipeline_create_info{
-      VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, // sType
-      nullptr, // pNext
-      flags, // flags
-      shader_stage_create_info, // stage
-      descriptor.pipeline_layout, // layout
-      VK_NULL_HANDLE, // basePipelineHandle
-      0u, // basePipelineIndex
-  };
-
-  VK_CHECK(vkCreateComputePipelines(
-      device_,
-      pipeline_cache,
-      1u,
-      &compute_pipeline_create_info,
-      nullptr,
-      &handle_));
-}
-
-ComputePipeline::ComputePipeline(ComputePipeline&& other) noexcept
-    : device_(other.device_),
-      handle_(other.handle_),
-      map_entries_(std::move(other.map_entries_)) {
-  other.handle_ = VK_NULL_HANDLE;
-}
-
-ComputePipeline::~ComputePipeline() {
-  if (handle_ == VK_NULL_HANDLE) {
-    return;
-  }
-  vkDestroyPipeline(device_, handle_, nullptr);
-  handle_ = VK_NULL_HANDLE;
-}
-
-void swap(ComputePipeline& lhs, ComputePipeline& rhs) noexcept {
-  VkDevice tmp_device = lhs.device_;
-  VkPipeline tmp_handle = lhs.handle_;
-
-  lhs.device_ = rhs.device_;
-  lhs.handle_ = rhs.handle_;
-
-  rhs.device_ = tmp_device;
-  rhs.handle_ = tmp_handle;
-}
-
-bool operator==(
-    const ComputePipeline::Descriptor& _1,
-    const ComputePipeline::Descriptor& _2) {
-  return (
-      _1.pipeline_layout == _2.pipeline_layout &&
-      _1.shader_module == _2.shader_module &&
-      _1.specialization_constants == _2.specialization_constants);
-}
-
-//
-// PipelineLayoutCache
-//
-
-PipelineLayoutCache::PipelineLayoutCache(VkDevice device)
-    : cache_mutex_{}, device_(device), cache_{} {}
-
-PipelineLayoutCache::PipelineLayoutCache(PipelineLayoutCache&& other) noexcept
-    : cache_mutex_{}, device_(other.device_), cache_(std::move(other.cache_)) {
-  std::lock_guard<std::mutex> lock(other.cache_mutex_);
-}
-
-PipelineLayoutCache::~PipelineLayoutCache() {
-  purge();
-}
-
-VkPipelineLayout PipelineLayoutCache::retrieve(
-    const VkDescriptorSetLayout layout,
-    const uint32_t push_constants_size) {
-  PipelineLayoutCache::Key key{layout, push_constants_size};
-  std::lock_guard<std::mutex> lock(cache_mutex_);
-
-  auto it = cache_.find(key);
-  if (cache_.cend() == it) {
-    it = cache_
-             .insert(
-                 {key,
-                  PipelineLayoutCache::Value(
-                      device_, layout, push_constants_size)})
-             .first;
-  }
-
-  return it->second.handle();
-}
-
-void PipelineLayoutCache::purge() {
-  std::lock_guard<std::mutex> lock(cache_mutex_);
-  cache_.clear();
-}
-
-//
-// ComputePipelineCache
-//
-
-ComputePipelineCache::ComputePipelineCache(
-    VkDevice device,
-    const std::string& cache_data_path)
-    : cache_mutex_{},
-      device_(device),
-      pipeline_cache_{VK_NULL_HANDLE},
-      cache_{},
-      cache_data_path_(cache_data_path) {
-  VkPipelineCacheCreateInfo pipeline_cache_create_info{};
-
-  auto buffer = load_cache();
-
-  pipeline_cache_create_info = {
-      VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-      buffer.size(), // initialDataSize
-      buffer.data(), // pInitialData
-  };
-
-  VK_CHECK(vkCreatePipelineCache(
-      device, &pipeline_cache_create_info, nullptr, &pipeline_cache_));
-}
-
-ComputePipelineCache::ComputePipelineCache(
-    ComputePipelineCache&& other) noexcept
-    : cache_mutex_{},
-      device_(other.device_),
-      pipeline_cache_(other.pipeline_cache_),
-      cache_(std::move(other.cache_)) {
-  std::lock_guard<std::mutex> lock(other.cache_mutex_);
-
-  other.pipeline_cache_ = VK_NULL_HANDLE;
-}
-
-ComputePipelineCache::~ComputePipelineCache() {
-  purge();
-
-  if (pipeline_cache_ == VK_NULL_HANDLE) {
-    return;
-  }
-
-  vkDestroyPipelineCache(device_, pipeline_cache_, nullptr);
-  pipeline_cache_ = VK_NULL_HANDLE;
-}
-
-bool ComputePipelineCache::contains(const ComputePipelineCache::Key& key) {
-  std::lock_guard<std::mutex> lock(cache_mutex_);
-
-  auto it = cache_.find(key);
-  return it != cache_.cend();
-}
-
-void ComputePipelineCache::create_pipelines(
-    const std::unordered_set<Key, Hasher>& descriptors) {
-  std::lock_guard<std::mutex> lock(cache_mutex_);
-
-  const auto num_pipelines = descriptors.size();
-  std::vector<VkPipeline> pipelines(num_pipelines);
-
-  std::vector<std::vector<VkSpecializationMapEntry>> map_entries;
-  map_entries.reserve(num_pipelines);
-
-  std::vector<VkSpecializationInfo> specialization_infos;
-  specialization_infos.reserve(num_pipelines);
-
-  std::vector<VkPipelineShaderStageCreateInfo> shader_stage_create_infos;
-  shader_stage_create_infos.reserve(num_pipelines);
-
-  std::vector<VkComputePipelineCreateInfo> create_infos;
-  create_infos.reserve(num_pipelines);
-
-  for (auto& key : descriptors) {
-    map_entries.push_back(key.specialization_constants.generate_map_entries());
-
-    specialization_infos.push_back(VkSpecializationInfo{
-        key.specialization_constants.size(), // mapEntryCount
-        map_entries.back().data(), // pMapEntries
-        key.specialization_constants.data_nbytes(), // dataSize
-        key.specialization_constants.data(), // pData
-    });
-
-    shader_stage_create_infos.push_back(VkPipelineShaderStageCreateInfo{
-        VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, // sType
-        nullptr, // pNext
-        0u, // flags
-        VK_SHADER_STAGE_COMPUTE_BIT, // stage
-        key.shader_module, // module
-        "main", // pName
-        &specialization_infos.back(), // pSpecializationInfo
-    });
-
-    create_infos.push_back(VkComputePipelineCreateInfo{
-        VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, // sType
-        nullptr, // pNext
-        0u, // flags
-        shader_stage_create_infos.back(), // stage
-        key.pipeline_layout, // layout
-        VK_NULL_HANDLE, // basePipelineHandle
-        0u, // basePipelineIndex
-    });
-  }
-
-  VK_CHECK(vkCreateComputePipelines(
-      device_,
-      pipeline_cache_,
-      create_infos.size(),
-      create_infos.data(),
-      nullptr,
-      pipelines.data()));
-
-  uint32_t i = 0;
-  for (auto& key : descriptors) {
-    auto it = cache_.find(key);
-    if (it != cache_.cend()) {
-      continue;
-    }
-    cache_.insert({key, ComputePipelineCache::Value(device_, pipelines[i])});
-    ++i;
-  }
-}
-
-VkPipeline ComputePipelineCache::retrieve(
-    const ComputePipelineCache::Key& key) {
-  std::lock_guard<std::mutex> lock(cache_mutex_);
-
-  auto it = cache_.find(key);
-  if (it == cache_.cend()) {
-    it = cache_
-             .insert(
-                 {key,
-                  ComputePipelineCache::Value(device_, key, pipeline_cache_)})
-             .first;
-  }
-  return it->second.handle();
-}
-
-void ComputePipelineCache::purge() {
-  cache_.clear();
-}
-
-std::vector<char> ComputePipelineCache::load_cache() {
-  // No optimization if path is unspecified
-  if (cache_data_path_.empty()) {
-    return {};
-  }
-
-  // Return if file doesn't exist; this is expected on first model-load
-  std::ifstream file(cache_data_path_, std::ios::binary | std::ios::ate);
-  if (file.fail()) {
-    return {};
-  }
-
-  auto size = file.tellg();
-  file.seekg(0, std::ios::beg);
-
-  std::vector<char> buffer(size);
-  file.read(buffer.data(), size);
-
-  return buffer;
-}
-
-void ComputePipelineCache::save_cache() {
-  // No optimization if path is unspecified
-  if (cache_data_path_.empty()) {
-    return;
-  }
-
-  // Return if file exists; the cache is already saved
-  std::ifstream ifile(cache_data_path_);
-  if (ifile.good()) {
-    return;
-  }
-
-  size_t size{};
-  vkGetPipelineCacheData(device_, pipeline_cache_, &size, nullptr);
-
-  std::vector<char> buffer(size);
-  vkGetPipelineCacheData(device_, pipeline_cache_, &size, buffer.data());
-
-  std::ofstream file(cache_data_path_, std::ios::binary);
-  file.write(buffer.data(), buffer.size());
-}
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Pipeline.h b/backends/vulkan/runtime/vk_api/Pipeline.h
deleted file mode 100644
index 67dfaebe75b..00000000000
--- a/backends/vulkan/runtime/vk_api/Pipeline.h
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/Shader.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Buffer.h>
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Image.h>
-
-#include <mutex>
-#include <unordered_map>
-#include <unordered_set>
-
-#define SV(x) ::vkcompute::vkapi::SpecVar(x)
-
-namespace vkcompute {
-namespace vkapi {
-
-struct SpecVar final {
-  enum class Type : uint8_t {
-    FLOAT,
-    INT,
-    UINT,
-    BOOL,
-  };
-
-  union Value {
-    int32_t as_int32;
-    uint32_t as_uint32;
-    float as_float;
-    bool as_bool;
-  };
-
-  Value value;
-  Type type;
-
-  SpecVar();
-  SpecVar(const float val);
-  SpecVar(const int32_t val);
-  SpecVar(const uint32_t val);
-  SpecVar(const bool val);
-
-  uint32_t val_size() const;
-  uint32_t val_offset() const;
-};
-
-bool operator==(const SpecVar& lhs, const SpecVar& rhs);
-
-bool operator!=(const SpecVar& lhs, const SpecVar& rhs);
-
-class SpecVarList final {
-  std::vector<SpecVar> vars;
-
- public:
-  SpecVarList();
-  SpecVarList(std::initializer_list<SpecVar> init_list);
-
-  inline const SpecVar& at(const size_t index) const {
-    return vars.at(index);
-  }
-
-  inline const SpecVar* data() const {
-    return vars.data();
-  }
-
-  inline uint32_t size() const {
-    return utils::safe_downcast<uint32_t>(vars.size());
-  }
-
-  inline uint32_t data_nbytes() const {
-    return vars.size() * sizeof(SpecVar);
-  }
-
-  void append(const SpecVarList& other);
-
-  void reserve(const size_t size);
-
-  void append(const SpecVar& other);
-
-  std::vector<VkSpecializationMapEntry> generate_map_entries() const;
-
-  friend bool operator==(const SpecVarList& lhs, const SpecVarList& rhs);
-};
-
-bool operator==(const SpecVarList& lhs, const SpecVarList& rhs);
-
-struct PipelineBarrier final {
-  struct Stages final {
-    VkPipelineStageFlags src;
-    VkPipelineStageFlags dst;
-  } stage;
-
-  std::vector<BufferMemoryBarrier> buffers;
-  std::vector<ImageMemoryBarrier> images;
-  std::vector<VkBufferMemoryBarrier> buffer_barrier_handles;
-  std::vector<VkImageMemoryBarrier> image_barrier_handles;
-
-  inline operator bool() const {
-    return (0u != stage.src) || (0u != stage.dst) || !buffers.empty() ||
-        !images.empty();
-  }
-};
-
-using PipelineStageFlags = uint8_t;
-
-enum PipelineStage : PipelineStageFlags {
-  NO_STAGE = 0u << 0u,
-  COMPUTE = 1u << 0u,
-  HOST = 1u << 1u,
-  TRANSFER = 1u << 2u,
-};
-
-VkAccessFlags vk_access(const PipelineStageFlags, const MemoryAccessFlags);
-VkPipelineStageFlags vk_stage(const PipelineStageFlags);
-VkImageLayout vk_layout(const PipelineStageFlags, const MemoryAccessFlags);
-
-class PipelineLayout final {
- public:
-  explicit PipelineLayout(VkDevice, VkDescriptorSetLayout, const uint32_t);
-
-  PipelineLayout(const PipelineLayout&) = delete;
-  PipelineLayout& operator=(const PipelineLayout&) = delete;
-
-  PipelineLayout(PipelineLayout&&) noexcept;
-  PipelineLayout& operator=(PipelineLayout&&) = delete;
-
-  ~PipelineLayout();
-
- private:
-  VkDevice device_;
-  VkPipelineLayout handle_;
-
- public:
-  VkPipelineLayout handle() const {
-    return handle_;
-  }
-
-  // We need to define a custom swap function since this class
-  // does not allow for move assignment. The swap function will
-  // be used in the hash map.
-  friend void swap(PipelineLayout& lhs, PipelineLayout& rhs) noexcept;
-};
-
-class ComputePipeline final {
- public:
-  struct Descriptor final {
-    VkPipelineLayout pipeline_layout;
-    VkShaderModule shader_module;
-    SpecVarList specialization_constants;
-  };
-
-  explicit ComputePipeline(VkDevice device, VkPipeline handle);
-
-  explicit ComputePipeline(
-      VkDevice device,
-      const Descriptor& descriptor,
-      VkPipelineCache pipeline_cache);
-
-  ComputePipeline(const ComputePipeline&) = delete;
-  ComputePipeline& operator=(const ComputePipeline&) = delete;
-
-  ComputePipeline(ComputePipeline&&) noexcept;
-  ComputePipeline& operator=(ComputePipeline&&) = delete;
-
-  ~ComputePipeline();
-
- private:
-  VkDevice device_;
-  VkPipeline handle_;
-  std::vector<VkSpecializationMapEntry> map_entries_;
-
- public:
-  inline VkPipeline handle() const {
-    return handle_;
-  }
-
-  // We need to define a custom swap function since this class
-  // does not allow for move assignment. The swap function will
-  // be used in the hash map.
-  friend void swap(ComputePipeline& lhs, ComputePipeline& rhs) noexcept;
-
-  friend bool operator==(
-      const ComputePipeline::Descriptor& _1,
-      const ComputePipeline::Descriptor& _2);
-};
-
-class PipelineLayoutCache final {
- public:
-  explicit PipelineLayoutCache(VkDevice device);
-
-  PipelineLayoutCache(const PipelineLayoutCache&) = delete;
-  PipelineLayoutCache& operator=(const PipelineLayoutCache&) = delete;
-
-  PipelineLayoutCache(PipelineLayoutCache&&) noexcept;
-  PipelineLayoutCache& operator=(PipelineLayoutCache&&) = delete;
-
-  ~PipelineLayoutCache();
-  using Key = std::pair<VkDescriptorSetLayout, uint32_t>;
-  using Value = PipelineLayout;
-
-  struct Hasher {
-    inline size_t operator()(
-        std::pair<VkDescriptorSetLayout, uint32_t> key) const {
-      size_t seed = 0;
-      seed = utils::hash_combine(
-          seed, std::hash<VkDescriptorSetLayout>()(key.first));
-      seed = utils::hash_combine(seed, std::hash<uint32_t>()(key.second));
-      return seed;
-    }
-  };
-
- private:
-  // Multiple threads could potentially be adding entries into the cache, so use
-  // a mutex to manage access
-  std::mutex cache_mutex_;
-
-  VkDevice device_;
-  std::unordered_map<Key, Value, Hasher> cache_;
-
- public:
-  VkPipelineLayout retrieve(const VkDescriptorSetLayout, const uint32_t);
-  void purge();
-};
-
-class ComputePipelineCache final {
- public:
-  explicit ComputePipelineCache(
-      VkDevice device,
-      const std::string& cache_data_path);
-
-  ComputePipelineCache(const ComputePipelineCache&) = delete;
-  ComputePipelineCache& operator=(const ComputePipelineCache&) = delete;
-
-  ComputePipelineCache(ComputePipelineCache&&) noexcept;
-  ComputePipelineCache& operator=(ComputePipelineCache&&) = delete;
-
-  ~ComputePipelineCache();
-
-  using Key = ComputePipeline::Descriptor;
-  using Value = ComputePipeline;
-
-  struct Hasher {
-    inline size_t operator()(
-        const ComputePipeline::Descriptor& descriptor) const {
-      size_t seed = 0;
-      seed = utils::hash_combine(
-          seed, std::hash<VkPipelineLayout>()(descriptor.pipeline_layout));
-      seed = utils::hash_combine(
-          seed, std::hash<VkShaderModule>()(descriptor.shader_module));
-
-      const SpecVarList& spec_vars = descriptor.specialization_constants;
-      seed = utils::hash_combine(seed, std::hash<uint32_t>()(spec_vars.size()));
-
-      for (int i = 0; i < spec_vars.size(); ++i) {
-        const SpecVar& spec_var = spec_vars.at(i);
-        size_t new_seed = 0;
-        switch (spec_var.type) {
-          case SpecVar::Type::FLOAT:
-            new_seed = std::hash<float>()(spec_var.value.as_float);
-            break;
-          case SpecVar::Type::INT:
-            new_seed = std::hash<int32_t>()(spec_var.value.as_int32);
-            break;
-          case SpecVar::Type::UINT:
-            new_seed = std::hash<uint32_t>()(spec_var.value.as_uint32);
-            break;
-          case SpecVar::Type::BOOL:
-            new_seed = std::hash<bool>()(spec_var.value.as_bool);
-            break;
-        }
-        seed = utils::hash_combine(seed, new_seed);
-      }
-
-      return seed;
-    }
-  };
-
-  void save_cache();
-
- private:
-  std::vector<char> load_cache();
-
-  // Multiple threads could potentially be adding entries into the cache, so use
-  // a mutex to manage access
-  std::mutex cache_mutex_;
-
-  VkDevice device_;
-  VkPipelineCache pipeline_cache_;
-  std::unordered_map<Key, Value, Hasher> cache_;
-  const std::string cache_data_path_;
-
- public:
-  bool contains(const Key&);
-  void create_pipelines(const std::unordered_set<Key, Hasher>&);
-  VkPipeline retrieve(const Key&);
-  void purge();
-};
-
-//
-// Impl
-//
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/QueryPool.cpp b/backends/vulkan/runtime/vk_api/QueryPool.cpp
deleted file mode 100644
index e8b3ca55206..00000000000
--- a/backends/vulkan/runtime/vk_api/QueryPool.cpp
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadImplicitCast
-
-#include <executorch/backends/vulkan/runtime/utils/VecUtils.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/QueryPool.h>
-
-#include <cmath>
-#include <iomanip>
-#include <iostream>
-#include <utility>
-
-namespace vkcompute {
-namespace vkapi {
-
-namespace {
-
-// On Mali gpus timestamp_period seems to return 0.
-// For some reason when 52.08 is used op runtimes seem to make more sense
-// TODO: Figure out what is special about 52.08
-constexpr int64_t kDefaultNsPerTick = 52; // lround(52.08f);
-
-} // namespace
-
-#define EARLY_RETURN_IF_UNINITIALIZED() \
-  if (querypool_ == VK_NULL_HANDLE) {   \
-    return;                             \
-  }
-
-QueryPool::QueryPool(const QueryPoolConfig& config, const Adapter* adapter_p)
-    : config_(config),
-      ns_per_tick_(1u),
-      device_(VK_NULL_HANDLE),
-      querypool_(VK_NULL_HANDLE),
-      num_queries_(0u),
-      shader_durations_(0),
-      mutex_{} {
-  initialize(adapter_p);
-}
-
-QueryPool::~QueryPool() {
-  EARLY_RETURN_IF_UNINITIALIZED();
-  vkDestroyQueryPool(device_, querypool_, nullptr);
-}
-
-void QueryPool::initialize(const Adapter* adapter_p) {
-  // No-op if adapter_p is nullptr or querypool is already created
-  if (!adapter_p || querypool_ != VK_NULL_HANDLE) {
-    return;
-  }
-
-  device_ = adapter_p->device_handle();
-
-  ns_per_tick_ = std::lround(adapter_p->timestamp_period());
-  ns_per_tick_ = (ns_per_tick_ == 0) ? kDefaultNsPerTick : ns_per_tick_;
-
-  shader_durations_.reserve(config_.initial_reserve_size);
-
-  const VkQueryPoolCreateInfo info{
-      VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-      VK_QUERY_TYPE_TIMESTAMP, // queryType
-      config_.max_query_count, // queryCount
-      0u, // pipelineStatistics
-  };
-
-  VK_CHECK(vkCreateQueryPool(device_, &info, nullptr, &querypool_));
-}
-
-size_t QueryPool::write_timestamp(const CommandBuffer& cmd) {
-  VK_CHECK_COND(
-      num_queries_ < config_.max_query_count,
-      "Vulkan QueryPool: Exceeded the maximum number of queries "
-      "allowed by the queryPool (",
-      config_.max_query_count,
-      ")!");
-
-  cmd.write_timestamp(querypool_, num_queries_++);
-  return num_queries_ - 1;
-}
-
-void QueryPool::reset_querypool(const CommandBuffer& cmd) {
-  EARLY_RETURN_IF_UNINITIALIZED();
-  std::lock_guard<std::mutex> lock(mutex_);
-
-  cmd.reset_querypool(querypool_, 0u, config_.max_query_count);
-  reset_state();
-}
-
-void QueryPool::reset_state() {
-  num_queries_ = 0u;
-  shader_durations_.clear();
-}
-
-void QueryPool::shader_profile_begin(
-    const CommandBuffer& cmd,
-    const uint32_t dispatch_id,
-    const std::string& kernel_name,
-    const VkExtent3D global_workgroup_size,
-    const VkExtent3D local_workgroup_size) {
-  EARLY_RETURN_IF_UNINITIALIZED();
-  std::lock_guard<std::mutex> lock(mutex_);
-
-  uint32_t query_idx = write_timestamp(cmd);
-
-  ShaderDuration log_entry{
-      utils::safe_downcast<uint32_t>(shader_durations_.size()),
-      // Execution Properties
-      dispatch_id,
-      kernel_name,
-      global_workgroup_size,
-      local_workgroup_size,
-      // Query indexes
-      query_idx, // start query idx
-      UINT32_MAX, // end query idx
-      // Timings
-      0u, // start time
-      0u, // end time
-      0u, // duration
-  };
-
-  shader_durations_.emplace_back(log_entry);
-}
-
-void QueryPool::shader_profile_end(const CommandBuffer& cmd) {
-  EARLY_RETURN_IF_UNINITIALIZED();
-  std::lock_guard<std::mutex> lock(mutex_);
-
-  size_t query_idx = write_timestamp(cmd);
-  shader_durations_.back().end_query_idx = query_idx;
-}
-
-void QueryPool::extract_results() {
-  EARLY_RETURN_IF_UNINITIALIZED();
-  std::lock_guard<std::mutex> lock(mutex_);
-
-  const VkQueryResultFlags flags = VK_QUERY_RESULT_64_BIT;
-
-  std::vector<uint64_t> query_data;
-  query_data.resize(num_queries_);
-
-  VK_CHECK(vkGetQueryPoolResults(
-      device_,
-      querypool_,
-      0u, // firstQuery
-      num_queries_, // queryCount
-      sizeof(uint64_t) * num_queries_, // dataSize
-      query_data.data(), // pData
-      sizeof(uint64_t), // stride
-      flags)); // flags
-
-  for (ShaderDuration& entry : shader_durations_) {
-    entry.start_time_ns = query_data.at(entry.start_query_idx) * ns_per_tick_;
-    entry.end_time_ns = query_data.at(entry.end_query_idx) * ns_per_tick_;
-    entry.execution_duration_ns = entry.end_time_ns - entry.start_time_ns;
-  }
-}
-
-std::ostream& operator<<(std::ostream& os, const VkExtent3D& extents) {
-  os << "{" << extents.width << ", " << extents.height << ", " << extents.depth
-     << "}";
-  return os;
-}
-
-std::string stringize(const VkExtent3D& extents) {
-  std::stringstream ss;
-  ss << "{" << extents.width << ", " << extents.height << ", " << extents.depth
-     << "}";
-  return ss.str();
-}
-
-std::vector<ShaderResult> QueryPool::get_shader_timestamp_data() {
-  if (querypool_ == VK_NULL_HANDLE) {
-    return {};
-  }
-  std::lock_guard<std::mutex> lock(mutex_);
-  std::vector<ShaderResult> shader_result;
-  for (ShaderDuration& entry : shader_durations_) {
-    shader_result.push_back(ShaderResult{
-        /* .kernel_name = */ entry.kernel_name,
-        /* .dispatch_id = */ entry.dispatch_id,
-        /* .start_time_ns = */ entry.start_time_ns,
-        /* .end_time_ns = */ entry.end_time_ns,
-        /* .metadata = */
-        ShaderMetadata{
-            /* .global_workgroup_size = */
-            {entry.global_workgroup_size.width,
-             entry.global_workgroup_size.height,
-             entry.global_workgroup_size.depth},
-            /* .local_workgroup_size = */
-            {entry.local_workgroup_size.width,
-             entry.local_workgroup_size.height,
-             entry.local_workgroup_size.depth},
-        }});
-  }
-  return shader_result;
-}
-
-std::string QueryPool::generate_string_report() {
-  std::lock_guard<std::mutex> lock(mutex_);
-
-  std::stringstream ss;
-
-  int kernel_name_w = 120;
-  int global_size_w = 25;
-  int local_size_w = 25;
-  int duration_w = 25;
-
-  ss << std::left;
-  ss << std::setw(kernel_name_w) << "Kernel Name";
-  ss << std::setw(global_size_w) << "Global Workgroup Size";
-  ss << std::setw(local_size_w) << "Local Workgroup Size";
-  ss << std::right << std::setw(duration_w) << "Duration (ns)";
-  ss << std::endl;
-
-  ss << std::left;
-  ss << std::setw(kernel_name_w) << "===========";
-  ss << std::setw(global_size_w) << "=====================";
-  ss << std::setw(local_size_w) << "====================";
-  ss << std::right << std::setw(duration_w) << "=============";
-  ss << std::endl;
-
-  for (ShaderDuration& entry : shader_durations_) {
-    std::chrono::duration<size_t, std::nano> exec_duration_ns(
-        entry.execution_duration_ns);
-
-    ss << std::left;
-    ss << std::setw(kernel_name_w) << entry.kernel_name;
-    ss << std::setw(global_size_w) << stringize(entry.global_workgroup_size);
-    ss << std::setw(local_size_w) << stringize(entry.local_workgroup_size);
-    ss << std::right << std::setw(duration_w) << exec_duration_ns.count();
-    ss << std::endl;
-  }
-
-  return ss.str();
-}
-
-std::string QueryPool::generate_tsv_string_report() {
-  std::lock_guard<std::mutex> lock(mutex_);
-
-  std::stringstream ss;
-
-  ss << "Kernel Name\t";
-  ss << "Global Workgroup Size\t";
-  ss << "Local Workgroup Size\t";
-  ss << "Duration (ns)\t";
-  ss << std::endl;
-
-  ss << "===========\t";
-  ss << "=====================\t";
-  ss << "====================\t";
-  ss << "=============\t";
-  ss << std::endl;
-
-  for (ShaderDuration& entry : shader_durations_) {
-    std::chrono::duration<size_t, std::nano> exec_duration_ns(
-        entry.execution_duration_ns);
-
-    ss << entry.kernel_name << "\t";
-    ss << stringize(entry.global_workgroup_size) << "\t";
-    ss << stringize(entry.local_workgroup_size) << "\t";
-    ss << exec_duration_ns.count() << "\t";
-    ss << std::endl;
-  }
-
-  return ss.str();
-}
-
-void QueryPool::print_results(const bool tsv_format) {
-  EARLY_RETURN_IF_UNINITIALIZED();
-  if (tsv_format) {
-    std::cout << generate_tsv_string_report() << std::endl;
-  } else {
-    std::cout << generate_string_report() << std::endl;
-  }
-}
-
-unsigned long QueryPool::get_total_shader_ns(std::string kernel_name) {
-  for (ShaderDuration& entry : shader_durations_) {
-    if (entry.kernel_name == kernel_name) {
-      std::chrono::duration<size_t, std::nano> exec_duration_ns(
-          entry.execution_duration_ns);
-      return exec_duration_ns.count();
-    }
-  }
-  return 0;
-}
-
-unsigned long QueryPool::get_mean_shader_ns(std::string kernel_name) {
-  uint64_t total_ns = 0;
-  uint32_t count = 0;
-  for (ShaderDuration& entry : shader_durations_) {
-    if (entry.kernel_name == kernel_name) {
-      std::chrono::duration<size_t, std::nano> exec_duration_ns(
-          entry.execution_duration_ns);
-      total_ns += exec_duration_ns.count();
-      count++;
-    }
-  }
-  if (count == 0) {
-    return 0;
-  }
-  return total_ns / count;
-}
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/QueryPool.h b/backends/vulkan/runtime/vk_api/QueryPool.h
deleted file mode 100644
index 94bd99584eb..00000000000
--- a/backends/vulkan/runtime/vk_api/QueryPool.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/Adapter.h>
-#include <executorch/backends/vulkan/runtime/vk_api/Command.h>
-#include <executorch/backends/vulkan/runtime/vk_api/Pipeline.h>
-
-#include <cstdint>
-#include <functional>
-
-#ifndef VULKAN_QUERY_POOL_SIZE
-#define VULKAN_QUERY_POOL_SIZE 4096u
-#endif
-
-namespace vkcompute {
-namespace vkapi {
-
-struct ShaderMetadata final {
-  const uint32_t global_workgroup_size[3];
-  const uint32_t local_workgroup_size[3];
-};
-
-struct ShaderResult final {
-  const std::string kernel_name;
-  const uint32_t dispatch_id;
-  const uint64_t start_time_ns;
-  const uint64_t end_time_ns;
-  const ShaderMetadata metadata;
-};
-
-struct QueryPoolConfig final {
-  uint32_t max_query_count = VULKAN_QUERY_POOL_SIZE;
-  uint32_t initial_reserve_size = 256u;
-};
-
-struct ShaderDuration final {
-  uint32_t idx;
-
-  // Execution Properties
-  uint32_t dispatch_id;
-  std::string kernel_name;
-  VkExtent3D global_workgroup_size;
-  VkExtent3D local_workgroup_size;
-
-  // Query indexes
-  uint32_t start_query_idx;
-  uint32_t end_query_idx;
-
-  // Timings
-  uint64_t start_time_ns;
-  uint64_t end_time_ns;
-  uint64_t execution_duration_ns;
-};
-
-class QueryPool final {
-  // Configuration
-  QueryPoolConfig config_;
-  uint64_t ns_per_tick_;
-
-  // Vulkan handles
-  VkDevice device_;
-  VkQueryPool querypool_;
-
-  // Internal State
-  uint32_t num_queries_;
-  std::vector<ShaderDuration> shader_durations_;
-
-  std::mutex mutex_;
-
- public:
-  explicit QueryPool(const QueryPoolConfig&, const Adapter* adapter_p);
-
-  QueryPool(const QueryPool&) = delete;
-  QueryPool& operator=(const QueryPool&) = delete;
-
-  QueryPool(QueryPool&&) = delete;
-  QueryPool& operator=(QueryPool&&) = delete;
-
-  ~QueryPool();
-
-  void initialize(const Adapter* adapter_p);
-
- private:
-  size_t write_timestamp(const CommandBuffer&);
-
- public:
-  void reset_querypool(const CommandBuffer&);
-
-  void reset_state();
-
-  void shader_profile_begin(
-      const CommandBuffer&,
-      const uint32_t,
-      const std::string&,
-      const VkExtent3D,
-      const VkExtent3D);
-
-  void shader_profile_end(const CommandBuffer&);
-
-  void extract_results();
-
-  std::vector<ShaderResult> get_shader_timestamp_data();
-  void print_results(const bool tsv_format = false);
-  unsigned long get_total_shader_ns(std::string kernel_name);
-  unsigned long get_mean_shader_ns(std::string kernel_name);
-
-  operator bool() const {
-    return querypool_ != VK_NULL_HANDLE;
-  }
-
- private:
-  std::string generate_string_report();
-  std::string generate_tsv_string_report();
-};
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Runtime.cpp b/backends/vulkan/runtime/vk_api/Runtime.cpp
deleted file mode 100644
index c3376e2ccbf..00000000000
--- a/backends/vulkan/runtime/vk_api/Runtime.cpp
+++ /dev/null
@@ -1,449 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/vk_api/Runtime.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/Adapter.h>
-
-#include <cstring>
-#include <iostream>
-#include <sstream>
-
-#ifdef USE_VOLK_HEADER_ONLY
-// For volk.h, define this before including volk.h in exactly one CPP file.
-#define VOLK_IMPLEMENTATION
-#include <volk.h>
-#endif /* USE_VOLK_HEADER_ONLY */
-
-namespace vkcompute {
-namespace vkapi {
-
-#define PRINT_CASE(name)       \
-  case MemoryAccessType::name: \
-    out << #name;              \
-    break;
-
-std::ostream& operator<<(std::ostream& out, const MemoryAccessType& tag) {
-  switch (tag) {
-    PRINT_CASE(NONE)
-    PRINT_CASE(READ)
-    PRINT_CASE(WRITE)
-  }
-  return out;
-}
-
-#undef PRINT_CASE
-
-namespace {
-
-void find_requested_layers_and_extensions(
-    std::vector<const char*>& enabled_layers,
-    std::vector<const char*>& enabled_extensions,
-    const std::vector<const char*>& requested_layers,
-    const std::vector<const char*>& requested_extensions) {
-  // Get supported instance layers
-  uint32_t layer_count = 0;
-  VK_CHECK(vkEnumerateInstanceLayerProperties(&layer_count, nullptr));
-
-  std::vector<VkLayerProperties> layer_properties(layer_count);
-  VK_CHECK(vkEnumerateInstanceLayerProperties(
-      &layer_count, layer_properties.data()));
-
-  // Search for requested layers
-  for (const auto& requested_layer : requested_layers) {
-    for (const auto& layer : layer_properties) {
-      if (strcmp(requested_layer, layer.layerName) == 0) {
-        enabled_layers.push_back(requested_layer);
-        break;
-      }
-    }
-  }
-
-  // Get supported instance extensions
-  uint32_t extension_count = 0;
-  VK_CHECK(vkEnumerateInstanceExtensionProperties(
-      nullptr, &extension_count, nullptr));
-
-  std::vector<VkExtensionProperties> extension_properties(extension_count);
-  VK_CHECK(vkEnumerateInstanceExtensionProperties(
-      nullptr, &extension_count, extension_properties.data()));
-
-  // Search for requested extensions
-  for (const auto& requested_extension : requested_extensions) {
-    for (const auto& extension : extension_properties) {
-      if (strcmp(requested_extension, extension.extensionName) == 0) {
-        enabled_extensions.push_back(requested_extension);
-        break;
-      }
-    }
-  }
-}
-
-VkInstance create_instance(const RuntimeConfig& config) {
-  const VkApplicationInfo application_info{
-      VK_STRUCTURE_TYPE_APPLICATION_INFO, // sType
-      nullptr, // pNext
-      "PyTorch Vulkan Backend", // pApplicationName
-      0, // applicationVersion
-      nullptr, // pEngineName
-      0, // engineVersion
-      VK_API_VERSION_1_1, // apiVersion
-  };
-
-  std::vector<const char*> enabled_layers;
-  std::vector<const char*> enabled_extensions;
-
-  std::vector<const char*> requested_layers;
-  std::vector<const char*> requested_extensions;
-
-  if (config.enable_validation_messages) {
-    requested_layers.emplace_back("VK_LAYER_KHRONOS_validation");
-#ifdef VK_EXT_debug_report
-    requested_extensions.emplace_back(VK_EXT_DEBUG_REPORT_EXTENSION_NAME);
-#endif /* VK_EXT_debug_report */
-  }
-
-  VkInstanceCreateFlags instance_flags = 0;
-#ifdef __APPLE__
-  instance_flags |= VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR;
-  requested_extensions.emplace_back(
-      VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME);
-#endif
-
-  find_requested_layers_and_extensions(
-      enabled_layers,
-      enabled_extensions,
-      requested_layers,
-      requested_extensions);
-
-  const void* instance_create_next = nullptr;
-  // VkConfig on Mac platforms does not expose debugPrintf settings for whatever
-  // reason so it has to be enabled manually.
-#if defined(__APPLE__) && defined(VULKAN_DEBUG)
-  std::vector<VkValidationFeatureEnableEXT> enabled_validation_features{
-      VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT,
-  };
-  VkValidationFeaturesEXT validation_features = {
-      VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT, // sType
-      nullptr, // pNext
-      static_cast<uint32_t>(
-          enabled_validation_features.size()), // enabledValidationFeatureCount
-      enabled_validation_features.data(), // pEnabledValidationFeatures
-      0,
-      nullptr, // pDisabledValidationFeatures
-  };
-  instance_create_next = &validation_features;
-#endif /* __APPLE__ && VULKAN_DEBUG */
-
-  const VkInstanceCreateInfo instance_create_info{
-      VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, // sType
-      instance_create_next, // pNext
-      instance_flags, // flags
-      &application_info, // pApplicationInfo
-      static_cast<uint32_t>(enabled_layers.size()), // enabledLayerCount
-      enabled_layers.data(), // ppEnabledLayerNames
-      static_cast<uint32_t>(enabled_extensions.size()), // enabledExtensionCount
-      enabled_extensions.data(), // ppEnabledExtensionNames
-  };
-
-  VkInstance instance{};
-  VK_CHECK(vkCreateInstance(&instance_create_info, nullptr, &instance));
-  VK_CHECK_COND(instance, "Invalid Vulkan instance!");
-
-#ifdef USE_VULKAN_VOLK
-  volkLoadInstance(instance);
-#endif /* USE_VULKAN_VOLK */
-
-  return instance;
-}
-
-std::vector<Runtime::DeviceMapping> create_physical_devices(
-    VkInstance instance) {
-  if (instance == VK_NULL_HANDLE) {
-    return std::vector<Runtime::DeviceMapping>();
-  }
-
-  uint32_t device_count = 0;
-  VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, nullptr));
-
-  std::vector<VkPhysicalDevice> devices(device_count);
-  VK_CHECK(vkEnumeratePhysicalDevices(instance, &device_count, devices.data()));
-
-  std::vector<Runtime::DeviceMapping> device_mappings;
-  device_mappings.reserve(device_count);
-  for (VkPhysicalDevice physical_device : devices) {
-    device_mappings.emplace_back(PhysicalDevice(physical_device), -1);
-  }
-
-  return device_mappings;
-}
-
-VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn(
-    const VkDebugReportFlagsEXT flags,
-    const VkDebugReportObjectTypeEXT /* object_type */,
-    const uint64_t /* object */,
-    const size_t /* location */,
-    const int32_t message_code,
-    const char* const layer_prefix,
-    const char* const message,
-    void* const /* user_data */) {
-  (void)flags;
-
-  std::stringstream stream;
-  stream << layer_prefix << " " << message_code << " " << message << std::endl;
-  const std::string log = stream.str();
-
-  std::cout << log;
-
-  return VK_FALSE;
-}
-
-VkDebugReportCallbackEXT create_debug_report_callback(
-    VkInstance instance,
-    const RuntimeConfig config) {
-  if (instance == VK_NULL_HANDLE || !config.enable_validation_messages) {
-    return VkDebugReportCallbackEXT{};
-  }
-
-  const VkDebugReportCallbackCreateInfoEXT debugReportCallbackCreateInfo{
-      VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, // sType
-      nullptr, // pNext
-      VK_DEBUG_REPORT_INFORMATION_BIT_EXT | VK_DEBUG_REPORT_WARNING_BIT_EXT |
-          VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT |
-          VK_DEBUG_REPORT_ERROR_BIT_EXT |
-          VK_DEBUG_REPORT_DEBUG_BIT_EXT, // flags
-      debug_report_callback_fn, // pfnCallback
-      nullptr, // pUserData
-  };
-
-  const auto vkCreateDebugReportCallbackEXT =
-      (PFN_vkCreateDebugReportCallbackEXT)vkGetInstanceProcAddr(
-          instance, "vkCreateDebugReportCallbackEXT");
-
-  VK_CHECK_COND(
-      vkCreateDebugReportCallbackEXT,
-      "Could not load vkCreateDebugReportCallbackEXT");
-
-  VkDebugReportCallbackEXT debug_report_callback{};
-  VK_CHECK(vkCreateDebugReportCallbackEXT(
-      instance,
-      &debugReportCallbackCreateInfo,
-      nullptr,
-      &debug_report_callback));
-
-  VK_CHECK_COND(debug_report_callback, "Invalid Vulkan debug report callback!");
-
-  return debug_report_callback;
-}
-
-//
-// Adapter selection methods
-//
-
-uint32_t select_first(const std::vector<Runtime::DeviceMapping>& devices) {
-  if (devices.empty()) {
-    return devices.size() + 1; // return out of range to signal invalidity
-  }
-
-  // Select the first adapter that has compute capability
-  for (size_t i = 0; i < devices.size(); ++i) {
-    if (devices[i].first.num_compute_queues > 0) {
-      return i;
-    }
-  }
-
-  return devices.size() + 1;
-}
-
-//
-// Global runtime initialization
-//
-
-std::unique_ptr<Runtime> init_global_vulkan_runtime(
-    const std::string& cache_data_path) {
-  // Load Vulkan drivers
-#if defined(USE_VULKAN_VOLK)
-  if (VK_SUCCESS != volkInitialize()) {
-    return std::unique_ptr<Runtime>(nullptr);
-  }
-#elif defined(USE_VULKAN_WRAPPER)
-  if (!InitVulkan()) {
-    return std::unique_ptr<Runtime>(nullptr);
-  }
-#endif /* USE_VULKAN_VOLK, USE_VULKAN_WRAPPER */
-
-  const bool enable_validation_messages =
-#if defined(VULKAN_DEBUG)
-      true;
-#else
-      false;
-#endif /* VULKAN_DEBUG */
-  const bool init_default_device = true;
-  const uint32_t num_requested_queues = 1; // TODO: raise this value
-
-  const RuntimeConfig default_config{
-      enable_validation_messages,
-      init_default_device,
-      AdapterSelector::First,
-      num_requested_queues,
-      cache_data_path,
-  };
-
-  try {
-    return std::make_unique<Runtime>(default_config);
-  } catch (...) {
-  }
-
-  return std::unique_ptr<Runtime>(nullptr);
-}
-
-} // namespace
-
-Runtime::Runtime(const RuntimeConfig config)
-    : config_(config),
-      instance_(create_instance(config_)),
-      device_mappings_(create_physical_devices(instance_)),
-      adapters_{},
-      default_adapter_i_(UINT32_MAX),
-      debug_report_callback_(create_debug_report_callback(instance_, config_)) {
-  // List of adapters will never exceed the number of physical devices
-  adapters_.reserve(device_mappings_.size());
-
-  if (config.init_default_device) {
-    try {
-      switch (config.default_selector) {
-        case AdapterSelector::First:
-          default_adapter_i_ = create_adapter(select_first);
-      }
-    } catch (...) {
-    }
-  }
-}
-
-Runtime::~Runtime() {
-  if (instance_ == VK_NULL_HANDLE) {
-    return;
-  }
-
-  // Clear adapters list to trigger device destruction before destroying
-  // VkInstance
-  adapters_.clear();
-
-  // Instance must be destroyed last as its used to destroy the debug report
-  // callback.
-  if (debug_report_callback_) {
-    const auto vkDestroyDebugReportCallbackEXT =
-        (PFN_vkDestroyDebugReportCallbackEXT)vkGetInstanceProcAddr(
-            instance_, "vkDestroyDebugReportCallbackEXT");
-
-    if (vkDestroyDebugReportCallbackEXT) {
-      vkDestroyDebugReportCallbackEXT(
-          instance_, debug_report_callback_, nullptr);
-    }
-
-    debug_report_callback_ = {};
-  }
-
-  vkDestroyInstance(instance_, nullptr);
-  instance_ = VK_NULL_HANDLE;
-}
-
-uint32_t Runtime::create_adapter(const Selector& selector) {
-  VK_CHECK_COND(
-      !device_mappings_.empty(),
-      "Pytorch Vulkan Runtime: Could not initialize adapter because no "
-      "devices were found by the Vulkan instance.");
-
-  uint32_t physical_device_i = selector(device_mappings_);
-  VK_CHECK_COND(
-      physical_device_i < device_mappings_.size(),
-      "Pytorch Vulkan Runtime: no suitable device adapter was selected! "
-      "Device could not be initialized");
-
-  Runtime::DeviceMapping& device_mapping = device_mappings_[physical_device_i];
-  // If an Adapter has already been created, return that
-  int32_t adapter_i = device_mapping.second;
-  if (adapter_i >= 0) {
-    return adapter_i;
-  }
-  // Otherwise, create an adapter for the selected physical device
-  adapter_i = utils::safe_downcast<int32_t>(adapters_.size());
-  adapters_.emplace_back(new Adapter(
-      instance_,
-      device_mapping.first,
-      config_.num_requested_queues,
-      config_.cache_data_path));
-  device_mapping.second = adapter_i;
-
-  return adapter_i;
-}
-
-std::string& set_and_get_pipeline_cache_data_path(
-    const std::string& file_path) {
-  // The global cache data path is declared as a static local variable for the
-  // same reasons as the global runtime below.
-#if defined(ETVK_DEFAULT_CACHE_PATH)
-  static std::string global_cache_data_path = ETVK_DEFAULT_CACHE_PATH;
-#else
-  static std::string global_cache_data_path;
-#endif /* ETVK_DEFAULT_CACHE_PATH */
-
-  if (file_path.size() > 0) {
-    global_cache_data_path = file_path;
-  }
-  return global_cache_data_path;
-}
-
-Runtime* runtime() {
-  // The global vulkan runtime is declared as a static local variable within a
-  // non-static function to ensure it has external linkage. If it were a global
-  // static variable there would be one copy per translation unit that includes
-  // Runtime.h as it would have internal linkage.
-  static const std::unique_ptr<Runtime> p_runtime =
-      init_global_vulkan_runtime(set_and_get_pipeline_cache_data_path(""));
-
-  VK_CHECK_COND(
-      p_runtime,
-      "Pytorch Vulkan Runtime: The global runtime could not be retrieved "
-      "because it failed to initialize.");
-
-  return p_runtime.get();
-}
-
-std::unique_ptr<Adapter> init_external_adapter(
-    const VkInstance instance,
-    const VkPhysicalDevice physical_device,
-    const VkDevice logical_device,
-    const uint32_t num_queues,
-    const std::string& cache_data_path) {
-  if (instance == VK_NULL_HANDLE || physical_device == VK_NULL_HANDLE ||
-      logical_device == VK_NULL_HANDLE) {
-    return std::unique_ptr<Adapter>(nullptr);
-  }
-
-  return std::make_unique<Adapter>(
-      instance, physical_device, logical_device, num_queues, cache_data_path);
-}
-
-Adapter* set_and_get_external_adapter(
-    const VkInstance instance,
-    const VkPhysicalDevice physical_device,
-    const VkDevice logical_device) {
-  static const std::unique_ptr<Adapter> p_external_adapter =
-      init_external_adapter(
-          instance,
-          physical_device,
-          logical_device,
-          1,
-          set_and_get_pipeline_cache_data_path(""));
-
-  return p_external_adapter.get();
-}
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Runtime.h b/backends/vulkan/runtime/vk_api/Runtime.h
deleted file mode 100644
index 3706d6c73d0..00000000000
--- a/backends/vulkan/runtime/vk_api/Runtime.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/Adapter.h>
-
-#include <functional>
-#include <memory>
-
-namespace vkcompute {
-namespace vkapi {
-
-//
-// A Vulkan Runtime initializes a Vulkan instance and decouples the concept of
-// Vulkan instance initialization from initialization of, and subsequent
-// interactions with,  Vulkan [physical and logical] devices as a precursor to
-// multi-GPU support.  The Vulkan Runtime can be queried for available Adapters
-// (i.e. physical devices) in the system which in turn can be used for creation
-// of a Vulkan Context (i.e. logical devices).  All Vulkan tensors in PyTorch
-// are associated with a Context to make tensor <-> device affinity explicit.
-//
-
-enum AdapterSelector {
-  First,
-};
-
-struct RuntimeConfig final {
-  bool enable_validation_messages;
-  bool init_default_device;
-  AdapterSelector default_selector;
-  uint32_t num_requested_queues;
-  std::string cache_data_path;
-};
-
-class Runtime final {
- public:
-  explicit Runtime(const RuntimeConfig);
-
-  // Do not allow copying. There should be only one global instance of this
-  // class.
-  Runtime(const Runtime&) = delete;
-  Runtime& operator=(const Runtime&) = delete;
-
-  Runtime(Runtime&&) = delete;
-  Runtime& operator=(Runtime&&) = delete;
-
-  ~Runtime();
-
-  using DeviceMapping = std::pair<PhysicalDevice, int32_t>;
-  using AdapterPtr = std::unique_ptr<Adapter>;
-
- private:
-  RuntimeConfig config_;
-
-  VkInstance instance_;
-
-  std::vector<DeviceMapping> device_mappings_;
-  std::vector<AdapterPtr> adapters_;
-  uint32_t default_adapter_i_;
-
-  VkDebugReportCallbackEXT debug_report_callback_;
-
- public:
-  inline VkInstance instance() const {
-    return instance_;
-  }
-
-  inline Adapter* get_adapter_p() {
-    VK_CHECK_COND(
-        default_adapter_i_ >= 0 && default_adapter_i_ < adapters_.size(),
-        "Pytorch Vulkan Runtime: Default device adapter is not set correctly!");
-    return adapters_[default_adapter_i_].get();
-  }
-
-  inline Adapter* get_adapter_p(uint32_t i) {
-    VK_CHECK_COND(
-        i >= 0 && i < adapters_.size(),
-        "Pytorch Vulkan Runtime: Adapter at index ",
-        i,
-        " is not available!");
-    return adapters_[i].get();
-  }
-
-  inline uint32_t default_adapter_i() const {
-    return default_adapter_i_;
-  }
-
-  using Selector =
-      std::function<uint32_t(const std::vector<Runtime::DeviceMapping>&)>;
-  uint32_t create_adapter(const Selector&);
-};
-
-std::string& set_and_get_pipeline_cache_data_path(const std::string& file_path);
-
-// The global runtime is retrieved using this function, where it is declared as
-// a static local variable.
-Runtime* runtime();
-
-// Used to share instance + devices between client code and ETVK
-Adapter* set_and_get_external_adapter(
-    const VkInstance instance = VK_NULL_HANDLE,
-    const VkPhysicalDevice physical_device = VK_NULL_HANDLE,
-    const VkDevice logical_device = VK_NULL_HANDLE);
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Shader.cpp b/backends/vulkan/runtime/vk_api/Shader.cpp
deleted file mode 100644
index 4356f92efe7..00000000000
--- a/backends/vulkan/runtime/vk_api/Shader.cpp
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/vk_api/Shader.h>
-
-#include <utility>
-
-namespace vkcompute {
-namespace vkapi {
-
-//
-// ShaderInfo
-//
-
-ShaderInfo::ShaderInfo()
-    : src_code{
-          nullptr,
-          0u,
-      } {}
-
-ShaderInfo::ShaderInfo(
-    std::string name,
-    const uint32_t* const spirv_bin,
-    const uint32_t size,
-    std::vector<VkDescriptorType>  layout,
-    const utils::uvec3 tile_size,
-    const bool requires_shader_int16_ext,
-    const bool requires_16bit_storage_ext,
-    const bool requires_8bit_storage_ext,
-    const bool requires_integer_dot_product_ext)
-    : src_code{
-          spirv_bin,
-          size,
-      },
-      kernel_name{std::move(name)},
-      kernel_layout{std::move(layout)},
-      out_tile_size(tile_size),
-      requires_shader_int16(requires_shader_int16_ext),
-      requires_16bit_storage(requires_16bit_storage_ext),
-      requires_8bit_storage(requires_8bit_storage_ext),
-      requires_integer_dot_product(requires_integer_dot_product_ext) {
-}
-
-bool operator==(const ShaderInfo& _1, const ShaderInfo& _2) {
-  return (
-      _1.src_code.bin == _2.src_code.bin &&
-      _1.src_code.size == _2.src_code.size);
-}
-
-//
-// ShaderLayout
-//
-
-ShaderLayout::ShaderLayout(
-    VkDevice device,
-    const ShaderLayout::Signature& signature)
-    : device_(device), handle_{VK_NULL_HANDLE} {
-  std::vector<VkDescriptorSetLayoutBinding> bindings;
-  bindings.reserve(signature.size());
-
-  uint32_t binding_num = 0u;
-  for (const VkDescriptorType type : signature) {
-    bindings.emplace_back(VkDescriptorSetLayoutBinding{
-        binding_num++, // binding
-        type, // descriptorType
-        1u, // descriptorCount
-        VK_SHADER_STAGE_COMPUTE_BIT, // stageFlags
-        nullptr, // pImmutableSamplers
-    });
-  }
-
-  const VkDescriptorSetLayoutCreateInfo descriptor_set_layout_create_info{
-      VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-      static_cast<uint32_t>(bindings.size()), // bindingCount
-      bindings.data(), // pBindings
-  };
-
-  VK_CHECK(vkCreateDescriptorSetLayout(
-      device_, &descriptor_set_layout_create_info, nullptr, &handle_));
-}
-
-ShaderLayout::ShaderLayout(ShaderLayout&& other) noexcept
-    : device_(other.device_), handle_(other.handle_) {
-  other.handle_ = VK_NULL_HANDLE;
-}
-
-ShaderLayout::~ShaderLayout() {
-  if (handle_ == VK_NULL_HANDLE) {
-    return;
-  }
-  vkDestroyDescriptorSetLayout(device_, handle_, nullptr);
-  handle_ = VK_NULL_HANDLE;
-}
-
-void swap(ShaderLayout& lhs, ShaderLayout& rhs) noexcept {
-  VkDevice tmp_device = lhs.device_;
-  VkDescriptorSetLayout tmp_handle = lhs.handle_;
-
-  lhs.device_ = rhs.device_;
-  lhs.handle_ = rhs.handle_;
-
-  rhs.device_ = tmp_device;
-  rhs.handle_ = tmp_handle;
-}
-
-//
-// ShaderModule
-//
-
-ShaderModule::ShaderModule(VkDevice device, const ShaderInfo& source)
-    : device_(device), handle_{VK_NULL_HANDLE} {
-  const uint32_t* code = source.src_code.bin;
-  uint32_t size = source.src_code.size;
-
-  const VkShaderModuleCreateInfo shader_module_create_info{
-      VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-      size, // codeSize
-      code, // pCode
-  };
-
-  VK_CHECK(vkCreateShaderModule(
-      device_, &shader_module_create_info, nullptr, &handle_));
-}
-
-ShaderModule::ShaderModule(ShaderModule&& other) noexcept
-    : device_(other.device_), handle_(other.handle_) {
-  other.handle_ = VK_NULL_HANDLE;
-}
-
-ShaderModule::~ShaderModule() {
-  if (handle_ == VK_NULL_HANDLE) {
-    return;
-  }
-  vkDestroyShaderModule(device_, handle_, nullptr);
-  handle_ = VK_NULL_HANDLE;
-}
-
-void swap(ShaderModule& lhs, ShaderModule& rhs) noexcept {
-  VkDevice tmp_device = lhs.device_;
-  VkShaderModule tmp_handle = lhs.handle_;
-
-  lhs.device_ = rhs.device_;
-  lhs.handle_ = rhs.handle_;
-
-  rhs.device_ = tmp_device;
-  rhs.handle_ = tmp_handle;
-}
-
-//
-// ShaderLayoutCache
-//
-
-ShaderLayoutCache::ShaderLayoutCache(VkDevice device)
-    : cache_mutex_{}, device_(device), cache_{} {}
-
-ShaderLayoutCache::ShaderLayoutCache(ShaderLayoutCache&& other) noexcept
-    : cache_mutex_{}, device_(other.device_), cache_(std::move(other.cache_)) {
-  std::lock_guard<std::mutex> lock(other.cache_mutex_);
-}
-
-ShaderLayoutCache::~ShaderLayoutCache() {
-  purge();
-}
-
-VkDescriptorSetLayout ShaderLayoutCache::retrieve(
-    const ShaderLayoutCache::Key& key) {
-  std::lock_guard<std::mutex> lock(cache_mutex_);
-
-  auto it = cache_.find(key);
-  if (cache_.cend() == it) {
-    it = cache_.insert({key, ShaderLayoutCache::Value(device_, key)}).first;
-  }
-
-  return it->second.handle();
-}
-
-void ShaderLayoutCache::purge() {
-  std::lock_guard<std::mutex> lock(cache_mutex_);
-  cache_.clear();
-}
-
-//
-// ShaderCache
-//
-
-ShaderCache::ShaderCache(VkDevice device)
-    : cache_mutex_{}, device_(device), cache_{} {}
-
-ShaderCache::ShaderCache(ShaderCache&& other) noexcept
-    : cache_mutex_{}, device_(other.device_), cache_(std::move(other.cache_)) {
-  std::lock_guard<std::mutex> lock(other.cache_mutex_);
-}
-
-ShaderCache::~ShaderCache() {
-  purge();
-}
-
-VkShaderModule ShaderCache::retrieve(const ShaderCache::Key& key) {
-  std::lock_guard<std::mutex> lock(cache_mutex_);
-
-  auto it = cache_.find(key);
-  if (cache_.cend() == it) {
-    it = cache_.insert({key, ShaderCache::Value(device_, key)}).first;
-  }
-
-  return it->second.handle();
-}
-
-void ShaderCache::purge() {
-  cache_.clear();
-}
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Shader.h b/backends/vulkan/runtime/vk_api/Shader.h
deleted file mode 100644
index 21332381406..00000000000
--- a/backends/vulkan/runtime/vk_api/Shader.h
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
-
-#include <executorch/backends/vulkan/runtime/utils/VecUtils.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/Types.h>
-
-#include <mutex>
-#include <unordered_map>
-
-namespace vkcompute {
-namespace vkapi {
-
-class ShaderLayout final {
- public:
-  using Signature = std::vector<VkDescriptorType>;
-
-  explicit ShaderLayout(VkDevice, const Signature&);
-
-  ShaderLayout(const ShaderLayout&) = delete;
-  ShaderLayout& operator=(const ShaderLayout&) = delete;
-
-  ShaderLayout(ShaderLayout&&) noexcept;
-  ShaderLayout& operator=(ShaderLayout&&) = delete;
-
-  ~ShaderLayout();
-
- private:
-  VkDevice device_;
-  VkDescriptorSetLayout handle_;
-
- public:
-  VkDescriptorSetLayout handle() const {
-    return handle_;
-  }
-
-  // We need to define a custom swap function since this class
-  // does not allow for move assignment. The swap function will
-  // be used in the hash map.
-  friend void swap(ShaderLayout& lhs, ShaderLayout& rhs) noexcept;
-};
-
-struct ShaderInfo final {
-  struct {
-    const uint32_t* bin = nullptr;
-    uint32_t size = 0u;
-  } src_code;
-
-  std::string kernel_name{""};
-  ShaderLayout::Signature kernel_layout{};
-
-  // Shader Metadata
-  utils::WorkgroupSize out_tile_size{1u, 1u, 1u};
-  bool requires_shader_int16 = false;
-  bool requires_16bit_storage = false;
-  bool requires_8bit_storage = false;
-  bool requires_integer_dot_product = false;
-
-  explicit ShaderInfo();
-
-  explicit ShaderInfo(
-      std::string,
-      const uint32_t*,
-      const uint32_t,
-      std::vector<VkDescriptorType>,
-      const utils::uvec3 tile_size,
-      const bool requires_shader_int16_ext,
-      const bool requires_16bit_storage_ext,
-      const bool requires_8bit_storage_ext,
-      const bool requires_integer_dot_product_ext);
-
-  operator bool() const {
-    return src_code.bin != nullptr;
-  };
-};
-
-bool operator==(const ShaderInfo& _1, const ShaderInfo& _2);
-
-class ShaderModule final {
- public:
-  explicit ShaderModule(VkDevice device, const ShaderInfo& source);
-
-  ShaderModule(const ShaderModule&) = delete;
-  ShaderModule& operator=(const ShaderModule&) = delete;
-
-  ShaderModule(ShaderModule&&) noexcept;
-  ShaderModule& operator=(ShaderModule&&) = delete;
-
-  ~ShaderModule();
-
- private:
-  VkDevice device_;
-  VkShaderModule handle_;
-
- public:
-  inline VkShaderModule handle() const {
-    return handle_;
-  }
-
-  // We need to define a custom swap function since this class
-  // does not allow for move assignment. The swap function will
-  // be used in the hash map.
-  friend void swap(ShaderModule& lhs, ShaderModule& rhs) noexcept;
-};
-
-class ShaderLayoutCache final {
- public:
-  explicit ShaderLayoutCache(VkDevice device);
-
-  ShaderLayoutCache(const ShaderLayoutCache&) = delete;
-  ShaderLayoutCache& operator=(const ShaderLayoutCache&) = delete;
-
-  ShaderLayoutCache(ShaderLayoutCache&&) noexcept;
-  ShaderLayoutCache& operator=(ShaderLayoutCache&&) = delete;
-
-  ~ShaderLayoutCache();
-
-  using Key = ShaderLayout::Signature;
-  using Value = ShaderLayout;
-
-  struct Hasher {
-    inline size_t operator()(const ShaderLayout::Signature& signature) const {
-      size_t hashed = 0u;
-
-      for (const VkDescriptorType type : signature) {
-        hashed =
-            utils::hash_combine(hashed, std::hash<VkDescriptorType>()(type));
-      }
-
-      return hashed;
-    }
-  };
-
- private:
-  // Multiple threads could potentially be adding entries into the cache, so use
-  // a mutex to manage access
-  std::mutex cache_mutex_;
-
-  VkDevice device_;
-  std::unordered_map<Key, Value, Hasher> cache_;
-
- public:
-  VkDescriptorSetLayout retrieve(const Key&);
-  void purge();
-};
-
-class ShaderCache final {
- public:
-  explicit ShaderCache(VkDevice device);
-
-  ShaderCache(const ShaderCache&) = delete;
-  ShaderCache& operator=(const ShaderCache&) = delete;
-
-  ShaderCache(ShaderCache&&) noexcept;
-  ShaderCache& operator=(ShaderCache&&) = delete;
-
-  ~ShaderCache();
-
-  using Key = ShaderInfo;
-  using Value = ShaderModule;
-
-  struct Hasher {
-    inline size_t operator()(const ShaderInfo& source) const {
-      size_t seed = 0;
-      seed = utils::hash_combine(
-          seed, std::hash<const uint32_t*>()(source.src_code.bin));
-      seed = utils::hash_combine(
-          seed, std::hash<uint32_t>()(source.src_code.size));
-
-      return seed;
-    }
-  };
-
- private:
-  // Multiple threads could potentially be adding entries into the cache, so use
-  // a mutex to manage access
-  std::mutex cache_mutex_;
-
-  VkDevice device_;
-  std::unordered_map<Key, Value, Hasher> cache_;
-
- public:
-  VkShaderModule retrieve(const Key&);
-  void purge();
-};
-
-} // namespace vkapi
-} // namespace vkcompute
-
-inline bool operator==(
-    const VkDescriptorSetLayoutBinding& _1,
-    const VkDescriptorSetLayoutBinding& _2) {
-  return (
-      _1.binding == _2.binding && _1.descriptorType == _2.descriptorType &&
-      _1.descriptorCount == _2.descriptorCount &&
-      _1.stageFlags == _2.stageFlags &&
-      _1.pImmutableSamplers == _2.pImmutableSamplers);
-}
diff --git a/backends/vulkan/runtime/vk_api/Types.h b/backends/vulkan/runtime/vk_api/Types.h
deleted file mode 100644
index b3309aa6c69..00000000000
--- a/backends/vulkan/runtime/vk_api/Types.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY bugprone-branch-clone
-
-#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/Exception.h>
-
-#include <cstddef>
-#include <cstdint>
-
-// X11 headers via volk define Bool, so we need to undef it
-#if defined(__linux__)
-#undef Bool
-#endif
-
-#ifdef USE_VULKAN_FP16_INFERENCE
-#define VK_FORMAT_FLOAT4 VK_FORMAT_R16G16B16A16_SFLOAT
-#else
-#define VK_FORMAT_FLOAT4 VK_FORMAT_R32G32B32A32_SFLOAT
-#endif /* USE_VULKAN_FP16_INFERENCE */
-
-#define VK_FORALL_SCALAR_TYPES(_)                  \
-  _(uint8_t, VK_FORMAT_R8G8B8A8_UINT, Byte)        \
-  _(uint8_t, VK_FORMAT_R8G8B8A8_UINT, Bool)        \
-  _(int8_t, VK_FORMAT_R8G8B8A8_SINT, Char)         \
-  _(uint16_t, VK_FORMAT_R16G16B16A16_SFLOAT, Half) \
-  _(uint16_t, VK_FORMAT_R16G16B16A16_UINT, UInt16) \
-  _(int16_t, VK_FORMAT_R16G16B16A16_SINT, Short)   \
-  _(uint32_t, VK_FORMAT_R32G32B32A32_UINT, UInt)   \
-  _(int32_t, VK_FORMAT_R32G32B32A32_SINT, Int)     \
-  _(uint64_t, VK_FORMAT_R64G64B64A64_UINT, UInt64) \
-  _(int64_t, VK_FORMAT_R64G64B64A64_SINT, Long)    \
-  _(float, VK_FORMAT_FLOAT4, Float)                \
-  _(double, VK_FORMAT_R64G64B64A64_SFLOAT, Double) \
-  _(int8_t, VK_FORMAT_R8G8B8A8_SINT, QInt8)        \
-  _(uint8_t, VK_FORMAT_R8G8B8A8_UINT, QUInt8)      \
-  _(int32_t, VK_FORMAT_R32G32B32A32_SINT, QInt32)
-
-namespace vkcompute {
-namespace vkapi {
-
-//
-// Scalar Types
-//
-
-enum class ScalarType : int8_t {
-#define DEFINE_ENUM_VAL_(ctype, vkformat, name) name,
-  VK_FORALL_SCALAR_TYPES(DEFINE_ENUM_VAL_)
-#undef DEFINE_ENUM_VAL_
-      Undefined,
-  NumOptions
-};
-
-#define DEFINE_CONSTANT(ctype, vkformat, name) \
-  constexpr ScalarType k##name = ScalarType::name;
-
-VK_FORALL_SCALAR_TYPES(DEFINE_CONSTANT)
-#undef DEFINE_CONSTANT
-
-/*
- * Given a `ScalarType`, return the corresponding `VkFormat` that should be used
- * for image texture storage. The `ScalarType` to `VkFormat` mapping is dictated
- * by the `VK_FORALL_SCALAR_TYPE` macro in `api/Types.h`
- */
-inline VkFormat to_vkformat(const ScalarType t) {
-#define CASE_VK_FORMAT(ctype, vkformat, name) \
-  case ScalarType::name:                      \
-    return vkformat;
-
-  switch (t) {
-    VK_FORALL_SCALAR_TYPES(CASE_VK_FORMAT)
-    default:
-      VK_THROW("Unknown ScalarType: ", t);
-  }
-#undef CASE_VK_FORMAT
-}
-
-/*
- * Given a `VkFormat`, return the `ScalarType` that best represents the data
- * type of invidivual elements in an image texture of the `VkFormat`. Note that
- * this mapping is different from the `to_vkformat()` function, since different
- * `ScalarType`s may use the same `VkFormat`.
- */
-inline ScalarType element_scalartype(const VkFormat vkformat) {
-  switch (vkformat) {
-    case VK_FORMAT_R64G64B64A64_SFLOAT:
-      return kDouble;
-    case VK_FORMAT_R32G32B32A32_SFLOAT:
-      return kFloat;
-    case VK_FORMAT_R16G16B16A16_SFLOAT:
-      return kHalf;
-    case VK_FORMAT_R8G8B8A8_SINT:
-      return kChar;
-    case VK_FORMAT_R8G8B8A8_UINT:
-    case VK_FORMAT_R8G8B8A8_UNORM:
-      return kByte;
-    case VK_FORMAT_R16G16B16A16_SINT:
-      return kShort;
-    case VK_FORMAT_R16G16B16A16_UINT:
-      return kUInt16;
-    case VK_FORMAT_R32G32B32A32_SINT:
-      return kInt;
-    case VK_FORMAT_R32G32B32A32_UINT:
-      return kUInt;
-    case VK_FORMAT_R64G64B64A64_SINT:
-      return kLong;
-    case VK_FORMAT_R64G64B64A64_UINT:
-      return kUInt64;
-    default:
-      VK_THROW("No corresponding scalar type for unknown VkFormat: ", vkformat);
-  }
-}
-
-/*
- * Given a ScalarType, return `sizeof(ctype)` where ctype is the C type
- * corresponding to the ScalarType. The C type to ScalarType mapping is dictated
- * by the VK_FORALL_SCALAR_TYPE macro in api/Types.h
- */
-inline size_t element_size(const ScalarType t) {
-#define CASE_ELEMENTSIZE_CASE(ctype, vkformat, name) \
-  case ScalarType::name:                             \
-    return sizeof(ctype);
-
-  switch (t) {
-    VK_FORALL_SCALAR_TYPES(CASE_ELEMENTSIZE_CASE)
-    default:
-      VK_THROW("Unknown ScalarType: ", t);
-  }
-#undef CASE_ELEMENTSIZE_CASE
-}
-
-inline const char* to_string(const ScalarType t) {
-#define CASE_TO_STRING(ctype, vkformat, name) \
-  case ScalarType::name:                      \
-    return #name;
-
-  switch (t) {
-    VK_FORALL_SCALAR_TYPES(CASE_TO_STRING)
-    default:
-      return "UNKNOWN_SCALAR_TYPE";
-  }
-#undef CASE_TO_STRING
-}
-
-inline std::ostream& operator<<(std::ostream& os, const ScalarType dtype) {
-  return os << to_string(dtype);
-}
-
-//
-// Map ScalarTypes to C++ types
-//
-
-template <ScalarType N>
-struct ScalarTypeToCType;
-
-#define SPECIALIZE_ScalarTypeToCType(ctype, vkformat, scalar_type)        \
-  template <>                                                             \
-  struct ScalarTypeToCType<::vkcompute::vkapi::ScalarType::scalar_type> { \
-    using type = ctype;                                                   \
-  };
-
-VK_FORALL_SCALAR_TYPES(SPECIALIZE_ScalarTypeToCType)
-
-#undef SPECIALIZE_ScalarTypeToCPPType
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/VkUtils.h b/backends/vulkan/runtime/vk_api/VkUtils.h
deleted file mode 100644
index b765d417d41..00000000000
--- a/backends/vulkan/runtime/vk_api/VkUtils.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
-
-namespace vkcompute {
-namespace vkapi {
-
-inline VkExtent3D create_extent3d(const utils::uvec3& extents) {
-  return VkExtent3D{extents[0u], extents[1u], extents[2u]};
-}
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocation.cpp b/backends/vulkan/runtime/vk_api/memory/Allocation.cpp
deleted file mode 100644
index fc2de39c811..00000000000
--- a/backends/vulkan/runtime/vk_api/memory/Allocation.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Allocation.h>
-
-#define PRINT_FIELD(struct, field) #field << ": " << struct.field << std::endl
-
-std::ostream& operator<<(std::ostream& out, VmaTotalStatistics stats) {
-  VmaDetailedStatistics total_stats = stats.total;
-  out << "VmaTotalStatistics: " << std::endl;
-  out << "  " << PRINT_FIELD(total_stats.statistics, blockCount);
-  out << "  " << PRINT_FIELD(total_stats.statistics, allocationCount);
-  out << "  " << PRINT_FIELD(total_stats.statistics, blockBytes);
-  out << "  " << PRINT_FIELD(total_stats.statistics, allocationBytes);
-  return out;
-}
-
-#undef PRINT_FIELD
-
-namespace vkcompute {
-namespace vkapi {
-
-Allocation::Allocation()
-    : allocator(VK_NULL_HANDLE), allocation(VK_NULL_HANDLE), is_copy_(false) {}
-
-Allocation::Allocation(
-    VmaAllocator vma_allocator,
-    const VkMemoryRequirements& mem_props,
-    const VmaAllocationCreateInfo& create_info)
-    : allocator(vma_allocator), allocation(VK_NULL_HANDLE), is_copy_(false) {
-  VK_CHECK(vmaAllocateMemory(
-      allocator, &mem_props, &create_info, &allocation, nullptr));
-}
-
-Allocation::Allocation(const Allocation& other) noexcept
-    : allocator(other.allocator),
-      allocation(other.allocation),
-      is_copy_(true) {}
-
-Allocation::Allocation(Allocation&& other) noexcept
-    : allocator(other.allocator),
-      allocation(other.allocation),
-      is_copy_(other.is_copy_) {
-  other.allocation = VK_NULL_HANDLE;
-}
-
-Allocation& Allocation::operator=(Allocation&& other) noexcept {
-  VmaAllocation tmp_allocation = allocation;
-
-  allocator = other.allocator;
-  allocation = other.allocation;
-  is_copy_ = other.is_copy_;
-
-  other.allocation = tmp_allocation;
-
-  return *this;
-}
-
-Allocation::~Allocation() {
-  // Do not destroy the VmaAllocation if this class instance is a copy of some
-  // other class instance, since this means that this class instance does not
-  // have ownership of the underlying resource.
-  if (allocation != VK_NULL_HANDLE && !is_copy_) {
-    vmaFreeMemory(allocator, allocation);
-  }
-}
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocation.h b/backends/vulkan/runtime/vk_api/memory/Allocation.h
deleted file mode 100644
index e56605e14b2..00000000000
--- a/backends/vulkan/runtime/vk_api/memory/Allocation.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/Exception.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/memory/vma_api.h>
-
-#include <ostream>
-
-std::ostream& operator<<(std::ostream& out, VmaTotalStatistics stats);
-
-namespace vkcompute {
-namespace vkapi {
-
-struct Allocation final {
-  explicit Allocation();
-
-  explicit Allocation(
-      const VmaAllocator,
-      const VkMemoryRequirements&,
-      const VmaAllocationCreateInfo&);
-
- protected:
-  /*
-   * The Copy constructor allows for creation of a class instance that are
-   * "aliases" of another class instance. The resulting class instance will not
-   * have ownership of the underlying VmaAllocation.
-   *
-   * This behaviour is analogous to creating a copy of a pointer, thus it is
-   * unsafe, as the original class instance may be destroyed before the copy.
-   * These constructors are therefore marked protected so that they may be used
-   * only in situations where the lifetime of the original class instance is
-   * guaranteed to exceed, or at least be the same as, the lifetime of the
-   * copied class instance.
-   */
-  Allocation(const Allocation&) noexcept;
-
- public:
-  // To discourage creating copies, the assignment operator is still deleted.
-  Allocation& operator=(const Allocation&) = delete;
-
-  Allocation(Allocation&&) noexcept;
-  Allocation& operator=(Allocation&&) noexcept;
-
-  ~Allocation();
-
-  // The allocator object this was allocated from
-  VmaAllocator allocator;
-  // Handles to the allocated memory
-  VmaAllocation allocation;
-
- private:
-  // Indicates whether this class instance is a copy of another class instance,
-  // in which case it does not have ownership of the underlying VmaAllocation
-  bool is_copy_;
-
- public:
-  operator bool() const {
-    return (allocation != VK_NULL_HANDLE);
-  }
-
-  inline bool is_copy() const {
-    return is_copy_;
-  }
-
-  friend class VulkanBuffer;
-  friend class VulkanImage;
-};
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp b/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
deleted file mode 100644
index 7976d0ddee5..00000000000
--- a/backends/vulkan/runtime/vk_api/memory/Allocator.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Allocator.h>
-
-namespace vkcompute {
-namespace vkapi {
-
-Allocator::Allocator(
-    VkInstance instance,
-    VkPhysicalDevice physical_device,
-    VkDevice device)
-    : instance_{},
-      physical_device_(physical_device),
-      device_(device),
-      allocator_{VK_NULL_HANDLE} {
-  VmaVulkanFunctions vk_functions{};
-  vk_functions.vkGetInstanceProcAddr = vkGetInstanceProcAddr;
-  vk_functions.vkGetDeviceProcAddr = vkGetDeviceProcAddr;
-
-  const VmaAllocatorCreateInfo allocator_create_info{
-      0u, // flags
-      physical_device_, // physicalDevice
-      device_, // device
-      0u, // preferredLargeHeapBlockSize
-      nullptr, // pAllocationCallbacks
-      nullptr, // pDeviceMemoryCallbacks
-      nullptr, // pHeapSizeLimit
-      &vk_functions, // pVulkanFunctions
-      instance, // instance
-      VK_API_VERSION_1_0, // vulkanApiVersion
-      nullptr, // pTypeExternalMemoryHandleTypes
-  };
-
-  VK_CHECK(vmaCreateAllocator(&allocator_create_info, &allocator_));
-}
-
-Allocator::Allocator(Allocator&& other) noexcept
-    : instance_(other.instance_),
-      physical_device_(other.physical_device_),
-      device_(other.device_),
-      allocator_(other.allocator_) {
-  other.allocator_ = VK_NULL_HANDLE;
-  other.device_ = VK_NULL_HANDLE;
-  other.physical_device_ = VK_NULL_HANDLE;
-  other.instance_ = VK_NULL_HANDLE;
-}
-
-Allocator::~Allocator() {
-  if (allocator_ == VK_NULL_HANDLE) {
-    return;
-  }
-  vmaDestroyAllocator(allocator_);
-}
-
-VmaAllocationCreateInfo Allocator::gpuonly_resource_create_info() {
-  VmaAllocationCreateInfo alloc_create_info = {};
-  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
-  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
-  return alloc_create_info;
-}
-
-Allocation Allocator::create_allocation(
-    const VkMemoryRequirements& memory_requirements,
-    const VmaAllocationCreateInfo& create_info) {
-  VmaAllocationCreateInfo alloc_create_info = create_info;
-  // Protect against using VMA_MEMORY_USAGE_AUTO_* flags when allocating memory
-  // directly, since those usage flags require that VkBufferCreateInfo and/or
-  // VkImageCreateInfo also be available.
-  switch (create_info.usage) {
-    // The logic for the below usage options are too complex, therefore prevent
-    // those from being used with direct memory allocation.
-    case VMA_MEMORY_USAGE_AUTO:
-    case VMA_MEMORY_USAGE_AUTO_PREFER_HOST:
-      VK_THROW(
-          "Only the VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE usage flag is compatible with create_allocation()");
-      break;
-    // Most of the time, VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE will simply set the
-    // DEVICE_LOCAL_BIT as a preferred memory flag. Therefore the below is a
-    // decent approximation for VMA behaviour.
-    case VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE:
-      alloc_create_info.preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
-      alloc_create_info.usage = VMA_MEMORY_USAGE_UNKNOWN;
-      break;
-    default:
-      break;
-  }
-
-  return Allocation(allocator_, memory_requirements, alloc_create_info);
-}
-
-VulkanImage Allocator::create_image(
-    const VkDevice device,
-    const VkExtent3D& extents,
-    const VkFormat image_format,
-    const VkImageType image_type,
-    const VkImageTiling image_tiling,
-    const VkImageViewType image_view_type,
-    const VulkanImage::SamplerProperties& sampler_props,
-    VkSampler sampler,
-    const bool allow_transfer,
-    const bool allocate_memory) {
-  VkImageUsageFlags usage =
-      VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT;
-  if (allow_transfer) {
-    usage |=
-        (VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
-  }
-
-  VmaAllocationCreateInfo alloc_create_info = gpuonly_resource_create_info();
-
-  const VulkanImage::ImageProperties image_props{
-      image_type,
-      image_format,
-      extents,
-      image_tiling,
-      usage,
-  };
-
-  const VulkanImage::ViewProperties view_props{
-      image_view_type,
-      image_format,
-  };
-
-  const VkImageLayout initial_layout = VK_IMAGE_LAYOUT_UNDEFINED;
-
-  return VulkanImage(
-      device,
-      allocator_,
-      alloc_create_info,
-      image_props,
-      view_props,
-      sampler_props,
-      sampler,
-      initial_layout,
-      allocate_memory);
-}
-
-VulkanBuffer Allocator::create_staging_buffer(const VkDeviceSize size) {
-  const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
-
-  VmaAllocationCreateInfo alloc_create_info = {};
-  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY;
-  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
-
-  // Staging buffers are accessed by both the CPU and GPU, so set the
-  // appropriate flags to indicate that the host device will be accessing
-  // the data from this buffer.
-  alloc_create_info.flags |=
-      VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT |
-      VMA_ALLOCATION_CREATE_MAPPED_BIT;
-  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_HOST;
-  alloc_create_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
-  alloc_create_info.preferredFlags =
-      VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
-
-  return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage);
-}
-
-VulkanBuffer Allocator::create_storage_buffer(
-    const VkDeviceSize size,
-    const bool allocate_memory) {
-  const VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
-
-  VmaAllocationCreateInfo alloc_create_info = gpuonly_resource_create_info();
-  return VulkanBuffer(
-      allocator_, size, alloc_create_info, buffer_usage, allocate_memory);
-}
-
-VulkanBuffer Allocator::create_uniform_buffer(const VkDeviceSize size) {
-  VmaAllocationCreateInfo alloc_create_info = {};
-  alloc_create_info.flags = DEFAULT_ALLOCATION_STRATEGY |
-      VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT;
-  alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO;
-
-  VkBufferUsageFlags buffer_usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
-
-  return VulkanBuffer(allocator_, size, alloc_create_info, buffer_usage);
-}
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/memory/Allocator.h b/backends/vulkan/runtime/vk_api/memory/Allocator.h
deleted file mode 100644
index 8f76ca932b7..00000000000
--- a/backends/vulkan/runtime/vk_api/memory/Allocator.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
-
-#include <executorch/backends/vulkan/runtime/utils/VecUtils.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/memory/vma_api.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Allocation.h>
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Buffer.h>
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Image.h>
-
-namespace vkcompute {
-namespace vkapi {
-
-constexpr VmaAllocationCreateFlags DEFAULT_ALLOCATION_STRATEGY =
-    VMA_ALLOCATION_CREATE_STRATEGY_MIN_MEMORY_BIT;
-
-class Allocator final {
- public:
-  explicit Allocator(
-      VkInstance instance,
-      VkPhysicalDevice physical_device,
-      VkDevice device);
-
-  Allocator(const Allocator&) = delete;
-  Allocator& operator=(const Allocator&) = delete;
-
-  Allocator(Allocator&&) noexcept;
-  Allocator& operator=(Allocator&&) = delete;
-
-  ~Allocator();
-
- private:
-  VkInstance instance_;
-  VkPhysicalDevice physical_device_;
-  VkDevice device_;
-  VmaAllocator allocator_;
-
- public:
-  VmaAllocationCreateInfo gpuonly_resource_create_info();
-
-  Allocation create_allocation(
-      const VkMemoryRequirements& memory_requirements,
-      const VmaAllocationCreateInfo& create_info);
-
-  VulkanImage create_image(
-      const VkDevice,
-      const VkExtent3D&,
-      const VkFormat,
-      const VkImageType,
-      const VkImageTiling,
-      const VkImageViewType,
-      const VulkanImage::SamplerProperties&,
-      VkSampler,
-      const bool allow_transfer = false,
-      const bool allocate_memory = true);
-
-  VulkanBuffer create_staging_buffer(const VkDeviceSize);
-
-  VulkanBuffer create_storage_buffer(
-      const VkDeviceSize,
-      const bool allocate_memory = true);
-
-  /*
-   * Create a uniform buffer with a specified size
-   */
-  VulkanBuffer create_uniform_buffer(const VkDeviceSize);
-
-  /*
-   * Create a uniform buffer containing the data in an arbitrary struct
-   */
-  template <typename Block>
-  VulkanBuffer create_params_buffer(const Block& block);
-
-  VmaTotalStatistics get_memory_statistics() const {
-    VmaTotalStatistics stats = {};
-    vmaCalculateStatistics(allocator_, &stats);
-    return stats;
-  }
-};
-
-//
-// Impl
-//
-
-template <typename Block>
-inline VulkanBuffer Allocator::create_params_buffer(const Block& block) {
-  VulkanBuffer uniform_buffer = create_uniform_buffer(sizeof(Block));
-
-  // Fill the uniform buffer with data in block
-  {
-    MemoryMap mapping(uniform_buffer, MemoryAccessType::WRITE);
-    Block* data_ptr = mapping.template data<Block>();
-
-    *data_ptr = block;
-  }
-
-  return uniform_buffer;
-}
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp b/backends/vulkan/runtime/vk_api/memory/Buffer.cpp
deleted file mode 100644
index f10e40abdbb..00000000000
--- a/backends/vulkan/runtime/vk_api/memory/Buffer.cpp
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Buffer.h>
-
-namespace vkcompute {
-namespace vkapi {
-
-//
-// VulkanBuffer
-//
-
-VulkanBuffer::VulkanBuffer()
-    : buffer_properties_{},
-      allocator_(VK_NULL_HANDLE),
-      memory_{},
-      owns_memory_(false),
-      memory_bundled_(false),
-      is_copy_(false),
-      handle_(VK_NULL_HANDLE) {}
-
-VulkanBuffer::VulkanBuffer(
-    VmaAllocator vma_allocator,
-    const VkDeviceSize size,
-    const VmaAllocationCreateInfo& allocation_create_info,
-    const VkBufferUsageFlags usage,
-    const bool allocate_memory)
-    : buffer_properties_({size, 0u, size}),
-      allocator_(vma_allocator),
-      memory_{},
-      owns_memory_(allocate_memory),
-      memory_bundled_(allocate_memory),
-      is_copy_(false),
-      handle_(VK_NULL_HANDLE) {
-  // If the buffer size is 0, allocate a buffer with a size of 1 byte. This is
-  // to ensure that there will be some resource that can be bound to a shader.
-  if (size == 0) {
-    buffer_properties_.size = 1u;
-    buffer_properties_.mem_range = 1u;
-  }
-
-  const VkBufferCreateInfo buffer_create_info{
-      VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-      buffer_properties_.size, // size
-      usage, // usage
-      VK_SHARING_MODE_EXCLUSIVE, // sharingMode
-      0u, // queueFamilyIndexCount
-      nullptr, // pQueueFamilyIndices
-  };
-
-  if (allocate_memory) {
-    VK_CHECK(vmaCreateBuffer(
-        allocator_,
-        &buffer_create_info,
-        &allocation_create_info,
-        &handle_,
-        &(memory_.allocation),
-        nullptr));
-  } else {
-    VmaAllocatorInfo allocator_info{};
-    vmaGetAllocatorInfo(allocator_, &allocator_info);
-    VK_CHECK(vkCreateBuffer(
-        allocator_info.device, &buffer_create_info, nullptr, &handle_));
-  }
-}
-
-VulkanBuffer::VulkanBuffer(
-    const VulkanBuffer& other,
-    const VkDeviceSize offset,
-    const VkDeviceSize range) noexcept
-    : buffer_properties_(other.buffer_properties_),
-      allocator_(other.allocator_),
-      memory_(other.memory_),
-      owns_memory_(false),
-      memory_bundled_(false),
-      is_copy_(true),
-      handle_(other.handle_) {
-  // TODO: set the offset and range appropriately
-  buffer_properties_.mem_offset = other.buffer_properties_.mem_offset + offset;
-  if (range != VK_WHOLE_SIZE) {
-    buffer_properties_.mem_range = range;
-  }
-}
-
-VulkanBuffer::VulkanBuffer(VulkanBuffer&& other) noexcept
-    : buffer_properties_(other.buffer_properties_),
-      allocator_(other.allocator_),
-      memory_(std::move(other.memory_)),
-      owns_memory_(other.owns_memory_),
-      memory_bundled_(other.memory_bundled_),
-      is_copy_(other.is_copy_),
-      handle_(other.handle_) {
-  other.handle_ = VK_NULL_HANDLE;
-}
-
-VulkanBuffer& VulkanBuffer::operator=(VulkanBuffer&& other) noexcept {
-  VkBuffer tmp_buffer = handle_;
-  bool tmp_owns_memory = owns_memory_;
-  bool tmp_memory_bundled = memory_bundled_;
-
-  buffer_properties_ = other.buffer_properties_;
-  allocator_ = other.allocator_;
-  memory_ = std::move(other.memory_);
-  owns_memory_ = other.owns_memory_;
-  memory_bundled_ = other.memory_bundled_;
-  is_copy_ = other.is_copy_;
-  handle_ = other.handle_;
-
-  other.handle_ = tmp_buffer;
-  other.owns_memory_ = tmp_owns_memory;
-  other.memory_bundled_ = tmp_memory_bundled;
-
-  return *this;
-}
-
-VulkanBuffer::~VulkanBuffer() {
-  // Do not destroy the VkBuffer if this class instance is a copy of another
-  // class instance, since this means that this class instance does not have
-  // ownership of the underlying resource.
-  if (handle_ != VK_NULL_HANDLE && !is_copy_) {
-    if (owns_memory_) {
-      if (memory_bundled_) {
-        vmaDestroyBuffer(allocator_, handle_, memory_.allocation);
-        // Prevent the underlying memory allocation from being freed; it was
-        // freed by vmaDestroyImage
-        memory_.allocation = VK_NULL_HANDLE;
-      } else {
-        vkDestroyBuffer(this->device(), handle_, nullptr);
-        // Allow underlying memory allocation to be freed by the destructor of
-        // Allocation class
-      }
-    } else {
-      vkDestroyBuffer(this->device(), handle_, nullptr);
-      // Prevent the underlying memory allocation from being freed since this
-      // object doesn't own it
-      memory_.allocation = VK_NULL_HANDLE;
-    }
-  }
-}
-
-VmaAllocationInfo VulkanBuffer::allocation_info() const {
-  VmaAllocationInfo info;
-  vmaGetAllocationInfo(allocator_, memory_.allocation, &info);
-  return info;
-}
-
-void VulkanBuffer::bind_allocation_impl(const Allocation& memory) {
-  VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!");
-  if (!is_copy_) {
-    VK_CHECK(vmaBindBufferMemory(allocator_, memory.allocation, handle_));
-  }
-}
-
-void VulkanBuffer::bind_allocation(const Allocation& memory) {
-  bind_allocation_impl(memory);
-  memory_.allocation = memory.allocation;
-}
-
-void VulkanBuffer::acquire_allocation(Allocation&& memory) {
-  bind_allocation_impl(memory);
-  memory_ = std::move(memory);
-  owns_memory_ = true;
-}
-
-VkMemoryRequirements VulkanBuffer::get_memory_requirements() const {
-  VkMemoryRequirements memory_requirements;
-  vkGetBufferMemoryRequirements(this->device(), handle_, &memory_requirements);
-  return memory_requirements;
-}
-
-//
-// MemoryMap
-//
-
-MemoryMap::MemoryMap(const VulkanBuffer& buffer, const uint8_t access)
-    : access_(access),
-      allocator_(buffer.vma_allocator()),
-      allocation_(buffer.allocation()),
-      data_(nullptr),
-      data_len_{buffer.mem_size()} {
-  if (allocation_) {
-    VK_CHECK(vmaMapMemory(allocator_, allocation_, &data_));
-  }
-}
-
-MemoryMap::MemoryMap(MemoryMap&& other) noexcept
-    : access_(other.access_),
-      allocator_(other.allocator_),
-      allocation_(other.allocation_),
-      data_(other.data_),
-      data_len_{other.data_len_} {
-  other.allocation_ = VK_NULL_HANDLE;
-  other.data_ = nullptr;
-}
-
-MemoryMap::~MemoryMap() {
-  if (!data_) {
-    return;
-  }
-
-  if (allocation_) {
-    if (access_ & MemoryAccessType::WRITE) {
-      // Call will be ignored by implementation if the memory type this
-      // allocation belongs to is not HOST_VISIBLE or is HOST_COHERENT, which is
-      // the behavior we want. Don't check the result here as the destructor
-      // cannot throw.
-      vmaFlushAllocation(allocator_, allocation_, 0u, VK_WHOLE_SIZE);
-    }
-
-    vmaUnmapMemory(allocator_, allocation_);
-  }
-}
-
-void MemoryMap::invalidate() {
-  if (access_ & MemoryAccessType::READ && allocation_) {
-    // Call will be ignored by implementation if the memory type this allocation
-    // belongs to is not HOST_VISIBLE or is HOST_COHERENT, which is the behavior
-    // we want.
-    VK_CHECK(
-        vmaInvalidateAllocation(allocator_, allocation_, 0u, VK_WHOLE_SIZE));
-  }
-}
-
-//
-// BufferMemoryBarrier
-//
-
-BufferMemoryBarrier::BufferMemoryBarrier(
-    const VkAccessFlags src_access_flags,
-    const VkAccessFlags dst_access_flags,
-    const VulkanBuffer& buffer)
-    : handle{
-          VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, // sType
-          nullptr, // pNext
-          src_access_flags, // srcAccessMask
-          dst_access_flags, // dstAccessMask
-          VK_QUEUE_FAMILY_IGNORED, // srcQueueFamilyIndex
-          VK_QUEUE_FAMILY_IGNORED, // dstQueueFamilyIndex
-          buffer.handle_, // buffer
-          buffer.buffer_properties_.mem_offset, // offset
-          buffer.buffer_properties_.mem_range, // size
-      } {}
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h
deleted file mode 100644
index 582b537465d..00000000000
--- a/backends/vulkan/runtime/vk_api/memory/Buffer.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
-
-#include <executorch/backends/vulkan/runtime/utils/VecUtils.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/memory/vma_api.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Allocation.h>
-
-namespace vkcompute {
-
-// Forward declare vTensor classes such that they can be set as friend classes
-namespace api {
-class vTensorStorage;
-} // namespace api
-
-namespace vkapi {
-
-using MemoryAccessFlags = uint8_t;
-
-enum MemoryAccessType : MemoryAccessFlags {
-  NONE = 0u << 0u,
-  READ = 1u << 0u,
-  WRITE = 1u << 1u,
-};
-
-static constexpr MemoryAccessFlags kReadWrite =
-    MemoryAccessType::WRITE | MemoryAccessType::READ;
-
-static constexpr MemoryAccessFlags kRead = MemoryAccessType::READ;
-
-static constexpr MemoryAccessFlags kWrite = MemoryAccessType::WRITE;
-
-class VulkanBuffer final {
- public:
-  struct BufferProperties final {
-    VkDeviceSize size;
-    VkDeviceSize mem_offset;
-    VkDeviceSize mem_range;
-  };
-
-  explicit VulkanBuffer();
-
-  explicit VulkanBuffer(
-      const VmaAllocator,
-      const VkDeviceSize,
-      const VmaAllocationCreateInfo&,
-      const VkBufferUsageFlags,
-      const bool allocate_memory = true);
-
- protected:
-  /*
-   * The Copy constructor and allows for creation of a class instance that are
-   * "aliases" of another class instance. The resulting class instance will not
-   * have ownership of the underlying VkBuffer.
-   *
-   * This behaviour is analogous to creating a copy of a pointer, thus it is
-   * unsafe, as the original class instance may be destroyed before the copy.
-   * These constructors are therefore marked protected so that they may be used
-   * only in situations where the lifetime of the original class instance is
-   * guaranteed to exceed, or at least be the same as, the lifetime of the
-   * copied class instance.
-   */
-  VulkanBuffer(
-      const VulkanBuffer& other,
-      const VkDeviceSize offset = 0u,
-      const VkDeviceSize range = VK_WHOLE_SIZE) noexcept;
-
- public:
-  // To discourage creating copies, the assignment operator is still deleted.
-  VulkanBuffer& operator=(const VulkanBuffer& other) = delete;
-
-  VulkanBuffer(VulkanBuffer&&) noexcept;
-  VulkanBuffer& operator=(VulkanBuffer&&) noexcept;
-
-  ~VulkanBuffer();
-
-  struct Package final {
-    VkBuffer handle;
-    VkDeviceSize buffer_offset;
-    VkDeviceSize buffer_range;
-  };
-
-  friend struct BufferMemoryBarrier;
-
- private:
-  BufferProperties buffer_properties_;
-  VmaAllocator allocator_;
-  Allocation memory_;
-  // Indicates whether the underlying memory is owned by this resource
-  bool owns_memory_;
-  // Indicates whether the allocation for the buffer was created with the buffer
-  // via vmaCreateBuffer; if this is false, the memory is owned but was bound
-  // separately via vmaBindBufferMemory
-  bool memory_bundled_;
-  // Indicates whether this VulkanBuffer was copied from another VulkanBuffer,
-  // thus it does not have ownership of the underlying VKBuffer
-  bool is_copy_;
-  VkBuffer handle_;
-
- public:
-  inline VkDevice device() const {
-    VmaAllocatorInfo allocator_info{};
-    vmaGetAllocatorInfo(allocator_, &allocator_info);
-    return allocator_info.device;
-  }
-
-  inline VmaAllocator vma_allocator() const {
-    return allocator_;
-  }
-
-  inline VmaAllocation allocation() const {
-    return memory_.allocation;
-  }
-
-  VmaAllocationInfo allocation_info() const;
-
-  inline VkBuffer handle() const {
-    return handle_;
-  }
-
-  inline VkDeviceSize mem_offset() const {
-    return buffer_properties_.mem_offset;
-  }
-
-  inline VkDeviceSize mem_range() const {
-    return buffer_properties_.mem_range;
-  }
-
-  inline VkDeviceSize mem_size() const {
-    return buffer_properties_.size;
-  }
-
-  inline size_t mem_size_as_size_t() const {
-    return utils::safe_downcast<size_t>(mem_size());
-  }
-
-  inline bool has_memory() const {
-    return (memory_.allocation != VK_NULL_HANDLE);
-  }
-
-  inline bool owns_memory() const {
-    return owns_memory_;
-  }
-
-  inline bool is_copy() const {
-    return is_copy_;
-  }
-
-  operator bool() const {
-    return (handle_ != VK_NULL_HANDLE);
-  }
-
-  inline bool is_copy_of(const VulkanBuffer& other) const {
-    return (handle_ == other.handle_) && is_copy_;
-  }
-
- private:
-  void bind_allocation_impl(const Allocation& memory);
-
- public:
-  /*
-   * Given a memory allocation, bind it to the underlying VkImage. The lifetime
-   * of the memory allocation is assumed to be managed externally.
-   */
-  void bind_allocation(const Allocation& memory);
-
-  /*
-   * Given a rvalue memory allocation, bind it to the underlying VkImage and
-   * also acquire ownership of the memory allocation.
-   */
-  void acquire_allocation(Allocation&& memory);
-
-  VkMemoryRequirements get_memory_requirements() const;
-
-  friend class api::vTensorStorage;
-};
-
-class MemoryMap final {
- public:
-  explicit MemoryMap(
-      const VulkanBuffer& buffer,
-      const MemoryAccessFlags access);
-
-  MemoryMap(const MemoryMap&) = delete;
-  MemoryMap& operator=(const MemoryMap&) = delete;
-
-  MemoryMap(MemoryMap&&) noexcept;
-  MemoryMap& operator=(MemoryMap&&) = delete;
-
-  ~MemoryMap();
-
- private:
-  uint8_t access_;
-  VmaAllocator allocator_;
-  VmaAllocation allocation_;
-  void* data_;
-  VkDeviceSize data_len_;
-
- public:
-  template <typename T>
-  T* data(const uint32_t offset = 0) {
-    return reinterpret_cast<T*>(static_cast<uint8_t*>(data_) + offset);
-  }
-
-  inline size_t nbytes() {
-    return utils::safe_downcast<size_t>(data_len_);
-  }
-
-  void invalidate();
-};
-
-struct BufferMemoryBarrier final {
-  VkBufferMemoryBarrier handle;
-
-  BufferMemoryBarrier(
-      const VkAccessFlags src_access_flags,
-      const VkAccessFlags dst_access_flags,
-      const VulkanBuffer& buffer);
-};
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/memory/Image.cpp b/backends/vulkan/runtime/vk_api/memory/Image.cpp
deleted file mode 100644
index cadeb779c83..00000000000
--- a/backends/vulkan/runtime/vk_api/memory/Image.cpp
+++ /dev/null
@@ -1,433 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Image.h>
-
-namespace vkcompute {
-namespace vkapi {
-
-//
-// ImageSampler
-//
-
-bool operator==(
-    const ImageSampler::Properties& _1,
-    const ImageSampler::Properties& _2) {
-  return (
-      _1.filter == _2.filter && _1.mipmap_mode == _2.mipmap_mode &&
-      _1.address_mode == _2.address_mode && _1.border_color == _2.border_color);
-}
-
-ImageSampler::ImageSampler(
-    VkDevice device,
-    const ImageSampler::Properties& props)
-    : device_(device), handle_(VK_NULL_HANDLE) {
-  const VkSamplerCreateInfo sampler_create_info{
-      VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-      props.filter, // magFilter
-      props.filter, // minFilter
-      props.mipmap_mode, // mipmapMode
-      props.address_mode, // addressModeU
-      props.address_mode, // addressModeV
-      props.address_mode, // addressModeW
-      0.0f, // mipLodBias
-      VK_FALSE, // anisotropyEnable
-      1.0f, // maxAnisotropy,
-      VK_FALSE, // compareEnable
-      VK_COMPARE_OP_NEVER, // compareOp
-      0.0f, // minLod
-      VK_LOD_CLAMP_NONE, // maxLod
-      props.border_color, // borderColor
-      VK_FALSE, // unnormalizedCoordinates
-  };
-
-  VK_CHECK(vkCreateSampler(device_, &sampler_create_info, nullptr, &handle_));
-}
-
-ImageSampler::ImageSampler(ImageSampler&& other) noexcept
-    : device_(other.device_), handle_(other.handle_) {
-  other.handle_ = VK_NULL_HANDLE;
-}
-
-ImageSampler::~ImageSampler() {
-  if (handle_ == VK_NULL_HANDLE) {
-    return;
-  }
-  vkDestroySampler(device_, handle_, nullptr);
-}
-
-size_t ImageSampler::Hasher::operator()(
-    const ImageSampler::Properties& props) const {
-  size_t seed = 0;
-  seed = utils::hash_combine(seed, std::hash<VkFilter>()(props.filter));
-  seed = utils::hash_combine(
-      seed, std::hash<VkSamplerMipmapMode>()(props.mipmap_mode));
-  seed = utils::hash_combine(
-      seed, std::hash<VkSamplerAddressMode>()(props.address_mode));
-  seed =
-      utils::hash_combine(seed, std::hash<VkBorderColor>()(props.border_color));
-  return seed;
-}
-
-void swap(ImageSampler& lhs, ImageSampler& rhs) noexcept {
-  VkDevice tmp_device = lhs.device_;
-  VkSampler tmp_handle = lhs.handle_;
-
-  lhs.device_ = rhs.device_;
-  lhs.handle_ = rhs.handle_;
-
-  rhs.device_ = tmp_device;
-  rhs.handle_ = tmp_handle;
-}
-
-//
-// VulkanImage
-//
-
-VulkanImage::VulkanImage()
-    : device_{VK_NULL_HANDLE},
-      image_properties_{},
-      view_properties_{},
-      sampler_properties_{},
-      allocator_(VK_NULL_HANDLE),
-      memory_{},
-      owns_memory_(false),
-      memory_bundled_(false),
-      owns_view_(false),
-      is_copy_(false),
-      handles_{
-          VK_NULL_HANDLE,
-          VK_NULL_HANDLE,
-          VK_NULL_HANDLE,
-      },
-      layout_{} {}
-
-VulkanImage::VulkanImage(
-    VkDevice device,
-    VmaAllocator vma_allocator,
-    const VmaAllocationCreateInfo& allocation_create_info,
-    const ImageProperties& image_props,
-    const ViewProperties& view_props,
-    const SamplerProperties& sampler_props,
-    VkSampler sampler,
-    const VkImageLayout layout,
-    const bool allocate_memory)
-    : device_{device},
-      image_properties_(image_props),
-      view_properties_(view_props),
-      sampler_properties_(sampler_props),
-      allocator_(vma_allocator),
-      memory_{},
-      owns_memory_{allocate_memory},
-      memory_bundled_(allocate_memory),
-      owns_view_(false),
-      is_copy_(false),
-      handles_{
-          VK_NULL_HANDLE,
-          VK_NULL_HANDLE,
-          sampler,
-      },
-      layout_(layout) {
-  VmaAllocatorInfo allocator_info{};
-  vmaGetAllocatorInfo(allocator_, &allocator_info);
-
-  // If any dims are zero, then allocate a 1x1x1 image texture. This is to
-  // ensure that there will be some resource that can be bound to a shader.
-  if (image_props.image_extents.width == 0 ||
-      image_props.image_extents.height == 0 ||
-      image_props.image_extents.depth == 0) {
-    image_properties_.image_extents.width = 1u;
-    image_properties_.image_extents.height = 1u;
-    image_properties_.image_extents.depth = 1u;
-  }
-
-  const VkImageCreateInfo image_create_info{
-      VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-      image_properties_.image_type, // imageType
-      image_properties_.image_format, // format
-      image_properties_.image_extents, // extents
-      1u, // mipLevels
-      1u, // arrayLayers
-      VK_SAMPLE_COUNT_1_BIT, // samples
-      image_properties_.image_tiling, // tiling
-      image_properties_.image_usage, // usage
-      VK_SHARING_MODE_EXCLUSIVE, // sharingMode
-      0u, // queueFamilyIndexCount
-      nullptr, // pQueueFamilyIndices
-      layout_, // initialLayout
-  };
-
-  if (allocate_memory) {
-    VK_CHECK(vmaCreateImage(
-        allocator_,
-        &image_create_info,
-        &allocation_create_info,
-        &(handles_.image),
-        &(memory_.allocation),
-        nullptr));
-    // Only create the image view if the image has been bound to memory
-    owns_view_ = true;
-    create_image_view();
-  } else {
-    VK_CHECK(vkCreateImage(
-        allocator_info.device, &image_create_info, nullptr, &(handles_.image)));
-  }
-}
-
-VulkanImage::VulkanImage(
-    VkDevice device,
-    const ImageProperties& image_props,
-    VkImage image,
-    VkImageView image_view,
-    VkSampler sampler,
-    const VkImageLayout layout)
-    : device_{device},
-      image_properties_{image_props},
-      view_properties_{},
-      sampler_properties_{},
-      allocator_(VK_NULL_HANDLE),
-      memory_{},
-      owns_memory_(false),
-      memory_bundled_(false),
-      is_copy_(false),
-      handles_{
-          image,
-          image_view,
-          sampler,
-      },
-      layout_{layout} {}
-
-VulkanImage::VulkanImage(const VulkanImage& other) noexcept
-    : device_(other.device_),
-      image_properties_(other.image_properties_),
-      view_properties_(other.view_properties_),
-      sampler_properties_(other.sampler_properties_),
-      allocator_(other.allocator_),
-      memory_(other.memory_),
-      owns_memory_{false},
-      owns_view_{false},
-      is_copy_(true),
-      handles_(other.handles_),
-      layout_(other.layout_) {}
-
-VulkanImage::VulkanImage(VulkanImage&& other) noexcept
-    : device_(other.device_),
-      image_properties_(other.image_properties_),
-      view_properties_(other.view_properties_),
-      sampler_properties_(other.sampler_properties_),
-      allocator_(other.allocator_),
-      memory_(std::move(other.memory_)),
-      owns_memory_(other.owns_memory_),
-      memory_bundled_(other.memory_bundled_),
-      owns_view_(other.owns_view_),
-      is_copy_(other.is_copy_),
-      handles_(other.handles_),
-      layout_(other.layout_) {
-  other.handles_.image = VK_NULL_HANDLE;
-  other.handles_.image_view = VK_NULL_HANDLE;
-  other.handles_.sampler = VK_NULL_HANDLE;
-  other.owns_memory_ = false;
-  other.memory_bundled_ = false;
-}
-
-VulkanImage& VulkanImage::operator=(VulkanImage&& other) noexcept {
-  VkImage tmp_image = handles_.image;
-  VkImageView tmp_image_view = handles_.image_view;
-  bool tmp_owns_memory = owns_memory_;
-  bool tmp_memory_bundled = memory_bundled_;
-
-  device_ = other.device_;
-  image_properties_ = other.image_properties_;
-  view_properties_ = other.view_properties_;
-  sampler_properties_ = other.sampler_properties_;
-  allocator_ = other.allocator_;
-  memory_ = std::move(other.memory_);
-  owns_memory_ = other.owns_memory_;
-  memory_bundled_ = other.memory_bundled_;
-  is_copy_ = other.is_copy_;
-  handles_ = other.handles_;
-  layout_ = other.layout_;
-
-  other.handles_.image = tmp_image;
-  other.handles_.image_view = tmp_image_view;
-  other.owns_memory_ = tmp_owns_memory;
-  other.memory_bundled_ = tmp_memory_bundled;
-
-  return *this;
-}
-
-VulkanImage::~VulkanImage() {
-  if (owns_view_ && handles_.image_view != VK_NULL_HANDLE) {
-    vkDestroyImageView(this->device(), handles_.image_view, nullptr);
-  }
-
-  // Do not destroy any resources if this class instance is a copy of another
-  // class instance, since this means that this class instance does not have
-  // ownership of the underlying resource.
-  if (is_copy_) {
-    return;
-  }
-
-  if (handles_.image != VK_NULL_HANDLE) {
-    if (owns_memory_) {
-      if (memory_bundled_) {
-        vmaDestroyImage(allocator_, handles_.image, memory_.allocation);
-        // Prevent the underlying memory allocation from being freed; it was
-        // freed by vmaDestroyImage
-        memory_.allocation = VK_NULL_HANDLE;
-      } else {
-        vkDestroyImage(this->device(), handles_.image, nullptr);
-        // Allow underlying memory allocation to be freed by the destructor of
-        // Allocation class
-      }
-    } else {
-      vkDestroyImage(this->device(), handles_.image, nullptr);
-      // Prevent the underlying memory allocation from being freed since this
-      // object doesn't own it
-      memory_.allocation = VK_NULL_HANDLE;
-    }
-  }
-}
-
-void VulkanImage::create_image_view() {
-  VmaAllocatorInfo allocator_info{};
-  vmaGetAllocatorInfo(allocator_, &allocator_info);
-
-  const VkComponentMapping component_mapping{
-      VK_COMPONENT_SWIZZLE_IDENTITY, // r
-      VK_COMPONENT_SWIZZLE_IDENTITY, // g
-      VK_COMPONENT_SWIZZLE_IDENTITY, // b
-      VK_COMPONENT_SWIZZLE_IDENTITY, // a
-  };
-
-  const VkImageSubresourceRange subresource_range{
-      VK_IMAGE_ASPECT_COLOR_BIT, // aspectMask
-      0u, // baseMipLevel
-      VK_REMAINING_MIP_LEVELS, // levelCount
-      0u, // baseArrayLayer
-      VK_REMAINING_ARRAY_LAYERS, // layerCount
-  };
-
-  const VkImageViewCreateInfo image_view_create_info{
-      VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-      handles_.image, // image
-      view_properties_.view_type, // viewType
-      view_properties_.view_format, // format
-      component_mapping, // components
-      subresource_range, // subresourceRange
-  };
-
-  VK_CHECK(vkCreateImageView(
-      allocator_info.device,
-      &(image_view_create_info),
-      nullptr,
-      &(handles_.image_view)));
-}
-
-void VulkanImage::bind_allocation_impl(const Allocation& memory) {
-  VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!");
-  // To prevent multiple instances of binding the same VkImage to a memory
-  // block, do not actually bind memory if this VulkanImage is a copy. Assume
-  // that the original VulkanImage is responsible for binding the image.
-  if (!is_copy_) {
-    VK_CHECK(vmaBindImageMemory(allocator_, memory.allocation, handles_.image));
-  }
-
-  // Only create the image view if the image has been bound to memory
-  owns_view_ = true;
-  create_image_view();
-}
-
-void VulkanImage::bind_allocation(const Allocation& memory) {
-  bind_allocation_impl(memory);
-  memory_.allocation = memory.allocation;
-}
-
-void VulkanImage::acquire_allocation(Allocation&& memory) {
-  bind_allocation_impl(memory);
-  memory_ = std::move(memory);
-  owns_memory_ = true;
-}
-
-VkMemoryRequirements VulkanImage::get_memory_requirements() const {
-  VkMemoryRequirements memory_requirements;
-  vkGetImageMemoryRequirements(
-      this->device(), handles_.image, &memory_requirements);
-  return memory_requirements;
-}
-
-//
-// ImageMemoryBarrier
-//
-
-ImageMemoryBarrier::ImageMemoryBarrier(
-    const VkAccessFlags src_access_flags,
-    const VkAccessFlags dst_access_flags,
-    const VkImageLayout src_layout_flags,
-    const VkImageLayout dst_layout_flags,
-    const VulkanImage& image)
-    : handle{
-          VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, // sType
-          nullptr, // pNext
-          src_access_flags, // srcAccessMask
-          dst_access_flags, // dstAccessMask
-          src_layout_flags, // oldLayout
-          dst_layout_flags, // newLayout
-          VK_QUEUE_FAMILY_IGNORED, // srcQueueFamilyIndex
-          VK_QUEUE_FAMILY_IGNORED, // dstQueueFamilyIndex
-          image.handles_.image, // image
-          {
-              // subresourceRange
-              VK_IMAGE_ASPECT_COLOR_BIT, // aspectMask
-              0u, // baseMipLevel
-              VK_REMAINING_MIP_LEVELS, // levelCount
-              0u, // baseArrayLayer
-              VK_REMAINING_ARRAY_LAYERS, // layerCount
-          },
-      } {}
-
-//
-// SamplerCache
-//
-
-SamplerCache::SamplerCache(VkDevice device)
-    : cache_mutex_{}, device_(device), cache_{} {}
-
-SamplerCache::SamplerCache(SamplerCache&& other) noexcept
-    : cache_mutex_{}, device_(other.device_), cache_(std::move(other.cache_)) {
-  std::lock_guard<std::mutex> lock(other.cache_mutex_);
-}
-
-SamplerCache::~SamplerCache() {
-  purge();
-}
-
-VkSampler SamplerCache::retrieve(const SamplerCache::Key& key) {
-  std::lock_guard<std::mutex> lock(cache_mutex_);
-
-  auto it = cache_.find(key);
-  if (cache_.cend() == it) {
-    it = cache_.insert({key, SamplerCache::Value(device_, key)}).first;
-  }
-
-  return it->second.handle();
-}
-
-void SamplerCache::purge() {
-  std::lock_guard<std::mutex> lock(cache_mutex_);
-  cache_.clear();
-}
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/memory/Image.h b/backends/vulkan/runtime/vk_api/memory/Image.h
deleted file mode 100644
index db632c34378..00000000000
--- a/backends/vulkan/runtime/vk_api/memory/Image.h
+++ /dev/null
@@ -1,311 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-// @lint-ignore-every CLANGTIDY facebook-hte-BadMemberName
-
-#include <executorch/backends/vulkan/runtime/vk_api/vk_api.h>
-
-#include <executorch/backends/vulkan/runtime/utils/VecUtils.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/memory/vma_api.h>
-
-#include <executorch/backends/vulkan/runtime/vk_api/memory/Allocation.h>
-
-#include <mutex>
-#include <unordered_map>
-
-namespace vkcompute {
-
-// Forward declare vTensor classes such that they can be set as friend classes
-namespace api {
-class vTensorStorage;
-} // namespace api
-
-namespace vkapi {
-
-class ImageSampler final {
- public:
-  struct Properties final {
-    VkFilter filter;
-    VkSamplerMipmapMode mipmap_mode;
-    VkSamplerAddressMode address_mode;
-    VkBorderColor border_color;
-  };
-
-  explicit ImageSampler(VkDevice, const Properties&);
-
-  ImageSampler(const ImageSampler&) = delete;
-  ImageSampler& operator=(const ImageSampler&) = delete;
-
-  ImageSampler(ImageSampler&&) noexcept;
-  ImageSampler& operator=(ImageSampler&&) = delete;
-
-  ~ImageSampler();
-
- private:
-  VkDevice device_;
-  VkSampler handle_;
-
- public:
-  VkSampler handle() const {
-    return handle_;
-  }
-
-  struct Hasher {
-    size_t operator()(const Properties&) const;
-  };
-
-  // We need to define a custom swap function since this class
-  // does not allow for move assignment. The swap function will
-  // be used in the hash map.
-  friend void swap(ImageSampler& lhs, ImageSampler& rhs) noexcept;
-};
-
-class VulkanImage final {
- public:
-  struct ImageProperties final {
-    VkImageType image_type;
-    VkFormat image_format;
-    VkExtent3D image_extents;
-    VkImageTiling image_tiling;
-    VkImageUsageFlags image_usage;
-  };
-
-  struct ViewProperties final {
-    VkImageViewType view_type;
-    VkFormat view_format;
-  };
-
-  using SamplerProperties = ImageSampler::Properties;
-
-  struct Handles final {
-    VkImage image;
-    VkImageView image_view;
-    VkSampler sampler;
-  };
-
-  explicit VulkanImage();
-
-  explicit VulkanImage(
-      VkDevice,
-      const VmaAllocator,
-      const VmaAllocationCreateInfo&,
-      const ImageProperties&,
-      const ViewProperties&,
-      const SamplerProperties&,
-      VkSampler,
-      const VkImageLayout,
-      const bool allocate_memory = true);
-
-  explicit VulkanImage(
-      VkDevice,
-      const ImageProperties&,
-      VkImage,
-      VkImageView,
-      VkSampler,
-      const VkImageLayout);
-
- protected:
-  /*
-   * The Copy constructor allows for creation of a class instance that are
-   * "aliases" of another class instance. The resulting class instance will not
-   * have ownership of the underlying VkImage.
-   *
-   * This behaviour is analogous to creating a copy of a pointer, thus it is
-   * unsafe, as the original class instance may be destroyed before the copy.
-   * These constructors are therefore marked protected so that they may be used
-   * only in situations where the lifetime of the original class instance is
-   * guaranteed to exceed, or at least be the same as, the lifetime of the
-   * copied class instance.
-   */
-  VulkanImage(const VulkanImage& other) noexcept;
-
- public:
-  // To discourage creating copies, the assignment operator is still deleted.
-  VulkanImage& operator=(const VulkanImage&) = delete;
-
-  VulkanImage(VulkanImage&&) noexcept;
-  VulkanImage& operator=(VulkanImage&&) noexcept;
-
-  ~VulkanImage();
-
-  struct Package final {
-    VkImage handle;
-    VkImageLayout image_layout;
-    VkImageView image_view;
-    VkSampler image_sampler;
-  };
-
-  friend struct ImageMemoryBarrier;
-
- private:
-  VkDevice device_;
-  ImageProperties image_properties_;
-  ViewProperties view_properties_;
-  SamplerProperties sampler_properties_;
-  // The allocator object this was allocated from
-  VmaAllocator allocator_;
-  // Handles to the allocated memory
-  Allocation memory_;
-  // Indicates whether the underlying memory is owned by this resource
-  bool owns_memory_;
-  // Indicates whether the allocation for the image was created with the image
-  // via vmaCreateImage; if this is false, the memory is owned but was bound
-  // separately via vmaBindImageMemory
-  bool memory_bundled_;
-  // In some cases, a VulkanImage may be a copy of another VulkanImage but still
-  // own a unique view of the VkImage.
-  bool owns_view_;
-  // Indicates whether this VulkanImage was copied from another VulkanImage,
-  // thus it does not have ownership of the underlying VKBuffer
-  bool is_copy_;
-  Handles handles_;
-  // Layout
-  VkImageLayout layout_;
-
- public:
-  void create_image_view();
-
-  inline VkDevice device() const {
-    return device_;
-  }
-
-  inline VmaAllocator vma_allocator() const {
-    return allocator_;
-  }
-
-  inline VmaAllocation allocation() const {
-    return memory_.allocation;
-  }
-
-  inline VkImageType type() const {
-    return image_properties_.image_type;
-  }
-
-  inline VkFormat format() const {
-    return image_properties_.image_format;
-  }
-
-  inline VkExtent3D extents() const {
-    return image_properties_.image_extents;
-  }
-
-  inline VkImage handle() const {
-    return handles_.image;
-  }
-
-  inline VkImageView image_view() const {
-    return handles_.image_view;
-  }
-
-  inline VkSampler sampler() const {
-    return handles_.sampler;
-  }
-
-  Package package() const {
-    return {
-        handles_.image,
-        layout_,
-        handles_.image_view,
-        handles_.sampler,
-    };
-  }
-
-  inline VkImageLayout layout() const {
-    return layout_;
-  }
-
-  inline void set_layout(const VkImageLayout layout) {
-    layout_ = layout;
-  }
-
-  inline bool has_memory() const {
-    return (memory_.allocation != VK_NULL_HANDLE);
-  }
-
-  inline bool owns_memory() const {
-    return owns_memory_;
-  }
-
-  inline bool is_copy() const {
-    return is_copy_;
-  }
-
-  inline operator bool() const {
-    return (handles_.image != VK_NULL_HANDLE);
-  }
-
-  inline bool is_copy_of(const VulkanImage& other) const {
-    return (handles_.image == other.handles_.image) && is_copy_;
-  }
-
- private:
-  void bind_allocation_impl(const Allocation& memory);
-
- public:
-  /*
-   * Given a memory allocation, bind it to the underlying VkImage. The lifetime
-   * of the memory allocation is assumed to be managed externally.
-   */
-  void bind_allocation(const Allocation& memory);
-
-  /*
-   * Given a rvalue memory allocation, bind it to the underlying VkImage and
-   * also acquire ownership of the memory allocation.
-   */
-  void acquire_allocation(Allocation&& memory);
-
-  VkMemoryRequirements get_memory_requirements() const;
-
-  friend class api::vTensorStorage;
-};
-
-struct ImageMemoryBarrier final {
-  VkImageMemoryBarrier handle;
-
-  ImageMemoryBarrier(
-      const VkAccessFlags src_access_flags,
-      const VkAccessFlags dst_access_flags,
-      const VkImageLayout src_layout_flags,
-      const VkImageLayout dst_layout_flags,
-      const VulkanImage& image);
-};
-
-class SamplerCache final {
- public:
-  explicit SamplerCache(VkDevice device);
-
-  SamplerCache(const SamplerCache&) = delete;
-  SamplerCache& operator=(const SamplerCache&) = delete;
-
-  SamplerCache(SamplerCache&&) noexcept;
-  SamplerCache& operator=(SamplerCache&&) = delete;
-
-  ~SamplerCache();
-
-  using Key = ImageSampler::Properties;
-  using Value = ImageSampler;
-  using Hasher = ImageSampler::Hasher;
-
- private:
-  // Multiple threads could potentially be adding entries into the cache, so use
-  // a mutex to manage access
-  std::mutex cache_mutex_;
-
-  VkDevice device_;
-  std::unordered_map<Key, Value, Hasher> cache_;
-
- public:
-  VkSampler retrieve(const Key&);
-  void purge();
-};
-
-} // namespace vkapi
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/memory/vma_api.cpp b/backends/vulkan/runtime/vk_api/memory/vma_api.cpp
deleted file mode 100644
index c5a1b588f19..00000000000
--- a/backends/vulkan/runtime/vk_api/memory/vma_api.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#define VMA_IMPLEMENTATION
-#include <executorch/backends/vulkan/runtime/vk_api/memory/vma_api.h>
diff --git a/backends/vulkan/runtime/vk_api/memory/vma_api.h b/backends/vulkan/runtime/vk_api/memory/vma_api.h
deleted file mode 100644
index 16205a3b619..00000000000
--- a/backends/vulkan/runtime/vk_api/memory/vma_api.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-//
-// Do NOT include vk_mem_alloc.h directly.
-// Always include this file (vma_api.h) instead.
-//
-
-#define VMA_VULKAN_VERSION 1000000
-
-#ifdef USE_VULKAN_WRAPPER
-#define VMA_STATIC_VULKAN_FUNCTIONS 0
-#else
-#define VMA_DYNAMIC_VULKAN_FUNCTIONS 0
-#endif /* USE_VULKAN_WRAPPER */
-
-#define VMA_DEFAULT_LARGE_HEAP_BLOCK_SIZE (4ull * 1024 * 1024)
-#define VMA_SMALL_HEAP_MAX_SIZE (256ull * 1024 * 1024)
-
-#define VMA_STATS_STRING_ENABLED 0
-
-#ifdef VULKAN_DEBUG
-#define VMA_DEBUG_ALIGNMENT 4096
-#define VMA_DEBUG_ALWAYS_DEDICATED_MEMORY 0
-#define VMA_DEBUG_DETECT_CORRUPTION 1
-#define VMA_DEBUG_GLOBAL_MUTEX 1
-#define VMA_DEBUG_INITIALIZE_ALLOCATIONS 1
-#define VMA_DEBUG_MARGIN 64
-#define VMA_DEBUG_MIN_BUFFER_IMAGE_GRANULARITY 256
-#define VMA_RECORDING_ENABLED 1
-
-#define VMA_DEBUG_LOG(format, ...)
-/*
-#define VMA_DEBUG_LOG(format, ...) do { \
-    printf(format, __VA_ARGS__); \
-    printf("\n"); \
-} while(false)
-*/
-#endif /* VULKAN_DEBUG */
-
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wnullability-completeness"
-#pragma clang diagnostic ignored "-Wunused-variable"
-#endif /* __clang__ */
-
-#include <include/vk_mem_alloc.h>
-
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif /* __clang__ */
diff --git a/backends/vulkan/runtime/vk_api/vk_api.h b/backends/vulkan/runtime/vk_api/vk_api.h
deleted file mode 100644
index e3fbf057f8b..00000000000
--- a/backends/vulkan/runtime/vk_api/vk_api.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#ifdef USE_VULKAN_WRAPPER
-#ifdef USE_VULKAN_VOLK
-#ifdef VK_ANDROID_external_memory_android_hardware_buffer
-#include <android/hardware_buffer.h>
-#include <vulkan/vulkan.h>
-#include <vulkan/vulkan_android.h>
-#endif /* VK_ANDROID_external_memory_android_hardware_buffer */
-
-#include <volk.h>
-#else
-#include <vulkan_wrapper.h>
-#endif /* USE_VULKAN_VOLK */
-#else
-#include <vulkan/vulkan.h>
-#endif /* USE_VULKAN_WRAPPER */
diff --git a/backends/vulkan/serialization b/backends/vulkan/serialization
new file mode 120000
index 00000000000..9d6671ad5f4
--- /dev/null
+++ b/backends/vulkan/serialization
@@ -0,0 +1 @@
+/home/ssjia/fbsource/xplat/executorch/backends/vulkan/serialization
\ No newline at end of file
diff --git a/backends/vulkan/serialization/TARGETS b/backends/vulkan/serialization/TARGETS
deleted file mode 100644
index 41893d29274..00000000000
--- a/backends/vulkan/serialization/TARGETS
+++ /dev/null
@@ -1,4 +0,0 @@
-load(":targets.bzl", "define_common_targets")
-oncall("executorch")
-
-define_common_targets(is_fbcode = True)
diff --git a/backends/vulkan/serialization/schema.fbs b/backends/vulkan/serialization/schema.fbs
deleted file mode 100644
index b6670b6f53d..00000000000
--- a/backends/vulkan/serialization/schema.fbs
+++ /dev/null
@@ -1,152 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and affiliates.
-
-namespace vkgraph;
-
-// Update after any BC breaking changes.
-file_identifier "VK00";
-
-table OperatorCall {
-  node_id:uint;
-  name:string;
-  args:[int];
-}
-
-enum VkDataType : byte {
-  BOOL = 0,
-  UINT8 = 1,
-  INT8 = 2,
-  INT32 = 3,
-  FLOAT16 = 4,
-  FLOAT32 = 5,
-  FLOAT64 = 6,
-  INT64 = 7,
-}
-
-// Describes what kind of GPU resource should be used to represent a tensor. The
-// int values assigned to each entry must match the corresponding entry in
-// api::StorageType.
-enum VkStorageType : ubyte {
-  BUFFER = 0,
-  TEXTURE_3D = 1,
-  TEXTURE_2D = 2,
-  DEFAULT_STORAGE = 255,
-}
-
-// Describes how memory should be laid out in GPU memory. See the GPUMemoryLayout
-// enum class in PyTorch Vulkan for more details. The int values assigned to each
-// entry must match the corresponding entry in utils::GPUMemoryLayout.
-enum VkMemoryLayout : ubyte {
-  TENSOR_WIDTH_PACKED = 0,
-  TENSOR_HEIGHT_PACKED = 1,
-  TENSOR_CHANNELS_PACKED = 2,
-  DEFAULT_LAYOUT = 255,
-}
-
-table VkTensor {
-  // Type of the tensor elements.
-  datatype:VkDataType;
-  // Shape dimensions.
-  dims:[uint];
-  // Index to the program's constant data. Negative indicates tensor is non-constant.
-  constant_id:int;
-  // Index to the shared memory object. Negative indicates the tensor doesn't share memory.
-  mem_obj_id:int;
-  // Storage type that should be used to represent this tensor
-  storage_type:VkStorageType = DEFAULT_STORAGE;
-  // Memory layout that should be used to represent this tensor
-  memory_layout:VkMemoryLayout = DEFAULT_LAYOUT;
-}
-
-table Null {}
-
-table Int {
-  int_val:long;
-}
-
-table Bool {
-  bool_val:bool;
-}
-
-table Double {
-  double_val:double;
-}
-
-table String {
-  string_val:string;
-}
-
-table IntList {
-  items:[long];
-}
-
-table DoubleList {
-  items:[double];
-}
-
-table BoolList {
-  items:[bool];
-}
-
-table ValueList {
-  items:[int];
-}
-
-table SymInt {
-  value:int;
-}
-
-union GraphTypes {
-  Null,
-  Int,
-  Double,
-  Bool,
-  VkTensor,
-  IntList,
-  DoubleList,
-  BoolList,
-  ValueList,
-  String,
-  SymInt,
-}
-
-table VkValue {
-  value:GraphTypes;
-}
-
-// Abstraction to represent a region of bytes in a raw data buffer. Useful for referencing raw data
-// serialized outside of the flatbuffer.
-table VkBytes {
-  offset:ulong;
-  length:ulong;
-  named_key:string;
-}
-
-table VkGraph {
-  // Schema version.
-  version:string;
-
-  // Objects
-  chain:[OperatorCall];
-  values:[VkValue];
-
-  // Indices
-  input_ids:[uint];
-  output_ids:[uint];
-
-  // Raw Objects (e.g. weight tensors and custom shaders)
-  constants:[VkBytes];
-  shaders:[VkBytes];
-
-  // Graph configuration
-  // As per flatbuffer BC/FC policy, new fields can be freely added to this
-  // section. It is recommended to provide default values, since older blobs
-  // without the field will be deserialized with the default value.
-
-  // Sets an override for the storage type and memory layout that will be used
-  // to represent a VkTensor if the VkTensor is not serialized with a particular
-  // storage type or memory layout setting
-  storage_type_override:VkStorageType = DEFAULT_STORAGE;
-  memory_layout_override:VkMemoryLayout = DEFAULT_LAYOUT;
-}
-
-root_type VkGraph;
diff --git a/backends/vulkan/serialization/targets.bzl b/backends/vulkan/serialization/targets.bzl
deleted file mode 100644
index 15ec61e70b0..00000000000
--- a/backends/vulkan/serialization/targets.bzl
+++ /dev/null
@@ -1,60 +0,0 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-
-def define_common_targets(is_fbcode = False):
-    runtime.genrule(
-        name = "gen_vk_delegate_schema",
-        srcs = ["schema.fbs"],
-        # We're only generating a single file, so it seems like we could use
-        # `out`, but `flatc` takes a directory as a parameter, not a single
-        # file. Use `outs` so that `${OUT}` is expanded as the containing
-        # directory instead of the file itself.
-        outs = {
-            "schema_generated.h": ["schema_generated.h"],
-        },
-        cmd = " ".join([
-            "$(exe {})".format(runtime.external_dep_location("flatc")),
-            "--cpp",
-            "--cpp-std c++11",
-            "--scoped-enums",
-            "-o ${OUT}",
-            "${SRCS}",
-        ]),
-        default_outs = ["."],
-    )
-
-    runtime.cxx_library(
-        name = "vk_delegate_schema",
-        srcs = [],
-        visibility = [
-            "//executorch/backends/vulkan/...",
-        ],
-        exported_headers = {
-            "schema_generated.h": ":gen_vk_delegate_schema[schema_generated.h]",
-        },
-        exported_external_deps = [
-            "flatbuffers-api",
-        ],
-    )
-
-    if is_fbcode:
-        runtime.python_library(
-            name = "lib",
-            srcs = [
-                "vulkan_graph_builder.py",
-                "vulkan_graph_schema.py",
-                "vulkan_graph_serialize.py",
-            ],
-            resources = [
-                "schema.fbs",
-            ],
-            visibility = [
-                "//executorch/...",
-                "//executorch/vulkan/...",
-                "@EXECUTORCH_CLIENTS",
-            ],
-            deps = [
-                "//executorch/exir:graph_module",
-                "//executorch/exir/_serialize:_bindings",
-                "//executorch/exir/_serialize:lib",
-            ],
-        )
diff --git a/backends/vulkan/serialization/vulkan_graph_builder.py b/backends/vulkan/serialization/vulkan_graph_builder.py
deleted file mode 100644
index 78ac51c8808..00000000000
--- a/backends/vulkan/serialization/vulkan_graph_builder.py
+++ /dev/null
@@ -1,471 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import ctypes
-import hashlib
-import logging
-import operator
-from types import NoneType
-from typing import cast, List, Optional, Union
-
-import executorch.backends.vulkan.serialization.vulkan_graph_schema as vk_graph_schema
-
-import torch
-
-from executorch.backends.vulkan.serialization.vulkan_graph_schema import (
-    VkMemoryLayout,
-    VkStorageType,
-)
-from executorch.backends.vulkan.utils import (
-    is_constant,
-    is_get_attr_node,
-    is_mutable_buffer_node,
-    is_param_node,
-    is_symint_node,
-    TensorRepr,
-)
-from executorch.exir._serialize._named_data_store import NamedDataStore
-from executorch.exir.backend.utils import DelegateMappingBuilder
-
-from executorch.exir.tensor import TensorSpec
-from torch._export.utils import get_buffer, get_param, is_buffer, is_param
-from torch.export import ExportedProgram
-from torch.fx import Node
-
-_ScalarType = Union[bool, int, float]
-_Argument = Union[
-    Node, NoneType, _ScalarType, TensorSpec, List[_ScalarType], List[Node], str
-]
-
-logger: logging.Logger = logging.getLogger("")
-logger.setLevel(logging.INFO)
-
-
-class VkGraphBuilder:
-    def __init__(
-        self,
-        program: ExportedProgram,
-        delegate_mapping_builder: DelegateMappingBuilder,
-        downcast_64_bit: bool = True,
-    ) -> None:
-        self.program = program
-        self.delegate_mapping_builder = delegate_mapping_builder
-        self.downcast_64_bit = downcast_64_bit
-        self.chain = []
-        self.values = []
-        self.input_ids = []
-        self.output_ids = []
-        self.const_tensors = []
-        self.named_data_store = NamedDataStore()
-
-        # Mapping from Node to VkValue id
-        self.node_to_value_ids = {}
-        # Mapping from const scalar value to created VkValue id
-        self.const_scalar_to_value_ids = {}
-
-        # For logging
-        self.seen_ops = set()
-
-    @staticmethod
-    def get_vk_datatype(torch_dtype: torch.dtype) -> vk_graph_schema.VkDataType:
-        if torch_dtype == torch.bool:
-            return vk_graph_schema.VkDataType.BOOL
-        elif torch_dtype == torch.uint8:
-            return vk_graph_schema.VkDataType.UINT8
-        elif torch_dtype == torch.int8:
-            return vk_graph_schema.VkDataType.INT8
-        elif torch_dtype == torch.int32:
-            return vk_graph_schema.VkDataType.INT32
-        elif torch_dtype == torch.int64:
-            return vk_graph_schema.VkDataType.INT64
-        elif torch_dtype == torch.float16:
-            return vk_graph_schema.VkDataType.FLOAT16
-        elif torch_dtype == torch.float32:
-            return vk_graph_schema.VkDataType.FLOAT32
-        elif torch_dtype == torch.float64:
-            return vk_graph_schema.VkDataType.FLOAT64
-        else:
-            raise AssertionError(f"Invalid dtype for vulkan_preprocess ({torch_dtype})")
-
-    def get_constant(self, node: Node) -> Optional[torch.Tensor]:
-        """
-        Returns the constant associated with the given node in the exported program.
-        Returns None if the node is not a constant within the exported program
-        """
-        if is_constant(self.program, node):
-            constant_name = (
-                self.program.graph_signature.inputs_to_lifted_tensor_constants[
-                    node.name
-                ]
-            )
-            if constant_name in self.program.constants:
-                return self.program.constants[constant_name]
-            else:
-                return None
-
-        return None
-
-    def get_param_tensor(self, node: Node) -> torch.Tensor:
-        tensor = None
-        if node is None:
-            raise RuntimeError("node is None")
-        elif is_param(self.program, node):
-            tensor = get_param(self.program, node)
-        elif is_buffer(self.program, node):
-            tensor = get_buffer(self.program, node)
-        elif is_constant(self.program, node):
-            tensor = self.get_constant(node)
-        elif is_get_attr_node(node):
-            # This is a hack to support both lifted and unlifted graph
-            try:
-                tensor = getattr(node.graph.owning_module, node.target)
-            except AttributeError:
-                tensor = getattr(self.program.graph_module, node.target)
-        else:
-            raise RuntimeError(f"unsupported param type, {node.op}.")
-
-        assert tensor is not None
-        return tensor
-
-    def maybe_add_constant_tensor(self, node: Node) -> int:
-        constant_id = -1
-        if is_param_node(self.program, node):
-            tensor = self.get_param_tensor(node)
-
-            # Serialize tensor data to bytes
-            tensor = tensor.contiguous()
-            size = tensor.untyped_storage().nbytes()
-
-            if size > 0:
-                array_type = ctypes.c_char * size
-                array = ctypes.cast(
-                    tensor.untyped_storage().data_ptr(),
-                    ctypes.POINTER(array_type),
-                ).contents
-
-                # Generate SHA256 hash as the named key
-                tensor_bytes = bytes(array)
-                sha256_hash = hashlib.sha256(tensor_bytes)
-                named_key = sha256_hash.hexdigest()
-
-                # Add to named data store with 16-byte alignment (matching XNNPACK)
-                self.named_data_store.add_named_data(
-                    named_key, tensor_bytes, alignment=16
-                )
-
-                # Create VkBytes entry with named_key and set offset to indicate named data usage
-                constant_id = len(self.const_tensors)
-                self.const_tensors.append((named_key, size))
-            else:
-                # Handle empty tensors
-                constant_id = len(self.const_tensors)
-                self.const_tensors.append(None)
-
-        return constant_id
-
-    def create_node_value(self, node: Node) -> int:
-        # If the node has been marked as a scalar tensor, create a SymInt instead of a tensor
-        if is_symint_node(node) or node.meta.get("etvk_is_scalar_tensor", False):
-            new_id = self.create_symint_value()
-            self.node_to_value_ids[node] = new_id
-            return new_id
-
-        spec = node.meta.get("spec")
-        if isinstance(spec, TensorSpec):
-            constant_id = self.maybe_add_constant_tensor(node)
-            new_id = self.create_tensor_value(spec, constant_id)
-            self.node_to_value_ids[node] = new_id
-            return new_id
-        elif isinstance(spec, list) or isinstance(spec, tuple):
-            # pyre-ignore[6]: pyre having hard time to infer Node type inside
-            # the container.
-            new_id = self.create_value_list_value(spec)
-            self.node_to_value_ids[node] = new_id
-            return new_id
-        else:
-            raise RuntimeError(
-                f"Cannot create value for node {node} with spec of type {type(spec)}"
-            )
-
-    def create_null_value(self) -> int:
-        new_id = len(self.values)
-        self.values.append(vk_graph_schema.VkValue(vk_graph_schema.Null()))
-        return new_id
-
-    def get_or_create_scalar_value(self, scalar: _ScalarType) -> int:
-        scalar_key = scalar
-        # Since Python considers 1 and True to be "equivalent" (as well as 0 and False)
-        # to distinguish entries in the dictionary, if scalar is bool then convert it
-        # to a string representation to use as a key for the dictionary
-        if isinstance(scalar, bool):
-            scalar_key = str(scalar)
-
-        if scalar_key in self.const_scalar_to_value_ids:
-            return self.const_scalar_to_value_ids[scalar_key]
-
-        new_id = len(self.values)
-        if isinstance(scalar, bool):
-            self.values.append(vk_graph_schema.VkValue(vk_graph_schema.Bool(scalar)))
-        elif isinstance(scalar, int):
-            self.values.append(vk_graph_schema.VkValue(vk_graph_schema.Int(scalar)))
-        elif isinstance(scalar, float):
-            self.values.append(vk_graph_schema.VkValue(vk_graph_schema.Double(scalar)))
-
-        self.const_scalar_to_value_ids[scalar_key] = new_id
-        return new_id
-
-    def create_symint_value(self) -> int:
-        new_id = len(self.values)
-        self.values.append(vk_graph_schema.VkValue(vk_graph_schema.SymInt(0)))
-        return new_id
-
-    def create_tensor_value(self, spec: TensorSpec, constant_id: int = -1) -> int:
-        # Negative id indicates that this tensor will have its own dedicated memory.
-        mem_obj_id = -1
-        if spec.mem_obj_id is not None:
-            mem_obj_id = spec.mem_obj_id
-
-        storage_type = VkStorageType.DEFAULT_STORAGE
-        memory_layout = VkMemoryLayout.DEFAULT_LAYOUT
-        if hasattr(spec, "etvk_node_repr"):
-            # pyre-ignore[16]
-            assert isinstance(spec.etvk_node_repr, TensorRepr)
-            storage_type = spec.etvk_node_repr.storage_type
-            memory_layout = spec.etvk_node_repr.memory_layout
-
-        # Apply downcast logic before getting VK datatype
-        effective_dtype = spec.dtype
-        if self.downcast_64_bit and spec.dtype == torch.float64:
-            effective_dtype = torch.float32
-        elif self.downcast_64_bit and spec.dtype == torch.int64:
-            effective_dtype = torch.int32
-
-        datatype = self.get_vk_datatype(effective_dtype)
-
-        new_id = len(self.values)
-        self.values.append(
-            vk_graph_schema.VkValue(
-                value=vk_graph_schema.VkTensor(
-                    datatype=datatype,
-                    dims=spec.shape,
-                    constant_id=constant_id,
-                    mem_obj_id=mem_obj_id,
-                    storage_type=storage_type,
-                    memory_layout=memory_layout,
-                )
-            )
-        )
-        return new_id
-
-    def create_scalar_list_value(self, arg: List[_ScalarType]) -> int:
-        new_id = len(self.values)
-
-        if len(arg) == 0:
-            self.values.append(
-                vk_graph_schema.VkValue(vk_graph_schema.IntList(items=[]))
-            )
-
-        all_bool = True
-        all_int = True
-        all_float = True
-        all_int_or_symint = True
-
-        for val in arg:
-            if not isinstance(val, bool):
-                all_bool = False
-            if not isinstance(val, int):
-                all_int = False
-                if not (isinstance(val, Node) and is_symint_node(val)):
-                    all_int_or_symint = False
-            if not isinstance(val, float):
-                all_float = False
-
-        if all_bool:
-            self.values.append(
-                vk_graph_schema.VkValue(
-                    vk_graph_schema.BoolList(items=[cast(bool, e) for e in arg])
-                )
-            )
-        if all_int:
-            self.values.append(
-                vk_graph_schema.VkValue(
-                    vk_graph_schema.IntList(items=[cast(int, e) for e in arg])
-                )
-            )
-        elif all_float:
-            self.values.append(
-                vk_graph_schema.VkValue(
-                    vk_graph_schema.DoubleList(items=[cast(float, e) for e in arg])
-                )
-            )
-        elif all_int_or_symint:
-            return self.create_value_list_value(arg)
-        else:
-            raise NotImplementedError(f"Cannot add value for list {arg}")
-
-        return new_id
-
-    def create_value_list_value(self, arg: tuple | list) -> int:
-        self.values.append(
-            vk_graph_schema.VkValue(
-                vk_graph_schema.ValueList(
-                    items=[self.get_or_create_value_for(e) for e in arg]
-                )
-            )
-        )
-        return len(self.values) - 1
-
-    def create_string_value(self, string: str) -> int:
-        new_id = len(self.values)
-        self.values.append(
-            vk_graph_schema.VkValue(vk_graph_schema.String(string_val=string))
-        )
-        return new_id
-
-    def get_or_create_value_for(self, arg: _Argument):
-        if isinstance(arg, Node):
-            # If the Node has already been processed, return the existing id.
-            if arg in self.node_to_value_ids:
-                return self.node_to_value_ids[arg]
-            return self.create_node_value(arg)
-        elif (
-            isinstance(arg, NoneType)
-            or isinstance(arg, torch.device)
-            or isinstance(arg, torch.dtype)
-            or isinstance(arg, torch.layout)
-            or isinstance(arg, torch.memory_format)
-        ):
-            return self.create_null_value()
-        elif isinstance(arg, _ScalarType):
-            return self.get_or_create_scalar_value(arg)
-        elif isinstance(arg, TensorSpec):
-            return self.create_tensor_value(arg)
-        elif isinstance(arg, list) and (
-            len(arg) == 0 or any(isinstance(val, _ScalarType) for val in arg)
-        ):
-            # pyre-ignore[6]
-            return self.create_scalar_list_value(arg)
-        elif isinstance(arg, list) and isinstance(arg[0], Node):
-            return self.create_value_list_value(arg)
-        elif isinstance(arg, torch.fx.immutable_collections.immutable_list):
-            return self.create_value_list_value(arg)
-        elif isinstance(arg, str):
-            return self.create_string_value(arg)
-        else:
-            raise RuntimeError(f"Cannot create value for arg of type {type(arg)}")
-
-    def process_placeholder_node(self, node: Node) -> None:
-        # ignores any tensors that don't get used in any ops
-        if len(node.users) == 0:
-            return None
-        ids = self.create_node_value(node)
-        if not is_param_node(self.program, node):
-            if isinstance(ids, int):
-                self.input_ids.append(ids)
-            else:
-                self.input_ids += ids
-
-    def process_getitem_node(self, node: Node) -> None:
-        # Find ValueList id from the collection node.
-        collection_node = node.all_input_nodes[0]
-        list_id = self.node_to_value_ids[collection_node]
-
-        # Extract the target Value id from ValueList.
-        valuelist_id = node.args[1]
-        value_id = self.values[list_id].value.items[valuelist_id]
-
-        # Map Node to Value id.
-        self.node_to_value_ids[node] = value_id
-
-    def process_call_function_node(self, node) -> None:
-        operator_call_args = []
-
-        self.seen_ops.add(node.target)
-
-        if hasattr(node.target, "_schema"):
-            for i, schema_arg in enumerate(node.target._schema.arguments):
-                if not schema_arg.kwarg_only and i < len(node.args):
-                    function_arg = node.args[i]
-                elif schema_arg.name in node.kwargs:
-                    function_arg = node.kwargs[schema_arg.name]
-                else:
-                    function_arg = schema_arg.default_value
-
-                # Create a Value for each function argument. If the argument has been
-                # previously encountered, then use the existing Value id.
-                operator_call_args.append(self.get_or_create_value_for(function_arg))
-        else:
-            for _, arg_node in enumerate(node.args):
-                operator_call_args.append(self.get_or_create_value_for(arg_node))
-
-        # Add output node
-        operator_call_args.append(self.create_node_value(node))
-        operator_node_id = (
-            0
-            if not self.delegate_mapping_builder
-            else self.delegate_mapping_builder.insert_delegate_mapping_entry(node)
-        )
-        self.chain.append(
-            vk_graph_schema.OperatorCall(
-                node_id=operator_node_id,  # pyre-ignore[6]: this is going to be an int
-                name=node.target.__name__,
-                args=operator_call_args,
-            ),
-        )
-
-    def process_getattr_node(self, node: Node) -> None:
-        self.create_node_value(node)
-
-    def process_output_node(self, node: Node) -> None:
-        for out_node in node.all_input_nodes:
-            if out_node not in self.node_to_value_ids:
-                raise AssertionError(
-                    "Cannot find input to output node in node_to_value_ids. This means "
-                    "the output node is being serialized before its corresponding "
-                    "internal node which is not allowed."
-                )
-            # Mutable buffers outputs are not included as an output to the
-            # delegate call. Skip marking them as an output.
-            if is_mutable_buffer_node(out_node, self.program):
-                continue
-
-            self.output_ids.append(self.node_to_value_ids[out_node])
-
-    def process_node(self, node: Node, call_node_debug_hdl: int) -> None:
-        if node.op == "placeholder":
-            self.process_placeholder_node(node)
-        elif node.op == "call_function":
-            if node.target == operator.getitem:
-                self.process_getitem_node(node)
-            else:
-                node.meta["debug_handle"] = call_node_debug_hdl
-                self.process_call_function_node(node)
-        elif node.op == "get_attr":
-            self.process_getattr_node(node)
-        elif node.op == "output":
-            self.process_output_node(node)
-        else:
-            raise AssertionError(f"Unsupported node op: {node.op}")
-
-    def build_graph(self) -> vk_graph_schema.VkGraph:
-        call_node_debug_hdl = 0
-        for node in self.program.graph_module.graph.nodes:
-            self.process_node(node, call_node_debug_hdl)
-            call_node_debug_hdl += 1
-
-        logger.info("Operators included in this Vulkan partition: ")
-        for op in self.seen_ops:
-            logger.info(f"    {op.__name__}")
-
-        return vk_graph_schema.VkGraph(
-            version="0",
-            chain=self.chain,
-            values=self.values,
-            input_ids=self.input_ids,
-            output_ids=self.output_ids,
-            constants=[],
-            shaders=[],
-        )
diff --git a/backends/vulkan/serialization/vulkan_graph_schema.py b/backends/vulkan/serialization/vulkan_graph_schema.py
deleted file mode 100644
index aa7641bd927..00000000000
--- a/backends/vulkan/serialization/vulkan_graph_schema.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-strict
-
-"""
-Please refer to fbcode/caffe2/executorch/backends/vulkan/serialization/schema/schema.fbs for the schema definitions
-"""
-
-from dataclasses import dataclass
-from enum import IntEnum
-from typing import List, Union
-
-
-@dataclass
-class OperatorCall:
-    node_id: int
-    name: str
-    args: List[int]
-
-
-class VkDataType(IntEnum):
-    BOOL = 0
-    UINT8 = 1
-    INT8 = 2
-    INT32 = 3
-    FLOAT16 = 4
-    FLOAT32 = 5
-    FLOAT64 = 6
-    INT64 = 7
-
-
-class VkStorageType(IntEnum):
-    BUFFER = 0
-    TEXTURE_3D = 1
-    TEXTURE_2D = 2
-    DEFAULT_STORAGE = 255
-
-    def __str__(self) -> str:
-        return self.name
-
-
-class VkMemoryLayout(IntEnum):
-    TENSOR_WIDTH_PACKED = 0
-    TENSOR_HEIGHT_PACKED = 1
-    TENSOR_CHANNELS_PACKED = 2
-    DEFAULT_LAYOUT = 255
-
-    def __str__(self) -> str:
-        return self.name
-
-
-@dataclass
-class VkTensor:
-    datatype: VkDataType
-    dims: List[int]
-    constant_id: int
-    mem_obj_id: int
-    storage_type: VkStorageType = VkStorageType.DEFAULT_STORAGE
-    memory_layout: VkMemoryLayout = VkMemoryLayout.DEFAULT_LAYOUT
-
-
-@dataclass
-class Null:
-    pass
-
-
-@dataclass
-class Int:
-    int_val: int
-
-
-@dataclass
-class Bool:
-    bool_val: bool
-
-
-@dataclass
-class Double:
-    double_val: float
-
-
-@dataclass
-class IntList:
-    items: List[int]
-
-
-@dataclass
-class DoubleList:
-    items: List[float]
-
-
-@dataclass
-class BoolList:
-    items: List[bool]
-
-
-@dataclass
-class ValueList:
-    items: List[int]
-
-
-@dataclass
-class String:
-    string_val: str
-
-
-@dataclass
-class SymInt:
-    value: int
-
-
-GraphTypes = Union[
-    Null,
-    Int,
-    Double,
-    Bool,
-    VkTensor,
-    IntList,
-    BoolList,
-    DoubleList,
-    ValueList,
-    String,
-    SymInt,
-]
-
-
-@dataclass
-class VkValue:
-    value: "GraphTypes"
-
-
-@dataclass
-class VkBytes:
-    offset: int
-    length: int
-    named_key: str = ""
-
-
-@dataclass
-class VkGraph:
-    version: str
-
-    chain: List[OperatorCall]
-    values: List[VkValue]
-
-    input_ids: List[int]
-    output_ids: List[int]
-
-    constants: List[VkBytes]
-    shaders: List[VkBytes]
-
-    storage_type_override: VkStorageType = VkStorageType.DEFAULT_STORAGE
-    memory_layout_override: VkMemoryLayout = VkMemoryLayout.DEFAULT_LAYOUT
diff --git a/backends/vulkan/serialization/vulkan_graph_serialize.py b/backends/vulkan/serialization/vulkan_graph_serialize.py
deleted file mode 100644
index 96f944560a8..00000000000
--- a/backends/vulkan/serialization/vulkan_graph_serialize.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-# Copyright 2025 Arm Limited and/or its affiliates.
-#
-# pyre-strict
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import ctypes
-import importlib.resources as _resources
-import json
-import os
-import tempfile
-from dataclasses import dataclass
-from typing import ClassVar, List
-
-import executorch.backends.vulkan.serialization as serialization_package
-
-import torch
-
-from executorch.backends.vulkan.serialization.vulkan_graph_schema import (
-    VkBytes,
-    VkGraph,
-)
-from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass
-from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile
-
-
-def convert_to_flatbuffer(vk_graph: VkGraph) -> bytes:
-    vk_graph_json = json.dumps(vk_graph, cls=_DataclassEncoder)
-
-    with tempfile.TemporaryDirectory() as d:
-        schema_path = os.path.join(d, "schema.fbs")
-        with open(schema_path, "wb") as schema_file:
-            schema_file.write(
-                _resources.read_binary(serialization_package, "schema.fbs")
-            )
-        json_path = os.path.join(d, "schema.json")
-        with open(json_path, "wb") as json_file:
-            json_file.write(vk_graph_json.encode("ascii"))
-        _flatc_compile(d, schema_path, json_path)
-        output_path = os.path.join(d, "schema.bin")
-        with open(output_path, "rb") as output_file:
-            return output_file.read()
-
-
-def flatbuffer_to_vk_graph(flatbuffers: bytes) -> VkGraph:
-    # Following similar (de)serialization logic on other backends:
-    # https://github.com/pytorch/executorch/blob/main/backends/qualcomm/serialization/qc_schema_serialize.py#L33
-    with tempfile.TemporaryDirectory() as d:
-        schema_path = os.path.join(d, "schema.fbs")
-        with open(schema_path, "wb") as schema_file:
-            schema_file.write(
-                _resources.read_binary(serialization_package, "schema.fbs")
-            )
-
-        bin_path = os.path.join(d, "schema.bin")
-        with open(bin_path, "wb") as bin_file:
-            bin_file.write(flatbuffers)
-
-        _flatc_decompile(d, schema_path, bin_path, ["--raw-binary"])
-
-        json_path = os.path.join(d, "schema.json")
-        with open(json_path, "rb") as output_file:
-            return _json_to_dataclass(json.load(output_file), VkGraph)
-
-
-def extract_vk_flatbuffer(data: bytes) -> bytes:
-    h: VulkanDelegateHeader = VulkanDelegateHeader.from_bytes(
-        data[: VulkanDelegateHeader.EXPECTED_LENGTH]
-    )
-    start = h.flatbuffer_offset
-    end = h.flatbuffer_offset + h.flatbuffer_size
-    return data[start:end]
-
-
-@dataclass
-class VulkanDelegateHeader:
-    # Defines the byte region that each component of the header corresponds to
-    MAGIC_IX: ClassVar[slice] = slice(4, 8)
-    HEADER_SIZE_IX: ClassVar[slice] = slice(8, 10)
-    FLATBUFFER_OFFSET_IX: ClassVar[slice] = slice(10, 14)
-    FLATBUFFER_SIZE_IX: ClassVar[slice] = slice(14, 18)
-    BYTES_OFFSET_IX: ClassVar[slice] = slice(18, 22)
-    BYTES_SIZE_IX: ClassVar[slice] = slice(22, 30)
-
-    # magic bytes that should be at the beginning of the header
-    EXPECTED_MAGIC: ClassVar[bytes] = b"VH00"
-    # The length of the header in bytes
-    EXPECTED_LENGTH: ClassVar[int] = 30
-
-    # Instance attributes, @dataclass will turn these into constructor args
-    flatbuffer_offset: int
-    flatbuffer_size: int
-    bytes_offset: int
-    bytes_size: int
-
-    @staticmethod
-    def from_bytes(data: bytes) -> "VulkanDelegateHeader":
-        if len(data) > VulkanDelegateHeader.EXPECTED_LENGTH:
-            raise ValueError(
-                f"Expected header to be {VulkanDelegateHeader.EXPECTED_LENGTH} bytes, "
-                f"but got {len(data)} bytes."
-            )
-
-        magic_b: bytes = data[VulkanDelegateHeader.MAGIC_IX]
-
-        if magic_b != VulkanDelegateHeader.EXPECTED_MAGIC:
-            raise ValueError(
-                f"Expected magic bytes to be {VulkanDelegateHeader.EXPECTED_MAGIC}, "
-                f"but got {magic_b}."
-            )
-
-        length: int = int.from_bytes(
-            data[VulkanDelegateHeader.HEADER_SIZE_IX], byteorder="little"
-        )
-
-        if length != VulkanDelegateHeader.EXPECTED_LENGTH:
-            raise ValueError(
-                f"Expected header to be {VulkanDelegateHeader.EXPECTED_LENGTH} bytes, "
-                f"but got {length} bytes."
-            )
-
-        flatbuffer_offset_b: bytes = data[VulkanDelegateHeader.FLATBUFFER_OFFSET_IX]
-        flatbuffer_size_b: bytes = data[VulkanDelegateHeader.FLATBUFFER_SIZE_IX]
-        bytes_offset_b: bytes = data[VulkanDelegateHeader.BYTES_OFFSET_IX]
-        bytes_size_b: bytes = data[VulkanDelegateHeader.BYTES_SIZE_IX]
-
-        return VulkanDelegateHeader(
-            flatbuffer_offset=int.from_bytes(flatbuffer_offset_b, byteorder="little"),
-            flatbuffer_size=int.from_bytes(flatbuffer_size_b, byteorder="little"),
-            bytes_offset=int.from_bytes(bytes_offset_b, byteorder="little"),
-            bytes_size=int.from_bytes(bytes_size_b, byteorder="little"),
-        )
-
-    def is_valid(self) -> bool:
-        if self.flatbuffer_size <= 0:
-            return False
-
-        expected_offset = self.flatbuffer_offset + self.flatbuffer_size
-        if self.bytes_offset < expected_offset:
-            return False
-
-        if self.bytes_size < 0:
-            return False
-
-        return True
-
-    def to_bytes(self) -> bytes:
-        if not self.is_valid():
-            raise ValueError("VulkanDelegateHeader instance contains invalid values")
-
-        data: bytes = (
-            # 4 bytes of padding for magic bytes, this is so that the header magic
-            # bytes is in the same position as the flatbuffer header magic bytes
-            b"\x00\x00\x00\x00"
-            + self.EXPECTED_MAGIC
-            + self.EXPECTED_LENGTH.to_bytes(2, byteorder="little")
-            + self.flatbuffer_offset.to_bytes(4, byteorder="little")
-            + self.flatbuffer_size.to_bytes(4, byteorder="little")
-            + self.bytes_offset.to_bytes(4, byteorder="little")
-            + self.bytes_size.to_bytes(8, byteorder="little")
-        )
-
-        assert len(data) == VulkanDelegateHeader.EXPECTED_LENGTH
-
-        return data
-
-
-def padding_required(data_len: int, alignment: int = 16) -> int:
-    remainder: int = data_len % alignment
-    if remainder != 0:
-        return alignment - remainder
-    return 0
-
-
-def aligned_size(data_len: int, alignment: int = 16) -> int:
-    return data_len + padding_required(data_len, alignment)
-
-
-def pad_to(data: bytes, size: int) -> bytes:
-    if size > len(data):
-        data += b"\x00" * (size - len(data))
-    return data
-
-
-def serialize_constant_tensors(
-    vk_graph: VkGraph,
-    const_tensors: List[torch.Tensor],
-    raw_bytes: bytearray,
-) -> None:
-    # Make sure that the graph does not have any registered constants prior to calling
-    # this function.
-    assert len(vk_graph.constants) == 0
-
-    current_offset = len(raw_bytes)
-    for tensor in const_tensors:
-        # The tensor data is stored in the named data map
-        if isinstance(tensor, tuple):
-            named_key, size = tensor
-            vk_graph.constants.append(
-                VkBytes(
-                    offset=18446744073709551615,  # UINT64_MAX to indicate named data
-                    length=size,
-                    named_key=named_key,
-                )
-            )
-        elif tensor is None or (
-            isinstance(tensor, torch.Tensor) and tensor.numel() == 0
-        ):
-            vk_graph.constants.append(VkBytes(current_offset, 0))
-        elif isinstance(tensor, torch.Tensor):
-            array_type = ctypes.c_char * tensor.untyped_storage().nbytes()
-            array = ctypes.cast(
-                tensor.untyped_storage().data_ptr(),
-                ctypes.POINTER(array_type),
-            ).contents
-
-            tensor_bytes = bytes(array)
-            # Pad the tensor bytes to the next 16 byte boundary
-            raw_bytes += tensor_bytes
-            raw_bytes += b"\x00" * padding_required(len(tensor_bytes))
-
-            vk_graph.constants.append(VkBytes(current_offset, len(tensor_bytes)))
-            current_offset += aligned_size(len(tensor_bytes))
-        else:
-            raise ValueError(f"Unsupported constant tensor type: {type(tensor)}")
-
-
-def serialize_custom_shaders(
-    vk_graph: VkGraph,
-    custom_shaders: List[str],
-    raw_bytes: bytearray,
-) -> bytes:
-    # Make sure that the graph deos not have any registered shaders prior to calling
-    # this function.
-    assert len(vk_graph.shaders) == 0
-
-    if len(custom_shaders) == 0:
-        return b""
-
-    else:
-        raise NotImplementedError("Serializing Custom shaders are not yet supported")
-
-
-def serialize_vulkan_graph(
-    vk_graph: VkGraph, const_tensors: List[torch.Tensor], custom_shaders: List[str]
-) -> bytes:
-    raw_bytes = bytearray()
-    serialize_constant_tensors(vk_graph, const_tensors, raw_bytes)
-    serialize_custom_shaders(vk_graph, custom_shaders, raw_bytes)
-    raw_bytes = bytes(raw_bytes)
-
-    flatbuffer_payload = convert_to_flatbuffer(vk_graph)
-
-    header_len = aligned_size(VulkanDelegateHeader.EXPECTED_LENGTH)
-    flatbuffer_payload_len = aligned_size(len(flatbuffer_payload))
-    raw_bytes_len = aligned_size(len(raw_bytes))
-
-    header: bytes = VulkanDelegateHeader(
-        flatbuffer_offset=header_len,
-        flatbuffer_size=len(flatbuffer_payload),
-        bytes_offset=header_len + flatbuffer_payload_len,
-        bytes_size=len(raw_bytes),
-    ).to_bytes()
-
-    return b"".join(
-        [
-            pad_to(header, header_len),
-            pad_to(flatbuffer_payload, flatbuffer_payload_len),
-            pad_to(raw_bytes, raw_bytes_len),
-        ]
-    )
diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
deleted file mode 100644
index 775341d420d..00000000000
--- a/backends/vulkan/targets.bzl
+++ /dev/null
@@ -1,394 +0,0 @@
-load("@fbsource//tools/target_determinator/macros:ci.bzl", "ci")
-load("@fbcode_macros//build_defs:native_rules.bzl", "buck_genrule")
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load("@fbsource//tools/build_defs:platform_defs.bzl", "ANDROID", "CXX", "FBCODE", "APPLE")
-
-
-def get_vulkan_compiler_flags():
-    return select({
-        "DEFAULT": [
-            "-Wno-global-constructors",
-            "-Wno-missing-prototypes",
-        ],
-        "ovr_config//os:windows": [],
-    })
-
-def get_vulkan_preprocessor_flags(no_volk, is_fbcode):
-    VK_API_PREPROCESSOR_FLAGS = []
-
-    default_flags = []
-    android_flags = []
-
-    if not no_volk:
-        for flags in [default_flags, android_flags]:
-            flags.append("-DUSE_VULKAN_WRAPPER")
-            flags.append("-DUSE_VULKAN_VOLK")
-            flags.append("-DUSE_VOLK_HEADER_ONLY")
-        android_flags.append("-DVK_ANDROID_external_memory_android_hardware_buffer")
-
-    if not is_fbcode:
-        link_moltenvk = no_volk and read_config("etvk", "link_moltenvk", "1") == "1"
-        mac_flags = default_flags
-        if link_moltenvk:
-            mac_flags = []
-
-        VK_API_PREPROCESSOR_FLAGS += select({
-            "DEFAULT": default_flags,
-            "ovr_config//os:android": android_flags,
-            "ovr_config//os:macos": mac_flags,
-        }) + select({
-            "//third-party/cuda:windows-cuda-11": [
-                "-DVK_USE_PLATFORM_WIN32_KHR",
-            ],
-            "DEFAULT": [],
-            "ovr_config//os:android": [
-                "-DVK_USE_PLATFORM_ANDROID_KHR",
-            ],
-            "ovr_config//os:linux": [
-                "-DVK_USE_PLATFORM_XLIB_KHR",
-            ],
-            "ovr_config//os:macos": [
-                "-DVK_USE_PLATFORM_MACOS_MVK",
-            ],
-            "ovr_config//os:windows": [
-                "-DVK_USE_PLATFORM_WIN32_KHR",
-            ],
-        })
-
-        etvk_default_cache_path = read_config("etvk", "default_cache_path", "")
-        if etvk_default_cache_path != "":
-            VK_API_PREPROCESSOR_FLAGS += ["-DETVK_DEFAULT_CACHE_PATH={}".format(etvk_default_cache_path)]
-
-        debug_mode = read_config("etvk", "debug", "0") == "1"
-        if debug_mode:
-            VK_API_PREPROCESSOR_FLAGS += ["-DVULKAN_DEBUG"]
-
-    return VK_API_PREPROCESSOR_FLAGS
-
-def get_labels(no_volk):
-    if no_volk:
-        return ci.labels(ci.linux(ci.mode("fbsource//arvr/mode/android/mac/dbg")))
-    else:
-        return []
-
-def get_platforms():
-    return [ANDROID, APPLE, CXX]
-
-def vulkan_spv_shader_lib(name, spv_filegroups, is_fbcode = False, no_volk = False):
-    gen_vulkan_spv_target = "//xplat/executorch/backends/vulkan:gen_vulkan_spv_bin"
-    glslc_path = "//xplat/caffe2/fb/vulkan/dotslash:glslc"
-
-    if is_fbcode:
-        gen_vulkan_spv_target = "//executorch/backends/vulkan:gen_vulkan_spv_bin"
-        glslc_path = "//caffe2/fb/vulkan/tools:glslc"
-
-    glsl_paths = []
-
-    # TODO(ssjia): remove the need for subpath once subdir_glob is enabled in OSS
-    for target, subpath in spv_filegroups.items():
-        glsl_paths.append("$(location {})/{}".format(target, subpath))
-
-    genrule_cmd = (
-        "$(exe {}) ".format(gen_vulkan_spv_target) +
-        "--glsl-paths {} ".format(" ".join(glsl_paths)) +
-        "--output-path $OUT " +
-        "--glslc-path=$(exe {}) ".format(glslc_path) +
-        "--tmp-dir-path=shader_cache " +
-        ("-f " if read_config("etvk", "force_shader_rebuild", "0") == "1" else " ") +
-        select({
-            "DEFAULT": "",
-            "ovr_config//os:android": "--optimize",
-            "ovr_config//os:linux": "--replace-u16vecn",
-            "ovr_config//os:windows": "--optimize --spv_debug",
-        })
-    )
-
-    genrule_name = "gen_{}_cpp".format(name)
-    buck_genrule(
-        name = genrule_name,
-        outs = {
-            "{}.cpp".format(name): ["spv.cpp"],
-        },
-        cmd = genrule_cmd,
-        default_outs = ["."],
-        labels = ["uses_dotslash"],
-    )
-
-    suffix = "_no_volk" if no_volk else ""
-    runtime.cxx_library(
-        name = name,
-        srcs = [
-            ":{}[{}.cpp]".format(genrule_name, name),
-        ],
-        compiler_flags = get_vulkan_compiler_flags(),
-        labels = get_labels(no_volk),
-        platforms = get_platforms(),
-        define_static_target = True,
-        # Static initialization is used to register shaders to the global shader registry,
-        # therefore link_whole must be True to make sure unused symbols are not discarded.
-        # @lint-ignore BUCKLINT: Avoid `link_whole=True`
-        link_whole = True,
-        # Define a soname that can be used for dynamic loading in Java, Python, etc.
-        soname = "lib{}.$(ext)".format(name),
-        exported_deps = [
-            "//executorch/backends/vulkan:vulkan_compute_api{}".format(suffix),
-        ],
-    )
-
-def define_common_targets(is_fbcode = False):
-    runtime.python_library(
-        name = "gen_vulkan_spv_lib",
-        srcs = [
-            "runtime/gen_vulkan_spv.py",
-        ],
-        base_module = "",
-        external_deps = ["torchgen"],
-    )
-
-    runtime.python_binary(
-        name = "gen_vulkan_spv_bin",
-        main_module = "runtime.gen_vulkan_spv",
-        visibility = [
-            "//executorch/backends/vulkan/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-        deps = [
-            ":gen_vulkan_spv_lib",
-        ],
-    )
-
-    runtime.filegroup(
-        name = "vulkan_graph_runtime_shaders",
-        srcs = native.glob([
-            "runtime/graph/ops/glsl/*",
-        ]),
-    )
-
-    for no_volk in [True, False]:
-        # No volk builds only available on xplat to build for Android
-        if no_volk and is_fbcode:
-            continue
-
-        suffix = "_no_volk" if no_volk else ""
-
-        VK_API_DEPS = [
-            "fbsource//third-party/VulkanMemoryAllocator/3.0.1:VulkanMemoryAllocator_xplat",
-        ]
-
-        default_deps = []
-        android_deps = ["fbsource//third-party/toolchains:android"]
-
-        if no_volk:
-            for deps in [default_deps, android_deps]:
-                deps.append("fbsource//third-party/toolchains:vulkan")
-                deps.append("fbsource//third-party/khronos:vulkan-headers")
-        else:
-            for deps in [default_deps, android_deps]:
-                deps.append("fbsource//third-party/volk:volk-header")
-
-        if is_fbcode:
-            VK_API_DEPS += [
-                "fbsource//third-party/swiftshader:swiftshader_vk_headers",
-                "fbsource//third-party/swiftshader/lib/linux-x64:libvk_swiftshader_fbcode",
-                "fbsource//third-party/swiftshader/lib/linux-x64:libvk_swiftshader_so",
-            ]
-        else:
-            link_moltenvk = no_volk and read_config("etvk", "link_moltenvk", "1") == "1"
-            mac_deps = default_deps
-            if link_moltenvk:
-                mac_deps = [
-                    "//third-party/khronos:moltenVK_static"
-                ]
-
-            VK_API_DEPS += select({
-                "DEFAULT": default_deps,
-                "ovr_config//os:android": android_deps,
-                "ovr_config//os:macos": mac_deps,
-            }) + select({
-                "DEFAULT": [],
-                "ovr_config//os:linux": [
-                    "//arvr/third-party/libX11:libX11",
-                ]
-            })
-
-        runtime.cxx_library(
-            name = "vulkan_compute_api{}".format(suffix),
-            compiler_flags = get_vulkan_compiler_flags(),
-            srcs = native.glob([
-                "runtime/api/**/*.cpp",
-                "runtime/utils/**/*.cpp",
-                "runtime/vk_api/**/*.cpp",
-            ]),
-            exported_headers = native.glob([
-                "runtime/api/**/*.h",
-                "runtime/utils/**/*.h",
-                "runtime/vk_api/**/*.h",
-            ]),
-            labels = get_labels(no_volk),
-            platforms = get_platforms(),
-            visibility = [
-                "//executorch/backends/vulkan/...",
-                "@EXECUTORCH_CLIENTS",
-            ],
-            fbobjc_frameworks = select({
-                "DEFAULT": [],
-                "ovr_config//os:macos": [
-                    "$SDKROOT/System/Library/Frameworks/CoreGraphics.framework",
-                    "$SDKROOT/System/Library/Frameworks/Foundation.framework",
-                    "$SDKROOT/System/Library/Frameworks/AppKit.framework",
-                    "$SDKROOT/System/Library/Frameworks/Metal.framework",
-                    "$SDKROOT/System/Library/Frameworks/QuartzCore.framework",
-                ],
-            }),
-            exported_preprocessor_flags = get_vulkan_preprocessor_flags(no_volk, is_fbcode),
-            exported_deps = VK_API_DEPS,
-        )
-
-        runtime.cxx_library(
-            name = "vulkan_graph_runtime{}".format(suffix),
-            srcs = native.glob([
-                "runtime/graph/**/*.cpp",
-            ]),
-            compiler_flags = get_vulkan_compiler_flags(),
-            exported_headers = native.glob([
-                "runtime/graph/**/*.h",
-            ]),
-            labels = get_labels(no_volk),
-            platforms = get_platforms(),
-            visibility = [
-                "//executorch/backends/...",
-                "//executorch/extension/pybindings/...",
-                "//executorch/test/...",
-                "@EXECUTORCH_CLIENTS",
-            ],
-            exported_deps = [
-                ":vulkan_graph_runtime_shaderlib{}".format(suffix),
-                "//executorch/runtime/backend:interface",
-            ],
-            define_static_target = True,
-            # Static initialization is used to register operators to the global operator registry,
-            # therefore link_whole must be True to make sure unused symbols are not discarded.
-            # @lint-ignore BUCKLINT: Avoid `link_whole=True`
-            link_whole = True,
-            # Define an soname that can be used for dynamic loading in Java, Python, etc.
-            soname = "libvulkan_graph_runtime.$(ext)",
-        )
-
-        vulkan_spv_shader_lib(
-            name = "vulkan_graph_runtime_shaderlib{}".format(suffix),
-            spv_filegroups = {
-                ":vulkan_graph_runtime_shaders": "runtime/graph/ops/glsl",
-            },
-            is_fbcode = is_fbcode,
-            no_volk = no_volk,
-        )
-
-        runtime.cxx_library(
-            name = "vulkan_backend_lib{}".format(suffix),
-            srcs = native.glob([
-                "runtime/*.cpp",
-            ]),
-            compiler_flags = get_vulkan_compiler_flags(),
-            headers = native.glob([
-                "runtime/*.h",
-            ]),
-            labels = get_labels(no_volk),
-            platforms = get_platforms(),
-            visibility = [
-                "//executorch/backends/...",
-                "//executorch/extension/pybindings/...",
-                "//executorch/test/...",
-                "@EXECUTORCH_CLIENTS",
-            ],
-            deps = [
-                ":vulkan_graph_runtime{}".format(suffix),
-                "//executorch/backends/vulkan/serialization:vk_delegate_schema",
-                "//executorch/runtime/core:event_tracer",
-                "//executorch/runtime/core/exec_aten/util:tensor_util",
-                "//executorch/runtime/core:named_data_map",
-            ],
-            define_static_target = True,
-            # VulkanBackend.cpp needs to compile with executor as whole
-            # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
-            link_whole = True,
-        )
-
-    ##
-    ## AOT targets
-    ##
-    if is_fbcode:
-        runtime.python_library(
-            name = "utils_lib",
-            srcs = [
-                "utils.py",
-            ],
-            visibility = [
-                "//executorch/backends/vulkan/...",
-            ],
-            deps = [
-                "//caffe2:torch",
-                "//executorch/exir:tensor",
-                "//executorch/exir/backend/canonical_partitioners:config_partitioner_lib",
-                "//executorch/backends/vulkan/serialization:lib",
-            ]
-        )
-
-        runtime.python_library(
-            name = "custom_ops_lib",
-            srcs = [
-                "custom_ops_lib.py"
-            ],
-            visibility = [
-                "//executorch/...",
-                "//executorch/vulkan/...",
-                "@EXECUTORCH_CLIENTS",
-            ],
-            deps = [
-                "//caffe2:torch",
-                "//executorch/backends/vulkan/patterns:vulkan_patterns",
-            ]
-        )
-
-        runtime.python_library(
-            name = "op_registry",
-            srcs = [
-                "op_registry.py",
-            ],
-            visibility = [
-                "//executorch/...",
-                "//executorch/vulkan/...",
-                "@EXECUTORCH_CLIENTS",
-            ],
-            deps = [
-                ":custom_ops_lib",
-                ":utils_lib",
-                "//caffe2:torch",
-                "//executorch/exir/dialects:lib",
-                "//executorch/backends/vulkan/serialization:lib",
-            ]
-        )
-
-        runtime.python_library(
-            name = "vulkan_preprocess",
-            srcs = [
-                "vulkan_preprocess.py",
-            ],
-            visibility = [
-                "//executorch/...",
-                "//executorch/vulkan/...",
-                "@EXECUTORCH_CLIENTS",
-            ],
-            deps = [
-                "//executorch/backends/transforms:addmm_mm_to_linear",
-                "//executorch/backends/transforms:fuse_batch_norm_with_conv",
-                "//executorch/backends/transforms:fuse_conv_with_clamp",
-                "//executorch/backends/transforms:fuse_view_copy",
-                "//executorch/backends/transforms:remove_clone_ops",
-                "//executorch/backends/transforms:view_copy_to_squeeze_unsqueeze",
-                "//executorch/backends/vulkan/_passes:vulkan_passes",
-                "//executorch/backends/vulkan/serialization:lib",
-                "//executorch/backends/transforms:remove_getitem_op",
-                "//executorch/backends/xnnpack/_passes:xnnpack_passes",
-                "//executorch/exir/backend:backend_details",
-            ],
-        )
diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
new file mode 120000
index 00000000000..f44d6f73587
--- /dev/null
+++ b/backends/vulkan/targets.bzl
@@ -0,0 +1 @@
+/home/ssjia/fbsource/xplat/executorch/backends/vulkan/targets.bzl
\ No newline at end of file
diff --git a/backends/vulkan/test b/backends/vulkan/test
new file mode 120000
index 00000000000..4de6140b88e
--- /dev/null
+++ b/backends/vulkan/test
@@ -0,0 +1 @@
+/home/ssjia/fbsource/xplat/executorch/backends/vulkan/test
\ No newline at end of file
diff --git a/backends/vulkan/test/CMakeLists.txt b/backends/vulkan/test/CMakeLists.txt
deleted file mode 100644
index e3bce1d8baf..00000000000
--- a/backends/vulkan/test/CMakeLists.txt
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# ### Editing this file ###
-#
-# This file should be formatted with
-# ~~~
-# cmake-format -i CMakeLists.txt
-# ~~~
-# It should also be cmake-lint clean.
-#
-# The targets in this file will be built if EXECUTORCH_BUILD_VULKAN is ON
-
-cmake_minimum_required(VERSION 3.19)
-project(executorch)
-
-if(ANDROID)
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH)
-endif()
-
-find_package(executorch CONFIG REQUIRED COMPONENTS vulkan_backend)
-find_package(GTest CONFIG REQUIRED)
-
-# Only build tests if Vulkan was compiled
-if(TARGET vulkan_backend)
-  if(NOT EXECUTORCH_ROOT)
-    set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
-  endif()
-
-  if(NOT PYTHON_EXECUTABLE)
-    set(PYTHON_EXECUTABLE python3)
-  endif()
-
-  # Include this file to access executorch_target_link_options_shared_lib This
-  # is required to provide access to executorch_target_link_options_shared_lib
-  # which allows libraries to be linked with the --whole-archive flag. This is
-  # required for libraries that perform dynamic registration via static
-  # initialization.
-  include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
-
-  include(../cmake/ShaderLibrary.cmake)
-
-  # Third party include paths
-
-  set(VULKAN_THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../third-party)
-
-  set(GTEST_INCLUDE_PATH
-      ${EXECUTORCH_ROOT}/third-party/googletest/googletest/include
-  )
-  set(VULKAN_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/Vulkan-Headers/include)
-  set(VOLK_PATH ${VULKAN_THIRD_PARTY_PATH}/volk)
-  set(VMA_PATH ${VULKAN_THIRD_PARTY_PATH}/VulkanMemoryAllocator)
-
-  set(COMMON_INCLUDES ${EXECUTORCH_ROOT}/.. ${VULKAN_HEADERS_PATH} ${VOLK_PATH}
-                      ${VMA_PATH} ${GTEST_INCLUDE_PATH} ${PYTORCH_PATH}
-  )
-
-  # Test Utility files
-
-  set(TEST_UTILS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/utils)
-  file(GLOB TEST_UTILS_CPP ${CMAKE_CURRENT_SOURCE_DIR}/utils/*.cpp)
-
-  # Test shaders
-
-  set(TEST_SHADERS_PATH ${CMAKE_CURRENT_SOURCE_DIR}/glsl)
-  gen_vulkan_shader_lib_cpp(${TEST_SHADERS_PATH})
-  vulkan_shader_lib(test_shaderlib ${generated_spv_cpp})
-
-  # API Test binary
-
-  set(COMPUTE_API_TEST_CPP
-      ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_compute_api_test.cpp
-  )
-
-  executorch_target_link_options_shared_lib(vulkan_backend)
-
-  add_executable(
-    vulkan_compute_api_test ${COMPUTE_API_TEST_CPP} ${TEST_UTILS_CPP}
-  )
-  target_include_directories(vulkan_compute_api_test PRIVATE ${COMMON_INCLUDES})
-  target_link_libraries(
-    vulkan_compute_api_test PRIVATE GTest::gtest_main vulkan_backend
-                                    executorch_core test_shaderlib
-  )
-  target_compile_options(vulkan_compute_api_test PRIVATE ${VULKAN_CXX_FLAGS})
-
-  set_property(TARGET vulkan_compute_api_test PROPERTY CXX_STANDARD 17)
-
-endif()
diff --git a/backends/vulkan/test/TARGETS b/backends/vulkan/test/TARGETS
deleted file mode 100644
index 53fad86f90c..00000000000
--- a/backends/vulkan/test/TARGETS
+++ /dev/null
@@ -1,91 +0,0 @@
-load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-
-oncall("executorch")
-
-python_unittest(
-    name = "test_vulkan_delegate",
-    srcs = [
-        "test_vulkan_delegate.py",
-    ],
-    preload_deps = [
-        "fbsource//third-party/swiftshader/lib/linux-x64:libvk_swiftshader_fbcode",
-        "//executorch/backends/vulkan:vulkan_backend_lib",
-        "//executorch/kernels/portable:custom_ops_generated_lib",
-    ],
-    deps = [
-        ":test_utils",
-        "//caffe2:torch",
-        "//executorch/backends/transforms:convert_dtype_pass",
-        "//executorch/backends/vulkan:vulkan_preprocess",
-        "//executorch/backends/vulkan/partitioner:vulkan_partitioner",
-        "//executorch/exir:lib",
-        "//executorch/extension/pybindings:portable_lib",  # @manual
-        "//executorch/extension/pytree:pylib",
-        "//executorch/kernels/portable:custom_ops_generated_lib",
-    ],
-)
-
-python_unittest(
-    name = "test_vulkan_passes",
-    srcs = [
-        "test_vulkan_passes.py",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/backends/vulkan/_passes:vulkan_passes",
-        "//executorch/backends/vulkan/quantizer:vulkan_quantizer",
-        "//executorch/backends/vulkan:vulkan_preprocess",
-        "//pytorch/ao:torchao",  # @manual
-    ]
-)
-
-python_unittest(
-    name = "test_vulkan_delegate_header",
-    srcs = [
-        "test_vulkan_delegate_header.py",
-    ],
-    deps = [
-        "//executorch/backends/vulkan:vulkan_preprocess",
-    ],
-)
-
-python_unittest(
-    name = "test_serialization",
-    srcs = [
-        "test_serialization.py",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/backends/vulkan:vulkan_preprocess",
-    ],
-)
-
-runtime.python_library(
-    name = "tester",
-    srcs = ["tester.py"],
-    deps = [
-        "//executorch/backends/vulkan/partitioner:vulkan_partitioner",
-        "//executorch/backends/vulkan:vulkan_preprocess",
-    ]
-)
-
-runtime.python_library(
-    name = "test_utils",
-    srcs = [
-        "utils.py",
-    ],
-    deps = [
-        "//caffe2:torch",
-        "//executorch/backends/vulkan:vulkan_preprocess",
-        "//executorch/backends/vulkan/partitioner:vulkan_partitioner",
-        "//executorch/backends/xnnpack:xnnpack_preprocess",
-        "//executorch/backends/xnnpack/quantizer:xnnpack_quantizer",
-        "//executorch/backends/xnnpack/partition:xnnpack_partitioner",
-        "//executorch/devtools:lib",
-        "//executorch/devtools/bundled_program/serialize:lib",
-        "//executorch/exir:lib",
-        "//executorch/extension/pybindings:portable_lib",  # @manual
-        "//executorch/extension/pytree:pylib",
-    ],
-)
diff --git a/backends/vulkan/test/compute_api_tests.bzl b/backends/vulkan/test/compute_api_tests.bzl
deleted file mode 100644
index db7bfe3c6ab..00000000000
--- a/backends/vulkan/test/compute_api_tests.bzl
+++ /dev/null
@@ -1,73 +0,0 @@
-load("@fbsource//tools/target_determinator/macros:ci.bzl", "ci")
-load("@fbsource//tools/build_defs:fb_xplat_cxx_binary.bzl", "fb_xplat_cxx_binary")
-load("@fbsource//tools/build_defs:fb_xplat_cxx_test.bzl", "fb_xplat_cxx_test")
-load("@fbsource//tools/build_defs:platform_defs.bzl", "ANDROID", "MACOSX", "CXX")
-load(
-    "@fbsource//xplat/executorch/backends/vulkan:targets.bzl",
-    "get_labels",
-    "get_platforms",
-    "vulkan_spv_shader_lib",
-)
-
-def define_compute_api_test_targets():
-    for no_volk in [True, False]:
-        suffix = "_no_volk" if no_volk else ""
-
-        vulkan_spv_shader_lib(
-            name = "test_shader_lib{}".format(suffix),
-            spv_filegroups = {
-                ":test_shaders": "glsl",
-            },
-            no_volk = no_volk,
-        )
-
-        fb_xplat_cxx_binary(
-            name = "vulkan_compute_api_test_bin{}".format(suffix),
-            srcs = [
-                "utils/test_utils.cpp",
-                "vulkan_compute_api_test.cpp",
-            ],
-            headers = [
-                "utils/test_utils.h",
-            ],
-            apple_sdks = MACOSX,
-            labels = get_labels(no_volk),
-            platforms = get_platforms(),
-            visibility = ["PUBLIC"],
-            deps = [
-                ":test_shader_lib{}".format(suffix),
-                "//third-party/googletest:gtest_main",
-                "//xplat/executorch/backends/vulkan:vulkan_graph_runtime{}".format(suffix),
-                "//xplat/executorch/runtime/core/exec_aten:lib",
-            ],
-        )
-
-    # no_volk variant does not work under the flagfile used for instrumentation tests,
-    # but it is also not necessary to test it as an instrumentation test. Therefore do
-    # not generate a no_volk variant for the instrumentation test.
-    fb_xplat_cxx_test(
-        name = "vulkan_compute_api_test{}".format(suffix),
-        srcs = [
-            "utils/test_utils.cpp",
-            "vulkan_compute_api_test.cpp",
-        ],
-        headers = [
-            "utils/test_utils.h",
-        ],
-        contacts = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"],
-        fbandroid_additional_loaded_sonames = [
-            "test_shader_lib",
-            "vulkan_graph_runtime",
-            "vulkan_graph_runtime_shaderlib",
-        ],
-        # Since this is an Android instrumentation test, only generate for ANDROID
-        platforms = [ANDROID],
-        use_instrumentation_test = True,
-        visibility = ["PUBLIC"],
-        deps = [
-            ":test_shader_lib{}".format(suffix),
-            "//third-party/googletest:gtest_main",
-            "//xplat/executorch/backends/vulkan:vulkan_graph_runtime{}".format(suffix),
-            "//xplat/executorch/runtime/core/exec_aten:lib",
-        ],
-    )
diff --git a/backends/vulkan/test/custom_ops/CMakeLists.txt b/backends/vulkan/test/custom_ops/CMakeLists.txt
deleted file mode 100644
index fe58055f649..00000000000
--- a/backends/vulkan/test/custom_ops/CMakeLists.txt
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-cmake_minimum_required(VERSION 3.19)
-project(prototyping_shaders)
-
-if(ANDROID)
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH)
-endif()
-
-find_package(executorch CONFIG REQUIRED COMPONENTS vulkan_backend)
-
-# Compile settings
-
-set(VULKAN_CXX_FLAGS "-fexceptions")
-list(APPEND VULKAN_CXX_FLAGS "-DUSE_VULKAN_WRAPPER")
-list(APPEND VULKAN_CXX_FLAGS "-DUSE_VULKAN_VOLK")
-
-message(STATUS "VULKAN_CXX_FLAGS: ${VULKAN_CXX_FLAGS}")
-
-# Only build if Vulkan was compiled
-if(TARGET vulkan_backend)
-  if(NOT EXECUTORCH_ROOT)
-    set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
-  endif()
-
-  if(NOT PYTHON_EXECUTABLE)
-    set(PYTHON_EXECUTABLE python3)
-  endif()
-
-  # Include this file to access executorch_target_link_options_shared_lib
-  include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
-  include(${EXECUTORCH_ROOT}/backends/vulkan/cmake/ShaderLibrary.cmake)
-
-  # Third party include paths
-  set(VULKAN_THIRD_PARTY_PATH ${EXECUTORCH_ROOT}/backends/vulkan/third-party)
-  set(VULKAN_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/Vulkan-Headers/include)
-  set(VOLK_PATH ${VULKAN_THIRD_PARTY_PATH}/volk)
-  set(VMA_PATH ${VULKAN_THIRD_PARTY_PATH}/VulkanMemoryAllocator)
-
-  set(COMMON_INCLUDES ${EXECUTORCH_ROOT}/.. ${VULKAN_HEADERS_PATH} ${VOLK_PATH}
-                      ${VMA_PATH}
-  )
-
-  # Prototyping utility files
-  set(PROTOTYPING_UTILS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR})
-  set(PROTOTYPING_UTILS_CPP ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp)
-
-  # Prototyping shaders
-  message(STATUS "shader stuff")
-  set(PROTOTYPING_SHADERS_PATH ${CMAKE_CURRENT_SOURCE_DIR}/glsl)
-  gen_vulkan_shader_lib_cpp(${PROTOTYPING_SHADERS_PATH})
-  vulkan_shader_lib(prototyping_shaderlib ${generated_spv_cpp})
-  target_compile_options(prototyping_shaderlib PRIVATE ${VULKAN_CXX_FLAGS})
-  message(STATUS "done shader stuff")
-
-  # Operator implementations library
-  file(GLOB OPERATOR_IMPL_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp)
-  add_library(operator_implementations STATIC ${OPERATOR_IMPL_SOURCES})
-  target_include_directories(
-    operator_implementations PRIVATE ${COMMON_INCLUDES}
-  )
-  target_link_libraries(
-    operator_implementations PRIVATE vulkan_backend executorch_core
-                                     prototyping_shaderlib
-  )
-  target_compile_options(operator_implementations PRIVATE ${VULKAN_CXX_FLAGS})
-  set_property(TARGET operator_implementations PROPERTY CXX_STANDARD 17)
-
-  executorch_target_link_options_shared_lib(vulkan_backend)
-  executorch_target_link_options_shared_lib(operator_implementations)
-
-  # Function to create operator prototype binaries
-  function(add_operator_prototype OPERATOR_NAME)
-    set(TARGET_NAME ${OPERATOR_NAME})
-    set(SOURCE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${OPERATOR_NAME}.cpp)
-
-    add_executable(${TARGET_NAME} ${SOURCE_FILE} ${PROTOTYPING_UTILS_CPP})
-    target_include_directories(${TARGET_NAME} PRIVATE ${COMMON_INCLUDES})
-    target_link_libraries(
-      ${TARGET_NAME} PRIVATE vulkan_backend executorch_core
-                             prototyping_shaderlib operator_implementations
-    )
-    target_compile_options(${TARGET_NAME} PRIVATE ${VULKAN_CXX_FLAGS})
-    set_property(TARGET ${TARGET_NAME} PROPERTY CXX_STANDARD 17)
-  endfunction()
-
-  # Define operator prototypes
-  add_operator_prototype(add)
-  add_operator_prototype(q8csw_linear)
-  add_operator_prototype(quantized_q4gaw_linear)
-  add_operator_prototype(quantized_int4_linear)
-  add_operator_prototype(q8csw_conv2d)
-endif()
diff --git a/backends/vulkan/test/custom_ops/TARGETS b/backends/vulkan/test/custom_ops/TARGETS
deleted file mode 100644
index e84397dc20e..00000000000
--- a/backends/vulkan/test/custom_ops/TARGETS
+++ /dev/null
@@ -1,5 +0,0 @@
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets(is_fbcode = True)
diff --git a/backends/vulkan/test/custom_ops/add.cpp b/backends/vulkan/test/custom_ops/add.cpp
deleted file mode 100644
index bc20246a7d1..00000000000
--- a/backends/vulkan/test/custom_ops/add.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-#include <iostream>
-#include <vector>
-#include "utils.h"
-
-using namespace executorch::vulkan::prototyping;
-
-// Generate test cases for add operation
-std::vector<TestCase> generate_add_test_cases() {
-  std::vector<TestCase> test_cases;
-
-  // Set the data generation type as a local variable
-  DataGenType data_gen_type = DataGenType::ONES;
-
-  // Define different input size configurations
-  std::vector<std::vector<int64_t>> size_configs = {
-      {1, 64, 64}, // Small square
-      {1, 128, 128}, // Medium square
-      {1, 256, 256}, // Large square
-      {1, 512, 512}, // Very large square
-      {1, 1, 1024}, // Wide tensor
-      {1, 1024, 1}, // Tall tensor
-      {32, 32, 32}, // 3D cube
-      {16, 128, 64}, // 3D rectangular
-  };
-
-  // Storage types to test
-  std::vector<utils::StorageType> storage_types = {
-      utils::kTexture3D, utils::kBuffer};
-
-  // Data types to test
-  std::vector<vkapi::ScalarType> data_types = {vkapi::kFloat, vkapi::kHalf};
-
-  // Generate test cases for each combination
-  for (const auto& sizes : size_configs) {
-    for (const auto& storage_type : storage_types) {
-      for (const auto& data_type : data_types) {
-        TestCase test_case;
-
-        // Create a descriptive name for the test case
-        std::string size_str = "";
-        for (size_t i = 0; i < sizes.size(); ++i) {
-          size_str += std::to_string(sizes[i]);
-          if (i < sizes.size() - 1)
-            size_str += "x";
-        }
-
-        std::string storage_str =
-            (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer";
-        std::string dtype_str = (data_type == vkapi::kFloat) ? "Float" : "Half";
-
-        // Add data generation type to the name for clarity
-        std::string test_name =
-            "Add_" + size_str + "_" + storage_str + "_" + dtype_str;
-        test_case.set_name(test_name);
-
-        // Set the operator name for the test case
-        test_case.set_operator_name("etvk.add_prototype");
-
-        // Add two input tensors with the same size, type, storage, and data
-        // generation method
-        ValueSpec input_a(
-            sizes, data_type, storage_type, utils::kWidthPacked, data_gen_type);
-        ValueSpec input_b(
-            sizes, data_type, storage_type, utils::kWidthPacked, data_gen_type);
-
-        // Add output tensor with the same size, type, and storage as inputs
-        // (output uses ZEROS by default)
-        ValueSpec output(
-            sizes,
-            data_type,
-            storage_type,
-            utils::kWidthPacked,
-            DataGenType::ZEROS);
-
-        test_case.add_input_spec(input_a);
-        test_case.add_input_spec(input_b);
-        test_case.add_output_spec(output);
-
-        test_cases.push_back(test_case);
-      }
-    }
-  }
-
-  return test_cases;
-}
-
-// Custom FLOP calculator for add operation
-// Add operation performs 1 FLOP (addition) per element
-int64_t add_flop_calculator(const TestCase& test_case) {
-  // Calculate total elements from the first input tensor
-  int64_t total_elements = 1;
-  if (!test_case.empty() && test_case.num_inputs() > 0 &&
-      test_case.inputs()[0].is_tensor()) {
-    const auto& sizes = test_case.inputs()[0].get_tensor_sizes();
-    for (int64_t size : sizes) {
-      total_elements *= size;
-    }
-  }
-
-  // Add operation: 1 FLOP per element (one addition)
-  return total_elements;
-}
-
-// Reference implementation for add operator
-void add_reference_compute(TestCase& test_case) {
-  const ValueSpec& input_a = test_case.inputs().at(0);
-  const ValueSpec& input_b = test_case.inputs().at(1);
-
-  ValueSpec& output = test_case.outputs().at(0);
-
-  if (input_a.dtype != vkapi::kFloat) {
-    throw std::invalid_argument("Unsupported dtype");
-  }
-
-  // Calculate number of elements
-  int64_t num_elements = input_a.numel();
-
-  auto& input_a_data = input_a.get_float_data();
-  auto& input_b_data = input_b.get_float_data();
-
-  auto& ref_data = output.get_ref_float_data();
-  ref_data.resize(num_elements);
-  for (int64_t i = 0; i < num_elements; ++i) {
-    ref_data[i] = input_a_data[i] + input_b_data[i];
-  }
-}
-
-int main(int argc, char* argv[]) {
-  set_print_output(false); // Disable output tensor printing
-  set_print_latencies(false); // Enable latency timing printing
-  set_use_gpu_timestamps(true); // Enable GPU timestamps
-
-  print_performance_header();
-  std::cout << "Add Operation Prototyping Framework" << std::endl;
-  print_separator();
-
-  // Initialize Vulkan context
-  try {
-    api::context()->initialize_querypool();
-  } catch (const std::exception& e) {
-    std::cerr << "Failed to initialize Vulkan context: " << e.what()
-              << std::endl;
-    return 1;
-  }
-
-  // Execute test cases using the new framework with custom FLOP calculator and
-  // reference compute
-  auto results = execute_test_cases(
-      generate_add_test_cases,
-      add_flop_calculator,
-      "Add",
-      3,
-      10,
-      add_reference_compute);
-
-  return 0;
-}
diff --git a/backends/vulkan/test/custom_ops/build_and_run.sh b/backends/vulkan/test/custom_ops/build_and_run.sh
deleted file mode 100755
index 2b9ce576e0e..00000000000
--- a/backends/vulkan/test/custom_ops/build_and_run.sh
+++ /dev/null
@@ -1,177 +0,0 @@
-#!/bin/zsh
-
-set -eux
-
-# Check that we're in the executorch directory
-current_dir=$(pwd)
-if [[ ! "$current_dir" =~ executorch$ ]]; then
-    echo "Error: This script must be run from a directory ending in 'executorch'"
-    echo "Current directory: $current_dir"
-    exit 1
-fi
-
-# Function to configure and build main project
-configure_and_build_main() {
-    local android_args=""
-    if [[ "$ANDROID_MODE" == "true" ]]; then
-        cmake . \
-        -DCMAKE_INSTALL_PREFIX=$CMAKE_OUT_DIR \
-        -DEXECUTORCH_BUILD_VULKAN=ON \
-        -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-        -DANDROID_ABI=arm64-v8a \
-        -DANDROID_PLATFORM=android-28 \
-        -DGLSLC_PATH=$(which glslc) \
-        -B$CMAKE_OUT_DIR
-    else
-        cmake . \
-        -DCMAKE_INSTALL_PREFIX=$CMAKE_OUT_DIR \
-        -DEXECUTORCH_BUILD_VULKAN=ON \
-        -DGLSLC_PATH=$(which glslc) \
-        -B$CMAKE_OUT_DIR
-    fi
-
-    cmake --build $CMAKE_OUT_DIR -j16 --target install
-    # -DCMAKE_CXX_FLAGS="-DVULKAN_DEBUG" \
-}
-
-# Function to build main project only
-build_main() {
-    cmake --build $CMAKE_OUT_DIR -j16 --target install
-}
-
-# Function to configure and build tests
-configure_and_build_tests() {
-    # Check if glslc is installed
-    if ! command -v glslc >/dev/null 2>&1; then
-        echo "Error: glslc is not installed or not found in PATH."
-        exit 1
-    fi
-
-    local android_args=""
-    if [[ "$ANDROID_MODE" == "true" ]]; then
-        cmake backends/vulkan/test/custom_ops/ \
-            -DCMAKE_INSTALL_PREFIX=$CMAKE_OUT_DIR \
-            -DCMAKE_BUILD_TYPE=Debug \
-            -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-            -DANDROID_ABI=arm64-v8a \
-            -DANDROID_PLATFORM=android-28 \
-            -DGLSLC_PATH=$(which glslc) \
-            -B$CMAKE_OUT_DIR/backends/vulkan/test/custom_ops
-    else
-        cmake backends/vulkan/test/custom_ops/ \
-            -DCMAKE_INSTALL_PREFIX=$CMAKE_OUT_DIR \
-            -DCMAKE_BUILD_TYPE=Debug \
-            -DGLSLC_PATH=$(which glslc) \
-            -B$CMAKE_OUT_DIR/backends/vulkan/test/custom_ops
-    fi
-
-    cmake --build $CMAKE_OUT_DIR/backends/vulkan/test/custom_ops -j16 --target all
-
-}
-
-build_tests() {
-    cmake --build $CMAKE_OUT_DIR/backends/vulkan/test/custom_ops -j16 --target all
-}
-
-# Function to rebuild both main and tests
-rebuild_both() {
-    build_main
-    build_tests
-}
-
-# Function to clean and rebuild everything
-clean_and_rebuild() {
-    rm -rf $CMAKE_OUT_DIR
-    configure_and_build_main
-    configure_and_build_tests
-}
-
-# Function to execute binary if specified
-execute_binary() {
-    local binary_name="$1"
-    if [[ -n "$binary_name" ]]; then
-        local binary_path="$CMAKE_OUT_DIR/backends/vulkan/test/custom_ops/$binary_name"
-        echo "Executing binary: $binary_path"
-
-        if [[ "$ANDROID_MODE" == "true" ]]; then
-            if [[ -f "$binary_path" ]]; then
-                echo "Pushing binary to Android device..."
-                adb push "$binary_path" /data/local/tmp/
-                echo "Executing binary on Android device..."
-                adb shell "cd /data/local/tmp && ./$binary_name"
-            else
-                echo "Error: Binary '$binary_path' not found"
-                exit 1
-            fi
-        else
-            if [[ -f "$binary_path" && -x "$binary_path" ]]; then
-                "$binary_path"
-            else
-                echo "Error: Binary '$binary_path' not found or not executable"
-                exit 1
-            fi
-        fi
-    fi
-}
-
-# Parse command line arguments
-BINARY_TO_EXECUTE=""
-ANDROID_MODE=false
-CMAKE_OUT_DIR="cmake-out"
-
-# Check for --android flag and adjust arguments accordingly
-if [[ "$1" == "--android" ]]; then
-    ANDROID_MODE=true
-    CMAKE_OUT_DIR="cmake-android-out"
-    shift  # Remove --android from arguments
-    echo "Android mode enabled. Using $CMAKE_OUT_DIR as build directory."
-fi
-
-case "${1:-}" in
-    --rebuild|-r)
-        echo "Rebuilding both main project and tests..."
-        BINARY_TO_EXECUTE="${2:-}"
-        rebuild_both
-        execute_binary "$BINARY_TO_EXECUTE"
-        ;;
-    --rebuild1|-r1)
-        echo "Rebuilding main project only..."
-        BINARY_TO_EXECUTE="${2:-}"
-        build_main
-        execute_binary "$BINARY_TO_EXECUTE"
-        ;;
-    --rebuild2|-r2)
-        echo "Rebuilding tests only..."
-        BINARY_TO_EXECUTE="${2:-}"
-        build_tests
-        execute_binary "$BINARY_TO_EXECUTE"
-        ;;
-    --clean|-c)
-        echo "WARNING: This will delete the entire $CMAKE_OUT_DIR directory and rebuild everything."
-        echo -n "Are you sure you want to continue? (y/N): "
-        read -r response
-        if [[ "$response" =~ ^[Yy]$ ]]; then
-            echo "Cleaning and rebuilding everything..."
-            BINARY_TO_EXECUTE="${2:-}"
-            clean_and_rebuild
-            execute_binary "$BINARY_TO_EXECUTE"
-        else
-            echo "Clean operation cancelled."
-            exit 0
-        fi
-        ;;
-    "")
-        echo "Running full configure and build..."
-        configure_and_build_main
-        configure_and_build_tests
-        ;;
-    *)
-        # If first argument doesn't match any build option, treat it as binary name
-        # and use default build behavior
-        echo "Running full configure and build..."
-        BINARY_TO_EXECUTE="$1"
-        configure_and_build_main
-        configure_and_build_tests
-        execute_binary "$BINARY_TO_EXECUTE"
-        ;;
-esac
diff --git a/backends/vulkan/test/custom_ops/choose_qparams_per_row.cpp b/backends/vulkan/test/custom_ops/choose_qparams_per_row.cpp
deleted file mode 100644
index aa2b21feab8..00000000000
--- a/backends/vulkan/test/custom_ops/choose_qparams_per_row.cpp
+++ /dev/null
@@ -1,363 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-#include <algorithm>
-#include <cmath>
-#include <iostream>
-#include <vector>
-#include "utils.h"
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-using namespace executorch::vulkan::prototyping;
-using namespace vkcompute;
-
-static constexpr int64_t kRefDimSizeLimit = 2050;
-static constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f;
-
-// ChooseQParams configuration struct
-struct ChooseQParamsConfig {
-  int64_t num_channels; // Height dimension (number of channels)
-  int64_t channel_size; // Width dimension (size per channel)
-  int32_t quant_min = -128;
-  int32_t quant_max = 127;
-  std::string test_case_name = "placeholder";
-  std::string op_name = "choose_qparams_per_row";
-};
-
-// Utility function to create a test case from a ChooseQParamsConfig
-TestCase create_test_case_from_config(
-    const ChooseQParamsConfig& config,
-    utils::StorageType storage_type,
-    vkapi::ScalarType input_dtype) {
-  TestCase test_case;
-
-  // Create a descriptive name for the test case
-  std::string storage_str =
-      (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer";
-  std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half";
-
-  std::string test_name =
-      config.test_case_name + "_" + storage_str + "_" + dtype_str;
-  test_case.set_name(test_name);
-
-  // Set the operator name for the test case
-  std::string operator_name = "etvk." + config.op_name + ".default";
-  test_case.set_operator_name(operator_name);
-
-  // Input tensor (float) - [num_channels, channel_size]
-  std::vector<int64_t> input_size = {config.num_channels, config.channel_size};
-  ValueSpec input_tensor(
-      input_size,
-      input_dtype,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::RANDOM);
-
-  if (debugging()) {
-    print_valuespec_data(input_tensor, "input_tensor");
-  }
-
-  // Quantization parameters
-  ValueSpec quant_min(config.quant_min);
-  ValueSpec quant_max(config.quant_max);
-
-  // Output scale tensor (float) - [num_channels]
-  ValueSpec scale_out(
-      {config.num_channels},
-      vkapi::kFloat,
-      utils::kBuffer, // Always buffer as per requirement
-      utils::kWidthPacked,
-      DataGenType::ZEROS);
-
-  // Output zero_point tensor (int8) - [num_channels]
-  ValueSpec zero_point_out(
-      {config.num_channels},
-      vkapi::kChar, // int8 for quantized zero point
-      utils::kBuffer, // Always buffer as per requirement
-      utils::kWidthPacked,
-      DataGenType::ZEROS);
-
-  // Add all specs to test case
-  test_case.add_input_spec(input_tensor);
-  test_case.add_input_spec(quant_min);
-  test_case.add_input_spec(quant_max);
-  test_case.add_output_spec(scale_out);
-  test_case.add_output_spec(zero_point_out);
-
-  return test_case;
-}
-
-// CPU reference implementation matching the behavior from op_choose_qparams.cpp
-void calculate_scale_and_zero_point_reference(
-    float min_val,
-    float max_val,
-    int32_t qmin,
-    int32_t qmax,
-    float& scale,
-    int32_t& zero_point) {
-  // Extend the [min, max] interval to ensure that it contains 0
-  min_val = std::min(min_val, 0.0f);
-  max_val = std::max(max_val, 0.0f);
-
-  // Use double precision for intermediate computation but use single precision
-  // in final number to reflect the actual number used during quantization.
-  double scale_double =
-      (static_cast<double>(max_val) - min_val) / (qmax - qmin);
-
-  // If scale is 0 or too small so its reciprocal is infinity, we arbitrary
-  // adjust the scale to 0.1 . We want to avoid scale's reciprocal being
-  // infinity because some of fbgemm code pre-computes scale's reciprocal to do
-  // multiplication instead of division in the time critical part of code.
-  if (static_cast<float>(scale_double) == 0.0f ||
-      std::isinf(1.0f / static_cast<float>(scale_double))) {
-    scale_double = 0.1;
-  }
-
-  // Cut off small scale
-  if (scale_double < SMALL_SCALE_THRESHOLD) {
-    float org_scale = static_cast<float>(scale_double);
-    scale_double = SMALL_SCALE_THRESHOLD;
-    // Adjust the min and max based on the new scale
-    if (min_val == 0.0f) {
-      max_val = SMALL_SCALE_THRESHOLD * (qmax - qmin);
-    } else if (max_val == 0.0f) {
-      min_val = -SMALL_SCALE_THRESHOLD * (qmax - qmin);
-    } else {
-      float amplifier = SMALL_SCALE_THRESHOLD / org_scale;
-      min_val *= amplifier;
-      max_val *= amplifier;
-    }
-  }
-
-  // Zero-point computation.
-  // First the initial floating-point computation. The zero-point can be
-  // determined from solving an affine equation for any known pair
-  // (real value, corresponding quantized value).
-  // We know two such pairs: (rmin, qmin) and (rmax, qmax).
-  // The arithmetic error on the zero point computed from either pair
-  // will be roughly machine_epsilon * (sum of absolute values of terms)
-  // so we want to use the variant that adds the smaller terms.
-  double zero_point_from_min = qmin - min_val / scale_double;
-  double zero_point_from_max = qmax - max_val / scale_double;
-  double zero_point_from_min_error =
-      std::abs(qmin) - std::abs(min_val / scale_double);
-  double zero_point_from_max_error =
-      std::abs(qmax) - std::abs(max_val / scale_double);
-  double initial_zero_point =
-      zero_point_from_min_error < zero_point_from_max_error
-      ? zero_point_from_min
-      : zero_point_from_max;
-
-  // Now we need to nudge the zero point to be an integer
-  // (our zero points are integer, and this is motivated by the requirement
-  // to be able to represent the real value "0" exactly as a quantized value,
-  // which is required in multiple places, for example in Im2col with zero
-  // padding).
-  int32_t nudged_zero_point = 0;
-  if (initial_zero_point < qmin) {
-    nudged_zero_point = qmin;
-  } else if (initial_zero_point > qmax) {
-    nudged_zero_point = qmax;
-  } else {
-    nudged_zero_point =
-        static_cast<int32_t>(nearbyint(static_cast<float>(initial_zero_point)));
-  }
-
-  scale = static_cast<float>(scale_double);
-  zero_point = nudged_zero_point;
-}
-
-// Generate easy test cases for choose_qparams_per_channel operation (for
-// debugging)
-std::vector<TestCase> generate_choose_qparams_per_channel_easy_cases() {
-  std::vector<TestCase> test_cases;
-
-  // Single simple configuration for debugging
-  int num_channels = 4;
-  int channel_size = 8;
-
-  ChooseQParamsConfig config = {
-      num_channels, // num_channels
-      channel_size, // channel_size
-      -128, // quant_min
-      127, // quant_max
-      "simple", // test_case_name
-  };
-
-  // Test with both storage types
-  std::vector<utils::StorageType> storage_types = {
-      utils::kTexture3D, utils::kBuffer};
-  std::vector<vkapi::ScalarType> float_types = {vkapi::kFloat};
-
-  // Generate test cases for each combination
-  for (const auto& storage_type : storage_types) {
-    for (const auto& input_dtype : float_types) {
-      test_cases.push_back(
-          create_test_case_from_config(config, storage_type, input_dtype));
-    }
-  }
-
-  return test_cases;
-}
-
-// Generate test cases for choose_qparams_per_channel operation
-std::vector<TestCase> generate_choose_qparams_per_channel_test_cases() {
-  std::vector<TestCase> test_cases;
-
-  std::vector<ChooseQParamsConfig> configs = {
-      {4, 16},
-      {8, 32},
-      {16, 64},
-      {32, 128},
-      {64, 256},
-      {128, 512},
-      {1, 512},
-      // Performance cases
-      {256, 1024},
-      {512, 2048},
-      {1, 2048},
-      {1, 8096},
-  };
-
-  // Test with different storage types
-  std::vector<utils::StorageType> storage_types = {
-      utils::kTexture3D, utils::kBuffer};
-
-  for (auto config : configs) {
-    std::string prefix = (config.num_channels < kRefDimSizeLimit &&
-                          config.channel_size < kRefDimSizeLimit)
-        ? "correctness_"
-        : "performance_";
-    std::string generated_test_case_name = prefix +
-        std::to_string(config.num_channels) + "_" +
-        std::to_string(config.channel_size);
-
-    config.test_case_name = generated_test_case_name;
-
-    for (const auto& storage_type : storage_types) {
-      test_cases.push_back(
-          create_test_case_from_config(config, storage_type, vkapi::kFloat));
-    }
-  }
-
-  return test_cases;
-}
-
-// Reference implementation for choose_qparams_per_channel
-void choose_qparams_per_channel_reference_impl(TestCase& test_case) {
-  int32_t idx = 0;
-  const ValueSpec& input_spec = test_case.inputs()[idx++];
-  const ValueSpec& quant_min_spec = test_case.inputs()[idx++];
-  const ValueSpec& quant_max_spec = test_case.inputs()[idx++];
-  const ValueSpec& eps_spec = test_case.inputs()[idx++];
-  const ValueSpec& dtype_spec = test_case.inputs()[idx++];
-  (void)eps_spec; // Unused in reference implementation
-  (void)dtype_spec; // Unused in reference implementation
-
-  // Extract output specifications
-  ValueSpec& scale_out_spec = test_case.outputs()[0];
-  ValueSpec& zero_point_out_spec = test_case.outputs()[1];
-
-  // Get tensor dimensions
-  auto input_sizes =
-      input_spec.get_tensor_sizes(); // [num_channels, channel_size]
-  int64_t num_channels = input_sizes[0];
-  int64_t channel_size = input_sizes[1];
-
-  // Skip for large tensors since computation time will be extremely slow
-  if (num_channels > kRefDimSizeLimit || channel_size > kRefDimSizeLimit) {
-    throw std::invalid_argument(
-        "One or more dimensions (num_channels, channel_size) exceed the allowed limit for reference implementation.");
-  }
-
-  if (input_spec.dtype != vkapi::kFloat) {
-    throw std::invalid_argument("Unsupported dtype");
-  }
-
-  // Get raw data pointers
-  auto& input_data = input_spec.get_float_data();
-  int32_t quant_min = quant_min_spec.get_int_value();
-  int32_t quant_max = quant_max_spec.get_int_value();
-
-  // Prepare output data
-  auto& scale_ref_data = scale_out_spec.get_ref_float_data();
-  auto& zero_point_ref_data = zero_point_out_spec.get_ref_int8_data();
-  scale_ref_data.resize(num_channels);
-  zero_point_ref_data.resize(num_channels);
-
-  // Process each channel
-  for (int64_t channel = 0; channel < num_channels; ++channel) {
-    // Find min and max for this channel
-    float min_val = std::numeric_limits<float>::max();
-    float max_val = std::numeric_limits<float>::lowest();
-
-    for (int64_t i = 0; i < channel_size; ++i) {
-      int64_t input_idx = channel * channel_size + i;
-      float val = input_data[input_idx];
-      min_val = std::min(min_val, val);
-      max_val = std::max(max_val, val);
-    }
-
-    // Calculate scale and zero point for this channel
-    float scale;
-    int32_t zero_point;
-    calculate_scale_and_zero_point_reference(
-        min_val, max_val, quant_min, quant_max, scale, zero_point);
-
-    // Store results (cast zero_point to int8)
-    scale_ref_data[channel] = scale;
-    zero_point_ref_data[channel] = static_cast<int8_t>(zero_point);
-  }
-}
-
-void reference_impl(TestCase& test_case) {
-  choose_qparams_per_channel_reference_impl(test_case);
-}
-
-int64_t choose_qparams_per_channel_flop_calculator(const TestCase& test_case) {
-  // Get input dimensions
-  const auto& input_sizes = test_case.inputs()[0].get_tensor_sizes();
-  int64_t num_channels = input_sizes[0];
-  int64_t channel_size = input_sizes[1];
-
-  // Calculate FLOPs for choose_qparams_per_channel operation
-  // Each channel requires:
-  // - Min/max finding: approximately 2 * channel_size comparisons
-  // - Scale calculation: ~5 operations (division, min/max operations)
-  // - Zero point calculation: ~10 operations (multiple arithmetic operations)
-  int64_t ops_per_channel = 2 * channel_size + 15; // Simplified estimate
-
-  int64_t flop = num_channels * ops_per_channel;
-
-  return flop;
-}
-
-int main(int argc, char* argv[]) {
-  set_debugging(false);
-  set_print_output(false);
-  set_print_latencies(false);
-  set_use_gpu_timestamps(true);
-
-  print_performance_header();
-  std::cout << "Choose QParams Per Channel Operation Prototyping Framework"
-            << std::endl;
-  print_separator();
-
-  ReferenceComputeFunc ref_fn = reference_impl;
-
-  auto results = execute_test_cases(
-      generate_choose_qparams_per_channel_test_cases,
-      choose_qparams_per_channel_flop_calculator,
-      "ChooseQParamsPerChannel",
-      0,
-      10,
-      ref_fn);
-
-  return 0;
-}
diff --git a/backends/vulkan/test/custom_ops/glsl/add.yaml b/backends/vulkan/test/custom_ops/glsl/add.yaml
deleted file mode 100644
index dd479cafd31..00000000000
--- a/backends/vulkan/test/custom_ops/glsl/add.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-add_buffer:
-  parameter_names_with_default_values:
-    NDIM: 3
-    DTYPE: float
-    PACKING: C_packed
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: add_buffer
-
-add_texture:
-  parameter_names_with_default_values:
-    NDIM: 3
-    DTYPE: float
-    PACKING: C_packed
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: add_texture3d
diff --git a/backends/vulkan/test/custom_ops/glsl/add_buffer.glsl b/backends/vulkan/test/custom_ops/glsl/add_buffer.glsl
deleted file mode 100644
index 8a0ddc4dba7..00000000000
--- a/backends/vulkan/test/custom_ops/glsl/add_buffer.glsl
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-#define T ${buffer_scalar_type(DTYPE)}
-
-${define_active_storage_type("buffer")}
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
-${layout_declare_tensor(B, "r", "t_other", DTYPE, "buffer")}
-
-layout(push_constant) uniform restrict Block {
-  int out_numel;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const int out_bufi = int(gl_GlobalInvocationID.x);
-  if (out_bufi >= out_numel) {
-    return;
-  }
-
-  // Simple addition without broadcasting
-  t_out[out_bufi] = t_in[out_bufi] + t_other[out_bufi];
-}
\ No newline at end of file
diff --git a/backends/vulkan/test/custom_ops/glsl/add_texture.glsl b/backends/vulkan/test/custom_ops/glsl/add_texture.glsl
deleted file mode 100644
index f64c8e25d71..00000000000
--- a/backends/vulkan/test/custom_ops/glsl/add_texture.glsl
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-${define_active_storage_type("texture3d")}
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}
-${layout_declare_tensor(B, "r", "t_other", DTYPE, "texture3d")}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 out_sizes;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-${layout_declare_spec_const(C, "int", "out_layout", "0")}
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  // Simple addition without broadcasting - same position for all tensors
-  VEC4_T in_texel = texelFetch(t_in, pos, 0);
-  VEC4_T other_texel = texelFetch(t_other, pos, 0);
-
-  imageStore(t_out, pos, in_texel + other_texel);
-}
diff --git a/backends/vulkan/test/custom_ops/glsl/float_canvas.glsl b/backends/vulkan/test/custom_ops/glsl/float_canvas.glsl
deleted file mode 100644
index f821fa3586f..00000000000
--- a/backends/vulkan/test/custom_ops/glsl/float_canvas.glsl
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-${define_active_storage_type("texture3d")}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", "float", "texture3d")}
-${layout_declare_tensor(B, "r", "nchw_in", "uint", "buffer")}
-
-${layout_declare_ubo(B, "ivec3", "out_limits")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(lpos, out_limits))) {
-    return;
-  }
-
-  // Placeholder: just copy input to output
-  vec4 in_texel = vec4(1.0f);
-  imageStore(t_out, lpos, in_texel);
-}
diff --git a/backends/vulkan/test/custom_ops/glsl/packed_int32_canvas_buffer.glsl b/backends/vulkan/test/custom_ops/glsl/packed_int32_canvas_buffer.glsl
deleted file mode 100644
index c1d90fadf7e..00000000000
--- a/backends/vulkan/test/custom_ops/glsl/packed_int32_canvas_buffer.glsl
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-${define_active_storage_type("texture3d")}
-
-#extension GL_EXT_debug_printf : enable
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", "int", "texture3d")}
-${layout_declare_tensor(B, "r", "nchw_in", "uint", "buffer")}
-
-${layout_declare_ubo(B, "ivec3", "out_limits")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(lpos, out_limits))) {
-    return;
-  }
-
-  // Pack four 8-bit values equal to 1 into a single uint
-  int packed = (1 << 0) | (1 << 8) | (1 << 16) | (1 << 24);
-
-  debugPrintfEXT(
-      "t_out[%i, %i] = %i\\n",
-      lpos.x, lpos.y,
-      packed);
-
-
-  // Placeholder: just copy input to output
-  ivec4 in_texel = ivec4(packed);
-  imageStore(t_out, lpos, in_texel);
-}
diff --git a/backends/vulkan/test/custom_ops/glsl/packed_int32_canvas_texture3d.glsl b/backends/vulkan/test/custom_ops/glsl/packed_int32_canvas_texture3d.glsl
deleted file mode 100644
index be6717efdaa..00000000000
--- a/backends/vulkan/test/custom_ops/glsl/packed_int32_canvas_texture3d.glsl
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-${define_active_storage_type("texture2d")}
-
-#extension GL_EXT_debug_printf : enable
-
-layout(std430) buffer;
-
-${layout_declare_tensor(B, "w", "t_out", "int", "texture3d")}
-${layout_declare_tensor(B, "r", "nchw_in", "uint", "buffer")}
-
-${layout_declare_ubo(B, "ivec3", "out_limits")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(lpos, out_limits))) {
-    return;
-  }
-
-  // Pack four 8-bit values equal to 1 into a single uint
-  int packed = (1 << 0) | (1 << 8) | (1 << 16) | (1 << 24);
-
-  debugPrintfEXT(
-      "t_out[%i, %i] = %i\\n",
-      lpos.x, lpos.y,
-      packed);
-
-
-  // Placeholder: just copy input to output
-  ivec4 in_texel = ivec4(packed);
-  imageStore(t_out, lpos, in_texel);
-}
diff --git a/backends/vulkan/test/custom_ops/impl/AddPrototype.cpp b/backends/vulkan/test/custom_ops/impl/AddPrototype.cpp
deleted file mode 100644
index dc35153baf0..00000000000
--- a/backends/vulkan/test/custom_ops/impl/AddPrototype.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-// Shader selection function for add operations
-vkapi::ShaderInfo pick_add_shader(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)resize_args;
-  const ValueRef out = args.at(0).refs.at(0);
-  const ValueRef in1 = args.at(1).refs.at(0);
-
-  // Build shader name following the binary_op pattern
-  std::string kernel_name = "add";
-  add_storage_type_suffix(kernel_name, graph->storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph->dtype_of(in1));
-
-  return VK_KERNEL_FROM_STR(kernel_name);
-}
-
-// Global workgroup size function for add operations
-utils::uvec3 add_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  return default_pick_global_wg_size(graph, shader, args, resize_args);
-}
-
-// Local workgroup size function for add operations
-utils::uvec3 add_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  return default_pick_local_wg_size(
-      graph, shader, global_workgroup_size, args, resize_args);
-}
-
-void add_prototype(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  int idx = 0;
-  const ValueRef input_a = args.at(idx++);
-  const ValueRef input_b = args.at(idx++);
-  const ValueRef output = args.at(idx++);
-
-  // Prepare parameter buffers (empty for add operation)
-  vkapi::ParamsBindList param_buffers;
-
-  // Prepare push constants based on storage type
-  std::vector<PushConstantDataInfo> push_constants;
-  push_constants.reserve(graph.is_buffer_storage(output) ? 1 : 1);
-
-  if (graph.is_buffer_storage(output)) {
-    // Buffer storage: pass numel as push constant
-    push_constants.emplace_back(graph.numel_pc_of(output));
-  } else {
-    // Texture storage: pass sizes as push constant
-    push_constants.emplace_back(graph.sizes_pc_of(output));
-  }
-
-  // Prepare specialization constants
-  vkapi::SpecVarList spec_vars;
-  if (graph.is_buffer_storage(output)) {
-    spec_vars = {
-        graph.hashed_layout_of(output),
-        graph.hashed_layout_of(input_a),
-        graph.hashed_layout_of(input_b)};
-  } else {
-    spec_vars = {graph.hashed_layout_of(output)};
-  }
-
-  // Add the compute node
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      pick_add_shader,
-      add_global_wg_size,
-      add_local_wg_size,
-      // Inputs and Outputs
-      {{output, vkapi::kWrite}, {{input_a, input_b}, vkapi::kRead}},
-      // Shader params buffers
-      param_buffers,
-      // Push Constants
-      push_constants,
-      // Specialization Constants
-      spec_vars,
-      // Resize args
-      {},
-      // Resizing Logic
-      nullptr));
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(etvk.add_prototype, add_prototype);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/q4gsw_linear.cpp b/backends/vulkan/test/custom_ops/q4gsw_linear.cpp
deleted file mode 100644
index 805b67c30a2..00000000000
--- a/backends/vulkan/test/custom_ops/q4gsw_linear.cpp
+++ /dev/null
@@ -1,373 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-#include <iostream>
-#include <vector>
-#include "utils.h"
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-using namespace executorch::vulkan::prototyping;
-
-using namespace vkcompute;
-
-static constexpr int64_t kRefDimSizeLimit = 300;
-
-// Linear configuration struct
-struct LinearConfig {
-  int64_t M; // Batch size / number of rows in input
-  int64_t K; // Input features / columns in input, rows in weight
-  int64_t N; // Output features / columns in weight
-  int64_t group_size; // Number of input channels per quantization group
-  bool has_bias = false;
-  std::string test_case_name = "placeholder";
-  std::string op_name = "linear_q4gsw";
-};
-
-// Helper function to unpack 4-bit values from uint8
-std::pair<int8_t, int8_t> unpack_4bit(uint8_t packed) {
-  // Extract lower 4 bits and upper 4 bits
-  int8_t lower = packed & 0x0F;
-  int8_t upper = (packed >> 4) & 0x0F;
-
-  // Subtract 8 from unpacked 4-bit values
-  lower -= 8;
-  upper -= 8;
-
-  return std::make_pair(lower, upper);
-}
-
-// Utility function to create a test case from a LinearConfig
-TestCase create_test_case_from_config(
-    const LinearConfig& config,
-    utils::StorageType storage_type,
-    vkapi::ScalarType input_dtype) {
-  TestCase test_case;
-
-  // Create a descriptive name for the test case
-  std::string storage_str =
-      (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer";
-  std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half";
-
-  std::string test_name =
-      config.test_case_name + "_" + storage_str + "_" + dtype_str;
-  test_case.set_name(test_name);
-
-  // Set the operator name for the test case
-  std::string operator_name = "et_vk." + config.op_name + ".default";
-  test_case.set_operator_name(operator_name);
-
-  // Derive sizes from M, K, N
-  std::vector<int64_t> input_size = {config.M, config.K};
-  // Input tensor (float/half) - [M, K]
-  ValueSpec input_tensor(
-      input_size,
-      input_dtype,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::RANDOM);
-
-  if (debugging()) {
-    print_valuespec_data(input_tensor, "input_tensor");
-  }
-
-  // For 4-bit weights, packed size is [N, K/2] since 2 weights per byte
-  std::vector<int64_t> weight_size = {config.N, config.K / 2};
-  // Quantized weight tensor (uint8, packed 4-bit) - [N, K/2]
-  ValueSpec quantized_weight(
-      weight_size,
-      vkapi::kByte, // uint8 for packed 4-bit quantized weights
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::RANDINT4);
-  quantized_weight.set_constant(true);
-  quantized_weight.set_int4(true);
-
-  if (debugging()) {
-    print_valuespec_data(quantized_weight, "weight_tensor");
-  }
-
-  // Weight quantization scales (float/half, per-group)
-  // For group symmetric quantization: [K/group_size, N]
-  // Each group of input features has scales for all output features
-  std::vector<int64_t> weight_scales_size = {
-      config.K / config.group_size, config.N};
-  ValueSpec weight_scales(
-      weight_scales_size,
-      input_dtype,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::RANDOM_SCALES);
-  weight_scales.set_constant(true);
-
-  // Group size parameter
-  ValueSpec group_size_spec(static_cast<int32_t>(config.group_size));
-
-  // Bias (optional, float/half) - [N]
-  ValueSpec bias(
-      {config.N}, // Per output feature
-      input_dtype,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::ZEROS);
-  bias.set_constant(true);
-  if (!config.has_bias) {
-    bias.set_none(true);
-  }
-
-  // Output tensor (float/half) - [M, N]
-  ValueSpec output(
-      {config.M, config.N},
-      input_dtype,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::ZEROS);
-
-  // Add all specs to test case for linear_q4gsw
-  test_case.add_input_spec(input_tensor);
-  test_case.add_input_spec(quantized_weight);
-  test_case.add_input_spec(weight_scales);
-  test_case.add_input_spec(group_size_spec);
-  test_case.add_input_spec(bias);
-  test_case.add_output_spec(output);
-
-  return test_case;
-}
-
-// Generate easy test cases for quantized linear operation (for debugging)
-std::vector<TestCase> generate_quantized_linear_easy_cases() {
-  std::vector<TestCase> test_cases;
-
-  // Single simple configuration for debugging
-  int M = 4;
-  int K = 32;
-  int N = 16;
-  int group_size = 8;
-
-  LinearConfig config = {
-      M, // Batch size
-      K, // Input features
-      N, // Output features
-      group_size, // Group size
-      true, // has_bias
-      "simple", // test_case_name
-  };
-
-  // Test with both storage types and data types for completeness
-  std::vector<utils::StorageType> storage_types = {
-      utils::kTexture3D, utils::kBuffer};
-  std::vector<vkapi::ScalarType> float_types = {vkapi::kFloat};
-
-  // Generate test cases for each combination
-  for (const auto& storage_type : storage_types) {
-    for (const auto& input_dtype : float_types) {
-      test_cases.push_back(
-          create_test_case_from_config(config, storage_type, input_dtype));
-    }
-  }
-
-  return test_cases;
-}
-
-// Generate test cases for quantized linear operation
-std::vector<TestCase> generate_quantized_linear_test_cases() {
-  std::vector<TestCase> test_cases;
-
-  std::vector<LinearConfig> configs = {
-      // Gemv test cases
-      {1, 128, 64, 32},
-      {1, 256, 128, 64},
-      // Gemm
-      {4, 64, 32, 16},
-      {4, 128, 64, 32},
-      {4, 256, 128, 64},
-      {32, 64, 32, 16},
-      {32, 128, 64, 32},
-      {32, 256, 128, 64},
-      // No bias tests
-      {32, 128, 64, 32, false},
-      {32, 256, 128, 64, false},
-      // Performance test cases
-      {1, 2048, 2048, 128},
-      {128, 2048, 2048, 128},
-      {256, 2048, 2048, 128},
-      {1024, 2048, 2048, 128},
-  };
-
-  // Test with different storage types and data types
-  std::vector<utils::StorageType> storage_types = {
-      utils::kTexture3D, utils::kBuffer};
-
-  for (auto config : configs) {
-    std::string prefix =
-        (config.M < kRefDimSizeLimit && config.K < kRefDimSizeLimit &&
-         config.N < kRefDimSizeLimit)
-        ? "correctness_"
-        : "performance_";
-    std::string generated_test_case_name = prefix + std::to_string(config.M) +
-        "_" + std::to_string(config.K) + "_" + std::to_string(config.N) + "_g" +
-        std::to_string(config.group_size);
-    if (!config.has_bias) {
-      generated_test_case_name += "_no_bias";
-    }
-
-    config.test_case_name = generated_test_case_name;
-
-    for (const auto& storage_type : storage_types) {
-      test_cases.push_back(
-          create_test_case_from_config(config, storage_type, vkapi::kFloat));
-    }
-  }
-
-  return test_cases;
-}
-
-// Reference implementation for 4-bit group symmetric weight quantized linear
-void linear_q4gsw_reference_impl(TestCase& test_case) {
-  int32_t idx = 0;
-  const ValueSpec& input_spec = test_case.inputs()[idx++];
-  const ValueSpec& weight_spec = test_case.inputs()[idx++];
-  const ValueSpec& weight_scales_spec = test_case.inputs()[idx++];
-  const ValueSpec& group_size_spec = test_case.inputs()[idx++];
-  const ValueSpec& bias_spec = test_case.inputs()[idx++];
-
-  // Extract output specification (mutable reference)
-  ValueSpec& output_spec = test_case.outputs()[0];
-
-  // Get tensor dimensions
-  auto input_sizes = input_spec.get_tensor_sizes(); // [batch_size, in_features]
-  auto weight_sizes =
-      weight_spec.get_tensor_sizes(); // [in_features, out_features/2]
-  auto output_sizes =
-      output_spec.get_tensor_sizes(); // [batch_size, out_features]
-
-  int64_t batch_size = input_sizes[0];
-  int64_t in_features = input_sizes[1];
-  int64_t out_features = output_sizes[1];
-  int64_t group_size = group_size_spec.get_int_value();
-
-  // Skip for large tensors since computation time will be extremely slow
-  if (batch_size > kRefDimSizeLimit || in_features > kRefDimSizeLimit ||
-      out_features > kRefDimSizeLimit) {
-    throw std::invalid_argument(
-        "One or more dimensions (batch_size, in_features, out_features) exceed the allowed limit for reference implementation.");
-  }
-
-  if (input_spec.dtype != vkapi::kFloat) {
-    throw std::invalid_argument("Unsupported dtype");
-  }
-
-  // Get raw data pointers
-  auto& input_data = input_spec.get_float_data();
-  auto& weight_data = weight_spec.get_uint8_data();
-  auto& weight_scales_data = weight_scales_spec.get_float_data();
-  auto& bias_data = bias_spec.get_float_data();
-
-  // Calculate number of output elements
-  int64_t num_output_elements = batch_size * out_features;
-
-  auto& ref_data = output_spec.get_ref_float_data();
-  ref_data.resize(num_output_elements);
-
-  // Perform quantized linear transformation (matrix multiplication)
-  for (int64_t b = 0; b < batch_size; ++b) {
-    for (int64_t out_f = 0; out_f < out_features; ++out_f) {
-      float sum = 0.0f;
-
-      // Matrix multiplication: output[b][out_f] = sum(input[b][in_f] *
-      // weight[out_f][in_f])
-      for (int64_t in_f = 0; in_f < in_features; ++in_f) {
-        // Get input value
-        int64_t input_idx = b * in_features + in_f;
-        float input_val = input_data[input_idx];
-
-        // Get weight value and dequantize (4-bit group symmetric quantization)
-        int64_t group_idx = in_f / group_size;
-        int64_t scales_idx = group_idx * out_features + out_f;
-
-        // Get packed weight value - weight matrix is [N, K/2]
-        int64_t weight_idx = (out_f) * (in_features / 2) + (in_f / 2);
-        uint8_t packed_weight = weight_data[weight_idx];
-
-        // Unpack 4-bit weight
-        auto unpacked = unpack_4bit(packed_weight);
-        int8_t weight_4bit = (in_f % 2 == 0) ? unpacked.first : unpacked.second;
-
-        // Dequantize weight using group symmetric quantization (no zero point)
-        float weight_scale = weight_scales_data[scales_idx];
-        float dequant_weight = static_cast<float>(weight_4bit) * weight_scale;
-
-        sum += input_val * dequant_weight;
-      }
-
-      // Add bias and store result
-      if (!bias_spec.is_none()) {
-        sum += bias_data[out_f];
-      }
-      int64_t output_idx = b * out_features + out_f;
-      ref_data[output_idx] = sum;
-    }
-  }
-}
-
-void reference_impl(TestCase& test_case) {
-  linear_q4gsw_reference_impl(test_case);
-}
-
-int64_t quantized_linear_flop_calculator(const TestCase& test_case) {
-  // Get input and weight dimensions
-  const auto& input_sizes = test_case.inputs()[0].get_tensor_sizes();
-  const auto& output_sizes = test_case.outputs()[0].get_tensor_sizes();
-
-  int64_t batch_size = input_sizes[0];
-  int64_t in_features = input_sizes[1];
-  int64_t out_features = output_sizes[1];
-
-  // Calculate FLOPs for quantized linear operation
-  // Each output element requires:
-  // - in_features multiply-accumulate operations
-  // - Additional operations for quantization/dequantization
-  int64_t output_elements = batch_size * out_features;
-  int64_t ops_per_output = in_features;
-
-  // Add quantization overhead (approximate)
-  // - Unpack 4-bit weight: 1 op per weight element used
-  // - Dequantize weight: 1 op per weight element used
-  // - Add bias: 1 op per output element
-  int64_t quantization_ops = ops_per_output * 2 + 1; // Simplified estimate
-
-  int64_t flop = output_elements * (ops_per_output + quantization_ops);
-
-  return flop;
-}
-
-int main(int argc, char* argv[]) {
-  set_debugging(false);
-  set_print_output(false);
-  set_print_latencies(false);
-  set_use_gpu_timestamps(true);
-
-  print_performance_header();
-  std::cout
-      << "4-bit Group Symmetric Weight Quantized Linear Operation Prototyping Framework"
-      << std::endl;
-  print_separator();
-
-  ReferenceComputeFunc ref_fn = reference_impl;
-
-  // Execute easy test cases using the new framework with custom FLOP calculator
-  auto results = execute_test_cases(
-      generate_quantized_linear_test_cases,
-      quantized_linear_flop_calculator,
-      "QuantizedLinearQ4GSW",
-      0,
-      10,
-      ref_fn);
-
-  return 0;
-}
diff --git a/backends/vulkan/test/custom_ops/q8csw_conv2d.cpp b/backends/vulkan/test/custom_ops/q8csw_conv2d.cpp
deleted file mode 100644
index d566e5b2646..00000000000
--- a/backends/vulkan/test/custom_ops/q8csw_conv2d.cpp
+++ /dev/null
@@ -1,785 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-#include <iostream>
-#include <vector>
-#include "utils.h"
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-using namespace executorch::vulkan::prototyping;
-
-using namespace vkcompute;
-
-static constexpr int64_t kRefDimSizeLimit = 100;
-
-// Component structs for better readability
-struct KernelSize {
-  int32_t h;
-  int32_t w;
-
-  KernelSize(int32_t height, int32_t width) : h(height), w(width) {}
-};
-
-struct Stride {
-  int32_t h;
-  int32_t w;
-
-  Stride(int32_t height, int32_t width) : h(height), w(width) {}
-};
-
-struct Padding {
-  int32_t h;
-  int32_t w;
-
-  Padding(int32_t height, int32_t width) : h(height), w(width) {}
-};
-
-struct Dilation {
-  int32_t h;
-  int32_t w;
-
-  Dilation(int32_t height = 1, int32_t width = 1) : h(height), w(width) {}
-};
-
-struct OutInChannels {
-  int32_t out;
-  int32_t in;
-
-  OutInChannels(int32_t out_channels, int32_t in_channels)
-      : out(out_channels), in(in_channels) {}
-};
-
-struct InputSize2D {
-  int32_t h;
-  int32_t w;
-
-  InputSize2D(int32_t height, int32_t width) : h(height), w(width) {}
-};
-
-// Conv2d configuration struct
-struct Conv2dConfig {
-  OutInChannels channels;
-  InputSize2D input_size;
-  KernelSize kernel;
-  Stride stride;
-  Padding padding;
-  Dilation dilation;
-  int32_t groups; // Number of groups for grouped convolution
-  std::string test_case_name = "placeholder";
-  std::string op_name = "conv2d_q8ta_q8csw";
-
-  // Calculate output dimensions
-  int64_t get_output_height() const {
-    return (input_size.h + 2 * padding.h - dilation.h * (kernel.h - 1) - 1) /
-        stride.h +
-        1;
-  }
-
-  int64_t get_output_width() const {
-    return (input_size.w + 2 * padding.w - dilation.w * (kernel.w - 1) - 1) /
-        stride.w +
-        1;
-  }
-};
-
-// Utility function to create a test case from a Conv2dConfig
-TestCase create_test_case_from_config(
-    const Conv2dConfig& config,
-    utils::StorageType storage_type,
-    vkapi::ScalarType input_dtype) {
-  TestCase test_case;
-
-  // Create a descriptive name for the test case
-  std::string storage_str =
-      (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer";
-  std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half";
-
-  std::string test_name =
-      config.test_case_name + "_" + storage_str + "_" + dtype_str;
-  test_case.set_name(test_name);
-
-  // Set the operator name for the test case
-  std::string operator_name = "et_vk." + config.op_name + ".default";
-  test_case.set_operator_name(operator_name);
-
-  // Calculate output dimensions
-  int64_t H_out = config.get_output_height();
-  int64_t W_out = config.get_output_width();
-
-  // Input tensor (float/half) - [1, C_in, H_in, W_in] (batch size always 1)
-  std::vector<int64_t> input_size = {
-      1, config.channels.in, config.input_size.h, config.input_size.w};
-
-  ValueSpec input_tensor(
-      input_size,
-      input_dtype,
-      storage_type,
-      utils::kChannelsPacked,
-      DataGenType::RANDOM);
-
-  if (debugging()) {
-    print_valuespec_data(input_tensor, "input_tensor");
-  }
-
-  float input_scale_val = 0.07f;
-  ValueSpec input_scale(input_scale_val);
-
-  int32_t input_zero_point_val = -3;
-  ValueSpec input_zero_point(input_zero_point_val);
-
-  // Quantized weight tensor (int8) - [C_out, C_in_per_group * K_h * K_w]
-  // Memory layout: height, width, then channels - in_c is innermost (stride 1)
-  // in the second dimension
-  const int64_t in_channels_per_group = config.channels.in / config.groups;
-  const int64_t in_features = utils::align_up_4(
-      in_channels_per_group * config.kernel.h * config.kernel.w);
-  std::vector<int64_t> weight_size = {config.channels.out, in_features};
-  ValueSpec quantized_weight(
-      weight_size,
-      vkapi::kChar, // int8 for quantized weights
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::RANDINT8);
-  quantized_weight.set_constant(true);
-
-  if (debugging()) {
-    print_valuespec_data(quantized_weight, "weight_tensor");
-  }
-
-  // Weight quantization scales (float/half, per-channel)
-  ValueSpec weight_scales(
-      {config.channels.out}, // Per output channel
-      input_dtype,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::RANDOM_SCALES);
-  weight_scales.set_constant(true);
-
-  ValueSpec weight_sums(
-      {config.channels.out}, // Per output channel
-      vkapi::kInt,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::ZEROS);
-  weight_sums.set_constant(true);
-
-  // Compute weight_sums data based on quantized weights
-  compute_weight_sums(
-      weight_sums, quantized_weight, config.channels.out, in_features);
-
-  // Bias (optional, float/half) - [C_out]
-  ValueSpec bias(
-      {config.channels.out}, // Per output channel
-      input_dtype,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::RANDOM);
-  bias.set_constant(true);
-
-  // Stride and padding parameters
-  ValueSpec stride({config.stride.h, config.stride.w});
-  ValueSpec padding({config.padding.h, config.padding.w});
-
-  // Dilation and groups parameters
-  ValueSpec dilation({config.dilation.h, config.dilation.w});
-  ValueSpec groups(config.groups);
-
-  // Kernel size parameters
-  ValueSpec kernel_size({config.kernel.h, config.kernel.w});
-
-  // Output tensor (float/half) - [1, C_out, H_out, W_out] (batch size always 1)
-  ValueSpec output(
-      {1, config.channels.out, H_out, W_out},
-      input_dtype,
-      storage_type,
-      utils::kChannelsPacked,
-      DataGenType::ZEROS);
-
-  // Add all specs to test case
-  if (config.op_name.find("q8ta") != std::string::npos) {
-    test_case.add_input_spec(input_tensor);
-    test_case.add_input_spec(input_scale);
-    test_case.add_input_spec(input_zero_point);
-    test_case.add_input_spec(quantized_weight);
-    test_case.add_input_spec(weight_sums);
-    test_case.add_input_spec(weight_scales);
-    test_case.add_input_spec(bias);
-    test_case.add_input_spec(kernel_size);
-    test_case.add_input_spec(stride);
-    test_case.add_input_spec(padding);
-    test_case.add_input_spec(dilation);
-    test_case.add_input_spec(groups);
-  } else {
-    test_case.add_input_spec(input_tensor);
-    test_case.add_input_spec(quantized_weight);
-    test_case.add_input_spec(weight_scales);
-    test_case.add_input_spec(bias);
-    test_case.add_input_spec(kernel_size);
-    test_case.add_input_spec(stride);
-    test_case.add_input_spec(padding);
-    test_case.add_input_spec(dilation);
-    test_case.add_input_spec(groups);
-  }
-
-  test_case.add_output_spec(output);
-
-  return test_case;
-}
-
-// Generate easy test cases for quantized conv2d operation (for debugging)
-std::vector<TestCase> generate_quantized_conv2d_easy_cases() {
-  std::vector<TestCase> test_cases;
-
-  // Single simple configuration for debugging
-  Conv2dConfig config = {
-      OutInChannels(8, 3), // channels (out, in)
-      InputSize2D(8, 8), // input_size (h, w)
-      KernelSize(3, 3), // kernel
-      Stride(1, 1), // stride
-      Padding(0, 0), // padding
-      Dilation(1, 1), // dilation
-      1, // groups
-  };
-
-  // Test with both storage types and data types for completeness
-  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
-  std::vector<vkapi::ScalarType> float_types = {vkapi::kFloat};
-
-  // Generate test cases for each combination
-  for (const auto& storage_type : storage_types) {
-    for (const auto& input_dtype : float_types) {
-      test_cases.push_back(
-          create_test_case_from_config(config, storage_type, input_dtype));
-    }
-  }
-
-  return test_cases;
-}
-
-// Generate test cases for quantized conv2d operation
-std::vector<TestCase> generate_quantized_conv2d_test_cases() {
-  std::vector<TestCase> test_cases;
-
-  std::vector<Conv2dConfig> configs = {
-      {OutInChannels(32, 3),
-       InputSize2D(64, 64),
-       KernelSize(3, 3),
-       Stride(2, 2),
-       Padding(1, 1),
-       Dilation(1, 1),
-       1},
-      {OutInChannels(32, 16),
-       InputSize2D(32, 32),
-       KernelSize(3, 3),
-       Stride(1, 1),
-       Padding(1, 1),
-       Dilation(1, 1),
-       1},
-      {OutInChannels(64, 32),
-       InputSize2D(16, 16),
-       KernelSize(3, 3),
-       Stride(1, 1),
-       Padding(1, 1),
-       Dilation(1, 1),
-       1},
-      // One output channel case
-      {OutInChannels(1, 32),
-       InputSize2D(55, 55),
-       KernelSize(3, 3),
-       Stride(1, 1),
-       Padding(1, 1),
-       Dilation(1, 1),
-       1},
-
-      // Stride 2 convolutions
-      {OutInChannels(32, 3),
-       InputSize2D(64, 64),
-       KernelSize(3, 3),
-       Stride(2, 2),
-       Padding(1, 1),
-       Dilation(1, 1),
-       1},
-      {OutInChannels(64, 32),
-       InputSize2D(32, 32),
-       KernelSize(3, 3),
-       Stride(2, 2),
-       Padding(1, 1),
-       Dilation(1, 1),
-       1},
-      // Different kernel sizes
-      {OutInChannels(32, 16),
-       InputSize2D(28, 28),
-       KernelSize(5, 5),
-       Stride(1, 1),
-       Padding(2, 2),
-       Dilation(1, 1),
-       1},
-      {OutInChannels(64, 32),
-       InputSize2D(14, 14),
-       KernelSize(7, 7),
-       Stride(1, 1),
-       Padding(3, 3),
-       Dilation(1, 1),
-       1},
-
-      // Dilated convolutions
-      {OutInChannels(32, 16),
-       InputSize2D(32, 32),
-       KernelSize(3, 3),
-       Stride(1, 1),
-       Padding(2, 2),
-       Dilation(2, 2),
-       1},
-      {OutInChannels(64, 32),
-       InputSize2D(16, 16),
-       KernelSize(3, 3),
-       Stride(1, 1),
-       Padding(3, 3),
-       Dilation(3, 3),
-       1},
-
-      // Grouped convolutions
-      {OutInChannels(32, 32),
-       InputSize2D(32, 32),
-       KernelSize(3, 3),
-       Stride(1, 1),
-       Padding(1, 1),
-       Dilation(1, 1),
-       4},
-      {OutInChannels(64, 64),
-       InputSize2D(16, 16),
-       KernelSize(3, 3),
-       Stride(1, 1),
-       Padding(1, 1),
-       Dilation(1, 1),
-       8},
-      // Performance test cases
-      {OutInChannels(256, 128),
-       InputSize2D(128, 128),
-       KernelSize(1, 1),
-       Stride(1, 1),
-       Padding(1, 1),
-       Dilation(1, 1),
-       8},
-      {OutInChannels(128, 64),
-       InputSize2D(128, 128),
-       KernelSize(3, 3),
-       Stride(1, 1),
-       Padding(1, 1),
-       Dilation(1, 1),
-       1}};
-
-  // Test with different storage types and data types
-  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
-
-  // Generate test cases for each combination
-  for (auto& config : configs) {
-    for (const auto& storage_type : storage_types) {
-      // Generate test case name programmatically
-      bool is_performance = config.channels.out > kRefDimSizeLimit ||
-          config.channels.in > kRefDimSizeLimit ||
-          config.input_size.h > kRefDimSizeLimit ||
-          config.input_size.w > kRefDimSizeLimit;
-      std::string prefix = is_performance ? "performance_" : "correctness_";
-      std::string suffix = std::to_string(config.channels.out) + "/" +
-          std::to_string(config.channels.in) + "_" +
-          std::to_string(config.input_size.h) + "/" +
-          std::to_string(config.input_size.w) + "_" +
-          std::to_string(config.kernel.h) + "/" +
-          std::to_string(config.kernel.w);
-
-      config.test_case_name = prefix + suffix;
-      // The default operator tested is activation + weight quantized conv2d;
-      // however, only test this if the int8 dot product extension is supported
-      if (vkcompute::api::context()
-              ->adapter_ptr()
-              ->supports_int8_dot_product()) {
-        test_cases.push_back(
-            create_test_case_from_config(config, storage_type, vkapi::kFloat));
-      }
-
-      Conv2dConfig wo_quant_config = config;
-      wo_quant_config.op_name = "conv2d_q8csw";
-      test_cases.push_back(create_test_case_from_config(
-          wo_quant_config, storage_type, vkapi::kFloat));
-    }
-  }
-
-  return test_cases;
-}
-
-// Reference implementation for weight only quantized conv2d (fp accumulation)
-void conv2d_q8csw_reference_impl(TestCase& test_case) {
-  int32_t idx = 0;
-  const ValueSpec& input_spec = test_case.inputs()[idx++];
-  const ValueSpec& weight_spec = test_case.inputs()[idx++];
-  const ValueSpec& weight_scales_spec = test_case.inputs()[idx++];
-  const ValueSpec& bias_spec = test_case.inputs()[idx++];
-  const ValueSpec& kernel_size_spec = test_case.inputs()[idx++];
-  const ValueSpec& stride_spec = test_case.inputs()[idx++];
-  const ValueSpec& padding_spec = test_case.inputs()[idx++];
-  const ValueSpec& dilation_spec = test_case.inputs()[idx++];
-  const ValueSpec& groups_spec = test_case.inputs()[idx++];
-
-  // Extract output specification (mutable reference)
-  ValueSpec& output_spec = test_case.outputs()[0];
-
-  // Get tensor dimensions
-  auto input_sizes = input_spec.get_tensor_sizes(); // [N, C_in, H_in, W_in]
-  auto weight_sizes =
-      weight_spec.get_tensor_sizes(); // [C_out, C_in_per_group * K_h * K_w]
-  auto output_sizes =
-      output_spec.get_tensor_sizes(); // [N, C_out, H_out, W_out]
-
-  int64_t N = input_sizes[0];
-  int64_t C_in = input_sizes[1];
-  int64_t H_in = input_sizes[2];
-  int64_t W_in = input_sizes[3];
-  int64_t C_out = output_sizes[1];
-  int64_t H_out = output_sizes[2];
-  int64_t W_out = output_sizes[3];
-
-  // Get kernel dimensions from kernel_size ValueSpec
-  auto kernel_size_data = kernel_size_spec.get_int32_data();
-  int64_t K_h = kernel_size_data[0];
-  int64_t K_w = kernel_size_data[1];
-
-  // Get stride, padding, dilation, and groups
-  auto stride_data = stride_spec.get_int32_data();
-  auto padding_data = padding_spec.get_int32_data();
-  auto dilation_data = dilation_spec.get_int32_data();
-  int64_t stride_h = stride_data[0];
-  int64_t stride_w = stride_data[1];
-  int64_t pad_h = padding_data[0];
-  int64_t pad_w = padding_data[1];
-  int64_t dilation_h = dilation_data[0];
-  int64_t dilation_w = dilation_data[1];
-  int64_t groups = groups_spec.get_int_value();
-
-  // Skip for large tensors since computation time will be extremely slow
-  if (N > kRefDimSizeLimit || C_in > kRefDimSizeLimit ||
-      H_in > kRefDimSizeLimit || W_in > kRefDimSizeLimit ||
-      C_out > kRefDimSizeLimit) {
-    throw std::invalid_argument(
-        "One or more dimensions exceed the allowed limit for reference implementation.");
-  }
-
-  if (input_spec.dtype != vkapi::kFloat) {
-    throw std::invalid_argument("Unsupported dtype");
-  }
-
-  // Get raw data pointers
-  auto& input_data = input_spec.get_float_data();
-  auto& weight_data = weight_spec.get_int8_data();
-  auto& weight_scales_data = weight_scales_spec.get_float_data();
-  auto& bias_data = bias_spec.get_float_data();
-
-  // Calculate channels per group for grouped convolution
-  int64_t C_in_per_group = C_in / groups;
-  int64_t C_out_per_group = C_out / groups;
-
-  // Calculate number of output elements
-  int64_t num_output_elements = N * C_out * H_out * W_out;
-
-  auto& ref_data = output_spec.get_ref_float_data();
-  ref_data.resize(num_output_elements);
-
-  const int in_features = utils::align_up_4(C_in_per_group * K_h * K_w);
-
-  // Perform weight-only quantized conv2d operation (fp accumulation)
-  for (int64_t n = 0; n < N; ++n) {
-    for (int64_t out_c = 0; out_c < C_out; ++out_c) {
-      for (int64_t out_h = 0; out_h < H_out; ++out_h) {
-        for (int64_t out_w = 0; out_w < W_out; ++out_w) {
-          float sum = 0.0f;
-
-          // Determine which group this output channel belongs to
-          int64_t group_idx = out_c / C_out_per_group;
-          int64_t in_c_start = group_idx * C_in_per_group;
-          int64_t in_c_end = (group_idx + 1) * C_in_per_group;
-
-          // Convolution operation with dilation support and grouped convolution
-          for (int64_t in_c = in_c_start; in_c < in_c_end; ++in_c) {
-            for (int64_t kh = 0; kh < K_h; ++kh) {
-              for (int64_t kw = 0; kw < K_w; ++kw) {
-                // Calculate input position with dilation
-                int64_t in_h = out_h * stride_h - pad_h + kh * dilation_h;
-                int64_t in_w = out_w * stride_w - pad_w + kw * dilation_w;
-
-                // Check bounds (zero padding)
-                if (in_h >= 0 && in_h < H_in && in_w >= 0 && in_w < W_in) {
-                  // Get input value (keep as float)
-                  int64_t input_idx = n * (C_in * H_in * W_in) +
-                      in_c * (H_in * W_in) + in_h * W_in + in_w;
-                  float input_val = input_data[input_idx];
-
-                  // Get weight value and dequantize
-                  // Weight layout: [C_out, C_in_per_group * K_h * K_w]
-                  int64_t weight_idx = out_c * in_features +
-                      (kh * (K_w * C_in_per_group) + kw * C_in_per_group +
-                       (in_c % C_in_per_group));
-                  float dequant_weight =
-                      (static_cast<float>(weight_data[weight_idx])) *
-                      weight_scales_data[out_c];
-
-                  sum += input_val * dequant_weight;
-                }
-              }
-            }
-          }
-
-          // Add bias and store result
-          sum += bias_data[out_c];
-          int64_t output_idx = n * (C_out * H_out * W_out) +
-              out_c * (H_out * W_out) + out_h * W_out + out_w;
-          ref_data[output_idx] = sum;
-        }
-      }
-    }
-  }
-}
-
-// Reference implementation for activation and weight quantized conv2d (int
-// accumulation)
-void conv2d_q8ta_q8csw_reference_impl(TestCase& test_case) {
-  // Extract input specifications
-  int32_t idx = 0;
-  const ValueSpec& input_spec = test_case.inputs()[idx++];
-  const ValueSpec& input_scale_spec = test_case.inputs()[idx++];
-  const ValueSpec& input_zeros_spec = test_case.inputs()[idx++];
-  const ValueSpec& weight_spec = test_case.inputs()[idx++];
-  const ValueSpec& weight_sums_spec = test_case.inputs()[idx++];
-  (void)weight_sums_spec;
-  const ValueSpec& weight_scales_spec = test_case.inputs()[idx++];
-  const ValueSpec& bias_spec = test_case.inputs()[idx++];
-  const ValueSpec& kernel_size_spec = test_case.inputs()[idx++];
-  const ValueSpec& stride_spec = test_case.inputs()[idx++];
-  const ValueSpec& padding_spec = test_case.inputs()[idx++];
-  const ValueSpec& dilation_spec = test_case.inputs()[idx++];
-  const ValueSpec& groups_spec = test_case.inputs()[idx++];
-
-  // Extract output specification (mutable reference)
-  ValueSpec& output_spec = test_case.outputs()[0];
-
-  // Get tensor dimensions
-  auto input_sizes = input_spec.get_tensor_sizes(); // [N, C_in, H_in, W_in]
-  auto weight_sizes =
-      weight_spec.get_tensor_sizes(); // [C_out, C_in_per_group * K_h * K_w]
-  auto output_sizes =
-      output_spec.get_tensor_sizes(); // [N, C_out, H_out, W_out]
-
-  int64_t N = input_sizes[0];
-  int64_t C_in = input_sizes[1];
-  int64_t H_in = input_sizes[2];
-  int64_t W_in = input_sizes[3];
-  int64_t C_out = output_sizes[1];
-  int64_t H_out = output_sizes[2];
-  int64_t W_out = output_sizes[3];
-
-  // Get kernel dimensions from kernel_size ValueSpec
-  auto kernel_size_data = kernel_size_spec.get_int32_data();
-  int64_t K_h = kernel_size_data[0];
-  int64_t K_w = kernel_size_data[1];
-
-  // Get stride, padding, dilation, and groups
-  auto stride_data = stride_spec.get_int32_data();
-  auto padding_data = padding_spec.get_int32_data();
-  auto dilation_data = dilation_spec.get_int32_data();
-  int64_t stride_h = stride_data[0];
-  int64_t stride_w = stride_data[1];
-  int64_t pad_h = padding_data[0];
-  int64_t pad_w = padding_data[1];
-  int64_t dilation_h = dilation_data[0];
-  int64_t dilation_w = dilation_data[1];
-  int64_t groups = groups_spec.get_int_value();
-
-  // Skip for large tensors since computation time will be extremely slow
-  if (N > kRefDimSizeLimit || C_in > kRefDimSizeLimit ||
-      H_in > kRefDimSizeLimit || W_in > kRefDimSizeLimit ||
-      C_out > kRefDimSizeLimit) {
-    throw std::invalid_argument(
-        "One or more dimensions exceed the allowed limit for reference implementation.");
-  }
-
-  if (input_spec.dtype != vkapi::kFloat) {
-    throw std::invalid_argument("Unsupported dtype");
-  }
-
-  // Get raw data pointers
-  auto& input_data = input_spec.get_float_data();
-  const float input_scale = input_scale_spec.get_float_value();
-  const int32_t input_zero_point = input_zeros_spec.get_int_value();
-
-  auto& weight_data = weight_spec.get_int8_data();
-  auto& weight_scales_data = weight_scales_spec.get_float_data();
-  auto& bias_data = bias_spec.get_float_data();
-
-  // Calculate channels per group for grouped convolution
-  int64_t C_in_per_group = C_in / groups;
-  int64_t C_out_per_group = C_out / groups;
-
-  // Calculate number of output elements
-  int64_t num_output_elements = N * C_out * H_out * W_out;
-
-  auto& ref_data = output_spec.get_ref_float_data();
-  ref_data.resize(num_output_elements);
-
-  const int in_features = utils::align_up_4(C_in_per_group * K_h * K_w);
-
-  // Perform activation and weight quantized conv2d operation (int accumulation)
-  for (int64_t n = 0; n < N; ++n) {
-    for (int64_t out_c = 0; out_c < C_out; ++out_c) {
-      for (int64_t out_h = 0; out_h < H_out; ++out_h) {
-        for (int64_t out_w = 0; out_w < W_out; ++out_w) {
-          int32_t int_sum = 0;
-          int32_t weight_sum = 0; // Track weight sum on the fly
-
-          // Determine which group this output channel belongs to
-          int64_t group_idx = out_c / C_out_per_group;
-          int64_t in_c_start = group_idx * C_in_per_group;
-          int64_t in_c_end = (group_idx + 1) * C_in_per_group;
-
-          // Convolution operation with integer accumulation
-          for (int64_t in_c = in_c_start; in_c < in_c_end; ++in_c) {
-            for (int64_t kh = 0; kh < K_h; ++kh) {
-              for (int64_t kw = 0; kw < K_w; ++kw) {
-                // Calculate input position with dilation
-                int64_t in_h = out_h * stride_h - pad_h + kh * dilation_h;
-                int64_t in_w = out_w * stride_w - pad_w + kw * dilation_w;
-
-                // Check bounds (zero padding)
-                if (in_h >= 0 && in_h < H_in && in_w >= 0 && in_w < W_in) {
-                  // Get input value and quantize to int8
-                  int64_t input_idx = n * (C_in * H_in * W_in) +
-                      in_c * (H_in * W_in) + in_h * W_in + in_w;
-
-                  float quant_input_f =
-                      std::round(input_data[input_idx] / input_scale) +
-                      input_zero_point;
-                  quant_input_f =
-                      std::min(std::max(quant_input_f, -128.0f), 127.0f);
-                  int8_t quantized_input = static_cast<int8_t>(quant_input_f);
-
-                  // Get quantized weight (already int8)
-                  // Weight layout: [C_out, C_in_per_group * K_h * K_w]
-                  int64_t weight_idx = out_c * in_features +
-                      (kh * (K_w * C_in_per_group) + kw * C_in_per_group +
-                       (in_c % C_in_per_group));
-                  int8_t quantized_weight = weight_data[weight_idx];
-
-                  // Integer multiplication and accumulation
-                  int_sum += static_cast<int32_t>(quantized_input) *
-                      static_cast<int32_t>(quantized_weight);
-
-                  // Track weight sum for this output channel on the fly
-                  weight_sum += static_cast<int32_t>(quantized_weight);
-                } else {
-                  // For zero padding, we still need to account for the weight
-                  // in weight_sum when input is effectively 0 (but quantized 0
-                  // is input_zero_point)
-                  int64_t weight_idx = out_c * in_features +
-                      (kh * (K_w * C_in_per_group) + kw * C_in_per_group +
-                       (in_c % C_in_per_group));
-                  int8_t quantized_weight = weight_data[weight_idx];
-
-                  // Add contribution from zero-padded input (quantized zero =
-                  // input_zero_point)
-                  int_sum += static_cast<int32_t>(input_zero_point) *
-                      static_cast<int32_t>(quantized_weight);
-
-                  // Track weight sum for this output channel on the fly
-                  weight_sum += static_cast<int32_t>(quantized_weight);
-                }
-              }
-            }
-          }
-
-          // Convert accumulated integer result to float and apply scales
-          // Final result = (int_sum - zero_point_correction) * input_scale *
-          // weight_scale + bias zero_point_correction = input_zero_point *
-          // sum_of_weights_for_this_output_channel
-          int32_t zero_point_correction = input_zero_point * weight_sum;
-          int32_t accum_adjusted = int_sum - zero_point_correction;
-          float float_result =
-              accum_adjusted * input_scale * weight_scales_data[out_c];
-
-          // Add bias and store result
-          float_result += bias_data[out_c];
-          int64_t output_idx = n * (C_out * H_out * W_out) +
-              out_c * (H_out * W_out) + out_h * W_out + out_w;
-          ref_data[output_idx] = float_result;
-        }
-      }
-    }
-  }
-}
-
-void reference_impl(TestCase& test_case) {
-  if (test_case.operator_name().find("q8ta") != std::string::npos) {
-    conv2d_q8ta_q8csw_reference_impl(test_case);
-  } else {
-    conv2d_q8csw_reference_impl(test_case);
-  }
-}
-
-// Custom FLOP calculator for quantized conv2d operation
-int64_t quantized_conv2d_flop_calculator(const TestCase& test_case) {
-  int kernel_idx = 4;
-  if (test_case.operator_name().find("q8ta") != std::string::npos) {
-    kernel_idx = 7;
-  }
-  // Get input and weight dimensions
-  const auto& input_sizes = test_case.inputs()[0].get_tensor_sizes();
-  const auto& output_sizes = test_case.outputs()[0].get_tensor_sizes();
-
-  const auto& kernel_sizes = test_case.inputs()[kernel_idx].get_int32_data();
-
-  int64_t N = input_sizes[0];
-  int64_t C_in = input_sizes[1];
-  int64_t C_out = output_sizes[1];
-  int64_t K_h = kernel_sizes[0];
-  int64_t K_w = kernel_sizes[1];
-  int64_t H_out = output_sizes[2];
-  int64_t W_out = output_sizes[3];
-
-  // Calculate FLOPs for quantized conv2d operation
-  // Each output element requires:
-  // - C_in * K_h * K_w multiply-accumulate operations
-  // - Additional operations for quantization/dequantization
-  int64_t output_elements = N * C_out * H_out * W_out;
-  int64_t ops_per_output = C_in * K_h * K_w;
-
-  int64_t flop = output_elements * (ops_per_output);
-
-  return flop;
-}
-
-int main(int argc, char* argv[]) {
-  set_debugging(false);
-  set_print_output(false);
-  set_print_latencies(false);
-  set_use_gpu_timestamps(true);
-
-  print_performance_header();
-  std::cout << "Quantized Conv2d Operation Prototyping Framework" << std::endl;
-  print_separator();
-
-  ReferenceComputeFunc ref_fn = reference_impl;
-
-  // Execute test cases using the new framework with custom FLOP calculator
-  auto results = execute_test_cases(
-      generate_quantized_conv2d_test_cases,
-      quantized_conv2d_flop_calculator,
-      "QuantizedConv2d",
-      0,
-      10,
-      ref_fn);
-
-  return 0;
-}
diff --git a/backends/vulkan/test/custom_ops/q8csw_linear.cpp b/backends/vulkan/test/custom_ops/q8csw_linear.cpp
deleted file mode 100644
index 23973426fcc..00000000000
--- a/backends/vulkan/test/custom_ops/q8csw_linear.cpp
+++ /dev/null
@@ -1,479 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-#include <iostream>
-#include <vector>
-#include "utils.h"
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-using namespace executorch::vulkan::prototyping;
-
-using namespace vkcompute;
-
-static constexpr int64_t kRefDimSizeLimit = 300;
-
-// Linear configuration struct
-struct LinearConfig {
-  int64_t M; // Batch size / number of rows in input
-  int64_t K; // Input features / columns in input, rows in weight
-  int64_t N; // Output features / columns in weight
-  bool has_bias = true;
-  std::string test_case_name = "placeholder";
-  std::string op_name = "linear_q8ta_q8csw";
-};
-
-// Utility function to create a test case from a LinearConfig
-TestCase create_test_case_from_config(
-    const LinearConfig& config,
-    utils::StorageType storage_type,
-    vkapi::ScalarType input_dtype) {
-  TestCase test_case;
-
-  // Create a descriptive name for the test case
-  std::string storage_str =
-      (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer";
-  std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half";
-
-  std::string test_name =
-      config.test_case_name + "_" + storage_str + "_" + dtype_str;
-  test_case.set_name(test_name);
-
-  // Set the operator name for the test case
-  std::string operator_name = "et_vk." + config.op_name + ".default";
-  test_case.set_operator_name(operator_name);
-
-  // Derive sizes from M, K, N
-  std::vector<int64_t> input_size = {config.M, config.K};
-  std::vector<int64_t> weight_size = {config.N, config.K};
-
-  // Input tensor (float/half) - [M, K]
-  ValueSpec input_tensor(
-      input_size,
-      input_dtype,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::RANDOM);
-
-  if (debugging()) {
-    print_valuespec_data(input_tensor, "input_tensor");
-  }
-
-  float input_scale_val = 0.008f;
-  ValueSpec input_scale(input_scale_val);
-
-  int32_t input_zero_point_val = -2;
-  ValueSpec input_zero_point(input_zero_point_val);
-
-  // Quantized weight tensor (int8) - [K, N]
-  ValueSpec quantized_weight(
-      weight_size,
-      vkapi::kChar, // int8 for quantized weights
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::RANDINT8);
-  quantized_weight.set_constant(true);
-
-  if (debugging()) {
-    print_valuespec_data(quantized_weight, "weight_tensor");
-  }
-
-  // Weight quantization scales (float/half, per-channel)
-  ValueSpec weight_scales(
-      {config.N}, // Per output feature
-      input_dtype,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::RANDOM_SCALES);
-  weight_scales.set_constant(true);
-
-  ValueSpec weight_sums(
-      {config.N}, // Per output features
-      vkapi::kInt,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::ZEROS);
-  weight_sums.set_constant(true);
-
-  // Compute weight_sums data based on quantized weights
-  int64_t in_features = config.K;
-  int64_t out_features = config.N;
-  compute_weight_sums(weight_sums, quantized_weight, out_features, in_features);
-
-  // Bias (optional, float/half) - [N]
-  ValueSpec bias(
-      {config.N}, // Per output feature
-      input_dtype,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::RANDOM);
-  bias.set_constant(true);
-  if (!config.has_bias) {
-    bias.set_none(true);
-  }
-
-  // Output tensor (float/half) - [M, N]
-  ValueSpec output(
-      {config.M, config.N},
-      input_dtype,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::ZEROS);
-
-  // Add all specs to test case
-  if (config.op_name.find("q8ta") != std::string::npos) {
-    test_case.add_input_spec(input_tensor);
-    test_case.add_input_spec(input_scale);
-    test_case.add_input_spec(input_zero_point);
-    test_case.add_input_spec(quantized_weight);
-    test_case.add_input_spec(weight_sums);
-    test_case.add_input_spec(weight_scales);
-    test_case.add_input_spec(bias);
-    test_case.add_output_spec(output);
-  } else {
-    test_case.add_input_spec(input_tensor);
-    test_case.add_input_spec(quantized_weight);
-    test_case.add_input_spec(weight_scales);
-    test_case.add_input_spec(bias);
-    test_case.add_output_spec(output);
-  }
-
-  return test_case;
-}
-
-// Generate easy test cases for quantized linear operation (for debugging)
-std::vector<TestCase> generate_quantized_linear_easy_cases() {
-  std::vector<TestCase> test_cases;
-
-  // Single simple configuration for debugging
-  int M = 4;
-  int K = 4;
-  int N = 4;
-
-  LinearConfig config = {
-      M, // Batch size
-      K, // Input features
-      N, // Output features
-      true, // has_bias
-      "simple", // test_case_name
-  };
-
-  // Test with both storage types and data types for completeness
-  std::vector<utils::StorageType> storage_types = {
-      utils::kTexture3D, utils::kBuffer};
-  std::vector<vkapi::ScalarType> float_types = {vkapi::kFloat};
-
-  // Generate test cases for each combination
-  for (const auto& storage_type : storage_types) {
-    for (const auto& input_dtype : float_types) {
-      test_cases.push_back(
-          create_test_case_from_config(config, storage_type, input_dtype));
-    }
-  }
-
-  return test_cases;
-}
-
-// Generate test cases for quantized linear operation
-std::vector<TestCase> generate_quantized_linear_test_cases() {
-  std::vector<TestCase> test_cases;
-
-  std::vector<LinearConfig> configs = {
-      {4, 64, 32},
-      {4, 128, 64},
-      {4, 256, 128},
-      {32, 64, 32},
-      {32, 128, 64},
-      {32, 256, 128},
-      // No bias tests
-      {32, 128, 64, false},
-      {32, 256, 128, false},
-      {256, 2048, 2048},
-      {512, 2048, 2048},
-      {1024, 2048, 2048},
-  };
-
-  // Test with different storage types and data types
-  std::vector<utils::StorageType> storage_types = {
-      utils::kTexture3D, utils::kBuffer};
-
-  for (auto config : configs) {
-    std::string prefix =
-        (config.M < kRefDimSizeLimit && config.K < kRefDimSizeLimit &&
-         config.N < kRefDimSizeLimit)
-        ? "correctness_"
-        : "performance_";
-    std::string generated_test_case_name = prefix + std::to_string(config.M) +
-        "_" + std::to_string(config.K) + "_" + std::to_string(config.N);
-    if (!config.has_bias) {
-      generated_test_case_name += "_no_bias";
-    }
-
-    config.test_case_name = generated_test_case_name;
-
-    for (const auto& storage_type : storage_types) {
-      if (vkcompute::api::context()
-              ->adapter_ptr()
-              ->supports_int8_dot_product()) {
-        // Test both activation+weight quantized and weight only quantized
-        test_cases.push_back(
-            create_test_case_from_config(config, storage_type, vkapi::kFloat));
-      }
-
-      LinearConfig wo_quant_config = config;
-      wo_quant_config.op_name = "linear_q8csw";
-      test_cases.push_back(create_test_case_from_config(
-          wo_quant_config, storage_type, vkapi::kFloat));
-    }
-  }
-
-  return test_cases;
-}
-
-// Reference implementation for weight only quantized linear
-void linear_q8csw_reference_impl(TestCase& test_case) {
-  int32_t idx = 0;
-  const ValueSpec& input_spec = test_case.inputs()[idx++];
-  const ValueSpec& weight_spec = test_case.inputs()[idx++];
-  const ValueSpec& weight_scales_spec = test_case.inputs()[idx++];
-  const ValueSpec& bias_spec = test_case.inputs()[idx++];
-
-  // Extract output specification (mutable reference)
-  ValueSpec& output_spec = test_case.outputs()[0];
-
-  // Get tensor dimensions
-  auto input_sizes = input_spec.get_tensor_sizes(); // [batch_size, in_features]
-  auto weight_sizes =
-      weight_spec.get_tensor_sizes(); // [out_features, in_features]
-  auto output_sizes =
-      output_spec.get_tensor_sizes(); // [batch_size, out_features]
-
-  int64_t batch_size = input_sizes[0];
-  int64_t in_features = input_sizes[1];
-  int64_t out_features = weight_sizes[0];
-
-  // Skip for large tensors since computation time will be extremely slow
-  if (batch_size > kRefDimSizeLimit || in_features > kRefDimSizeLimit ||
-      out_features > kRefDimSizeLimit) {
-    throw std::invalid_argument(
-        "One or more dimensions (batch_size, in_features, out_features) exceed the allowed limit for reference implementation.");
-  }
-
-  if (input_spec.dtype != vkapi::kFloat) {
-    throw std::invalid_argument("Unsupported dtype");
-  }
-
-  // Get raw data pointers
-  auto& input_data = input_spec.get_float_data();
-
-  auto& weight_data = weight_spec.get_int8_data();
-  auto& weight_scales_data = weight_scales_spec.get_float_data();
-  auto& bias_data = bias_spec.get_float_data();
-
-  // Calculate number of output elements
-  int64_t num_output_elements = batch_size * out_features;
-
-  auto& ref_data = output_spec.get_ref_float_data();
-  ref_data.resize(num_output_elements);
-
-  // Perform quantized linear transformation (matrix multiplication)
-  for (int64_t b = 0; b < batch_size; ++b) {
-    for (int64_t out_f = 0; out_f < out_features; ++out_f) {
-      float sum = 0.0f;
-
-      // Matrix multiplication: output[b][out_f] = sum(input[b][in_f] *
-      // weight[out_f][in_f])
-      for (int64_t in_f = 0; in_f < in_features; ++in_f) {
-        // Get input value and dequantize
-        int64_t input_idx = b * in_features + in_f;
-        float input_val = input_data[input_idx];
-
-        // Get weight value and dequantize
-        int64_t weight_idx = out_f * in_features + in_f;
-        float dequant_weight = (static_cast<float>(weight_data[weight_idx])) *
-            weight_scales_data[out_f];
-
-        sum += input_val * dequant_weight;
-      }
-
-      // Add bias and store result
-      if (!bias_spec.is_none()) {
-        sum += bias_data[out_f];
-      }
-      int64_t output_idx = b * out_features + out_f;
-      ref_data[output_idx] = sum;
-    }
-  }
-}
-
-void linear_q8ta_q8csw_reference_impl(TestCase& test_case) {
-  // Extract input specifications
-  int32_t idx = 0;
-  const ValueSpec& input_spec = test_case.inputs()[idx++];
-  const ValueSpec& input_scale_spec = test_case.inputs()[idx++];
-  const ValueSpec& input_zeros_spec = test_case.inputs()[idx++];
-  const ValueSpec& weight_spec = test_case.inputs()[idx++];
-  const ValueSpec& weight_sums_spec = test_case.inputs()[idx++];
-  (void)weight_sums_spec;
-  const ValueSpec& weight_scales_spec = test_case.inputs()[idx++];
-  const ValueSpec& bias_spec = test_case.inputs()[idx++];
-
-  // Extract output specification (mutable reference)
-  ValueSpec& output_spec = test_case.outputs()[0];
-
-  // Get tensor dimensions
-  auto input_sizes = input_spec.get_tensor_sizes(); // [batch_size, in_features]
-  auto weight_sizes =
-      weight_spec.get_tensor_sizes(); // [out_features, in_features]
-  auto output_sizes =
-      output_spec.get_tensor_sizes(); // [batch_size, out_features]
-
-  int64_t batch_size = input_sizes[0];
-  int64_t in_features = input_sizes[1];
-  int64_t out_features = weight_sizes[0];
-
-  // Skip for large tensors since computation time will be extremely slow
-  if (batch_size > kRefDimSizeLimit || in_features > kRefDimSizeLimit ||
-      out_features > kRefDimSizeLimit) {
-    throw std::invalid_argument(
-        "One or more dimensions (batch_size, in_features, out_features) exceed the allowed limit for reference implementation.");
-  }
-
-  if (input_spec.dtype != vkapi::kFloat) {
-    throw std::invalid_argument("Unsupported dtype");
-  }
-
-  // Get raw data pointers
-  auto& input_data = input_spec.get_float_data();
-  const float input_scale = input_scale_spec.get_float_value();
-  const int32_t input_zero_point = input_zeros_spec.get_int_value();
-
-  auto& weight_data = weight_spec.get_int8_data();
-  auto& weight_scales_data = weight_scales_spec.get_float_data();
-  auto& bias_data = bias_spec.get_float_data();
-
-  // Calculate number of output elements
-  int64_t num_output_elements = batch_size * out_features;
-
-  auto& ref_data = output_spec.get_ref_float_data();
-  ref_data.resize(num_output_elements);
-
-  // Perform quantized linear transformation (matrix multiplication) with
-  // integer accumulation
-  for (int64_t b = 0; b < batch_size; ++b) {
-    for (int64_t out_f = 0; out_f < out_features; ++out_f) {
-      int32_t int_sum = 0;
-      int32_t weight_sum = 0; // Track weight sum on the fly
-
-      // Matrix multiplication with integer accumulation:
-      // int_sum = sum(quantized_input[b][in_f] * quantized_weight[out_f][in_f])
-      for (int64_t in_f = 0; in_f < in_features; ++in_f) {
-        // Get input value and quantize to int8
-        int64_t input_idx = b * in_features + in_f;
-
-        float quant_input_f =
-            std::round(input_data[input_idx] / input_scale) + input_zero_point;
-        quant_input_f = std::min(std::max(quant_input_f, -128.0f), 127.0f);
-        int8_t quantized_input = static_cast<int8_t>(quant_input_f);
-
-        // Get quantized weight (already int8)
-        int64_t weight_idx = out_f * in_features + in_f;
-        int8_t quantized_weight = weight_data[weight_idx];
-
-        // Integer multiplication and accumulation
-        int_sum += static_cast<int32_t>(quantized_input) *
-            static_cast<int32_t>(quantized_weight);
-
-        // Track weight sum for this output channel on the fly
-        weight_sum += static_cast<int32_t>(quantized_weight);
-      }
-
-      // Convert accumulated integer result to float and apply scales
-      // Final result = (int_sum - zero_point_correction) * input_scale *
-      // weight_scale + bias zero_point_correction = input_zero_point *
-      // sum_of_weights_for_this_output_channel
-      int32_t zero_point_correction = input_zero_point * weight_sum;
-      int32_t accum_adjusted = int_sum - zero_point_correction;
-
-      float float_result =
-          accum_adjusted * input_scale * weight_scales_data[out_f];
-
-      // Add bias and store result
-      if (!bias_spec.is_none()) {
-        float_result += bias_data[out_f];
-      }
-      int64_t output_idx = b * out_features + out_f;
-      ref_data[output_idx] = float_result;
-    }
-  }
-}
-
-void reference_impl(TestCase& test_case) {
-  if (test_case.operator_name().find("q8ta") != std::string::npos) {
-    linear_q8ta_q8csw_reference_impl(test_case);
-  } else {
-    linear_q8csw_reference_impl(test_case);
-  }
-}
-
-int64_t quantized_linear_flop_calculator(const TestCase& test_case) {
-  int input_idx = 0;
-  int weight_idx = 1;
-  if (test_case.operator_name().find("q8ta") != std::string::npos) {
-    input_idx = 0;
-    weight_idx = 3;
-  }
-
-  // Get input and weight dimensions
-  const auto& input_sizes = test_case.inputs()[input_idx].get_tensor_sizes();
-  const auto& weight_sizes = test_case.inputs()[weight_idx].get_tensor_sizes();
-
-  int64_t batch_size = input_sizes[0];
-  int64_t in_features = input_sizes[1];
-  int64_t out_features = weight_sizes[0];
-
-  // Calculate FLOPs for quantized linear operation
-  // Each output element requires:
-  // - in_features multiply-accumulate operations
-  // - Additional operations for quantization/dequantization
-  int64_t output_elements = batch_size * out_features;
-  int64_t ops_per_output = in_features;
-
-  // Add quantization overhead (approximate)
-  // - Dequantize input: 1 op per input element used
-  // - Dequantize weight: 1 op per weight element used
-  // - Add bias: 1 op per output element
-  int64_t quantization_ops = ops_per_output + 1; // Simplified estimate
-
-  int64_t flop = output_elements * (ops_per_output + quantization_ops);
-
-  return flop;
-}
-
-int main(int argc, char* argv[]) {
-  set_debugging(false);
-  set_print_output(false);
-  set_print_latencies(false);
-  set_use_gpu_timestamps(true);
-
-  print_performance_header();
-  std::cout << "Quantized Linear Operation Prototyping Framework" << std::endl;
-  print_separator();
-
-  ReferenceComputeFunc ref_fn = reference_impl;
-
-  auto results = execute_test_cases(
-      generate_quantized_linear_test_cases,
-      quantized_linear_flop_calculator,
-      "QuantizedLinear",
-      0,
-      10,
-      ref_fn);
-
-  return 0;
-}
diff --git a/backends/vulkan/test/custom_ops/quantized_int4_linear.cpp b/backends/vulkan/test/custom_ops/quantized_int4_linear.cpp
deleted file mode 100644
index c125ce2d09c..00000000000
--- a/backends/vulkan/test/custom_ops/quantized_int4_linear.cpp
+++ /dev/null
@@ -1,366 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-#include <iostream>
-#include <vector>
-#include "utils.h"
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-using namespace executorch::vulkan::prototyping;
-
-using namespace vkcompute;
-
-// Linear configuration struct
-struct LinearConfig {
-  int64_t M; // Batch size / number of rows in input
-  int64_t K; // Input features / columns in input, rows in weight
-  int64_t N; // Output features / columns in weight
-  int64_t group_size; // Number of input channels per quantization group
-  std::string name_suffix;
-  std::string shader_variant_name = "default";
-};
-
-// Utility function to create a test case from a LinearConfig
-TestCase create_test_case_from_config(
-    const LinearConfig& config,
-    utils::StorageType storage_type,
-    vkapi::ScalarType input_dtype) {
-  TestCase test_case;
-
-  // Create a descriptive name for the test case
-  std::string storage_str =
-      (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer";
-  std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half";
-
-  std::string test_name = "QuantizedLinearInt4_" + config.name_suffix + "_" +
-      storage_str + "_" + dtype_str;
-  test_case.set_name(test_name);
-
-  // Set the operator name for the test case
-  std::string operator_name = "et_vk.linear_weight_int4.default";
-  test_case.set_operator_name(operator_name);
-
-  // Derive sizes from M, K, N
-  std::vector<int64_t> input_size = {config.M, config.K};
-  std::vector<int64_t> weight_size = {
-      config.N, config.K / 2}; // Packed 4-bit weights
-
-  // Input tensor (float/half) - [M, K]
-  ValueSpec input_tensor(
-      input_size,
-      input_dtype,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::ONES);
-
-  if (debugging()) {
-    print_valuespec_data(input_tensor, "input_tensor");
-  }
-
-  // Quantized weight tensor (int8, packed 4-bit) - [N, K/2]
-  ValueSpec quantized_weight(
-      weight_size,
-      vkapi::kChar, // int8 for packed 4-bit quantized weights
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::ONES);
-  quantized_weight.set_constant(true);
-  quantized_weight.set_int4(true);
-
-  if (debugging()) {
-    print_valuespec_data(quantized_weight, "weight_tensor");
-  }
-
-  // Group size parameter
-  ValueSpec group_size_spec(static_cast<int32_t>(config.group_size));
-
-  // Weight quantization scales and zeros (float/half, per-group) -
-  // [K/group_size, N, 2]
-  std::vector<int64_t> scales_and_zeros_size = {
-      config.K / config.group_size, config.N, 2};
-  ValueSpec scales_and_zeros(
-      scales_and_zeros_size,
-      input_dtype,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::ONES);
-  scales_and_zeros.set_constant(true);
-
-  if (debugging()) {
-    print_valuespec_data(scales_and_zeros, "scales_and_zeros");
-  }
-
-  // Output tensor (float/half) - [M, N]
-  ValueSpec output(
-      {config.M, config.N},
-      input_dtype,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::ZEROS);
-
-  // Add all specs to test case
-  test_case.add_input_spec(input_tensor);
-  test_case.add_input_spec(quantized_weight);
-  test_case.add_input_spec(group_size_spec);
-  test_case.add_input_spec(scales_and_zeros);
-  // Add dummy value for inner_k_tiles (unused but required by operator
-  // signature)
-  ValueSpec dummy_inner_k_tiles(static_cast<int32_t>(8));
-  test_case.add_input_spec(dummy_inner_k_tiles);
-
-  test_case.add_output_spec(output);
-
-  return test_case;
-}
-
-// Generate easy test cases for quantized linear operation (for debugging)
-std::vector<TestCase> generate_quantized_linear_easy_cases() {
-  std::vector<TestCase> test_cases;
-
-  // Single simple configuration for debugging
-  int M = 8;
-  int K = 16;
-  int N = 16;
-  int group_size = 8;
-
-  LinearConfig config = {
-      M, // Batch size
-      K, // Input features
-      N, // Output features
-      group_size, // Group size
-      "simple", // descriptive name
-      "default" // shader variant name
-  };
-
-  // Test with both storage types and data types for completeness
-  std::vector<utils::StorageType> storage_types = {
-      utils::kTexture3D, utils::kBuffer};
-  std::vector<vkapi::ScalarType> float_types = {vkapi::kFloat};
-
-  // Generate test cases for each combination
-  for (const auto& storage_type : storage_types) {
-    for (const auto& input_dtype : float_types) {
-      test_cases.push_back(
-          create_test_case_from_config(config, storage_type, input_dtype));
-    }
-  }
-
-  return test_cases;
-}
-
-// Generate test cases for quantized linear operation
-std::vector<TestCase> generate_quantized_linear_test_cases() {
-  std::vector<TestCase> test_cases;
-
-  std::vector<LinearConfig> configs = {
-      {8, 64, 32, 8, "correctness_8_64_32_g8"},
-      {8, 128, 64, 16, "correctness_8_128_64_g16"},
-      {8, 256, 128, 32, "correctness_8_256_128_g32"},
-      {32, 64, 32, 8, "correctness_32_64_32_g8"},
-      {32, 128, 64, 16, "correctness_32_128_64_g16"},
-      {32, 256, 128, 32, "correctness_32_256_128_g32"},
-      {1, 256, 128, 32, "correctness_32_256_128_g32"},
-      // Performance test cases
-      {1, 2048, 2048, 128, "performance_128_2048_2048_g128"},
-      {128, 2048, 2048, 128, "performance_128_2048_2048_g128"},
-      {248, 2048, 2048, 128, "performance_128_2048_2048_g128"},
-      {1024, 2048, 2048, 128, "performance_128_2048_2048_g128"},
-      // {16384, 576, 128, 32, "performance_16384_576_128_g32"}
-  };
-
-  // Test with different storage types and data types
-  std::vector<utils::StorageType> storage_types = {
-      utils::kTexture3D, utils::kBuffer};
-
-  // Generate test cases for each combination
-  for (const auto& config : configs) {
-    for (const auto& storage_type : storage_types) {
-      test_cases.push_back(
-          create_test_case_from_config(config, storage_type, vkapi::kFloat));
-    }
-  }
-
-  return test_cases;
-}
-
-// Helper function to unpack 4-bit values from int8
-std::pair<int8_t, int8_t> unpack_4bit(int8_t packed) {
-  // Extract lower 4 bits and upper 4 bits
-  int8_t lower = packed & 0x0F;
-  int8_t upper = (packed >> 4) & 0x0F;
-
-  // Sign extend from 4-bit to 8-bit
-  if (lower & 0x08)
-    lower |= 0xF0;
-  if (upper & 0x08)
-    upper |= 0xF0;
-
-  return std::make_pair(lower, upper);
-}
-
-// Reference implementation for quantized linear operation
-void quantized_linear_reference_impl(TestCase& test_case) {
-  static constexpr int64_t kRefDimSizeLimit = 300;
-  // Extract input specifications
-  int32_t idx = 0;
-  const ValueSpec& input_spec = test_case.inputs()[idx++];
-  const ValueSpec& weight_spec = test_case.inputs()[idx++];
-  const ValueSpec& group_size_spec = test_case.inputs()[idx++];
-  const ValueSpec& scales_and_zeros_spec = test_case.inputs()[idx++];
-  // Skip dummy inner_k_tiles
-  idx++;
-
-  // Extract output specification (mutable reference)
-  ValueSpec& output_spec = test_case.outputs()[0];
-
-  // Get tensor dimensions
-  auto input_sizes = input_spec.get_tensor_sizes(); // [batch_size, in_features]
-  auto weight_sizes =
-      weight_spec.get_tensor_sizes(); // [out_features, in_features/2]
-  auto output_sizes =
-      output_spec.get_tensor_sizes(); // [batch_size, out_features]
-
-  int64_t batch_size = input_sizes[0];
-  int64_t in_features = input_sizes[1];
-  int64_t out_features = output_sizes[1];
-  int64_t group_size = group_size_spec.get_int_value();
-
-  // Skip for large tensors since computation time will be extremely slow
-  if (batch_size > kRefDimSizeLimit || in_features > kRefDimSizeLimit ||
-      out_features > kRefDimSizeLimit) {
-    throw std::invalid_argument(
-        "One or more dimensions (batch_size, in_features, out_features) exceed the allowed limit for reference implementation.");
-  }
-
-  if (input_spec.dtype != vkapi::kFloat) {
-    throw std::invalid_argument("Unsupported dtype");
-  }
-
-  // Get raw data pointers
-  auto& input_data = input_spec.get_float_data();
-  auto& weight_data = weight_spec.get_int8_data();
-  auto& scales_and_zeros_data = scales_and_zeros_spec.get_float_data();
-
-  // Calculate number of output elements
-  int64_t num_output_elements = batch_size * out_features;
-
-  auto& ref_data = output_spec.get_ref_float_data();
-  ref_data.resize(num_output_elements);
-
-  // Perform quantized linear transformation (matrix multiplication)
-  for (int64_t b = 0; b < batch_size; ++b) {
-    for (int64_t out_f = 0; out_f < out_features; ++out_f) {
-      float sum = 0.0f;
-
-      bool should_print = b == 0 && out_f == 0;
-      should_print = false;
-
-      if (should_print) {
-        std::cout << "Weights seen: ";
-      }
-
-      // Matrix multiplication: output[b][out_f] = sum(input[b][in_f] *
-      // weight[out_f][in_f])
-      for (int64_t in_f = 0; in_f < in_features; ++in_f) {
-        // Get input value
-        int64_t input_idx = b * in_features + in_f;
-        float input_val = input_data[input_idx];
-
-        // Get weight value and dequantize (4-bit group affine quantization)
-        int64_t group_idx = in_f / group_size;
-        int64_t scales_and_zeros_idx = group_idx * out_features * 2 + out_f * 2;
-
-        // Get packed weight value
-        int64_t weight_idx = out_f * (in_features / 2) + (in_f / 2);
-        int8_t packed_weight = weight_data[weight_idx];
-
-        // Unpack 4-bit weight
-        auto unpacked = unpack_4bit(packed_weight);
-        int8_t weight_4bit = (in_f % 2 == 0) ? unpacked.first : unpacked.second;
-
-        // Dequantize weight using group affine quantization
-        float weight_scale = scales_and_zeros_data[scales_and_zeros_idx];
-        float weight_zero = scales_and_zeros_data[scales_and_zeros_idx + 1];
-        float dequant_weight =
-            (static_cast<float>(weight_4bit) - 8.0f) * weight_scale +
-            weight_zero;
-
-        if (should_print) {
-          std::cout << int(weight_4bit) << ", ";
-        }
-
-        sum += input_val * dequant_weight;
-      }
-
-      if (should_print) {
-        std::cout << std::endl;
-      }
-
-      // Store result
-      int64_t output_idx = b * out_features + out_f;
-      ref_data[output_idx] = sum;
-    }
-  }
-}
-
-// Custom FLOP calculator for quantized linear operation
-int64_t quantized_linear_flop_calculator(const TestCase& test_case) {
-  if (test_case.num_inputs() < 4 || test_case.num_outputs() < 1) {
-    return 0;
-  }
-
-  // Get input and weight dimensions
-  const auto& input_sizes = test_case.inputs()[0].get_tensor_sizes();
-  const auto& output_sizes = test_case.outputs()[0].get_tensor_sizes();
-
-  int64_t batch_size = input_sizes[0];
-  int64_t in_features = input_sizes[1];
-  int64_t out_features = output_sizes[1];
-
-  // Calculate FLOPs for quantized linear operation
-  // Each output element requires:
-  // - in_features multiply-accumulate operations
-  // - Additional operations for quantization/dequantization
-  int64_t output_elements = batch_size * out_features;
-  int64_t ops_per_output = in_features;
-
-  // Add quantization overhead (approximate)
-  // - Dequantize weight: 2 ops per weight element used (unpack + dequantize)
-  int64_t quantization_ops = ops_per_output * 2; // Simplified estimate
-
-  int64_t flop = output_elements * (ops_per_output + quantization_ops);
-
-  return flop;
-}
-
-int main(int argc, char* argv[]) {
-  set_debugging(false);
-  set_print_output(false);
-  set_print_latencies(false);
-  set_use_gpu_timestamps(true);
-
-  print_performance_header();
-  std::cout << "Quantized 4-bit Int4 Linear Operation Prototyping Framework"
-            << std::endl;
-  print_separator();
-
-  ReferenceComputeFunc ref_fn = quantized_linear_reference_impl;
-
-  // Execute easy test cases using the new framework with custom FLOP
-  // calculator
-  auto results = execute_test_cases(
-      generate_quantized_linear_test_cases,
-      quantized_linear_flop_calculator,
-      "QuantizedLinearInt4",
-      0,
-      10,
-      ref_fn);
-
-  return 0;
-}
diff --git a/backends/vulkan/test/custom_ops/quantized_q4gaw_linear.cpp b/backends/vulkan/test/custom_ops/quantized_q4gaw_linear.cpp
deleted file mode 100644
index 084d718b502..00000000000
--- a/backends/vulkan/test/custom_ops/quantized_q4gaw_linear.cpp
+++ /dev/null
@@ -1,433 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-#include <iostream>
-#include <vector>
-#include "utils.h"
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
-
-using namespace executorch::vulkan::prototyping;
-
-using namespace vkcompute;
-
-// Linear configuration struct
-struct LinearConfig {
-  int64_t M; // Batch size / number of rows in input
-  int64_t K; // Input features / columns in input, rows in weight
-  int64_t N; // Output features / columns in weight
-  int64_t group_size; // Number of input channels per quantization group
-  std::string name_suffix;
-  std::string shader_variant_name = "default";
-};
-
-// Utility function to create a test case from a LinearConfig
-TestCase create_test_case_from_config(
-    const LinearConfig& config,
-    utils::StorageType storage_type,
-    vkapi::ScalarType input_dtype) {
-  TestCase test_case;
-
-  // Create a descriptive name for the test case
-  std::string storage_str =
-      (storage_type == utils::kTexture3D) ? "Texture3D" : "Buffer";
-  std::string dtype_str = (input_dtype == vkapi::kFloat) ? "Float" : "Half";
-
-  std::string test_name = "QuantizedLinear4GAW_" + config.name_suffix + "_" +
-      storage_str + "_" + dtype_str;
-  test_case.set_name(test_name);
-
-  // Set the operator name for the test case
-  std::string operator_name = "et_vk.linear_q8ta_q4gaw.";
-  operator_name += config.shader_variant_name;
-  test_case.set_operator_name(operator_name);
-
-  // Derive sizes from M, K, N
-  std::vector<int64_t> input_size = {config.M, config.K};
-  std::vector<int64_t> weight_size = {
-      config.K, config.N / 2}; // Packed 4-bit weights
-
-  // Input tensor (float/half) - [M, K]
-  ValueSpec input_tensor(
-      input_size,
-      input_dtype,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::RANDINT);
-
-  if (debugging()) {
-    print_valuespec_data(input_tensor, "input_tensor");
-  }
-
-  float input_scale_val = 1.0f;
-  ValueSpec input_scale(input_scale_val);
-
-  int32_t input_zero_point_val = 0;
-  ValueSpec input_zero_point(input_zero_point_val);
-
-  // Group size parameter
-  ValueSpec group_size_spec(static_cast<int32_t>(config.group_size));
-
-  // Quantized weight tensor (int8, packed 4-bit) - [K, N/2]
-  ValueSpec quantized_weight(
-      weight_size,
-      vkapi::kChar, // int8 for packed 4-bit quantized weights
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::RANDINT4);
-  quantized_weight.set_constant(true);
-  quantized_weight.set_int4(true);
-
-  if (debugging()) {
-    print_valuespec_data(quantized_weight, "weight_tensor");
-  }
-
-  // Weight quantization scales (float/half, per-group) - [N, K/group_size]
-  std::vector<int64_t> weight_scales_size = {
-      config.N, config.K / config.group_size};
-  ValueSpec weight_scales(
-      weight_scales_size,
-      input_dtype,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::RANDOM_SCALES);
-  weight_scales.set_constant(true);
-
-  if (debugging()) {
-    print_valuespec_data(weight_scales, "weight_scales");
-  }
-
-  // Weight zeros (int32, per-group) - [N, K/group_size]
-  ValueSpec weight_zeros(
-      weight_scales_size,
-      vkapi::kInt, // int32 for zeros
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::ZEROS);
-  weight_zeros.set_constant(true);
-
-  ValueSpec weight_sums(
-      {config.N}, // Per output features
-      vkapi::kFloat,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::ZEROS);
-  weight_sums.set_constant(true);
-
-  // Compute weight_sums data based on quantized weights
-  int64_t in_features = config.K;
-  int64_t out_features = config.N;
-
-  ValueSpec orig_OC(static_cast<int32_t>(config.N));
-
-  // Bias (optional, float/half) - [N]
-  ValueSpec bias(
-      {config.N}, // Per output feature
-      input_dtype,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::ZEROS);
-  bias.set_constant(true);
-
-  // Output tensor (float/half) - [M, N]
-  ValueSpec output(
-      {config.M, config.N},
-      input_dtype,
-      storage_type,
-      utils::kWidthPacked,
-      DataGenType::ZEROS);
-
-  // Add all specs to test case
-  test_case.add_input_spec(input_tensor);
-  test_case.add_input_spec(input_scale);
-  test_case.add_input_spec(input_zero_point);
-  test_case.add_input_spec(quantized_weight);
-  test_case.add_input_spec(weight_sums);
-  test_case.add_input_spec(weight_scales);
-  test_case.add_input_spec(weight_zeros);
-  test_case.add_input_spec(orig_OC);
-  test_case.add_input_spec(group_size_spec);
-  test_case.add_input_spec(bias);
-
-  test_case.add_output_spec(output);
-
-  return test_case;
-}
-
-// Generate easy test cases for quantized linear operation (for debugging)
-std::vector<TestCase> generate_quantized_linear_easy_cases() {
-  std::vector<TestCase> test_cases;
-
-  // Single simple configuration for debugging
-  int M = 4;
-  int K = 32;
-  int N = 32;
-  int group_size = 8;
-
-  LinearConfig config = {
-      M, // Batch size
-      K, // Input features
-      N, // Output features
-      group_size, // Group size
-      "simple", // descriptive name
-      "noint8" // shader variant name
-  };
-
-  // Test with both storage types and data types for completeness
-  std::vector<utils::StorageType> storage_types = {
-      utils::kTexture3D, utils::kBuffer};
-  std::vector<vkapi::ScalarType> float_types = {vkapi::kFloat};
-
-  // Generate test cases for each combination
-  for (const auto& storage_type : storage_types) {
-    for (const auto& input_dtype : float_types) {
-      test_cases.push_back(
-          create_test_case_from_config(config, storage_type, input_dtype));
-    }
-  }
-
-  return test_cases;
-}
-
-// Generate test cases for quantized linear operation
-std::vector<TestCase> generate_quantized_linear_test_cases() {
-  std::vector<TestCase> test_cases;
-
-  std::vector<LinearConfig> configs = {
-      {8, 64, 32, 8, "correctness_1_64_32_g8"},
-      {8, 128, 64, 16, "correctness_1_128_64_g16"},
-      {8, 256, 128, 32, "correctness_1_256_128_g32"},
-      {32, 64, 32, 8, "correctness_32_64_32_g8"},
-      {32, 128, 64, 16, "correctness_32_128_64_g16"},
-      {32, 256, 128, 32, "correctness_32_256_128_g32"},
-      {1, 256, 128, 32, "todo"},
-      // Performance test cases
-      {1, 2048, 2048, 128, "todo"},
-      {128, 2048, 2048, 128, "performance_128_2048_2048_g64"},
-      {248, 2048, 2048, 128, "performance_128_2048_2048_g64"},
-      {1024, 2048, 2048, 128, "performance_128_2048_2048_g64"},
-      // {16384, 576, 128, 32, "performance_16384_576_128_g32"}
-  };
-
-  // Test with different storage types and data types
-  std::vector<utils::StorageType> storage_types = {
-      utils::kTexture3D, utils::kBuffer};
-
-  // Generate test cases for each combination
-  for (const auto& config : configs) {
-    for (const auto& storage_type : storage_types) {
-      // Test both with and without shader int8 dot product
-      test_cases.push_back(
-          create_test_case_from_config(config, storage_type, vkapi::kFloat));
-
-      // LinearConfig no_int_config = config;
-      // no_int_config.name_suffix = config.name_suffix + "_noint8";
-      // no_int_config.shader_variant_name = "noint8";
-
-      // test_cases.push_back(create_test_case_from_config(
-      //     no_int_config, storage_type, vkapi::kFloat));
-    }
-  }
-
-  return test_cases;
-}
-
-// Helper function to unpack 4-bit values from int8
-std::pair<int8_t, int8_t> unpack_4bit(int8_t packed) {
-  // Extract lower 4 bits and upper 4 bits
-  int8_t lower = packed & 0x0F;
-  int8_t upper = (packed >> 4) & 0x0F;
-
-  // Sign extend from 4-bit to 8-bit
-  if (lower & 0x08)
-    lower |= 0xF0;
-  if (upper & 0x08)
-    upper |= 0xF0;
-
-  return std::make_pair(lower, upper);
-}
-
-// Reference implementation for quantized linear operation
-void quantized_linear_reference_impl(TestCase& test_case) {
-  static constexpr int64_t kRefDimSizeLimit = 300;
-  // Extract input specifications
-  int32_t idx = 0;
-  const ValueSpec& input_spec = test_case.inputs()[idx++];
-  const ValueSpec& input_scale_spec = test_case.inputs()[idx++];
-  const ValueSpec& input_zeros_spec = test_case.inputs()[idx++];
-  const ValueSpec& weight_spec = test_case.inputs()[idx++];
-  const ValueSpec& weight_sums_spec = test_case.inputs()[idx++];
-  (void)weight_sums_spec;
-  const ValueSpec& weight_scales_spec = test_case.inputs()[idx++];
-  const ValueSpec& weight_zeros_spec = test_case.inputs()[idx++];
-  const ValueSpec& orig_OC = test_case.inputs()[idx++];
-  (void)orig_OC;
-  const ValueSpec& group_size_spec = test_case.inputs()[idx++];
-  const ValueSpec& bias_spec = test_case.inputs()[idx++];
-
-  // Extract output specification (mutable reference)
-  ValueSpec& output_spec = test_case.outputs()[0];
-
-  // Get tensor dimensions
-  auto input_sizes = input_spec.get_tensor_sizes(); // [batch_size, in_features]
-  auto weight_sizes =
-      weight_spec.get_tensor_sizes(); // [in_features, out_features/2]
-  auto output_sizes =
-      output_spec.get_tensor_sizes(); // [batch_size, out_features]
-
-  int64_t batch_size = input_sizes[0];
-  int64_t in_features = input_sizes[1];
-  int64_t out_features = output_sizes[1];
-  int64_t group_size = group_size_spec.get_int_value();
-
-  // Skip for large tensors since computation time will be extremely slow
-  if (batch_size > kRefDimSizeLimit || in_features > kRefDimSizeLimit ||
-      out_features > kRefDimSizeLimit) {
-    throw std::invalid_argument(
-        "One or more dimensions (batch_size, in_features, out_features) exceed the allowed limit for reference implementation.");
-  }
-
-  if (input_spec.dtype != vkapi::kFloat) {
-    throw std::invalid_argument("Unsupported dtype");
-  }
-
-  // Get raw data pointers
-  auto& input_data = input_spec.get_float_data();
-  const float input_scale = input_scale_spec.get_float_value();
-  const int32_t input_zero_point = input_zeros_spec.get_int_value();
-
-  auto& weight_data = weight_spec.get_int8_data();
-  auto& weight_scales_data = weight_scales_spec.get_float_data();
-  auto& weight_zeros_data = weight_zeros_spec.get_int32_data();
-  auto& bias_data = bias_spec.get_float_data();
-
-  // Calculate number of output elements
-  int64_t num_output_elements = batch_size * out_features;
-
-  auto& ref_data = output_spec.get_ref_float_data();
-  ref_data.resize(num_output_elements);
-
-  // Perform quantized linear transformation (matrix multiplication)
-  for (int64_t b = 0; b < batch_size; ++b) {
-    for (int64_t out_f = 0; out_f < out_features; ++out_f) {
-      float sum = 0.0f;
-
-      bool should_print = b == 0 && out_f == 0;
-      should_print = false;
-
-      if (should_print) {
-        std::cout << "Weights seen: ";
-      }
-
-      // Matrix multiplication: output[b][out_f] = sum(input[b][in_f] *
-      // weight[out_f][in_f])
-      for (int64_t in_f = 0; in_f < in_features; ++in_f) {
-        // Get input value and dequantize
-        int64_t input_idx = b * in_features + in_f;
-
-        float quant_input =
-            std::round(input_data[input_idx] / input_scale) + input_zero_point;
-        quant_input = std::min(std::max(quant_input, -128.0f), 127.0f);
-        float dequant_input = (quant_input - input_zero_point) * input_scale;
-
-        // Get weight value and dequantize (4-bit group affine quantization)
-        int64_t group_idx = in_f / group_size;
-        int64_t scales_idx = group_idx * out_features + out_f;
-
-        // Get packed weight value
-        int64_t weight_idx = in_f * (out_features / 2) + (out_f / 2);
-        int8_t packed_weight = weight_data[weight_idx];
-
-        // Unpack 4-bit weight
-        auto unpacked = unpack_4bit(packed_weight);
-        int8_t weight_4bit =
-            (out_f % 2 == 0) ? unpacked.first : unpacked.second;
-
-        // Dequantize weight using group affine quantization
-        float weight_scale = weight_scales_data[scales_idx];
-        int32_t weight_zero = weight_zeros_data[scales_idx];
-        float dequant_weight =
-            (static_cast<float>(weight_4bit) - weight_zero) * weight_scale;
-
-        if (should_print) {
-          std::cout << int(weight_4bit) << ", ";
-        }
-
-        sum += dequant_input * dequant_weight;
-      }
-
-      if (should_print) {
-        std::cout << std::endl;
-      }
-
-      // Add bias and store result
-      sum += bias_data[out_f];
-      int64_t output_idx = b * out_features + out_f;
-      ref_data[output_idx] = sum;
-    }
-  }
-}
-
-// Custom FLOP calculator for quantized linear operation
-int64_t quantized_linear_flop_calculator(const TestCase& test_case) {
-  if (test_case.num_inputs() < 6 || test_case.num_outputs() < 1) {
-    return 0;
-  }
-
-  // Get input and weight dimensions
-  const auto& input_sizes = test_case.inputs()[0].get_tensor_sizes();
-  const auto& output_sizes = test_case.outputs()[0].get_tensor_sizes();
-
-  int64_t batch_size = input_sizes[0];
-  int64_t in_features = input_sizes[1];
-  int64_t out_features = output_sizes[1];
-
-  // Calculate FLOPs for quantized linear operation
-  // Each output element requires:
-  // - in_features multiply-accumulate operations
-  // - Additional operations for quantization/dequantization
-  int64_t output_elements = batch_size * out_features;
-  int64_t ops_per_output = in_features;
-
-  // Add quantization overhead (approximate)
-  // - Dequantize input: 1 op per input element used
-  // - Dequantize weight: 2 ops per weight element used (unpack + dequantize)
-  // - Add bias: 1 op per output element
-  int64_t quantization_ops = ops_per_output * 2 + 1; // Simplified estimate
-
-  int64_t flop = output_elements * (ops_per_output + quantization_ops);
-
-  return flop;
-}
-
-int main(int argc, char* argv[]) {
-  set_debugging(false);
-  set_print_output(false);
-  set_print_latencies(false);
-  set_use_gpu_timestamps(true);
-
-  print_performance_header();
-  std::cout
-      << "Quantized 4-bit Group Affine Weights Linear Operation Prototyping Framework"
-      << std::endl;
-  print_separator();
-
-  ReferenceComputeFunc ref_fn = quantized_linear_reference_impl;
-
-  // Execute easy test cases using the new framework with custom FLOP
-  // calculator
-  auto results = execute_test_cases(
-      generate_quantized_linear_test_cases,
-      quantized_linear_flop_calculator,
-      "QuantizedLinear4GAW",
-      0,
-      10,
-      ref_fn);
-
-  return 0;
-}
diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl
deleted file mode 100644
index 3162857c2d3..00000000000
--- a/backends/vulkan/test/custom_ops/targets.bzl
+++ /dev/null
@@ -1,99 +0,0 @@
-load("@fbsource//tools/build_defs:platform_defs.bzl", "ANDROID")
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load(
-    "@fbsource//xplat/executorch/backends/vulkan:targets.bzl",
-    "get_platforms",
-    "vulkan_spv_shader_lib",
-)
-
-def define_custom_op_test_binary(custom_op_name, extra_deps = [], src_file = None):
-    deps_list = [
-        ":prototyping_utils",
-        ":operator_implementations",
-        ":custom_ops_shaderlib",
-        "//executorch/backends/vulkan:vulkan_graph_runtime",
-        runtime.external_dep_location("libtorch"),
-    ] + extra_deps
-
-    src_file_str = src_file if src_file else "{}.cpp".format(custom_op_name)
-
-    runtime.cxx_binary(
-        name = custom_op_name,
-        srcs = [
-            src_file_str,
-        ],
-        platforms = get_platforms(),
-        define_static_target = False,
-        deps = deps_list,
-    )
-
-def define_common_targets(is_fbcode = False):
-    if is_fbcode:
-        return
-
-    # Shader library from GLSL files
-    runtime.filegroup(
-        name = "custom_ops_shaders",
-        srcs = native.glob([
-            "glsl/*.glsl",
-            "glsl/*.yaml",
-        ]),
-        visibility = [
-            "//executorch/backends/vulkan/test/custom_ops/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
-
-    vulkan_spv_shader_lib(
-        name = "custom_ops_shaderlib",
-        spv_filegroups = {
-            ":custom_ops_shaders": "glsl",
-        },
-        is_fbcode = is_fbcode,
-    )
-
-    # Prototyping utilities library
-    runtime.cxx_library(
-        name = "prototyping_utils",
-        srcs = [
-            "utils.cpp",
-        ],
-        headers = [
-            "utils.h",
-        ],
-        exported_headers = [
-            "utils.h",
-        ],
-        platforms = get_platforms(),
-        deps = [
-            "//executorch/backends/vulkan:vulkan_graph_runtime",
-        ],
-        visibility = [
-            "//executorch/backends/vulkan/test/custom_ops/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
-
-    # Operator implementations library
-    runtime.cxx_library(
-        name = "operator_implementations",
-        srcs = native.glob([
-            "impl/*.cpp",
-        ]),
-        platforms = get_platforms(),
-        deps = [
-            "//executorch/backends/vulkan:vulkan_graph_runtime",
-            ":custom_ops_shaderlib",
-        ],
-        visibility = [
-            "//executorch/backends/vulkan/test/custom_ops/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-        link_whole = True,
-    )
-
-    define_custom_op_test_binary("add")
-    define_custom_op_test_binary("q8csw_linear")
-    define_custom_op_test_binary("q8csw_conv2d")
-    define_custom_op_test_binary("choose_qparams_per_row")
-    define_custom_op_test_binary("q4gsw_linear")
diff --git a/backends/vulkan/test/custom_ops/utils.cpp b/backends/vulkan/test/custom_ops/utils.cpp
deleted file mode 100644
index 37e0060b3f2..00000000000
--- a/backends/vulkan/test/custom_ops/utils.cpp
+++ /dev/null
@@ -1,1717 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include "utils.h"
-#include <algorithm>
-#include <cmath>
-#include <numeric>
-#include <random>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace executorch {
-namespace vulkan {
-namespace prototyping {
-
-int get_seed() {
-  static int seed = 42;
-  return seed++;
-}
-
-// Forward declarations for data generation utilities
-void generate_random_float_data(
-    std::vector<float>& data,
-    float min_val = -1.0f,
-    float max_val = 1.0f);
-void generate_random_int_data(
-    std::vector<int32_t>& data,
-    int min_val = -10,
-    int max_val = 10);
-void generate_randint_float_data(
-    std::vector<float>& data,
-    int min_val = -10,
-    int max_val = 10);
-void generate_randint_half_data(
-    std::vector<uint16_t>& data,
-    int min_val = -10,
-    int max_val = 10);
-void generate_random_int8_data(
-    std::vector<int8_t>& data,
-    int8_t min_val = -10,
-    int8_t max_val = 10);
-void generate_random_uint8_data(
-    std::vector<uint8_t>& data,
-    uint8_t min_val = 0,
-    uint8_t max_val = 255);
-void generate_random_2xint4_data(std::vector<uint8_t>& data);
-void generate_random_2xint4_data(std::vector<int8_t>& data);
-void generate_random_int4_data(
-    std::vector<int8_t>& data,
-    int8_t min_val = -8,
-    int8_t max_val = 7);
-void generate_ones_data(std::vector<float>& data);
-void generate_zeros_data(std::vector<float>& data);
-
-// Output and latency printing utilities
-namespace {
-static int print_output_enabled = 0;
-static int print_latencies_enabled = 0;
-static int gpu_timestamps_enabled = 0;
-static int debugging_enabled = 0;
-} // namespace
-
-bool print_output() {
-  return print_output_enabled > 0;
-}
-
-void set_print_output(bool print_output) {
-  print_output_enabled = print_output ? 1 : 0;
-}
-
-bool print_latencies() {
-  return print_latencies_enabled > 0;
-}
-
-void set_print_latencies(bool print_latencies) {
-  print_latencies_enabled = print_latencies ? 1 : 0;
-}
-
-bool use_gpu_timestamps() {
-  return gpu_timestamps_enabled > 0;
-}
-
-void set_use_gpu_timestamps(bool use_timestamps) {
-  gpu_timestamps_enabled = use_timestamps ? 1 : 0;
-}
-
-bool debugging() {
-  return debugging_enabled > 0;
-}
-
-void set_debugging(bool enable_debugging) {
-  debugging_enabled = enable_debugging ? 1 : 0;
-}
-
-// ValueSpec implementation
-void ValueSpec::generate_tensor_data() {
-  if (spec_type != SpecType::Tensor) {
-    return;
-  }
-
-  int64_t num_elements = numel();
-
-  switch (dtype) {
-    case vkapi::kFloat: {
-      float_data.resize(num_elements);
-      if (data_gen_type == DataGenType::RANDOM) {
-        generate_random_float_data(float_data);
-      } else if (data_gen_type == DataGenType::RANDOM_SCALES) {
-        generate_random_float_data(float_data, 0.005, 0.015);
-      } else if (data_gen_type == DataGenType::RANDINT) {
-        generate_randint_float_data(float_data);
-      } else if (data_gen_type == DataGenType::RANDINT8) {
-        generate_randint_float_data(float_data, -128, 127);
-      } else if (data_gen_type == DataGenType::RANDINT4) {
-        generate_randint_float_data(float_data, -8, 7);
-      } else if (data_gen_type == DataGenType::ONES) {
-        generate_ones_data(float_data);
-      } else if (data_gen_type == DataGenType::ZEROS) {
-        generate_zeros_data(float_data);
-      } else {
-        generate_zeros_data(float_data);
-      }
-      break;
-    }
-    case vkapi::kHalf: {
-      half_data.resize(num_elements);
-      if (data_gen_type == DataGenType::RANDOM) {
-        // Generate random float data first, then convert to half
-        std::vector<float> temp_data(num_elements);
-        generate_random_float_data(temp_data);
-        for (size_t i = 0; i < temp_data.size(); ++i) {
-          // Simple conversion to uint16_t representation of half
-          half_data[i] = static_cast<uint16_t>(temp_data[i] * 32767.0f);
-        }
-      } else if (data_gen_type == DataGenType::RANDINT) {
-        generate_randint_half_data(half_data);
-      } else if (data_gen_type == DataGenType::RANDINT8) {
-        generate_randint_half_data(half_data, -128, 127);
-      } else if (data_gen_type == DataGenType::RANDINT4) {
-        generate_randint_half_data(half_data, -8, 7);
-      } else if (data_gen_type == DataGenType::ONES) {
-        std::fill(
-            half_data.begin(),
-            half_data.end(),
-            static_cast<uint16_t>(32767)); // 1.0 in half
-      } else if (data_gen_type == DataGenType::ZEROS) {
-        std::fill(
-            half_data.begin(),
-            half_data.end(),
-            static_cast<uint16_t>(0)); // 0.0 in half
-      } else {
-        std::fill(
-            half_data.begin(),
-            half_data.end(),
-            static_cast<uint16_t>(0)); // 0.0 in half
-      }
-      break;
-    }
-    case vkapi::kInt: {
-      int32_data.resize(num_elements);
-      if (data_gen_type == DataGenType::RANDOM) {
-        generate_random_int_data(int32_data);
-      } else if (data_gen_type == DataGenType::RANDINT) {
-        generate_random_int_data(
-            int32_data); // For int type, RANDINT is same as RANDOM
-      } else if (data_gen_type == DataGenType::RANDINT8) {
-        generate_random_int_data(int32_data, -128, 127);
-      } else if (data_gen_type == DataGenType::RANDINT4) {
-        generate_random_int_data(int32_data, -8, 7);
-      } else if (data_gen_type == DataGenType::ONES) {
-        std::fill(int32_data.begin(), int32_data.end(), 1);
-      } else if (data_gen_type == DataGenType::ZEROS) {
-        std::fill(int32_data.begin(), int32_data.end(), 0);
-      } else {
-        std::fill(int32_data.begin(), int32_data.end(), 0);
-      }
-      break;
-    }
-    case vkapi::kChar: {
-      int8_data.resize(num_elements);
-      if (data_gen_type == DataGenType::RANDOM) {
-        generate_random_int8_data(int8_data);
-      } else if (data_gen_type == DataGenType::RANDINT) {
-        generate_random_int8_data(int8_data);
-      } else if (data_gen_type == DataGenType::RANDINT8) {
-        generate_random_int8_data(int8_data, -128, 127);
-      } else if (data_gen_type == DataGenType::RANDINT4) {
-        generate_random_2xint4_data(int8_data);
-      } else if (data_gen_type == DataGenType::ONES) {
-        std::fill(int8_data.begin(), int8_data.end(), 1);
-      } else if (data_gen_type == DataGenType::ONES_INT4) {
-        int8_t packed_data = (1 << 4) | 1;
-        std::fill(int8_data.begin(), int8_data.end(), packed_data);
-      } else if (data_gen_type == DataGenType::ZEROS) {
-        std::fill(int8_data.begin(), int8_data.end(), 0);
-      } else {
-        std::fill(int8_data.begin(), int8_data.end(), 0);
-      }
-      break;
-    }
-    case vkapi::kByte: {
-      uint8_data.resize(num_elements);
-      if (data_gen_type == DataGenType::RANDOM) {
-        generate_random_uint8_data(uint8_data);
-      } else if (data_gen_type == DataGenType::RANDINT) {
-        generate_random_uint8_data(uint8_data);
-      } else if (data_gen_type == DataGenType::RANDINT8) {
-        generate_random_uint8_data(uint8_data, 0, 255);
-      } else if (data_gen_type == DataGenType::RANDINT4) {
-        generate_random_2xint4_data(uint8_data);
-      } else if (data_gen_type == DataGenType::ONES) {
-        std::fill(uint8_data.begin(), uint8_data.end(), 1);
-      } else if (data_gen_type == DataGenType::ZEROS) {
-        std::fill(uint8_data.begin(), uint8_data.end(), 0);
-      } else {
-        std::fill(uint8_data.begin(), uint8_data.end(), 0);
-      }
-      break;
-    }
-    default:
-      // Default to float
-      float_data.resize(num_elements);
-      if (data_gen_type == DataGenType::RANDOM) {
-        generate_random_float_data(float_data);
-      } else if (data_gen_type == DataGenType::RANDINT) {
-        generate_randint_float_data(float_data);
-      } else if (data_gen_type == DataGenType::ONES) {
-        generate_ones_data(float_data);
-      } else if (data_gen_type == DataGenType::ZEROS) {
-        generate_zeros_data(float_data);
-      } else {
-        generate_zeros_data(float_data);
-      }
-      break;
-  }
-}
-
-int64_t ValueSpec::numel() const {
-  if (spec_type == SpecType::Int || spec_type == SpecType::Float ||
-      spec_type == SpecType::Bool) {
-    return 1;
-  } else if (spec_type == SpecType::IntList) {
-    return sizes.empty() ? 0 : sizes[0];
-  } else { // Tensor
-    int64_t total = 1;
-    for (int64_t size : sizes) {
-      total *= size;
-    }
-    return total;
-  }
-}
-
-size_t ValueSpec::nbytes() const {
-  size_t element_size = 0;
-  switch (dtype) {
-    case vkapi::kFloat:
-      element_size = sizeof(float);
-      break;
-    case vkapi::kHalf:
-      element_size = sizeof(uint16_t);
-      break;
-    case vkapi::kInt:
-      element_size = sizeof(int32_t);
-      break;
-    case vkapi::kChar:
-      element_size = sizeof(int8_t);
-      break;
-    case vkapi::kByte:
-      element_size = sizeof(uint8_t);
-      break;
-    default:
-      element_size = sizeof(float); // Default fallback
-      break;
-  }
-  return numel() * element_size;
-}
-
-std::string ValueSpec::to_string() const {
-  std::string result = "ValueSpec(";
-
-  switch (spec_type) {
-    case SpecType::Tensor:
-      result += "type=Tensor, sizes=[";
-      break;
-    case SpecType::IntList:
-      result += "type=IntList, count=";
-      result += std::to_string(sizes.empty() ? 0 : sizes[0]);
-      result += ", data_gen=";
-      result += (data_gen_type == DataGenType::FIXED) ? "FIXED" : "RANDOM";
-      result += ")";
-      return result;
-    case SpecType::Int:
-      result += "type=Int, value=";
-      result += std::to_string(get_int_value());
-      result += ", data_gen=";
-      result += (data_gen_type == DataGenType::FIXED) ? "FIXED" : "RANDOM";
-      result += ")";
-      return result;
-    case SpecType::Float:
-      result += "type=Float, value=";
-      result += std::to_string(get_float_value());
-      result += ", data_gen=";
-      result += (data_gen_type == DataGenType::FIXED) ? "FIXED" : "RANDOM";
-      result += ")";
-      return result;
-    case SpecType::Bool:
-      result += "type=Bool, value=";
-      result += get_bool_value() ? "true" : "false";
-      result += ", data_gen=";
-      result += (data_gen_type == DataGenType::FIXED) ? "FIXED" : "RANDOM";
-      result += ")";
-      return result;
-  }
-
-  for (size_t i = 0; i < sizes.size(); ++i) {
-    result += std::to_string(sizes[i]);
-    if (i < sizes.size() - 1)
-      result += ", ";
-  }
-  result += "]";
-
-  if (spec_type == SpecType::Tensor) {
-    result += ", dtype=";
-    switch (dtype) {
-      case vkapi::kFloat:
-        result += "float";
-        break;
-      case vkapi::kHalf:
-        result += "half";
-        break;
-      case vkapi::kInt:
-        result += "int32";
-        break;
-      case vkapi::kChar:
-        result += "int8";
-        break;
-      case vkapi::kByte:
-        result += "uint8";
-        break;
-      default:
-        result += "unknown";
-        break;
-    }
-
-    result += ", memory_layout=";
-    switch (memory_layout) {
-      case utils::kWidthPacked:
-        result += "WidthPacked";
-        break;
-      case utils::kHeightPacked:
-        result += "HeightPacked";
-        break;
-      case utils::kChannelsPacked:
-        result += "ChannelsPacked";
-        break;
-      default:
-        result += "unknown";
-        break;
-    }
-
-    result += ", storage_type=";
-    switch (storage_type) {
-      case utils::kTexture3D:
-        result += "Texture3D";
-        break;
-      case utils::kBuffer:
-        result += "Buffer";
-        break;
-      default:
-        result += "unknown";
-        break;
-    }
-  }
-
-  result += ", data_gen=";
-  switch (data_gen_type) {
-    case DataGenType::FIXED:
-      result += "FIXED";
-      break;
-    case DataGenType::RANDOM:
-      result += "RANDOM";
-      break;
-    case DataGenType::RANDINT:
-      result += "RANDINT";
-      break;
-    case DataGenType::RANDINT8:
-      result += "RANDINT8";
-      break;
-    case DataGenType::RANDINT4:
-      result += "RANDINT4";
-      break;
-    case DataGenType::ONES:
-      result += "ONES";
-      break;
-    case DataGenType::ZEROS:
-      result += "ZEROS";
-      break;
-    default:
-      result += "unknown";
-      break;
-  }
-  result += ")";
-  return result;
-}
-
-// Additional ValueSpec methods
-void ValueSpec::resize_data(size_t new_size) {
-  switch (dtype) {
-    case vkapi::kFloat:
-      float_data.resize(new_size);
-      break;
-    case vkapi::kHalf:
-      half_data.resize(new_size);
-      break;
-    case vkapi::kInt:
-      int32_data.resize(new_size);
-      break;
-    case vkapi::kChar:
-      int8_data.resize(new_size);
-      break;
-    case vkapi::kByte:
-      uint8_data.resize(new_size);
-      break;
-    default:
-      float_data.resize(new_size);
-      break;
-  }
-}
-
-void* ValueSpec::get_mutable_data_ptr() {
-  switch (dtype) {
-    case vkapi::kFloat:
-      return float_data.data();
-    case vkapi::kHalf:
-      return half_data.data();
-    case vkapi::kInt:
-      return int32_data.data();
-    case vkapi::kChar:
-      return int8_data.data();
-    case vkapi::kByte:
-      return uint8_data.data();
-    default:
-      return float_data.data();
-  }
-}
-
-float ValueSpec::get_element(size_t index) const {
-  if (index >= static_cast<size_t>(numel())) {
-    return 0.0f;
-  }
-
-  switch (dtype) {
-    case vkapi::kFloat:
-      return index < float_data.size() ? float_data[index] : 0.0f;
-    case vkapi::kHalf:
-      return index < half_data.size() ? (half_data[index] / 32767.0f) : 0.0f;
-    case vkapi::kInt:
-      return index < int32_data.size() ? static_cast<float>(int32_data[index])
-                                       : 0.0f;
-    case vkapi::kChar:
-      return index < int8_data.size() ? static_cast<float>(int8_data[index])
-                                      : 0.0f;
-    case vkapi::kByte:
-      return index < uint8_data.size() ? static_cast<float>(uint8_data[index])
-                                       : 0.0f;
-    default:
-      return 0.0f;
-  }
-}
-
-const void* ValueSpec::get_data_ptr() const {
-  switch (dtype) {
-    case vkapi::kFloat:
-      return float_data.data();
-    case vkapi::kHalf:
-      return half_data.data();
-    case vkapi::kInt:
-      return int32_data.data();
-    case vkapi::kChar:
-      return int8_data.data();
-    case vkapi::kByte:
-      return uint8_data.data();
-    default:
-      throw std::runtime_error("Unsupported data type for get_data_ptr");
-  }
-}
-
-void generate_random_float_data(
-    std::vector<float>& data,
-    float min_val,
-    float max_val) {
-  std::mt19937 gen(get_seed());
-  std::uniform_real_distribution<float> dis(min_val, max_val);
-  for (auto& val : data) {
-    val = dis(gen);
-  }
-}
-
-void generate_random_int_data(
-    std::vector<int32_t>& data,
-    int min_val,
-    int max_val) {
-  std::mt19937 gen(get_seed());
-  std::uniform_int_distribution<int32_t> dis(min_val, max_val);
-  for (auto& val : data) {
-    val = dis(gen);
-  }
-}
-
-void generate_randint_float_data(
-    std::vector<float>& data,
-    int min_val,
-    int max_val) {
-  std::mt19937 gen(get_seed());
-  std::uniform_int_distribution<int32_t> dis(min_val, max_val);
-  for (auto& val : data) {
-    val = static_cast<float>(dis(gen));
-  }
-}
-
-void generate_randint_half_data(
-    std::vector<uint16_t>& data,
-    int min_val,
-    int max_val) {
-  std::mt19937 gen(get_seed());
-  std::uniform_int_distribution<int32_t> dis(min_val, max_val);
-  for (auto& val : data) {
-    val = static_cast<uint16_t>(std::abs(dis(gen)) % 65536);
-  }
-}
-
-void generate_ones_data(std::vector<float>& data) {
-  std::fill(data.begin(), data.end(), 1.0f);
-}
-
-void generate_random_int8_data(
-    std::vector<int8_t>& data,
-    int8_t min_val,
-    int8_t max_val) {
-  std::mt19937 gen(get_seed());
-  std::uniform_int_distribution<int16_t> dis(min_val, max_val);
-  for (auto& val : data) {
-    val = static_cast<int8_t>(dis(gen));
-  }
-}
-
-void generate_random_uint8_data(
-    std::vector<uint8_t>& data,
-    uint8_t min_val,
-    uint8_t max_val) {
-  std::mt19937 gen(get_seed());
-  std::uniform_int_distribution<uint16_t> dis(min_val, max_val);
-  for (auto& val : data) {
-    val = static_cast<uint8_t>(dis(gen));
-  }
-}
-
-void generate_random_int4_data(
-    std::vector<int8_t>& data,
-    int8_t min_val,
-    int8_t max_val) {
-  std::mt19937 gen(get_seed());
-  std::uniform_int_distribution<int16_t> dis(min_val, max_val);
-  for (auto& val : data) {
-    val = static_cast<int8_t>(dis(gen));
-  }
-}
-
-void generate_random_2xint4_data(std::vector<int8_t>& data) {
-  std::mt19937 gen(get_seed());
-  std::uniform_int_distribution<int16_t> dis(-8, 7); // Signed 4-bit range
-  for (auto& val : data) {
-    // Generate two separate 4-bit values
-    int8_t lower_4bits = static_cast<int8_t>(dis(gen)) & 0x0F;
-    int8_t upper_4bits = static_cast<int8_t>(dis(gen)) & 0x0F;
-    // Pack them into a single 8-bit value
-    val = (upper_4bits << 4) | lower_4bits;
-  }
-}
-
-void generate_random_2xint4_data(std::vector<uint8_t>& data) {
-  std::mt19937 gen(get_seed());
-  std::uniform_int_distribution<uint16_t> dis(0, 15); // Unsigned 4-bit range
-  for (auto& val : data) {
-    // Generate two separate 4-bit values
-    uint8_t lower_4bits = static_cast<uint8_t>(dis(gen)) & 0x0F;
-    uint8_t upper_4bits = static_cast<uint8_t>(dis(gen)) & 0x0F;
-    // Pack them into a single 8-bit value
-    val = (upper_4bits << 4) | lower_4bits;
-  }
-}
-
-void generate_zeros_data(std::vector<float>& data) {
-  std::fill(data.begin(), data.end(), 0.0f);
-}
-
-// Correctness checking against reference data
-bool ValueSpec::validate_against_reference(
-    float abs_tolerance,
-    float rel_tolerance) const {
-  // Only validate float tensors as specified in requirements
-  if (dtype != vkapi::kFloat || !is_tensor()) {
-    return true; // Skip validation for non-float or non-tensor types
-  }
-
-  const auto& computed_data = get_float_data();
-  const auto& reference_data = get_ref_float_data();
-
-  // Skip validation if no reference data is available
-  if (reference_data.empty()) {
-    return true;
-  }
-
-  // Check if sizes match
-  if (computed_data.size() != reference_data.size()) {
-    if (debugging()) {
-      std::cout << "Size mismatch: computed=" << computed_data.size()
-                << ", reference=" << reference_data.size() << std::endl;
-    }
-    return false;
-  }
-
-  // Element-wise comparison with both absolute and relative tolerance
-  for (size_t i = 0; i < computed_data.size(); ++i) {
-    float diff = std::abs(computed_data[i] - reference_data[i]);
-    float abs_ref = std::abs(reference_data[i]);
-
-    // Check if either absolute or relative tolerance condition is satisfied
-    bool abs_tolerance_ok = diff <= abs_tolerance;
-    bool rel_tolerance_ok = diff <= rel_tolerance * abs_ref;
-
-    if (!abs_tolerance_ok && !rel_tolerance_ok) {
-      std::cout << "Mismatch at element " << i
-                << ": computed=" << computed_data[i]
-                << ", reference=" << reference_data[i] << ", diff=" << diff
-                << ", abs_tolerance=" << abs_tolerance
-                << ", rel_tolerance=" << rel_tolerance
-                << ", rel_threshold=" << (rel_tolerance * abs_ref) << std::endl;
-      return false;
-    }
-  }
-
-  if (debugging()) {
-    std::cout << "Correctness validation PASSED" << std::endl;
-  }
-  return true;
-}
-
-// Helper function to collect GPU timing from querypool
-float collect_gpu_timing_us(ComputeGraph& graph) {
-  graph.context()->querypool().extract_results();
-  const auto results = graph.context()->querypool().get_shader_timestamp_data();
-  if (!results.empty()) {
-    // Sum durations of all shaders that don't contain nchw_to or to_nchw
-    float total_duration_us = 0.0f;
-    for (const auto& shader_result : results) {
-      if (shader_result.kernel_name.find("nchw_to") == std::string::npos &&
-          shader_result.kernel_name.find("to_nchw") == std::string::npos) {
-        // Calculate duration from start and end times, convert from ns to μs
-        uint64_t duration_ns =
-            shader_result.end_time_ns - shader_result.start_time_ns;
-        total_duration_us += static_cast<float>(duration_ns) / 1000.0f;
-      }
-    }
-    return total_duration_us;
-  }
-  return 0.0f;
-}
-
-// BenchmarkResult implementation
-void BenchmarkResult::add_iter_timing(float time_us) {
-  iter_timings.push_back(time_us);
-}
-
-float BenchmarkResult::get_avg_time_us() const {
-  if (iter_timings.empty()) {
-    return 0.0f;
-  }
-
-  float sum = 0.0f;
-  for (float timing : iter_timings) {
-    sum += timing;
-  }
-  return sum / iter_timings.size();
-}
-
-float BenchmarkResult::get_min_time_us() const {
-  if (iter_timings.empty()) {
-    return 0.0f;
-  }
-
-  return *std::min_element(iter_timings.begin(), iter_timings.end());
-}
-
-float BenchmarkResult::get_max_time_us() const {
-  if (iter_timings.empty()) {
-    return 0.0f;
-  }
-
-  return *std::max_element(iter_timings.begin(), iter_timings.end());
-}
-
-float BenchmarkResult::get_std_dev_us() const {
-  if (iter_timings.size() <= 1) {
-    return 0.0f;
-  }
-
-  float mean = get_avg_time_us();
-  float sum_sq_diff = 0.0f;
-
-  for (float timing : iter_timings) {
-    float diff = timing - mean;
-    sum_sq_diff += diff * diff;
-  }
-
-  return std::sqrt(sum_sq_diff / (iter_timings.size() - 1));
-}
-
-void BenchmarkResult::print_summary(
-    int case_number,
-    const std::string& size_info,
-    float total_gflops) const {
-  static constexpr int OPERATOR_NAME_WIDTH = 50;
-  static constexpr int KERNEL_NAME_WIDTH = 70;
-  static constexpr int SIZE_INFO_WIDTH = 20;
-  static constexpr int TIMING_WIDTH = 20;
-  static constexpr int GFLOPS_WIDTH = 20;
-  static constexpr int CORRECTNESS_WIDTH = 10;
-
-  std::string correctness_str;
-  switch (correctness_status_) {
-    case CorrectnessStatus::SKIPPED:
-      correctness_str = "SKIPPED";
-      break;
-    case CorrectnessStatus::PASSED:
-      correctness_str = "PASSED";
-      break;
-    case CorrectnessStatus::FAILED:
-      correctness_str = "FAILED";
-      break;
-  }
-
-  std::cout << std::left << std::setw(OPERATOR_NAME_WIDTH)
-            << get_operator_name() << " " << std::left
-            << std::setw(KERNEL_NAME_WIDTH) << get_kernel_name() << std::right
-            << " " << std::setw(SIZE_INFO_WIDTH) << size_info
-            << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3)
-            << get_avg_time_us() << " μs " << std::setw(GFLOPS_WIDTH)
-            << std::fixed << std::setprecision(3) << total_gflops << " GFLOP/s "
-            << std::setw(CORRECTNESS_WIDTH) << correctness_str << std::endl;
-}
-
-// TestResult implementation
-void TestResult::add_result(const BenchmarkResult& result) {
-  results_.push_back(result);
-}
-
-void TestResult::add_result(BenchmarkResult&& result) {
-  results_.push_back(std::move(result));
-}
-
-void TestResult::print_summary() const {
-  static constexpr int CASE_WIDTH = 80;
-  static constexpr int KERNEL_NAME_WIDTH = 20;
-  static constexpr int TIMING_WIDTH = 12;
-  static constexpr int PASS_WIDTH = 8;
-
-  if (results_.empty()) {
-    std::cout << "No results to display" << std::endl;
-    return;
-  }
-
-  std::cout << "\n=== " << operation_name_
-            << " Performance Summary ===" << std::endl;
-  print_separator();
-
-  std::cout << std::left << std::setw(CASE_WIDTH) << "Case" << std::left
-            << std::setw(KERNEL_NAME_WIDTH) << "Kernel Name" << std::left
-            << std::setw(TIMING_WIDTH) << "Avg (μs)" << std::left
-            << std::setw(TIMING_WIDTH) << "Min (μs)" << std::left
-            << std::setw(TIMING_WIDTH) << "Max (μs)" << std::left
-            << std::setw(TIMING_WIDTH) << "Std Dev" << std::left
-            << std::setw(PASS_WIDTH) << "Pass" << std::endl;
-  print_separator();
-
-  for (size_t i = 0; i < results_.size(); ++i) {
-    const auto& result = results_[i];
-    bool vulkan_execute_succeeded =
-        result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f;
-    std::cout << std::left << std::setw(CASE_WIDTH) << i + 1 << std::left
-              << std::setw(KERNEL_NAME_WIDTH)
-              << result.get_kernel_name().substr(0, KERNEL_NAME_WIDTH - 1)
-              << std::left << std::setw(TIMING_WIDTH) << std::fixed
-              << std::setprecision(3) << result.get_avg_time_us() << std::left
-              << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3)
-              << result.get_min_time_us() << std::left
-              << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3)
-              << result.get_max_time_us() << std::left
-              << std::setw(TIMING_WIDTH) << std::fixed << std::setprecision(3)
-              << result.get_std_dev_us() << std::left << std::setw(PASS_WIDTH)
-              << (vulkan_execute_succeeded ? "✓" : "✗") << std::endl;
-  }
-
-  print_separator();
-  std::cout << "Total cases: " << results_.size()
-            << ", Passed: " << get_passed_count()
-            << ", Failed: " << get_failed_count() << std::endl;
-  std::cout << "Overall GFLOP/s: " << std::fixed << std::setprecision(3)
-            << gflops_ << std::endl;
-  std::cout << "Overall correctness: "
-            << (correctness_passed_ ? "PASSED" : "FAILED") << std::endl;
-}
-
-void TestResult::print_detailed_results() const {
-  if (results_.empty()) {
-    std::cout << "No results to display" << std::endl;
-    return;
-  }
-
-  std::cout << "\n=== " << operation_name_
-            << " Detailed Results ===" << std::endl;
-
-  for (size_t i = 0; i < results_.size(); ++i) {
-    const auto& result = results_[i];
-    bool vulkan_execute_succeeded =
-        result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f;
-    std::cout << "\nCase " << i + 1 << ": " << result.get_kernel_name()
-              << std::endl;
-    std::cout << "  Iterations: " << result.get_num_iterations() << std::endl;
-    std::cout << "  Average: " << std::fixed << std::setprecision(3)
-              << result.get_avg_time_us() << " μs" << std::endl;
-    std::cout << "  Min: " << std::fixed << std::setprecision(3)
-              << result.get_min_time_us() << " μs" << std::endl;
-    std::cout << "  Max: " << std::fixed << std::setprecision(3)
-              << result.get_max_time_us() << " μs" << std::endl;
-    std::cout << "  Std Dev: " << std::fixed << std::setprecision(3)
-              << result.get_std_dev_us() << " μs" << std::endl;
-    std::cout << "  Correctness: "
-              << (vulkan_execute_succeeded ? "PASSED" : "FAILED") << std::endl;
-
-    if (result.get_num_iterations() > 0) {
-      std::cout << "  Individual timings (μs): ";
-      const auto& timings = result.get_iter_timings();
-      for (size_t j = 0; j < std::min(size_t(10), timings.size()); ++j) {
-        std::cout << std::fixed << std::setprecision(1) << timings[j];
-        if (j < std::min(size_t(10), timings.size()) - 1)
-          std::cout << ", ";
-      }
-      if (timings.size() > 10) {
-        std::cout << " ... (" << (timings.size() - 10) << " more)";
-      }
-      std::cout << std::endl;
-    }
-  }
-
-  std::cout << "\nOverall Results:" << std::endl;
-  std::cout << "  Total GFLOP/s: " << std::fixed << std::setprecision(3)
-            << gflops_ << std::endl;
-  std::cout << "  Overall correctness: "
-            << (correctness_passed_ ? "PASSED" : "FAILED") << std::endl;
-}
-
-void TestResult::print_statistics() const {
-  if (results_.empty()) {
-    std::cout << "No results to display statistics for" << std::endl;
-    return;
-  }
-
-  std::cout << "\n=== " << operation_name_ << " Statistics ===" << std::endl;
-  std::cout << "Total test cases: " << results_.size() << std::endl;
-  std::cout << "Passed: " << get_passed_count() << std::endl;
-  std::cout << "Failed: " << get_failed_count() << std::endl;
-  std::cout << "Success rate: " << std::fixed << std::setprecision(1)
-            << (100.0f * get_passed_count() / results_.size()) << "%"
-            << std::endl;
-
-  if (get_passed_count() > 0) {
-    std::cout << "Total average time: " << std::fixed << std::setprecision(3)
-              << get_total_avg_time_us() << " μs" << std::endl;
-    std::cout << "Total GFLOP/s: " << std::fixed << std::setprecision(3)
-              << get_total_gflops() << std::endl;
-
-    const auto* fastest = get_fastest_result();
-    const auto* slowest = get_slowest_result();
-    const auto* highest_gflops = get_highest_gflops_result();
-
-    if (fastest) {
-      std::cout << "Fastest case: " << fastest->get_kernel_name() << " ("
-                << std::fixed << std::setprecision(3)
-                << fastest->get_avg_time_us() << " μs)" << std::endl;
-    }
-
-    if (slowest) {
-      std::cout << "Slowest case: " << slowest->get_kernel_name() << " ("
-                << std::fixed << std::setprecision(3)
-                << slowest->get_avg_time_us() << " μs)" << std::endl;
-    }
-
-    if (highest_gflops) {
-      std::cout << "Best performing case: " << highest_gflops->get_kernel_name()
-                << " (" << std::fixed << std::setprecision(3)
-                << highest_gflops->get_avg_time_us() << " μs)" << std::endl;
-    }
-  }
-}
-
-void TestResult::print_brief_summary() const {
-  print_separator();
-  std::cout << "Summary Statistics:" << std::endl;
-
-  if (get_passed_count() > 0) {
-    std::cout << "Average execution time: " << std::fixed
-              << std::setprecision(3) << get_total_avg_time_us() << " μs"
-              << std::endl;
-    std::cout << "Total throughput: " << std::fixed << std::setprecision(3)
-              << get_gflops() << " GFLOP/s" << std::endl;
-    std::cout << "Successful test cases: " << get_passed_count() << "/"
-              << size() << std::endl;
-    std::cout << "Overall correctness: "
-              << (get_correctness_passed() ? "PASSED" : "FAILED") << std::endl;
-  } else {
-    std::cout << "No successful test cases to report" << std::endl;
-  }
-}
-
-float TestResult::get_total_avg_time_us() const {
-  if (results_.empty()) {
-    return 0.0f;
-  }
-
-  float sum = 0.0f;
-  size_t count = 0;
-
-  for (const auto& result : results_) {
-    bool vulkan_execute_succeeded =
-        result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f;
-    if (vulkan_execute_succeeded) {
-      sum += result.get_avg_time_us();
-      count++;
-    }
-  }
-
-  return count > 0 ? sum / count : 0.0f;
-}
-
-float TestResult::get_total_gflops() const {
-  return gflops_;
-}
-
-size_t TestResult::get_passed_count() const {
-  size_t count = 0;
-  for (const auto& result : results_) {
-    bool vulkan_execute_succeeded =
-        result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f;
-    if (vulkan_execute_succeeded) {
-      count++;
-    }
-  }
-  return count;
-}
-
-size_t TestResult::get_failed_count() const {
-  return results_.size() - get_passed_count();
-}
-
-const BenchmarkResult* TestResult::get_fastest_result() const {
-  const BenchmarkResult* fastest = nullptr;
-
-  for (const auto& result : results_) {
-    bool vulkan_execute_succeeded =
-        result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f;
-    if (vulkan_execute_succeeded) {
-      if (!fastest || result.get_avg_time_us() < fastest->get_avg_time_us()) {
-        fastest = &result;
-      }
-    }
-  }
-
-  return fastest;
-}
-
-const BenchmarkResult* TestResult::get_slowest_result() const {
-  const BenchmarkResult* slowest = nullptr;
-
-  for (const auto& result : results_) {
-    bool vulkan_execute_succeeded =
-        result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f;
-    if (vulkan_execute_succeeded) {
-      if (!slowest || result.get_avg_time_us() > slowest->get_avg_time_us()) {
-        slowest = &result;
-      }
-    }
-  }
-
-  return slowest;
-}
-
-const BenchmarkResult* TestResult::get_highest_gflops_result() const {
-  // Since GFLOPS is now a TestResult-level metric rather than per-case,
-  // this method returns the fastest result as a proxy for highest performance
-  return get_fastest_result();
-}
-
-// Default FLOP calculation function (assumes 1 FLOP per element)
-int64_t default_flop_calculator(const TestCase& test_case) {
-  // Calculate total elements from the first input tensor
-  int64_t total_elements = 1;
-  if (!test_case.empty() && test_case.num_inputs() > 0 &&
-      test_case.inputs()[0].is_tensor()) {
-    const auto& sizes = test_case.inputs()[0].get_tensor_sizes();
-    for (int64_t size : sizes) {
-      total_elements *= size;
-    }
-  }
-
-  // Assume 1 FLOP per element for basic operations
-  return total_elements;
-}
-
-ComputeGraph setup_compute_graph(TestCase& test_case, std::string op_name) {
-  GraphConfig config;
-  config.enable_querypool = true;
-  ComputeGraph graph(config);
-
-  std::vector<ValueRef> input_values;
-
-  // Process input ValueSpecs
-  for (size_t i = 0; i < test_case.num_inputs(); ++i) {
-    const ValueSpec& input_spec = test_case.inputs()[i];
-
-    if (input_spec.is_none()) {
-      input_values.push_back(graph.add_none());
-    } else if (input_spec.is_float()) {
-      ValueRef input_value =
-          graph.add_scalar(static_cast<double>(input_spec.get_float_value()));
-      input_values.push_back(input_value);
-    } else if (input_spec.is_int()) {
-      ValueRef input_value =
-          graph.add_scalar(static_cast<int64_t>(input_spec.get_int_value()));
-      input_values.push_back(input_value);
-    } else if (input_spec.is_bool()) {
-      ValueRef input_value = graph.add_scalar(input_spec.get_bool_value());
-      input_values.push_back(input_value);
-    } else if (input_spec.is_int_list()) {
-      // Convert int32_t list to int64_t list for ComputeGraph
-      const auto& int32_list = input_spec.get_int_list();
-      std::vector<int64_t> int64_list;
-      int64_list.reserve(int32_list.size());
-      for (int32_t val : int32_list) {
-        int64_list.push_back(static_cast<int64_t>(val));
-      }
-      ValueRef input_value = graph.add_scalar_list(std::move(int64_list));
-      input_values.push_back(input_value);
-    } else if (input_spec.is_constant()) {
-      ValueRef input_value = graph.add_tensorref(
-          input_spec.get_tensor_sizes(),
-          input_spec.dtype,
-          input_spec.get_data_ptr());
-      input_values.push_back(input_value);
-    } else {
-      IOValueRef input_io = graph.add_input_tensor(
-          input_spec.get_tensor_sizes(),
-          input_spec.dtype,
-          input_spec.storage_type,
-          input_spec.memory_layout);
-      input_values.push_back(input_io.value);
-    }
-  }
-
-  std::vector<ValueRef> output_values;
-
-  // Process output ValueSpecs
-  for (size_t i = 0; i < test_case.num_outputs(); ++i) {
-    const ValueSpec& output_spec = test_case.outputs()[i];
-
-    if (!output_spec.is_tensor()) {
-      throw std::runtime_error("All output specifications must be tensors");
-    }
-
-    // Create output tensor
-    ValueRef output_value = graph.add_tensor(
-        output_spec.get_tensor_sizes(),
-        output_spec.dtype,
-        output_spec.storage_type,
-        output_spec.memory_layout);
-
-    output_values.push_back(output_value);
-  }
-
-  // Get the operator function and call it
-  auto opFn = VK_GET_OP_FN(op_name);
-
-  // Create arguments vector for the operator function
-  std::vector<ValueRef> op_args = input_values;
-  op_args.insert(op_args.end(), output_values.begin(), output_values.end());
-
-  opFn(graph, op_args);
-
-  for (size_t i = 0; i < output_values.size(); ++i) {
-    graph.set_output_value(output_values[i]);
-  }
-  return graph;
-}
-
-// Test execution utilities
-BenchmarkResult
-execute_test_case(TestCase& test_case, int warmup_runs, int benchmark_runs) {
-  BenchmarkResult result(
-      test_case.name().empty() ? "unnamed_test_case" : test_case.name());
-
-  // Initialize querypool if using GPU timestamps
-  if (use_gpu_timestamps()) {
-    api::context()->initialize_querypool();
-  }
-
-  // Create the compute graph for this test case using setup_compute_graph
-  ComputeGraph graph =
-      setup_compute_graph(test_case, test_case.operator_name());
-
-  // Prepare the graph
-  graph.prepare();
-  graph.prepack();
-
-  // Copy input data into the graph's staging buffers
-  for (size_t i = 0; i < test_case.num_inputs(); ++i) {
-    const ValueSpec& input_spec = test_case.inputs()[i];
-    if (input_spec.is_tensor() && i < graph.inputs().size()) {
-      // Skip copying data for constant tensors
-      if (input_spec.is_constant()) {
-        continue;
-      }
-
-      const auto& input_ref = graph.inputs()[i];
-
-      // Get the appropriate data based on dtype
-      const void* data_ptr = nullptr;
-      size_t data_numel = input_spec.numel();
-
-      switch (input_spec.dtype) {
-        case vkapi::kFloat:
-          data_ptr = input_spec.get_float_data().data();
-          break;
-        case vkapi::kHalf:
-          data_ptr = input_spec.get_half_data().data();
-          break;
-        case vkapi::kInt:
-          data_ptr = input_spec.get_int32_data().data();
-          break;
-        case vkapi::kChar:
-          data_ptr = input_spec.get_int8_data().data();
-          break;
-        case vkapi::kByte:
-          data_ptr = input_spec.get_uint8_data().data();
-          break;
-        default:
-          throw std::runtime_error("Unsupported data type for input tensor");
-      }
-
-      // Copy data into staging buffer
-      graph.copy_into_staging(input_ref.staging, data_ptr, data_numel);
-    }
-  }
-
-  // Warmup runs
-  for (int run = 0; run < warmup_runs; ++run) {
-    graph.execute();
-  }
-
-  // Benchmark runs - collect individual iteration timings
-  float total_cpu_time_us = 0.0f;
-  float total_gpu_time_us = 0.0f;
-
-  for (int run = 0; run < benchmark_runs; ++run) {
-    // Measure CPU time for each execute() call
-    auto cpu_start = std::chrono::high_resolution_clock::now();
-    graph.execute();
-    auto cpu_end = std::chrono::high_resolution_clock::now();
-
-    auto cpu_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-        cpu_end - cpu_start);
-    float cpu_time_us = static_cast<float>(cpu_duration.count());
-    total_cpu_time_us += cpu_time_us;
-
-    // Collect GPU timing using helper function
-    float gpu_time_us = collect_gpu_timing_us(graph);
-    total_gpu_time_us += gpu_time_us;
-
-    // Add the appropriate timing based on the flag
-    float iter_time_us = use_gpu_timestamps() ? gpu_time_us : cpu_time_us;
-    result.add_iter_timing(iter_time_us);
-  }
-
-  // Calculate averages for display
-  float avg_cpu_time_us = total_cpu_time_us / benchmark_runs;
-  float avg_gpu_time_us = total_gpu_time_us / benchmark_runs;
-
-  // Print both timings if latency printing is enabled
-  if (print_latencies()) {
-    if (use_gpu_timestamps()) {
-      graph.context()->querypool().print_results();
-    }
-    std::cout << "  CPU timing: " << std::fixed << std::setprecision(3)
-              << avg_cpu_time_us << " μs" << std::endl;
-    std::cout << "  GPU timing: " << std::fixed << std::setprecision(3)
-              << avg_gpu_time_us << " μs" << std::endl;
-    std::cout << "  Using " << (use_gpu_timestamps() ? "GPU" : "CPU")
-              << " timing for result" << std::endl;
-  }
-
-  // Copy output data from the graph's staging buffers
-  for (size_t i = 0; i < test_case.num_outputs(); ++i) {
-    ValueSpec& output_spec = test_case.outputs()[i];
-
-    if (output_spec.is_tensor() && i < graph.outputs().size()) {
-      const auto& output_ref = graph.outputs()[i];
-
-      // Ensure output data vector is properly sized
-      size_t data_numel = output_spec.numel();
-      output_spec.resize_data(data_numel);
-
-      // Get mutable data pointer for the output
-      void* data_ptr = output_spec.get_mutable_data_ptr();
-
-      if (data_ptr != nullptr) {
-        // Copy data from staging buffer to output spec
-        graph.copy_from_staging(output_ref.staging, data_ptr, data_numel);
-      }
-
-      // Print output tensor data if output printing is enabled
-      if (print_output()) {
-        std::string output_name = "Output[" + std::to_string(i) + "]";
-        print_valuespec_data(output_spec, output_name);
-      }
-    }
-  }
-
-  return result;
-}
-
-TestResult execute_test_cases(
-    std::function<std::vector<TestCase>()> test_case_generator,
-    FlopCalculatorFunc flop_calculator,
-    const std::string& operation_name,
-    int warmup_runs,
-    int benchmark_runs,
-    ReferenceComputeFunc reference_compute_func) {
-  TestResult results(operation_name);
-
-  // Generate all test cases
-  std::vector<TestCase> test_cases = test_case_generator();
-
-  std::cout << "Executing " << test_cases.size() << " test cases for "
-            << operation_name << std::endl;
-  print_separator();
-
-  bool any_correctness_failed = false;
-  float total_gflops = 0.0f;
-
-  for (size_t i = 0; i < test_cases.size(); ++i) {
-    TestCase& test_case = test_cases[i];
-
-    // Compute reference data if reference function is provided
-    bool skipped_reference_fn = true;
-    if (reference_compute_func) {
-      try {
-        reference_compute_func(test_case);
-        skipped_reference_fn = false;
-      } catch (const std::invalid_argument& e) {
-        if (debugging()) {
-          std::cout << "Compute reference skipped: " << e.what() << std::endl;
-        }
-      }
-    }
-
-    // Execute single test case
-    BenchmarkResult result;
-    bool shader_not_supported = false;
-    try {
-      result = execute_test_case(test_case, warmup_runs, benchmark_runs);
-      result.set_operator_name(test_case.operator_name());
-    } catch (const vkcompute::vkapi::ShaderNotSupportedError& e) {
-      result = BenchmarkResult(
-          test_case.name().empty() ? "unnamed_test_case" : test_case.name(),
-          test_case.operator_name());
-      shader_not_supported = true;
-    }
-
-    // Determine if this test case passed (has valid timing data)
-    bool vulkan_execute_succeeded =
-        result.get_num_iterations() > 0 && result.get_avg_time_us() > 0.0f;
-
-    if (shader_not_supported) {
-      result.set_correctness_status(CorrectnessStatus::SKIPPED);
-    } else if (!vulkan_execute_succeeded) {
-      result.set_correctness_status(CorrectnessStatus::FAILED);
-    } else if (skipped_reference_fn) {
-      result.set_correctness_status(CorrectnessStatus::SKIPPED);
-    } else {
-      // Reference function provided and succeeded - validate outputs
-      bool correctness_passed = true;
-
-      for (size_t output_idx = 0; output_idx < test_case.num_outputs();
-           ++output_idx) {
-        const ValueSpec& output_spec = test_case.outputs()[output_idx];
-
-        if (!output_spec.validate_against_reference(
-                test_case.get_abs_tolerance(), test_case.get_rel_tolerance())) {
-          correctness_passed = false;
-          std::cout << "  Correctness validation FAILED for test "
-                    << result.get_kernel_name() << std::endl;
-          print_valuespec_data(output_spec, "vulkan output");
-          print_valuespec_data(output_spec, "ref output", true);
-
-          throw std::runtime_error("Correctness validation failed");
-        }
-      }
-
-      if (correctness_passed) {
-        result.set_correctness_status(CorrectnessStatus::PASSED);
-      } else {
-        any_correctness_failed = true;
-        result.set_correctness_status(CorrectnessStatus::FAILED);
-      }
-    }
-
-    // Calculate GFLOPS for this test case using the provided FLOP calculator
-    float case_gflops = 0.0f;
-    if (vulkan_execute_succeeded) {
-      // Use the provided FLOP calculator to get total FLOPs for this test case
-      int64_t total_flops = flop_calculator(test_case);
-      float flops = static_cast<float>(total_flops);
-      float avg_time_us = result.get_avg_time_us();
-      if (avg_time_us > 0.0f && total_flops > 0) {
-        case_gflops = (flops / 1e9f) / (avg_time_us / 1e6f);
-      }
-
-      total_gflops += case_gflops;
-    } else {
-      case_gflops = -1.0f; // Indicate failure
-    }
-
-    // Calculate tensor info for display
-    std::string size_info = "[";
-    if (!test_case.empty() && test_case.num_inputs() > 0 &&
-        test_case.inputs()[0].is_tensor()) {
-      const auto& sizes = test_case.inputs()[0].get_tensor_sizes();
-      for (size_t j = 0; j < sizes.size(); ++j) {
-        size_info += std::to_string(sizes[j]);
-        if (j < sizes.size() - 1)
-          size_info += "x";
-      }
-    }
-    size_info += "]";
-
-    // Print progress using the BenchmarkResult member function
-    result.print_summary(i + 1, size_info, case_gflops);
-
-    // Add result to collection
-    results.add_result(std::move(result));
-  }
-
-  // Set the overall results on the TestResult
-  results.set_correctness_passed(!any_correctness_failed);
-  results.set_gflops(total_gflops);
-
-  print_separator();
-  std::cout << "Completed " << results.size() << " test cases" << std::endl;
-
-  return results;
-}
-
-// Convenience overload that uses the default FLOP calculator
-TestResult execute_test_cases(
-    std::function<std::vector<TestCase>()> test_case_generator,
-    const std::string& operation_name,
-    int warmup_runs,
-    int benchmark_runs,
-    ReferenceComputeFunc reference_compute_func) {
-  return execute_test_cases(
-      test_case_generator,
-      default_flop_calculator,
-      operation_name,
-      warmup_runs,
-      benchmark_runs,
-      reference_compute_func);
-}
-
-// Utility functions for printing
-void print_performance_header() {
-  std::cout << "\n=== Compute Shader Performance Benchmark ===" << std::endl;
-}
-
-void print_separator() {
-  std::cout << std::string(70, '-') << std::endl;
-}
-
-// ValueSpec data printing utilities
-void print_valuespec_data(
-    const ValueSpec& spec,
-    const std::string& name,
-    const bool print_ref_data,
-    size_t max_elements,
-    int precision) {
-  std::cout << "\n" << name << " Data:" << std::endl;
-  std::cout << "  Type: " << spec.to_string() << std::endl;
-
-  if (!spec.is_tensor()) {
-    if (spec.is_int()) {
-      std::cout << "  Value: " << spec.get_int_value() << std::endl;
-    } else if (spec.is_int_list()) {
-      const auto& int_list = spec.get_int_list();
-      std::cout << "  Values: [";
-      size_t print_count = std::min(max_elements, int_list.size());
-      for (size_t i = 0; i < print_count; ++i) {
-        std::cout << int_list[i];
-        if (i < print_count - 1)
-          std::cout << ", ";
-      }
-      if (int_list.size() > max_elements) {
-        std::cout << ", ... (" << (int_list.size() - max_elements) << " more)";
-      }
-      std::cout << "]" << std::endl;
-    }
-    return;
-  }
-
-  // Print tensor data
-  size_t total_elements = spec.numel();
-  size_t print_count = std::min(max_elements, total_elements);
-
-  std::cout << "  Total elements: " << total_elements << std::endl;
-  std::cout << "  Data (first " << print_count << " elements): [";
-
-  std::cout << std::fixed << std::setprecision(precision);
-
-  switch (spec.dtype) {
-    case vkapi::kFloat: {
-      auto data = spec.get_float_data().data();
-      if (print_ref_data) {
-        data = spec.get_ref_float_data().data();
-      }
-      for (size_t i = 0; i < print_count; ++i) {
-        std::cout << data[i];
-        if (i < print_count - 1)
-          std::cout << ", ";
-      }
-      break;
-    }
-    case vkapi::kHalf: {
-      const auto& data = spec.get_half_data();
-      for (size_t i = 0; i < print_count; ++i) {
-        // Convert uint16_t back to float for display
-        float value = data[i] / 32767.0f;
-        std::cout << value;
-        if (i < print_count - 1)
-          std::cout << ", ";
-      }
-      break;
-    }
-    case vkapi::kInt: {
-      const auto& data = spec.get_int32_data();
-      for (size_t i = 0; i < print_count; ++i) {
-        std::cout << data[i];
-        if (i < print_count - 1)
-          std::cout << ", ";
-      }
-      break;
-    }
-    case vkapi::kChar: {
-      const auto& data = spec.get_int8_data();
-      if (spec.is_int4()) {
-        // Print each 4-bit value individually
-        size_t element_count = 0;
-        for (size_t i = 0; i < data.size() && element_count < print_count;
-             ++i) {
-          // Extract lower 4 bits (signed)
-          int8_t lower_4bits = data[i] & 0x0F;
-          if (lower_4bits > 7)
-            lower_4bits -= 16; // Convert to signed
-          std::cout << static_cast<int>(lower_4bits);
-          element_count++;
-
-          if (element_count < print_count) {
-            std::cout << ", ";
-            // Extract upper 4 bits (signed)
-            int8_t upper_4bits = (data[i] >> 4) & 0x0F;
-            if (upper_4bits > 7)
-              upper_4bits -= 16; // Convert to signed
-            std::cout << static_cast<int>(upper_4bits);
-            element_count++;
-
-            if (element_count < print_count)
-              std::cout << ", ";
-          }
-        }
-      } else {
-        for (size_t i = 0; i < print_count; ++i) {
-          std::cout << static_cast<int>(data[i]);
-          if (i < print_count - 1)
-            std::cout << ", ";
-        }
-      }
-      break;
-    }
-    case vkapi::kByte: {
-      const auto& data = spec.get_uint8_data();
-      if (spec.is_int4()) {
-        // Print each 4-bit value individually
-        size_t element_count = 0;
-        for (size_t i = 0; i < data.size() && element_count < print_count;
-             ++i) {
-          // Extract lower 4 bits
-          uint8_t lower_4bits = data[i] & 0x0F;
-          std::cout << static_cast<unsigned int>(lower_4bits);
-          element_count++;
-
-          if (element_count < print_count) {
-            std::cout << ", ";
-            // Extract upper 4 bits
-            uint8_t upper_4bits = (data[i] >> 4) & 0x0F;
-            std::cout << static_cast<unsigned int>(upper_4bits);
-            element_count++;
-
-            if (element_count < print_count)
-              std::cout << ", ";
-          }
-        }
-      } else {
-        for (size_t i = 0; i < print_count; ++i) {
-          std::cout << static_cast<unsigned int>(data[i]);
-          if (i < print_count - 1)
-            std::cout << ", ";
-        }
-      }
-      break;
-    }
-    default:
-      std::cout << "unsupported data type";
-      break;
-  }
-
-  if (total_elements > max_elements) {
-    std::cout << ", ... (" << (total_elements - max_elements) << " more)";
-  }
-  std::cout << "]" << std::endl;
-
-  // Print some statistics for tensor data
-  if (total_elements > 0) {
-    float min_val = 0.0f, max_val = 0.0f, sum = 0.0f;
-    bool first = true;
-
-    for (size_t i = 0; i < total_elements; ++i) {
-      float val = spec.get_element(i);
-      if (first) {
-        min_val = max_val = val;
-        first = false;
-      } else {
-        min_val = std::min(min_val, val);
-        max_val = std::max(max_val, val);
-      }
-      sum += val;
-    }
-
-    float mean = sum / total_elements;
-    std::cout << "  Statistics: min=" << std::setprecision(precision) << min_val
-              << ", max=" << max_val << ", mean=" << mean << ", sum=" << sum
-              << std::endl;
-  }
-}
-
-ValueRef quantized_weights_canvas(
-    ComputeGraph& graph,
-    const ValueRef weight_ref) {
-  const auto original_sizes = graph.sizes_of(weight_ref);
-
-  // Get the 2 highest values of original_sizes
-  std::vector<int64_t> sorted_sizes = original_sizes;
-  std::sort(sorted_sizes.begin(), sorted_sizes.end(), std::greater<int64_t>());
-  int64_t largest1 = sorted_sizes.size() > 0 ? sorted_sizes[0] : 0;
-  int64_t largest2 = sorted_sizes.size() > 1 ? sorted_sizes[1] : 0;
-
-  std::vector<int64_t> final_sizes = {1, largest1, largest1};
-
-  // Debug logging if debugging flag is set
-  if (debugging()) {
-    std::cout << "Debug: Creating quantized weights canvas tensor" << std::endl;
-    std::cout << "Debug: Original sizes: [";
-    for (size_t i = 0; i < original_sizes.size(); ++i) {
-      std::cout << original_sizes[i];
-      if (i < original_sizes.size() - 1)
-        std::cout << ", ";
-    }
-    std::cout << "]" << std::endl;
-    std::cout << "Debug: Canvas sizes: [";
-    for (size_t i = 0; i < final_sizes.size(); ++i) {
-      std::cout << final_sizes[i];
-      if (i < final_sizes.size() - 1)
-        std::cout << ", ";
-    }
-    std::cout << "]" << std::endl;
-  }
-
-  ValueRef packed_weight = graph.add_tensor(
-      final_sizes, vkapi::kInt, utils::kTexture3D, utils::kWidthPacked);
-
-  utils::uvec3 global_wg_size{
-      utils::div_up(utils::safe_downcast<uint32_t>(largest1), uint32_t(4)),
-      utils::safe_downcast<uint32_t>(largest2),
-      utils::safe_downcast<uint32_t>(std::min(largest1, int64_t(2048)))};
-
-  std::string kernel_name = "packed_int32_canvas";
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(packed_weight));
-
-  graph.prepack_nodes().emplace_back(new PrepackNode(
-      graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      graph.create_global_wg_size(packed_weight),
-      graph.create_local_wg_size(packed_weight),
-      weight_ref,
-      packed_weight,
-      // UBOs
-      {graph.logical_limits_ubo(packed_weight)},
-      // Specialization constants
-      {},
-      // Push Constants
-      {}));
-
-  return packed_weight;
-}
-
-ValueRef float_tensor_canvas(ComputeGraph& graph, const ValueRef weight_ref) {
-  const auto original_sizes = graph.sizes_of(weight_ref);
-
-  // Get the 2 highest values of original_sizes
-  std::vector<int64_t> sorted_sizes = original_sizes;
-  std::sort(sorted_sizes.begin(), sorted_sizes.end(), std::greater<int64_t>());
-  int64_t largest1 = sorted_sizes.size() > 0 ? sorted_sizes[0] : 0;
-  int64_t largest2 = sorted_sizes.size() > 1 ? sorted_sizes[1] : 0;
-
-  std::vector<int64_t> final_sizes = {1, largest1, largest1};
-
-  // Debug logging if debugging flag is set
-  if (debugging()) {
-    std::cout << "Debug: Creating float tensor canvas" << std::endl;
-    std::cout << "Debug: Original sizes: [";
-    for (size_t i = 0; i < original_sizes.size(); ++i) {
-      std::cout << original_sizes[i];
-      if (i < original_sizes.size() - 1)
-        std::cout << ", ";
-    }
-    std::cout << "]" << std::endl;
-    std::cout << "Debug: Canvas sizes: [";
-    for (size_t i = 0; i < final_sizes.size(); ++i) {
-      std::cout << final_sizes[i];
-      if (i < final_sizes.size() - 1)
-        std::cout << ", ";
-    }
-    std::cout << "]" << std::endl;
-  }
-
-  ValueRef packed_weight = graph.add_tensor(
-      final_sizes, vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked);
-
-  utils::uvec3 global_wg_size{
-      utils::div_up(utils::safe_downcast<uint32_t>(largest1), uint32_t(4)),
-      utils::safe_downcast<uint32_t>(largest2),
-      utils::safe_downcast<uint32_t>(std::min(largest1, int64_t(2048)))};
-
-  graph.prepack_nodes().emplace_back(new PrepackNode(
-      graph,
-      VK_KERNEL_FROM_STR("float_canvas"),
-      graph.create_global_wg_size(packed_weight),
-      graph.create_local_wg_size(packed_weight),
-      weight_ref,
-      packed_weight,
-      // UBOs
-      {graph.logical_limits_ubo(packed_weight)},
-      // Specialization constants
-      {},
-      // Push Constants
-      {}));
-
-  return packed_weight;
-}
-
-// Compute weight sums for quantized operations (linear and convolution)
-void compute_weight_sums(
-    ValueSpec& weight_sums,
-    const ValueSpec& quantized_weight,
-    int64_t out_features,
-    int64_t elements_per_output_feature) {
-  auto& weight_sums_data = weight_sums.get_int32_data();
-  auto& quantized_weight_data = quantized_weight.get_int8_data();
-
-  weight_sums_data.resize(out_features);
-
-  // For each output feature, compute the sum of quantized weights
-  for (int64_t out_f = 0; out_f < out_features; ++out_f) {
-    int32_t sum = 0;
-    for (int64_t elem = 0; elem < elements_per_output_feature; ++elem) {
-      // Weight indexing depends on the layout:
-      // For linear: [out_features, in_features] -> out_f *
-      // elements_per_output_feature + elem For conv2d: [C_out, C_in * K_h *
-      // K_w] -> out_f * elements_per_output_feature + elem
-      int64_t weight_idx = out_f * elements_per_output_feature + elem;
-      sum += static_cast<int32_t>(quantized_weight_data[weight_idx]);
-    }
-    weight_sums_data[out_f] = sum;
-  }
-}
-
-} // namespace prototyping
-} // namespace vulkan
-} // namespace executorch
diff --git a/backends/vulkan/test/custom_ops/utils.h b/backends/vulkan/test/custom_ops/utils.h
deleted file mode 100644
index 2440e225ef2..00000000000
--- a/backends/vulkan/test/custom_ops/utils.h
+++ /dev/null
@@ -1,661 +0,0 @@
-// Copyright (c) Meta Platforms, Inc. and affiliates.
-// All rights reserved.
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <chrono>
-#include <functional>
-#include <iomanip>
-#include <iostream>
-#include <string>
-#include <vector>
-
-namespace executorch {
-namespace vulkan {
-namespace prototyping {
-
-using namespace vkcompute;
-
-//
-// Global configuration options
-//
-
-bool print_output();
-void set_print_output(bool print_output);
-
-bool print_latencies();
-void set_print_latencies(bool print_latencies);
-
-bool use_gpu_timestamps();
-void set_use_gpu_timestamps(bool use_timestamps);
-
-bool debugging();
-void set_debugging(bool enable_debugging);
-
-//
-// ValueSpec class
-//
-
-enum class SpecType { Tensor, IntList, Int, Float, Bool };
-
-// Data generation types
-enum class DataGenType {
-  FIXED,
-  RANDOM,
-  RANDOM_SCALES,
-  RANDINT,
-  RANDINT8,
-  RANDINT4,
-  ONES,
-  ONES_INT4,
-  ZEROS
-};
-
-// Value specification struct
-struct ValueSpec {
-  std::vector<int64_t> sizes;
-  vkapi::ScalarType dtype;
-  utils::GPUMemoryLayout memory_layout;
-  utils::StorageType storage_type;
-  SpecType spec_type;
-  DataGenType data_gen_type;
-  bool is_constant_tensor;
-  bool is_none_flag;
-  bool is_int4_tensor;
-
-  std::vector<float> float_data;
-  std::vector<int32_t> int32_data;
-  std::vector<uint16_t> half_data; // Using uint16_t as substitute for half
-  std::vector<int8_t> int8_data; // For kChar (signed 8-bit)
-  std::vector<uint8_t> uint8_data; // For kByte (unsigned 8-bit)
-
-  std::vector<float> ref_float_data;
-  std::vector<int32_t> ref_int32_data;
-  std::vector<uint16_t> ref_half_data;
-  std::vector<int8_t> ref_int8_data;
-  std::vector<uint8_t> ref_uint8_data;
-
-  ValueSpec(
-      const std::vector<int64_t>& sizes,
-      vkapi::ScalarType dtype,
-      utils::StorageType storage_type = utils::kTexture3D,
-      utils::GPUMemoryLayout memory_layout = utils::kWidthPacked)
-      : sizes(sizes),
-        dtype(dtype),
-        memory_layout(memory_layout),
-        storage_type(storage_type),
-        spec_type(SpecType::Tensor),
-        data_gen_type(DataGenType::ZEROS),
-        is_constant_tensor(false),
-        is_none_flag(false),
-        is_int4_tensor(false) {
-    generate_tensor_data();
-  }
-
-  // Constructor for tensor with custom data generation type
-  ValueSpec(
-      const std::vector<int64_t>& sizes,
-      vkapi::ScalarType dtype,
-      utils::StorageType storage_type,
-      utils::GPUMemoryLayout memory_layout,
-      DataGenType data_gen_type)
-      : sizes(sizes),
-        dtype(dtype),
-        memory_layout(memory_layout),
-        storage_type(storage_type),
-        spec_type(SpecType::Tensor),
-        data_gen_type(data_gen_type),
-        is_constant_tensor(false),
-        is_none_flag(false),
-        is_int4_tensor(false) {
-    generate_tensor_data();
-  }
-
-  // Constructor for single int
-  ValueSpec(int32_t value)
-      : sizes({1}),
-        dtype(vkapi::kInt),
-        memory_layout(utils::kWidthPacked),
-        storage_type(utils::kTexture3D),
-        spec_type(SpecType::Int),
-        data_gen_type(DataGenType::FIXED),
-        is_constant_tensor(false),
-        is_none_flag(false),
-        is_int4_tensor(false) {
-    int32_data.push_back(value);
-  }
-
-  // Constructor for single float
-  ValueSpec(float value)
-      : sizes({1}),
-        dtype(vkapi::kFloat),
-        memory_layout(utils::kWidthPacked),
-        storage_type(utils::kTexture3D),
-        spec_type(SpecType::Float),
-        data_gen_type(DataGenType::FIXED),
-        is_constant_tensor(false),
-        is_none_flag(false),
-        is_int4_tensor(false) {
-    float_data.push_back(value);
-  }
-
-  // Constructor for single bool
-  ValueSpec(bool value)
-      : sizes({1}),
-        dtype(vkapi::kInt),
-        memory_layout(utils::kWidthPacked),
-        storage_type(utils::kTexture3D),
-        spec_type(SpecType::Bool),
-        data_gen_type(DataGenType::FIXED),
-        is_constant_tensor(false),
-        is_none_flag(false),
-        is_int4_tensor(false) {
-    int32_data.push_back(value ? 1 : 0);
-  }
-
-  // Constructor for int list
-  ValueSpec(const std::vector<int32_t>& values)
-      : sizes({static_cast<int64_t>(values.size())}),
-        dtype(vkapi::kInt),
-        memory_layout(utils::kWidthPacked),
-        storage_type(utils::kTexture3D),
-        spec_type(SpecType::IntList),
-        data_gen_type(DataGenType::FIXED),
-        is_constant_tensor(false),
-        is_none_flag(false),
-        is_int4_tensor(false),
-        int32_data(values) {}
-
-  // Default constructor
-  ValueSpec()
-      : dtype(vkapi::kFloat),
-        memory_layout(utils::kWidthPacked),
-        storage_type(utils::kTexture3D),
-        spec_type(SpecType::Tensor),
-        data_gen_type(DataGenType::ZEROS),
-        is_constant_tensor(false),
-        is_none_flag(false),
-        is_int4_tensor(false) {}
-
-  int64_t numel() const;
-  size_t nbytes() const;
-  std::string to_string() const;
-
-  bool is_tensor() const {
-    return spec_type == SpecType::Tensor;
-  }
-  bool is_int_list() const {
-    return spec_type == SpecType::IntList;
-  }
-  bool is_int() const {
-    return spec_type == SpecType::Int;
-  }
-  bool is_float() const {
-    return spec_type == SpecType::Float;
-  }
-  bool is_bool() const {
-    return spec_type == SpecType::Bool;
-  }
-
-  int32_t get_int_value() const {
-    return int32_data.empty() ? 0 : int32_data[0];
-  }
-  float get_float_value() const {
-    return float_data.empty() ? 0.0f : float_data[0];
-  }
-  bool get_bool_value() const {
-    return int32_data.empty() ? false : (int32_data[0] != 0);
-  }
-  const std::vector<int32_t>& get_int_list() const {
-    return int32_data;
-  }
-  const std::vector<int64_t>& get_tensor_sizes() const {
-    return sizes;
-  }
-
-  const std::vector<float>& get_float_data() const {
-    return float_data;
-  }
-  const std::vector<int32_t>& get_int32_data() const {
-    return int32_data;
-  }
-  const std::vector<uint16_t>& get_half_data() const {
-    return half_data;
-  }
-  const std::vector<int8_t>& get_int8_data() const {
-    return int8_data;
-  }
-  const std::vector<uint8_t>& get_uint8_data() const {
-    return uint8_data;
-  }
-
-  std::vector<float>& get_float_data() {
-    return float_data;
-  }
-  std::vector<int32_t>& get_int32_data() {
-    return int32_data;
-  }
-  std::vector<uint16_t>& get_half_data() {
-    return half_data;
-  }
-  std::vector<int8_t>& get_int8_data() {
-    return int8_data;
-  }
-  std::vector<uint8_t>& get_uint8_data() {
-    return uint8_data;
-  }
-
-  const std::vector<float>& get_ref_float_data() const {
-    return ref_float_data;
-  }
-  const std::vector<int32_t>& get_ref_int32_data() const {
-    return ref_int32_data;
-  }
-  const std::vector<uint16_t>& get_ref_half_data() const {
-    return ref_half_data;
-  }
-  const std::vector<int8_t>& get_ref_int8_data() const {
-    return ref_int8_data;
-  }
-  const std::vector<uint8_t>& get_ref_uint8_data() const {
-    return ref_uint8_data;
-  }
-
-  std::vector<float>& get_ref_float_data() {
-    return ref_float_data;
-  }
-  std::vector<int32_t>& get_ref_int32_data() {
-    return ref_int32_data;
-  }
-  std::vector<uint16_t>& get_ref_half_data() {
-    return ref_half_data;
-  }
-  std::vector<int8_t>& get_ref_int8_data() {
-    return ref_int8_data;
-  }
-  std::vector<uint8_t>& get_ref_uint8_data() {
-    return ref_uint8_data;
-  }
-
-  void resize_data(size_t new_size);
-  void* get_mutable_data_ptr();
-  float get_element(size_t index) const;
-
-  // Set/get constant flag
-  bool is_constant() const {
-    return is_constant_tensor;
-  }
-  void set_constant(bool is_constant) {
-    is_constant_tensor = is_constant;
-  }
-
-  // Set/get none flag
-  bool is_none() const {
-    return is_none_flag;
-  }
-
-  void set_none(bool is_none) {
-    is_none_flag = is_none;
-  }
-
-  // Set/get int4 flag
-  bool is_int4() const {
-    return is_int4_tensor;
-  }
-  void set_int4(bool is_int4) {
-    is_int4_tensor = is_int4;
-  }
-
-  const void* get_data_ptr() const;
-
-  // Correctness checking against reference data
-  // Returns true if computed data matches reference data within tolerance
-  // Only validates float tensors as specified in requirements
-  bool validate_against_reference(
-      float abs_tolerance = 2e-3f,
-      float rel_tolerance = 1e-3f) const;
-
- private:
-  void generate_tensor_data();
-};
-
-//
-// TestCase
-//
-
-class TestCase {
- public:
-  TestCase() : abs_tolerance_(2e-3f), rel_tolerance_(1e-3f) {}
-  TestCase(const std::string& name)
-      : name_(name), abs_tolerance_(2e-3f), rel_tolerance_(1e-3f) {}
-
-  void set_name(const std::string& name) {
-    name_ = name;
-  }
-  const std::string& name() const {
-    return name_;
-  }
-
-  void set_operator_name(const std::string& op_name) {
-    operator_name_ = op_name;
-  }
-  const std::string& operator_name() const {
-    return operator_name_;
-  }
-
-  // Tolerance settings
-  void set_abs_tolerance(float abs_tolerance) {
-    abs_tolerance_ = abs_tolerance;
-  }
-  float get_abs_tolerance() const {
-    return abs_tolerance_;
-  }
-
-  void set_rel_tolerance(float rel_tolerance) {
-    rel_tolerance_ = rel_tolerance;
-  }
-  float get_rel_tolerance() const {
-    return rel_tolerance_;
-  }
-
-  void add_input_spec(const ValueSpec& spec) {
-    inputs_.push_back(spec);
-  }
-
-  const std::vector<ValueSpec>& inputs() const {
-    return inputs_;
-  }
-
-  std::vector<ValueSpec>& inputs() {
-    return inputs_;
-  }
-
-  size_t num_inputs() const {
-    return inputs_.size();
-  }
-
-  void add_output_spec(const ValueSpec& spec) {
-    outputs_.push_back(spec);
-  }
-
-  const std::vector<ValueSpec>& outputs() const {
-    return outputs_;
-  }
-
-  std::vector<ValueSpec>& outputs() {
-    return outputs_;
-  }
-
-  size_t num_outputs() const {
-    return outputs_.size();
-  }
-
-  bool empty() const {
-    return inputs_.empty() && outputs_.empty();
-  }
-  void clear() {
-    inputs_.clear();
-    outputs_.clear();
-    name_.clear();
-    operator_name_.clear();
-    abs_tolerance_ = 2e-3f;
-    rel_tolerance_ = 1e-3f;
-  }
-
- private:
-  std::string name_;
-  std::string operator_name_;
-  std::vector<ValueSpec> inputs_;
-  std::vector<ValueSpec> outputs_;
-  float abs_tolerance_;
-  float rel_tolerance_;
-};
-
-//
-// BenchmarkResult
-//
-
-enum class CorrectnessStatus {
-  SKIPPED, // No reference function provided
-  PASSED, // Reference function provided and validation passed
-  FAILED // Reference function provided but validation failed
-};
-
-class BenchmarkResult {
- public:
-  BenchmarkResult() : correctness_status_(CorrectnessStatus::SKIPPED) {}
-
-  BenchmarkResult(const std::string& name)
-      : kernel_name(name), correctness_status_(CorrectnessStatus::SKIPPED) {}
-
-  BenchmarkResult(
-      const std::string& kernel_name,
-      const std::string& operator_name)
-      : kernel_name(kernel_name),
-        operator_name(operator_name),
-        correctness_status_(CorrectnessStatus::SKIPPED) {}
-
-  // Add timing for a single iteration
-  void add_iter_timing(float time_us);
-
-  // Getters
-  const std::string& get_kernel_name() const {
-    return kernel_name;
-  }
-  const std::string& get_operator_name() const {
-    return operator_name;
-  }
-  float get_avg_time_us() const;
-  size_t get_num_iterations() const {
-    return iter_timings.size();
-  }
-  const std::vector<float>& get_iter_timings() const {
-    return iter_timings;
-  }
-  CorrectnessStatus get_correctness_status() const {
-    return correctness_status_;
-  }
-
-  // Setters
-  void set_kernel_name(const std::string& name) {
-    kernel_name = name;
-  }
-  void set_operator_name(const std::string& name) {
-    operator_name = name;
-  }
-  void set_correctness_status(CorrectnessStatus status) {
-    correctness_status_ = status;
-  }
-
-  // Statistics
-  float get_min_time_us() const;
-  float get_max_time_us() const;
-  float get_std_dev_us() const;
-
-  // Clear all timings
-  void clear_timings() {
-    iter_timings.clear();
-  }
-
-  // Print progress for this benchmark result
-  void print_summary(
-      int case_number,
-      const std::string& size_info,
-      float total_gflops) const;
-
- private:
-  std::string kernel_name;
-  std::string operator_name;
-  std::vector<float>
-      iter_timings; // Individual iteration timings in microseconds
-  CorrectnessStatus correctness_status_;
-};
-
-// Test result collection and processing
-class TestResult {
- public:
-  TestResult() : gflops_(0.0f), correctness_passed_(true) {}
-  TestResult(const std::string& operation_name)
-      : operation_name_(operation_name),
-        gflops_(0.0f),
-        correctness_passed_(true) {}
-
-  // Add a benchmark result
-  void add_result(const BenchmarkResult& result);
-  void add_result(BenchmarkResult&& result);
-
-  // Getters
-  const std::string& get_operation_name() const {
-    return operation_name_;
-  }
-  float get_gflops() const {
-    return gflops_;
-  }
-  bool get_correctness_passed() const {
-    return correctness_passed_;
-  }
-  size_t size() const {
-    return results_.size();
-  }
-  bool empty() const {
-    return results_.empty();
-  }
-
-  // Setters
-  void set_gflops(float gflops_val) {
-    gflops_ = gflops_val;
-  }
-  void set_correctness_passed(bool passed) {
-    correctness_passed_ = passed;
-  }
-
-  // Access results
-  const BenchmarkResult& operator[](size_t index) const {
-    return results_[index];
-  }
-  BenchmarkResult& operator[](size_t index) {
-    return results_[index];
-  }
-  const std::vector<BenchmarkResult>& get_results() const {
-    return results_;
-  }
-
-  // Iterator support
-  std::vector<BenchmarkResult>::iterator begin() {
-    return results_.begin();
-  }
-  std::vector<BenchmarkResult>::iterator end() {
-    return results_.end();
-  }
-  std::vector<BenchmarkResult>::const_iterator begin() const {
-    return results_.begin();
-  }
-  std::vector<BenchmarkResult>::const_iterator end() const {
-    return results_.end();
-  }
-
-  // Processing and analysis
-  void print_summary() const;
-  void print_detailed_results() const;
-  void print_statistics() const;
-  void print_brief_summary() const;
-
-  // Get aggregate statistics
-  float get_total_avg_time_us() const;
-  float get_total_gflops() const;
-  size_t get_passed_count() const;
-  size_t get_failed_count() const;
-
-  // Find best/worst performing results
-  const BenchmarkResult* get_fastest_result() const;
-  const BenchmarkResult* get_slowest_result() const;
-  const BenchmarkResult* get_highest_gflops_result() const;
-
-  // Clear all results
-  void clear() {
-    results_.clear();
-  }
-
-  // Set operation name
-  void set_operation_name(const std::string& name) {
-    operation_name_ = name;
-  }
-
- private:
-  std::string operation_name_;
-  std::vector<BenchmarkResult> results_;
-  float gflops_;
-  bool correctness_passed_;
-};
-
-//
-// Test case execution
-//
-
-using FlopCalculatorFunc = std::function<int64_t(const TestCase&)>;
-
-// Default FLOP calculation function (assumes 1 FLOP per element)
-int64_t default_flop_calculator(const TestCase& test_case);
-
-using ReferenceComputeFunc = std::function<void(TestCase&)>;
-
-BenchmarkResult execute_test_case(
-    TestCase& test_case,
-    int warmup_runs = 3,
-    int benchmark_runs = 10);
-
-TestResult execute_test_cases(
-    std::function<std::vector<TestCase>()> test_case_generator,
-    FlopCalculatorFunc flop_calculator,
-    const std::string& operation_name = "Operation",
-    int warmup_runs = 3,
-    int benchmark_runs = 10,
-    ReferenceComputeFunc reference_compute_func = nullptr);
-
-TestResult execute_test_cases(
-    std::function<std::vector<TestCase>()> test_case_generator,
-    const std::string& operation_name = "Operation",
-    int warmup_runs = 3,
-    int benchmark_runs = 10,
-    ReferenceComputeFunc reference_compute_func = nullptr);
-
-//
-// Print utilities
-//
-
-void print_performance_header();
-void print_separator();
-
-void print_valuespec_data(
-    const ValueSpec& spec,
-    const std::string& name = "ValueSpec",
-    const bool print_ref_data = false,
-    size_t max_elements = 20,
-    int precision = 6);
-
-ValueRef quantized_weights_canvas(
-    ComputeGraph& graph,
-    const ValueRef weight_ref);
-
-ValueRef float_tensor_canvas(ComputeGraph& graph, const ValueRef weight_ref);
-
-// Compute weight sums for quantized operations (linear and convolution)
-void compute_weight_sums(
-    ValueSpec& weight_sums,
-    const ValueSpec& quantized_weight,
-    int64_t out_features,
-    int64_t elements_per_output_feature);
-
-// Setup compute graph based on TestCase and operation name
-ComputeGraph setup_compute_graph(TestCase& test_case, std::string op_name);
-
-} // namespace prototyping
-} // namespace vulkan
-} // namespace executorch
diff --git a/backends/vulkan/test/glsl/all_shaders.yaml b/backends/vulkan/test/glsl/all_shaders.yaml
deleted file mode 100644
index 4ef934eb105..00000000000
--- a/backends/vulkan/test/glsl/all_shaders.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-binary_op_nobroadcast__test:
-  parameter_names_with_default_values:
-    DTYPE: float
-    OPERATOR: X + Y
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: "half"
-        SUFFIX: "half"
-      - VALUE: "float"
-        SUFFIX: "float"
-  shader_variants:
-    - NAME: binary_add_nobroadcast__test
-      OPERATOR: X + Y
-    - NAME: binary_sub_nobroadcast__test
-      OPERATOR: X - Y
-    - NAME: binary_mul_nobroadcast__test
-      OPERATOR: X * Y
-    - NAME: binary_div_nobroadcast__test
-      OPERATOR: X / Y
-    - NAME: binary_pow_nobroadcast__test
-      OPERATOR: pow(X, Y)
-
-fill_texture__test:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: "half"
-        SUFFIX: "half"
-      - VALUE: "float"
-        SUFFIX: "float"
-  shader_variants:
-    - NAME: fill_texture__test
-
-idx_fill_buffer:
-  parameter_names_with_default_values:
-    DTYPE: float
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-      - VALUE: half
-      - VALUE: int8
-  shader_variants:
-    - NAME: idx_fill_buffer
-
-idx_fill_texture:
-  parameter_names_with_default_values:
-    DTYPE: float
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-      - VALUE: int32
-      - VALUE: int8
-  shader_variants:
-    - NAME: idx_fill_texture
-
-scalar_add_buffer:
-  parameter_names_with_default_values:
-    DTYPE: float
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: float
-      - VALUE: half
-      - VALUE: int8
-  shader_variants:
-    - NAME: scalar_add_buffer
diff --git a/backends/vulkan/test/glsl/binary_op_nobroadcast__test.glsl b/backends/vulkan/test/glsl/binary_op_nobroadcast__test.glsl
deleted file mode 100644
index 7f72ac58972..00000000000
--- a/backends/vulkan/test/glsl/binary_op_nobroadcast__test.glsl
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define op(X, Y) ${OPERATOR}
-
-layout(std430) buffer;
-
-// clang-format off
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D image_out;
-// clang-format on
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-layout(set = 0, binding = 2) uniform PRECISION sampler3D image_other;
-
-layout(set = 0, binding = 3) uniform PRECISION restrict OutExtents {
-  uvec4 data;
-}
-out_extents;
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_extents.data.xyz))) {
-    return;
-  }
-
-  vec4 in_texel = texelFetch(image_in, pos, 0);
-  vec4 other_texel = texelFetch(image_other, pos, 0);
-
-  imageStore(image_out, pos, op(in_texel, other_texel));
-}
diff --git a/backends/vulkan/test/glsl/dynamic_dispatch_test.glsl b/backends/vulkan/test/glsl/dynamic_dispatch_test.glsl
deleted file mode 100644
index 341da3eeacd..00000000000
--- a/backends/vulkan/test/glsl/dynamic_dispatch_test.glsl
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(0, "w", "t_out", "float", "texture3d")}
-${layout_declare_tensor(1, "r", "t_in1", "float", "texture3d")}
-${layout_declare_tensor(2, "r", "t_in2", "float", "texture3d")}
-
-layout(push_constant) uniform restrict Block {
-  ivec4 out_sizes;
-  ivec4 in1_sizes;
-  ivec4 in2_sizes;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_sizes.xyz))) {
-    return;
-  }
-
-
-  vec4 out_texel = vec4(0.0);
-  for (int row = 0; row < in1_sizes.y; ++row) {
-    ivec3 in_pos = ivec3(pos.x, row, pos.z);
-    vec4 in1_texel = texelFetch(t_in1, in_pos, 0);
-    vec4 in2_texel = texelFetch(t_in2, in_pos, 0);
-
-    out_texel += in1_texel * in2_texel;
-  }
-
-  imageStore(t_out, pos, out_texel + ${OFFSET});
-}
diff --git a/backends/vulkan/test/glsl/dynamic_dispatch_test.yaml b/backends/vulkan/test/glsl/dynamic_dispatch_test.yaml
deleted file mode 100644
index 0f0f5f51685..00000000000
--- a/backends/vulkan/test/glsl/dynamic_dispatch_test.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-dynamic_dispatch_test:
-  parameter_names_with_default_values:
-    OFFSET: 2.25
-  shader_variants:
-    - NAME: dynamic_dispatch_test_var1
-    - NAME: dynamic_dispatch_test_var2
-      OFFSET: 5.5
diff --git a/backends/vulkan/test/glsl/fill_buffer.glsl b/backends/vulkan/test/glsl/fill_buffer.glsl
deleted file mode 100644
index 090d9e70d6c..00000000000
--- a/backends/vulkan/test/glsl/fill_buffer.glsl
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-$PRECISION = "highp"
-$DTYPE = "float"
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${buffer_gvec_type(DTYPE, 4)}
-
-#include "indexing_utils.h"
-
-layout(std430) buffer;
-
-layout(set = 0, binding = 0) buffer  PRECISION restrict writeonly Buffer {
-  VEC4_T data[];
-}
-buffer_in;
-
-layout(set = 0, binding = 1) uniform PRECISION restrict Params {
-  int len;
-}
-params;
-
-
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const float scale = 1;
-layout(constant_id = 4) const float offset = 0;
-
-void main() {
-  const int i = ivec3(gl_GlobalInvocationID).x;
-
-  const int base = 4 * i;
-  if (base < params.len) {
-    buffer_in.data[i] = scale * (VEC4_T(base) + VEC4_T(0, 1, 2, 3)) + offset;
-  }
-}
diff --git a/backends/vulkan/test/glsl/fill_texture__test.glsl b/backends/vulkan/test/glsl/fill_texture__test.glsl
deleted file mode 100644
index 76c630de55e..00000000000
--- a/backends/vulkan/test/glsl/fill_texture__test.glsl
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-layout(std430) buffer;
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} uOutput;
-layout(set = 0, binding = 1) uniform PRECISION restrict Block {
-  ivec3 size;
-  int fill;
-  vec4 vals;
-} params;
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, params.size))) {
-    return;
-  }
-
-  imageStore(uOutput, pos, params.vals);
-}
diff --git a/backends/vulkan/test/glsl/idx_fill_buffer.glsl b/backends/vulkan/test/glsl/idx_fill_buffer.glsl
deleted file mode 100644
index d32c52c205e..00000000000
--- a/backends/vulkan/test/glsl/idx_fill_buffer.glsl
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define T ${buffer_scalar_type(DTYPE)}
-
-#include "indexing_utils.h"
-
-${define_required_extensions(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_buffer(0, "w", "out_buf", DTYPE, PRECISION, True)}
-${layout_declare_ubo(1, "int", "numel")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const int t_id = ivec3(gl_GlobalInvocationID).x;
-  if (t_id >= numel) {
-    return;
-  }
-
-  out_buf[t_id] = T(t_id);
-}
diff --git a/backends/vulkan/test/glsl/idx_fill_texture.glsl b/backends/vulkan/test/glsl/idx_fill_texture.glsl
deleted file mode 100644
index 8914d2b8925..00000000000
--- a/backends/vulkan/test/glsl/idx_fill_texture.glsl
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-#include "indexing_utils.h"
-
-layout(std430) buffer;
-
-${layout_declare_tensor(0, "w", "image_out", DTYPE, "texture3d")}
-${layout_declare_ubo(1, "ivec4", "sizes")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-layout(constant_id = 4) const int offset = 10;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(pos, sizes, packed_dim);
-
-  if (any(greaterThanEqual(idx, sizes))) {
-    return;
-  }
-
-  const ivec4 buf_indices = get_texel_nchw_buffer_ixs(idx, sizes, packed_dim);
-  VEC4_T texel = VEC4_T(buf_indices) + offset;
-  imageStore(image_out, pos, texel);
-}
diff --git a/backends/vulkan/test/glsl/indexing_utils.h b/backends/vulkan/test/glsl/indexing_utils.h
deleted file mode 100644
index 8563daaa5fb..00000000000
--- a/backends/vulkan/test/glsl/indexing_utils.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// Width Dim Index, assuming (W, H, C, N) order
-#define W_DIM 0
-// Height, assuming (W, H, C, N) order
-#define H_DIM 1
-// Channels, assuming (W, H, C, N) order
-#define C_DIM 2
-
-/*
- * Describes which texture axis the "batches" dimension runs along in a 4D
- * texture.
- *
- * Currently it is set to 2 since we represent batches by concatenating along
- * the channels dim, which has index 2 in (W, H, C, N) order and maps to the
- * depth dimension of a texture, which also corresponds to index 2 in (x, y, z)
- * order.
- */
-#define BATCH_AXIS 2
-
-//
-// Basic Indexing Utility Macros and Functions
-//
-
-/*
- * Aligns input to the next multiple of 4
- */
-#define alignup4(x) ((x + 3) & -4)
-
-//
-// (w, h, c, n) Tensor Index <-> Contiguous Buffer Index Conversion
-//
-
-/*
- * Input: (w, h, c, n) tensor index, (W, H, C, N) sizes of a tensor, which dim
- *        is packed along a texel
- * Output: A ivec4 containing the buffer indices corresponding to each texel
- *         element.
- */
-ivec4 get_texel_nchw_buffer_ixs(ivec4 idx, ivec4 sizes, int packed_dim) {
-  ivec4 strides =
-      ivec4(1, sizes.x, sizes.x * sizes.y, sizes.x * sizes.y * sizes.z);
-
-  int base_i = idx.x * strides.x + idx.y * strides.y + idx.z * strides.z +
-      idx.w * strides.w;
-
-  return base_i + ivec4(0, 1, 2, 3) * strides[packed_dim];
-}
-
-//
-// (w, h, c, n) Tensor Index <-> (x, y, z) Texture Position Conversion
-//
-
-/*
- * Input: (x, y, z) texel position, (W, H, C, N) sizes of the tensor, which dim
- *        is packed along a texel
- * Output: Whether the texel position is outside the bounds of the image texture
- *         given the size and packed dimension of the tensor.
- */
-bool pos_out_of_bounds(ivec3 pos, ivec4 sizes, int packed_dim) {
-  // Align packed dim to next multiple of 4 to account for texel padding
-  sizes[packed_dim] = alignup4(sizes[packed_dim]);
-
-  ivec3 max_pos = sizes.xyz;
-  max_pos[BATCH_AXIS] += sizes.w * sizes[BATCH_AXIS];
-  max_pos[packed_dim] /= 4;
-  return (any(greaterThanEqual(pos, max_pos)));
-}
-
-/*
- * Input: (x, y, z) texel position, (W, H, C, N) sizes of the tensor,
- *        which dim is packed along a texel
- * Returns: the (w, h, c, n) tensor index cooresponding to the first element of
- *          the texel at the specified position
- */
-ivec4 to_tensor_idx(ivec3 pos, ivec4 sizes, int packed_dim) {
-  // Align packed dim to next multiple of 4 to account for texel padding
-  sizes[packed_dim] = alignup4(sizes[packed_dim]);
-
-  // Packed dim contains 4 elements per texel
-  pos[packed_dim] *= 4;
-  // Construct the initial tensor index via swizzling
-#if BATCH_AXIS == 2
-  ivec4 tensor_idx = pos.xyzz;
-#endif
-#if BATCH_AXIS == 1
-  ivec4 tensor_idx = pos.xyzy;
-#endif
-#if BATCH_AXIS == 0
-  ivec4 tensor_idx = pos.xyzx;
-#endif
-  // Adjust the axis that the batch dim runs along
-  tensor_idx[3] /= sizes[BATCH_AXIS];
-  tensor_idx[BATCH_AXIS] %= sizes[BATCH_AXIS];
-
-  return tensor_idx;
-}
diff --git a/backends/vulkan/test/glsl/reference_matmul.glsl b/backends/vulkan/test/glsl/reference_matmul.glsl
deleted file mode 100644
index 4d4e0ae8734..00000000000
--- a/backends/vulkan/test/glsl/reference_matmul.glsl
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION highp
-
-layout(std430) buffer;
-
-${layout_declare_tensor(0, "w", "t_out", "float", "buffer")}
-${layout_declare_tensor(1, "r", "t_mat1", "float", "buffer")}
-${layout_declare_tensor(2, "r", "t_mat2", "float", "buffer")}
-${layout_declare_ubo(3, "ivec4", "out_sizes")}
-${layout_declare_ubo(4, "ivec4", "out_strides")}
-${layout_declare_ubo(5, "ivec4", "mat1_sizes")}
-${layout_declare_ubo(6, "ivec4", "mat1_strides")}
-${layout_declare_ubo(7, "ivec4", "mat2_sizes")}
-${layout_declare_ubo(8, "ivec4", "mat2_strides")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-#include "reference_matmul_common_buffer.glslh"
-
-void main() {
-  const ivec2 out_idx = ivec2(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y);
-  if (any(greaterThanEqual(out_idx, out_sizes.xy))) {
-    return;
-  }
-
-  // Initial idx for mat1 is (0, out_idx.y)
-  int mat1_id = out_idx.y * mat1_strides.y;
-  // Initial idx for mat2 is (out_idx.x, 0)
-  int mat2_id = out_idx.x * mat2_strides.x;
-
-  float sum = 0.0;
-  for (int i = 0; i < mat1_sizes.x; ++i) {
-    sum += perform_dot_product(out_idx.y, out_idx.x, i);
-  }
-
-  const int out_id = out_idx.x * out_strides.x + out_idx.y * out_strides.y;
-  t_out[out_id] = sum;
-}
diff --git a/backends/vulkan/test/glsl/reference_matmul_common.glslh b/backends/vulkan/test/glsl/reference_matmul_common.glslh
deleted file mode 100644
index 2f22b588b75..00000000000
--- a/backends/vulkan/test/glsl/reference_matmul_common.glslh
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef MATMUL_COMMON_${STORAGE}_H
-#define MATMUL_COMMON_${STORAGE}_H
-
-$if STORAGE == "buffer":
-  float perform_dot_product(
-      const uint out_row,
-      const uint out_col,
-      const uint k) {
-    const uint mat1_bufi = out_row * mat1_strides.y + k * mat1_strides.x;
-    const uint mat2_bufi = k * mat2_strides.y + out_col * mat2_strides.x;
-
-    return t_mat1[mat1_bufi] * t_mat2[mat2_bufi];
-  }
-$else:
-  vec4 perform_dot_product(
-      const uint out_row,
-      const uint out_col,
-      const uint k) {
-    vec4 mat1_tex = texelFetch(t_mat1, ivec3(k, out_row, 0), 0);
-    vec4 mat2_tex = texelFetch(t_mat2, ivec3(out_col, k, 0), 0);
-
-    return dot(mat1_tex, mat2_tex);
-  }
-
-#endif
diff --git a/backends/vulkan/test/glsl/reference_matmul_common.yaml b/backends/vulkan/test/glsl/reference_matmul_common.yaml
deleted file mode 100644
index d19bbabf0d1..00000000000
--- a/backends/vulkan/test/glsl/reference_matmul_common.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-reference_matmul_common:
-  parameter_names_with_default_values:
-    STORAGE: buffer
-  generate_variant_forall:
-    STORAGE:
-      - VALUE: buffer
-      - VALUE: texture3d
-  shader_variants:
-    - NAME: reference_matmul_common
diff --git a/backends/vulkan/test/glsl/scalar_add_buffer.glsl b/backends/vulkan/test/glsl/scalar_add_buffer.glsl
deleted file mode 100644
index cd3a85a1655..00000000000
--- a/backends/vulkan/test/glsl/scalar_add_buffer.glsl
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-${define_required_extensions(DTYPE)}
-
-#define T ${buffer_scalar_type(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(0, "rw", "buffer_in", DTYPE, "buffer")}
-${layout_declare_ubo(1, "int", "numel")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const float scalar = 2.0;
-
-void main() {
-  const int t_id = ivec3(gl_GlobalInvocationID).x;
-  if (t_id >= numel) {
-    return;
-  }
-
-  buffer_in[t_id] = buffer_in[t_id] + T(scalar);
-}
diff --git a/backends/vulkan/test/glsl/scalar_add_texture.glsl b/backends/vulkan/test/glsl/scalar_add_texture.glsl
deleted file mode 100644
index 992907d0c25..00000000000
--- a/backends/vulkan/test/glsl/scalar_add_texture.glsl
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-layout(std430) buffer;
-
-${layout_declare_tensor(0, "rw", "t_in", "float", "texture3d")}
-${layout_declare_ubo(1, "ivec3", "extents")}
-${layout_declare_ubo(2, "int", "scalar")}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-  if (any(greaterThanEqual(pos, extents))) {
-    return;
-  }
-
-  vec4 in_tex = imageLoad(t_in, pos);
-  imageStore(t_in, pos, imageLoad(t_in, pos) + float(scalar));
-}
diff --git a/backends/vulkan/test/glsl/test_shader.glsl b/backends/vulkan/test/glsl/test_shader.glsl
deleted file mode 100644
index 4804528346d..00000000000
--- a/backends/vulkan/test/glsl/test_shader.glsl
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-layout(std430) buffer;
-
-layout(set = 0, binding = 0, rgba32f) uniform PRECISION restrict writeonly image3D   uOutput;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
-layout(set = 0, binding = 2) uniform PRECISION restrict Block {
-  ivec4 size;
-} uBlock;
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (all(lessThan(pos, uBlock.size.xyz))) {
-    const vec4 intex = texelFetch(uInput, pos, 0);
-    imageStore(
-        uOutput,
-        pos,
-        intex + 5);
-  }
-}
diff --git a/backends/vulkan/test/op_tests/CMakeLists.txt b/backends/vulkan/test/op_tests/CMakeLists.txt
deleted file mode 100644
index 07a13c3f260..00000000000
--- a/backends/vulkan/test/op_tests/CMakeLists.txt
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# ### Editing this file ###
-#
-# This file should be formatted with
-# ~~~
-# cmake-format -i CMakeLists.txt
-# ~~~
-# It should also be cmake-lint clean.
-#
-# The targets in this file will be built if EXECUTORCH_BUILD_VULKAN is ON
-
-cmake_minimum_required(VERSION 3.19)
-project(executorch)
-
-if(ANDROID)
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH)
-endif()
-
-find_package(executorch CONFIG REQUIRED COMPONENTS vulkan_backend)
-find_package(GTest CONFIG REQUIRED)
-
-if(NOT EXECUTORCH_ROOT)
-  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
-endif()
-
-# Include this file to access executorch_target_link_options_shared_lib This is
-# required to provide access to executorch_target_link_options_shared_lib which
-# allows libraries to be linked with the --whole-archive flag. This is required
-# for libraries that perform dynamic registration via static initialization.
-include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
-
-get_torch_base_path(TORCH_BASE_PATH)
-if(NOT TORCH_INSTALL_PREFIX)
-  set(TORCH_INSTALL_PREFIX ${TORCH_BASE_PATH})
-endif()
-
-# libtorch is needed for Vulkan correctness tests
-find_library(LIB_TORCH torch HINTS ${TORCH_INSTALL_PREFIX}/lib)
-find_library(LIB_TORCH_CPU torch_cpu HINTS ${TORCH_INSTALL_PREFIX}/lib)
-find_library(LIB_C10 c10 HINTS ${TORCH_INSTALL_PREFIX}/lib)
-
-# Third party include paths
-
-set(VULKAN_THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party)
-
-set(GTEST_INCLUDE_PATH
-    ${EXECUTORCH_ROOT}/third-party/googletest/googletest/include
-)
-set(VULKAN_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/Vulkan-Headers/include)
-set(VOLK_PATH ${VULKAN_THIRD_PARTY_PATH}/volk)
-set(VMA_PATH ${VULKAN_THIRD_PARTY_PATH}/VulkanMemoryAllocator)
-
-set(COMMON_INCLUDES
-    ${EXECUTORCH_ROOT}/..
-    ${VULKAN_HEADERS_PATH}
-    ${VOLK_PATH}
-    ${VMA_PATH}
-    ${GTEST_INCLUDE_PATH}
-    ${TORCH_BASE_PATH}/include
-    ${TORCH_BASE_PATH}/include/torch/csrc/api/include
-)
-
-executorch_target_link_options_shared_lib(vulkan_backend)
-
-function(vulkan_op_test test_name test_src)
-  set(extra_deps ${ARGN})
-
-  add_executable(${test_name} ${test_src})
-  target_include_directories(${test_name} PRIVATE ${COMMON_INCLUDES})
-  target_link_libraries(
-    ${test_name}
-    PRIVATE GTest::gtest_main
-            vulkan_backend
-            executorch_core
-            ${LIB_TORCH}
-            ${LIB_TORCH_CPU}
-            ${LIB_C10}
-            ${extra_deps}
-  )
-
-  add_test(${test_name} ${test_name})
-endfunction()
-
-if(TARGET vulkan_backend AND LIB_TORCH)
-  add_library(test_utils ${CMAKE_CURRENT_SOURCE_DIR}/test_utils.cpp)
-  target_include_directories(test_utils PRIVATE ${COMMON_INCLUDES})
-  target_link_libraries(
-    test_utils PRIVATE vulkan_backend ${LIB_TORCH} ${LIB_TORCH_CPU}
-  )
-
-  find_library(
-    CUSTOM_OPS_LIB custom_ops_aot_lib
-    HINTS ${CMAKE_INSTALL_PREFIX}/executorch/extension/llm/custom_ops
-  )
-  if(CUSTOM_OPS_LIB)
-    vulkan_op_test(
-      vulkan_sdpa_test ${CMAKE_CURRENT_SOURCE_DIR}/sdpa_test.cpp
-      ${CUSTOM_OPS_LIB} test_utils
-    )
-  else()
-    message(
-      STATUS "Skip building sdpa_test because custom_ops_aot_lib is not found"
-    )
-  endif()
-  vulkan_op_test(
-    vulkan_rope_test ${CMAKE_CURRENT_SOURCE_DIR}/rotary_embedding_test.cpp
-    test_utils
-  )
-  vulkan_op_test(
-    quantized_linear_test ${CMAKE_CURRENT_SOURCE_DIR}/quantized_linear_test.cpp
-    test_utils
-  )
-
-  # Only build generated op tests if a path to tags.yaml and
-  # native_functions.yaml is provided. These files are required for codegen.
-  if(TORCH_OPS_YAML_PATH)
-    set(GENERATED_VULKAN_TESTS_CPP_PATH ${CMAKE_CURRENT_BINARY_DIR}/vk_gen_cpp)
-
-    # Generated operator correctness tests
-
-    set(generated_test_cpp ${GENERATED_VULKAN_TESTS_CPP_PATH}/op_tests.cpp)
-
-    add_custom_command(
-      COMMENT "Generating Vulkan operator correctness tests"
-      OUTPUT ${generated_test_cpp}
-      COMMAND
-        ${PYTHON_EXECUTABLE}
-        ${EXECUTORCH_ROOT}/backends/vulkan/test/op_tests/generate_op_correctness_tests.py
-        -o ${GENERATED_VULKAN_TESTS_CPP_PATH} --tags-path
-        ${TORCH_OPS_YAML_PATH}/tags.yaml --aten-yaml-path
-        ${TORCH_OPS_YAML_PATH}/native_functions.yaml
-      DEPENDS ${EXECUTORCH_ROOT}/backends/vulkan/test/op_tests/**/*.py
-    )
-
-    vulkan_op_test(vulkan_op_correctness_tests ${generated_test_cpp})
-
-    # Generated operator benchmarks (only built in google benchmark is
-    # installed)
-    find_package(benchmark CONFIG)
-
-    if(benchmark_FOUND)
-      set(generated_benchmark_cpp
-          ${GENERATED_VULKAN_TESTS_CPP_PATH}/op_benchmarks.cpp
-      )
-
-      add_custom_command(
-        COMMENT "Generating Vulkan operator benchmarks"
-        OUTPUT ${generated_benchmark_cpp}
-        COMMAND
-          ${PYTHON_EXECUTABLE}
-          ${EXECUTORCH_ROOT}/backends/vulkan/test/op_tests/generate_op_benchmarks.py
-          -o ${GENERATED_VULKAN_TESTS_CPP_PATH} --tags-path
-          ${TORCH_OPS_YAML_PATH}/tags.yaml --aten-yaml-path
-          ${TORCH_OPS_YAML_PATH}/native_functions.yaml
-        DEPENDS ${EXECUTORCH_ROOT}/backends/vulkan/test/op_tests/**/*.py
-      )
-
-      vulkan_op_test(vulkan_op_benchmarks ${generated_benchmark_cpp})
-    endif()
-  else()
-    message(
-      STATUS
-        "Skipping generated operator correctness tests and benchmarks. Please specify TORCH_OPS_YAML_PATH to build these tests."
-    )
-  endif()
-endif()
diff --git a/backends/vulkan/test/op_tests/TARGETS b/backends/vulkan/test/op_tests/TARGETS
deleted file mode 100644
index e84397dc20e..00000000000
--- a/backends/vulkan/test/op_tests/TARGETS
+++ /dev/null
@@ -1,5 +0,0 @@
-load(":targets.bzl", "define_common_targets")
-
-oncall("executorch")
-
-define_common_targets(is_fbcode = True)
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
deleted file mode 100644
index 8c5d0c4797b..00000000000
--- a/backends/vulkan/test/op_tests/cases.py
+++ /dev/null
@@ -1,1949 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import itertools
-
-from collections import namedtuple
-from typing import Callable
-
-from executorch.backends.vulkan.test.op_tests.utils.test_suite import VkTestSuite
-
-
-# Prime numbers dim sizes for testing
-XL = 113
-L = 89
-M2 = 41
-M1 = 37
-M = 29
-S2 = 11
-S1 = 7
-S = 5
-XS = 3
-
-test_suites = {}
-
-
-def register_test_suite(aten_op):
-    def test_suite_decorator(fn: Callable) -> Callable:
-        if isinstance(aten_op, str):
-            test_suites[aten_op] = fn()
-        elif isinstance(aten_op, list):
-            for op in aten_op:
-                test_suites[op] = fn()
-        return fn
-
-    return test_suite_decorator
-
-
-@register_test_suite(
-    ["aten.add.Tensor", "aten.sub.Tensor", "aten.div.Tensor", "aten.mul.Tensor"]
-)
-def get_binary_elementwise_inputs():
-    test_suite = VkTestSuite(
-        [
-            ((M1, M2), (M1, M2)),
-            ((M1, M2), (M1, 1), 2.0),
-            ((M1, M2), (1, M2)),
-            ((S, S1, S2), (S, S1, S2)),
-            ((S, S1, S2), (S, S1, 1), 2.0),
-            ((S, S1, S2), (S, 1, S2), 2.0),
-            ((XS, S, S1, S2), (XS, S, 1, 1), 2.0),
-            ((3, 64, 1), (1, 64, 1)),
-        ]
-    )
-    test_suite.storage_types = [
-        "utils::kBuffer",
-        "utils::kTexture3D",
-    ]
-
-    highdim_test_suite = VkTestSuite(
-        [
-            ((4, 5, 8, 1, 2, 1), (4, 5, 8, 1, 1, 1)),
-        ]
-    )
-    highdim_test_suite.storage_types = [
-        "utils::kBuffer",
-    ]
-    highdim_test_suite.test_name_suffix = "highdim"
-
-    for suite in [test_suite, highdim_test_suite]:
-        suite.layouts = [
-            "utils::kWidthPacked",
-            "utils::kChannelsPacked",
-        ]
-
-    return [test_suite, highdim_test_suite]
-
-
-# Eq requires a different test generator so it was split from the other test case.
-@register_test_suite(
-    [
-        "aten.eq.Tensor",
-        "aten.gt.Tensor",
-        "aten.lt.Tensor",
-        "aten.ge.Tensor",
-        "aten.le.Tensor",
-    ]
-)
-def get_binary_elementwise_compare_inputs():
-    test_suite = VkTestSuite(
-        [
-            ((M1, M2), (M1, M2)),
-            ((M1, M2), (M1, 1), 2.0),
-            ((M1, M2), (1, M2)),
-            ((S, S1, S2), (S, S1, S2)),
-            ((S, S1, S2), (S, S1, 1), 2.0),
-            ((S, S1, S2), (S, 1, S2), 2.0),
-            ((XS, S, S1, S2), (XS, S, 1, 1), 2.0),
-            ((3, 64, 1), (1, 64, 1)),
-        ]
-    )
-    test_suite.layouts = [
-        "utils::kWidthPacked",
-        "utils::kChannelsPacked",
-    ]
-    test_suite.storage_types = [
-        "utils::kBuffer",
-        "utils::kTexture3D",
-    ]
-    test_suite.data_gen = "make_casted_randint_tensor"
-    return test_suite
-
-
-@register_test_suite("aten.mm.default")
-def get_mm_inputs():
-    test_suite = VkTestSuite(
-        [
-            ((M1, L), (L, M2)),
-            ((S1, S2), (S2, M)),
-            ((6, 32), (32, 64)),
-        ],
-    )
-    test_suite.prepacked_args = ["mat2"]
-    # ATen matmul doesn't support half
-    test_suite.dtypes = ["at::kFloat"]
-    test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"]
-    test_suite.layouts = [
-        "utils::kWidthPacked",
-        "utils::kChannelsPacked",
-    ]
-    return test_suite
-
-
-@register_test_suite("aten.bmm.default")
-def get_bmm_inputs():
-    test_suite = VkTestSuite(
-        [
-            ((S, M1, L), (S, L, M2)),
-            ((M, S1, S2), (M, S2, M)),
-            ((4, 6, 32), (4, 32, 16)),
-        ],
-    )
-    test_suite.prepacked_args = ["mat2"]
-    # ATen matmul doesn't support half
-    test_suite.dtypes = ["at::kFloat"]
-    test_suite.layouts = [
-        "utils::kWidthPacked",
-        "utils::kChannelsPacked",
-    ]
-    return test_suite
-
-
-@register_test_suite("aten.addmm.default")
-def get_addmm_inputs():
-    test_suite = VkTestSuite(
-        [
-            ((1, S), (S1, S), (S, S), 1.0, 1.5),
-            ((S, 1), (S, S1), (S1, S1), 1.0, 1.0),
-            ((M1, M2), (M1, M2), (M2, M2)),
-            ((M1, M2), (M1, M2), (M2, M2), 4.2, 2.3),
-            ((M1, 1), (M1, L), (L, L), 2.0, 3.0),
-            ((M2), (M1, M2), (M2, M2)),
-            ((6, M2), (6, M2), (M2, M2)),
-        ]
-    )
-    # ATen matmul doesn't support half
-    test_suite.dtypes = ["at::kFloat"]
-    test_suite.layouts = [
-        "utils::kWidthPacked",
-        "utils::kChannelsPacked",
-    ]
-    return test_suite
-
-
-common_MKN_list = [
-    (S2, M2, M1),
-    (L, L, M1),
-]
-
-
-@register_test_suite("aten.linear.default")
-def get_linear_inputs():
-    MKN_list = common_MKN_list
-
-    inputs_list = [((M, K), (N, K), None) for M, K, N in MKN_list]
-    inputs_list += [((M, K), (N, K), (N)) for M, K, N in MKN_list]
-    inputs_list += [((3, M, K), (N, K), None) for M, K, N in MKN_list]
-    inputs_list += [((3, M, K), (N, K), (N)) for M, K, N in MKN_list]
-    inputs_list += [((3, 6, K), (N, K), (N)) for M, K, N in MKN_list]
-
-    test_suite = VkTestSuite(inputs_list)
-    test_suite.dtypes = ["at::kFloat"]
-    test_suite.layouts = [
-        "utils::kWidthPacked",
-        "utils::kChannelsPacked",
-    ]
-    test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
-    return test_suite
-
-
-@register_test_suite("aten._weight_int8pack_mm.default")
-def get_weight_int8pack_mm_inputs():
-    MKN_list = [
-        [1, 480, 256],
-        [1, 1024, 1024],
-        [1, 1024, 256],
-        [3, 480, 256],
-        [6, 480, 256],
-        [6, 256, 1024],
-        [6, 1024, 256],
-        [6, 256, 256],
-        [6, 256, 512],
-        [4, 768, 4096],
-        [1024, 1024, 1024],
-    ]
-
-    inputs_list = [((M, K), (N, K), (N)) for M, K, N in MKN_list]
-
-    test_suite = VkTestSuite(inputs_list)
-    test_suite.dtypes = ["at::kFloat"]
-    test_suite.layouts = ["utils::kWidthPacked"]
-    test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"]
-    test_suite.prepacked_args = ["mat2", "scales"]
-    test_suite.requires_prepack = True
-
-    test_suite.arg_dtype["mat2"] = "at::kChar"
-    test_suite.arg_data_range["mat2"] = (0, 100)
-
-    test_suite.arg_data_range["scales"] = (0.0008, 0.001)
-
-    return test_suite
-
-
-@register_test_suite("aten.avg_pool2d.default")
-def get_avg_pool2d_inputs():
-    Test = namedtuple(
-        "VkAvgPoolTest",
-        [
-            "self",
-            "kernel_size",
-            "stride",
-            "padding",
-            "ceil_mode",
-            "count_include_pad",
-            "divisor_override",
-        ],
-    )
-
-    test_cases = []
-    for ceil_mode in [True, False]:
-        for count_include_pad in [True, False]:
-            for divisor_override in [None, 5]:
-                test_cases += [
-                    Test(
-                        self=(S, M1, M2),
-                        kernel_size=[2, 2],
-                        stride=[1, 1],
-                        padding=[0, 0],
-                        ceil_mode=ceil_mode,
-                        count_include_pad=count_include_pad,
-                        divisor_override=divisor_override,
-                    ),
-                ]
-
-    test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
-    test_suite.dtypes = ["at::kFloat"]
-    return test_suite
-
-
-@register_test_suite(
-    ["aten.max_pool2d_with_indices.default", "aten.max_pool2d.default"]
-)
-def get_max_pool2d_inputs():
-    test_suite = VkTestSuite(
-        [
-            ((1, 7, 89, 77), [2, 2], [1, 1], [0, 0], [1, 1]),
-        ]
-    )
-    return test_suite
-
-
-@register_test_suite("aten.convolution.default")
-def get_conv_inputs():
-    Test = namedtuple(
-        "ConvTest",
-        [
-            "self",
-            "weight",
-            "bias",
-            "stride",
-            "padding",
-            "dilation",
-            "transposed",
-            "output_padding",
-            "groups",
-        ],
-    )
-    Test.__new__.__defaults__ = (
-        None,
-        None,
-        None,
-        [1, 1],
-        [0, 0],
-        [1, 1],
-        False,
-        [9, 0],
-        1,
-    )
-
-    test_cases = [
-        Test(
-            self=(1, 64, 256, 256),
-            weight=(64, 32, 3, 3),
-            bias=None,
-            stride=[1, 1],
-            padding=[1, 1],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=2,
-        ),
-        Test(
-            self=(1, 16, 3, 3),
-            weight=(16, 8, 3, 3),
-            bias=None,
-            stride=[1, 1],
-            padding=[1, 1],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=2,
-        ),
-        Test(
-            self=(1, 6, 40, 50),
-            weight=(8, 6, 3, 3),
-            bias=(8,),
-            stride=[1, 2],
-            padding=[2, 3],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=1,
-        ),
-        Test(
-            self=(1, 6, 40, 50),
-            weight=(6, 8, 3, 3),
-            bias=(8,),
-            stride=[1, 2],
-            padding=[2, 3],
-            dilation=[1, 1],
-            transposed=True,
-            output_padding=[0, 1],
-            groups=1,
-        ),
-        Test(
-            self=(1, 6, 40, 50),
-            weight=(8, 6, 3, 3),
-            bias=None,
-            stride=[1, 2],
-            padding=[2, 3],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=1,
-        ),
-        Test(
-            self=(1, 6, 7),
-            weight=(6, 1, 3),
-            bias=(6,),
-            stride=[1],
-            padding=[0],
-            dilation=[1],
-            transposed=False,
-            output_padding=[0],
-            groups=6,
-        ),
-        Test(
-            self=(2, 20, 30),
-            weight=(10, 4, 6),
-            bias=(10,),
-            stride=[5],
-            padding=[5],
-            dilation=[3],
-            transposed=False,
-            output_padding=[0],
-            groups=5,
-        ),
-        Test(
-            self=(1, 9, 11),
-            weight=(9, 1, 3),
-            bias=None,
-            stride=[1],
-            padding=[0],
-            dilation=[1],
-            transposed=False,
-            output_padding=[0],
-            groups=9,
-        ),
-        Test(
-            self=(5, 15, 30),
-            weight=(20, 3, 3),
-            bias=None,
-            stride=[3],
-            padding=[5],
-            dilation=[7],
-            transposed=False,
-            output_padding=[0],
-            groups=5,
-        ),
-        Test(
-            self=(1, 8, 90, 77),
-            weight=(1, 8, 3, 3),
-            bias=(1,),
-            stride=[1, 1],
-            padding=[2, 2],
-            dilation=[2, 2],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=1,
-        ),
-    ]
-
-    test_cases_pw = [
-        Test(
-            self=(1, 16, 3, 5),
-            weight=(4, 16, 1, 1),
-            bias=(4,),
-            stride=[1, 1],
-            padding=[0, 0],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=1,
-        ),
-        Test(
-            self=(1, 5, 3, 5),
-            weight=(4, 5, 1, 1),
-            bias=(4,),
-            stride=[1, 1],
-            padding=[0, 0],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=1,
-        ),
-        Test(
-            self=(1, 5, 3, 5),
-            weight=(3, 5, 1, 1),
-            bias=(3,),
-            stride=[1, 1],
-            padding=[0, 0],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=1,
-        ),
-        Test(
-            self=(1, 5, 3, 5),
-            weight=(3, 5, 1, 1),
-            bias=(3,),
-            stride=[1, 1],
-            padding=[1, 0],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=1,
-        ),
-        Test(
-            self=(1, 5, 3, 5),
-            weight=(3, 5, 1, 1),
-            bias=(3,),
-            stride=[1, 1],
-            padding=[0, 1],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=1,
-        ),
-        Test(
-            self=(1, 5, 3, 5),
-            weight=(3, 5, 1, 1),
-            bias=(3,),
-            stride=[2, 1],
-            padding=[1, 0],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=1,
-        ),
-        Test(
-            self=(1, 8, 72, 96),
-            weight=(8, 8, 1, 1),
-            bias=(8,),
-            stride=[1, 1],
-            padding=[1, 1],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=1,
-        ),
-        Test(
-            self=(1, 16, 240, 320),
-            weight=(64, 16, 1, 1),
-            bias=(64,),
-            stride=[1, 1],
-            padding=[0, 0],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=1,
-        ),
-        Test(
-            self=(1, 16, 240, 320),
-            weight=(64, 16, 1, 1),
-            bias=(64,),
-            stride=[2, 2],
-            padding=[0, 0],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=1,
-        ),
-        Test(
-            self=(1, 16, 240, 320),
-            weight=(64, 16, 1, 1),
-            bias=(64,),
-            stride=[4, 4],
-            padding=[1, 1],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=1,
-        ),
-        Test(
-            self=(1, 16, 240, 320),
-            weight=(64, 16, 1, 1),
-            bias=(64,),
-            stride=[1, 1],
-            padding=[4, 4],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=1,
-        ),
-        Test(
-            self=(1, 16, 672, 512),
-            weight=(64, 16, 1, 1),
-            bias=(64,),
-            stride=[1, 1],
-            padding=[0, 0],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=1,
-        ),
-    ]
-
-    test_cases_dw = [
-        Test(
-            self=(1, XS, S, S1),
-            weight=(XS, 1, 3, 3),
-            bias=(XS,),
-            stride=[1, 1],
-            padding=[1, 1],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=XS,
-        ),
-        Test(
-            self=(1, XS, S, S1),
-            weight=(XS, 1, 5, 5),
-            bias=(XS,),
-            stride=[1, 1],
-            padding=[2, 2],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=XS,
-        ),
-        Test(
-            self=(1, XS, S, S1),
-            weight=(XS, 1, 3, 3),
-            bias=(XS,),
-            stride=[2, 1],
-            padding=[1, 1],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=XS,
-        ),
-        Test(
-            self=(1, XS, S, S1),
-            weight=(XS, 1, 5, 5),
-            bias=(XS,),
-            stride=[1, 2],
-            padding=[2, 2],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=XS,
-        ),
-        Test(
-            self=(1, S2, S, S1),
-            weight=(S2, 1, 3, 3),
-            bias=(S2,),
-            stride=[1, 1],
-            padding=[1, 1],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=S2,
-        ),
-        Test(
-            self=(1, S2, S, S1),
-            weight=(S2, 1, 5, 5),
-            bias=(S2,),
-            stride=[1, 1],
-            padding=[2, 2],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=S2,
-        ),
-        Test(
-            self=(1, 8, 72, 96),
-            weight=(8, 1, 3, 3),
-            bias=(8,),
-            stride=[1, 1],
-            padding=[1, 1],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=8,
-        ),
-        Test(
-            self=(1, 8, 72, 96),
-            weight=(8, 1, 5, 5),
-            bias=(8,),
-            stride=[1, 1],
-            padding=[2, 2],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=8,
-        ),
-        Test(
-            self=(1, 4, 234, 234),
-            weight=(4, 1, 3, 3),
-            bias=(4,),
-            stride=[2, 1],
-            padding=[1, 1],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=4,
-        ),
-        Test(
-            self=(1, 4, 234, 234),
-            weight=(4, 1, 3, 3),
-            bias=(4,),
-            stride=[1, 2],
-            padding=[1, 1],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=4,
-        ),
-        Test(
-            self=(1, 4, 234, 234),
-            weight=(4, 1, 3, 3),
-            bias=(4,),
-            stride=[2, 2],
-            padding=[1, 1],
-            dilation=[1, 1],
-            transposed=False,
-            output_padding=[0, 0],
-            groups=4,
-        ),
-    ]
-
-    test_suite = VkTestSuite(test_cases)
-    test_suite.layouts = [
-        "utils::kChannelsPacked",
-    ]
-
-    test_suite_pw = VkTestSuite(test_cases_pw)
-    test_suite_pw.layouts = [
-        "utils::kChannelsPacked",
-    ]
-    test_suite_pw.test_name_suffix = "pw"
-
-    test_suite_dw = VkTestSuite(test_cases_dw)
-    test_suite_dw.layouts = [
-        "utils::kChannelsPacked",
-    ]
-    test_suite_dw.test_name_suffix = "dw"
-    return [test_suite, test_suite_pw, test_suite_dw]
-
-
-@register_test_suite("aten.native_layer_norm.default")
-def get_native_layer_norm_inputs():
-    test_suite = VkTestSuite(
-        [
-            ((S1, S2), [S2], (S2), (S2), 0.001),
-            ((M, M1, M2), [M2], (M2), (M2), 0.001),
-            ((S, XL, M1, M2), [M2], (M2), (M2), 0.001),
-        ]
-    )
-    test_suite.layouts = [
-        "utils::kWidthPacked",
-        "utils::kHeightPacked",
-        "utils::kChannelsPacked",
-    ]
-    return test_suite
-
-
-@register_test_suite("aten.native_group_norm.default")
-def get_native_group_norm_inputs():
-    test_suite = VkTestSuite(
-        [
-            # (input_shape, weight_shape, bias_shape, N, C, HxW, group, eps)
-            # General test cases
-            ((1, 8, 4, 4), (8), (8), 1, 8, 16, 2, 0.001),
-            ((2, 8, 3, 3), (8), (8), 2, 8, 9, 4, 0.001),
-            ((1, 12, 2, 2), (12), (12), 1, 12, 4, 3, 0.001),
-            ((3, 16, 5, 5), (16), (16), 3, 16, 25, 8, 0.001),
-            ((3, 16, 13, 17), (16), (16), 3, 16, 13 * 17, 4, 0.001),
-            ((1, 4, 7, 7), (4), (4), 1, 4, 49, 2, 0.001),
-            ((2, 6, 1, 8), (6), (6), 2, 6, 8, 3, 0.001),
-            # Single group and prime number sizes
-            ((3, 7, 13, 11), (7), (7), 3, 7, 13 * 11, 1, 0.001),
-            # Each channel is it's own group and prime number sizes
-            ((1, 7, 13, 11), (7), (7), 1, 7, 13 * 11, 7, 0.001),
-        ]
-    )
-    test_suite.layouts = [
-        "utils::kChannelsPacked",
-    ]
-    test_suite.storage_types = [
-        "utils::kTexture3D",
-    ]
-    test_suite.dtypes = [
-        "at::kFloat",
-        "at::kHalf",
-    ]
-    test_suite.arg_storage_types = {
-        "out": [None, "utils::kBuffer", "utils::kBuffer"],
-    }
-
-    test_suite.prepacked_args = ["weight", "bias"]
-    test_suite.requires_prepack = True
-
-    return test_suite
-
-
-def get_upsample_inputs():
-    inputs_list = [
-        # (input tensor shape, output 2D image size (H, W), output scaling factors)
-        ((2, 2, 2, 2), None, [1, 1]),
-        ((1, 1, 2, 2), None, [2, 2]),
-        ((1, 1, 2, 2), None, [2, 4]),
-        ((1, 1, 2, 2), None, [4, 2]),
-        ((1, 1, 2, 2), [2, 2], None),
-        ((1, 1, 2, 2), [2, 4], None),
-        ((1, 1, 2, 2), [3, 2], None),
-    ]
-    return inputs_list
-
-
-@register_test_suite("aten.upsample_nearest2d.vec")
-def get_upsample_nearest2d_inputs():
-    inputs_list = get_upsample_inputs()
-    return VkTestSuite(inputs_list)
-
-
-@register_test_suite("aten.upsample_bilinear2d.vec")
-def get_upsample_bilinear2d_inputs():
-    base_inputs_list = get_upsample_inputs()
-    inputs_list = []
-    for input_case in base_inputs_list:
-        inputs_list.append((input_case[0], input_case[1], False, input_case[2]))
-        inputs_list.append((input_case[0], input_case[1], True, input_case[2]))
-    return VkTestSuite(inputs_list)
-
-
-@register_test_suite(["aten.full.default", "aten.full_like.default"])
-def get_full_inputs():
-    test_suite = VkTestSuite(
-        [
-            ([S1, S2], 42.0),
-            ([M, M1, M2], 3.14),
-            ([L, M, M1, M2], 2.72),
-        ]
-    )
-    return test_suite
-
-
-@register_test_suite("aten.scalar_tensor.default")
-def get_scalar_tensor_inputs():
-    test_suite = VkTestSuite(
-        [
-            (42.0,),
-            (3.14,),
-            (2.72,),
-            (0.0,),
-            (-1.0,),
-            (100.0,),
-        ]
-    )
-    return test_suite
-
-
-@register_test_suite(
-    [
-        "aten.zeros.default",
-        "aten.zeros_like.default",
-        "aten.ones.default",
-        "aten.ones_like.default",
-    ]
-)
-def get_ones_inputs():
-    test_suite = VkTestSuite(
-        [
-            ([S1, S2]),
-            ([M, M1, M2]),
-            ([L, M, M1, M2]),
-        ]
-    )
-    return test_suite
-
-
-@register_test_suite(["aten.select.int", "aten.select_copy.int"])
-def get_select_int_inputs():
-    test_suite = VkTestSuite(
-        [
-            ((8, 8, 8), 0, -2),
-            ((8, 8, 8), 1, -3),
-            ((8, 8, 8), 2, -4),
-            ((6, 2, 7), 1, 0),
-            ((6, 2, 7), 2, 3),
-            ((6, 10, 7), 0, 3),
-            ((6, 10, 7), 1, 0),
-            ((6, 10, 7), 1, 9),
-            ((6, 10, 7), 2, 6),
-            ((9, 2, 9, 4), 0, 8),
-            ((9, 2, 9, 4), 1, 1),
-            ((9, 2, 9, 4), 2, 0),
-            ((9, 2, 9, 4), 2, 8),
-            ((9, 2, 9, 4), 3, 3),
-            ((8, 6, 1, 1), 0, 4),
-            ((8, 6, 1, 1), 1, 4),
-        ]
-    )
-    test_suite.layouts = ["utils::kWidthPacked", "utils::kChannelsPacked"]
-    test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
-    test_suite.dtypes = ["at::kFloat"]
-    test_suite.data_gen = "make_seq_tensor"
-    return test_suite
-
-
-@register_test_suite(["aten.permute.default", "aten.permute_copy.default"])
-def get_permute_inputs():
-    batch_tests = [
-        ((9, 2, 5, 7), out_axis) for out_axis in itertools.permutations([0, 1, 2, 3])
-    ]
-    channel_tests = [
-        ((9, 2, 5), out_axis) for out_axis in itertools.permutations([0, 1, 2])
-    ]
-    wh_tests = [((9, 2), out_axis) for out_axis in itertools.permutations([0, 1])]
-    test_suite = VkTestSuite(batch_tests + channel_tests + wh_tests)
-
-    test_suite.layouts = [
-        "utils::kWidthPacked",
-        "utils::kHeightPacked",
-        "utils::kChannelsPacked",
-    ]
-    test_suite.storage_types = [
-        "utils::kBuffer",
-        "utils::kTexture3D",
-    ]
-    test_suite.dtypes = [
-        "at::kFloat",
-    ]
-    return test_suite
-
-
-@register_test_suite("aten.view_copy.default")
-def get_view_inputs():
-    test_suite = VkTestSuite(
-        [
-            ((3, 4, 5), [1, 1, -1]),
-            ((3, 4, 5), [1, -1, 1]),
-            ((3, 4, 5), [-1, 1, 1]),
-            ((8, 7, 2, 3), [4, 3, 7, 4]),
-            ((8, 7, 2, 3), [7, -1, 2, 1]),
-            ((8, 7, 2, 3), [1, 1, 1, -1]),
-            ((8, 7, 2, 3), [-1]),
-            ((2, 3, 3, 7), [2, -1, 1, 1]),
-            ((3, 5, 2, 7), [7, -1, 2, 1]),
-            ((2, 2, 8, 6), [2, 6, -1, 1]),
-            ((2, 2, 8, 6), [6, -1, 1]),
-            ((S1, S2, S1, S2), [S2, -1, 1, S1]),
-            ((S1, S2, S1, S2), [S1, 1, -1, S2]),
-            ((S1, S2, S1, S2), [-1, 1, S1, S2]),
-        ]
-    )
-    test_suite.layouts = [
-        "utils::kWidthPacked",
-        "utils::kHeightPacked",
-        "utils::kChannelsPacked",
-    ]
-
-    highdim_test_suite = VkTestSuite(
-        [
-            ((1, 1, 3, 3, 3), (9, 3)),
-            ((2, 3, 4, 6, 5, 4), (6, 4, 6, 5, 4)),
-            ((2, 3, 3, 7, 8), (2, 3, 3, 8 * 7)),
-        ]
-    )
-    highdim_test_suite.storage_types = [
-        "utils::kBuffer",
-    ]
-    highdim_test_suite.test_name_suffix = "highdim"
-    highdim_test_suite.data_gen = "make_seq_tensor"
-
-    for suite in [test_suite, highdim_test_suite]:
-        suite.layouts = [
-            # "utils::kWidthPacked",
-            "utils::kHeightPacked",
-            "utils::kChannelsPacked",
-        ]
-
-    return [test_suite, highdim_test_suite]
-
-
-@register_test_suite("aten.slice_copy.Tensor")
-def get_slice_out_inputs():
-    Test = namedtuple("VkSliceTest", ["self", "dim", "start", "end", "step"])
-    Test.__new__.__defaults__ = (None, 0, None, None, 1)
-
-    # Slice by width and height
-    test_cases = [
-        Test(self=[1, 1, 4, 10], dim=3, start=3),
-        Test(self=[1, 1, 4, 10], dim=3, start=3, step=2),
-        Test(self=[1, 1, 4, 10], dim=3, start=3, end=4, step=2),
-        Test(self=[1, 1, 4, 10], dim=2, start=3),
-        Test(self=[9, 9, 9, 9], dim=2, start=0, end=9, step=1),
-        Test(self=[9, 9, 9, 9], dim=2, start=1, end=8, step=1),
-        Test(self=[9, 9, 9, 9], dim=2, start=1, end=2, step=1),
-        Test(self=[9, 9, 9, 9], dim=3, start=1, end=5, step=1),
-        Test(self=[9, 9, 9, 9], dim=3, start=1, end=5, step=2),
-        Test(self=[9, 9, 9, 9], dim=-1, start=1, end=5, step=2),
-        Test(self=[9, 9, 9, 9], dim=-2, start=1, end=5, step=2),
-        Test(self=[9, 9, 9], dim=1, start=2, step=1),
-        Test(self=[9, 9, 9], dim=1, start=2, step=2),
-        Test(self=[9, 9, 9], dim=2, start=2, step=1),
-        Test(self=[9, 9, 9], dim=2, start=2, step=2),
-        Test(self=[9, 9], dim=0, start=2, step=1),
-        Test(self=[9, 9], dim=0, start=2, step=2),
-        Test(self=[9, 9], dim=1, start=2, step=1),
-        Test(self=[9, 9], dim=1, start=2, step=2),
-    ]
-
-    # Slice by batch
-    test_cases += [
-        Test(self=[6, 5, 3, 2], dim=0),
-        Test(self=[6, 5, 3, 2], dim=0, step=2),
-        Test(self=[13, 13, 3, 2], dim=0, step=2),
-        Test(self=[13, 13, 3, 2], dim=0, start=1, step=2),
-        Test(self=[13, 13, 3, 2], dim=0, start=1, step=5),
-        Test(self=[13, 13, 3, 2], dim=0, start=1, step=20),
-        Test(self=[13, 2, 3, 2], dim=0, start=1, step=2),
-        Test(self=[13, 2, 3, 2], dim=0, start=1, step=5),
-        Test(self=[13, 2, 3, 2], dim=0, start=1, step=20),
-    ]
-
-    # Slice by channel
-    test_cases += [
-        Test(self=[2, 5, 1, 10], dim=1),
-        Test(self=[2, 5, 1, 10], dim=1, start=1),
-        Test(self=[2, 5, 1, 10], dim=1, start=1, step=2),
-        Test(self=[5, 13, 1, 10], dim=1),
-        Test(self=[5, 13, 1, 10], dim=1, start=1),
-        Test(self=[5, 13, 1, 10], dim=1, start=1, step=2),
-        Test(self=[5, 13, 1, 10], dim=1, start=1, step=5),
-        Test(self=[5, 13, 1, 10], dim=1, start=1, step=20),
-        Test(self=[13, 1, 10], dim=0),
-        Test(self=[13, 1, 10], dim=0, start=1),
-        Test(self=[13, 1, 10], dim=0, start=1, step=2),
-        Test(self=[13, 1, 10], dim=0, start=1, step=5),
-        Test(self=[13, 1, 10], dim=0, start=1, step=20),
-    ]
-
-    # Slice by negative/unspecified indices
-    INT64_MAX = 9223372036854775807  # represents arr[:]
-    test_cases += [
-        Test(self=[8, 9], dim=0, start=-2, step=1),
-        Test(self=[8, 9], dim=0, start=-2, step=2),
-        Test(self=[8, 9], dim=0, end=-2, step=1),
-        Test(self=[8, 9], dim=0, end=-2, step=2),
-        Test(self=[8, 9], dim=0, end=INT64_MAX, step=1),
-        Test(self=[8, 9], dim=0, end=INT64_MAX, step=2),
-        Test(self=[8, 9], dim=1, start=-2, step=1),
-        Test(self=[8, 9], dim=1, start=-2, step=2),
-        Test(self=[8, 9], dim=1, end=-2, step=1),
-        Test(self=[8, 9], dim=1, end=-2, step=2),
-        Test(self=[8, 9], dim=1, end=INT64_MAX, step=1),
-        Test(self=[8, 9], dim=1, end=INT64_MAX, step=2),
-    ]
-
-    test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
-
-    test_suite.dtypes = ["at::kFloat", "at::kHalf"]
-    test_suite.layouts = [
-        "utils::kWidthPacked",
-        "utils::kHeightPacked",
-        "utils::kChannelsPacked",
-    ]
-    test_suite.data_gen = "make_seq_tensor"
-    return test_suite
-
-
-def get_slice_view_inputs():
-    Test = namedtuple("VkSliceTest", ["self", "dim", "start", "end", "step"])
-    Test.__new__.__defaults__ = (None, 0, None, None, 1)
-
-    # Slice by channel
-    test_cases = [
-        Test(self=[1, 17, 1, 10], dim=1, start=0, end=4),
-        Test(self=[1, 17, 1, 10], dim=1, start=0, end=8),
-        Test(self=[1, 17, 3, 7], dim=1, start=0, end=12),
-    ]
-
-    test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
-
-    test_suite.dtypes = ["at::kFloat"]
-    test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
-    test_suite.layouts = ["utils::kWidthPacked"]
-    test_suite.data_gen = "make_seq_tensor"
-    test_suite.is_view_op = True
-
-    return test_suite
-
-
-@register_test_suite(["aten.slice.Tensor"])
-def get_slice_inputs():
-    texture_test_suite = get_slice_out_inputs()
-    texture_test_suite.test_name_suffix = "no_view"
-
-    view_test_suite = get_slice_view_inputs()
-    view_test_suite.test_name_suffix = "view"
-
-    return [view_test_suite, texture_test_suite]
-
-
-@register_test_suite(["aten.transpose.int"])
-def get_transpose_inputs():
-    Test = namedtuple("VkTransposeViewTest", ["self", "dim0", "dim1"])
-    Test.__new__.__defaults__ = (None, 0, 1)
-
-    test_cases = [
-        Test(self=[M1, M2], dim0=0, dim1=1),
-        Test(self=[M1, S2, M], dim0=0, dim1=1),
-        Test(self=[M1, S2, M], dim0=0, dim1=2),
-        Test(self=[M1, S2, M], dim0=2, dim1=1),
-        Test(self=[S, M, S2, M2], dim0=3, dim1=2),
-        Test(self=[S, M, S2, M2], dim0=1, dim1=2),
-        Test(self=[S, M, S2, M2], dim0=3, dim1=1),
-    ]
-
-    test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
-
-    test_suite.dtypes = ["at::kFloat"]
-    test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
-    test_suite.layouts = ["utils::kWidthPacked", "utils::kChannelsPacked"]
-    test_suite.data_gen = "make_seq_tensor"
-    test_suite.is_view_op = True
-    return test_suite
-
-
-@register_test_suite("aten.index_select.default")
-def get_index_select_inputs():
-    Test = namedtuple("VkIndexSelectTest", ["self", "dim", "index"])
-    Test.__new__.__defaults__ = (None, 0, None)
-
-    test_cases = []
-
-    for i in range(4):
-        test_cases += [
-            Test(self=[9, 9, 9, 9], dim=i, index=[0]),
-            Test(self=[9, 9, 9, 9], dim=i, index=[2]),
-            Test(self=[9, 9, 9, 9], dim=i, index=[0, 2]),
-            Test(self=[9, 9, 9, 9], dim=i, index=[3, 1]),
-            Test(self=[9, 9, 9, 9], dim=i, index=[5, 5]),
-            Test(self=[9, 9, 9, 9], dim=i, index=[2, 3, 4, 5, 7]),
-        ]
-
-    test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
-
-    test_suite.dtypes = ["at::kFloat"]
-    test_suite.layouts = ["utils::kChannelsPacked"]
-    return test_suite
-
-
-@register_test_suite("aten.embedding.default")
-def get_embedding_inputs():
-    Test = namedtuple("VkEmbeddingTest", ["weight", "indices"])
-    Test.__new__.__defaults__ = (None, None)
-
-    test_cases = [
-        Test(weight=[10, 9], indices=[0, 2]),
-        Test(weight=[10, 9], indices=[2, 3, 4, 5, 7]),
-        Test(weight=[10, 9], indices=[[0, 2], [1, 4], [7, 7]]),
-        Test(weight=[10, 9], indices=[[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]),
-        Test(weight=[10, 9], indices=[[[3, 1, 4], [1, 5, 9]], [[2, 6, 5], [3, 5, 8]]]),
-    ]
-
-    test_suite = VkTestSuite([tuple(tc) + (-1, "false", "false") for tc in test_cases])
-
-    test_suite.dtypes = ["at::kFloat"]
-    test_suite.layouts = ["utils::kChannelsPacked"]
-    return test_suite
-
-
-@register_test_suite("aten.unsqueeze_copy.default")
-def get_unsqueeze_inputs():
-    test_suite = VkTestSuite(
-        [
-            ((2, 3, 4), 0),
-            ((1, 1, 1), 0),
-            ((1, 1, 1), 1),
-            ((1, 1, 1), 2),
-            ((1, 1, 1), 3),
-            ((9, 9, 9), 0),
-            ((9, 9, 9), 1),
-            ((9, 9, 9), 2),
-            ((9, 9, 9), 3),
-            ((9, 9), 0),
-            ((9, 9), 1),
-            ((9, 9), 2),
-            ((9,), 0),
-            ((9,), 1),
-            ((1, 10), -1),
-        ]
-    )
-
-    highdim_test_suite = VkTestSuite(
-        [
-            ((2, 3, 4, 5, 6), 0),
-            ((2, 3, 4, 5, 6), 1),
-            ((2, 3, 4, 5, 6), 5),
-            ((2, 3, 4, 5, 6), -1),
-            ((2, 3, 4, 5, 6), -2),
-            ((1, 2, 3, 4, 5), 0),
-            ((1, 2, 3, 4, 5), 3),
-            ((1, 2, 3, 4, 5), -1),
-            ((2, 3, 4, 5), 0),
-            ((1, 2, 3, 4), 1),
-        ]
-    )
-    highdim_test_suite.storage_types = [
-        "utils::kBuffer",
-    ]
-    highdim_test_suite.test_name_suffix = "highdim"
-
-    for suite in [test_suite, highdim_test_suite]:
-        suite.layouts = [
-            "utils::kWidthPacked",
-            "utils::kChannelsPacked",
-        ]
-        suite.data_gen = "make_seq_tensor"
-
-    return [test_suite, highdim_test_suite]
-
-
-@register_test_suite("aten.clone.default")
-def get_clone_inputs():
-    test_suite = VkTestSuite(
-        [
-            ((S2, S1, S2, S1),),
-            ((S2, S1, S2),),
-            ((S2, S1),),
-            ((S2,),),
-            ((XS, S1, XS, S1),),
-            ((XS, S1, XS),),
-            ((S1, XS, S1),),
-            ((XS, S1),),
-            ((S1, XS),),
-            ((S1,),),
-            ((XS,),),
-        ]
-    )
-
-    highdim_test_suite = VkTestSuite(
-        [
-            ((2, 3, 4, 5, 6),),
-            ((2, 3, 4, 5, 1),),
-            ((1, 1, 3, 4, 5),),
-            ((2, 3, 4, 5, 6, 7),),
-            ((1, 2, 3, 4, 5, 6),),
-        ]
-    )
-    highdim_test_suite.storage_types = [
-        "utils::kBuffer",
-    ]
-    highdim_test_suite.test_name_suffix = "highdim"
-
-    for suite in [test_suite, highdim_test_suite]:
-        suite.layouts = [
-            "utils::kChannelsPacked",
-        ]
-        suite.data_gen = "make_seq_tensor"
-
-    return [test_suite, highdim_test_suite]
-
-
-@register_test_suite("aten.repeat.default")
-def get_repeat_inputs():
-    test_suite_2d = VkTestSuite(
-        [
-            ((2, 3), [1, 4]),
-            ((2, 3), [4, 1]),
-            ((2, 3), [4, 4]),
-            ((2, 3), [3, 1, 4]),
-        ]
-    )
-    test_suite_2d.layouts = [
-        "utils::kWidthPacked",
-        "utils::kHeightPacked",
-        "utils::kChannelsPacked",
-    ]
-    test_suite_2d.storage_types = ["utils::kTexture3D"]
-    test_suite_2d.data_gen = "make_seq_tensor"
-    test_suite_2d.dtypes = ["at::kFloat"]
-    test_suite_2d.test_name_suffix = "2d"
-
-    test_suite_3d = VkTestSuite(
-        [
-            # Repeat channels only (most challenging case)
-            ((3, XS, S), [2, 1, 1]),
-            ((7, XS, S), [4, 1, 1]),
-            ((1, 7, XS, S), [1, 4, 1, 1]),
-            ((3, 7, XS, S), [1, 4, 1, 1]),
-            # Repat channels with other dims
-            ((1, 7, XS, S), [1, 4, 1, 3]),
-            ((3, 7, XS, S), [1, 4, 1, 3]),
-            ((3, 7, XS, S), [1, 4, 3, 1]),
-            ((3, 7, XS, S), [1, 4, 3, 3]),
-            # Repeat Batch
-            ((3, 7, XS, S), [3, 4, 3, 3]),
-            ((3, 7, XS, S), [3, 1, 3, 3]),
-            # More other cases
-            ((3, 7, 1, 1), [1, 4, 1, 1]),
-            ((2, 3), [1, 4]),
-            ((2, 3), [4, 1]),
-            ((2, 3), [4, 4]),
-            ((S1, S2, S2), [1, 3, 1]),
-            ((S1, S2, S2), [1, 3, 3]),
-            ((S1, S2, S2), [3, 3, 1]),
-            ((S1, S2, S2), [3, 3, 3]),
-            ((S1, S2, S2, S2), [1, 1, 3, 1]),
-            ((S1, S2, S2, S2), [1, 1, 1, 3]),
-            ((S1, S2, S2, S2), [1, 1, 3, 3]),
-            ((S1, S2, S2, S2), [1, 3, 1, 3]),
-            ((S1, S2, S2, S2), [3, 3, 3, 3]),
-            ((S1, S2, S2, S2), [3, 3, 1, 1]),
-            # Expanding cases
-            ((2, 3), [3, 1, 4]),
-            ((2, 3), [3, 3, 2, 4]),
-        ]
-    )
-    test_suite_3d.layouts = [
-        "utils::kWidthPacked",
-        "utils::kHeightPacked",
-        "utils::kChannelsPacked",
-    ]
-    test_suite_3d.storage_types = ["utils::kTexture3D"]
-    test_suite_3d.data_gen = "make_seq_tensor"
-    test_suite_3d.dtypes = ["at::kFloat"]
-    test_suite_3d.test_name_suffix = "3d"
-
-    return [test_suite_2d, test_suite_3d]
-
-
-@register_test_suite("aten.repeat_interleave.self_int")
-def get_repeat_interleave_inputs():
-    test_suite_W = VkTestSuite(
-        [
-            ((4, 32, 256), 3, -2),
-            # Test repeat on each non-packed dim
-            ((16, 32, 64), 5, -2),
-            ((16, 32, 64), 5, -3),
-            # Test batched inputs
-            ((3, 5, 32, 64), 4, -2),
-            ((3, 5, 32, 64), 4, -3),
-        ]
-    )
-    test_suite_W.layouts = [
-        "utils::kWidthPacked",
-    ]
-    test_suite_W.data_gen = "make_seq_tensor"
-    test_suite_W.dtypes = ["at::kFloat"]
-    test_suite_W.test_name_suffix = "W_packed"
-
-    test_suite_C = VkTestSuite(
-        [
-            # Test repeat on each non-packed dim
-            ((32, 32, 16), 5, -1),
-            ((32, 32, 16), 5, -2),
-            # Test batched inputs
-            ((3, 16, 8, 64), 4, -1),
-            ((3, 16, 8, 64), 4, -2),
-        ]
-    )
-    test_suite_C.layouts = [
-        "utils::kChannelsPacked",
-    ]
-    test_suite_C.data_gen = "make_seq_tensor"
-    test_suite_C.dtypes = ["at::kFloat"]
-    test_suite_C.test_name_suffix = "C_packed"
-
-    return [test_suite_W, test_suite_C]
-
-
-@register_test_suite("aten.cat.default")
-def get_cat_inputs():
-    # TensorList must be specified as list of tuples
-    suite_inputs = [
-        # Cat on Height
-        ([(M, M, 3, 5), (M, M, 0, 5)], 2),
-        ([(S1, S1, 3, 5), (S1, S1, 0, 5)], 2),
-        ([(M, M, 3, 5), (M, M, 4, 5)], 2),
-        ([(S1, S1, 3, 5), (S1, S1, 4, 5)], 2),
-        ([(M2, 3, 5), (M2, 4, 5)], 1),
-        ([(S1, 3, 5), (S1, 4, 5)], 1),
-        ([(3, 5), (4, 5)], 0),
-        ([(3, 5), (4, 5), (1, 5)], 0),
-        (
-            [(3, 5)],
-            0,
-        ),
-        # Cat on Width
-        ([(M, M, 5, 3), (M, M, 5, 4)], 3),
-        ([(S1, S1, 5, 3), (S1, S1, 5, 4)], 3),
-        ([(M, 5, 3), (M, 5, 4)], 2),
-        ([(S1, 5, 3), (S1, 5, 4)], 2),
-        ([(5, 0), (5, 4)], 1),
-        ([(5, 3), (5, 4)], 1),
-        ([(5, 3), (5, 4), (5, 1)], 1),
-        (
-            [(5, 4)],
-            1,
-        ),
-        ([(5,), (6,)], 0),
-        # Cat on Batch
-        ([(M, S1, 5, 4), (M1, S1, 5, 4)], 0),
-        ([(S, S1, 5, 4), (S1, S1, 5, 4)], 0),
-        ([(S, M, 5, 4), (S1, M, 5, 4)], 0),
-        ([(S, XS, 5, 4), (S1, XS, 5, 4)], 0),
-        ([(S, S2, 5, 4), (S1, S2, 5, 4)], 0),
-        (
-            [
-                (3, 1, 2, 5),
-                (3, 1, 2, 5),
-                (3, 1, 2, 5),
-            ],
-            0,
-        ),
-        # Cat on Channel
-        ([(M, 5, 4), (0, 5, 4), (M1, 5, 4)], 0),
-        ([(S, 5, 4), (0, 5, 4), (S2, 5, 4)], 0),
-        ([(M, 5, 4), (M1, 5, 4), (M2, 5, 4)], 0),
-        ([(S, 5, 4), (S1, 5, 4), (S2, 5, 4)], 0),
-        ([(XS, 5, 4), (XS, 5, 4), (S2, 5, 4)], 0),
-        ([(XS, S, 5, 4), (XS, S1, 5, 4), (XS, S2, 5, 4)], 1),
-        ([(XS, XS, 5, 4), (XS, XS, 5, 4), (XS, S2, 5, 4)], 1),
-        (
-            [
-                (XS, 1, 2, 5),
-                (XS, 1, 2, 5),
-                (XS, 1, 2, 5),
-            ],
-            1,
-        ),
-    ]
-
-    high_number_cat_inputs = []
-    for num_input in [6, 9]:
-        odd_size = (3, 7, 29, 31)
-        even_size = (3, 8, 29, 32)
-        ones = (3, 1, 1, 1)
-
-        for input_size in [odd_size, even_size, ones]:
-            input_sizes = [input_size] * num_input
-            # Test cat on height, width, and batch dim
-            high_number_cat_inputs.append((input_sizes, 3))
-            high_number_cat_inputs.append((input_sizes, 2))
-            high_number_cat_inputs.append((input_sizes, 1))
-            high_number_cat_inputs.append((input_sizes, 0))
-
-    test_suite = VkTestSuite(suite_inputs + high_number_cat_inputs)
-
-    test_suite.layouts = [
-        "utils::kWidthPacked",
-        "utils::kChannelsPacked",
-    ]
-    test_suite.storage_types = [
-        "utils::kTexture3D",
-        "utils::kBuffer",
-    ]
-    test_suite.data_gen = "make_seq_tensor"
-    test_suite.dtypes = ["at::kFloat"]
-    return test_suite
-
-
-@register_test_suite("aten.split_with_sizes_copy.default")
-def get_split_with_sizes_inputs():
-    Test = namedtuple("VkSliceTest", ["self", "sizes", "dim"])
-    test_cases = [
-        # Split on Width
-        Test(self=(S1, 7, 10, 11), sizes=[1, 3, 2, 5], dim=3),
-        Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=3),
-        Test(self=(7, 10, 11), sizes=[1, 3, 2, 5], dim=2),
-        Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=2),
-        Test(self=(7, 10, 11), sizes=[3, 8], dim=2),
-        Test(self=(7, 10, 10), sizes=[1, 9], dim=2),
-        Test(self=(10, 10), sizes=[1, 9], dim=1),
-        Test(self=(10,), sizes=[1, 9], dim=0),
-        # Split on Height
-        Test(self=(S1, 7, 11, 10), sizes=[1, 3, 2, 5], dim=2),
-        Test(self=(S1, 7, 10, 10), sizes=[1, 2, 3, 4], dim=2),
-        Test(self=(7, 11, 10), sizes=[1, 3, 2, 5], dim=1),
-        Test(self=(7, 10, 10), sizes=[1, 2, 3, 4], dim=1),
-        Test(self=(7, 11, 11), sizes=[3, 8], dim=1),
-        Test(self=(7, 10, 10), sizes=[10], dim=1),
-        Test(self=(7, 6, 10), sizes=[1, 1, 1, 1, 1, 1], dim=1),
-        Test(self=(10, 10), sizes=[1, 2, 3, 4], dim=0),
-        # Split on Batch
-        Test(self=(10, 7, 10, 10), sizes=[3, 6, 1], dim=0),
-        Test(self=(10, 7, 10, 10), sizes=[10], dim=0),
-        # Split on Channel
-        Test(self=(7, 13, 4, 8), sizes=[3, 5, 2, 3], dim=1),
-        Test(self=(7, 13, 4, 8), sizes=[3, 6, 1, 3], dim=1),
-        Test(self=(7, 13, 4, 8), sizes=[3, 2, 2, 5, 1], dim=1),
-        Test(self=(7, 13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=1),
-        Test(self=(13, 4, 8), sizes=[3, 5, 2, 1, 2], dim=0),
-        Test(self=(13, 4, 8), sizes=[3, 3, 3, 3, 1], dim=0),
-        Test(self=(13, 4, 8), sizes=[2, 9, 2], dim=0),
-        Test(self=(13, 4, 8), sizes=[13], dim=0),
-    ]
-    test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
-
-    test_suite.layouts = [
-        "utils::kWidthPacked",
-        "utils::kHeightPacked",
-        "utils::kChannelsPacked",
-    ]
-    test_suite.data_gen = "make_seq_tensor"
-    test_suite.dtypes = ["at::kFloat"]
-    return test_suite
-
-
-@register_test_suite("aten.split.Tensor")
-def get_split_tensor_inputs():
-    test_suite = VkTestSuite(
-        [
-            # Split on Width
-            ((S1, 7, 10, 12), 12, 3),
-            ((S1, 7, 10, 12), 3, 3),
-            ((S1, 7, 10, 12), 1, 3),
-            ((7, 10, 12), 12, 2),
-            ((7, 10, 12), 3, 2),
-            ((7, 10, 12), 1, 2),
-            ((10, 12), 12, 1),
-            ((10, 12), 3, 1),
-            ((10, 12), 1, 1),
-            ((12,), 12, 0),
-            ((12,), 3, 0),
-            ((12,), 1, 0),
-            # Split on Height
-            ((S1, 7, 12, 8), 12, 2),
-            ((S1, 7, 12, 8), 3, 2),
-            ((S1, 7, 12, 8), 1, 2),
-            ((7, 12, 8), 12, 1),
-            ((7, 12, 8), 3, 1),
-            ((7, 12, 8), 1, 1),
-            ((12, 8), 12, 0),
-            ((12, 8), 3, 0),
-            ((12, 8), 1, 0),
-            # Split  on Batch
-            ((12, 7, 10, 10), 12, 0),
-            ((12, 7, 10, 10), 3, 0),
-            ((12, 7, 10, 10), 1, 0),
-            # Split  on Channel
-            ((7, 15, 10, 10), 15, 1),
-            ((7, 15, 10, 10), 5, 1),
-            ((7, 15, 10, 10), 3, 1),
-            ((7, 15, 10, 10), 1, 1),
-            ((15, 10, 10), 15, 0),
-            ((15, 10, 10), 5, 0),
-            ((15, 10, 10), 3, 0),
-            ((15, 10, 10), 1, 0),
-        ]
-    )
-
-    test_suite.layouts = [
-        "utils::kWidthPacked",
-        "utils::kHeightPacked",
-        "utils::kChannelsPacked",
-    ]
-    test_suite.data_gen = "make_seq_tensor"
-    test_suite.dtypes = ["at::kFloat"]
-    return test_suite
-
-
-def get_reduce_inputs(is_softmax: bool = False):
-    bool_arg = False if is_softmax else True
-    return [
-        ((L), 0, bool_arg),
-        ((L), -1, bool_arg),
-        ((M, L), 0, bool_arg),
-        ((M, L), 1, bool_arg),
-        ((L, M), -1, bool_arg),
-        ((M, L), -2, bool_arg),
-        ((S, S1, S2), 0, bool_arg),
-        ((S, S1, S2), 1, bool_arg),
-        ((S, S1, S2), 2, bool_arg),
-        ((S, S1, S2), -1, bool_arg),
-        ((S, S1, S2), -2, bool_arg),
-        ((S, S1, S2), -3, bool_arg),
-        ((1, S, S1, S2), 1, bool_arg),
-        ((1, S, S1, S2), 2, bool_arg),
-        ((1, S, S1, S2), 3, bool_arg),
-        ((1, S, S1, S2), -1, bool_arg),
-        ((1, S, S1, S2), -2, bool_arg),
-        ((1, S, S1, S2), -3, bool_arg),
-        # Test batches > 1 where the reduction dim is not the concat dim
-        ((S, S2, S1, 128), -1, bool_arg),
-    ]
-
-
-@register_test_suite(["aten._softmax.default", "aten._log_softmax.default"])
-def get_softmax_inputs():
-    test_suite = VkTestSuite(get_reduce_inputs(is_softmax=True))
-    test_suite.layouts = [
-        "utils::kWidthPacked",
-        "utils::kChannelsPacked",
-    ]
-    return test_suite
-
-
-@register_test_suite(
-    ["aten.amax.default", "aten.amin.default", "aten.sum.dim_IntList", "aten.mean.dim"]
-)
-def get_reduce_op_inputs():
-    test_suite = VkTestSuite(get_reduce_inputs())
-    test_suite.layouts = [
-        "utils::kChannelsPacked",
-        "utils::kWidthPacked",
-    ]
-    return test_suite
-
-
-@register_test_suite(["aten.var.dim"])
-def get_var_inputs():
-    test_cases = []
-    shapes_and_dims = [
-        ((L), 0),
-        ((L), -1),
-        ((M, L), 0),
-        ((M, L), 1),
-        ((L, M), -1),
-        ((M, L), -2),
-        ((S, S1, S2), 0),
-        ((S, S1, S2), 1),
-        ((S, S1, S2), 2),
-        ((S, S1, S2), -1),
-        ((S, S1, S2), -2),
-        ((S, S1, S2), -3),
-        ((1, S, S1, S2), 1),
-        ((1, S, S1, S2), 2),
-        ((1, S, S1, S2), 3),
-        ((1, S, S1, S2), -1),
-        ((1, S, S1, S2), -2),
-        ((1, S, S1, S2), -3),
-        # Test batches > 1 where the reduction dim is not the concat dim
-        ((S, L, S1, L), -1),
-        ((S, S2, S1, S), -2),
-        ((S, S2, M, M), 2),
-        ((S, M, S1, L), 3),
-    ]
-
-    for i, (shape, dim) in enumerate(shapes_and_dims):
-        unbiased = (i % 2) == 0
-        test_cases.append((shape, dim, unbiased, True))
-
-    # Texture-based tests
-    texture_test_suite = VkTestSuite(test_cases)
-    texture_test_suite.layouts = [
-        "utils::kChannelsPacked",
-        "utils::kWidthPacked",
-    ]
-    texture_test_suite.storage_types = ["utils::kTexture3D"]
-    texture_test_suite.atol = "1e-4"
-    texture_test_suite.rtol = "1e-4"
-    texture_test_suite.test_name_suffix = "texture"
-
-    # Buffer-based tests
-    buffer_test_suite = VkTestSuite(test_cases)
-    buffer_test_suite.layouts = [
-        "utils::kChannelsPacked",
-        "utils::kWidthPacked",
-    ]
-    buffer_test_suite.storage_types = ["utils::kBuffer"]
-    buffer_test_suite.atol = "1e-4"
-    buffer_test_suite.rtol = "1e-4"
-    buffer_test_suite.test_name_suffix = "buffer"
-
-    return [texture_test_suite, buffer_test_suite]
-
-
-@register_test_suite(
-    [
-        "aten.sqrt.default",
-        "aten.rsqrt.default",
-        "aten.exp.default",
-        "aten.hardshrink.default",
-        "aten.sin.default",
-        "aten.neg.default",
-        "aten.cos.default",
-        "aten.hardswish.default",
-        "aten.hardsigmoid.default",
-        "aten.leaky_relu.default",
-        "aten.round.default",
-        "aten.tan.default",
-        "aten.relu6.default",
-    ]
-)
-def get_unary_ops_inputs():
-    test_suite = VkTestSuite(
-        [
-            (M1,),
-            (M1, M2),
-            (S1, M1, M2),
-            (S1, S2, S2, M2),
-        ]
-    )
-    test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"]
-    test_suite.atol = "1e-4"
-    test_suite.rtol = "1e-4"
-    return test_suite
-
-
-# separate test suite from unary_ops for learning purposes
-@register_test_suite("aten.tan.default")
-def get_tan_inputs():
-    test_suite = VkTestSuite(
-        [
-            (M1,),
-            (M1, M2),
-            (S1, M1, M2),
-            (S1, S2, S2, M2),
-        ]
-    )
-    test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"]
-    test_suite.dtypes = ["at::kFloat", "at::kHalf"]
-    return test_suite
-
-
-@register_test_suite("aten._native_batch_norm_legit_no_training.default")
-def get_native_batch_norm_inputs():
-    Test = namedtuple(
-        "VkSliceTest", ["self", "weight", "bias", "mean", "var", "momentum", "eps"]
-    )
-
-    test_cases = [
-        Test(
-            self=(1, 1, 2, 5),
-            weight=(1,),
-            bias=(1,),
-            mean=(1,),
-            var=(1,),
-            momentum=0.0,
-            eps=0.001,
-        ),
-        Test(
-            self=(S2, 1, 2, 5),
-            weight=(1,),
-            bias=(1,),
-            mean=(1,),
-            var=(1,),
-            momentum=0.0,
-            eps=0.001,
-        ),
-        Test(
-            self=(1, S2, 2, 5),
-            weight=(S2,),
-            bias=(S2,),
-            mean=(S2,),
-            var=(S2,),
-            momentum=0.0,
-            eps=0.001,
-        ),
-        Test(
-            self=(9, S1, 2, 5),
-            weight=(S1,),
-            bias=(S1,),
-            mean=(S1,),
-            var=(S1,),
-            momentum=0.0,
-            eps=0.01,
-        ),
-        Test(
-            self=(3, S1, 2, 5),
-            weight=(S1,),
-            bias=(S1,),
-            mean=(S1,),
-            var=(S1,),
-            momentum=0.0,
-            eps=0.001,
-        ),
-        Test(
-            self=(3, S2, 2, 5),
-            weight=(S2,),
-            bias=(S2,),
-            mean=(S2,),
-            var=(S2,),
-            momentum=0.0,
-            eps=0.001,
-        ),
-        Test(
-            self=(3, S2, 2, 5),
-            weight=(S2,),
-            bias=(S2,),
-            mean=(S2,),
-            var=(S2,),
-            momentum=0.0,
-            eps=0.000,
-        ),
-    ]
-
-    test_suite = VkTestSuite(test_cases)
-    test_suite.requires_prepack = True
-    test_suite.prepacked_args = ["weight", "bias", "mean", "var"]
-
-    return test_suite
-
-
-@register_test_suite("aten.gelu.default")
-def get_gelu_inputs():
-    test_suite = VkTestSuite(
-        [
-            ((M1), "tanh"),
-            ((M1, M2), "tanh"),
-            ((S1, M1, M2), "tanh"),
-            ((S1, S2, S2, M2), "tanh"),
-        ]
-    )
-    return test_suite
-
-
-@register_test_suite("aten.arange.start_step")
-def get_arange_inputs():
-    test_suite = VkTestSuite(
-        [
-            (1, 13),
-            (1.0, 11),
-            (-13, 3),
-            (-11.0, 2),
-            (3, 15, 3),
-            (3, 23, 2),
-            (3, 23.0, 4),
-            (13, 1, -1),
-            (-3, -13, -2),
-            (13, -2.0, -4),
-        ],
-    )
-
-    test_suite.layouts = [
-        "utils::kChannelsPacked",
-    ]
-    return test_suite
-
-
-@register_test_suite("aten.constant_pad_nd.default")
-def get_constant_pad_nd_inputs():
-    test_suite = VkTestSuite(
-        [
-            ([S1, S2], [1, 1], 24.0),
-            ([M, M1, M2], [2, 2], 23.2),
-            ([L, M, M1, M2], [3, 5], 12.2),
-            ([S1, S2], [1, 1, 1, 1], 24.0),
-            ([M, M1, M2], [2, 2, 2, 2], 23.2),
-            ([L, M, M1, M2], [3, 5, 3, 5], 12.2),
-            ([M, M1, M2], [1, 2, 3, 4, 5, 6], 23.2),
-            ([L, M, M1, M2], [3, 3, 3, 3, 3, 3], 12.2),
-        ]
-    )
-    return test_suite
-
-
-@register_test_suite("aten.minimum.default")
-def get_minimum_inputs():
-    test_suite = VkTestSuite(
-        [
-            ((M1, M2), (M2)),
-            ((M1, M2), (M1, M2)),
-            ((M1, M2, M), (M2, M)),
-            ((M1, M1, S1, S2), (M1, M1, S1, S2)),
-            ((S1, S1, S2, S), (S1, S2, S)),
-            ((M1, S1, S2), (L, M1, S1, S2)),
-            ((S1, S2), (L, M1, S1, S2)),
-        ]
-    )
-    return test_suite
-
-
-@register_test_suite("aten.squeeze_copy.dims")
-def get_squeeze_copy_dim_inputs():
-    test_suite = VkTestSuite(
-        [
-            ([S, S, S, 1], 3),
-            ([S, 1, S, S], 1),
-            ([S, 1, 1, S], [1, 2]),
-            ([1, S, S, S], 0),
-            ([S, S, S, S], 3),
-            ([S, S, S, S], 2),
-            ([S, S, S, S], 1),
-            ([M, M1, 1], 2),
-            ([M, 1, M1], 1),
-            ([1, M1, M1], 0),
-        ]
-    )
-
-    highdim_test_suite = VkTestSuite(
-        [
-            ([1, 2, 3, 4, 5, 1], 0),
-            ([1, 2, 3, 4, 5, 1], 5),
-            ([1, 2, 3, 4, 5, 1], [0, 5]),
-            ([2, 1, 3, 1, 5, 6], 1),
-            ([2, 1, 3, 1, 5, 6], 3),
-            ([2, 1, 3, 1, 5, 6], [1, 3]),
-            ([1, 1, 3, 4, 5, 6], [0, 1]),
-            ([2, 3, 4, 1, 1, 6], [3, 4]),
-        ]
-    )
-    highdim_test_suite.storage_types = [
-        "utils::kBuffer",
-    ]
-    highdim_test_suite.test_name_suffix = "highdim"
-
-    for suite in [test_suite, highdim_test_suite]:
-        suite.layouts = [
-            "utils::kWidthPacked",
-            "utils::kChannelsPacked",
-        ]
-
-    return [test_suite, highdim_test_suite]
-
-
-@register_test_suite("aten.flip.default")
-def get_flip_inputs():
-    Test = namedtuple("Flip", ["self", "dim"])
-    Test.__new__.__defaults__ = (None, 0)
-
-    test_cases = [
-        Test(self=[9], dim=[0]),
-        Test(self=[9, 9], dim=[0, 1]),
-        Test(self=[9, 9, 9], dim=[0, 2]),
-        Test(self=[9, 9, 9], dim=[0, 1, 2]),
-        Test(self=[9, 9, 9, 9], dim=[0]),
-        Test(self=[9, 9, 9, 9], dim=[0, 2, 3]),
-        Test(self=[9, 9, 9, 9], dim=[1, 3]),
-        Test(self=[9, 9, 9, 9], dim=[0, 1, 2, 3]),
-    ]
-
-    test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
-    return test_suite
-
-
-@register_test_suite("aten.expand_copy.default")
-def get_expand_inputs():
-    test_suite = VkTestSuite(
-        [
-            # Basic expansion cases
-            ((1,), [5]),
-            ((1, 1), [3, 4]),
-            ((1, 3), [2, 3]),
-            ((3, 1), [3, 4]),
-            ((1, 1, 1), [2, 3, 4]),
-            # Expand with same size (no-op)
-            ((3, 4), [3, 4]),
-            ((2, 3, 4), [2, 3, 4]),
-            # Expand with additional dimensions
-            ((3,), [2, 3]),
-            ((3, 4), [2, 3, 4]),
-            ((2, 3), [1, 2, 3]),
-            # Mixed expansion cases
-            ((1, 3, 1, 4), [2, 3, 5, 4]),
-            ((1, 1, 3, 1), [2, 4, 3, 5]),
-            # Larger tensor cases
-            ((1, S1), [M, S1]),
-            ((S2, 1), [S2, M1]),
-            ((1, 1, S), [S1, S2, S]),
-            ((1, S1, 1, S2), [M, S1, M1, S2]),
-        ]
-    )
-    test_suite.storage_types = [
-        "utils::kBuffer",
-    ]
-    test_suite.layouts = [
-        "utils::kWidthPacked",
-        "utils::kChannelsPacked",
-    ]
-    test_suite.dtypes = [
-        "at::kFloat",
-        "at::kHalf",
-    ]
-    test_suite.data_gen = "make_seq_tensor"
-    return test_suite
-
-
-@register_test_suite("aten.where.self")
-def get_where_inputs():
-    Test = namedtuple("Where", ["condition", "self", "other"])
-    Test.__new__.__defaults__ = (None, None, None)
-
-    test_cases = [
-        Test(condition=[11], self=[11], other=[11]),
-        Test(condition=[10, 9], self=[10, 9], other=[10, 9]),
-        Test(condition=[10, 5, 3], self=[10, 5, 3], other=[10, 5, 3]),
-        Test(condition=[2, 10, 5, 3], self=[2, 10, 5, 3], other=[2, 10, 5, 3]),
-    ]
-
-    test_suite = VkTestSuite([tuple(tc) for tc in test_cases])
-    test_suite.arg_dtype["condition"] = "at::kBool"
-    test_suite.layouts = [
-        "utils::kWidthPacked",
-        "utils::kHeightPacked",
-        "utils::kChannelsPacked",
-    ]
-    test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"]
-    test_suite.atol = "1e-4"
-    test_suite.rtol = "1e-4"
-    return test_suite
diff --git a/backends/vulkan/test/op_tests/choose_qparams_test.cpp b/backends/vulkan/test/op_tests/choose_qparams_test.cpp
deleted file mode 100644
index 3b1094a1e84..00000000000
--- a/backends/vulkan/test/op_tests/choose_qparams_test.cpp
+++ /dev/null
@@ -1,786 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <ATen/ATen.h>
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/extension/aten_util/make_aten_functor_from_et_functor.h>
-#include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
-
-#include "test_utils.h"
-
-#include <cassert>
-#include <iostream>
-
-namespace torch {
-namespace executor {
-namespace native {
-
-// Forward declarations of the functions we're testing
-std::tuple<Tensor&, Tensor&> choose_qparams_tensor_out(
-    const Tensor& input,
-    int64_t quant_min,
-    int64_t quant_max,
-    ET_UNUSED double eps,
-    ScalarType dtype,
-    Tensor& scale_out,
-    Tensor& zero_point_out);
-
-std::tuple<Tensor&, Tensor&> choose_qparams_per_token_asymmetric_out(
-    const Tensor& input,
-    ScalarType dtype,
-    Tensor& scale_out,
-    Tensor& zero_point_out);
-
-// Wrapper function for choose_qparams_tensor_out without context
-Tensor& choose_qparams_tensor_out_no_context(
-    const Tensor& input,
-    int64_t quant_min,
-    int64_t quant_max,
-    ET_UNUSED double eps,
-    ScalarType dtype,
-    Tensor& scale_out,
-    Tensor& zero_point_out) {
-  torch::executor::native::choose_qparams_tensor_out(
-      input, quant_min, quant_max, eps, dtype, scale_out, zero_point_out);
-  return scale_out;
-}
-
-// Wrapper function for choose_qparams_per_token_asymmetric_out without context
-Tensor& choose_qparams_per_token_asymmetric_out_no_context(
-    const Tensor& input,
-    ScalarType dtype,
-    Tensor& scale_out,
-    Tensor& zero_point_out) {
-  torch::executor::native::choose_qparams_per_token_asymmetric_out(
-      input, dtype, scale_out, zero_point_out);
-  return scale_out;
-}
-
-// ATen wrapper for choose_qparams_tensor
-std::tuple<at::Tensor, at::Tensor> choose_qparams_tensor_aten(
-    const at::Tensor& input,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype) {
-  auto scale_out = at::empty({}, at::device(at::kCPU).dtype(at::kDouble));
-  auto zero_point_out = at::empty({}, at::device(at::kCPU).dtype(at::kLong));
-  double eps = 1e-7;
-
-  ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype);
-
-  // Use WRAP_TO_ATEN with the wrapper function
-  WRAP_TO_ATEN(choose_qparams_tensor_out_no_context, 5)
-  (input, quant_min, quant_max, eps, et_dtype, scale_out, zero_point_out);
-
-  return {scale_out, zero_point_out};
-}
-
-// ATen wrapper for choose_qparams_per_token_asymmetric
-std::tuple<at::Tensor, at::Tensor> choose_qparams_per_token_asymmetric_aten(
-    const at::Tensor& input,
-    at::ScalarType dtype) {
-  // Calculate output sizes for scale and zero_point tensors
-  std::vector<int64_t> output_sizes;
-  for (int64_t i = 0; i < input.dim() - 1; i++) {
-    output_sizes.push_back(input.size(i));
-  }
-  output_sizes.push_back(1);
-
-  auto scale_out =
-      at::empty(output_sizes, at::device(at::kCPU).dtype(at::kDouble));
-  auto zero_point_out =
-      at::empty(output_sizes, at::device(at::kCPU).dtype(at::kLong));
-
-  ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype);
-
-  // Use WRAP_TO_ATEN with the wrapper function
-  WRAP_TO_ATEN(choose_qparams_per_token_asymmetric_out_no_context, 2)
-  (input, et_dtype, scale_out, zero_point_out);
-
-  return {scale_out, zero_point_out};
-}
-
-} // namespace native
-} // namespace executor
-} // namespace torch
-
-//
-// Reference Implementation
-//
-
-/*
- * Reference implementation of choose_qparams_tensor
- */
-std::tuple<at::Tensor, at::Tensor> choose_qparams_tensor_reference_impl(
-    const at::Tensor& input,
-    int64_t quant_min,
-    int64_t quant_max) {
-  // Create output tensors
-  at::Tensor scale_out = at::empty({}, at::device(at::kCPU).dtype(at::kDouble));
-  at::Tensor zero_point_out =
-      at::empty({}, at::device(at::kCPU).dtype(at::kLong));
-
-  // Find min and max values in the input tensor
-  float min_val = input.min().item<float>();
-  float max_val = input.max().item<float>();
-
-  // Extend the [min, max] interval to ensure it contains 0
-  min_val = std::min(min_val, 0.f);
-  max_val = std::max(max_val, 0.f);
-
-  // Calculate scale
-  double scale =
-      (static_cast<double>(max_val) - min_val) / (quant_max - quant_min);
-
-  // Handle small scale
-  constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f;
-  if (float(scale) == 0.0f || std::isinf(1.0f / float(scale))) {
-    scale = 0.1;
-  }
-
-  if (scale < SMALL_SCALE_THRESHOLD) {
-    float org_scale = scale;
-    scale = SMALL_SCALE_THRESHOLD;
-    // Adjust min and max based on new scale
-    if (min_val == 0.0f) {
-      max_val = SMALL_SCALE_THRESHOLD * (quant_max - quant_min);
-    } else if (max_val == 0.0f) {
-      min_val = -SMALL_SCALE_THRESHOLD * (quant_max - quant_min);
-    } else {
-      float amplifier = SMALL_SCALE_THRESHOLD / org_scale;
-      min_val *= amplifier;
-      max_val *= amplifier;
-    }
-  }
-
-  // Calculate zero point
-  double zero_point_from_min = quant_min - min_val / static_cast<double>(scale);
-  double zero_point_from_max = quant_max - max_val / static_cast<double>(scale);
-  double zero_point_from_min_error =
-      std::abs(quant_min) - std::abs(min_val / static_cast<double>(scale));
-  double zero_point_from_max_error =
-      std::abs(quant_max) - std::abs(max_val / static_cast<double>(scale));
-  double initial_zero_point =
-      zero_point_from_min_error < zero_point_from_max_error
-      ? zero_point_from_min
-      : zero_point_from_max;
-
-  // Nudge zero point to be an integer
-  int64_t nudged_zero_point = 0;
-  if (initial_zero_point < quant_min) {
-    nudged_zero_point = quant_min;
-  } else if (initial_zero_point > quant_max) {
-    nudged_zero_point = quant_max;
-  } else {
-    nudged_zero_point = std::nearbyint(static_cast<float>(initial_zero_point));
-  }
-
-  // Set output values - use item_mutable() for scalar tensors
-  scale_out.fill_(scale);
-  zero_point_out.fill_(nudged_zero_point);
-
-  return std::make_tuple(scale_out, zero_point_out);
-}
-
-/*
- * Reference implementation of choose_qparams_per_token_asymmetric
- */
-std::tuple<at::Tensor, at::Tensor>
-choose_qparams_per_token_asymmetric_reference_impl(
-    const at::Tensor& input,
-    at::ScalarType dtype) {
-  // For per-token quantization, we need to compute scale and zero_point for
-  // each token
-  int64_t quant_min = -128;
-  int64_t quant_max = 127;
-
-  // Calculate output sizes
-  std::vector<int64_t> output_sizes;
-  for (int64_t i = 0; i < input.dim() - 1; i++) {
-    output_sizes.push_back(input.size(i));
-  }
-  output_sizes.push_back(1);
-
-  // Create output tensors
-  at::Tensor scale_out =
-      at::empty(output_sizes, at::device(at::kCPU).dtype(at::kDouble));
-  at::Tensor zero_point_out =
-      at::empty(output_sizes, at::device(at::kCPU).dtype(at::kLong));
-
-  // Calculate number of tokens
-  int64_t num_tokens = 1;
-  for (int64_t i = 0; i < input.dim() - 1; i++) {
-    num_tokens *= input.size(i);
-  }
-
-  // Reshape input to [num_tokens, last_dim]
-  at::Tensor reshaped_input = input.reshape({num_tokens, input.size(-1)});
-
-  // Process each token
-  for (int64_t token_idx = 0; token_idx < num_tokens; token_idx++) {
-    at::Tensor token = reshaped_input[token_idx];
-
-    // Find min and max values for this token
-    float min_val = token.min().item<float>();
-    float max_val = token.max().item<float>();
-
-    // Extend the [min, max] interval to ensure it contains 0
-    min_val = std::min(min_val, 0.f);
-    max_val = std::max(max_val, 0.f);
-
-    // Calculate scale
-    double scale =
-        (static_cast<double>(max_val) - min_val) / (quant_max - quant_min);
-
-    // Handle small scale
-    constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f;
-    if (float(scale) == 0.0f || std::isinf(1.0f / float(scale))) {
-      scale = 0.1;
-    }
-
-    if (scale < SMALL_SCALE_THRESHOLD) {
-      float org_scale = scale;
-      scale = SMALL_SCALE_THRESHOLD;
-      // Adjust min and max based on new scale
-      if (min_val == 0.0f) {
-        max_val = SMALL_SCALE_THRESHOLD * (quant_max - quant_min);
-      } else if (max_val == 0.0f) {
-        min_val = -SMALL_SCALE_THRESHOLD * (quant_max - quant_min);
-      } else {
-        float amplifier = SMALL_SCALE_THRESHOLD / org_scale;
-        min_val *= amplifier;
-        max_val *= amplifier;
-      }
-    }
-
-    // Calculate zero point
-    double zero_point_from_min =
-        quant_min - min_val / static_cast<double>(scale);
-    double zero_point_from_max =
-        quant_max - max_val / static_cast<double>(scale);
-    double zero_point_from_min_error =
-        std::abs(quant_min) - std::abs(min_val / static_cast<double>(scale));
-    double zero_point_from_max_error =
-        std::abs(quant_max) - std::abs(max_val / static_cast<double>(scale));
-    double initial_zero_point =
-        zero_point_from_min_error < zero_point_from_max_error
-        ? zero_point_from_min
-        : zero_point_from_max;
-
-    // Nudge zero point to be an integer
-    int64_t nudged_zero_point = 0;
-    if (initial_zero_point < quant_min) {
-      nudged_zero_point = quant_min;
-    } else if (initial_zero_point > quant_max) {
-      nudged_zero_point = quant_max;
-    } else {
-      nudged_zero_point =
-          std::nearbyint(static_cast<float>(initial_zero_point));
-    }
-
-    // Set output values for this token - use index_put_ for safety
-    scale_out.view({num_tokens, 1}).index_put_({token_idx, 0}, scale);
-    zero_point_out.view({num_tokens, 1})
-        .index_put_({token_idx, 0}, nudged_zero_point);
-  }
-
-  return std::make_tuple(scale_out, zero_point_out);
-}
-
-// Forward declaration of implementation functions
-void test_vulkan_choose_qparams_tensor_impl(
-    const std::vector<int>& input_sizes,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    const vkcompute::utils::StorageType in_storage,
-    const vkcompute::utils::StorageType out_storage);
-
-void test_vulkan_choose_qparams_per_token_asymmetric_impl(
-    const std::vector<int>& input_sizes,
-    at::ScalarType dtype,
-    const vkcompute::utils::StorageType in_storage,
-    const vkcompute::utils::StorageType out_storage);
-
-// Wrapper function to test both buffer and texture storage types
-void test_vulkan_choose_qparams_tensor(
-    const std::vector<int>& input_sizes,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype) {
-  // Test with buffer storage
-  test_vulkan_choose_qparams_tensor_impl(
-      input_sizes,
-      quant_min,
-      quant_max,
-      dtype,
-      vkcompute::utils::kBuffer,
-      vkcompute::utils::kBuffer);
-
-  // Test with texture storage
-  test_vulkan_choose_qparams_tensor_impl(
-      input_sizes,
-      quant_min,
-      quant_max,
-      dtype,
-      vkcompute::utils::kTexture3D,
-      vkcompute::utils::kTexture3D);
-}
-
-// Wrapper function to test both buffer and texture storage types
-void test_vulkan_choose_qparams_per_token_asymmetric(
-    const std::vector<int>& input_sizes,
-    at::ScalarType dtype) {
-  // Test with buffer storage
-  test_vulkan_choose_qparams_per_token_asymmetric_impl(
-      input_sizes, dtype, vkcompute::utils::kBuffer, vkcompute::utils::kBuffer);
-
-  // Test with texture storage
-  test_vulkan_choose_qparams_per_token_asymmetric_impl(
-      input_sizes,
-      dtype,
-      vkcompute::utils::kTexture3D,
-      vkcompute::utils::kTexture3D);
-}
-
-void test_reference_choose_qparams_tensor(
-    const std::vector<int>& input_sizes,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype) {
-  std::vector<int64_t> input_sizes_int64(
-      input_sizes.begin(), input_sizes.end());
-  at::Tensor input =
-      at::rand(input_sizes_int64, at::device(at::kCPU).dtype(at::kFloat));
-
-  // Get reference output
-  auto [reference_scale, reference_zero_point] =
-      choose_qparams_tensor_reference_impl(input, quant_min, quant_max);
-
-  // Get implementation output
-  auto [impl_scale, impl_zero_point] =
-      torch::executor::native::choose_qparams_tensor_aten(
-          input, quant_min, quant_max, dtype);
-
-  // Compare outputs
-  const bool scale_correct = at::allclose(reference_scale, impl_scale);
-  const bool zero_point_correct =
-      at::equal(reference_zero_point, impl_zero_point);
-
-  if (!scale_correct || !zero_point_correct) {
-    std::cout << "\n"
-              << "Failed with parameters: " << std::endl;
-    std::cout << "  quant_min: " << quant_min << std::endl;
-    std::cout << "  quant_max: " << quant_max << std::endl;
-
-    std::cout << "input:" << std::endl;
-    std::cout << input << std::endl;
-    std::cout << "reference scale:" << std::endl;
-    std::cout << reference_scale << std::endl;
-    std::cout << "implementation scale:" << std::endl;
-    std::cout << impl_scale << std::endl;
-    std::cout << "reference zero_point:" << std::endl;
-    std::cout << reference_zero_point << std::endl;
-    std::cout << "implementation zero_point:" << std::endl;
-    std::cout << impl_zero_point << std::endl;
-  }
-
-  ASSERT_TRUE(scale_correct && zero_point_correct);
-}
-
-void test_vulkan_choose_qparams_tensor_impl(
-    const std::vector<int>& input_sizes,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    const vkcompute::utils::StorageType in_storage,
-    const vkcompute::utils::StorageType out_storage) {
-  std::vector<int64_t> input_sizes_int64(
-      input_sizes.begin(), input_sizes.end());
-  at::Tensor input =
-      at::rand(input_sizes_int64, at::device(at::kCPU).dtype(at::kFloat));
-
-  // Get reference output
-  auto [reference_scale, reference_zero_point] =
-      torch::executor::native::choose_qparams_tensor_aten(
-          input, quant_min, quant_max, dtype);
-
-  // Build Vulkan choose_qparams_tensor graph
-  using namespace vkcompute;
-
-  GraphConfig config;
-  config.set_storage_type_override(in_storage);
-  ComputeGraph graph(config);
-
-  IOValueRef r_input = graph.add_input_tensor(
-      input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage);
-
-  const ValueRef r_quant_min = graph.add_scalar<int64_t>(quant_min);
-  const ValueRef r_quant_max = graph.add_scalar<int64_t>(quant_max);
-
-  // Output tensors
-  const ValueRef r_scale = graph.add_tensor({}, vkapi::kFloat, out_storage);
-  const ValueRef r_zero_point = graph.add_tensor({}, vkapi::kInt, out_storage);
-
-  // Create output tuple
-  const ValueRef r_out_tuple = graph.add_value_list({r_scale, r_zero_point});
-
-  // Add eps and dtype parameters to match ATen signature
-  const ValueRef r_eps = graph.add_scalar<double>(6.1e-5);
-  const ValueRef r_dtype =
-      graph.add_scalar<int64_t>(static_cast<int64_t>(dtype));
-
-  VK_GET_OP_FN("quantized_decomposed.choose_qparams.tensor")
-  (graph,
-   {
-       r_input.value,
-       r_quant_min,
-       r_quant_max,
-       r_eps,
-       r_dtype,
-       r_out_tuple,
-   });
-
-  ValueRef staging_scale = graph.set_output_tensor(r_scale);
-  ValueRef staging_zero_point = graph.set_output_tensor(r_zero_point);
-
-  graph.prepare();
-
-  graph.prepack();
-
-  // Run Vulkan choose_qparams_tensor
-  graph.copy_into_staging(
-      r_input.staging, input.const_data_ptr(), input.numel());
-
-  graph.execute();
-
-  // Create output tensors to hold the results - use types that match GPU output
-  at::Tensor vk_scale =
-      at::empty({}, at::device(at::kCPU).dtype(at::kFloat)).contiguous();
-  at::Tensor vk_zero_point =
-      at::empty({}, at::device(at::kCPU).dtype(at::kInt)).contiguous();
-
-  // Copy results from GPU to CPU
-  graph.copy_from_staging(
-      staging_scale, vk_scale.mutable_data_ptr(), vk_scale.numel());
-  graph.copy_from_staging(
-      staging_zero_point,
-      vk_zero_point.mutable_data_ptr(),
-      vk_zero_point.numel());
-
-  // Convert reference values to match Vulkan output types for comparison
-  at::Tensor reference_scale_float = reference_scale.to(at::kFloat);
-  at::Tensor reference_zero_point_int = reference_zero_point.to(at::kInt);
-
-  // Compare outputs
-  const bool scale_correct = at::allclose(reference_scale_float, vk_scale);
-  const bool zero_point_correct =
-      at::equal(reference_zero_point_int, vk_zero_point);
-
-  if (!scale_correct || !zero_point_correct) {
-    std::cout << "\n"
-              << "Failed with parameters: " << std::endl;
-    std::cout << "  quant_min: " << quant_min << std::endl;
-    std::cout << "  quant_max: " << quant_max << std::endl;
-    std::cout << "  storage type: "
-              << (in_storage == vkcompute::utils::kBuffer ? "buffer"
-                                                          : "texture")
-              << std::endl;
-
-    // make sure that there arent a ton of elements in the input tensor
-    if (input.numel() < 100) {
-      std::cout << "input:" << std::endl;
-      std::cout << input << "\n" << std::endl;
-      std::cout << "reference scale:" << std::endl;
-      std::cout << reference_scale << std::endl;
-      std::cout << "vulkan scale:" << std::endl;
-      std::cout << vk_scale << "\n" << std::endl;
-      std::cout << "reference zero_point:" << std::endl;
-      std::cout << reference_zero_point << std::endl;
-      std::cout << "vulkan zero_point:" << std::endl;
-      std::cout << vk_zero_point << std::endl;
-    }
-  }
-
-  ASSERT_TRUE(scale_correct && zero_point_correct);
-}
-
-TEST(VulkanChooseQparamsTest, test_reference_choose_qparams_tensor_int8) {
-  test_reference_choose_qparams_tensor(
-      {2, 3, 4}, // input sizes
-      -128, // quant_min
-      127, // quant_max
-      at::kChar);
-}
-
-TEST(VulkanChooseQparamsTest, test_vulkan_choose_qparams_tensor_uint8_4D) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_vulkan_choose_qparams_tensor(
-      {5, 3, 2, 4}, // input sizes
-      0, // quant_min
-      255, // quant_max
-      at::kByte);
-}
-
-TEST(VulkanChooseQparamsTest, test_vulkan_choose_qparams_tensor_int8_2D) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_vulkan_choose_qparams_tensor(
-      {5, 5}, // input sizes
-      -128, // quant_min
-      127, // quant_max
-      at::kChar);
-}
-
-TEST(VulkanChooseQparamsTest, test_vulkan_choose_qparams_tensor_int8_3D) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_vulkan_choose_qparams_tensor(
-      {12, 8, 2}, // input sizes
-      -128, // quant_min
-      127, // quant_max
-      at::kChar);
-}
-
-TEST(VulkanChooseQparamsTest, test_vulkan_choose_qparams_tensor_int8_4D) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_vulkan_choose_qparams_tensor(
-      {10, 10, 6, 4}, // input sizes
-      -128, // quant_min
-      127, // quant_max
-      at::kChar);
-}
-
-void test_reference_choose_qparams_per_token_asymmetric(
-    const std::vector<int>& input_sizes,
-    at::ScalarType dtype) {
-  std::vector<int64_t> input_sizes_int64(
-      input_sizes.begin(), input_sizes.end());
-  at::Tensor input =
-      at::rand(input_sizes_int64, at::device(at::kCPU).dtype(at::kFloat));
-
-  // Get reference output
-  auto [reference_scale, reference_zero_point] =
-      choose_qparams_per_token_asymmetric_reference_impl(input, dtype);
-
-  // Get implementation output
-  auto [impl_scale, impl_zero_point] =
-      torch::executor::native::choose_qparams_per_token_asymmetric_aten(
-          input, dtype);
-
-  // Compare outputs
-  const bool scale_correct = at::allclose(reference_scale, impl_scale);
-  const bool zero_point_correct =
-      at::equal(reference_zero_point, impl_zero_point);
-
-  if (!scale_correct || !zero_point_correct) {
-    std::cout << "\n"
-              << "Failed with parameters: " << std::endl;
-
-    std::cout << "input:" << std::endl;
-    std::cout << input << std::endl;
-    std::cout << "reference scale:" << std::endl;
-    std::cout << reference_scale << std::endl;
-    std::cout << "implementation scale:" << std::endl;
-    std::cout << impl_scale << std::endl;
-    std::cout << "reference zero_point:" << std::endl;
-    std::cout << reference_zero_point << std::endl;
-    std::cout << "implementation zero_point:" << std::endl;
-    std::cout << impl_zero_point << std::endl;
-  }
-
-  ASSERT_TRUE(scale_correct && zero_point_correct);
-}
-
-void test_vulkan_choose_qparams_per_token_asymmetric_impl(
-    const std::vector<int>& input_sizes,
-    at::ScalarType dtype,
-    const vkcompute::utils::StorageType in_storage,
-    const vkcompute::utils::StorageType out_storage) {
-  std::vector<int64_t> input_sizes_int64(
-      input_sizes.begin(), input_sizes.end());
-  at::Tensor input =
-      at::rand(input_sizes_int64, at::device(at::kCPU).dtype(at::kFloat));
-
-  // Calculate output sizes
-  std::vector<int64_t> output_sizes;
-  for (int64_t i = 0; i < input.dim() - 1; i++) {
-    output_sizes.push_back(input.size(i));
-  }
-  output_sizes.push_back(1);
-
-  // Get reference output
-  auto [reference_scale, reference_zero_point] =
-      torch::executor::native::choose_qparams_per_token_asymmetric_aten(
-          input, dtype);
-
-  // Build Vulkan choose_qparams_per_token_asymmetric graph
-  using namespace vkcompute;
-
-  GraphConfig config;
-  config.set_storage_type_override(in_storage);
-  ComputeGraph graph(config);
-
-  IOValueRef r_input = graph.add_input_tensor(
-      input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage);
-
-  // Output tensors
-  const ValueRef r_scale =
-      graph.add_tensor(output_sizes, vkapi::kFloat, out_storage);
-  const ValueRef r_zero_point =
-      graph.add_tensor(output_sizes, vkapi::kInt, out_storage);
-
-  // Create output tuple
-  const ValueRef r_out_tuple = graph.add_value_list({r_scale, r_zero_point});
-
-  // Add dtype parameter to match ATen signature
-  const ValueRef r_dtype =
-      graph.add_scalar<int64_t>(static_cast<int64_t>(dtype));
-
-  VK_GET_OP_FN(
-      "quantized_decomposed.choose_qparams_per_token_asymmetric.default")
-  (graph,
-   {
-       r_input.value,
-       r_dtype,
-       r_out_tuple,
-   });
-
-  ValueRef staging_scale = graph.set_output_tensor(r_scale);
-  ValueRef staging_zero_point = graph.set_output_tensor(r_zero_point);
-
-  graph.prepare();
-
-  graph.prepack();
-
-  // Run Vulkan choose_qparams_per_token_asymmetric
-  graph.copy_into_staging(
-      r_input.staging, input.const_data_ptr(), input.numel());
-
-  graph.execute();
-
-  // Create output tensors to hold the results - use types that match GPU output
-  at::Tensor vk_scale =
-      at::empty(output_sizes, at::device(at::kCPU).dtype(at::kFloat))
-          .contiguous();
-  at::Tensor vk_zero_point =
-      at::empty(output_sizes, at::device(at::kCPU).dtype(at::kInt))
-          .contiguous();
-
-  // Copy results from GPU to CPU
-  graph.copy_from_staging(
-      staging_scale, vk_scale.mutable_data_ptr(), vk_scale.numel());
-  graph.copy_from_staging(
-      staging_zero_point,
-      vk_zero_point.mutable_data_ptr(),
-      vk_zero_point.numel());
-
-  // Convert reference values to match Vulkan output types for comparison
-  at::Tensor reference_scale_float = reference_scale.to(at::kFloat);
-  at::Tensor reference_zero_point_int = reference_zero_point.to(at::kInt);
-
-  // Compare outputs
-  const bool scale_correct = at::allclose(reference_scale_float, vk_scale);
-  const bool zero_point_correct =
-      at::equal(reference_zero_point_int, vk_zero_point);
-  if (!scale_correct || !zero_point_correct) {
-    std::cout << "\n"
-              << "Failed with parameters: " << std::endl;
-    std::cout << "  storage type: "
-              << (in_storage == vkcompute::utils::kBuffer ? "buffer"
-                                                          : "texture")
-              << std::endl;
-
-    if (input.numel() < 100) {
-      std::cout << "input:" << std::endl;
-      std::cout << input << "\n" << std::endl;
-      std::cout << "reference scale:" << std::endl;
-      std::cout << reference_scale << std::endl;
-      std::cout << "vulkan scale:" << std::endl;
-      std::cout << vk_scale << "\n" << std::endl;
-      std::cout << "reference zero_point:" << std::endl;
-      std::cout << reference_zero_point << std::endl;
-      std::cout << "vulkan zero_point:" << std::endl;
-      std::cout << vk_zero_point << std::endl;
-    }
-  }
-
-  ASSERT_TRUE(scale_correct && zero_point_correct);
-}
-
-TEST(
-    VulkanChooseQparamsTest,
-    test_reference_choose_qparams_per_token_asymmetric_int8) {
-  test_reference_choose_qparams_per_token_asymmetric(
-      {2, 3, 4}, // input sizes (2*3=6 tokens)
-      at::kChar);
-}
-
-TEST(
-    VulkanChooseQparamsTest,
-    test_vulkan_choose_qparams_per_token_asymmetric_int8_1D) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_vulkan_choose_qparams_per_token_asymmetric({7}, at::kChar);
-}
-
-TEST(
-    VulkanChooseQparamsTest,
-    test_vulkan_choose_qparams_per_token_asymmetric_int8_2D) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_vulkan_choose_qparams_per_token_asymmetric({2, 2}, at::kChar);
-}
-
-TEST(
-    VulkanChooseQparamsTest,
-    test_vulkan_choose_qparams_per_token_asymmetric_int8_3D) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_vulkan_choose_qparams_per_token_asymmetric({3, 6, 4}, at::kChar);
-}
-
-TEST(
-    VulkanChooseQparamsTest,
-    test_vulkan_choose_qparams_per_token_asymmetric_int8_4D) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_vulkan_choose_qparams_per_token_asymmetric({128, 2, 16, 3}, at::kChar);
-}
diff --git a/backends/vulkan/test/op_tests/dequantize_test.cpp b/backends/vulkan/test/op_tests/dequantize_test.cpp
deleted file mode 100644
index 9fca2c632d3..00000000000
--- a/backends/vulkan/test/op_tests/dequantize_test.cpp
+++ /dev/null
@@ -1,2492 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <ATen/ATen.h>
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/extension/aten_util/make_aten_functor_from_et_functor.h>
-#include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
-
-#include "test_utils.h"
-
-#include <cassert>
-#include <cstdint>
-#include <iostream>
-#include <limits>
-
-namespace torch {
-namespace executor {
-namespace native {
-
-// Forward declarations of the functions we're testing
-Tensor& dequantize_per_tensor_out(
-    const Tensor& input,
-    double scale,
-    int64_t zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    executorch::aten::optional<ScalarType> out_dtype,
-    Tensor& out);
-
-Tensor& dequantize_per_token_out(
-    const Tensor& input,
-    const Tensor& scale,
-    const Tensor& zero_points,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    ScalarType out_dtype,
-    Tensor& out);
-
-Tensor& dequantize_per_channel_out(
-    const Tensor& input,
-    const Tensor& scale,
-    const std::optional<Tensor>& zero_points,
-    int64_t axis,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    executorch::aten::optional<ScalarType> out_dtype,
-    Tensor& out);
-
-Tensor& dequantize_per_tensor_tensor_args_out(
-    const Tensor& input,
-    const Tensor& scale,
-    const Tensor& zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    executorch::aten::optional<ScalarType> out_dtype,
-    Tensor& out);
-
-// Wrapper function for dequantize_per_tensor_out without context
-Tensor& dequantize_per_tensor_out_no_context(
-    const Tensor& input,
-    double scale,
-    int64_t zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    executorch::aten::optional<ScalarType> out_dtype,
-    Tensor& out) {
-  return torch::executor::native::dequantize_per_tensor_out(
-      input, scale, zero_point, quant_min, quant_max, dtype, out_dtype, out);
-}
-
-// Wrapper function for dequantize_per_token_out without context
-Tensor& dequantize_per_token_out_no_context(
-    const Tensor& input,
-    const Tensor& scale,
-    const Tensor& zero_points,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    ScalarType out_dtype,
-    Tensor& out) {
-  return torch::executor::native::dequantize_per_token_out(
-      input, scale, zero_points, quant_min, quant_max, dtype, out_dtype, out);
-}
-
-// Wrapper function for dequantize_per_channel_out without context
-Tensor& dequantize_per_channel_out_no_context(
-    const Tensor& input,
-    const Tensor& scale,
-    const std::optional<Tensor>& zero_points,
-    int64_t axis,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    executorch::aten::optional<ScalarType> out_dtype,
-    Tensor& out) {
-  return torch::executor::native::dequantize_per_channel_out(
-      input,
-      scale,
-      zero_points,
-      axis,
-      quant_min,
-      quant_max,
-      dtype,
-      out_dtype,
-      out);
-}
-
-// Wrapper function for dequantize_per_tensor_tensor_args_out without context
-Tensor& dequantize_per_tensor_tensor_args_out_no_context(
-    const Tensor& input,
-    const Tensor& scale,
-    const Tensor& zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    executorch::aten::optional<ScalarType> out_dtype,
-    Tensor& out) {
-  return torch::executor::native::dequantize_per_tensor_tensor_args_out(
-      input, scale, zero_point, quant_min, quant_max, dtype, out_dtype, out);
-}
-
-// ATen wrapper for dequantize_per_tensor
-at::Tensor dequantize_per_tensor_aten(
-    const at::Tensor& input,
-    double scale,
-    int64_t zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    at::ScalarType out_dtype) {
-  auto out = at::empty_like(input, out_dtype);
-  // Convert at::ScalarType to executorch::ScalarType
-  ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype);
-  ScalarType et_out_dtype = at_scalartype_to_et_scalartype(out_dtype);
-
-  executorch::aten::optional<ScalarType> opt_et_out_dtype(et_out_dtype);
-
-  WRAP_TO_ATEN(dequantize_per_tensor_out_no_context, 7)
-  (input,
-   scale,
-   zero_point,
-   quant_min,
-   quant_max,
-   et_dtype,
-   opt_et_out_dtype,
-   out);
-  return out;
-}
-
-// ATen wrapper for dequantize_per_token
-at::Tensor dequantize_per_token_aten(
-    const at::Tensor& input,
-    const at::Tensor& scale,
-    const at::Tensor& zero_points,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    at::ScalarType out_dtype) {
-  auto out = at::empty_like(input, out_dtype);
-  // Convert at::ScalarType to executorch::ScalarType
-  ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype);
-  ScalarType et_out_dtype = at_scalartype_to_et_scalartype(out_dtype);
-
-  WRAP_TO_ATEN(dequantize_per_token_out_no_context, 7)
-  (input,
-   scale,
-   zero_points,
-   quant_min,
-   quant_max,
-   et_dtype,
-   et_out_dtype,
-   out);
-  return out;
-}
-
-// ATen wrapper for dequantize_per_channel
-at::Tensor dequantize_per_channel_aten(
-    const at::Tensor& input,
-    const at::Tensor& scale,
-    const std::optional<at::Tensor>& zero_points,
-    int64_t axis,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    at::ScalarType out_dtype) {
-  auto out = at::empty_like(input, out_dtype);
-  // Convert at::ScalarType to executorch::ScalarType
-  ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype);
-  ScalarType et_out_dtype = at_scalartype_to_et_scalartype(out_dtype);
-
-  executorch::aten::optional<ScalarType> opt_et_out_dtype(et_out_dtype);
-
-  WRAP_TO_ATEN(dequantize_per_channel_out_no_context, 8)
-  (input,
-   scale,
-   zero_points,
-   axis,
-   quant_min,
-   quant_max,
-   et_dtype,
-   opt_et_out_dtype,
-   out);
-  return out;
-}
-
-// ATen wrapper for dequantize_per_tensor with tensor args
-at::Tensor dequantize_per_tensor_tensor_args_aten(
-    const at::Tensor& input,
-    const at::Tensor& scale,
-    const at::Tensor& zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    at::ScalarType out_dtype) {
-  auto out = at::empty_like(input, out_dtype);
-  // Convert at::ScalarType to executorch::ScalarType
-  ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype);
-  ScalarType et_out_dtype = at_scalartype_to_et_scalartype(out_dtype);
-
-  executorch::aten::optional<ScalarType> opt_et_out_dtype(et_out_dtype);
-
-  WRAP_TO_ATEN(dequantize_per_tensor_tensor_args_out_no_context, 7)
-  (input,
-   scale,
-   zero_point,
-   quant_min,
-   quant_max,
-   et_dtype,
-   opt_et_out_dtype,
-   out);
-  return out;
-}
-
-} // namespace native
-} // namespace executor
-} // namespace torch
-
-void check_dequantize_args(
-    int64_t quant_min,
-    int64_t quant_max,
-    c10::ScalarType in_dtype,
-    c10::ScalarType out_dtype) {
-  using namespace vkcompute;
-
-  // Check that quant_min <= quant_max
-  VK_CHECK_COND(
-      quant_min <= quant_max,
-      "quant_min must be <= quant_max, got quant_min: ",
-      quant_min,
-      " quant_max: ",
-      quant_max);
-
-  // Check that input dtype is a quantized type
-  switch (in_dtype) {
-    case c10::kByte:
-    case c10::kChar:
-    case c10::kShort:
-    case c10::kInt:
-    case c10::kLong:
-      break;
-    default:
-      VK_THROW(
-          "Unsupported input dtype: ",
-          scalar_type_name(in_dtype),
-          " (",
-          static_cast<int>(in_dtype),
-          ")");
-  }
-
-  // Check that output dtype is a floating point type
-  switch (out_dtype) {
-    case c10::kHalf:
-    case c10::kFloat:
-    case c10::kDouble:
-      break;
-    default:
-      VK_THROW(
-          "Unsupported output dtype: ",
-          scalar_type_name(out_dtype),
-          " (",
-          static_cast<int>(out_dtype),
-          ")");
-  }
-}
-
-/**
- * Helper function to validate dequantize_per_channel arguments
- * Similar to the validation in quantize_test.cpp
- */
-void check_dequantize_per_channel_args(
-    const std::vector<int>& input_sizes,
-    const std::vector<float>& scales,
-    const std::vector<int>& zero_points,
-    int64_t axis) {
-  // Normalize axis
-  int64_t normalized_axis = axis;
-  if (normalized_axis < 0) {
-    normalized_axis += input_sizes.size();
-  }
-
-  ASSERT_GE(normalized_axis, 0)
-      << "axis " << axis << " is not legal, normalized axis " << normalized_axis
-      << " should be >= 0";
-
-  ASSERT_LT(normalized_axis, static_cast<int64_t>(input_sizes.size()))
-      << "axis " << axis << " is not legal, normalized axis " << normalized_axis
-      << " should be < input.dim() " << input_sizes.size();
-
-  int64_t num_channels = input_sizes[normalized_axis];
-
-  ASSERT_EQ(num_channels, static_cast<int64_t>(scales.size()))
-      << "Expected scales.size() to match input.size(axis) (" << num_channels
-      << "), but got " << scales.size();
-
-  ASSERT_EQ(num_channels, static_cast<int64_t>(zero_points.size()))
-      << "Expected zero_points.size() to match input.size(axis) ("
-      << num_channels << "), but got " << zero_points.size();
-}
-
-//
-// Reference Implementation
-//
-
-/*
- * Reference implementation of dequantize_per_tensor
- */
-at::Tensor dequantize_per_tensor_reference_impl(
-    const at::Tensor& input,
-    double scale,
-    int64_t zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    at::ScalarType out_dtype) {
-  // Create output tensor with the target dtype
-  at::Tensor out = at::empty_like(input, out_dtype);
-
-  // Dequantize the input tensor
-  at::Tensor flat_input = input.flatten();
-  at::Tensor flat_out = out.flatten();
-
-  // Store casted values to avoid repeated casting
-  const int32_t zero_point_int32 = static_cast<int32_t>(zero_point);
-  const float scale_float = static_cast<float>(scale);
-
-  for (int i = 0; i < flat_input.numel(); i++) {
-    double dequantized_value = 0.0;
-
-    // Extract quantized value and dequantize based on input dtype
-    // Following the CPU implementation pattern: (input - zero_point) * scale
-    if (dtype == at::kByte) {
-      uint8_t qvalue = flat_input[i].item<uint8_t>();
-      dequantized_value = (qvalue - zero_point_int32) * scale_float;
-    } else if (dtype == at::kChar) {
-      int8_t qvalue = flat_input[i].item<int8_t>();
-      dequantized_value = (qvalue - zero_point_int32) * scale_float;
-    } else if (dtype == at::kShort) {
-      int16_t qvalue = flat_input[i].item<int16_t>();
-      dequantized_value = (qvalue - zero_point_int32) * scale_float;
-    } else if (dtype == at::kInt) {
-      int32_t qvalue = flat_input[i].item<int32_t>();
-      dequantized_value = (qvalue - zero_point_int32) * scale_float;
-    } else if (dtype == at::kLong) {
-      int64_t qvalue = flat_input[i].item<int64_t>();
-      dequantized_value = (qvalue - zero_point_int32) * scale_float;
-    }
-
-    // Store result based on output dtype
-    if (out_dtype == at::kFloat) {
-      flat_out[i] = static_cast<float>(dequantized_value);
-    } else if (out_dtype == at::kDouble) {
-      flat_out[i] = dequantized_value;
-    } else if (out_dtype == at::kHalf) {
-      flat_out[i] = static_cast<c10::Half>(dequantized_value);
-    }
-  }
-
-  return out.reshape(input.sizes());
-}
-
-/*
- * Reference implementation of dequantize_per_token
- */
-at::Tensor dequantize_per_token_reference_impl(
-    const at::Tensor& input,
-    const at::Tensor& scale,
-    const at::Tensor& zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    at::ScalarType out_dtype) {
-  // Create output tensor with the target dtype
-  at::Tensor out = at::empty_like(input, out_dtype);
-
-  // Calculate number of tokens
-  int num_tokens = 1;
-  for (int i = 0; i < input.dim() - 1; i++) {
-    num_tokens *= input.size(i);
-  }
-
-  // Verify that the number of tokens matches the size of scale and zero_point
-  // tensors
-  assert(num_tokens == scale.numel());
-  assert(num_tokens == zero_point.numel());
-
-  // Reshape input to [num_tokens, last_dim]
-  at::Tensor reshaped_input = input.reshape({num_tokens, input.size(-1)});
-  at::Tensor reshaped_out = out.reshape({num_tokens, input.size(-1)});
-
-  // Dequantize each token separately
-  for (int token_idx = 0; token_idx < num_tokens; token_idx++) {
-    // Get scale and zero_point for this token
-    float token_scale = scale[token_idx].item<float>();
-    int64_t token_zero_point = zero_point[token_idx].item<int64_t>();
-
-    // Store casted values to avoid repeated casting
-    const int32_t token_zero_point_int32 =
-        static_cast<int32_t>(token_zero_point);
-
-    // Dequantize the token
-    for (int i = 0; i < input.size(-1); i++) {
-      double dequantized_value = 0.0;
-
-      // Extract quantized value and dequantize based on input dtype
-      // Following the CPU implementation pattern: (input - zero_point) * scale
-      if (dtype == at::kByte) {
-        uint8_t qvalue = reshaped_input[token_idx][i].item<uint8_t>();
-        dequantized_value = (qvalue - token_zero_point_int32) * token_scale;
-      } else if (dtype == at::kChar) {
-        int8_t qvalue = reshaped_input[token_idx][i].item<int8_t>();
-        dequantized_value = (qvalue - token_zero_point_int32) * token_scale;
-      } else if (dtype == at::kShort) {
-        int16_t qvalue = reshaped_input[token_idx][i].item<int16_t>();
-        dequantized_value = (qvalue - token_zero_point_int32) * token_scale;
-      } else if (dtype == at::kInt) {
-        int32_t qvalue = reshaped_input[token_idx][i].item<int32_t>();
-        dequantized_value = (qvalue - token_zero_point_int32) * token_scale;
-      } else if (dtype == at::kLong) {
-        int64_t qvalue = reshaped_input[token_idx][i].item<int64_t>();
-        dequantized_value = (qvalue - token_zero_point_int32) * token_scale;
-      } else {
-        throw std::runtime_error("Unsupported input dtype");
-      }
-
-      // Store result based on output dtype
-      if (out_dtype == at::kFloat) {
-        reshaped_out[token_idx][i] = static_cast<float>(dequantized_value);
-      } else if (out_dtype == at::kDouble) {
-        reshaped_out[token_idx][i] = dequantized_value;
-      } else if (out_dtype == at::kHalf) {
-        reshaped_out[token_idx][i] = static_cast<c10::Half>(dequantized_value);
-      }
-    }
-  }
-
-  return out;
-}
-
-/*
- * Reference implementation of dequantize_per_channel
- */
-at::Tensor dequantize_per_channel_reference_impl(
-    const at::Tensor& input,
-    const at::Tensor& scale,
-    const std::optional<at::Tensor>& zero_point,
-    int64_t axis,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    at::ScalarType out_dtype) {
-  // Normalize axis to handle negative values
-  int64_t normalized_axis = axis;
-  if (normalized_axis < 0) {
-    normalized_axis += input.dim();
-  }
-
-  // Create output tensor with the same shape as input but with target dtype
-  at::Tensor output = at::empty_like(input, out_dtype);
-
-  // Get the number of channels along the quantization axis
-  int64_t num_channels = input.size(normalized_axis);
-
-  // Calculate strides for efficient indexing
-  std::vector<int64_t> input_strides;
-  std::vector<int64_t> input_sizes;
-  for (int64_t i = 0; i < input.dim(); i++) {
-    input_sizes.push_back(input.size(i));
-    input_strides.push_back(input.stride(i));
-  }
-
-  // Get data pointers
-  const double* scale_data = scale.const_data_ptr<double>();
-  const int64_t* zero_point_data = nullptr;
-  if (zero_point.has_value()) {
-    zero_point_data = zero_point.value().const_data_ptr<int64_t>();
-  }
-
-  // Iterate through all elements in the tensor
-  int64_t total_elements = input.numel();
-
-  // Helper lambda to convert flat index to multi-dimensional coordinates
-  auto flat_to_coords = [&](int64_t flat_idx, std::vector<int64_t>& coords) {
-    int64_t remaining = flat_idx;
-    for (int64_t dim = input.dim() - 1; dim >= 0; dim--) {
-      coords[dim] = remaining % input_sizes[dim];
-      remaining /= input_sizes[dim];
-    }
-  };
-
-  // Process each element
-  std::vector<int64_t> coords(input.dim());
-  for (int64_t flat_idx = 0; flat_idx < total_elements; flat_idx++) {
-    // Convert flat index to coordinates
-    flat_to_coords(flat_idx, coords);
-
-    // Get the channel index for this element
-    int64_t channel_idx = coords[normalized_axis];
-
-    // Get the quantization parameters for this channel
-    double channel_scale = scale_data[channel_idx];
-    int64_t channel_zero_point = 0;
-    if (zero_point_data != nullptr) {
-      channel_zero_point = zero_point_data[channel_idx];
-    }
-
-    // Store casted values to avoid repeated casting
-    const int32_t channel_zero_point_int32 =
-        static_cast<int32_t>(channel_zero_point);
-    const float channel_scale_float = static_cast<float>(channel_scale);
-
-    // Get the input value and dequantize
-    double dequantized_value = 0.0;
-
-    // Extract quantized value and dequantize based on input dtype
-    // Following the CPU implementation pattern: (input - zero_point) * scale
-    if (dtype == at::kByte) {
-      uint8_t qvalue = input.flatten()[flat_idx].item<uint8_t>();
-      dequantized_value =
-          (qvalue - channel_zero_point_int32) * channel_scale_float;
-    } else if (dtype == at::kChar) {
-      int8_t qvalue = input.flatten()[flat_idx].item<int8_t>();
-      dequantized_value =
-          (qvalue - channel_zero_point_int32) * channel_scale_float;
-    } else if (dtype == at::kShort) {
-      int16_t qvalue = input.flatten()[flat_idx].item<int16_t>();
-      dequantized_value =
-          (qvalue - channel_zero_point_int32) * channel_scale_float;
-    } else if (dtype == at::kInt) {
-      int32_t qvalue = input.flatten()[flat_idx].item<int32_t>();
-      dequantized_value =
-          (qvalue - channel_zero_point_int32) * channel_scale_float;
-    } else if (dtype == at::kLong) {
-      int64_t qvalue = input.flatten()[flat_idx].item<int64_t>();
-      dequantized_value =
-          (qvalue - channel_zero_point_int32) * channel_scale_float;
-    } else {
-      throw std::runtime_error("Unsupported input dtype");
-    }
-
-    // Store the result based on output dtype
-    if (out_dtype == at::kFloat) {
-      output.flatten()[flat_idx] = static_cast<float>(dequantized_value);
-    } else if (out_dtype == at::kDouble) {
-      output.flatten()[flat_idx] = dequantized_value;
-    } else if (out_dtype == at::kHalf) {
-      output.flatten()[flat_idx] = static_cast<c10::Half>(dequantized_value);
-    }
-  }
-
-  return output;
-}
-
-// Forward declaration of implementation functions
-void test_vulkan_dequantize_per_token_impl(
-    const std::vector<int>& input_sizes,
-    const std::vector<float>& scales,
-    const std::vector<int>& zero_points,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    at::ScalarType out_dtype,
-    const vkcompute::utils::StorageType in_storage,
-    const vkcompute::utils::StorageType out_storage);
-
-void test_vulkan_dequantize_per_channel_impl(
-    const std::vector<int>& input_sizes,
-    const std::vector<float>& scales,
-    const std::vector<int>& zero_points,
-    int64_t axis,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    at::ScalarType out_dtype,
-    const vkcompute::utils::StorageType in_storage,
-    const vkcompute::utils::StorageType out_storage);
-
-void test_vulkan_dequantize_per_tensor_tensor_impl(
-    const std::vector<int>& input_sizes,
-    float scale,
-    int zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    at::ScalarType out_dtype,
-    const vkcompute::utils::StorageType in_storage,
-    const vkcompute::utils::StorageType out_storage);
-
-// Wrapper function to test both buffer and texture storage types
-void test_vulkan_dequantize_per_token(
-    const std::vector<int>& input_sizes,
-    const std::vector<float>& scales,
-    const std::vector<int>& zero_points,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    at::ScalarType out_dtype) {
-  // Test with buffer storage
-  test_vulkan_dequantize_per_token_impl(
-      input_sizes,
-      scales,
-      zero_points,
-      quant_min,
-      quant_max,
-      dtype,
-      out_dtype,
-      vkcompute::utils::kBuffer,
-      vkcompute::utils::kBuffer);
-
-  // Telling the system to expect a float instead of a double
-  // since the shader can only return 32bit anyways
-  if (out_dtype == at::kDouble) {
-    out_dtype = at::kFloat;
-  }
-
-  // Test with texture storage
-  test_vulkan_dequantize_per_token_impl(
-      input_sizes,
-      scales,
-      zero_points,
-      quant_min,
-      quant_max,
-      dtype,
-      out_dtype,
-      vkcompute::utils::kTexture3D,
-      vkcompute::utils::kTexture3D);
-}
-
-// Wrapper function to test both buffer and texture storage types
-void test_vulkan_dequantize_per_channel(
-    const std::vector<int>& input_sizes,
-    const std::vector<float>& scales,
-    const std::vector<int>& zero_points,
-    int64_t axis,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    at::ScalarType out_dtype) {
-  // Test with buffer storage
-  test_vulkan_dequantize_per_channel_impl(
-      input_sizes,
-      scales,
-      zero_points,
-      axis,
-      quant_min,
-      quant_max,
-      dtype,
-      out_dtype,
-      vkcompute::utils::kBuffer,
-      vkcompute::utils::kBuffer);
-
-  // Telling the system to expect a float instead of a double
-  // since the shader can only return 32bit anyways
-  if (out_dtype == at::kDouble) {
-    out_dtype = at::kFloat;
-  }
-
-  // Test with texture storage
-  test_vulkan_dequantize_per_channel_impl(
-      input_sizes,
-      scales,
-      zero_points,
-      axis,
-      quant_min,
-      quant_max,
-      dtype,
-      out_dtype,
-      vkcompute::utils::kTexture3D,
-      vkcompute::utils::kTexture3D);
-}
-
-// Wrapper function to test both buffer and texture storage types
-void test_vulkan_dequantize_per_tensor_tensor(
-    const std::vector<int>& input_sizes,
-    float scale,
-    int zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    at::ScalarType out_dtype) {
-  // Test with buffer storage
-  test_vulkan_dequantize_per_tensor_tensor_impl(
-      input_sizes,
-      scale,
-      zero_point,
-      quant_min,
-      quant_max,
-      dtype,
-      out_dtype,
-      vkcompute::utils::kBuffer,
-      vkcompute::utils::kBuffer);
-
-  // Telling the system to expect a float instead of a double
-  // since the shader can only return 32bit anyways
-  if (out_dtype == at::kDouble) {
-    out_dtype = at::kFloat;
-  }
-
-  // Test with texture storage
-  test_vulkan_dequantize_per_tensor_tensor_impl(
-      input_sizes,
-      scale,
-      zero_point,
-      quant_min,
-      quant_max,
-      dtype,
-      out_dtype,
-      vkcompute::utils::kTexture3D,
-      vkcompute::utils::kTexture3D);
-}
-
-void test_reference_dequantize_per_tensor(
-    const std::vector<int>& input_sizes,
-    float scale,
-    int zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    at::ScalarType out_dtype) {
-  check_dequantize_args(quant_min, quant_max, dtype, out_dtype);
-  std::vector<int64_t> input_sizes_int64(
-      input_sizes.begin(), input_sizes.end());
-
-  // Create a quantized input tensor with values from quant_min to quant_max
-  at::Tensor input;
-  if (dtype == at::kByte) {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kByte));
-  } else if (dtype == at::kChar) {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kChar));
-  } else if (dtype == at::kShort) {
-    input =
-        at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kShort));
-  } else if (dtype == at::kInt) {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kInt));
-  } else {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kLong));
-  }
-
-  // Fill with a simple pattern: values from quant_min to quant_max in steps
-  float step = 1.0f;
-  if (input.numel() > 1) {
-    step = static_cast<float>(quant_max - quant_min) / (input.numel() - 1);
-  }
-
-  auto flat_input = input.flatten();
-  for (int i = 0; i < flat_input.numel(); i++) {
-    int64_t qvalue = quant_min + i * step;
-    if (dtype == at::kByte) {
-      flat_input[i] = static_cast<uint8_t>(qvalue);
-    } else if (dtype == at::kChar) {
-      flat_input[i] = static_cast<int8_t>(qvalue);
-    } else if (dtype == at::kShort) {
-      flat_input[i] = static_cast<int16_t>(qvalue);
-    } else if (dtype == at::kInt) {
-      flat_input[i] = static_cast<int32_t>(qvalue);
-    } else if (dtype == at::kLong) {
-      flat_input[i] = static_cast<int64_t>(qvalue);
-    }
-  }
-
-  // Reshape back to original dimensions
-  input = flat_input.reshape(input_sizes_int64);
-
-  // Get reference output
-  at::Tensor reference_out = dequantize_per_tensor_reference_impl(
-      input, scale, zero_point, quant_min, quant_max, dtype, out_dtype);
-
-  // Get implementation output
-  at::Tensor impl_out = torch::executor::native::dequantize_per_tensor_aten(
-      input, scale, zero_point, quant_min, quant_max, dtype, out_dtype);
-
-  // Compare outputs
-  const bool output_correct = at::allclose(reference_out, impl_out);
-  if (!output_correct) {
-    std::cout << "\n"
-              << "Failed with parameters: " << std::endl;
-    std::cout << "  scale: " << scale << std::endl;
-    std::cout << "  zero_point: " << zero_point << std::endl;
-    std::cout << "  quant_min: " << quant_min << std::endl;
-    std::cout << "  quant_max: " << quant_max << std::endl;
-    std::cout << "  input dtype: " << dtype << std::endl;
-    std::cout << "  output dtype: " << out_dtype << std::endl;
-
-    std::cout << "input:" << std::endl;
-    std::cout << input << std::endl;
-    std::cout << "reference:" << std::endl;
-    std::cout << reference_out << std::endl;
-    std::cout << "implementation:" << std::endl;
-    std::cout << impl_out << std::endl;
-  }
-
-  ASSERT_TRUE(output_correct);
-}
-
-TEST(
-    VulkanDequantizePerTensorTest,
-    test_reference_dequantize_per_tensor_uint8_to_float) {
-  test_reference_dequantize_per_tensor(
-      {2, 3, 4}, // input sizes
-      0.1, // scale
-      5, // zero_point
-      0, // quant_min
-      255, // quant_max
-      at::kByte, // input dtype
-      at::kFloat); // output dtype
-}
-
-TEST(
-    VulkanDequantizePerTensorTest,
-    test_reference_dequantize_per_tensor_int8_to_float) {
-  test_reference_dequantize_per_tensor(
-      {3, 4, 5}, // input sizes
-      0.05, // scale
-      0, // zero_point
-      -128, // quant_min
-      127, // quant_max
-      at::kChar, // input dtype
-      at::kFloat); // output dtype
-}
-
-TEST(
-    VulkanDequantizePerTensorTest,
-    test_reference_dequantize_per_tensor_int32_to_float) {
-  test_reference_dequantize_per_tensor(
-      {4, 6, 2}, // input sizes
-      0.2, // scale
-      2, // zero_point
-      std::numeric_limits<int32_t>::min(), // quant_min
-      std::numeric_limits<int32_t>::max(), // quant_max
-      at::kInt, // input dtype
-      at::kFloat); // output dtype
-}
-
-TEST(
-    VulkanDequantizePerTensorTest,
-    test_reference_dequantize_per_tensor_uint8_to_half) {
-  test_reference_dequantize_per_tensor(
-      {7, 4}, // input sizes
-      0.1, // scale
-      10, // zero_point
-      0, // quant_min
-      255, // quant_max
-      at::kByte, // input dtype (uint8)
-      at::kHalf); // output dtype
-}
-
-TEST(
-    VulkanDequantizePerTensorTest,
-    test_reference_dequantize_per_tensor_int32_to_half) {
-  test_reference_dequantize_per_tensor(
-      {2, 6, 5}, // input sizes
-      0.3, // scale
-      -10, // zero_point
-      std::numeric_limits<int32_t>::min(), // quant_min
-      std::numeric_limits<int32_t>::max(), // quant_max
-      at::kInt, // input dtype
-      at::kHalf); // output dtype
-}
-
-// No Vulkan tests for quantized_decomposed.dequantize_per_tensor.default
-// because it is not going to be implemented in Vulkan since we will
-// be handling any future calls to this op via the export stage
-
-void test_reference_dequantize_per_token(
-    const std::vector<int>& input_sizes,
-    const std::vector<float>& scales,
-    const std::vector<int>& zero_points,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    at::ScalarType out_dtype) {
-  check_dequantize_args(quant_min, quant_max, dtype, out_dtype);
-  int num_tokens = 1;
-  for (int i = 0; i < input_sizes.size() - 1; i++) {
-    num_tokens *= input_sizes[i];
-  }
-
-  ASSERT_EQ(num_tokens, scales.size());
-  ASSERT_EQ(num_tokens, zero_points.size());
-
-  // Create input tensor with quantized values
-  std::vector<int64_t> input_sizes_int64(
-      input_sizes.begin(), input_sizes.end());
-  at::Tensor input;
-  if (dtype == at::kByte) {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kByte));
-  } else if (dtype == at::kChar) {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kChar));
-  } else if (dtype == at::kShort) {
-    input =
-        at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kShort));
-  } else if (dtype == at::kInt) {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kInt));
-  } else {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kLong));
-  }
-
-  // Fill with a simple pattern: values from quant_min to quant_max in steps
-  at::Tensor reshaped_input = input.reshape({num_tokens, input.size(-1)});
-  for (int token_idx = 0; token_idx < num_tokens; token_idx++) {
-    float step = 1.0f;
-    if (input.size(-1) > 1) {
-      step = static_cast<float>(quant_max - quant_min) / (input.size(-1) - 1);
-    }
-
-    for (int i = 0; i < input.size(-1); i++) {
-      int64_t qvalue = quant_min + i * step;
-      if (dtype == at::kByte) {
-        reshaped_input[token_idx][i] = static_cast<uint8_t>(qvalue);
-      } else if (dtype == at::kChar) {
-        reshaped_input[token_idx][i] = static_cast<int8_t>(qvalue);
-      } else if (dtype == at::kShort) {
-        reshaped_input[token_idx][i] = static_cast<int16_t>(qvalue);
-      } else if (dtype == at::kInt) {
-        reshaped_input[token_idx][i] = static_cast<int32_t>(qvalue);
-      } else if (dtype == at::kLong) {
-        reshaped_input[token_idx][i] = static_cast<int64_t>(qvalue);
-      }
-    }
-  }
-
-  // Reshape back to original dimensions
-  input = reshaped_input.reshape(input_sizes_int64);
-
-  // Create scale and zero_point tensors
-  at::Tensor scale_tensor =
-      at::tensor(scales, at::device(at::kCPU).dtype(at::kFloat));
-  at::Tensor zero_point_tensor =
-      at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong));
-
-  // Get reference output
-  at::Tensor reference_out = dequantize_per_token_reference_impl(
-      input,
-      scale_tensor,
-      zero_point_tensor,
-      quant_min,
-      quant_max,
-      dtype,
-      out_dtype);
-
-  // Get implementation output
-  at::Tensor impl_out = torch::executor::native::dequantize_per_token_aten(
-      input,
-      scale_tensor,
-      zero_point_tensor,
-      quant_min,
-      quant_max,
-      dtype,
-      out_dtype);
-
-  // Compare outputs
-  const bool output_correct = at::allclose(reference_out, impl_out);
-  if (!output_correct) {
-    std::cout << "\n"
-              << "Failed with parameters: " << std::endl;
-    std::cout << "  scale(s):";
-    for (size_t i = 0; i < scales.size(); i++) {
-      std::cout << " " << scales[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  zero_point(s):";
-    for (size_t i = 0; i < zero_points.size(); i++) {
-      std::cout << " " << zero_points[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  quant_min: " << quant_min << std::endl;
-    std::cout << "  quant_max: " << quant_max << std::endl;
-    std::cout << "  input dtype: " << dtype << std::endl;
-    std::cout << "  output dtype: " << out_dtype << std::endl;
-
-    std::cout << "input:" << std::endl;
-    std::cout << input << std::endl;
-    std::cout << "reference:" << std::endl;
-    std::cout << reference_out << std::endl;
-    std::cout << "implementation:" << std::endl;
-    std::cout << impl_out << std::endl;
-  }
-
-  ASSERT_TRUE(output_correct);
-}
-
-void test_vulkan_dequantize_per_token_impl(
-    const std::vector<int>& input_sizes,
-    const std::vector<float>& scales,
-    const std::vector<int>& zero_points,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    at::ScalarType out_dtype,
-    const vkcompute::utils::StorageType in_storage,
-    const vkcompute::utils::StorageType out_storage) {
-  check_dequantize_args(quant_min, quant_max, dtype, out_dtype);
-  int num_tokens = 1;
-  for (int i = 0; i < input_sizes.size() - 1; i++) {
-    num_tokens *= input_sizes[i];
-  }
-
-  ASSERT_EQ(num_tokens, scales.size());
-  ASSERT_EQ(num_tokens, zero_points.size());
-
-  // Create input tensor with quantized values
-  std::vector<int64_t> input_sizes_int64(
-      input_sizes.begin(), input_sizes.end());
-  at::Tensor input;
-  if (dtype == at::kByte) {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kByte));
-  } else if (dtype == at::kChar) {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kChar));
-  } else if (dtype == at::kShort) {
-    input =
-        at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kShort));
-  } else if (dtype == at::kInt) {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kInt));
-  } else {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kLong));
-  }
-
-  // Fill with a simple pattern: values from quant_min to quant_max in steps
-  at::Tensor reshaped_input = input.reshape({num_tokens, input.size(-1)});
-  for (int token_idx = 0; token_idx < num_tokens; token_idx++) {
-    float step = 1.0f;
-    if (input.size(-1) > 1) {
-      step = static_cast<float>(quant_max - quant_min) / (input.size(-1) - 1);
-    }
-
-    for (int i = 0; i < input.size(-1); i++) {
-      int64_t qvalue = quant_min + i * step;
-      if (dtype == at::kByte) {
-        reshaped_input[token_idx][i] = static_cast<uint8_t>(qvalue);
-      } else if (dtype == at::kChar) {
-        reshaped_input[token_idx][i] = static_cast<int8_t>(qvalue);
-      } else if (dtype == at::kShort) {
-        reshaped_input[token_idx][i] = static_cast<int16_t>(qvalue);
-      } else if (dtype == at::kInt) {
-        reshaped_input[token_idx][i] = static_cast<int32_t>(qvalue);
-      } else if (dtype == at::kLong) {
-        reshaped_input[token_idx][i] = static_cast<int64_t>(qvalue);
-      }
-    }
-  }
-
-  // Reshape back to original dimensions
-  input = reshaped_input.reshape(input_sizes_int64);
-
-  // Create scale and zero_point tensors
-  at::Tensor scale_tensor =
-      at::tensor(scales, at::device(at::kCPU).dtype(at::kFloat));
-  at::Tensor zero_point_tensor =
-      at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong));
-
-  // Get reference output
-  at::Tensor reference_out = torch::executor::native::dequantize_per_token_aten(
-      input,
-      scale_tensor,
-      zero_point_tensor,
-      quant_min,
-      quant_max,
-      dtype,
-      out_dtype);
-
-  // Build Vulkan dequantize_per_token graph
-  using namespace vkcompute;
-
-  GraphConfig config;
-  config.set_storage_type_override(in_storage);
-  ComputeGraph graph(config);
-
-  IOValueRef r_input = graph.add_input_tensor(
-      input.sizes().vec(), from_at_scalartype(dtype), in_storage);
-  IOValueRef r_scale = graph.add_input_tensor(
-      scale_tensor.sizes().vec(),
-      vkapi::kFloat,
-      utils::kBuffer,
-      utils::kWidthPacked);
-  IOValueRef r_zero_point = graph.add_input_tensor(
-      zero_point_tensor.sizes().vec(),
-      vkapi::kInt,
-      utils::kBuffer,
-      utils::kWidthPacked);
-
-  const ValueRef r_quant_min = graph.add_scalar<int64_t>(quant_min);
-  const ValueRef r_quant_max = graph.add_scalar<int64_t>(quant_max);
-
-  const ValueRef r_out = graph.add_tensor(
-      input.sizes().vec(), from_at_scalartype(out_dtype), out_storage);
-
-  const ValueRef r_dtype =
-      graph.add_scalar<int64_t>(static_cast<int64_t>(out_dtype));
-
-  VK_GET_OP_FN("quantized_decomposed.dequantize_per_token.default")
-  (graph,
-   {
-       r_input.value,
-       r_scale.value,
-       r_zero_point.value,
-       r_quant_min,
-       r_quant_max,
-       r_dtype,
-       r_dtype,
-       r_out,
-   });
-
-  ValueRef staging_out = graph.set_output_tensor(r_out);
-
-  graph.prepare();
-
-  graph.prepack();
-
-  // Copy input data to GPU
-  graph.copy_into_staging(
-      r_input.staging, input.const_data_ptr(), input.numel());
-
-  // Convert scale tensor to float and copy to GPU
-  at::Tensor scale_float = scale_tensor.to(at::kFloat);
-  graph.copy_into_staging(
-      r_scale.staging, scale_float.const_data_ptr(), scale_float.numel());
-
-  // Convert zero_point tensor to int and copy to GPU
-  at::Tensor zero_point_int = zero_point_tensor.to(at::kInt);
-  graph.copy_into_staging(
-      r_zero_point.staging,
-      zero_point_int.const_data_ptr(),
-      zero_point_int.numel());
-
-  // Execute the graph
-  graph.execute();
-
-  // Copy output data back to CPU
-  at::Tensor vk_out = at::empty_like(reference_out).contiguous();
-  graph.copy_from_staging(
-      staging_out, vk_out.mutable_data_ptr(), vk_out.numel());
-
-  // Compare outputs with appropriate tolerance for half precision
-  bool output_correct;
-  if (out_dtype == at::kHalf) {
-    // Use higher tolerance for half precision due to limited precision
-    output_correct =
-        at::allclose(reference_out, vk_out, /*rtol=*/1e-2, /*atol=*/1e-2);
-  } else {
-    output_correct =
-        at::allclose(reference_out, vk_out, /*rtol=*/1e-5, /*atol=*/1e-5);
-  }
-  if (!output_correct) {
-    std::cout << "\n"
-              << "Failed with parameters: " << std::endl;
-    std::cout << "  scale(s):";
-    for (size_t i = 0; i < scales.size(); i++) {
-      std::cout << " " << scales[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  zero_point(s):";
-    for (size_t i = 0; i < zero_points.size(); i++) {
-      std::cout << " " << zero_points[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  quant_min: " << quant_min << std::endl;
-    std::cout << "  quant_max: " << quant_max << std::endl;
-    std::cout << "  storage type: "
-              << (in_storage == vkcompute::utils::kBuffer ? "buffer"
-                                                          : "texture")
-              << std::endl;
-    std::cout << "  input dtype: " << dtype << std::endl;
-    std::cout << "  output dtype: " << out_dtype << std::endl;
-
-    std::cout << "input:" << std::endl;
-    std::cout << input << std::endl;
-    std::cout << "reference:" << std::endl;
-    std::cout << reference_out << std::endl;
-    std::cout << "vulkan:" << std::endl;
-    std::cout << vk_out << std::endl;
-  }
-
-  ASSERT_TRUE(output_correct);
-}
-
-TEST(
-    VulkanDequantizePerTokenTest,
-    test_reference_dequantize_per_token_uint8_to_float) {
-  std::vector<float> scales = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6};
-  std::vector<int> zero_points = {5, 10, 15, 20, 25, 30};
-
-  test_reference_dequantize_per_token(
-      {2, 3, 4}, // input sizes (2*3=6 tokens)
-      scales,
-      zero_points,
-      0, // quant_min
-      255, // quant_max
-      at::kByte, // input dtype
-      at::kFloat); // output dtype
-}
-
-TEST(
-    VulkanDequantizePerTokenTest,
-    test_reference_dequantize_per_token_int8_to_float) {
-  std::vector<float> scales = {0.05, 0.1, 0.15, 0.2};
-  std::vector<int> zero_points = {0, -5, 5, 10};
-
-  test_reference_dequantize_per_token(
-      {2, 2, 5}, // input sizes (2*2=4 tokens)
-      scales,
-      zero_points,
-      -128, // quant_min
-      127, // quant_max
-      at::kChar, // input dtype
-      at::kFloat); // output dtype
-}
-
-TEST(
-    VulkanDequantizePerTokenTest,
-    test_reference_dequantize_per_token_int32_to_float) {
-  std::vector<float> scales = {0.05, 0.1, 0.15, 0.2};
-  std::vector<int> zero_points = {0, -5, 5, 10};
-
-  test_reference_dequantize_per_token(
-      {2, 2, 10}, // input sizes (2*2=4 tokens)
-      scales,
-      zero_points,
-      std::numeric_limits<int32_t>::min(), // quant_min
-      std::numeric_limits<int32_t>::max(), // quant_max
-      at::kInt, // input dtype
-      at::kFloat); // output dtype
-}
-
-TEST(
-    VulkanDequantizePerTokenTest,
-    test_reference_dequantize_per_token_int8_to_half) {
-  std::vector<float> scales = {0.05, 0.1, 0.15, 0.2};
-  std::vector<int> zero_points = {0, -5, 5, 10};
-
-  test_reference_dequantize_per_token(
-      {4, 1, 5}, // input sizes (4*1=4 tokens)
-      scales,
-      zero_points,
-      -128, // quant_min
-      127, // quant_max
-      at::kChar, // input dtype (int8)
-      at::kHalf); // output dtype
-}
-
-TEST(
-    VulkanDequantizePerTokenTest,
-    test_reference_dequantize_per_token_int32_to_half) {
-  std::vector<float> scales = {0.05, 0.1};
-  std::vector<int> zero_points = {0, -5};
-
-  test_reference_dequantize_per_token(
-      {2, 2}, // input sizes (2 tokens)
-      scales,
-      zero_points,
-      std::numeric_limits<int32_t>::min(), // quant_min
-      std::numeric_limits<int32_t>::max(), // quant_max
-      at::kInt, // input dtype
-      at::kHalf); // output dtype
-}
-
-TEST(
-    VulkanDequantizePerTokenTest,
-    test_vulkan_dequantize_per_token_uint8_to_float) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6};
-  std::vector<int> zero_points = {5, 10, 15, 20, 25, 30};
-
-  test_vulkan_dequantize_per_token(
-      {2, 3, 6}, // input sizes (2*3=6 tokens)
-      scales,
-      zero_points,
-      0, // quant_min
-      255, // quant_max
-      at::kByte, // input dtype
-      at::kFloat); // output dtype
-}
-
-TEST(
-    VulkanDequantizePerTokenTest,
-    test_vulkan_dequantize_per_token_int8_to_float) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales = {0.05, 0.0};
-  std::vector<int> zero_points = {10, -5};
-
-  test_vulkan_dequantize_per_token(
-      {2, 2}, // input sizes (2*2=4 tokens)
-      scales,
-      zero_points,
-      -128, // quant_min
-      127, // quant_max
-      at::kChar, // input dtype
-      at::kFloat); // output dtype
-}
-
-TEST(
-    VulkanDequantizePerTokenTest,
-    test_vulkan_dequantize_per_token_int32_to_float) {
-  std::vector<float> scales = {
-      0.0001, 0.0002, 0.0003, 0.0, 0.0011, 0.0102, 0.1003, 0.0};
-  std::vector<int> zero_points = {100, -100, 50, -50, 12, -6, 4, -24};
-
-  test_vulkan_dequantize_per_token(
-      {2, 2, 2, 12}, // input sizes (2*2=4 tokens)
-      scales,
-      zero_points,
-      -2147483648, // quant_min
-      2147483647, // quant_max
-      at::kInt, // input dtype
-      at::kFloat); // output dtype
-}
-
-TEST(
-    VulkanDequantizePerTokenTest,
-    test_vulkan_dequantize_per_token_int8_to_half) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_float16_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales = {0.05, 0.2};
-  std::vector<int> zero_points = {2, -5};
-
-  test_vulkan_dequantize_per_token(
-      {2, 2}, // input sizes (2=4 tokens)
-      scales,
-      zero_points,
-      -128, // quant_min
-      127, // quant_max
-      at::kChar, // input dtype
-      at::kHalf); // output dtype
-}
-
-TEST(
-    VulkanDequantizePerTokenTest,
-    test_vulkan_dequantize_per_token_int32_to_half) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_float16_buffers_support()) {
-    GTEST_SKIP();
-  }
-  // Use much smaller scales to avoid overflow to infinity in half precision
-  // Half precision max value is ~65504, so with int32 values around 2e9,
-  // we need scales smaller than 65504/2e9 ≈ 3e-5 to avoid overflow
-  std::vector<float> scales = {1e-5, 2e-5, 1.5e-5};
-  std::vector<int> zero_points = {20, -15, 1};
-
-  test_vulkan_dequantize_per_token(
-      {3, 6}, // input sizes (3 tokens)
-      scales,
-      zero_points,
-      std::numeric_limits<int32_t>::min(), // quant_min
-      std::numeric_limits<int32_t>::max(), // quant_max
-      at::kInt, // input dtype
-      at::kHalf); // output dtype
-}
-
-TEST(
-    VulkanDequantizePerTokenTest,
-    test_vulkan_dequantize_per_token_int8_to_double) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales = {0.05, 0.001};
-  std::vector<int> zero_points = {10, -5};
-
-  test_vulkan_dequantize_per_token(
-      {2, 2}, // input sizes (2 tokens)
-      scales,
-      zero_points,
-      -128, // quant_min
-      127, // quant_max
-      at::kChar, // input dtype
-      at::kDouble); // output dtype
-}
-
-void test_reference_dequantize_per_channel(
-    const std::vector<int>& input_sizes,
-    const std::vector<float>& scales,
-    const std::vector<int>& zero_points,
-    int64_t axis,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    at::ScalarType out_dtype) {
-  check_dequantize_args(quant_min, quant_max, dtype, out_dtype);
-  check_dequantize_per_channel_args(input_sizes, scales, zero_points, axis);
-
-  std::vector<int64_t> input_sizes_int64(
-      input_sizes.begin(), input_sizes.end());
-
-  // Create input tensor with quantized values
-  at::Tensor input;
-  if (dtype == at::kByte) {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kByte));
-  } else if (dtype == at::kChar) {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kChar));
-  } else if (dtype == at::kShort) {
-    input =
-        at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kShort));
-  } else if (dtype == at::kInt) {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kInt));
-  } else {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kLong));
-  }
-
-  // Fill with a simple pattern: values from quant_min to quant_max in steps
-  float step = 1.0f;
-  if (input.numel() > 1) {
-    step = static_cast<float>(quant_max - quant_min) / (input.numel() - 1);
-  }
-
-  auto flat_input = input.flatten();
-  for (int i = 0; i < flat_input.numel(); i++) {
-    int64_t qvalue = quant_min + i * step;
-    if (dtype == at::kByte) {
-      flat_input[i] = static_cast<uint8_t>(qvalue);
-    } else if (dtype == at::kChar) {
-      flat_input[i] = static_cast<int8_t>(qvalue);
-    } else if (dtype == at::kShort) {
-      flat_input[i] = static_cast<int16_t>(qvalue);
-    } else if (dtype == at::kInt) {
-      flat_input[i] = static_cast<int32_t>(qvalue);
-    } else if (dtype == at::kLong) {
-      flat_input[i] = static_cast<int64_t>(qvalue);
-    }
-  }
-
-  // Reshape back to original dimensions
-  input = flat_input.reshape(input_sizes_int64);
-
-  // Create scale and zero_point tensors
-  at::Tensor scale_tensor =
-      at::tensor(scales, at::device(at::kCPU).dtype(at::kDouble));
-  at::Tensor zero_point_tensor =
-      at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong));
-
-  // Get reference output
-  at::Tensor my_ref = dequantize_per_channel_reference_impl(
-      input,
-      scale_tensor,
-      zero_point_tensor,
-      axis,
-      quant_min,
-      quant_max,
-      dtype,
-      out_dtype);
-
-  // Get implementation output
-  at::Tensor cpu_ref = torch::executor::native::dequantize_per_channel_aten(
-      input,
-      scale_tensor,
-      zero_point_tensor,
-      axis,
-      quant_min,
-      quant_max,
-      dtype,
-      out_dtype);
-
-  // Compare outputs
-  const bool output_correct = at::allclose(my_ref, cpu_ref);
-  if (!output_correct) {
-    std::cout << "\n"
-              << "Failed with parameters: " << std::endl;
-    std::cout << "  axis: " << axis << std::endl;
-    std::cout << "  input sizes:";
-    for (size_t i = 0; i < input_sizes.size(); i++) {
-      std::cout << " " << input_sizes[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  scale(s):";
-    for (size_t i = 0; i < scales.size(); i++) {
-      std::cout << " " << scales[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  zero_point(s):";
-    for (size_t i = 0; i < zero_points.size(); i++) {
-      std::cout << " " << zero_points[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  quant_min: " << quant_min << std::endl;
-    std::cout << "  quant_max: " << quant_max << std::endl;
-    std::cout << "  input dtype: " << dtype << std::endl;
-    std::cout << "  output dtype: " << out_dtype << std::endl;
-
-    std::cout << "input:" << std::endl;
-    std::cout << input << std::endl;
-    std::cout << "cpu_ref:" << std::endl;
-    std::cout << cpu_ref << std::endl;
-    std::cout << "my_ref:" << std::endl;
-    std::cout << my_ref << std::endl;
-  }
-
-  ASSERT_TRUE(output_correct);
-}
-
-void test_vulkan_dequantize_per_channel_impl(
-    const std::vector<int>& input_sizes,
-    const std::vector<float>& scales,
-    const std::vector<int>& zero_points,
-    int64_t axis,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    at::ScalarType out_dtype,
-    const vkcompute::utils::StorageType in_storage,
-    const vkcompute::utils::StorageType out_storage) {
-  check_dequantize_args(quant_min, quant_max, dtype, out_dtype);
-  check_dequantize_per_channel_args(input_sizes, scales, zero_points, axis);
-
-  std::vector<int64_t> input_sizes_int64(
-      input_sizes.begin(), input_sizes.end());
-
-  // Create random float tensor
-  at::Tensor float_x =
-      at::rand(input_sizes_int64, at::device(at::kCPU).dtype(at::kFloat));
-
-  // Create scale and zero_point tensors
-  at::Tensor scale_tensor =
-      at::tensor(scales, at::device(at::kCPU).dtype(at::kFloat));
-  at::Tensor zero_point_tensor =
-      at::tensor(zero_points, at::device(at::kCPU).dtype(at::kInt));
-
-  // Map the dtype to the corresponding quantized type and quantize the float
-  // tensor
-  c10::ScalarType qtype;
-  at::Tensor adjusted_zero_points = zero_point_tensor;
-
-  if (dtype == at::kByte) {
-    qtype = c10::kQUInt8;
-    // ATEN ONLY: Adjust zero points for unsigned types (must be non-negative)
-    adjusted_zero_points = at::clamp_min(zero_point_tensor, 0);
-  } else if (dtype == at::kChar) {
-    qtype = c10::kQInt8;
-  } else if (dtype == at::kInt) {
-    qtype = c10::kQInt32;
-  } else {
-    std::cout << "invalid dtype for ATEN: " << dtype << std::endl;
-    std::cout << " --> Delegating to c10::kQInt32" << std::endl;
-    qtype = c10::kQInt32;
-  }
-
-  // Normalize axis for ATen (ATen doesn't handle negative axes in
-  // quantize_per_channel)
-  int64_t normalized_axis = axis;
-  if (normalized_axis < 0) {
-    normalized_axis += input_sizes_int64.size();
-  }
-
-  // Quantize using ATen
-  at::Tensor quantized_aten = at::quantize_per_channel(
-      float_x, scale_tensor, adjusted_zero_points, normalized_axis, qtype);
-
-  // Get ATen dequantized output
-  at::Tensor aten_out = at::dequantize(quantized_aten).to(out_dtype);
-
-  // Extract the quantized values (int_repr) to use with our implementations
-  at::Tensor quantized_input = quantized_aten.int_repr().to(dtype);
-
-  // Get reference output using
-  // torch::executor::native::dequantize_per_channel_aten
-  at::Tensor reference_out =
-      torch::executor::native::dequantize_per_channel_aten(
-          quantized_input,
-          scale_tensor.to(at::kDouble),
-          zero_point_tensor.to(at::kLong),
-          axis,
-          quant_min,
-          quant_max,
-          dtype,
-          out_dtype);
-
-  // Build Vulkan dequantize_per_channel graph
-  using namespace vkcompute;
-
-  GraphConfig config;
-  config.set_storage_type_override(in_storage);
-  ComputeGraph graph(config);
-
-  // Add tensors to graph
-  IOValueRef r_input = graph.add_input_tensor(
-      quantized_input.sizes().vec(),
-      from_at_scalartype(quantized_input.scalar_type()),
-      in_storage);
-
-  IOValueRef r_scale = graph.add_input_tensor(
-      scale_tensor.sizes().vec(),
-      vkapi::kFloat,
-      utils::kBuffer,
-      utils::kWidthPacked);
-
-  IOValueRef r_zero_point = graph.add_input_tensor(
-      adjusted_zero_points.sizes().vec(),
-      vkapi::kInt,
-      utils::kBuffer,
-      utils::kWidthPacked);
-
-  ValueRef r_out = graph.add_tensor(
-      quantized_input.sizes().vec(),
-      from_at_scalartype(out_dtype),
-      out_storage);
-
-  const ValueRef r_axis = graph.add_scalar<int64_t>(axis);
-  const ValueRef r_quant_min = graph.add_scalar<int64_t>(quant_min);
-  const ValueRef r_quant_max = graph.add_scalar<int64_t>(quant_max);
-  const ValueRef r_dtype =
-      graph.add_scalar<int64_t>(static_cast<int64_t>(dtype));
-  const ValueRef r_output_dtype =
-      graph.add_scalar<int64_t>(static_cast<int64_t>(out_dtype));
-
-  VK_GET_OP_FN("quantized_decomposed.dequantize_per_channel.default")
-  (graph,
-   {
-       r_input.value,
-       r_scale.value,
-       r_zero_point.value,
-       r_axis,
-       r_quant_min,
-       r_quant_max,
-       r_dtype,
-       r_output_dtype,
-       r_out,
-   });
-
-  ValueRef staging_out = graph.set_output_tensor(r_out);
-
-  graph.prepare();
-  graph.prepack();
-
-  // Copy input data to GPU
-  graph.copy_into_staging(
-      r_input.staging,
-      quantized_input.const_data_ptr(),
-      quantized_input.numel());
-
-  // copy scale tensor to GPU
-  graph.copy_into_staging(
-      r_scale.staging, scale_tensor.const_data_ptr(), scale_tensor.numel());
-
-  // copy zero_point tensor to GPU
-  graph.copy_into_staging(
-      r_zero_point.staging,
-      zero_point_tensor.const_data_ptr(),
-      zero_point_tensor.numel());
-
-  // Execute the graph
-  graph.execute();
-
-  // Copy output data back to CPU
-  at::Tensor vk_out = at::empty_like(reference_out).contiguous();
-  graph.copy_from_staging(
-      staging_out, vk_out.mutable_data_ptr(), vk_out.numel());
-
-  // Compare outputs with appropriate tolerance for half precision
-  bool output_correct;
-  if (out_dtype == at::kHalf) {
-    // Use higher tolerance for half precision due to limited precision
-    output_correct =
-        at::allclose(reference_out, vk_out, /*rtol=*/1e-2, /*atol=*/1e-2);
-  } else {
-    output_correct =
-        at::allclose(reference_out, vk_out, /*rtol=*/1e-5, /*atol=*/1e-5);
-  }
-  if (!output_correct) {
-    std::cout << "\n"
-              << "Failed with parameters: " << std::endl;
-    std::cout << "  axis: " << axis << std::endl;
-    std::cout << "  input sizes:";
-    for (size_t i = 0; i < input_sizes.size(); i++) {
-      std::cout << " " << input_sizes[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  scale(s):";
-    for (size_t i = 0; i < scales.size(); i++) {
-      std::cout << " " << scales[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  zero_point(s):";
-    for (size_t i = 0; i < zero_points.size(); i++) {
-      std::cout << " " << zero_points[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  quant_min: " << quant_min << std::endl;
-    std::cout << "  quant_max: " << quant_max << std::endl;
-    std::cout << "  input dtype: " << dtype << std::endl;
-    std::cout << "  output dtype: " << out_dtype << std::endl;
-    std::cout << "  storage: " << in_storage << std::endl;
-    std::cout << std::endl;
-
-    std::cout << "\033[91m quantized_input: \033[0m" << std::endl;
-    std::cout << quantized_input << std::endl;
-    std::cout << "\033[91m aten: \033[0m" << std::endl;
-    std::cout << aten_out << std::endl;
-    std::cout << "\033[91m reference: \033[0m" << std::endl;
-    std::cout << reference_out << std::endl;
-    std::cout << "\033[91m vulkan: \033[0m" << std::endl;
-    std::cout << vk_out << std::endl;
-  }
-
-  ASSERT_TRUE(output_correct);
-}
-
-TEST(
-    VulkanDequantizePerChannelTest,
-    test_reference_dequantize_per_channel_uint8_to_float_3D_axis0) {
-  std::vector<float> scales = {0.1, 0.2, 0.3};
-  std::vector<int> zero_points = {0, 5, -2};
-
-  test_reference_dequantize_per_channel(
-      {3, 4, 2}, // input sizes
-      scales,
-      zero_points,
-      0, // axis
-      0, // quant_min
-      255, // quant_max
-      at::kByte,
-      at::kFloat);
-}
-
-TEST(
-    VulkanDequantizePerChannelTest,
-    test_reference_dequantize_per_channel_int8_to_float_3D_axis2) {
-  std::vector<float> scales = {0.1, 0.2};
-  std::vector<int> zero_points = {0, 5};
-
-  test_reference_dequantize_per_channel(
-      {3, 4, 2}, // input sizes
-      scales,
-      zero_points,
-      2, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kFloat);
-}
-
-TEST(
-    VulkanDequantizePerChannelTest,
-    test_reference_dequantize_per_channel_int8_to_float_3D_axisn1) {
-  std::vector<float> scales = {0.1, 0.2};
-  std::vector<int> zero_points = {0, 5};
-
-  test_reference_dequantize_per_channel(
-      {3, 4, 2}, // input sizes
-      scales,
-      zero_points,
-      -1, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kFloat);
-}
-
-TEST(
-    VulkanDequantizePerChannelTest,
-    test_reference_dequantize_per_channel_int32_to_float_4D_axis0) {
-  std::vector<float> scales = {0.1, 0.2, 0.00002};
-  std::vector<int> zero_points = {0, 5, -4};
-
-  test_reference_dequantize_per_channel(
-      {3, 4, 2, 5}, // input sizes
-      scales,
-      zero_points,
-      0, // axis
-      std::numeric_limits<int32_t>::min(), // quant_min
-      std::numeric_limits<int32_t>::max(), // quant_max
-      at::kInt,
-      at::kFloat);
-}
-
-// END OF REFERENCE TESTS
-
-TEST(
-    VulkanDequantizePerChannelTest,
-    test_vulkan_dequantize_per_channel_int8_to_float_axis0) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales(9, 0.1f);
-  std::vector<int> zero_points(9, 2);
-
-  // 1D Tensor
-  test_vulkan_dequantize_per_channel(
-      {9}, // input sizes
-      scales,
-      zero_points,
-      0, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kFloat);
-
-  // 2D Tensor
-  test_vulkan_dequantize_per_channel(
-      {9, 14}, // input sizes
-      scales,
-      zero_points,
-      0, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kFloat);
-
-  // 3D Tensor
-  test_vulkan_dequantize_per_channel(
-      {9, 7, 11}, // input sizes
-      scales,
-      zero_points,
-      0, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kFloat);
-
-  // 4D Tensor
-  test_vulkan_dequantize_per_channel(
-      {9, 17, 5, 5}, // input sizes
-      scales,
-      zero_points,
-      0, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kFloat);
-
-  // 4D Tensor (negative axis)
-  test_vulkan_dequantize_per_channel(
-      {5, 17, 5, 9}, // input sizes
-      scales,
-      zero_points,
-      -1, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kFloat);
-}
-
-TEST(
-    VulkanDequantizePerChannelTest,
-    test_vulkan_dequantize_per_channel_int8_to_float_axis1) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales(14, 0.001f);
-  std::vector<int> zero_points(14, -5);
-
-  // 2D Tensor
-  test_vulkan_dequantize_per_channel(
-      {9, 14}, // input sizes
-      scales,
-      zero_points,
-      1, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kFloat);
-
-  // 3D Tensor
-  test_vulkan_dequantize_per_channel(
-      {9, 14, 11}, // input sizes
-      scales,
-      zero_points,
-      1, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kFloat);
-
-  // 4D Tensor
-  test_vulkan_dequantize_per_channel(
-      {9, 14, 5, 5}, // input sizes
-      scales,
-      zero_points,
-      1, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kFloat);
-
-  // 4D Tensor (negative axis)
-  test_vulkan_dequantize_per_channel(
-      {9, 7, 14, 5}, // input sizes
-      scales,
-      zero_points,
-      -2, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kFloat);
-}
-
-TEST(
-    VulkanDequantizePerChannelTest,
-    test_vulkan_dequantize_per_channel_int8_to_float_axis2) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales(11, 0.5f);
-  std::vector<int> zero_points(11, 12);
-
-  // 3D Tensor
-  test_vulkan_dequantize_per_channel(
-      {9, 14, 11}, // input sizes
-      scales,
-      zero_points,
-      2, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kFloat);
-
-  // 4D Tensor
-  test_vulkan_dequantize_per_channel(
-      {9, 14, 11, 5}, // input sizes
-      scales,
-      zero_points,
-      2, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kFloat);
-
-  // 4D Tensor (negative axis)
-  test_vulkan_dequantize_per_channel(
-      {9, 11, 14, 5}, // input sizes
-      scales,
-      zero_points,
-      -3, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kFloat);
-}
-
-TEST(
-    VulkanDequantizePerChannelTest,
-    test_vulkan_dequantize_per_channel_int8_to_float_axis3) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales(7, 0.5f);
-  std::vector<int> zero_points(7, 12);
-
-  // 4D Tensor
-  test_vulkan_dequantize_per_channel(
-      {9, 14, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      3, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kFloat);
-
-  // 4D Tensor (negative axis)
-  test_vulkan_dequantize_per_channel(
-      {7, 14, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      -4, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kFloat);
-}
-
-TEST(
-    VulkanDequantizePerChannelTest,
-    test_vulkan_dequantize_per_channel_uint8_to_float_comprehensive) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales = {0.1, 0.2, 0.0001, 0.5, 0.02};
-  std::vector<int> zero_points = {0, 5, -5, 1, 12};
-
-  // 4D Tensor
-  test_vulkan_dequantize_per_channel(
-      {5, 14, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      0, // axis
-      0, // quant_min
-      255, // quant_max
-      at::kByte,
-      at::kFloat);
-
-  // 4D Tensor
-  test_vulkan_dequantize_per_channel(
-      {9, 5, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      1, // axis
-      0, // quant_min
-      255, // quant_max
-      at::kByte,
-      at::kFloat);
-
-  // 4D Tensor
-  test_vulkan_dequantize_per_channel(
-      {9, 14, 5, 7}, // input sizes
-      scales,
-      zero_points,
-      2, // axis
-      0, // quant_min
-      255, // quant_max
-      at::kByte,
-      at::kFloat);
-
-  // 4D Tensor
-  test_vulkan_dequantize_per_channel(
-      {9, 14, 11, 5}, // input sizes
-      scales,
-      zero_points,
-      3, // axis
-      0, // quant_min
-      255, // quant_max
-      at::kByte,
-      at::kFloat);
-
-  // 4D Tensor (negative axis)
-  test_vulkan_dequantize_per_channel(
-      {5, 14, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      -4, // axis
-      0, // quant_min
-      255, // quant_max
-      at::kByte,
-      at::kFloat);
-}
-
-TEST(
-    VulkanDequantizePerChannelTest,
-    test_vulkan_dequantize_per_channel_8bit_to_half) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_float16_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales = {0.1, 0.2, 0.01, 0.5, 0.02};
-  std::vector<int> zero_points = {0, 5, 5, 1, 12};
-
-  // 4D Tensor
-  test_vulkan_dequantize_per_channel(
-      {5, 14, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      0, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kHalf);
-
-  // 4D Tensor
-  test_vulkan_dequantize_per_channel(
-      {9, 5, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      1, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kHalf);
-
-  // 4D Tensor
-  test_vulkan_dequantize_per_channel(
-      {9, 14, 5, 7}, // input sizes
-      scales,
-      zero_points,
-      2, // axis
-      0, // quant_min
-      255, // quant_max
-      at::kByte,
-      at::kHalf);
-
-  // 4D Tensor
-  test_vulkan_dequantize_per_channel(
-      {9, 14, 11, 5}, // input sizes
-      scales,
-      zero_points,
-      3, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kHalf);
-
-  // 4D Tensor (negative axis)
-  test_vulkan_dequantize_per_channel(
-      {5, 14, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      -4, // axis
-      0, // quant_min
-      255, // quant_max
-      at::kByte,
-      at::kHalf);
-}
-
-TEST(
-    VulkanDequantizePerChannelTest,
-    test_vulkan_dequantize_per_channel_8bit_to_double) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales = {0.1, 0.2, 0.01, 0.5, 0.02};
-  std::vector<int> zero_points = {0, 5, 5, 1, 12};
-
-  // 4D Tensor
-  test_vulkan_dequantize_per_channel(
-      {5, 14, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      0, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kDouble);
-
-  // 4D Tensor
-  test_vulkan_dequantize_per_channel(
-      {9, 5, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      1, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kDouble);
-
-  // 4D Tensor
-  test_vulkan_dequantize_per_channel(
-      {9, 14, 5, 7}, // input sizes
-      scales,
-      zero_points,
-      2, // axis
-      0, // quant_min
-      255, // quant_max
-      at::kByte,
-      at::kDouble);
-
-  // 4D Tensor
-  test_vulkan_dequantize_per_channel(
-      {9, 14, 11, 5}, // input sizes
-      scales,
-      zero_points,
-      3, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kChar,
-      at::kDouble);
-
-  // 4D Tensor (negative axis)
-  test_vulkan_dequantize_per_channel(
-      {5, 14, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      -4, // axis
-      0, // quant_min
-      255, // quant_max
-      at::kByte,
-      at::kDouble);
-}
-
-void test_vulkan_dequantize_per_tensor_tensor_impl(
-    const std::vector<int>& input_sizes,
-    float scale,
-    int zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype,
-    at::ScalarType out_dtype,
-    const vkcompute::utils::StorageType in_storage,
-    const vkcompute::utils::StorageType out_storage) {
-  check_dequantize_args(quant_min, quant_max, dtype, out_dtype);
-  std::vector<int64_t> input_sizes_int64(
-      input_sizes.begin(), input_sizes.end());
-
-  // Create a quantized input tensor with values from quant_min to quant_max
-  at::Tensor input;
-  if (dtype == at::kByte) {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kByte));
-  } else if (dtype == at::kChar) {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kChar));
-  } else if (dtype == at::kShort) {
-    input =
-        at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kShort));
-  } else if (dtype == at::kInt) {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kInt));
-  } else {
-    input = at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(at::kLong));
-  }
-
-  // Fill with a simple pattern: values from quant_min to quant_max in steps
-  float step = 1.0f;
-  if (input.numel() > 1) {
-    step = static_cast<float>(quant_max - quant_min) / (input.numel() - 1);
-  }
-
-  auto flat_input = input.flatten();
-  for (int i = 0; i < flat_input.numel(); i++) {
-    int64_t qvalue = quant_min + i * step;
-    if (dtype == at::kByte) {
-      flat_input[i] = static_cast<uint8_t>(qvalue);
-    } else if (dtype == at::kChar) {
-      flat_input[i] = static_cast<int8_t>(qvalue);
-    } else if (dtype == at::kShort) {
-      flat_input[i] = static_cast<int16_t>(qvalue);
-    } else if (dtype == at::kInt) {
-      flat_input[i] = static_cast<int32_t>(qvalue);
-    } else if (dtype == at::kLong) {
-      flat_input[i] = static_cast<int64_t>(qvalue);
-    }
-  }
-
-  // Reshape back to original dimensions
-  input = flat_input.reshape(input_sizes_int64);
-
-  // Create scale and zero_point as tensors (single element tensors)
-  at::Tensor scale_tensor =
-      at::tensor({scale}, at::device(at::kCPU).dtype(at::kDouble));
-  at::Tensor zero_point_tensor =
-      at::tensor({zero_point}, at::device(at::kCPU).dtype(at::kLong));
-
-  // Get reference output using tensor variant
-  at::Tensor reference_out =
-      torch::executor::native::dequantize_per_tensor_tensor_args_aten(
-          input,
-          scale_tensor,
-          zero_point_tensor,
-          quant_min,
-          quant_max,
-          dtype,
-          out_dtype);
-
-  // Build Vulkan dequantize_per_tensor.tensor graph
-  using namespace vkcompute;
-
-  GraphConfig config;
-  config.set_storage_type_override(in_storage);
-  ComputeGraph graph(config);
-
-  IOValueRef r_input = graph.add_input_tensor(
-      input.sizes().vec(), from_at_scalartype(dtype), in_storage);
-
-  // Add scale and zero_point as tensor inputs (buffer storage, width packed)
-  IOValueRef r_scale = graph.add_input_tensor(
-      scale_tensor.sizes().vec(),
-      vkapi::kFloat,
-      utils::kBuffer,
-      utils::kWidthPacked);
-  IOValueRef r_zero_point = graph.add_input_tensor(
-      zero_point_tensor.sizes().vec(),
-      vkapi::kInt,
-      utils::kBuffer,
-      utils::kWidthPacked);
-
-  const ValueRef r_quant_min = graph.add_scalar<int64_t>(quant_min);
-  const ValueRef r_quant_max = graph.add_scalar<int64_t>(quant_max);
-
-  const ValueRef r_out = graph.add_tensor(
-      input.sizes().vec(), from_at_scalartype(out_dtype), out_storage);
-
-  const ValueRef r_dtype =
-      graph.add_scalar<int64_t>(static_cast<int64_t>(dtype));
-  const ValueRef r_out_dtype =
-      graph.add_scalar<int64_t>(static_cast<int64_t>(out_dtype));
-
-  VK_GET_OP_FN("quantized_decomposed.dequantize_per_tensor.tensor")
-  (graph,
-   {
-       r_input.value,
-       r_scale.value,
-       r_zero_point.value,
-       r_quant_min,
-       r_quant_max,
-       r_dtype,
-       r_out_dtype,
-       r_out,
-   });
-
-  ValueRef staging_out = graph.set_output_tensor(r_out);
-
-  graph.prepare();
-  graph.prepack();
-
-  // Run Vulkan dequantize_per_tensor.tensor
-  graph.copy_into_staging(
-      r_input.staging, input.const_data_ptr(), input.numel());
-
-  // Convert scale tensor to float and copy to GPU
-  at::Tensor scale_float = scale_tensor.to(at::kFloat);
-  graph.copy_into_staging(
-      r_scale.staging, scale_float.const_data_ptr(), scale_float.numel());
-
-  // Convert zero_point tensor to int and copy to GPU
-  at::Tensor zero_point_int = zero_point_tensor.to(at::kInt);
-  graph.copy_into_staging(
-      r_zero_point.staging,
-      zero_point_int.const_data_ptr(),
-      zero_point_int.numel());
-
-  graph.execute();
-
-  at::Tensor vk_out = at::empty_like(reference_out).contiguous();
-  graph.copy_from_staging(
-      staging_out, vk_out.mutable_data_ptr(), vk_out.numel());
-
-  // Compare outputs with appropriate tolerance for half precision
-  bool output_correct;
-  if (out_dtype == at::kHalf) {
-    // Use higher tolerance for half precision due to limited precision
-    output_correct =
-        at::allclose(reference_out, vk_out, /*rtol=*/1e-2, /*atol=*/1e-2);
-  } else {
-    output_correct =
-        at::allclose(reference_out, vk_out, /*rtol=*/1e-5, /*atol=*/1e-5);
-  }
-  if (!output_correct) {
-    std::cout << "\n"
-              << "Failed with parameters: " << std::endl;
-    std::cout << "  scale: " << scale << std::endl;
-    std::cout << "  zero_point: " << zero_point << std::endl;
-    std::cout << "  quant_min: " << quant_min << std::endl;
-    std::cout << "  quant_max: " << quant_max << std::endl;
-    std::cout << "  storage type: "
-              << (in_storage == vkcompute::utils::kBuffer ? "buffer"
-                                                          : "texture")
-              << std::endl;
-    std::cout << "  input dtype: " << dtype << std::endl;
-    std::cout << "  output dtype: " << out_dtype << std::endl;
-
-    std::cout << "input:" << std::endl;
-    std::cout << input << std::endl;
-    std::cout << "reference:" << std::endl;
-    std::cout << reference_out << std::endl;
-    std::cout << "vulkan:" << std::endl;
-    std::cout << vk_out << std::endl;
-  }
-
-  ASSERT_TRUE(output_correct);
-}
-
-TEST(
-    VulkanDequantizePerTensorTensorTest,
-    test_vulkan_dequantize_per_tensor_tensor_int8_to_float) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_vulkan_dequantize_per_tensor_tensor(
-      {2, 3, 4}, // input sizes
-      0.01, // scale
-      1, // zero_point
-      -128, // quant_min
-      127, // quant_max
-      at::kChar, // input dtype
-      at::kFloat); // output dtype
-}
-
-TEST(
-    VulkanDequantizePerTensorTensorTest,
-    test_vulkan_dequantize_per_tensor_tensor_uint8_to_float) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_vulkan_dequantize_per_tensor_tensor(
-      {2, 3, 4, 12}, // input sizes
-      0.1, // scale
-      5, // zero_point
-      0, // quant_min
-      255, // quant_max
-      at::kByte, // input dtype
-      at::kFloat); // output dtype
-}
-
-TEST(
-    VulkanDequantizePerTensorTensorTest,
-    test_vulkan_dequantize_per_tensor_tensor_int32_to_float) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_vulkan_dequantize_per_tensor_tensor(
-      {2, 3}, // input sizes
-      0.01, // scale
-      12, // zero_point
-      std::numeric_limits<int32_t>::min(), // quant_min
-      std::numeric_limits<int32_t>::max(), // quant_max
-      at::kInt, // input dtype
-      at::kFloat); // output dtype
-}
-
-TEST(
-    VulkanDequantizePerTensorTensorTest,
-    test_vulkan_dequantize_per_tensor_tensor_uint8_to_half) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_vulkan_dequantize_per_tensor_tensor(
-      {3, 4}, // input sizes
-      0.3, // scale
-      2, // zero_point
-      0, // quant_min
-      255, // quant_max
-      at::kByte, // input dtype
-      at::kHalf); // output dtype
-}
-
-TEST(
-    VulkanDequantizePerTensorTensorTest,
-    test_vulkan_dequantize_per_tensor_tensor_int8_to_double) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_vulkan_dequantize_per_tensor_tensor(
-      {2, 3, 4}, // input sizes
-      0.03, // scale
-      -2, // zero_point
-      -128, // quant_min
-      127, // quant_max
-      at::kChar, // input dtype
-      at::kDouble); // output dtype
-}
diff --git a/backends/vulkan/test/op_tests/generate_op_benchmarks.py b/backends/vulkan/test/op_tests/generate_op_benchmarks.py
deleted file mode 100644
index 7f286123df9..00000000000
--- a/backends/vulkan/test/op_tests/generate_op_benchmarks.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import os
-
-from typing import Dict
-
-from executorch.backends.vulkan.test.op_tests.cases import test_suites
-
-from executorch.backends.vulkan.test.op_tests.utils.gen_benchmark_vk import (
-    VkBenchmarkFileGen,
-)
-from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import (
-    ComputeGraphGen,
-)
-from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite
-from torchgen import local
-
-from torchgen.gen import parse_native_yaml, ParsedYaml
-from torchgen.model import DispatchKey, NativeFunction
-
-
-def registry_name(f: NativeFunction) -> str:
-    name = str(f.namespace) + "." + str(f.func.name)
-    if len(f.func.name.overload_name) == 0:
-        name += ".default"
-    return name
-
-
-def construct_f_map(parsed_yaml: ParsedYaml) -> Dict[str, NativeFunction]:
-    f_map: Dict[str, NativeFunction] = {}
-    for f in parsed_yaml.native_functions:
-        f_map[registry_name(f)] = f
-    return f_map
-
-
-def process_test_suites(
-    cpp_generator: VkBenchmarkFileGen,
-    f_map: Dict[str, NativeFunction],
-    test_suites: Dict[str, TestSuite],
-) -> None:
-    for registry_name, op_test_suites in test_suites.items():
-        f = f_map[registry_name]
-        if isinstance(op_test_suites, list):
-            for suite in op_test_suites:
-                cpp_generator.add_suite(registry_name, f, suite)
-        else:
-            cpp_generator.add_suite(registry_name, f, op_test_suites)
-
-
-@local.parametrize(
-    use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
-)
-def generate_cpp(
-    native_functions_yaml_path: str, tags_path: str, output_dir: str
-) -> None:
-    output_file = os.path.join(output_dir, "op_benchmarks.cpp")
-    cpp_generator = VkBenchmarkFileGen(output_file)
-
-    parsed_yaml = parse_native_yaml(native_functions_yaml_path, tags_path)
-    f_map = construct_f_map(parsed_yaml)
-
-    ComputeGraphGen.backend_key = parsed_yaml.backend_indices[DispatchKey.CPU]
-
-    process_test_suites(cpp_generator, f_map, test_suites)
-
-    with open(output_file, "w") as file:
-        file.write(cpp_generator.generate_cpp())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--aten-yaml-path",
-        help="path to native_functions.yaml file.",
-    )
-    parser.add_argument(
-        "--tags-path",
-        help="Path to tags.yaml. Required by yaml parsing in gen_correctness_vk system.",
-    )
-
-    parser.add_argument("-o", "--output", help="Output directory", required=True)
-    args = parser.parse_args()
-    generate_cpp(args.aten_yaml_path, args.tags_path, args.output)
diff --git a/backends/vulkan/test/op_tests/generate_op_correctness_tests.py b/backends/vulkan/test/op_tests/generate_op_correctness_tests.py
deleted file mode 100644
index 8814070abd3..00000000000
--- a/backends/vulkan/test/op_tests/generate_op_correctness_tests.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import os
-
-from typing import Dict
-
-from executorch.backends.vulkan.test.op_tests.cases import test_suites
-from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import (
-    ComputeGraphGen,
-)
-
-from executorch.backends.vulkan.test.op_tests.utils.gen_correctness_vk import (
-    VkCorrectnessTestFileGen,
-)
-from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite
-from torchgen import local
-
-from torchgen.gen import parse_native_yaml, ParsedYaml
-from torchgen.model import DispatchKey, NativeFunction
-
-
-def registry_name(f: NativeFunction) -> str:
-    name = str(f.namespace) + "." + str(f.func.name)
-    if len(f.func.name.overload_name) == 0:
-        name += ".default"
-    return name
-
-
-def construct_f_map(parsed_yaml: ParsedYaml) -> Dict[str, NativeFunction]:
-    f_map: Dict[str, NativeFunction] = {}
-    for f in parsed_yaml.native_functions:
-        f_map[registry_name(f)] = f
-    return f_map
-
-
-def process_test_suites(
-    cpp_generator: VkCorrectnessTestFileGen,
-    f_map: Dict[str, NativeFunction],
-    test_suites: Dict[str, TestSuite],
-) -> None:
-    for registry_name, op_test_suites in test_suites.items():
-        f = f_map[registry_name]
-        if isinstance(op_test_suites, list):
-            for suite in op_test_suites:
-                cpp_generator.add_suite(registry_name, f, suite)
-        else:
-            cpp_generator.add_suite(registry_name, f, op_test_suites)
-
-
-@local.parametrize(
-    use_const_ref_for_mutable_tensors=False, use_ilistref_for_tensor_lists=False
-)
-def generate_cpp(
-    native_functions_yaml_path: str, tags_path: str, output_dir: str
-) -> None:
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-
-    output_file = os.path.join(output_dir, "op_tests.cpp")
-    cpp_generator = VkCorrectnessTestFileGen(output_file)
-
-    parsed_yaml = parse_native_yaml(native_functions_yaml_path, tags_path)
-    f_map = construct_f_map(parsed_yaml)
-
-    ComputeGraphGen.backend_key = parsed_yaml.backend_indices[DispatchKey.CPU]
-
-    process_test_suites(cpp_generator, f_map, test_suites)
-
-    with open(output_file, "w") as file:
-        file.write(cpp_generator.generate_cpp())
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--aten-yaml-path",
-        help="path to native_functions.yaml file.",
-    )
-    parser.add_argument(
-        "--tags-path",
-        help="Path to tags.yaml. Required by yaml parsing in gen_correctness_vk system.",
-    )
-    parser.add_argument("-o", "--output", help="Output directory", required=True)
-    args = parser.parse_args()
-    generate_cpp(args.aten_yaml_path, args.tags_path, args.output)
diff --git a/backends/vulkan/test/op_tests/quantize_affine_test.cpp b/backends/vulkan/test/op_tests/quantize_affine_test.cpp
deleted file mode 100644
index 1c0a6c2e6b9..00000000000
--- a/backends/vulkan/test/op_tests/quantize_affine_test.cpp
+++ /dev/null
@@ -1,1376 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <ATen/ATen.h>
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include "test_utils.h"
-
-#include <cassert>
-#include <iostream>
-#include <limits>
-
-static inline void
-_check_dims(c10::string_view name, int64_t expected, int64_t actual) {
-  VK_CHECK_COND(
-      expected == actual,
-      name,
-      " has rank ",
-      actual,
-      " but block_size has length ",
-      expected);
-}
-
-at::Tensor quantize_affine_reference_impl(
-    const at::Tensor& input_,
-    const std::vector<int64_t>& block_size,
-    const at::Tensor& scale,
-    const c10::optional<at::Tensor>& zero_point_opt,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType out_dtype,
-    c10::optional<std::string> zero_point_domain_opt = std::string("INT")) {
-  constexpr float kEps = 1e-7f;
-
-  const int64_t ndim = input_.dim();
-  _check_dims("input", block_size.size(), ndim);
-
-  VK_CHECK_COND(
-      input_.scalar_type() == at::kFloat || input_.scalar_type() == at::kHalf ||
-          input_.scalar_type() == at::kBFloat16,
-      "Unsupported input dtype: ",
-      input_.dtype());
-
-  auto zero_point_domain =
-      zero_point_domain_opt.has_value() ? *zero_point_domain_opt : "INT";
-
-  bool has_zp = zero_point_opt.has_value();
-  VK_CHECK_COND(
-      has_zp || zero_point_domain == "NONE" || zero_point_domain == "",
-      "zero_point must be supplied unless zero_point_domain is NONE or null");
-
-  at::Tensor input = input_.contiguous();
-
-  std::vector<int64_t> shape_for_reduction;
-  std::vector<int64_t> reduction_dims;
-  int64_t cur_dim = 0;
-
-  auto in_sizes = input.sizes();
-  for (int64_t i = 0; i < ndim; ++i) {
-    const int64_t blk = block_size[i];
-    const int64_t dim = in_sizes[i];
-
-    if (blk != dim && blk > 1) {
-      VK_CHECK_COND(
-          dim % blk == 0,
-          "Input size ",
-          dim,
-          " is not divisible by block_size ",
-          blk,
-          " at dimension ",
-          i);
-      shape_for_reduction.push_back(dim / blk);
-      shape_for_reduction.push_back(blk);
-      reduction_dims.push_back(cur_dim + 1);
-      cur_dim += 2;
-    } else {
-      shape_for_reduction.push_back(dim);
-      if (blk != 1) {
-        reduction_dims.push_back(cur_dim);
-      }
-      cur_dim += 1;
-    }
-  }
-
-  at::Tensor input_reshaped = input.view(shape_for_reduction);
-
-  std::vector<int64_t> shape_after_reduction = shape_for_reduction;
-  for (int64_t d : reduction_dims) {
-    shape_after_reduction[d] = 1;
-  }
-
-  at::Tensor scale_b =
-      scale.view(shape_after_reduction).to(input_reshaped.scalar_type());
-
-  at::Tensor zp_b;
-  if (has_zp) {
-    zp_b = (*zero_point_opt).view(shape_after_reduction).toType(at::kFloat);
-  }
-
-  scale_b = scale_b.clamp_min(kEps);
-  at::Tensor inv_scale = 1.0f / scale_b;
-
-  at::Tensor q;
-  if (zero_point_domain == "INT") {
-    VK_CHECK_COND(has_zp, "INT zero_point_domain requires zero_point tensor");
-    q = at::round(input_reshaped * inv_scale) + zp_b;
-  } else if (zero_point_domain == "NONE" || zero_point_domain.empty()) {
-    VK_CHECK_COND(
-        !has_zp, "zero_point must be None when domain is NONE / null");
-    q = at::round(input_reshaped * inv_scale);
-  } else {
-    VK_CHECK_COND(
-        has_zp && zero_point_domain == "FLOAT",
-        "zero_point_domain must be INT, FLOAT, NONE or null");
-    const float mid_point = (quant_max + quant_min + 1) * 0.5f;
-    at::Tensor min_val = zp_b - scale_b * mid_point;
-    q = at::round((input_reshaped - min_val) / scale_b);
-  }
-
-  q = at::clamp(q, (double)quant_min, (double)quant_max);
-
-  q = q.view(in_sizes).to(out_dtype);
-
-  return q;
-}
-
-at::Tensor dequantize_affine_reference_impl(
-    const at::Tensor& input_,
-    const std::vector<int64_t>& block_size,
-    const at::Tensor& scale,
-    const c10::optional<at::Tensor>& zero_point_opt,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType out_dtype,
-    c10::optional<std::string> zero_point_domain_opt = std::string("INT")) {
-  const int64_t ndim = input_.dim();
-  _check_dims("input", block_size.size(), ndim);
-
-  VK_CHECK_COND(
-      input_.scalar_type() == at::kByte || input_.scalar_type() == at::kChar ||
-          input_.scalar_type() == at::kShort ||
-          input_.scalar_type() == at::kInt,
-      "Unsupported input dtype: ",
-      input_.dtype());
-
-  VK_CHECK_COND(
-      out_dtype == at::kFloat || out_dtype == at::kHalf ||
-          out_dtype == at::kBFloat16,
-      "Unsupported output dtype: ",
-      out_dtype);
-
-  auto zero_point_domain =
-      zero_point_domain_opt.has_value() ? *zero_point_domain_opt : "INT";
-
-  bool has_zp = zero_point_opt.has_value();
-  VK_CHECK_COND(
-      has_zp || zero_point_domain == "NONE" || zero_point_domain == "",
-      "zero_point must be supplied unless zero_point_domain is NONE or null");
-
-  at::Tensor input = input_.contiguous();
-
-  std::vector<int64_t> shape_for_reduction;
-  std::vector<int64_t> reduction_dims;
-  int64_t cur_dim = 0;
-
-  auto in_sizes = input.sizes();
-  for (int64_t i = 0; i < ndim; ++i) {
-    const int64_t blk = block_size[i];
-    const int64_t dim = in_sizes[i];
-
-    if (blk != dim && blk > 1) {
-      VK_CHECK_COND(
-          dim % blk == 0,
-          "Input size ",
-          dim,
-          " is not divisible by block_size ",
-          blk,
-          " at dimension ",
-          i);
-      shape_for_reduction.push_back(dim / blk);
-      shape_for_reduction.push_back(blk);
-      reduction_dims.push_back(cur_dim + 1);
-      cur_dim += 2;
-    } else {
-      shape_for_reduction.push_back(dim);
-      if (blk != 1) {
-        reduction_dims.push_back(cur_dim);
-      }
-      cur_dim += 1;
-    }
-  }
-
-  at::Tensor input_reshaped = input.view(shape_for_reduction);
-
-  std::vector<int64_t> shape_after_reduction = shape_for_reduction;
-  for (int64_t d : reduction_dims) {
-    shape_after_reduction[d] = 1;
-  }
-
-  at::Tensor scale_b = scale.view(shape_after_reduction).to(out_dtype);
-
-  at::Tensor zp_b;
-  if (has_zp) {
-    zp_b = (*zero_point_opt).view(shape_after_reduction).to(out_dtype);
-  }
-
-  at::Tensor input_fp = input_reshaped.to(out_dtype);
-  at::Tensor dq;
-
-  if (zero_point_domain == "INT") {
-    VK_CHECK_COND(has_zp, "INT zero_point_domain requires zero_point tensor");
-    dq = (input_fp - zp_b) * scale_b;
-  } else if (zero_point_domain == "NONE" || zero_point_domain.empty()) {
-    VK_CHECK_COND(
-        !has_zp, "zero_point must be None when domain is NONE / null");
-    dq = input_fp * scale_b;
-  } else {
-    VK_CHECK_COND(
-        has_zp && zero_point_domain == "FLOAT",
-        "zero_point_domain must be INT, FLOAT, NONE or null");
-    const float mid_point = (quant_max + quant_min + 1) * 0.5f;
-    at::Tensor min_val = zp_b - scale_b * mid_point;
-    dq = input_fp * scale_b + min_val;
-  }
-
-  dq = dq.view(in_sizes);
-
-  return dq;
-}
-
-// Wrapper function to maintain compatibility with existing test code (above is
-// a good reference for how the python implementation works)
-at::Tensor quantize_affine_reference_impl(
-    const at::Tensor& input,
-    const std::vector<int64_t>& block_size,
-    const at::Tensor& scale,
-    const at::Tensor& zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype) {
-  return quantize_affine_reference_impl(
-      input,
-      block_size,
-      scale,
-      c10::optional<at::Tensor>(zero_point),
-      quant_min,
-      quant_max,
-      dtype,
-      std::string("INT"));
-}
-
-// Wrapper function for dequantize_affine
-at::Tensor dequantize_affine_reference_impl(
-    const at::Tensor& input,
-    const std::vector<int64_t>& block_size,
-    const at::Tensor& scale,
-    const at::Tensor& zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype) {
-  return dequantize_affine_reference_impl(
-      input,
-      block_size,
-      scale,
-      c10::optional<at::Tensor>(zero_point),
-      quant_min,
-      quant_max,
-      dtype,
-      std::string("INT"));
-}
-
-std::tuple<at::Tensor, at::Tensor> choose_qparams_affine_reference_impl(
-    const at::Tensor& input_,
-    const std::string& mapping_type,
-    const std::vector<int64_t>& block_size,
-    int64_t quant_min,
-    int64_t quant_max,
-    double eps) {
-  const int64_t ndim = input_.dim();
-  _check_dims("input", block_size.size(), ndim);
-
-  VK_CHECK_COND(
-      input_.scalar_type() == at::kFloat || input_.scalar_type() == at::kHalf ||
-          input_.scalar_type() == at::kBFloat16,
-      "Unsupported input dtype: ",
-      input_.dtype());
-
-  at::Tensor input = input_.contiguous();
-
-  std::vector<int64_t> shape_for_reduction;
-  std::vector<int64_t> reduction_dims;
-  int64_t cur_dim = 0;
-
-  auto in_sizes = input.sizes();
-  for (int64_t i = 0; i < ndim; ++i) {
-    const int64_t blk = block_size[i];
-    const int64_t dim = in_sizes[i];
-
-    if (blk != dim && blk > 1) {
-      VK_CHECK_COND(
-          dim % blk == 0,
-          "Input size ",
-          dim,
-          " is not divisible by block_size ",
-          blk,
-          " at dimension ",
-          i);
-      shape_for_reduction.push_back(dim / blk);
-      shape_for_reduction.push_back(blk);
-      reduction_dims.push_back(cur_dim + 1);
-      cur_dim += 2;
-    } else {
-      shape_for_reduction.push_back(dim);
-      if (blk != 1) {
-        reduction_dims.push_back(cur_dim);
-      }
-      cur_dim += 1;
-    }
-  }
-
-  at::Tensor input_reshaped = input.view(shape_for_reduction);
-
-  std::vector<int64_t> shape_after_reduction = shape_for_reduction;
-  for (int64_t d : reduction_dims) {
-    shape_after_reduction[d] = 1;
-  }
-
-  at::Tensor min_val = input_reshaped.amin(reduction_dims, /*keepdim=*/true);
-  at::Tensor max_val = input_reshaped.amax(reduction_dims, /*keepdim=*/true);
-
-  at::Tensor scale, zero_point;
-
-  if (mapping_type == "ASYMMETRIC") {
-    // Include zero in the range
-    min_val = at::minimum(min_val, at::zeros_like(min_val));
-    max_val = at::maximum(max_val, at::zeros_like(max_val));
-
-    // Calculate scale
-    scale = (max_val - min_val) / (quant_max - quant_min);
-    scale = at::maximum(scale, at::full_like(scale, eps));
-
-    // Calculate zero_point
-    zero_point = at::round(quant_min - min_val / scale);
-    zero_point = at::clamp(zero_point, quant_min, quant_max);
-  } else if (mapping_type == "SYMMETRIC") {
-    // Include zero in the range
-    min_val = at::minimum(min_val, at::zeros_like(min_val));
-    max_val = at::maximum(max_val, at::zeros_like(max_val));
-
-    // Calculate max absolute value
-    at::Tensor abs_min = at::abs(min_val);
-    at::Tensor abs_max = at::abs(max_val);
-    at::Tensor M = at::maximum(abs_min, abs_max);
-
-    // Calculate scale
-    scale = M / ((quant_max - quant_min) * 0.5);
-    scale = at::maximum(scale, at::full_like(scale, eps));
-
-    // Calculate zero_point (mid-point)
-    zero_point =
-        at::full_like(scale, (quant_max + quant_min + 1) / 2, at::kInt);
-  } else if (mapping_type == "SYMMETRIC_NO_CLIPPING_ERR") {
-    // Include zero in the range
-    min_val = at::minimum(min_val, at::zeros_like(min_val));
-    max_val = at::maximum(max_val, at::zeros_like(max_val));
-
-    // Calculate scale based on min/max values
-    at::Tensor s_min = at::abs(min_val) / std::abs(quant_min);
-    at::Tensor s_max = max_val / quant_max;
-    scale = at::maximum(s_min, s_max);
-    scale = at::maximum(scale, at::full_like(scale, eps));
-
-    // Calculate zero_point (mid-point)
-    zero_point =
-        at::full_like(scale, (quant_max + quant_min + 1) / 2, at::kInt);
-  } else {
-    VK_CHECK_COND(
-        false,
-        "Unsupported mapping_type: ",
-        mapping_type,
-        ". Expected ASYMMETRIC, SYMMETRIC, or SYMMETRIC_NO_CLIPPING_ERR");
-  }
-
-  std::vector<int64_t> output_shape;
-  for (size_t i = 0; i < shape_after_reduction.size(); ++i) {
-    if (shape_after_reduction[i] != 1 ||
-        std::find(reduction_dims.begin(), reduction_dims.end(), i) ==
-            reduction_dims.end()) {
-      output_shape.push_back(shape_after_reduction[i]);
-    }
-  }
-
-  // Reshape scale and zero_point to final output shape
-  scale = scale.view(output_shape);
-  zero_point = zero_point.view(output_shape);
-
-  return std::make_tuple(scale, zero_point);
-}
-
-void test_vulkan_quantize_affine_impl(
-    const std::vector<int>& input_sizes,
-    const std::vector<int64_t>& block_size,
-    const std::vector<float>& scales,
-    const std::vector<int>& zero_points,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType in_dtype = at::kFloat,
-    at::ScalarType dtype = at::kInt,
-    const vkcompute::utils::StorageType in_storage =
-        vkcompute::utils::kTexture3D,
-    const vkcompute::utils::StorageType out_storage =
-        vkcompute::utils::kTexture3D) {
-  // Create input tensor with random values
-  std::vector<int64_t> input_sizes_int64(
-      input_sizes.begin(), input_sizes.end());
-  at::Tensor input =
-      at::rand(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype));
-
-  // Create scale and zero_point tensors
-  at::Tensor scale_tensor =
-      at::tensor(scales, at::device(at::kCPU).dtype(at::kFloat));
-  at::Tensor zero_point_tensor =
-      at::tensor(zero_points, at::device(at::kCPU).dtype(at::kInt));
-
-  // Get reference output
-  at::Tensor reference_out = quantize_affine_reference_impl(
-      input,
-      block_size,
-      scale_tensor,
-      zero_point_tensor,
-      quant_min,
-      quant_max,
-      dtype);
-
-  using namespace vkcompute;
-
-  GraphConfig config;
-  config.set_storage_type_override(in_storage);
-  ComputeGraph graph(config);
-
-  IOValueRef r_input = graph.add_input_tensor(
-      input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage);
-
-  std::vector<int64_t> block_size_copy(block_size);
-  const ValueRef r_block_size =
-      graph.add_scalar_list<int64_t>(std::move(block_size_copy));
-
-  IOValueRef r_scale = graph.add_input_tensor(
-      scale_tensor.sizes().vec(),
-      vkapi::kFloat,
-      utils::kBuffer,
-      utils::kWidthPacked);
-  IOValueRef r_zero_point = graph.add_input_tensor(
-      zero_point_tensor.sizes().vec(),
-      vkapi::kInt,
-      utils::kBuffer,
-      utils::kWidthPacked);
-
-  const ValueRef r_output_dtype =
-      graph.add_scalar<int64_t>(static_cast<int64_t>(dtype));
-  const ValueRef r_quant_min = graph.add_scalar<int64_t>(quant_min);
-  const ValueRef r_quant_max = graph.add_scalar<int64_t>(quant_max);
-
-  const ValueRef r_out = graph.add_tensor(
-      input.sizes().vec(), from_at_scalartype(dtype), out_storage);
-
-  VK_GET_OP_FN("torchao.quantize_affine.default")
-  (graph,
-   {
-       r_input.value,
-       r_block_size,
-       r_scale.value,
-       r_zero_point.value,
-       r_output_dtype,
-       r_quant_min,
-       r_quant_max,
-       r_out,
-   });
-
-  ValueRef staging_out = graph.set_output_tensor(r_out);
-
-  graph.prepare();
-  graph.prepack();
-
-  // Copy input data to GPU
-  graph.copy_into_staging(
-      r_input.staging, input.const_data_ptr(), input.numel());
-
-  // Copy scale tensor to GPU
-  graph.copy_into_staging(
-      r_scale.staging, scale_tensor.const_data_ptr(), scale_tensor.numel());
-
-  // Copy zero_point tensor to GPU
-  graph.copy_into_staging(
-      r_zero_point.staging,
-      zero_point_tensor.const_data_ptr(),
-      zero_point_tensor.numel());
-
-  // Execute the graph
-  graph.execute();
-
-  // Copy output data back to CPU
-  at::Tensor vk_out = at::empty_like(reference_out).contiguous();
-  graph.copy_from_staging(
-      staging_out, vk_out.mutable_data_ptr(), vk_out.numel());
-
-  // Compare outputs
-  at::Tensor reference_int = reference_out.to(at::kInt);
-  at::Tensor vk_int = vk_out.to(at::kInt);
-
-  // Tolerance is 1 to address rounding errors and fp math differences between
-  // CPU/GPU
-  const bool output_correct =
-      at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
-  if (!output_correct) {
-    std::cout << "\nFailed with parameters:" << std::endl;
-    std::cout << "  input_sizes: [";
-    for (size_t i = 0; i < input_sizes.size(); i++) {
-      std::cout << input_sizes[i] << (i < input_sizes.size() - 1 ? ", " : "");
-    }
-    std::cout << "]" << std::endl;
-    std::cout << "  block_size: [";
-    for (size_t i = 0; i < block_size.size(); i++) {
-      std::cout << block_size[i] << (i < block_size.size() - 1 ? ", " : "");
-    }
-    std::cout << "]" << std::endl;
-    std::cout << "  scales: [";
-    for (size_t i = 0; i < scales.size(); i++) {
-      std::cout << scales[i] << (i < scales.size() - 1 ? ", " : "");
-    }
-    std::cout << "]" << std::endl;
-    std::cout << "  zero_points: [";
-    for (size_t i = 0; i < zero_points.size(); i++) {
-      std::cout << zero_points[i] << (i < zero_points.size() - 1 ? ", " : "");
-    }
-    std::cout << "]" << std::endl;
-    std::cout << "  quant_min: " << quant_min << std::endl;
-    std::cout << "  quant_max: " << quant_max << std::endl;
-    std::cout << "  storage type: "
-              << (in_storage == vkcompute::utils::kBuffer ? "buffer"
-                                                          : "texture")
-              << std::endl;
-
-    std::cout << "input:" << std::endl << input << std::endl;
-    std::cout << "reference:" << std::endl << reference_int << std::endl;
-    std::cout << "vulkan:" << std::endl << vk_int << std::endl;
-  }
-
-  ASSERT_TRUE(output_correct);
-}
-
-// Wrapper function to test both buffer and texture storage types
-void test_vulkan_quantize_affine(
-    const std::vector<int>& input_sizes,
-    const std::vector<int64_t>& block_size,
-    const std::vector<float>& scales,
-    const std::vector<int>& zero_points,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType in_dtype = at::kFloat,
-    at::ScalarType dtype = at::kInt) {
-  // Test with buffer storage
-  test_vulkan_quantize_affine_impl(
-      input_sizes,
-      block_size,
-      scales,
-      zero_points,
-      quant_min,
-      quant_max,
-      in_dtype,
-      dtype,
-      vkcompute::utils::kBuffer,
-      vkcompute::utils::kBuffer);
-
-  // Test with texture storage
-  test_vulkan_quantize_affine_impl(
-      input_sizes,
-      block_size,
-      scales,
-      zero_points,
-      quant_min,
-      quant_max,
-      in_dtype,
-      dtype,
-      vkcompute::utils::kTexture3D,
-      vkcompute::utils::kTexture3D);
-}
-
-TEST(VulkanQuantizeAffineTest, test_1d_quantization) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  // 1D: 1x1x1x12 Tensor, block_size is 3
-  test_vulkan_quantize_affine(
-      {12}, // input_sizes
-      {3}, // block_size
-      {0.1f, 0.2f, 0.15f, 0.25f}, // scales (4 blocks)
-      {10, -20, 5, 30}, // zero_points (4 blocks)
-      -128, // quant_min (char min)
-      127, // quant_max (char max)
-      at::kFloat, // input dtype
-      at::kChar); // output dtype
-}
-
-TEST(VulkanQuantizeAffineTest, test_2d_quantization) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  // 2D: 1x1x8x6 Tensor, block_size is 1x1x2x3 (8/2=4, 6/3=2, so 4*2=8 blocks)
-  test_vulkan_quantize_affine(
-      {8, 6}, // input_sizes
-      {2, 3}, // block_size (1/1=1, 1/1=1, 8/2=4, 6/3=2)
-      {0.1f, 0.2f, 0.15f, 0.25f, 0.3f, 0.05f, 0.4f, 0.35f}, // scales (8 blocks)
-      {-10, 15, 0, 25, -5, 20, 10, -15}, // zero_points (8 blocks)
-      -128, // quant_min (char min)
-      127, // quant_max (char max)
-      at::kFloat, // input dtype
-      at::kChar); // output dtype
-}
-
-TEST(VulkanQuantizeAffineTest, test_3d_quantization) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  // 3D: 1x6x4x6 Tensor, block_size is 3x2x2 (6/3=2, 4/2=2, 6/2=3, so 2*2*3=12
-  // blocks)
-  test_vulkan_quantize_affine(
-      {6, 4, 6}, // input_sizes (changed 7->6 so divisible by 3)
-      {3,
-       2,
-       2}, // block_size (6 divisible by 3, 4 divisible by 2, 6 divisible by 2)
-      {0.1f,
-       0.2f,
-       0.15f,
-       0.25f,
-       0.3f,
-       0.05f,
-       0.4f,
-       0.35f,
-       0.12f,
-       0.18f,
-       0.22f,
-       0.28f}, // scales (12 blocks)
-      {-15, 10, 5, -25, 20, -10, 15, -5, 8, -12, 18, -8}, // zero_points (12
-                                                          // blocks)
-      -128, // quant_min (char min)
-      127, // quant_max (char max)
-      at::kFloat, // input dtype
-      at::kChar); // output dtype
-}
-
-TEST(VulkanQuantizeAffineTest, test_4d_quantization) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  // 4D: 8x6x6x6 Tensor, block_size is 2x3x2x3 (8/2=4, 6/3=2, 6/2=3, 6/3=2, so
-  // 4*2*3*2=48 blocks)
-  test_vulkan_quantize_affine(
-      {8, 6, 6, 6}, // input_sizes
-      {2, 3, 2, 3}, // block_size (8/2=4, 6/3=2, 6/2=3, 6/3=2)
-      {0.1f,  0.2f,  0.15f, 0.25f, 0.3f,  0.05f, 0.4f,  0.35f, 0.12f, 0.18f,
-       0.22f, 0.28f, 0.32f, 0.08f, 0.45f, 0.38f, 0.14f, 0.24f, 0.16f, 0.26f,
-       0.34f, 0.06f, 0.44f, 0.36f, 0.11f, 0.21f, 0.13f, 0.23f, 0.31f, 0.07f,
-       0.41f, 0.37f, 0.19f, 0.29f, 0.17f, 0.27f, 0.33f, 0.09f, 0.43f, 0.39f,
-       0.10f, 0.20f, 0.14f, 0.24f, 0.30f, 0.04f, 0.40f, 0.34f}, // scales (48
-                                                                // blocks)
-      {-20, 10,  5,   -15, 25,  -10, 15,  -5, 8,  -12, 18,  -8, 22,
-       -18, 12,  -22, -25, 15,  0,   -20, 30, -5, 20,  -10, 5,  -25,
-       10,  -15, 35,  -15, 25,  -35, -30, 20, -5, -25, 40,  0,  30,
-       -40, 10,  -30, 15,  -10, 45,  -20, 35, -45}, // zero_points (48 blocks)
-      -128, // quant_min (char min)
-      127, // quant_max (char max)
-      at::kFloat, // input dtype
-      at::kChar); // output dtype
-}
-
-void test_vulkan_dequantize_affine_impl(
-    const std::vector<int>& input_sizes,
-    const std::vector<int64_t>& block_size,
-    const std::vector<float>& scales,
-    const std::vector<int>& zero_points,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType in_dtype = at::kChar,
-    at::ScalarType out_dtype = at::kFloat,
-    const vkcompute::utils::StorageType in_storage =
-        vkcompute::utils::kTexture3D,
-    const vkcompute::utils::StorageType out_storage =
-        vkcompute::utils::kTexture3D) {
-  // Create input tensor with random integer values within quant_min and
-  // quant_max
-  std::vector<int64_t> input_sizes_int64(
-      input_sizes.begin(), input_sizes.end());
-  at::Tensor input = at::randint(
-      quant_min,
-      quant_max + 1,
-      input_sizes_int64,
-      at::device(at::kCPU).dtype(in_dtype));
-
-  // Create scale and zero_point tensors
-  at::Tensor scale_tensor =
-      at::tensor(scales, at::device(at::kCPU).dtype(at::kFloat));
-  at::Tensor zero_point_tensor =
-      at::tensor(zero_points, at::device(at::kCPU).dtype(at::kInt));
-
-  // Get reference output
-  at::Tensor reference_out = dequantize_affine_reference_impl(
-      input,
-      block_size,
-      scale_tensor,
-      zero_point_tensor,
-      quant_min,
-      quant_max,
-      out_dtype);
-
-  using namespace vkcompute;
-
-  GraphConfig config;
-  config.set_storage_type_override(in_storage);
-  ComputeGraph graph(config);
-
-  IOValueRef r_input = graph.add_input_tensor(
-      input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage);
-
-  // Create block_size as IntList instead of Tensor
-  std::vector<int64_t> block_size_copy(block_size);
-  const ValueRef r_block_size =
-      graph.add_scalar_list<int64_t>(std::move(block_size_copy));
-
-  IOValueRef r_scale = graph.add_input_tensor(
-      scale_tensor.sizes().vec(),
-      vkapi::kFloat,
-      utils::kBuffer,
-      utils::kWidthPacked);
-  IOValueRef r_zero_point = graph.add_input_tensor(
-      zero_point_tensor.sizes().vec(),
-      vkapi::kInt,
-      utils::kBuffer,
-      utils::kWidthPacked);
-
-  // Create input_dtype scalar
-  const ValueRef r_input_dtype =
-      graph.add_scalar<int64_t>(static_cast<int64_t>(in_dtype));
-  const ValueRef r_quant_min = graph.add_scalar<int64_t>(quant_min);
-  const ValueRef r_quant_max = graph.add_scalar<int64_t>(quant_max);
-  const ValueRef r_output_dtype =
-      graph.add_scalar<int64_t>(static_cast<int64_t>(out_dtype));
-
-  const ValueRef r_out = graph.add_tensor(
-      input.sizes().vec(), from_at_scalartype(out_dtype), out_storage);
-
-  // Match the argument order in dequantize_affine_impl in Dequantize.cpp:
-  // input, block_size, scale, zero_point, input_dtype, quant_min, quant_max,
-  // output_dtype, output
-  VK_GET_OP_FN("torchao.dequantize_affine.default")
-  (graph,
-   {
-       r_input.value,
-       r_block_size,
-       r_scale.value,
-       r_zero_point.value,
-       r_input_dtype,
-       r_quant_min,
-       r_quant_max,
-       r_output_dtype,
-       r_out,
-   });
-
-  ValueRef staging_out = graph.set_output_tensor(r_out);
-
-  graph.prepare();
-  graph.prepack();
-
-  // Copy input data to GPU
-  graph.copy_into_staging(
-      r_input.staging, input.const_data_ptr(), input.numel());
-
-  // Copy scale tensor to GPU
-  graph.copy_into_staging(
-      r_scale.staging, scale_tensor.const_data_ptr(), scale_tensor.numel());
-
-  // Copy zero_point tensor to GPU
-  graph.copy_into_staging(
-      r_zero_point.staging,
-      zero_point_tensor.const_data_ptr(),
-      zero_point_tensor.numel());
-
-  // Execute the graph
-  graph.execute();
-
-  // Copy output data back to CPU
-  at::Tensor vk_out = at::empty_like(reference_out).contiguous();
-  graph.copy_from_staging(
-      staging_out, vk_out.mutable_data_ptr(), vk_out.numel());
-
-  // Compare outputs
-  const bool output_correct =
-      at::allclose(reference_out, vk_out, /*rtol=*/1e-5, /*atol=*/1e-5);
-  if (!output_correct) {
-    std::cout << "\nFailed with parameters:" << std::endl;
-    std::cout << "  input_sizes: [";
-    for (size_t i = 0; i < input_sizes.size(); i++) {
-      std::cout << input_sizes[i] << (i < input_sizes.size() - 1 ? ", " : "");
-    }
-    std::cout << "]" << std::endl;
-    std::cout << "  block_size: [";
-    for (size_t i = 0; i < block_size.size(); i++) {
-      std::cout << block_size[i] << (i < block_size.size() - 1 ? ", " : "");
-    }
-    std::cout << "]" << std::endl;
-    std::cout << "  scales: [";
-    for (size_t i = 0; i < scales.size(); i++) {
-      std::cout << scales[i] << (i < scales.size() - 1 ? ", " : "");
-    }
-    std::cout << "]" << std::endl;
-    std::cout << "  zero_points: [";
-    for (size_t i = 0; i < zero_points.size(); i++) {
-      std::cout << zero_points[i] << (i < zero_points.size() - 1 ? ", " : "");
-    }
-    std::cout << "]" << std::endl;
-    std::cout << "  quant_min: " << quant_min << std::endl;
-    std::cout << "  quant_max: " << quant_max << std::endl;
-    std::cout << "  storage type: "
-              << (in_storage == vkcompute::utils::kBuffer ? "buffer"
-                                                          : "texture")
-              << std::endl;
-
-    std::cout << "input:" << std::endl << input << std::endl;
-    std::cout << "reference:" << std::endl << reference_out << std::endl;
-    std::cout << "vulkan:" << std::endl << vk_out << std::endl;
-  }
-
-  ASSERT_TRUE(output_correct);
-}
-
-// Wrapper function to test both buffer and texture storage types
-void test_vulkan_dequantize_affine(
-    const std::vector<int>& input_sizes,
-    const std::vector<int64_t>& block_size,
-    const std::vector<float>& scales,
-    const std::vector<int>& zero_points,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType in_dtype = at::kChar,
-    at::ScalarType out_dtype = at::kFloat) {
-  // Test with buffer storage
-  test_vulkan_dequantize_affine_impl(
-      input_sizes,
-      block_size,
-      scales,
-      zero_points,
-      quant_min,
-      quant_max,
-      in_dtype,
-      out_dtype,
-      vkcompute::utils::kBuffer,
-      vkcompute::utils::kBuffer);
-
-  // Test with texture storage
-  test_vulkan_dequantize_affine_impl(
-      input_sizes,
-      block_size,
-      scales,
-      zero_points,
-      quant_min,
-      quant_max,
-      in_dtype,
-      out_dtype,
-      vkcompute::utils::kTexture3D,
-      vkcompute::utils::kTexture3D);
-}
-
-TEST(VulkanDequantizeAffineTest, test_1d_dequantization) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  // 1D: 1x1x1x12 Tensor, block_size is 3
-  test_vulkan_dequantize_affine(
-      {12}, // input_sizes
-      {3}, // block_size
-      {0.1f, 0.2f, 0.15f, 0.25f}, // scales (4 blocks)
-      {10, -20, 5, 30}, // zero_points (4 blocks)
-      -128, // quant_min (char min)
-      127, // quant_max (char max)
-      at::kChar, // input dtype
-      at::kFloat); // output dtype
-}
-
-TEST(VulkanDequantizeAffineTest, test_2d_dequantization) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  // 2D: 1x1x8x6 Tensor, block_size is 1x1x2x3 (8/2=4, 6/3=2, so 4*2=8 blocks)
-  test_vulkan_dequantize_affine(
-      {8, 6}, // input_sizes
-      {2, 3}, // block_size (1/1=1, 1/1=1, 8/2=4, 6/3=2)
-      {0.1f, 0.2f, 0.15f, 0.25f, 0.3f, 0.05f, 0.4f, 0.35f}, // scales (8 blocks)
-      {-10, 15, 0, 25, -5, 20, 10, -15}, // zero_points (8 blocks)
-      -128, // quant_min (char min)
-      127, // quant_max (char max)
-      at::kChar, // input dtype
-      at::kFloat); // output dtype
-}
-
-TEST(VulkanDequantizeAffineTest, test_3d_dequantization) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  // 3D: 1x6x4x6 Tensor, block_size is 3x2x2 (6/3=2, 4/2=2, 6/2=3, so 2*2*3=12
-  // blocks)
-  test_vulkan_dequantize_affine(
-      {6, 4, 6}, // input_sizes (changed 7->6 so divisible by 3)
-      {3,
-       2,
-       2}, // block_size (6 divisible by 3, 4 divisible by 2, 6 divisible by 2)
-      {0.1f,
-       0.2f,
-       0.15f,
-       0.25f,
-       0.3f,
-       0.05f,
-       0.4f,
-       0.35f,
-       0.12f,
-       0.18f,
-       0.22f,
-       0.28f}, // scales (12 blocks)
-      {-15, 10, 5, -25, 20, -10, 15, -5, 8, -12, 18, -8}, // zero_points (12
-                                                          // blocks)
-      -128, // quant_min (char min)
-      127, // quant_max (char max)
-      at::kChar, // input dtype
-      at::kFloat); // output dtype
-}
-
-TEST(VulkanDequantizeAffineTest, test_4d_dequantization) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  // 4D: 8x6x6x6 Tensor, block_size is 2x3x2x3 (8/2=4, 6/3=2, 6/2=3, 6/3=2, so
-  // 4*2*3*2=48 blocks)
-  test_vulkan_dequantize_affine(
-      {8, 6, 6, 6}, // input_sizes
-      {2, 3, 2, 3}, // block_size (8/2=4, 6/3=2, 6/2=3, 6/3=2)
-      {0.1f,  0.2f,  0.15f, 0.25f, 0.3f,  0.05f, 0.4f,  0.35f, 0.12f, 0.18f,
-       0.22f, 0.28f, 0.32f, 0.08f, 0.45f, 0.38f, 0.14f, 0.24f, 0.16f, 0.26f,
-       0.34f, 0.06f, 0.44f, 0.36f, 0.11f, 0.21f, 0.13f, 0.23f, 0.31f, 0.07f,
-       0.41f, 0.37f, 0.19f, 0.29f, 0.17f, 0.27f, 0.33f, 0.09f, 0.43f, 0.39f,
-       0.10f, 0.20f, 0.14f, 0.24f, 0.30f, 0.04f, 0.40f, 0.34f}, // scales (48
-                                                                // blocks)
-      {-20, 10,  5,   -15, 25,  -10, 15,  -5, 8,  -12, 18,  -8, 22,
-       -18, 12,  -22, -25, 15,  0,   -20, 30, -5, 20,  -10, 5,  -25,
-       10,  -15, 35,  -15, 25,  -35, -30, 20, -5, -25, 40,  0,  30,
-       -40, 10,  -30, 15,  -10, 45,  -20, 35, -45}, // zero_points (48 blocks)
-      -128, // quant_min (char min)
-      127, // quant_max (char max)
-      at::kChar, // input dtype
-      at::kFloat); // output dtype
-}
-
-void test_vulkan_choose_qparams_affine_impl(
-    const std::vector<int>& input_sizes,
-    const std::vector<int64_t>& block_size,
-    const std::string& mapping_type,
-    int64_t quant_min,
-    int64_t quant_max,
-    double eps,
-    at::ScalarType in_dtype = at::kFloat,
-    const vkcompute::utils::StorageType in_storage =
-        vkcompute::utils::kTexture3D,
-    const vkcompute::utils::StorageType out_storage =
-        vkcompute::utils::kBuffer) {
-  // Create input tensor with random values
-  std::vector<int64_t> input_sizes_int64(
-      input_sizes.begin(), input_sizes.end());
-  at::Tensor input =
-      at::rand(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype));
-
-  // Get reference output
-  auto reference_out = choose_qparams_affine_reference_impl(
-      input, mapping_type, block_size, quant_min, quant_max, eps);
-
-  at::Tensor reference_scale = std::get<0>(reference_out);
-  at::Tensor reference_zero_point = std::get<1>(reference_out);
-
-  reference_zero_point = reference_zero_point.to(at::kInt);
-
-  using namespace vkcompute;
-
-  GraphConfig config;
-  config.set_storage_type_override(in_storage);
-  ComputeGraph graph(config);
-
-  IOValueRef r_input = graph.add_input_tensor(
-      input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage);
-
-  // Create mapping_type as string
-  std::string mapping_type_copy = mapping_type;
-  const ValueRef r_mapping_type =
-      graph.add_string(std::move(mapping_type_copy));
-
-  // Create block_size as IntList
-  std::vector<int64_t> block_size_copy(block_size);
-  const ValueRef r_block_size =
-      graph.add_scalar_list<int64_t>(std::move(block_size_copy));
-
-  // Create target_dtype, quant_min, quant_max, eps
-  const ValueRef r_target_dtype =
-      graph.add_scalar<int64_t>(static_cast<int64_t>(at::kChar));
-  const ValueRef r_quant_min = graph.add_scalar<int64_t>(quant_min);
-  const ValueRef r_quant_max = graph.add_scalar<int64_t>(quant_max);
-  const ValueRef r_eps = graph.add_scalar<double>(eps);
-
-  // Create scale_dtype and zero_point_dtype
-  const ValueRef r_scale_dtype =
-      graph.add_scalar<int64_t>(static_cast<int64_t>(at::kFloat));
-  const ValueRef r_zero_point_dtype =
-      graph.add_scalar<int64_t>(static_cast<int64_t>(at::kInt));
-
-  // Create output tuple
-  std::vector<ValueRef> out_tuple;
-
-  // Create scale and zero_point output tensors
-  const ValueRef r_scale_out = graph.add_tensor(
-      reference_scale.sizes().vec(), vkapi::kFloat, out_storage);
-  const ValueRef r_zero_point_out = graph.add_tensor(
-      reference_zero_point.sizes().vec(), vkapi::kInt, out_storage);
-
-  out_tuple.push_back(r_scale_out);
-  out_tuple.push_back(r_zero_point_out);
-
-  const ValueRef r_out_tuple = graph.add_value_list(std::move(out_tuple));
-
-  VK_GET_OP_FN("torchao.choose_qparams_affine.default")
-  (graph,
-   {
-       r_input.value,
-       r_mapping_type,
-       r_block_size,
-       r_target_dtype,
-       r_quant_min,
-       r_quant_max,
-       r_eps,
-       r_scale_dtype,
-       r_zero_point_dtype,
-       r_out_tuple,
-   });
-
-  ValueRef staging_scale = graph.set_output_tensor(r_scale_out);
-  ValueRef staging_zero_point = graph.set_output_tensor(r_zero_point_out);
-
-  graph.prepare();
-  graph.prepack();
-
-  // Copy input data to GPU
-  graph.copy_into_staging(
-      r_input.staging, input.const_data_ptr(), input.numel());
-
-  // Execute the graph
-  graph.execute();
-
-  // Copy output data back to CPU
-  at::Tensor vk_scale = at::empty_like(reference_scale).contiguous();
-  at::Tensor vk_zero_point = at::empty_like(reference_zero_point).contiguous();
-
-  graph.copy_from_staging(
-      staging_scale, vk_scale.mutable_data_ptr(), vk_scale.numel());
-  graph.copy_from_staging(
-      staging_zero_point,
-      vk_zero_point.mutable_data_ptr(),
-      vk_zero_point.numel());
-
-  // Compare outputs
-  const bool scale_correct =
-      at::allclose(reference_scale, vk_scale, /*rtol=*/1e-3, /*atol=*/1e-3);
-
-  // For zero point, we need to compare as integers since zero point should be
-  // an integer First convert both tensors to int if they aren't already
-  at::Tensor ref_zp_int = reference_zero_point.to(at::kInt);
-  at::Tensor vk_zp_int = vk_zero_point.to(at::kInt);
-  const bool zero_point_correct = at::equal(ref_zp_int, vk_zp_int);
-
-  if (!scale_correct || !zero_point_correct) {
-    std::cout << "\nFailed with parameters:" << std::endl;
-    std::cout << "  input_sizes: [";
-    for (size_t i = 0; i < input_sizes.size(); i++) {
-      std::cout << input_sizes[i] << (i < input_sizes.size() - 1 ? ", " : "");
-    }
-    std::cout << "]" << std::endl;
-    std::cout << "  block_size: [";
-    for (size_t i = 0; i < block_size.size(); i++) {
-      std::cout << block_size[i] << (i < block_size.size() - 1 ? ", " : "");
-    }
-    std::cout << "]" << std::endl;
-    std::cout << "  mapping_type: " << mapping_type << std::endl;
-    std::cout << "  quant_min: " << quant_min << std::endl;
-    std::cout << "  quant_max: " << quant_max << std::endl;
-    std::cout << "  eps: " << eps << std::endl;
-    std::cout << "  storage type: "
-              << (in_storage == vkcompute::utils::kBuffer ? "buffer"
-                                                          : "texture")
-              << std::endl;
-
-    if (!scale_correct || !zero_point_correct) {
-      std::cout << "input:" << std::endl;
-      std::cout << input << std::endl;
-
-      std::cout << "reference_scale:" << std::endl
-                << reference_scale << std::endl;
-      std::cout << "vulkan_scale:" << std::endl << vk_scale << std::endl;
-
-      std::cout << "reference_zero_point:" << std::endl
-                << reference_zero_point << std::endl;
-      std::cout << "vulkan_zero_point:" << std::endl
-                << vk_zero_point << std::endl;
-    }
-  }
-
-  ASSERT_TRUE(scale_correct);
-  ASSERT_TRUE(zero_point_correct);
-}
-
-// Wrapper function to test both buffer and texture storage types
-void test_vulkan_choose_qparams_affine(
-    const std::vector<int>& input_sizes,
-    const std::vector<int64_t>& block_size,
-    const std::string& mapping_type,
-    int64_t quant_min,
-    int64_t quant_max,
-    double eps,
-    at::ScalarType in_dtype = at::kFloat) {
-  // Test with buffer storage for both input and output
-  test_vulkan_choose_qparams_affine_impl(
-      input_sizes,
-      block_size,
-      mapping_type,
-      quant_min,
-      quant_max,
-      eps,
-      in_dtype,
-      vkcompute::utils::kBuffer,
-      vkcompute::utils::kBuffer);
-
-  // Test with texture storage for input and buffer storage for output
-  // (shader always uses buffer storage for outputs)
-  test_vulkan_choose_qparams_affine_impl(
-      input_sizes,
-      block_size,
-      mapping_type,
-      quant_min,
-      quant_max,
-      eps,
-      in_dtype,
-      vkcompute::utils::kTexture3D,
-      vkcompute::utils::kBuffer);
-}
-
-TEST(VulkanChooseQParamsAffineTest, test_1d_asymmetric) {
-  // 1D: 12 Tensor, block_size is 3
-  test_vulkan_choose_qparams_affine(
-      {12}, // input_sizes
-      {3}, // block_size
-      "ASYMMETRIC", // mapping_type
-      -128, // quant_min (char min)
-      127, // quant_max (char max)
-      1e-5, // eps
-      at::kFloat); // input dtype
-}
-
-TEST(VulkanChooseQParamsAffineTest, test_2d_symmetric) {
-  // 2D: 8x6 Tensor, block_size is 2x3
-  test_vulkan_choose_qparams_affine(
-      {8, 6}, // input_sizes
-      {2, 3}, // block_size
-      "SYMMETRIC", // mapping_type
-      -128, // quant_min (char min)
-      127, // quant_max (char max)
-      1e-5, // eps
-      at::kFloat); // input dtype
-}
-
-TEST(VulkanChooseQParamsAffineTest, test_3d_symmetric_no_clipping) {
-  // 3D: 6x4x6 Tensor, block_size is 3x2x2
-  test_vulkan_choose_qparams_affine(
-      {6, 4, 6}, // input_sizes
-      {3, 2, 2}, // block_size
-      "SYMMETRIC_NO_CLIPPING_ERR", // mapping_type
-      -128, // quant_min (char min)
-      127, // quant_max (char max)
-      1e-5, // eps
-      at::kFloat); // input dtype
-}
-
-TEST(VulkanChooseQParamsAffineTest, test_4d_asymmetric) {
-  // 4D: 4x6x6x6 Tensor, block_size is 2x3x2x3
-  test_vulkan_choose_qparams_affine(
-      {4, 6, 6, 6}, // input_sizes (reduced from 8 to 4 to make test faster)
-      {2, 3, 2, 3}, // block_size
-      "ASYMMETRIC", // mapping_type
-      -128, // quant_min (char min)
-      127, // quant_max (char max)
-      1e-5, // eps
-      at::kFloat); // input dtype
-}
-
-TEST(VulkanChooseQParamsAffineTest, test_per_tensor) {
-  // Per-tensor: block_size equals tensor size
-  test_vulkan_choose_qparams_affine(
-      {4, 6, 8}, // input_sizes
-      {4, 6, 8}, // block_size equals tensor size
-      "ASYMMETRIC", // mapping_type
-      -128, // quant_min (char min)
-      127, // quant_max (char max)
-      1e-5, // eps
-      at::kFloat); // input dtype
-}
-
-TEST(VulkanChooseQParamsAffineTest, test_per_token) {
-  // Per-token: block_size is all 1s except last dimension
-  test_vulkan_choose_qparams_affine(
-      {4, 6, 8}, // input_sizes
-      {1, 1, 8}, // block_size is all 1s except last dimension
-      "ASYMMETRIC", // mapping_type
-      -128, // quant_min (char min)
-      127, // quant_max (char max)
-      1e-5, // eps
-      at::kFloat); // input dtype
-}
-
-// Additional tests for choose_qparams_affine
-
-TEST(VulkanChooseQParamsAffineTest, test_uint8_range) {
-  // Test with uint8 range (0-255)
-  test_vulkan_choose_qparams_affine(
-      {6, 8}, // input_sizes
-      {2, 4}, // block_size
-      "ASYMMETRIC", // mapping_type
-      0, // quant_min (uint8 min)
-      255, // quant_max (uint8 max)
-      1e-5, // eps
-      at::kFloat); // input dtype
-}
-
-TEST(VulkanChooseQParamsAffineTest, test_int16_range) {
-  // Test with int16 range (-32768 to 32767)
-  test_vulkan_choose_qparams_affine(
-      {6, 8}, // input_sizes
-      {2, 4}, // block_size
-      "SYMMETRIC", // mapping_type
-      -32768, // quant_min (int16 min)
-      32767, // quant_max (int16 max)
-      1e-5, // eps
-      at::kFloat); // input dtype
-}
-
-TEST(VulkanChooseQParamsAffineTest, test_larger_eps) {
-  // Test with larger epsilon value
-  test_vulkan_choose_qparams_affine(
-      {6, 8}, // input_sizes
-      {2, 4}, // block_size
-      "ASYMMETRIC", // mapping_type
-      -128, // quant_min
-      127, // quant_max
-      1e-2, // larger eps
-      at::kFloat); // input dtype
-}
-
-TEST(VulkanChooseQParamsAffineTest, test_per_channel_first_dim) {
-  // Per-channel quantization on first dimension
-  test_vulkan_choose_qparams_affine(
-      {8, 6, 4}, // input_sizes
-      {1, 6, 4}, // block_size (per-channel on dim 0)
-      "SYMMETRIC", // mapping_type
-      -128, // quant_min
-      127, // quant_max
-      1e-5, // eps
-      at::kFloat); // input dtype
-}
-
-TEST(VulkanChooseQParamsAffineTest, test_per_channel_middle_dim) {
-  // Per-channel quantization on middle dimension
-  test_vulkan_choose_qparams_affine(
-      {4, 8, 6}, // input_sizes
-      {4, 1, 6}, // block_size (per-channel on dim 1)
-      "SYMMETRIC", // mapping_type
-      -128, // quant_min
-      127, // quant_max
-      1e-5, // eps
-      at::kFloat); // input dtype
-}
-
-TEST(VulkanChooseQParamsAffineTest, test_mixed_block_sizes) {
-  // Mixed block sizes (some dimensions fully quantized, some partially)
-  test_vulkan_choose_qparams_affine(
-      {8, 6, 10}, // input_sizes
-      {4, 6, 2}, // block_size (mixed: partial, full, partial)
-      "ASYMMETRIC", // mapping_type
-      -128, // quant_min
-      127, // quant_max
-      1e-5, // eps
-      at::kFloat); // input dtype
-}
-
-TEST(VulkanChooseQParamsAffineTest, test_small_tensor) {
-  // Test with a small tensor
-  test_vulkan_choose_qparams_affine(
-      {2, 3}, // small input_sizes
-      {2, 3}, // block_size (full tensor)
-      "ASYMMETRIC", // mapping_type
-      -128, // quant_min
-      127, // quant_max
-      1e-5, // eps
-      at::kFloat); // input dtype
-}
-
-TEST(VulkanChooseQParamsAffineTest, test_asymmetric_narrow_range) {
-  // Test with a narrow quantization range
-  test_vulkan_choose_qparams_affine(
-      {6, 8}, // input_sizes
-      {2, 4}, // block_size
-      "ASYMMETRIC", // mapping_type
-      -10, // quant_min (narrow range)
-      10, // quant_max (narrow range)
-      1e-5, // eps
-      at::kFloat); // input dtype
-}
-
-TEST(VulkanChooseQParamsAffineTest, test_symmetric_narrow_range) {
-  // Test with a narrow quantization range with symmetric mapping
-  test_vulkan_choose_qparams_affine(
-      {6, 8}, // input_sizes
-      {2, 4}, // block_size
-      "SYMMETRIC", // mapping_type
-      -10, // quant_min (narrow range)
-      10, // quant_max (narrow range)
-      1e-5, // eps
-      at::kFloat); // input dtype
-}
-
-TEST(VulkanChooseQParamsAffineTest, test_symmetric_no_clipping_narrow_range) {
-  // Test with a narrow quantization range with symmetric no clipping mapping
-  test_vulkan_choose_qparams_affine(
-      {6, 8}, // input_sizes
-      {2, 4}, // block_size
-      "SYMMETRIC_NO_CLIPPING_ERR", // mapping_type
-      -10, // quant_min (narrow range)
-      10, // quant_max (narrow range)
-      1e-5, // eps
-      at::kFloat); // input dtype
-}
\ No newline at end of file
diff --git a/backends/vulkan/test/op_tests/quantize_test.cpp b/backends/vulkan/test/op_tests/quantize_test.cpp
deleted file mode 100644
index 86eebcf9b14..00000000000
--- a/backends/vulkan/test/op_tests/quantize_test.cpp
+++ /dev/null
@@ -1,2188 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <ATen/ATen.h>
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/extension/aten_util/make_aten_functor_from_et_functor.h>
-#include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
-
-#include "test_utils.h"
-
-#include <cassert>
-#include <iostream>
-#include <limits>
-
-float eps = 1e-7;
-
-namespace torch {
-namespace executor {
-namespace native {
-
-// Forward declarations of the functions we're testing
-Tensor& quantize_per_tensor_out(
-    const Tensor& input,
-    double scale,
-    int64_t zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    Tensor& out);
-
-Tensor& quantize_per_token_out(
-    const Tensor& input,
-    const Tensor& scale,
-    const Tensor& zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    Tensor& out);
-
-Tensor& quantize_per_channel_out(
-    const Tensor& input,
-    const Tensor& scale,
-    const Tensor& zero_point,
-    int64_t axis,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    Tensor& out);
-
-Tensor& quantize_per_tensor_tensor_args_out(
-    const Tensor& input,
-    const Tensor& scale,
-    const Tensor& zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    Tensor& out);
-
-// Wrapper function for quantize_per_tensor_out without context
-Tensor& quantize_per_tensor_out_no_context(
-    const Tensor& input,
-    double scale,
-    int64_t zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    Tensor& out) {
-  return torch::executor::native::quantize_per_tensor_out(
-      input, scale, zero_point, quant_min, quant_max, dtype, out);
-}
-
-// Wrapper function for quantize_per_token_out without context
-Tensor& quantize_per_token_out_no_context(
-    const Tensor& input,
-    const Tensor& scale,
-    const Tensor& zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    Tensor& out) {
-  return torch::executor::native::quantize_per_token_out(
-      input, scale, zero_point, quant_min, quant_max, dtype, out);
-}
-
-// Wrapper function for quantize_per_channel_out without context
-Tensor& quantize_per_channel_out_no_context(
-    const Tensor& input,
-    const Tensor& scale,
-    const Tensor& zero_point,
-    int64_t axis,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    Tensor& out) {
-  return torch::executor::native::quantize_per_channel_out(
-      input, scale, zero_point, axis, quant_min, quant_max, dtype, out);
-}
-
-// Wrapper function for quantize_per_tensor_tensor_args_out without context
-Tensor& quantize_per_tensor_tensor_args_out_no_context(
-    const Tensor& input,
-    const Tensor& scale,
-    const Tensor& zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    ScalarType dtype,
-    Tensor& out) {
-  return torch::executor::native::quantize_per_tensor_tensor_args_out(
-      input, scale, zero_point, quant_min, quant_max, dtype, out);
-}
-
-// ATen wrapper for quantize_per_tensor
-at::Tensor quantize_per_tensor_aten(
-    const at::Tensor& input,
-    double scale,
-    int64_t zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype) {
-  auto out = at::empty_like(input, dtype);
-  ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype);
-
-  WRAP_TO_ATEN(quantize_per_tensor_out_no_context, 6)
-  (input, scale, zero_point, quant_min, quant_max, et_dtype, out);
-  return out;
-}
-
-// ATen wrapper for quantize_per_token
-at::Tensor quantize_per_token_aten(
-    const at::Tensor& input,
-    const at::Tensor& scale,
-    const at::Tensor& zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype) {
-  auto out = at::empty_like(input, dtype);
-  ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype);
-
-  WRAP_TO_ATEN(quantize_per_token_out_no_context, 6)
-  (input, scale, zero_point, quant_min, quant_max, et_dtype, out);
-  return out;
-}
-
-// ATen wrapper for quantize_per_channel
-at::Tensor quantize_per_channel_aten(
-    const at::Tensor& input,
-    const at::Tensor& scale,
-    const at::Tensor& zero_point,
-    int64_t axis,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype) {
-  auto out = at::empty_like(input, dtype);
-  ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype);
-
-  WRAP_TO_ATEN(quantize_per_channel_out_no_context, 7)
-  (input, scale, zero_point, axis, quant_min, quant_max, et_dtype, out);
-  return out;
-}
-
-// ATen wrapper for quantize_per_tensor with tensor args
-at::Tensor quantize_per_tensor_tensor_args_aten(
-    const at::Tensor& input,
-    const at::Tensor& scale,
-    const at::Tensor& zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype) {
-  auto out = at::empty_like(input, dtype);
-  ScalarType et_dtype = at_scalartype_to_et_scalartype(dtype);
-
-  WRAP_TO_ATEN(quantize_per_tensor_tensor_args_out_no_context, 6)
-  (input, scale, zero_point, quant_min, quant_max, et_dtype, out);
-  return out;
-}
-
-} // namespace native
-} // namespace executor
-} // namespace torch
-
-void check_quantize_args(
-    int64_t quant_min,
-    int64_t quant_max,
-    c10::ScalarType out_dtype) {
-  using namespace vkcompute;
-  int32_t quant_min_lower_bound = 0, quant_max_upper_bound = 0;
-  switch (out_dtype) {
-    case c10::kByte:
-      quant_min_lower_bound =
-          static_cast<int32_t>(std::numeric_limits<uint8_t>::min());
-      quant_max_upper_bound =
-          static_cast<int32_t>(std::numeric_limits<uint8_t>::max());
-      break;
-    case c10::kChar:
-      quant_min_lower_bound =
-          static_cast<int32_t>(std::numeric_limits<int8_t>::min());
-      quant_max_upper_bound =
-          static_cast<int32_t>(std::numeric_limits<int8_t>::max());
-      break;
-    case c10::kBits16:
-    case c10::kUInt16:
-      quant_min_lower_bound = std::numeric_limits<uint16_t>::min();
-      quant_max_upper_bound = std::numeric_limits<uint16_t>::max();
-      break;
-    case c10::kShort:
-      quant_min_lower_bound = std::numeric_limits<int16_t>::min();
-      quant_max_upper_bound = std::numeric_limits<int16_t>::max();
-      break;
-    case c10::kInt:
-      quant_min_lower_bound = std::numeric_limits<int32_t>::min();
-      quant_max_upper_bound = std::numeric_limits<int32_t>::max();
-      break;
-    default:
-      VK_CHECK_COND(false, "Unsupported dtype: ", scalar_type_name(out_dtype));
-  }
-  VK_CHECK_COND(
-      quant_min >= quant_min_lower_bound,
-      "quant_min out of bound for dtype, expected quant_min_lower_bound: ",
-      quant_min_lower_bound,
-      " actual quant_min: ",
-      quant_min);
-
-  VK_CHECK_COND(
-      quant_max <= quant_max_upper_bound,
-      "quant_max out of bound for dtype, expected quant_max_upper_bound: ",
-      quant_max_upper_bound,
-      " actual quant_max: ",
-      quant_max);
-}
-
-/**
- * Helper function to validate quantize_per_channel arguments
- * Similar to the validation in op_quantize.cpp
- */
-void check_quantize_per_channel_args(
-    const std::vector<int>& input_sizes,
-    const std::vector<float>& scales,
-    const std::vector<int>& zero_points,
-    int64_t axis) {
-  // Normalize axis
-  int64_t normalized_axis = axis;
-  if (normalized_axis < 0) {
-    normalized_axis += input_sizes.size();
-  }
-
-  ASSERT_GE(normalized_axis, 0)
-      << "axis " << axis << " is not legal, normalized axis " << normalized_axis
-      << " should be >= 0";
-
-  ASSERT_LT(normalized_axis, static_cast<int64_t>(input_sizes.size()))
-      << "axis " << axis << " is not legal, normalized axis " << normalized_axis
-      << " should be < input.dim() " << input_sizes.size();
-
-  int64_t num_channels = input_sizes[normalized_axis];
-
-  ASSERT_EQ(num_channels, static_cast<int64_t>(scales.size()))
-      << "Expected scales.size() to match input.size(axis) (" << num_channels
-      << "), but got " << scales.size();
-
-  ASSERT_EQ(num_channels, static_cast<int64_t>(zero_points.size()))
-      << "Expected zero_points.size() to match input.size(axis) ("
-      << num_channels << "), but got " << zero_points.size();
-}
-
-//
-// Reference Implementation
-//
-
-/*
- * Reference implementation of quantize_per_tensor
- */
-at::Tensor quantize_per_tensor_reference_impl(
-    const at::Tensor& input,
-    double scale,
-    int64_t zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype) {
-  // Create output tensor with the target dtype
-  at::Tensor out = at::empty_like(input, dtype);
-
-  // Quantize the input tensor
-  float inv_scale = 1.0 / scale;
-
-  // Iterate through the tensor and quantize each element
-  at::Tensor float_input = input.to(at::kFloat);
-  at::Tensor float_values = float_input.flatten();
-
-  auto out_flat = out.flatten();
-
-  for (int i = 0; i < float_values.numel(); i++) {
-    float value = float_values[i].item<float>();
-    int64_t qvalue = zero_point + std::nearbyint(inv_scale * value);
-
-    qvalue = std::max<int64_t>(qvalue, quant_min);
-    qvalue = std::min<int64_t>(qvalue, quant_max);
-
-    if (dtype == at::kByte) {
-      out_flat[i] = static_cast<uint8_t>(qvalue);
-    } else if (dtype == at::kChar) {
-      out_flat[i] = static_cast<int8_t>(qvalue);
-    } else if (dtype == at::kShort) {
-      out_flat[i] = static_cast<int16_t>(qvalue);
-    } else if (dtype == at::kInt) {
-      out_flat[i] = static_cast<int32_t>(qvalue);
-    } else if (dtype == at::kLong) {
-      out_flat[i] = static_cast<int64_t>(qvalue);
-    }
-  }
-
-  return out.reshape(input.sizes());
-}
-
-/*
- * Reference implementation of quantize_per_token
- */
-at::Tensor quantize_per_token_reference_impl(
-    const at::Tensor& input,
-    const at::Tensor& scale,
-    const at::Tensor& zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype) {
-  // Create output tensor with the target dtype
-  at::Tensor out = at::empty_like(input, dtype);
-
-  // Calculate number of tokens
-  int num_tokens = 1;
-  for (int i = 0; i < input.dim() - 1; i++) {
-    num_tokens *= input.size(i);
-  }
-
-  // Verify that the number of tokens matches the size of scale and zero_point
-  // tensors
-  assert(num_tokens == scale.numel());
-  assert(num_tokens == zero_point.numel());
-
-  // Reshape input to [num_tokens, last_dim]
-  at::Tensor reshaped_input = input.reshape({num_tokens, input.size(-1)});
-  at::Tensor reshaped_out = out.reshape({num_tokens, input.size(-1)});
-
-  // Quantize each token separately
-  for (int token_idx = 0; token_idx < num_tokens; token_idx++) {
-    // Use float for scale since Vulkan doesn't support double
-    float token_scale = scale[token_idx].item<float>();
-    // Use int for zero_point since Vulkan doesn't support int64_t
-    int token_zero_point = zero_point[token_idx].item<int>();
-
-    float inv_scale = 1.0 / token_scale;
-
-    // Quantize the token
-    for (int i = 0; i < input.size(-1); i++) {
-      float value = reshaped_input[token_idx][i].item<float>();
-      int qvalue = token_zero_point + std::nearbyint(inv_scale * value);
-
-      qvalue = std::max<int64_t>(qvalue, quant_min);
-      qvalue = std::min<int64_t>(qvalue, quant_max);
-
-      if (dtype == at::kByte) {
-        reshaped_out[token_idx][i] = static_cast<uint8_t>(qvalue);
-      } else if (dtype == at::kChar) {
-        reshaped_out[token_idx][i] = static_cast<int8_t>(qvalue);
-      } else if (dtype == at::kShort) {
-        reshaped_out[token_idx][i] = static_cast<int16_t>(qvalue);
-      } else if (dtype == at::kInt) {
-        reshaped_out[token_idx][i] = static_cast<int32_t>(qvalue);
-      } else if (dtype == at::kLong) {
-        reshaped_out[token_idx][i] = static_cast<int64_t>(qvalue);
-      }
-    }
-  }
-
-  return out;
-}
-
-/*
- * Reference implementation of quantize_per_channel
- */
-at::Tensor quantize_per_channel_reference_impl(
-    const at::Tensor& input,
-    const at::Tensor& scale,
-    const at::Tensor& zero_point,
-    int64_t axis,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType dtype) {
-  // Normalize axis to handle negative values
-  int64_t normalized_axis = axis;
-  if (normalized_axis < 0) {
-    normalized_axis += input.dim();
-  }
-
-  // Create output tensor with the same shape as input but with target dtype
-  at::Tensor output = at::empty_like(input, dtype);
-
-  // Get the number of channels along the quantization axis
-  int64_t num_channels = input.size(normalized_axis);
-
-  // Calculate strides for efficient indexing
-  std::vector<int64_t> input_strides;
-  std::vector<int64_t> input_sizes;
-  for (int64_t i = 0; i < input.dim(); i++) {
-    input_sizes.push_back(input.size(i));
-    input_strides.push_back(input.stride(i));
-  }
-
-  // Get data pointers
-  const float* input_data = input.const_data_ptr<float>();
-  const double* scale_data = scale.const_data_ptr<double>();
-  const int64_t* zero_point_data = zero_point.const_data_ptr<int64_t>();
-
-  // Iterate through all elements in the tensor
-  int64_t total_elements = input.numel();
-
-  // Helper lambda to convert flat index to multi-dimensional coordinates
-  auto flat_to_coords = [&](int64_t flat_idx, std::vector<int64_t>& coords) {
-    int64_t remaining = flat_idx;
-    for (int64_t dim = input.dim() - 1; dim >= 0; dim--) {
-      coords[dim] = remaining % input_sizes[dim];
-      remaining /= input_sizes[dim];
-    }
-  };
-
-  // Process each element
-  std::vector<int64_t> coords(input.dim());
-  for (int64_t flat_idx = 0; flat_idx < total_elements; flat_idx++) {
-    // Convert flat index to coordinates
-    flat_to_coords(flat_idx, coords);
-
-    // Get the channel index for this element
-    int64_t channel_idx = coords[normalized_axis];
-
-    // Get the quantization parameters for this channel
-    double channel_scale = scale_data[channel_idx];
-    int64_t channel_zero_point = zero_point_data[channel_idx];
-
-    // Get the input value
-    float input_value = input_data[flat_idx];
-
-    // Apply quantization formula: round(input / scale) + zero_point
-    float inv_scale = 1.0f / static_cast<float>(channel_scale);
-    int64_t quantized_value = static_cast<int64_t>(
-        static_cast<int32_t>(channel_zero_point) +
-        std::nearbyint(static_cast<float>(inv_scale * input_value)));
-
-    // Clamp to quantization bounds
-    quantized_value = std::max<int64_t>(quantized_value, quant_min);
-    quantized_value = std::min<int64_t>(quantized_value, quant_max);
-
-    // Store the result based on output dtype
-    switch (dtype) {
-      case at::kByte: {
-        uint8_t* output_data = output.mutable_data_ptr<uint8_t>();
-        output_data[flat_idx] = static_cast<uint8_t>(quantized_value);
-        break;
-      }
-      case at::kChar: {
-        int8_t* output_data = output.mutable_data_ptr<int8_t>();
-        output_data[flat_idx] = static_cast<int8_t>(quantized_value);
-        break;
-      }
-      case at::kShort: {
-        int16_t* output_data = output.mutable_data_ptr<int16_t>();
-        output_data[flat_idx] = static_cast<int16_t>(quantized_value);
-        break;
-      }
-      case at::kInt: {
-        int32_t* output_data = output.mutable_data_ptr<int32_t>();
-        output_data[flat_idx] = static_cast<int32_t>(quantized_value);
-        break;
-      }
-      default:
-        assert(false && "Unsupported output dtype");
-    }
-  }
-
-  return output;
-}
-
-// Forward declaration of implementation functions
-void test_vulkan_quantize_per_token_impl(
-    const std::vector<int>& input_sizes,
-    const std::vector<float>& scales,
-    const std::vector<int>& zero_points,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType in_dtype,
-    at::ScalarType dtype,
-    const vkcompute::utils::StorageType in_storage,
-    const vkcompute::utils::StorageType out_storage);
-
-void test_vulkan_quantize_per_channel_impl(
-    const std::vector<int>& input_sizes,
-    const std::vector<float>& scales,
-    const std::vector<int>& zero_points,
-    int64_t axis,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType in_dtype,
-    at::ScalarType dtype,
-    const vkcompute::utils::StorageType in_storage,
-    const vkcompute::utils::StorageType out_storage);
-
-void test_vulkan_quantize_per_tensor_tensor_impl(
-    const std::vector<int>& input_sizes,
-    float scale,
-    int zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType in_dtype,
-    at::ScalarType dtype,
-    const vkcompute::utils::StorageType in_storage,
-    const vkcompute::utils::StorageType out_storage);
-
-// Wrapper function to test both buffer and texture storage types
-void test_vulkan_quantize_per_token(
-    const std::vector<int>& input_sizes,
-    const std::vector<float>& scales,
-    const std::vector<int>& zero_points,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType in_dtype = at::kFloat,
-    at::ScalarType dtype = at::kInt) {
-  // Test with buffer storage
-  test_vulkan_quantize_per_token_impl(
-      input_sizes,
-      scales,
-      zero_points,
-      quant_min,
-      quant_max,
-      in_dtype,
-      dtype,
-      vkcompute::utils::kBuffer,
-      vkcompute::utils::kBuffer);
-
-  // If the in_dtype is a double, convert to float for texture implementation
-  // since they don't support 64bit as inputs
-  if (in_dtype == at::kDouble) {
-    in_dtype = at::kFloat;
-  }
-
-  // Test with texture storage
-  test_vulkan_quantize_per_token_impl(
-      input_sizes,
-      scales,
-      zero_points,
-      quant_min,
-      quant_max,
-      in_dtype,
-      dtype,
-      vkcompute::utils::kTexture3D,
-      vkcompute::utils::kTexture3D);
-}
-
-// Wrapper function to test both buffer and texture storage types
-void test_vulkan_quantize_per_channel(
-    const std::vector<int>& input_sizes,
-    const std::vector<float>& scales,
-    const std::vector<int>& zero_points,
-    int64_t axis,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType in_dtype = at::kFloat,
-    at::ScalarType dtype = at::kInt) {
-  // Test with buffer storage
-  test_vulkan_quantize_per_channel_impl(
-      input_sizes,
-      scales,
-      zero_points,
-      axis,
-      quant_min,
-      quant_max,
-      in_dtype,
-      dtype,
-      vkcompute::utils::kBuffer,
-      vkcompute::utils::kBuffer);
-
-  // If the in_dtype is a double, convert to float for texture implementation
-  // since they don't support 64bit as inputs
-  if (in_dtype == at::kDouble) {
-    in_dtype = at::kFloat;
-  }
-
-  test_vulkan_quantize_per_channel_impl(
-      input_sizes,
-      scales,
-      zero_points,
-      axis,
-      quant_min,
-      quant_max,
-      in_dtype,
-      dtype,
-      vkcompute::utils::kTexture3D,
-      vkcompute::utils::kTexture3D);
-}
-
-// Wrapper function to test both buffer and texture storage types
-void test_vulkan_quantize_per_tensor_tensor(
-    const std::vector<int>& input_sizes,
-    float scale,
-    int zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType in_dtype = at::kFloat,
-    at::ScalarType dtype = at::kInt) {
-  // Test with buffer storage
-  test_vulkan_quantize_per_tensor_tensor_impl(
-      input_sizes,
-      scale,
-      zero_point,
-      quant_min,
-      quant_max,
-      in_dtype,
-      dtype,
-      vkcompute::utils::kBuffer,
-      vkcompute::utils::kBuffer);
-
-  // If the in_dtype is a double, convert to float for texture implementation
-  // since they don't support 64bit as inputs
-  if (in_dtype == at::kDouble) {
-    in_dtype = at::kFloat;
-  }
-
-  // Test with texture storage
-  test_vulkan_quantize_per_tensor_tensor_impl(
-      input_sizes,
-      scale,
-      zero_point,
-      quant_min,
-      quant_max,
-      in_dtype,
-      dtype,
-      vkcompute::utils::kTexture3D,
-      vkcompute::utils::kTexture3D);
-}
-
-void test_reference_quantize_per_tensor(
-    const std::vector<int>& input_sizes,
-    float scale,
-    int zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType in_dtype = at::kFloat,
-    at::ScalarType dtype = at::kInt) {
-  check_quantize_args(quant_min, quant_max, dtype);
-  std::vector<int64_t> input_sizes_int64(
-      input_sizes.begin(), input_sizes.end());
-  at::Tensor input =
-      at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype));
-
-  // Fill with a simple pattern: values from 0 to 1 in steps
-  float step = 1.0f / (input.numel() - 1);
-  auto flat_input = input.flatten();
-  for (int i = 0; i < flat_input.numel(); i++) {
-    flat_input[i] = i * step;
-  }
-
-  // Reshape back to original dimensions
-  input = flat_input.reshape(input_sizes_int64);
-
-  scale = scale < eps ? eps : scale;
-
-  // Get reference output
-  at::Tensor reference_out = quantize_per_tensor_reference_impl(
-      input, scale, zero_point, quant_min, quant_max, dtype);
-
-  // Get implementation output
-  at::Tensor impl_out = torch::executor::native::quantize_per_tensor_aten(
-      input, scale, zero_point, quant_min, quant_max, dtype);
-
-  // Convert to int for consistent display regardless of underlying type
-  at::Tensor reference_int = reference_out.to(at::kInt);
-  at::Tensor impl_int = impl_out.to(at::kInt);
-
-  const bool output_correct = at::equal(reference_int, impl_int);
-  if (!output_correct) {
-    at::Tensor diffs = at::abs(reference_int - impl_int);
-
-    std::cout << "\n"
-              << "Failed with parameters: " << std::endl;
-    std::cout << "  scale: " << scale << std::endl;
-    std::cout << "  zero_point: " << zero_point << std::endl;
-    std::cout << "  quant_min: " << quant_min << std::endl;
-    std::cout << "  quant_max: " << quant_max << std::endl;
-
-    std::cout << "input:" << std::endl;
-    std::cout << input << std::endl;
-    std::cout << "reference:" << std::endl;
-    std::cout << reference_int << std::endl;
-    std::cout << "my_reference:" << std::endl;
-    std::cout << impl_int << std::endl;
-  }
-
-  ASSERT_TRUE(output_correct);
-}
-
-TEST(
-    VulkanQuantizePerTensorTest,
-    test_reference_quantize_per_tensor_float_to_int8) {
-  test_reference_quantize_per_tensor(
-      {2, 3, 4}, // input sizes
-      0.1, // scale
-      0, // zero_point
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-}
-
-TEST(
-    VulkanQuantizePerTensorTest,
-    test_reference_quantize_per_tensor_float_to_int32) {
-  test_reference_quantize_per_tensor(
-      {2, 3, 4}, // input sizes
-      0.04, // scale
-      5, // zero_point
-      std::numeric_limits<int32_t>::min(), // quant_min
-      std::numeric_limits<int32_t>::max(), // quant_max
-      at::kFloat,
-      at::kInt);
-}
-
-TEST(
-    VulkanQuantizePerTensorTest,
-    test_reference_quantize_per_tensor_half_to_uint8) {
-  test_reference_quantize_per_tensor(
-      {2, 3, 4}, // input sizes
-      0.2, // scale
-      2, // zero_point
-      0, // quant_min
-      255, // quant_max
-      at::kHalf,
-      at::kByte);
-}
-
-TEST(
-    VulkanQuantizePerTensorTest,
-    test_reference_quantize_per_tensor_half_to_int32) {
-  test_reference_quantize_per_tensor(
-      {2, 3, 4}, // input sizes
-      0.01, // scale
-      1, // zero_point
-      std::numeric_limits<int32_t>::min(), // quant_min
-      std::numeric_limits<int32_t>::max(), // quant_max
-      at::kHalf,
-      at::kInt);
-}
-
-// No Vulkan tests for quantized_decomposed.quantize_per_tensor.default
-// because it is not going to be implemented in Vulkan since we will
-// be handling any future calls to this op via the export stage
-
-void test_reference_quantize_per_token(
-    const std::vector<int>& input_sizes,
-    const std::vector<float>& pre_scales,
-    const std::vector<int>& zero_points,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType in_dtype = at::kFloat,
-    at::ScalarType dtype = at::kInt) {
-  check_quantize_args(quant_min, quant_max, dtype);
-  std::vector<int64_t> input_sizes_int64(
-      input_sizes.begin(), input_sizes.end());
-  at::Tensor input =
-      at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype));
-
-  // Fill with a simple pattern: values from 0 to 1 in steps
-  float step = 1.0 / (input.numel() - 1);
-  auto flat_input = input.flatten();
-  for (int i = 0; i < flat_input.numel(); i++) {
-    flat_input[i] = i * step;
-  }
-
-  // Reshape back to original dimensions
-  input = flat_input.reshape(input_sizes_int64);
-
-  // Calculate number of tokens
-  int num_tokens = 1;
-  for (int i = 0; i < input.dim() - 1; i++) {
-    num_tokens *= input.size(i);
-  }
-
-  // Verify that the number of tokens matches the size of scales and zero_points
-  ASSERT_EQ(num_tokens, pre_scales.size());
-  ASSERT_EQ(num_tokens, zero_points.size());
-
-  std::vector<float> scales = pre_scales;
-  for (auto& s : scales) {
-    s = s < eps ? eps : s;
-  }
-
-  // Create scale and zero_point tensors
-  at::Tensor scale_tensor =
-      at::tensor(scales, at::device(at::kCPU).dtype(at::kDouble));
-  at::Tensor zero_point_tensor =
-      at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong));
-
-  // Get reference output
-  at::Tensor reference_out = quantize_per_token_reference_impl(
-      input, scale_tensor, zero_point_tensor, quant_min, quant_max, dtype);
-
-  // Get implementation output
-  at::Tensor impl_out = torch::executor::native::quantize_per_token_aten(
-      input, scale_tensor, zero_point_tensor, quant_min, quant_max, dtype);
-
-  // Convert to int for consistent display regardless of underlying type
-  at::Tensor reference_int = reference_out.to(at::kInt);
-  at::Tensor impl_int = impl_out.to(at::kInt);
-
-  const bool output_correct = at::equal(reference_int, impl_out);
-  if (!output_correct) {
-    std::cout << "\n"
-              << "Failed with parameters: " << std::endl;
-    std::cout << "  scale(s):";
-    for (size_t i = 0; i < scales.size(); i++) {
-      std::cout << " " << scales[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  zero_point(s):";
-    for (size_t i = 0; i < zero_points.size(); i++) {
-      std::cout << " " << zero_points[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  quant_min: " << quant_min << std::endl;
-    std::cout << "  quant_max: " << quant_max << std::endl;
-
-    std::cout << "input:" << std::endl;
-    std::cout << input << std::endl;
-    std::cout << "reference:" << std::endl;
-    std::cout << reference_int << std::endl;
-    std::cout << "my_reference:" << std::endl;
-    std::cout << impl_out << std::endl;
-  }
-
-  ASSERT_TRUE(output_correct);
-}
-
-void test_vulkan_quantize_per_token_impl(
-    const std::vector<int>& input_sizes,
-    const std::vector<float>& pre_scales,
-    const std::vector<int>& zero_points,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType in_dtype = at::kFloat,
-    at::ScalarType dtype = at::kInt,
-    const vkcompute::utils::StorageType in_storage =
-        vkcompute::utils::kTexture3D,
-    const vkcompute::utils::StorageType out_storage =
-        vkcompute::utils::kTexture3D) {
-  check_quantize_args(quant_min, quant_max, dtype);
-  int num_tokens = 1;
-  for (int i = 0; i < input_sizes.size() - 1; i++) {
-    num_tokens *= input_sizes[i];
-  }
-
-  ASSERT_EQ(num_tokens, pre_scales.size());
-  ASSERT_EQ(num_tokens, zero_points.size());
-
-  std::vector<float> scales = pre_scales;
-  for (auto& s : scales) {
-    s = s < eps ? eps : s;
-  }
-
-  // Create input tensor with random values
-  std::vector<int64_t> input_sizes_int64(
-      input_sizes.begin(), input_sizes.end());
-  at::Tensor input =
-      at::rand(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype));
-  at::Tensor scale_tensor =
-      at::tensor(scales, at::device(at::kCPU).dtype(at::kDouble));
-  at::Tensor zero_point_tensor =
-      at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong));
-
-  // Get reference output to show what we would compare against
-  at::Tensor reference_out = torch::executor::native::quantize_per_token_aten(
-      input, scale_tensor, zero_point_tensor, quant_min, quant_max, dtype);
-
-  using namespace vkcompute;
-
-  GraphConfig config;
-  config.set_storage_type_override(in_storage);
-  ComputeGraph graph(config);
-
-  IOValueRef r_input = graph.add_input_tensor(
-      input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage);
-  IOValueRef r_scale = graph.add_input_tensor(
-      scale_tensor.sizes().vec(),
-      vkapi::kFloat,
-      utils::kBuffer,
-      utils::kWidthPacked);
-  IOValueRef r_zero_point = graph.add_input_tensor(
-      zero_point_tensor.sizes().vec(),
-      vkapi::kInt,
-      utils::kBuffer,
-      utils::kWidthPacked);
-
-  const ValueRef r_quant_min = graph.add_scalar<int64_t>(quant_min);
-  const ValueRef r_quant_max = graph.add_scalar<int64_t>(quant_max);
-
-  const ValueRef r_out = graph.add_tensor(
-      input.sizes().vec(), from_at_scalartype(dtype), out_storage);
-
-  const ValueRef r_dtype =
-      graph.add_scalar<int64_t>(static_cast<int64_t>(dtype));
-
-  VK_GET_OP_FN("quantized_decomposed.quantize_per_token.default")
-  (graph,
-   {
-       r_input.value,
-       r_scale.value,
-       r_zero_point.value,
-       r_quant_min,
-       r_quant_max,
-       r_dtype,
-       r_out,
-   });
-
-  ValueRef staging_out = graph.set_output_tensor(r_out);
-
-  graph.prepare();
-
-  graph.prepack();
-
-  // Copy input data to GPU
-  graph.copy_into_staging(
-      r_input.staging, input.const_data_ptr(), input.numel());
-
-  // Convert scale tensor to float and copy to GPU
-  at::Tensor scale_float = scale_tensor.to(at::kFloat);
-  graph.copy_into_staging(
-      r_scale.staging, scale_float.const_data_ptr(), scale_float.numel());
-
-  // Convert zero_point tensor to int and copy to GPU
-  at::Tensor zero_point_int = zero_point_tensor.to(at::kInt);
-  graph.copy_into_staging(
-      r_zero_point.staging,
-      zero_point_int.const_data_ptr(),
-      zero_point_int.numel());
-
-  // Execute the graph
-  graph.execute();
-
-  // Copy output data back to CPU
-  at::Tensor vk_out = at::empty_like(reference_out).contiguous();
-  graph.copy_from_staging(
-      staging_out, vk_out.mutable_data_ptr(), vk_out.numel());
-
-  // Compare outputs
-  at::Tensor reference_int = reference_out.to(at::kInt);
-  at::Tensor vk_int = vk_out.to(at::kInt);
-
-  // Tolerance is 1 to address rounding errors and fp math differences between
-  // CPU/GPU
-  const bool output_correct =
-      at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
-  if (!output_correct) {
-    at::Tensor diffs = at::abs(reference_int - vk_int);
-
-    std::cout << "\n"
-              << "Failed with parameters: " << std::endl;
-    std::cout << "  scale(s):";
-    for (size_t i = 0; i < scales.size(); i++) {
-      std::cout << " " << scales[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  zero_point(s):";
-    for (size_t i = 0; i < zero_points.size(); i++) {
-      std::cout << " " << zero_points[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  quant_min: " << quant_min << std::endl;
-    std::cout << "  quant_max: " << quant_max << std::endl;
-    std::cout << "  storage type: "
-              << (in_storage == vkcompute::utils::kBuffer ? "buffer"
-                                                          : "texture")
-              << std::endl;
-
-    std::cout << "input:" << std::endl;
-    std::cout << input << std::endl;
-    std::cout << "reference:" << std::endl;
-    std::cout << reference_int << std::endl;
-    std::cout << "vulkan:" << std::endl;
-    std::cout << vk_int << std::endl;
-  }
-
-  ASSERT_TRUE(output_correct);
-}
-
-TEST(
-    VulkanQuantizePerTokenTest,
-    test_reference_quantize_per_token_float_to_int8) {
-  std::vector<float> scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3};
-  std::vector<int> zero_points = {1, 2, 3, 0, -1, -2};
-
-  test_reference_quantize_per_token(
-      {2, 3, 4}, // input sizes (2*3=6 tokens)
-      scales,
-      zero_points,
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-}
-
-TEST(
-    VulkanQuantizePerTokenTest,
-    test_reference_quantize_per_token_float_to_int32) {
-  std::vector<float> scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3};
-  std::vector<int> zero_points = {1, 2, 3, 0, -1, -2};
-
-  test_reference_quantize_per_token(
-      {2, 3, 4}, // input sizes (2*3=6 tokens)
-      scales,
-      zero_points,
-      std::numeric_limits<int32_t>::min(), // quant_min
-      std::numeric_limits<int32_t>::max(), // quant_max
-      at::kFloat,
-      at::kInt);
-}
-
-TEST(
-    VulkanQuantizePerTokenTest,
-    test_reference_quantize_per_token_half_to_int32) {
-  std::vector<float> scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3};
-  std::vector<int> zero_points = {1, 2, 3, 0, -1, -2};
-
-  test_reference_quantize_per_token(
-      {2, 3, 4}, // input sizes (2*3=6 tokens)
-      scales,
-      zero_points,
-      std::numeric_limits<int32_t>::min(), // quant_min
-      std::numeric_limits<int32_t>::max(), // quant_max
-      at::kHalf,
-      at::kInt);
-}
-
-TEST(
-    VulkanQuantizePerTokenTest,
-    test_reference_quantize_per_token_half_to_uint8) {
-  std::vector<float> scales = {0.1, 0, 0.3, 0.1, 0.2, 0.3};
-  std::vector<int> zero_points = {1, 2, 3, 0, -1, -2};
-
-  test_reference_quantize_per_token(
-      {2, 3, 4}, // input sizes (2*3=6 tokens)
-      scales,
-      zero_points,
-      0, // quant_min
-      255, // quant_max
-      at::kHalf,
-      at::kByte);
-}
-
-TEST(
-    VulkanQuantizePerTokenTest,
-    test_vulkan_quantize_per_token_float_to_uint8) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales = {
-      -0.5, -0.3, -0.2, 0, 0.1, 0.8, 0.1, 0.2, 0.3, 0.4};
-  std::vector<int> zero_points = {-8, 0, 15, 20, 19, 12, 47, 1, -50, -12};
-
-  test_vulkan_quantize_per_token(
-      {5, 2, 4}, // input sizes (5*2=10 tokens)
-      scales,
-      zero_points,
-      0, // quant_min
-      255, // quant_max
-      at::kFloat,
-      at::kByte);
-}
-
-TEST(VulkanQuantizePerTokenTest, test_vulkan_quantize_per_token_float_to_int8) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales = {
-      -0.5, -0.3, -0.2, 0, 0.1, 0.8, 0.1, 0.2, 0.3, 0.4};
-  std::vector<int> zero_points = {-8, 0, 15, 20, 19, 12, 47, 1, -50, -12};
-
-  test_vulkan_quantize_per_token(
-      {5, 2, 4}, // input sizes (5 tokens)
-      scales,
-      zero_points,
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-}
-
-TEST(
-    VulkanQuantizePerTokenTest,
-    test_vulkan_quantize_per_token_float_to_int32) {
-  std::vector<float> scales = {
-      -0.5, -0.3, -0.2, 0, 0.1, 0.8, 0.1, 0.2, 0.3, 0.4};
-  std::vector<int> zero_points = {-8, 0, 15, 20, 19, 12, 47, 1, -50, -12};
-
-  test_vulkan_quantize_per_token(
-      {5, 2, 4}, // input sizes (5*2=10 tokens)
-      scales,
-      zero_points,
-      -2147483648, // quant_min
-      2147483647, // quant_max
-      at::kFloat,
-      at::kInt);
-}
-
-TEST(
-    VulkanQuantizePerTokenTest,
-    test_vulkan_quantize_per_token_float_to_int32_small_scales) {
-  std::vector<float> scales = {
-      0,
-      2.9387358770557188e-39f,
-      1.40129846e-45f,
-      1.17549435e-38f,
-      0.0000000000001};
-  std::vector<int> zero_points = {20, -10, 15, 200, 50};
-
-  test_vulkan_quantize_per_token(
-      {5, 2}, // input sizes (3 tokens)
-      scales,
-      zero_points,
-      -2147483648, // quant_min
-      2147483647, // quant_max
-      at::kFloat,
-      at::kInt);
-}
-
-TEST(
-    VulkanQuantizePerTokenTest,
-    test_vulkan_quantize_per_token_float_to_uint8_many_tokens) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales(18, 0.1);
-  std::vector<int> zero_points(18, 5);
-
-  // Alternate scale values
-  for (size_t i = 0; i < scales.size(); i++) {
-    scales[i] = (i % 2 == 0) ? 0.3 : -0.5;
-  }
-
-  test_vulkan_quantize_per_token(
-      {3, 3, 2, 3}, // input sizes (3*3*2=18 tokens)
-      scales,
-      zero_points,
-      0, // quant_min
-      125, // quant_max
-      at::kFloat,
-      at::kByte);
-}
-
-TEST(VulkanQuantizePerTokenTest, test_vulkan_quantize_per_token_half_to_int8) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_float16_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales = {0.1, 0.2};
-  std::vector<int> zero_points = {0, 5};
-
-  test_vulkan_quantize_per_token(
-      {2, 2}, // input sizes (2*2=4 tokens)
-      scales,
-      zero_points,
-      -128, // quant_min
-      127, // quant_max
-      at::kHalf, // input dtype
-      at::kChar); // output dtype
-}
-
-TEST(
-    VulkanQuantizePerTokenTest,
-    test_vulkan_quantize_per_token_double_to_int8) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales = {0.1, 0.2};
-  std::vector<int> zero_points = {0, 5};
-
-  test_vulkan_quantize_per_token(
-      {2, 2}, // input sizes (2*2=4 tokens)
-      scales,
-      zero_points,
-      -128, // quant_min
-      127, // quant_max
-      at::kDouble, // input dtype
-      at::kChar); // output dtype
-}
-
-void test_reference_quantize_per_channel(
-    const std::vector<int>& input_sizes,
-    const std::vector<float>& pre_scales,
-    const std::vector<int>& zero_points,
-    int64_t axis,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType in_dtype = at::kFloat,
-    at::ScalarType dtype = at::kInt) {
-  check_quantize_args(quant_min, quant_max, dtype);
-  check_quantize_per_channel_args(input_sizes, pre_scales, zero_points, axis);
-
-  std::vector<int64_t> input_sizes_int64(
-      input_sizes.begin(), input_sizes.end());
-  at::Tensor input =
-      at::zeros(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype));
-
-  // Fill with a simple pattern: values from 0 to 1 in steps
-  float step = 1.0f / (input.numel() - 1);
-  auto flat_input = input.flatten();
-  for (int i = 0; i < flat_input.numel(); i++) {
-    flat_input[i] = i * step;
-  }
-
-  // Reshape back to original dimensions
-  input = flat_input.reshape(input_sizes_int64);
-
-  std::vector<float> scales = pre_scales;
-  for (auto& s : scales) {
-    s = s < eps ? eps : s;
-  }
-
-  // Create scale and zero_point tensors
-  at::Tensor scale_tensor =
-      at::tensor(scales, at::device(at::kCPU).dtype(at::kDouble));
-  at::Tensor zero_point_tensor =
-      at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong));
-
-  // Get reference output
-  at::Tensor my_ref = quantize_per_channel_reference_impl(
-      input,
-      scale_tensor,
-      zero_point_tensor,
-      axis,
-      quant_min,
-      quant_max,
-      dtype);
-
-  // Get implementation output
-  at::Tensor cpu_ref = torch::executor::native::quantize_per_channel_aten(
-      input,
-      scale_tensor,
-      zero_point_tensor,
-      axis,
-      quant_min,
-      quant_max,
-      dtype);
-
-  // Get direct ATen implementation output
-  c10::ScalarType aten_dtype = dtype;
-  if (dtype == at::kChar) {
-    aten_dtype = c10::kQInt8;
-  } else if (dtype == at::kByte) {
-    aten_dtype = c10::kQUInt8;
-  }
-
-  // Normalize axis for ATen (it doesn't handle negative values)
-  int64_t normalized_axis = axis;
-  if (normalized_axis < 0) {
-    normalized_axis += input.dim();
-  }
-
-  at::Tensor aten_ref = at::quantize_per_channel(
-      input, scale_tensor, zero_point_tensor, normalized_axis, aten_dtype);
-
-  // Convert to int for consistent display regardless of underlying type
-  at::Tensor my_ref_int = my_ref.to(at::kInt);
-  at::Tensor cpu_ref_int = cpu_ref.to(at::kInt);
-  // For quantized tensors, we need to use int_repr() to get the underlying
-  // integer values
-  at::Tensor aten_ref_int = aten_ref.int_repr().to(at::kInt);
-
-  const bool output_correct = at::equal(my_ref_int, cpu_ref_int);
-  if (!output_correct) {
-    std::cout << "\n"
-              << "Failed with parameters: " << std::endl;
-    std::cout << "  axis: " << axis << std::endl;
-    std::cout << "  input sizes:";
-    for (size_t i = 0; i < input_sizes.size(); i++) {
-      std::cout << " " << input_sizes[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  scale(s):";
-    for (size_t i = 0; i < scales.size(); i++) {
-      std::cout << " " << scales[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  zero_point(s):";
-    for (size_t i = 0; i < zero_points.size(); i++) {
-      std::cout << " " << zero_points[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  quant_min: " << quant_min << std::endl;
-    std::cout << "  quant_max: " << quant_max << std::endl;
-
-    std::cout << "input:" << std::endl;
-    std::cout << input << std::endl;
-    std::cout << "aten_ref:" << std::endl;
-    std::cout << aten_ref_int << std::endl;
-    std::cout << "cpu_ref:" << std::endl;
-    std::cout << cpu_ref_int << std::endl;
-    std::cout << "my_ref:" << std::endl;
-    std::cout << my_ref_int << std::endl;
-  }
-
-  ASSERT_TRUE(output_correct);
-}
-
-void test_vulkan_quantize_per_channel_impl(
-    const std::vector<int>& input_sizes,
-    const std::vector<float>& pre_scales,
-    const std::vector<int>& zero_points,
-    int64_t axis,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType in_dtype = at::kFloat,
-    at::ScalarType dtype = at::kInt,
-    const vkcompute::utils::StorageType in_storage =
-        vkcompute::utils::kTexture3D,
-    const vkcompute::utils::StorageType out_storage =
-        vkcompute::utils::kTexture3D) {
-  check_quantize_args(quant_min, quant_max, dtype);
-  check_quantize_per_channel_args(input_sizes, pre_scales, zero_points, axis);
-
-  std::vector<float> scales = pre_scales;
-  for (auto& s : scales) {
-    s = s < eps ? eps : s;
-  }
-
-  // Create input tensor with random values
-  std::vector<int64_t> input_sizes_int64(
-      input_sizes.begin(), input_sizes.end());
-  at::Tensor input =
-      at::rand(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype));
-  at::Tensor scale_tensor =
-      at::tensor(scales, at::device(at::kCPU).dtype(at::kDouble));
-  at::Tensor zero_point_tensor =
-      at::tensor(zero_points, at::device(at::kCPU).dtype(at::kLong));
-
-  // Get reference output
-  at::Tensor reference_out = torch::executor::native::quantize_per_channel_aten(
-      input,
-      scale_tensor,
-      zero_point_tensor,
-      axis,
-      quant_min,
-      quant_max,
-      dtype);
-
-  using namespace vkcompute;
-
-  GraphConfig config;
-  config.set_storage_type_override(in_storage);
-  ComputeGraph graph(config);
-
-  IOValueRef r_input = graph.add_input_tensor(
-      input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage);
-  IOValueRef r_scale = graph.add_input_tensor(
-      scale_tensor.sizes().vec(),
-      vkapi::kFloat,
-      utils::kBuffer,
-      utils::kWidthPacked);
-  IOValueRef r_zero_point = graph.add_input_tensor(
-      zero_point_tensor.sizes().vec(),
-      vkapi::kInt,
-      utils::kBuffer,
-      utils::kWidthPacked);
-
-  const ValueRef r_axis = graph.add_scalar<int64_t>(axis);
-  const ValueRef r_quant_min = graph.add_scalar<int64_t>(quant_min);
-  const ValueRef r_quant_max = graph.add_scalar<int64_t>(quant_max);
-
-  const ValueRef r_out = graph.add_tensor(
-      input.sizes().vec(), from_at_scalartype(dtype), out_storage);
-
-  const ValueRef r_dtype =
-      graph.add_scalar<int64_t>(static_cast<int64_t>(dtype));
-
-  VK_GET_OP_FN("quantized_decomposed.quantize_per_channel.default")
-  (graph,
-   {
-       r_input.value,
-       r_scale.value,
-       r_zero_point.value,
-       r_axis,
-       r_quant_min,
-       r_quant_max,
-       r_dtype,
-       r_out,
-   });
-
-  ValueRef staging_out = graph.set_output_tensor(r_out);
-
-  graph.prepare();
-  graph.prepack();
-
-  // Copy input data to GPU
-  graph.copy_into_staging(
-      r_input.staging, input.const_data_ptr(), input.numel());
-
-  // Convert scale tensor to float and copy to GPU
-  at::Tensor scale_float = scale_tensor.to(at::kFloat);
-  graph.copy_into_staging(
-      r_scale.staging, scale_float.const_data_ptr(), scale_float.numel());
-
-  // Convert zero_point tensor to int and copy to GPU
-  at::Tensor zero_point_int = zero_point_tensor.to(at::kInt);
-  graph.copy_into_staging(
-      r_zero_point.staging,
-      zero_point_int.const_data_ptr(),
-      zero_point_int.numel());
-
-  // Execute the graph
-  graph.execute();
-
-  // Copy output data back to CPU
-  at::Tensor vk_out = at::empty_like(reference_out).contiguous();
-  graph.copy_from_staging(
-      staging_out, vk_out.mutable_data_ptr(), vk_out.numel());
-
-  // Compare outputs
-  at::Tensor reference_int = reference_out.to(at::kInt);
-  at::Tensor vk_int = vk_out.to(at::kInt);
-
-  // Tolerance is 1 to address rounding errors and fp math differences between
-  // CPU/GPU
-  const bool output_correct =
-      at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
-  if (!output_correct) {
-    at::Tensor diffs = at::abs(reference_int - vk_int);
-
-    std::cout << "\n"
-              << "Failed with parameters: " << std::endl;
-    std::cout << "  axis: " << axis << std::endl;
-    std::cout << "  input sizes:";
-    for (size_t i = 0; i < input_sizes.size(); i++) {
-      std::cout << " " << input_sizes[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  scale(s):";
-    for (size_t i = 0; i < scales.size(); i++) {
-      std::cout << " " << scales[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  zero_point(s):";
-    for (size_t i = 0; i < zero_points.size(); i++) {
-      std::cout << " " << zero_points[i] << " ";
-    }
-    std::cout << "" << std::endl;
-    std::cout << "  quant_min: " << quant_min << std::endl;
-    std::cout << "  quant_max: " << quant_max << std::endl;
-    std::cout << "  storage type: "
-              << (in_storage == vkcompute::utils::kBuffer ? "buffer"
-                                                          : "texture")
-              << std::endl;
-
-    std::cout << "input:" << std::endl;
-    std::cout << input << std::endl;
-    std::cout << "reference:" << std::endl;
-    std::cout << reference_int << std::endl;
-    std::cout << "vulkan:" << std::endl;
-    std::cout << vk_int << std::endl;
-  }
-
-  ASSERT_TRUE(output_correct);
-}
-
-TEST(
-    VulkanQuantizePerChannelTest,
-    test_reference_quantize_per_channel_float_to_int8_3D_axis0) {
-  std::vector<float> scales = {0.1, 0.2, 0.3};
-  std::vector<int> zero_points = {0, 5, -2};
-
-  test_reference_quantize_per_channel(
-      {3, 4, 2}, // input sizes
-      scales,
-      zero_points,
-      0, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-}
-
-TEST(
-    VulkanQuantizePerChannelTest,
-    test_reference_quantize_per_channel_float_to_int8_3D_axis2) {
-  std::vector<float> scales = {0.1, 0.2};
-  std::vector<int> zero_points = {0, 5};
-
-  test_reference_quantize_per_channel(
-      {3, 4, 2}, // input sizes
-      scales,
-      zero_points,
-      2, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-}
-
-TEST(
-    VulkanQuantizePerChannelTest,
-    test_reference_quantize_per_channel_float_to_int8_3D_axisn1) {
-  std::vector<float> scales = {0.1, 0.2};
-  std::vector<int> zero_points = {0, 5};
-
-  test_reference_quantize_per_channel(
-      {3, 4, 2}, // input sizes
-      scales,
-      zero_points,
-      -1, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-}
-
-TEST(
-    VulkanQuantizePerChannelTest,
-    test_reference_quantize_per_channel_float_to_int8_4D_axis0) {
-  std::vector<float> scales = {0.1, 0.2, 0.00002};
-  std::vector<int> zero_points = {0, 5, -4};
-
-  test_reference_quantize_per_channel(
-      {3, 4, 2, 5}, // input sizes
-      scales,
-      zero_points,
-      0, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-}
-
-// END OF REFERENCE TESTS
-
-TEST(
-    VulkanQuantizePerChannelTest,
-    test_vulkan_quantize_per_channel_float_to_int8_axis0) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales(9, 0.1f);
-  std::vector<int> zero_points(9, 2);
-
-  // 1D Tensor
-  test_vulkan_quantize_per_channel(
-      {9}, // input sizes
-      scales,
-      zero_points,
-      0, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-
-  // 2D Tensor
-  test_vulkan_quantize_per_channel(
-      {9, 14}, // input sizes
-      scales,
-      zero_points,
-      0, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-
-  // 3D Tensor
-  test_vulkan_quantize_per_channel(
-      {9, 7, 11}, // input sizes
-      scales,
-      zero_points,
-      0, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-
-  // 4D Tensor
-  test_vulkan_quantize_per_channel(
-      {9, 17, 5, 5}, // input sizes
-      scales,
-      zero_points,
-      0, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-
-  // 4D Tensor (negative axis)
-  test_vulkan_quantize_per_channel(
-      {5, 17, 5, 9}, // input sizes
-      scales,
-      zero_points,
-      -1, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-}
-
-TEST(
-    VulkanQuantizePerChannelTest,
-    test_vulkan_quantize_per_channel_float_to_int8_axis1) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales(14, 0.001f);
-  std::vector<int> zero_points(14, -5);
-
-  // 2D Tensor
-  test_vulkan_quantize_per_channel(
-      {9, 14}, // input sizes
-      scales,
-      zero_points,
-      1, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-
-  // 3D Tensor
-  test_vulkan_quantize_per_channel(
-      {9, 14, 11}, // input sizes
-      scales,
-      zero_points,
-      1, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-
-  // 4D Tensor
-  test_vulkan_quantize_per_channel(
-      {9, 14, 5, 5}, // input sizes
-      scales,
-      zero_points,
-      1, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-
-  // 4D Tensor (negative axis)
-  test_vulkan_quantize_per_channel(
-      {9, 7, 14, 5}, // input sizes
-      scales,
-      zero_points,
-      -2, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-}
-
-TEST(
-    VulkanQuantizePerChannelTest,
-    test_vulkan_quantize_per_channel_float_to_int8_axis2) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales(11, 0.5f);
-  std::vector<int> zero_points(11, 12);
-
-  // 3D Tensor
-  test_vulkan_quantize_per_channel(
-      {9, 14, 11}, // input sizes
-      scales,
-      zero_points,
-      2, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-
-  // 4D Tensor
-  test_vulkan_quantize_per_channel(
-      {9, 14, 11, 5}, // input sizes
-      scales,
-      zero_points,
-      2, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-
-  // 4D Tensor (negative axis)
-  test_vulkan_quantize_per_channel(
-      {9, 11, 14, 5}, // input sizes
-      scales,
-      zero_points,
-      -3, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-}
-
-TEST(
-    VulkanQuantizePerChannelTest,
-    test_vulkan_quantize_per_channel_float_to_int8_axis3) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales(7, 0.5f);
-  std::vector<int> zero_points(7, 12);
-
-  // 4D Tensor
-  test_vulkan_quantize_per_channel(
-      {9, 14, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      3, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-
-  // 4D Tensor (negative axis)
-  test_vulkan_quantize_per_channel(
-      {7, 14, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      -4, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat,
-      at::kChar);
-}
-
-TEST(
-    VulkanQuantizePerChannelTest,
-    test_vulkan_quantize_per_channel_float_to_uint8_comprehensive) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales = {0.1, 0.2, 0.0001, 0.5, 0.02};
-  std::vector<int> zero_points = {0, 5, -5, 1, 12};
-
-  // 4D Tensor
-  test_vulkan_quantize_per_channel(
-      {5, 14, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      0, // axis
-      0, // quant_min
-      255, // quant_max
-      at::kFloat,
-      at::kByte);
-
-  // 4D Tensor
-  test_vulkan_quantize_per_channel(
-      {9, 5, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      1, // axis
-      0, // quant_min
-      255, // quant_max
-      at::kFloat,
-      at::kByte);
-
-  // 4D Tensor
-  test_vulkan_quantize_per_channel(
-      {9, 14, 5, 7}, // input sizes
-      scales,
-      zero_points,
-      2, // axis
-      0, // quant_min
-      255, // quant_max
-      at::kFloat,
-      at::kByte);
-
-  // 4D Tensor
-  test_vulkan_quantize_per_channel(
-      {9, 14, 11, 5}, // input sizes
-      scales,
-      zero_points,
-      3, // axis
-      0, // quant_min
-      255, // quant_max
-      at::kFloat,
-      at::kByte);
-
-  // 4D Tensor (negative axis)
-  test_vulkan_quantize_per_channel(
-      {5, 14, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      -4, // axis
-      0, // quant_min
-      255, // quant_max
-      at::kFloat,
-      at::kByte);
-}
-
-TEST(
-    VulkanQuantizePerChannelTest,
-    test_vulkan_quantize_per_channel_half_to_8bit) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_float16_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales = {0.1, 0.2, 0.01, 0.5, 0.02};
-  std::vector<int> zero_points = {0, 5, 5, 1, 12};
-
-  // 4D Tensor
-  test_vulkan_quantize_per_channel(
-      {5, 14, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      0, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kHalf,
-      at::kChar);
-
-  // 4D Tensor
-  test_vulkan_quantize_per_channel(
-      {9, 5, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      1, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kHalf,
-      at::kChar);
-
-  // 4D Tensor
-  test_vulkan_quantize_per_channel(
-      {9, 14, 5, 7}, // input sizes
-      scales,
-      zero_points,
-      2, // axis
-      0, // quant_min
-      255, // quant_max
-      at::kHalf,
-      at::kByte);
-
-  // 4D Tensor
-  test_vulkan_quantize_per_channel(
-      {9, 14, 11, 5}, // input sizes
-      scales,
-      zero_points,
-      3, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kHalf,
-      at::kChar);
-
-  // 4D Tensor (negative axis)
-  test_vulkan_quantize_per_channel(
-      {5, 14, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      -4, // axis
-      0, // quant_min
-      255, // quant_max
-      at::kHalf,
-      at::kByte);
-}
-
-TEST(
-    VulkanQuantizePerChannelTest,
-    test_vulkan_quantize_per_channel_double_to_8bit) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  std::vector<float> scales = {0.1, 0.2, 0.01, 0.5, 0.02};
-  std::vector<int> zero_points = {0, 5, 5, 1, 12};
-
-  // 4D Tensor
-  test_vulkan_quantize_per_channel(
-      {5, 14, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      0, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kDouble,
-      at::kChar);
-
-  // 4D Tensor
-  test_vulkan_quantize_per_channel(
-      {9, 5, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      1, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kDouble,
-      at::kChar);
-
-  // 4D Tensor
-  test_vulkan_quantize_per_channel(
-      {9, 14, 5, 7}, // input sizes
-      scales,
-      zero_points,
-      2, // axis
-      0, // quant_min
-      255, // quant_max
-      at::kDouble,
-      at::kByte);
-
-  // 4D Tensor
-  test_vulkan_quantize_per_channel(
-      {9, 14, 11, 5}, // input sizes
-      scales,
-      zero_points,
-      3, // axis
-      -128, // quant_min
-      127, // quant_max
-      at::kDouble,
-      at::kChar);
-
-  // 4D Tensor (negative axis)
-  test_vulkan_quantize_per_channel(
-      {5, 14, 11, 7}, // input sizes
-      scales,
-      zero_points,
-      -4, // axis
-      0, // quant_min
-      255, // quant_max
-      at::kDouble,
-      at::kByte);
-}
-
-void test_vulkan_quantize_per_tensor_tensor_impl(
-    const std::vector<int>& input_sizes,
-    float scale,
-    int zero_point,
-    int64_t quant_min,
-    int64_t quant_max,
-    at::ScalarType in_dtype = at::kFloat,
-    at::ScalarType dtype = at::kInt,
-    const vkcompute::utils::StorageType in_storage =
-        vkcompute::utils::kTexture3D,
-    const vkcompute::utils::StorageType out_storage =
-        vkcompute::utils::kTexture3D) {
-  check_quantize_args(quant_min, quant_max, dtype);
-  std::vector<int64_t> input_sizes_int64(
-      input_sizes.begin(), input_sizes.end());
-  at::Tensor input =
-      at::rand(input_sizes_int64, at::device(at::kCPU).dtype(in_dtype));
-
-  scale = scale < eps ? eps : scale;
-
-  // Create scale and zero_point as tensors (single element tensors)
-  at::Tensor scale_tensor =
-      at::tensor({scale}, at::device(at::kCPU).dtype(at::kDouble));
-  at::Tensor zero_point_tensor =
-      at::tensor({zero_point}, at::device(at::kCPU).dtype(at::kLong));
-
-  // Get reference output using tensor variant
-  at::Tensor reference_out =
-      torch::executor::native::quantize_per_tensor_tensor_args_aten(
-          input, scale_tensor, zero_point_tensor, quant_min, quant_max, dtype);
-
-  // Build Vulkan quantize_per_tensor.tensor graph
-  using namespace vkcompute;
-
-  GraphConfig config;
-  config.set_storage_type_override(in_storage);
-  ComputeGraph graph(config);
-
-  IOValueRef r_input = graph.add_input_tensor(
-      input.sizes().vec(), from_at_scalartype(input.scalar_type()), in_storage);
-
-  // Add scale and zero_point as tensor inputs (buffer storage, width packed)
-  IOValueRef r_scale = graph.add_input_tensor(
-      scale_tensor.sizes().vec(),
-      vkapi::kFloat,
-      utils::kBuffer,
-      utils::kWidthPacked);
-  IOValueRef r_zero_point = graph.add_input_tensor(
-      zero_point_tensor.sizes().vec(),
-      vkapi::kInt,
-      utils::kBuffer,
-      utils::kWidthPacked);
-
-  const ValueRef r_quant_min = graph.add_scalar<int64_t>(quant_min);
-  const ValueRef r_quant_max = graph.add_scalar<int64_t>(quant_max);
-
-  const ValueRef r_out = graph.add_tensor(
-      input.sizes().vec(), from_at_scalartype(dtype), out_storage);
-
-  const ValueRef r_dtype =
-      graph.add_scalar<int64_t>(static_cast<int64_t>(dtype));
-
-  VK_GET_OP_FN("quantized_decomposed.quantize_per_tensor.tensor")
-  (graph,
-   {
-       r_input.value,
-       r_scale.value,
-       r_zero_point.value,
-       r_quant_min,
-       r_quant_max,
-       r_dtype,
-       r_out,
-   });
-
-  ValueRef staging_out = graph.set_output_tensor(r_out);
-
-  graph.prepare();
-  graph.prepack();
-
-  // Run Vulkan quantize_per_tensor.tensor
-  graph.copy_into_staging(
-      r_input.staging, input.const_data_ptr(), input.numel());
-
-  // Convert scale tensor to float and copy to GPU
-  at::Tensor scale_float = scale_tensor.to(at::kFloat);
-  graph.copy_into_staging(
-      r_scale.staging, scale_float.const_data_ptr(), scale_float.numel());
-
-  // Convert zero_point tensor to int and copy to GPU
-  at::Tensor zero_point_int = zero_point_tensor.to(at::kInt);
-  graph.copy_into_staging(
-      r_zero_point.staging,
-      zero_point_int.const_data_ptr(),
-      zero_point_int.numel());
-
-  graph.execute();
-
-  at::Tensor vk_out = at::empty_like(reference_out).contiguous();
-  graph.copy_from_staging(
-      staging_out, vk_out.mutable_data_ptr(), vk_out.numel());
-
-  // Compare outputs
-  // For quantized types, we need to compare the actual integer values
-  at::Tensor reference_int = reference_out.to(at::kInt);
-  at::Tensor vk_int = vk_out.to(at::kInt);
-
-  // Tolerance is 1 to address rounding errors and fp math differences between
-  // CPU/GPU
-  const bool output_correct =
-      at::allclose(reference_int, vk_int, /*rtol=*/1, /*atol=*/1);
-  if (!output_correct) {
-    at::Tensor diffs = at::abs(reference_int - vk_int);
-
-    std::cout << "\n"
-              << "Failed with parameters: " << std::endl;
-    std::cout << "  scale: " << scale << std::endl;
-    std::cout << "  zero_point: " << zero_point << std::endl;
-    std::cout << "  quant_min: " << quant_min << std::endl;
-    std::cout << "  quant_max: " << quant_max << std::endl;
-    std::cout << "  storage type: "
-              << (in_storage == vkcompute::utils::kBuffer ? "buffer"
-                                                          : "texture")
-              << std::endl;
-
-    std::cout << "input:" << std::endl;
-    std::cout << input << std::endl;
-    std::cout << "reference:" << std::endl;
-    std::cout << reference_int << std::endl;
-    std::cout << "vulkan:" << std::endl;
-    std::cout << vk_int << std::endl;
-  }
-
-  ASSERT_TRUE(output_correct);
-}
-
-TEST(
-    VulkanQuantizePerTensorTensorTest,
-    test_vulkan_quantize_per_tensor_tensor_float_to_int8) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_vulkan_quantize_per_tensor_tensor(
-      {2, 3, 4}, // input sizes
-      0.01, // scale
-      1, // zero_point
-      -128, // quant_min
-      127, // quant_max
-      at::kFloat, // input dtype
-      at::kChar); // output dtype
-}
-
-TEST(
-    VulkanQuantizePerTensorTensorTest,
-    test_vulkan_quantize_per_tensor_tensor_float_to_uint8) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_vulkan_quantize_per_tensor_tensor(
-      {2, 3, 4, 12}, // input sizes
-      0.1, // scale
-      5, // zero_point
-      0, // quant_min
-      255, // quant_max
-      at::kFloat, // input dtype
-      at::kByte); // output dtype
-}
-
-TEST(
-    VulkanQuantizePerTensorTensorTest,
-    test_vulkan_quantize_per_tensor_tensor_float_to_int32) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_vulkan_quantize_per_tensor_tensor(
-      {2, 3}, // input sizes
-      0.01, // scale
-      12, // zero_point
-      std::numeric_limits<int32_t>::min(), // quant_min
-      std::numeric_limits<int32_t>::max(), // quant_max
-      at::kFloat, // input dtype
-      at::kInt); // output dtype
-}
-
-TEST(
-    VulkanQuantizePerTensorTensorTest,
-    test_vulkan_quantize_per_tensor_tensor_half_to_uint8) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_vulkan_quantize_per_tensor_tensor(
-      {3, 4}, // input sizes
-      0.3, // scale
-      2, // zero_point
-      0, // quant_min
-      255, // quant_max
-      at::kHalf, // input dtype
-      at::kByte); // output dtype
-}
-
-TEST(
-    VulkanQuantizePerTensorTensorTest,
-    test_vulkan_quantize_per_tensor_tensor_double_to_int8) {
-  if (!vkcompute::api::context()
-           ->adapter_ptr()
-           ->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_vulkan_quantize_per_tensor_tensor(
-      {2, 3, 4}, // input sizes
-      0.03, // scale
-      -2, // zero_point
-      -128, // quant_min
-      127, // quant_max
-      at::kDouble, // input dtype
-      at::kChar); // output dtype
-}
diff --git a/backends/vulkan/test/op_tests/quantized_linear_test.cpp b/backends/vulkan/test/op_tests/quantized_linear_test.cpp
deleted file mode 100644
index db95f4a793f..00000000000
--- a/backends/vulkan/test/op_tests/quantized_linear_test.cpp
+++ /dev/null
@@ -1,900 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <ATen/ATen.h>
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include "test_utils.h"
-
-#include <cassert>
-
-class VulkanLinearQCS4WTest : public ::testing::Test {
- public:
-  void SetUp() override {
-    if (!vkcompute::api::context()
-             ->adapter_ptr()
-             ->supports_int16_shader_types()) {
-      GTEST_SKIP();
-    }
-  }
-
-  void TearDown() override {
-    // Clean up any resources if needed
-  }
-};
-
-class VulkanLinearQTA8AQGA4WTest : public ::testing::Test {
- public:
-  void SetUp() override {
-    if (!vkcompute::api::context()
-             ->adapter_ptr()
-             ->has_full_int8_buffers_support()) {
-      GTEST_SKIP();
-    }
-  }
-
-  void TearDown() override {
-    // Clean up any resources if needed
-  }
-};
-
-//
-// Reference Implementations
-//
-
-at::Tensor linear_qga4w_reference_impl(
-    const at::Tensor& x,
-    const at::Tensor& weights_4x2,
-    const int64_t groupsize,
-    const at::Tensor& scales_and_zeros,
-    const int64_t inner_k_tiles) {
-  const std::vector<int64_t> original_x_size(x.sizes().vec());
-  const size_t ndim = original_x_size.size();
-  const int64_t out_features = weights_4x2.size(0);
-  const at::Tensor x_flattened = x.reshape({-1, original_x_size[ndim - 1]});
-  at::Tensor out = at::_weight_int4pack_mm_for_cpu(
-      x_flattened, weights_4x2, groupsize, scales_and_zeros);
-  std::vector<int64_t> out_shape(
-      original_x_size.begin(), original_x_size.end());
-  out_shape.at(ndim - 1) = out_features;
-  return out.reshape(out_shape);
-}
-
-at::Tensor unpack_weights_4x2(const at::Tensor& weights_4x2) {
-  std::vector<int64_t> weights_shape(weights_4x2.sizes().vec());
-  weights_shape[1] *= 2;
-
-  at::Tensor weights_unpacked =
-      at::empty(weights_shape, at::device(at::kCPU).dtype(at::kInt));
-
-  const int64_t N = weights_unpacked.size(0);
-  const int64_t K = weights_unpacked.size(1);
-
-  for (int n = 0; n < N; n++) {
-    for (int k = 0; k < K; k += 2) {
-      const uint8_t packed_val = weights_4x2[n][k / 2].item().to<uint8_t>();
-      const uint8_t second_val = packed_val & 0x0F;
-      const uint8_t first_val = (packed_val & 0xF0) >> 4;
-
-      weights_unpacked[n][k] = int(first_val);
-      weights_unpacked[n][k + 1] = int(second_val);
-    }
-  }
-
-  return weights_unpacked;
-}
-
-at::Tensor dequantize_and_linear_qga4w(
-    const at::Tensor& x,
-    const at::Tensor& weights_4x2,
-    const int64_t groupsize,
-    const at::Tensor& scales_and_zeros,
-    const int64_t inner_k_tiles) {
-  std::vector<int64_t> weights_shape(weights_4x2.sizes().vec());
-  weights_shape[1] *= 2;
-
-  at::Tensor weights_dequantized =
-      at::empty(weights_shape, at::device(at::kCPU).dtype(at::kFloat));
-
-  const int64_t N = weights_dequantized.size(0);
-  const int64_t K = weights_dequantized.size(1);
-
-  const int k_groups = K / groupsize;
-  for (int n = 0; n < N; n++) {
-    for (int k = 0; k < K; k += 2) {
-      const int group_idx = k / groupsize;
-      // const int scale_idx = k_groups * n + group_idx;
-      const uint8_t packed_val = weights_4x2[n][k / 2].item().to<uint8_t>();
-      const uint8_t second_val = packed_val & 0x0F;
-      const uint8_t first_val = (packed_val & 0xF0) >> 4;
-
-      const float scale = scales_and_zeros[group_idx][n][0].item().to<float>();
-      const float zero = scales_and_zeros[group_idx][n][1].item().to<float>();
-
-      weights_dequantized[n][k] = (float(first_val) - 8.0) * scale + zero;
-      weights_dequantized[n][k + 1] = (float(second_val) - 8.0) * scale + zero;
-    }
-  }
-
-  return at::linear(x, weights_dequantized);
-}
-
-at::Tensor dequantize_and_linear_qcs4w(
-    const at::Tensor& x,
-    const at::Tensor& weights_4x2,
-    const at::Tensor& scales) {
-  std::vector<int64_t> weights_shape(weights_4x2.sizes().vec());
-  weights_shape[1] *= 2;
-
-  at::Tensor weights_dequantized =
-      at::empty(weights_shape, at::device(at::kCPU).dtype(at::kFloat));
-
-  const int64_t N = weights_dequantized.size(0);
-  const int64_t K = weights_dequantized.size(1);
-
-  for (int n = 0; n < N; n++) {
-    for (int k = 0; k < K; k += 2) {
-      // const int scale_idx = k_groups * n + group_idx;
-      const uint8_t packed_val = weights_4x2[n][k / 2].item().to<uint8_t>();
-      const uint8_t second_val = packed_val & 0x0F;
-      const uint8_t first_val = (packed_val & 0xF0) >> 4;
-
-      const float scale = scales[n].item().to<float>();
-
-      weights_dequantized[n][k] = (float(first_val) - 8.0) * scale;
-      weights_dequantized[n][k + 1] = (float(second_val) - 8.0) * scale;
-    }
-  }
-
-  return at::linear(x, weights_dequantized);
-}
-
-at::Tensor linear_qcs4w_reference_impl(
-    const at::Tensor& x,
-    const at::Tensor& weights_4x2,
-    const at::Tensor& scales) {
-  const std::vector<int64_t> original_x_size(x.sizes().vec());
-  const size_t ndim = original_x_size.size();
-  const int64_t out_features = weights_4x2.size(0);
-  const at::Tensor x_flattened = x.reshape({-1, original_x_size[ndim - 1]});
-
-  const at::Tensor weights_unpacked =
-      (unpack_weights_4x2(weights_4x2) - 8).to(at::kChar);
-  at::Tensor out =
-      at::_weight_int8pack_mm(x_flattened, weights_unpacked, scales);
-
-  std::vector<int64_t> out_shape(
-      original_x_size.begin(), original_x_size.end());
-  out_shape.at(ndim - 1) = out_features;
-  return out.reshape(out_shape);
-}
-
-at::Tensor linear_qta8a_qga4w_quantized_matmul(
-    const at::Tensor& quantized_input, // [B, M, K] int8 quantized input
-    const at::Tensor& input_scale, // [B*M] per-token input scales
-    const at::Tensor& input_zero_point, // [B*M] per-token input zero points
-    const at::Tensor& weights_4x2, // [N, K/2] 4-bit packed weights
-    const int64_t group_size, // Group size for weight quantization
-    const at::Tensor& weight_scales, // [K/group_size, N] weight scales
-    const at::Tensor& weight_zeros) { // [K/group_size, N] weight zeros
-
-  const int64_t B = quantized_input.size(0);
-  const int64_t M = quantized_input.size(1);
-  const int64_t K = quantized_input.size(2);
-  const int64_t N = weights_4x2.size(0);
-
-  // Create output tensor for floating point results
-  at::Tensor float_output =
-      at::zeros({B, M, N}, at::device(at::kCPU).dtype(at::kFloat));
-
-  // Accessors for efficient access
-  auto input_accessor = quantized_input.accessor<int8_t, 3>();
-  auto output_accessor = float_output.accessor<float, 3>();
-  auto weights_accessor = weights_4x2.accessor<uint8_t, 2>();
-  auto weight_scales_accessor = weight_scales.accessor<float, 2>();
-  auto weight_zeros_accessor = weight_zeros.accessor<int32_t, 2>();
-  auto input_scale_accessor = input_scale.accessor<float, 1>();
-  auto input_zero_accessor = input_zero_point.accessor<int32_t, 1>();
-
-  // Perform quantized matrix multiplication following quantization.md equation
-  // (5): result_real_value = lhs_scale * rhs_scale * Sum_over_k(
-  //   (lhs_quantized_value[k] - lhs_zero_point) *
-  //   (rhs_quantized_value[k] - rhs_zero_point)
-  // )
-  for (int64_t b = 0; b < B; b++) {
-    for (int64_t m = 0; m < M; m++) {
-      const int64_t token_idx = b * M + m;
-      const float lhs_scale =
-          input_scale_accessor[token_idx]; // Per-token input scale
-      const int32_t lhs_zero_point =
-          input_zero_accessor[token_idx]; // Per-token input zero point
-
-      for (int64_t n = 0; n < N; n++) {
-        float result_real_value = 0.0f;
-
-        for (int64_t k = 0; k < K; k++) {
-          // Get per-group weight quantization parameters
-          const int64_t group_idx = k / group_size;
-          const float rhs_scale =
-              weight_scales_accessor[group_idx][n]; // Per-group weight scale
-          const int32_t rhs_zero_point =
-              weight_zeros_accessor[group_idx]
-                                   [n]; // Per-group weight zero point
-
-          // Unpack the 4-bit weight for this position
-          const uint8_t packed_val = weights_accessor[n][k / 2];
-          uint8_t weight_4bit;
-          if (k % 2 == 0) {
-            weight_4bit = (packed_val & 0xF0) >> 4; // First weight in pair
-          } else {
-            weight_4bit = packed_val & 0x0F; // Second weight in pair
-          }
-
-          // Get quantized values
-          const int32_t lhs_quantized_value =
-              static_cast<int32_t>(input_accessor[b][m][k]);
-          // Convert 4-bit weight to signed: subtract 8 to get range [-8, 7]
-          const int32_t rhs_quantized_value =
-              static_cast<int32_t>(weight_4bit) - 8;
-
-          // Apply proper quantization paradigm from quantization.md equation
-          // (3): real_value = scale * (quantized_value - zero_point) Following
-          // equation (5): result = lhs_scale * rhs_scale *
-          //   (lhs_quantized - lhs_zero) * (rhs_quantized - rhs_zero)
-          const float lhs_diff =
-              static_cast<float>(lhs_quantized_value - lhs_zero_point);
-          const float rhs_diff =
-              static_cast<float>(rhs_quantized_value - rhs_zero_point);
-
-          result_real_value += lhs_scale * rhs_scale * lhs_diff * rhs_diff;
-        }
-
-        output_accessor[b][m][n] = result_real_value;
-      }
-    }
-  }
-
-  return float_output;
-}
-
-at::Tensor linear_qta8a_qga4w_4bit_dequant_impl(
-    const at::Tensor& quantized_input,
-    const at::Tensor& input_scale,
-    const at::Tensor& input_zero_point,
-    const at::Tensor& weights_4x2,
-    const int64_t group_size,
-    const at::Tensor& weight_scales,
-    const at::Tensor& weight_zeros) {
-  // Calculate number of input tokens
-  int64_t input_num_tokens = 1;
-  for (size_t i = 0; i < quantized_input.sizes().size() - 1; i++) {
-    input_num_tokens *= quantized_input.size(i);
-  }
-
-  // Manually dequantize the char tensor using per-token quantization
-  at::Tensor x_float = at::zeros_like(quantized_input, at::kFloat);
-
-  // Apply per-token dequantization
-  auto input_accessor = quantized_input.accessor<int8_t, 3>();
-  auto output_accessor = x_float.accessor<float, 3>();
-
-  for (int64_t token_idx = 0; token_idx < input_num_tokens; token_idx++) {
-    float scale_val = input_scale[token_idx].item<float>();
-    int zero_point_val = input_zero_point[token_idx].item<int>();
-
-    // Calculate batch and sequence indices for this token
-    int64_t b = token_idx / quantized_input.size(1);
-    int64_t m = token_idx % quantized_input.size(1);
-
-    // Apply dequantization for all features in this token
-    for (int64_t k = 0; k < quantized_input.size(-1); k++) {
-      float dequant_val =
-          (input_accessor[b][m][k] - zero_point_val) * scale_val;
-      output_accessor[b][m][k] = dequant_val;
-    }
-  }
-
-  std::vector<int64_t> weights_shape(weights_4x2.sizes().vec());
-  weights_shape[1] *= 2;
-
-  at::Tensor weights_dequantized =
-      at::empty(weights_shape, at::device(at::kCPU).dtype(at::kFloat));
-
-  const int64_t N = weights_dequantized.size(0);
-  const int64_t K = weights_dequantized.size(1);
-
-  for (int n = 0; n < N; n++) {
-    for (int k = 0; k < K; k += 2) {
-      const int group_idx = k / group_size;
-      const uint8_t packed_val = weights_4x2[n][k / 2].item().to<uint8_t>();
-      const uint8_t second_val = packed_val & 0x0F;
-      const uint8_t first_val = (packed_val & 0xF0) >> 4;
-
-      const float scale = weight_scales[group_idx][n].item().to<float>();
-      const int zero = weight_zeros[group_idx][n].item().to<int>();
-
-      weights_dequantized[n][k] =
-          ((float(first_val) - 8.0) - float(zero)) * scale;
-      weights_dequantized[n][k + 1] =
-          ((float(second_val) - 8.0) - float(zero)) * scale;
-    }
-  }
-
-  at::Tensor linear_result = at::linear(x_float, weights_dequantized);
-
-  return linear_result;
-}
-
-//
-// Test functions
-//
-
-void test_reference_linear_qga4w(
-    const int B,
-    const int M,
-    const int K,
-    const int N,
-    const int group_size = 32,
-    const int inner_k_tiles = 8) {
-  assert(K % group_size == 0);
-
-  at::Tensor x = at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat));
-  at::Tensor weights_4x2 =
-      at::randint(0, 256, {N, K / 2}, at::device(at::kCPU).dtype(at::kByte));
-  at::Tensor weights_int = unpack_weights_4x2(weights_4x2);
-
-  const int k_groups = K / group_size;
-  at::Tensor scales_and_zeros =
-      at::rand({k_groups, N, 2}, at::device(at::kCPU).dtype(at::kFloat));
-
-  at::Tensor out = linear_qga4w_reference_impl(
-      x,
-      at::_convert_weight_to_int4pack_for_cpu(weights_int, group_size),
-      group_size,
-      scales_and_zeros,
-      inner_k_tiles);
-
-  at::Tensor out_ref = dequantize_and_linear_qga4w(
-      x, weights_4x2, group_size, scales_and_zeros, inner_k_tiles);
-
-  ASSERT_TRUE(at::allclose(out, out_ref));
-}
-
-void test_reference_linear_qcs4w(
-    const int B,
-    const int M,
-    const int K,
-    const int N) {
-  at::Tensor x = at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat));
-  at::Tensor weights_4x2 =
-      at::randint(0, 256, {N, K / 2}, at::device(at::kCPU).dtype(at::kByte));
-  at::Tensor weights_int = unpack_weights_4x2(weights_4x2);
-
-  at::Tensor scales = at::rand({N}, at::device(at::kCPU).dtype(at::kFloat));
-
-  at::Tensor out = linear_qcs4w_reference_impl(x, weights_4x2, scales);
-
-  at::Tensor out_ref = dequantize_and_linear_qcs4w(x, weights_4x2, scales);
-
-  ASSERT_TRUE(at::allclose(out, out_ref));
-}
-
-void test_vulkan_linear_qga4w_impl(
-    const int B,
-    const int M,
-    const int K,
-    const int N,
-    const int group_size = 32,
-    const int inner_k_tiles = 8,
-    const vkcompute::utils::StorageType in_storage =
-        vkcompute::utils::kTexture3D,
-    const vkcompute::utils::StorageType out_storage =
-        vkcompute::utils::kTexture3D) {
-  assert(K % group_size == 0);
-
-  at::Tensor x = at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat));
-  at::Tensor weights_4x2 =
-      at::randint(0, 256, {N, K / 2}, at::device(at::kCPU).dtype(at::kByte));
-
-  const int k_groups = K / group_size;
-  at::Tensor scales_and_zeros =
-      at::rand({k_groups, N, 2}, at::device(at::kCPU).dtype(at::kFloat));
-
-  at::Tensor weights_int = unpack_weights_4x2(weights_4x2);
-  at::Tensor out_ref = linear_qga4w_reference_impl(
-      x,
-      at::_convert_weight_to_int4pack_for_cpu(weights_int, group_size),
-      group_size,
-      scales_and_zeros,
-      inner_k_tiles);
-
-  // Build Vulkan graph
-  using namespace vkcompute;
-
-  GraphConfig config;
-  config.set_storage_type_override(utils::kTexture3D);
-  ComputeGraph graph(config);
-
-#define MAKE_TENSORREF_FOR(x)              \
-  ValueRef r_##x = graph.add_tensorref(    \
-      x.sizes().vec(),                     \
-      from_at_scalartype(x.scalar_type()), \
-      x.const_data_ptr());
-
-  MAKE_TENSORREF_FOR(weights_4x2);
-  MAKE_TENSORREF_FOR(scales_and_zeros);
-
-  IOValueRef r_x = graph.add_input_tensor(
-      x.sizes().vec(), from_at_scalartype(x.scalar_type()), in_storage);
-
-  const ValueRef r_out = graph.add_tensor(
-      out_ref.sizes().vec(),
-      from_at_scalartype(out_ref.scalar_type()),
-      out_storage);
-
-  VK_GET_OP_FN("et_vk.linear_weight_int4.default")
-  (graph,
-   {r_x.value,
-    r_weights_4x2,
-    graph.add_scalar<int64_t>(group_size),
-    r_scales_and_zeros,
-    kDummyValueRef,
-    r_out});
-
-  ValueRef staging_out = graph.set_output_tensor(r_out);
-
-  graph.prepare();
-
-  graph.prepack();
-
-  //
-  // Run model
-  //
-
-  graph.propagate_resize();
-  graph.copy_into_staging(r_x.staging, x.const_data_ptr(), x.numel());
-
-  graph.execute();
-
-  at::Tensor vk_out = at::empty_like(out_ref);
-  graph.copy_from_staging(
-      staging_out, vk_out.mutable_data_ptr(), vk_out.numel());
-
-  ASSERT_TRUE(at::allclose(vk_out, out_ref, 1e-4, 1e-4));
-}
-
-void test_vulkan_linear_qga4w(
-    const int B,
-    const int M,
-    const int K,
-    const int N,
-    const int group_size = 32,
-    const int inner_k_tiles = 8) {
-  test_vulkan_linear_qga4w_impl(
-      B,
-      M,
-      K,
-      N,
-      group_size,
-      inner_k_tiles,
-      vkcompute::utils::kBuffer,
-      vkcompute::utils::kBuffer);
-
-  test_vulkan_linear_qga4w_impl(
-      B,
-      M,
-      K,
-      N,
-      group_size,
-      inner_k_tiles,
-      vkcompute::utils::kTexture3D,
-      vkcompute::utils::kTexture3D);
-}
-
-void test_vulkan_linear_qcs4w_impl(
-    const int B,
-    const int M,
-    const int K,
-    const int N,
-    const vkcompute::utils::StorageType in_storage =
-        vkcompute::utils::kTexture3D,
-    const vkcompute::utils::StorageType out_storage =
-        vkcompute::utils::kTexture3D) {
-  at::Tensor x = at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat));
-  at::Tensor weights_4x2 =
-      at::randint(0, 256, {N, K / 2}, at::device(at::kCPU).dtype(at::kByte));
-
-  at::Tensor scales = at::rand({N}, at::device(at::kCPU).dtype(at::kFloat));
-
-  at::Tensor out_ref = linear_qcs4w_reference_impl(x, weights_4x2, scales);
-
-  // Build Vulkan graph
-  using namespace vkcompute;
-
-  GraphConfig config;
-  config.set_storage_type_override(utils::kTexture3D);
-  ComputeGraph graph(config);
-
-#define MAKE_TENSORREF_FOR(x)              \
-  ValueRef r_##x = graph.add_tensorref(    \
-      x.sizes().vec(),                     \
-      from_at_scalartype(x.scalar_type()), \
-      x.const_data_ptr());
-
-  MAKE_TENSORREF_FOR(weights_4x2);
-  MAKE_TENSORREF_FOR(scales);
-
-  IOValueRef r_x = graph.add_input_tensor(
-      x.sizes().vec(), from_at_scalartype(x.scalar_type()), in_storage);
-
-  const ValueRef r_out = graph.add_tensor(
-      out_ref.sizes().vec(),
-      from_at_scalartype(out_ref.scalar_type()),
-      out_storage);
-
-  VK_GET_OP_FN("et_vk.linear_qcs4w.default")
-  (graph, {r_x.value, r_weights_4x2, r_scales, r_out});
-
-  ValueRef staging_out = graph.set_output_tensor(r_out);
-
-  graph.prepare();
-
-  graph.prepack();
-
-  //
-  // Run model
-  //
-
-  graph.propagate_resize();
-  graph.copy_into_staging(r_x.staging, x.const_data_ptr(), x.numel());
-
-  graph.execute();
-
-  at::Tensor vk_out = at::empty_like(out_ref);
-  graph.copy_from_staging(
-      staging_out, vk_out.mutable_data_ptr(), vk_out.numel());
-
-  ASSERT_TRUE(at::allclose(vk_out, out_ref, 1e-4, 1e-4));
-}
-
-void test_vulkan_linear_qcs4w(
-    const int B,
-    const int M,
-    const int K,
-    const int N) {
-  test_vulkan_linear_qcs4w_impl(
-      B, M, K, N, vkcompute::utils::kBuffer, vkcompute::utils::kBuffer);
-
-  test_vulkan_linear_qcs4w_impl(
-      B, M, K, N, vkcompute::utils::kTexture3D, vkcompute::utils::kTexture3D);
-}
-
-void test_vulkan_linear_qta8a_qga4w_impl(
-    const int B,
-    const int M,
-    const int K,
-    const int N,
-    const int group_size = 8,
-    const vkcompute::utils::StorageType in_storage =
-        vkcompute::utils::kTexture3D,
-    const vkcompute::utils::StorageType out_storage =
-        vkcompute::utils::kTexture3D) {
-  assert(K % group_size == 0);
-
-  const int64_t input_num_tokens = B * M;
-  const int k_groups = K / group_size;
-
-  at::Tensor input_scale =
-      at::rand({input_num_tokens}, at::device(at::kCPU).dtype(at::kFloat));
-  at::Tensor input_zero_point = at::randint(
-      -10, 10, {input_num_tokens}, at::device(at::kCPU).dtype(at::kInt));
-
-  at::Tensor float_x =
-      at::rand({B, M, K}, at::device(at::kCPU).dtype(at::kFloat));
-
-  // Create a reference quantized tensor using per-token quantization
-  // Mimic per-token quantization using at::quantize_per_channel by reshaping
-  // [num_tokens, features]
-  at::Tensor float_x_reshaped = float_x.view({input_num_tokens, K});
-  at::Tensor qx_ref_reshaped = at::quantize_per_channel(
-      float_x_reshaped,
-      input_scale.to(at::kDouble),
-      input_zero_point.to(at::kLong),
-      0, // axis 0 for per-token (first dimension after reshape)
-      c10::ScalarType::QInt8);
-
-  at::Tensor x =
-      at::int_repr(qx_ref_reshaped).view(float_x.sizes()).to(at::kChar);
-
-  at::Tensor weights_4x2 =
-      at::randint(0, 256, {N, K / 2}, at::device(at::kCPU).dtype(at::kByte));
-  at::Tensor weight_scales =
-      at::rand({k_groups, N}, at::device(at::kCPU).dtype(at::kFloat));
-  at::Tensor weight_zeros = at::randint(
-      -128, 128, {k_groups, N}, at::device(at::kCPU).dtype(at::kInt));
-
-  at::Tensor out_ref = linear_qta8a_qga4w_4bit_dequant_impl(
-      x,
-      input_scale,
-      input_zero_point,
-      weights_4x2,
-      group_size,
-      weight_scales,
-      weight_zeros);
-
-  // Build Vulkan graph
-  using namespace vkcompute;
-
-  GraphConfig config;
-  config.set_storage_type_override(utils::kTexture3D);
-  ComputeGraph graph(config);
-
-#define MAKE_TENSORREF_FOR(x)              \
-  ValueRef r_##x = graph.add_tensorref(    \
-      x.sizes().vec(),                     \
-      from_at_scalartype(x.scalar_type()), \
-      x.const_data_ptr());
-
-  MAKE_TENSORREF_FOR(weights_4x2);
-  MAKE_TENSORREF_FOR(weight_scales);
-  MAKE_TENSORREF_FOR(weight_zeros);
-
-  IOValueRef r_x = graph.add_input_tensor(
-      x.sizes().vec(), from_at_scalartype(x.scalar_type()), in_storage);
-
-  IOValueRef r_input_scale = graph.add_input_tensor(
-      input_scale.sizes().vec(),
-      from_at_scalartype(input_scale.scalar_type()),
-      utils::kBuffer);
-
-  IOValueRef r_input_zero_point = graph.add_input_tensor(
-      input_zero_point.sizes().vec(),
-      from_at_scalartype(input_zero_point.scalar_type()),
-      utils::kBuffer);
-
-  const ValueRef r_out = graph.add_tensor(
-      out_ref.sizes().vec(),
-      from_at_scalartype(out_ref.scalar_type()),
-      out_storage);
-
-  VK_GET_OP_FN("et_vk.linear_qta8a_qga4w.default")
-  (graph,
-   {r_x.value,
-    r_input_scale.value,
-    r_input_zero_point.value,
-    r_weights_4x2,
-    graph.add_scalar<int64_t>(group_size),
-    r_weight_scales,
-    r_weight_zeros,
-    r_out});
-
-  ValueRef staging_out = graph.set_output_tensor(r_out);
-
-  graph.prepare();
-
-  graph.prepack();
-
-  //
-  // Run model
-  //
-
-  graph.propagate_resize();
-  graph.copy_into_staging(r_x.staging, x.const_data_ptr(), x.numel());
-  graph.copy_into_staging(
-      r_input_scale.staging, input_scale.const_data_ptr(), input_scale.numel());
-  graph.copy_into_staging(
-      r_input_zero_point.staging,
-      input_zero_point.const_data_ptr(),
-      input_zero_point.numel());
-
-  graph.execute();
-
-  at::Tensor vk_out = at::empty_like(out_ref);
-  graph.copy_from_staging(
-      staging_out, vk_out.mutable_data_ptr(), vk_out.numel());
-
-  // This is a reference implementation that uses the quantized
-  // matmul paradigm. It should follow closely with how the vulkan
-  // implementation works, and demonstrates reasonably close results.
-  at::Tensor qmm_ref = linear_qta8a_qga4w_quantized_matmul(
-      x,
-      input_scale,
-      input_zero_point,
-      weights_4x2,
-      group_size,
-      weight_scales,
-      weight_zeros);
-
-  // For quantized int8 operations, allow for 1-unit differences due to rounding
-  bool is_close = at::allclose(vk_out, out_ref, 5e-3, 5e-3);
-  if (!is_close) {
-    std::cout << "qmm_ref: \n" << qmm_ref << std::endl;
-    std::cout << "out_ref: \n" << out_ref << std::endl;
-    std::cout << "vk_out: \n" << vk_out << std::endl;
-  }
-
-  ASSERT_TRUE(is_close);
-}
-
-void test_vulkan_linear_qta8a_qga4w(
-    const int B,
-    const int M,
-    const int K,
-    const int N,
-    const int group_size = 32) {
-  test_vulkan_linear_qta8a_qga4w_impl(
-      B,
-      M,
-      K,
-      N,
-      group_size,
-      vkcompute::utils::kBuffer,
-      vkcompute::utils::kBuffer);
-
-  test_vulkan_linear_qta8a_qga4w_impl(
-      B,
-      M,
-      K,
-      N,
-      group_size,
-      vkcompute::utils::kTexture3D,
-      vkcompute::utils::kTexture3D);
-}
-
-// Test linear_qga4w operator
-
-TEST(VulkanLinearQGA4WTest, test_reference_impl) {
-  test_reference_linear_qga4w(
-      /*B = */ 1,
-      /*M = */ 4,
-      /*K = */ 128,
-      /*N = */ 32);
-}
-
-TEST(VulkanLinearQGA4WTest, test_vulkan_impl_small_m) {
-  test_vulkan_linear_qga4w(
-      /*B = */ 1,
-      /*M = */ 4,
-      /*K = */ 128,
-      /*N = */ 32);
-
-  test_vulkan_linear_qga4w(
-      /*B = */ 1,
-      /*M = */ 1,
-      /*K = */ 256,
-      /*N = */ 256);
-}
-
-TEST(VulkanLinearQGA4WTest, test_vulkan_impl_gemm) {
-  test_vulkan_linear_qga4w(
-      /*B = */ 1,
-      /*M = */ 256,
-      /*K = */ 256,
-      /*N = */ 256);
-}
-
-// Test linear_qcs4w operator
-
-TEST_F(VulkanLinearQCS4WTest, test_reference_impl) {
-  test_reference_linear_qcs4w(
-      /*B = */ 1,
-      /*M = */ 4,
-      /*K = */ 128,
-      /*N = */ 32);
-}
-
-TEST_F(VulkanLinearQCS4WTest, test_vulkan_impl_small_m) {
-  test_vulkan_linear_qcs4w(
-      /*B = */ 1,
-      /*M = */ 4,
-      /*K = */ 128,
-      /*N = */ 32);
-
-  test_vulkan_linear_qcs4w(
-      /*B = */ 1,
-      /*M = */ 1,
-      /*K = */ 256,
-      /*N = */ 256);
-}
-
-TEST_F(VulkanLinearQCS4WTest, test_vulkan_impl_gemm) {
-  test_vulkan_linear_qcs4w(
-      /*B = */ 1,
-      /*M = */ 32,
-      /*K = */ 32,
-      /*N = */ 32);
-}
-
-// Test linear_qta8a_qga4w operator
-
-TEST_F(
-    VulkanLinearQTA8AQGA4WTest,
-    test_vulkan_linear_quant_gemm_custom_groupsize) {
-  test_vulkan_linear_qta8a_qga4w(
-      /*B = */ 1,
-      /*M = */ 2,
-      /*K = */ 8,
-      /*N = */ 8,
-      /*group_size = */ 8);
-
-  test_vulkan_linear_qta8a_qga4w(
-      /*B = */ 1,
-      /*M = */ 2,
-      /*K = */ 16,
-      /*N = */ 8,
-      /*group_size = */ 8);
-}
-
-TEST_F(VulkanLinearQTA8AQGA4WTest, test_vulkan_linear_quant_gemm) {
-  test_vulkan_linear_qta8a_qga4w(
-      /*B = */ 1,
-      /*M = */ 4,
-      /*K = */ 64,
-      /*N = */ 32);
-
-  test_vulkan_linear_qta8a_qga4w(
-      /*B = */ 1,
-      /*M = */ 4,
-      /*K = */ 128,
-      /*N = */ 32);
-
-  test_vulkan_linear_qta8a_qga4w(
-      /*B = */ 1,
-      /*M = */ 8,
-      /*K = */ 64,
-      /*N = */ 16);
-
-  test_vulkan_linear_qta8a_qga4w(
-      /*B = */ 1,
-      /*M = */ 256,
-      /*K = */ 256,
-      /*N = */ 256);
-}
-
-TEST_F(
-    VulkanLinearQTA8AQGA4WTest,
-    test_vulkan_linear_quant_gemv_custom_groupsize) {
-  test_vulkan_linear_qta8a_qga4w(
-      /*B = */ 1,
-      /*M = */ 1,
-      /*K = */ 8,
-      /*N = */ 8,
-      /*group_size = */ 8);
-
-  test_vulkan_linear_qta8a_qga4w(
-      /*B = */ 1,
-      /*M = */ 1,
-      /*K = */ 16,
-      /*N = */ 8,
-      /*group_size = */ 8);
-}
-
-TEST_F(VulkanLinearQTA8AQGA4WTest, test_vulkan_linear_quant_gemv) {
-  test_vulkan_linear_qta8a_qga4w(
-      /*B = */ 1,
-      /*M = */ 1,
-      /*K = */ 32,
-      /*N = */ 32);
-
-  test_vulkan_linear_qta8a_qga4w(
-      /*B = */ 1,
-      /*M = */ 1,
-      /*K = */ 64,
-      /*N = */ 16);
-
-  test_vulkan_linear_qta8a_qga4w(
-      /*B = */ 1,
-      /*M = */ 1,
-      /*K = */ 256,
-      /*N = */ 256);
-}
diff --git a/backends/vulkan/test/op_tests/rotary_embedding_test.cpp b/backends/vulkan/test/op_tests/rotary_embedding_test.cpp
deleted file mode 100644
index 9f9bdef24aa..00000000000
--- a/backends/vulkan/test/op_tests/rotary_embedding_test.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <ATen/ATen.h>
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include "test_utils.h"
-
-#include <cassert>
-
-//
-// Reference Implementations
-//
-
-std::pair<at::Tensor, at::Tensor> rotary_embedding_impl(
-    const at::Tensor& xq,
-    const at::Tensor& xk,
-    const at::Tensor& freqs_cos,
-    const at::Tensor& freqs_sin) {
-  std::vector<at::Tensor> xq_even_odd = at::unbind(
-      xq.reshape({xq.size(0), xq.size(1), xq.size(2), xq.size(3) / 2, 2}), -1);
-  at::Tensor& xq_r = xq_even_odd[0];
-  at::Tensor& xq_i = xq_even_odd[1];
-
-  std::vector<at::Tensor> xk_even_odd = at::unbind(
-      xk.reshape({xk.size(0), xk.size(1), xk.size(2), xk.size(3) / 2, 2}), -1);
-  at::Tensor& xk_r = xk_even_odd[0];
-  at::Tensor& xk_i = xk_even_odd[1];
-
-  at::Tensor freqs_cos_reshape =
-      freqs_cos.reshape({1, freqs_cos.size(0), 1, freqs_cos.size(1)});
-  at::Tensor freqs_sin_reshape =
-      freqs_sin.reshape({1, freqs_sin.size(0), 1, freqs_sin.size(1)});
-
-  at::Tensor xq_out_r = xq_r * freqs_cos_reshape - xq_i * freqs_sin_reshape;
-  at::Tensor xq_out_i = xq_r * freqs_sin_reshape + xq_i * freqs_cos_reshape;
-  at::Tensor xk_out_r = xk_r * freqs_cos_reshape - xk_i * freqs_sin_reshape;
-  at::Tensor xk_out_i = xk_r * freqs_sin_reshape + xk_i * freqs_cos_reshape;
-
-  at::Tensor xq_out = at::flatten(at::stack({xq_out_r, xq_out_i}, -1), 3);
-  at::Tensor xk_out = at::flatten(at::stack({xk_out_r, xk_out_i}, -1), 3);
-
-  return std::make_pair(xq_out, xk_out);
-}
-
-//
-// Test functions
-//
-
-void test_reference(
-    const int n_heads = 4,
-    const int n_kv_heads = 2,
-    const int dim = 32,
-    const int seq_len = 1) {
-  const int head_dim = dim / n_heads;
-
-  at::Tensor xq = at::rand(
-      {1, seq_len, n_heads, head_dim}, at::device(at::kCPU).dtype(at::kFloat));
-  at::Tensor xk = at::rand(
-      {1, seq_len, n_kv_heads, head_dim},
-      at::device(at::kCPU).dtype(at::kFloat));
-  at::Tensor freqs_cos =
-      at::rand({seq_len, head_dim / 2}, at::device(at::kCPU).dtype(at::kFloat));
-  at::Tensor freqs_sin =
-      at::rand({seq_len, head_dim / 2}, at::device(at::kCPU).dtype(at::kFloat));
-
-  std::pair<at::Tensor, at::Tensor> outs =
-      rotary_embedding_impl(xq, xk, freqs_cos, freqs_sin);
-  at::Tensor& xq_out = outs.first;
-  at::Tensor& xk_out = outs.second;
-
-  // Build Vulkan graph
-  using namespace vkcompute;
-
-  GraphConfig config;
-  config.set_storage_type_override(utils::kTexture3D);
-  ComputeGraph graph(config);
-
-#define MAKE_INPUT_FOR(x)                    \
-  IOValueRef r_##x = graph.add_input_tensor( \
-      x.sizes().vec(), from_at_scalartype(x.scalar_type()));
-
-  MAKE_INPUT_FOR(xq);
-  MAKE_INPUT_FOR(xk);
-  MAKE_INPUT_FOR(freqs_cos);
-  MAKE_INPUT_FOR(freqs_sin);
-
-  const ValueRef r_xq_out = graph.add_tensor(
-      xq_out.sizes().vec(), from_at_scalartype(xq_out.scalar_type()));
-  const ValueRef r_xk_out = graph.add_tensor(
-      xk_out.sizes().vec(), from_at_scalartype(xk_out.scalar_type()));
-
-  VK_GET_OP_FN("et_vk.apply_rotary_emb.default")
-  (graph,
-   {r_xq.value,
-    r_xk.value,
-    r_freqs_cos.value,
-    r_freqs_sin.value,
-    graph.add_value_list({r_xq_out, r_xk_out})});
-
-  ValueRef staging_xq_out = graph.set_output_tensor(r_xq_out);
-  ValueRef staging_xk_out = graph.set_output_tensor(r_xk_out);
-
-  graph.prepare();
-
-  graph.prepack();
-
-  //
-  // Run model
-  //
-
-  graph.propagate_resize();
-  graph.copy_into_staging(r_xq.staging, xq.const_data_ptr(), xq.numel());
-  graph.copy_into_staging(r_xk.staging, xk.const_data_ptr(), xk.numel());
-  graph.copy_into_staging(
-      r_freqs_cos.staging, freqs_cos.const_data_ptr(), freqs_cos.numel());
-  graph.copy_into_staging(
-      r_freqs_sin.staging, freqs_sin.const_data_ptr(), freqs_sin.numel());
-
-  graph.execute();
-
-  at::Tensor vk_xq_out = at::empty_like(xq_out);
-  graph.copy_from_staging(
-      staging_xq_out, vk_xq_out.mutable_data_ptr(), vk_xq_out.numel());
-
-  at::Tensor vk_xk_out = at::empty_like(xk_out);
-  graph.copy_from_staging(
-      staging_xk_out, vk_xk_out.mutable_data_ptr(), vk_xk_out.numel());
-
-  EXPECT_TRUE(at::allclose(xq_out, vk_xq_out, 1e-4, 1e-4));
-  EXPECT_TRUE(at::allclose(xk_out, vk_xk_out, 1e-4, 1e-4));
-}
-
-TEST(VulkanRotaryEmbeddingTest, rotary_embedding_test) {
-  test_reference();
-}
-
-TEST(VulkanRotaryEmbeddingTest, rotary_embedding_llama3_params_test) {
-  test_reference(
-      /*n_heads=*/32,
-      /*n_kv_heads=*/8,
-      /*dim=*/2048);
-}
-
-TEST(VulkanRotaryEmbeddingTest, rotary_embedding_llama3_params_test_seq_len_3) {
-  test_reference(
-      /*n_heads=*/32,
-      /*n_kv_heads=*/8,
-      /*dim=*/2048,
-      /*seq_len=*/3);
-}
diff --git a/backends/vulkan/test/op_tests/sdpa_test.cpp b/backends/vulkan/test/op_tests/sdpa_test.cpp
deleted file mode 100644
index e4b3f662c04..00000000000
--- a/backends/vulkan/test/op_tests/sdpa_test.cpp
+++ /dev/null
@@ -1,839 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <ATen/ATen.h>
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/extension/aten_util/make_aten_functor_from_et_functor.h>
-#include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
-#include <executorch/extension/llm/custom_ops/op_sdpa.h>
-
-#include "test_utils.h"
-
-#include <cassert>
-#include <iostream>
-
-namespace torch {
-namespace executor {
-namespace native {
-
-// The below are copied from executorch/extension/llm/custom_ops/op_sdpa_aot.cpp
-// They are needed because the original definitions are inaccessible due to
-// being defined in an anonymous namespace.
-
-Tensor& sdpa_with_kv_cache_out_no_context(
-    const Tensor& q_projected,
-    const Tensor& k_projected,
-    const Tensor& v_projected,
-    Tensor& key_cache,
-    Tensor& value_cache,
-    const int64_t start_pos,
-    const int64_t seq_len,
-    // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue
-    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
-    const optional<Tensor> attn_mask,
-    const double dropout_p,
-    const bool is_causal,
-    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
-    const optional<double> scale,
-    Tensor& output) {
-  executorch::runtime::KernelRuntimeContext context{};
-  return torch::executor::native::sdpa_with_kv_cache_out(
-      context,
-      q_projected,
-      k_projected,
-      v_projected,
-      key_cache,
-      value_cache,
-      start_pos,
-      seq_len,
-      attn_mask,
-      dropout_p,
-      is_causal,
-      scale,
-      output);
-}
-
-at::Tensor sdpa_with_kv_cache_aten(
-    const at::Tensor& q_projected,
-    const at::Tensor& k_projected,
-    const at::Tensor& v_projected,
-    at::Tensor& key_cache,
-    at::Tensor& value_cache,
-    const int64_t start_pos,
-    const int64_t seq_len,
-    // @lint-ignore CLANGTIDY facebook-hte-ConstantArgumentPassByValue
-    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
-    const std::optional<at::Tensor> attn_mask,
-    const double dropout_p,
-    const bool is_causal,
-    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
-    const std::optional<double> scale) {
-  auto output = at::empty_like(q_projected);
-  WRAP_TO_ATEN(sdpa_with_kv_cache_out_no_context, 11)
-  (q_projected,
-   k_projected,
-   v_projected,
-   key_cache,
-   value_cache,
-   start_pos,
-   seq_len,
-   attn_mask,
-   dropout_p,
-   is_causal,
-   scale,
-   output);
-  return output;
-}
-
-} // namespace native
-} // namespace executor
-} // namespace torch
-
-//
-// Reference Implementation
-//
-
-/*
- * Converts a boolean mask to an additive mask. Values that are false are
- * converted to -inf, and values that are true are converted to 0.
- */
-at::Tensor convert_boolean_attn_mask(
-    const at::Tensor& attn_mask,
-    caffe2::TypeMeta dtype) {
-  // Convert boolean mask to additive mask; need to invert mask to indicate what
-  // to mask *out*.
-  if (attn_mask.dtype() == at::kBool) {
-    return at::where(
-        attn_mask.logical_not(),
-        -std::numeric_limits<double>::infinity(),
-        at::scalar_tensor(
-            0.0, at::TensorOptions().dtype(dtype).device(attn_mask.device())));
-  }
-  // Otherwise, attn_mask represents an additive attention tensor
-  return attn_mask;
-}
-
-/*
- * Construct an attention mask for SDPA.
- * 1. Construct a square matrix of ones with each dim equal to start_pos +
- *    seq_len
- * 2. Keep the lower triangular elements as 1 and set the rest to 0
- * 3. Slice the mask to keep only seq_len rows starting from input_pos
- * 4. Convert the mask to an additive mask
- */
-at::Tensor construct_attention_mask(
-    const at::Tensor& q,
-    const at::Tensor& k_cache,
-    const int start_pos) {
-  const int max_seq_len = k_cache.size(1);
-  const int seq_len = q.size(1);
-
-  const int length = start_pos + seq_len;
-  at::Tensor attn_mask_base =
-      at::ones({length, length}, q.options().dtype(at::kBool)).tril();
-
-  at::Tensor attn_mask_sliced =
-      at::slice(attn_mask_base, 0, start_pos, start_pos + seq_len);
-
-  attn_mask_sliced = convert_boolean_attn_mask(attn_mask_sliced, q.dtype());
-  return attn_mask_sliced;
-}
-
-/*
- * Reference implementation of SDPA
- */
-at::Tensor sdpa_reference_impl(
-    const at::Tensor& q_projected,
-    const at::Tensor& k_projected,
-    const at::Tensor& v_projected,
-    at::Tensor& key_cache,
-    at::Tensor& value_cache,
-    const int64_t start_pos,
-    const int64_t seq_len,
-    const std::optional<at::Tensor> __attn_mask_ignored,
-    const double dropout_p,
-    const bool is_causal,
-    const std::optional<double> scale) {
-  at::Tensor attn_mask =
-      construct_attention_mask(q_projected, key_cache, start_pos);
-
-  // Cache update
-  at::Tensor key_cache_updated = at::slice_scatter(
-      key_cache, k_projected, 1, start_pos, start_pos + k_projected.size(1));
-  at::Tensor value_cache_updated = at::slice_scatter(
-      value_cache, v_projected, 1, start_pos, start_pos + v_projected.size(1));
-
-  // Write back to input
-  key_cache = key_cache_updated;
-  value_cache = value_cache_updated;
-
-  at::Tensor key_cache_sliced =
-      at::slice(key_cache_updated, 1, 0, start_pos + q_projected.size(1));
-
-  at::Tensor value_cache_sliced =
-      at::slice(value_cache_updated, 1, 0, start_pos + q_projected.size(1));
-
-  // Since n_heads may not be the same as n_kv_heads, the sliced k and v cache
-  // matrices need to be "expanded" to match
-  const int num_repeats = q_projected.size(2) / key_cache.size(2);
-  at::Tensor key_cache_sliced_repeated =
-      at::repeat_interleave(key_cache_sliced, num_repeats, 2);
-  at::Tensor value_cache_sliced_repeated =
-      at::repeat_interleave(value_cache_sliced, num_repeats, 2);
-
-  at::Tensor q_transposed = q_projected.transpose(1, 2);
-  at::Tensor k_transposed = key_cache_sliced_repeated.transpose(1, 2);
-  at::Tensor v_transposed = value_cache_sliced_repeated.transpose(1, 2);
-
-  at::Tensor k_transposed_2 = k_transposed.transpose(-2, -1);
-  at::Tensor attn_weight_prescale = at::matmul(q_transposed, k_transposed_2);
-
-  float scale_factor = 1.0 / sqrt(q_transposed.size(-1));
-  at::Tensor attn_weight = attn_weight_prescale * scale_factor + attn_mask;
-
-  at::Tensor attn_weight_softmax = at::softmax(attn_weight, -1);
-  at::Tensor out = at::matmul(attn_weight_softmax, v_transposed);
-
-  return out.transpose(1, 2);
-}
-
-//
-// Test functions
-//
-
-void test_reference_sdpa(
-    const int start_input_pos,
-    const int sequence_len,
-    const int embedding_dim,
-    const int num_heads,
-    const int num_kv_heads,
-    const int batch_size,
-    const int max_seq_len,
-    at::ScalarType dtype = at::kFloat) {
-  const int head_dim = embedding_dim / num_heads;
-
-  // K and V caches. Need an extra set for the reference implementation
-
-  at::Tensor k_cache = at::zeros(
-      {batch_size, max_seq_len, num_kv_heads, head_dim},
-      at::device(at::kCPU).dtype(dtype));
-  at::Tensor v_cache = at::zeros_like(k_cache);
-
-  at::Tensor k_cache_ref = at::zeros_like(k_cache);
-  at::Tensor v_cache_ref = at::zeros_like(v_cache);
-
-  for (int input_pos = start_input_pos; input_pos + sequence_len < max_seq_len;
-       input_pos += sequence_len) {
-    at::Tensor q = at::rand(
-        {batch_size, sequence_len, num_heads, head_dim},
-        at::device(at::kCPU).dtype(dtype));
-    at::Tensor k = at::rand(
-        {batch_size, sequence_len, num_kv_heads, head_dim},
-        at::device(at::kCPU).dtype(dtype));
-    at::Tensor v = at::rand_like(k);
-
-    at::Tensor reference_impl_out = sdpa_reference_impl(
-        q, k, v, k_cache, v_cache, input_pos, sequence_len, {}, 0.0, true, {});
-
-    at::Tensor reference_out = torch::executor::native::sdpa_with_kv_cache_aten(
-        q,
-        k,
-        v,
-        k_cache_ref,
-        v_cache_ref,
-        input_pos,
-        sequence_len,
-        {},
-        0.0,
-        true,
-        {});
-
-    ASSERT_TRUE(at::allclose(reference_impl_out, reference_out));
-  }
-}
-
-void test_vulkan_sdpa(
-    const int start_input_pos,
-    const int base_sequence_len,
-    const int embedding_dim,
-    const int num_heads,
-    const int num_kv_heads,
-    const int batch_size,
-    const int max_seq_len,
-    const bool dynamic_seq_len = true,
-    at::ScalarType dtype = at::kFloat) {
-  const int head_dim = embedding_dim / num_heads;
-
-  const int init_seq_len = dynamic_seq_len ? max_seq_len : base_sequence_len;
-  // K and V caches
-
-  at::Tensor k_cache = at::zeros(
-      {batch_size, max_seq_len, num_kv_heads, head_dim},
-      at::device(at::kCPU).dtype(dtype));
-
-  at::Tensor v_cache = at::zeros_like(k_cache);
-
-  // Reference input data
-  at::Tensor q = at::empty(
-      {batch_size, init_seq_len, num_heads, head_dim},
-      at::device(at::kCPU).dtype(dtype));
-  at::Tensor k = at::empty(
-      {batch_size, init_seq_len, num_kv_heads, head_dim},
-      at::device(at::kCPU).dtype(dtype));
-  at::Tensor v = at::empty_like(k);
-
-  // Get reference output
-  at::Tensor out = at::empty_like(q);
-
-  // Build Vulkan SDPA graph
-  using namespace vkcompute;
-
-  GraphConfig config;
-  config.set_storage_type_override(utils::kTexture3D);
-  ComputeGraph graph(config);
-
-  // "Data" variant for vulkan initialization
-
-  at::Tensor k_cache_data = at::zeros_like(k_cache);
-  at::Tensor v_cache_data = at::zeros_like(v_cache);
-
-#define MAKE_TENSORREF_FOR(x)              \
-  ValueRef r_##x = graph.add_tensorref(    \
-      x.sizes().vec(),                     \
-      from_at_scalartype(x.scalar_type()), \
-      x.const_data_ptr());
-
-  MAKE_TENSORREF_FOR(k_cache_data);
-  MAKE_TENSORREF_FOR(v_cache_data);
-
-#define MAKE_INPUT_FOR(x)                    \
-  IOValueRef r_##x = graph.add_input_tensor( \
-      x.sizes().vec(), from_at_scalartype(x.scalar_type()));
-
-  MAKE_INPUT_FOR(q);
-  MAKE_INPUT_FOR(k);
-  MAKE_INPUT_FOR(v);
-#undef MAKE_INPUT_FOR
-
-  const ValueRef r_input_pos_symint = graph.add_symint(start_input_pos);
-  const ValueRef r_out = graph.add_tensor(
-      out.sizes().vec(), from_at_scalartype(out.scalar_type()));
-
-  VK_GET_OP_FN("sdpa_with_kv_cache.default")
-  (graph,
-   {
-       r_q.value,
-       r_k.value,
-       r_v.value,
-       r_k_cache_data,
-       r_v_cache_data,
-       r_input_pos_symint,
-       kDummyValueRef, // sequence_len
-       kDummyValueRef, // attn_mask
-       kDummyValueRef, // dropout_p
-       kDummyValueRef, // is_causal
-       kDummyValueRef, // scale
-       r_out,
-   });
-
-  ValueRef staging_out = graph.set_output_tensor(r_out);
-
-  graph.prepare();
-
-  graph.prepack();
-
-  //
-  // Run model
-  //
-
-#define COPY_INPUT(x) \
-  graph.copy_into_staging(r_##x.staging, x.const_data_ptr(), x.numel());
-
-#define EXTRACT_TENSOR(x)                             \
-  at::Tensor vk_##x = at::zeros_like(x).contiguous(); \
-  graph.copy_from_staging(                            \
-      staging_##x, vk_##x.mutable_data_ptr(), vk_##x.numel());
-
-  int seq_len = base_sequence_len;
-  for (int i = 0, input_pos = start_input_pos;
-       input_pos + seq_len < max_seq_len;
-       input_pos += seq_len, i++) {
-    q = at::rand(
-        {batch_size, seq_len, num_heads, head_dim},
-        at::device(at::kCPU).dtype(dtype));
-    k = at::rand(
-        {batch_size, seq_len, num_kv_heads, head_dim},
-        at::device(at::kCPU).dtype(dtype));
-    v = at::rand_like(k);
-
-    at::Tensor reference_out = sdpa_reference_impl(
-        q, k, v, k_cache, v_cache, input_pos, seq_len, {}, 0.0, true, {});
-
-    graph.set_symint(r_input_pos_symint, input_pos);
-    graph.resize_input(0, q.sizes().vec());
-    graph.resize_input(1, k.sizes().vec());
-    graph.resize_input(2, v.sizes().vec());
-    graph.propagate_resize();
-
-    // Run Vulkan SDPA
-    COPY_INPUT(q);
-    COPY_INPUT(k);
-    COPY_INPUT(v);
-
-    graph.execute();
-
-    out = at::empty_like(q);
-    EXTRACT_TENSOR(out);
-
-    const bool output_correct = at::allclose(reference_out, vk_out);
-    if (!output_correct) {
-      at::Tensor diffs = at::abs(reference_out - vk_out);
-
-      std::cout << "Failed at input_pos " << input_pos << " with seq_len "
-                << seq_len << std::endl;
-
-      std::cout << "Maximum difference: " << std::endl;
-      std::cout << at::max(diffs).item() << std::endl;
-      std::cout << "Found at index " << std::endl;
-      std::cout << at::argmax(diffs).item() << std::endl;
-
-      std::cout << "Maximum value observed: " << std::endl;
-      std::cout << at::max(at::abs(at::cat({reference_out, vk_out}, -1))).item()
-                << std::endl;
-    }
-    ASSERT_TRUE(output_correct);
-
-    if (dynamic_seq_len) {
-      seq_len = base_sequence_len + (i % 3);
-    }
-  }
-}
-
-TEST(VulkanSDPATest, test_sdpa_op_small_params) {
-  const int starting_input_pos = 0;
-  const int base_sequence_len = 3;
-  const int embedding_dim = 18;
-  const int num_heads = 6;
-  const int num_kv_heads = 2;
-  const int batch_size = 1;
-  const int max_seq_len = 7;
-
-  test_vulkan_sdpa(
-      starting_input_pos,
-      base_sequence_len,
-      embedding_dim,
-      num_heads,
-      num_kv_heads,
-      batch_size,
-      max_seq_len,
-      false);
-}
-
-TEST(VulkanSDPATest, test_sdpa_op_small_params_dynamic) {
-  const int starting_input_pos = 0;
-  const int base_sequence_len = 3;
-  const int embedding_dim = 18;
-  const int num_heads = 6;
-  const int num_kv_heads = 2;
-  const int batch_size = 1;
-  const int max_seq_len = 12;
-
-  test_vulkan_sdpa(
-      starting_input_pos,
-      base_sequence_len,
-      embedding_dim,
-      num_heads,
-      num_kv_heads,
-      batch_size,
-      max_seq_len);
-}
-
-TEST(VulkanSDPATest, test_sdpa_op_llama3_params_dynamic) {
-  const int starting_input_pos = 0;
-  const int base_sequence_len = 3;
-  const int embedding_dim = 2048;
-  const int num_heads = 32;
-  const int num_kv_heads = 8;
-  const int batch_size = 1;
-  const int max_seq_len = 128;
-
-  test_vulkan_sdpa(
-      starting_input_pos,
-      base_sequence_len,
-      embedding_dim,
-      num_heads,
-      num_kv_heads,
-      batch_size,
-      max_seq_len);
-}
-
-TEST(VulkanSDPATest, test_reference_impl) {
-  const int starting_input_pos = 0;
-  const int base_sequence_len = 3;
-  const int embedding_dim = 2048;
-  const int num_heads = 32;
-  const int num_kv_heads = 8;
-  const int batch_size = 1;
-  const int max_seq_len = 128;
-
-  test_reference_sdpa(
-      starting_input_pos,
-      base_sequence_len,
-      embedding_dim,
-      num_heads,
-      num_kv_heads,
-      batch_size,
-      max_seq_len);
-}
-
-void test_vulkan_flash_attention_impl(
-    const int start_input_pos,
-    const int sequence_len,
-    const int embedding_dim,
-    const int num_heads,
-    const int num_kv_heads,
-    const int batch_size,
-    const int max_seq_len,
-    vkcompute::utils::StorageType storage_type,
-    at::ScalarType dtype = at::kFloat) {
-  const int head_dim = embedding_dim / num_heads;
-
-  at::Tensor k_cache = at::zeros(
-      {batch_size, max_seq_len, num_kv_heads, head_dim},
-      at::device(at::kCPU).dtype(dtype));
-  at::Tensor v_cache = at::zeros_like(k_cache);
-
-  at::Tensor q = at::rand(
-      {batch_size, sequence_len, num_heads, head_dim},
-      at::device(at::kCPU).dtype(dtype));
-  at::Tensor k = at::rand(
-      {batch_size, sequence_len, num_kv_heads, head_dim},
-      at::device(at::kCPU).dtype(dtype));
-  at::Tensor v = at::rand_like(k);
-
-  // Get reference output using existing SDPA
-  at::Tensor reference_out = sdpa_reference_impl(
-      q,
-      k,
-      v,
-      k_cache,
-      v_cache,
-      start_input_pos,
-      sequence_len,
-      {},
-      0.0,
-      true,
-      {});
-
-  using namespace vkcompute;
-
-  GraphConfig config;
-  config.set_storage_type_override(storage_type);
-  ComputeGraph graph(config);
-
-  // Create input references
-  IOValueRef r_q = graph.add_input_tensor(
-      q.sizes().vec(), from_at_scalartype(q.scalar_type()));
-  IOValueRef r_k = graph.add_input_tensor(
-      k.sizes().vec(), from_at_scalartype(k.scalar_type()));
-  IOValueRef r_v = graph.add_input_tensor(
-      v.sizes().vec(), from_at_scalartype(v.scalar_type()));
-
-  // Create cache tensors (these would be updated by cache update operations in
-  // practice)
-  ValueRef r_k_cache = graph.add_tensorref(
-      k_cache.sizes().vec(),
-      from_at_scalartype(k_cache.scalar_type()),
-      k_cache.const_data_ptr());
-  ValueRef r_v_cache = graph.add_tensorref(
-      v_cache.sizes().vec(),
-      from_at_scalartype(v_cache.scalar_type()),
-      v_cache.const_data_ptr());
-
-  const ValueRef r_input_pos_symint = graph.add_symint(start_input_pos);
-  const ValueRef r_out =
-      graph.add_tensor(q.sizes().vec(), from_at_scalartype(q.scalar_type()));
-
-  // Call Flash Attention implementation
-  VK_GET_OP_FN("llama.flash_attention.default")
-  (graph,
-   {
-       r_q.value,
-       r_k.value, // Use actual K tensor, not cache
-       r_v.value, // Use actual V tensor, not cache
-       r_input_pos_symint,
-       kDummyValueRef, // attn_mask
-       kDummyValueRef, // dropout_p
-       kDummyValueRef, // is_causal
-       kDummyValueRef, // scale
-       r_out,
-   });
-
-  ValueRef staging_out = graph.set_output_tensor(r_out);
-
-  graph.prepare();
-  graph.prepack();
-
-  // Copy inputs and run
-  graph.copy_into_staging(r_q.staging, q.const_data_ptr(), q.numel());
-  graph.copy_into_staging(r_k.staging, k.const_data_ptr(), k.numel());
-  graph.copy_into_staging(r_v.staging, v.const_data_ptr(), v.numel());
-
-  graph.execute();
-
-  // Extract output
-  at::Tensor vk_out = at::zeros_like(q).contiguous();
-  graph.copy_from_staging(
-      staging_out, vk_out.mutable_data_ptr(), vk_out.numel());
-
-  // Compare results
-  const bool output_correct = at::allclose(reference_out, vk_out, 1e-3, 1e-3);
-
-  if (!output_correct) {
-    at::Tensor diffs = at::abs(reference_out - vk_out);
-    std::cout << "Maximum difference: " << at::max(diffs).item() << std::endl;
-    std::cout << "Maximum value observed: "
-              << at::max(at::abs(at::cat({reference_out, vk_out}, -1))).item()
-              << std::endl;
-  }
-  ASSERT_TRUE(output_correct);
-}
-
-void test_vulkan_flash_attention(
-    const int start_input_pos,
-    const int sequence_len,
-    const int embedding_dim,
-    const int num_heads,
-    const int num_kv_heads,
-    const int batch_size,
-    const int max_seq_len,
-    at::ScalarType dtype = at::kFloat) {
-  test_vulkan_flash_attention_impl(
-      start_input_pos,
-      sequence_len,
-      embedding_dim,
-      num_heads,
-      num_kv_heads,
-      batch_size,
-      max_seq_len,
-      vkcompute::utils::kBuffer,
-      dtype);
-
-  test_vulkan_flash_attention_impl(
-      start_input_pos,
-      sequence_len,
-      embedding_dim,
-      num_heads,
-      num_kv_heads,
-      batch_size,
-      max_seq_len,
-      vkcompute::utils::kTexture3D,
-      dtype);
-}
-
-// Flash Attention Tests (both Buffer and Texture)
-TEST(VulkanSDPATest, test_flash_attention_small_params) {
-  const int starting_input_pos = 0;
-  const int sequence_len = 2;
-  const int embedding_dim = 4;
-  const int num_heads = 2;
-  const int num_kv_heads = 2;
-  const int batch_size = 1;
-  const int max_seq_len = 4;
-
-  test_vulkan_flash_attention(
-      starting_input_pos,
-      sequence_len,
-      embedding_dim,
-      num_heads,
-      num_kv_heads,
-      batch_size,
-      max_seq_len);
-}
-
-TEST(VulkanSDPATest, test_flash_attention_multi_tile) {
-  const int starting_input_pos = 0;
-  const int sequence_len = 48;
-  const int embedding_dim = 32;
-  const int num_heads = 2;
-  const int num_kv_heads = 2;
-  const int batch_size = 1;
-  const int max_seq_len = 64;
-
-  test_vulkan_flash_attention(
-      starting_input_pos,
-      sequence_len,
-      embedding_dim,
-      num_heads,
-      num_kv_heads,
-      batch_size,
-      max_seq_len);
-}
-
-TEST(VulkanSDPATest, test_flash_attention_op_small_params) {
-  const int starting_input_pos = 0;
-  const int sequence_len = 3;
-  const int embedding_dim = 18;
-  const int num_heads = 6;
-  const int num_kv_heads = 2;
-  const int batch_size = 1;
-  const int max_seq_len = 7;
-
-  test_vulkan_flash_attention(
-      starting_input_pos,
-      sequence_len,
-      embedding_dim,
-      num_heads,
-      num_kv_heads,
-      batch_size,
-      max_seq_len);
-}
-
-TEST(VulkanSDPATest, test_flash_attention_op_small_params_dynamic) {
-  const int starting_input_pos = 0;
-  const int sequence_len = 3;
-  const int embedding_dim = 18;
-  const int num_heads = 6;
-  const int num_kv_heads = 2;
-  const int batch_size = 1;
-  const int max_seq_len = 12;
-
-  test_vulkan_flash_attention(
-      starting_input_pos,
-      sequence_len,
-      embedding_dim,
-      num_heads,
-      num_kv_heads,
-      batch_size,
-      max_seq_len);
-}
-
-TEST(VulkanSDPATest, test_flash_attention_op_llama3_params) {
-  const int starting_input_pos = 0;
-  const int sequence_len = 3;
-  const int embedding_dim = 2048;
-  const int num_heads = 32;
-  const int num_kv_heads = 8;
-  const int batch_size = 1;
-  const int max_seq_len = 128;
-
-  test_vulkan_flash_attention(
-      starting_input_pos,
-      sequence_len,
-      embedding_dim,
-      num_heads,
-      num_kv_heads,
-      batch_size,
-      max_seq_len);
-}
-
-TEST(VulkanSDPATest, test_flash_attention_op_llama3_params_dynamic) {
-  const int starting_input_pos = 0;
-  const int embedding_dim = 2048;
-  const int num_heads = 32;
-  const int num_kv_heads = 8;
-  const int batch_size = 1;
-  const int max_seq_len = 128;
-
-  // Test with different sequence lengths
-  std::vector<int> sequence_lengths = {1, 3, 5, 7, 16, 32};
-
-  for (int seq_len : sequence_lengths) {
-    if (seq_len < max_seq_len) {
-      test_vulkan_flash_attention(
-          starting_input_pos,
-          seq_len,
-          embedding_dim,
-          num_heads,
-          num_kv_heads,
-          batch_size,
-          max_seq_len);
-    }
-  }
-}
-
-TEST(VulkanSDPATest, test_flash_attention_reference_impl) {
-  const int starting_input_pos = 0;
-  const int sequence_len = 3;
-  const int embedding_dim = 2048;
-  const int num_heads = 32;
-  const int num_kv_heads = 8;
-  const int batch_size = 1;
-  const int max_seq_len = 128;
-
-  test_vulkan_flash_attention(
-      starting_input_pos,
-      sequence_len,
-      embedding_dim,
-      num_heads,
-      num_kv_heads,
-      batch_size,
-      max_seq_len);
-}
-
-TEST(VulkanSDPATest, test_flash_attention_reference_impl_small) {
-  const int starting_input_pos = 0;
-  const int sequence_len = 2;
-  const int embedding_dim = 32;
-  const int num_heads = 4;
-  const int num_kv_heads = 2;
-  const int batch_size = 1;
-  const int max_seq_len = 16;
-
-  test_vulkan_flash_attention(
-      starting_input_pos,
-      sequence_len,
-      embedding_dim,
-      num_heads,
-      num_kv_heads,
-      batch_size,
-      max_seq_len);
-}
-
-TEST(VulkanSDPATest, test_flash_attention_vec4_alignment) {
-  const int starting_input_pos = 0;
-  const int sequence_len = 8;
-  const int embedding_dim = 64;
-  const int num_heads = 4;
-  const int num_kv_heads = 2;
-  const int batch_size = 1;
-  const int max_seq_len = 16;
-
-  test_vulkan_flash_attention(
-      starting_input_pos,
-      sequence_len,
-      embedding_dim,
-      num_heads,
-      num_kv_heads,
-      batch_size,
-      max_seq_len);
-}
-
-TEST(VulkanSDPATest, test_flash_attention_edge_cases) {
-  // Test with single head (no multi-query complexity)
-  test_vulkan_flash_attention(0, 1, 8, 1, 1, 1, 4);
-
-  // Test with equal heads (no multi-query complexity)
-  test_vulkan_flash_attention(0, 2, 16, 4, 4, 1, 8);
-
-  // Test with large head dimension
-  test_vulkan_flash_attention(0, 2, 128, 2, 1, 1, 8);
-
-  // Test with sequence length that exactly matches block size (32)
-  test_vulkan_flash_attention(0, 32, 64, 2, 1, 1, 64);
-
-  // Test with sequence length slightly larger than block size
-  test_vulkan_flash_attention(
-      0, 33, 68, 2, 1, 1, 64); // 68 = 4*17, good for vec4
-}
diff --git a/backends/vulkan/test/op_tests/targets.bzl b/backends/vulkan/test/op_tests/targets.bzl
deleted file mode 100644
index b9386f92772..00000000000
--- a/backends/vulkan/test/op_tests/targets.bzl
+++ /dev/null
@@ -1,224 +0,0 @@
-load("@fbsource//tools/build_defs:platform_defs.bzl", "ANDROID")
-load("@fbsource//xplat/caffe2:pt_defs.bzl", "get_pt_ops_deps")
-load("@fbsource//xplat/caffe2:pt_ops.bzl", "pt_operator_library")
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load(
-    "@fbsource//xplat/executorch/backends/vulkan:targets.bzl",
-    "get_platforms",
-)
-
-def define_test_targets(test_name, extra_deps = [], src_file = None, is_fbcode = False):
-    deps_list = [
-        "//third-party/googletest:gtest_main",
-        "//executorch/backends/vulkan:vulkan_graph_runtime",
-        runtime.external_dep_location("libtorch"),
-    ] + extra_deps
-
-    src_file_str = src_file if src_file else "{}.cpp".format(test_name)
-
-    runtime.cxx_binary(
-        name = "{}_bin".format(test_name),
-        srcs = [
-            src_file_str,
-        ],
-        compiler_flags = [
-            "-Wno-unused-variable",
-        ],
-        platforms = get_platforms(),
-        define_static_target = False,
-        deps = deps_list,
-    )
-
-    runtime.cxx_test(
-        name = test_name,
-        srcs = [
-            src_file_str,
-        ],
-        contacts = ["oncall+ai_infra_mobile_platform@xmail.facebook.com"],
-        fbandroid_additional_loaded_sonames = [
-            "torch-code-gen",
-            "vulkan_graph_runtime",
-            "vulkan_graph_runtime_shaderlib",
-        ],
-        platforms = [ANDROID],
-        use_instrumentation_test = True,
-        deps = deps_list,
-    )
-
-
-def define_common_targets(is_fbcode = False):
-    if is_fbcode:
-        return
-
-    runtime.python_library(
-        name = "generate_op_correctness_tests_lib",
-        srcs = native.glob(["utils/*.py"]) + [
-            "generate_op_correctness_tests.py",
-            "cases.py",
-        ],
-        base_module = "executorch.backends.vulkan.test.op_tests",
-        deps = [
-            "fbsource//third-party/pypi/expecttest:expecttest",
-        ],
-        external_deps = ["torchgen"],
-    )
-
-    runtime.python_library(
-        name = "generate_op_benchmarks_lib",
-        srcs = native.glob(["utils/*.py"]) + [
-            "generate_op_benchmarks.py",
-            "cases.py",
-        ],
-        base_module = "executorch.backends.vulkan.test.op_tests",
-        deps = [
-            "fbsource//third-party/pypi/expecttest:expecttest",
-        ],
-        external_deps = ["torchgen"],
-    )
-
-    runtime.python_binary(
-        name = "generate_op_correctness_tests",
-        main_module = "executorch.backends.vulkan.test.op_tests.generate_op_correctness_tests",
-        deps = [
-            ":generate_op_correctness_tests_lib",
-        ],
-    )
-
-    runtime.python_binary(
-        name = "generate_op_benchmarks",
-        main_module = "executorch.backends.vulkan.test.op_tests.generate_op_benchmarks",
-        deps = [
-            ":generate_op_benchmarks_lib",
-        ],
-    )
-
-    aten_src_path = runtime.external_dep_location("aten-src-path")
-    genrule_cmd = [
-        "$(exe :generate_op_correctness_tests)",
-        "--tags-path $(location {})/aten/src/ATen/native/tags.yaml".format(aten_src_path),
-        "--aten-yaml-path $(location {})/aten/src/ATen/native/native_functions.yaml".format(aten_src_path),
-        "-o $OUT",
-    ]
-
-    runtime.genrule(
-        name = "generated_op_correctness_tests_cpp",
-        outs = {
-            "op_tests.cpp": ["op_tests.cpp"],
-        },
-        cmd = " ".join(genrule_cmd),
-        default_outs = ["."],
-    )
-
-    benchmarks_genrule_cmd = [
-        "$(exe :generate_op_benchmarks)",
-        "--tags-path $(location {})/aten/src/ATen/native/tags.yaml".format(aten_src_path),
-        "--aten-yaml-path $(location {})/aten/src/ATen/native/native_functions.yaml".format(aten_src_path),
-        "-o $OUT",
-    ]
-
-    runtime.genrule(
-        name = "generated_op_benchmarks_cpp",
-        outs = {
-            "op_benchmarks.cpp": ["op_benchmarks.cpp"],
-        },
-        cmd = " ".join(benchmarks_genrule_cmd),
-        default_outs = ["."],
-    )
-
-    runtime.cxx_binary(
-        name = "compute_graph_op_benchmarks_bin",
-        srcs = [
-            ":generated_op_benchmarks_cpp[op_benchmarks.cpp]",
-        ],
-        compiler_flags = [
-            "-Wno-unused-variable",
-        ],
-        define_static_target = False,
-        deps = [
-            "//third-party/benchmark:benchmark",
-            "//executorch/backends/vulkan:vulkan_graph_runtime",
-            runtime.external_dep_location("libtorch"),
-        ],
-        platforms = get_platforms(),
-    )
-
-    runtime.cxx_library(
-        name = "test_utils",
-        srcs = [
-            "test_utils.cpp",
-        ],
-        headers = [
-            "test_utils.h",
-        ],
-        exported_headers = [
-            "test_utils.h",
-        ],
-        deps = [
-            "//executorch/backends/vulkan:vulkan_graph_runtime",
-            "//executorch/runtime/core/exec_aten:lib",
-            runtime.external_dep_location("libtorch"),
-        ],
-        visibility = [
-            "//executorch/backends/vulkan/test/op_tests/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
-
-    define_test_targets(
-        "compute_graph_op_tests",
-        src_file=":generated_op_correctness_tests_cpp[op_tests.cpp]"
-    )
-
-    define_test_targets(
-        "sdpa_test",
-        extra_deps = [
-            ":test_utils",
-            "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
-            "//executorch/extension/tensor:tensor",
-        ]
-    )
-    define_test_targets(
-        "quantize_test",
-        extra_deps = [
-            ":test_utils",
-            "//executorch/kernels/quantized/cpu:op_quantize",
-            "//executorch/extension/tensor:tensor",
-            "//executorch/extension/aten_util:aten_bridge",
-        ]
-    )
-    define_test_targets(
-        "dequantize_test",
-        extra_deps = [
-            ":test_utils",
-            "//executorch/kernels/quantized/cpu:op_dequantize",
-            "//executorch/extension/tensor:tensor",
-            "//executorch/extension/aten_util:aten_bridge",
-        ]
-    )
-    define_test_targets(
-        "choose_qparams_test",
-        extra_deps = [
-            ":test_utils",
-            "//executorch/kernels/quantized/cpu:op_choose_qparams",
-            "//executorch/extension/tensor:tensor",
-            "//executorch/extension/aten_util:aten_bridge",
-        ]
-    )
-    define_test_targets(
-        "quantized_linear_test",
-        extra_deps = [
-            ":test_utils",
-        ]
-    )
-    define_test_targets(
-        "rotary_embedding_test",
-        extra_deps = [
-            ":test_utils",
-        ]
-    )
-    define_test_targets(
-        "quantize_affine_test",
-        extra_deps = [
-            ":test_utils",
-        ]
-    )
diff --git a/backends/vulkan/test/op_tests/test_utils.cpp b/backends/vulkan/test/op_tests/test_utils.cpp
deleted file mode 100644
index c5702abd079..00000000000
--- a/backends/vulkan/test/op_tests/test_utils.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "test_utils.h"
-
-#include <stdexcept>
-
-executorch::aten::ScalarType at_scalartype_to_et_scalartype(
-    at::ScalarType dtype) {
-  using ScalarType = executorch::aten::ScalarType;
-  switch (dtype) {
-    case at::kByte:
-      return ScalarType::Byte;
-    case at::kChar:
-      return ScalarType::Char;
-    case at::kShort:
-      return ScalarType::Short;
-    case at::kInt:
-      return ScalarType::Int;
-    case at::kLong:
-      return ScalarType::Long;
-    case at::kHalf:
-      return ScalarType::Half;
-    case at::kFloat:
-      return ScalarType::Float;
-    case at::kDouble:
-      return ScalarType::Double;
-    default:
-      throw std::runtime_error("Unsupported dtype");
-  }
-}
-
-std::string scalar_type_name(c10::ScalarType dtype) {
-  switch (dtype) {
-    case c10::kLong:
-      return "c10::kLong";
-    case c10::kShort:
-      return "c10::kShort";
-    case c10::kComplexHalf:
-      return "c10::kComplexHalf";
-    case c10::kComplexFloat:
-      return "c10::kComplexFloat";
-    case c10::kComplexDouble:
-      return "c10::kComplexDouble";
-    case c10::kBool:
-      return "c10::kBool";
-    case c10::kQInt8:
-      return "c10::kQInt8";
-    case c10::kQUInt8:
-      return "c10::kQUInt8";
-    case c10::kQInt32:
-      return "c10::kQInt32";
-    case c10::kBFloat16:
-      return "c10::kBFloat16";
-    case c10::kQUInt4x2:
-      return "c10::kQUInt4x2";
-    case c10::kQUInt2x4:
-      return "c10::kQUInt2x4";
-    case c10::kFloat:
-      return "c10::kFloat";
-    case c10::kHalf:
-      return "c10::kHalf";
-    case c10::kInt:
-      return "c10::kInt";
-    case c10::kChar:
-      return "c10::kChar";
-    case c10::kByte:
-      return "c10::kByte";
-    case c10::kDouble:
-      return "c10::kDouble";
-    case c10::kUInt16:
-      return "c10::kUInt16";
-    case c10::kBits16:
-      return "c10::kBits16";
-    default:
-      return "Unknown(" + std::to_string(static_cast<int>(dtype)) + ")";
-  }
-}
-
-vkcompute::vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) {
-  using namespace vkcompute;
-  switch (at_scalartype) {
-    case c10::kHalf:
-      return vkapi::kHalf;
-    case c10::kFloat:
-      return vkapi::kFloat;
-    case c10::kDouble:
-      return vkapi::kDouble;
-    case c10::kInt:
-      return vkapi::kInt;
-    case c10::kLong:
-      // No support for 64-bit integers
-      return vkapi::kInt;
-    case c10::kChar:
-      return vkapi::kChar;
-    case c10::kByte:
-      return vkapi::kByte;
-    case c10::kShort:
-      return vkapi::kShort;
-    case c10::kUInt16:
-      return vkapi::kUInt16;
-    default:
-      VK_THROW(
-          "Unsupported at::ScalarType: ",
-          scalar_type_name(at_scalartype),
-          " (",
-          static_cast<int>(at_scalartype),
-          ")");
-  }
-}
diff --git a/backends/vulkan/test/op_tests/test_utils.h b/backends/vulkan/test/op_tests/test_utils.h
deleted file mode 100644
index 369767007e0..00000000000
--- a/backends/vulkan/test/op_tests/test_utils.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <string>
-
-#include <ATen/ATen.h>
-#include <c10/core/ScalarType.h>
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-
-/**
- * Convert at::ScalarType to executorch::ScalarType
- */
-executorch::aten::ScalarType at_scalartype_to_et_scalartype(
-    at::ScalarType dtype);
-
-/**
- * Get the string name of a c10::ScalarType for better error messages
- */
-std::string scalar_type_name(c10::ScalarType dtype);
-
-/**
- * Convert c10::ScalarType to vkcompute::vkapi::ScalarType
- */
-vkcompute::vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype);
diff --git a/backends/vulkan/test/op_tests/utils/aten_types.py b/backends/vulkan/test/op_tests/utils/aten_types.py
deleted file mode 100644
index 6ad2f568e91..00000000000
--- a/backends/vulkan/test/op_tests/utils/aten_types.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-####################
-## ATen C++ Types ##
-####################
-
-AT_INT_ARRAY_REF = "at::IntArrayRef"
-AT_SCALAR = "at::Scalar"
-AT_TENSOR = "at::Tensor"
-AT_TENSOR_LIST = "at::TensorList"
-BOOL = "bool"
-DOUBLE = "double"
-INT = "int64_t"
-OPT_AT_DOUBLE_ARRAY_REF = "::std::optional<at::ArrayRef<double>>"
-OPT_AT_INT_ARRAY_REF = "at::OptionalIntArrayRef"
-OPT_AT_TENSOR = "::std::optional<at::Tensor>"
-OPT_BOOL = "::std::optional<bool>"
-OPT_INT64 = "::std::optional<int64_t>"
-OPT_DEVICE = "::std::optional<at::Device>"
-OPT_LAYOUT = "::std::optional<at::Layout>"
-OPT_MEMORY_FORMAT = "::std::optional<at::MemoryFormat>"
-OPT_SCALAR_TYPE = "::std::optional<at::ScalarType>"
-STRING = "std::string_view"
-OLD_STRING = "c10::string_view"
-TWO_TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor>"
-THREE_TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor,at::Tensor>"
-TENSOR_VECTOR = "::std::vector<at::Tensor>"
diff --git a/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py b/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py
deleted file mode 100644
index 76eb9dbe838..00000000000
--- a/backends/vulkan/test/op_tests/utils/gen_benchmark_vk.py
+++ /dev/null
@@ -1,346 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import re
-
-from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import (
-    ComputeGraphGen,
-)
-from executorch.backends.vulkan.test.op_tests.utils.gen_correctness_base import (
-    CorrectnessTestGen,
-)
-from executorch.backends.vulkan.test.op_tests.utils.test_suite import VkTestSuite
-
-from torchgen.model import NativeFunction
-
-##########################
-## Test Suite Generation ##
-##########################
-
-benchmark_fixture_template = """
-class GeneratedOpBenchmark_{op_name} : public ::benchmark::Fixture {{
- protected:
-  ComputeGraph* graph;
-  at::ScalarType test_dtype = at::kFloat;
-  float rtol = {rtol};
-  float atol = {atol};
-
-  {arg_valuerefs}
-
-  void SetUp(::benchmark::State& state) override {{
-    GraphConfig config;
-    config.descriptor_pool_safety_factor = 2.0;
-    test_dtype = at::ScalarType(state.range(0));
-    const utils::StorageType storage_type = utils::StorageType(state.range(1));
-    const utils::GPUMemoryLayout memory_layout = utils::GPUMemoryLayout(state.range(2));
-    config.set_storage_type_override(storage_type);
-    config.set_memory_layout_override(memory_layout);
-    config.enable_querypool = true;
-    graph = new ComputeGraph(config);
-  }}
-
-  void TearDown(::benchmark::State& state) override {{
-    delete graph;
-    graph = nullptr;
-  }}
-
-  {build_graph_fn}
-  {benchmark_fn}
-}};
-"""
-
-benchmark_template = """
-BENCHMARK_DEFINE_F(GeneratedOpBenchmark_{op_name}, {case_name})(benchmark::State& state) {{
-    {skips}
-    {create_ref_data}
-    {call_build_graph}
-    ShaderTimes shader_times;
-    for (auto _ : state) {{
-        {call_benchmark}
-        graph->context()->querypool().extract_results();
-        QueryPoolResults results = graph->context()->querypool().get_shader_timestamp_data();
-        process_querypool_results(results, shader_times);
-    }}
-    register_shader_time_counters(state, shader_times);
-}}
-
-BENCHMARK_REGISTER_F(GeneratedOpBenchmark_{op_name}, {case_name})->Threads(1)->ArgsProduct({combos});
-"""
-
-
-class VkBenchmarkGen(CorrectnessTestGen):
-    def __init__(self, op_reg_name: str, f: NativeFunction, inputs: VkTestSuite):
-        super().__init__(f, inputs)
-        self.op_reg_name = op_reg_name
-        self.generator = ComputeGraphGen(
-            self.op_reg_name, self.f, self.suite_def, inputs.force_io
-        )
-
-    def gen_call_benchmark(self, prepack=False) -> str:
-        test_str = f"benchmark_{self.op_name}("
-        if prepack:
-            test_str = f"prepacked_benchmark_{self.op_name}("
-        for binding in self.f_sig.arguments():
-            arg = binding.argument
-            test_str += f"{arg.name}, "
-        test_str = test_str[:-2] + ");"
-        test_str = re.sub(r"^", "  ", test_str, flags=re.M)
-        return test_str
-
-    def gen_call_build_graph(self, prepack=False) -> str:
-        test_str = f"build_graph_{self.op_name}("
-        if prepack:
-            test_str = f"prepacked_build_graph_{self.op_name}("
-        for binding in self.f_sig.arguments():
-            arg = binding.argument
-            test_str += f"{arg.name}, "
-        test_str = test_str[:-2] + ");"
-        test_str = re.sub(r"^", "  ", test_str, flags=re.M)
-        return test_str
-
-    def gen_combos(self, inputs) -> str:
-        dtypes_list = ", ".join(f"int({dtype})" for dtype in self.suite_def.dtypes)
-        storage_types_list = ", ".join(
-            f"int({storage_type})" for storage_type in self.suite_def.storage_types
-        )
-        layouts_list = ", ".join(f"int({layout})" for layout in self.suite_def.layouts)
-        return f"{{ {{ {dtypes_list} }}, {{ {storage_types_list} }}, {{ {layouts_list} }} }}"
-
-    def generate_benchmark_case(self, inputs, prepack=False) -> str:
-        return benchmark_template.format(
-            op_name=f"{self.op_name}",
-            case_name=self.gen_case_name(inputs, prepack),
-            skips=self.generator.gen_conditional_skips(
-                'state.SkipWithError("unsupported type"); return;'
-            ),
-            create_ref_data=self.gen_create_ref_data(inputs),
-            call_build_graph=self.gen_call_build_graph(prepack),
-            call_benchmark=self.gen_call_benchmark(prepack),
-            combos=self.gen_combos(inputs),
-        )
-
-    def generate_benchmark(self) -> str:
-        benchmarks_cpp = ""
-        for inputs in self.suite_def.input_cases:
-            if not self.suite_def.requires_prepack:
-                benchmarks_cpp += self.generate_benchmark_case(inputs)
-            if self.suite_def.supports_prepack():
-                benchmarks_cpp += self.generate_benchmark_case(inputs, prepack=True)
-        return benchmarks_cpp
-
-    def generate_benchmark_fixture(self) -> str:
-        build_graph_fn = ""
-        benchmark_fn = ""
-        if not self.suite_def.requires_prepack:
-            build_graph_fn = self.generator.gen_build_graph_fn()
-            benchmark_fn = self.generator.gen_op_exec_graph_fn()
-
-        prepacked_build_graph_fn = ""
-        prepacked_benchmark_fn = ""
-        if self.suite_def.supports_prepack():
-            self.generator.should_prepack = True
-            prepacked_build_graph_fn = self.generator.gen_build_graph_fn()
-            build_graph_fn += "\n\n  "
-            build_graph_fn += prepacked_build_graph_fn
-            prepacked_benchmark_fn = self.generator.gen_op_exec_graph_fn()
-            benchmark_fn += "\n\n  "
-            benchmark_fn += prepacked_benchmark_fn
-
-        return benchmark_fixture_template.format(
-            op_name=self.op_name,
-            build_graph_fn=build_graph_fn,
-            benchmark_fn=benchmark_fn,
-            rtol=self.suite_def.rtol,
-            arg_valuerefs=self.generator.gen_arg_valueref_decls(),
-            atol=self.suite_def.atol,
-        )
-
-
-##########################
-## Test File Generation ##
-##########################
-
-cpp_test_template = """
-#include <iostream>
-#include <ATen/ATen.h>
-#include <benchmark/benchmark.h>
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-using namespace vkcompute;
-using TensorOptions = at::TensorOptions;
-
-vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) {{
-  switch (at_scalartype) {{
-    case c10::kDouble:
-      return vkapi::kDouble;
-    case c10::kFloat:
-      return vkapi::kFloat;
-    case c10::kHalf:
-      return vkapi::kHalf;
-    case c10::kInt:
-      return vkapi::kInt;
-    case c10::kLong:
-      return vkapi::kInt;
-    case c10::kChar:
-      return vkapi::kChar;
-    case c10::kBool:
-      return vkapi::kBool;
-    default:
-      VK_THROW("Unsupported at::ScalarType!");
-  }}
-}}
-
-at::Tensor make_casted_randint_tensor(
-    std::vector<int64_t> sizes,
-    at::ScalarType dtype = at::kFloat,
-    int low = 0,
-    int high = 10) {{
-
-  return at::randint(high, sizes, at::device(at::kCPU).dtype(dtype));
-}}
-
-at::Tensor make_rand_tensor(
-    std::vector<int64_t> sizes,
-    at::ScalarType dtype = at::kFloat,
-    float low = 0.0,
-    float high = 1.0) {{
-  if (high == 1.0 && low == 0.0)
-    return at::rand(sizes, at::device(at::kCPU).dtype(dtype));
-
-  if (dtype == at::kChar)
-    return at::randint(high, sizes, at::device(at::kCPU).dtype(dtype));
-
-  return at::rand(sizes, at::device(at::kCPU).dtype(dtype)) * (high - low) + low;
-}}
-
-at::Tensor make_seq_tensor(
-    std::vector<int64_t> sizes,
-    at::ScalarType dtype = at::kFloat,
-    float low = 0.0,
-    float high = 1.0) {{
-  (void)low;
-  (void)high;
-
-  int64_t n = 1;
-  for (auto size: sizes) {{
-    n *= size;
-  }}
-
-  std::vector<float> values(n);
-  for (int i=0;i<n;i++) {{
-    values[i] = (float) i;
-  }}
-
-  // Clone as original data will be deallocated upon return.
-  return at::from_blob(values.data(), sizes, at::kFloat).toType(dtype).detach().clone();
-}}
-
-at::Tensor make_index_tensor_1d(std::vector<int64_t> indices) {{
-  at::ScalarType dtype = at::kInt;
-  std::vector<int64_t> sizes = {{static_cast<int64_t>(indices.size())}};
-
-  // Clone as original data will be deallocated upon return.
-  return at::from_blob(indices.data(), sizes, dtype).detach().clone();
-}}
-
-at::Tensor make_index_tensor_2d(std::vector<std::vector<int64_t>> indices) {{
-  at::ScalarType dtype = at::kInt;
-  std::vector<int64_t> sizes = {{
-    static_cast<int64_t>(indices.size()),
-    static_cast<int64_t>(indices[0].size())}};
-
-  // Flatten indices as from_blob reads garbage otherwise.
-  std::vector<int64_t> acc;
-  for (auto& vec: indices) {{
-    acc.insert(acc.end(), vec.begin(), vec.end());
-  }}
-
-  // Clone as original data will be deallocated upon return.
-  return at::from_blob(acc.data(), sizes, dtype).detach().clone();
-}}
-
-at::Tensor make_index_tensor_3d(std::vector<std::vector<std::vector<int64_t>>> indices) {{
-  at::ScalarType dtype = at::kInt;
-  std::vector<int64_t> sizes = {{
-    static_cast<int64_t>(indices.size()),
-    static_cast<int64_t>(indices[0].size()),
-    static_cast<int64_t>(indices[0][0].size())}};
-
-  // Flatten indices as from_blob reads garbage otherwise.
-  std::vector<int64_t> acc;
-  for (auto& v: indices) {{
-    for (auto& vv: v) {{
-      acc.insert(acc.end(), vv.begin(), vv.end());
-    }}
-  }}
-
-  // Clone as original data will be deallocated upon return.
-  return at::from_blob(acc.data(), sizes, dtype).detach().clone();
-}}
-
-using QueryPoolResults = std::vector<vkcompute::vkapi::ShaderResult>;
-using ShaderTimes = std::unordered_map<std::string, std::vector<uint64_t>>;
-
-void process_querypool_results(
-    QueryPoolResults& results,
-    ShaderTimes& shader_times) {{
-  for (const vkcompute::vkapi::ShaderResult& r : results) {{
-    uint64_t duration_ns = r.end_time_ns - r.start_time_ns;
-    if (shader_times.find(r.kernel_name) == shader_times.end()) {{
-      shader_times[r.kernel_name] = std::vector<uint64_t>();
-    }}
-    shader_times[r.kernel_name].emplace_back(duration_ns);
-  }}
-}}
-
-void register_shader_time_counters(
-    benchmark::State& state,
-    ShaderTimes& shader_times) {{
-  for (auto& times_list : shader_times) {{
-    // Filter to_nchw and nchw_to shaders
-    if (times_list.first.find("to_nchw") != std::string::npos) {{
-        continue;
-    }}
-    if (times_list.first.find("nchw_to") != std::string::npos) {{
-        continue;
-    }}
-
-    std::sort(times_list.second.begin(), times_list.second.end());
-    uint64_t median_time;
-    median_time = times_list.second[times_list.second.size() / 2];
-    state.counters[times_list.first + " median ns"] = median_time;
-  }}
-}}
-
-{benchmark_fixtures}
-
-{def_benchmarks}
-"""
-
-
-class VkBenchmarkFileGen:
-    def __init__(self, out_path):
-        self.out_path = out_path
-        self.suites_gens = []
-
-    def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None:
-        suites_gen = VkBenchmarkGen(op_reg_name, f, all_input_cases)
-        self.suites_gens.append(suites_gen)
-
-    def generate_benchmarks_cpp(self) -> str:
-        return "\n".join([h.generate_benchmark() for h in self.suites_gens])
-
-    def generate_benchmark_fixtures(self) -> str:
-        return "\n".join([h.generate_benchmark_fixture() for h in self.suites_gens])
-
-    def generate_cpp(self) -> str:
-        return cpp_test_template.format(
-            benchmark_fixtures=self.generate_benchmark_fixtures(),
-            def_benchmarks=self.generate_benchmarks_cpp(),
-        )
diff --git a/backends/vulkan/test/op_tests/utils/gen_computegraph.py b/backends/vulkan/test/op_tests/utils/gen_computegraph.py
deleted file mode 100644
index 490044340d6..00000000000
--- a/backends/vulkan/test/op_tests/utils/gen_computegraph.py
+++ /dev/null
@@ -1,788 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import re
-from dataclasses import dataclass
-from typing import List, Optional, Union
-
-from executorch.backends.vulkan.test.op_tests.utils.aten_types import (
-    AT_INT_ARRAY_REF,
-    AT_SCALAR,
-    AT_TENSOR,
-    AT_TENSOR_LIST,
-    BOOL,
-    DOUBLE,
-    INT,
-    OLD_STRING,
-    OPT_AT_DOUBLE_ARRAY_REF,
-    OPT_AT_INT_ARRAY_REF,
-    OPT_AT_TENSOR,
-    OPT_BOOL,
-    OPT_DEVICE,
-    OPT_INT64,
-    OPT_LAYOUT,
-    OPT_MEMORY_FORMAT,
-    OPT_SCALAR_TYPE,
-    STRING,
-    TENSOR_VECTOR,
-    THREE_TENSOR_TUPLE,
-    TWO_TENSOR_TUPLE,
-)
-from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite
-
-from torchgen.api import cpp
-from torchgen.api.types import CppSignatureGroup
-from torchgen.gen import generate_static_dispatch_backend_call, translate_args
-from torchgen.gen_aoti_c_shim import gen_static_dispatch_backend_call_signature
-from torchgen.model import NativeFunction, Variant
-
-###################################
-## Compute Graph Code Generation ##
-###################################
-
-
-@dataclass
-class ATenArg:
-    name: str
-    cpp_type: str
-    default: Optional[str]
-
-
-@dataclass
-class ValueRef:
-    name: str
-    src_cpp_name: str
-    src_cpp_type: str
-    is_in: bool = False
-    is_out: bool = False
-    fixed_storage_type: Optional[str] = None
-    fixed_memory_layout: Optional[str] = None
-    requires_prepack: bool = False
-    supports_prepack: bool = False
-    # When is_dynamic_size is true, the underlying object size is not known
-    # during code-gen. Example is the out value for aten.split where the out
-    # value is a vector<Tensor>. In these cases, we need to use an additional
-    # vector or at::TensorList to track these values.
-    is_dynamic_size: bool = False
-
-    @property
-    def io_value_list_name(self):
-        assert self.is_dynamic_size
-        return f"{self.name}_io_value_list"
-
-    @property
-    def value_list_name(self):
-        assert self.is_dynamic_size
-        return f"{self.name}_value_list"
-
-    @property
-    def vk_out(self):
-        assert self.is_out
-        return f"vk_{self.name}"
-
-
-ValueRefList = Union[ValueRef, List[ValueRef]]
-
-InableCppType = frozenset([AT_TENSOR, AT_TENSOR_LIST])
-
-
-class ComputeGraphGen:
-    backend_key = None
-
-    def __init__(
-        self,
-        op_reg_name: str,
-        f: NativeFunction,
-        suite_def: TestSuite,
-        include_io: bool = True,
-    ):
-        self.op_reg_name = op_reg_name
-        self.f = f
-        self.suite_def = suite_def
-        self.include_io = include_io
-
-        self.f_sig = CppSignatureGroup.from_native_function(
-            self.f, method=False, fallback_binding=self.f.manual_cpp_binding
-        ).most_faithful_signature()
-
-        self.graph = "graph"
-        self.dot = "->"
-
-        self.args = []
-        self.refs = {}
-
-        self.should_prepack = False
-
-        for binding in self.f_sig.arguments():
-            arg = binding.argument
-            ctype = cpp.argumenttype_type(
-                arg.type, mutable=arg.is_write, binds=arg.name
-            )
-            cpp_type = ctype.cpp_type(strip_ref=True)
-
-            self.args.append(
-                ATenArg(name=arg.name, cpp_type=cpp_type, default=arg.default)
-            )
-
-            # These are the argument will be passed as a "weight" tensor, the
-            # corresponding object will be TensorRef in the compute graph.
-            requires_prepack = (
-                "weight" in arg.name
-                or "bias" in arg.name
-                or "running_mean" in arg.name
-                or "running_var" in arg.name
-            )
-            supports_prepack = False
-            if arg.name in self.suite_def.prepacked_args:
-                supports_prepack = True
-
-            fixed_storage_type = None
-            if arg.name in self.suite_def.arg_storage_types:
-                fixed_storage_type = self.suite_def.arg_storage_types[arg.name]
-
-            fixed_memory_layout = None
-            if arg.name in self.suite_def.arg_memory_layouts:
-                fixed_memory_layout = self.suite_def.arg_memory_layouts[arg.name]
-
-            self.refs[arg.name] = ValueRef(
-                name=f"{arg.name}_ref",
-                src_cpp_name=arg.name,
-                src_cpp_type=cpp_type,
-                is_in=(cpp_type in InableCppType),
-                fixed_storage_type=fixed_storage_type,
-                fixed_memory_layout=fixed_memory_layout,
-                requires_prepack=requires_prepack,
-                supports_prepack=supports_prepack,
-            )
-
-        ret_type = cpp.returns_type(self.f.func.returns, symint=False).cpp_type()
-        self.out = ATenArg(name="out", cpp_type=ret_type, default=None)
-
-        fixed_storage_type = None
-        if "out" in self.suite_def.arg_storage_types:
-            fixed_storage_type = self.suite_def.arg_storage_types["out"]
-        fixed_memory_layout = None
-        if "out" in self.suite_def.arg_memory_layouts:
-            fixed_memory_layout = self.suite_def.arg_memory_layouts["out"]
-
-        if ret_type == AT_TENSOR:
-            self.refs["out"] = ValueRef(
-                name="out_ref",
-                src_cpp_name="out",
-                src_cpp_type=ret_type,
-                is_out=True,
-                fixed_storage_type=fixed_storage_type,
-                fixed_memory_layout=fixed_memory_layout,
-            )
-        elif ret_type == TWO_TENSOR_TUPLE:
-            self.refs["out"] = [
-                ValueRef(
-                    name="out_ref_first",
-                    src_cpp_name="std::get<0>(out)",
-                    src_cpp_type="at::Tensor",
-                    is_out=True,
-                    fixed_storage_type=(
-                        fixed_storage_type[0] if fixed_storage_type else None
-                    ),
-                    fixed_memory_layout=(
-                        fixed_memory_layout[0] if fixed_memory_layout else None
-                    ),
-                ),
-                ValueRef(
-                    name="out_ref_second",
-                    src_cpp_name="std::get<1>(out)",
-                    src_cpp_type="at::Tensor",
-                    is_out=True,
-                    fixed_storage_type=(
-                        fixed_storage_type[1] if fixed_storage_type else None
-                    ),
-                    fixed_memory_layout=(
-                        fixed_memory_layout[1] if fixed_memory_layout else None
-                    ),
-                ),
-                ValueRef(
-                    name="out_ref",
-                    src_cpp_name="out",
-                    src_cpp_type=ret_type,
-                    is_out=False,
-                ),
-            ]
-        elif ret_type == THREE_TENSOR_TUPLE:
-            self.refs["out"] = [
-                ValueRef(
-                    name="out_ref_first",
-                    src_cpp_name="std::get<0>(out)",
-                    src_cpp_type="at::Tensor",
-                    is_out=True,
-                    fixed_storage_type=(
-                        fixed_storage_type[0] if fixed_storage_type else None
-                    ),
-                    fixed_memory_layout=(
-                        fixed_memory_layout[0] if fixed_memory_layout else None
-                    ),
-                ),
-                ValueRef(
-                    name="out_ref_second",
-                    src_cpp_name="std::get<1>(out)",
-                    src_cpp_type="at::Tensor",
-                    is_out=True,
-                    fixed_storage_type=(
-                        fixed_storage_type[1] if fixed_storage_type else None
-                    ),
-                    fixed_memory_layout=(
-                        fixed_memory_layout[1] if fixed_memory_layout else None
-                    ),
-                ),
-                ValueRef(
-                    name="out_ref_third",
-                    src_cpp_name="std::get<2>(out)",
-                    src_cpp_type="at::Tensor",
-                    is_out=True,
-                    fixed_storage_type=(
-                        fixed_storage_type[2] if fixed_storage_type else None
-                    ),
-                    fixed_memory_layout=(
-                        fixed_memory_layout[2] if fixed_memory_layout else None
-                    ),
-                ),
-                ValueRef(
-                    name="out_ref",
-                    src_cpp_name="out",
-                    src_cpp_type=ret_type,
-                    is_out=False,
-                ),
-            ]
-        elif ret_type == TENSOR_VECTOR:
-            self.refs["out"] = ValueRef(
-                name="out_ref",
-                src_cpp_name="out",
-                src_cpp_type=ret_type,
-                is_out=True,
-                is_dynamic_size=True,
-            )
-        else:
-            raise NotImplementedError(
-                f"ret_type: {ret_type} not supported for out value"
-            )
-
-    ## ATen code generation
-
-    def gen_decl(self, fn_name: str, ret_type: str = "void") -> str:
-        cpp_args = [a.decl() for a in self.f_sig.arguments()]
-        cpp_args_str = ", ".join(cpp_args)
-        return f"{ret_type} {fn_name}({cpp_args_str})"
-
-    def create_aten_fn_call(self) -> str:
-        func_call = generate_static_dispatch_backend_call(
-            self.f_sig, self.f, ComputeGraphGen.backend_key
-        )[7:].replace("::cpu", "")
-
-        return func_call
-
-    def create_aten_method_call(self) -> str:
-        # For functions with only Method variant, we fallback to the function
-        # declared in MethodOperators.h
-        cpp_sig = gen_static_dispatch_backend_call_signature(self.f_sig, self.f)
-        exprs = translate_args(self.f_sig, cpp_sig)
-        func_call = f"at::_ops::{self.f_sig.name()}::call({exprs});"
-        return func_call
-
-    def create_out_src(self, include_declarations: bool = True) -> str:
-        cpp_type = self.out.cpp_type if include_declarations else ""
-        if Variant.function in self.f.variants:
-            return f"{cpp_type} out = " + self.create_aten_fn_call() + "\n"
-        else:
-            return f"{cpp_type} out = " + self.create_aten_method_call() + "\n"
-
-    ## Graph code generation utils
-
-    def prepack_ref(self, ref: ValueRef) -> bool:
-        if ref.requires_prepack:
-            return True
-        else:
-            return ref.supports_prepack and self.should_prepack
-
-    def create_value_decl_for(self, ref: ValueRefList) -> str:  # noqa: C901
-        if isinstance(ref, list):
-            ret_str = ""
-            for r in ref:
-                ret_str += self.create_value_decl_for(r)
-            return ret_str
-
-        cpp_type = "IOValueRef" if (ref.is_in or ref.requires_prepack) else "ValueRef"
-        if ref.src_cpp_type == AT_TENSOR_LIST:
-            ret_str = f"std::vector<IOValueRef> {ref.name}_io_value_refs;\n"
-            ret_str += f"std::vector<ValueRef> {ref.name}_value_refs;\n"
-            return ret_str
-        elif ref.src_cpp_type == TENSOR_VECTOR:
-            ret_str = f"std::vector<IOValueRef> {ref.io_value_list_name};\n"
-            ret_str += f"std::vector<ValueRef> {ref.value_list_name};\n"
-            return ret_str
-        else:
-            return f"{cpp_type} {ref.name};\n"
-
-    def create_value_for(  # noqa: C901
-        self, ref: ValueRefList, include_declarations: bool = True
-    ) -> str:
-        if isinstance(ref, list):
-            ret_str = ""
-            for r in ref:
-                ret_str += self.create_value_for(r)
-            return ret_str
-
-        prepack = self.prepack_ref(ref)
-        ref_is_view = self.suite_def.is_view_op and ref.is_out
-
-        # If skipping IO, force is_in to be False
-        if not self.include_io and ref.is_in:
-            ref.is_in = False
-
-        cpp_type = "IOValueRef" if (ref.is_in and not prepack) else "ValueRef"
-        if not include_declarations:
-            cpp_type = ""
-
-        if ref.src_cpp_type == OPT_AT_TENSOR:
-            ret_str = f"{cpp_type} {ref.name} = "
-            if prepack:
-                ret_str = ""
-                if include_declarations:
-                    ret_str += f"IOValueRef {ref.name};\n"
-                ret_str += f"{ref.name}.value = "
-            ret_str += f"!{ref.src_cpp_name}.has_value() ? "
-            ret_str += f"{self.graph}{self.dot}add_none() : "
-            if not prepack:
-                ret_str += f"{self.graph}{self.dot}"
-                ret_str += "add_input_tensor(" if ref.is_in else "add_tensor("
-                ret_str += f"{ref.src_cpp_name}->sizes().vec(), "
-                ret_str += f"from_at_scalartype({ref.src_cpp_name}->scalar_type()"
-                if ref.fixed_storage_type:
-                    ret_str += f", {ref.fixed_storage_type}"
-                if ref.fixed_memory_layout:
-                    ret_str += f", {ref.fixed_memory_layout}"
-                ret_str += "));\n"
-            elif prepack:
-                ret_str += f"{self.graph}{self.dot}"
-                ret_str += f"add_tensorref({ref.src_cpp_name}->sizes().vec(), "
-                ret_str += f"from_at_scalartype({ref.src_cpp_name}->scalar_type()), "
-                ret_str += f"{ref.src_cpp_name}->const_data_ptr()); \n"
-            return ret_str
-        elif ref.src_cpp_type == OPT_INT64:
-            ret_str = f"{cpp_type} {ref.name} = "
-            ret_str += f"!{ref.src_cpp_name}.has_value() ? "
-            ret_str += f"{self.graph}{self.dot}add_none() : "
-            ret_str += f"{self.graph}{self.dot}add_scalar<int64_t>"
-            ret_str += f"({ref.src_cpp_name}.value());\n"
-            return ret_str
-        elif (
-            ref.src_cpp_type == OPT_AT_DOUBLE_ARRAY_REF
-            or ref.src_cpp_type == OPT_AT_INT_ARRAY_REF
-        ):
-            ret_str = f"{cpp_type} {ref.name} = "
-            ret_str += f"!{ref.src_cpp_name}.has_value() ? "
-            ret_str += f"{self.graph}{self.dot}add_none() : "
-            ret_str += f"{self.graph}{self.dot}add_scalar_list"
-            ret_str += f"({ref.src_cpp_name}->vec());\n"
-            return ret_str
-        elif ref.src_cpp_type == AT_TENSOR_LIST:
-            assert ref.is_in, "AT_TENSOR_LIST must be an input"
-            # This logic is a bit convoluted. We need to create a IOValueRef for
-            # each tensor, to facilate staging. On the other hand, we will
-            # use the .value tensor to create a ValueList, which will be passed
-            # to the corresponding ops.
-            ret_str = ""
-            if include_declarations:
-                ret_str += f"std::vector<IOValueRef> {ref.name}_io_value_refs;\n"
-                ret_str += f"std::vector<ValueRef> {ref.name}_value_refs;\n"
-            ret_str += f"for (int i=0; i < {ref.src_cpp_name}.size(); i++) {{\n"
-            ret_str += (
-                f"  IOValueRef io_value_ref = {self.graph}{self.dot}add_input_tensor(\n"
-            )
-            ret_str += f"      {ref.src_cpp_name}[i].sizes().vec(),\n"
-            ret_str += (
-                f"      from_at_scalartype({ref.src_cpp_name}[i].scalar_type())); \n"
-            )
-            ret_str += f"  {ref.name}_value_refs.emplace_back(io_value_ref.value);\n"
-            ret_str += f"  {ref.name}_io_value_refs.emplace_back(io_value_ref);\n"
-            ret_str += "}\n"
-            ret_str += f"ValueRef {ref.name} = {self.graph}{self.dot}add_value_list(std::move({ref.name}_value_refs));\n"
-            return ret_str
-        elif ref.src_cpp_type == TENSOR_VECTOR:
-            ret_str = ""
-            if include_declarations:
-                ret_str += f"std::vector<IOValueRef> {ref.io_value_list_name};\n"
-                ret_str += f"std::vector<ValueRef> {ref.value_list_name};\n"
-            ret_str += f"""
-for (int i=0; i<out.size(); i++) {{
-    const at::Tensor& cur = out[i];
-    IOValueRef io_value_ref;
-    io_value_ref.value = {self.graph}{self.dot}add_tensor(
-        cur.sizes().vec(), from_at_scalartype(cur.scalar_type()));
-    {ref.io_value_list_name}.emplace_back(io_value_ref);
-    {ref.value_list_name}.emplace_back(io_value_ref.value);
-}}
-ValueRef out_ref = {self.graph}{self.dot}add_value_list(std::move({ref.value_list_name}));
-"""
-            return ret_str
-
-        ret_str = f"{cpp_type} {ref.name} = {self.graph}{self.dot}"
-        if prepack:
-            ret_str = ""
-            if include_declarations:
-                ret_str = f"IOValueRef {ref.name};\n"
-            ret_str += f"{ref.name}.value = {self.graph}{self.dot}"
-
-        if ref.src_cpp_type == AT_TENSOR and ref_is_view:
-            input_name = None
-            for _name, ref in self.refs.items():
-                if ref.is_in and ref.src_cpp_type == AT_TENSOR:
-                    input_name = ref.name
-
-            assert input_name is not None
-            ret_str += f"add_tensor_view({input_name}.value);"
-        elif ref.src_cpp_type == AT_TENSOR and not prepack:
-            ret_str += "add_input_tensor(" if ref.is_in else "add_tensor("
-            ret_str += f"{ref.src_cpp_name}.sizes().vec(), "
-            ret_str += f"from_at_scalartype({ref.src_cpp_name}.scalar_type())"
-            if ref.fixed_storage_type:
-                ret_str += f", {ref.fixed_storage_type}"
-            if ref.fixed_memory_layout:
-                ret_str += f", {ref.fixed_memory_layout}"
-            ret_str += ");\n"
-        elif ref.src_cpp_type == AT_TENSOR and prepack:
-            ret_str += f"add_tensorref({ref.src_cpp_name}.sizes().vec(), "
-            ret_str += f"from_at_scalartype({ref.src_cpp_name}.scalar_type()), "
-            ret_str += f"{ref.src_cpp_name}.const_data_ptr()); \n"
-        elif ref.src_cpp_type == AT_SCALAR:
-            # TODO(ssjia): generalize this to work with all scalar types
-            ret_str += f"add_scalar<double>({ref.src_cpp_name}.toDouble()); \n"
-        elif ref.src_cpp_type == AT_INT_ARRAY_REF:
-            ret_str += f"add_scalar_list({ref.src_cpp_name}.vec()); \n"
-        elif ref.src_cpp_type == BOOL:
-            ret_str += f"add_scalar<bool>({ref.src_cpp_name}); \n"
-        elif ref.src_cpp_type == INT:
-            ret_str += f"add_scalar<int64_t>({ref.src_cpp_name}); \n"
-        elif ref.src_cpp_type == DOUBLE:
-            ret_str += f"add_scalar<double>({ref.src_cpp_name}); \n"
-        elif (
-            ref.src_cpp_type == OPT_SCALAR_TYPE
-            or ref.src_cpp_type == OPT_LAYOUT
-            or ref.src_cpp_type == OPT_DEVICE
-            or ref.src_cpp_type == OPT_BOOL
-            or ref.src_cpp_type == OPT_MEMORY_FORMAT
-        ):
-            ret_str += "add_none(); \n"
-        elif ref.src_cpp_type == STRING or ref.src_cpp_type == OLD_STRING:
-            ret_str += f"add_string(std::string({ref.src_cpp_name})); \n"
-        elif ref.src_cpp_type == TWO_TENSOR_TUPLE:
-            ret_str += f"add_value_list({{{ref.name}_first, {ref.name}_second}}); \n"
-        elif ref.src_cpp_type == THREE_TENSOR_TUPLE:
-            ret_str += f"add_value_list({{{ref.name}_first, {ref.name}_second, {ref.name}_third}}); \n"
-        else:
-            raise RuntimeError(f"Unsupported cpp type {ref.src_cpp_type}")
-
-        return ret_str
-
-    def create_op_call(self) -> str:
-        deref = "*" if self.dot == "->" else ""
-        op_create_code = f'VK_GET_OP_FN("{self.op_reg_name}")({deref}{self.graph}, {{'
-
-        for aten_arg in self.args:
-            ref = self.refs[aten_arg.name]
-            if ref.src_cpp_type == AT_TENSOR_LIST:
-                # Special case. Underlying tensors are input tensors, but the
-                # container itself is just a normal value.
-                op_create_code += f"{ref.name}, "
-            else:
-                op_create_code += (
-                    f"{ref.name}.value, "
-                    if ref.is_in or ref.requires_prepack or ref.is_out
-                    else f"{ref.name}, "
-                )
-                # op_create_code += f"{ref.name}, "
-
-        op_create_code += "out_ref});\n"
-        return op_create_code
-
-    def gen_output_staging_valueref_decl(self, ref: ValueRefList) -> str:
-        if isinstance(ref, list):
-            ret_str = ""
-            for r in ref[:-1]:
-                ret_str += self.gen_output_staging_valueref_decl(r)
-            return ret_str
-        elif ref.src_cpp_type == TENSOR_VECTOR:
-            assert ref.is_out
-            ret_str = ""
-            return ret_str
-
-        assert ref.src_cpp_type == AT_TENSOR and ref.is_out
-        return f"ValueRef {ref.name}_staging;\n"
-
-    def set_output(self, ref: ValueRefList, include_declarations: bool = True) -> str:
-        if isinstance(ref, list):
-            ret_str = ""
-            for r in ref[:-1]:
-                ret_str += self.set_output(r, include_declarations)
-            return ret_str
-        elif ref.src_cpp_type == TENSOR_VECTOR:
-            assert ref.is_out
-            ret_str = f"""
-for (int i=0; i<out.size(); i++) {{
-    {ref.io_value_list_name}[i].staging = {self.graph}{self.dot}set_output_tensor(
-        {ref.io_value_list_name}[i].value);
-}}
-"""
-            return ret_str
-
-        assert ref.src_cpp_type == AT_TENSOR and ref.is_out
-        cpptype = "ValueRef" if include_declarations else ""
-        ret_str = f"{cpptype} {ref.name}_staging = {self.graph}{self.dot}"
-        ret_str += f"set_output_tensor({ref.name});\n"
-        return ret_str
-
-    def virtual_resize(self, ref: ValueRefList) -> str:
-        assert isinstance(ref, ValueRef)
-        assert ref.src_cpp_type in InableCppType and ref.is_in
-        if self.prepack_ref(ref):
-            return ""
-
-        if ref.src_cpp_type == AT_TENSOR:
-            ret_str = f"{self.graph}{self.dot}virtual_resize({ref.name}.value, "
-            ret_str += f"{ref.src_cpp_name}.sizes().vec());\n"
-        elif ref.src_cpp_type == AT_TENSOR_LIST:
-            ret_str = ""
-            ret_str += f"for (int i=0; i < {ref.name}_io_value_refs.size(); i++) {{\n"
-            ret_str += f"  {self.graph}{self.dot}virtual_resize({ref.name}_io_value_refs[i].value, "
-            ret_str += f"{ref.src_cpp_name}[i].sizes().vec());\n"
-            ret_str += "}\n"
-        else:
-            raise AssertionError(f"{ref.src_cpp_type} not expected")
-
-        return ret_str
-
-    def copy_into_staging(self, ref: ValueRefList) -> str:
-        assert isinstance(ref, ValueRef)
-        assert ref.src_cpp_type in InableCppType and ref.is_in
-
-        if self.prepack_ref(ref):
-            return ""
-
-        if ref.src_cpp_type == AT_TENSOR:
-            ret_str = f"{self.graph}{self.dot}copy_into_staging("
-            ret_str += f"{ref.name}.staging, "
-            ret_str += f"{ref.src_cpp_name}.const_data_ptr(), "
-            ret_str += f"{ref.src_cpp_name}.numel());\n"
-        elif ref.src_cpp_type == AT_TENSOR_LIST:
-            ret_str = ""
-            ret_str += f"for (int i=0; i < {ref.name}_io_value_refs.size(); i++) {{\n"
-            ret_str += f"  {self.graph}{self.dot}copy_into_staging("
-            ret_str += f"{ref.name}_io_value_refs[i].staging, "
-            ret_str += f"{ref.src_cpp_name}[i].const_data_ptr(), "
-            ret_str += f"{ref.src_cpp_name}[i].numel());\n"
-            ret_str += "}\n"
-        else:
-            raise AssertionError(f"{ref.src_cpp_type} not expected")
-        return ret_str
-
-    def declare_vk_out_for(self, ref: Union[ValueRef, List[ValueRef]]) -> str:
-        if isinstance(ref, list):
-            ret_str = ""
-            for r in ref[:-1]:
-                ret_str += self.declare_vk_out_for(r)
-            return ret_str
-        elif ref.src_cpp_type == TENSOR_VECTOR:
-            assert ref.is_out
-            ret_str = f"""
-std::vector<at::Tensor> {ref.vk_out};
-for (int i=0; i<out.size(); i++) {{
-    {ref.vk_out}.emplace_back(at::empty_like(out[i]).contiguous());
-}}
-"""
-            return ret_str
-
-        assert ref.src_cpp_type == AT_TENSOR and ref.is_out
-        ret_str = f"at::Tensor vk_{ref.name} = at::empty_like({ref.src_cpp_name})"
-        ret_str += ".contiguous();\n"
-        return ret_str
-
-    def copy_from_staging(self, ref: ValueRefList) -> str:
-        if isinstance(ref, list):
-            ret_str = ""
-            for r in ref[:-1]:
-                ret_str += self.copy_from_staging(r)
-            return ret_str
-        elif ref.src_cpp_type == TENSOR_VECTOR:
-            assert ref.is_out
-            ret_str = f"""
-for (int i=0; i<out.size(); i++) {{
-    {self.graph}{self.dot}copy_from_staging(
-        {ref.io_value_list_name}[i].staging,
-        {ref.vk_out}[i].mutable_data_ptr(),
-        {ref.vk_out}[i].numel());
-}}
-"""
-            return ret_str
-
-        assert ref.src_cpp_type == AT_TENSOR and ref.is_out
-        ret_str = f"{self.graph}{self.dot}copy_from_staging({ref.name}_staging, "
-        ret_str += f"vk_{ref.name}.mutable_data_ptr(), vk_{ref.name}.numel());\n"
-
-        return ret_str
-
-    ## Misc. code generation utilities
-
-    def check_graph_out(self, ref: ValueRefList) -> str:
-        if isinstance(ref, list):
-            ret_str = ""
-            for r in ref[:-1]:
-                ret_str += self.check_graph_out(r)
-            return ret_str
-        elif ref.src_cpp_type == TENSOR_VECTOR:
-            assert ref.is_out
-            ret_str = f"""
-for (int i=0; i<out.size(); i++) {{
-    EXPECT_TRUE(check_close(out[i], {ref.vk_out}[i], rtol, atol));
-}}
-"""
-            return ret_str
-
-        return (
-            f"EXPECT_TRUE(check_close({ref.src_cpp_name}, vk_{ref.name}, rtol, atol));"
-        )
-
-    ## Top level code generation
-
-    def gen_arg_valueref_decls(self) -> str:
-        ret_str = ""
-        for aten_arg in self.args:
-            ref = self.refs[aten_arg.name]
-            ret_str += self.create_value_decl_for(ref)
-
-        ret_str += self.create_value_decl_for(self.refs["out"])
-        ret_str += f"{self.out.cpp_type} out;\n"
-        ret_str += self.gen_output_staging_valueref_decl(self.refs["out"])
-        return ret_str
-
-    def gen_graph_build_code(self, include_declarations: bool = True) -> str:
-        graph_build = self.create_out_src(include_declarations)
-        for aten_arg in self.args:
-            graph_build += self.create_value_for(
-                self.refs[aten_arg.name], include_declarations
-            )
-
-        graph_build += self.create_value_for(self.refs["out"], include_declarations)
-        graph_build += self.create_op_call()
-
-        if self.include_io:
-            graph_build += self.set_output(self.refs["out"], include_declarations)
-
-        graph_build += f"{self.graph}{self.dot}prepare();\n"
-        graph_build += f"{self.graph}{self.dot}prepack();\n"
-
-        graph_build += "\n"
-        return graph_build
-
-    def gen_graph_exec_code(self, check_output=True) -> str:
-        graph_exec = ""
-        if self.include_io:
-            for aten_arg in self.args:
-                ref = self.refs[aten_arg.name]
-                if ref.is_in:
-                    graph_exec += self.virtual_resize(ref)
-                    graph_exec += self.copy_into_staging(ref)
-
-            graph_exec += f"{self.graph}{self.dot}propagate_resize();\n"
-
-        graph_exec += f"{self.graph}{self.dot}execute();\n"
-
-        graph_exec += self.declare_vk_out_for(self.refs["out"])
-        if self.include_io:
-            graph_exec += self.copy_from_staging(self.refs["out"])
-
-        if self.include_io and check_output:
-            graph_exec += self.check_graph_out(self.refs["out"])
-
-        graph_exec = re.sub(r"^", "  ", graph_exec, flags=re.M)
-        graph_exec = "{\n" + graph_exec + "\n}"
-
-        return graph_exec
-
-    def gen_conditional_skips(self, skip_str: str = "GTEST_SKIP();") -> str:
-        fp16_skip = f"if (!{self.graph}{self.dot}context()->adapter_ptr()->has_full_float16_buffers_support()) {{\n"
-        fp16_skip += f"  {skip_str}\n"
-        fp16_skip += "}"
-        fp16_skip = re.sub(r"^", "  ", fp16_skip, flags=re.M) + "\n"
-
-        int8_skip = f"if (!{self.graph}{self.dot}context()->adapter_ptr()->has_full_int8_buffers_support()) {{\n"
-        int8_skip += f"  {skip_str};\n"
-        int8_skip += "}\n"
-
-        skips = ""
-
-        skips += "if (test_dtype == at::kHalf) {\n"
-        skips += fp16_skip
-        skips += "}\n"
-
-        for _, dtype in self.suite_def.arg_dtype.items():
-            if dtype == "at::kChar" or dtype == "at::kQInt8":
-                skips += int8_skip
-                continue
-
-        skips += "\n"
-        return skips
-
-    def gen_op_check_fn(self) -> str:
-        op_name = self.f.func.name.unambiguous_name()
-        if self.suite_def.test_name_suffix is not None:
-            op_name += "_" + self.suite_def.test_name_suffix
-
-        op_check_fn = self.gen_decl(f"check_{op_name}") + " {\n"
-        if self.should_prepack:
-            op_check_fn = self.gen_decl(f"prepacked_check_{op_name}") + " {\n"
-
-        op_check_fn_body = ""
-        op_check_fn_body += self.gen_graph_build_code()
-        op_check_fn_body += self.gen_graph_exec_code()
-
-        op_check_fn_body = re.sub(r"^", "    ", op_check_fn_body, flags=re.M)
-
-        op_check_fn += op_check_fn_body
-        op_check_fn += "\n  }"
-
-        return op_check_fn
-
-    def gen_build_graph_fn(self, include_declarations: bool = False) -> str:
-        op_name = self.f.func.name.unambiguous_name()
-        if self.suite_def.test_name_suffix is not None:
-            op_name += "_" + self.suite_def.test_name_suffix
-        op_build_graph_fn = self.gen_decl(f"build_graph_{op_name}") + " {\n"
-        if self.should_prepack:
-            op_build_graph_fn = (
-                self.gen_decl(f"prepacked_build_graph_{op_name}") + " {\n"
-            )
-
-        op_build_graph_fn_body = ""
-        op_build_graph_fn_body += self.gen_graph_build_code(include_declarations)
-
-        op_build_graph_fn += op_build_graph_fn_body
-        op_build_graph_fn += "\n  }"
-        return op_build_graph_fn
-
-    def gen_op_exec_graph_fn(self) -> str:
-        op_name = self.f.func.name.unambiguous_name()
-        if self.suite_def.test_name_suffix is not None:
-            op_name += "_" + self.suite_def.test_name_suffix
-        op_benchmark_fn = self.gen_decl(f"benchmark_{op_name}") + " {\n"
-        if self.should_prepack:
-            op_benchmark_fn = self.gen_decl(f"prepacked_benchmark_{op_name}") + " {\n"
-
-        op_benchmark_fn_body = ""
-        op_benchmark_fn_body += self.gen_graph_exec_code(False)
-
-        op_benchmark_fn_body = re.sub(r"^", "    ", op_benchmark_fn_body, flags=re.M)
-
-        op_benchmark_fn += op_benchmark_fn_body
-        op_benchmark_fn += "\n  }"
-        return op_benchmark_fn
diff --git a/backends/vulkan/test/op_tests/utils/gen_correctness_base.py b/backends/vulkan/test/op_tests/utils/gen_correctness_base.py
deleted file mode 100644
index 80b4d5dead9..00000000000
--- a/backends/vulkan/test/op_tests/utils/gen_correctness_base.py
+++ /dev/null
@@ -1,417 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import re
-from typing import Any, List
-
-from executorch.backends.vulkan.test.op_tests.utils.aten_types import (
-    AT_INT_ARRAY_REF,
-    AT_SCALAR,
-    AT_TENSOR,
-    AT_TENSOR_LIST,
-    BOOL,
-    DOUBLE,
-    INT,
-    OLD_STRING,
-    OPT_AT_DOUBLE_ARRAY_REF,
-    OPT_AT_INT_ARRAY_REF,
-    OPT_AT_TENSOR,
-    OPT_BOOL,
-    OPT_DEVICE,
-    OPT_INT64,
-    OPT_LAYOUT,
-    OPT_MEMORY_FORMAT,
-    OPT_SCALAR_TYPE,
-    STRING,
-)
-from executorch.backends.vulkan.test.op_tests.utils.test_suite import TestSuite
-
-from torchgen.api import cpp
-from torchgen.api.types import CppSignatureGroup
-from torchgen.model import Argument, NativeFunction
-
-##########################
-## Test Suite Generation ##
-##########################
-
-test_fixture_template = """
-class GeneratedOpsTest_{op_name} : public ::testing::Test {{
-}};
-"""
-
-test_suite_template = """
-TEST_P(GeneratedOpsTest_{op_name}, {case_name}) {{
-{create_ref_data}
-try {{
-{create_and_check_out}
-}}
-catch (const vkcompute::vkapi::ShaderNotSupportedError& e) {{
-    GTEST_SKIP() << e.what();
-}}
-}}
-"""
-
-
-def init_list_str(pylist: Any) -> str:
-    if pylist == "[]":
-        return "{" + "}"
-
-    if not isinstance(pylist, (list, tuple)):
-        pylist = [pylist]
-
-    list_str = "{"
-    for s in pylist:
-        if isinstance(s, (list, tuple)):
-            list_str += f"{init_list_str(s)}, "
-        else:
-            list_str += f"{s}, "
-    list_str = list_str[:-2] + "}"
-    return list_str
-
-
-def get_or_return_default(arg: Argument, inputs: List[Any], i: int):
-    if i < len(inputs):
-        return inputs[i]
-    else:
-        assert arg.default is not None
-        return arg.default
-
-
-class CorrectnessTestGen:
-    def __init__(self, f: NativeFunction, test_suite: TestSuite):
-        self.f = f
-        self.suite_def = test_suite
-        self.op_name = f.func.name.unambiguous_name()
-        if test_suite.test_name_suffix is not None:
-            self.op_name += f"_{test_suite.test_name_suffix}"
-
-        self.f_sig = CppSignatureGroup.from_native_function(
-            self.f, method=False, fallback_binding=self.f.manual_cpp_binding
-        ).most_faithful_signature()
-
-    def gen_case_name_tuple(self, t) -> str:
-        return "x".join(
-            [
-                (
-                    str(e)
-                    if not isinstance(e, (list, tuple))
-                    else self.gen_case_name_tuple(e)
-                )
-                for e in t
-            ]
-        )
-
-    def gen_case_name(self, inputs: List[Any], prepack: bool = False) -> str:
-        name_str = self.op_name
-        if prepack:
-            name_str += "_prepack"
-        for arg_sizes_or_val in inputs:
-            name_str += "_"
-            if isinstance(arg_sizes_or_val, tuple):
-                name_str += self.gen_case_name_tuple(arg_sizes_or_val)
-            elif isinstance(arg_sizes_or_val, list):
-                lst = []
-                for size in arg_sizes_or_val:
-                    if isinstance(size, (list, tuple)):
-                        lst.append(self.gen_case_name_tuple(size))
-                    else:
-                        lst.append(str(size))
-                name_str += "c".join(lst)
-            else:
-                name_str += str(arg_sizes_or_val).replace(".", "p")
-
-        # minus sign is a invalid char for test case. change to "n".
-        name_str = name_str.replace("-", "n")
-        return name_str
-
-    def call_data_gen_fn(self, arg: Argument, data: Any, terminate: bool = True) -> str:
-        tensor_dtype = (
-            "test_dtype"
-            if arg.name not in self.suite_def.arg_dtype
-            else self.suite_def.arg_dtype[arg.name]
-        )
-
-        data_range = (
-            self.suite_def.data_range
-            if arg.name not in self.suite_def.arg_data_range
-            else self.suite_def.arg_data_range[arg.name]
-        )
-
-        data_gen_fn = (
-            self.suite_def.data_gen
-            if arg.name not in self.suite_def.arg_data_gen_fn
-            else self.suite_def.arg_data_gen_fn[arg.name]
-        )
-
-        ret_str = f"{data_gen_fn}({init_list_str(data)}, {tensor_dtype}, {data_range[0]}, {data_range[1]})"
-        if terminate:
-            ret_str += ";"
-
-        return ret_str
-
-    def create_input_data(self, arg: Argument, data: Any) -> str:  # noqa: C901
-        ctype = cpp.argumenttype_type(arg.type, mutable=arg.is_write, binds=arg.name)
-        cpp_type = ctype.cpp_type(strip_ref=True)
-
-        # Short cut exit for TENSORLIST, because it needs multiple lines of
-        # construction, deviates from the rest.
-        if cpp_type == AT_TENSOR_LIST:
-            ret_str = f"std::vector<{AT_TENSOR}> tensor_vec;\n"
-            for elem in data:
-                ret_str += f"tensor_vec.emplace_back({self.call_data_gen_fn(arg, elem, False)});\n"
-            ret_str += f"{cpp_type} {arg.name} = tensor_vec;\n"
-            return ret_str + "\n"
-
-        if cpp_type == AT_INT_ARRAY_REF:
-            ret_str = f"std::vector<int64_t> {arg.name} = "
-        elif cpp_type == OPT_AT_DOUBLE_ARRAY_REF and str(data) != "None":
-            ret_str = f"std::vector<double> {arg.name} = "
-        elif cpp_type == OPT_AT_INT_ARRAY_REF and str(data) != "None":
-            ret_str = f"std::vector<int64_t> {arg.name} = "
-        else:
-            ret_str = f"{cpp_type} {arg.name} = "
-
-        if cpp_type == AT_TENSOR:
-            if arg.name == "index" or arg.name == "indices":
-                args_str = init_list_str(data)
-                if args_str[:3] == "{{{":
-                    ret_str += f"make_index_tensor_3d({init_list_str(data)});"
-                elif args_str[:2] == "{{":
-                    ret_str += f"make_index_tensor_2d({init_list_str(data)});"
-                else:
-                    ret_str += f"make_index_tensor_1d({init_list_str(data)});"
-            else:
-                ret_str += self.call_data_gen_fn(arg, data)
-        elif cpp_type == OPT_AT_TENSOR:
-            if str(data) == "None":
-                ret_str += "std::nullopt;"
-            else:
-                ret_str += self.call_data_gen_fn(arg, data)
-        elif cpp_type == AT_SCALAR:
-            ret_str += f"{data};"
-        elif cpp_type == AT_INT_ARRAY_REF:
-            ret_str += f"{init_list_str(data)};"
-        elif cpp_type == OPT_AT_DOUBLE_ARRAY_REF or cpp_type == OPT_AT_INT_ARRAY_REF:
-            if str(data) == "None":
-                ret_str += "std::nullopt;"
-            else:
-                ret_str += f"{init_list_str(data)};"
-        elif cpp_type == BOOL:
-            ret_str += f"{str(data).lower()};"
-        elif cpp_type == INT:
-            ret_str += f"{str(data).lower()};"
-        elif cpp_type == DOUBLE:
-            ret_str += f"{str(data).lower()};"
-        elif cpp_type == OPT_INT64:
-            if str(data) == "None":
-                ret_str += "std::nullopt;"
-            else:
-                ret_str += f"{str(data)};"
-        elif cpp_type == STRING or cpp_type == OLD_STRING:
-            ret_str += f'std::string_view("{data}");'
-        elif (
-            cpp_type == OPT_SCALAR_TYPE
-            or cpp_type == OPT_LAYOUT
-            or cpp_type == OPT_DEVICE
-            or cpp_type == OPT_BOOL
-            or cpp_type == OPT_MEMORY_FORMAT
-        ):
-            ret_str += "std::nullopt;"
-        else:
-            raise RuntimeError(f"Unsupported cpp type {cpp_type}")
-        return ret_str + "\n"
-
-    def gen_create_ref_data(self, inputs: List[Any]) -> str:
-        ref_code = ""
-
-        for i, binding in enumerate(self.f_sig.arguments()):
-            arg = binding.argument
-            arg_data = get_or_return_default(arg, inputs, i)
-            ref_code += self.create_input_data(arg, arg_data)
-
-        ref_code = re.sub(r"^", "  ", ref_code, flags=re.M)
-        return ref_code
-
-    def gen_create_and_check_out(self, prepack=False) -> str:
-        test_str = f"check_{self.op_name}("
-        if prepack:
-            test_str = f"prepacked_check_{self.op_name}("
-        for binding in self.f_sig.arguments():
-            arg = binding.argument
-            test_str += f"{arg.name}, "
-        test_str = test_str[:-2] + ");"
-        test_str = re.sub(r"^", "  ", test_str, flags=re.M)
-        return test_str
-
-    def gen_parameterization(self) -> str:
-        return ""
-
-    def generate_fixture_cpp(self) -> str:
-        return test_fixture_template.format(op_name=self.f.func.name)
-
-    def generate_case_cpp(self, inputs, prepack=False) -> str:
-        return test_suite_template.format(
-            op_name=f"{self.op_name}",
-            case_name=self.gen_case_name(inputs, prepack),
-            create_ref_data=self.gen_create_ref_data(inputs),
-            create_and_check_out=self.gen_create_and_check_out(prepack),
-        )
-
-    def generate_suite_cpp(self) -> str:
-        suite_cpp = self.generate_fixture_cpp()
-        for inputs in self.suite_def.input_cases:
-            if not self.suite_def.requires_prepack:
-                suite_cpp += self.generate_case_cpp(inputs)
-            if self.suite_def.supports_prepack():
-                suite_cpp += self.generate_case_cpp(inputs, prepack=True)
-
-        suite_cpp += self.gen_parameterization()
-        return suite_cpp
-
-
-##########################
-## Test File Generation ##
-##########################
-
-cpp_test_template = """
-#include <gtest/gtest.h>
-
-#include <ATen/ATen.h>
-
-{preamble}
-
-at::Tensor make_casted_randint_tensor(
-    std::vector<int64_t> sizes,
-    at::ScalarType dtype = at::kFloat,
-    int low = 0,
-    int high = 10) {{
-
-  return at::randint(high, sizes, at::device(at::kCPU).dtype(dtype));
-}}
-
-at::Tensor make_rand_tensor(
-    std::vector<int64_t> sizes,
-    at::ScalarType dtype = at::kFloat,
-    float low = 0.0,
-    float high = 1.0) {{
-
-  if (dtype == at::kChar)
-    return at::randint(high, sizes, at::device(at::kCPU).dtype(dtype));
-
-  if (dtype == at::kBool)
-    return at::rand(sizes, at::device(at::kCPU)) > 0.5;
-
-  if (high == 1.0 && low == 0.0)
-    return at::rand(sizes, at::device(at::kCPU).dtype(dtype));
-
-  return at::rand(sizes, at::device(at::kCPU).dtype(dtype)) * (high - low) + low;
-}}
-
-at::Tensor make_zeros_tensor(
-    std::vector<int64_t> sizes,
-    at::ScalarType dtype = at::kFloat,
-    float low = 0.0,
-    float high = 1.0) {{
-  return at::zeros(sizes, at::device(at::kCPU).dtype(dtype));
-}}
-
-at::Tensor make_ones_tensor(
-    std::vector<int64_t> sizes,
-    at::ScalarType dtype = at::kFloat,
-    float low = 0.0,
-    float high = 1.0) {{
-  return at::ones(sizes, at::device(at::kCPU).dtype(dtype));
-}}
-
-at::Tensor make_seq_tensor(
-    std::vector<int64_t> sizes,
-    at::ScalarType dtype = at::kFloat,
-    float low = 0.0,
-    float high = 1.0) {{
-  (void)low;
-  (void)high;
-
-  int64_t n = 1;
-  for (auto size: sizes) {{
-    n *= size;
-  }}
-
-  std::vector<float> values(n);
-  for (int i=0;i<n;i++) {{
-    values[i] = (float) i;
-  }}
-
-  // Clone as original data will be deallocated upon return.
-  return at::from_blob(values.data(), sizes, at::kFloat).toType(dtype).detach().clone();
-}}
-
-at::Tensor make_index_tensor_1d(std::vector<int64_t> indices) {{
-  at::ScalarType dtype = at::kInt;
-  std::vector<int64_t> sizes = {{static_cast<int64_t>(indices.size())}};
-
-  // Clone as original data will be deallocated upon return.
-  return at::from_blob(indices.data(), sizes, dtype).detach().clone();
-}}
-
-at::Tensor make_index_tensor_2d(std::vector<std::vector<int64_t>> indices) {{
-  at::ScalarType dtype = at::kInt;
-  std::vector<int64_t> sizes = {{
-    static_cast<int64_t>(indices.size()),
-    static_cast<int64_t>(indices[0].size())}};
-
-  // Flatten indices as from_blob reads garbage otherwise.
-  std::vector<int64_t> acc;
-  for (auto& vec: indices) {{
-    acc.insert(acc.end(), vec.begin(), vec.end());
-  }}
-
-  // Clone as original data will be deallocated upon return.
-  return at::from_blob(acc.data(), sizes, dtype).detach().clone();
-}}
-
-at::Tensor make_index_tensor_3d(std::vector<std::vector<std::vector<int64_t>>> indices) {{
-  at::ScalarType dtype = at::kInt;
-  std::vector<int64_t> sizes = {{
-    static_cast<int64_t>(indices.size()),
-    static_cast<int64_t>(indices[0].size()),
-    static_cast<int64_t>(indices[0][0].size())}};
-
-  // Flatten indices as from_blob reads garbage otherwise.
-  std::vector<int64_t> acc;
-  for (auto& v: indices) {{
-    for (auto& vv: v) {{
-      acc.insert(acc.end(), vv.begin(), vv.end());
-    }}
-  }}
-
-  // Clone as original data will be deallocated upon return.
-  return at::from_blob(acc.data(), sizes, dtype).detach().clone();
-}}
-
-{test_suites_cpp}
-"""
-
-
-class CorrectnessTestFileGen:
-    def __init__(self, out_path):
-        self.out_path = out_path
-        self.suites_gens = []
-
-    def generate_cpp(self) -> str:
-        return cpp_test_template.format(
-            preamble=self.generate_preamble(),
-            test_suites_cpp=self.generate_test_suites_cpp(),
-        )
-
-    def generate_preamble(self) -> str:
-        return ""
-
-    def generate_test_suites_cpp(self) -> str:
-        return "\n".join([h.generate_suite_cpp() for h in self.suites_gens])
-
-    def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None:
-        suites_gen = CorrectnessTestGen(f, all_input_cases)
-        self.suites_gens.append(suites_gen)
diff --git a/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py b/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py
deleted file mode 100644
index c368c23c539..00000000000
--- a/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from executorch.backends.vulkan.test.op_tests.utils.gen_computegraph import (
-    ComputeGraphGen,
-)
-from executorch.backends.vulkan.test.op_tests.utils.gen_correctness_base import (
-    CorrectnessTestFileGen,
-    CorrectnessTestGen,
-)
-from executorch.backends.vulkan.test.op_tests.utils.test_suite import VkTestSuite
-
-from torchgen.model import NativeFunction
-
-##################################
-## Test Fixture Code Generation ##
-##################################
-
-test_fixture_template = """
-class GeneratedOpsTest_{op_name} : public ::testing::TestWithParam< ::std::tuple<at::ScalarType, utils::StorageType, utils::GPUMemoryLayout>> {{
- protected:
-  ComputeGraph* graph;
-  at::ScalarType test_dtype = at::kFloat;
-  float rtol = {rtol};
-  float atol = {atol};
-
-  void SetUp() override {{
-    GraphConfig config;
-    utils::StorageType default_storage_type;
-    utils::GPUMemoryLayout default_memory_layout;
-    std::tie(test_dtype, default_storage_type, default_memory_layout) = GetParam();
-    config.set_storage_type_override(default_storage_type);
-    config.set_memory_layout_override(default_memory_layout);
-    graph = new ComputeGraph(config);
-
-    if (test_dtype == at::kHalf) {{
-      rtol = 1e-2;
-      atol = 1e-2;
-    }}
-  }}
-
-  void TearDown() override {{
-    delete graph;
-    graph = nullptr;
-  }}
-
-  {check_fn}
-}};
-"""
-
-
-class VkCorrectnessTestGen(CorrectnessTestGen):
-    def __init__(self, op_reg_name: str, f: NativeFunction, inputs: VkTestSuite):
-        super().__init__(f, inputs)
-        self.op_reg_name = op_reg_name
-        self.generator = ComputeGraphGen(self.op_reg_name, self.f, self.suite_def)
-
-    def generate_fixture_cpp(self) -> str:
-        check_fn = ""
-        if not self.suite_def.requires_prepack:
-            check_fn = self.generator.gen_op_check_fn()
-
-        prepacked_check_fn = ""
-        if self.suite_def.supports_prepack():
-            self.generator.should_prepack = True
-            prepacked_check_fn = self.generator.gen_op_check_fn()
-            check_fn += "\n\n  "
-            check_fn += prepacked_check_fn
-
-        return test_fixture_template.format(
-            op_name=self.op_name,
-            check_fn=check_fn,
-            rtol=self.suite_def.rtol,
-            atol=self.suite_def.atol,
-        )
-
-    def gen_parameterization(self) -> str:
-        dtypes = self.suite_def.dtypes
-        storage_types = self.suite_def.storage_types
-        layouts = self.suite_def.layouts
-
-        return f"""
-INSTANTIATE_TEST_SUITE_P(
-  Combos_{self.op_name},
-  GeneratedOpsTest_{self.op_name},
-    ::testing::Combine(
-      ::testing::Values({', '.join(dtypes)}),
-      ::testing::Values({', '.join(storage_types)}),
-      ::testing::Values({', '.join(layouts)})));
-        """
-
-
-##############################
-## Test File Code Generation ##
-###############################
-
-preamble_str = """
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
-
-#include <tuple>
-
-using namespace vkcompute;
-using TensorOptions = at::TensorOptions;
-
-vkapi::ScalarType from_at_scalartype(c10::ScalarType at_scalartype) {
-  switch (at_scalartype) {
-    case c10::kDouble:
-      return vkapi::kDouble;
-    case c10::kFloat:
-      return vkapi::kFloat;
-    case c10::kHalf:
-      return vkapi::kHalf;
-    case c10::kInt:
-      return vkapi::kInt;
-    case c10::kLong:
-      return vkapi::kInt;
-    case c10::kChar:
-      return vkapi::kChar;
-    case c10::kBool:
-      return vkapi::kBool;
-    default:
-      VK_THROW("Unsupported at::ScalarType!");
-  }
-}
-
-#ifdef USE_VULKAN_FP16_INFERENCE
-bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-2, float atol=1e-2) {
-#else
-bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-5, float atol=1e-5) {
-#endif
-  // Skip checking index tensors
-  if (t1.scalar_type() == at::kLong || t2.scalar_type() == at::kLong) {
-    return true;
-  }
-  bool is_close = at::allclose(t1, t2, rtol, atol);
-  if (!is_close && t1.numel() < 500) {
-    std::cout << "reference: " << std::endl;
-    print(t1, 150);
-    std::cout << std::endl;
-    std::cout << "vulkan: " << std::endl;
-    print(t2, 150);
-    std::cout << std::endl;
-  }
-  return is_close;
-}
-"""
-
-
-class VkCorrectnessTestFileGen(CorrectnessTestFileGen):
-    def __init__(self, out_path: str):
-        super().__init__(out_path)
-
-    def generate_preamble(self) -> str:
-        return preamble_str
-
-    def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None:
-        suites_gen = VkCorrectnessTestGen(op_reg_name, f, all_input_cases)
-        self.suites_gens.append(suites_gen)
diff --git a/backends/vulkan/test/op_tests/utils/test_suite.py b/backends/vulkan/test/op_tests/utils/test_suite.py
deleted file mode 100644
index 427864b0d5d..00000000000
--- a/backends/vulkan/test/op_tests/utils/test_suite.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
-
-###################################
-## Generic Test Suite definition ##
-###################################
-
-
-class TestSuite:
-    def __init__(self, input_cases: List[Any]):
-        self.input_cases: List[Any] = input_cases
-        self.prepacked_args: List[str] = []
-        self.requires_prepack: bool = False
-        self.dtypes: List[str] = ["at::kFloat", "at::kHalf"]
-
-        self.data_gen: str = "make_rand_tensor"
-        self.data_range = (0, 1)
-
-        self.arg_dtype = {}
-        self.arg_data_gen_fn: Dict[str, str] = {}
-        self.arg_data_range = {}
-
-        self.atol: str = "1e-5"
-        self.rtol: str = "1e-5"
-
-        self.is_view_op: bool = False
-        self.test_name_suffix: Optional[str] = None
-
-    def supports_prepack(self):
-        return len(self.prepacked_args) > 0
-
-
-##################################
-## Vulkan Test Suite Definition ##
-##################################
-
-
-@dataclass
-class VkTestSuite(TestSuite):
-    def __init__(self, input_cases: List[Any]):
-        super().__init__(input_cases)
-        self.storage_types: List[str] = ["utils::kTexture3D"]
-        self.layouts: List[str] = ["utils::kChannelsPacked"]
-        self.data_gen: str = "make_rand_tensor"
-        self.force_io: bool = True
-        self.arg_storage_types: Dict[str, str] = {}
-        self.arg_memory_layouts: Dict[str, str] = {}
diff --git a/backends/vulkan/test/scripts/test_model.sh b/backends/vulkan/test/scripts/test_model.sh
deleted file mode 100755
index 5f06d2c039b..00000000000
--- a/backends/vulkan/test/scripts/test_model.sh
+++ /dev/null
@@ -1,180 +0,0 @@
-#!/bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-set -exu
-
-# Initialize variables
-RUN_BUILD=false
-RUN_CORRECTNESS_TEST=false
-RUN_CLEAN=false
-RUN_RECOMPILE=false
-MODEL_NAME=""
-OUTPUT_DIRECTORY="."
-
-# Parse arguments
-SKIP_NEXT=false
-for i in $(seq 1 $#); do
-  if [[ "$SKIP_NEXT" == true ]]; then
-    SKIP_NEXT=false
-    continue
-  fi
-
-  arg="${!i}"
-  case $arg in
-    --build|-b)
-      RUN_BUILD=true
-      ;;
-    --clean|-c)
-      RUN_CLEAN=true
-      ;;
-    --recompile|-rc)
-      RUN_RECOMPILE=true
-      ;;
-    --output_directory|-o)
-      next_i=$((i + 1))
-      if [[ $next_i -le $# ]]; then
-        OUTPUT_DIRECTORY="${!next_i}"
-        SKIP_NEXT=true
-      else
-        echo "Error: --output_directory|-o requires a value"
-        exit 1
-      fi
-      ;;
-    --*|-*)
-      echo "Unknown argument: $arg"
-      exit 1
-      ;;
-    *)
-      if [[ -z "$MODEL_NAME" ]]; then
-        MODEL_NAME="$arg"
-      else
-        echo "Multiple model names provided: $MODEL_NAME and $arg"
-        exit 1
-      fi
-      ;;
-  esac
-done
-
-# Determine execution mode based on parsed arguments
-if [[ "$RUN_BUILD" == true ]] && [[ -z "$MODEL_NAME" ]]; then
-  # Build-only mode
-  RUN_CORRECTNESS_TEST=false
-elif [[ "$RUN_BUILD" == true ]] && [[ -n "$MODEL_NAME" ]]; then
-  # Build and test mode
-  RUN_CORRECTNESS_TEST=true
-elif [[ "$RUN_BUILD" == false ]] && [[ -n "$MODEL_NAME" ]]; then
-  # Test-only mode
-  RUN_CORRECTNESS_TEST=true
-else
-  echo "Invalid argument combination. Usage:"
-  echo "  $0 --build|-b [--clean|-c] [--recompile|-rc] [-o|--output_directory DIR]                    # Build-only mode"
-  echo "  $0 model_name [--build|-b] [--clean|-c] [--recompile|-rc] [-o|--output_directory DIR]       # Test mode or build+test mode"
-  exit 1
-fi
-
-if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
-  PYTHON_EXECUTABLE=python3
-fi
-which "${PYTHON_EXECUTABLE}"
-
-CMAKE_OUTPUT_DIR=cmake-out
-
-# Only set EXPORTED_MODEL if running correctness test
-if [[ "${RUN_CORRECTNESS_TEST}" == true ]]; then
-  EXPORTED_MODEL=${MODEL_NAME}_vulkan
-fi
-
-
-clean_build_directory() {
-  echo "Cleaning build directory: ${CMAKE_OUTPUT_DIR}"
-  rm -rf ${CMAKE_OUTPUT_DIR}
-}
-
-recompile() {
-  cmake --build cmake-out -j64 --target install
-}
-
-build_core_libraries_and_devtools() {
-  echo "Building core libraries and devtools with comprehensive Vulkan support..."
-
-  # Build core libraries with all required components
-  cmake . \
-    -DCMAKE_INSTALL_PREFIX=cmake-out \
-    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-    -DEXECUTORCH_BUILD_KERNELS_CUSTOM_AOT=ON \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-    -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
-    -DEXECUTORCH_BUILD_DEVTOOLS=ON \
-    -DEXECUTORCH_BUILD_VULKAN=ON \
-    -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -Bcmake-out && \
-  cmake --build cmake-out -j64 --target install
-
-  # Build devtools example runner
-  cmake examples/devtools \
-    -DCMAKE_INSTALL_PREFIX=cmake-out \
-    -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-    -DEXECUTORCH_BUILD_VULKAN=ON \
-    -Bcmake-out/examples/devtools && \
-  cmake --build cmake-out/examples/devtools -j16 --config Release
-}
-
-run_example_runner() {
-  ./${CMAKE_OUTPUT_DIR}/examples/devtools/example_runner -bundled_program_path "${OUTPUT_DIRECTORY}/${EXPORTED_MODEL}.bpte" -output_verification
-}
-
-test_bundled_model_with_vulkan() {
-  # Export model as bundled program with Vulkan backend
-  "${PYTHON_EXECUTABLE}" -m examples.vulkan.export --model_name="${MODEL_NAME}" --output_dir="${OUTPUT_DIRECTORY}" --bundled
-
-  # Update exported model name for bundled program
-  EXPORTED_MODEL="${MODEL_NAME}_vulkan"
-
-  # Verify the exported bundled model exists
-  if [[ ! -f "${OUTPUT_DIRECTORY}/${EXPORTED_MODEL}.bpte" ]]; then
-    echo "Error: Failed to export bundled model ${MODEL_NAME} with Vulkan backend"
-    exit 1
-  fi
-
-  # Note: Running bundled programs may require different executor runner
-  echo "Bundled program created successfully. Use appropriate bundled program runner to test."
-
-  run_example_runner
-}
-
-
-# Main execution
-if [[ "${RUN_BUILD}" == true ]]; then
-  if [[ "${RUN_CLEAN}" == true ]]; then
-    clean_build_directory
-  fi
-  build_core_libraries_and_devtools
-fi
-
-if [[ "${RUN_RECOMPILE}" == true ]]; then
-  recompile
-fi
-
-if [[ "${RUN_CORRECTNESS_TEST}" == true ]]; then
-  echo "Testing ${MODEL_NAME} with Vulkan backend..."
-  # Always use bundled program testing
-  test_bundled_model_with_vulkan
-
-  # Check if test completed successfully
-  if [[ $? -eq 0 ]]; then
-    echo "Vulkan model test completed successfully!"
-  else
-    echo "Vulkan model test failed!"
-    exit 1
-  fi
-fi
diff --git a/backends/vulkan/test/scripts/test_op.sh b/backends/vulkan/test/scripts/test_op.sh
deleted file mode 100755
index 36920cb73cc..00000000000
--- a/backends/vulkan/test/scripts/test_op.sh
+++ /dev/null
@@ -1,258 +0,0 @@
-#!/bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-set -exu
-
-# Initialize variables
-RUN_BUILD=false
-RUN_CLEAN=false
-RUN_CLEAN_TESTS=false
-RUN_RECOMPILE=false
-RUN_TESTS=false
-TEST_BINARY=""
-ATEN_OP=""
-
-# Parse arguments
-SKIP_NEXT=false
-if [[ $# -eq 0 ]]; then
-  # No arguments provided - run default test
-  TEST_BINARY="vulkan_op_correctness_tests"
-  RUN_TESTS=true
-else
-  for i in $(seq 1 $#); do
-    if [[ "$SKIP_NEXT" == true ]]; then
-      SKIP_NEXT=false
-      continue
-    fi
-
-    arg="${!i}"
-    case $arg in
-      --build|-b)
-        RUN_BUILD=true
-        ;;
-      --clean|-c)
-        RUN_CLEAN=true
-        RUN_BUILD=true
-        ;;
-      --clean_tests|-ct)
-        RUN_CLEAN_TESTS=true
-        ;;
-      --recompile|-rc)
-        RUN_RECOMPILE=true
-        ;;
-      --test|-t)
-        RUN_TESTS=true
-        ;;
-      --aten)
-        next_i=$((i + 1))
-        if [[ $next_i -le $# ]]; then
-          ATEN_OP="${!next_i}"
-          TEST_BINARY="vulkan_op_correctness_tests"
-          RUN_TESTS=true
-          SKIP_NEXT=true
-        else
-          echo "Error: --aten requires an operator name"
-          exit 1
-        fi
-        ;;
-      --*|-*)
-        echo "Unknown argument: $arg"
-        exit 1
-        ;;
-      *)
-        if [[ -z "$TEST_BINARY" ]]; then
-          TEST_BINARY="$arg"
-          RUN_TESTS=true
-        else
-          echo "Multiple test binaries provided: $TEST_BINARY and $arg"
-          exit 1
-        fi
-        ;;
-    esac
-  done
-fi
-
-# Determine execution mode based on parsed arguments
-if [[ "$RUN_BUILD" == true ]] && [[ -z "$TEST_BINARY" ]] && [[ "$RUN_TESTS" == false ]]; then
-  # Build-only mode
-  echo "Build-only mode"
-elif [[ "$RUN_BUILD" == true ]] && [[ -n "$TEST_BINARY" ]]; then
-  # Build and test mode
-  echo "Build and test mode for: $TEST_BINARY"
-elif [[ "$RUN_BUILD" == false ]] && [[ -n "$TEST_BINARY" ]]; then
-  # Test-only mode
-  echo "Test-only mode for: $TEST_BINARY"
-elif [[ "$RUN_TESTS" == true ]] && [[ -z "$TEST_BINARY" ]]; then
-  # Run all available tests
-  echo "Running all available operator tests"
-elif [[ $# -eq 0 ]]; then
-  # No arguments provided - run default test
-  TEST_BINARY="vulkan_op_correctness_tests"
-  RUN_TESTS=true
-  echo "No arguments provided, running default test: $TEST_BINARY"
-else
-  echo "Invalid argument combination. Usage:"
-  echo "  $0                                                                              # Run default vulkan_op_correctness_tests"
-  echo "  $0 --build|-b [--clean|-c] [--clean_tests|-ct] [--recompile|-rc]                # Build-only mode"
-  echo "  $0 [test_binary_name] [--build|-b] [--clean|-c] [--clean_tests|-ct] [--recompile|-rc]  # Test mode or build+test mode"
-  echo "  $0 --test|-t [--build|-b] [--clean|-c] [--clean_tests|-ct] [--recompile|-rc]    # Run all tests mode"
-  echo "  $0 --aten <operator_name> [--build|-b] [--clean|-c] [--clean_tests|-ct] [--recompile|-rc]  # Run specific ATen operator test"
-  echo "  $0 --clean_tests|-ct                                                            # Clean and rebuild only operator tests"
-  echo ""
-  echo "Available test binaries:"
-  echo "  - vulkan_op_correctness_tests"
-  echo "  - vulkan_op_benchmarks"
-  echo "  - compute_graph_op_tests"
-  echo "  - sdpa_test"
-  exit 1
-fi
-
-if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
-  PYTHON_EXECUTABLE=python3
-fi
-which "${PYTHON_EXECUTABLE}"
-
-CMAKE_OUTPUT_DIR=cmake-out
-
-clean_build_directory() {
-  echo "Cleaning build directory: ${CMAKE_OUTPUT_DIR}"
-  rm -rf ${CMAKE_OUTPUT_DIR}
-}
-
-clean_test_directory() {
-  echo "Cleaning test build directory: ${CMAKE_OUTPUT_DIR}/backends/vulkan/test/op_tests"
-  rm -rf ${CMAKE_OUTPUT_DIR}/backends/vulkan/test/op_tests
-}
-
-build_core_libraries() {
-  cmake . \
-    -DCMAKE_INSTALL_PREFIX=cmake-out \
-    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-    -DEXECUTORCH_BUILD_KERNELS_CUSTOM_AOT=ON \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-    -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
-    -DEXECUTORCH_BUILD_DEVTOOLS=ON \
-    -DEXECUTORCH_BUILD_VULKAN=ON \
-    -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_TESTS=ON \
-    -Bcmake-out && \
-  cmake --build cmake-out -j64 --target install
-}
-
-build_operator_tests() {
-  echo "Building Vulkan operator tests..."
-
-  # Check if TORCH_OPS_YAML_PATH is set, if not use default
-  if [[ -z "${TORCH_OPS_YAML_PATH:-}" ]]; then
-    TORCH_OPS_YAML_PATH="$HOME/Github/pytorch/aten/src/ATen/native"
-    echo "Using default TORCH_OPS_YAML_PATH: $TORCH_OPS_YAML_PATH"
-  fi
-
-  # Verify that TORCH_OPS_YAML_PATH exists
-  if [[ ! -d "$TORCH_OPS_YAML_PATH" ]]; then
-    echo "Error: TORCH_OPS_YAML_PATH directory does not exist: $TORCH_OPS_YAML_PATH"
-    echo "Please set TORCH_OPS_YAML_PATH to a valid PyTorch native operations directory"
-    echo "Example: export TORCH_OPS_YAML_PATH=/path/to/pytorch/aten/src/ATen/native"
-    exit 1
-  fi
-
-  # Verify required YAML files exist
-  if [[ ! -f "$TORCH_OPS_YAML_PATH/native_functions.yaml" ]]; then
-    echo "Error: Required file not found: $TORCH_OPS_YAML_PATH/native_functions.yaml"
-    exit 1
-  fi
-
-  if [[ ! -f "$TORCH_OPS_YAML_PATH/tags.yaml" ]]; then
-    echo "Error: Required file not found: $TORCH_OPS_YAML_PATH/tags.yaml"
-    exit 1
-  fi
-
-  echo "Using TORCH_OPS_YAML_PATH: $TORCH_OPS_YAML_PATH"
-
-  # Build operator tests
-  cmake backends/vulkan/test/op_tests \
-    -DCMAKE_INSTALL_PREFIX=cmake-out \
-    -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-    -DTORCH_OPS_YAML_PATH="$TORCH_OPS_YAML_PATH" \
-    -DCMAKE_CXX_STANDARD=17 \
-    -Bcmake-out/backends/vulkan/test/op_tests && \
-  cmake --build cmake-out/backends/vulkan/test/op_tests -j16
-}
-
-recompile() {
-  echo "Recompiling..."
-  cmake --build cmake-out -j64 --target install
-  cmake --build cmake-out/backends/vulkan/test/op_tests -j16
-}
-
-run_operator_test() {
-  local test_name="$1"
-  local test_binary_path=""
-
-  case "$test_name" in
-    "aten")
-      test_binary_path="${CMAKE_OUTPUT_DIR}/backends/vulkan/test/op_tests/vulkan_op_correctness_tests"
-      ;;
-    *)
-      # Try to find the binary directly
-      test_binary_path="${CMAKE_OUTPUT_DIR}/backends/vulkan/test/op_tests/${test_name}"
-      ;;
-  esac
-
-  if [[ -f "$test_binary_path" ]]; then
-    echo "Running test binary: $test_binary_path"
-
-    # Add gtest filter if ATEN_OP is specified
-    if [[ -n "$ATEN_OP" ]]; then
-      echo "Filtering tests for ATen operator: $ATEN_OP"
-      "$test_binary_path" --gtest_filter="*${ATEN_OP}*"
-    else
-      "$test_binary_path"
-    fi
-  else
-    echo "Error: Test binary not found at $test_binary_path"
-    echo "Available binaries in ${CMAKE_OUTPUT_DIR}/backends/vulkan/test/op_tests/:"
-    ls -la "${CMAKE_OUTPUT_DIR}/backends/vulkan/test/op_tests/" 2>/dev/null || echo "Directory not found"
-    exit 1
-  fi
-}
-
-# Main execution
-if [[ "${RUN_CLEAN_TESTS}" == true ]]; then
-  clean_test_directory
-  build_operator_tests
-fi
-
-if [[ "${RUN_BUILD}" == true ]]; then
-  if [[ "${RUN_CLEAN}" == true ]]; then
-    clean_build_directory
-  fi
-  build_core_libraries
-  build_operator_tests
-fi
-
-if [[ "${RUN_RECOMPILE}" == true ]]; then
-  recompile
-fi
-
-if [[ "${RUN_TESTS}" == true ]]; then
-  run_operator_test "$TEST_BINARY"
-
-  # Check if tests completed successfully
-  if [[ $? -eq 0 ]]; then
-    echo "Vulkan operator tests completed successfully!"
-  else
-    echo "Some Vulkan operator tests failed!"
-    exit 1
-  fi
-fi
diff --git a/backends/vulkan/test/test_serialization.py b/backends/vulkan/test/test_serialization.py
deleted file mode 100644
index c373f5216d2..00000000000
--- a/backends/vulkan/test/test_serialization.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# pyre-strict
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import ctypes
-import random
-import unittest
-from typing import List
-
-import torch
-
-from executorch.backends.vulkan.serialization.vulkan_graph_schema import (
-    IntList,
-    OperatorCall,
-    String,
-    VkGraph,
-    VkValue,
-)
-
-from executorch.backends.vulkan.serialization.vulkan_graph_serialize import (
-    convert_to_flatbuffer,
-    flatbuffer_to_vk_graph,
-    serialize_vulkan_graph,
-    VulkanDelegateHeader,
-)
-
-
-class TestSerialization(unittest.TestCase):
-    def _generate_random_const_tensors(self, num_tensors: int) -> List[torch.Tensor]:
-        """
-        Helper function to generate `num_tensor` buffers of random sizes and random contents,
-        we return a tuple of (list_of_buffers, list_of_mem_sizes),
-        """
-        tensors = []
-        for _ in range(num_tensors):
-            width = random.randint(4, 100)
-            height = random.randint(4, 100)
-            channels = random.randint(2, 8)
-
-            tensor = torch.randn(channels, width, height)
-            tensors.append(tensor)
-
-        return tensors
-
-    def test_serialize_vulkan_binary(self) -> None:
-        vk_graph = VkGraph(
-            version="0",
-            chain=[],
-            values=[],
-            input_ids=[],
-            output_ids=[],
-            constants=[],
-            shaders=[],
-        )
-        const_tensors = self._generate_random_const_tensors(5)
-
-        serialized_binary = serialize_vulkan_graph(vk_graph, const_tensors, [])
-
-        # Check header
-        self.assertEqual(serialized_binary[0:4], b"\x00\x00\x00\x00")
-        self.assertEqual(serialized_binary[VulkanDelegateHeader.MAGIC_IX], b"VH00")
-        flatbuffer_offset = int.from_bytes(
-            serialized_binary[VulkanDelegateHeader.FLATBUFFER_OFFSET_IX],
-            byteorder="little",
-        )
-        constants_offset = int.from_bytes(
-            serialized_binary[VulkanDelegateHeader.BYTES_OFFSET_IX],
-            byteorder="little",
-        )
-        constants_size = int.from_bytes(
-            serialized_binary[VulkanDelegateHeader.BYTES_SIZE_IX],
-            byteorder="little",
-        )
-
-        # Flatbuffer magic should be in the same spot as the Header's magic
-        self.assertEqual(
-            serialized_binary[flatbuffer_offset:][VulkanDelegateHeader.MAGIC_IX],
-            b"VK00",
-        )
-
-        constant_data_payload = serialized_binary[
-            constants_offset : constants_offset + constants_size
-        ]
-
-        # We check that constant data indexes stored in the vk_graph correctly index
-        # into the correct buffer in the constant data section
-        self.assertEqual(len(vk_graph.constants), len(const_tensors))
-        for bytes_range, tensor in zip(vk_graph.constants, const_tensors):
-            offset = bytes_range.offset
-            length = bytes_range.length
-
-            constant_data_bytes = constant_data_payload[offset : offset + length]
-
-            array_type = ctypes.c_char * tensor.untyped_storage().nbytes()
-            array = ctypes.cast(
-                tensor.untyped_storage().data_ptr(),
-                ctypes.POINTER(array_type),
-            ).contents
-
-            tensor_bytes = bytes(array)
-            self.assertEqual(constant_data_bytes, tensor_bytes)
-
-    def test_serialize_deserialize_vkgraph(self) -> None:
-        in_vk_graph = VkGraph(
-            version="1",
-            chain=[
-                OperatorCall(node_id=1, name="foo", args=[1, 2, 3]),
-                OperatorCall(node_id=2, name="bar", args=[]),
-            ],
-            values=[
-                VkValue(
-                    value=String(
-                        string_val="abc",
-                    ),
-                ),
-                VkValue(
-                    value=IntList(
-                        items=[-1, -4, 2],
-                    ),
-                ),
-            ],
-            input_ids=[],
-            output_ids=[],
-            constants=[],
-            shaders=[],
-        )
-
-        bs = convert_to_flatbuffer(in_vk_graph)
-        out_vk_graph = flatbuffer_to_vk_graph(bs)
-
-        self.assertEqual(in_vk_graph, out_vk_graph)
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
deleted file mode 100644
index 00a357b0b67..00000000000
--- a/backends/vulkan/test/test_vulkan_delegate.py
+++ /dev/null
@@ -1,2652 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-
-import ctypes
-import unittest
-from typing import Tuple
-
-import executorch.backends.vulkan.test.utils as test_utils
-
-import torch
-
-from executorch.backends.transforms.convert_dtype_pass import I64toI32
-
-from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
-
-from executorch.backends.vulkan.vulkan_preprocess import VulkanBackend
-
-from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
-    get_symmetric_quantization_config,
-    XNNPACKQuantizer,
-)
-
-from executorch.exir import (
-    EdgeCompileConfig,
-    EdgeProgramManager,
-    ExecutorchProgramManager,
-    to_edge_transform_and_lower,
-)
-from executorch.extension.pybindings.portable_lib import (  # @manual
-    _load_for_executorch_from_buffer,
-)
-from executorch.extension.pytree import tree_flatten
-from torch.export import Dim, export, ExportedProgram
-
-from torchao.quantization.granularity import PerGroup
-
-from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
-
-from torchao.quantization.pt2e.quantizer import Quantizer
-from torchao.quantization.quant_api import IntxWeightOnlyConfig, quantize_
-from torchao.utils import unwrap_tensor_subclass
-
-try:
-    ctypes.CDLL("libvulkan.so.1")
-except:
-    pass
-
-
-def lower_module(
-    model: torch.nn.Module, sample_inputs: Tuple[torch.Tensor], dynamic_shapes=None
-) -> EdgeProgramManager:
-    compile_options = {}
-    if dynamic_shapes is not None:
-        compile_options["require_dynamic_shapes"] = True
-
-    edge_compile_config = EdgeCompileConfig(
-        _skip_dim_order=False,  # TODO(T182928844): Delegate dim order op to backend.
-    )
-
-    program: ExportedProgram = export(
-        model, sample_inputs, dynamic_shapes=dynamic_shapes, strict=True
-    )
-
-    edge_program = to_edge_transform_and_lower(
-        program,
-        compile_config=edge_compile_config,
-        transform_passes=[
-            I64toI32(edge_compile_config._skip_dim_order),
-        ],
-        partitioner=[VulkanPartitioner(compile_options)],
-    )
-
-    return edge_program
-
-
-def quantize_and_lower_module(
-    model: torch.nn.Module,
-    sample_inputs: Tuple[torch.Tensor],
-    quantizer: Quantizer,
-    dynamic_shapes=None,
-) -> EdgeProgramManager:
-    compile_options = {}
-    if dynamic_shapes is not None:
-        compile_options["require_dynamic_shapes"] = True
-
-    edge_compile_config = EdgeCompileConfig(
-        _skip_dim_order=False,  # TODO(T182928844): Delegate dim order op to backend.
-    )
-
-    program = export(
-        model, sample_inputs, dynamic_shapes=dynamic_shapes, strict=True
-    ).module()
-
-    program = prepare_pt2e(program, quantizer)
-    # Calibrate
-    program(*sample_inputs)
-
-    program = convert_pt2e(program)
-
-    program = export(program, sample_inputs, dynamic_shapes=dynamic_shapes)
-
-    edge_program = to_edge_transform_and_lower(
-        program,
-        compile_config=edge_compile_config,
-        transform_passes=[
-            I64toI32(edge_compile_config._skip_dim_order),
-        ],
-        partitioner=[VulkanPartitioner(compile_options)],
-    )
-
-    return edge_program
-
-
-class TestVulkanBackend(unittest.TestCase):
-    def assert_outputs_equal(
-        self,
-        model_output,
-        ref_output,
-        atol=1e-03,
-        rtol=1e-03,
-        first_output_only=False,
-        equal_nan=True,
-    ):
-        """
-        Helper testing function that asserts that the model output and the reference output
-        are equal with some tolerance. Due to numerical differences between eager mode and
-        the Vulkan's backend, we relax the detal such that default absolute
-        tolerance is 1e-3. and default relative tolerance is 1e-3.
-        """
-
-        # Compare the result from executor and eager mode direclty
-        if isinstance(ref_output, tuple) or isinstance(ref_output, list):
-            # Multiple outputs executor always returns tuple, even if there is one output
-            self.assertTrue(len(ref_output) == len(model_output))
-            if first_output_only:
-                result = torch.allclose(
-                    model_output[0],
-                    ref_output[0],
-                    atol=atol,
-                    rtol=rtol,
-                    equal_nan=equal_nan,
-                )
-                if not result:
-                    test_utils.print_tensor_comparison_errors(
-                        model_output[0], ref_output[0], atol, rtol
-                    )
-                self.assertTrue(result)
-            else:
-                for i in range(len(ref_output)):
-                    result = torch.allclose(
-                        model_output[i],
-                        ref_output[i],
-                        atol=atol,
-                        rtol=rtol,
-                        equal_nan=equal_nan,
-                    )
-                    if not result:
-                        print(f"\n=== Output {i} comparison failed ===")
-                        test_utils.print_tensor_comparison_errors(
-                            model_output[i], ref_output[i], atol, rtol
-                        )
-                    self.assertTrue(result)
-        else:
-            # If one output, eager returns tensor while executor tuple of size 1
-            result = torch.allclose(
-                model_output[0],
-                ref_output,
-                atol=atol,
-                rtol=rtol,
-                equal_nan=equal_nan,
-            )
-            if not result:
-                test_utils.print_tensor_comparison_errors(
-                    model_output[0], ref_output, atol, rtol
-                )
-            self.assertTrue(result)
-
-    def check_no_delegation(self, et_program: ExecutorchProgramManager):
-        self.assertEqual(
-            len(et_program.executorch_program.execution_plan[0].delegates),
-            0,
-        )
-        return
-
-    def check_vk_delegation(self, et_program: ExecutorchProgramManager):
-        self.assertEqual(
-            et_program.executorch_program.execution_plan[0].delegates[0].id,
-            VulkanBackend.__name__,
-        )
-
-    def run_delegated_model_and_check_output(
-        self,
-        et_program: ExecutorchProgramManager,
-        model: torch.nn.Module,
-        sample_inputs: Tuple[torch.Tensor],
-        atol=1e-03,
-        rtol=1e-01,
-        test_inputs=None,
-        first_output_only=False,
-    ):
-        executorch_module = _load_for_executorch_from_buffer(et_program.buffer)
-        inputs_flattened, _ = tree_flatten(sample_inputs)
-
-        model_output = executorch_module.run_method("forward", tuple(inputs_flattened))
-        ref_output = model(*sample_inputs)
-
-        self.assert_outputs_equal(
-            model_output,
-            ref_output,
-            atol=atol,
-            rtol=rtol,
-            first_output_only=first_output_only,
-        )
-
-        if test_inputs is not None:
-            for test_input in test_inputs:
-                test_inputs_flattened, _ = tree_flatten(test_input)
-                model_output = executorch_module.run_method(
-                    "forward", tuple(test_inputs_flattened)
-                )
-                ref_output = model(*test_input)
-
-                self.assert_outputs_equal(
-                    model_output,
-                    ref_output,
-                    atol=atol,
-                    rtol=rtol,
-                    first_output_only=first_output_only,
-                )
-
-    def lower_module_and_test_output(
-        self,
-        model: torch.nn.Module,
-        sample_inputs: Tuple[torch.Tensor],
-        atol=1e-03,
-        rtol=1e-01,
-        dynamic_shapes=None,
-        test_inputs=None,
-        first_output_only=False,
-        expect_no_delegates=False,
-    ):
-        """
-        Helper testing function that takes a torch.nn.Module and lowers it to Vulkan with
-        the given sample inputs. It then runs the lowered module and compares its
-        outputs with the outputs of the eager module.
-        """
-
-        # Validate that the model can execute in eager mode
-        model.eval()
-        model(*sample_inputs)
-
-        edge_program = lower_module(model, sample_inputs, dynamic_shapes=dynamic_shapes)
-
-        et_program = edge_program.to_executorch()
-
-        if expect_no_delegates:
-            self.check_no_delegation(et_program)
-            return
-
-        self.check_vk_delegation(et_program)
-
-        self.run_delegated_model_and_check_output(
-            et_program,
-            model,
-            sample_inputs,
-            atol,
-            rtol,
-            test_inputs=test_inputs,
-            first_output_only=first_output_only,
-        )
-
-    def test_vulkan_backend_add(self):
-        # This test is the simplest test by manually lowering some submodules, we can use paritioner
-        # for auto detecting lowerable parts.
-        class AddModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y, w):
-                z = x + y
-                z = z + x
-                z = z + x
-                z = z + w
-                z = w + z
-                z = z + 3  # test scalar broadcasting
-                return z
-
-        add_module = AddModule()
-        sample_inputs = (
-            torch.rand(size=(2, 3), dtype=torch.float32),
-            torch.rand(size=(2, 3), dtype=torch.float32),
-            torch.rand(size=(2, 1), dtype=torch.float32),  # test broadcasting
-        )
-
-        self.lower_module_and_test_output(add_module, sample_inputs)
-
-        sample_inputs = (
-            torch.rand(size=(4, 5, 2, 3), dtype=torch.float32),
-            torch.rand(size=(4, 5, 2, 3), dtype=torch.float32),
-            torch.rand(
-                size=(2, 3), dtype=torch.float32
-            ),  # test broadcasting on packed dim
-        )
-
-        self.lower_module_and_test_output(add_module, sample_inputs)
-
-    def test_vulkan_backend_add_int(self):
-        class AddIntModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                z = x + y
-                return z
-
-        add_int_module = AddIntModule()
-        sample_inputs = (
-            torch.randint(low=-100, high=100, size=(2, 3), dtype=torch.int32),
-            torch.randint(low=-100, high=100, size=(2, 3), dtype=torch.int32),
-        )
-
-        self.lower_module_and_test_output(add_int_module, sample_inputs)
-
-    def test_vulkan_backend_zero_dim_tensor(self):
-        class ZeroDimModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.zero = torch.full([], 1.3, dtype=torch.float32)
-
-            def forward(self, x):
-                return x + self.zero
-
-        internal_data_module = ZeroDimModule()
-        sample_inputs = (torch.rand(size=(2, 3), dtype=torch.float32),)
-        self.lower_module_and_test_output(internal_data_module, sample_inputs)
-
-    def test_vulkan_backend_internal_data(self):
-        class InternalDataModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.weight = torch.rand(size=(2, 3), dtype=torch.float32)
-
-            def forward(self, x, y):
-                inter1 = torch.add(x, y, alpha=2)
-                inter2 = torch.add(x, y, alpha=3.14)
-                inter3 = inter1 * self.weight
-                inter4 = inter2 * self.weight
-                return inter4 - inter3
-
-        internal_data_module = InternalDataModule()
-        sample_inputs = (
-            torch.rand(size=(2, 3), dtype=torch.float32),
-            torch.rand(size=(2, 3), dtype=torch.float32),
-        )
-
-        self.lower_module_and_test_output(internal_data_module, sample_inputs)
-
-    def test_vulkan_backend_sub(self):
-        class SubModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                z = torch.sub(x, y, alpha=2)
-                z = torch.sub(z, x, alpha=3.14)
-                z = z - x
-                return z
-
-        sub_module = SubModule()
-        sample_inputs = (
-            torch.rand(size=(2, 3), dtype=torch.float32),
-            torch.rand(size=(2, 3), dtype=torch.float32),
-        )
-
-        self.lower_module_and_test_output(sub_module, sample_inputs)
-
-    def test_vulkan_backend_mul(self):
-        class MulModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                z = x * y
-                z = z * x
-                z = z * x
-                return z
-
-        mul_module = MulModule()
-        sample_inputs = (
-            torch.rand(size=(2, 3), dtype=torch.float32),
-            torch.rand(size=(2, 3), dtype=torch.float32),
-        )
-
-        self.lower_module_and_test_output(mul_module, sample_inputs)
-
-    def test_vulkan_backend_div(self):
-        class DivModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                z = x / y
-                z = z / x
-                z = z / x
-                return z
-
-        div_module = DivModule()
-        sample_inputs = (
-            torch.rand(size=(2, 3), dtype=torch.float32),
-            torch.rand(size=(2, 3), dtype=torch.float32),
-        )
-
-        self.lower_module_and_test_output(div_module, sample_inputs)
-
-    def test_vulkan_backend_arithmetic(self):
-        class ArithmeticModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.weight = torch.rand(size=(2, 3), dtype=torch.float32)
-
-            def forward(self, x, y):
-                z = x + y
-                z = z - x
-                z = z / x
-                z = z * self.weight
-                return z
-
-        arithmetic_module = ArithmeticModule()
-        sample_inputs = (
-            torch.rand(size=(2, 3), dtype=torch.float32),
-            torch.rand(size=(2, 3), dtype=torch.float32),
-        )
-
-        self.lower_module_and_test_output(arithmetic_module, sample_inputs)
-
-    def test_vulkan_backend_floor_div(self):
-        class FloorDivModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                z = x // y
-                return z
-
-        floor_div_module = FloorDivModule()
-        sample_inputs = (
-            torch.rand(size=(2, 3), dtype=torch.float32) * 10.0,
-            torch.rand(size=(2, 3), dtype=torch.float32) + 1.0,
-        )
-
-        # absolute tolerance is 1 because of flooring
-        self.lower_module_and_test_output(
-            floor_div_module, sample_inputs, atol=1.0 + 1e-03
-        )
-
-    def test_vulkan_backend_pow(self):
-        class PowModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                z = torch.pow(x, y)
-                return z
-
-        pow_module = PowModule()
-        sample_inputs = (
-            torch.rand(size=(2, 3), dtype=torch.float32),
-            torch.rand(size=(2, 3), dtype=torch.float32),
-        )
-
-        self.lower_module_and_test_output(pow_module, sample_inputs)
-
-    def lower_unary_module_and_test_output(self, module):
-        batch = Dim("batch", max=8)
-        sample_inputs = (torch.randn(8, 16, 96, 92),)
-
-        dynamic_shapes = {"x": {0: batch}}
-        test_inputs = [
-            (torch.randn(3, 14, 15, 92),),
-            (torch.randn(6, 5, 35, 89),),
-            (torch.randn(7, 9, 32, 38),),
-        ]
-
-        self.lower_module_and_test_output(
-            module,
-            sample_inputs,
-            dynamic_shapes=dynamic_shapes,
-            test_inputs=test_inputs,
-        )
-
-    def test_vulkan_backend_clamp(self):
-        class ClampModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.clamp(x, min=-3.14)
-
-        self.lower_unary_module_and_test_output(ClampModule())
-
-    def test_vulkan_backend_clamp_int(self):
-        class ClampModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.clamp(x, min=-3)
-
-        sample_inputs = (
-            torch.randint(low=-100, high=100, size=(5, 5), dtype=torch.int32),
-        )
-
-        self.lower_module_and_test_output(ClampModule(), sample_inputs)
-
-    def test_vulkan_backend_clamp_int64(self):
-        class ClampModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.clamp(x, min=-3)
-
-        sample_inputs = (
-            torch.randint(low=-100, high=100, size=(5, 5), dtype=torch.int64),
-        )
-
-        self.lower_module_and_test_output(ClampModule(), sample_inputs)
-
-    def test_vulkan_backend_cos(self):
-        class CosModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.cos(x)
-
-        self.lower_unary_module_and_test_output(CosModule())
-
-    def test_vulkan_backend_hardtanh(self):
-        class HardTanHModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.tanh = torch.nn.Hardtanh(min_val=-3.14, max_val=6.28)
-
-            def forward(self, x):
-                return self.tanh(x)
-
-        self.lower_unary_module_and_test_output(HardTanHModule())
-
-    def test_vulkan_backend_exp(self):
-        class ExpModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.exp(x)
-
-        self.lower_unary_module_and_test_output(ExpModule())
-
-    def test_vulkan_backend_neg(self):
-        class NegModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.neg(x)
-
-        self.lower_unary_module_and_test_output(NegModule())
-
-    def test_vulkan_backend_sin(self):
-        class SinModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.sin(x)
-
-        self.lower_unary_module_and_test_output(SinModule())
-
-    def test_vulkan_backend_relu(self):
-        class ReLUModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.relu(x)
-
-        self.lower_unary_module_and_test_output(ReLUModule())
-
-    def test_vulkan_backend_sqrt(self):
-        class SqrtModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.sqrt(x)
-
-        self.lower_unary_module_and_test_output(SqrtModule())
-
-    def test_vulkan_backend_hardshrink(self):
-        class HardshrinkModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.hardshrink = torch.nn.Hardshrink(lambd=0.3)
-
-            def forward(self, x):
-                return self.hardshrink(x)
-
-        self.lower_unary_module_and_test_output(HardshrinkModule())
-
-    def test_vulkan_backend_max_pool2d(self):
-        class MaxPool2dModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.max_pool = torch.nn.MaxPool2d(
-                    kernel_size=(2, 3),
-                    stride=(1, 1),
-                    padding=0,
-                    dilation=1,
-                    ceil_mode=False,
-                    return_indices=True,
-                )
-
-            def forward(self, x):
-                return self.max_pool(x)
-
-        max_pool2d_module = MaxPool2dModule()
-        sample_inputs = (torch.randn(5, 13, 55, 68),)
-
-        batch = Dim("batch", max=8)
-        dynamic_shapes = {"x": {0: batch}}
-        test_inputs = [
-            (torch.randn(3, 14, 15, 9),),
-            (torch.randn(1, 1, 4, 6),),
-            (torch.randn(5, 10, 50, 40),),
-        ]
-        self.lower_module_and_test_output(
-            max_pool2d_module,
-            sample_inputs,
-            dynamic_shapes=dynamic_shapes,
-            test_inputs=test_inputs,
-            first_output_only=True,
-        )
-
-    def test_vulkan_backend_avg_pool2d(self):
-        class AvgPool2dModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.avg_pool = torch.nn.AvgPool2d(
-                    kernel_size=(4, 4),
-                    stride=(4, 4),
-                    padding=(0, 0),
-                    ceil_mode=True,
-                    count_include_pad=True,
-                    divisor_override=None,
-                )
-
-            def forward(self, x):
-                return self.avg_pool(x)
-
-        avg_pool2d_module = AvgPool2dModule()
-        sample_inputs = (torch.randn(5, 13, 55, 68),)
-
-        batch = Dim("batch", max=8)
-        dynamic_shapes = {"x": {0: batch}}
-        test_inputs = [
-            (torch.randn(3, 14, 15, 9),),
-            (torch.randn(1, 1, 4, 6),),
-            (torch.randn(5, 10, 50, 40),),
-        ]
-        self.lower_module_and_test_output(
-            avg_pool2d_module,
-            sample_inputs,
-            dynamic_shapes=dynamic_shapes,
-            test_inputs=test_inputs,
-        )
-
-    def test_vulkan_backend_abs(self):
-        class AbsModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.abs(x)
-
-        self.lower_unary_module_and_test_output(AbsModule())
-
-    def test_vulkan_backend_sigmoid(self):
-        class SigmoidModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.sigmoid(x)
-
-        self.lower_unary_module_and_test_output(SigmoidModule())
-
-    def test_vulkan_backend_tanh(self):
-        class TanhModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.tanh(x)
-
-        self.lower_unary_module_and_test_output(TanhModule())
-
-    def test_vulkan_backend_linear(self):
-        class LinearModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear = torch.nn.Linear(128, 64, bias=False)
-
-            def forward(self, x):
-                return self.linear(x)
-
-        module = LinearModule()
-        sample_inputs = (torch.rand(size=(32, 128), dtype=torch.float32),)
-        batch = Dim("batch", max=32)
-        dynamic_shapes = {"x": {0: batch}}
-
-        test_inputs = [
-            (torch.rand(15, 128),),
-            (torch.rand(6, 128),),
-            (torch.rand(30, 128),),
-            (torch.rand(20, 128),),
-            (torch.rand(19, 128),),
-        ]
-
-        self.lower_module_and_test_output(
-            module,
-            sample_inputs,
-            dynamic_shapes=dynamic_shapes,
-            test_inputs=test_inputs,
-        )
-
-    def test_vulkan_backend_partial(self):
-        class SimpleModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear = torch.nn.Linear(10, 10)
-                self.offset_1 = torch.rand(size=(2, 10), dtype=torch.float32)
-                self.offset_2 = torch.rand(size=(2, 10), dtype=torch.float32)
-
-            def forward(self, x):
-                return self.linear(x + self.offset_1) - self.offset_2
-
-        model = SimpleModel()
-        sample_inputs = (torch.rand(size=(2, 10), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(model, sample_inputs)
-
-    @unittest.skip(
-        "Currently this test is failing due to weird partitioning because the eq scalar"
-        "operator is not supported yet. Re-enable when the operator is supported."
-    )
-    def test_vulkan_backend_partial_dynamic_shapes(self):
-        class SimpleModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.branch1 = torch.nn.Sequential(
-                    torch.nn.Linear(64, 64), torch.nn.ReLU()
-                )
-                self.branch2 = torch.nn.Sequential(
-                    torch.nn.Linear(128, 64), torch.nn.ReLU()
-                )
-                self.buffer_1 = torch.ones((1, 64)) * 0.5
-                self.buffer_2 = torch.ones((1, 64)) * 1.4
-
-            def forward(self, x1, x2):
-                out1 = self.branch1(x1)
-                out2 = self.branch2(x2)
-                return (out1 + self.buffer_1 + out2) * self.buffer_2
-
-        model = SimpleModel()
-        sample_inputs = (torch.randn(32, 64), torch.randn(32, 128))
-        batch = Dim("batch", max=32)
-        dynamic_shapes = {"x1": {0: batch}, "x2": {0: batch}}
-
-        test_inputs = [
-            (torch.randn(15, 64), torch.randn(15, 128)),
-            (torch.randn(6, 64), torch.randn(6, 128)),
-            (torch.randn(30, 64), torch.randn(30, 128)),
-            (torch.randn(20, 64), torch.randn(20, 128)),
-            (torch.randn(19, 64), torch.randn(19, 128)),
-        ]
-
-        self.lower_module_and_test_output(
-            model, sample_inputs, dynamic_shapes=dynamic_shapes, test_inputs=test_inputs
-        )
-
-    def test_vulkan_backend_matmul(self):
-        class MatMulModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.weight = torch.ones(size=(63, 22), dtype=torch.float32)
-
-            def forward(self, x):
-                return torch.matmul(x, self.weight)
-
-        module = MatMulModule()
-        sample_inputs = (torch.ones(size=(31, 63), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(module, sample_inputs)
-
-    def test_vulkan_backend_bmm(self):
-        class BMMModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.weight = torch.randn(size=(4, 4, 5), dtype=torch.float32)
-
-            def forward(self, x):
-                return torch.bmm(x, self.weight)
-
-        module = BMMModule()
-        sample_inputs = (torch.randn(size=(4, 3, 4), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(module, sample_inputs)
-
-    @unittest.skip(
-        "Reduce shader does not support multiple reduction axes at the moment"
-    )
-    def test_vulkan_backend_sum_dim_list(self):
-        class SumModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                x = torch.sum(x, (0, -1), keepdim=True)
-                x = torch.sum(x, 2, keepdim=False)
-                return x
-
-        module = SumModule()
-        sample_inputs = (torch.ones(size=(3, 2, 7, 5), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            module,
-            sample_inputs,
-        )
-
-    @unittest.skip(
-        "Reduce shader does not support multiple reduction axes at the moment"
-    )
-    def test_vulkan_backend_sum(self):
-        class SumModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                x = torch.sum(x, (), keepdim=True)
-                x = torch.sum(x)
-                return x
-
-        module = SumModule()
-        sample_inputs = (torch.rand(size=(3, 2, 7, 5), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            module,
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_conv2d(self):
-        class Conv2dModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = torch.nn.Conv2d(
-                    in_channels=6,
-                    out_channels=8,
-                    kernel_size=(3, 3),
-                    padding=(2, 3),
-                    stride=(1, 2),
-                    dilation=1,
-                    groups=1,
-                    bias=True,
-                )
-
-            def forward(self, x):
-                return self.conv(x)
-
-        conv2d_module = Conv2dModule()
-        sample_inputs = (torch.randn(size=(1, 6, 40, 50), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            conv2d_module,
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_conv_transpose2d(self):
-        class ConvTranspose2dModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = torch.nn.ConvTranspose2d(
-                    in_channels=6,
-                    out_channels=8,
-                    kernel_size=(3, 3),
-                    padding=(2, 3),
-                    stride=(1, 2),
-                    output_padding=(0, 1),
-                    dilation=1,
-                    groups=1,
-                    bias=True,
-                )
-
-            def forward(self, x):
-                return self.conv(x)
-
-        conv_transpose2d_module = ConvTranspose2dModule()
-        sample_inputs = (torch.randn(size=(1, 6, 40, 50), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            conv_transpose2d_module,
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_conv2d_dw(self):
-        class Conv2dModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = torch.nn.Conv2d(
-                    in_channels=8,
-                    out_channels=8,
-                    kernel_size=3,
-                    padding=1,
-                    groups=8,
-                    bias=True,
-                )
-
-            def forward(self, x):
-                return self.conv(x)
-
-        conv2d_module = Conv2dModule()
-        sample_inputs = (torch.randn(size=(1, 8, 72, 96), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            conv2d_module,
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_conv2d_pw(self):
-        class Conv2dModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = torch.nn.Conv2d(
-                    in_channels=8,
-                    out_channels=8,
-                    kernel_size=1,
-                    padding=1,
-                    groups=1,
-                    bias=True,
-                )
-
-            def forward(self, x):
-                return self.conv(x)
-
-        conv2d_module = Conv2dModule()
-        sample_inputs = (torch.randn(size=(1, 8, 72, 96), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            conv2d_module,
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_conv2d_bias_false(self):
-        class Conv2dModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = torch.nn.Conv2d(
-                    in_channels=6,
-                    out_channels=8,
-                    kernel_size=(3, 3),
-                    padding=(2, 3),
-                    stride=(1, 2),
-                    dilation=1,
-                    groups=1,
-                    bias=False,
-                )
-
-            def forward(self, x):
-                return self.conv(x)
-
-        conv2d_module = Conv2dModule()
-        sample_inputs = (torch.randn(size=(1, 6, 40, 50), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            conv2d_module,
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_conv1d(self):
-        class Conv1dModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = torch.nn.Conv1d(
-                    in_channels=20,
-                    out_channels=10,
-                    kernel_size=6,
-                    stride=5,
-                    padding=5,
-                    dilation=3,
-                    groups=5,
-                    bias=True,
-                )
-
-            def forward(self, x):
-                return self.conv(x)
-
-        conv1d_module = Conv1dModule()
-        sample_inputs = (torch.randn(size=(3, 20, 30), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            conv1d_module,
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_conv1d_bias_false(self):
-        class Conv1dModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv = torch.nn.Conv1d(
-                    in_channels=6,
-                    out_channels=6,
-                    kernel_size=3,
-                    groups=6,
-                    bias=False,
-                )
-
-            def forward(self, x):
-                return self.conv(x)
-
-        conv1d_module = Conv1dModule()
-        sample_inputs = (torch.randn(size=(1, 6, 7), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            conv1d_module,
-            sample_inputs,
-        )
-
-    @unittest.skip("layer norm compute shader not working with swiftshader")
-    def test_vulkan_backend_native_layer_norm(self):
-        class NativeLayerNormModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.layer_norm = torch.nn.LayerNorm(5)
-
-            def forward(self, x):
-                return self.layer_norm(x)
-
-        sample_inputs = (torch.randn(size=(3, 4, 5), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            NativeLayerNormModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_batch_norm(self):
-        class BatchNormModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.bn = torch.nn.BatchNorm2d(num_features=3)
-
-            def forward(self, x):
-                return self.bn(x)
-
-        sample_inputs = (torch.randn(size=(4, 3, 2, 5), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            BatchNormModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_full(self):
-        class FullModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.full(x.shape, 42.0)
-
-        class ZerosModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.zeros(x.shape)
-
-        class OnesModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.ones(x.shape)
-
-        sample_inputs = (torch.randn(size=(2, 3, 4, 5), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            FullModule(),
-            sample_inputs,
-        )
-
-        self.lower_module_and_test_output(
-            ZerosModule(),
-            sample_inputs,
-        )
-
-        self.lower_module_and_test_output(
-            OnesModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_full_like(self):
-        class FullLikeModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.full_like(x, 42.0)
-
-        class ZerosLikeModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.zeros_like(x)
-
-        class OnesLikeModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.ones_like(x)
-
-        sample_inputs = (torch.randn(size=(2, 3, 4, 5), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            FullLikeModule(),
-            sample_inputs,
-        )
-
-        self.lower_module_and_test_output(
-            ZerosLikeModule(),
-            sample_inputs,
-        )
-
-        self.lower_module_and_test_output(
-            OnesLikeModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_upsample_nearest2d(self):
-        class UpsampleNearest2d(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.upsample = torch.nn.Upsample(scale_factor=2, mode="nearest")
-
-            def forward(self, x):
-                return self.upsample(x)
-
-        sample_inputs = (torch.arange(1, 5, dtype=torch.float32).view(1, 1, 2, 2),)
-
-        self.lower_module_and_test_output(
-            UpsampleNearest2d(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_minimum(self):
-        class MinimumModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                return torch.minimum(x, y)
-
-        sample_inputs = (
-            torch.rand(size=(3, 5, 6, 4), dtype=torch.float32),
-            torch.rand(size=(6, 4), dtype=torch.float32),
-        )
-
-        self.lower_module_and_test_output(
-            MinimumModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_reshape(self):
-        class ReshapeModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.reshape(x, [-1, x.size(-1)])
-
-        sample_inputs = (torch.randn(size=(5, 3, 4), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            ReshapeModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_view(self):
-        class ViewModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return x.view([-1, x.size(-1)])
-
-        sample_inputs = (torch.randn(size=(3, 2, 3, 4), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            ViewModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_view_int(self):
-        class ViewModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return x.view([-1, x.size(-1)])
-
-        sample_inputs = (torch.randint(size=(3, 6, 2, 7), high=100, dtype=torch.int32),)
-
-        self.lower_module_and_test_output(
-            ViewModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_unsqueeze(self):
-        class UnsqueezeModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                x = torch.unsqueeze(x, 1)
-                x = torch.unsqueeze(x, 0)
-                return x
-
-        sample_inputs = (torch.randn(size=(3,), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            UnsqueezeModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_squeeze(self):
-        class SqueezeModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.squeeze(x, 0)
-
-        sample_inputs = (torch.randn(size=(1, 2, 2, 1), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            SqueezeModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_select(self):
-        class SelectModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return x[0][3]
-
-        sample_inputs = (torch.randn(size=(3, 6, 2, 7), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            SelectModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_permute_copy(self):
-        class PermuteModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.permute(x, [3, 0, 2, 1])
-
-        sample_inputs = (torch.randn(size=(3, 6, 2, 7), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            PermuteModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_permute_copy_int(self):
-        class PermuteModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.permute(x, [3, 0, 2, 1])
-
-        sample_inputs = (torch.randint(size=(3, 6, 2, 7), high=100, dtype=torch.int32),)
-
-        self.lower_module_and_test_output(
-            PermuteModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_cat(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y, z):
-                return torch.cat([x, y, z], dim=1)
-
-        sample_inputs = (
-            torch.randn(size=(3, 6, 2, 7), dtype=torch.float32),
-            torch.randn(size=(3, 1, 2, 7), dtype=torch.float32),
-            torch.randn(size=(3, 9, 2, 7), dtype=torch.float32),
-        )
-
-        self.lower_module_and_test_output(
-            TestModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_cat_with_zero_size(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y, z, w):
-                return torch.cat([x, y, z, w], dim=1)
-
-        sample_inputs = (
-            torch.randn(size=(3, 6, 2, 7), dtype=torch.float32),
-            torch.randn(size=(3, 0, 2, 7), dtype=torch.float32),
-            torch.randn(size=(3, 0, 2, 7), dtype=torch.float32),
-            torch.randn(size=(3, 3, 2, 7), dtype=torch.float32),
-        )
-
-        self.lower_module_and_test_output(
-            TestModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_slice(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return x[:, 2:9:2, :]
-
-        sample_inputs = (torch.randn(size=(3, 13, 7, 3), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            TestModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_split_with_sizes(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.split(x, (3, 6, 1, 3), dim=1)
-
-        sample_inputs = (torch.randn(size=(3, 13, 7, 3), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            TestModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_split_tensor(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.tensor_split(x, 2, dim=1)
-
-        sample_inputs = (torch.randn(size=(3, 14, 7, 3), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            TestModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_clone(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.clone(x)
-
-        sample_inputs = (torch.randn(size=(3, 14, 7, 3), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            TestModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_constant_pad_nd(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.nn.functional.pad(x, (1, 2, 3, 4, 5, 6), "constant", 24.2)
-
-        sample_inputs = (torch.randn(size=(3, 7, 5, 11), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            TestModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_repeat(self):
-        class TestModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return x.repeat([2, 3, 1, 2])
-
-        sample_inputs = (torch.randn(size=(3, 7, 5, 9), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            TestModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_t_default(self):
-        # aten.permute_copy.default is not enabled yet in partitioner
-        class TestModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                # torch.t is actually exported as aten::permute.
-                return torch.t(x)
-
-        sample_inputs = (torch.randn(size=(3, 14), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            TestModule(),
-            sample_inputs,
-        )
-
-    @unittest.skip(
-        "Softmax shader with shared memory does not work with swiftshader due to potential swiftshader bug"
-    )
-    def test_vulkan_backend_softmax(self):
-        class SoftmaxModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                x = x.softmax(dim=0)
-                x = x.softmax(dim=1)
-                x = x.softmax(dim=2)
-                return x
-
-        sample_inputs = (torch.randn(size=(3, 2, 7), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            SoftmaxModule(),
-            sample_inputs,
-        )
-
-    @unittest.skip(
-        "Softmax shader with shared memory does not work with swiftshader due to potential swiftshader bug"
-    )
-    def test_vulkan_backend_logsoftmax(self):
-        class LogSoftmaxModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                x = x.log_softmax(dim=0)
-                x = x.log_softmax(dim=1)
-                x = x.log_softmax(dim=2)
-                return x
-
-        sample_inputs = (torch.randn(size=(3, 2, 7), dtype=torch.float32),)
-
-        self.lower_module_and_test_output(
-            LogSoftmaxModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_gelu(self):
-        class GeluModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.gelu = torch.nn.GELU(approximate="tanh")
-
-            def forward(self, x):
-                return self.gelu(x)
-
-        self.lower_unary_module_and_test_output(GeluModule())
-
-    @unittest.skip(
-        "Reduce shader does not support multiple reduction axes at the moment"
-    )
-    def test_vulkan_backend_mean(self):
-        class MeanModule(torch.nn.Module):
-            def __init__(self, dims, keepdim=True):
-                super().__init__()
-                self.dims = dims
-                self.keepdim = keepdim
-
-            def forward(self, x):
-                return torch.mean(x, self.dims, keepdim=self.keepdim)
-
-        sample_inputs = (
-            torch.arange(end=2 * 3 * 2 * 5, dtype=torch.float32).reshape(2, 3, 2, 5),
-        )
-
-        self.lower_module_and_test_output(
-            MeanModule(dims=[-1, -2]),
-            sample_inputs,
-        )
-
-        self.lower_module_and_test_output(
-            MeanModule(dims=[1]),
-            sample_inputs,
-        )
-
-        self.lower_module_and_test_output(
-            MeanModule(dims=[0, 1, 2, 3]),
-            sample_inputs,
-        )
-
-        self.lower_module_and_test_output(
-            MeanModule(dims=[-1, -2], keepdim=False),
-            sample_inputs,
-        )
-
-        self.lower_module_and_test_output(
-            MeanModule(dims=[1], keepdim=False),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_index_select_int(self):
-        class IndexSelectModule(torch.nn.Module):
-            def __init__(self, dim, indices):
-                super().__init__()
-                self.dim = dim
-                self.index = torch.tensor(indices)
-
-            def forward(self, x):
-                return torch.index_select(x, self.dim, self.index)
-
-        sample_inputs = (torch.arange(96).reshape(2, 8, 2, 3),)
-
-        self.lower_module_and_test_output(
-            IndexSelectModule(dim=1, indices=[2, 3, 5, 6, 7]),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_index_select(self):
-        class IndexSelectModule(torch.nn.Module):
-            def __init__(self, dim, indices):
-                super().__init__()
-                self.dim = dim
-                self.index = torch.tensor(indices)
-
-            def forward(self, x):
-                return torch.index_select(x, self.dim, self.index)
-
-        sample_inputs = (torch.arange(144).reshape(12, 1, 3, 4).float(),)
-
-        self.lower_module_and_test_output(
-            IndexSelectModule(dim=0, indices=[1, 3, 5, 7, 8, 9, 10, 11, 2, 3]),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_arange_int(self):
-        class ArangeModule(torch.nn.Module):
-            def __init__(self, input):
-                super().__init__()
-                self.input = input
-
-            def forward(self, x):
-                return torch.arange(*self.input, dtype=torch.int32)
-
-        # `torch.arange` could take one, two or three arguments as input.
-        # If only one argument is provided, it will be interpreted as `end`.
-        # If two arguments are provided, the first one will be interpreted as `start`
-        # and the second one will be interpreted as `end`.
-        # If three arguments are provided, the first one will be interpreted as `start`,
-        # the second one will be interpreted as `end` and the third one will be
-        # interpreted as `step`.
-        inputs = [
-            [1],
-            [-3, 5],
-            [1, 11, 2],
-            [12, 1, -2],
-        ]
-        for i in inputs:
-            self.lower_module_and_test_output(
-                ArangeModule(i),
-                (torch.randn(size=(1,), dtype=torch.float32),),  # dummy input
-            )
-
-    def test_vulkan_backend_arange_float(self):
-        class ArangeModule(torch.nn.Module):
-            def __init__(self, input):
-                super().__init__()
-                self.input = input
-
-            def forward(self, x):
-                return torch.arange(*self.input)
-
-        inputs = [
-            [1.5],
-            [-3, 5.0],
-            [1.0, 11, 2],
-            [12, 1, -2.0],
-        ]
-        for i in inputs:
-            self.lower_module_and_test_output(
-                ArangeModule(i),
-                (torch.randn(size=(1,), dtype=torch.float32),),  # dummy input
-            )
-
-    def test_vulkan_backend_arange_int64(self):
-        class ArangeModule(torch.nn.Module):
-            def __init__(self, input):
-                super().__init__()
-                self.input = input
-
-            def forward(self, x):
-                return torch.arange(*self.input)
-
-        inputs = [
-            [1],
-            [-3, 5],
-            [1, 11, 2],
-            [12, 1, -2],
-            [1.5],
-            [-3, 5.0],
-            [1.0, 11, 2],
-            [12, 1, -2.0],
-        ]
-        for i in inputs:
-            self.lower_module_and_test_output(
-                ArangeModule(i),
-                (torch.randn(size=(1,), dtype=torch.float32),),  # dummy input
-            )
-            self.lower_module_and_test_output(
-                ArangeModule(i),
-                (torch.randint(low=-100, high=100, size=(5, 5)),),  # dummy input
-            )
-
-    def test_vulkan_backend_embedding_1d(self):
-        class EmbeddingModule(torch.nn.Module):
-            def __init__(self, embedding):
-                super().__init__()
-                self.embedding = embedding
-
-            def forward(self, x):
-                return self.embedding(x)
-
-        self.lower_module_and_test_output(
-            EmbeddingModule(torch.nn.Embedding(5, 4)),
-            (torch.tensor([0, 1, 0, 4, 2, 0]),),
-        )
-
-    def test_vulkan_backend_embedding_2d(self):
-        class EmbeddingModule(torch.nn.Module):
-            def __init__(self, embedding):
-                super().__init__()
-                self.embedding = embedding
-
-            def forward(self, x):
-                return self.embedding(x)
-
-        self.lower_module_and_test_output(
-            EmbeddingModule(torch.nn.Embedding(5, 4)),
-            (torch.tensor([[0, 1, 0], [4, 2, 0]]),),
-        )
-
-    def test_vulkan_backend_embedding_3d(self):
-        class EmbeddingModule(torch.nn.Module):
-            def __init__(self, embedding):
-                super().__init__()
-                self.embedding = embedding
-
-            def forward(self, x):
-                return self.embedding(x)
-
-        self.lower_module_and_test_output(
-            EmbeddingModule(torch.nn.Embedding(5, 4)),
-            (torch.tensor([[[0, 1], [0, 1]], [[4, 2], [3, 3]]]),),
-        )
-
-    # def test_vulkan_backend_conv_with_dim_order(self):
-    #     class Conv2dSequential(torch.nn.Module):
-    #         def __init__(self, bias=True, channel_last=False):
-    #             super().__init__()
-    #             self.first = torch.nn.Conv2d(
-    #                 in_channels=1,
-    #                 out_channels=3,
-    #                 kernel_size=(3, 3),
-    #                 padding=1,
-    #                 bias=bias,
-    #             )
-    #             self.second = torch.nn.Conv2d(
-    #                 in_channels=3,
-    #                 out_channels=2,
-    #                 kernel_size=(3, 3),
-    #                 padding=1,
-    #                 bias=bias,
-    #             )
-
-    #         def forward(self, x):
-    #             x = x.to(memory_format=torch.channels_last)
-    #             return self.second(self.first(x))
-
-    #     self.lower_module_and_test_output(
-    #         Conv2dSequential(),
-    #         (torch.rand(size=[1, 1, 3, 3]),),
-    #
-    #     )
-
-    def test_vulkan_backend_flip(self):
-        class FlipModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.flip(x, [0, 1, 2, 3])
-
-        self.lower_module_and_test_output(
-            FlipModule(),
-            (torch.arange(48).reshape(2, 3, 4, 2),),
-        )
-
-    def test_vulkan_backend_conv_with_clamp(self):
-        class ConvWithClampModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.weight = torch.randn(6, 8, 3, 3)
-                self.bias = torch.randn(8)
-                self.stride = (1, 2)
-                self.padding = (2, 3)
-                self.dilation = (1, 1)
-                self.transposed = True
-                self.output_padding = (0, 1)
-                self.groups = 1
-                self.output_min = 0
-                self.output_max = 10
-
-            def forward(self, x):
-                return torch.ops.et_vk.conv_with_clamp(
-                    x,
-                    self.weight,
-                    self.bias,
-                    self.stride,
-                    self.padding,
-                    self.dilation,
-                    self.transposed,
-                    self.output_padding,
-                    self.groups,
-                    self.output_min,
-                    self.output_max,
-                )
-
-        self.lower_module_and_test_output(
-            ConvWithClampModule(),
-            (torch.randn(size=(1, 6, 40, 50), dtype=torch.float32),),
-        )
-
-    def test_vulkan_backend_grid_priors(self):
-        class GridPriorsModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                return torch.ops.et_vk.grid_priors(
-                    x,
-                    stride=8,
-                    offset=0.5,
-                )
-
-        self.lower_module_and_test_output(
-            GridPriorsModule(),
-            (torch.rand(size=[1, 5, 2, 3]),),
-        )
-
-    def test_vulkan_backend_large_linear_layer(self):
-        class LinearModel(torch.nn.Module):
-            def __init__(self, large_out_channels: int) -> None:
-                super(LinearModel, self).__init__()
-                self.fc0 = torch.nn.Linear(1024, 128)
-                self.fc1 = torch.nn.Linear(128, large_out_channels)
-
-            def forward(self, x: torch.Tensor):
-                x = self.fc0(x)
-                out = self.fc1(x)
-                return out
-
-        large_out_channels = 2**16
-
-        self.lower_module_and_test_output(
-            LinearModel(large_out_channels),
-            (torch.ones(1024),),
-        )
-
-    def test_vulkan_backend_sym_size_int(self):
-        """
-        Test the sym_size.int operator with a model that:
-        1. Takes an input tensor with shape [1, M, K]
-        2. Reshapes it to [M, K]
-        3. Applies a linear layer
-        4. Reshapes the output back to [1, M, N]
-        """
-        K = 64  # Input feature dimension
-        N = 32  # Output feature dimension
-
-        class SymSizeModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear = torch.nn.Linear(K, N)
-
-            def forward(self, x):
-                M = x.size(1)
-
-                reshaped = torch.reshape(x, [M, K])
-                output = self.linear(reshaped)
-                return torch.reshape(output, [1, M, N])
-
-        sample_inputs = (torch.randn(1, 64, K),)
-
-        batch = Dim("batch", min=1, max=128)
-        dynamic_shapes = {"x": {1: batch}}
-
-        test_inputs = [
-            (torch.randn(1, 32, K),),
-            (torch.randn(1, 96, K),),
-            (torch.randn(1, 128, K),),
-        ]
-
-        self.lower_module_and_test_output(
-            SymSizeModel(),
-            sample_inputs,
-            dynamic_shapes=dynamic_shapes,
-            test_inputs=test_inputs,
-        )
-
-    def test_select_last_height_dynamic_shapes(self):
-        """
-        Test selecting the last element along the height dimension with dynamic shapes.
-        The height dimension (dim=1) is variable.
-        """
-
-        class SelectLastHeightModule(torch.nn.Module):
-            """
-            Module that selects the last element along the height dimension (dim=1) of a 3D tensor.
-            This is equivalent to the operation: x[:, -1, :]
-            """
-
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                # Select the last element along dimension 1 (height)
-                return x[:, -1, :]
-
-        # Create the module
-        module = SelectLastHeightModule()
-
-        # Create sample inputs with a specific shape
-        # Shape: [batch_size, height, width]
-        sample_inputs = (torch.arange(1, 61).reshape(2, 10, 3).float(),)
-
-        # Define dynamic shapes for the height dimension
-        height = Dim("height", min=1, max=10)
-        dynamic_shapes = {"x": {1: height}}
-
-        # Create test inputs with different heights
-        test_inputs = [
-            (torch.arange(1, 7).reshape(2, 1, 3).float(),),  # Minimum height
-            (torch.arange(1, 19).reshape(2, 3, 3).float(),),  # Small height
-            (torch.arange(1, 43).reshape(2, 7, 3).float(),),  # Medium height
-            (torch.arange(1, 31).reshape(2, 5, 3).float(),),  # Maximum height
-        ]
-
-        # Use the testing infrastructure from TestVulkanBackend
-        test_backend = TestVulkanBackend()
-        test_backend.lower_module_and_test_output(
-            module,
-            sample_inputs,
-            dynamic_shapes=dynamic_shapes,
-            test_inputs=test_inputs,
-        )
-
-    def test_vulkan_backend_group_norm(self):
-        class ConvGroupNormModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                # Conv2d: 3 input channels -> 16 output channels
-                self.conv = torch.nn.Conv2d(
-                    in_channels=3,
-                    out_channels=16,
-                    kernel_size=3,
-                    padding=1,
-                    bias=True,
-                )
-                # GroupNorm: 4 groups for 16 channels (16 % 4 == 0)
-                self.group_norm = torch.nn.GroupNorm(
-                    num_groups=4,
-                    num_channels=16,
-                    eps=1e-5,
-                    affine=True,
-                )
-
-            def forward(self, x):
-                x = self.conv(x)
-                x = self.group_norm(x)
-                return x
-
-        # Create sample inputs: [batch, channels, height, width]
-        sample_inputs = (torch.randn(size=(1, 3, 32, 32), dtype=torch.float32),)
-
-        # Test with static shapes first
-        self.lower_module_and_test_output(
-            ConvGroupNormModule(),
-            sample_inputs,
-        )
-
-    def test_vulkan_backend_group_norm_different_groups(self):
-        class GroupNormModule(torch.nn.Module):
-            def __init__(self, num_groups, num_channels):
-                super().__init__()
-                self.group_norm = torch.nn.GroupNorm(
-                    num_groups=num_groups,
-                    num_channels=num_channels,
-                    eps=1e-5,
-                    affine=True,
-                )
-
-            def forward(self, x):
-                return self.group_norm(x)
-
-        # Test different group configurations
-        test_configs = [
-            (2, 8),  # 2 groups, 8 channels
-            (4, 16),  # 4 groups, 16 channels
-            (8, 32),  # 8 groups, 32 channels
-        ]
-
-        for num_groups, num_channels in test_configs:
-            with self.subTest(num_groups=num_groups, num_channels=num_channels):
-                sample_inputs = (
-                    torch.randn(size=(2, num_channels, 16, 16), dtype=torch.float32),
-                )
-
-                self.lower_module_and_test_output(
-                    GroupNormModule(num_groups, num_channels),
-                    sample_inputs,
-                )
-
-    def test_vulkan_backend_full_quantization_workflow(self):
-        class FullQuantizationWorkflowModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                # Step 1: Choose quantization parameters per tensor
-                scale, zero_point = (
-                    torch.ops.quantized_decomposed.choose_qparams.tensor(
-                        x,
-                        quant_min=-2147483648,  # int32 min
-                        quant_max=2147483647,  # int32 max
-                        eps=1e-5,
-                        dtype=torch.int32,
-                    )
-                )
-
-                # Step 2: Quantize using the calculated parameters
-                quantized = torch.ops.quantized_decomposed.quantize_per_tensor.tensor(
-                    x,
-                    scale,
-                    zero_point,
-                    quant_min=-2147483648,  # int32 min
-                    quant_max=2147483647,  # int32 max
-                    dtype=torch.int32,
-                )
-
-                # Step 3: Dequantize back to float
-                dequantized = (
-                    torch.ops.quantized_decomposed.dequantize_per_tensor.tensor(
-                        quantized,
-                        scale,
-                        zero_point,
-                        quant_min=-2147483648,  # int32 min
-                        quant_max=2147483647,  # int32 max
-                        dtype=torch.int32,
-                    )
-                )
-
-                return dequantized
-
-        full_workflow_module = FullQuantizationWorkflowModule()
-        sample_inputs = (torch.rand(size=(2, 3, 4), dtype=torch.float32),)
-
-        # Use higher tolerance since quantization introduces some error
-        self.lower_module_and_test_output(
-            full_workflow_module, sample_inputs, atol=5e-3, rtol=5e-3
-        )
-
-    def test_vulkan_backend_full_per_token_quantization_workflow(self):
-        class FullPerTokenQuantizationWorkflowModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                # Step 1: Choose quantization parameters per token
-                scale, zero_point = (
-                    torch.ops.quantized_decomposed.choose_qparams_per_token_asymmetric.default(
-                        x,
-                        dtype=torch.int32,
-                    )
-                )
-
-                # Step 2: Quantize using the calculated parameters per token
-                quantized = torch.ops.quantized_decomposed.quantize_per_token.default(
-                    x,
-                    scale,
-                    zero_point,
-                    quant_min=-2147483648,  # int32 min
-                    quant_max=2147483647,  # int32 max
-                    dtype=torch.int32,
-                )
-
-                # Step 3: Dequantize back to float per token
-                dequantized = (
-                    torch.ops.quantized_decomposed.dequantize_per_token.default(
-                        quantized,
-                        scale,
-                        zero_point,
-                        quant_min=-2147483648,  # int32 min
-                        quant_max=2147483647,  # int32 max
-                        dtype=torch.int32,
-                        output_dtype=torch.float32,
-                    )
-                )
-
-                return dequantized
-
-        full_per_token_workflow_module = FullPerTokenQuantizationWorkflowModule()
-        sample_inputs = (torch.rand(size=(6, 4), dtype=torch.float32),)
-
-        # Use higher tolerance since quantization introduces some error
-        self.lower_module_and_test_output(
-            full_per_token_workflow_module, sample_inputs, atol=5e-3, rtol=5e-3
-        )
-
-    def test_vulkan_backend_different_required_reprs(self):
-        class ComplexModule(torch.nn.Module):
-            """
-            This Module tests the tag memory metadata pass. The first few ops executed
-            are binary ops, which don't require any specific representation for input
-            and output tensors.
-
-            This is followed by a linear layer, which requires the input tensor to be
-            width packed.
-
-            Three linear layer outputs are then concatenated, and the result is passed
-            to a convolution layer which requires channels packing. Finally, group norm
-            is called and the output is postprocessed by a binary op before returning.
-
-            In addition to requiring memory layout transitions between the linear and
-            conv stages, the module also contains ops which have "non-standard"
-            torch.fx.Nodes; cat will contain an argument node that is a list of nodes,
-            and group norm's node will be associated with multiple output tensors.
-            """
-
-            def __init__(self):
-                super().__init__()
-                self.linear = torch.nn.Linear(10, 10)
-                self.conv = torch.nn.Conv2d(
-                    in_channels=3,  # Assuming concatenation triples the channels
-                    out_channels=16,
-                    kernel_size=3,
-                    padding=1,
-                )
-                self.group_norm = torch.nn.GroupNorm(num_groups=4, num_channels=16)
-
-            def forward(self, x, a, b, c, d):
-                w = a + b
-                y = a + c
-                z = a + d
-
-                b1 = x + y
-                b2 = x + z
-                b3 = x + w
-
-                l1 = self.linear(b1).unsqueeze(0)
-                l2 = self.linear(b2).unsqueeze(0)
-                l3 = self.linear(b3).unsqueeze(0)
-
-                concat = torch.cat([l1, l2, l3], dim=0)  # Concatenate along channels
-                conv = self.conv(concat + a)
-                g = self.group_norm(conv.unsqueeze(0))
-                return g + x
-
-        complex_module = ComplexModule()
-        sample_inputs = (
-            torch.rand(size=(10, 10), dtype=torch.float32),  # x
-            torch.rand(size=(10, 10), dtype=torch.float32),  # a
-            torch.rand(size=(10, 10), dtype=torch.float32),  # b
-            torch.rand(size=(10, 10), dtype=torch.float32),  # c
-            torch.rand(size=(10, 10), dtype=torch.float32),  # d
-        )
-
-        self.lower_module_and_test_output(complex_module, sample_inputs)
-
-    def test_vulkan_backend_cat_different_reprs(self):
-        class CustomComplexModule(torch.nn.Module):
-            """
-            This test validates that the memory metadata tagging pass can handle
-            transitioning arguments to the cat operator. Linear layers require width
-            packing, while conv layers require channels packing. Before executing the
-            cat operator, all input tensors should use the same representation.
-            """
-
-            def __init__(self):
-                super().__init__()
-                self.linear1 = torch.nn.Linear(10, 10)
-                self.linear2 = torch.nn.Linear(10, 10)
-                self.conv = torch.nn.Conv2d(
-                    in_channels=4,  # Assuming input b has 3 channels
-                    out_channels=8,
-                    kernel_size=3,
-                    padding=1,
-                )
-
-            def forward(self, a, b):
-                x1 = self.linear1(a).unsqueeze(0)
-                x2 = self.linear2(a).unsqueeze(0)
-                y = self.conv(b)
-                return torch.cat([x1, x2, y], dim=0)
-
-        custom_complex_module = CustomComplexModule()
-        sample_inputs = (
-            torch.rand(size=(10, 10), dtype=torch.float32),  # a
-            torch.rand(size=(4, 10, 10), dtype=torch.float32),  # b
-        )
-
-        self.lower_module_and_test_output(custom_complex_module, sample_inputs)
-
-    def test_vulkan_backend_cat_width_dynamic_shapes(self):
-        class CatWidthModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x1, x2, x3, x4, x5, x6):
-                return torch.cat([x1, x2, x3, x4, x5, x6], dim=3)
-
-        cat_width_module = CatWidthModule()
-
-        # Create 6 tensors with different widths but same batch, channel, and height dimensions
-        sample_inputs = (
-            torch.randn(size=(2, 3, 4, 5), dtype=torch.float32),  # width=5
-            torch.randn(size=(2, 3, 4, 3), dtype=torch.float32),  # width=3
-            torch.randn(size=(2, 3, 4, 7), dtype=torch.float32),  # width=7
-            torch.randn(size=(2, 3, 4, 2), dtype=torch.float32),  # width=2
-            torch.randn(size=(2, 3, 4, 4), dtype=torch.float32),  # width=4
-            torch.randn(size=(2, 3, 4, 6), dtype=torch.float32),  # width=6
-        )
-
-        # Define dynamic shapes for the width dimension (dim=3) for each input
-        width1 = Dim("width1", min=1, max=10)
-        width2 = Dim("width2", min=1, max=10)
-        width3 = Dim("width3", min=1, max=10)
-        width4 = Dim("width4", min=1, max=10)
-        width5 = Dim("width5", min=1, max=10)
-        width6 = Dim("width6", min=1, max=10)
-
-        dynamic_shapes = {
-            "x1": {3: width1},
-            "x2": {3: width2},
-            "x3": {3: width3},
-            "x4": {3: width4},
-            "x5": {3: width5},
-            "x6": {3: width6},
-        }
-
-        # Create test inputs with different width combinations
-        test_inputs = [
-            (
-                torch.randn(2, 3, 4, 2),  # width=2
-                torch.randn(2, 3, 4, 1),  # width=1
-                torch.randn(2, 3, 4, 3),  # width=3
-                torch.randn(2, 3, 4, 1),  # width=1
-                torch.randn(2, 3, 4, 2),  # width=2
-                torch.randn(2, 3, 4, 4),  # width=4
-            ),
-            (
-                torch.randn(2, 3, 4, 8),  # width=8
-                torch.randn(2, 3, 4, 2),  # width=2
-                torch.randn(2, 3, 4, 1),  # width=1
-                torch.randn(2, 3, 4, 3),  # width=3
-                torch.randn(2, 3, 4, 5),  # width=5
-                torch.randn(2, 3, 4, 1),  # width=1
-            ),
-            (
-                torch.randn(2, 3, 4, 1),  # width=1
-                torch.randn(2, 3, 4, 9),  # width=9
-                torch.randn(2, 3, 4, 2),  # width=2
-                torch.randn(2, 3, 4, 4),  # width=4
-                torch.randn(2, 3, 4, 1),  # width=1
-                torch.randn(2, 3, 4, 3),  # width=3
-            ),
-        ]
-
-        self.lower_module_and_test_output(
-            cat_width_module,
-            sample_inputs,
-            dynamic_shapes=dynamic_shapes,
-            test_inputs=test_inputs,
-        )
-
-    def test_vulkan_backend_cat_channels_dynamic_shapes(self):
-        class CatChannelsModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x1, x2, x3, x4, x5, x6):
-                return torch.cat([x1, x2, x3, x4, x5, x6], dim=1)
-
-        cat_channels_module = CatChannelsModule()
-
-        # Create 6 tensors with different channel counts but same batch, height, and width dimensions
-        sample_inputs = (
-            torch.randn(size=(2, 8, 8, 6), dtype=torch.float32),  # channels=4
-            torch.randn(size=(2, 8, 8, 6), dtype=torch.float32),  # channels=2
-            torch.randn(size=(2, 8, 8, 6), dtype=torch.float32),  # channels=6
-            torch.randn(size=(2, 8, 8, 6), dtype=torch.float32),  # channels=1
-            torch.randn(size=(2, 8, 8, 6), dtype=torch.float32),  # channels=3
-            torch.randn(size=(2, 8, 8, 6), dtype=torch.float32),  # channels=5
-        )
-
-        # Define dynamic shapes for the channels dimension (dim=1) for each input
-        channels1 = Dim("channels1", min=1, max=8)
-        channels2 = Dim("channels2", min=1, max=8)
-        channels3 = Dim("channels3", min=1, max=8)
-        channels4 = Dim("channels4", min=1, max=8)
-        channels5 = Dim("channels5", min=1, max=8)
-        channels6 = Dim("channels6", min=1, max=8)
-
-        dynamic_shapes = {
-            "x1": {1: channels1},
-            "x2": {1: channels2},
-            "x3": {1: channels3},
-            "x4": {1: channels4},
-            "x5": {1: channels5},
-            "x6": {1: channels6},
-        }
-
-        # Create test inputs with different channel combinations
-        test_inputs = [
-            (
-                torch.randn(2, 1, 8, 6),  # channels=1
-                torch.randn(2, 2, 8, 6),  # channels=2
-                torch.randn(2, 1, 8, 6),  # channels=1
-                torch.randn(2, 3, 8, 6),  # channels=3
-                torch.randn(2, 1, 8, 6),  # channels=1
-                torch.randn(2, 2, 8, 6),  # channels=2
-            ),
-            (
-                torch.randn(2, 6, 8, 6),  # channels=6
-                torch.randn(2, 1, 8, 6),  # channels=1
-                torch.randn(2, 3, 8, 6),  # channels=3
-                torch.randn(2, 2, 8, 6),  # channels=2
-                torch.randn(2, 4, 8, 6),  # channels=4
-                torch.randn(2, 1, 8, 6),  # channels=1
-            ),
-            (
-                torch.randn(2, 2, 8, 6),  # channels=2
-                torch.randn(2, 7, 8, 6),  # channels=7
-                torch.randn(2, 1, 8, 6),  # channels=1
-                torch.randn(2, 1, 8, 6),  # channels=1
-                torch.randn(2, 3, 8, 6),  # channels=3
-                torch.randn(2, 2, 8, 6),  # channels=2
-            ),
-        ]
-
-        self.lower_module_and_test_output(
-            cat_channels_module,
-            sample_inputs,
-            dynamic_shapes=dynamic_shapes,
-            test_inputs=test_inputs,
-        )
-
-    def test_vulkan_backend_high_dimensional_tensors(self):
-        class HighDimTensorModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x, y):
-                # Unsqueeze inputs twice to create 5-dim tensors
-                x_5d = torch.unsqueeze(torch.unsqueeze(x, 0), 0)
-                y_5d = torch.unsqueeze(torch.unsqueeze(y, 0), 0)
-                # Add tensors together
-                result = x_5d + y_5d
-                return result
-
-        high_dim_module = HighDimTensorModule()
-        # Create 2 4-dim inputs
-        sample_inputs = (
-            torch.rand(size=(2, 3, 4, 5), dtype=torch.float32),
-            torch.rand(size=(2, 3, 4, 5), dtype=torch.float32),
-        )
-
-        self.lower_module_and_test_output(high_dim_module, sample_inputs)
-
-    def test_vulkan_backend_torchao_wo_quantized_linear(self):
-        in_features = 1024
-        out_features = 512
-        bias = False
-        group_size = 64
-        weight_bits = 4
-
-        class TorchAOQuantizedLinearModule(torch.nn.Module):
-            def __init__(
-                self,
-                in_features: int,
-                out_features: int,
-                bias: bool = False,
-                group_size: int = 64,
-                weight_bits: int = 4,
-            ):
-                super().__init__()
-                self.linear = torch.nn.Linear(in_features, out_features, bias=bias)
-                self.group_size = group_size
-                self.weight_bits = weight_bits
-
-                if self.weight_bits == 4:
-                    self.weight_dtype = torch.int4
-                else:
-                    self.weight_dtype = torch.int8
-
-                self.quant_granularity = PerGroup(self.group_size)
-
-            def forward(self, x: torch.Tensor) -> torch.Tensor:
-                return self.linear(x)
-
-            def apply_quantization(self):
-                """Apply TorchAO weight-only quantization to the linear layer."""
-                q_config = IntxWeightOnlyConfig(
-                    weight_dtype=self.weight_dtype,
-                    granularity=self.quant_granularity,
-                )
-                quantize_(self, q_config)
-                unwrap_tensor_subclass(self)
-                return self
-
-        # Test with GEMV pattern (batch_size=1, seq_len=1)
-        quantized_linear_module = TorchAOQuantizedLinearModule(
-            in_features=in_features,
-            out_features=out_features,
-            bias=bias,
-            group_size=group_size,
-            weight_bits=weight_bits,
-        )
-
-        # Apply quantization
-        quantized_linear_module = quantized_linear_module.apply_quantization()
-
-        # Test with 2D input (GEMV pattern)
-        sample_inputs = (torch.randn(size=(1, in_features), dtype=torch.float32),)
-
-        # Use higher tolerance since quantization introduces some error
-        self.lower_module_and_test_output(
-            quantized_linear_module, sample_inputs, atol=1e-2, rtol=1e-2
-        )
-
-        # Test with GEMM pattern (batch_size > 1)
-        quantized_linear_module_gemm = TorchAOQuantizedLinearModule(
-            in_features=in_features,
-            out_features=out_features,
-            bias=bias,
-            group_size=group_size,
-            weight_bits=weight_bits,
-        )
-
-        # Apply quantization
-        quantized_linear_module_gemm = quantized_linear_module_gemm.apply_quantization()
-
-        # Test with 3D input (GEMM pattern)
-        sample_inputs_gemm = (
-            torch.randn(size=(1, 248, in_features), dtype=torch.float32),
-        )
-
-        # Use higher tolerance since quantization introduces some error
-        self.lower_module_and_test_output(
-            quantized_linear_module_gemm, sample_inputs_gemm, atol=1e-2, rtol=1e-2
-        )
-
-    def test_vulkan_backend_xnnpack_pt2e_quantized_linear_sequence(self):
-        """
-        Test a sequence of linear layers quantized with XNNPACK quantization config.
-        This test creates a module with multiple linear layers in sequence and applies
-        XNNPACK symmetric quantization to test the quantized model execution.
-        """
-
-        import executorch.backends.vulkan.test.utils as test_utils
-
-        class LinearSequenceModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear1 = torch.nn.Linear(128, 64, bias=False)
-                self.linear2 = torch.nn.Linear(64, 32, bias=False)
-                self.linear3 = torch.nn.Linear(32, 16, bias=False)
-
-                MAX = 0.75
-                MIN = -0.25
-                self.linear1.weight.data = test_utils.random_uniform_tensor(
-                    self.linear1.weight.shape, MIN, MAX
-                )
-                self.linear2.weight.data = test_utils.random_uniform_tensor(
-                    self.linear2.weight.shape, MIN, MAX
-                )
-                self.linear3.weight.data = test_utils.random_uniform_tensor(
-                    self.linear3.weight.shape, MIN, MAX
-                )
-
-            def forward(self, x):
-                x = self.linear1(x)
-                x = self.linear2(x)
-                x = self.linear3(x)
-                return x
-
-        # Create the module
-        linear_sequence_module = LinearSequenceModule()
-
-        M = 32
-        # Create sample inputs
-        sample_inputs = (
-            (
-                test_utils.random_uniform_tensor(
-                    (M, linear_sequence_module.linear1.in_features),
-                    -0.25,
-                    0.75,
-                )
-            ),
-        )
-
-        # Create XNNPACK quantizer with symmetric quantization config
-        quantizer = XNNPACKQuantizer()
-        operator_config = get_symmetric_quantization_config(
-            is_per_channel=True,
-            is_dynamic=False,
-        )
-        quantizer.set_global(operator_config)
-
-        # Test the quantized module using the existing quantize_and_lower_module function
-        # Use higher tolerance since quantization introduces some error
-        edge_program = quantize_and_lower_module(
-            linear_sequence_module, sample_inputs, quantizer
-        )
-
-        et_program = edge_program.to_executorch()
-        self.check_vk_delegation(et_program)
-
-        self.run_delegated_model_and_check_output(
-            et_program,
-            linear_sequence_module,
-            sample_inputs,
-            atol=1e-2,
-            rtol=1e-1,
-        )
-
-    def test_vulkan_backend_xnnpack_pt2e_quantized_conv_sequence(self):
-        """
-        Test a sequence of convolution layers quantized with PT2E quantization.
-        This test creates a module with multiple Conv2d layers in sequence and applies
-        XNNPACK symmetric quantization to test the quantized model execution.
-        Similar to the linear sequence test but using convolution layers.
-        """
-
-        import executorch.backends.vulkan.test.utils as test_utils
-
-        class ConvSequenceModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv1 = torch.nn.Conv2d(
-                    in_channels=3,
-                    out_channels=16,
-                    kernel_size=3,
-                    padding=1,
-                    bias=False,
-                )
-                self.conv2 = torch.nn.Conv2d(
-                    in_channels=16,
-                    out_channels=32,
-                    kernel_size=3,
-                    padding=1,
-                    bias=False,
-                )
-                self.conv3 = torch.nn.Conv2d(
-                    in_channels=32,
-                    out_channels=64,
-                    kernel_size=3,
-                    padding=1,
-                    bias=False,
-                )
-
-                MAX = 0.75
-                MIN = -0.25
-                self.conv1.weight.data = test_utils.random_uniform_tensor(
-                    self.conv1.weight.shape, MIN, MAX
-                )
-                self.conv2.weight.data = test_utils.random_uniform_tensor(
-                    self.conv2.weight.shape, MIN, MAX
-                )
-                self.conv3.weight.data = test_utils.random_uniform_tensor(
-                    self.conv3.weight.shape, MIN, MAX
-                )
-
-            def forward(self, x):
-                x = self.conv1(x)
-                x = self.conv2(x)
-                x = self.conv3(x)
-                return x
-
-        # Create the module
-        conv_sequence_module = ConvSequenceModule()
-
-        input_tensor = test_utils.random_uniform_tensor(
-            (1, 3, 32, 32),
-            -0.25,
-            0.75,
-        )
-
-        # Create sample inputs
-        sample_inputs = (input_tensor,)
-
-        # Create XNNPACK quantizer with symmetric quantization config
-        quantizer = XNNPACKQuantizer()
-        operator_config = get_symmetric_quantization_config(
-            is_per_channel=True,
-            is_dynamic=False,
-        )
-        quantizer.set_global(operator_config)
-
-        # Test the quantized module using the existing quantize_and_lower_module function
-        # Use higher tolerance since quantization introduces some error
-        edge_program = quantize_and_lower_module(
-            conv_sequence_module, sample_inputs, quantizer
-        )
-
-        et_program = edge_program.to_executorch()
-        self.check_vk_delegation(et_program)
-
-        self.run_delegated_model_and_check_output(
-            et_program,
-            conv_sequence_module,
-            sample_inputs,
-            atol=1e-2,
-            rtol=1e-1,
-        )
-
-    def test_vulkan_backend_xnnpack_pt2e_quantized_conv_sequence_all_reduced(self):
-        """
-        Test a sequence of convolution layers quantized with PT2E quantization.
-        This test creates a module with multiple Conv2d layers in sequence and applies
-        XNNPACK symmetric quantization to test the quantized model execution.
-        Similar to the linear sequence test but using convolution layers.
-        """
-
-        import executorch.backends.vulkan.test.utils as test_utils
-
-        class ConvSequenceModule(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.conv1 = torch.nn.Conv2d(
-                    in_channels=3,
-                    out_channels=32,
-                    kernel_size=3,
-                    padding=1,
-                    bias=False,
-                )
-                self.conv2 = torch.nn.Conv2d(
-                    in_channels=32,
-                    out_channels=1,
-                    kernel_size=3,
-                    padding=1,
-                    bias=False,
-                )
-
-                MAX = 0.75
-                MIN = -0.25
-                self.conv1.weight.data = test_utils.random_uniform_tensor(
-                    self.conv1.weight.shape, MIN, MAX
-                )
-                self.conv2.weight.data = test_utils.random_uniform_tensor(
-                    self.conv2.weight.shape, MIN, MAX
-                )
-
-            def forward(self, x):
-                x = self.conv1(x)
-                x = self.conv2(x)
-                return x
-
-        # Create the module
-        conv_sequence_module = ConvSequenceModule()
-
-        input_tensor = test_utils.random_uniform_tensor(
-            (1, 3, 32, 32),
-            -0.25,
-            0.75,
-        )
-
-        # Create sample inputs
-        sample_inputs = (input_tensor,)
-
-        # Create XNNPACK quantizer with symmetric quantization config
-        quantizer = XNNPACKQuantizer()
-        operator_config = get_symmetric_quantization_config(
-            is_per_channel=True,
-            is_dynamic=False,
-        )
-        quantizer.set_global(operator_config)
-
-        # Test the quantized module using the existing quantize_and_lower_module function
-        # Use higher tolerance since quantization introduces some error
-        edge_program = quantize_and_lower_module(
-            conv_sequence_module, sample_inputs, quantizer
-        )
-
-        et_program = edge_program.to_executorch()
-        self.check_vk_delegation(et_program)
-
-        self.run_delegated_model_and_check_output(
-            et_program,
-            conv_sequence_module,
-            sample_inputs,
-            atol=1e-2,
-            rtol=1e-1,
-        )
diff --git a/backends/vulkan/test/test_vulkan_delegate_header.py b/backends/vulkan/test/test_vulkan_delegate_header.py
deleted file mode 100644
index bf8b59fc49d..00000000000
--- a/backends/vulkan/test/test_vulkan_delegate_header.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-
-from executorch.backends.vulkan.serialization.vulkan_graph_serialize import (
-    VulkanDelegateHeader,
-)
-
-EXAMPLE_FLATBUFFER_OFFSET: int = 0x11223344
-EXAMPLE_FLATBUFFER_SIZE: int = 0x55667788
-EXAMPLE_BYTES_OFFSET: int = EXAMPLE_FLATBUFFER_OFFSET + EXAMPLE_FLATBUFFER_SIZE
-EXAMPLE_BYTES_SIZE: int = 0x99AABBCC99AABBCC
-
-# If header layout or magic changes, this test must change too.
-# The layout of the header is a contract, not an implementation detail
-EXAMPLE_HEADER_DATA: bytes = (
-    # zeros
-    b"\x00\x00\x00\x00"
-    # magic
-    + b"VH00"
-    # All Values below are littl Endian
-    # header length
-    + b"\x1E\x00"
-    # Flatbuffer Offset
-    + b"\x44\x33\x22\x11"
-    # Flatbuffer Size
-    + b"\x88\x77\x66\x55"
-    # Bytes Data Offset
-    + b"\xCC\xAA\x88\x66"
-    # Bytes Data Size
-    + b"\xCC\xBB\xAA\x99\xCC\xBB\xAA\x99"
-)
-
-
-class TestVulkanDelegateHeader(unittest.TestCase):
-    def test_to_bytes(self) -> None:
-        header = VulkanDelegateHeader(
-            EXAMPLE_FLATBUFFER_OFFSET,
-            EXAMPLE_FLATBUFFER_SIZE,
-            EXAMPLE_BYTES_OFFSET,
-            EXAMPLE_BYTES_SIZE,
-        )
-        self.assertEqual(header.to_bytes(), EXAMPLE_HEADER_DATA)
-        self.assertTrue(header.is_valid())
-
-    def test_from_bytes(self) -> None:
-        header = VulkanDelegateHeader.from_bytes(EXAMPLE_HEADER_DATA)
-        self.assertEqual(header.flatbuffer_offset, EXAMPLE_FLATBUFFER_OFFSET)
-        self.assertEqual(header.flatbuffer_size, EXAMPLE_FLATBUFFER_SIZE)
-        self.assertEqual(header.bytes_offset, EXAMPLE_BYTES_OFFSET)
-        self.assertEqual(header.bytes_size, EXAMPLE_BYTES_SIZE)
-
-    def test_invalid_metadata(self) -> None:
-        WRONG_MAGIC_DATA = EXAMPLE_HEADER_DATA[0:4] + b"YT01" + EXAMPLE_HEADER_DATA[8:]
-        with self.assertRaisesRegex(
-            ValueError,
-            "Expected magic bytes to be b'VH00', but got b'YT01'",
-        ):
-            VulkanDelegateHeader.from_bytes(WRONG_MAGIC_DATA)
-
-        WRONG_LENGTH_DATA = (
-            EXAMPLE_HEADER_DATA[0:8] + b"\x1D\x00" + EXAMPLE_HEADER_DATA[10:]
-        )
-        with self.assertRaisesRegex(
-            ValueError, "Expected header to be 30 bytes, but got 29 bytes."
-        ):
-            VulkanDelegateHeader.from_bytes(WRONG_LENGTH_DATA)
-
-        with self.assertRaisesRegex(
-            ValueError, "Expected header to be 30 bytes, but got 31 bytes."
-        ):
-            VulkanDelegateHeader.from_bytes(EXAMPLE_HEADER_DATA + b"\x00")
-
-    def test_invalid_flatbuffer_size(self) -> None:
-        header = VulkanDelegateHeader(
-            EXAMPLE_FLATBUFFER_OFFSET,
-            0,
-            EXAMPLE_BYTES_OFFSET,
-            EXAMPLE_BYTES_SIZE,
-        )
-
-        with self.assertRaises(ValueError):
-            header.to_bytes()
-
-    def test_invalid_constants_offset(self) -> None:
-        header = VulkanDelegateHeader(
-            EXAMPLE_FLATBUFFER_OFFSET,
-            EXAMPLE_FLATBUFFER_SIZE,
-            EXAMPLE_FLATBUFFER_OFFSET + EXAMPLE_FLATBUFFER_SIZE - 1,
-            EXAMPLE_BYTES_SIZE,
-        )
-
-        with self.assertRaises(ValueError):
-            header.to_bytes()
-
-    def test_to_bytes_same_as_from_bytes(self) -> None:
-        header = VulkanDelegateHeader.from_bytes(EXAMPLE_HEADER_DATA)
-
-        to_bytes = header.to_bytes()
-        self.assertEqual(EXAMPLE_HEADER_DATA, to_bytes)
diff --git a/backends/vulkan/test/test_vulkan_passes.py b/backends/vulkan/test/test_vulkan_passes.py
deleted file mode 100644
index b277dff2a76..00000000000
--- a/backends/vulkan/test/test_vulkan_passes.py
+++ /dev/null
@@ -1,317 +0,0 @@
-import unittest
-from typing import Optional, Tuple
-
-import torch
-
-from executorch.backends.transforms.addmm_mm_to_linear import AddmmToLinearTransform
-from executorch.backends.vulkan._passes import FuseQuantizedOpsTransform
-from executorch.backends.vulkan._passes.fuse_patterns import FusePatternsPass
-
-from executorch.backends.vulkan.quantizer.vulkan_quantizer import (
-    get_symmetric_quantization_config,
-    VulkanQuantizer,
-)
-
-from executorch.exir import EdgeCompileConfig, EdgeProgramManager, to_edge
-
-from executorch.exir.backend.canonical_partitioners.config_partitioner import (
-    format_target_name,
-)
-from torchao.quantization.linear_quant_modules import Int8DynActInt4WeightQuantizer
-
-from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
-from torchao.quantization.pt2e.quantizer import Quantizer
-
-###################
-## Common Models ##
-###################
-
-
-class SingleLinearModule(torch.nn.Module):
-    def __init__(self, K=256, N=128):
-        super().__init__()
-        self.K = K
-        self.N = N
-        self.linear = torch.nn.Linear(K, N, bias=False)
-
-    def forward(self, x):
-        return self.linear(x)
-
-    def get_sample_inputs(self):
-        sample_inputs = (torch.rand(size=(32, self.K), dtype=torch.float32),)
-        return sample_inputs
-
-
-###########
-## Tests ##
-###########
-
-
-def quantize_and_lower_module(
-    model: torch.nn.Module,
-    sample_inputs: Tuple[torch.Tensor],
-    quantizer: Quantizer,
-    dynamic_shapes=None,
-) -> EdgeProgramManager:
-    edge_compile_config = EdgeCompileConfig(
-        _skip_dim_order=False,  # TODO(T182928844): Delegate dim order op to backend.
-        _check_ir_validity=False,
-    )
-
-    program = torch.export.export_for_training(
-        model, sample_inputs, dynamic_shapes=dynamic_shapes, strict=True
-    ).module()
-
-    program = prepare_pt2e(program, quantizer)  # pyre-ignore
-    # Calibrate
-    program(*sample_inputs)
-
-    program = convert_pt2e(program)
-
-    program = torch.export.export(program, sample_inputs, dynamic_shapes=dynamic_shapes)
-
-    edge_program = to_edge(
-        program,
-        compile_config=edge_compile_config,
-    )
-
-    return edge_program
-
-
-def get_target_canonical_name(node: torch.fx.Node) -> Optional[str]:
-    if node.op != "call_function":
-        return None
-    node_name = format_target_name(node.target.__name__)  # pyre-ignore
-    return node_name
-
-
-def op_node_count(graph_module: torch.fx.GraphModule, canonical_op_name: str) -> int:
-    count = 0
-    for node in graph_module.graph.nodes:
-        canonical_name = get_target_canonical_name(node)
-        if canonical_name is not None and canonical_name == canonical_op_name:
-            count += 1
-    return count
-
-
-class TestVulkanPasses(unittest.TestCase):
-
-    def test_fuse_int8pack_mm(self):
-        K = 256
-        N = 256
-        model = SingleLinearModule(K, N)
-        sample_inputs = model.get_sample_inputs()
-
-        quantizer = VulkanQuantizer()
-        quantizer.set_global(
-            get_symmetric_quantization_config(is_dynamic=False, weight_bits=8)
-        )
-
-        edge_manager = quantize_and_lower_module(
-            model,
-            sample_inputs,
-            quantizer,
-        )
-
-        ep = edge_manager._edge_programs["forward"]
-        edge_manager.transform(
-            [
-                AddmmToLinearTransform(),
-                FuseQuantizedOpsTransform(ep),
-            ]
-        )
-
-        gm = ep.graph_module
-
-        self.assertEqual(op_node_count(gm, "_weight_int8pack_mm.default"), 1)
-        self.assertEqual(op_node_count(gm, "dequantize_per_channel.default"), 0)
-
-    def test_fuse_linear_qcs4w(self):
-        K = 256
-        N = 256
-        model = SingleLinearModule(K, N)
-        sample_inputs = model.get_sample_inputs()
-
-        quantizer = VulkanQuantizer()
-        quantizer.set_global(
-            get_symmetric_quantization_config(is_dynamic=False, weight_bits=4)
-        )
-
-        edge_manager = quantize_and_lower_module(
-            model,
-            sample_inputs,
-            quantizer,
-        )
-
-        ep = edge_manager._edge_programs["forward"]
-        edge_manager.transform(
-            [
-                AddmmToLinearTransform(),
-                FuseQuantizedOpsTransform(ep),
-            ]
-        )
-
-        gm = ep.graph_module
-
-        self.assertEqual(op_node_count(gm, "linear_qcs4w.default"), 1)
-        self.assertEqual(op_node_count(gm, "dequantize_per_channel.default"), 0)
-
-    @unittest.skip(
-        "linear_qta8a_qga4w currently does not support E2E dynamic quantization"
-    )
-    def test_fuse_linear_qta8a_qga4w(self):
-        """Test fusion of dynamic activation + grouped weight quantized linear (QTA8A_QGA4W)."""
-        K = 256
-        N = 256
-        model = SingleLinearModule(K, N)
-        sample_inputs = model.get_sample_inputs()
-
-        # Use source transform quantizer for dynamic activation + grouped weight quantization
-        quantizer = Int8DynActInt4WeightQuantizer(
-            groupsize=128,  # Group size for 4-bit weights
-            padding_allowed=False,
-            precision=torch.float32,
-            scales_precision=torch.float32,
-            device=torch.device("cpu"),
-        )
-
-        # Apply source transform quantization
-        quantized_model = quantizer.quantize(model)
-
-        # Export the quantized model
-        edge_compile_config = EdgeCompileConfig(
-            _skip_dim_order=False,
-            _check_ir_validity=False,
-        )
-
-        program = torch.export.export_for_training(
-            quantized_model, sample_inputs, strict=True
-        ).module()
-
-        program = torch.export.export(program, sample_inputs)
-
-        edge_manager = to_edge(
-            program,
-            compile_config=edge_compile_config,
-        )
-
-        ep = edge_manager._edge_programs["forward"]
-        edge_manager.transform(
-            [
-                AddmmToLinearTransform(),
-                FuseQuantizedOpsTransform(ep),
-            ]
-        )
-
-        gm = ep.graph_module
-
-        # Check that the linear_qta8a_qga4w operator was created
-        self.assertEqual(op_node_count(gm, "linear_qta8a_qga4w.default"), 1)
-        # Check that the original quantization/dequantization nodes were removed
-        self.assertEqual(op_node_count(gm, "quantize_per_token.default"), 0)
-        self.assertEqual(op_node_count(gm, "dequantize_per_channel.default"), 0)
-        self.assertEqual(op_node_count(gm, "linear.default"), 0)
-
-    def test_fuse_rotary_emb(self):
-        """Test conversion of rotary embedding pattern to et_vk.apply_rotary_emb custom op."""
-
-        class RotaryEmbeddingModel(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(
-                self,
-                xq: torch.Tensor,
-                xk: torch.Tensor,
-                freqs_cos: torch.Tensor,
-                freqs_sin: torch.Tensor,
-            ):
-                # This implementation matches the apply_rotary_emb function in rope.py
-                # Split into real and imaginary parts
-                xq_r, xq_i = xq.float().reshape(xq.shape[:-1] + (-1, 2)).unbind(-1)
-                xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1)
-
-                # Reshape frequencies for broadcasting
-                freqs_cos = self._reshape_for_broadcast(freqs_cos, xq_r)
-                freqs_sin = self._reshape_for_broadcast(freqs_sin, xq_r)
-
-                # Apply rotary embedding
-                xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin
-                xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos
-                xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin
-                xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos
-
-                # Recombine real and imaginary parts
-                xq_out = torch.stack([xq_out_r, xq_out_i], dim=-1).flatten(3)
-                xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3)
-
-                return xq_out.type_as(xq), xk_out.type_as(xk)
-
-            def _reshape_for_broadcast(self, freqs_cis: torch.Tensor, x: torch.Tensor):
-                """Helper function to reshape frequencies for broadcasting"""
-                ndim = x.ndim
-                freqs_cis_ndim = freqs_cis.ndim
-                if freqs_cis_ndim == 3:
-                    # freqs_cis: (seq_len, n_heads, head_dim // 2)
-                    shape = [
-                        d if (i == ndim - 3 or i == ndim - 2 or i == ndim - 1) else 1
-                        for i, d in enumerate(x.shape)
-                    ]
-                else:
-                    # freqs_cis: (seq_len, head_dim // 2)
-                    shape = [
-                        d if i == 1 or i == ndim - 1 else 1
-                        for i, d in enumerate(x.shape)
-                    ]
-                return freqs_cis.view(shape)
-
-        # Create sample inputs based on the test file
-        batch_size = 1
-        seq_len = 5
-        n_heads = 32
-        n_kv_heads = 8
-        head_dim = 2048
-
-        xq = torch.randn(batch_size, seq_len, n_heads, head_dim, dtype=torch.float)
-        xk = torch.randn(batch_size, seq_len, n_kv_heads, head_dim, dtype=torch.float)
-        freqs_cos = torch.randn(seq_len, head_dim // 2, dtype=torch.float)
-        freqs_sin = torch.randn(seq_len, head_dim // 2, dtype=torch.float)
-
-        sample_inputs = (xq, xk, freqs_cos, freqs_sin)
-
-        model = RotaryEmbeddingModel()
-
-        # Export the model
-        edge_compile_config = EdgeCompileConfig(
-            _skip_dim_order=False,
-            _check_ir_validity=False,
-        )
-
-        program = torch.export.export(model, sample_inputs, strict=True)
-
-        edge_manager = to_edge(
-            program,
-            compile_config=edge_compile_config,
-        )
-
-        # Apply the rotary embedding pass
-        ep = edge_manager._edge_programs["forward"]
-        rotary_pass = FusePatternsPass(ep)
-        result = rotary_pass.call(ep.graph_module)
-
-        # Verify that the pass was successful
-        self.assertTrue(result.modified)
-
-        # Check that the custom op was created
-        gm = ep.graph_module
-        custom_op_count = 0
-        for node in gm.graph.nodes:
-            if (
-                node.op == "call_function"
-                and hasattr(node.target, "__name__")
-                and "apply_rotary_emb" in str(node.target)
-            ):
-                custom_op_count += 1
-
-        # We expect at least one custom op to be created
-        self.assertGreater(custom_op_count, 0)
diff --git a/backends/vulkan/test/tester.py b/backends/vulkan/test/tester.py
deleted file mode 100644
index b2066a06ec0..00000000000
--- a/backends/vulkan/test/tester.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Any, List, Optional, Sequence, Tuple
-
-import executorch
-import executorch.backends.test.harness.stages as BaseStages
-
-import torch
-from executorch.backends.test.harness import Tester as TesterBase
-from executorch.backends.test.harness.stages import StageType
-from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
-from executorch.backends.vulkan.quantizer.vulkan_quantizer import (
-    get_symmetric_quantization_config as get_symmetric_quantization_config_vulkan,
-    VulkanQuantizer,
-)
-from executorch.exir import EdgeCompileConfig
-from executorch.exir.backend.partitioner import Partitioner
-from torchao.quantization.pt2e.quantizer import Quantizer
-
-
-class Quantize(BaseStages.Quantize):
-    def __init__(
-        self,
-        quantizer: Optional[Quantizer] = None,
-        quantization_config: Any | None = None,
-        calibrate: bool = True,
-        calibration_samples: Optional[Sequence[Any]] = None,
-        is_qat: Optional[bool] = False,
-    ):
-        super().__init__(
-            quantizer=quantizer or VulkanQuantizer(),
-            quantization_config=(
-                quantization_config or get_symmetric_quantization_config_vulkan()
-            ),
-            calibrate=calibrate,
-            calibration_samples=calibration_samples,
-            is_qat=is_qat,
-        )
-
-
-class Partition(BaseStages.Partition):
-    def __init__(self, partitioner: Optional[Partitioner] = None):
-        super().__init__(
-            partitioner=partitioner or VulkanPartitioner(),
-        )
-
-
-class ToEdgeTransformAndLower(BaseStages.ToEdgeTransformAndLower):
-    def __init__(
-        self,
-        partitioners: Optional[List[Partitioner]] = None,
-        edge_compile_config: Optional[EdgeCompileConfig] = None,
-    ):
-        super().__init__(
-            default_partitioner_cls=VulkanPartitioner,
-            partitioners=partitioners,
-            edge_compile_config=edge_compile_config
-            or EdgeCompileConfig(_check_ir_validity=False),
-        )
-
-
-class VulkanTester(TesterBase):
-    def __init__(
-        self,
-        module: torch.nn.Module,
-        example_inputs: Tuple[torch.Tensor],
-        dynamic_shapes: Optional[Tuple[Any]] = None,
-    ):
-        stage_classes = (
-            executorch.backends.test.harness.Tester.default_stage_classes()
-            | {
-                StageType.PARTITION: Partition,
-                StageType.TO_EDGE_TRANSFORM_AND_LOWER: ToEdgeTransformAndLower,
-            }
-        )
-
-        super().__init__(
-            module=module,
-            stage_classes=stage_classes,
-            example_inputs=example_inputs,
-            dynamic_shapes=dynamic_shapes,
-        )
diff --git a/backends/vulkan/test/utils.py b/backends/vulkan/test/utils.py
deleted file mode 100644
index 363ee37058d..00000000000
--- a/backends/vulkan/test/utils.py
+++ /dev/null
@@ -1,787 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import logging
-from collections import OrderedDict
-from copy import deepcopy
-
-from enum import auto, Enum
-from typing import Any, List, Optional, Tuple
-
-import executorch.backends.vulkan.utils as utils
-
-import torch
-
-from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
-from executorch.backends.vulkan.vulkan_preprocess import VulkanBackend
-from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
-
-from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
-    get_symmetric_quantization_config,
-    XNNPACKQuantizer,
-)
-from executorch.backends.xnnpack.xnnpack_preprocess import XnnpackBackend
-from executorch.devtools import BundledProgram
-from executorch.devtools.bundled_program.config import MethodTestCase, MethodTestSuite
-from executorch.devtools.bundled_program.serialize import (
-    serialize_from_bundled_program_to_flatbuffer,
-)
-from executorch.exir import ExecutorchProgramManager, to_edge_transform_and_lower
-from executorch.extension.pybindings.portable_lib import (  # @manual
-    _load_for_executorch_from_buffer,
-)
-from executorch.extension.pytree import tree_flatten
-from torch.export import export, export_for_training
-
-from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
-
-
-class QuantizationMode(Enum):
-    """Enum to describe how a model should be quantized."""
-
-    NONE = auto()
-    INT8_STATIC_PER_CHANNEL = auto()
-
-
-def get_exported_graph(
-    model,
-    sample_inputs,
-    dynamic_shapes=None,
-    qmode=QuantizationMode.NONE,
-) -> torch.fx.GraphModule:
-    export_training_graph = export_for_training(
-        model, sample_inputs, dynamic_shapes=dynamic_shapes, strict=True
-    ).module()
-
-    if qmode == QuantizationMode.NONE:
-        return export_training_graph
-
-    quantizer = XNNPACKQuantizer()
-
-    operator_config = get_symmetric_quantization_config(is_per_channel=True)
-    quantizer.set_global(operator_config)
-
-    prepared_graph = prepare_pt2e(export_training_graph, quantizer)
-    prepared_graph(*sample_inputs)
-    converted_graph = convert_pt2e(prepared_graph)
-
-    return converted_graph
-
-
-def random_uniform_tensor(shape, low=0.0, high=1.0, device=None, dtype=None):
-    if dtype is None:
-        dtype = torch.float32
-
-    return torch.empty(shape, device=device, dtype=dtype).uniform_(low, high)
-
-
-def export_model_to_vulkan(
-    model,
-    sample_inputs,
-    dynamic_shapes=None,
-    operator_blocklist=None,
-    operator_allowlist=None,
-    nn_module_blocklist=None,
-    nn_module_allowlist=None,
-    qmode=QuantizationMode.NONE,
-):
-    compile_options = {}
-    exported_graph = get_exported_graph(model, sample_inputs, qmode=qmode)
-    program = export(
-        exported_graph,
-        sample_inputs,
-        dynamic_shapes=dynamic_shapes,
-        strict=True,
-    )
-
-    edge_program = to_edge_transform_and_lower(
-        program,
-        partitioner=[
-            VulkanPartitioner(
-                compile_options,
-                operator_blocklist=operator_blocklist,
-                operator_allowlist=operator_allowlist,
-                nn_module_blocklist=nn_module_blocklist,
-                nn_module_allowlist=nn_module_allowlist,
-            )
-        ],
-        transform_passes=None,
-        compile_config=None,
-    )
-
-    executorch_program = edge_program.to_executorch()
-
-    # Check if the delegate ID matches VulkanBackend
-    if (
-        executorch_program.executorch_program.execution_plan[0].delegates[0].id
-        != VulkanBackend.__name__
-    ):
-        raise RuntimeError(
-            f"Expected delegate ID {VulkanBackend.__name__}, but got {executorch_program.executorch_program.execution_plan[0].delegates[0].id}"
-        )
-
-    return executorch_program
-
-
-def export_model_to_xnnpack(
-    model,
-    sample_inputs,
-    dynamic_shapes=None,
-    operator_blocklist=None,
-    operator_allowlist=None,
-    nn_module_blocklist=None,
-    nn_module_allowlist=None,
-    qmode=QuantizationMode.NONE,
-):
-    compile_options = {}
-    exported_graph = get_exported_graph(model, sample_inputs, qmode=qmode)
-    program = export(
-        exported_graph,
-        sample_inputs,
-        dynamic_shapes=dynamic_shapes,
-        strict=True,
-    )
-
-    edge_program = to_edge_transform_and_lower(
-        program,
-        partitioner=[XnnpackPartitioner(compile_options)],
-        transform_passes=None,
-        compile_config=None,
-    )
-
-    executorch_program = edge_program.to_executorch()
-
-    # Check if the delegate ID matches XnnpackBackend
-    if (
-        executorch_program.executorch_program.execution_plan[0].delegates[0].id
-        != XnnpackBackend.__name__
-    ):
-        raise RuntimeError(
-            f"Expected delegate ID {XnnpackBackend.__name__}, but got {executorch_program.executorch_program.execution_plan[0].delegates[0].id}"
-        )
-
-    return executorch_program
-
-
-def print_tensor_comparison_errors(
-    tensor1, tensor2, atol=1e-03, rtol=1e-03, max_errors=10
-):
-    """
-    Print the first max_errors tensor indexes that exceed the absolute/relative tolerance
-    and the error at each of those locations.
-
-    Args:
-        tensor1: First tensor to compare
-        tensor2: Second tensor to compare
-        atol: Absolute tolerance
-        rtol: Relative tolerance
-        max_errors: Maximum number of errors to print (default: 10)
-    """
-    # Handle lists/tuples of tensors
-    if isinstance(tensor1, (list, tuple)) and isinstance(tensor2, (list, tuple)):
-        if len(tensor1) != len(tensor2):
-            print(f"Tensor count mismatch: {len(tensor1)} vs {len(tensor2)}")
-            return
-
-        for i, (t1, t2) in enumerate(zip(tensor1, tensor2)):
-            print(f"\n=== Tensor {i} comparison ===")
-            print_tensor_comparison_errors(t1, t2, atol, rtol, max_errors)
-        return
-
-    # Handle single tensor comparison
-    if not isinstance(tensor1, torch.Tensor) or not isinstance(tensor2, torch.Tensor):
-        print("Error: Both inputs must be torch.Tensor objects")
-        return
-
-    if tensor1.shape != tensor2.shape:
-        print(f"Shape mismatch: {tensor1.shape} vs {tensor2.shape}")
-        return
-
-    # Calculate absolute and relative errors
-    abs_diff = torch.abs(tensor1 - tensor2)
-    rel_diff = abs_diff / (
-        torch.abs(tensor2) + 1e-8
-    )  # Add small epsilon to avoid division by zero
-
-    # Find locations where tolerance is exceeded
-    tolerance_mask = (abs_diff > atol) & (rel_diff > rtol)
-
-    if not tolerance_mask.any():
-        print("All values are within tolerance")
-        return
-
-    # Get indices where tolerance is exceeded
-    error_indices = torch.nonzero(tolerance_mask, as_tuple=False)
-    total_errors = error_indices.shape[0]
-
-    print(f"Found {total_errors} values exceeding tolerance (atol={atol}, rtol={rtol})")
-    print(f"Showing first {min(max_errors, total_errors)} errors:")
-    print("Index -> tensor1_value, tensor2_value, abs_error, rel_error")
-
-    # Print first max_errors locations
-    for i in range(min(max_errors, total_errors)):
-        idx = tuple(error_indices[i].tolist())
-        val1 = tensor1[idx].item()
-        val2 = tensor2[idx].item()
-        abs_err = abs_diff[idx].item()
-        rel_err = rel_diff[idx].item()
-
-        print(
-            f"{idx} -> {val1:.6f}, {val2:.6f}, abs_err={abs_err:.6f}, rel_err={rel_err:.6f}"
-        )
-
-
-def check_outputs_equal(
-    model_output, ref_output, atol=1e-03, rtol=1e-03, first_output_only=False
-):
-    """
-    Helper function that checks if model output and reference output are equal with some tolerance.
-    Returns True if equal, False otherwise.
-    """
-    # Convert OrderedDict to list if needed
-    if isinstance(ref_output, OrderedDict):
-        ref_output = list(ref_output.values())
-
-    # Compare the result from executor and eager mode directly
-    if isinstance(ref_output, tuple) or isinstance(ref_output, list):
-        # Multiple outputs executor always returns tuple, even if there is one output
-        if len(ref_output) != len(model_output):
-            print_tensor_comparison_errors(model_output, ref_output, atol, rtol)
-            return False
-        if first_output_only:
-            result = torch.allclose(
-                model_output[0], ref_output[0], atol=atol, rtol=rtol
-            )
-            if not result:
-                print_tensor_comparison_errors(
-                    model_output[0], ref_output[0], atol, rtol
-                )
-            return result
-        else:
-            for i in range(len(ref_output)):
-                if not torch.allclose(
-                    model_output[i], ref_output[i], atol=atol, rtol=rtol
-                ):
-                    print(f"\n=== Output {i} comparison failed ===")
-                    print_tensor_comparison_errors(
-                        model_output[i], ref_output[i], atol, rtol
-                    )
-                    return False
-            return True
-    else:
-        # If one output, eager returns tensor while executor tuple of size 1
-        result = torch.allclose(model_output[0], ref_output, atol=atol, rtol=rtol)
-        if not result:
-            print_tensor_comparison_errors(model_output[0], ref_output, atol, rtol)
-        return result
-
-
-def run_and_check_output(
-    reference_model: torch.nn.Module,
-    executorch_program: ExecutorchProgramManager,
-    sample_inputs: Tuple[torch.Tensor],
-    atol=1e-03,
-    rtol=1e-01,
-    first_output_only=False,
-) -> bool:
-    """
-    Utility function that accepts an already lowered ExecuTorch program, executes it with
-    the provided sample input, and checks the output for correctness.
-
-    Args:
-        executorch_program: Already lowered ExecutorchProgramManager
-        sample_inputs: Sample inputs to run the program with
-        reference_model: Reference model to generate reference outputs for comparison
-        atol: Absolute tolerance for output comparison
-        rtol: Relative tolerance for output comparison
-        first_output_only: Whether to compare only the first output
-
-    Returns:
-        bool: True if outputs match within tolerance, False otherwise
-    """
-    # Load the ExecutorTorch program
-    executorch_module = _load_for_executorch_from_buffer(executorch_program.buffer)
-
-    # Flatten inputs for execution
-    inputs_flattened, _ = tree_flatten(sample_inputs)
-
-    # Run the ExecutorTorch program
-    model_output = executorch_module.run_method("forward", tuple(inputs_flattened))
-
-    # Generate reference outputs using the reference model
-    ref_output = reference_model(*sample_inputs)
-
-    # Check if outputs are equal
-    return check_outputs_equal(
-        model_output,
-        ref_output,
-        atol=atol,
-        rtol=rtol,
-        first_output_only=first_output_only,
-    )
-
-
-def make_copy_of_inputs(sample_inputs: Tuple[Any]) -> Tuple[Any]:
-    sample_inputs_copy = []
-    for input_val in sample_inputs:
-        if isinstance(input_val, torch.Tensor):
-            sample_inputs_copy.append(input_val.clone())
-        else:
-            sample_inputs_copy.append(deepcopy(input_val))
-    return tuple(sample_inputs_copy)
-
-
-def lower_module_and_test_output(
-    model: torch.nn.Module,
-    sample_inputs: Tuple[torch.Tensor],
-    atol=1e-03,
-    rtol=1e-01,
-    dynamic_shapes=None,
-    test_inputs=None,
-    first_output_only=False,
-    operator_blocklist=None,
-    operator_allowlist=None,
-    nn_module_allowlist=None,
-    nn_module_blocklist=None,
-    xnnpack=False,
-) -> bool:
-    """
-    Helper testing function that takes a torch.nn.Module and lowers it to Vulkan with
-    the given sample inputs. It then runs the lowered module and compares its
-    outputs with the outputs of the eager module.
-
-    Returns:
-        bool: True if all comparisons pass, False otherwise.
-    """
-    # Export model to Vulkan using the helper function
-    if xnnpack:
-        executorch_program = export_model_to_xnnpack(
-            model,
-            make_copy_of_inputs(sample_inputs),
-            dynamic_shapes,
-            operator_blocklist,
-            operator_allowlist,
-            nn_module_blocklist,
-            nn_module_allowlist,
-        )
-    else:
-        executorch_program = export_model_to_vulkan(
-            model,
-            make_copy_of_inputs(sample_inputs),
-            dynamic_shapes,
-            operator_blocklist=operator_blocklist,
-            operator_allowlist=operator_allowlist,
-            nn_module_blocklist=nn_module_blocklist,
-            nn_module_allowlist=nn_module_allowlist,
-        )
-
-    executorch_module = _load_for_executorch_from_buffer(executorch_program.buffer)
-
-    inputs_flattened, _ = tree_flatten(sample_inputs)
-
-    model_output = executorch_module.run_method("forward", tuple(inputs_flattened))
-    ref_output = model(*make_copy_of_inputs(sample_inputs))
-
-    if not check_outputs_equal(
-        model_output,
-        ref_output,
-        atol=atol,
-        rtol=rtol,
-        first_output_only=first_output_only,
-    ):
-        return False
-
-    if test_inputs is not None:
-        for test_input in test_inputs:
-            test_inputs_flattened, _ = tree_flatten(test_input)
-            model_output = executorch_module.run_method(
-                "forward", tuple(test_inputs_flattened)
-            )
-            ref_output = model(*test_input)
-
-            if not check_outputs_equal(
-                model_output,
-                ref_output,
-                atol=atol,
-                rtol=rtol,
-                first_output_only=first_output_only,
-            ):
-                return False
-
-    return True
-
-
-def save_bundled_program(
-    model: torch.nn.Module,
-    sample_inputs: Tuple[torch.Tensor],
-    output_path: str,
-    method_name: str = "forward",
-    et_program: Optional[ExecutorchProgramManager] = None,
-    dynamic_shapes=None,
-) -> str:
-    """
-    Export a bundled .pte file containing the model and test cases.
-
-    Args:
-        model: The PyTorch model to export
-        sample_inputs: Sample inputs for the model
-        output_path: Path where the bundled .pte file should be saved (should end with .bpte)
-        method_name: Name of the method to test (default: "forward")
-        et_program: Optional pre-exported ExecutorchProgramManager. If None, will export to Vulkan
-        dynamic_shapes: Optional dynamic shapes for export
-
-    Returns:
-        str: Path to the saved bundled program file
-    """
-    # If no ExecutorchProgramManager provided, export to Vulkan
-    if et_program is None:
-        et_program = export_model_to_vulkan(model, sample_inputs, dynamic_shapes)
-
-    # Generate expected outputs by running the model
-    expected_outputs = [getattr(model, method_name)(*sample_inputs)]
-
-    # Flatten sample inputs to match expected format
-    inputs_flattened, _ = tree_flatten(sample_inputs)
-
-    # Create test suite with the sample inputs and expected outputs
-    test_suites = [
-        MethodTestSuite(
-            method_name=method_name,
-            test_cases=[
-                MethodTestCase(
-                    inputs=inputs_flattened,
-                    expected_outputs=expected_outputs,
-                )
-            ],
-        )
-    ]
-
-    # Create bundled program
-    bp = BundledProgram(et_program, test_suites)
-
-    # Serialize to flatbuffer
-    bp_buffer = serialize_from_bundled_program_to_flatbuffer(bp)
-
-    # Ensure output path has correct extension
-    if not output_path.endswith(".bpte"):
-        output_path = output_path + ".bpte"
-
-    # Write to file
-    with open(output_path, "wb") as file:
-        file.write(bp_buffer)
-    return output_path
-
-
-def save_executorch_program(
-    executorch_program: ExecutorchProgramManager,
-    output_path: str,
-) -> str:
-    """
-    Save an ExecutorchProgramManager as a .pte file.
-
-    Args:
-        executorch_program: The ExecutorchProgramManager to save
-        output_path: Path where the .pte file should be saved (should end with .pte)
-
-    Returns:
-        str: Path to the saved .pte file
-    """
-    # Ensure output path has correct extension
-    if not output_path.endswith(".pte"):
-        output_path = output_path + ".pte"
-
-    # Write to file
-    with open(output_path, "wb") as file:
-        executorch_program.write_to_file(file)
-
-    return output_path
-
-
-def print_occurrences(edge_program, operator_list: List):
-    """
-    Print the input/output information for all occurrences of specified operators in the edge program.
-
-    Args:
-        edge_program: The edge program created by to_edge_transform_and_lower
-        operator_list: List of operators to search for in the graph
-    """
-    logger = logging.getLogger("")
-    logger.setLevel(logging.INFO)
-
-    logger.info(
-        f"Searching for occurrences of {len(operator_list)} operators in the graph..."
-    )
-
-    occurrence_count = 0
-
-    for node in edge_program.exported_program().graph.nodes:
-        if utils.is_torch_op_node(node):
-            target = node.target
-            # Handle auto_functionalized nodes
-            if node.target == torch.ops.higher_order.auto_functionalized:
-                first_arg = node.args[0]
-                if hasattr(first_arg, "name"):
-                    target = first_arg.name()
-                elif hasattr(first_arg, "__name__"):
-                    target = first_arg.__name__
-
-            # Check if this operator is in our list
-            if target in operator_list:
-                occurrence_count += 1
-                logger.info(f"Occurrence {occurrence_count}: {node.format_node()}")
-
-                # Get the node I/O string using the utils function
-                try:
-                    io_str = utils.node_io_str(node)
-                    logger.info(f"  {io_str}")
-                except Exception as e:
-                    logger.info(f"  Error getting I/O string: {e}")
-
-    if occurrence_count == 0:
-        logger.info("No occurrences of the specified operators found in the graph.")
-    else:
-        logger.info(
-            f"Found {occurrence_count} total occurrences of the specified operators."
-        )
-
-
-def op_ablation_test(  # noqa: C901
-    model: torch.nn.Module,
-    sample_inputs: Tuple[torch.Tensor],
-    atol=1e-03,
-    rtol=1e-01,
-    dynamic_shapes=None,
-    test_inputs=None,
-    first_output_only=False,
-) -> dict:
-    """
-    Fast binary search utility function to determine which operators work correctly when delegated to Vulkan.
-
-    This function uses a binary search approach to efficiently find bad operators:
-    1. Split operators into two halves (least frequent first, most frequent second)
-    2. Test each half to see if it produces correct output
-    3. Add good halves to known_good_ops and recursively search bad halves
-    4. Continue until all operators are classified
-
-    Args:
-        model: The PyTorch model to test
-        sample_inputs: Sample inputs for the model
-        atol: Absolute tolerance for output comparison
-        rtol: Relative tolerance for output comparison
-        dynamic_shapes: Optional dynamic shapes for export
-        test_inputs: Optional additional test inputs
-        first_output_only: Whether to compare only the first output
-
-    Returns:
-        dict: Dictionary with keys:
-            - 'good_operators': List of operators that work correctly
-            - 'bad_operators': List of operators that cause failures
-            - 'operator_frequencies': Dictionary mapping operators to their occurrence count
-            - 'all_operators': List of all unique operators found in the graph
-            - 'test_count': Number of tests performed
-    """
-    logger = logging.getLogger("")
-    logger.setLevel(logging.INFO)
-
-    logger.info("Starting fast binary search operator ablation test...")
-
-    # Step 1: Export model to get edge_program and extract operators
-    export_training_graph = export_for_training(
-        model, sample_inputs, strict=True
-    ).module()
-    program = export(
-        export_training_graph,
-        sample_inputs,
-        dynamic_shapes=dynamic_shapes,
-        strict=True,
-    )
-    edge_program = to_edge_transform_and_lower(
-        program,
-        partitioner=[],  # No partitioner to get the full graph
-        transform_passes=None,
-        compile_config=None,
-    )
-
-    # Step 2: Scan edge_program.graph_module to obtain unique operators and their frequencies
-    operator_frequencies = {}
-    for node in edge_program.exported_program().graph.nodes:
-        if utils.is_torch_op_node(node):
-            target = node.target
-            # Handle auto_functionalized nodes
-            if node.target == torch.ops.higher_order.auto_functionalized:
-                first_arg = node.args[0]
-                if hasattr(first_arg, "name"):
-                    target = first_arg.name()
-                elif hasattr(first_arg, "__name__"):
-                    target = first_arg.__name__
-
-            if target in operator_frequencies:
-                operator_frequencies[target] += 1
-            else:
-                operator_frequencies[target] = 1
-
-    all_operators = list(operator_frequencies.keys())
-    logger.info(f"Found {len(all_operators)} unique operators in the graph")
-
-    # Sort operators by frequency (most frequent first for binary search)
-    operators_by_frequency = sorted(
-        all_operators, key=lambda op: operator_frequencies[op], reverse=True
-    )
-
-    logger.info("Operator frequencies (sorted by occurrence, most frequent first):")
-    for op in operators_by_frequency:
-        logger.info(f"  {op.name()}: {operator_frequencies[op]} occurrences")
-
-    # Global test counter
-    test_count = 0
-
-    def test_operator_set(ops_to_test: List, known_good_ops: List) -> bool:
-        """Test if a set of operators works correctly when combined with known good operators."""
-        nonlocal test_count
-        test_count += 1
-
-        test_allowlist = known_good_ops + ops_to_test
-        logger.info(
-            f"Test {test_count}: Testing {len(ops_to_test)} operators with {len(known_good_ops)} known good"
-        )
-
-        try:
-            success = lower_module_and_test_output(
-                model=model,
-                sample_inputs=sample_inputs,
-                atol=atol,
-                rtol=rtol,
-                dynamic_shapes=dynamic_shapes,
-                test_inputs=test_inputs,
-                first_output_only=first_output_only,
-                operator_allowlist=test_allowlist,
-            )
-            logger.info(f"  {'✓ PASS' if success else '✗ FAIL'}")
-
-            # Log known good ops
-            logger.info("  Known good:")
-            for op in known_good_ops:
-                logger.info(f"  * {op.name()}")
-
-            # Log tested ops
-            logger.info("  Tested ops:")
-            for op in ops_to_test:
-                logger.info(f"  * {op.name()}")
-
-            return success
-        except Exception as e:
-            logger.info(f"  ! Error: {e}")
-            return False
-
-    def find_bad_operators(
-        ops_to_test: List, known_good_ops: List
-    ) -> Tuple[List, List]:
-        """
-        Recursively find bad operators using binary search.
-
-        Returns:
-            Tuple of (good_operators, bad_operators) from ops_to_test
-        """
-        if not ops_to_test:
-            return [], []
-
-        if len(ops_to_test) == 1:
-            # Base case: single operator
-            op = ops_to_test[0]
-            if test_operator_set([op], known_good_ops):
-                logger.info(f"  Single operator {op.name()} is GOOD")
-                return [op], []
-            else:
-                logger.info(f"  Single operator {op.name()} is BAD")
-                return [], [op]
-
-        # Split ops_to_test into two halves
-        mid = len(ops_to_test) // 2
-        first_half = ops_to_test[:mid]  # Least frequent operators
-        second_half = ops_to_test[mid:]  # Most frequent operators
-
-        logger.info(
-            f"Splitting {len(ops_to_test)} operators: {len(first_half)} + {len(second_half)}"
-        )
-
-        # Log known good ops
-        logger.info("  Known good:")
-        for op in known_good_ops:
-            logger.info(f"  * {op.name()}")
-
-        # Log first half ops
-        logger.info("  First half ops:")
-        for op in first_half:
-            logger.info(f"  * {op.name()}")
-
-        # Log second half ops
-        logger.info("  Second half ops:")
-        for op in second_half:
-            logger.info(f"  * {op.name()}")
-
-        good_ops = []
-        bad_ops = []
-
-        first_half_good = test_operator_set(first_half, known_good_ops)
-        if first_half_good:
-            logger.info(
-                f"First half ({len(first_half)} ops) is good - adding to known good"
-            )
-            good_ops.extend(first_half)
-            known_good_ops.extend(first_half)
-
-        second_half_good = test_operator_set(second_half, known_good_ops)
-        if second_half_good:
-            logger.info(
-                f"Second half ({len(second_half)} ops) is good - adding to known good"
-            )
-            good_ops.extend(second_half)
-
-        if not first_half_good:
-            logger.info(f"First half ({len(first_half)} ops) is bad - recursing")
-            sub_good, sub_bad = find_bad_operators(first_half, known_good_ops)
-            good_ops.extend(sub_good)
-            bad_ops.extend(sub_bad)
-            known_good_ops.extend(sub_good)
-        if not second_half_good:
-            logger.info(f"Second half ({len(second_half)} ops) is bad - recursing")
-            sub_good, sub_bad = find_bad_operators(second_half, known_good_ops)
-            good_ops.extend(sub_good)
-            bad_ops.extend(sub_bad)
-
-        return good_ops, bad_ops
-
-    # Start the binary search
-    logger.info(
-        f"\n=== Starting binary search on {len(operators_by_frequency)} operators ==="
-    )
-    good_operators, bad_operators = find_bad_operators(operators_by_frequency, [])
-
-    # Summary of results
-    logger.info(f"\n=== Binary search complete after {test_count} tests ===")
-    logger.info(f"Good operators ({len(good_operators)}):")
-    for op in good_operators:
-        logger.info(f"  ✓ {op.name()} (frequency: {operator_frequencies[op]})")
-
-    logger.info(f"Bad operators ({len(bad_operators)}):")
-    for op in bad_operators:
-        logger.info(f"  ✗ {op.name()} (frequency: {operator_frequencies[op]})")
-
-    print_occurrences(edge_program, bad_operators)
-
-    efficiency_gain = len(all_operators) - test_count
-    logger.info(
-        f"Efficiency: {test_count} tests instead of {len(all_operators)} (saved {efficiency_gain} tests)"
-    )
-
-    return {
-        "good_operators": good_operators,
-        "bad_operators": bad_operators,
-        "operator_frequencies": operator_frequencies,
-        "all_operators": all_operators,
-        "test_count": test_count,
-    }
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
deleted file mode 100644
index 07d28229221..00000000000
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ /dev/null
@@ -1,626 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/test/utils/test_utils.h>
-
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-#include <cassert>
-#include <random>
-#include <string>
-
-using namespace vkcompute;
-
-bool is_bitw8(vkapi::ScalarType dtype) {
-  return dtype == vkapi::kByte || dtype == vkapi::kChar ||
-      dtype == vkapi::kQInt8 || dtype == vkapi::kQUInt8;
-}
-
-vkapi::ShaderInfo get_nchw_to_tensor_shader(
-    const api::vTensor& v_dst,
-    bool int8_buffer_enabled,
-    bool push_constant_variant) {
-  std::string kernel_name;
-  kernel_name.reserve(kShaderNameReserve);
-
-  if (is_bitw8(v_dst.dtype()) && v_dst.storage_type() != utils::kBuffer &&
-      !int8_buffer_enabled) {
-    kernel_name = "nchw_to_bitw8_image_nobitw8buffer";
-    if (!push_constant_variant) {
-      kernel_name += "_no_pc";
-    }
-    add_storage_type_suffix(kernel_name, v_dst.storage_type());
-    add_dtype_suffix(kernel_name, v_dst.dtype());
-    return VK_KERNEL_FROM_STR(kernel_name);
-  }
-
-  if (v_dst.storage_type() == utils::kBuffer) {
-    kernel_name = "nchw_to_buffer";
-    add_dtype_suffix(kernel_name, v_dst.dtype());
-    return VK_KERNEL_FROM_STR(kernel_name);
-  }
-
-  kernel_name = "nchw_to_image";
-  if (!push_constant_variant) {
-    kernel_name += "_no_pc";
-  }
-  add_storage_type_suffix(kernel_name, v_dst.storage_type());
-  add_dtype_suffix(kernel_name, v_dst.dtype());
-
-  return VK_KERNEL_FROM_STR(kernel_name);
-}
-
-vkapi::ShaderInfo get_tensor_to_nchw_shader(
-    const api::vTensor& v_src,
-    bool int8_buffer_enabled,
-    bool push_constant_variant) {
-  std::string kernel_name;
-  kernel_name.reserve(kShaderNameReserve);
-
-  if (is_bitw8(v_src.dtype()) && v_src.storage_type() != utils::kBuffer &&
-      !int8_buffer_enabled) {
-    kernel_name = "bitw8_image_to_nchw_nobitw8buffer";
-    if (!push_constant_variant) {
-      kernel_name += "_no_pc";
-    }
-    add_storage_type_suffix(kernel_name, v_src.storage_type());
-    add_dtype_suffix(kernel_name, v_src.dtype());
-    return VK_KERNEL_FROM_STR(kernel_name);
-  }
-
-  if (v_src.storage_type() == utils::kBuffer) {
-    kernel_name = "buffer_to_nchw";
-    add_dtype_suffix(kernel_name, v_src.dtype());
-    return VK_KERNEL_FROM_STR(kernel_name);
-  }
-
-  kernel_name = "image_to_nchw";
-  if (!push_constant_variant) {
-    kernel_name += "_no_pc";
-  }
-  add_storage_type_suffix(kernel_name, v_src.storage_type());
-  add_dtype_suffix(kernel_name, v_src.dtype());
-
-  return VK_KERNEL_FROM_STR(kernel_name);
-}
-//
-// Operator Recording Functions
-//
-
-void record_nchw_to_buffer_op(
-    api::Context* const context,
-    vkapi::VulkanBuffer& src_buffer,
-    api::vTensor& v_dst) {
-  vkapi::PipelineBarrier pipeline_barrier{};
-  vkapi::SpecVarList specialization_constants = {v_dst.hashed_layout()};
-
-  context->submit_compute_job(
-      get_nchw_to_tensor_shader(v_dst, true, false),
-      pipeline_barrier,
-      {uint32_t(v_dst.numel()), 1, 1},
-      {64, 1, 1},
-      specialization_constants,
-      VK_NULL_HANDLE,
-      0,
-      v_dst.buffer(
-          pipeline_barrier,
-          vkapi::PipelineStage::COMPUTE,
-          vkapi::MemoryAccessType::WRITE),
-      src_buffer,
-      v_dst.buffer_meta_ubo());
-}
-
-void record_buffer_to_nchw_op(
-    api::Context* const context,
-    api::vTensor& v_src,
-    vkapi::VulkanBuffer& dst_buffer) {
-  vkapi::PipelineBarrier pipeline_barrier{};
-  context->submit_compute_job(
-      get_tensor_to_nchw_shader(v_src, true, false),
-      pipeline_barrier,
-      {uint32_t(v_src.numel()), 1, 1},
-      {64, 1, 1},
-      {},
-      VK_NULL_HANDLE,
-      0,
-      dst_buffer,
-      v_src.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
-      v_src.buffer_meta_ubo());
-}
-
-void record_nchw_to_image_op(
-    api::Context* const context,
-    vkapi::VulkanBuffer& src_buffer,
-    api::vTensor& v_dst) {
-  vkapi::PipelineBarrier pipeline_barrier{};
-  vkapi::SpecVarList specialization_constants = {v_dst.hashed_layout()};
-
-  context->submit_compute_job(
-      get_nchw_to_tensor_shader(
-          v_dst,
-          context->adapter_ptr()->has_full_int8_buffers_support(),
-          false),
-      pipeline_barrier,
-      v_dst.logical_limits(),
-      adaptive_work_group_size(v_dst.logical_limits()),
-      specialization_constants,
-      VK_NULL_HANDLE,
-      0,
-      v_dst.image(
-          pipeline_barrier,
-          vkapi::PipelineStage::COMPUTE,
-          vkapi::MemoryAccessType::WRITE),
-      src_buffer,
-      v_dst.sizes_ubo());
-}
-
-void record_image_to_nchw_op(
-    api::Context* const context,
-    api::vTensor& v_src,
-    vkapi::VulkanBuffer& dst_buffer) {
-  vkapi::PipelineBarrier pipeline_barrier{};
-  vkapi::SpecVarList specialization_constants = {v_src.hashed_layout()};
-
-  context->submit_compute_job(
-      get_tensor_to_nchw_shader(v_src, true, false),
-      pipeline_barrier,
-      v_src.logical_limits(),
-      adaptive_work_group_size(v_src.logical_limits()),
-      specialization_constants,
-      VK_NULL_HANDLE,
-      0,
-      dst_buffer,
-      v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
-      v_src.sizes_ubo());
-}
-
-void record_bitw8_image_to_nchw_nobitw8buffer_op(
-    api::Context* const context,
-    api::vTensor& v_src,
-    api::StagingBuffer& dst_buffer) {
-  vkapi::PipelineBarrier pipeline_barrier{};
-  uint32_t buffer_len = utils::safe_downcast<uint32_t>(dst_buffer.numel() / 4);
-  utils::uvec3 global_wg_size = {buffer_len, 1, 1};
-
-  std::string kernel_name = "bitw8_image_to_nchw_nobitw8buffer_no_pc";
-  add_storage_type_suffix(kernel_name, v_src.storage_type());
-  add_dtype_suffix(kernel_name, v_src.dtype());
-
-  context->submit_compute_job(
-      VK_KERNEL_FROM_STR(kernel_name),
-      pipeline_barrier,
-      global_wg_size,
-      adaptive_work_group_size(global_wg_size),
-      {v_src.hashed_layout()},
-      VK_NULL_HANDLE,
-      0,
-      dst_buffer.buffer(),
-      v_src.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
-      v_src.sizes_ubo(),
-      v_src.numel_ubo());
-}
-
-void record_binary_op(
-    api::Context* const context,
-    const std::string& op_name,
-    api::vTensor& v_in1,
-    api::vTensor& v_in2,
-    api::vTensor& v_dst) {
-  std::string kernel_name = "binary_" + op_name + "_nobroadcast__test";
-  add_dtype_suffix(kernel_name, v_dst.dtype());
-
-  vkapi::PipelineBarrier pipeline_barrier{};
-  vkapi::SpecVarList specialization_constants = {};
-  context->submit_compute_job(
-      VK_KERNEL_FROM_STR(kernel_name),
-      pipeline_barrier,
-      v_dst.logical_limits(),
-      adaptive_work_group_size(v_dst.logical_limits()),
-      specialization_constants,
-      VK_NULL_HANDLE,
-      0,
-      v_dst.image(
-          pipeline_barrier,
-          vkapi::PipelineStage::COMPUTE,
-          vkapi::MemoryAccessType::WRITE),
-      v_in1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
-      v_in2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
-      v_dst.sizes_ubo());
-}
-
-void execute_and_check_add(
-    api::vTensor& a,
-    api::vTensor& b,
-    api::vTensor& c,
-    float a_val,
-    float b_val) {
-  // Fill input tensors
-  fill_vtensor(a, a_val);
-  fill_vtensor(b, b_val);
-
-  // a + b = c
-  record_binary_op(api::context(), "add", a, b, c);
-
-  // Extract output tensor
-  std::vector<float> data_out = extract_vtensor(c);
-
-  // Check output
-  for (size_t i = 0; i < data_out.size(); ++i) {
-    CHECK_VALUE(data_out, i, (a_val + b_val));
-  }
-}
-
-void record_index_fill_buffer(api::Context* context, api::vTensor& v_ten) {
-  std::string kernel_name("idx_fill_buffer");
-  switch (v_ten.dtype()) {
-    case vkapi::kFloat:
-      kernel_name += "_float";
-      break;
-    case vkapi::kHalf:
-      kernel_name += "_half";
-      break;
-    case vkapi::kQInt8:
-      kernel_name += "_int8";
-      break;
-    case vkapi::kQUInt8:
-      kernel_name += "_uint8";
-      break;
-    default:
-      throw std::runtime_error("Unsupported dtype");
-      break;
-  }
-
-  api::ParamsBuffer params(api::context(), int32_t(v_ten.numel()));
-
-  {
-    vkapi::PipelineBarrier pipeline_barrier{};
-    vkapi::SpecVarList specialization_constants = {};
-    api::context()->submit_compute_job(
-        VK_KERNEL_FROM_STR(kernel_name),
-        pipeline_barrier,
-        {uint32_t(v_ten.numel()), 1, 1},
-        {64, 1, 1},
-        specialization_constants,
-        VK_NULL_HANDLE,
-        0,
-        v_ten.buffer(
-            pipeline_barrier,
-            vkapi::PipelineStage::COMPUTE,
-            vkapi::MemoryAccessType::READ),
-        params.buffer());
-  }
-}
-
-void record_scalar_add_buffer(
-    api::Context* context,
-    api::vTensor& v_ten,
-    float offset) {
-  vkapi::PipelineBarrier pipeline_barrier{};
-  vkapi::SpecVarList specialization_constants = {SV(offset)};
-  std::string kernel = "scalar_add_buffer";
-  add_dtype_suffix(kernel, v_ten.dtype());
-  api::context()->submit_compute_job(
-      VK_KERNEL_FROM_STR(kernel),
-      pipeline_barrier,
-      {uint32_t(v_ten.numel()), 1, 1},
-      {64, 1, 1},
-      specialization_constants,
-      VK_NULL_HANDLE,
-      0,
-      v_ten.buffer(
-          pipeline_barrier,
-          vkapi::PipelineStage::COMPUTE,
-          vkapi::MemoryAccessType::READ | vkapi::MemoryAccessType::WRITE),
-      v_ten.numel_ubo());
-}
-
-void record_reference_matmul(
-    api::Context* context,
-    api::vTensor& out,
-    api::vTensor& mat1,
-    api::vTensor& mat2) {
-  vkapi::PipelineBarrier pipeline_barrier{};
-  api::context()->submit_compute_job(
-      VK_KERNEL(reference_matmul),
-      pipeline_barrier,
-      {uint32_t(out.size(1)), uint32_t(out.size(0)), 1},
-      {64, 1, 1},
-      {},
-      VK_NULL_HANDLE,
-      0,
-      out.buffer(
-          pipeline_barrier,
-          vkapi::PipelineStage::COMPUTE,
-          vkapi::MemoryAccessType::WRITE),
-      mat1.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
-      mat2.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
-      out.sizes_ubo(),
-      out.strides_ubo(),
-      mat1.sizes_ubo(),
-      mat1.strides_ubo(),
-      mat2.sizes_ubo(),
-      mat2.strides_ubo());
-}
-
-void record_matmul_texture3d(
-    api::Context* context,
-    api::vTensor& out,
-    api::vTensor& mat1,
-    api::vTensor& mat2) {
-  std::string kernel_name = "matmul_naive";
-  kernel_name.reserve(kShaderNameReserve);
-  add_storage_type_suffix(kernel_name, out.storage_type());
-  add_dtype_suffix(kernel_name, out.dtype());
-
-  utils::uvec3 global_wg_size = out.logical_limits();
-
-  vkapi::PipelineBarrier pipeline_barrier{};
-  api::context()->submit_compute_job(
-      VK_KERNEL_FROM_STR(kernel_name),
-      pipeline_barrier,
-      global_wg_size,
-      {8, 8, 1},
-      {out.hashed_layout(), mat1.hashed_layout(), mat2.hashed_layout()},
-      VK_NULL_HANDLE,
-      0,
-      out.image(
-          pipeline_barrier,
-          vkapi::PipelineStage::COMPUTE,
-          vkapi::MemoryAccessType::WRITE),
-      mat1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
-      mat2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
-      out.sizes_ubo(),
-      out.logical_limits_ubo(),
-      mat1.sizes_ubo(),
-      mat2.sizes_ubo());
-}
-
-//
-// Input & Output Utilities
-//
-
-#define FORALL_SUPPORTED_TYPES(_) \
-  _(uint8_t, Byte)                \
-  _(int8_t, Char)                 \
-  _(int32_t, Int)                 \
-  _(executorch::aten::Half, Half) \
-  _(float, Float)                 \
-  _(int8_t, QInt8)
-
-void fill_vtensor(api::vTensor& vten, std::vector<float>& data) {
-  api::StagingBuffer staging_buffer(api::context(), vten.dtype(), data.size());
-
-#define CASE(ctype, name)                                     \
-  case vkapi::ScalarType::name: {                             \
-    std::vector<ctype> data_converted;                        \
-    data_converted.resize(data.size());                       \
-    for (int i = 0; i < data.size(); ++i) {                   \
-      data_converted[i] = ctype(data[i]);                     \
-    }                                                         \
-    staging_buffer.copy_from(                                 \
-        data_converted.data(), vten.staging_buffer_nbytes()); \
-  } break;
-
-  switch (vten.dtype()) {
-    FORALL_SUPPORTED_TYPES(CASE)
-    default:
-      VK_THROW("Unsupported dtype");
-  }
-
-#undef CASE
-
-  if (vten.storage_type() == utils::StorageType::BUFFER) {
-    record_nchw_to_buffer_op(api::context(), staging_buffer.buffer(), vten);
-  } else {
-    record_nchw_to_image_op(api::context(), staging_buffer.buffer(), vten);
-  }
-}
-
-void fill_vtensor(api::vTensor& vten, float val, bool iota) {
-  std::vector<float> vten_data(vten.staging_buffer_numel());
-  if (iota) {
-    std::iota(vten_data.begin(), vten_data.end(), val);
-  } else {
-    std::fill(vten_data.begin(), vten_data.end(), val);
-  }
-
-  fill_vtensor(vten, vten_data);
-}
-
-std::vector<float> create_random_float_buffer(
-    const size_t numel,
-    const float min,
-    const float max) {
-  std::vector<float> data(numel);
-  std::default_random_engine rng;
-  std::uniform_real_distribution<float> dist(min, max);
-
-  for (size_t i = 0; i < data.size(); ++i) {
-    data[i] = dist(rng);
-  }
-  return data;
-}
-
-std::vector<uint8_t> create_random_uint8_buffer(
-    const size_t numel,
-    const uint8_t min,
-    const uint8_t max) {
-  std::vector<uint8_t> data(numel);
-  std::default_random_engine rng;
-  std::uniform_real_distribution<float> dist(min, max);
-
-  for (size_t i = 0; i < data.size(); ++i) {
-    data[i] = (uint8_t)dist(rng);
-  }
-  return data;
-}
-
-void fill_vtensor(
-    ComputeGraph& graph,
-    const IOValueRef idx,
-    float val,
-    bool iota) {
-  std::vector<float> data(graph.numel_of(idx.value));
-  if (graph.storage_type_of(idx.value) != utils::kBuffer) {
-    data.resize(graph.staging_buffer_numel_of(idx.value));
-  }
-  if (iota) {
-    std::iota(data.begin(), data.end(), val);
-  } else {
-    std::fill(data.begin(), data.end(), val);
-  }
-
-  graph.copy_into_staging(idx.staging, data.data(), data.size());
-}
-
-void extract_vtensor(api::vTensor& vten, std::vector<float>& data) {
-  api::StagingBuffer staging_buffer(
-      api::context(), vten.dtype(), vten.staging_buffer_numel());
-
-  if (vten.storage_type() == utils::StorageType::BUFFER) {
-    record_buffer_to_nchw_op(api::context(), vten, staging_buffer.buffer());
-  } else {
-    record_image_to_nchw_op(api::context(), vten, staging_buffer.buffer());
-  }
-
-  vkapi::VulkanFence fence = api::context()->fences().get_fence();
-  api::context()->submit_cmd_to_gpu(fence.get_submit_handle());
-  fence.wait();
-
-#define CASE(ctype, name)                                     \
-  case vkapi::ScalarType::name: {                             \
-    std::vector<ctype> data_converted(data.size());           \
-    staging_buffer.copy_to(                                   \
-        data_converted.data(), vten.staging_buffer_nbytes()); \
-    for (int i = 0; i < data.size(); ++i) {                   \
-      data[i] = float(data_converted[i]);                     \
-    }                                                         \
-  } break;
-
-  switch (vten.dtype()) {
-    FORALL_SUPPORTED_TYPES(CASE)
-    default:
-      VK_THROW("Unsupported dtype");
-  }
-
-#undef CASE
-}
-
-//
-// Context Management
-//
-
-void submit_to_gpu() {
-  vkapi::VulkanFence fence = api::context()->fences().get_fence();
-  api::context()->submit_cmd_to_gpu(fence.get_submit_handle());
-  fence.wait();
-}
-
-vkapi::Allocation allocate_memory_for(const api::vTensor& vten) {
-  VmaAllocationCreateInfo alloc_create_info =
-      api::context()->adapter_ptr()->vma().gpuonly_resource_create_info();
-  return api::context()->adapter_ptr()->vma().create_allocation(
-      vten.get_memory_requirements(), alloc_create_info);
-}
-
-VmaTotalStatistics get_vma_stats() {
-  return api::context()->adapter_ptr()->vma().get_memory_statistics();
-}
-
-size_t get_vma_allocation_count() {
-  return get_vma_stats().total.statistics.allocationCount;
-}
-
-//
-// Graph Test Utilities
-//
-
-void execute_graph_and_check_output(
-    ComputeGraph& graph,
-    std::vector<float> input_vals,
-    std::vector<float> expected_outputs) {
-  assert(input_vals.size() == graph.inputs().size());
-  assert(expected_outputs.size() == graph.outputs().size());
-
-  for (size_t i = 0; i < graph.inputs().size(); ++i) {
-    fill_vtensor(graph, graph.inputs().at(i), input_vals.at(i));
-  }
-
-  graph.execute();
-
-  for (size_t i = 0; i < graph.outputs().size(); ++i) {
-    IOValueRef out_ioval = graph.outputs().at(i);
-    std::vector<float> output_data(
-        graph.staging_buffer_numel_of(out_ioval.value));
-    graph.copy_from_staging(
-        out_ioval.staging, output_data.data(), output_data.size());
-
-    for (size_t j = 0; j < graph.numel_of(out_ioval.value); ++j) {
-      CHECK_VALUE(output_data, j, expected_outputs.at(i));
-    }
-  }
-}
-
-vkcompute::ComputeGraph build_mm_graph(
-    int B,
-    int M,
-    int K,
-    int N,
-    vkcompute::vkapi::ScalarType dtype,
-    vkcompute::utils::StorageType in_out_stype,
-    vkcompute::utils::GPUMemoryLayout memory_layout,
-    const std::vector<float>& mat2_data,
-    const bool prepack_mat2) {
-  using namespace vkcompute;
-  GraphConfig config;
-  config.expect_dynamic_shapes = true;
-  ComputeGraph graph(config);
-
-  std::vector<int64_t> mat1_size = {M, K};
-  std::vector<int64_t> mat2_size = {K, N};
-  std::vector<int64_t> out_size = {M, N};
-  if (B > 1) {
-    mat1_size.resize(3);
-    mat1_size = {B, M, K};
-    mat2_size.resize(3);
-    mat2_size = {B, K, N};
-    out_size.resize(3);
-    out_size = {B, M, N};
-  }
-
-  IOValueRef mat1 =
-      graph.add_input_tensor(mat1_size, dtype, in_out_stype, memory_layout);
-  IOValueRef mat2{};
-
-  ValueRef mat2_w = graph.add_tensorref(mat2_size, dtype, mat2_data.data());
-
-  if (prepack_mat2) {
-    mat2.value = mat2_w;
-  } else {
-    mat2.value =
-        graph.add_tensor(mat2_size, dtype, in_out_stype, memory_layout);
-    mat2.staging = graph.set_input_tensor(mat2.value);
-  }
-
-  IOValueRef out;
-  out.value = graph.add_tensor(out_size, dtype, in_out_stype, memory_layout);
-
-  VK_GET_OP_FN("aten.mm.default")(graph, {mat1.value, mat2.value, out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  return graph;
-}
-
-bool check_close(float a, float b, float atol, float rtol) {
-  float max = std::max(std::abs(a), std::abs(b));
-  float diff = std::abs(a - b);
-  return diff <= (atol + rtol * max);
-}
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
deleted file mode 100644
index 1fd40b6f815..00000000000
--- a/backends/vulkan/test/utils/test_utils.h
+++ /dev/null
@@ -1,304 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <random>
-
-#include <gtest/gtest.h>
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#define CREATE_FLOAT_TEXTURE(sizes, allocate_memory)  \
-  vkcompute::api::vTensor(                            \
-      vkcompute::api::context(),                      \
-      sizes,                                          \
-      vkapi::kFloat,                                  \
-      utils::StorageType::TEXTURE_3D,                 \
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, \
-      allocate_memory);
-
-#define CREATE_FLOAT_BUFFER(sizes, allocate_memory) \
-  vkcompute::api::vTensor(                          \
-      vkcompute::api::context(),                    \
-      sizes,                                        \
-      vkapi::kFloat,                                \
-      utils::StorageType::BUFFER,                   \
-      utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED,  \
-      allocate_memory);
-
-#define DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(tensor) \
-  vkcompute::api::StagingBuffer staging_buffer_##tensor(    \
-      vkcompute::api::context(),                            \
-      vkapi::kFloat,                                        \
-      tensor.staging_buffer_numel());                       \
-  record_nchw_to_image_op(                                  \
-      vkcompute::api::context(), staging_buffer_##tensor.buffer(), tensor);
-
-#define DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(tensor) \
-  vkcompute::api::StagingBuffer staging_buffer_##tensor(      \
-      vkcompute::api::context(),                              \
-      vkapi::kFloat,                                          \
-      tensor.staging_buffer_numel());                         \
-  record_image_to_nchw_op(                                    \
-      vkcompute::api::context(), tensor, staging_buffer_##tensor.buffer());
-
-#define CHECK_VALUE(data, idx, expected)                          \
-  do {                                                            \
-    if (data[idx] != expected) {                                  \
-      std::cout << "Output at [" << idx << "] = " << data[idx]    \
-                << ", does not match expected value " << expected \
-                << std::endl;                                     \
-    }                                                             \
-    ASSERT_TRUE(data[idx] == expected);                           \
-  } while (false)
-
-//
-// Operator Recording
-//
-
-void record_nchw_to_buffer_op(
-    vkcompute::api::Context* const context,
-    vkcompute::vkapi::VulkanBuffer& src_buffer,
-    vkcompute::api::vTensor& v_dst);
-
-void record_buffer_to_nchw_op(
-    vkcompute::api::Context* const context,
-    vkcompute::api::vTensor& v_src,
-    vkcompute::vkapi::VulkanBuffer& dst_buffer);
-
-void record_nchw_to_image_op(
-    vkcompute::api::Context* const context,
-    vkcompute::vkapi::VulkanBuffer& src_buffer,
-    vkcompute::api::vTensor& v_dst);
-
-void record_image_to_nchw_op(
-    vkcompute::api::Context* const context,
-    vkcompute::api::vTensor& v_src,
-    vkcompute::vkapi::VulkanBuffer& dst_buffer);
-
-void record_bitw8_image_to_nchw_nobitw8buffer_op(
-    vkcompute::api::Context* const context,
-    vkcompute::api::vTensor& v_src,
-    vkcompute::api::StagingBuffer& dst_buffer);
-
-void record_conv2d_prepack_weights_op(
-    vkcompute::api::Context* const context,
-    vkcompute::vkapi::VulkanBuffer& src_buffer,
-    vkcompute::api::vTensor& v_dst,
-    const std::vector<int64_t>& original_sizes,
-    const bool transposed);
-
-void record_binary_op(
-    vkcompute::api::Context* const context,
-    const std::string& op_name,
-    vkcompute::api::vTensor& v_in1,
-    vkcompute::api::vTensor& v_in2,
-    vkcompute::api::vTensor& v_dst);
-
-void execute_and_check_add(
-    vkcompute::api::vTensor& a,
-    vkcompute::api::vTensor& b,
-    vkcompute::api::vTensor& c,
-    float a_val,
-    float b_val);
-
-void record_index_fill_buffer(
-    vkcompute::api::Context* const context,
-    vkcompute::api::vTensor& v_ten);
-
-void record_scalar_add_buffer(
-    vkcompute::api::Context* context,
-    vkcompute::api::vTensor& v_ten,
-    float offset);
-
-void record_reference_matmul(
-    vkcompute::api::Context* context,
-    vkcompute::api::vTensor& out,
-    vkcompute::api::vTensor& mat1,
-    vkcompute::api::vTensor& mat2);
-
-void record_matmul_texture3d(
-    vkcompute::api::Context* context,
-    vkcompute::api::vTensor& out,
-    vkcompute::api::vTensor& mat1,
-    vkcompute::api::vTensor& mat2);
-
-//
-// Input & Output Utilities
-//
-
-inline std::vector<float> create_random_float_vector(
-    const size_t numel,
-    const float min = 0.0f,
-    const float max = 1.0f) {
-  std::vector<float> result(numel);
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_real_distribution<float> dis(min, max);
-
-  for (size_t i = 0; i < numel; ++i) {
-    result[i] = dis(gen);
-  }
-
-  return result;
-}
-
-inline void fill_staging(
-    vkcompute::api::StagingBuffer& staging,
-    float val,
-    int numel = -1) {
-  if (numel < 0) {
-    numel = staging.numel();
-  }
-  std::vector<float> data(numel);
-  std::fill(data.begin(), data.end(), val);
-  staging.copy_from(data.data(), sizeof(float) * numel);
-}
-
-void fill_vtensor(vkcompute::api::vTensor& vten, std::vector<float>& data);
-
-void fill_vtensor(vkcompute::api::vTensor& vten, float val, bool iota = false);
-
-std::vector<float> create_random_float_buffer(
-    const size_t numel,
-    const float min = 0,
-    const float max = 1);
-
-std::vector<uint8_t> create_random_uint8_buffer(
-    const size_t numel,
-    const uint8_t min = 0,
-    const uint8_t max = 255);
-
-void fill_vtensor(
-    vkcompute::ComputeGraph& graph,
-    const vkcompute::IOValueRef idx,
-    float val,
-    bool iota = false);
-
-void extract_vtensor(vkcompute::api::vTensor& vten, std::vector<float>& data);
-
-inline std::vector<float> extract_vtensor(vkcompute::api::vTensor& vten) {
-  std::vector<float> data_out(vten.staging_buffer_numel());
-  extract_vtensor(vten, data_out);
-  return data_out;
-}
-
-inline void check_staging_buffer(
-    vkcompute::api::StagingBuffer& staging,
-    float val,
-    int numel = -1) {
-  if (numel < 0) {
-    numel = staging.numel();
-  }
-  std::vector<float> data(numel);
-  staging.copy_to(data.data(), sizeof(float) * numel);
-
-  for (size_t i = 0; i < data.size(); ++i) {
-    CHECK_VALUE(data, i, val);
-  }
-}
-
-inline int64_t get_buf_idx(
-    vkcompute::ComputeGraph& graph,
-    vkcompute::IOValueRef ref,
-    const std::vector<int64_t>& tensor_coor) {
-  const std::vector<int64_t>& sizes = graph.sizes_of(ref.value);
-
-  int64_t c = vkcompute::dim_at<vkcompute::kChannel4D>(sizes);
-  int64_t h = vkcompute::dim_at<vkcompute::kHeight4D>(sizes);
-  int64_t w = vkcompute::dim_at<vkcompute::kWidth4D>(sizes);
-
-  int64_t ni = vkcompute::dim_at<vkcompute::kBatch4D>(tensor_coor);
-  int64_t ci = vkcompute::dim_at<vkcompute::kChannel4D>(tensor_coor);
-  int64_t hi = vkcompute::dim_at<vkcompute::kHeight4D>(tensor_coor);
-  int64_t wi = vkcompute::dim_at<vkcompute::kWidth4D>(tensor_coor);
-
-  return (ni * c * h * w + ci * h * w + hi * w + wi);
-}
-
-//
-// Context Management
-//
-
-void submit_to_gpu();
-
-vkcompute::vkapi::Allocation allocate_memory_for(
-    const vkcompute::api::vTensor& vten);
-
-VmaTotalStatistics get_vma_stats();
-
-size_t get_vma_allocation_count();
-
-//
-// Graph Test Utilities
-//
-
-void execute_graph_and_check_output(
-    vkcompute::ComputeGraph& graph,
-    std::vector<float> input_vals,
-    std::vector<float> expected_outputs);
-
-#define CREATE_RAND_WEIGHT_TENSOR(name, sizes, dtype)              \
-  std::vector<float> data_##name =                                 \
-      create_random_float_buffer(utils::multiply_integers(sizes)); \
-  ValueRef name = graph.add_tensorref(sizes, dtype, data_##name.data());
-
-vkcompute::ComputeGraph build_mm_graph(
-    int B,
-    int M,
-    int K,
-    int N,
-    vkcompute::vkapi::ScalarType dtype,
-    vkcompute::utils::StorageType in_out_stype,
-    vkcompute::utils::GPUMemoryLayout memory_layout,
-    const std::vector<float>& mat2_data,
-    const bool prepack_mat2 = false);
-
-//
-// Debugging Utilities
-//
-
-#define PRINT_DATA(vec)        \
-  do {                         \
-    std::cout << #vec << ": "; \
-    print_vector(vec);         \
-  } while (false);
-
-#define PRINT_DATA_RANGE(vec, start, range)                                \
-  do {                                                                     \
-    std::cout << #vec << "[" << start << ", " << (start + range) << "]: "; \
-    print_vector(vec, start, range);                                       \
-  } while (false);
-
-template <typename T>
-void print_vector(
-    const std::vector<T>& data,
-    size_t start = 0,
-    size_t range = 20) {
-  size_t end = data.size();
-  if (range >= 1) {
-    end = std::min(data.size(), start + range);
-  }
-  for (size_t i = start; i < end; ++i) {
-    std::cout << data.at(i) << ", ";
-  }
-  std::cout << std::endl;
-}
-
-//
-// Misc. Utilities
-//
-
-bool check_close(float a, float b, float atol = 1e-4, float rtol = 1e-5);
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
deleted file mode 100644
index a193d02da88..00000000000
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ /dev/null
@@ -1,3232 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <gtest/gtest.h>
-
-#include <bitset>
-#include <iomanip>
-#include <utility>
-#include <vector>
-
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/QPackUtils.h>
-
-#include <executorch/backends/vulkan/test/utils/test_utils.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/DispatchNode.h>
-
-using namespace vkcompute;
-using namespace vkcompute::api;
-
-std::vector<float>
-transpose_matrix(std::vector<float>& mat, const int H, const int W) {
-  std::vector<float> out(W * H);
-  for (int out_y = 0; out_y < H; ++out_y) {
-    for (int out_x = 0; out_x < W; ++out_x) {
-      out[out_x * H + out_y] = mat[out_y * W + out_x];
-    }
-  }
-  return out;
-}
-
-std::vector<float> compute_reference_matmul(
-    std::vector<float>& mat1,
-    std::vector<float>& mat2,
-    const int M,
-    const int K,
-    const int N) {
-  std::vector<float> out(M * N);
-  for (int out_y = 0; out_y < M; ++out_y) {
-    for (int out_x = 0; out_x < N; ++out_x) {
-      out[out_y * N + out_x] = 0;
-      for (int k = 0; k < K; ++k) {
-        out[out_y * N + out_x] += mat1[out_y * K + k] * mat2[k * N + out_x];
-      }
-    }
-  }
-  return out;
-}
-
-std::vector<std::vector<int64_t>> standard_sizes_to_test = {
-    // 2D
-    {7, 11},
-    {13, 6},
-    // 3D
-    {2, 9, 7},
-    {9, 15, 19},
-    {7, 11, 24},
-    {13, 8, 11},
-    {12, 11, 19},
-    // 4D
-    {2, 2, 3, 5},
-    {9, 13, 11, 17},
-    {17, 14, 18, 20},
-    {7, 13, 12, 21},
-    {3, 8, 13, 17},
-};
-
-//
-// Compute API Tests
-//
-
-class VulkanComputeAPITest : public ::testing::Test {
- public:
-  void SetUp() override {
-    // Make sure we are starting with a clean slate
-    EXPECT_TRUE(get_vma_allocation_count() == 0);
-  }
-
-  void TearDown() override {
-    context()->flush();
-
-    // Make sure we are ending with a clean slate
-    EXPECT_TRUE(get_vma_allocation_count() == 0);
-  }
-};
-
-TEST_F(VulkanComputeAPITest, print_adapter) {
-  std::cout << *(context()->adapter_ptr()) << std::endl;
-}
-
-#if defined(VULKAN_DEBUG) && defined(VK_KHR_pipeline_executable_properties)
-
-TEST_F(VulkanComputeAPITest, print_shader_executable_properties) {
-  context()->print_shader_executable_properties(
-      VK_KERNEL(binary_add_nobroadcast__test_half), {0});
-}
-
-#endif // VULKAN_DEBUG && VK_KHR_pipeline_executable_properties
-
-std::vector<int64_t> get_reference_strides(
-    const std::vector<int64_t>& sizes,
-    const utils::GPUMemoryLayout layout,
-    const bool flip_unsqueezed = false) {
-  int64_t C = utils::val_at(-3, sizes);
-  int64_t H = utils::val_at(-2, sizes);
-  int64_t W = utils::val_at(-1, sizes);
-
-  int64_t numel = utils::multiply_integers(sizes);
-
-  switch (layout) {
-    case utils::kWidthPacked:
-      switch (sizes.size()) {
-        case 1:
-          if (flip_unsqueezed)
-            return {1, numel, numel, numel};
-          return {1};
-        case 2:
-          if (flip_unsqueezed)
-            return {1, W, numel, numel};
-          return {W, 1};
-        case 3:
-          if (flip_unsqueezed)
-            return {1, W, H * W, numel};
-          return {H * W, W, 1};
-        case 4:
-          if (flip_unsqueezed)
-            return {1, W, H * W, C * H * W};
-          return {C * H * W, H * W, W, 1};
-        default:
-          return {};
-      }
-      break;
-    case utils::kHeightPacked:
-      switch (sizes.size()) {
-        case 1:
-          if (flip_unsqueezed)
-            return {1, numel, numel, numel};
-          return {1};
-        case 2:
-          if (flip_unsqueezed)
-            return {H, 1, numel, numel};
-          return {1, H};
-          return {1, H};
-        case 3:
-          if (flip_unsqueezed)
-            return {H, 1, H * W, numel};
-          return {W * H, 1, H};
-        case 4:
-          if (flip_unsqueezed)
-            return {H, 1, W * H, C * W * H};
-          return {C * W * H, W * H, 1, H};
-        default:
-          return {};
-      }
-    case utils::kChannelsPacked:
-      switch (sizes.size()) {
-        case 1:
-          if (flip_unsqueezed)
-            return {1, numel, numel, numel};
-          return {1};
-        case 2:
-          if (flip_unsqueezed)
-            return {1, W, numel, numel};
-          return {W, 1};
-        case 3:
-          if (flip_unsqueezed)
-            return {C, W * C, 1, numel};
-          return {1, W * C, C};
-        case 4:
-          if (flip_unsqueezed)
-            return {C, W * C, 1, H * W * C};
-          return {H * W * C, 1, W * C, C};
-        default:
-          return {};
-      }
-  }
-  return {};
-}
-
-/*
- * Applies the following transformations to a tensor's dim_order vector:
- *   1. Reverse the order of elements so that the fastest moving dimensions are
- *      first.
- *   2. Convert NCHW dimension indices to WHCN indices, so that 0 represents the
- *      width dimension, 1 represents the height dimension, and 2 represents the
- *      channels dimension.
- *   3. Unsqueeze the dim_order vector to the next multiple of 4.
- */
-std::vector<int64_t> create_whcn_dim_order(
-    const std::vector<int64_t>& dim_order) {
-  size_t ndim = dim_order.size();
-  std::vector<int64_t> whcn_order(ndim);
-
-  // Convert from NCHW to WHCN index, and flip the dim order so that the fastest
-  // moving dimension is first.
-  // example: {     1,     2,        0} -> {       2,     0,      1}
-  //          {height, width, channels} -> {channels, width, height}
-  for (size_t whcn_i = 0, nchw_i = (ndim - 1); whcn_i < ndim;
-       ++whcn_i, --nchw_i) {
-    whcn_order.at(whcn_i) = ndim - 1 - dim_order.at(nchw_i);
-  }
-
-  // Unsqueeze to the next multiple of 4
-  size_t ndim_up4 = utils::align_up_4(ndim);
-  whcn_order.resize(ndim_up4);
-
-  // Append unsqueezed dimensions
-  for (size_t i = ndim; i < ndim_up4; ++i) {
-    whcn_order.at(i) = i;
-  }
-
-  return whcn_order;
-}
-
-TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) {
-  vkapi::ShaderInfo empty_shader_info;
-  EXPECT_FALSE(empty_shader_info);
-  EXPECT_TRUE(empty_shader_info.src_code.bin == nullptr);
-  EXPECT_TRUE(empty_shader_info.src_code.size == 0u);
-}
-
-bool compare_vectors(
-    const std::vector<int32_t>& v32,
-    const std::vector<int64_t>& v64) {
-  if (v32.size() != v64.size()) {
-    return false;
-  }
-  for (size_t i = 0; i < v32.size(); ++i) {
-    if (static_cast<int64_t>(v32[i]) != v64[i]) {
-      return false;
-    }
-  }
-  return true;
-}
-
-TEST_F(VulkanComputeAPITest, calculate_dim_order_test) {
-  // ndim, GPUMemoryLayout, expected dim order pairs
-  std::vector<std::tuple<size_t, int32_t, std::vector<int64_t>>> test_cases = {
-      {1, WHCN::kWidthDim, {0}},
-      {1, WHCN::kHeightDim, {0}},
-      {1, WHCN::kChannelsDim, {0}},
-      {2, WHCN::kWidthDim, {0, 1}},
-      {2, WHCN::kHeightDim, {1, 0}},
-      {2, WHCN::kChannelsDim, {0, 1}},
-      {3, WHCN::kWidthDim, {0, 1, 2}},
-      {3, WHCN::kHeightDim, {0, 2, 1}},
-      {3, WHCN::kChannelsDim, {1, 2, 0}},
-      {4, WHCN::kWidthDim, {0, 1, 2, 3}},
-      {4, WHCN::kHeightDim, {0, 1, 3, 2}},
-      {4, WHCN::kChannelsDim, {0, 2, 3, 1}},
-  };
-
-  for (const auto& test_case : test_cases) {
-    const size_t& ndim = std::get<0>(test_case);
-    const int32_t packed_dim = std::get<1>(test_case);
-    const auto& expected_dim_order = std::get<2>(test_case);
-    std::vector<int64_t> dim_order = calculate_dim_order(ndim, packed_dim);
-
-    ASSERT_TRUE(dim_order == expected_dim_order);
-  }
-}
-
-TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
-  vTensor v_tensor_to_resize(
-      context(),
-      {25, 25, 25, 25},
-      vkapi::kFloat,
-      utils::kBuffer,
-      utils::kWidthPacked,
-      /*allocate_memory = */ false);
-
-  for (const auto& sizes : standard_sizes_to_test) {
-    if (sizes.size() < 3) {
-      continue;
-    }
-    for (const auto& layout :
-         {utils::kWidthPacked, utils::kHeightPacked, utils::kChannelsPacked}) {
-      {
-        const int32_t packed_dim = static_cast<int32_t>(layout);
-        std::vector<int64_t> dim_order =
-            calculate_dim_order(sizes.size(), packed_dim);
-        std::vector<int64_t> strides = calculate_strides(sizes, dim_order);
-        int64_t numel = utils::multiply_integers(sizes);
-
-        std::vector<int64_t> ref_strides = get_reference_strides(sizes, layout);
-        ASSERT_TRUE(strides == ref_strides);
-
-        std::vector<int64_t> unsqueezed_strides =
-            flip_and_unsqueeze<int64_t>(strides, kTensorStrides, numel);
-
-        std::vector<int64_t> ref_unsqueezed_strides =
-            get_reference_strides(sizes, layout, true);
-
-        ASSERT_TRUE(unsqueezed_strides == ref_unsqueezed_strides);
-
-        std::vector<int64_t> whcn_dim_order =
-            flip_and_unsqueeze<int64_t>(dim_order, kTensorDimOrder, numel);
-
-        std::vector<int64_t> ref_whcn_dim_order =
-            create_whcn_dim_order(dim_order);
-
-        ASSERT_TRUE(whcn_dim_order == ref_whcn_dim_order);
-
-        // Create new vTensor and check that the strides are correct
-        vTensor new_v_tensor(
-            context(),
-            sizes,
-            vkapi::kFloat,
-            utils::kBuffer,
-            layout,
-            /*allocate_memory = */ false);
-
-        ASSERT_TRUE(new_v_tensor.strides() == ref_strides);
-
-        // Resize vtensor and check that updated metadata is correct
-        v_tensor_to_resize.virtual_reconfigure(sizes, dim_order);
-        ASSERT_TRUE(v_tensor_to_resize.strides() == ref_strides);
-      }
-    }
-  }
-}
-
-TEST_F(VulkanComputeAPITest, virtual_transpose_test) {
-  std::vector<int64_t> sizes = {7, 9, 11, 13};
-  // (dim0, dim1), new_sizes, new_dim_order, new_axis_map, new_packed_dim_idx
-  std::vector<std::vector<std::vector<int64_t>>> test_cases = {
-      {{2, 3}, {7, 9, 13, 11}, {0, 1, 3, 2}, {1, 0, 2, 2}, {1}},
-      {{2, 1}, {7, 11, 9, 13}, {0, 2, 1, 3}, {0, 2, 1, 1}, {0}},
-      {{1, 3}, {7, 13, 11, 9}, {0, 3, 2, 1}, {2, 1, 0, 0}, {2}},
-  };
-
-  for (const auto& test_case : test_cases) {
-    const int dim0 = test_case.at(0).at(0);
-    const int dim1 = test_case.at(0).at(1);
-
-    const auto& expected_sizes = test_case.at(1);
-    const auto& expected_dim_order = test_case.at(2);
-    const auto& expected_axis_map = test_case.at(3);
-    const int expected_packed_dim = test_case.at(4).at(0);
-
-    {
-      vTensor a_buffer = vTensor(
-          context(), sizes, vkapi::kFloat, utils::kBuffer, utils::kWidthPacked);
-
-      a_buffer.virtual_transpose(dim0, dim1);
-      EXPECT_TRUE(a_buffer.sizes() == expected_sizes);
-      EXPECT_TRUE(a_buffer.dim_order() == expected_dim_order);
-    }
-
-    {
-      vTensor a_texture = vTensor(
-          context(),
-          sizes,
-          vkapi::kFloat,
-          utils::kTexture3D,
-          utils::kWidthPacked);
-      a_texture.virtual_transpose(dim0, dim1);
-      EXPECT_TRUE(a_texture.sizes() == expected_sizes);
-      EXPECT_TRUE(a_texture.axis_map() == expected_axis_map);
-      EXPECT_TRUE(a_texture.packed_dim() == expected_packed_dim);
-    }
-  }
-}
-
-TEST_F(VulkanComputeAPITest, view_of_view_test) {
-  constexpr int N = 3;
-  constexpr int C = 5;
-  constexpr int H = 17;
-  constexpr int W = 19;
-
-  std::vector<int64_t> sizes = {N, C, H, W};
-
-  vTensor t1 = vTensor(
-      context(), sizes, vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked);
-
-  vTensor t2 = vTensor(t1);
-  EXPECT_TRUE(t2.sizes() == sizes);
-  vTensor t3 = vTensor(t2);
-  EXPECT_TRUE(t2.sizes() == sizes);
-
-  t2.virtual_transpose(1, 2);
-  std::vector<int64_t> expected_t2_sizes = {N, H, C, W};
-  EXPECT_TRUE(t2.sizes() == expected_t2_sizes);
-
-  // Because t3 was created before t2's metadata was updated, we need to first
-  // update t3's metadata to match t2's metadata. Then the transpose will yield
-  // the correct metadata.
-  t3.virtual_clone(t2);
-  t3.virtual_transpose(2, 3);
-  std::vector<int64_t> expected_t3_sizes = {N, H, W, C};
-  EXPECT_TRUE(t3.sizes() == expected_t3_sizes);
-}
-
-utils::ivec3 make_temp_ivec3(int x, int y, int z) {
-  return utils::ivec3{x, y, z};
-}
-
-TEST_F(VulkanComputeAPITest, vec_test) {
-  {
-    utils::vec3 v3({1, 2, 3});
-    ASSERT_TRUE(v3[0] == 1);
-    ASSERT_TRUE(v3[1] == 2);
-    ASSERT_TRUE(v3[2] == 3);
-    v3 = {4, 5, 6};
-    ASSERT_TRUE(v3[0] == 4);
-    ASSERT_TRUE(v3[1] == 5);
-    ASSERT_TRUE(v3[2] == 6);
-  }
-
-  {
-    utils::uvec4 uv4({4, 3, 2, 1});
-    ASSERT_TRUE(uv4[0] == 4);
-    ASSERT_TRUE(uv4[1] == 3);
-    ASSERT_TRUE(uv4[2] == 2);
-    ASSERT_TRUE(uv4[3] == 1);
-    uv4 = {11, 13, 12, 88};
-    ASSERT_TRUE(uv4[0] == 11);
-    ASSERT_TRUE(uv4[1] == 13);
-    ASSERT_TRUE(uv4[2] == 12);
-    ASSERT_TRUE(uv4[3] == 88);
-  }
-
-  // Test copy from same type
-  {
-    utils::ivec3 v{5, 6, 8};
-    utils::ivec3 v2 = v;
-
-    ASSERT_TRUE(v2[0] == 5);
-    ASSERT_TRUE(v2[1] == 6);
-    ASSERT_TRUE(v2[2] == 8);
-  }
-
-  // Test copy from different type
-  {
-    utils::uvec3 v{5, 6, 8};
-    utils::ivec3 v2 = v;
-
-    ASSERT_TRUE(v2[0] == 5);
-    ASSERT_TRUE(v2[1] == 6);
-    ASSERT_TRUE(v2[2] == 8);
-  }
-
-  // Test construction from temporary vec
-  {
-    utils::uvec3 v{make_temp_ivec3(4, 5, 10)};
-    ASSERT_TRUE(v[0] == 4);
-    ASSERT_TRUE(v[1] == 5);
-    ASSERT_TRUE(v[2] == 10);
-  }
-
-  // Test initalization from temporary vec
-  {
-    utils::uvec3 v = make_temp_ivec3(4, 5, 10);
-    ASSERT_TRUE(v[0] == 4);
-    ASSERT_TRUE(v[1] == 5);
-    ASSERT_TRUE(v[2] == 10);
-  }
-}
-
-TEST_F(VulkanComputeAPITest, retrieve_custom_shader_test) {
-  // Try to get shader from custom shader library
-  const vkapi::ShaderInfo& kernel = VK_KERNEL(test_shader);
-
-  ASSERT_TRUE(kernel.kernel_name == "test_shader");
-}
-
-TEST_F(VulkanComputeAPITest, spec_var_classes_test) {
-  // Check equality operator
-  ASSERT_TRUE(SV(1.5f) == SV(1.5f));
-  ASSERT_FALSE(SV(15.0f) == SV(15));
-  ASSERT_FALSE(SV(1u) == SV(true));
-
-  size_t sv_size = sizeof(vkapi::SpecVar);
-
-  vkapi::SpecVarList spec_vars = {};
-  ASSERT_TRUE(spec_vars.size() == 0);
-  spec_vars = {SV(1.1f), SV(32), SV(45)};
-  ASSERT_TRUE(spec_vars.size() == 3);
-  vkapi::SpecVarList spec_vars_other = {SV(2.6f), SV(true), SV(78u), SV(5.5f)};
-  spec_vars.append(spec_vars_other);
-  ASSERT_TRUE(spec_vars.size() == 7);
-
-  // Check validity of the data
-  const vkapi::SpecVar* data = spec_vars.data();
-  ASSERT_TRUE(*(reinterpret_cast<const float*>(data + 3)) == 2.6f);
-  ASSERT_TRUE(*(reinterpret_cast<const int32_t*>(data + 1)) == 32);
-  ASSERT_TRUE(*(reinterpret_cast<const int32_t*>(data + 5)) == 78u);
-
-  // Check validity of the map entries
-  std::vector<VkSpecializationMapEntry> entries =
-      spec_vars.generate_map_entries();
-
-  for (size_t i = 0; i < spec_vars.size(); ++i) {
-    ASSERT_TRUE(entries[i].constantID == i);
-    ASSERT_TRUE(entries[i].offset == sv_size * i);
-    if (i != 4) {
-      ASSERT_TRUE(entries[i].size == 4);
-    } else {
-      ASSERT_TRUE(entries[i].size == 1);
-    }
-  }
-
-  // Check copy
-  vkapi::SpecVarList spec_vars_copy(spec_vars);
-  ASSERT_TRUE(spec_vars_copy.size() == 7);
-
-  // Check validity of the copied data
-  const vkapi::SpecVar* copy_data = spec_vars_copy.data();
-  ASSERT_TRUE(*(reinterpret_cast<const bool*>(copy_data + 4)) == true);
-  ASSERT_TRUE(*(reinterpret_cast<const int32_t*>(copy_data + 2)) == 45);
-  ASSERT_TRUE(*(reinterpret_cast<const float*>(copy_data + 6)) == 5.5f);
-}
-
-TEST_F(VulkanComputeAPITest, spec_var_shader_test) {
-  size_t len = 16;
-  StagingBuffer buffer(context(), vkapi::kFloat, len);
-
-  float scale = 3.0f;
-  float offset = 1.5f;
-
-  {
-    ParamsBuffer params(context(), int32_t(len));
-    uint32_t len_div4 = utils::div_up(uint32_t(len), uint32_t(4));
-    vkapi::PipelineBarrier pipeline_barrier{};
-    context()->submit_compute_job(
-        VK_KERNEL(fill_buffer),
-        pipeline_barrier,
-        {64, 1, 1},
-        {len_div4, 1, 1},
-        {SV(scale), SV(offset)},
-        VK_NULL_HANDLE,
-        0,
-        buffer.buffer(),
-        params.buffer());
-  }
-
-  submit_to_gpu();
-
-  std::vector<float> data(len);
-  buffer.copy_to(data.data(), buffer.nbytes());
-
-  for (size_t i = 0; i < len; ++i) {
-    CHECK_VALUE(data, i, scale * i + offset);
-  }
-}
-
-TEST_F(VulkanComputeAPITest, update_params_between_submit) {
-  context()->set_cmd(/*reusable = */ true);
-  std::vector<int64_t> sizes = {4, 4, 2};
-  vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
-
-  std::string kernel_name("fill_texture__test");
-  add_dtype_suffix(kernel_name, a.dtype());
-
-  struct Params final {
-    utils::ivec3 size;
-    int32_t fill;
-    utils::vec4 values;
-  };
-
-  Params block{
-      {2, 4, 1},
-      0,
-      {5.0, 5.0, 5.0, 5.0},
-  };
-
-  ParamsBuffer params(context(), block);
-
-  {
-    vkapi::PipelineBarrier pipeline_barrier{};
-    vkapi::SpecVarList specialization_constants = {};
-    context()->submit_compute_job(
-        VK_KERNEL_FROM_STR(kernel_name),
-        pipeline_barrier,
-        {4, 4, 4},
-        {4, 4, 4},
-        specialization_constants,
-        VK_NULL_HANDLE,
-        0,
-        a.image(
-            pipeline_barrier,
-            vkapi::PipelineStage::COMPUTE,
-            vkapi::MemoryAccessType::WRITE),
-        params.buffer());
-  }
-
-  StagingBuffer staging_buffer(
-      context(), vkapi::kFloat, a.staging_buffer_numel());
-  record_image_to_nchw_op(context(), a, staging_buffer.buffer());
-
-  submit_to_gpu();
-  check_staging_buffer(staging_buffer, 5.0f);
-
-  Params new_block{
-      {2, 4, 1},
-      0,
-      {4.0, 4.0, 4.0, 4.0},
-  };
-
-  params.update(new_block);
-
-  submit_to_gpu();
-  check_staging_buffer(staging_buffer, 4.0f);
-}
-
-template <typename T, vkapi::ScalarType dtype>
-void test_storage_buffer_type(const size_t len) {
-  StagingBuffer buffer(context(), dtype, len);
-
-  std::string kernel_name("idx_fill_buffer");
-  switch (dtype) {
-    case vkapi::kFloat:
-      kernel_name += "_float";
-      break;
-    case vkapi::kHalf:
-      kernel_name += "_half";
-      break;
-    case vkapi::kQInt8:
-      kernel_name += "_int8";
-      break;
-    case vkapi::kQUInt8:
-      kernel_name += "_uint8";
-      break;
-    default:
-      throw std::runtime_error("Unsupported dtype");
-      break;
-  }
-
-  ParamsBuffer params(context(), int32_t(len));
-
-  {
-    uint32_t len_div4 = utils::div_up(uint32_t(len), uint32_t(4));
-    vkapi::PipelineBarrier pipeline_barrier{};
-    vkapi::SpecVarList specialization_constants = {};
-    context()->submit_compute_job(
-        VK_KERNEL_FROM_STR(kernel_name),
-        pipeline_barrier,
-        {64, 1, 1},
-        {len_div4, 1, 1},
-        specialization_constants,
-        VK_NULL_HANDLE,
-        0,
-        buffer.buffer(),
-        params.buffer());
-  }
-
-  submit_to_gpu();
-
-  std::vector<T> data(len);
-  buffer.copy_to(data.data(), buffer.nbytes());
-
-  for (size_t i = 0; i < len; ++i) {
-    CHECK_VALUE(data, i, T(i));
-  }
-}
-
-TEST_F(VulkanComputeAPITest, test_buffer_float) {
-  test_storage_buffer_type<float, vkapi::kFloat>(16);
-}
-
-TEST_F(VulkanComputeAPITest, test_buffer_float16) {
-  if (!context()->adapter_ptr()->has_full_float16_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_storage_buffer_type<executorch::aten::Half, vkapi::kHalf>(16);
-}
-
-TEST_F(VulkanComputeAPITest, test_buffer_int8) {
-  if (!context()->adapter_ptr()->has_full_int8_buffers_support()) {
-    GTEST_SKIP();
-  }
-  test_storage_buffer_type<int8_t, vkapi::kQInt8>(16);
-}
-
-TEST_F(VulkanComputeAPITest, test_zero_size_tensor) {
-  // Simple test that performs a + b -> c
-
-  std::vector<int64_t> sizes = {0, 5, 7};
-  vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
-  vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
-  vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
-
-  // Fill input tensors
-  fill_vtensor(a, 2.5f);
-  fill_vtensor(b, 1.5f);
-
-  // a + b -> c
-  record_binary_op(context(), "add", a, b, c);
-
-  // Extract output tensor
-  std::vector<float> data_out = extract_vtensor(c);
-
-  // Assert all tensors are empty
-  ASSERT_TRUE(a.numel() == 0);
-  ASSERT_TRUE(b.numel() == 0);
-  ASSERT_TRUE(c.numel() == 0);
-  ASSERT_TRUE(a.nbytes() == 0);
-  ASSERT_TRUE(b.nbytes() == 0);
-  ASSERT_TRUE(c.nbytes() == 0);
-
-  // Check output
-  for (size_t i = 0; i < data_out.size(); ++i) {
-    CHECK_VALUE(data_out, i, 4.0f);
-  }
-}
-
-template <typename T>
-void run_buffer_tensor_sanity_check(vTensor& tensor) {
-  fill_vtensor(tensor, 0.0f, true);
-
-  record_scalar_add_buffer(context(), tensor, 2.0f);
-  std::vector<float> data_out = extract_vtensor(tensor);
-
-  // Check output
-  for (size_t i = 0; i < tensor.numel(); ++i) {
-    CHECK_VALUE(data_out, i, i + 2.0f);
-  }
-}
-
-TEST_F(VulkanComputeAPITest, buffer_tensor_sanity_check) {
-  for (const auto& sizes : standard_sizes_to_test) {
-    for (const auto& dtype : {vkapi::kFloat, vkapi::kHalf, vkapi::kChar}) {
-      if (dtype == vkapi::kHalf &&
-          !context()->adapter_ptr()->has_full_float16_buffers_support()) {
-        continue;
-      }
-      if (dtype == vkapi::kHalf && utils::multiply_integers(sizes) >= 2048) {
-        continue;
-      }
-      if (dtype == vkapi::kChar &&
-          !context()->adapter_ptr()->has_full_int8_buffers_support()) {
-        continue;
-      }
-      if (dtype == vkapi::kChar && utils::multiply_integers(sizes) >= 128) {
-        continue;
-      }
-      for (const auto& layout :
-           {utils::kWidthPacked,
-            utils::kHeightPacked,
-            utils::kChannelsPacked}) {
-        vTensor a = vTensor(context(), sizes, dtype, utils::kBuffer, layout);
-        switch (dtype) {
-          case vkapi::kFloat:
-            run_buffer_tensor_sanity_check<float>(a);
-            break;
-          case vkapi::kHalf:
-            run_buffer_tensor_sanity_check<executorch::aten::Half>(a);
-            break;
-          case vkapi::kChar:
-            run_buffer_tensor_sanity_check<int8_t>(a);
-            break;
-          default:
-            VK_THROW("Unsupported dtype");
-        }
-      }
-    }
-  }
-}
-
-TEST_F(VulkanComputeAPITest, texture_add_sanity_check) {
-  // Simple test that performs a + b -> c
-
-  std::vector<int64_t> sizes = {4, 4, 1};
-  vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
-  vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
-  vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
-
-  // Fill input tensors
-  fill_vtensor(a, 2.5f);
-  fill_vtensor(b, 1.5f);
-
-  // a + b -> c
-  record_binary_op(context(), "add", a, b, c);
-
-  // Extract output tensor
-  std::vector<float> data_out = extract_vtensor(c);
-
-  // Check output
-  for (size_t i = 0; i < data_out.size(); ++i) {
-    CHECK_VALUE(data_out, i, 4.0f);
-  }
-}
-
-TEST_F(VulkanComputeAPITest, tensor_alias_test) {
-  for (utils::StorageType storage_type : {utils::kTexture3D, utils::kBuffer}) {
-    std::vector<int64_t> sizes = {9, 9};
-
-    const size_t alloc_count_before = get_vma_allocation_count();
-
-    vTensor original = vTensor(context(), sizes, vkapi::kFloat, storage_type);
-
-    vTensor copy = vTensor(original);
-
-    // Two tensors but only one additional allocation.
-    EXPECT_TRUE(get_vma_allocation_count() == alloc_count_before + 1);
-    EXPECT_TRUE(copy.is_view_of(original));
-
-    // Fill original tensor with some data
-    fill_vtensor(original, 2.5f, true);
-
-    std::vector<float> data_out(copy.staging_buffer_numel());
-    // Extract the copy tensor; should contain the data of the original tensor
-    extract_vtensor(copy, data_out);
-
-    for (size_t i = 0; i < original.numel(); ++i) {
-      CHECK_VALUE(data_out, i, 2.5f + i);
-    }
-  }
-}
-
-TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
-  constexpr int M = 11;
-  constexpr int K = 23;
-  constexpr int N = 17;
-  std::vector<int64_t> mat1_sizes = {M, K};
-  std::vector<int64_t> mat2_sizes = {N, K};
-  std::vector<int64_t> out_sizes = {M, N};
-
-  for (const auto storage_type : {utils::kTexture3D, utils::kBuffer}) {
-    vTensor mat1 = vTensor(
-        context(),
-        mat1_sizes,
-        vkapi::kFloat,
-        storage_type,
-        utils::kWidthPacked);
-    vTensor mat2 = vTensor(
-        context(),
-        mat2_sizes,
-        vkapi::kFloat,
-        storage_type,
-        utils::kWidthPacked);
-    vTensor out = vTensor(
-        context(), out_sizes, vkapi::kFloat, storage_type, utils::kWidthPacked);
-
-    // Generate data
-    std::vector<float> mat1_data =
-        create_random_float_buffer(mat1.staging_buffer_numel());
-    std::vector<float> mat2_data =
-        create_random_float_buffer(mat2.staging_buffer_numel());
-
-    // Create direct view and modify sizes and strides later
-    vTensor mat2_t = vTensor(mat2);
-    // Update sizes and strides of mat2_t to be that of a transposed tensor
-    mat2_t.virtual_transpose(0, 1);
-
-    EXPECT_TRUE(mat2_t.packed_dim() == WHCN::kHeightDim);
-
-    std::vector<float> mat2_t_data = transpose_matrix(mat2_data, N, K);
-    std::vector<float> ref_out =
-        compute_reference_matmul(mat1_data, mat2_t_data, M, K, N);
-
-    // Fill original tensor with some data
-    fill_vtensor(mat1, mat1_data);
-    fill_vtensor(mat2, mat2_data);
-
-    if (storage_type == utils::kTexture3D) {
-      record_matmul_texture3d(context(), out, mat1, mat2_t);
-    } else {
-      record_reference_matmul(context(), out, mat1, mat2_t);
-    }
-
-    std::vector<float> data_out(out.staging_buffer_numel());
-    // Extract the copy tensor; should contain the data of the original tensor
-    extract_vtensor(out, data_out);
-
-    for (size_t i = 0; i < ref_out.size(); ++i) {
-      EXPECT_TRUE(check_close(data_out[i], ref_out[i]));
-    }
-  }
-}
-
-TEST_F(VulkanComputeAPITest, texture_deferred_allocation_test) {
-  // This test is the same as texture_add_sanity_check, except that the tensor
-  // memory is allocated in a deferred fashion
-
-  std::vector<int64_t> sizes = {4, 4, 1};
-  vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
-  vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
-  vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
-
-  // No allocations made so far
-  EXPECT_TRUE(get_vma_allocation_count() == 0);
-
-  std::vector<float> data_a(a.staging_buffer_numel());
-  std::fill(data_a.begin(), data_a.end(), 2.5f);
-  std::vector<float> data_b(b.staging_buffer_numel());
-  std::fill(data_b.begin(), data_b.end(), 1.5f);
-
-  // Allocate memory at the last possible opportunity
-  vkapi::Allocation a_mem = allocate_memory_for(a);
-  a.image().bind_allocation(a_mem);
-  vkapi::Allocation b_mem = allocate_memory_for(b);
-  b.image().bind_allocation(b_mem);
-  vkapi::Allocation c_mem = allocate_memory_for(c);
-  c.image().bind_allocation(c_mem);
-
-  // One allocation for each tensor
-  EXPECT_TRUE(get_vma_allocation_count() == 3);
-
-  fill_vtensor(a, data_a);
-  fill_vtensor(b, data_b);
-
-  record_binary_op(context(), "add", a, b, c);
-
-  std::vector<float> data_c(c.staging_buffer_numel());
-  extract_vtensor(c, data_c);
-
-  for (size_t i = 0; i < data_c.size(); ++i) {
-    CHECK_VALUE(data_c, i, 4.0f);
-  }
-}
-
-TEST_F(VulkanComputeAPITest, texture_resource_aliasing_test) {
-  // This test performs the following operations:
-  // 1. a + b -> c
-  // 2. c + d -> e
-  // and share memory between tensors whenever possible.
-
-  std::vector<int64_t> sizes = {4, 4, 1};
-  vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
-  vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
-  vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
-  vTensor d = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
-  vTensor e = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
-
-  // No allocations made so far
-  EXPECT_TRUE(get_vma_allocation_count() == 0);
-
-  // a and d can share the same memory allocation
-  vkapi::Allocation a_d_mem = allocate_memory_for(a);
-  a.image().bind_allocation(a_d_mem);
-  d.image().bind_allocation(a_d_mem);
-  // b and e can share the same memory allocation
-  vkapi::Allocation b_e_mem = allocate_memory_for(b);
-  b.image().bind_allocation(b_e_mem);
-  e.image().bind_allocation(b_e_mem);
-  // c must have its own memory allocation
-  vkapi::Allocation c_mem = allocate_memory_for(c);
-  c.image().bind_allocation(c_mem);
-
-  // 3 allocations should be made
-  EXPECT_TRUE(get_vma_allocation_count() == 3);
-
-  // Specify input data
-  std::vector<float> data_a(a.staging_buffer_numel());
-  std::fill(data_a.begin(), data_a.end(), 2.5f);
-  std::vector<float> data_b(b.staging_buffer_numel());
-  std::fill(data_b.begin(), data_b.end(), 1.5f);
-  std::vector<float> data_d(b.staging_buffer_numel());
-  std::fill(data_d.begin(), data_d.end(), 1.0f);
-
-  // First, fill a and b with data
-  fill_vtensor(a, data_a);
-  fill_vtensor(b, data_b);
-
-  // a + b -> c
-  record_binary_op(context(), "add", a, b, c);
-
-  // Now d can be filled with data
-  fill_vtensor(d, data_d);
-
-  // c + d -> e
-  record_binary_op(context(), "add", c, d, e);
-
-  // Extract data from e
-  std::vector<float> data_e(e.staging_buffer_numel());
-  extract_vtensor(e, data_e);
-
-  // Sanity check that the values are correct
-  for (size_t i = 0; i < data_e.size(); ++i) {
-    CHECK_VALUE(data_e, i, 5.0f);
-  }
-}
-
-TEST_F(VulkanComputeAPITest, resource_bind_twice_fails) {
-  // Check that binding a resource that already has memory associated with it
-  // fails
-
-  std::vector<int64_t> sizes = {4, 4, 1};
-  vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
-
-  // Try to double bind a resource, which should fail
-  vkapi::Allocation a_mem = allocate_memory_for(a);
-  EXPECT_THROW(a.image().bind_allocation(a_mem), vkapi::Error);
-}
-
-TEST_F(VulkanComputeAPITest, resource_destructor_non_owning_memory) {
-  // Check that the destructor of a vTensor that does not own its memory
-  // does not free the memory
-
-  vkapi::Allocation memory;
-
-  // Default Allocation constructor should not allocate memory
-  EXPECT_TRUE(get_vma_allocation_count() == 0);
-
-  std::vector<int64_t> sizes = {4, 4, 1};
-  {
-    vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
-
-    memory = allocate_memory_for(a);
-    EXPECT_TRUE(get_vma_allocation_count() == 1);
-    a.image().bind_allocation(memory);
-  }
-
-  // Check that the memory is still allocated
-  EXPECT_TRUE(get_vma_allocation_count() == 1);
-}
-
-TEST_F(VulkanComputeAPITest, use_non_bound_textures_fails) {
-  // Try to encode a command buffer with a vTensor that does not have
-  // memory
-
-  std::vector<int64_t> sizes = {4, 4, 1};
-  vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ false);
-
-  // No allocations yet
-  EXPECT_TRUE(get_vma_allocation_count() == 0);
-
-  std::vector<float> data_a(a.staging_buffer_numel());
-  std::fill(data_a.begin(), data_a.end(), 2.5f);
-
-  // Encoding a command buffer with a vTensor without memory should throw
-  EXPECT_THROW(fill_vtensor(a, data_a), vkapi::Error);
-}
-
-TEST_F(VulkanComputeAPITest, texture_virtual_resize) {
-  context()->set_cmd(/*reusable = */ true);
-  std::vector<int64_t> sizes = {8, 12, 12};
-  vTensor a = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
-  vTensor b = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
-  vTensor c = CREATE_FLOAT_TEXTURE(sizes, /*allocate_memory = */ true);
-
-  DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(a)
-  DEFINE_STAGING_BUFFER_AND_RECORD_TO_GPU_FOR(b)
-
-  fill_staging(staging_buffer_a, 11.5f);
-  fill_staging(staging_buffer_b, 12.5f);
-
-  record_binary_op(context(), "add", a, b, c);
-
-  DEFINE_STAGING_BUFFER_AND_RECORD_FROM_GPU_FOR(c)
-
-  submit_to_gpu();
-  check_staging_buffer(staging_buffer_c, 24.0f);
-
-  std::vector<std::vector<int64_t>> new_sizes_list = {
-      {4, 2, 4}, {4, 3, 6}, {8, 12, 12}, {8, 1, 1}, {8, 11, 10}};
-
-  for (auto& new_sizes : new_sizes_list) {
-    a.virtual_resize(new_sizes);
-    b.virtual_resize(new_sizes);
-    c.virtual_resize(new_sizes);
-
-    fill_staging(staging_buffer_a, float(new_sizes[1] + 1.5f), a.numel());
-    fill_staging(staging_buffer_b, float(new_sizes[2] + 55.0f), b.numel());
-
-    submit_to_gpu();
-    check_staging_buffer(
-        staging_buffer_c,
-        float(new_sizes[1] + new_sizes[2] + 56.5f),
-        c.numel());
-  }
-}
-
-//
-// Compute Graph Tests
-//
-
-#define EXTRACT_TENSOR(name)                                                 \
-  std::vector<float> data_##name(graph.staging_buffer_numel_of(name.value)); \
-  graph.copy_from_staging(name.staging, data_##name.data(), data_##name.size());
-
-// The purpose of this test is simply to track the size of various classes over
-// time, in the interest of making sure that they doesn't grow too large.
-TEST_F(VulkanComputeAPITest, print_object_sizes) {
-#define PRINT_SIZE(name) \
-  std::cout << #name << " size: " << sizeof(name) << " B" << std::endl
-  PRINT_SIZE(vTensor);
-  PRINT_SIZE(Value);
-  PRINT_SIZE(StagingBuffer);
-  PRINT_SIZE(ComputeGraph);
-  PRINT_SIZE(DispatchNode);
-#undef PRINT_SIZE
-
-  // The actual sizes of each object is dependent on the platform. However, we
-  // can alert ourselves to any significant changes in the sizes of these
-  // objects by checking the `sizeof()` the class against some loose thresholds.
-
-  // Current known size on 64 bit system: 1040 B
-  EXPECT_TRUE(sizeof(vTensor) < 1200);
-  // Current known size on 64 bit system: 80 B
-  EXPECT_TRUE(sizeof(Value) < 100);
-  // Current known size on 64 bit system: 120 B
-  EXPECT_TRUE(sizeof(StagingBuffer) < 500);
-  // Current known size on 64 bit system: 608 B
-  EXPECT_TRUE(sizeof(ComputeGraph) < 700);
-  // Current known size on 64 bit system: 248 B
-  EXPECT_TRUE(sizeof(DispatchNode) < 500);
-}
-
-TEST_F(VulkanComputeAPITest, test_tensor_creation_from_vulkan_image) {
-  const auto w = 16;
-  const auto h = 12;
-  const auto d = 1;
-  const utils::uvec3 image_extents = {w, h, d};
-
-  vkapi::Adapter* adapter_ptr = context()->adapter_ptr();
-
-  vkapi::ImageSampler::Properties sampler_props{
-      VK_FILTER_NEAREST,
-      VK_SAMPLER_MIPMAP_MODE_NEAREST,
-      VK_SAMPLER_ADDRESS_MODE_REPEAT,
-      VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK,
-  };
-
-  VkFormat image_format = VK_FORMAT_R32G32B32A32_SFLOAT;
-  VkImageType image_type = VK_IMAGE_TYPE_3D;
-  VkImageViewType image_view_type = VK_IMAGE_VIEW_TYPE_3D;
-
-  VkSampler sampler = adapter_ptr->sampler_cache().retrieve(sampler_props);
-
-  auto image = adapter_ptr->vma().create_image(
-      context()->device(),
-      vkapi::create_extent3d(image_extents),
-      image_format,
-      image_type,
-      context()->preferred_image_tiling(),
-      image_view_type,
-      sampler_props,
-      sampler,
-      /*allow_transfer = */ true,
-      /*allocate_memory = */ true);
-
-  auto tensor = vTensor(context(), image);
-
-  const auto exp_sizes = std::vector<int64_t>{w, h, d * 4};
-  EXPECT_TRUE(tensor.sizes() == exp_sizes);
-  EXPECT_TRUE(tensor.packed_dim() == 2);
-
-  const auto exp_numel = w * h * d * 4;
-  EXPECT_TRUE(tensor.numel() == exp_numel);
-}
-
-TEST(VulkanComputeGraphTest, test_values_scalars) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  ValueRef idx;
-
-  idx = graph.add_scalar<int64_t>(4);
-  EXPECT_TRUE(graph.get_int(idx) == 4);
-
-  idx = graph.add_scalar<double>(5.5f);
-  EXPECT_TRUE(graph.get_double(idx) == 5.5f);
-}
-
-TEST(VulkanComputeGraphTest, test_values_scalar_list_inplace_constructed) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  ValueRef idx = graph.add_scalar_list<int64_t>({1, 2, 3, 4});
-  const auto arr = graph.get_int_list(idx);
-  EXPECT_TRUE(arr->size() == 4);
-  for (int i = 0; i < 4; i++) {
-    EXPECT_TRUE(arr->at(i) == i + 1);
-  }
-}
-
-TEST(VulkanComputeGraphTest, test_values_scalar_list_outside_constructed) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  ValueRef idx;
-  {
-    std::vector<double> data = {5.0, 4.0, 3.0, 2.0, 1.0};
-    idx = graph.add_scalar_list(std::move(data));
-  }
-  const auto& arr = graph.get_double_list(idx);
-  EXPECT_TRUE(arr->size() == 5);
-  for (int i = 0; i < 5; i++) {
-    EXPECT_TRUE(arr->at(i) == (5 - i));
-  }
-}
-
-TEST(VulkanComputeGraphTest, test_values_string) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  ValueRef idx;
-  {
-    std::string data = "hello, world";
-    idx = graph.add_string(std::move(data));
-  }
-  std::string stored = graph.get_string(idx);
-  EXPECT_TRUE(stored == "hello, world");
-}
-
-TEST(VulkanComputeGraphTest, empty_init_graphnode_test) {
-  ExecuteNode node(nullptr, {});
-
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  // Encode an empty ExecuteNode and check that command buffer encoding does not
-  // crash.
-  graph.execute_nodes().emplace_back(new ExecuteNode(nullptr, {}));
-}
-
-TEST(VulkanComputeGraphTest, test_zero_dim_tensor) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  std::vector<int64_t> size_big = {7, 3, 5};
-  std::vector<int64_t> size_small = {};
-
-  // Build graph
-
-  IOValueRef a = graph.add_input_tensor(size_big, vkapi::kFloat);
-  IOValueRef b = graph.add_input_tensor(size_small, vkapi::kFloat);
-
-  IOValueRef out = {};
-
-  out.value = graph.add_tensor(size_big, vkapi::kFloat);
-
-  auto addFn = VK_GET_OP_FN("aten.add.Tensor");
-  addFn(graph, {a.value, b.value, kDummyValueRef, out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  // Run graph
-
-  for (float i = 5.0f; i < 30.0f; i += 10.0f) {
-    float val_a = i + 2.0f;
-    float val_b = i + 1.5f;
-    float val_c = val_a + val_b;
-
-    fill_vtensor(graph, a, val_a);
-    fill_vtensor(graph, b, val_b);
-
-    graph.execute();
-
-    EXTRACT_TENSOR(out);
-
-    // Sanity check that the values are correct
-    for (size_t i = 0; i < graph.numel_of(out.value); ++i) {
-      CHECK_VALUE(data_out, i, val_c);
-    }
-  }
-}
-
-TEST(VulkanComputeGraphTest, test_simple_graph_with_buffer) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  std::vector<int64_t> sizes = {7, 13, 19};
-
-  // Build graph
-
-  IOValueRef a = graph.add_input_tensor(sizes, vkapi::kFloat, utils::kBuffer);
-
-  IOValueRef out = {};
-
-  out.value = graph.add_tensor(sizes, vkapi::kFloat, utils::kBuffer);
-
-  auto addFn = VK_GET_OP_FN("aten.abs.default");
-  addFn(graph, {a.value, out.value, kDummyValueRef, kDummyValueRef});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  // Run graph
-
-  for (float i = 5.0f; i < 30.0f; i += 10.0f) {
-    float val = -i + 2.0f;
-    float expected_val = std::abs(val);
-
-    fill_vtensor(graph, a, val);
-
-    graph.execute();
-
-    EXTRACT_TENSOR(out);
-
-    // Sanity check that the values are correct
-    for (size_t i = 0; i < graph.numel_of(out.value); ++i) {
-      CHECK_VALUE(data_out, i, expected_val);
-    }
-  }
-}
-
-TEST(VulkanComputeGraphTest, test_graph_view_of_view) {
-  GraphConfig config;
-  config.set_storage_type_override(utils::kTexture3D);
-  ComputeGraph graph(config);
-
-  constexpr int N = 3;
-  constexpr int C = 5;
-  constexpr int H = 17;
-  constexpr int W = 19;
-
-  std::vector<int64_t> orig_sizes = {N, C, H, W};
-
-  // Test a common view of view usage pattern. In delegate execution, the values
-  // of the graph are created first; then operators are added. As a result,
-  // creating views of views is a bit tricky because metadata updates to a view
-  // does not update the metadata of the view's views. Nonetheless, view
-  // operators have an implicit assumption that the metadata of the output is
-  // equivalent to the metadata of the input. Therefore, view operators must
-  // account for unseen updates to the input view by first calling
-  // `virtual_clone()` to make the output equivalent to the input before.
-  // modifying metadata.
-
-  ValueRef t1 = graph.add_tensor(orig_sizes, vkapi::kFloat);
-  ValueRef t2 = graph.add_tensor_view(t1);
-  ValueRef t3 = graph.add_tensor_view(t2);
-
-  ValueRef channels = graph.add_scalar<int64_t>(1);
-  ValueRef height = graph.add_scalar<int64_t>(2);
-  ValueRef width = graph.add_scalar<int64_t>(3);
-
-  auto opFn = VK_GET_OP_FN("aten.transpose.int");
-
-  opFn(graph, {t1, channels, height, t2});
-  std::vector<int64_t> t2_sizes = graph.sizes_of(t2);
-  std::vector<int64_t> expected_t2_sizes = {N, H, C, W};
-  EXPECT_TRUE(t2_sizes == expected_t2_sizes);
-
-  opFn(graph, {t2, height, width, t3});
-  std::vector<int64_t> t3_sizes = graph.sizes_of(t3);
-  std::vector<int64_t> expected_t3_sizes = {N, H, W, C};
-  EXPECT_TRUE(t3_sizes == expected_t3_sizes);
-}
-
-TEST(VulkanComputeGraphTest, test_simple_graph) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  std::vector<int64_t> size_big = {1, 8, 8};
-  std::vector<int64_t> size_small = {1, 1, 8};
-
-  // Build graph
-
-  IOValueRef a = graph.add_input_tensor(size_big, vkapi::kFloat);
-  IOValueRef b = graph.add_input_tensor(size_small, vkapi::kFloat);
-
-  IOValueRef out = {};
-
-  out.value = graph.add_tensor(size_big, vkapi::kFloat);
-
-  auto addFn = VK_GET_OP_FN("aten.add.Tensor");
-  addFn(graph, {a.value, b.value, kDummyValueRef, out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  // Run graph
-
-  for (float i = 5.0f; i < 30.0f; i += 10.0f) {
-    float val_a = i + 2.0f;
-    float val_b = i + 1.5f;
-    float val_c = val_a + val_b;
-
-    fill_vtensor(graph, a, val_a);
-    fill_vtensor(graph, b, val_b);
-
-    graph.execute();
-
-    EXTRACT_TENSOR(out);
-
-    // Sanity check that the values are correct
-    for (size_t i = 0; i < graph.numel_of(out.value); ++i) {
-      CHECK_VALUE(data_out, i, val_c);
-    }
-  }
-}
-
-TEST(VulkanComputeGraphTest, test_simple_graph_with_symint) {
-  GraphConfig config;
-  config.set_storage_type_override(utils::kTexture3D);
-  ComputeGraph graph(config);
-
-  std::vector<int64_t> sizes = {8, 64, 124};
-
-  // Build graph
-
-  ValueRef scalar = graph.add_symint(1);
-  IOValueRef a = graph.add_input_tensor(sizes, vkapi::kFloat);
-
-  IOValueRef out = {};
-  out.value = a.value;
-
-  graph.execute_nodes().emplace_back(new DispatchNode(
-      graph,
-      VK_KERNEL_FROM_STR("scalar_add_texture"),
-      graph.create_global_wg_size(a.value),
-      graph.create_local_wg_size(a.value),
-      // Inputs and Outputs
-      {{out.value, vkapi::MemoryAccessType::WRITE}},
-      // Shader params buffers
-      {graph.logical_limits_ubo(a.value),
-       graph.get_or_create_int_param_buffer(scalar)},
-      // Push constants
-      {},
-      // Specialization Constants
-      {},
-      // Resizing Logic
-      {},
-      nullptr));
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  // Run graph
-
-  for (float i = 5.0f; i < 30.0f; i += 10.0f) {
-    int scalar_val = i - 3.0f;
-    graph.set_symint(scalar, scalar_val);
-
-    int32_t scalar_val_read = graph.read_symint(scalar);
-    EXPECT_TRUE(scalar_val_read == scalar_val);
-
-    float val_a = i + 2.0f;
-    float val_out = val_a + scalar_val;
-
-    fill_vtensor(graph, a, val_a);
-
-    graph.execute();
-
-    EXTRACT_TENSOR(out);
-
-    // Sanity check that the values are correct
-    for (size_t i = 0; i < graph.numel_of(out.value); i++) {
-      CHECK_VALUE(data_out, i, val_out);
-    }
-  }
-}
-
-#define CREATE_WEIGHT_TENSOR(name, sizes, dtype, val)              \
-  std::vector<float> data_##name(utils::multiply_integers(sizes)); \
-  std::fill(data_##name.begin(), data_##name.end(), val);          \
-  ValueRef name = graph.add_tensorref(sizes, dtype, data_##name.data());
-
-TEST(VulkanComputeGraphTest, test_simple_prepacked_graph) {
-  GraphConfig config;
-  config.enable_querypool = true;
-  ComputeGraph graph(config);
-
-  std::vector<int64_t> size_big = {8, 73, 62};
-  std::vector<int64_t> size_small = {8, 73, 1};
-
-  CREATE_WEIGHT_TENSOR(w1, size_small, vkapi::kFloat, 3.5f);
-  CREATE_WEIGHT_TENSOR(w2, size_small, vkapi::kFloat, 3.0f);
-
-  // Build graph
-
-  IOValueRef a = graph.add_input_tensor(size_big, vkapi::kFloat);
-
-  ValueRef c = graph.add_tensor(size_big, vkapi::kFloat);
-  ValueRef e = graph.add_tensor(size_big, vkapi::kFloat);
-
-  ValueRef w1_packed = graph.add_tensor(size_small, vkapi::kFloat);
-  ValueRef w2_packed = graph.add_tensor(size_small, vkapi::kFloat);
-
-  auto prepackFn = VK_GET_OP_FN("et_vk.prepack.default");
-  prepackFn(graph, {w1, w1_packed});
-  prepackFn(graph, {w2, w2_packed});
-
-  auto addFn = VK_GET_OP_FN("aten.add.Tensor");
-  addFn(graph, {a.value, w1_packed, kDummyValueRef, c});
-
-  auto mulFn = VK_GET_OP_FN("aten.mul.Tensor");
-  mulFn(graph, {c, w2_packed, e});
-
-  IOValueRef out = {};
-  out.value = e;
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-
-  graph.prepack();
-
-  // Run graph
-
-  for (float i = 5.0f; i < 30.0f; i += 10.0f) {
-    float val_out = (i + 3.5f) * 3.0f;
-
-    fill_vtensor(graph, a, i);
-
-    // Execute graph
-    graph.execute();
-
-    EXTRACT_TENSOR(out);
-
-    // Sanity check that the values are correct
-    for (size_t i = 0; i < graph.numel_of(out.value); ++i) {
-      CHECK_VALUE(data_out, i, val_out);
-    }
-
-    if (graph.context()->querypool()) {
-      graph.context()->querypool().extract_results();
-      graph.context()->querypool().print_results();
-    }
-  }
-}
-
-TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
-  GraphConfig config;
-  config.expect_dynamic_shapes = true;
-  ComputeGraph graph(config);
-  size_t expected_vma_allocation_count = 0;
-
-  std::vector<int64_t> size_big = {12, 64, 64};
-  std::vector<int64_t> size_small = {12, 64, 64};
-
-  // Build graph and regularly check allocation counts
-
-  IOValueRef a = graph.add_input_tensor(
-      size_big,
-      vkapi::kFloat,
-      /*shared_object_idx = */ 2);
-  IOValueRef b = graph.add_input_tensor(
-      size_small,
-      vkapi::kFloat,
-      /*shared_object_idx = */ 4);
-
-  // +2: t.sizes_ubo() for each staging shader
-  expected_vma_allocation_count += 2;
-  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
-
-  ValueRef c = graph.add_tensor(
-      size_big,
-      vkapi::kFloat,
-      /*shared_object_idx = */ 6);
-
-  auto addFn = VK_GET_OP_FN("aten.add.Tensor");
-  addFn(graph, {a.value, b.value, kDummyValueRef, c});
-
-  // no new allocations if binary op uses push constants
-  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
-
-  IOValueRef d = graph.add_input_tensor(
-      size_small,
-      vkapi::kFloat,
-      /*shared_object_idx = */ 2);
-
-  // +1: t.sizes_ubo() uniform buffer for staging shader
-  expected_vma_allocation_count += 1;
-  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
-
-  ValueRef e = graph.add_tensor(
-      size_big,
-      vkapi::kFloat,
-      /*shared_object_idx = */ 4);
-
-  auto mulFn = VK_GET_OP_FN("aten.mul.Tensor");
-  mulFn(graph, {c, d.value, e});
-
-  // no new allocations if binary op uses push constants
-  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
-
-  IOValueRef out = {};
-  out.value = e;
-  out.staging = graph.set_output_tensor(out.value);
-
-  // +1: staging buffer input tensor
-  expected_vma_allocation_count += 1;
-  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
-
-  graph.prepare();
-  graph.prepack();
-
-  // +3: shared memory allocations for tensors
-  expected_vma_allocation_count += 3;
-  EXPECT_EQ(get_vma_allocation_count(), expected_vma_allocation_count);
-
-  // Run graph
-
-  std::vector<std::vector<int64_t>> new_sizes_list = {
-      {8, 44, 34}, {4, 13, 56}, {8, 12, 64}, {12, 55, 33}, {4, 54, 10}};
-
-  for (auto& new_sizes : new_sizes_list) {
-    graph.virtual_resize(a.value, new_sizes);
-    graph.virtual_resize(b.value, new_sizes);
-    graph.virtual_resize(d.value, new_sizes);
-    graph.propagate_resize();
-
-    float val_a = new_sizes[1] + 4.0f;
-    float val_b = new_sizes[2] + 1.5f;
-    float val_d = new_sizes[0] + 2.0f;
-    float val_out = (val_a + val_b) * val_d;
-
-    fill_vtensor(graph, a, val_a);
-    fill_vtensor(graph, b, val_b);
-    fill_vtensor(graph, d, val_d);
-
-    // Execute graph
-    graph.execute();
-
-    EXTRACT_TENSOR(out);
-
-    // Sanity check that the values are correct
-    for (size_t i = 0; i < graph.numel_of(out.value); i++) {
-      CHECK_VALUE(data_out, i, val_out);
-    }
-  }
-
-  std::vector<std::vector<int64_t>> new_sizes_list_2 = {
-      {8, 44, 34}, {4, 13, 56}, {8, 12, 64}, {12, 55, 33}, {4, 54, 10}};
-
-  for (auto& new_sizes : new_sizes_list_2) {
-    graph.resize_input(0, new_sizes);
-    graph.resize_input(1, new_sizes);
-    graph.resize_input(2, new_sizes);
-    graph.propagate_resize();
-
-    // Check output shape
-    EXPECT_TRUE(graph.sizes_of(out.value) == new_sizes);
-
-    float val_a = new_sizes[1] + 6.0f;
-    float val_b = new_sizes[2] + 2.5f;
-    float val_d = new_sizes[0] + 4.0f;
-    float val_out = (val_a + val_b) * val_d;
-
-    fill_vtensor(graph, a, val_a);
-    fill_vtensor(graph, b, val_b);
-    fill_vtensor(graph, d, val_d);
-
-    // Execute graph
-    graph.execute();
-
-    EXTRACT_TENSOR(out);
-
-    // Sanity check that the values are correct
-    for (size_t i = 0; i < graph.numel_of(out.value); i++) {
-      CHECK_VALUE(data_out, i, val_out);
-    }
-  }
-}
-
-TEST(VulkanComputeGraphTest, test_simple_graph_with_tmp_tensors) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  std::vector<int64_t> size_big = {8, 64, 124};
-  std::vector<int64_t> size_small = {8, 1, 124};
-
-  // Build graph
-
-  IOValueRef a = graph.add_input_tensor(
-      size_big, vkapi::kFloat, /*shared_object_idx = */ 0);
-  IOValueRef b = graph.add_input_tensor(
-      size_small, vkapi::kFloat, /*shared_object_idx = */ 1);
-
-  IOValueRef out = {};
-
-  out.value =
-      graph.add_tensor(size_big, vkapi::kFloat, /*shared_object_idx = */ 2);
-
-  // Perform the following compute
-  //
-  // a, b, out;
-  // {
-  //   inter;
-  //   {
-  //     tmp = a + b
-  //     tmp2 = tmp + a
-  //     inter = tmp2 + b
-  //   }
-  //   {
-  //     tmp = inter + b;
-  //     tmp2 = tmp + a
-  //     out = tmp2 + b;
-  //   }
-  // }
-  {
-    TmpTensor inter(&graph, size_big, vkapi::kFloat);
-    EXPECT_TRUE(inter.sobj_idx == 3);
-    {
-      TmpTensor tmp(&graph, size_big, vkapi::kFloat);
-      EXPECT_TRUE(tmp.sobj_idx == 4);
-      VK_GET_OP_FN("aten.add.Tensor")
-      (graph, {a, b, kDummyValueRef, tmp});
-
-      TmpTensor tmp2(&graph, size_big, vkapi::kFloat);
-      EXPECT_TRUE(tmp2.sobj_idx == 5);
-      VK_GET_OP_FN("aten.add.Tensor")
-      (graph, {tmp, a, kDummyValueRef, tmp2});
-
-      VK_GET_OP_FN("aten.add.Tensor")
-      (graph, {tmp2, b, kDummyValueRef, inter});
-    }
-    {
-      TmpTensor tmp(&graph, size_big, vkapi::kFloat);
-      EXPECT_TRUE(tmp.sobj_idx == 4);
-      VK_GET_OP_FN("aten.add.Tensor")
-      (graph, {inter, b, kDummyValueRef, tmp});
-
-      TmpTensor tmp2(&graph, size_big, vkapi::kFloat);
-      EXPECT_TRUE(tmp2.sobj_idx == 5);
-      VK_GET_OP_FN("aten.add.Tensor")
-      (graph, {tmp, a, kDummyValueRef, tmp2});
-
-      VK_GET_OP_FN("aten.add.Tensor")
-      (graph, {tmp2, b, kDummyValueRef, out});
-    }
-  }
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  // Run graph
-
-  for (float i = 5.0f; i < 30.0f; i += 10.0f) {
-    float val_a = i + 2.0f;
-    float val_b = i + 1.5f;
-    float val_tmp = val_a + val_b;
-    float val_tmp2 = val_tmp + val_a;
-    float val_inter = val_tmp2 + val_b;
-    float val_tmp_2 = val_inter + val_b;
-    float val_tmp2_2 = val_tmp_2 + val_a;
-    float val_out = val_tmp2_2 + val_b;
-
-    fill_vtensor(graph, a, val_a);
-    fill_vtensor(graph, b, val_b);
-
-    graph.execute();
-
-    EXTRACT_TENSOR(out);
-
-    // Sanity check that the values are correct
-    for (size_t i = 0; i < graph.numel_of(out.value); ++i) {
-      CHECK_VALUE(data_out, i, val_out);
-    }
-  }
-}
-
-TEST(VulkanComputeGraphTest, test_large_graph) {
-  auto build_start_time = std::chrono::system_clock::now();
-  GraphConfig config;
-  config.expect_dynamic_shapes = true;
-  ComputeGraph graph(config);
-
-  int64_t input_w = 256;
-  int64_t input_h = 256;
-  int64_t input_c = 8;
-
-  std::vector<int64_t> size_big = {input_c, input_h, input_w};
-  std::vector<int64_t> size_small = {input_c, input_h, 1};
-
-  std::vector<int64_t> size_big_alt = {input_c / 2, input_h / 2, input_w / 2};
-  std::vector<int64_t> size_small_alt = {input_c / 2, input_h / 2, 1};
-
-  // Build graph
-
-  IOValueRef a = graph.add_input_tensor(size_big, vkapi::kFloat, 2);
-  IOValueRef b = graph.add_input_tensor(size_small, vkapi::kFloat, 4);
-
-  ValueRef c = graph.add_tensor(size_big, vkapi::kFloat, 6);
-
-  auto addFn = VK_GET_OP_FN("aten.add.Tensor");
-  addFn(graph, {a.value, b.value, kDummyValueRef, c});
-
-  int n = 100;
-
-  for (int i = 0; i < n; i++) {
-    addFn(graph, {c, b.value, kDummyValueRef, a.value});
-
-    addFn(graph, {a.value, b.value, kDummyValueRef, c});
-  }
-
-  IOValueRef out = {};
-  out.value = c;
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  auto build_end_time = std::chrono::system_clock::now();
-
-  auto build_time = std::chrono::duration_cast<std::chrono::microseconds>(
-      build_end_time - build_start_time);
-
-  std::stringstream ss;
-  for (int i = 0; i < 10; i++) {
-    auto resize_start_time = std::chrono::system_clock::now();
-    if (i % 2 == 0) {
-      graph.resize_input(0, size_big_alt);
-      graph.resize_input(1, size_small_alt);
-    } else {
-      graph.resize_input(0, size_big);
-      graph.resize_input(1, size_small);
-    }
-    graph.propagate_resize();
-    auto resize_end_time = std::chrono::system_clock::now();
-
-    auto resize_time = std::chrono::duration_cast<std::chrono::microseconds>(
-        resize_end_time - resize_start_time);
-
-    float val_a = 1.0f;
-    float val_b = 2.0f;
-
-    float val_e = val_a + val_b * (2 * n + 1);
-
-    auto inference_start_time = std::chrono::system_clock::now();
-
-    fill_vtensor(graph, a, val_a);
-    fill_vtensor(graph, b, val_b);
-
-    graph.execute();
-
-    EXTRACT_TENSOR(out);
-
-    auto inference_end_time = std::chrono::system_clock::now();
-
-    auto inference_time = std::chrono::duration_cast<std::chrono::microseconds>(
-        inference_end_time - inference_start_time);
-
-    for (int i = 0; i < graph.numel_of(out.value); i++) {
-      CHECK_VALUE(data_out, i, val_e);
-    }
-
-    ss << "[          ] Resize:    " << std::setw(10) << std::right
-       << resize_time.count() << " us" << std::endl;
-    ss << "[          ] Inference: " << std::setw(10) << std::right
-       << inference_time.count() << " us" << std::endl;
-  }
-  ss << "[          ] Model Load:" << std::setw(10) << std::right
-     << build_time.count() << " us" << std::endl;
-  std::cout << ss.str();
-}
-
-void test_clone(
-    std::vector<int64_t> sizes,
-    utils::StorageType src_storage,
-    utils::GPUMemoryLayout src_layout,
-    utils::StorageType dst_storage,
-    utils::GPUMemoryLayout dst_layout) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  IOValueRef a =
-      graph.add_input_tensor(sizes, vkapi::kFloat, src_storage, src_layout);
-
-  IOValueRef out = {};
-  out.value = graph.add_tensor(sizes, vkapi::kFloat, dst_storage, dst_layout);
-
-  auto copyFn = VK_GET_OP_FN("aten.clone.default");
-  copyFn(graph, {a.value, kDummyValueRef, out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  fill_vtensor(graph, a, 0.0f, /*iota = */ true);
-
-  graph.propagate_resize();
-  graph.execute();
-
-  EXTRACT_TENSOR(out);
-  EXTRACT_TENSOR(a);
-
-  for (int i = 0; i < graph.numel_of(a.value); ++i) {
-    EXPECT_TRUE(data_out[i] == data_a[i]);
-  }
-}
-
-TEST(VulkanComputeGraphTest, test_clone) {
-  std::vector<std::pair<utils::GPUMemoryLayout, utils::GPUMemoryLayout>> cases{
-      {utils::kWidthPacked, utils::kWidthPacked},
-      {utils::kWidthPacked, utils::kChannelsPacked},
-      {utils::kChannelsPacked, utils::kChannelsPacked},
-  };
-
-  for (std::vector<int64_t> sizes : standard_sizes_to_test) {
-    for (auto& [src_layout, dst_layout] : cases) {
-      test_clone(
-          sizes, utils::kTexture3D, src_layout, utils::kBuffer, dst_layout);
-      test_clone(
-          sizes, utils::kBuffer, src_layout, utils::kTexture3D, dst_layout);
-      test_clone(
-          sizes, utils::kTexture3D, src_layout, utils::kTexture3D, dst_layout);
-    }
-  }
-}
-
-TEST(VulkanComputeGraphTest, test_etvk_copy_offset_node) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  int64_t n = 6;
-  int64_t c = 12;
-  int64_t h = 4;
-  int64_t w = 8;
-  utils::GPUMemoryLayout memory_layout =
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
-
-  std::vector<int64_t> size = {n, c, h, w};
-
-  IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
-
-  IOValueRef out = {};
-  out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
-
-  // Notice that copy_node operates on in texture's x, y, z dimension. In the
-  // comment, we provide the cooresponding coordinate in nchw.
-
-  // src_offset is (n=0, c=4, h=1, w=1)
-  ValueRef src_offset_ref = graph.add_scalar_list<int64_t>({1, 1, 1});
-
-  // dst_offset is (n=1, c=8, h=2, w=0) in nchw coordinate
-  // Argument is {x, y, z}.
-  // x = 0 since w = 0
-  // y = 2 since h = 2
-  // z = c / 4 + 2 since
-  //   1. there c/4 planes per batch, n=1 means we are on the first batch;
-  //   2. +2 because c = 8, with channel packing it means two texels.
-  ValueRef dst_offset_ref = graph.add_scalar_list<int64_t>({0, 2, c / 4 + 2});
-
-  // range is (n=1, c=8, h=2, w=4)
-  // Argument is {x, y, z}.
-  // x = 4 since w = 4
-  // y = 2 since h = 2
-  // z = 2 since we are only copying 8 channels, hence 2 texel. n = 1 can be a
-  // bit misleading here, since it gives the impression that we are copying the
-  // entire channel. However, remember when we copy, we are trying to
-  // dst[dst_offset:dst_offset + range] = src[src_offset:src_offset + range],
-  // range must be non zero.
-  ValueRef range_ref = graph.add_scalar_list<int64_t>({4, 2, 2});
-
-  auto copyFn = VK_GET_OP_FN("etvk.copy_offset");
-  copyFn(
-      graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  fill_vtensor(graph, a, 0.0f, /*iota = */ true);
-
-  graph.execute();
-
-  EXTRACT_TENSOR(out);
-  EXTRACT_TENSOR(a);
-
-  // We will examine the results in the dst_range
-  // The value in the cooresponding coordinate should match between the source
-  // and destination tensor. We loop thru the range, calculate both the src and
-  // dst index using the offsets, and compare the values in the extracted
-  // vector. They should match.
-  int n_idx = 0;
-  // at each nested loop, index range from dst_offset to dst_offset + range
-
-  for (int c_idx = 0; c_idx < 8; c_idx++) {
-    for (int h_idx = 0; h_idx < 2; h_idx++) {
-      for (int w_idx = 0; w_idx < 4; w_idx++) {
-        auto dst_idx =
-            get_buf_idx(graph, out, {n_idx + 1, c_idx + 8, h_idx + 2, w_idx});
-        auto src_idx =
-            get_buf_idx(graph, a, {n_idx, c_idx + 4, h_idx + 1, w_idx + 1});
-
-        EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
-      }
-    }
-  }
-}
-
-TEST(VulkanComputeGraphTest, DISABLED_test_etvk_copy_channel_offset_node) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  int64_t n = 2;
-  int64_t c = 12;
-  int64_t h = 4;
-  int64_t w = 8;
-  utils::GPUMemoryLayout memory_layout =
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
-
-  std::vector<int64_t> size = {n, c, h, w};
-
-  IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
-
-  IOValueRef out = {};
-  out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
-
-  int64_t src_offset = 2;
-  int64_t dst_offset = 3;
-  int64_t range = 7;
-
-  ValueRef src_offset_ref = graph.add_scalar<int64_t>(src_offset);
-  ValueRef dst_offset_ref = graph.add_scalar<int64_t>(dst_offset);
-  ValueRef range_ref = graph.add_scalar<int64_t>(range);
-
-  auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset");
-  copyFn(
-      graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  fill_vtensor(graph, a, 0.0f, true);
-
-  graph.execute();
-
-  EXTRACT_TENSOR(out);
-  EXTRACT_TENSOR(a);
-
-  for (int n_idx = 0; n_idx < n; n_idx++) {
-    for (int c_idx = 0; c_idx < range; c_idx++) {
-      for (int h_idx = 0; h_idx < h; h_idx++) {
-        for (int w_idx = 0; w_idx < w; w_idx++) {
-          auto src_idx =
-              get_buf_idx(graph, a, {n_idx, c_idx + src_offset, h_idx, w_idx});
-          auto dst_idx = get_buf_idx(
-              graph, out, {n_idx, c_idx + dst_offset, h_idx, w_idx});
-          EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
-        }
-      }
-    }
-  }
-}
-
-TEST(
-    VulkanComputeGraphTest,
-    DISABLED_test_etvk_copy_channel_offset_node_clean_boundary) {
-  // Tricky part for channel copy is handling the boundary across multiple copy.
-  // For example, when we concat two [3, 1, 1] nchw-tensors along the channel
-  // dimension, due to channel packing, elements from different source texel
-  // will be packed into same destination texel at the boundaries.
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  int64_t n = 2;
-  int64_t c = 12;
-  int64_t h = 4;
-  int64_t w = 8;
-  utils::GPUMemoryLayout memory_layout =
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
-
-  std::vector<int64_t> size = {n, c, h, w};
-
-  IOValueRef zero = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
-  IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
-  IOValueRef b = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
-
-  IOValueRef out = {};
-  out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
-
-  auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset");
-
-  // Make sure entire out tensor is zeroed. The zero tensor will be filled with
-  // zero later.
-  copyFn(
-      graph,
-      {zero.value,
-       graph.add_scalar<int64_t>(c),
-       graph.add_scalar<int64_t>(0),
-       graph.add_scalar<int64_t>(0),
-       out.value});
-
-  int64_t a_src_offset = 0;
-  int64_t a_dst_offset = 2;
-  int64_t a_range = 5;
-  // a will write to channge [2, 7)
-  copyFn(
-      graph,
-      {a.value,
-       graph.add_scalar<int64_t>(a_range),
-       graph.add_scalar<int64_t>(a_src_offset),
-       graph.add_scalar<int64_t>(a_dst_offset),
-       out.value});
-
-  // b will write to channel [6, 11)
-  // Intentional for b to override channel=6
-  int64_t b_src_offset = 0;
-  int64_t b_dst_offset = 6;
-  int64_t b_range = 5;
-
-  copyFn(
-      graph,
-      {b.value,
-       graph.add_scalar<int64_t>(b_range),
-       graph.add_scalar<int64_t>(b_src_offset),
-       graph.add_scalar<int64_t>(b_dst_offset),
-       out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  float a_value = 1.0f;
-  float b_value = 2.0f;
-  float zero_value = 0.0f;
-  fill_vtensor(graph, a, a_value);
-  fill_vtensor(graph, b, b_value);
-  fill_vtensor(graph, zero, zero_value);
-
-  graph.execute();
-
-  EXTRACT_TENSOR(out);
-
-  for (int n_idx = 0; n_idx < n; n_idx++) {
-    // c_idx only up to a_range-1 because the expected overwrite by b
-    for (int c_idx = a_dst_offset; c_idx < a_dst_offset + a_range - 1;
-         c_idx++) {
-      for (int h_idx = 0; h_idx < h; h_idx++) {
-        for (int w_idx = 0; w_idx < w; w_idx++) {
-          auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
-          EXPECT_TRUE(data_out[dst_idx] == a_value);
-        }
-      }
-    }
-  }
-
-  for (int n_idx = 0; n_idx < n; n_idx++) {
-    for (int c_idx = b_dst_offset; c_idx < b_dst_offset + b_range; c_idx++) {
-      for (int h_idx = 0; h_idx < h; h_idx++) {
-        for (int w_idx = 0; w_idx < w; w_idx++) {
-          auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
-          EXPECT_TRUE(data_out[dst_idx] == b_value);
-        }
-      }
-    }
-  }
-
-  // Also verify that data before a_dst_offset and after b_dst_offset + b_range
-  // are untouched.
-  for (int n_idx = 0; n_idx < n; n_idx++) {
-    for (int c_idx = 0; c_idx < a_dst_offset; c_idx++) {
-      for (int h_idx = 0; h_idx < h; h_idx++) {
-        for (int w_idx = 0; w_idx < w; w_idx++) {
-          auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
-          EXPECT_TRUE(data_out[dst_idx] == zero_value);
-        }
-      }
-    }
-  }
-
-  for (int n_idx = 0; n_idx < n; n_idx++) {
-    for (int c_idx = b_dst_offset + b_range; c_idx < c; c_idx++) {
-      for (int h_idx = 0; h_idx < h; h_idx++) {
-        for (int w_idx = 0; w_idx < w; w_idx++) {
-          auto dst_idx = get_buf_idx(graph, out, {n_idx, c_idx, h_idx, w_idx});
-          EXPECT_TRUE(data_out[dst_idx] == zero_value);
-        }
-      }
-    }
-  }
-}
-
-TEST(VulkanComputeGraphTest, test_etvk_copy_offset_int_node) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  int64_t n = 6;
-  int64_t c = 12;
-  int64_t h = 4;
-  int64_t w = 8;
-  utils::GPUMemoryLayout memory_layout =
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
-
-  std::vector<int64_t> size = {n, c, h, w};
-
-  IOValueRef a = graph.add_input_tensor(size, vkapi::kInt, memory_layout);
-
-  IOValueRef out = {};
-  out.value = graph.add_tensor(size, vkapi::kInt, memory_layout);
-
-  // Notice that copy_node operates on in texture's x, y, z dimension. In the
-  // comment, we provide the cooresponding coordinate in nchw.
-
-  // src_offset is (n=0, c=4, h=1, w=1)
-  ValueRef src_offset_ref = graph.add_scalar_list<int64_t>({1, 1, 1});
-
-  // dst_offset is (n=1, c=8, h=2, w=0) in nchw coordinate
-  // Argument is {x, y, z}.
-  // x = 0 since w = 0
-  // y = 2 since h = 2
-  // z = c / 4 + 2 since
-  //   1. there c/4 planes per batch, n=1 means we are on the first batch;
-  //   2. +2 because c = 8, with channel packing it means two texels.
-  ValueRef dst_offset_ref = graph.add_scalar_list<int64_t>({0, 2, c / 4 + 2});
-
-  // range is (n=1, c=8, h=2, w=4)
-  // Argument is {x, y, z}.
-  // x = 4 since w = 4
-  // y = 2 since h = 2
-  // z = 2 since we are only copying 8 channels, hence 2 texel. n = 1 can be a
-  // bit misleading here, since it gives the impression that we are copying the
-  // entire channel. However, remember when we copy, we are trying to
-  // dst[dst_offset:dst_offset + range] = src[src_offset:src_offset + range],
-  // range must be non zero.
-  ValueRef range_ref = graph.add_scalar_list<int64_t>({4, 2, 2});
-
-  auto copyFn = VK_GET_OP_FN("etvk.copy_offset");
-  copyFn(
-      graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  fill_vtensor(graph, a, 0, /*iota = */ true);
-
-  graph.execute();
-
-  EXTRACT_TENSOR(out);
-  EXTRACT_TENSOR(a);
-
-  // We will examine the results in the dst_range
-  // The value in the cooresponding coordinate should match between the source
-  // and destination tensor. We loop thru the range, calculate both the src and
-  // dst index using the offsets, and compare the values in the extracted
-  // vector. They should match.
-  int n_idx = 0;
-  // at each nested loop, index range from dst_offset to dst_offset + range
-
-  for (int c_idx = 0; c_idx < 8; c_idx++) {
-    for (int h_idx = 0; h_idx < 2; h_idx++) {
-      for (int w_idx = 0; w_idx < 4; w_idx++) {
-        auto dst_idx =
-            get_buf_idx(graph, out, {n_idx + 1, c_idx + 8, h_idx + 2, w_idx});
-        auto src_idx =
-            get_buf_idx(graph, a, {n_idx, c_idx + 4, h_idx + 1, w_idx + 1});
-
-        EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
-      }
-    }
-  }
-}
-
-TEST(VulkanComputeGraphTest, DISABLED_test_etvk_copy_channel_offset_int_node) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  int64_t n = 2;
-  int64_t c = 12;
-  int64_t h = 4;
-  int64_t w = 8;
-  utils::GPUMemoryLayout memory_layout =
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED;
-
-  std::vector<int64_t> size = {n, c, h, w};
-
-  IOValueRef a = graph.add_input_tensor(size, vkapi::kFloat, memory_layout);
-
-  IOValueRef out = {};
-  out.value = graph.add_tensor(size, vkapi::kFloat, memory_layout);
-
-  int64_t src_offset = 2;
-  int64_t dst_offset = 3;
-  int64_t range = 7;
-
-  ValueRef src_offset_ref = graph.add_scalar<int64_t>(src_offset);
-  ValueRef dst_offset_ref = graph.add_scalar<int64_t>(dst_offset);
-  ValueRef range_ref = graph.add_scalar<int64_t>(range);
-
-  auto copyFn = VK_GET_OP_FN("etvk.copy_channel_offset");
-  copyFn(
-      graph, {a.value, range_ref, src_offset_ref, dst_offset_ref, out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-  graph.prepack();
-
-  fill_vtensor(graph, a, 0.0f, true);
-
-  graph.execute();
-
-  EXTRACT_TENSOR(out);
-  EXTRACT_TENSOR(a);
-
-  for (int n_idx = 0; n_idx < n; n_idx++) {
-    for (int c_idx = 0; c_idx < range; c_idx++) {
-      for (int h_idx = 0; h_idx < h; h_idx++) {
-        for (int w_idx = 0; w_idx < w; w_idx++) {
-          auto src_idx =
-              get_buf_idx(graph, a, {n_idx, c_idx + src_offset, h_idx, w_idx});
-          auto dst_idx = get_buf_idx(
-              graph, out, {n_idx, c_idx + dst_offset, h_idx, w_idx});
-          EXPECT_TRUE(data_out[dst_idx] == data_a[src_idx]);
-        }
-      }
-    }
-  }
-}
-
-TEST(VulkanComputeGraphTest, test_view_change_packing) {
-  std::vector<std::pair<utils::GPUMemoryLayout, utils::GPUMemoryLayout>>
-      layout_pairs = {
-          {utils::kWidthPacked, utils::kChannelsPacked},
-          {utils::kWidthPacked, utils::kHeightPacked},
-          {utils::kWidthPacked, utils::kWidthPacked},
-          {utils::kHeightPacked, utils::kChannelsPacked},
-          {utils::kHeightPacked, utils::kHeightPacked},
-          {utils::kHeightPacked, utils::kHeightPacked},
-          {utils::kChannelsPacked, utils::kChannelsPacked},
-          {utils::kChannelsPacked, utils::kHeightPacked},
-          {utils::kChannelsPacked, utils::kHeightPacked},
-      };
-
-  int64_t n = 3;
-  int64_t c = 2;
-  int64_t h = 2;
-  int64_t w = 5;
-  std::vector<int64_t> size = {n, c, h, w};
-
-  for (auto layout_pair : layout_pairs) {
-    GraphConfig config;
-    ComputeGraph graph(config);
-
-    IOValueRef in =
-        graph.add_input_tensor(size, vkapi::kFloat, layout_pair.first);
-
-    IOValueRef out = {};
-    out.value = graph.add_tensor(size, vkapi::kFloat, layout_pair.second);
-
-    auto viewFn = VK_GET_OP_FN("aten.view_copy.default");
-    viewFn(graph, {in.value, graph.add_none(), out.value});
-
-    out.staging = graph.set_output_tensor(out.value);
-
-    graph.prepare();
-    graph.prepack();
-
-    fill_vtensor(graph, in, 0.0, true);
-
-    graph.execute();
-
-    EXTRACT_TENSOR(out);
-
-    // The extracted data is a flattened nchw buffer. Hence, should expect the
-    // all elements inside the out array to match the index.
-    for (int i = 0; i < graph.numel_of(out.value); i++) {
-      CHECK_VALUE(data_out, i, i);
-    }
-  }
-}
-
-class VulkanToFromGPUShaderTest : public ::testing::Test {
- public:
-  void SetUp() override {
-    // Make sure we are starting with a clean slate
-    EXPECT_TRUE(get_vma_allocation_count() == 0);
-  }
-
-  void TearDown() override {
-    context()->flush();
-
-    // Make sure we are ending with a clean slate
-    EXPECT_TRUE(get_vma_allocation_count() == 0);
-  }
-};
-
-template <typename T>
-void run_from_gpu_test(
-    std::vector<int64_t>& sizes,
-    utils::GPUMemoryLayout memory_layout =
-        utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
-    vkapi::ScalarType dtype = vkapi::kFloat,
-    utils::StorageType storage_type = utils::StorageType::TEXTURE_3D) {
-  if (dtype == vkapi::kHalf &&
-      !context()->adapter_ptr()->supports_16bit_storage_buffers()) {
-    return;
-  }
-  vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout);
-
-  std::string kernel_name("idx_fill_texture");
-  add_dtype_suffix(kernel_name, vten.dtype());
-
-  int32_t offset = -50;
-
-  {
-    vkapi::PipelineBarrier pipeline_barrier{};
-    context()->submit_compute_job(
-        VK_KERNEL_FROM_STR(kernel_name),
-        pipeline_barrier,
-        vten.logical_limits(),
-        {4, 4, 4},
-        {vten.packed_dim(), offset},
-        VK_NULL_HANDLE,
-        0,
-        vten.image(
-            pipeline_barrier,
-            vkapi::PipelineStage::COMPUTE,
-            vkapi::MemoryAccessType::WRITE),
-        vten.sizes_ubo());
-  }
-
-  StagingBuffer staging_buffer(context(), dtype, vten.staging_buffer_numel());
-
-  if (dtype == vkapi::kChar &&
-      !context()->adapter_ptr()->has_full_int8_buffers_support()) {
-    record_bitw8_image_to_nchw_nobitw8buffer_op(
-        context(), vten, staging_buffer);
-  } else {
-    record_image_to_nchw_op(context(), vten, staging_buffer.buffer());
-  }
-
-  submit_to_gpu();
-
-  std::vector<T> data_out(staging_buffer.numel());
-  staging_buffer.copy_to(data_out.data(), staging_buffer.nbytes());
-
-  for (int i = 0; i < vten.numel(); i++) {
-    CHECK_VALUE(data_out, i, i + offset);
-  }
-}
-
-template <typename T>
-void round_trip_test(
-    std::vector<int64_t>& sizes,
-    utils::GPUMemoryLayout memory_layout =
-        utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
-    vkapi::ScalarType dtype = vkapi::kFloat,
-    utils::StorageType storage_type = utils::StorageType::TEXTURE_3D) {
-  if (dtype == vkapi::kHalf &&
-      !context()->adapter_ptr()->supports_16bit_storage_buffers()) {
-    return;
-  }
-
-  vTensor vten = vTensor(context(), sizes, dtype, storage_type, memory_layout);
-
-  // Create and fill input staging buffer
-  StagingBuffer staging_buffer_in(
-      context(), dtype, vten.staging_buffer_numel());
-
-  std::vector<T> data_in(staging_buffer_in.numel());
-  for (int i = 0; i < staging_buffer_in.numel(); i++) {
-    data_in[i] = T(i * -1);
-  }
-  staging_buffer_in.copy_from(data_in.data(), vten.staging_buffer_nbytes());
-
-  // Output staging buffer
-  StagingBuffer staging_buffer_out(
-      context(), dtype, vten.staging_buffer_numel());
-
-  record_nchw_to_image_op(context(), staging_buffer_in.buffer(), vten);
-
-  // Copy data in and out of the tensor
-  if (dtype == vkapi::kChar &&
-      !context()->adapter_ptr()->has_full_int8_buffers_support()) {
-    record_bitw8_image_to_nchw_nobitw8buffer_op(
-        context(), vten, staging_buffer_out);
-  } else {
-    record_image_to_nchw_op(context(), vten, staging_buffer_out.buffer());
-  }
-
-  // Execute command buffer
-  submit_to_gpu();
-
-  // Extract data from output staging buffer
-  std::vector<T> data_out(staging_buffer_out.numel());
-  staging_buffer_out.copy_to(data_out.data(), staging_buffer_out.nbytes());
-
-  // All indices should be equal to the input data
-  for (int i = 0; i < vten.numel(); i++) {
-    CHECK_VALUE(data_out, i, data_in[i]);
-  }
-}
-
-template <typename T>
-void compute_graph_round_trip_test(
-    std::vector<int64_t>& sizes,
-    utils::GPUMemoryLayout memory_layout =
-        utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
-    vkapi::ScalarType dtype = vkapi::kFloat,
-    utils::StorageType storage_type = utils::StorageType::TEXTURE_3D) {
-  if (dtype == vkapi::kHalf &&
-      !context()->adapter_ptr()->supports_16bit_storage_buffers()) {
-    return;
-  }
-
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  ValueRef r_tensor =
-      graph.add_tensor(sizes, dtype, storage_type, memory_layout);
-  ValueRef r_staging_in = graph.set_input_tensor(r_tensor);
-  ValueRef r_staging_out = graph.set_output_tensor(r_tensor);
-
-  graph.prepare();
-  graph.prepack();
-
-  std::vector<T> data_in(graph.numel_of(r_tensor));
-  for (int i = 0; i < data_in.size(); i++) {
-    data_in[i] = T(i * -1);
-  }
-  graph.copy_into_staging(r_staging_in, data_in.data(), data_in.size());
-
-  graph.execute();
-
-  std::vector<T> data_out(graph.staging_buffer_numel_of(r_tensor));
-  graph.copy_from_staging(r_staging_out, data_out.data(), data_out.size());
-
-  for (int i = 0; i < data_in.size(); i++) {
-    CHECK_VALUE(data_out, i, data_in[i]);
-  }
-}
-
-TEST(VulkanToFromGPUShaderTest, round_trip_tests) {
-  // The below tests will fill each texel element with the value of the linear
-  // buffer index that corresponds to it. The texel at position (0, 0, 0) will
-  // be filled with the values [0, 1, 2, 3], the texel at position (1, 0, 0)
-  // will be filled with the values [4, 5, 6, 7], and so forth. The contents of
-  // the texture are then written back to the CPU, and to check that the
-  // transfer has ben performed correctly the value at each index of the CPU
-  // data buffer should be equal to the index.
-  //
-  // The below test cases should ensure that the total number of elements does
-  // not exceed 2048, or else the tests will fail for FP16 textures due to
-  // precision issues. Half precision floating point formats can only represent
-  // integers from 2048 to 4096 using intervals of 2.
-  std::vector<std::vector<int64_t>> to_test = {
-      // 2D sizes
-      {17, 21},
-      {67, 23},
-      {55, 33},
-      // 3D sizes
-      {7, 9, 13},
-      {21, 2, 19},
-      {17, 17, 5},
-      // 4D sizes
-      {7, 3, 13, 7},
-      {11, 9, 9, 1},
-      {3, 3, 3, 3},
-      {3, 1, 7, 13},
-  };
-
-  // These sizes are set such that the total number of elements is less than
-  // 128 which is the maximum representable value for int8.
-  std::vector<std::vector<int64_t>> to_test_int8 = {
-      // 2D sizes
-      {14, 7},
-      // 3D sizes
-      {3, 7, 5},
-      {4, 2, 11},
-      // 4D sizes
-      {3, 3, 3, 3},
-      {7, 1, 6, 3},
-  };
-
-#define RUN_TESTS(ctype, dtype)                                      \
-  round_trip_test<ctype>(                                            \
-      sizes, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, dtype); \
-  round_trip_test<ctype>(                                            \
-      sizes, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, dtype);    \
-  round_trip_test<ctype>(                                            \
-      sizes, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, dtype);   \
-  compute_graph_round_trip_test<ctype>(                              \
-      sizes, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, dtype); \
-  compute_graph_round_trip_test<ctype>(                              \
-      sizes, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, dtype);    \
-  compute_graph_round_trip_test<ctype>(                              \
-      sizes, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, dtype);
-
-  for (auto& sizes : to_test) {
-    RUN_TESTS(float, vkapi::kFloat)
-    RUN_TESTS(executorch::aten::Half, vkapi::kHalf)
-  }
-
-  for (auto& sizes : to_test_int8) {
-    RUN_TESTS(int8_t, vkapi::kChar);
-  }
-
-#undef RUN_TESTS
-}
-
-//
-// Operator Smoke Tests
-//
-
-void test_binary_op(
-    std::string op_name,
-    std::vector<int64_t> sizes_big,
-    std::vector<int64_t> sizes_small,
-    vkapi::ScalarType dtype,
-    utils::GPUMemoryLayout memory_layout) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  IOValueRef arg2{};
-
-  // Build graph
-
-  IOValueRef arg1 = graph.add_input_tensor(sizes_big, dtype, memory_layout);
-  arg2 = graph.add_input_tensor(sizes_small, dtype, memory_layout);
-
-  IOValueRef out;
-  out.value = graph.add_tensor(sizes_big, dtype, memory_layout);
-
-  std::stringstream ss;
-  ss << "aten.";
-  ss << op_name;
-  ss << ".Tensor";
-  VK_GET_OP_FN(ss.str())
-  (graph, {arg1.value, arg2.value, kDummyValueRef, out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-
-  graph.prepack();
-
-  for (int i = 1; i < 4; i++) {
-    float val_arg1 = i + 1.5;
-    float val_arg2 = i - 3.5;
-
-    float val_out = val_arg1 + val_arg2;
-    if (op_name == "sub") {
-      val_out = val_arg1 - val_arg2;
-    }
-    if (op_name == "mul") {
-      val_out = val_arg1 * val_arg2;
-    }
-    if (op_name == "div") {
-      val_out = val_arg1 / val_arg2;
-    }
-
-    execute_graph_and_check_output(graph, {val_arg1, val_arg2}, {val_out});
-  }
-}
-
-#define CALL_TEST_FN_FORALL_CONDITIONS(_)                   \
-  _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked)  \
-  _(vkapi::kFloat, utils::kTexture3D, utils::kHeightPacked) \
-  _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked)
-
-#define CALL_TEST_FN_FOR_W_PACKED(_)                              \
-  _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, false) \
-  _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, true)  \
-  _(vkapi::kFloat, utils::kBuffer, utils::kWidthPacked, false)    \
-  _(vkapi::kFloat, utils::kBuffer, utils::kWidthPacked, true)
-
-#define CALL_TEST_FN_FOR_C_PACKED(_)                                 \
-  _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, false) \
-  _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, true)  \
-  _(vkapi::kFloat, utils::kBuffer, utils::kChannelsPacked, false)    \
-  _(vkapi::kFloat, utils::kBuffer, utils::kChannelsPacked, true)
-
-TEST(VulkanComputeGraphOpsTest, add_smoke_test) {
-#define RUN_TESTS(dtype, storage, layout)                         \
-  test_binary_op("add", {17, 21}, {17, 21}, dtype, layout);       \
-  test_binary_op("add", {17, 21}, {1, 1}, dtype, layout);         \
-  test_binary_op("sub", {11, 22}, {11, 22}, dtype, layout);       \
-  test_binary_op("sub", {11, 22}, {11, 1}, dtype, layout);        \
-  test_binary_op("add", {7, 17, 17}, {7, 17, 17}, dtype, layout); \
-  test_binary_op("add", {7, 17, 17}, {7, 1, 17}, dtype, layout);  \
-  test_binary_op("sub", {9, 9, 7}, {9, 9, 7}, dtype, layout);     \
-  test_binary_op("sub", {9, 9, 7}, {9, 1, 1}, dtype, layout);
-
-  CALL_TEST_FN_FORALL_CONDITIONS(RUN_TESTS);
-
-#undef RUN_TESTS
-}
-
-void test_mm(
-    int B,
-    int M,
-    int K,
-    int N,
-    vkapi::ScalarType dtype,
-    utils::StorageType storage_type,
-    utils::GPUMemoryLayout memory_layout,
-    bool prepack = true) {
-  std::vector<int64_t> mat2_size = {B, K, N};
-
-  std::vector<float> mat2_data(utils::multiply_integers(mat2_size));
-  std::fill(mat2_data.begin(), mat2_data.end(), 2.0f);
-  ComputeGraph graph = build_mm_graph(
-      B, M, K, N, dtype, storage_type, memory_layout, mat2_data, prepack);
-
-  graph.prepare();
-  graph.prepack();
-
-  for (int i = 1; i < 4; i++) {
-    if (prepack) {
-      float val_mat1 = i;
-      float val_out = K * (val_mat1 * 2.0f);
-      execute_graph_and_check_output(graph, {val_mat1}, {val_out});
-    } else {
-      float val_mat1 = i;
-      float val_mat2 = i + 1;
-      float val_out = K * (val_mat1 * val_mat2);
-      execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out});
-    }
-  }
-}
-
-TEST(VulkanComputeGraphOpsTest, mm_smoke_test) {
-#define RUN_TESTS(dtype, storage_type, layout, prepack) \
-  test_mm(                                              \
-      /*B = */ 1,                                       \
-      /*M = */ 31,                                      \
-      /*K = */ 127,                                     \
-      /*N = */ 23,                                      \
-      dtype,                                            \
-      storage_type,                                     \
-      layout,                                           \
-      prepack);                                         \
-  test_mm(                                              \
-      /*B = */ 5,                                       \
-      /*M = */ 31,                                      \
-      /*K = */ 127,                                     \
-      /*N = */ 23,                                      \
-      dtype,                                            \
-      storage_type,                                     \
-      layout,                                           \
-      prepack);                                         \
-  test_mm(                                              \
-      /*B = */ 7,                                       \
-      /*M = */ 13,                                      \
-      /*K = */ 89,                                      \
-      /*N = */ 17,                                      \
-      dtype,                                            \
-      storage_type,                                     \
-      layout,                                           \
-      prepack);                                         \
-  test_mm(                                              \
-      /*B = */ 1,                                       \
-      /*M = */ 13,                                      \
-      /*K = */ 89,                                      \
-      /*N = */ 17,                                      \
-      dtype,                                            \
-      storage_type,                                     \
-      layout,                                           \
-      prepack);
-
-  CALL_TEST_FN_FOR_W_PACKED(RUN_TESTS);
-  CALL_TEST_FN_FOR_C_PACKED(RUN_TESTS);
-
-#undef RUN_TESTS
-}
-
-void test_mm_with_resize_reencode(
-    int B,
-    int M,
-    int K,
-    int N,
-    vkapi::ScalarType dtype,
-    utils::StorageType storage_type,
-    utils::GPUMemoryLayout memory_layout) {
-  ASSERT_TRUE(M > 1);
-
-  std::vector<int64_t> mat2_size = {B, K, N};
-  std::vector<float> mat2_data(utils::multiply_integers(mat2_size));
-  std::fill(mat2_data.begin(), mat2_data.end(), 2.0f);
-
-  ComputeGraph graph = build_mm_graph(
-      B, M, K, N, dtype, storage_type, memory_layout, mat2_data, false);
-
-  graph.prepare();
-  graph.prepack();
-
-  for (int i = 1; i < 4; i++) {
-    float val_mat1 = i;
-    float val_mat2 = i + 1;
-    float val_out = K * (val_mat1 * val_mat2);
-    execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out});
-  }
-
-  // Switch to GEMV mode
-  int new_K = K / 2;
-  std::vector<int64_t> new_mat1_size = {1, new_K};
-  std::vector<int64_t> new_mat2_size = {new_K, N};
-  graph.resize_input(0, new_mat1_size);
-  graph.resize_input(1, new_mat2_size);
-  graph.propagate_resize();
-
-  for (int i = 1; i < 4; i++) {
-    float val_mat1 = i;
-    float val_mat2 = i + 1;
-    float val_out = new_K * (val_mat1 * val_mat2);
-    execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out});
-  }
-}
-
-TEST(VulkanComputeGraphOpsTest, test_graph_resize_reencode) {
-  test_mm_with_resize_reencode(
-      /*B = */ 1,
-      /*M = */ 31,
-      /*K = */ 127,
-      /*N = */ 23,
-      vkapi::kFloat,
-      utils::kTexture3D,
-      utils::kWidthPacked);
-}
-
-void test_grid_priors(
-    std::vector<int64_t> input_sizes,
-    std::vector<int64_t> output_sizes,
-    int stride,
-    double offset,
-    const std::vector<float>& data_out_expected) {
-  GraphConfig config;
-  ComputeGraph graph(config);
-
-  // Build graph
-  IOValueRef in = graph.add_input_tensor(
-      input_sizes,
-      vkapi::kFloat,
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
-  IOValueRef out;
-  out.value = graph.add_tensor(
-      output_sizes,
-      vkapi::kFloat,
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
-
-  VK_GET_OP_FN("et_vk.grid_priors.default")
-  (graph,
-   {in.value,
-    graph.add_scalar<int64_t>(stride),
-    graph.add_scalar<double>(offset),
-    out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-
-  graph.prepack();
-
-  // Resize input
-  graph.propagate_resize();
-
-  // run graph
-  graph.execute();
-
-  std::vector<float> output_data(graph.staging_buffer_numel_of(out.value));
-  graph.copy_from_staging(out.staging, output_data.data(), output_data.size());
-
-  // check results
-  std::vector<int64_t> out_sizes = graph.sizes_of(out.value);
-  int h_out = utils::val_at(-2, out_sizes);
-  int w_out = utils::val_at(-1, out_sizes);
-  for (size_t i = 0; i < h_out; ++i) {
-    for (size_t j = 0; j < w_out; ++j) {
-      size_t idx_out = i * w_out + j;
-      CHECK_VALUE(output_data, idx_out, data_out_expected[idx_out]);
-    }
-  }
-}
-
-TEST(VulkanComputeGraphOpsTest, grid_priors_test) {
-  test_grid_priors(
-      /*input size = */ {1, 5, 2, 3},
-      /*output size = */ {6, 2},
-      /*stride = */ 1,
-      /*offset = */ 0.0,
-      /*data_out_expected = */ {0, 0, 1, 0, 2, 0, 0, 1, 1, 1, 2, 1});
-
-  test_grid_priors(
-      /*input size = */ {1, 5, 2, 3},
-      /*output size = */ {6, 2},
-      /*stride = */ 8,
-      /*offset = */ 0.5,
-      /*data_out_expected = */ {4, 4, 12, 4, 20, 4, 4, 12, 12, 12, 20, 12});
-}
-
-void test_transpose_view_mm(
-    const int B,
-    const int M,
-    const int K,
-    const int N,
-    utils::StorageType storage_type) {
-  GraphConfig config;
-  config.expect_dynamic_shapes = true;
-  config.set_storage_type_override(storage_type);
-  ComputeGraph graph(config);
-
-  std::vector<int64_t> mat1_size = {M, K};
-  std::vector<int64_t> mat2_t_size = {N, K};
-  std::vector<int64_t> out_size = {M, N};
-
-  std::vector<int64_t> mat1_small_size = {M - 4, K - 3};
-  std::vector<int64_t> mat2_t_small_size = {N - 1, K - 3};
-
-  if (B > 1) {
-    mat1_size.resize(3);
-    mat1_size = {B, M, K};
-    mat2_t_size.resize(3);
-    mat2_t_size = {B, N, K};
-    out_size.resize(3);
-    out_size = {B, M, N};
-
-    mat1_small_size.resize(3);
-    mat1_small_size = {B, M - 4, K - 3};
-    mat2_t_small_size.resize(3);
-    mat2_t_small_size = {B, N - 1, K - 3};
-  }
-
-  // Build graph; use shared objects to test views of shared objects
-
-  IOValueRef mat1 =
-      graph.add_input_tensor(mat1_size, vkapi::kFloat, utils::kWidthPacked, 0);
-  IOValueRef mat2_transpose = graph.add_input_tensor(
-      mat2_t_size, vkapi::kFloat, utils::kWidthPacked, 1);
-
-  ValueRef mat2 = graph.add_tensor_view(mat2_transpose.value);
-
-  ValueRef dim0;
-  ValueRef dim1;
-
-  if (B > 1) {
-    dim0 = graph.add_scalar<int64_t>(1);
-    dim1 = graph.add_scalar<int64_t>(2);
-  } else {
-    dim0 = graph.add_scalar<int64_t>(0);
-    dim1 = graph.add_scalar<int64_t>(1);
-  }
-
-  IOValueRef out;
-  out.value = graph.add_tensor(out_size, vkapi::kFloat, utils::kWidthPacked, 2);
-
-  VK_GET_OP_FN("aten.transpose.int")
-  (graph, {mat2_transpose.value, dim0, dim1, mat2});
-  VK_GET_OP_FN("aten.mm.default")(graph, {mat1.value, mat2, out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-
-  graph.prepack();
-
-  for (int i = 1; i < 4; i++) {
-    float val_mat1 = i;
-    float val_mat2 = i + 1;
-    float val_out = K * (val_mat1 * val_mat2);
-
-    // Try at full size
-    graph.resize_input(0, mat1_size);
-    graph.resize_input(1, mat2_t_size);
-    graph.propagate_resize();
-    execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out});
-
-    // Try at reduced sizes
-    val_out = (K - 3) * (val_mat1 * val_mat2);
-    graph.resize_input(0, mat1_small_size);
-    graph.resize_input(1, mat2_t_small_size);
-    graph.propagate_resize();
-    execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out});
-  }
-}
-
-TEST(VulkanComputeGraphOpsTest, test_transpose_with_mm) {
-  for (auto storage_type : {utils::kBuffer, utils::kTexture3D}) {
-    test_transpose_view_mm(2, 7, 17, 5, storage_type);
-  }
-}
-
-void test_to_copy() {
-  GraphConfig config;
-  config.set_storage_type_override(utils::kTexture3D);
-  ComputeGraph graph(config);
-  int M = 8;
-  int N = 8;
-  int K = 8;
-  // Build graph
-  IOValueRef in = graph.add_input_tensor(
-      {1, M, N, K},
-      vkapi::kFloat,
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
-
-  std::vector<float> data_in =
-      create_random_float_buffer(M * N * K, -1024, 1024);
-  graph.copy_into_staging(in.staging, data_in.data(), data_in.size());
-
-  IOValueRef out;
-  out.value = graph.add_tensor(
-      {1, M, N, K},
-      vkapi::kHalf,
-      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
-
-  auto op = VK_GET_OP_FN("aten._to_copy.default");
-  op(graph,
-     {in.value,
-      graph.add_none(),
-      graph.add_none(),
-      graph.add_none(),
-      graph.add_none(),
-      graph.add_none(),
-      graph.add_none(),
-      out.value});
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  graph.prepare();
-
-  graph.prepack();
-  graph.propagate_resize();
-  graph.execute();
-
-  std::vector<torch::executor::Half> output_data(graph.numel_of(out.value));
-  graph.copy_from_staging(out.staging, output_data.data(), output_data.size());
-
-  EXPECT_EQ(data_in.size(), output_data.size());
-
-#ifdef VULKAN_DEBUG
-  float mse_ex = 0.0f;
-  float mse_vk = 0.0f;
-#endif
-
-  // check results
-  for (size_t i = 0; i < output_data.size(); ++i) {
-    float input = data_in[i];
-    torch::executor::Half expected_output =
-        static_cast<torch::executor::Half>(input);
-    uint16_t* expected_bits = reinterpret_cast<uint16_t*>(&expected_output);
-    torch::executor::Half output = output_data[i];
-    uint16_t* output_bits = reinterpret_cast<uint16_t*>(&output);
-
-#ifdef VULKAN_DEBUG
-    std::string msg;
-    msg.reserve(64);
-    msg = "input = " + std::to_string(input) + "(0b" +
-        std::bitset<32>(*reinterpret_cast<uint32_t*>(&input)).to_string() +
-        "), expected output = " + std::to_string(expected_output) + "(0b" +
-        std::bitset<16>(*expected_bits).to_string() +
-        "), recieved output = " + std::to_string(output) + "(0b" +
-        std::bitset<16>(*output_bits).to_string() + ")";
-
-    std::cout << msg << std::endl;
-
-    mse_ex += std::pow(expected_output - input, 2);
-    mse_vk += std::pow(output - input, 2);
-#endif
-
-    // Note: Torch executor half "rounds up" when converting to fp16 whereas
-    // most driver implementations of Vulkan's opFConvert() just truncates the
-    // extra bits for performance (rounding introduces conditional).
-    // Example:
-    // INPUT F32 = 25.248 (sign{0b0}, exp{0b10000011},
-    // mantissa{0b10010011111101111100111}),
-    // TORCH HALF OUTPUT F16 = 25.25 (sign{0b0}, exp{0b10011},
-    // mantissa{0b1001010000}),
-    // VULKAN OUTPUT F16 = 25.2344 (sign{0b0}, exp{0b10011},
-    // mantissa{0b1001001111})
-    // Note:
-    // The vulkan mantissa exactly matches the first 10
-    // bits of the input 23 bit mantissa. But since the 11th bit is 1, the
-    // torch half output is rounded up (essentially adding a 1).
-    // Vulkan mantissa{0b1001001111} + 1 = Torch half mantissa{0b1001010000}
-
-    EXPECT_TRUE(
-        (*output_bits == *expected_bits) ||
-        /*rounding error*/ ((*output_bits + 1u) == *expected_bits));
-  }
-
-#ifdef VULKAN_DEBUG
-  mse_ex /= output_data.size();
-  mse_vk /= output_data.size();
-
-  std::cout << "========================================================="
-            << std::endl;
-  std::cout << "mse_ex = " << mse_ex << ", mse_vk = " << mse_vk << std::endl;
-#endif
-}
-
-TEST(VulkanComputeGraphOpsTest, test_to_copy) {
-  if (context()->adapter_ptr()->supports_16bit_storage_buffers()) {
-    test_to_copy();
-  }
-}
-
-vkapi::ShaderInfo pick_dynamic_dispatch_shader(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& additional_args) {
-  const ValueRef mat1 = args[1].refs[0];
-
-  std::string kernel_name = "dynamic_dispatch_test";
-  if (graph->size_at<int32_t>(-2, mat1) == 1) {
-    kernel_name += "_var1";
-  } else {
-    kernel_name += "_var2";
-  }
-  return VK_KERNEL_FROM_STR(kernel_name);
-}
-
-utils::uvec3 pick_dynamic_dispatch_global_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)shader;
-  const ValueRef out = args[0].refs[0];
-  return graph->logical_limits_of(out);
-}
-
-utils::uvec3 pick_dynamic_dispatch_local_wg_size(
-    ComputeGraph* graph,
-    const vkapi::ShaderInfo& shader,
-    const utils::uvec3& global_workgroup_size,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& resize_args) {
-  (void)graph;
-  (void)shader;
-  (void)global_workgroup_size;
-  return {64, 1, 1};
-}
-
-void resize_dynamic_dispatch_node(
-    ComputeGraph* graph,
-    const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& additional_args) {
-  const ValueRef out = args[0].refs[0];
-  const ValueRef mat1 = args[1].refs[0];
-
-  std::vector<int64_t> out_sizes = graph->sizes_of(mat1);
-  out_sizes.at(out_sizes.size() - 2) = 1;
-
-  graph->virtual_resize(out, out_sizes);
-}
-
-void add_dynamic_dispatch_test_node(
-    ComputeGraph& graph,
-    const ValueRef mat1,
-    const ValueRef mat2,
-    const ValueRef out) {
-  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
-      graph,
-      pick_dynamic_dispatch_shader,
-      pick_dynamic_dispatch_global_wg_size,
-      pick_dynamic_dispatch_local_wg_size,
-      // Inputs and Outputs
-      {{out, vkapi::kWrite}, {{mat1, mat2}, vkapi::kRead}},
-      // Shader params buffers
-      {},
-      // Push Constants
-      {graph.sizes_pc_of(out),
-       graph.sizes_pc_of(mat1),
-       graph.sizes_pc_of(mat2)},
-      // Specialization constants
-      {},
-      // Resize Logic
-      {},
-      resize_dynamic_dispatch_node));
-}
-
-vkcompute::ComputeGraph build_dynamic_dispatch_test_graph(int M, int N) {
-  using namespace vkcompute;
-  GraphConfig config;
-  config.expect_dynamic_shapes = true;
-  ComputeGraph graph(config);
-
-  vkapi::ScalarType dtype = vkapi::kFloat;
-  utils::StorageType in_out_stype = utils::kTexture3D;
-  utils::GPUMemoryLayout memory_layout = utils::kWidthPacked;
-
-  std::vector<int64_t> mat1_size = {M, N};
-  std::vector<int64_t> mat2_size = {M, N};
-  std::vector<int64_t> out_size = {1, N};
-
-  IOValueRef mat1 =
-      graph.add_input_tensor(mat1_size, dtype, in_out_stype, memory_layout);
-  IOValueRef mat2{};
-
-  mat2.value = graph.add_tensor(mat2_size, dtype, in_out_stype, memory_layout);
-  mat2.staging = graph.set_input_tensor(mat2.value);
-
-  IOValueRef out;
-  out.value = graph.add_tensor(out_size, dtype, in_out_stype, memory_layout);
-
-  add_dynamic_dispatch_test_node(graph, mat1, mat2, out);
-
-  out.staging = graph.set_output_tensor(out.value);
-
-  return graph;
-}
-
-void test_dynamic_dispatch(int M, int N) {
-  ComputeGraph graph = build_dynamic_dispatch_test_graph(M, N);
-
-  graph.prepare();
-  graph.prepack();
-
-  for (int i = 1; i < 4; i++) {
-    float val_mat1 = i;
-    float val_mat2 = i + 1;
-    // 5.3 is a hardcoded offset in the compute shader
-    float val_out = M * (val_mat1 * val_mat2) + 5.5;
-    execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out});
-  }
-
-  // Switch to GEMV mode
-  int new_N = N / 2;
-  std::vector<int64_t> new_mat1_size = {1, new_N};
-  std::vector<int64_t> new_mat2_size = {1, new_N};
-  graph.resize_input(0, new_mat1_size);
-  graph.resize_input(1, new_mat2_size);
-  graph.propagate_resize();
-
-  for (int i = 1; i < 4; i++) {
-    float val_mat1 = i;
-    float val_mat2 = i + 1;
-    float val_out = (val_mat1 * val_mat2) + 2.25;
-    execute_graph_and_check_output(graph, {val_mat1, val_mat2}, {val_out});
-  }
-}
-
-TEST(VulkanComputeGraphOpsTest, test_dynamic_dispatch_graph) {
-  test_dynamic_dispatch(128, 128);
-}
diff --git a/backends/vulkan/tools b/backends/vulkan/tools
new file mode 120000
index 00000000000..1049695e9e7
--- /dev/null
+++ b/backends/vulkan/tools
@@ -0,0 +1 @@
+/home/ssjia/fbsource/xplat/executorch/backends/vulkan/tools
\ No newline at end of file
diff --git a/backends/vulkan/tools/gpuinfo/TARGETS b/backends/vulkan/tools/gpuinfo/TARGETS
deleted file mode 100644
index 10e3acb4b8c..00000000000
--- a/backends/vulkan/tools/gpuinfo/TARGETS
+++ /dev/null
@@ -1,50 +0,0 @@
-load("@fbcode_macros//build_defs:native_rules.bzl", "buck_filegroup")
-load("@fbsource//tools/build_defs:fb_xplat_cxx_binary.bzl", "fb_xplat_cxx_binary")
-load(
-    "@fbsource//tools/build_defs:platform_defs.bzl",
-    "ANDROID",
-)
-load(
-    "@fbsource//xplat/executorch/backends/vulkan:targets.bzl",
-    "vulkan_spv_shader_lib",
-)
-
-oncall("executorch")
-
-buck_filegroup(
-    name = "gpuinfo_shaders",
-    srcs = glob([
-        "glsl/*",
-    ]),
-    visibility = [
-        "PUBLIC",
-    ],
-)
-
-vulkan_spv_shader_lib(
-    name = "gpuinfo_shader_lib",
-    is_fbcode = True,
-    spv_filegroups = {
-        ":gpuinfo_shaders": "glsl",
-    },
-)
-
-fb_xplat_cxx_binary(
-    name = "vulkan_gpuinfo",
-    srcs = glob([
-        "**/*.cpp",
-    ]),
-    headers = glob([
-        "**/*.h",
-    ]),
-    header_namespace = "/include",
-    include_directories = ["/include"],
-    platforms = ANDROID,
-    raw_headers = glob([
-        "**/*.h",
-    ]),
-    deps = [
-        ":gpuinfo_shader_lib",
-        "//executorch/backends/vulkan:vulkan_graph_runtime",
-    ],
-)
diff --git a/backends/vulkan/tools/gpuinfo/config.json b/backends/vulkan/tools/gpuinfo/config.json
deleted file mode 100644
index afb5cbc6c59..00000000000
--- a/backends/vulkan/tools/gpuinfo/config.json
+++ /dev/null
@@ -1,48 +0,0 @@
-{
-  "reg_count": {
-    "enabled": true,
-    "threshold": 3,
-    "compensate": 0.1
-  },
-  "buf_cacheline_size": {
-    "enabled": true,
-    "threshold": 10,
-    "compensate": 0.1
-  },
-  "buffer_bandwidth": {
-    "enabled": true,
-    "range": 134217728,
-    "nflush": 4,
-    "nunroll": 16,
-    "niter": 10
-  },
-  "ubo_bandwidth": {
-    "enabled": true,
-    "range": 134217728,
-    "nflush": 4,
-    "nunroll": 16,
-    "niter": 10
-  },
-  "shared_bandwidth": {
-    "enabled": true,
-    "nflush": 4,
-    "nunroll": 16,
-    "niter": 10
-  },
-  "warp_size": {
-    "enabled": true,
-    "threshold": 3,
-    "compensate": 0.1
-  },
-  "tex_bandwidth": {
-    "enabled": true,
-    "nflush": 4,
-    "nunroll": 16,
-    "niter": 10
-  },
-  "tex_cacheline_concurr": {
-    "enabled": true,
-    "threshold": 3,
-    "compensate": 0.1
-  }
-}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl
deleted file mode 100644
index 38c9befec6f..00000000000
--- a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.glsl
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-layout(std430) buffer;
-
-$if MEMTYPE == "ubo":
-    ${layout_declare_ubo(0, "vec4", "A")}
-$elif MEMTYPE == "buffer":
-    ${layout_declare_buffer(0, "r", "A", DTYPE, "PRECISION", False)}
-$else:
-    ${layout_declare_buffer(0, "r", "_", DTYPE, "PRECISION", False)}
-
-${layout_declare_buffer(1, "w", "B", DTYPE, "PRECISION", False)}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int niter = 1;
-layout(constant_id = 4) const int nvec = 1;
-layout(constant_id = 5) const int local_group_size = 1;
-// The address mask works as a modulo because x % 2^n == x & (2^n - 1).
-// This will help us limit address accessing to a specific set of unique
-// addresses depending on the access size we want to measure.
-layout(constant_id = 6) const int addr_mask = 1;
-layout(constant_id = 7) const int workgroup_width = 1;
-
-$if MEMTYPE == "shared":
-    shared vec4 A[nvec];
-
-void main() {
-
-    $if MEMTYPE == "shared":
-        A[gl_LocalInvocationID[0]][0] = gl_LocalInvocationID[0];
-        memoryBarrierShared();
-
-    vec4 sum = vec4(0);
-    uint offset = (gl_WorkGroupID[0] * workgroup_width  + gl_LocalInvocationID[0]) & addr_mask;
-
-    int i = 0;
-    for (; i < niter; ++i){
-      $for j in range(int(NUNROLL)):
-          sum *= A[offset];
-
-          // On each unroll, a new unique address will be accessed through the offset,
-          // limited by the address mask to a specific set of unique addresses
-          offset = (offset + local_group_size) & addr_mask;
-    }
-
-    // This is to ensure no compiler optimizations occur
-    vec4 zero = vec4(i>>31);
-
-    B[gl_LocalInvocationID[0]] = sum + zero;
-}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.yaml b/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.yaml
deleted file mode 100644
index b47e6ba2a3d..00000000000
--- a/backends/vulkan/tools/gpuinfo/glsl/buf_bandwidth.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-buf_bandwidth:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: buffer
-    NUNROLL: "16"
-  generate_variant_forall:
-    MEMTYPE:
-      - VALUE: ubo
-      - VALUE: buffer
-      - VALUE: shared
-  shader_variants:
-    - NAME: buf_bandwidth
diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.glsl b/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.glsl
deleted file mode 100644
index d9e36376909..00000000000
--- a/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.glsl
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-layout(std430) buffer;
-
-
-${layout_declare_buffer(0, "r", "source", DTYPE)}
-${layout_declare_buffer(1, "w", "destination", DTYPE)}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int niter = 1;
-layout(constant_id = 4) const int stride = 1;
-layout(constant_id = 5) const int pitch = 1;
-
-void main() {
-  float c = 0;
-  for (int i = 0; i < niter; ++i) {
-    const int zero = i >> 31;
-    c += source[zero + pitch * gl_GlobalInvocationID[0]];
-    c += source[zero + stride + pitch * gl_GlobalInvocationID[0]];
-  }
-  destination[0] = c;
-}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.yaml b/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.yaml
deleted file mode 100644
index 8570e14ea1b..00000000000
--- a/backends/vulkan/tools/gpuinfo/glsl/buf_cacheline_size.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-buf_cacheline_size:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: buffer
-  shader_variants:
-    - NAME: buf_cacheline_size
diff --git a/backends/vulkan/tools/gpuinfo/glsl/reg_count.glsl b/backends/vulkan/tools/gpuinfo/glsl/reg_count.glsl
deleted file mode 100644
index cc63ae80c52..00000000000
--- a/backends/vulkan/tools/gpuinfo/glsl/reg_count.glsl
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-layout(std430) buffer;
-
-${layout_declare_buffer(0, "w", "out_buff", DTYPE)}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int NITER = 1;
-
-void main() {
-
-  $for k in range(int(NREG)):
-     float reg_data${k} = float(NITER) + ${k};
-
-  int i = 0;
-  for (; i < NITER; ++i) {
-    reg_data0 *= reg_data${int(NREG)-1};
-    $for k in range(1, int(NREG)):
-      reg_data${k} *= reg_data${k-1};
-  }
-  i = i >> 31;
-
-  $for k in range(int(NREG)):
-    out_buff[${k} * i] = reg_data${k};
-}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/reg_count.yaml b/backends/vulkan/tools/gpuinfo/glsl/reg_count.yaml
deleted file mode 100644
index ecdf87d362e..00000000000
--- a/backends/vulkan/tools/gpuinfo/glsl/reg_count.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-reg_count:
-  parameter_names_with_default_values:
-    DTYPE: float
-    STORAGE: buffer
-  generate_variant_forall:
-    NREG:
-      - RANGE: [1, 512]
-
-  shader_variants:
-    - NAME: reg_count
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
deleted file mode 100644
index 7ab67bd2d0a..00000000000
--- a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.glsl
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_sampler(0, "r", "A", DTYPE)}
-${layout_declare_buffer(1, "w", "B", DTYPE, "PRECISION", False)}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int niter = 1;
-layout(constant_id = 4) const int nvec = 1;
-layout(constant_id = 5) const int local_group_size = 1;
-// The address mask works as a modulo because x % 2^n == x & (2^n - 1).
-// This will help us limit address accessing to a specific set of unique
-// addresses depending on the access size we want to measure.
-layout(constant_id = 6) const int addr_mask = 1;
-layout(constant_id = 7) const int workgroup_width = 1;
-
-void main() {
-    vec4 sum = vec4(0);
-    uint offset = (gl_WorkGroupID[0] * workgroup_width  + gl_LocalInvocationID[0]) & addr_mask;
-
-    int i = 0;
-    for (; i < niter; ++i){
-      VEC4_T in_texel;
-      $for j in range(int(NUNROLL)):
-        $if DIM == 0:
-            in_texel = texelFetch(A, ivec3(offset, 0, 0), 0);
-        $elif DIM == 1:
-            in_texel = texelFetch(A, ivec3(0, offset, 0), 0);
-        $elif DIM == 2:
-            in_texel = texelFetch(A, ivec3(0, 0, offset), 0);
-
-        sum *= in_texel;
-
-        // On each unroll, a new unique address will be accessed through the offset,
-        // limited by the address mask to a specific set of unique addresses
-        offset = (offset + local_group_size) & addr_mask;
-    }
-
-    // This is to ensure no compiler optimizations occur
-    vec4 zero = vec4(i>>31);
-
-    B[gl_LocalInvocationID[0]] = sum + zero;
-}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml b/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml
deleted file mode 100644
index 84da6938fd4..00000000000
--- a/backends/vulkan/tools/gpuinfo/glsl/tex_bandwidth.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-tex_bandwidth:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NUNROLL: "16"
-  generate_variant_forall:
-    DIM:
-      - RANGE: [0, 2]
-  shader_variants:
-    - NAME: tex_bandwidth
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl
deleted file mode 100644
index 62659c7bb88..00000000000
--- a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.glsl
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-${layout_declare_sampler(0, "r", "in_tex", DTYPE)}
-${layout_declare_buffer(1, "w", "out_buf", DTYPE, "PRECISION", False)}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int niter = 1;
-
-void main() {
-    vec4 sum = vec4(0);
-    int i = 0;
-    for (; i < niter; ++i){
-        $if DIM == 0:
-            sum += texelFetch(in_tex, ivec3(gl_GlobalInvocationID[0], 0, 0), 0);
-        $elif DIM == 1:
-            sum +=  texelFetch(in_tex, ivec3(0, gl_GlobalInvocationID[0], 0), 0);
-        $elif DIM == 2:
-            sum +=  texelFetch(in_tex, ivec3(0, 0, gl_GlobalInvocationID[0]), 0);
-    }
-
-    // This is to ensure no compiler optimizations occur
-    vec4 zero = vec4(i>>31);
-
-    out_buf[0] = sum + zero;
-}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml b/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml
deleted file mode 100644
index 6b557c9f66e..00000000000
--- a/backends/vulkan/tools/gpuinfo/glsl/tex_cacheline_concurr.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-tex_cacheline_concurr:
-  parameter_names_with_default_values:
-    DTYPE: float
-  generate_variant_forall:
-    DIM:
-      - RANGE: [0, 2]
-  shader_variants:
-    - NAME: tex_cacheline_concurr
diff --git a/backends/vulkan/tools/gpuinfo/glsl/warp_size.glsl b/backends/vulkan/tools/gpuinfo/glsl/warp_size.glsl
deleted file mode 100644
index 352ce04a5c9..00000000000
--- a/backends/vulkan/tools/gpuinfo/glsl/warp_size.glsl
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-layout(std430) buffer;
-
-${layout_declare_buffer(0, "w", "out_buff", DTYPE)}
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-$if METHOD == "scheduler":
-    shared int shared_counter;
-$elif METHOD == "physical":
-    layout(constant_id = 3) const int NITER = 1;
-$else:
-    $raise Exception("Unsupported value for warp_size")
-
-void main() {
-
-    $if METHOD == "scheduler":
-        shared_counter = 0;
-        memoryBarrierShared();
-        int i = atomicAdd(shared_counter, 1);
-        memoryBarrierShared();
-        out_buff[gl_GlobalInvocationID[0]] = i;
-    $else:
-        int sum = 0;
-        for (int j = 0; j < NITER; ++j) {
-            // Integer division is an exemplary multi-cycle instruction that can
-            // hardly be optimized, thus reducing the impact of latency hiding.
-            sum += j / 3;
-            barrier();
-        }
-        out_buff[gl_GlobalInvocationID[0]] = sum;
-}
diff --git a/backends/vulkan/tools/gpuinfo/glsl/warp_size.yaml b/backends/vulkan/tools/gpuinfo/glsl/warp_size.yaml
deleted file mode 100644
index 69587bd38d0..00000000000
--- a/backends/vulkan/tools/gpuinfo/glsl/warp_size.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-warp_size:
-  parameter_names_with_default_values:
-    DTYPE: int32
-    STORAGE: buffer
-  generate_variant_forall:
-    METHOD:
-      - VALUE: scheduler
-      - VALUE: physical
-  shader_variants:
-    - NAME: warp_size
diff --git a/backends/vulkan/tools/gpuinfo/include/app.h b/backends/vulkan/tools/gpuinfo/include/app.h
deleted file mode 100644
index a46e9e6b9ae..00000000000
--- a/backends/vulkan/tools/gpuinfo/include/app.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <folly/json.h>
-#include <fstream>
-#include <iostream>
-
-#include "utils.h"
-
-namespace gpuinfo {
-
-class App {
- private:
-  folly::dynamic config_;
-
- public:
-  size_t buf_cache_size;
-  uint32_t max_shared_mem_size;
-  uint32_t sm_count;
-  uint32_t nthread_logic;
-  uint32_t subgroup_size;
-  uint32_t max_tex_width;
-  uint32_t max_tex_height;
-  uint32_t max_tex_depth;
-
-  App() {
-    context()->initialize_querypool();
-
-    std::cout << context()->adapter_ptr()->stringize() << std::endl
-              << std::endl;
-
-    auto cl_device = get_cl_device();
-
-    sm_count = cl_device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
-    nthread_logic = cl_device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
-    buf_cache_size = cl_device.getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>();
-    max_shared_mem_size = cl_device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
-    max_tex_width = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_WIDTH>();
-    max_tex_height = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_HEIGHT>();
-    max_tex_depth = cl_device.getInfo<CL_DEVICE_IMAGE3D_MAX_DEPTH>();
-
-    VkPhysicalDeviceSubgroupProperties subgroup_props{};
-    VkPhysicalDeviceProperties2 props2{};
-
-    props2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
-    props2.pNext = &subgroup_props;
-    subgroup_props.sType =
-        VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
-    vkGetPhysicalDeviceProperties2(
-        context()->adapter_ptr()->physical_handle(), &props2);
-    subgroup_size = subgroup_props.subgroupSize;
-
-    std::cout << std::endl;
-    std::cout << "SM count," << sm_count << std::endl;
-    std::cout << "Logic Thread Count," << nthread_logic << std::endl;
-    std::cout << "Cache Size," << buf_cache_size << std::endl;
-    std::cout << "Shared Memory Size," << max_shared_mem_size << std::endl;
-    std::cout << "SubGroup Size," << subgroup_size << std::endl;
-    std::cout << "MaxTexWidth," << max_tex_width << std::endl;
-    std::cout << "MaxTexHeight," << max_tex_height << std::endl;
-    std::cout << "MaxTexDepth," << max_tex_depth << std::endl;
-  }
-
-  float get_config(const std::string& test, const std::string& key) const {
-    if (config_[test].empty()) {
-      throw std::runtime_error("Missing config for " + test);
-    }
-
-    if (!config_[test][key].isNumber()) {
-      throw std::runtime_error(
-          "Config for " + test + "." + key + " is not a number");
-    }
-
-    float value;
-    if (config_[test][key].isDouble()) {
-      value = config_[test][key].getDouble();
-    } else {
-      value = config_[test][key].getInt();
-    }
-
-    std::cout << "Read value for " << test << "." << key << " = " << value
-              << std::endl;
-    return value;
-  }
-
-  bool enabled(const std::string& test) const {
-    if (config_.empty() || config_[test].empty() ||
-        !config_[test]["enabled"].isBool()) {
-      return true;
-    }
-    return config_[test]["enabled"].getBool();
-  }
-
-  void load_config(std::string file_path) {
-    std::ifstream file(file_path);
-    std::stringstream buffer;
-    buffer << file.rdbuf();
-    const std::string json_str = buffer.str();
-    if (json_str.empty()) {
-      throw std::runtime_error(
-          "Failed to read config file from " + file_path + ".");
-    }
-    config_ = folly::parseJson(json_str);
-  }
-};
-} // namespace gpuinfo
diff --git a/backends/vulkan/tools/gpuinfo/include/architecture.h b/backends/vulkan/tools/gpuinfo/include/architecture.h
deleted file mode 100644
index 9af908eb170..00000000000
--- a/backends/vulkan/tools/gpuinfo/include/architecture.h
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
-
-#include "app.h"
-#include "stats.h"
-#include "utils.h"
-
-using namespace vkapi;
-
-namespace gpuinfo {
-
-void reg_count(const App& app) {
-  if (!app.enabled("reg_count")) {
-    std::cout << "Skipped Register Count" << std::endl;
-    return;
-  }
-
-  std::cout << std::endl;
-  std::cout << "------ Register Count ------" << std::endl;
-  const uint32_t NREG_MIN = 1;
-  const uint32_t NREG_MAX = 512;
-  const uint32_t NREG_STEP = 1;
-
-  const double COMPENSATE = app.get_config("reg_count", "compensate");
-  const double THRESHOLD = app.get_config("reg_count", "threshold");
-
-  const uint32_t NGRP_MIN = 1;
-  const uint32_t NGRP_MAX = 64;
-  const uint32_t NGRP_STEP = 1;
-
-  uint32_t NITER;
-
-  auto bench = [&](uint32_t ngrp, uint32_t nreg) {
-    StagingBuffer buffer(context(), vkapi::kFloat, 1);
-    vkapi::PipelineBarrier pipeline_barrier{};
-
-    auto shader_name = "reg_count_" + std::to_string(nreg);
-
-    auto time = benchmark_on_gpu(shader_name, 30, [&]() {
-      context()->submit_compute_job(
-          VK_KERNEL_FROM_STR(shader_name),
-          pipeline_barrier,
-          {1, ngrp, 1},
-          {1, 1, 1},
-          {SV(NITER)},
-          VK_NULL_HANDLE,
-          0,
-          buffer.buffer());
-    });
-    return time;
-  };
-
-  ensure_min_niter(1000, NITER, [&]() { return bench(1, NREG_MIN); });
-
-  uint32_t nreg_max;
-
-  DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
-  uint32_t nreg = NREG_MIN;
-  for (; nreg <= NREG_MAX; nreg += NREG_STEP) {
-    double time = bench(1, nreg);
-    std::cout << "Testing nreg=\t" << nreg << "\tTime=\t" << time << "\tus"
-              << std::endl;
-    if (dj.push(time)) {
-      nreg -= NREG_STEP;
-      nreg_max = nreg;
-      break;
-    }
-  }
-  if (nreg >= NREG_MAX) {
-    std::cout << "Unable to conclude a maximal register count" << std::endl;
-    nreg_max = NREG_STEP;
-  } else {
-    std::cout << nreg_max << " registers are available at most" << std::endl;
-  }
-
-  auto find_ngrp_by_nreg = [&](const uint32_t nreg) {
-    DtJumpFinder<3> dj(COMPENSATE, THRESHOLD);
-    for (auto ngrp = NGRP_MIN; ngrp <= NGRP_MAX; ngrp += NGRP_STEP) {
-      auto time = bench(ngrp, nreg);
-      std::cout << "Testing occupation (nreg=\t" << nreg << "\t); ngrp=\t"
-                << ngrp << "\t, time=\t" << time << "\tus" << std::endl;
-
-      if (dj.push(time)) {
-        ngrp -= NGRP_STEP;
-        std::cout << "Using " << nreg << " registers can have " << ngrp
-                  << " concurrent single-thread workgroups" << std::endl;
-        return ngrp;
-      }
-    }
-    std::cout
-        << "Unable to conclude a maximum number of concurrent single-thread workgroups when "
-        << nreg << " registers are occupied" << std::endl;
-    return (uint32_t)1;
-  };
-
-  uint32_t ngrp_full, ngrp_half;
-  ngrp_full = find_ngrp_by_nreg(nreg_max);
-  ngrp_half = find_ngrp_by_nreg(nreg_max / 2);
-
-  std::string reg_ty;
-
-  if (ngrp_full * 1.5 < ngrp_half) {
-    std::cout << "All physical threads in an sm share " << nreg_max
-              << " registers" << std::endl;
-    reg_ty = "Pooled";
-
-  } else {
-    std::cout << "Each physical thread has " << nreg_max << " registers"
-              << std::endl;
-    reg_ty = "Dedicated";
-  }
-
-  std::cout << std::endl << std::endl;
-  std::cout << "MaxRegisters," << nreg_max << std::endl;
-  std::cout << "ConcurrentWorkgroupsFullReg," << ngrp_full << std::endl;
-  std::cout << "ConcurrentWorkgroupsHalfReg," << ngrp_half << std::endl;
-  std::cout << "RegisterType," << reg_ty << std::endl;
-}
-
-// Warp size is a difficult metric to obtain because the hardware limitations
-// do not always coincide with the way the SM divides the workload. For
-// instance, the hardware can have a warp size of 64 threads, but an SM might
-// be able to simulate concurrency of 128 threads with a single scheduler.
-
-// Because of this, it is important to measure the warp size different ways,
-// that can evidence both the physical limitations of the hardware, and the
-// actual behavior of the driver.
-
-// Additionally,the SM can behave in two different ways when the assigned
-// workload is smaller than the warp size.
-
-// In Case 1, like ARM Mali, the SM can assign dummy workloads to fill empty
-// threads and maintain a uniform workload.
-
-// In Case 2, like in Adreno, the driver might decide to pack multiple works
-// together and dispatch them at once.
-void warp_size(const App& app, const bool verbose = false) {
-  if (!app.enabled("warp_size")) {
-    std::cout << "Skipped Warp Size" << std::endl;
-    return;
-  }
-
-  std::cout << "\n------ Warp Size ------" << std::endl;
-
-  // Method A: Stress test with a kernel that uses complex ALU operations like
-  // integer division to avoid latency hiding. Increase the number of threads
-  // until a jump in latency is detected.
-
-  // This timing-based method helps us identify physical warp sizes. It also
-  // helps with Case 2, when threads of multiple warps are managed by the same
-  // scheduler at the same time.
-  const double COMPENSATE = app.get_config("warp_size", "compensate");
-  const double THRESHOLD = app.get_config("warp_size", "threshold");
-
-  uint32_t NITER;
-
-  auto bench = [&](uint32_t nthread) {
-    StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
-    vkapi::PipelineBarrier pipeline_barrier{};
-
-    auto shader_name = "warp_size_physical";
-
-    auto time = benchmark_on_gpu(shader_name, 10, [&]() {
-      context()->submit_compute_job(
-          VK_KERNEL_FROM_STR(shader_name),
-          pipeline_barrier,
-          // Large number of work groups selected to potentially saturate all
-          // ALUs and thus have a better baseline for comparison.
-          {nthread, 1024, 1},
-          {nthread, 1, 1},
-          {SV(NITER)},
-          VK_NULL_HANDLE,
-          0,
-          out_buf.buffer());
-    });
-
-    return time;
-  };
-
-  ensure_min_niter(1000, NITER, [&]() { return bench(1); });
-
-  uint32_t warp_size = app.subgroup_size;
-  DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
-
-  // We increase the number of threads until we hit a jump in the data.
-  uint32_t nthread = 1;
-  for (; nthread <= app.nthread_logic; ++nthread) {
-    double time = bench(nthread);
-    std::cout << "nthread=\t" << nthread << "\t(\t" << time << "\tus)"
-              << std::endl;
-    if (dj.push(time)) {
-      warp_size = nthread - 1;
-      break;
-    }
-  }
-  if (nthread >= app.nthread_logic) {
-    std::cout
-        << "Unable to conclude a physical warp size. Assuming warp_size == subgroup_size"
-        << std::endl;
-  }
-
-  // Method B: Let all the threads in a warp race and atomically fetch-add
-  // a counter, then store the counter values to the output buffer in the
-  // scheduling order of these threads. If all the order numbers follow an
-  // ascending order, then the threads are likely executing within a warp.
-  // Threads in different warps are not managed by the same scheduler, so they
-  // would race for a same ID out of order, unaware of each other.
-
-  // This method evidences the actual driver behavior when running
-  // concurrency, regardless of the physical limitations of the hardware.
-
-  // Likewise, this method helps us identify warp sizes when the SM
-  // sub-divides its ALUs into independent groups, like the three execution
-  // engines in a Mali G76 core. It helps warp-probing in Case 1 because it
-  // doesn't depend on kernel timing, so the extra wait time doesn't lead to
-  // inaccuracy.
-  auto bench_sm = [&](uint32_t nthread) {
-    StagingBuffer out_buf(context(), vkapi::kInt, app.nthread_logic);
-    vkapi::PipelineBarrier pipeline_barrier{};
-
-    auto shader_name = "warp_size_scheduler";
-
-    benchmark_on_gpu(shader_name, 1, [&]() {
-      context()->submit_compute_job(
-          VK_KERNEL_FROM_STR(shader_name),
-          pipeline_barrier,
-          {nthread, 1, 1},
-          {nthread, 1, 1},
-          {},
-          VK_NULL_HANDLE,
-          0,
-          out_buf.buffer());
-    });
-
-    std::vector<int32_t> data(app.nthread_logic);
-    out_buf.copy_to(data.data(), out_buf.nbytes());
-
-    if (verbose) {
-      std::stringstream ss;
-      for (auto j = 0; j < nthread; ++j) {
-        ss << data[j] << " ";
-      }
-      std::cout << ss.str() << std::endl;
-    }
-
-    // Check until which point is the data in ascending order.
-    int32_t last = -1;
-    int32_t j = 0;
-    for (; j < nthread; ++j) {
-      if (last >= data[j]) {
-        break;
-      }
-      last = data[j];
-    }
-
-    return j;
-  };
-
-  // Test increasing sizes until the data is no longer in ascending order.
-  uint32_t warp_size_scheduler = warp_size;
-  int i = 1;
-  for (; i <= app.nthread_logic; ++i) {
-    uint32_t nascend = bench_sm(i);
-    if (nascend != i) {
-      warp_size_scheduler = nascend;
-      break;
-    }
-  }
-  if (i > app.nthread_logic) {
-    std::cout << "Unable to conclude an SM Warp Size." << std::endl;
-  }
-
-  std::cout << "PhysicalWarpSize," << warp_size << std::endl;
-  std::cout << "SMWarpSize," << warp_size_scheduler << std::endl;
-}
-}; // namespace gpuinfo
diff --git a/backends/vulkan/tools/gpuinfo/include/buffers.h b/backends/vulkan/tools/gpuinfo/include/buffers.h
deleted file mode 100644
index 31137b11eea..00000000000
--- a/backends/vulkan/tools/gpuinfo/include/buffers.h
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include "app.h"
-#include "stats.h"
-#include "utils.h"
-
-using namespace vkapi;
-
-namespace gpuinfo {
-
-void buf_cacheline_size(const App& app) {
-  if (!app.enabled("buf_cacheline_size")) {
-    std::cout << "Skipped Buffer Cacheline Size" << std::endl;
-    return;
-  }
-
-  std::cout << std::endl;
-  std::cout << "------ Buffer Cacheline Size ------" << std::endl;
-
-  const double COMPENSATE = app.get_config("buf_cacheline_size", "compensate");
-  const double THRESHOLD = app.get_config("buf_cacheline_size", "threshold");
-
-  const uint32_t PITCH = app.buf_cache_size / app.nthread_logic;
-  const uint32_t BUF_SIZE = app.buf_cache_size;
-  const uint32_t MAX_STRIDE = PITCH;
-
-  uint32_t NITER;
-
-  auto bench = [&](int stride) {
-    StagingBuffer in_buf(context(), vkapi::kFloat, BUF_SIZE);
-    StagingBuffer out_buf(context(), vkapi::kFloat, 1);
-    vkapi::PipelineBarrier pipeline_barrier{};
-
-    auto shader_name = "buf_cacheline_size";
-
-    auto time = benchmark_on_gpu(shader_name, 100, [&]() {
-      context()->submit_compute_job(
-          VK_KERNEL_FROM_STR(shader_name),
-          pipeline_barrier,
-          {app.nthread_logic, 1, 1},
-          {app.nthread_logic, 1, 1},
-          {SV(NITER), SV(stride), SV(PITCH)},
-          VK_NULL_HANDLE,
-          0,
-          in_buf.buffer(),
-          out_buf.buffer());
-    });
-    return time;
-  };
-
-  ensure_min_niter(1000, NITER, [&]() { return bench(1); });
-
-  uint32_t cacheline_size;
-
-  DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
-  uint32_t stride = 1;
-  for (; stride <= MAX_STRIDE; ++stride) {
-    double time = bench(stride);
-    std::cout << "Testing stride=\t" << stride << "\t, time=\t" << time
-              << std::endl;
-
-    if (dj.push(time)) {
-      cacheline_size = stride * sizeof(float);
-      break;
-    }
-  }
-  if (stride >= MAX_STRIDE) {
-    std::cout << "Unable to conclude a top level buffer cacheline size."
-              << std::endl;
-    cacheline_size = MAX_STRIDE * sizeof(float);
-  }
-
-  std::cout << "BufTopLevelCachelineSize," << cacheline_size << std::endl;
-}
-
-void _bandwidth(
-    const App& app,
-    const std::string memtype,
-    const uint32_t range) {
-  auto memtype_lower = memtype;
-  std::transform(
-      memtype_lower.begin(),
-      memtype_lower.end(),
-      memtype_lower.begin(),
-      [](unsigned char c) { return std::tolower(c); });
-
-  auto test_name = memtype_lower + "_bandwidth";
-
-  // Cache lines flushed
-  const uint32_t NFLUSH = app.get_config(test_name, "nflush");
-  // Number of loop unrolls. Changing this value requires an equal change in
-  // buf_bandwidth.yaml
-  const uint32_t NUNROLL = app.get_config(test_name, "nunroll");
-  // Number of iterations. Increasing this value reduces noise in exchange for
-  // higher latency.
-  const uint32_t NITER = app.get_config(test_name, "niter");
-  // Vector dimensions (vec4)
-  const uint32_t VEC_WIDTH = 4;
-  const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
-  // Number of vectors that fit in the selected memory space
-  const uint32_t NVEC = range / VEC_SIZE;
-  // Number of memory reads per thread
-  const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
-  // Number of threads needed to read al l vectors
-  // The thread count doesn't divide by thread workload in shared memory
-  // because of the limited memory size.
-  const uint32_t NTHREAD = memtype == "Shared" ? NVEC : NVEC / NREAD_PER_THREAD;
-  // Occupy all threads
-  const uint32_t local_x = app.nthread_logic;
-  // Ensure that global is a multiple of local, and distribute across all SMs
-  const uint32_t global_x =
-      (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH;
-
-  auto bench = [&](uint32_t access_size) {
-    // Number of vectors that fit in this iteration
-    const uint32_t nvec_access = access_size / VEC_SIZE;
-
-    // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
-    // This will help us limit address accessing to a specific set of unique
-    // addresses depending on the access size we want to measure.
-    const uint32_t addr_mask = nvec_access - 1;
-
-    // This is to distribute the accesses to unique addresses across the
-    // workgroups, once the size of the access excedes the workgroup width.
-    const uint32_t workgroup_width = local_x * NITER * NUNROLL;
-
-    StagingBuffer in_buf(context(), vkapi::kFloat, range / sizeof(float));
-    StagingBuffer out_buf(
-        context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
-    vkapi::PipelineBarrier pipeline_barrier{};
-
-    auto shader_name = "buf_bandwidth_" + memtype_lower;
-
-    auto time = benchmark_on_gpu(shader_name, 10, [&]() {
-      context()->submit_compute_job(
-          VK_KERNEL_FROM_STR(shader_name),
-          pipeline_barrier,
-          {global_x, 1, 1},
-          {local_x, 1, 1},
-          {SV(NITER),
-           SV(nvec_access),
-           SV(local_x),
-           SV(addr_mask),
-           SV(workgroup_width)},
-          VK_NULL_HANDLE,
-          0,
-          in_buf.buffer(),
-          out_buf.buffer());
-    });
-
-    const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
-    auto gbps = SIZE_TRANS * 1e-3 / time;
-    std::cout << memtype << " bandwidth accessing \t" << access_size
-              << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
-              << "\tus)" << std::endl;
-    return gbps;
-  };
-
-  double max_bandwidth = 0;
-  double min_bandwidth = DBL_MAX;
-  for (uint32_t access_size = VEC_SIZE; access_size < range; access_size *= 2) {
-    double gbps = bench(access_size);
-    max_bandwidth = std::max(gbps, max_bandwidth);
-    min_bandwidth = std::min(gbps, min_bandwidth);
-  }
-
-  std::cout << "Max" << memtype << "Bandwidth (GB/s)," << max_bandwidth
-            << std::endl;
-  std::cout << "Min" << memtype << "Bandwidth (GB/s)," << min_bandwidth
-            << std::endl;
-}
-
-void buf_bandwidth(const App& app) {
-  if (!app.enabled("buffer_bandwidth")) {
-    std::cout << "Skipped Memory Bandwidth" << std::endl;
-    return;
-  }
-
-  std::cout << "\n------ Memory Bandwidth ------" << std::endl;
-  // Maximum memory space read - 128MB
-  // For regular devices, bandwidth plateaus at less memory than this, so more
-  // is not needed.
-  const uint32_t RANGE = app.get_config("buffer_bandwidth", "range");
-  _bandwidth(app, "Buffer", RANGE);
-}
-
-void ubo_bandwidth(const App& app) {
-  if (!app.enabled("ubo_bandwidth")) {
-    std::cout << "Skipped UBO Bandwidth" << std::endl;
-    return;
-  }
-
-  std::cout << "\n------ UBO Bandwidth ------" << std::endl;
-  const uint32_t RANGE = app.get_config("ubo_bandwidth", "range");
-  _bandwidth(app, "UBO", RANGE);
-}
-
-void shared_mem_bandwidth(const App& app) {
-  if (!app.enabled("shared_bandwidth")) {
-    std::cout << "Skipped Shared Memory Bandwidth" << std::endl;
-    return;
-  }
-
-  std::cout << "\n------ Shared Bandwidth ------" << std::endl;
-  const uint32_t RANGE = app.max_shared_mem_size;
-  _bandwidth(app, "Shared", RANGE);
-}
-} // namespace gpuinfo
diff --git a/backends/vulkan/tools/gpuinfo/include/stats.h b/backends/vulkan/tools/gpuinfo/include/stats.h
deleted file mode 100644
index 123ed0d8bcb..00000000000
--- a/backends/vulkan/tools/gpuinfo/include/stats.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Portions (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- * Code sourced from
- * https://github.com/microsoft/ArchProbe/blob/main/include/stats.hpp with the
- * following MIT license
- *
- * MIT License
- *
- * Copyright (c) Microsoft Corporation.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE
- */
-
-#pragma once
-#include <array>
-#include <cstdint>
-
-template <typename T>
-class AvgStats {
-  T sum_ = 0;
-  uint64_t n_ = 0;
-
- public:
-  typedef T value_t;
-
-  void push(T value) {
-    sum_ += value;
-    n_ += 1;
-  }
-  inline bool has_value() const {
-    return n_ != 0;
-  }
-  operator T() const {
-    return sum_ / n_;
-  }
-};
-
-template <typename T, size_t NTap>
-class NTapAvgStats {
-  std::array<double, NTap> hist_;
-  size_t cur_idx_;
-  bool ready_;
-
- public:
-  typedef T value_t;
-
-  void push(T value) {
-    hist_[cur_idx_++] = value;
-    if (cur_idx_ >= NTap) {
-      cur_idx_ = 0;
-      ready_ = true;
-    }
-  }
-  inline bool has_value() const {
-    return ready_;
-  }
-  operator T() const {
-    double out = 0.0;
-    for (double x : hist_) {
-      out += x;
-    }
-    out /= NTap;
-    return out;
-  }
-};
-
-template <uint32_t NTap>
-struct DtJumpFinder {
- private:
-  NTapAvgStats<double, NTap> time_avg_;
-  AvgStats<double> dtime_avg_;
-  double compensation_;
-  double threshold_;
-
- public:
-  // Compensation is a tiny additive to give on delta time so that the algorithm
-  // works smoothly when a sequence of identical timing is ingested, which is
-  // pretty common in our tests. Threshold is simply how many times the new
-  // delta has to be to be recognized as a deviation.
-  DtJumpFinder(double compensation = 0.01, double threshold = 10)
-      : time_avg_(),
-        dtime_avg_(),
-        compensation_(compensation),
-        threshold_(threshold) {}
-
-  // Returns true if the delta time regarding to the last data point seems
-  // normal; returns false if it seems the new data point is too much away from
-  // the historical records.
-  bool push(double time) {
-    if (time_avg_.has_value()) {
-      double dtime = std::abs(time - time_avg_) + (compensation_ * time_avg_);
-      if (dtime_avg_.has_value()) {
-        double ddtime = std::abs(dtime - dtime_avg_);
-        if (ddtime > threshold_ * dtime_avg_) {
-          return true;
-        }
-      }
-      dtime_avg_.push(dtime);
-    }
-    time_avg_.push(time);
-    return false;
-  }
-
-  double dtime_avg() const {
-    return dtime_avg_;
-  }
-  double compensate_time() const {
-    return compensation_ * time_avg_;
-  }
-};
diff --git a/backends/vulkan/tools/gpuinfo/include/textures.h b/backends/vulkan/tools/gpuinfo/include/textures.h
deleted file mode 100644
index c9ff133f1ec..00000000000
--- a/backends/vulkan/tools/gpuinfo/include/textures.h
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include "app.h"
-#include "stats.h"
-#include "utils.h"
-
-namespace gpuinfo {
-
-// Textures are drastically different from buffers in terms of data layout.
-// While buffers are a contiguous range of memory, textures are opaque objects
-// defined by the vendor and it is possible that nearby points of data are not
-// neighboring in memory. Likewise, data points are accessed in
-// multi-dimensional patches instead of simple lines. This makes the stride
-// method for figuring out the cache line size not applicable. To go around
-// this, this experiment runs an increasing amount of threads accessing
-// different datapoints in the texture and measures latency. If the cache line
-// is big enough to contain all requested data for the amount of threads,
-// latency will be low. When there are more threads and hence more data than
-// what a single cache line can handle, a second line must be fetched,
-// increasing latency in a measurable way.
-void tex_cacheline_concurr(const App& app) {
-  if (!app.enabled("tex_cacheline_concurr")) {
-    std::cout << "Skipped Texture Cacheline Optimal Concurrency" << std::endl;
-    return;
-  }
-
-  const uint32_t TEXEL_WIDTH = 4;
-  const uint32_t TEXEL_SIZE = sizeof(float) * TEXEL_WIDTH;
-
-  const double COMPENSATE =
-      app.get_config("tex_cacheline_concurr", "compensate");
-  const double THRESHOLD = app.get_config("tex_cacheline_concurr", "threshold");
-
-  for (int dim = 0; dim < 3; ++dim) {
-    std::cout << std::endl;
-    std::cout << "------ Texture Cacheline Optimal Concurrency (dim = " << dim
-              << ") ------" << std::endl;
-
-    uint32_t NITER;
-
-    const uint32_t IMG_OTHER_EDGE = dim == 0 ? app.max_tex_width
-        : dim == 1                           ? app.max_tex_height
-                                             : app.max_tex_depth;
-
-    const uint32_t MAX_NTHREAD = std::min(app.nthread_logic, IMG_OTHER_EDGE);
-
-    auto bench = [&](uint32_t nthread) {
-      std::vector<int64_t> sizes_whd = {
-          app.max_tex_width, app.max_tex_height, app.max_tex_depth};
-
-      auto sizes_nchw = whd_to_nchw(sizes_whd);
-
-      vTensor in_tensor =
-          api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
-
-      StagingBuffer out_buf(context(), vkapi::kFloat, TEXEL_WIDTH);
-
-      vkapi::PipelineBarrier pipeline_barrier{};
-
-      auto shader_name = "tex_cacheline_concurr_" + std::to_string(dim);
-
-      auto time = benchmark_on_gpu(shader_name, 100, [&]() {
-        context()->submit_compute_job(
-            VK_KERNEL_FROM_STR(shader_name),
-            pipeline_barrier,
-            {nthread, 1, 1},
-            {nthread, 1, 1},
-            {SV(NITER)},
-            VK_NULL_HANDLE,
-            0,
-            in_tensor.image(),
-            out_buf.buffer());
-      });
-      return time;
-    };
-
-    ensure_min_niter(1000, NITER, [&]() { return bench(1); });
-
-    DtJumpFinder<5> dj(COMPENSATE, THRESHOLD);
-    uint32_t nthread = 1;
-    for (; nthread <= MAX_NTHREAD; ++nthread) {
-      double time = bench(nthread);
-      std::cout << "Testing nthread=\t" << nthread << "\t, time=\t" << time
-                << std::endl;
-
-      if (dj.push(time)) {
-        auto max_concurrency = nthread - 1;
-        std::cout << "TextureCachelineConcurrencyDim" << dim << " (B),"
-                  << max_concurrency * TEXEL_SIZE << std::endl;
-        break;
-      }
-    }
-    if (nthread >= MAX_NTHREAD) {
-      std::cout
-          << "Unable to conclude an optimal texture cacheline concurrency for dim "
-          << dim << std::endl;
-    };
-  }
-
-  // TODO: Use concurrency information to obtain the cache line size for
-  // textures as done in https://fburl.com/98xiou3g
-}
-
-void tex_bandwidth(const App& app) {
-  if (!app.enabled("tex_bandwidth")) {
-    std::cout << "Skipped Texture Bandwidth" << std::endl;
-    return;
-  }
-
-  for (int dim = 0; dim < 3; dim++) {
-    std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------"
-              << std::endl;
-    const uint32_t MAX_SIZE = dim == 0 ? app.max_tex_width
-        : dim == 1                     ? app.max_tex_height
-                                       : app.max_tex_depth;
-
-    // rgba, float
-    const uint32_t VEC_WIDTH = 4;
-    const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
-    const uint32_t NVEC = MAX_SIZE;
-
-    const uint32_t RANGE = NVEC * VEC_SIZE;
-
-    // Cache lines flushed
-    const uint32_t NFLUSH = app.get_config("tex_bandwidth", "nflush");
-    // Number of loop unrolls. Changing this value requires an equal change in
-    // tex_bandwidth.yaml
-    const uint32_t NUNROLL = app.get_config("tex_bandwidth", "nunroll");
-    // Number of iterations. Increasing this value reduces noise in exchange
-    // for higher latency.
-    const uint32_t NITER = app.get_config("tex_bandwidth", "niter");
-    // Number of memory reads per thread
-    const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
-    // Number of threads needed to read all texells
-    const uint32_t NTHREAD = NVEC;
-    // Occupy all threads
-    const uint32_t local_x = app.nthread_logic;
-    // Ensure that global is a multiple of local, and distribute across all
-    // SMs
-    const uint32_t global_x =
-        (NTHREAD / local_x * local_x) * app.sm_count * NFLUSH;
-
-    auto shader_name = "tex_bandwidth_" + std::to_string(dim);
-
-    std::vector<int64_t> sizes_whd = {MAX_SIZE, 1, 1};
-    if (dim == 1) {
-      sizes_whd = {1, MAX_SIZE, 1};
-    } else if (dim == 2) {
-      sizes_whd = {1, 1, MAX_SIZE};
-    }
-    auto sizes_nchw = whd_to_nchw(sizes_whd);
-
-    vTensor in_tensor = api::vTensor(api::context(), sizes_nchw, vkapi::kFloat);
-
-    auto bench = [&](uint32_t access_size, uint32_t dim) {
-      // Number of texels that fit in this iteration
-      const uint32_t ntexel_access = access_size / VEC_SIZE;
-
-      // The address mask works as a modulo because x % 2^n == x & (2^n - 1).
-      // This will help us limit address accessing to a specific set of unique
-      // addresses depending on the access size we want to measure.
-      const uint32_t addr_mask = ntexel_access - 1;
-
-      // This is to distribute the accesses to unique addresses across the
-      // workgroups, once the size of the access excedes the workgroup width.
-      const uint32_t workgroup_width = local_x * NITER * NUNROLL;
-
-      StagingBuffer out_buf(
-          context(), vkapi::kFloat, VEC_WIDTH * app.nthread_logic);
-      vkapi::PipelineBarrier pipeline_barrier{};
-
-      auto time = benchmark_on_gpu(shader_name, 10, [&]() {
-        context()->submit_compute_job(
-            VK_KERNEL_FROM_STR(shader_name),
-            pipeline_barrier,
-            {global_x, 1, 1},
-            {local_x, 1, 1},
-            {SV(NITER),
-             SV(ntexel_access),
-             SV(local_x),
-             SV(addr_mask),
-             SV(workgroup_width)},
-            VK_NULL_HANDLE,
-            0,
-            in_tensor.image(),
-            out_buf.buffer());
-      });
-
-      const uint32_t SIZE_TRANS = global_x * NREAD_PER_THREAD * VEC_SIZE;
-      double gbps = SIZE_TRANS * 1e-3 / time;
-      std::cout << "Texture bandwidth accessing \t" << access_size
-                << "\tB unique data is \t" << gbps << " \tgbps (\t" << time
-                << "\tus)" << std::endl;
-      return gbps;
-    };
-
-    double max_bandwidth = 0;
-    double min_bandwidth = DBL_MAX;
-    for (uint32_t access_size = VEC_SIZE; access_size < RANGE;
-         access_size *= 2) {
-      double gbps = bench(access_size, dim);
-      max_bandwidth = std::max(gbps, max_bandwidth);
-      min_bandwidth = std::min(gbps, min_bandwidth);
-    }
-
-    std::cout << "MaxTextureBandwidthDim" << dim << "(GB/s)," << max_bandwidth
-              << std::endl;
-    std::cout << "MinTextureBandwidthDim" << dim << "(GB/s)," << min_bandwidth
-              << std::endl;
-  }
-}
-} // namespace gpuinfo
diff --git a/backends/vulkan/tools/gpuinfo/include/utils.h b/backends/vulkan/tools/gpuinfo/include/utils.h
deleted file mode 100644
index 887cb443ef4..00000000000
--- a/backends/vulkan/tools/gpuinfo/include/utils.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/backends/vulkan/runtime/api/api.h>
-
-#define CL_TARGET_OPENCL_VERSION 200
-#define CL_HPP_TARGET_OPENCL_VERSION CL_TARGET_OPENCL_VERSION
-#include <CL/opencl.hpp>
-
-using namespace vkcompute;
-using namespace api;
-
-#define QP context()->querypool()
-
-auto benchmark_on_gpu(
-    std::string shader_id,
-    uint32_t niter,
-    std::function<void()> encode_kernel) {
-  auto fence = context()->fences().get_fence();
-
-  for (int i = 0; i < niter; ++i) {
-    encode_kernel();
-  };
-
-  context()->submit_cmd_to_gpu(fence.get_submit_handle());
-  fence.wait();
-  QP.extract_results();
-  uint64_t count = QP.get_mean_shader_ns(shader_id);
-  QP.reset_state();
-  context()->flush();
-
-  return count / 1000.f;
-}
-
-void ensure_min_niter(
-    double min_time_us,
-    uint32_t& niter,
-    std::function<double()> run) {
-  const uint32_t DEFAULT_NITER = 100;
-  niter = DEFAULT_NITER;
-  for (uint32_t i = 0; i < 100; ++i) {
-    double t = run();
-    if (t > min_time_us * 0.99) {
-      return;
-    }
-    niter = uint32_t(niter * min_time_us / t);
-  }
-}
-
-std::vector<int64_t> whd_to_nchw(std::vector<int64_t> sizes) {
-  const int64_t W = sizes[0];
-  const int64_t H = sizes[1];
-  const int64_t D = sizes[2];
-
-  // Channels-packed: {W, H, D} = {W, H, (C / 4) * N}
-  return {1, D * 4, H, W};
-}
-
-cl_platform_id get_cl_platform_id() {
-  cl_uint nplatform_id;
-  clGetPlatformIDs(0, nullptr, &nplatform_id);
-  std::vector<cl_platform_id> platform_ids;
-  platform_ids.resize(nplatform_id);
-  clGetPlatformIDs(nplatform_id, platform_ids.data(), nullptr);
-  return platform_ids[0];
-}
-
-cl_device_id get_cl_dev_id(cl_platform_id platform_id) {
-  cl_uint ndev_id;
-  clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 0, nullptr, &ndev_id);
-  std::vector<cl_device_id> dev_ids;
-  dev_ids.resize(ndev_id);
-  clGetDeviceIDs(
-      platform_id, CL_DEVICE_TYPE_ALL, ndev_id, dev_ids.data(), nullptr);
-  return dev_ids[0];
-}
-
-cl::Device get_cl_device() {
-  auto platform_id = get_cl_platform_id();
-  auto dev_id = get_cl_dev_id(platform_id);
-  cl::Device dev(dev_id);
-  return dev;
-}
diff --git a/backends/vulkan/tools/gpuinfo/src/main.cpp b/backends/vulkan/tools/gpuinfo/src/main.cpp
deleted file mode 100644
index f0e29aaf1ae..00000000000
--- a/backends/vulkan/tools/gpuinfo/src/main.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "app.h"
-#include "architecture.h"
-#include "buffers.h"
-#include "textures.h"
-
-using namespace vkapi;
-
-int main(int argc, const char** argv) {
-  gpuinfo::App app;
-
-  std::string file_path = "config.json";
-  if (argc > 1) {
-    file_path = argv[1];
-  };
-  app.load_config(file_path);
-
-  // Architecture
-  gpuinfo::reg_count(app);
-  gpuinfo::warp_size(app);
-
-  // Buffers
-  gpuinfo::buf_cacheline_size(app);
-  gpuinfo::buf_bandwidth(app);
-  gpuinfo::ubo_bandwidth(app);
-  gpuinfo::shared_mem_bandwidth(app);
-
-  // Textures
-  gpuinfo::tex_bandwidth(app);
-  gpuinfo::tex_cacheline_concurr(app);
-
-  return 0;
-}
diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py
deleted file mode 100644
index 1291eb62936..00000000000
--- a/backends/vulkan/utils.py
+++ /dev/null
@@ -1,1305 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import operator
-from typing import Any, List, Optional, Set, Tuple, Union
-
-import torch
-
-from executorch.backends.vulkan.serialization.vulkan_graph_schema import (
-    VkMemoryLayout,
-    VkStorageType,
-)
-
-from executorch.exir.backend.canonical_partitioners.config_partitioner import (
-    format_target_name,
-)
-
-from executorch.exir.dialects.edge._ops import EdgeOpOverload
-
-from executorch.exir.tensor import TensorSpec
-
-from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
-
-from torch._subclasses.fake_tensor import FakeTensor, FakeTensorConverter
-
-from torch.export import ExportedProgram
-
-from torch.export.exported_program import InputKind
-from torch.export.graph_signature import TensorArgument
-
-TorchOpType = Union[EdgeOpOverload, torch._ops.OpOverload, str]
-
-_DQ_OPS = {
-    "dequantize_per_tensor.tensor",
-    "dequantize_per_tensor.default",
-    "dequantize_per_channel.default",
-    "dequantize_per_channel_group.default",
-    "dequantize_per_token.default",
-    "dequantize_affine.default",
-}
-
-_Q_OPS = {
-    "quantize_per_tensor.tensor",
-    "quantize_per_tensor.default",
-    "quantize_per_channel.default",
-    "quantize_per_token.default",
-    "quantize_affine.default",
-}
-
-##
-## Node type determination
-##
-
-# Convenience type
-MaybeNodeList = Union[torch.fx.Node, List[torch.fx.Node], Tuple[torch.fx.Node]]
-
-
-def is_torch_op_node(node: torch.fx.Node) -> bool:
-    if node.op != "call_function":
-        return False
-
-    if isinstance(node.target, EdgeOpOverload):
-        return True
-    if isinstance(node.target, torch._ops.OpOverload):
-        return True
-
-    return False
-
-
-def is_dequant_node(node: torch.fx.Node) -> bool:
-    if node.op != "call_function":
-        return False
-    node_name = format_target_name(node.target.__name__)  # pyre-ignore
-    return node_name in _DQ_OPS
-
-
-def is_quant_node(node: torch.fx.Node) -> bool:
-    if node.op != "call_function":
-        return False
-    node_name = format_target_name(node.target.__name__)  # pyre-ignore
-    return node_name in _Q_OPS
-
-
-def is_dequant_per_channel_node(node: torch.fx.Node) -> bool:
-    if node.op != "call_function":
-        return False
-    node_name = format_target_name(node.target.__name__)  # pyre-ignore
-    return node_name == "dequantize_per_channel.default"
-
-
-def is_linear_node(node: torch.fx.Node) -> bool:
-    if node.op != "call_function":
-        return False
-    node_name = format_target_name(node.target.__name__)  # pyre-ignore
-    return node_name == "linear.default"
-
-
-def is_get_attr_node(node: torch.fx.Node) -> bool:
-    return isinstance(node, torch.fx.Node) and node.op == "get_attr"
-
-
-def is_constant(program: ExportedProgram, node: torch.fx.Node) -> bool:
-    return node.name in program.graph_signature.inputs_to_lifted_tensor_constants
-
-
-def is_param_node(program: ExportedProgram, node: torch.fx.Node) -> bool:
-    """
-    Check if the given node is a parameter within the exported program
-    """
-    return (
-        is_get_attr_node(node)
-        or is_param(program, node)
-        or is_buffer(program, node)
-        or is_constant(program, node)
-    )
-
-
-def is_mutable_buffer_node(
-    node: torch.fx.Node, exported_program: ExportedProgram
-) -> bool:
-    if node.target not in exported_program.graph_signature.inputs_to_buffers:
-        return False
-    buf = exported_program.graph_signature.inputs_to_buffers[node.target]
-    return buf in exported_program.graph_signature.buffers_to_mutate.values()
-
-
-def is_symint_node(node: torch.fx.Node) -> bool:
-    """
-    Returns true if the given node produces a SymInt value
-    """
-    if "val" not in node.meta:
-        return False
-
-    if isinstance(node.meta["val"], torch.SymInt):
-        return True
-
-    return False
-
-
-def is_single_tensor_node(node: torch.fx.Node) -> bool:
-    """
-    Returns true if the given node produces a single tensor value
-    """
-    if "val" not in node.meta:
-        return False
-
-    if isinstance(node.meta["val"], FakeTensor):
-        return True
-
-    return False
-
-
-def is_tensor_collection_node(node: Any) -> bool:
-    """
-    Returns true if the given node produces a collection of tensor values
-    """
-    if not isinstance(node, torch.fx.Node):
-        return False
-
-    if "val" not in node.meta:
-        return False
-
-    if isinstance(node.meta["val"], list) or isinstance(node.meta["val"], tuple):
-        return all(isinstance(x, FakeTensor) for x in node.meta["val"])
-
-    return False
-
-
-def is_tensor_node(node: Any) -> bool:
-    """
-    Returns true if the given node produces a tensor value, or a collection of tensor values
-    """
-    if not isinstance(node, torch.fx.Node):
-        return False
-
-    if "val" not in node.meta:
-        return False
-
-    if isinstance(node.meta["val"], FakeTensor):
-        return True
-
-    if isinstance(node.meta["val"], list) or isinstance(node.meta["val"], tuple):
-        return all(isinstance(x, FakeTensor) for x in node.meta["val"])
-
-    return False
-
-
-def is_tensor_arg_node(node: Any) -> bool:
-    if isinstance(node, torch.fx.Node):
-        return is_tensor_node(node)
-    elif isinstance(node, (list, tuple)):
-        return all(is_tensor_node(n) for n in node)
-
-    return False
-
-
-def num_tensor_arg_nodes(node: torch.fx.Node) -> int:
-    """
-    For a given node, return the number of argument nodes that are associated with
-    tensors.
-    """
-    count = 0
-    for arg_node in node.args:
-        if not isinstance(arg_node, torch.fx.Node):
-            continue
-        if is_tensor_node(arg_node):
-            count += 1
-
-    return count
-
-
-def num_tensors_in_node(node: torch.fx.Node) -> int:
-    """
-    Returns the number of tensors associated a given node
-    """
-    if "val" not in node.meta:
-        return 0
-
-    if isinstance(node.meta["val"], FakeTensor):
-        return 1
-
-    if isinstance(node.meta["val"], list) or isinstance(node.meta["val"], tuple):
-        if all(isinstance(x, FakeTensor) for x in node.meta["val"]):
-            return len(node.meta["val"])
-
-    return 0
-
-
-def tensor_node_is_bool(node: torch.fx.Node) -> bool:
-    """
-    Returns true if a given node contains a tensor with bool dtype
-    """
-    if isinstance(node.meta["val"], FakeTensor):
-        return node.meta["val"].dtype == torch.bool
-    if isinstance(node.meta["val"], list) or isinstance(node.meta["val"], tuple):
-        for fake_tensor in node.meta["val"]:
-            if isinstance(fake_tensor, FakeTensor):
-                if fake_tensor.dtype == torch.bool:
-                    return True
-    return False
-
-
-def get_primary_arg_idx(self, node: torch.fx.Node) -> Optional[int]:
-    primary_arg_idx: Optional[int] = None
-    for i, arg_node in enumerate(node.args):
-        if self.is_non_constant_tensor_node(arg_node):
-            return i
-
-    return primary_arg_idx
-
-
-def node_comes_from_any_nn_module_in_set(
-    node,
-    nn_module_typenames: Set[str],
-) -> bool:
-    if isinstance(node, (list, tuple)):
-        return all(
-            node_comes_from_any_nn_module_in_set(n, nn_module_typenames) for n in node
-        )
-
-    if not isinstance(node, torch.fx.Node):
-        return False
-
-    nn_module_stack = node.meta.get("nn_module_stack", None)
-    if nn_module_stack is None:
-        return False
-
-    for _, packed in nn_module_stack.items():
-        _, typename = packed
-        for partial_name in nn_module_typenames:
-            if partial_name in typename:
-                return True
-
-    return False
-
-
-def get_tensor_name(exp_prog: ExportedProgram, node: torch.fx.Node) -> str:
-    if node is None:
-        return ""
-    if is_param(exp_prog, node):
-        return exp_prog.graph_signature.inputs_to_parameters[node.name]
-    elif is_buffer(exp_prog, node):
-        return exp_prog.graph_signature.inputs_to_buffers[node.name]
-    elif is_lifted_tensor_constant(exp_prog, node):
-        return exp_prog.graph_signature.inputs_to_lifted_tensor_constants[node.name]
-    else:
-        assert isinstance(node.target, str)
-        return node.target
-
-    return ""
-
-
-def find_dequant_user(node: torch.fx.Node) -> Optional[torch.fx.Node]:
-    """
-    Search the direct users of the given node and return the first one that is a
-    dequantization op. Returns None if no dequantization op is found.
-    """
-    for user in node.users:
-        if is_dequant_node(user):
-            return user
-    return None
-
-
-def find_quant_user(node: torch.fx.Node) -> Optional[torch.fx.Node]:
-    """
-    Search the direct users of the given node and return the first one that is a
-    quantization op. Returns None if no quantization op is found.
-    """
-    for user in node.users:
-        if is_quant_node(user):
-            return user
-
-    return None
-
-
-##
-## Memory Layout, Storage Type Determination
-##
-
-ImageExtents = Tuple[int, int, int]
-
-DEFAULT_TEXTURE_LIMITS = (16384, 16384, 2048)
-DEFAULT_BUFFER_LIMIT = 128 * (1024 * 1024)
-
-all_storage_types: Set[VkStorageType] = {
-    VkStorageType.BUFFER,
-    VkStorageType.TEXTURE_3D,
-}
-
-all_memory_layouts: Set[VkMemoryLayout] = {
-    VkMemoryLayout.TENSOR_WIDTH_PACKED,
-    VkMemoryLayout.TENSOR_HEIGHT_PACKED,
-    VkMemoryLayout.TENSOR_CHANNELS_PACKED,
-}
-
-MemoryLayoutSet = Set[VkMemoryLayout]
-MemoryLayoutSetList = Union[MemoryLayoutSet, List[MemoryLayoutSet]]
-
-
-def within_buffer_limit(node: torch.fx.Node, buffer_limit: int) -> int:
-    """
-    Checks whether the tensors produced by the given node can fit within the device's
-    GPU buffer limit, which represents the maximum number of elements that can be stored
-    in a GPU buffer.
-    """
-    assert is_tensor_node(node)
-
-    if isinstance(node.meta["val"], FakeTensor):
-        return node.meta["val"].numel() < buffer_limit
-    elif isinstance(node.meta["val"], list) or isinstance(node.meta["val"], tuple):
-        return all(x.numel() < buffer_limit for x in node.meta["val"])
-    else:
-        raise RuntimeError(f"Cannot get numel for val of type {type(node.meta['val'])}")
-
-
-def tensor_node_is_high_dim(node: torch.fx.Node) -> bool:
-    """
-    Returns true if a given node contains a tensor with more than 4 dimensions
-    """
-    if isinstance(node.meta["val"], FakeTensor):
-        return len(node.meta["val"].shape) > 4
-    if isinstance(node.meta["val"], list) or isinstance(node.meta["val"], tuple):
-        for fake_tensor in node.meta["val"]:
-            if isinstance(fake_tensor, FakeTensor):
-                if len(fake_tensor.shape) > 4:
-                    return True
-    return False
-
-
-def required_image_extents(sizes: torch.Size, layout: VkMemoryLayout) -> ImageExtents:
-    """
-    Calculate the image extents that will be used to represent a tensor with the given sizes
-    and memory layout in the Vulkan Delegate.
-    """
-    width = sizes[-1] if len(sizes) >= 1 else 1
-    height = sizes[-2] if len(sizes) >= 2 else 1
-    channels = sizes[-3] if len(sizes) >= 3 else 1
-    batch = sizes[0] if len(sizes) >= 4 else 1
-
-    if layout == VkMemoryLayout.TENSOR_WIDTH_PACKED:
-        width = (width + 3) // 4
-    elif layout == VkMemoryLayout.TENSOR_HEIGHT_PACKED:
-        height = (height + 3) // 4
-    elif layout == VkMemoryLayout.TENSOR_CHANNELS_PACKED:
-        channels = (channels + 3) // 4
-    else:
-        raise RuntimeError(f"Unsupported memory layout {layout}")
-
-    return width, height, channels * batch
-
-
-def extents_are_valid(extents: ImageExtents, limits: ImageExtents) -> bool:
-    return all(extents[i] <= limits[i] for i in range(len(extents)))
-
-
-def valid_texture_memory_layouts(
-    tensor_sizes: torch.Size, texture_limits: ImageExtents
-) -> Set[VkMemoryLayout]:
-    """
-    Given tensor sizes, determine the set of memory layouts which will prodice a texture
-    that can fit within the specified device limits.
-    """
-    valid_layouts = set()
-    for layout in list(all_memory_layouts):
-        extents = required_image_extents(tensor_sizes, layout)
-        if extents_are_valid(extents, texture_limits):
-            valid_layouts.add(layout)
-
-    return valid_layouts
-
-
-class TensorRepr:
-    """
-    This class is a wrapper around a pair of VkStorageType and VkMemoryLayout which
-    describes how a tensor should be represented in the Vulkan Delegate.
-    """
-
-    def __init__(self, storage_type: VkStorageType, memory_layout: VkMemoryLayout):
-        self.storage_type = storage_type
-        self.memory_layout = memory_layout
-
-    def __str__(self) -> str:
-        return f"TensorRepr({self.storage_type}, {self.memory_layout})"
-
-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, TensorRepr):
-            return NotImplemented
-        return (
-            self.storage_type == other.storage_type
-            and self.memory_layout == other.memory_layout
-        )
-
-    def __ne__(self, other: object) -> bool:
-        return not self.__eq__(other)
-
-
-class TensorReprList:
-    """
-    This class is a wrapper around a list of TensorRepr instances that automatically
-    applies a "broadcasting" mechanism. The broadcasting mechanism allows for a single
-    underlying TensorRepr to be used to represent multiple tensors.
-    """
-
-    def __init__(self, tensor_reprs: Union[TensorRepr, List[TensorRepr]]):
-        self.vals: List[TensorRepr] = (
-            tensor_reprs if isinstance(tensor_reprs, list) else [tensor_reprs]
-        )
-
-    def __len__(self):
-        return len(self.vals)
-
-    def __getitem__(self, idx: int) -> TensorRepr:
-        if idx > 0 and len(self) == 1:
-            return self.vals[0]
-        else:
-            return self.vals[idx]
-
-    def __setitem__(self, idx: int, val: TensorRepr) -> None:
-        if idx > 0 and len(self) == 1:
-            self.vals[0] = val
-        else:
-            self.vals[idx] = val
-
-    def __str__(self) -> str:
-        return f"[{', '.join(str(ts) for ts in self.vals)}]"
-
-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, TensorReprList):
-            return NotImplemented
-
-        if len(self) == len(other):
-            for self_val, other_val in zip(self.vals, other.vals):
-                if self_val != other_val:
-                    return False
-
-            return True
-
-        return False
-
-    def __ne__(self, other: object) -> bool:
-        return not self.__eq__(other)
-
-    def append(self, val: TensorRepr) -> None:
-        self.vals.append(val)
-
-    def storage_type(self, idx: int = 0) -> VkStorageType:
-        return self.vals[idx].storage_type
-
-    def memory_layout(self, idx: int = 0) -> VkMemoryLayout:
-        return self.vals[idx].memory_layout
-
-
-class TensorRepSet:
-    """
-    This class describes the possible set of representations (i.e. TensorRepr) that may
-    be used to represent a tensor. This set is determined by the implementation of the
-    operator that the tensor participates in as well as the texture extents of the GPU.
-    """
-
-    def __init__(
-        self,
-        buffer_memory_layouts: Set[VkMemoryLayout],
-        texture_memory_layouts: Set[VkMemoryLayout],
-    ):
-        self.valid_buffer_layouts = buffer_memory_layouts
-        self.valid_texture_layouts = texture_memory_layouts
-
-    def __str__(self) -> str:
-        buffer_layouts = ", ".join(layout.name for layout in self.valid_buffer_layouts)
-        texture_layouts = ", ".join(
-            layout.name for layout in self.valid_texture_layouts
-        )
-        return f"TensorRepSet(Buffer Layouts: [{buffer_layouts}], Texture Layouts: [{texture_layouts}])"
-
-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, TensorRepSet):
-            return NotImplemented
-        return (
-            self.valid_buffer_layouts == other.valid_buffer_layouts
-            and self.valid_texture_layouts == other.valid_texture_layouts
-        )
-
-    def __ne__(self, other: object) -> bool:
-        return not self.__eq__(other)
-
-    def is_empty(self) -> bool:
-        """
-        A TensorRepSet is "empty" if there are no valid representations of the tensor.
-        """
-        return (
-            len(self.valid_buffer_layouts) == 0 and len(self.valid_texture_layouts) == 0
-        )
-
-    def make_intersect(self, other: "TensorRepSet") -> "TensorRepSet":
-        """
-        Merge this TensorRepr with another TensorRepr, returning a new TensorRepr
-        with the intersection of the two.
-        """
-        return TensorRepSet(
-            self.valid_buffer_layouts & other.valid_buffer_layouts,
-            self.valid_texture_layouts & other.valid_texture_layouts,
-        )
-
-    def is_compatible(self, storage: TensorRepr) -> bool:
-        """
-        Check if this TensorRepr is compatible with the given TensorRepSet.
-        """
-        if storage.storage_type == VkStorageType.BUFFER:
-            return storage.memory_layout in self.valid_buffer_layouts
-        elif storage.storage_type == VkStorageType.TEXTURE_3D:
-            return storage.memory_layout in self.valid_texture_layouts
-        else:
-            raise RuntimeError(f"Unsupported storage type {storage.storage_type}")
-
-    def any_in_common(self, other: "TensorRepSet") -> bool:
-        """
-        Check if this TensorRepr has any representations in common with another
-        TensorRepr.
-        """
-        return (
-            len(self.valid_buffer_layouts & other.valid_buffer_layouts) > 0
-            or len(self.valid_texture_layouts & other.valid_texture_layouts) > 0
-        )
-
-    def texture_is_valid(self):
-        return len(self.valid_texture_layouts) > 0
-
-    def buffer_is_valid(self):
-        return len(self.valid_buffer_layouts) > 0
-
-    def first_valid_buffer_layout(self):
-        return list(self.valid_buffer_layouts)[0]
-
-    def first_valid_texture_layout(self):
-        return list(self.valid_texture_layouts)[0]
-
-    def make_tensor_repr(self) -> TensorRepr:
-        """
-        Pick a representation (i.e. TensorRepr) from the set of possible representations.
-        If there are multiple valid representations, then:
-        1. Prefer texture storage over buffer storage
-        2. Pick the first available memory layout.
-        """
-        if self.is_empty():
-            # An empty repset typically means that it is associated with a weight tensor
-            # or non tensor argument. In this case, just return default storage and
-            # layout as placeholder.
-            return TensorRepr(
-                VkStorageType.DEFAULT_STORAGE, VkMemoryLayout.DEFAULT_LAYOUT
-            )
-
-        if self.texture_is_valid():
-            return TensorRepr(
-                VkStorageType.TEXTURE_3D, self.first_valid_texture_layout()
-            )
-
-        else:
-            return TensorRepr(VkStorageType.BUFFER, self.first_valid_buffer_layout())
-
-    def is_constrained(self) -> bool:
-        """
-        A "constrained" RepSet is one that has either:
-        1. A single valid texture memory layout, and no valid buffer memory layouts
-        2. No valid texture memory layouts, and a single valid buffer memory layout
-        3. Is empty
-
-        In this case, it is unambiguous which representation should be used for the
-        tensor.
-        """
-        if self.is_empty():
-            return True
-        elif (
-            len(self.valid_texture_layouts) == 1 and len(self.valid_buffer_layouts) == 0
-        ):
-            return True
-        elif (
-            len(self.valid_texture_layouts) == 0 and len(self.valid_buffer_layouts) == 1
-        ):
-            return True
-        else:
-            return False
-
-    def is_ambiguous(self) -> bool:
-        """
-        An "ambiguous" RepSet is one that is not constrained.
-        """
-        return not self.is_constrained()
-
-
-def make_tensor_repset(tensor_repr: TensorRepr) -> TensorRepSet:
-    """
-    Given a TensorRepr, return a TensorRepSet that contains only that TensorRepr
-    """
-    if tensor_repr.storage_type == VkStorageType.BUFFER:
-        return TensorRepSet({tensor_repr.memory_layout}, set())
-    elif tensor_repr.storage_type == VkStorageType.TEXTURE_3D:
-        return TensorRepSet(set(), {tensor_repr.memory_layout})
-    else:
-        raise RuntimeError(f"Unsupported storage type {tensor_repr.storage_type}")
-
-
-def make_filtered_tensor_repset(
-    tensor_val: FakeTensor,
-    tensor_repset: TensorRepSet,
-    texture_limits: ImageExtents,
-) -> TensorRepSet:
-    """
-    `tensor_val` represents an actual tensor participating in some operator computation.
-
-    `tensor_repset` represents the set of valid tensor representations that may be used
-    for that tensor that is supported by the op implementation.
-
-    `texture_limits` represents the maximum texture sizes that is supported by the GPU.
-
-    Given the above, return a new TensorRepSet that contains only texture layouts that
-    can be used to produce a valid image texture for the given tensor (i.e. fits within
-    texture limits).
-    """
-    valid_texture_layouts = set()
-    for memory_layout in tensor_repset.valid_texture_layouts:
-        extents = required_image_extents(tensor_val.shape, memory_layout)
-        if extents_are_valid(extents, texture_limits):
-            valid_texture_layouts.add(memory_layout)
-
-    # High dimensional tensors require buffer storage
-    if len(tensor_val.shape) > 4:
-        return TensorRepSet(tensor_repset.valid_buffer_layouts, set())
-
-    # Bool tensors are currently not supported
-    if tensor_val.dtype == torch.bool:
-        return NO_STORAGE
-
-    return TensorRepSet(tensor_repset.valid_buffer_layouts, valid_texture_layouts)
-
-
-## Convenience TensorRepSet definitions
-
-CONTIGUOUS_ANY = TensorRepSet(
-    {VkMemoryLayout.TENSOR_WIDTH_PACKED}, {VkMemoryLayout.TENSOR_WIDTH_PACKED}
-)
-CONTIGUOUS_BUFFER = TensorRepSet({VkMemoryLayout.TENSOR_WIDTH_PACKED}, set())
-
-WIDTH_PACKED_TEXTURE = TensorRepSet(set(), {VkMemoryLayout.TENSOR_WIDTH_PACKED})
-CHANNELS_PACKED_TEXTURE = TensorRepSet(set(), {VkMemoryLayout.TENSOR_CHANNELS_PACKED})
-
-ANY_TEXTURE = TensorRepSet(set(), all_memory_layouts)
-ANY_BUFFER = TensorRepSet(all_memory_layouts, set())
-
-ANY_STORAGE = TensorRepSet(all_memory_layouts, all_memory_layouts)
-NO_STORAGE = TensorRepSet(set(), set())
-
-
-class TensorRepSetList:
-    """
-    This class is a wrapper around a list of TensorRepSet instances that automatically
-    applies a "broadcasting" mechanism. The broadcasting mechanism allows for a single
-    underlying TensorRepSet to be used for multiple tensors.
-    """
-
-    def __init__(
-        self,
-        tensor_repsets: Union[TensorRepSet, List[TensorRepSet]],
-    ):
-        self.vals: List[TensorRepSet] = (
-            tensor_repsets if isinstance(tensor_repsets, list) else [tensor_repsets]
-        )
-
-    def __len__(self):
-        return len(self.vals)
-
-    def __getitem__(self, idx: int) -> TensorRepSet:
-        if idx > 0 and len(self) == 1:
-            return self.vals[0]
-        else:
-            return self.vals[idx]
-
-    def __setitem__(self, idx: int, val: TensorRepSet) -> None:
-        if idx > 0 and len(self.vals) == 1:
-            self.vals[0] = val
-        else:
-            self.vals[idx] = val
-
-    def __str__(self) -> str:
-        return f"[{', '.join(str(ts) for ts in self.vals)}]"
-
-    def append(self, val: TensorRepSet) -> None:
-        return self.vals.append(val)
-
-    def any_is_empty(self) -> bool:
-        if len(self.vals) == 0:
-            return True
-
-        return any(tensor_repr.is_empty() for tensor_repr in self.vals)
-
-
-class OpRepSets:
-    """
-    This class is responsible for representing and managing the set of valid tensor
-    representations that may be used for all input and output tensors of an operator.
-    It is also responsible for maintaining synchronization rules between tensors
-    participating in the computation.
-
-    Currently, three synchronization rules exist:
-    1. All input tensors must use the same representation (e.g. binary ops)
-    2. The "primary" input and output tensors must use the same representation
-       (e.g. group norm; the output is a tuple of out, mean, rstd; out must be the same
-       representation as the first input x, but mean and rstd may use different
-       representations as out)
-    3. All output tensors must use the same representation (e.g. choose qparams)
-
-    Note that "primary" input and output tensor refers to the first non-weight input
-    tensor and the first output tensor. Note that Some operators (such as arange) do not
-    have any tensor inputs.
-
-    Currently, the above three synchronization rules are sufficient to describe the
-    representation requirements of all ET-VK operators.
-
-    This class also provides utilities to constrain the repsets; when applying the
-    constraints, the synchronization rules will be maintained.
-    """
-
-    def __init__(  # noqa: C901
-        self,
-        inputs_repsets: TensorRepSetList,
-        outputs_repsets: TensorRepSetList,
-        op_node: torch.fx.Node,
-        texture_limits: ImageExtents,
-    ):
-        self.op_node = op_node
-
-        # inputs_repset_list is received from the operator registration. If a different
-        # repset is defined for each input tensor, then assume that the input tensor
-        # representations do not need to be synchronized.
-        if len(inputs_repsets) > 1:
-            self.sync_args_repr = False
-        # Otherwise, default to True
-        else:
-            self.sync_args_repr = True
-
-        # outputs_repset_list is received from the operator registration. If a different
-        # repset is defined for each output tensor, then assume that the output tensor
-        # representations do not need to be synchronized.
-        if len(outputs_repsets) > 1:
-            self.sync_outs_repr = False
-        else:
-            self.sync_outs_repr = True
-
-        # Try to determine the index of the "primary" argument, i.e. the first non
-        # constant tensor argument. For the vast majority of operators with tensor
-        # arguments, this will be the first argument.
-        self.primary_arg_idx: Optional[int] = None
-        for i, arg_node in enumerate(self.op_node.args):
-            arg_node_repset = inputs_repsets[i]
-            if not is_tensor_arg_node(arg_node):
-                continue
-            if arg_node_repset is None:
-                continue
-            if arg_node_repset.is_empty():
-                continue
-
-            self.primary_arg_idx = i
-            break
-
-        # If the repset of the primary input and the primary output are the same, then
-        # assume they need to be the same.
-        self.sync_primary_io_repr = self.primary_arg_idx is not None
-        if self.primary_arg_idx is not None:
-            if inputs_repsets[self.primary_arg_idx] != outputs_repsets[0]:
-                self.sync_primary_io_repr = False
-
-        # Now, go through the arguments of the operator and create a filtered repset
-        # for each based on the actual tensor value.
-        args_repset_list = TensorRepSetList([])
-        common_arg_repset = ANY_STORAGE
-        for i, arg_node in enumerate(op_node.args):
-            arg_repset = inputs_repsets[i]
-
-            # Use ANY_STORAGE for non-tensor nodes so they don't cause the op repsets to
-            # appear empty
-            if not is_tensor_arg_node(arg_node):
-                args_repset_list.append(ANY_STORAGE)
-            # NO_STORAGE is used to denote that an input is either a non tensor arg or
-            # a weight tensor that is not prepacked. Similar to the above, use
-            # ANY_STORAGE in this case.
-            elif arg_repset.is_empty():
-                args_repset_list.append(ANY_STORAGE)
-            else:
-                assert not arg_repset.is_empty()
-
-                arg_repset = self.make_valid_tensor_repset_for_arg(
-                    arg_repset, arg_node, texture_limits
-                )
-
-                args_repset_list.append(arg_repset)
-                common_arg_repset = common_arg_repset.make_intersect(arg_repset)
-
-        # Repeat for output tensors.
-        outs_repset_list = TensorRepSetList([])
-        common_out_repset = ANY_STORAGE
-        if num_tensors_in_node(op_node) == 1:
-            common_out_repset = make_filtered_tensor_repset(
-                op_node.meta["val"], outputs_repsets[0], texture_limits
-            )
-            outs_repset_list.append(common_out_repset)
-        # Multiple output tensors
-        else:
-            for i, val in enumerate(op_node.meta["val"]):
-                assert isinstance(val, FakeTensor)
-                out_repset = make_filtered_tensor_repset(
-                    val, outputs_repsets[i], texture_limits
-                )
-
-                outs_repset_list.append(out_repset)
-                common_out_repset = common_out_repset.make_intersect(out_repset)
-
-        # Apply synchronization rules; if either all inputs/outputs must use the same
-        # representation, then only use a single underlying repset.
-        if self.sync_args_repr:
-            args_repset_list = TensorRepSetList([common_arg_repset])
-
-        if self.sync_outs_repr:
-            outs_repset_list = TensorRepSetList([common_out_repset])
-
-        # Finally, apply synchronization rules that sync inputs and outputs. If input
-        # or output repsets are updated, then maintain synchronization rules.
-        if self.sync_primary_io_repr:
-            assert self.primary_arg_idx is not None
-
-            primary_in_repset = args_repset_list[self.primary_arg_idx]
-            primary_out_repset = outs_repset_list[0]
-
-            primary_repset = primary_in_repset.make_intersect(primary_out_repset)
-
-            if self.sync_args_repr:
-                args_repset_list = TensorRepSetList([primary_repset])
-            else:
-                assert self.primary_arg_idx is not None
-                args_repset_list[self.primary_arg_idx] = primary_repset
-
-            if self.sync_outs_repr:
-                outs_repset_list = TensorRepSetList([primary_repset])
-            else:
-                assert self.primary_arg_idx is not None
-                outs_repset_list[0] = primary_repset
-
-        # Save the resulting repsets
-        self.args_repset_list = args_repset_list
-        self.outs_repset_list = outs_repset_list
-
-        # Check that synchronization rules are respected.
-        self.assert_sync_contraints()
-
-    def __str__(self) -> str:
-        return f"OpRepSets(ins={self.args_repset_list}, outs={self.outs_repset_list})"
-
-    def make_valid_tensor_repset_for_node_list_arg(
-        self,
-        arg_repsets: TensorRepSet,
-        arg_node: List[torch.fx.Node],
-        texture_limits: ImageExtents,
-    ) -> TensorRepSet:
-        """
-        Wrapper around make_filtered_tensor_repset for a list of nodes. This will happen
-        for the cat operator, where the first argument is a list of nodes.
-        """
-        # For variable length args, assume that they all need to use the same representation
-        # only one repset should be defined
-        common_tensor_repsets = arg_repsets
-
-        for n in arg_node:
-            assert isinstance(n, torch.fx.Node)
-            common_tensor_repsets = common_tensor_repsets.make_intersect(
-                make_filtered_tensor_repset(
-                    n.meta["val"], common_tensor_repsets, texture_limits
-                )
-            )
-
-        return common_tensor_repsets
-
-    def make_valid_tensor_repset_for_arg(
-        self, arg_repsets: TensorRepSet, arg_node: Any, texture_limits: ImageExtents
-    ) -> TensorRepSet:
-        """
-        Helper function to call make_filtered_tensor_repset
-        """
-        if isinstance(arg_node, torch.fx.Node) and is_single_tensor_node(arg_node):
-            return make_filtered_tensor_repset(
-                arg_node.meta["val"], arg_repsets, texture_limits
-            )
-        elif isinstance(arg_node, list) and all(
-            is_single_tensor_node(n) for n in arg_node
-        ):
-            return self.make_valid_tensor_repset_for_node_list_arg(
-                arg_repsets, arg_node, texture_limits
-            )
-        # Special case for getitem; return the repset of the particular val in the
-        # list of tensors that is being extracted.
-        elif (
-            self.op_node.target == operator.getitem and arg_node == self.op_node.args[0]
-        ):
-            idx = self.op_node.args[1]
-            assert isinstance(idx, int)
-            return make_filtered_tensor_repset(
-                arg_node.meta["val"][idx], arg_repsets, texture_limits
-            )
-
-        raise NotImplementedError(f"Unhandled node type {arg_node}")
-
-    def assert_sync_contraints(self) -> None:
-        if self.sync_args_repr:
-            assert len(self.args_repset_list) == 1
-
-        if self.sync_outs_repr:
-            assert len(self.outs_repset_list) == 1
-
-        if self.sync_primary_io_repr:
-            assert (
-                self.args_repset_list[self.primary_arg_idx] == self.outs_repset_list[0]
-            )
-
-    def any_is_empty(self) -> bool:
-        return (
-            self.args_repset_list.any_is_empty() or self.outs_repset_list.any_is_empty()
-        )
-
-    def get_arg_repset(self, i: int):
-        return self.args_repset_list[i]
-
-    def get_out_repset(self, i: int):
-        return self.outs_repset_list[i]
-
-    def try_constrain_with_arg_repset(
-        self, arg_i: int, source_repset: TensorRepSet
-    ) -> bool:
-        """
-        Attempt to constrain the repsets of the tensors participating in this operator
-        based on an "existing" repset of an argument. The existing repset can have two
-        sources:
-        * A representation may have been determined for the argument already from a
-          prior operator
-        * The output repset of the operator which produces the argument
-
-        If the existing repset of the argument is compatible with the current operator,
-        then constrain the repsets of this operator and apply synchronization rules.
-
-        This process tries to minimize the number of transition nodes that will need to
-        be inserted by tag_memory_meta_pass.py by maintaining existing representations
-        for as long as possible.
-        """
-        arg_current_repset = self.args_repset_list[arg_i]
-
-        if arg_current_repset == source_repset:
-            return False
-
-        if not arg_current_repset.any_in_common(source_repset):
-            return False
-
-        if self.sync_primary_io_repr:
-            if not self.get_out_repset(0).any_in_common(source_repset):
-                return False
-
-        # If this point is reached, then it is possible to constrain
-        self.args_repset_list[arg_i] = arg_current_repset.make_intersect(source_repset)
-        if self.sync_primary_io_repr and (
-            arg_i == self.primary_arg_idx or self.sync_args_repr
-        ):
-            self.outs_repset_list[0] = arg_current_repset.make_intersect(source_repset)
-
-        self.assert_sync_contraints()
-        return True
-
-    def pick_representations(self) -> Tuple[TensorReprList, TensorReprList]:
-        """
-        For each tensor participating in the op, pick a representation for it among the
-        possible represetntation sets.
-        """
-        args_repr_list = TensorReprList([])
-        outs_repr_list = TensorReprList([])
-
-        for i in range(len(self.op_node.args)):
-            arg_repset = self.args_repset_list[i]
-            args_repr_list.append(arg_repset.make_tensor_repr())
-
-        for i in range(num_tensors_in_node(self.op_node)):
-            out_repset = self.outs_repset_list[i]
-            outs_repr_list.append(out_repset.make_tensor_repr())
-
-        return args_repr_list, outs_repr_list
-
-
-##
-## TensorSpec Utils
-##
-
-
-def has_node_spec_attr(node: torch.fx.Node, attr: str) -> bool:
-    return "spec" in node.meta and hasattr(node.meta["spec"], attr)
-
-
-def set_node_spec_attr(node: torch.fx.Node, attr: str, value):
-    assert "spec" in node.meta
-    spec = node.meta["spec"]
-    if isinstance(spec, TensorSpec):
-        setattr(spec, attr, value)
-    elif isinstance(spec, (list, tuple)):
-        # Special case if value is a list/tuple of the same length as the
-        # collection of tensors in the node. In this case, treat the value list
-        # as a list of values to set indivudually for each tensor in the node
-        if isinstance(value, (list, tuple)) and len(spec) == len(value):
-            assert len(spec) == len(value)
-            for s, v in zip(spec, value):
-                assert isinstance(s, TensorSpec)
-                setattr(s, attr, v)
-        # Otherwise, set the attribute to value for all tensors in the list
-        else:
-            for s in spec:
-                assert isinstance(s, TensorSpec)
-                setattr(s, attr, value)
-    else:
-        raise RuntimeError(f"Cannot set attr for spec of type {type(spec)}")
-
-
-def get_node_spec_attr(node: torch.fx.Node, attr: str, return_first: bool = True):
-    assert "spec" in node.meta
-    spec = node.meta["spec"]
-    if isinstance(spec, TensorSpec):
-        return getattr(spec, attr) if hasattr(spec, attr) else None
-    elif isinstance(spec, (list, tuple)):
-        if return_first:
-            return getattr(spec[0], attr) if hasattr(spec[0], attr) else None
-        else:
-            return [getattr(s, attr) if hasattr(s, attr) else None for s in spec]
-    else:
-        raise RuntimeError(f"Cannot get attr for spec of type {type(spec)}")
-
-
-def get_node_storage_type(node: torch.fx.Node) -> Optional[VkStorageType]:
-    return get_node_spec_attr(node, "vk_storage_type")
-
-
-def get_node_memory_layout(node: torch.fx.Node) -> Optional[VkMemoryLayout]:
-    return get_node_spec_attr(node, "vk_memory_layout")
-
-
-def has_node_repr(node) -> bool:
-    if isinstance(node, (list, tuple)):
-        return all(has_node_spec_attr(n, "etvk_node_repr") for n in node)
-    else:
-        return has_node_spec_attr(node, "etvk_node_repr")
-
-
-def set_node_repr(node: torch.fx.Node, node_repr: Union[TensorRepr, TensorReprList]):
-    if isinstance(node_repr, TensorReprList):
-        # Convert to a regular list so taht `set_node_spec_attr` can attach each entry
-        # to a separate TensorSpec
-        node_repr_list = [node_repr[i] for i in range(num_tensors_in_node(node))]
-        set_node_spec_attr(node, "etvk_node_repr", node_repr_list)
-    else:
-        set_node_spec_attr(node, "etvk_node_repr", node_repr)
-
-
-def get_node_repr(node) -> Union[TensorRepr, TensorReprList]:
-    if isinstance(node, (list, tuple)):
-        raise NotImplementedError("get_node_repr not implemented for list of nodes")
-    else:
-        return get_node_spec_attr(node, "etvk_node_repr", False)
-
-
-##
-## Graph Pattern Matching
-##
-
-
-def maybe_skip_q_dq_arg_chain(
-    arg: torch.fx.node.Argument,
-) -> Tuple[Optional[torch.fx.Node], Optional[torch.fx.Node], Optional[torch.fx.Node]]:
-    """
-    Check if the given node argument is part of a Quantize/Dequantize chain produced by
-    the quant workflow. If so, return the source tensor that is the input to the Q/DQ
-    chain and the quantize/dequantize nodes in the chain. Otherwise, return the argument
-    as is and None, None
-    """
-    if not isinstance(arg, torch.fx.Node):
-        return None, None, None
-
-    if is_dequant_node(arg):
-        dequant_node = arg
-        quant_node = dequant_node.args[0]
-        assert isinstance(quant_node, torch.fx.Node)
-        source_arg = quant_node.args[0]
-        assert isinstance(source_arg, torch.fx.Node)
-        return source_arg, quant_node, dequant_node
-    else:
-        return arg, None, None
-
-
-def trace_args_until_placeholder(
-    node: torch.fx.node.Argument, max_search_depth: int = 4
-) -> Tuple[Optional[torch.fx.Node], List[torch.fx.Node]]:
-    """
-    Trace through node.args[0] of a given initial node until a placeholder node is found
-    then return it and the list of nodes traversed. If no placeholder node is found,
-    returns None and an empty list.
-    """
-    cur_node = node
-    search_depth = 0
-
-    if not isinstance(cur_node, torch.fx.Node):
-        return None, []
-
-    traversed = [cur_node]
-    while cur_node.op != "placeholder" and search_depth < max_search_depth:
-        # Break if cur_node has no args
-        if len(cur_node.args) == 0:
-            break
-
-        cur_node = cur_node.args[0]
-        if not isinstance(cur_node, torch.fx.Node):
-            break
-        traversed.append(cur_node)
-        search_depth += 1
-
-    if not isinstance(cur_node, torch.fx.Node):
-        return None, []
-    if cur_node.op != "placeholder":
-        return None, []
-
-    assert isinstance(cur_node, torch.fx.Node)
-    return cur_node, traversed
-
-
-def is_in_4bit_range(tensor: torch.Tensor) -> bool:
-    """
-    Check if the given tensor is in the range of 4-bit quantization and is of integer type.
-    """
-    if tensor.dtype not in (torch.int8, torch.uint8):
-        return False
-
-    return tensor.min().item() >= -8 and tensor.max().item() <= 7
-
-
-def is_in_8bit_range(tensor: torch.Tensor) -> bool:
-    """
-    Check if the given tensor is in the range of 4-bit quantization and is of integer type.
-    """
-    if tensor.dtype not in (torch.int8, torch.uint8):
-        return False
-
-    return tensor.min().item() >= -128 and tensor.max().item() <= 127
-
-
-##
-## Misc
-##
-
-
-def get_tensor_val_str(tensor_val: FakeTensor) -> str:
-    return f"{tensor_val.dtype}: {tensor_val.shape}"
-
-
-def get_node_val_str(node: torch.fx.Node) -> str:
-    if is_single_tensor_node(node):
-        assert isinstance(node.meta["val"], FakeTensor)
-        return get_tensor_val_str(node.meta["val"])
-    elif is_tensor_collection_node(node):
-        assert isinstance(node.meta["val"], (list, tuple))
-        return f"[{', '.join(get_tensor_val_str(t) for t in node.meta['val'])}]"
-    else:
-        if "val" not in node.meta:
-            return str(node)
-        return str(node.meta["val"])
-
-
-def get_arg_node_val_str(arg_node: Any) -> str:
-    if isinstance(arg_node, torch.fx.Node):
-        return get_node_val_str(arg_node)
-    elif isinstance(arg_node, (list, tuple)):
-        return f"[{', '.join(get_arg_node_val_str(n) for n in arg_node)}]"
-    else:
-        return str(arg_node)
-
-
-def node_io_str(node: torch.fx.Node) -> str:
-    target = node.target
-    if isinstance(target, EdgeOpOverload):
-        assert isinstance(target, EdgeOpOverload)
-        target_name = target.__name__
-    elif isinstance(target, torch._ops.OpOverload):
-        assert isinstance(target, torch._ops.OpOverload)
-        target_name = target.name()
-    else:
-        target_name = str(target)
-
-    out_str = f"{get_node_val_str(node)} = {target_name}("
-    for arg in node.args:
-        out_str += get_arg_node_val_str(arg) + ", "
-
-    out_str += " ...)"
-    return out_str
-
-
-def update_program_state_dict(
-    program: ExportedProgram,
-    buffer_name: str,
-    updated_tensor: torch.Tensor,
-) -> None:
-    target_name = None
-    # Iterate over all the tensors in the graph signature, and find
-    # the one corresponding to the parameter/buffer name
-    for input_ in program.graph_signature.input_specs:
-        if (
-            input_.kind in (InputKind.BUFFER, InputKind.PARAMETER)
-            and isinstance(input_.arg, TensorArgument)
-            and input_.arg.name == buffer_name
-        ):
-            target_name = input_.target
-            break
-
-    # Assert that we found the parameter/buffer
-    assert (
-        target_name is not None
-    ), f"could not find {buffer_name} in source program signature"
-    assert target_name in program.state_dict, f"could not find {target_name}"
-
-    # Finally, overwrite the current tensor with updated tensor
-    program.state_dict[target_name] = updated_tensor
-
-
-def align_width_and_update_state_dict(
-    ep: ExportedProgram,
-    node: torch.fx.Node,
-    cur_tensor: torch.Tensor,
-    align_to: int = 4,
-    force_update: bool = False,
-) -> torch.Tensor:
-    """
-    Align the width of the given tensor to the given alignment value and update the
-    state dict of the program with the aligned tensor.
-    """
-    added_padding = False
-    cur_width = cur_tensor.shape[-1]
-    # Only align the width of the tensor if it is not already aligned
-    if cur_width % align_to != 0:
-        num_padding = align_to - (cur_width % align_to)
-        # Align the width of the tensor to the given alignment value
-        aligned_tensor = torch.nn.functional.pad(
-            cur_tensor, (0, num_padding)
-        ).contiguous()
-        added_padding = True
-    else:
-        aligned_tensor = cur_tensor
-
-    if added_padding or force_update:
-        update_program_state_dict(ep, node.name, aligned_tensor)
-        # FakeTensor needs to match updated tensor
-        cur_fake_tensor = node.meta["val"]
-        node.meta["val"] = FakeTensorConverter().from_real_tensor(
-            cur_fake_tensor.fake_mode,
-            aligned_tensor,
-        )
-
-    return aligned_tensor
diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py
new file mode 120000
index 00000000000..78678ae8191
--- /dev/null
+++ b/backends/vulkan/utils.py
@@ -0,0 +1 @@
+/home/ssjia/fbsource/xplat/executorch/backends/vulkan/utils.py
\ No newline at end of file
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
deleted file mode 100644
index 69d3cdef75d..00000000000
--- a/backends/vulkan/vulkan_preprocess.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-strict
-
-from functools import partial
-
-from typing import Any, Dict, final, List
-
-import executorch.backends.vulkan.utils as utils
-
-from executorch.backends.transforms.addmm_mm_to_linear import AddmmToLinearTransform
-from executorch.backends.transforms.fuse_conv_with_clamp import FuseClampPass
-from executorch.backends.transforms.fuse_view_copy import FuseViewCopyTransform
-from executorch.backends.transforms.view_copy_to_squeeze_unsqueeze import (
-    ViewCopyToSqueezeUnsqueezePass,
-)
-from executorch.backends.vulkan._passes import (
-    FoldQDQPass,
-    FuseQuantizedOpsTransform,
-    insert_prepack_nodes,
-    RemoveLocalScalarDenseOpsTransform,
-    RemoveRedundantOpsTransform,
-    SqueezeUnsqueezeInputs,
-    TagMemoryMetaPass,
-)
-from executorch.backends.vulkan._passes.fuse_patterns import FusePatternsPass
-from executorch.backends.vulkan._passes.remove_asserts import RemoveAssertsTransform
-
-from executorch.backends.vulkan.serialization.vulkan_graph_builder import VkGraphBuilder
-from executorch.backends.vulkan.serialization.vulkan_graph_schema import (
-    VkMemoryLayout,
-    VkStorageType,
-)
-from executorch.backends.vulkan.serialization.vulkan_graph_serialize import (
-    serialize_vulkan_graph,
-)
-from executorch.backends.xnnpack._passes import FuseBatchNormPass
-
-from executorch.exir.backend.backend_details import (
-    BackendDetails,
-    CompileSpec,
-    ExportedProgram,
-    PreprocessResult,
-)
-from executorch.exir.backend.utils import DelegateMappingBuilder
-
-from executorch.exir.memory_planning import greedy, MemoryPlanningAlgorithmSuite
-from executorch.exir.pass_base import ExportPass, PassBase
-
-from executorch.exir.passes import MemoryPlanningPass, SpecPropPass
-
-from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
-
-from executorch.exir.program._program import _copy_module
-
-from torch.export._remove_auto_functionalized_pass import (
-    unsafe_remove_auto_functionalized_pass,
-)
-
-DEFAULT_DEBUG_HANDLE = 65535
-
-
-# pyre-ignore
-def apply_passes(program: ExportedProgram, passes) -> ExportedProgram:
-    for p in passes:
-        if issubclass(type(p), ExportPass) or issubclass(type(p), PassBase):
-            new_gm = program.graph_module
-            # This is a workaround to allow the memory planning pass to work without
-            # having to first apply ToOutVarPass(). See the `greedy()` function in
-            # `exir.memory_planning`; if this attribute isn't set, assertions in
-            # `collect_spec_from_nodes()` will fail.
-            if isinstance(p, MemoryPlanningPass):
-                new_gm.encounter_to_out_var_failure = True
-
-            new_gm_res = p(new_gm)
-            assert new_gm_res is not None
-            new_gm = new_gm_res.graph_module
-
-            # See the application of this function in exir/program/_program.py for more
-            # details on why this step is necessary.
-            if isinstance(p, SpecPropPass):
-                p.update_placeholder_tensor_specs(program, new_gm)
-
-            _copy_module(program.graph_module, new_gm)
-        else:
-            program = p(program)
-
-    return program
-
-
-def parse_compile_spec(compile_specs: List[CompileSpec]) -> Dict[str, Any]:
-    options = {}
-    for spec in compile_specs:
-        if spec.key == "storage_type_override":
-            options[spec.key] = VkStorageType(
-                int.from_bytes(spec.value, byteorder="little")
-            )
-        if spec.key == "memory_layout_override":
-            options[spec.key] = VkMemoryLayout(
-                int.from_bytes(spec.value, byteorder="little")
-            )
-        if spec.key in {"texture_limits_x", "texture_limits_y", "texture_limits_z"}:
-            options[spec.key] = int.from_bytes(spec.value, byteorder="little")
-
-        if spec.key == "skip_tag_memory_metadata":
-            options[spec.key] = bool.from_bytes(spec.value, byteorder="little")
-
-        if spec.key == "downcast_64_bit":
-            options[spec.key] = bool.from_bytes(spec.value, byteorder="little")
-
-        # Unhandled options are ignored
-
-    return options
-
-
-@final
-class VulkanBackend(BackendDetails):
-    @classmethod
-    # pyre-ignore
-    def preprocess(  # noqa: C901
-        cls,
-        program: ExportedProgram,
-        module_compile_spec: List[CompileSpec],
-    ) -> PreprocessResult:
-        compile_options = parse_compile_spec(module_compile_spec)
-        limits_x = compile_options.get(
-            "texture_limits_x", utils.DEFAULT_TEXTURE_LIMITS[0]
-        )
-        limits_y = compile_options.get(
-            "texture_limits_y", utils.DEFAULT_TEXTURE_LIMITS[1]
-        )
-        limits_z = compile_options.get(
-            "texture_limits_z", utils.DEFAULT_TEXTURE_LIMITS[2]
-        )
-        texture_limits = (limits_x, limits_y, limits_z)
-
-        default_storage_type = compile_options.get(
-            "storage_type_override", VkStorageType.TEXTURE_3D
-        )
-        default_memory_layout = compile_options.get(
-            "memory_layout_override", VkMemoryLayout.TENSOR_WIDTH_PACKED
-        )
-        downcast_64_bit = compile_options.get("downcast_64_bit", True)
-
-        program = unsafe_remove_auto_functionalized_pass(program)
-
-        # First, apply passes that fuse/remove operators to consolidate the graph
-        # structure but still preserve an "ATen-compliant" graph structure (i.e. all
-        # arguments to ATen operators must match the ATen function schema).
-        program = apply_passes(
-            program,
-            [
-                FusePatternsPass(program),
-                RemoveRedundantOpsTransform(),
-                AddmmToLinearTransform(),
-                FuseQuantizedOpsTransform(program),
-                FoldQDQPass(program),
-                SqueezeUnsqueezeInputs(),
-                FuseViewCopyTransform(),
-                ViewCopyToSqueezeUnsqueezePass(),
-                FuseBatchNormPass(program),
-                FuseClampPass(),
-            ],
-        )
-
-        # Next annotate tensor nodes with TensorSpec structs which is needed for dynamic
-        # shapes and memory planning. Until this point, the graph must be ATen compliant
-        # because SpecPropPass will be calling the underlying ATen operators during its
-        # execution.
-        program = apply_passes(program, [SpecPropPass()])
-
-        # Apply graph transforms which either require `TensorSpec`s to have been created
-        # or would create an non ATen compliant graph structure.
-        program = apply_passes(
-            program,
-            [
-                RemoveAssertsTransform(),
-                # Since this pass may replace a scalar argument with a tensor argument,
-                # this pass may result in a non ATen compliant graph structure.
-                RemoveLocalScalarDenseOpsTransform(),
-                insert_prepack_nodes,
-            ],
-        )
-
-        # Optionally apply the memory metadata tagging pass, which will insert storage
-        # type and memory layout transition nodes to ensure that all tensor arguments
-        # to an operator is in a supported or optimal configuration. If this pass is not
-        # applied, there will be a risk that some operators recieve arguments with
-        # memory settings that are not supported by the implementation.
-        if not compile_options.get("skip_tag_memory_metadata", False):
-            program = apply_passes(
-                program,
-                [
-                    TagMemoryMetaPass(
-                        texture_limits,
-                        default_storage_type=default_storage_type,
-                        default_memory_layout=default_memory_layout,
-                    ),
-                ],
-            )
-
-        # Finally, apply dynamic shape passes and memory planning pass. These passes
-        # must be applied only when the graph structure is finalized.
-        greedy_memory_planning = partial(greedy, allow_overlapping_allocations=False)
-        mem_planning_suite = MemoryPlanningAlgorithmSuite(
-            algo_list=[greedy_memory_planning]
-        )
-        program = apply_passes(
-            program,
-            [
-                ConstraintBasedSymShapeEvalPass(),
-                MemoryPlanningPass(memory_planning_algo=mem_planning_suite),
-            ],
-        )
-
-        graph_builder = VkGraphBuilder(
-            program,
-            DelegateMappingBuilder(generated_identifiers=True),
-            downcast_64_bit=downcast_64_bit,
-        )
-        vk_graph = graph_builder.build_graph()
-
-        return PreprocessResult(
-            processed_bytes=serialize_vulkan_graph(
-                vk_graph, graph_builder.const_tensors, []
-            ),
-            debug_handle_map=graph_builder.delegate_mapping_builder.get_delegate_mapping(),
-            data_store_output=graph_builder.named_data_store.get_named_data_store_output(),
-        )
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
new file mode 120000
index 00000000000..a52006818c0
--- /dev/null
+++ b/backends/vulkan/vulkan_preprocess.py
@@ -0,0 +1 @@
+/home/ssjia/fbsource/xplat/executorch/backends/vulkan/vulkan_preprocess.py
\ No newline at end of file