From 167c063261833bbda0772d27674c1fea44ab0908 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Mon, 2 Jun 2025 15:30:49 -0700
Subject: [PATCH 1/3] [ET-VK][ez] Enable dynamic shape support when using push
 constants

Pull Request resolved: https://github.com/pytorch/executorch/pull/11253

## Changes

* Call `encode_execute()` upon resize in `propagate_resize()`
* Minor update to `DispatchNode` to store push constant data array as a persistent member of the class

## Motivation

Passing in tensor metadata (i.e. sizes, strides) via push constants is typically more performant than passing them via a UBO (uniform buffer object). However, currently dynamic shapes do not work when push constants are used as I realized that the tensor metadata contained in the push constants do not get updated.

It appears that that `vkCmdPushConstants` sets the push constants when encoding the command buffer, however the push constants will not be updated if the command buffer is submitted for execution multiple times. Therefore, to update push constant values **the command buffer needs to be re-encoded**.

## Performance Impact

This may add a small performance overhead (i.e. re-encoding the command buffer) when executing models with dynamic shapes. Models that do not trigger tensor resizing will not be impacted.

However, I measured the impact on a llama 3.2 1B model and the impact of re-encoding a command buffer appears to be negligible.

In any case, re-encoding the command buffer is a "necessary evil" when working with dynamic shapes, otherwise the tensor metadata seen by shaders may never get updated. Furthermore, re-encoding the command buffer can allow an opportunity to adjust global work group sizing to match current tensor sizes, which may have a huge performance impact when maximum tensor sizes far exceeds what tensor sizes will realistically be during inference (one instance of this is for transformer models when the max sequence length is very long).
ghstack-source-id: 287711101
@exported-using-ghexport

Differential Revision: [D75686051](https://our.internmc.facebook.com/intern/diff/D75686051/)
---
 backends/vulkan/runtime/VulkanBackend.cpp     |  7 +++++
 .../vulkan/runtime/graph/ComputeGraph.cpp     |  4 ++-
 backends/vulkan/runtime/graph/ComputeGraph.h  |  7 ++++-
 .../vulkan/runtime/graph/ops/DispatchNode.cpp | 26 ++++++++++---------
 .../vulkan/runtime/graph/ops/DispatchNode.h   |  6 +++++
 .../vulkan/runtime/graph/ops/ExecuteNode.h    |  2 +-
 .../vulkan/test/vulkan_compute_api_test.cpp   |  3 +--
 7 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
index b32f4eb4308..02df85c33e8 100644
--- a/backends/vulkan/runtime/VulkanBackend.cpp
+++ b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -499,6 +499,8 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
     compute_graph->encode_prepack();
     compute_graph->prepack();
 
+    // TODO(ssjia): remove this once we can batch compile compute pipelines
+    // during prepare().
     compute_graph->encode_execute();
 
     return Error::Ok;
@@ -567,9 +569,14 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
       }
     }
 
+    // propagate_resize() will re-encode the command buffer so that push
+    // constants are updated and DynamicDispatchNode can update the compute
+    // shader, global workgroup size, and local workgroup size to perform the
+    // model inference.
     if (should_propagate_resize) {
       compute_graph->propagate_resize();
     }
+
     compute_graph->execute();
 
     for (size_t i = 0; i < compute_graph->outputs().size(); i++) {
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index a4a6abdd63f..be9eae352ec 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -678,11 +678,12 @@ void ComputeGraph::encode_execute() {
   }
 }
 
-void ComputeGraph::execute() const {
+void ComputeGraph::execute() {
   vkapi::VulkanFence fence = context_->fences().get_fence();
   context_->submit_cmd_to_gpu(fence.get_submit_handle());
   fence.wait();
   context_->fences().return_fence(fence);
+  execute_count_++;
 }
 
 void ComputeGraph::resize_input(
@@ -696,6 +697,7 @@ void ComputeGraph::propagate_resize() {
   for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
     node->trigger_resize(this);
   }
+  encode_execute();
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 9f4bab3ac04..9f56941b184 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -187,6 +187,7 @@ class ComputeGraph final {
 
  protected:
   size_t values_in_use_ = 0;
+  size_t execute_count_ = 0;
 
  public:
   //
@@ -745,7 +746,7 @@ class ComputeGraph final {
   //
 
   void encode_execute();
-  void execute() const;
+  void execute();
 
   //
   // Dynamic Shape support
@@ -762,6 +763,10 @@ class ComputeGraph final {
     return context_->adapter_ptr()->supports_int16_shader_types();
   }
 
+  inline size_t execute_count() const {
+    return execute_count_;
+  }
+
   /*
    * Check whether the GPU supports 8 bit buffers.
    */
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
index 51ff0c122b0..a0d3a4c2e5c 100644
--- a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
@@ -46,15 +46,7 @@ void DispatchNode::encode(ComputeGraph* graph) {
 
   std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
 
-  std::array<uint8_t, kMaxPushConstantSize> push_constants_data;
-  uint32_t push_constants_offset = 0;
-
-  for (const auto& push_constant : push_constants_) {
-    push_constants_offset += push_constant.write(
-        push_constants_data.data(),
-        push_constants_offset,
-        kMaxPushConstantSize);
-  }
+  write_push_constant_data();
 
   context->report_shader_dispatch_start(
       shader_.kernel_name,
@@ -63,7 +55,7 @@ void DispatchNode::encode(ComputeGraph* graph) {
       node_id_);
 
   vkapi::DescriptorSet descriptor_set = context->get_descriptor_set(
-      shader_, local_workgroup_size_, spec_vars_, push_constants_offset);
+      shader_, local_workgroup_size_, spec_vars_, push_constants_offset_);
 
   uint32_t idx = 0;
   idx = bind_values_to_descriptor_set(
@@ -76,10 +68,20 @@ void DispatchNode::encode(ComputeGraph* graph) {
       pipeline_barrier,
       shader_,
       global_workgroup_size_,
-      push_constants_data.data(),
-      push_constants_offset);
+      push_constants_data_.data(),
+      push_constants_offset_);
 
   context->report_shader_dispatch_end();
 }
 
+void DispatchNode::write_push_constant_data() {
+  push_constants_offset_ = 0;
+  for (const auto& push_constant : push_constants_) {
+    push_constants_offset_ += push_constant.write(
+        push_constants_data_.data(),
+        push_constants_offset_,
+        kMaxPushConstantSize);
+  }
+}
+
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.h b/backends/vulkan/runtime/graph/ops/DispatchNode.h
index c45f0a741fd..db95adfee39 100644
--- a/backends/vulkan/runtime/graph/ops/DispatchNode.h
+++ b/backends/vulkan/runtime/graph/ops/DispatchNode.h
@@ -50,6 +50,12 @@ class DispatchNode : public ExecuteNode {
   const vkapi::SpecVarList spec_vars_;
   const std::vector<PushConstantDataInfo> push_constants_;
 
+  // For push constants
+  std::array<uint8_t, kMaxPushConstantSize> push_constants_data_{};
+  uint32_t push_constants_offset_ = 0;
+
+  void write_push_constant_data();
+
  public:
   operator bool() const {
     return shader_;
diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
index 7563fc63c71..0731722e13a 100644
--- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h
+++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
@@ -65,7 +65,7 @@ class ExecuteNode {
     (void)graph;
   }
 
-  inline void trigger_resize(ComputeGraph* graph) {
+  virtual inline void trigger_resize(ComputeGraph* graph) {
     if (resize_fn_ != nullptr) {
       resize_fn_(graph, args_, resize_args_);
     }
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index a6475d95d07..f014cc79f56 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -1660,9 +1660,8 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   for (auto& new_sizes : new_sizes_list) {
     graph.get_tensor(a.value)->virtual_resize(new_sizes);
     graph.get_tensor(b.value)->virtual_resize(new_sizes);
-    graph.get_tensor(c)->virtual_resize(new_sizes);
     graph.get_tensor(d.value)->virtual_resize(new_sizes);
-    graph.get_tensor(e)->virtual_resize(new_sizes);
+    graph.propagate_resize();
 
     float val_a = new_sizes[1] + 4.0f;
     float val_b = new_sizes[2] + 1.5f;

From f95e6cb5aafcd595dc209865925ce2c333f9612f Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Mon, 2 Jun 2025 15:30:51 -0700
Subject: [PATCH 2/3] [ET-VK][ez] Updates to DynamicDispatchNode

Pull Request resolved: https://github.com/pytorch/executorch/pull/11254

## Changes

For `DynamicDispatchNode`:
* Pass in global work group size to the local work group size determination function
* Add additional constructor for which the shader is not dynamic
* During `encode`, check that pick functions are not `nullptr`

## Motivation

Oftentimes it is useful to know what the global work group size is when determining what the local group group size should be.

## Performance Impact

None.
ghstack-source-id: 287711100
@exported-using-ghexport

Differential Revision: [D75686047](https://our.internmc.facebook.com/intern/diff/D75686047/)
---
 .../runtime/graph/ops/DynamicDispatchNode.cpp | 58 ++++++++++++++++---
 .../runtime/graph/ops/DynamicDispatchNode.h   | 15 +++++
 .../vulkan/test/vulkan_compute_api_test.cpp   | 13 ++++-
 3 files changed, 76 insertions(+), 10 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp b/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp
index ac84916c6fa..a8d2fe2e99d 100644
--- a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp
@@ -25,9 +25,9 @@ DynamicDispatchNode::DynamicDispatchNode(
     const ResizeFunction& resize_fn)
     : DispatchNode(
           graph,
-          pick_shader_fn(&graph, args, resize_args),
-          pick_global_wg_fn(&graph, args, resize_args),
-          pick_local_wg_fn(&graph, args, resize_args),
+          vkapi::ShaderInfo(),
+          {1u, 1u, 1u},
+          {1u, 1u, 1u},
           args,
           params,
           push_constants,
@@ -36,13 +36,57 @@ DynamicDispatchNode::DynamicDispatchNode(
           resize_fn),
       pick_shader_fn_(pick_shader_fn),
       pick_global_wg_fn_(pick_global_wg_fn),
+      pick_local_wg_fn_(pick_local_wg_fn) {
+  shader_ = pick_shader_fn(&graph, args, resize_args);
+  global_workgroup_size_ =
+      pick_global_wg_fn(&graph, shader_, args, resize_args);
+  local_workgroup_size_ = utils::WorkgroupSize(pick_local_wg_fn(
+      &graph, shader_, global_workgroup_size_, args, resize_args));
+}
+
+DynamicDispatchNode::DynamicDispatchNode(
+    ComputeGraph& graph,
+    const vkapi::ShaderInfo& shader,
+    const PickGlobalFn& pick_global_wg_fn,
+    const PickLocalFn& pick_local_wg_fn,
+    const std::vector<ArgGroup>& args,
+    const vkapi::ParamsBindList& params,
+    const std::vector<PushConstantDataInfo>& push_constants,
+    const vkapi::SpecVarList& spec_vars,
+    const std::vector<ValueRef>& resize_args,
+    const ResizeFunction& resize_fn)
+    : DispatchNode(
+          graph,
+          shader,
+          pick_global_wg_fn(&graph, shader, args, resize_args),
+          pick_local_wg_fn(
+              &graph,
+              shader,
+              pick_global_wg_fn(&graph, shader, args, resize_args),
+              args,
+              resize_args),
+          args,
+          params,
+          push_constants,
+          spec_vars,
+          resize_args,
+          resize_fn),
+      pick_shader_fn_{nullptr},
+      pick_global_wg_fn_(pick_global_wg_fn),
       pick_local_wg_fn_(pick_local_wg_fn) {}
 
 void DynamicDispatchNode::encode(ComputeGraph* graph) {
-  shader_ = pick_shader_fn_(graph, args_, resize_args_);
-  global_workgroup_size_ = pick_global_wg_fn_(graph, args_, resize_args_);
-  local_workgroup_size_ =
-      utils::WorkgroupSize(pick_local_wg_fn_(graph, args_, resize_args_));
+  if (pick_shader_fn_) {
+    shader_ = pick_shader_fn_(graph, args_, resize_args_);
+  }
+  if (pick_global_wg_fn_) {
+    global_workgroup_size_ =
+        pick_global_wg_fn_(graph, shader_, args_, resize_args_);
+  }
+  if (pick_local_wg_fn_) {
+    local_workgroup_size_ = utils::WorkgroupSize(pick_local_wg_fn_(
+        graph, shader_, global_workgroup_size_, args_, resize_args_));
+  }
   DispatchNode::encode(graph);
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h b/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h
index ede50941415..005151272c3 100644
--- a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h
+++ b/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h
@@ -32,10 +32,13 @@ class DynamicDispatchNode final : public DispatchNode {
       const std::vector<ValueRef>&)>;
   using PickGlobalFn = const std::function<utils::uvec3(
       ComputeGraph*,
+      const vkapi::ShaderInfo& shader,
       const std::vector<ArgGroup>&,
       const std::vector<ValueRef>&)>;
   using PickLocalFn = const std::function<utils::uvec3(
       ComputeGraph*,
+      const vkapi::ShaderInfo& shader,
+      const utils::uvec3& global_workgroup_size,
       const std::vector<ArgGroup>&,
       const std::vector<ValueRef>&)>;
 
@@ -51,6 +54,18 @@ class DynamicDispatchNode final : public DispatchNode {
       const std::vector<ValueRef>& resize_args,
       const ResizeFunction& resize_fn = nullptr);
 
+  explicit DynamicDispatchNode(
+      ComputeGraph& graph,
+      const vkapi::ShaderInfo& shader,
+      const PickGlobalFn& pick_global_wg_fn,
+      const PickLocalFn& pick_local_wg_fn,
+      const std::vector<ArgGroup>& args,
+      const vkapi::ParamsBindList& params,
+      const std::vector<PushConstantDataInfo>& push_constants,
+      const vkapi::SpecVarList& spec_vars,
+      const std::vector<ValueRef>& resize_args,
+      const ResizeFunction& resize_fn = nullptr);
+
   ~DynamicDispatchNode() override = default;
 
   void encode(ComputeGraph* graph) override;
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index f014cc79f56..60dfb3b8606 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -9,6 +9,7 @@
 #include <gtest/gtest.h>
 
 #include <bitset>
+#include <iomanip>
 #include <utility>
 #include <vector>
 
@@ -3314,17 +3315,23 @@ vkapi::ShaderInfo pick_dynamic_dispatch_shader(
 
 utils::uvec3 pick_dynamic_dispatch_global_wg_size(
     ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
     const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& additional_args) {
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
   const ValueRef out = args[0].refs[0];
-
   return graph->logical_limits_of(out);
 }
 
 utils::uvec3 pick_dynamic_dispatch_local_wg_size(
     ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
     const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& additional_args) {
+    const std::vector<ValueRef>& resize_args) {
+  (void)graph;
+  (void)shader;
+  (void)global_workgroup_size;
   return {64, 1, 1};
 }
 

From c0efc7618b1cfdc2b51608e390b2d1b57368b522 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Mon, 2 Jun 2025 15:30:54 -0700
Subject: [PATCH 3/3] [ET-VK] Introduce generalized shaders for transfer ops
 and use it for select and slice

Pull Request resolved: https://github.com/pytorch/executorch/pull/11255

## Changes

* Introduce `transfer_buffer.glsl` and `transfer_texture.glsl`, and `Transfer.cpp` which generalizes shaders where each element of the output is copied from a unique element of the input.
* Update `Slice.cpp` and `Select.cpp` to use `Transfer.cpp`
* Remove old implementations of slice and select

## Motivation

With this new implementation, the op can now support both buffers and textures of any packing. There are also benefits of code consolidation.
ghstack-source-id: 287711104
@exported-using-ghexport

Differential Revision: [D75686050](https://our.internmc.facebook.com/intern/diff/D75686050/)
---
 .../vulkan/runtime/graph/ComputeGraph.cpp     |  18 +-
 backends/vulkan/runtime/graph/ComputeGraph.h  |  20 ++
 .../runtime/graph/ops/glsl/select.glslh       |  74 ++++++
 .../graph/ops/glsl/select_batch_4d.glsl       |  52 ----
 .../graph/ops/glsl/select_channel_3d.glsl     |  50 ----
 .../graph/ops/glsl/select_channel_3d.yaml     |  10 -
 .../graph/ops/glsl/select_channel_4d.glsl     |  65 -----
 .../graph/ops/glsl/select_height_3d.glsl      |  62 -----
 .../graph/ops/glsl/select_height_3d.yaml      |  10 -
 .../graph/ops/glsl/select_height_4d.glsl      |  64 -----
 .../graph/ops/glsl/select_height_4d.yaml      |  10 -
 .../graph/ops/glsl/select_width_3d.glsl       |  63 -----
 .../graph/ops/glsl/select_width_3d.yaml       |  10 -
 .../graph/ops/glsl/select_width_4d.glsl       |  67 -----
 .../graph/ops/glsl/select_width_4d.yaml       |  10 -
 .../vulkan/runtime/graph/ops/glsl/slice.glslh |  53 ++++
 .../graph/ops/glsl/slice_packed_dim.glsl      |  67 -----
 .../graph/ops/glsl/slice_packed_dim.yaml      |  11 -
 .../graph/ops/glsl/slice_unpacked_dim.glsl    |  68 ------
 .../graph/ops/glsl/slice_unpacked_dim.yaml    |  10 -
 .../graph/ops/glsl/transfer_buffer.glsl       |  58 +++++
 ...t_channel_4d.yaml => transfer_buffer.yaml} |   9 +-
 .../graph/ops/glsl/transfer_texture.glsl      |  83 +++++++
 ...ct_batch_4d.yaml => transfer_texture.yaml} |   9 +-
 .../vulkan/runtime/graph/ops/impl/Common.cpp  |  33 +++
 .../vulkan/runtime/graph/ops/impl/Common.h    |  47 ++++
 .../vulkan/runtime/graph/ops/impl/Select.cpp  | 193 ++++++++-------
 .../vulkan/runtime/graph/ops/impl/Slice.cpp   | 230 +++++++-----------
 .../runtime/graph/ops/impl/Transfer.cpp       | 114 +++++++++
 .../vulkan/runtime/graph/ops/impl/Transfer.h  |  40 +++
 backends/vulkan/test/op_tests/cases.py        |   8 +-
 backends/vulkan/test/test_vulkan_delegate.py  |  47 ++++
 32 files changed, 792 insertions(+), 873 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/select.glslh
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.glsl
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.yaml
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.glsl
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_height_3d.yaml
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_height_4d.yaml
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_width_3d.yaml
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/select_width_4d.yaml
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/slice.glslh
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.glsl
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.yaml
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.glsl
 delete mode 100644 backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.yaml
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
 rename backends/vulkan/runtime/graph/ops/glsl/{select_channel_4d.yaml => transfer_buffer.yaml} (54%)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl
 rename backends/vulkan/runtime/graph/ops/glsl/{select_batch_4d.yaml => transfer_texture.yaml} (52%)
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/Common.cpp
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/Common.h
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/Transfer.h

diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index be9eae352ec..1222a9fc641 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -492,7 +492,7 @@ vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer(
     const ValueRef idx) {
   if (values_.at(idx).isInt()) {
     const int32_t val = extract_scalar<int32_t>(idx);
-    create_params_buffer(val);
+    return create_params_buffer(val);
   } else if (values_.at(idx).isSymInt()) {
     SymIntPtr symint = get_symint(idx);
     return vkapi::BufferBindInfo(symint->gpu_buffer.buffer());
@@ -500,6 +500,16 @@ vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer(
   VK_THROW("Cannot create a int param buffer for the given value");
 }
 
+vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer(
+    const ValueRef idx,
+    const int32_t default_val) {
+  if (values_.at(idx).isNone()) {
+    return create_params_buffer(default_val);
+  } else {
+    return get_or_create_int_param_buffer(idx);
+  }
+}
+
 void ComputeGraph::set_symint(const ValueRef idx, const int32_t val) {
   get_symint(idx)->set(val);
 }
@@ -693,6 +703,12 @@ void ComputeGraph::resize_input(
   get_tensor(io_val.value)->virtual_resize(new_sizes);
 }
 
+void ComputeGraph::virtual_resize(
+    const ValueRef idx,
+    const std::vector<int64_t>& new_sizes) {
+  get_tensor(idx)->virtual_resize(new_sizes);
+}
+
 void ComputeGraph::propagate_resize() {
   for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
     node->trigger_resize(this);
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 9f56941b184..fe546f26477 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -398,6 +398,19 @@ class ComputeGraph final {
   std::optional<T> extract_optional_scalar(const ValueRef idx) {
     if (val_is_none(idx)) {
       return ::std::nullopt;
+    } else if (val_is_symint(idx)) {
+      return utils::safe_downcast<T>(read_symint(idx));
+    } else {
+      return extract_scalar<T>(idx);
+    }
+  }
+
+  template <typename T>
+  T extract_optional_scalar(const ValueRef idx, const T default_val) {
+    if (val_is_none(idx)) {
+      return default_val;
+    } else if (val_is_symint(idx)) {
+      return utils::safe_downcast<T>(read_symint(idx));
     } else {
       return extract_scalar<T>(idx);
     }
@@ -609,6 +622,10 @@ class ComputeGraph final {
    */
   vkapi::BufferBindInfo get_or_create_int_param_buffer(const ValueRef idx);
 
+  vkapi::BufferBindInfo get_or_create_int_param_buffer(
+      const ValueRef idx,
+      const int32_t default_value);
+
   void set_symint(const ValueRef idx, const int32_t val);
 
   int32_t read_symint(const ValueRef idx);
@@ -753,6 +770,9 @@ class ComputeGraph final {
   //
 
   void resize_input(const int64_t idx, const std::vector<int64_t>& new_sizes);
+  void virtual_resize(
+      const ValueRef idx,
+      const std::vector<int64_t>& new_sizes);
   void propagate_resize();
 
   //
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select.glslh b/backends/vulkan/runtime/graph/ops/glsl/select.glslh
new file mode 100644
index 00000000000..3bcbf04a3ba
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/select.glslh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef SELECT_GLSLH
+#define SELECT_GLSLH
+
+/*
+ * Enable the fast path if a texel loaded from the input texture can be used as
+ * is to store to the output texture. The following conditions must be met:
+ *
+ * 1. The input and output textures have the same packed dimension.
+ * 2. The selected_dim must not be the packed dimension of the input.
+ * 3. The packed dimension of the input must "map" to the packed dimension of
+ *    the output. This occurs if selected_dim is greater than the packed dimension
+ *    of the input.
+ */
+bool can_use_fast_path() {
+  if (out_packed_dim != in_packed_dim) {
+    return false;
+  }
+  if (selected_dim <= in_packed_dim) {
+    return false;
+  }
+  return true;
+}
+
+/*
+ * Given an output tensor index, return the corresponding input tensor index for
+ * the select operator. This is done by "inserting" the select index at the
+ * selected_dim in the input tensor index.
+ *
+ * A simple example is (note all tensor index are in WHCN order):
+ *   out_tidx = [7, 5, 9]
+ *   selected_dim = 2
+ *   index = 3
+ *   in_tidx = [7, 3, 5, 9]
+ *
+ * This function assumes that the following variables are defined in the layout:
+ * - in_sizes
+ * - selected_dim
+ * - index
+ */
+ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
+  ivec4 in_tidx = ivec4(0);
+
+  int adjusted_index = index;
+  if (index < 0) {
+    adjusted_index = index + in_sizes[selected_dim];
+  }
+
+  // Handle different dimensions for selection
+  if (selected_dim == 0) {
+    // Select from width dimension
+    in_tidx = ivec4(adjusted_index, out_tidx.x, out_tidx.y, out_tidx.z);
+  } else if (selected_dim == 1) {
+    // Select from height dimension
+    in_tidx = ivec4(out_tidx.x, adjusted_index, out_tidx.y, out_tidx.z);
+  } else if (selected_dim == 2) {
+    // Select from channel dimension
+    in_tidx = ivec4(out_tidx.x, out_tidx.y, adjusted_index, out_tidx.z);
+  } else if (selected_dim == 3) {
+    // Select from batch dimension
+    in_tidx = ivec4(out_tidx.x, out_tidx.y, out_tidx.z, adjusted_index);
+  }
+
+  return in_tidx;
+}
+
+#endif // SELECT_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl
deleted file mode 100644
index f94e1120492..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
-
-layout(set = 0, binding = 4) uniform PRECISION restrict SelectVal {
-  // data.x: index along batch dim to select
-  // data.y: number of batches
-  // data.z: number of texels per batch
-  // data.w: unused
-  ivec4 select_info;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const int num_batches = select_info.y;
-  const int num_texel_per_batch = select_info.z;
-  const int index = select_info.x;
-
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  const uint src_pos_z = (num_texel_per_batch * index) + pos.z;
-  imageStore(
-      image_out, pos, texelFetch(image_in, ivec3(pos.x, pos.y, src_pos_z), 0));
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.glsl
deleted file mode 100644
index 0bbec798484..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.glsl
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-#define T ${texel_component_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
-
-// index to select
-layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal {
-  int index;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  const int tex = index / 4;
-  const int ind = index % 4;
-  const T v = VEC4_T(texelFetch(image_in, ivec3(pos.x, pos.y, tex), 0))[ind];
-
-  imageStore(image_out, ivec3(pos.x, pos.y, 0), VEC4_T(v, 0, 0, 0));
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.yaml
deleted file mode 100644
index 1c5c4e34b06..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-select_channel_3d:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: select_channel_3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.glsl
deleted file mode 100644
index 517362f76ea..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.glsl
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
-
-layout(set = 0, binding = 4) uniform PRECISION restrict SelectVal {
-  // data.x: index along channel dim to select
-  // data.y: number of batches
-  // data.z: number of texels per batch
-  // data.w: unused
-  ivec4 select_info;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  const int num_batches = select_info.y;
-  const int num_texel_per_batch = select_info.z;
-  const int index = select_info.x;
-
-  // read in the same channel from 4 separate batches
-  VEC4_T out_texel = VEC4_T(0, 0, 0, 0);
-  for (int k = 0; k < 4; k++) {
-    if ((k + pos.z * 4) >=
-        num_batches) {
-      break;
-    }
-    const uint src_pos_z = (4 * num_texel_per_batch * pos.z) +
-        (k * num_texel_per_batch) + (index / 4);
-    const uint src_pos_t = index % 4;
-    out_texel[k] =
-        VEC4_T(texelFetch(image_in, ivec3(pos.x, pos.y, src_pos_z), 0))[src_pos_t];
-  }
-
-  imageStore(image_out, pos, out_texel);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl
deleted file mode 100644
index 87409fb35fd..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
-
-// index to select
-layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal {
-  int index;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  // w
-  const int src_x = pos.x;
-  // h
-  const int src_y = index;
-  // c
-  const int src_z = pos.y;
-
-  const VEC4_T v = VEC4_T(texelFetch(image_in, ivec3(src_x, src_y, src_z), 0));
-
-  for (int i = 0; i < 4; i++) {
-    ivec3 new_pos = ivec3(pos.x, pos.y * 4 + i, 0);
-
-    // When the C-channel exceeds original block size, exit early
-    if (new_pos.y >= sizes.y) {
-      return;
-    }
-
-    imageStore(image_out, new_pos, VEC4_T(v[i], 0, 0, 0));
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.yaml
deleted file mode 100644
index a373f1decd9..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-select_height_3d:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: select_height_3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl
deleted file mode 100644
index 2e4e2afb2db..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
-
-// index to select
-layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal {
-  // data.x: index along height dim to select
-  // data.y: number of batches
-  // data.z: number of texels per batch
-  // data.w: unused
-  ivec4 select_info;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  const int num_batches = select_info.y;
-  const int num_texel_per_batch = select_info.z;
-  const int index = select_info.x;
-
-  VEC4_T out_texel = VEC4_T(0, 0, 0, 0);
-  // read in the same channel from 4 separate batches
-  for (int k = 0; k < 4; k++) {
-    if ((k + pos.z * 4) >= num_batches
-        ) { // < 4 batches for this texel, exit early
-      break;
-    }
-    const uint src_pos_z = (pos.z * num_texel_per_batch * 4) +
-        k * num_texel_per_batch + (pos.y / 4);
-    out_texel[k] = VEC4_T(texelFetch(
-        image_in, ivec3(pos.x, index, src_pos_z), 0))[pos.y % 4];
-  }
-  imageStore(image_out, pos, out_texel);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.yaml
deleted file mode 100644
index c3724f1157a..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-select_height_4d:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: select_height_4d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl
deleted file mode 100644
index 1e12d15ab21..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
-
-// index to select
-layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal {
-  int index;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  // w
-  const int src_x = index;
-  // h
-  const int src_y = pos.x;
-  // c
-  const int src_z = pos.y;
-
-  const VEC4_T v = VEC4_T(texelFetch(image_in, ivec3(src_x, src_y, src_z), 0));
-
-  for (int i = 0; i < 4; i++) {
-    ivec3 new_pos = ivec3(pos.x, pos.y * 4 + i, 0);
-
-    // When the C-channel exceeds original block size, exit early
-    if (new_pos.y >= sizes.y) {
-      return;
-    }
-
-    imageStore(image_out, new_pos, VEC4_T(v[i], 0, 0, 0));
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.yaml
deleted file mode 100644
index a3070bf6ca3..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-select_width_3d:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: select_width_3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl
deleted file mode 100644
index ffbd8afbda0..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
-
-// index to select
-layout(set = 0, binding = 4) uniform PRECISION restrict SelectVal {
-  // data.x: index along width dim to select
-  // data.y: number of batches
-  // data.z: number of texels per batch
-  // data.w: unused
-  ivec4 select_info;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  const int num_batches = select_info.y;
-  const int num_texel_per_batch = select_info.z;
-  const int index = select_info.x;
-
-  //vec4 out_texel = vec4(0, 0, 0, 0);
-  VEC4_T out_texel = VEC4_T(0, 0, 0, 0);
-  // read in the same channel from 4 separate batches
-  for (int k = 0; k < 4; k++) {
-    if ((k + pos.z * 4) >=
-        num_batches) { // < 4 batches for this texel, exit early
-      break;
-    }
-    const uint src_pos_z = (pos.z * num_texel_per_batch * 4) +
-        k * num_texel_per_batch + (pos.y / 4);
-
-    out_texel[k] = VEC4_T(texelFetch(
-        image_in, ivec3(index, pos.x, src_pos_z), 0))[pos.y % 4];
-  }
-  imageStore(image_out, pos, out_texel);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.yaml
deleted file mode 100644
index f1131d77395..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-select_width_4d:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: select_width_4d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
new file mode 100644
index 00000000000..5d4cc70fdc1
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef SLICE_GLSLH
+#define SLICE_GLSLH
+
+/**
+ * Enable the fast path if a texel loaded from the input texture can be used as
+ * is to store to the output texture. The following conditions must be met:
+ *
+ * 1. The input and output textures have the same packed dimension.
+ * 2. The select_dim must not be the packed dimension of the input.
+ */
+bool can_use_fast_path() {
+  if (out_packed_dim != in_packed_dim) {
+    return false;
+  }
+  if (in_packed_dim == selected_dim) {
+    return false;
+  }
+  return true;
+}
+
+/*
+ * Converts output tensor indices to input tensor indices for the slice operation.
+ * This function maps the output indices to the corresponding input indices based on
+ * the slice parameters (start, step, selected_dim).
+ *
+ * Parameters assumed to be defined in the layout specifier:
+ * - in_sizes
+ * - selected_dim
+ * - start
+ * - step
+ */
+ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
+  ivec4 in_tidx = out_tidx;
+
+  int adjusted_start = start;
+  if (start < 0) {
+    adjusted_start = start + in_sizes[selected_dim];
+  }
+
+  in_tidx[selected_dim] = adjusted_start + out_tidx[selected_dim] * step;
+
+  return in_tidx;
+}
+
+#endif // SLICE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.glsl b/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.glsl
deleted file mode 100644
index 0a6fa31a65f..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.glsl
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_ubo(2, "ivec4", "out_sizes")}
-${layout_declare_ubo(3, "ivec4", "in_sizes")}
-
-layout(set = 0, binding = 4) uniform PRECISION restrict SliceArg {
-  int offset;
-  int step;
-}
-slice_arg;
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-void main() {
-  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(out_pos, out_sizes, packed_dim);
-
-  if (any(greaterThanEqual(idx, out_sizes))) {
-    return;
-  }
-
-  // We map the output pos using the buffer index.  For each index in the texel,
-  // we calculate the source whcn-coordinate amended with offset-ed channel
-  // value.  Then we calculate the actual texture position from the
-  // whcn-coordinate.
-  const ivec4 buf_indices = tidx_to_nchwi(idx, out_sizes, packed_dim);
-
-  vec4 outex;
-  for (int i=0;i<4;i++) {
-      ivec4 user_coor = nchwi_to_tidx(buf_indices[i], out_sizes);
-
-      int in_dim = user_coor[packed_dim];
-
-      ivec4 in_user_coor = user_coor;
-      in_user_coor[packed_dim] = slice_arg.offset + in_dim * slice_arg.step;
-
-      ivec4 in_pow_elem = to_texture_elem_pos(
-        in_user_coor,
-        in_sizes,
-        packed_dim);
-
-      vec4 v = texelFetch(t_in, in_pow_elem.xyz, 0);
-
-      outex[i] = v[in_pow_elem.w];
-  }
-  imageStore(t_out, out_pos, outex);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.yaml b/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.yaml
deleted file mode 100644
index 718e7316824..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-slice_packed_dim:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: slice_packed_dim
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.glsl b/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.glsl
deleted file mode 100644
index 54f0bd0b78c..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.glsl
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict SliceArg {
-  int dim;
-  int offset;
-  int step;
-  int image_in_channel_size;
-}
-slice_arg;
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (pos_out_of_bounds(pos, sizes, packed_dim)) {
-    return;
-  }
-
-  ivec3 in_pos = pos;
-
-  // slice along batch axis
-  if (slice_arg.dim == 3) {
-    // index of the channel inside a batch
-    const int chanl_index = pos.z % slice_arg.image_in_channel_size;
-    // index of batch
-    const int batch_index = pos.z / slice_arg.image_in_channel_size;
-    in_pos.z = (slice_arg.offset + batch_index * slice_arg.step) * slice_arg.image_in_channel_size + chanl_index;
-  } else if (slice_arg.dim == C_DIM) {
-    // index of the channel inside a batch
-    const int chanl_index = pos.z % sizes.z;
-    // index of batch
-    const int batch_index = pos.z / sizes.z;
-    in_pos.z = slice_arg.offset + batch_index * slice_arg.image_in_channel_size + chanl_index * slice_arg.step;
-  } else if (slice_arg.dim == H_DIM) {
-    in_pos.y = slice_arg.offset + pos.y * slice_arg.step;
-  } else {
-    in_pos.x = slice_arg.offset + pos.x * slice_arg.step;
-  }
-
-  imageStore(image_out, pos, texelFetch(image_in, in_pos, 0));
-
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.yaml b/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.yaml
deleted file mode 100644
index 0453bb707b1..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-slice_unpacked_dim:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: slice_unpacked_dim
diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
new file mode 100644
index 00000000000..3ca854e0526
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type("buffer")}
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
+
+$if OP_NAME == "slice":
+  ${layout_declare_ubo(B, "int", "start")}
+  ${layout_declare_ubo(B, "int", "step")}
+
+$if OP_NAME == "select":
+  ${layout_declare_ubo(B, "int", "index")}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 in_sizes;
+  ivec4 out_strides;
+  ivec4 in_strides;
+  int out_numel;
+  int selected_dim;
+};
+
+${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "in_packed_dim", "DEFAULT_LAYOUT")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+#include "${OP_NAME}.glslh"
+
+void main() {
+  const int out_bufi = ivec3(gl_GlobalInvocationID).x;
+  if (out_bufi >= out_numel) {
+    return;
+  }
+
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
+  ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
+
+  const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
+  t_out[out_bufi] = t_in[in_bufi];
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.yaml
similarity index 54%
rename from backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.yaml
index 6236555f5dd..bdde613c8ce 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.yaml
@@ -1,10 +1,13 @@
-select_channel_4d:
+transfer_buffer:
   parameter_names_with_default_values:
     DTYPE: float
-    NDIM: 3
+    OP_NAME: select
   generate_variant_forall:
     DTYPE:
       - VALUE: half
       - VALUE: float
   shader_variants:
-    - NAME: select_channel_4d
+    - NAME: select_buffer
+      OP_NAME: select
+    - NAME: slice_buffer
+      OP_NAME: slice
diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl
new file mode 100644
index 00000000000..d3e25436c04
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type("texture3d")}
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}
+
+$if OP_NAME == "slice":
+  ${layout_declare_ubo(B, "int", "start")}
+  ${layout_declare_ubo(B, "int", "step")}
+
+$if OP_NAME == "select":
+  ${layout_declare_ubo(B, "int", "index")}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 out_sizes;
+  ivec4 in_sizes;
+  int selected_dim;
+};
+
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
+const lowp int out_packed_dim = unhash_packed_dim(out_layout);
+
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
+const lowp int in_packed_dim = unhash_packed_dim(in_layout);
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+#include "${OP_NAME}.glslh"
+
+void main() {
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
+  ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim);
+
+  if (any(greaterThanEqual(out_tidx, out_sizes))) {
+    return;
+  }
+
+  if (can_use_fast_path()) {
+    ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
+    ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
+    VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
+
+    write_texel_lpos(t_out, lpos, in_texel, out_axis_map);
+  }
+  else {
+    VEC4_T out_texel = VEC4_T(0);
+    for (int texel_i = 0; texel_i < 4; ++texel_i) {
+      ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
+      ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
+      int element_idx = in_tidx[in_packed_dim] % 4;
+
+      VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
+      T selected_value = T(in_texel[element_idx]);
+
+      out_texel[texel_i] = selected_value;
+
+      out_tidx[out_packed_dim]++;
+    }
+
+    write_texel_lpos(t_out, lpos, out_texel, out_axis_map);
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.yaml
similarity index 52%
rename from backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/transfer_texture.yaml
index 9c7d54c8f69..f877ee036e4 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.yaml
@@ -1,10 +1,13 @@
-select_batch_4d:
+transfer_texture:
   parameter_names_with_default_values:
     DTYPE: float
-    NDIM: 3
+    OP_NAME: select
   generate_variant_forall:
     DTYPE:
       - VALUE: half
       - VALUE: float
   shader_variants:
-    - NAME: select_batch_4d
+    - NAME: select_texture3d
+      OP_NAME: select
+    - NAME: slice_texture3d
+      OP_NAME: slice
diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.cpp b/backends/vulkan/runtime/graph/ops/impl/Common.cpp
new file mode 100644
index 00000000000..4de099231d3
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Common.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+
+namespace vkcompute {
+
+utils::uvec3 default_pick_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& additional_args) {
+  (void)shader;
+  const ValueRef out = args.at(0).refs.at(0);
+  return graph->create_global_wg_size(out);
+}
+
+utils::uvec3 default_pick_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& additional_args) {
+  (void)shader;
+  return graph->create_local_wg_size(global_workgroup_size);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.h b/backends/vulkan/runtime/graph/ops/impl/Common.h
new file mode 100644
index 00000000000..d5ff455ae41
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Common.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/ExecuteNode.h>
+
+namespace vkcompute {
+
+/**
+ * Creates a global workgroup size based on the first output tensor in the args.
+ * This is a utility function that extracts the output tensor from
+ * args.at(0).refs.at(0) and calls graph->create_global_wg_size(out) on it.
+ *
+ * @param graph The ComputeGraph instance
+ * @param args Vector of ArgGroup containing the output tensor reference
+ * @return utils::uvec3 The global workgroup size
+ */
+utils::uvec3 default_pick_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& additional_args);
+
+/**
+ * Creates a local workgroup size based on the first output tensor in the args.
+ * This is a utility function that extracts the output tensor from
+ * args.at(0).refs.at(0) and calls graph->create_local_wg_size(out) on it.
+ *
+ * @param graph The ComputeGraph instance
+ * @param args Vector of ArgGroup containing the output tensor reference
+ * @return utils::uvec3 The local workgroup size
+ */
+utils::uvec3 default_pick_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& additional_args);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Select.cpp b/backends/vulkan/runtime/graph/ops/impl/Select.cpp
index a83e986e414..69d49e8283b 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Select.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Select.cpp
@@ -8,129 +8,136 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/Logging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Transfer.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
 
-void check_args(
-    const api::vTensor& t_in,
-    int64_t dim,
-    int64_t index,
-    const api::vTensor& t_out) {
-  VK_CHECK_COND(check_packed_dim_is(t_in, WHCN::kChannelsDim));
-  VK_CHECK_COND(check_packed_dim_is(t_out, WHCN::kChannelsDim));
+void resize_select_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  ValueRef out = args.at(0).refs.at(0);
+  ValueRef in = args.at(1).refs.at(0);
+  int64_t dim = graph->extract_scalar<int64_t>(extra_args.at(0));
 
-  const int64_t in_dim = t_in.dim();
-  VK_CHECK_COND(
-      in_dim == 3 || in_dim == 4,
-      "Vulkan select only support 3d or 4d tensors!");
-
-  const int64_t in_size = t_in.size(dim);
-
-  if (index < -in_size || index >= in_size) {
-    VK_CHECK_COND(
-        false,
-        "select(): index ",
-        index,
-        " t_outof range for tensor of size ",
-        in_size,
-        " at dimension ",
-        dim);
+  int64_t in_ndim = graph->dim_of(in);
+
+  if (dim < 0) {
+    dim += in_ndim;
+  }
+
+  std::vector<int64_t> new_out_sizes;
+  for (int64_t i = 0; i < in_ndim; ++i) {
+    if (i != dim) {
+      new_out_sizes.push_back(graph->size_at<int64_t>(i, in));
+    }
   }
+
+  graph->virtual_resize(out, new_out_sizes);
 }
 
-void add_select_int_node(
+void check_select_args(
     ComputeGraph& graph,
     const ValueRef in,
     const ValueRef dim_ref,
     const ValueRef index_ref,
     const ValueRef out) {
-  vTensorPtr t_in = graph.get_tensor(in);
-  vTensorPtr t_out = graph.get_tensor(out);
   int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-  int64_t index = graph.extract_scalar<int64_t>(index_ref);
+  int64_t index = graph.extract_optional_scalar<int64_t>(index_ref, 0);
+  int64_t in_ndim = graph.dim_of(in);
 
-  check_args(*t_in, dim, index, *t_out);
+  if (dim < 0) {
+    dim += in_ndim;
+  }
 
-  const int64_t in_size = t_in->size(dim);
+  VK_CHECK_COND(
+      dim >= 0 && dim < in_ndim,
+      "Dimension out of range (expected to be in range of [",
+      -in_ndim,
+      ", ",
+      in_ndim - 1,
+      "], but got ",
+      dim,
+      ")");
+
+  const int64_t in_size_at_dim = graph.size_at<int64_t>(dim, in);
 
   if (index < 0) {
-    index += in_size;
+    index += in_size_at_dim;
   }
 
-  std::string kernel_name;
-
-  // for 3d tensors, these values are not used by the shader.
-  int32_t num_texel_per_batch = 1;
-  int32_t num_batches = 1;
-
-  int64_t in_dim = t_in->dim();
-  if (in_dim == 3) {
-    if (dim == 0) {
-      kernel_name = "select_channel_3d";
-    } else if (dim == 1) {
-      kernel_name = "select_height_3d";
-    } else if (dim == 2) {
-      kernel_name = "select_width_3d";
-    } else {
-      VK_CHECK_COND(
-          false, "Unexpected dim value=", dim, "for the input 3d tensor");
-    }
-  } else { // self.dim() == 4
-    num_texel_per_batch =
-        static_cast<int32_t>(std::ceil(static_cast<float>(t_in->size(1)) / 4));
-    num_batches = t_in->size(0);
-    if (dim == 0) {
-      kernel_name = "select_batch_4d";
-    } else if (dim == 1) {
-      kernel_name = "select_channel_4d";
-    } else if (dim == 2) {
-      kernel_name = "select_height_4d";
-    } else if (dim == 3) {
-      kernel_name = "select_width_4d";
-    } else {
+  VK_CHECK_COND(
+      index >= 0 && index < in_size_at_dim,
+      "select(): index ",
+      index,
+      " out of range for tensor of size ",
+      in_size_at_dim,
+      " at dimension ",
+      dim);
+
+  // Check that output tensor has correct dimensions
+  int64_t out_dim = graph.dim_of(out);
+  VK_CHECK_COND(
+      out_dim == in_ndim - 1,
+      "Output tensor dimension mismatch (expected ",
+      in_size_at_dim - 1,
+      ", but got ",
+      out_dim,
+      ")");
+
+  // Check that output tensor has correct sizes
+  int64_t out_idx = 0;
+  for (int64_t i = 0; i < in_size_at_dim; ++i) {
+    if (i != dim) {
       VK_CHECK_COND(
-          false, "Unexpected dim value=", dim, "for the input 4d tensor");
+          graph.size_at<int64_t>(out_idx, out) == graph.size_at<int64_t>(i, in),
+          "Output size mismatch at dimension ",
+          out_idx,
+          " (expected ",
+          graph.size_at<int16_t>(i, in),
+          ", but got ",
+          graph.size_at<int64_t>(out_idx, out),
+          ")");
+      out_idx++;
     }
   }
+}
 
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, *t_out);
+/**
+ * Adds a select operation node to the compute graph.
+ *
+ * The select operator extracts a slice from a tensor along a specified
+ * dimension at a given index. It effectively reduces the dimensionality of the
+ * input tensor by one, by selecting a single slice at the specified index along
+ * the given dimension. For example, if input is a 3D tensor with shape [2,3,4]
+ * and we select dimension 1, index 2, the output will be a 2D tensor with shape
+ * [2,4].
+ */
+void add_select_copy_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef dim_ref,
+    const ValueRef index_ref,
+    const ValueRef out) {
+  check_select_args(graph, in, dim_ref, index_ref, out);
 
-  // TODO: add resizing to support dynamic shapes.
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  add_transfer_copy_node(
       graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      graph.create_global_wg_size(out),
-      graph.create_local_wg_size(out),
-      // Inputs and Outputs
-      {{out, vkapi::MemoryAccessType::WRITE},
-       {in, vkapi::MemoryAccessType::READ}},
-      // Parameter buffers
-      {t_out->logical_limits_ubo(),
-       t_out->sizes_ubo(),
-       // TODO: num_batches and num_texel_per_batch are provided by
-       // t_out->sizes. Can change the following to reduce params
-       // created.
-       graph.create_params_buffer(
-           utils::make_ivec4({index, num_batches, num_texel_per_batch, 0}))},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
+      TransferType::SELECT,
+      in,
+      dim_ref,
+      index_ref,
+      kDummyValueRef,
+      kDummyValueRef,
+      out,
+      {dim_ref, index_ref},
+      resize_select_node);
 }
 
 void select_int(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_select_int_node(graph, args[0], args[1], args[2], args[3]);
+  return add_select_copy_node(graph, args[0], args[1], args[2], args[3]);
 }
 
 REGISTER_OPERATORS {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
index c40e16f7c0a..67d714d10aa 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
@@ -8,12 +8,10 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
-#include <executorch/backends/vulkan/runtime/graph/Logging.h>
-
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Slice.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Transfer.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
@@ -33,127 +31,73 @@ inline int64_t normalize_idx(
   return normalize(index, max);
 }
 
-void add_slice_tensor_copy_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    ValueRef dim_ref,
-    ValueRef opt_start_ref,
-    ValueRef opt_end_ref,
-    ValueRef step_ref,
-    ValueRef out) {
-  vTensorPtr t_in = graph.get_tensor(in);
-  vTensorPtr t_out = graph.get_tensor(out);
-
-  VK_CHECK_COND(check_same_packed_dim(*t_in, *t_out));
-
-  // Need normalize the dim
-  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-
-  VK_CHECK_COND(
-      -t_in->dim() <= dim && dim < t_in->dim(),
-      "dim must be in range of [-self.dim(), self.dim()), but current dim's value is ",
-      dim,
-      " and self.dim() = ",
-      t_in->dim());
-
-  dim = normalize(dim, t_in->dim());
-
-  DimIndex dim_index = normalize_to_dim_index(*t_in, dim);
+void resize_slice_copy_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  ValueRef out_ref = args.at(0).refs.at(0);
+  ValueRef in_ref = args.at(1).refs.at(0);
 
+  int64_t dim = graph->extract_scalar<int64_t>(extra_args.at(0));
   std::optional<int64_t> opt_start =
-      graph.extract_optional_scalar<int64_t>(opt_start_ref);
+      graph->extract_optional_scalar<int64_t>(extra_args.at(1));
   std::optional<int64_t> opt_end =
-      graph.extract_optional_scalar<int64_t>(opt_end_ref);
-  int64_t step = graph.extract_scalar<int64_t>(step_ref);
-
-  const auto in_sizes = t_in->sizes();
-  const auto out_sizes = t_out->sizes();
-
-  int64_t start = opt_start.value_or(0);
-  int64_t end = opt_end.value_or(in_sizes[dim]);
+      graph->extract_optional_scalar<int64_t>(extra_args.at(2));
+  int64_t step = graph->extract_scalar<int64_t>(extra_args.at(3));
 
-  start = normalize_idx(start, in_sizes[dim], 0);
-  end = normalize_idx(end, in_sizes[dim], in_sizes[dim]);
+  // Normalize dim
+  if (dim < 0) {
+    dim += graph->dim_of(in_ref);
+  }
 
-  const vkapi::SpecVarList spec_vars = {t_in->packed_dim()};
+  const std::vector<int64_t> in_sizes = graph->sizes_of(in_ref);
+  int64_t dim_size = in_sizes.at(dim);
 
-  const auto packed_dim_idx =
-      static_cast<DimIndex>(DimIndex::DIM_LAST - t_in->packed_dim());
+  int64_t start = opt_start.value_or(0);
+  int64_t end = opt_end.value_or(dim_size);
 
-  // if slice dim is the same as the packed dim, we can use the channel slice
-  if (dim_index == packed_dim_idx) {
-    // slice by channel
-    std::string kernel_name = "slice_packed_dim";
-    kernel_name.reserve(kShaderNameReserve);
-    add_dtype_suffix(kernel_name, *t_out);
+  // Normalize start and end indices
+  start = normalize_idx(start, dim_size, 0);
+  end = normalize_idx(end, dim_size, dim_size);
 
-    const struct Block final {
-      int offset;
-      int step;
-    } params{
-        static_cast<int32_t>(start),
-        static_cast<int32_t>(step),
-    };
+  // Calculate output size
+  std::vector<int64_t> new_out_sizes = in_sizes;
+  new_out_sizes.at(dim) = (end - start + step - 1) / step; // Ceiling division
 
-    graph.execute_nodes().emplace_back(new DispatchNode(
-        graph,
-        VK_KERNEL_FROM_STR(kernel_name),
-        graph.create_global_wg_size(out),
-        graph.create_local_wg_size(out),
-        {{out, vkapi::MemoryAccessType::WRITE},
-         {in, vkapi::MemoryAccessType::READ}},
-        {t_out->sizes_ubo(),
-         t_in->sizes_ubo(),
-         graph.create_params_buffer(params)},
-        {},
-        spec_vars,
-        {},
-        nullptr));
-
-  } else {
-    // GPU's coordinate is in x = 0, y = 1, z = 2, w = 3
-    const int64_t gpu_dim = -(dim_index + 1);
-    // stride of input tensor's channel dimension
-    int64_t in_channel_stride = dim_at(in_sizes, kChannel4D);
-    VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
-
-    // Due to channel packing, each batch value is span over stride planes
-    if (dim_index == kBatch4D && packed_dim_idx == kChannel4D) {
-      in_channel_stride = utils::div_up_4(in_channel_stride);
-    }
+  graph->virtual_resize(out_ref, new_out_sizes);
+}
 
-    std::string kernel_name = "slice_unpacked_dim";
-    kernel_name.reserve(kShaderNameReserve);
-    add_dtype_suffix(kernel_name, *t_out);
-
-    utils::uvec3 global_size = t_out->logical_limits();
-    utils::uvec3 local_size = graph.create_local_wg_size(global_size);
-
-    const struct Block final {
-      int dim;
-      int offset;
-      int step;
-      int stride;
-    } params{
-        static_cast<int32_t>(gpu_dim),
-        static_cast<int32_t>(start),
-        static_cast<int32_t>(step),
-        static_cast<int32_t>(in_channel_stride),
-    };
-
-    graph.execute_nodes().emplace_back(new DispatchNode(
-        graph,
-        VK_KERNEL_FROM_STR(kernel_name),
-        global_size,
-        local_size,
-        {{out, vkapi::MemoryAccessType::WRITE},
-         {in, vkapi::MemoryAccessType::READ}},
-        {t_out->sizes_ubo(), graph.create_params_buffer(params)},
-        {},
-        spec_vars,
-        {},
-        nullptr));
-  }
+/**
+ * Adds a slice_copy operation node to the compute graph.
+ *
+ * The slice operator extracts a portion of a tensor along a specified
+ * dimension. It creates a new tensor that contains a subset of the input
+ * tensor's data, defined by start, end, and step parameters along the given
+ * dimension.
+ *
+ * For example, if input is a tensor with shape [4,5,6] and we slice along
+ * dimension 1 with start=1, end=4, step=2, the output will have shape [4,2,6],
+ * containing elements from the input at positions 1 and 3 along dimension 1.
+ */
+void add_slice_copy_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    ValueRef dim_ref,
+    ValueRef opt_start_ref,
+    ValueRef opt_end_ref,
+    ValueRef step_ref,
+    ValueRef out) {
+  add_transfer_copy_node(
+      graph,
+      TransferType::SLICE,
+      in,
+      dim_ref,
+      opt_start_ref,
+      opt_end_ref,
+      step_ref,
+      out,
+      {dim_ref, opt_start_ref, opt_end_ref, step_ref},
+      resize_slice_copy_node);
 }
 
 std::vector<int64_t> get_slice_sizes(
@@ -186,16 +130,16 @@ void resize_slice_view_node(
     const std::vector<ArgGroup>& args,
     const std::vector<ValueRef>& extra_args) {
   (void)args;
-  vTensorPtr out = graph->get_tensor(extra_args[0]);
+  ValueRef out_ref = extra_args.at(0);
 
   std::vector<int64_t> new_out_sizes = get_slice_sizes(
       *graph,
-      extra_args[1], // input
-      extra_args[2], // dim
-      extra_args[3], // optional start
-      extra_args[4]); // optional end
+      extra_args.at(1), // input
+      extra_args.at(2), // dim
+      extra_args.at(3), // optional start
+      extra_args.at(4)); // optional end
 
-  out->virtual_resize(new_out_sizes);
+  graph->virtual_resize(out_ref, new_out_sizes);
 }
 
 void check_slice_view_args(
@@ -267,54 +211,54 @@ void add_slice_view_node(
   std::vector<int64_t> new_out_sizes =
       get_slice_sizes(graph, in_ref, dim_ref, opt_start_ref, opt_end_ref);
 
-  graph.get_tensor(out_ref)->virtual_resize(new_out_sizes);
+  graph.virtual_resize(out_ref, new_out_sizes);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
       resize_slice_view_node,
       {out_ref, in_ref, dim_ref, opt_start_ref, opt_end_ref, opt_step_ref}));
 }
 
-void slice_tensor_copy(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_slice_tensor_copy_node(
+void slice_copy(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  return add_slice_copy_node(
       graph,
-      args[0],
-      args[1], // dim
-      args[2], // optional start
-      args[3], // optional end
-      args[4], // step
-      args[5]);
+      args.at(0),
+      args.at(1), // dim
+      args.at(2), // optional start
+      args.at(3), // optional end
+      args.at(4), // step
+      args.at(5));
 }
 
-void slice_tensor(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  ValueRef in = args[0];
-  ValueRef out = args[5];
+void slice(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  ValueRef in = args.at(0);
+  ValueRef out = args.at(5);
 
   // Special case if out is a view of in
   if (graph.val_is_view_of(out, in)) {
     add_slice_view_node(
         graph,
         in,
-        args[1], // dim
-        args[2], // optional start
-        args[3], // optional end
-        args[4], // step
+        args.at(1), // dim
+        args.at(2), // optional start
+        args.at(3), // optional end
+        args.at(4), // step
         out);
     return;
   }
 
-  add_slice_tensor_copy_node(
+  add_slice_copy_node(
       graph,
       in,
-      args[1], // dim
-      args[2], // optional start
-      args[3], // optional end
-      args[4], // step
+      args.at(1), // dim
+      args.at(2), // optional start
+      args.at(3), // optional end
+      args.at(4), // step
       out);
 }
 
 REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.slice_copy.Tensor, slice_tensor_copy);
-  VK_REGISTER_OP(aten.slice.Tensor, slice_tensor);
+  VK_REGISTER_OP(aten.slice_copy.Tensor, slice_copy);
+  VK_REGISTER_OP(aten.slice.Tensor, slice);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
new file mode 100644
index 00000000000..423c9789d67
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Transfer.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+/**
+ * Adds a transfer copy operation node to the compute graph.
+ * This function handles both SELECT and SLICE operations based on the
+ * transfer_type parameter.
+ */
+void add_transfer_copy_node(
+    ComputeGraph& graph,
+    TransferType transfer_type,
+    const ValueRef in,
+    const ValueRef dim_ref,
+    const ValueRef index_or_start_ref,
+    const ValueRef end_ref,
+    const ValueRef step_ref,
+    const ValueRef out,
+    const std::vector<ValueRef>& resize_args,
+    const ExecuteNode::ResizeFunction& resize_fn) {
+  int64_t ndim = graph.dim_of(in);
+  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+
+  if (dim < 0) {
+    dim += ndim;
+  }
+
+  int64_t dim_whcn = nchw_dim_to_whcn_dim(dim, ndim);
+
+  vkapi::ParamsBindList param_buffers;
+  if (transfer_type == TransferType::SELECT) {
+    param_buffers = {
+        graph.get_or_create_int_param_buffer(index_or_start_ref, 0)};
+  } else { // TransferType::SLICE
+    param_buffers = {
+        graph.get_or_create_int_param_buffer(index_or_start_ref, 0),
+        graph.get_or_create_int_param_buffer(step_ref, 1)};
+  }
+
+  const struct TransferParams {
+    const int32_t dim;
+  } transfer_params{static_cast<int32_t>(dim_whcn)};
+
+  std::vector<PushConstantDataInfo> push_constants;
+  vkapi::SpecVarList spec_vars;
+
+  if (graph.is_buffer_storage(out)) {
+    push_constants = {
+        graph.sizes_pc_of(in),
+        graph.strides_pc_of(out),
+        graph.strides_pc_of(in),
+        graph.numel_pc_of(out),
+        PushConstantDataInfo(&transfer_params, sizeof(transfer_params))};
+
+    spec_vars = {
+        graph.packed_dim_of(out),
+        graph.packed_dim_of(in),
+    };
+  } else {
+    push_constants = {
+        graph.sizes_pc_of(out),
+        graph.sizes_pc_of(in),
+        PushConstantDataInfo(&transfer_params, sizeof(transfer_params))};
+
+    spec_vars = {
+        graph.hashed_layout_of(out),
+        graph.hashed_layout_of(in),
+    };
+  }
+
+  // Determine the shader directly
+  std::string kernel_name;
+  if (transfer_type == TransferType::SELECT) {
+    kernel_name = "select";
+  } else { // TransferType::SLICE
+    kernel_name = "slice";
+  }
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  // Create and add the dispatch node
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      default_pick_global_wg_size,
+      default_pick_local_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
+      // Parameter buffers
+      param_buffers,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      spec_vars,
+      // Resize Args
+      resize_args,
+      // Resizing Logic
+      resize_fn));
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transfer.h b/backends/vulkan/runtime/graph/ops/impl/Transfer.h
new file mode 100644
index 00000000000..09aae144994
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Transfer.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/ExecuteNode.h>
+
+namespace vkcompute {
+
+enum class TransferType { SELECT, SLICE };
+
+/**
+ * Adds a transfer copy operation node to the compute graph, which implements
+ * operators for which each element of the output tensor maps to a unique
+ * element of the input tensor.
+ *
+ * This function currently handles the following operations:
+ * - select
+ * - slice
+ */
+void add_transfer_copy_node(
+    ComputeGraph& graph,
+    TransferType transfer_type,
+    const ValueRef in,
+    const ValueRef dim_ref,
+    const ValueRef index_or_start_ref,
+    const ValueRef end_ref,
+    const ValueRef step_ref,
+    const ValueRef out,
+    const std::vector<ValueRef>& resize_args,
+    const ExecuteNode::ResizeFunction& resize_fn = nullptr);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 40e86a13b42..bf6e9683ef7 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -499,7 +499,9 @@ def get_ones_inputs():
 def get_select_int_inputs():
     test_suite = VkTestSuite(
         [
-            ((6, 2, 7), 0, 3),
+            ((8, 8, 8), 0, -2),
+            ((8, 8, 8), 1, -3),
+            ((8, 8, 8), 2, -4),
             ((6, 2, 7), 1, 0),
             ((6, 2, 7), 2, 3),
             ((6, 10, 7), 0, 3),
@@ -515,6 +517,10 @@ def get_select_int_inputs():
             ((8, 6, 1, 1), 1, 4),
         ]
     )
+    test_suite.layouts = ["utils::kWidthPacked", "utils::kChannelsPacked"]
+    test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
+    test_suite.dtypes = ["at::kFloat"]
+    test_suite.data_gen = "make_seq_tensor"
     return test_suite
 
 
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
index 80ead02de9f..447e5d039f4 100644
--- a/backends/vulkan/test/test_vulkan_delegate.py
+++ b/backends/vulkan/test/test_vulkan_delegate.py
@@ -1842,3 +1842,50 @@ def forward(self, x):
             dynamic_shapes=dynamic_shapes,
             test_inputs=test_inputs,
         )
+
+    def test_select_last_height_dynamic_shapes(self):
+        """
+        Test selecting the last element along the height dimension with dynamic shapes.
+        The height dimension (dim=1) is variable.
+        """
+
+        class SelectLastHeightModule(torch.nn.Module):
+            """
+            Module that selects the last element along the height dimension (dim=1) of a 3D tensor.
+            This is equivalent to the operation: x[:, -1, :]
+            """
+
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                # Select the last element along dimension 1 (height)
+                return x[:, -1, :]
+
+        # Create the module
+        module = SelectLastHeightModule()
+
+        # Create sample inputs with a specific shape
+        # Shape: [batch_size, height, width]
+        sample_inputs = (torch.arange(1, 61).reshape(2, 10, 3).float(),)
+
+        # Define dynamic shapes for the height dimension
+        height = Dim("height", min=1, max=10)
+        dynamic_shapes = {"x": {1: height}}
+
+        # Create test inputs with different heights
+        test_inputs = [
+            (torch.arange(1, 7).reshape(2, 1, 3).float(),),  # Minimum height
+            (torch.arange(1, 19).reshape(2, 3, 3).float(),),  # Small height
+            (torch.arange(1, 43).reshape(2, 7, 3).float(),),  # Medium height
+            (torch.arange(1, 31).reshape(2, 5, 3).float(),),  # Maximum height
+        ]
+
+        # Use the testing infrastructure from TestVulkanBackend
+        test_backend = TestVulkanBackend()
+        test_backend.lower_module_and_test_output(
+            module,
+            sample_inputs,
+            dynamic_shapes=dynamic_shapes,
+            test_inputs=test_inputs,
+        )