diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
index b32f4eb4308..02df85c33e8 100644
--- a/backends/vulkan/runtime/VulkanBackend.cpp
+++ b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -499,6 +499,8 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
     compute_graph->encode_prepack();
     compute_graph->prepack();
 
+    // TODO(ssjia): remove this once we can batch compile compute pipelines
+    // during prepare().
     compute_graph->encode_execute();
 
     return Error::Ok;
@@ -567,9 +569,14 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
       }
     }
 
+    // propagate_resize() will re-encode the command buffer so that push
+    // constants are updated and DynamicDispatchNode can update the compute
+    // shader, global workgroup size, and local workgroup size to perform the
+    // model inference.
     if (should_propagate_resize) {
       compute_graph->propagate_resize();
     }
+
     compute_graph->execute();
 
     for (size_t i = 0; i < compute_graph->outputs().size(); i++) {
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index a4a6abdd63f..1222a9fc641 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -492,7 +492,7 @@ vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer(
     const ValueRef idx) {
   if (values_.at(idx).isInt()) {
     const int32_t val = extract_scalar<int32_t>(idx);
-    create_params_buffer(val);
+    return create_params_buffer(val);
   } else if (values_.at(idx).isSymInt()) {
     SymIntPtr symint = get_symint(idx);
     return vkapi::BufferBindInfo(symint->gpu_buffer.buffer());
@@ -500,6 +500,16 @@ vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer(
   VK_THROW("Cannot create a int param buffer for the given value");
 }
 
+vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer(
+    const ValueRef idx,
+    const int32_t default_val) {
+  if (values_.at(idx).isNone()) {
+    return create_params_buffer(default_val);
+  } else {
+    return get_or_create_int_param_buffer(idx);
+  }
+}
+
 void ComputeGraph::set_symint(const ValueRef idx, const int32_t val) {
   get_symint(idx)->set(val);
 }
@@ -678,11 +688,12 @@ void ComputeGraph::encode_execute() {
   }
 }
 
-void ComputeGraph::execute() const {
+void ComputeGraph::execute() {
   vkapi::VulkanFence fence = context_->fences().get_fence();
   context_->submit_cmd_to_gpu(fence.get_submit_handle());
   fence.wait();
   context_->fences().return_fence(fence);
+  execute_count_++;
 }
 
 void ComputeGraph::resize_input(
@@ -692,10 +703,17 @@ void ComputeGraph::resize_input(
   get_tensor(io_val.value)->virtual_resize(new_sizes);
 }
 
+void ComputeGraph::virtual_resize(
+    const ValueRef idx,
+    const std::vector<int64_t>& new_sizes) {
+  get_tensor(idx)->virtual_resize(new_sizes);
+}
+
 void ComputeGraph::propagate_resize() {
   for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
     node->trigger_resize(this);
   }
+  encode_execute();
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 9f4bab3ac04..fe546f26477 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -187,6 +187,7 @@ class ComputeGraph final {
 
  protected:
   size_t values_in_use_ = 0;
+  size_t execute_count_ = 0;
 
  public:
   //
@@ -397,6 +398,19 @@ class ComputeGraph final {
   std::optional<T> extract_optional_scalar(const ValueRef idx) {
     if (val_is_none(idx)) {
       return ::std::nullopt;
+    } else if (val_is_symint(idx)) {
+      return utils::safe_downcast<T>(read_symint(idx));
+    } else {
+      return extract_scalar<T>(idx);
+    }
+  }
+
+  template <typename T>
+  T extract_optional_scalar(const ValueRef idx, const T default_val) {
+    if (val_is_none(idx)) {
+      return default_val;
+    } else if (val_is_symint(idx)) {
+      return utils::safe_downcast<T>(read_symint(idx));
     } else {
       return extract_scalar<T>(idx);
     }
@@ -608,6 +622,10 @@ class ComputeGraph final {
    */
   vkapi::BufferBindInfo get_or_create_int_param_buffer(const ValueRef idx);
 
+  vkapi::BufferBindInfo get_or_create_int_param_buffer(
+      const ValueRef idx,
+      const int32_t default_value);
+
   void set_symint(const ValueRef idx, const int32_t val);
 
   int32_t read_symint(const ValueRef idx);
@@ -745,13 +763,16 @@ class ComputeGraph final {
   //
 
   void encode_execute();
-  void execute() const;
+  void execute();
 
   //
   // Dynamic Shape support
   //
 
   void resize_input(const int64_t idx, const std::vector<int64_t>& new_sizes);
+  void virtual_resize(
+      const ValueRef idx,
+      const std::vector<int64_t>& new_sizes);
   void propagate_resize();
 
   //
@@ -762,6 +783,10 @@ class ComputeGraph final {
     return context_->adapter_ptr()->supports_int16_shader_types();
   }
 
+  inline size_t execute_count() const {
+    return execute_count_;
+  }
+
   /*
    * Check whether the GPU supports 8 bit buffers.
    */
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
index 51ff0c122b0..a0d3a4c2e5c 100644
--- a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
@@ -46,15 +46,7 @@ void DispatchNode::encode(ComputeGraph* graph) {
 
   std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
 
-  std::array<uint8_t, kMaxPushConstantSize> push_constants_data;
-  uint32_t push_constants_offset = 0;
-
-  for (const auto& push_constant : push_constants_) {
-    push_constants_offset += push_constant.write(
-        push_constants_data.data(),
-        push_constants_offset,
-        kMaxPushConstantSize);
-  }
+  write_push_constant_data();
 
   context->report_shader_dispatch_start(
       shader_.kernel_name,
@@ -63,7 +55,7 @@ void DispatchNode::encode(ComputeGraph* graph) {
       node_id_);
 
   vkapi::DescriptorSet descriptor_set = context->get_descriptor_set(
-      shader_, local_workgroup_size_, spec_vars_, push_constants_offset);
+      shader_, local_workgroup_size_, spec_vars_, push_constants_offset_);
 
   uint32_t idx = 0;
   idx = bind_values_to_descriptor_set(
@@ -76,10 +68,20 @@ void DispatchNode::encode(ComputeGraph* graph) {
       pipeline_barrier,
       shader_,
       global_workgroup_size_,
-      push_constants_data.data(),
-      push_constants_offset);
+      push_constants_data_.data(),
+      push_constants_offset_);
 
   context->report_shader_dispatch_end();
 }
 
+void DispatchNode::write_push_constant_data() {
+  push_constants_offset_ = 0;
+  for (const auto& push_constant : push_constants_) {
+    push_constants_offset_ += push_constant.write(
+        push_constants_data_.data(),
+        push_constants_offset_,
+        kMaxPushConstantSize);
+  }
+}
+
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.h b/backends/vulkan/runtime/graph/ops/DispatchNode.h
index c45f0a741fd..db95adfee39 100644
--- a/backends/vulkan/runtime/graph/ops/DispatchNode.h
+++ b/backends/vulkan/runtime/graph/ops/DispatchNode.h
@@ -50,6 +50,12 @@ class DispatchNode : public ExecuteNode {
   const vkapi::SpecVarList spec_vars_;
   const std::vector<PushConstantDataInfo> push_constants_;
 
+  // For push constants
+  std::array<uint8_t, kMaxPushConstantSize> push_constants_data_{};
+  uint32_t push_constants_offset_ = 0;
+
+  void write_push_constant_data();
+
  public:
   operator bool() const {
     return shader_;
diff --git a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp b/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp
index ac84916c6fa..a8d2fe2e99d 100644
--- a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp
@@ -25,9 +25,9 @@ DynamicDispatchNode::DynamicDispatchNode(
     const ResizeFunction& resize_fn)
     : DispatchNode(
           graph,
-          pick_shader_fn(&graph, args, resize_args),
-          pick_global_wg_fn(&graph, args, resize_args),
-          pick_local_wg_fn(&graph, args, resize_args),
+          vkapi::ShaderInfo(),
+          {1u, 1u, 1u},
+          {1u, 1u, 1u},
           args,
           params,
           push_constants,
@@ -36,13 +36,57 @@ DynamicDispatchNode::DynamicDispatchNode(
           resize_fn),
       pick_shader_fn_(pick_shader_fn),
       pick_global_wg_fn_(pick_global_wg_fn),
+      pick_local_wg_fn_(pick_local_wg_fn) {
+  shader_ = pick_shader_fn(&graph, args, resize_args);
+  global_workgroup_size_ =
+      pick_global_wg_fn(&graph, shader_, args, resize_args);
+  local_workgroup_size_ = utils::WorkgroupSize(pick_local_wg_fn(
+      &graph, shader_, global_workgroup_size_, args, resize_args));
+}
+
+DynamicDispatchNode::DynamicDispatchNode(
+    ComputeGraph& graph,
+    const vkapi::ShaderInfo& shader,
+    const PickGlobalFn& pick_global_wg_fn,
+    const PickLocalFn& pick_local_wg_fn,
+    const std::vector<ArgGroup>& args,
+    const vkapi::ParamsBindList& params,
+    const std::vector<PushConstantDataInfo>& push_constants,
+    const vkapi::SpecVarList& spec_vars,
+    const std::vector<ValueRef>& resize_args,
+    const ResizeFunction& resize_fn)
+    : DispatchNode(
+          graph,
+          shader,
+          pick_global_wg_fn(&graph, shader, args, resize_args),
+          pick_local_wg_fn(
+              &graph,
+              shader,
+              pick_global_wg_fn(&graph, shader, args, resize_args),
+              args,
+              resize_args),
+          args,
+          params,
+          push_constants,
+          spec_vars,
+          resize_args,
+          resize_fn),
+      pick_shader_fn_{nullptr},
+      pick_global_wg_fn_(pick_global_wg_fn),
       pick_local_wg_fn_(pick_local_wg_fn) {}
 
 void DynamicDispatchNode::encode(ComputeGraph* graph) {
-  shader_ = pick_shader_fn_(graph, args_, resize_args_);
-  global_workgroup_size_ = pick_global_wg_fn_(graph, args_, resize_args_);
-  local_workgroup_size_ =
-      utils::WorkgroupSize(pick_local_wg_fn_(graph, args_, resize_args_));
+  if (pick_shader_fn_) {
+    shader_ = pick_shader_fn_(graph, args_, resize_args_);
+  }
+  if (pick_global_wg_fn_) {
+    global_workgroup_size_ =
+        pick_global_wg_fn_(graph, shader_, args_, resize_args_);
+  }
+  if (pick_local_wg_fn_) {
+    local_workgroup_size_ = utils::WorkgroupSize(pick_local_wg_fn_(
+        graph, shader_, global_workgroup_size_, args_, resize_args_));
+  }
   DispatchNode::encode(graph);
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h b/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h
index ede50941415..005151272c3 100644
--- a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h
+++ b/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h
@@ -32,10 +32,13 @@ class DynamicDispatchNode final : public DispatchNode {
       const std::vector<ValueRef>&)>;
   using PickGlobalFn = const std::function<utils::uvec3(
       ComputeGraph*,
+      const vkapi::ShaderInfo& shader,
       const std::vector<ArgGroup>&,
       const std::vector<ValueRef>&)>;
   using PickLocalFn = const std::function<utils::uvec3(
       ComputeGraph*,
+      const vkapi::ShaderInfo& shader,
+      const utils::uvec3& global_workgroup_size,
       const std::vector<ArgGroup>&,
       const std::vector<ValueRef>&)>;
 
@@ -51,6 +54,18 @@ class DynamicDispatchNode final : public DispatchNode {
       const std::vector<ValueRef>& resize_args,
       const ResizeFunction& resize_fn = nullptr);
 
+  explicit DynamicDispatchNode(
+      ComputeGraph& graph,
+      const vkapi::ShaderInfo& shader,
+      const PickGlobalFn& pick_global_wg_fn,
+      const PickLocalFn& pick_local_wg_fn,
+      const std::vector<ArgGroup>& args,
+      const vkapi::ParamsBindList& params,
+      const std::vector<PushConstantDataInfo>& push_constants,
+      const vkapi::SpecVarList& spec_vars,
+      const std::vector<ValueRef>& resize_args,
+      const ResizeFunction& resize_fn = nullptr);
+
   ~DynamicDispatchNode() override = default;
 
   void encode(ComputeGraph* graph) override;
diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
index 7563fc63c71..0731722e13a 100644
--- a/backends/vulkan/runtime/graph/ops/ExecuteNode.h
+++ b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
@@ -65,7 +65,7 @@ class ExecuteNode {
     (void)graph;
   }
 
-  inline void trigger_resize(ComputeGraph* graph) {
+  virtual inline void trigger_resize(ComputeGraph* graph) {
     if (resize_fn_ != nullptr) {
       resize_fn_(graph, args_, resize_args_);
     }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select.glslh b/backends/vulkan/runtime/graph/ops/glsl/select.glslh
new file mode 100644
index 00000000000..3bcbf04a3ba
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/select.glslh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef SELECT_GLSLH
+#define SELECT_GLSLH
+
+/*
+ * Enable the fast path if a texel loaded from the input texture can be used as
+ * is to store to the output texture. The following conditions must be met:
+ *
+ * 1. The input and output textures have the same packed dimension.
+ * 2. The selected_dim must not be the packed dimension of the input.
+ * 3. The packed dimension of the input must "map" to the packed dimension of
+ *    the output. This occurs if selected_dim is greater than the packed dimension
+ *    of the input.
+ */
+bool can_use_fast_path() {
+  if (out_packed_dim != in_packed_dim) {
+    return false;
+  }
+  if (selected_dim <= in_packed_dim) {
+    return false;
+  }
+  return true;
+}
+
+/*
+ * Given an output tensor index, return the corresponding input tensor index for
+ * the select operator. This is done by "inserting" the select index at the
+ * selected_dim in the input tensor index.
+ *
+ * A simple example is (note all tensor index are in WHCN order):
+ *   out_tidx = [7, 5, 9]
+ *   selected_dim = 2
+ *   index = 3
+ *   in_tidx = [7, 3, 5, 9]
+ *
+ * This function assumes that the following variables are defined in the layout:
+ * - in_sizes
+ * - selected_dim
+ * - index
+ */
+ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
+  ivec4 in_tidx = ivec4(0);
+
+  int adjusted_index = index;
+  if (index < 0) {
+    adjusted_index = index + in_sizes[selected_dim];
+  }
+
+  // Handle different dimensions for selection
+  if (selected_dim == 0) {
+    // Select from width dimension
+    in_tidx = ivec4(adjusted_index, out_tidx.x, out_tidx.y, out_tidx.z);
+  } else if (selected_dim == 1) {
+    // Select from height dimension
+    in_tidx = ivec4(out_tidx.x, adjusted_index, out_tidx.y, out_tidx.z);
+  } else if (selected_dim == 2) {
+    // Select from channel dimension
+    in_tidx = ivec4(out_tidx.x, out_tidx.y, adjusted_index, out_tidx.z);
+  } else if (selected_dim == 3) {
+    // Select from batch dimension
+    in_tidx = ivec4(out_tidx.x, out_tidx.y, out_tidx.z, adjusted_index);
+  }
+
+  return in_tidx;
+}
+
+#endif // SELECT_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl
deleted file mode 100644
index f94e1120492..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.glsl
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
-
-layout(set = 0, binding = 4) uniform PRECISION restrict SelectVal {
-  // data.x: index along batch dim to select
-  // data.y: number of batches
-  // data.z: number of texels per batch
-  // data.w: unused
-  ivec4 select_info;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const int num_batches = select_info.y;
-  const int num_texel_per_batch = select_info.z;
-  const int index = select_info.x;
-
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  const uint src_pos_z = (num_texel_per_batch * index) + pos.z;
-  imageStore(
-      image_out, pos, texelFetch(image_in, ivec3(pos.x, pos.y, src_pos_z), 0));
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.glsl
deleted file mode 100644
index 0bbec798484..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.glsl
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-#define T ${texel_component_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
-
-// index to select
-layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal {
-  int index;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  const int tex = index / 4;
-  const int ind = index % 4;
-  const T v = VEC4_T(texelFetch(image_in, ivec3(pos.x, pos.y, tex), 0))[ind];
-
-  imageStore(image_out, ivec3(pos.x, pos.y, 0), VEC4_T(v, 0, 0, 0));
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.yaml
deleted file mode 100644
index 1c5c4e34b06..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_channel_3d.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-select_channel_3d:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: select_channel_3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.glsl
deleted file mode 100644
index 517362f76ea..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.glsl
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
-
-layout(set = 0, binding = 4) uniform PRECISION restrict SelectVal {
-  // data.x: index along channel dim to select
-  // data.y: number of batches
-  // data.z: number of texels per batch
-  // data.w: unused
-  ivec4 select_info;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  const int num_batches = select_info.y;
-  const int num_texel_per_batch = select_info.z;
-  const int index = select_info.x;
-
-  // read in the same channel from 4 separate batches
-  VEC4_T out_texel = VEC4_T(0, 0, 0, 0);
-  for (int k = 0; k < 4; k++) {
-    if ((k + pos.z * 4) >=
-        num_batches) {
-      break;
-    }
-    const uint src_pos_z = (4 * num_texel_per_batch * pos.z) +
-        (k * num_texel_per_batch) + (index / 4);
-    const uint src_pos_t = index % 4;
-    out_texel[k] =
-        VEC4_T(texelFetch(image_in, ivec3(pos.x, pos.y, src_pos_z), 0))[src_pos_t];
-  }
-
-  imageStore(image_out, pos, out_texel);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl
deleted file mode 100644
index 87409fb35fd..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.glsl
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
-
-// index to select
-layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal {
-  int index;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  // w
-  const int src_x = pos.x;
-  // h
-  const int src_y = index;
-  // c
-  const int src_z = pos.y;
-
-  const VEC4_T v = VEC4_T(texelFetch(image_in, ivec3(src_x, src_y, src_z), 0));
-
-  for (int i = 0; i < 4; i++) {
-    ivec3 new_pos = ivec3(pos.x, pos.y * 4 + i, 0);
-
-    // When the C-channel exceeds original block size, exit early
-    if (new_pos.y >= sizes.y) {
-      return;
-    }
-
-    imageStore(image_out, new_pos, VEC4_T(v[i], 0, 0, 0));
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.yaml
deleted file mode 100644
index a373f1decd9..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_height_3d.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-select_height_3d:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: select_height_3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl
deleted file mode 100644
index 2e4e2afb2db..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.glsl
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
-
-// index to select
-layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal {
-  // data.x: index along height dim to select
-  // data.y: number of batches
-  // data.z: number of texels per batch
-  // data.w: unused
-  ivec4 select_info;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  const int num_batches = select_info.y;
-  const int num_texel_per_batch = select_info.z;
-  const int index = select_info.x;
-
-  VEC4_T out_texel = VEC4_T(0, 0, 0, 0);
-  // read in the same channel from 4 separate batches
-  for (int k = 0; k < 4; k++) {
-    if ((k + pos.z * 4) >= num_batches
-        ) { // < 4 batches for this texel, exit early
-      break;
-    }
-    const uint src_pos_z = (pos.z * num_texel_per_batch * 4) +
-        k * num_texel_per_batch + (pos.y / 4);
-    out_texel[k] = VEC4_T(texelFetch(
-        image_in, ivec3(pos.x, index, src_pos_z), 0))[pos.y % 4];
-  }
-  imageStore(image_out, pos, out_texel);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.yaml
deleted file mode 100644
index c3724f1157a..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_height_4d.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-select_height_4d:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: select_height_4d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl
deleted file mode 100644
index 1e12d15ab21..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.glsl
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
-
-// index to select
-layout(set = 0, binding = 4) uniform PRECISION restrict IndexVal {
-  int index;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  // w
-  const int src_x = index;
-  // h
-  const int src_y = pos.x;
-  // c
-  const int src_z = pos.y;
-
-  const VEC4_T v = VEC4_T(texelFetch(image_in, ivec3(src_x, src_y, src_z), 0));
-
-  for (int i = 0; i < 4; i++) {
-    ivec3 new_pos = ivec3(pos.x, pos.y * 4 + i, 0);
-
-    // When the C-channel exceeds original block size, exit early
-    if (new_pos.y >= sizes.y) {
-      return;
-    }
-
-    imageStore(image_out, new_pos, VEC4_T(v[i], 0, 0, 0));
-  }
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.yaml
deleted file mode 100644
index a3070bf6ca3..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_width_3d.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-select_width_3d:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: select_width_3d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl b/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl
deleted file mode 100644
index ffbd8afbda0..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.glsl
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
-
-// index to select
-layout(set = 0, binding = 4) uniform PRECISION restrict SelectVal {
-  // data.x: index along width dim to select
-  // data.y: number of batches
-  // data.z: number of texels per batch
-  // data.w: unused
-  ivec4 select_info;
-};
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  const int num_batches = select_info.y;
-  const int num_texel_per_batch = select_info.z;
-  const int index = select_info.x;
-
-  //vec4 out_texel = vec4(0, 0, 0, 0);
-  VEC4_T out_texel = VEC4_T(0, 0, 0, 0);
-  // read in the same channel from 4 separate batches
-  for (int k = 0; k < 4; k++) {
-    if ((k + pos.z * 4) >=
-        num_batches) { // < 4 batches for this texel, exit early
-      break;
-    }
-    const uint src_pos_z = (pos.z * num_texel_per_batch * 4) +
-        k * num_texel_per_batch + (pos.y / 4);
-
-    out_texel[k] = VEC4_T(texelFetch(
-        image_in, ivec3(index, pos.x, src_pos_z), 0))[pos.y % 4];
-  }
-  imageStore(image_out, pos, out_texel);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.yaml
deleted file mode 100644
index f1131d77395..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/select_width_4d.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-select_width_4d:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: select_width_4d
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
new file mode 100644
index 00000000000..5d4cc70fdc1
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef SLICE_GLSLH
+#define SLICE_GLSLH
+
+/**
+ * Enable the fast path if a texel loaded from the input texture can be used as
+ * is to store to the output texture. The following conditions must be met:
+ *
+ * 1. The input and output textures have the same packed dimension.
+ * 2. The select_dim must not be the packed dimension of the input.
+ */
+bool can_use_fast_path() {
+  if (out_packed_dim != in_packed_dim) {
+    return false;
+  }
+  if (in_packed_dim == selected_dim) {
+    return false;
+  }
+  return true;
+}
+
+/*
+ * Converts output tensor indices to input tensor indices for the slice operation.
+ * This function maps the output indices to the corresponding input indices based on
+ * the slice parameters (start, step, selected_dim).
+ *
+ * Parameters assumed to be defined in the layout specifier:
+ * - in_sizes
+ * - selected_dim
+ * - start
+ * - step
+ */
+ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
+  ivec4 in_tidx = out_tidx;
+
+  int adjusted_start = start;
+  if (start < 0) {
+    adjusted_start = start + in_sizes[selected_dim];
+  }
+
+  in_tidx[selected_dim] = adjusted_start + out_tidx[selected_dim] * step;
+
+  return in_tidx;
+}
+
+#endif // SLICE_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.glsl b/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.glsl
deleted file mode 100644
index 0a6fa31a65f..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.glsl
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
-${layout_declare_ubo(2, "ivec4", "out_sizes")}
-${layout_declare_ubo(3, "ivec4", "in_sizes")}
-
-layout(set = 0, binding = 4) uniform PRECISION restrict SliceArg {
-  int offset;
-  int step;
-}
-slice_arg;
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-void main() {
-  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
-  const ivec4 idx = to_tensor_idx(out_pos, out_sizes, packed_dim);
-
-  if (any(greaterThanEqual(idx, out_sizes))) {
-    return;
-  }
-
-  // We map the output pos using the buffer index.  For each index in the texel,
-  // we calculate the source whcn-coordinate amended with offset-ed channel
-  // value.  Then we calculate the actual texture position from the
-  // whcn-coordinate.
-  const ivec4 buf_indices = tidx_to_nchwi(idx, out_sizes, packed_dim);
-
-  vec4 outex;
-  for (int i=0;i<4;i++) {
-      ivec4 user_coor = nchwi_to_tidx(buf_indices[i], out_sizes);
-
-      int in_dim = user_coor[packed_dim];
-
-      ivec4 in_user_coor = user_coor;
-      in_user_coor[packed_dim] = slice_arg.offset + in_dim * slice_arg.step;
-
-      ivec4 in_pow_elem = to_texture_elem_pos(
-        in_user_coor,
-        in_sizes,
-        packed_dim);
-
-      vec4 v = texelFetch(t_in, in_pow_elem.xyz, 0);
-
-      outex[i] = v[in_pow_elem.w];
-  }
-  imageStore(t_out, out_pos, outex);
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.yaml b/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.yaml
deleted file mode 100644
index 718e7316824..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/slice_packed_dim.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-slice_packed_dim:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-    STORAGE: texture3d
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: slice_packed_dim
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.glsl b/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.glsl
deleted file mode 100644
index 54f0bd0b78c..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.glsl
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#version 450 core
-
-#define PRECISION ${PRECISION}
-
-#define VEC4_T ${texel_type(DTYPE)}
-
-layout(std430) buffer;
-
-#include "indexing_utils.h"
-
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
-
-layout(set = 0, binding = 3) uniform PRECISION restrict SliceArg {
-  int dim;
-  int offset;
-  int step;
-  int image_in_channel_size;
-}
-slice_arg;
-
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
-
-layout(constant_id = 3) const int packed_dim = C_DIM;
-
-void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
-
-  if (pos_out_of_bounds(pos, sizes, packed_dim)) {
-    return;
-  }
-
-  ivec3 in_pos = pos;
-
-  // slice along batch axis
-  if (slice_arg.dim == 3) {
-    // index of the channel inside a batch
-    const int chanl_index = pos.z % slice_arg.image_in_channel_size;
-    // index of batch
-    const int batch_index = pos.z / slice_arg.image_in_channel_size;
-    in_pos.z = (slice_arg.offset + batch_index * slice_arg.step) * slice_arg.image_in_channel_size + chanl_index;
-  } else if (slice_arg.dim == C_DIM) {
-    // index of the channel inside a batch
-    const int chanl_index = pos.z % sizes.z;
-    // index of batch
-    const int batch_index = pos.z / sizes.z;
-    in_pos.z = slice_arg.offset + batch_index * slice_arg.image_in_channel_size + chanl_index * slice_arg.step;
-  } else if (slice_arg.dim == H_DIM) {
-    in_pos.y = slice_arg.offset + pos.y * slice_arg.step;
-  } else {
-    in_pos.x = slice_arg.offset + pos.x * slice_arg.step;
-  }
-
-  imageStore(image_out, pos, texelFetch(image_in, in_pos, 0));
-
-}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.yaml b/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.yaml
deleted file mode 100644
index 0453bb707b1..00000000000
--- a/backends/vulkan/runtime/graph/ops/glsl/slice_unpacked_dim.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-slice_unpacked_dim:
-  parameter_names_with_default_values:
-    DTYPE: float
-    NDIM: 3
-  generate_variant_forall:
-    DTYPE:
-      - VALUE: half
-      - VALUE: float
-  shader_variants:
-    - NAME: slice_unpacked_dim
diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
new file mode 100644
index 00000000000..3ca854e0526
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type("buffer")}
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, "buffer")}
+
+$if OP_NAME == "slice":
+  ${layout_declare_ubo(B, "int", "start")}
+  ${layout_declare_ubo(B, "int", "step")}
+
+$if OP_NAME == "select":
+  ${layout_declare_ubo(B, "int", "index")}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 in_sizes;
+  ivec4 out_strides;
+  ivec4 in_strides;
+  int out_numel;
+  int selected_dim;
+};
+
+${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "in_packed_dim", "DEFAULT_LAYOUT")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+#include "${OP_NAME}.glslh"
+
+void main() {
+  const int out_bufi = ivec3(gl_GlobalInvocationID).x;
+  if (out_bufi >= out_numel) {
+    return;
+  }
+
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
+  ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
+
+  const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
+  t_out[out_bufi] = t_in[in_bufi];
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.yaml
similarity index 54%
rename from backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.yaml
index 6236555f5dd..bdde613c8ce 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/select_channel_4d.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.yaml
@@ -1,10 +1,13 @@
-select_channel_4d:
+transfer_buffer:
   parameter_names_with_default_values:
     DTYPE: float
-    NDIM: 3
+    OP_NAME: select
   generate_variant_forall:
     DTYPE:
       - VALUE: half
       - VALUE: float
   shader_variants:
-    - NAME: select_channel_4d
+    - NAME: select_buffer
+      OP_NAME: select
+    - NAME: slice_buffer
+      OP_NAME: slice
diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl
new file mode 100644
index 00000000000..d3e25436c04
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.glsl
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type("texture3d")}
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}
+
+$if OP_NAME == "slice":
+  ${layout_declare_ubo(B, "int", "start")}
+  ${layout_declare_ubo(B, "int", "step")}
+
+$if OP_NAME == "select":
+  ${layout_declare_ubo(B, "int", "index")}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 out_sizes;
+  ivec4 in_sizes;
+  int selected_dim;
+};
+
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
+const lowp int out_packed_dim = unhash_packed_dim(out_layout);
+
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
+const lowp int in_packed_dim = unhash_packed_dim(in_layout);
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+#include "${OP_NAME}.glslh"
+
+void main() {
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
+  ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim);
+
+  if (any(greaterThanEqual(out_tidx, out_sizes))) {
+    return;
+  }
+
+  if (can_use_fast_path()) {
+    ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
+    ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
+    VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
+
+    write_texel_lpos(t_out, lpos, in_texel, out_axis_map);
+  }
+  else {
+    VEC4_T out_texel = VEC4_T(0);
+    for (int texel_i = 0; texel_i < 4; ++texel_i) {
+      ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
+      ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
+      int element_idx = in_tidx[in_packed_dim] % 4;
+
+      VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
+      T selected_value = T(in_texel[element_idx]);
+
+      out_texel[texel_i] = selected_value;
+
+      out_tidx[out_packed_dim]++;
+    }
+
+    write_texel_lpos(t_out, lpos, out_texel, out_axis_map);
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.yaml b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.yaml
similarity index 52%
rename from backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.yaml
rename to backends/vulkan/runtime/graph/ops/glsl/transfer_texture.yaml
index 9c7d54c8f69..f877ee036e4 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/select_batch_4d.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_texture.yaml
@@ -1,10 +1,13 @@
-select_batch_4d:
+transfer_texture:
   parameter_names_with_default_values:
     DTYPE: float
-    NDIM: 3
+    OP_NAME: select
   generate_variant_forall:
     DTYPE:
       - VALUE: half
       - VALUE: float
   shader_variants:
-    - NAME: select_batch_4d
+    - NAME: select_texture3d
+      OP_NAME: select
+    - NAME: slice_texture3d
+      OP_NAME: slice
diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.cpp b/backends/vulkan/runtime/graph/ops/impl/Common.cpp
new file mode 100644
index 00000000000..4de099231d3
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Common.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+
+namespace vkcompute {
+
+utils::uvec3 default_pick_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& additional_args) {
+  (void)shader;
+  const ValueRef out = args.at(0).refs.at(0);
+  return graph->create_global_wg_size(out);
+}
+
+utils::uvec3 default_pick_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& additional_args) {
+  (void)shader;
+  return graph->create_local_wg_size(global_workgroup_size);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.h b/backends/vulkan/runtime/graph/ops/impl/Common.h
new file mode 100644
index 00000000000..d5ff455ae41
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Common.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/ExecuteNode.h>
+
+namespace vkcompute {
+
+/**
+ * Creates a global workgroup size based on the first output tensor in the args.
+ * This is a utility function that extracts the output tensor from
+ * args.at(0).refs.at(0) and calls graph->create_global_wg_size(out) on it.
+ *
+ * @param graph The ComputeGraph instance
+ * @param args Vector of ArgGroup containing the output tensor reference
+ * @return utils::uvec3 The global workgroup size
+ */
+utils::uvec3 default_pick_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& additional_args);
+
+/**
+ * Creates a local workgroup size based on the first output tensor in the args.
+ * This is a utility function that extracts the output tensor from
+ * args.at(0).refs.at(0) and calls graph->create_local_wg_size(out) on it.
+ *
+ * @param graph The ComputeGraph instance
+ * @param args Vector of ArgGroup containing the output tensor reference
+ * @return utils::uvec3 The local workgroup size
+ */
+utils::uvec3 default_pick_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& additional_args);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Select.cpp b/backends/vulkan/runtime/graph/ops/impl/Select.cpp
index a83e986e414..69d49e8283b 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Select.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Select.cpp
@@ -8,129 +8,136 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
-#include <executorch/backends/vulkan/runtime/api/api.h>
-#include <executorch/backends/vulkan/runtime/graph/Logging.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Transfer.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
 
-void check_args(
-    const api::vTensor& t_in,
-    int64_t dim,
-    int64_t index,
-    const api::vTensor& t_out) {
-  VK_CHECK_COND(check_packed_dim_is(t_in, WHCN::kChannelsDim));
-  VK_CHECK_COND(check_packed_dim_is(t_out, WHCN::kChannelsDim));
+void resize_select_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  ValueRef out = args.at(0).refs.at(0);
+  ValueRef in = args.at(1).refs.at(0);
+  int64_t dim = graph->extract_scalar<int64_t>(extra_args.at(0));
 
-  const int64_t in_dim = t_in.dim();
-  VK_CHECK_COND(
-      in_dim == 3 || in_dim == 4,
-      "Vulkan select only support 3d or 4d tensors!");
-
-  const int64_t in_size = t_in.size(dim);
-
-  if (index < -in_size || index >= in_size) {
-    VK_CHECK_COND(
-        false,
-        "select(): index ",
-        index,
-        " t_outof range for tensor of size ",
-        in_size,
-        " at dimension ",
-        dim);
+  int64_t in_ndim = graph->dim_of(in);
+
+  if (dim < 0) {
+    dim += in_ndim;
+  }
+
+  std::vector<int64_t> new_out_sizes;
+  for (int64_t i = 0; i < in_ndim; ++i) {
+    if (i != dim) {
+      new_out_sizes.push_back(graph->size_at<int64_t>(i, in));
+    }
   }
+
+  graph->virtual_resize(out, new_out_sizes);
 }
 
-void add_select_int_node(
+void check_select_args(
     ComputeGraph& graph,
     const ValueRef in,
     const ValueRef dim_ref,
     const ValueRef index_ref,
     const ValueRef out) {
-  vTensorPtr t_in = graph.get_tensor(in);
-  vTensorPtr t_out = graph.get_tensor(out);
   int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-  int64_t index = graph.extract_scalar<int64_t>(index_ref);
+  int64_t index = graph.extract_optional_scalar<int64_t>(index_ref, 0);
+  int64_t in_ndim = graph.dim_of(in);
 
-  check_args(*t_in, dim, index, *t_out);
+  if (dim < 0) {
+    dim += in_ndim;
+  }
 
-  const int64_t in_size = t_in->size(dim);
+  VK_CHECK_COND(
+      dim >= 0 && dim < in_ndim,
+      "Dimension out of range (expected to be in range of [",
+      -in_ndim,
+      ", ",
+      in_ndim - 1,
+      "], but got ",
+      dim,
+      ")");
+
+  const int64_t in_size_at_dim = graph.size_at<int64_t>(dim, in);
 
   if (index < 0) {
-    index += in_size;
+    index += in_size_at_dim;
   }
 
-  std::string kernel_name;
-
-  // for 3d tensors, these values are not used by the shader.
-  int32_t num_texel_per_batch = 1;
-  int32_t num_batches = 1;
-
-  int64_t in_dim = t_in->dim();
-  if (in_dim == 3) {
-    if (dim == 0) {
-      kernel_name = "select_channel_3d";
-    } else if (dim == 1) {
-      kernel_name = "select_height_3d";
-    } else if (dim == 2) {
-      kernel_name = "select_width_3d";
-    } else {
-      VK_CHECK_COND(
-          false, "Unexpected dim value=", dim, "for the input 3d tensor");
-    }
-  } else { // self.dim() == 4
-    num_texel_per_batch =
-        static_cast<int32_t>(std::ceil(static_cast<float>(t_in->size(1)) / 4));
-    num_batches = t_in->size(0);
-    if (dim == 0) {
-      kernel_name = "select_batch_4d";
-    } else if (dim == 1) {
-      kernel_name = "select_channel_4d";
-    } else if (dim == 2) {
-      kernel_name = "select_height_4d";
-    } else if (dim == 3) {
-      kernel_name = "select_width_4d";
-    } else {
+  VK_CHECK_COND(
+      index >= 0 && index < in_size_at_dim,
+      "select(): index ",
+      index,
+      " out of range for tensor of size ",
+      in_size_at_dim,
+      " at dimension ",
+      dim);
+
+  // Check that output tensor has correct dimensions
+  int64_t out_dim = graph.dim_of(out);
+  VK_CHECK_COND(
+      out_dim == in_ndim - 1,
+      "Output tensor dimension mismatch (expected ",
+      in_size_at_dim - 1,
+      ", but got ",
+      out_dim,
+      ")");
+
+  // Check that output tensor has correct sizes
+  int64_t out_idx = 0;
+  for (int64_t i = 0; i < in_size_at_dim; ++i) {
+    if (i != dim) {
       VK_CHECK_COND(
-          false, "Unexpected dim value=", dim, "for the input 4d tensor");
+          graph.size_at<int64_t>(out_idx, out) == graph.size_at<int64_t>(i, in),
+          "Output size mismatch at dimension ",
+          out_idx,
+          " (expected ",
+          graph.size_at<int16_t>(i, in),
+          ", but got ",
+          graph.size_at<int64_t>(out_idx, out),
+          ")");
+      out_idx++;
     }
   }
+}
 
-  kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, *t_out);
+/**
+ * Adds a select operation node to the compute graph.
+ *
+ * The select operator extracts a slice from a tensor along a specified
+ * dimension at a given index. It effectively reduces the dimensionality of the
+ * input tensor by one, by selecting a single slice at the specified index along
+ * the given dimension. For example, if input is a 3D tensor with shape [2,3,4]
+ * and we select dimension 1, index 2, the output will be a 2D tensor with shape
+ * [2,4].
+ */
+void add_select_copy_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef dim_ref,
+    const ValueRef index_ref,
+    const ValueRef out) {
+  check_select_args(graph, in, dim_ref, index_ref, out);
 
-  // TODO: add resizing to support dynamic shapes.
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  add_transfer_copy_node(
       graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      graph.create_global_wg_size(out),
-      graph.create_local_wg_size(out),
-      // Inputs and Outputs
-      {{out, vkapi::MemoryAccessType::WRITE},
-       {in, vkapi::MemoryAccessType::READ}},
-      // Parameter buffers
-      {t_out->logical_limits_ubo(),
-       t_out->sizes_ubo(),
-       // TODO: num_batches and num_texel_per_batch are provided by
-       // t_out->sizes. Can change the following to reduce params
-       // created.
-       graph.create_params_buffer(
-           utils::make_ivec4({index, num_batches, num_texel_per_batch, 0}))},
-      // Push Constants
-      {},
-      // Specialization Constants
-      {},
-      // Resize Args
-      {},
-      // Resizing Logic
-      nullptr));
+      TransferType::SELECT,
+      in,
+      dim_ref,
+      index_ref,
+      kDummyValueRef,
+      kDummyValueRef,
+      out,
+      {dim_ref, index_ref},
+      resize_select_node);
 }
 
 void select_int(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_select_int_node(graph, args[0], args[1], args[2], args[3]);
+  return add_select_copy_node(graph, args[0], args[1], args[2], args[3]);
 }
 
 REGISTER_OPERATORS {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
index c40e16f7c0a..67d714d10aa 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Slice.cpp
@@ -8,12 +8,10 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
-#include <executorch/backends/vulkan/runtime/graph/Logging.h>
-
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Slice.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Transfer.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
@@ -33,127 +31,73 @@ inline int64_t normalize_idx(
   return normalize(index, max);
 }
 
-void add_slice_tensor_copy_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    ValueRef dim_ref,
-    ValueRef opt_start_ref,
-    ValueRef opt_end_ref,
-    ValueRef step_ref,
-    ValueRef out) {
-  vTensorPtr t_in = graph.get_tensor(in);
-  vTensorPtr t_out = graph.get_tensor(out);
-
-  VK_CHECK_COND(check_same_packed_dim(*t_in, *t_out));
-
-  // Need normalize the dim
-  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-
-  VK_CHECK_COND(
-      -t_in->dim() <= dim && dim < t_in->dim(),
-      "dim must be in range of [-self.dim(), self.dim()), but current dim's value is ",
-      dim,
-      " and self.dim() = ",
-      t_in->dim());
-
-  dim = normalize(dim, t_in->dim());
-
-  DimIndex dim_index = normalize_to_dim_index(*t_in, dim);
+void resize_slice_copy_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  ValueRef out_ref = args.at(0).refs.at(0);
+  ValueRef in_ref = args.at(1).refs.at(0);
 
+  int64_t dim = graph->extract_scalar<int64_t>(extra_args.at(0));
   std::optional<int64_t> opt_start =
-      graph.extract_optional_scalar<int64_t>(opt_start_ref);
+      graph->extract_optional_scalar<int64_t>(extra_args.at(1));
   std::optional<int64_t> opt_end =
-      graph.extract_optional_scalar<int64_t>(opt_end_ref);
-  int64_t step = graph.extract_scalar<int64_t>(step_ref);
-
-  const auto in_sizes = t_in->sizes();
-  const auto out_sizes = t_out->sizes();
-
-  int64_t start = opt_start.value_or(0);
-  int64_t end = opt_end.value_or(in_sizes[dim]);
+      graph->extract_optional_scalar<int64_t>(extra_args.at(2));
+  int64_t step = graph->extract_scalar<int64_t>(extra_args.at(3));
 
-  start = normalize_idx(start, in_sizes[dim], 0);
-  end = normalize_idx(end, in_sizes[dim], in_sizes[dim]);
+  // Normalize dim
+  if (dim < 0) {
+    dim += graph->dim_of(in_ref);
+  }
 
-  const vkapi::SpecVarList spec_vars = {t_in->packed_dim()};
+  const std::vector<int64_t> in_sizes = graph->sizes_of(in_ref);
+  int64_t dim_size = in_sizes.at(dim);
 
-  const auto packed_dim_idx =
-      static_cast<DimIndex>(DimIndex::DIM_LAST - t_in->packed_dim());
+  int64_t start = opt_start.value_or(0);
+  int64_t end = opt_end.value_or(dim_size);
 
-  // if slice dim is the same as the packed dim, we can use the channel slice
-  if (dim_index == packed_dim_idx) {
-    // slice by channel
-    std::string kernel_name = "slice_packed_dim";
-    kernel_name.reserve(kShaderNameReserve);
-    add_dtype_suffix(kernel_name, *t_out);
+  // Normalize start and end indices
+  start = normalize_idx(start, dim_size, 0);
+  end = normalize_idx(end, dim_size, dim_size);
 
-    const struct Block final {
-      int offset;
-      int step;
-    } params{
-        static_cast<int32_t>(start),
-        static_cast<int32_t>(step),
-    };
+  // Calculate output size
+  std::vector<int64_t> new_out_sizes = in_sizes;
+  new_out_sizes.at(dim) = (end - start + step - 1) / step; // Ceiling division
 
-    graph.execute_nodes().emplace_back(new DispatchNode(
-        graph,
-        VK_KERNEL_FROM_STR(kernel_name),
-        graph.create_global_wg_size(out),
-        graph.create_local_wg_size(out),
-        {{out, vkapi::MemoryAccessType::WRITE},
-         {in, vkapi::MemoryAccessType::READ}},
-        {t_out->sizes_ubo(),
-         t_in->sizes_ubo(),
-         graph.create_params_buffer(params)},
-        {},
-        spec_vars,
-        {},
-        nullptr));
-
-  } else {
-    // GPU's coordinate is in x = 0, y = 1, z = 2, w = 3
-    const int64_t gpu_dim = -(dim_index + 1);
-    // stride of input tensor's channel dimension
-    int64_t in_channel_stride = dim_at(in_sizes, kChannel4D);
-    VK_CHECK_COND(out_sizes[dim] == (1 + (end - start - 1) / step));
-
-    // Due to channel packing, each batch value is span over stride planes
-    if (dim_index == kBatch4D && packed_dim_idx == kChannel4D) {
-      in_channel_stride = utils::div_up_4(in_channel_stride);
-    }
+  graph->virtual_resize(out_ref, new_out_sizes);
+}
 
-    std::string kernel_name = "slice_unpacked_dim";
-    kernel_name.reserve(kShaderNameReserve);
-    add_dtype_suffix(kernel_name, *t_out);
-
-    utils::uvec3 global_size = t_out->logical_limits();
-    utils::uvec3 local_size = graph.create_local_wg_size(global_size);
-
-    const struct Block final {
-      int dim;
-      int offset;
-      int step;
-      int stride;
-    } params{
-        static_cast<int32_t>(gpu_dim),
-        static_cast<int32_t>(start),
-        static_cast<int32_t>(step),
-        static_cast<int32_t>(in_channel_stride),
-    };
-
-    graph.execute_nodes().emplace_back(new DispatchNode(
-        graph,
-        VK_KERNEL_FROM_STR(kernel_name),
-        global_size,
-        local_size,
-        {{out, vkapi::MemoryAccessType::WRITE},
-         {in, vkapi::MemoryAccessType::READ}},
-        {t_out->sizes_ubo(), graph.create_params_buffer(params)},
-        {},
-        spec_vars,
-        {},
-        nullptr));
-  }
+/**
+ * Adds a slice_copy operation node to the compute graph.
+ *
+ * The slice operator extracts a portion of a tensor along a specified
+ * dimension. It creates a new tensor that contains a subset of the input
+ * tensor's data, defined by start, end, and step parameters along the given
+ * dimension.
+ *
+ * For example, if input is a tensor with shape [4,5,6] and we slice along
+ * dimension 1 with start=1, end=4, step=2, the output will have shape [4,2,6],
+ * containing elements from the input at positions 1 and 3 along dimension 1.
+ */
+void add_slice_copy_node(
+    ComputeGraph& graph,
+    ValueRef in,
+    ValueRef dim_ref,
+    ValueRef opt_start_ref,
+    ValueRef opt_end_ref,
+    ValueRef step_ref,
+    ValueRef out) {
+  add_transfer_copy_node(
+      graph,
+      TransferType::SLICE,
+      in,
+      dim_ref,
+      opt_start_ref,
+      opt_end_ref,
+      step_ref,
+      out,
+      {dim_ref, opt_start_ref, opt_end_ref, step_ref},
+      resize_slice_copy_node);
 }
 
 std::vector<int64_t> get_slice_sizes(
@@ -186,16 +130,16 @@ void resize_slice_view_node(
     const std::vector<ArgGroup>& args,
     const std::vector<ValueRef>& extra_args) {
   (void)args;
-  vTensorPtr out = graph->get_tensor(extra_args[0]);
+  ValueRef out_ref = extra_args.at(0);
 
   std::vector<int64_t> new_out_sizes = get_slice_sizes(
       *graph,
-      extra_args[1], // input
-      extra_args[2], // dim
-      extra_args[3], // optional start
-      extra_args[4]); // optional end
+      extra_args.at(1), // input
+      extra_args.at(2), // dim
+      extra_args.at(3), // optional start
+      extra_args.at(4)); // optional end
 
-  out->virtual_resize(new_out_sizes);
+  graph->virtual_resize(out_ref, new_out_sizes);
 }
 
 void check_slice_view_args(
@@ -267,54 +211,54 @@ void add_slice_view_node(
   std::vector<int64_t> new_out_sizes =
       get_slice_sizes(graph, in_ref, dim_ref, opt_start_ref, opt_end_ref);
 
-  graph.get_tensor(out_ref)->virtual_resize(new_out_sizes);
+  graph.virtual_resize(out_ref, new_out_sizes);
 
   graph.execute_nodes().emplace_back(new ExecuteNode(
       resize_slice_view_node,
       {out_ref, in_ref, dim_ref, opt_start_ref, opt_end_ref, opt_step_ref}));
 }
 
-void slice_tensor_copy(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  return add_slice_tensor_copy_node(
+void slice_copy(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  return add_slice_copy_node(
       graph,
-      args[0],
-      args[1], // dim
-      args[2], // optional start
-      args[3], // optional end
-      args[4], // step
-      args[5]);
+      args.at(0),
+      args.at(1), // dim
+      args.at(2), // optional start
+      args.at(3), // optional end
+      args.at(4), // step
+      args.at(5));
 }
 
-void slice_tensor(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  ValueRef in = args[0];
-  ValueRef out = args[5];
+void slice(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  ValueRef in = args.at(0);
+  ValueRef out = args.at(5);
 
   // Special case if out is a view of in
   if (graph.val_is_view_of(out, in)) {
     add_slice_view_node(
         graph,
         in,
-        args[1], // dim
-        args[2], // optional start
-        args[3], // optional end
-        args[4], // step
+        args.at(1), // dim
+        args.at(2), // optional start
+        args.at(3), // optional end
+        args.at(4), // step
         out);
     return;
   }
 
-  add_slice_tensor_copy_node(
+  add_slice_copy_node(
       graph,
       in,
-      args[1], // dim
-      args[2], // optional start
-      args[3], // optional end
-      args[4], // step
+      args.at(1), // dim
+      args.at(2), // optional start
+      args.at(3), // optional end
+      args.at(4), // step
       out);
 }
 
 REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.slice_copy.Tensor, slice_tensor_copy);
-  VK_REGISTER_OP(aten.slice.Tensor, slice_tensor);
+  VK_REGISTER_OP(aten.slice_copy.Tensor, slice_copy);
+  VK_REGISTER_OP(aten.slice.Tensor, slice);
 }
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
new file mode 100644
index 00000000000..423c9789d67
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Transfer.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+/**
+ * Adds a transfer copy operation node to the compute graph.
+ * This function handles both SELECT and SLICE operations based on the
+ * transfer_type parameter.
+ */
+void add_transfer_copy_node(
+    ComputeGraph& graph,
+    TransferType transfer_type,
+    const ValueRef in,
+    const ValueRef dim_ref,
+    const ValueRef index_or_start_ref,
+    const ValueRef end_ref,
+    const ValueRef step_ref,
+    const ValueRef out,
+    const std::vector<ValueRef>& resize_args,
+    const ExecuteNode::ResizeFunction& resize_fn) {
+  int64_t ndim = graph.dim_of(in);
+  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+
+  if (dim < 0) {
+    dim += ndim;
+  }
+
+  int64_t dim_whcn = nchw_dim_to_whcn_dim(dim, ndim);
+
+  vkapi::ParamsBindList param_buffers;
+  if (transfer_type == TransferType::SELECT) {
+    param_buffers = {
+        graph.get_or_create_int_param_buffer(index_or_start_ref, 0)};
+  } else { // TransferType::SLICE
+    param_buffers = {
+        graph.get_or_create_int_param_buffer(index_or_start_ref, 0),
+        graph.get_or_create_int_param_buffer(step_ref, 1)};
+  }
+
+  const struct TransferParams {
+    const int32_t dim;
+  } transfer_params{static_cast<int32_t>(dim_whcn)};
+
+  std::vector<PushConstantDataInfo> push_constants;
+  vkapi::SpecVarList spec_vars;
+
+  if (graph.is_buffer_storage(out)) {
+    push_constants = {
+        graph.sizes_pc_of(in),
+        graph.strides_pc_of(out),
+        graph.strides_pc_of(in),
+        graph.numel_pc_of(out),
+        PushConstantDataInfo(&transfer_params, sizeof(transfer_params))};
+
+    spec_vars = {
+        graph.packed_dim_of(out),
+        graph.packed_dim_of(in),
+    };
+  } else {
+    push_constants = {
+        graph.sizes_pc_of(out),
+        graph.sizes_pc_of(in),
+        PushConstantDataInfo(&transfer_params, sizeof(transfer_params))};
+
+    spec_vars = {
+        graph.hashed_layout_of(out),
+        graph.hashed_layout_of(in),
+    };
+  }
+
+  // Determine the shader directly
+  std::string kernel_name;
+  if (transfer_type == TransferType::SELECT) {
+    kernel_name = "select";
+  } else { // TransferType::SLICE
+    kernel_name = "slice";
+  }
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  // Create and add the dispatch node
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      default_pick_global_wg_size,
+      default_pick_local_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
+      // Parameter buffers
+      param_buffers,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      spec_vars,
+      // Resize Args
+      resize_args,
+      // Resizing Logic
+      resize_fn));
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transfer.h b/backends/vulkan/runtime/graph/ops/impl/Transfer.h
new file mode 100644
index 00000000000..09aae144994
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Transfer.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/ExecuteNode.h>
+
+namespace vkcompute {
+
+enum class TransferType { SELECT, SLICE };
+
+/**
+ * Adds a transfer copy operation node to the compute graph, which implements
+ * operators for which each element of the output tensor maps to a unique
+ * element of the input tensor.
+ *
+ * This function currently handles the following operations:
+ * - select
+ * - slice
+ */
+void add_transfer_copy_node(
+    ComputeGraph& graph,
+    TransferType transfer_type,
+    const ValueRef in,
+    const ValueRef dim_ref,
+    const ValueRef index_or_start_ref,
+    const ValueRef end_ref,
+    const ValueRef step_ref,
+    const ValueRef out,
+    const std::vector<ValueRef>& resize_args,
+    const ExecuteNode::ResizeFunction& resize_fn = nullptr);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 40e86a13b42..bf6e9683ef7 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -499,7 +499,9 @@ def get_ones_inputs():
 def get_select_int_inputs():
     test_suite = VkTestSuite(
         [
-            ((6, 2, 7), 0, 3),
+            ((8, 8, 8), 0, -2),
+            ((8, 8, 8), 1, -3),
+            ((8, 8, 8), 2, -4),
             ((6, 2, 7), 1, 0),
             ((6, 2, 7), 2, 3),
             ((6, 10, 7), 0, 3),
@@ -515,6 +517,10 @@ def get_select_int_inputs():
             ((8, 6, 1, 1), 1, 4),
         ]
     )
+    test_suite.layouts = ["utils::kWidthPacked", "utils::kChannelsPacked"]
+    test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
+    test_suite.dtypes = ["at::kFloat"]
+    test_suite.data_gen = "make_seq_tensor"
     return test_suite
 
 
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
index 80ead02de9f..447e5d039f4 100644
--- a/backends/vulkan/test/test_vulkan_delegate.py
+++ b/backends/vulkan/test/test_vulkan_delegate.py
@@ -1842,3 +1842,50 @@ def forward(self, x):
             dynamic_shapes=dynamic_shapes,
             test_inputs=test_inputs,
         )
+
+    def test_select_last_height_dynamic_shapes(self):
+        """
+        Test selecting the last element along the height dimension with dynamic shapes.
+        The height dimension (dim=1) is variable.
+        """
+
+        class SelectLastHeightModule(torch.nn.Module):
+            """
+            Module that selects the last element along the height dimension (dim=1) of a 3D tensor.
+            This is equivalent to the operation: x[:, -1, :]
+            """
+
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                # Select the last element along dimension 1 (height)
+                return x[:, -1, :]
+
+        # Create the module
+        module = SelectLastHeightModule()
+
+        # Create sample inputs with a specific shape
+        # Shape: [batch_size, height, width]
+        sample_inputs = (torch.arange(1, 61).reshape(2, 10, 3).float(),)
+
+        # Define dynamic shapes for the height dimension
+        height = Dim("height", min=1, max=10)
+        dynamic_shapes = {"x": {1: height}}
+
+        # Create test inputs with different heights
+        test_inputs = [
+            (torch.arange(1, 7).reshape(2, 1, 3).float(),),  # Minimum height
+            (torch.arange(1, 19).reshape(2, 3, 3).float(),),  # Small height
+            (torch.arange(1, 43).reshape(2, 7, 3).float(),),  # Medium height
+            (torch.arange(1, 31).reshape(2, 5, 3).float(),),  # Maximum height
+        ]
+
+        # Use the testing infrastructure from TestVulkanBackend
+        test_backend = TestVulkanBackend()
+        test_backend.lower_module_and_test_output(
+            module,
+            sample_inputs,
+            dynamic_shapes=dynamic_shapes,
+            test_inputs=test_inputs,
+        )
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index a6475d95d07..60dfb3b8606 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -9,6 +9,7 @@
 #include <gtest/gtest.h>
 
 #include <bitset>
+#include <iomanip>
 #include <utility>
 #include <vector>
 
@@ -1660,9 +1661,8 @@ TEST(VulkanComputeGraphTest, test_simple_shared_objects_with_resize) {
   for (auto& new_sizes : new_sizes_list) {
     graph.get_tensor(a.value)->virtual_resize(new_sizes);
     graph.get_tensor(b.value)->virtual_resize(new_sizes);
-    graph.get_tensor(c)->virtual_resize(new_sizes);
     graph.get_tensor(d.value)->virtual_resize(new_sizes);
-    graph.get_tensor(e)->virtual_resize(new_sizes);
+    graph.propagate_resize();
 
     float val_a = new_sizes[1] + 4.0f;
     float val_b = new_sizes[2] + 1.5f;
@@ -3315,17 +3315,23 @@ vkapi::ShaderInfo pick_dynamic_dispatch_shader(
 
 utils::uvec3 pick_dynamic_dispatch_global_wg_size(
     ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
     const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& additional_args) {
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
   const ValueRef out = args[0].refs[0];
-
   return graph->logical_limits_of(out);
 }
 
 utils::uvec3 pick_dynamic_dispatch_local_wg_size(
     ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
     const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& additional_args) {
+    const std::vector<ValueRef>& resize_args) {
+  (void)graph;
+  (void)shader;
+  (void)global_workgroup_size;
   return {64, 1, 1};
 }