diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index 1222a9fc641..68935e63123 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -449,6 +449,15 @@ ValueRef ComputeGraph::add_symint(const int32_t val) {
   return idx;
 }
 
+ValueRef ComputeGraph::get_or_add_value_for_int(const int64_t val) {
+  for (int i = 0; i < values_.size(); ++i) {
+    if (values_.at(i).isInt() && values_.at(i).toInt() == val) {
+      return i;
+    }
+  }
+  return add_scalar(val);
+}
+
 ValueRef ComputeGraph::set_input_tensor(
     const ValueRef idx,
     const bool use_staging) {
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index fe546f26477..0e8a5eba51f 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -604,6 +604,13 @@ class ComputeGraph final {
 
   ValueRef add_symint(const int32_t val);
 
+  /*
+   * Searches the graph's value list for a Int value with the specified value.
+   * If one is found, returns the index of the value. Otherwise, add a new value
+   * and return the index of the new value.
+   */
+  ValueRef get_or_add_value_for_int(const int64_t val);
+
   ValueRef set_input_tensor(const ValueRef idx, const bool use_staging = true);
   ValueRef set_output_tensor(const ValueRef idx, const bool use_staging = true);
 
diff --git a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp b/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp
index a8d2fe2e99d..b8c0fcbbf79 100644
--- a/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.cpp
@@ -25,9 +25,9 @@ DynamicDispatchNode::DynamicDispatchNode(
     const ResizeFunction& resize_fn)
     : DispatchNode(
           graph,
-          vkapi::ShaderInfo(),
-          {1u, 1u, 1u},
+          pick_shader_fn(&graph, args, resize_args),
           {1u, 1u, 1u},
+          {8u, 8u, 1u},
           args,
           params,
           push_constants,
@@ -37,7 +37,6 @@ DynamicDispatchNode::DynamicDispatchNode(
       pick_shader_fn_(pick_shader_fn),
       pick_global_wg_fn_(pick_global_wg_fn),
       pick_local_wg_fn_(pick_local_wg_fn) {
-  shader_ = pick_shader_fn(&graph, args, resize_args);
   global_workgroup_size_ =
       pick_global_wg_fn(&graph, shader_, args, resize_args);
   local_workgroup_size_ = utils::WorkgroupSize(pick_local_wg_fn(
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
index ff6b54c5289..d260ed767d0 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
@@ -30,8 +31,8 @@ void check_binary_op_args(
 void resize_binary_op_node(
     ComputeGraph* graph,
     const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
   vTensorPtr out = graph->get_tensor(args[0].refs[0]);
 
   // TODO(T183442143): Verify tensors are broadcastable.
@@ -78,11 +79,11 @@ void add_binary_op_texture_node(
   add_storage_type_suffix(kernel_name, *t_out);
   add_dtype_suffix(kernel_name, *t_out);
 
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      graph.create_global_wg_size(out),
-      graph.create_local_wg_size(out),
+      default_pick_global_wg_size,
+      default_pick_local_wg_size,
       // Inputs and Outputs
       {{out, vkapi::kWrite}, {{arg1, arg2}, vkapi::kRead}},
       // Shader params buffers
@@ -122,11 +123,11 @@ void add_binary_op_buffer_node(
   add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      graph.create_global_wg_size(out),
-      graph.create_local_wg_size(out),
+      default_pick_global_wg_size,
+      default_pick_local_wg_size,
       // Inputs and Outputs
       {{out, vkapi::kWrite}, {{in1, in2}, vkapi::kRead}},
       // Shader params buffers
diff --git a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
index d0276b1783b..da06223cd12 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Clone.cpp
@@ -10,6 +10,7 @@
 
 #include <executorch/backends/vulkan/runtime/graph/Logging.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/View.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
@@ -21,8 +22,8 @@ namespace vkcompute {
 void resize_clone_node(
     ComputeGraph* graph,
     const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
   vTensorPtr out = graph->get_tensor(args[0].refs[0]);
   vTensorPtr in = graph->get_tensor(args[1].refs[0]);
   // TODO: support for when dimensionality doesn't match, i.e. clone is used to
@@ -41,11 +42,11 @@ void add_clone_node(
   std::string kernel_name = "clone";
   add_dtype_suffix(kernel_name, *t_out);
 
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      graph.create_global_wg_size(out),
-      graph.create_local_wg_size(out),
+      default_pick_global_wg_size,
+      default_pick_local_wg_size,
       // Inputs and Outputs
       {{out, vkapi::kWrite}, {in, vkapi::kRead}},
       // Parameter Buffers
@@ -60,6 +61,17 @@ void add_clone_node(
       resize_clone_node));
 }
 
+utils::uvec3 clone_image_to_buffer_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+  const ValueRef image = args.at(1).refs.at(0);
+  return graph->create_global_wg_size(image);
+}
+
 void add_image_to_buffer_node(
     ComputeGraph& graph,
     const ValueRef image,
@@ -68,12 +80,11 @@ void add_image_to_buffer_node(
   add_dtype_suffix(kernel_name, graph.dtype_of(image));
   vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
 
-  utils::uvec3 global_wg_size = graph.create_global_wg_size(image);
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       shader,
-      global_wg_size,
-      graph.create_local_wg_size(global_wg_size),
+      clone_image_to_buffer_global_wg_size,
+      default_pick_local_wg_size,
       // Input and Outputs
       {{buffer, vkapi::kWrite}, {image, vkapi::kRead}},
       // Parameter Buffers
@@ -96,12 +107,11 @@ void add_buffer_to_image_node(
   add_dtype_suffix(kernel_name, graph.dtype_of(image));
   vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
 
-  utils::uvec3 global_wg_size = graph.create_global_wg_size(image);
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       shader,
-      global_wg_size,
-      graph.create_local_wg_size(global_wg_size),
+      default_pick_global_wg_size,
+      default_pick_local_wg_size,
       // Input and Outputs
       {{image, vkapi::kWrite}, {buffer, vkapi::kRead}},
       // Parameter Buffers
diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.cpp b/backends/vulkan/runtime/graph/ops/impl/Common.cpp
index 4de099231d3..4c3c16417b5 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Common.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Common.cpp
@@ -14,8 +14,9 @@ utils::uvec3 default_pick_global_wg_size(
     ComputeGraph* graph,
     const vkapi::ShaderInfo& shader,
     const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& additional_args) {
+    const std::vector<ValueRef>& resize_args) {
   (void)shader;
+  (void)resize_args;
   const ValueRef out = args.at(0).refs.at(0);
   return graph->create_global_wg_size(out);
 }
@@ -25,8 +26,10 @@ utils::uvec3 default_pick_local_wg_size(
     const vkapi::ShaderInfo& shader,
     const utils::uvec3& global_workgroup_size,
     const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& additional_args) {
+    const std::vector<ValueRef>& resize_args) {
   (void)shader;
+  (void)args;
+  (void)resize_args;
   return graph->create_local_wg_size(global_workgroup_size);
 }
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Common.h b/backends/vulkan/runtime/graph/ops/impl/Common.h
index d5ff455ae41..662fb07095a 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Common.h
+++ b/backends/vulkan/runtime/graph/ops/impl/Common.h
@@ -17,31 +17,23 @@ namespace vkcompute {
  * Creates a global workgroup size based on the first output tensor in the args.
  * This is a utility function that extracts the output tensor from
  * args.at(0).refs.at(0) and calls graph->create_global_wg_size(out) on it.
- *
- * @param graph The ComputeGraph instance
- * @param args Vector of ArgGroup containing the output tensor reference
- * @return utils::uvec3 The global workgroup size
  */
 utils::uvec3 default_pick_global_wg_size(
     ComputeGraph* graph,
     const vkapi::ShaderInfo& shader,
     const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& additional_args);
+    const std::vector<ValueRef>& resize_args);
 
 /**
  * Creates a local workgroup size based on the first output tensor in the args.
  * This is a utility function that extracts the output tensor from
  * args.at(0).refs.at(0) and calls graph->create_local_wg_size(out) on it.
- *
- * @param graph The ComputeGraph instance
- * @param args Vector of ArgGroup containing the output tensor reference
- * @return utils::uvec3 The local workgroup size
  */
 utils::uvec3 default_pick_local_wg_size(
     ComputeGraph* graph,
     const vkapi::ShaderInfo& shader,
     const utils::uvec3& global_workgroup_size,
     const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& additional_args);
+    const std::vector<ValueRef>& resize_args);
 
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
index 724f4630264..73a625f3adf 100644
--- a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/MatMul.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
@@ -37,12 +38,12 @@ void check_matmul_args(
 void resize_matmul_node(
     ComputeGraph* graph,
     const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
+    const std::vector<ValueRef>& resize_args) {
   vTensorPtr out = graph->get_tensor(args[0].refs[0]);
   vTensorPtr mat1 = graph->get_tensor(args[1].refs[0]);
   vTensorPtr mat2 = graph->get_tensor(args[1].refs[1]);
 
-  bool mat2_is_transposed = graph->get_bool(extra_args[0]);
+  bool mat2_is_transposed = graph->get_bool(resize_args[0]);
 
   const int out_cols = utils::val_at(-2, mat1->sizes());
   const int out_rows = mat2_is_transposed ? utils::val_at(-2, mat2->sizes())
@@ -56,6 +57,23 @@ void resize_matmul_node(
   out->virtual_resize(new_out_sizes);
 }
 
+/**
+ * Custom global workgroup size function for naive buffer matmul operations.
+ */
+utils::uvec3 matmul_naive_buffer_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+  return {
+      graph->size_at<uint32_t>(-1, out),
+      graph->size_at<uint32_t>(-2, out),
+      graph->size_at<uint32_t>(-3, out) * graph->size_at<uint32_t>(-4, out)};
+}
+
 void add_matmul_naive_buffer_node(
     ComputeGraph& graph,
     const ValueRef mat1,
@@ -72,21 +90,16 @@ void add_matmul_naive_buffer_node(
   std::string kernel_name = "matmul_naive_buffer";
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
-  utils::uvec3 global_size = {
-      graph.size_at<uint32_t>(-1, out),
-      graph.size_at<uint32_t>(-2, out),
-      graph.size_at<uint32_t>(-3, out) * graph.size_at<uint32_t>(-4, out)};
-
   int mat2_is_transposed_val = (mat2_is_transposed != kDummyValueRef &&
                                 graph.get_bool(mat2_is_transposed))
       ? 1
       : 0;
 
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      graph.create_local_wg_size(global_size),
+      matmul_naive_buffer_global_wg_size,
+      default_pick_local_wg_size,
       // Inputs and Outputs
       {{out, vkapi::kWrite}, {{mat1, mat2}, vkapi::kRead}},
       // Shader params buffers
@@ -109,6 +122,22 @@ void add_matmul_naive_buffer_node(
       resize_matmul_node));
 }
 
+vkapi::ShaderInfo pick_matmul_naive_texture3d_shader(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef out = args.at(0).refs.at(0);
+  const bool is_transposed = graph->get_bool(resize_args.at(0));
+
+  std::string kernel_name =
+      is_transposed ? "matmul_transposed_naive" : "matmul_naive";
+  kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph->storage_type_of(out));
+  add_dtype_suffix(kernel_name, graph->dtype_of(out));
+
+  return VK_KERNEL_FROM_STR(kernel_name);
+}
+
 void add_matmul_naive_texture3d_node(
     ComputeGraph& graph,
     const ValueRef mat1,
@@ -122,19 +151,11 @@ void add_matmul_naive_texture3d_node(
       utils::kHeightPacked,
       /*passthrough = */ true);
 
-  std::string kernel_name = graph.get_bool(mat2_is_transposed)
-      ? "matmul_transposed_naive"
-      : "matmul_naive";
-  kernel_name.reserve(kShaderNameReserve);
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      global_wg_size,
-      graph.create_local_wg_size(global_wg_size),
+      pick_matmul_naive_texture3d_shader,
+      default_pick_global_wg_size,
+      default_pick_local_wg_size,
       // Inputs and Outputs
       {{out, vkapi::kWrite}, {{mat1, mat2}, vkapi::kRead}},
       // Shader params buffers
@@ -156,6 +177,59 @@ void add_matmul_naive_texture3d_node(
       resize_matmul_node));
 }
 
+vkapi::ShaderInfo pick_matmul_optimized_shader(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef mat1_W_packed = resize_args.at(1);
+  const bool mat2_is_transposed_val = graph->get_bool(resize_args.at(0));
+
+  std::string kernel_name = mat2_is_transposed_val
+      ? "matmul_transposed_optimized"
+      : "matmul_optimized";
+
+  std::vector<int64_t> mat1_sizes = graph->sizes_of(mat1_W_packed);
+  size_t mat1_dims = mat1_sizes.size();
+  if (mat1_dims == 3) {
+    kernel_name = "batch_" + kernel_name;
+  }
+  if (mat1_sizes.at(mat1_dims - 2) < 8) {
+    kernel_name += "_tile_row_2";
+  } else {
+    kernel_name += "_tile_row_4";
+  }
+
+  add_dtype_suffix(kernel_name, graph->dtype_of(out));
+
+  return VK_KERNEL_FROM_STR(kernel_name);
+}
+
+utils::uvec3 matmul_optimized_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef mat1_W_packed = resize_args.at(1);
+
+  const std::vector<int64_t> mat1_sizes = graph->sizes_of(mat1_W_packed);
+  const size_t mat1_dims = mat1_sizes.size();
+
+  utils::uvec3 global_size = graph->logical_limits_of(out);
+  if (mat1_sizes.at(mat1_dims - 2) < 8) {
+    // Use `logical_extents` instead of `image_extents` because the workgroup
+    // axes need to correspond to tensor dimensions.
+    global_size = utils::divup_vec(global_size, {4, 2, 1});
+  } else {
+    global_size = utils::divup_vec(global_size, {4, 4, 1});
+  }
+
+  return global_size;
+}
+
 void add_matmul_optimized_node(
     ComputeGraph& graph,
     const ValueRef mat1,
@@ -192,45 +266,11 @@ void add_matmul_optimized_node(
     viewFn(graph, {mat2, graph.add_none(), mat2_packed});
   }
 
-  std::string kernel_name = mat2_is_transposed_val
-      ? "matmul_transposed_optimized"
-      : "matmul_optimized";
-
-  std::vector<int64_t> mat1_sizes = graph.sizes_of(mat1_W_packed);
-  int mat1_dims = mat1_sizes.size();
-  if (mat1_dims == 3) {
-    kernel_name = "batch_" + kernel_name;
-  }
-  if (mat1_sizes.at(mat1_dims - 2) < 8) {
-    kernel_name += "_tile_row_2";
-  } else {
-    kernel_name += "_tile_row_4";
-  }
-
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
-
-  // Each thread computes a W=(2/4) x H=4 x C=(1/4) output tile. Therefore, the
-  // total number of threads is W/(2 or 4) x H/4 x C/1. Since the out tensor is
-  // channels packed, C does not need to be divided by 4. The "identity" of each
-  // thread is the (x, y, z) coordinate of the output tile it is computing, and
-  // this identity can be used to compute the tensor index of the top left
-  // element in the tile, which will be [W=x*(2 or 4), H=y*4, C=z*(1 or 4), N=0]
-  utils::uvec3 global_size = graph.logical_limits_of(out);
-  if (mat1_sizes.at(mat1_dims - 2) < 8) {
-    // Use `logical_extents` instead of `image_extents` because the workgroup
-    // axes need to correspond to tensor dimensions.
-    global_size = utils::divup_vec(global_size, {4, 2, 1});
-  } else {
-    global_size = utils::divup_vec(global_size, {4, 4, 1});
-  }
-
-  utils::uvec3 local_size = adaptive_work_group_size(global_size);
-
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      pick_matmul_optimized_shader,
+      matmul_optimized_global_wg_size,
+      default_pick_local_wg_size,
       // Inputs and Outputs
       {{out, vkapi::kWrite}, {{mat1_W_packed, mat2_packed}, vkapi::kRead}},
       // Shader params buffers
@@ -246,7 +286,7 @@ void add_matmul_optimized_node(
        graph.hashed_layout_of(mat1_W_packed),
        graph.hashed_layout_of(mat2_packed)},
       // Resize Args
-      {mat2_is_transposed},
+      {mat2_is_transposed, mat1_W_packed},
       // Resizing Logic
       resize_matmul_node));
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQGANW.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQGANW.cpp
index 8c5cb0093d9..d9425b8b62f 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQGANW.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQGANW.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
@@ -70,30 +71,26 @@ void resize_linear_qga4w_node(
   out->virtual_resize(new_out_sizes);
 }
 
-void add_linear_qga4w_node(
-    ComputeGraph& graph,
-    const ValueRef mat1,
-    const ValueRef mat2_data,
-    const ValueRef group_size,
-    const ValueRef scales_and_zeros_data,
-    const ValueRef out) {
-  check_linear_qga4w_args(
-      graph, mat1, mat2_data, group_size, scales_and_zeros_data, out);
-
-  const uint32_t group_size_val = graph.extract_scalar<uint32_t>(group_size);
+/**
+ * Determines if the cooperative algorithm should be used based on input tensor
+ * dimensions. Apply the coop algorithm for gemv cases, i.e. mat1 is avector as
+ * as opposed to a matrix.
+ */
+bool should_use_coop_algorithm(ComputeGraph* graph, const ValueRef& mat1) {
+  return graph->size_at<uint32_t>(-2, mat1) == 1;
+}
 
-  bool use_coop_algorithm = false;
-  // Apply the coop algorithm for gemv cases, i.e. mat1 is a vector as opposed
-  // to a matrix.
-  if (graph.size_at<uint32_t>(-2, mat1) == 1) {
-    use_coop_algorithm = true;
-  }
+vkapi::ShaderInfo pick_linear_qga4w_shader(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
 
-  ValueRef mat2 =
-      prepack_int4_linear_weight_transposed_interleaved(graph, mat2_data);
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef mat1 = args.at(1).refs.at(0);
+  const ValueRef mat2 = args.at(1).refs.at(1);
 
-  ValueRef scales_and_zeros = prepack_standard_hw_transposed(
-      graph, scales_and_zeros_data, utils::kBuffer, utils::kWidthPacked);
+  const bool use_coop_algorithm = should_use_coop_algorithm(graph, mat1);
 
   std::string kernel_name = "linear_qga4w";
   if (use_coop_algorithm) {
@@ -101,26 +98,75 @@ void add_linear_qga4w_node(
   } else {
     kernel_name += "_tiled";
   }
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(mat1));
-  add_storage_type_suffix(kernel_name, graph.storage_type_of(mat2));
-  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+  add_storage_type_suffix(kernel_name, graph->storage_type_of(out));
+  add_storage_type_suffix(kernel_name, graph->storage_type_of(mat1));
+  add_storage_type_suffix(kernel_name, graph->storage_type_of(mat2));
+  add_dtype_suffix(kernel_name, graph->dtype_of(out));
 
-  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
+  return VK_KERNEL_FROM_STR(kernel_name);
+}
+
+utils::uvec3 linear_qga4w_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+
+  const bool use_coop_algorithm =
+      shader.kernel_name.find("_coop") != std::string::npos;
+
+  utils::uvec3 global_wg_size = graph->logical_limits_of(out);
   global_wg_size[0] = utils::div_up(global_wg_size[0], uint32_t(2));
-  utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size);
+
+  if (!use_coop_algorithm) {
+    global_wg_size[1] = utils::div_up(global_wg_size[1], uint32_t(3));
+  }
+
+  return global_wg_size;
+}
+
+utils::uvec3 linear_qga4w_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)args;
+  (void)resize_args;
+  const bool use_coop_algorithm =
+      shader.kernel_name.find("_coop") != std::string::npos;
 
   if (use_coop_algorithm) {
-    local_wg_size = {8, 1, 8};
+    return {8, 1, 8};
   } else {
-    global_wg_size[1] = utils::div_up(global_wg_size[1], uint32_t(3));
+    return graph->create_local_wg_size(global_workgroup_size);
   }
+}
+
+void add_linear_qga4w_node(
+    ComputeGraph& graph,
+    const ValueRef mat1,
+    const ValueRef mat2_data,
+    const ValueRef group_size,
+    const ValueRef scales_and_zeros_data,
+    const ValueRef out) {
+  check_linear_qga4w_args(
+      graph, mat1, mat2_data, group_size, scales_and_zeros_data, out);
 
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  const uint32_t group_size_val = graph.extract_scalar<uint32_t>(group_size);
+
+  ValueRef mat2 =
+      prepack_int4_linear_weight_transposed_interleaved(graph, mat2_data);
+
+  ValueRef scales_and_zeros = prepack_standard_hw_transposed(
+      graph, scales_and_zeros_data, utils::kBuffer, utils::kWidthPacked);
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
-      VK_KERNEL_FROM_STR(kernel_name),
-      global_wg_size,
-      local_wg_size,
+      pick_linear_qga4w_shader,
+      linear_qga4w_global_wg_size,
+      linear_qga4w_local_wg_size,
       // Inputs and Outputs
       {{out, vkapi::kWrite}, {{mat1, mat2, scales_and_zeros}, vkapi::kRead}},
       // Shader params buffers
diff --git a/backends/vulkan/runtime/graph/ops/impl/Reduce.cpp b/backends/vulkan/runtime/graph/ops/impl/Reduce.cpp
index 8fcd4a0609c..c0fd442ec50 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Reduce.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Reduce.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
@@ -20,23 +21,66 @@ using namespace utils;
 void resize_reduce_node(
     ComputeGraph* graph,
     const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
+    const std::vector<ValueRef>& resize_args) {
   vTensorPtr out = graph->get_tensor(args[0].refs[0]);
   vTensorPtr in = graph->get_tensor(args[1].refs[0]);
 
-  int dim = extra_args[0];
+  int32_t reduce_dim_nchw = graph->extract_scalar<int32_t>(resize_args.at(0));
 
   std::vector<int64_t> new_sizes = in->sizes();
-  new_sizes[normalize(dim, new_sizes.size())] = 1;
+  new_sizes.at(normalize(reduce_dim_nchw, new_sizes.size())) = 1;
   out->virtual_resize(new_sizes);
 }
 
+utils::uvec3 reduce_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  const ValueRef out = args.at(0).refs.at(0);
+  const int32_t reduce_dim_whcn =
+      graph->extract_scalar<int32_t>(resize_args.at(1));
+
+  utils::uvec3 global_wg_size = graph->logical_limits_of(out);
+  global_wg_size[reduce_dim_whcn] = 1;
+  return global_wg_size;
+}
+
+utils::uvec3 reduce_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)args;
+  (void)global_workgroup_size;
+
+  const int32_t reduce_dim_whcn =
+      graph->extract_scalar<int32_t>(resize_args.at(1));
+  const int64_t group_dim_whcn =
+      graph->extract_scalar<int64_t>(resize_args.at(2));
+
+  // This should match the value of MAX_NTHREADS in the reduce shader.
+  constexpr uint32_t max_nthreads = 16;
+
+  const uint32_t nworkers_per_group = 4;
+  const uint32_t ngroups = 4;
+  VK_CHECK_COND(nworkers_per_group * ngroups <= max_nthreads);
+
+  utils::uvec3 local_wg_size{1, 1, 1};
+  local_wg_size[reduce_dim_whcn] = nworkers_per_group;
+  local_wg_size[group_dim_whcn] = ngroups;
+
+  return local_wg_size;
+}
+
 void add_reduce_node(
     ComputeGraph& graph,
-    ValueRef in,
-    const int dim,
-    ValueRef out,
+    const ValueRef in,
+    const ValueRef dim_ref,
+    const ValueRef out,
     const std::string& op_name) {
   VK_CHECK_COND(
       !graph.is_buffer_storage(in) && !graph.is_buffer_storage(out),
@@ -44,7 +88,7 @@ void add_reduce_node(
 
   const int64_t ndim = graph.dim_of(in);
 
-  int32_t reduce_dim = dim;
+  int32_t reduce_dim = graph.extract_scalar<int32_t>(dim_ref);
   reduce_dim = normalize(reduce_dim, ndim);
   reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim);
 
@@ -55,40 +99,30 @@ void add_reduce_node(
     VK_CHECK_COND(graph.concat_dim_of(out) != reduce_dim);
   }
 
-  vkapi::ShaderInfo shader_descriptor;
   std::string kernel_name = op_name;
   kernel_name.reserve(kShaderNameReserve);
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
-  // This should match the value of MAX_NTHREADS in the softmax shader.
-  constexpr uint32_t max_nthreads = 16;
-
-  const uint32_t nworkers_per_group = 4;
-  const uint32_t ngroups = 4;
-  VK_CHECK_COND(nworkers_per_group * ngroups <= max_nthreads);
-
-  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
-  global_wg_size[reduce_dim] = 1;
-
-  utils::uvec3 local_wg_size{1, 1, 1};
-  local_wg_size[reduce_dim] = nworkers_per_group;
+  // Calculate group_dim for specialization constants
   const int other_dim_1 = (reduce_dim + 1) % 3;
   const int other_dim_2 = (reduce_dim + 2) % 3;
   int32_t group_dim;
-  if (global_wg_size[other_dim_1] > global_wg_size[other_dim_2]) {
-    local_wg_size[other_dim_1] = ngroups;
+  utils::uvec3 limits = graph.logical_limits_of(out);
+  if (limits[other_dim_1] > limits[other_dim_2]) {
     group_dim = other_dim_1;
   } else {
-    local_wg_size[other_dim_2] = ngroups;
     group_dim = other_dim_2;
   }
 
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  const ValueRef reduce_dim_whcn_ref =
+      graph.get_or_add_value_for_int(reduce_dim);
+  const ValueRef group_dim_whcn_ref = graph.get_or_add_value_for_int(group_dim);
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
-      // shader_descriptor,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_wg_size,
-      local_wg_size,
+      reduce_global_wg_size,
+      reduce_local_wg_size,
       // Inputs and Outputs
       {{out, vkapi::kWrite}, {in, vkapi::kRead}},
       // Shader params buffers
@@ -98,17 +132,20 @@ void add_reduce_node(
       // Specialization Constants
       {graph.packed_dim_of(out), reduce_dim, group_dim},
       // Resize Args
-      {dim},
+      {dim_ref, reduce_dim_whcn_ref, group_dim_whcn_ref},
       // Resizing Logic
       resize_reduce_node));
 }
 
 #define DEFINE_REDUCE_FN(op_name, out_arg_idx)                           \
   void op_name(ComputeGraph& graph, const std::vector<ValueRef>& args) { \
-    const IntListPtr dims_list = graph.get_int_list(args[1]);            \
-    VK_CHECK_COND(dims_list->size() == 1);                               \
+    const std::vector<int64_t> dims_list =                               \
+        graph.extract_int_or_symint_list(args[1]);                       \
+    VK_CHECK_COND(dims_list.size() == 1);                                \
+    const int64_t dim_val = dims_list.at(0);                             \
+    const ValueRef dim_ref = graph.get_or_add_value_for_int(dim_val);    \
     return add_reduce_node(                                              \
-        graph, args[0], dims_list->at(0), args[out_arg_idx], #op_name);  \
+        graph, args[0], dim_ref, args[out_arg_idx], #op_name);           \
   }
 
 DEFINE_REDUCE_FN(sum, 4)
diff --git a/backends/vulkan/runtime/graph/ops/impl/RotaryEmbedding.cpp b/backends/vulkan/runtime/graph/ops/impl/RotaryEmbedding.cpp
index 31bab144d8a..fcc8fe4b265 100644
--- a/backends/vulkan/runtime/graph/ops/impl/RotaryEmbedding.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/RotaryEmbedding.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
 namespace vkcompute {
@@ -31,6 +33,22 @@ void resize_rotary_embedding_node(
   graph->virtual_resize(xk_out, xk_sizes);
 }
 
+utils::uvec3 rotary_embedding_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+
+  const ValueRef xq_out = args.at(0).refs.at(0);
+
+  utils::uvec3 global_wg_size = graph->logical_limits_of(xq_out);
+  global_wg_size[0] /= 2;
+
+  return global_wg_size;
+}
+
 void add_rotary_embedding_node(
     ComputeGraph& graph,
     const ValueRef xq,
@@ -57,17 +75,11 @@ void add_rotary_embedding_node(
   std::string kernel_name = "rotary_embedding";
   add_dtype_suffix(kernel_name, graph.dtype_of(xq_out));
 
-  utils::uvec3 global_wg_size = graph.logical_limits_of(xq_out);
-  global_wg_size[0] /= 2;
-  const utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size);
-
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
-      // Shader
       VK_KERNEL_FROM_STR(kernel_name),
-      // Workgroup sizes
-      global_wg_size,
-      local_wg_size,
+      rotary_embedding_global_wg_size,
+      default_pick_local_wg_size,
       // Inputs and Outputs
       {{{xq_out, xk_out}, vkapi::kWrite},
        {{xq, xk, freqs_cos, freqs_sin}, vkapi::kRead}},
diff --git a/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp b/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp
index 5ef84347181..5ac8077d95f 100644
--- a/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/SDPA.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/MatMul.h>
@@ -22,6 +23,24 @@
 
 namespace vkcompute {
 
+utils::uvec3 kv_cache_update_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+
+  const ValueRef cache = args.at(0).refs.at(0);
+  const ValueRef projected = args.at(1).refs.at(0);
+
+  if (graph->is_buffer_storage(cache)) {
+    return graph->create_global_wg_size(projected);
+  } else {
+    return graph->logical_limits_of(projected);
+  }
+}
+
 void add_kv_cache_update_node(
     ComputeGraph& graph,
     const ValueRef input_pos_symint,
@@ -31,30 +50,24 @@ void add_kv_cache_update_node(
   add_storage_type_suffix(kernel_name, graph.storage_type_of(projected));
   add_dtype_suffix(kernel_name, graph.dtype_of(projected));
 
-  utils::uvec3 global_size;
   vkapi::ParamsBindList param_ubos;
 
   if (graph.is_buffer_storage(cache)) {
-    global_size = graph.create_global_wg_size(projected);
-
     param_ubos = {
         graph.numel_ubo(projected),
         graph.strides_ubo(cache),
         graph.get_or_create_int_param_buffer(input_pos_symint)};
   } else {
-    global_size = graph.logical_limits_of(projected);
-
     param_ubos = {
         graph.logical_limits_ubo(projected),
         graph.get_or_create_int_param_buffer(input_pos_symint)};
   }
-  const utils::uvec3 local_size = graph.create_local_wg_size(global_size);
 
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      kv_cache_update_global_wg_size,
+      default_pick_local_wg_size,
       // Inputs and Outputs
       {{cache, vkapi::kWrite}, {projected, vkapi::kRead}},
       // Shader param buffers
@@ -69,6 +82,27 @@ void add_kv_cache_update_node(
       nullptr));
 }
 
+utils::uvec3 attn_weight_scale_and_mask_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+
+  const ValueRef attn_weight = args.at(0).refs.at(0);
+
+  if (graph->is_buffer_storage(attn_weight)) {
+    return {
+        graph->size_at<uint32_t>(-1, attn_weight),
+        graph->size_at<uint32_t>(-2, attn_weight),
+        graph->size_at<uint32_t>(-3, attn_weight),
+    };
+  } else {
+    return graph->logical_limits_of(attn_weight);
+  }
+}
+
 void add_attn_weight_scale_and_mask_node(
     ComputeGraph& graph,
     const ValueRef input_pos_symint,
@@ -81,37 +115,25 @@ void add_attn_weight_scale_and_mask_node(
   const int32_t head_dim_size = graph.size_at<int32_t>(-1, q_projected);
   const float scale_val = 1.0f / std::sqrt(static_cast<float>(head_dim_size));
 
-  utils::uvec3 global_size;
-  utils::uvec3 local_size;
   vkapi::ParamsBindList param_ubos;
 
   if (graph.is_buffer_storage(attn_weight)) {
-    global_size = {
-        graph.size_at<uint32_t>(-1, attn_weight),
-        graph.size_at<uint32_t>(-2, attn_weight),
-        graph.size_at<uint32_t>(-3, attn_weight),
-    };
-
     param_ubos = {
         graph.sizes_ubo(attn_weight),
         graph.strides_ubo(attn_weight),
         graph.create_params_buffer(scale_val)};
   } else {
-    global_size = graph.logical_limits_of(attn_weight);
-
     param_ubos = {
         graph.logical_limits_ubo(attn_weight),
         graph.get_or_create_int_param_buffer(input_pos_symint),
         graph.create_params_buffer(scale_val)};
   }
 
-  local_size = graph.create_local_wg_size(global_size);
-
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_size,
-      local_size,
+      attn_weight_scale_and_mask_global_wg_size,
+      default_pick_local_wg_size,
       // Inputs and Outputs
       {{attn_weight, vkapi::kReadWrite}},
       // Shader param buffers
diff --git a/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp b/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp
index 7469cbb0eb2..e37ef66434b 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Softmax.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
@@ -17,11 +18,55 @@ namespace vkcompute {
 
 using namespace utils;
 
+utils::uvec3 pick_softmax_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+
+  const ValueRef out = args.at(0).refs.at(0);
+  const int32_t reduce_dim_xyz =
+      graph->extract_scalar<int32_t>(resize_args.at(1));
+
+  utils::uvec3 global_size = graph->logical_limits_of(out);
+  global_size[reduce_dim_xyz] = 1;
+  return global_size;
+}
+
+utils::uvec3 pick_softmax_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)global_workgroup_size;
+  (void)args;
+
+  const int64_t group_dim_xyz =
+      graph->extract_scalar<int64_t>(resize_args.at(2));
+
+  const int32_t reduce_dim_xyz =
+      graph->extract_scalar<int32_t>(resize_args.at(1));
+
+  // These values are hardcoded in add_softmax_node
+  const uint32_t nworkers_per_group = 4;
+  const uint32_t ngroups = 4;
+
+  utils::uvec3 local_wg_size{1, 1, 1};
+  local_wg_size[reduce_dim_xyz] = nworkers_per_group;
+  local_wg_size[group_dim_xyz] = ngroups;
+
+  return local_wg_size;
+}
+
 void resize_softmax_node(
     ComputeGraph* graph,
     const std::vector<ArgGroup>& args,
-    const std::vector<ValueRef>& extra_args) {
-  (void)extra_args;
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
   vTensorPtr out = graph->get_tensor(args[0].refs[0]);
   vTensorPtr in = graph->get_tensor(args[1].refs[0]);
 
@@ -31,9 +76,9 @@ void resize_softmax_node(
 
 void add_softmax_node(
     ComputeGraph& graph,
-    ValueRef in,
-    ValueRef dim,
-    ValueRef out,
+    const ValueRef in,
+    const ValueRef dim_ref,
+    const ValueRef out,
     bool log_softmax) {
   VK_CHECK_COND(
       !graph.is_buffer_storage(in) && !graph.is_buffer_storage(out),
@@ -41,18 +86,18 @@ void add_softmax_node(
 
   const int64_t ndim = graph.dim_of(in);
 
-  int32_t reduce_dim = graph.extract_scalar<int32_t>(dim);
-  reduce_dim = normalize(reduce_dim, ndim);
-  reduce_dim = nchw_dim_to_whcn_dim(reduce_dim, ndim);
+  int32_t reduce_dim_nchw = graph.extract_scalar<int32_t>(dim_ref);
+  reduce_dim_nchw = normalize(reduce_dim_nchw, ndim);
+  const int32_t reduce_dim_xyz = nchw_dim_to_whcn_dim(reduce_dim_nchw, ndim);
 
   // Check that the concat dim is not the reduction dim, if the tensor has a
   // batch dim greater than 1.
   if (graph.dim_of(in) == 4 && graph.size_at<int>(0, in) > 1) {
     VK_CHECK_COND(
-        graph.concat_dim_of(in) != reduce_dim,
+        graph.concat_dim_of(in) != reduce_dim_xyz,
         "Softmax shader currently does not support concat dim == reduce dim");
     VK_CHECK_COND(
-        graph.concat_dim_of(out) != reduce_dim,
+        graph.concat_dim_of(out) != reduce_dim_xyz,
         "Softmax shader currently does not support concat dim == reduce dim");
   }
 
@@ -71,39 +116,36 @@ void add_softmax_node(
   const uint32_t ngroups = 4;
   VK_CHECK_COND(nworkers_per_group * ngroups <= max_nthreads);
 
-  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
-  global_wg_size[reduce_dim] = 1;
-
-  utils::uvec3 local_wg_size{1, 1, 1};
-  local_wg_size[reduce_dim] = nworkers_per_group;
-  const int other_dim_1 = (reduce_dim + 1) % 3;
-  const int other_dim_2 = (reduce_dim + 2) % 3;
+  // Determine the group dimension
+  const int other_dim_1 = (reduce_dim_xyz + 1) % 3;
+  const int other_dim_2 = (reduce_dim_xyz + 2) % 3;
   int32_t group_dim;
+  utils::uvec3 global_wg_size = graph.logical_limits_of(out);
   if (global_wg_size[other_dim_1] > global_wg_size[other_dim_2]) {
-    local_wg_size[other_dim_1] = ngroups;
     group_dim = other_dim_1;
   } else {
-    local_wg_size[other_dim_2] = ngroups;
     group_dim = other_dim_2;
   }
 
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  const ValueRef reduce_dim_xyz_ref =
+      graph.get_or_add_value_for_int(reduce_dim_xyz);
+  const ValueRef group_dim_xyz_ref = graph.get_or_add_value_for_int(group_dim);
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
-      // shader_descriptor,
       VK_KERNEL_FROM_STR(kernel_name),
-      global_wg_size,
-      local_wg_size,
+      pick_softmax_global_wg_size,
+      pick_softmax_local_wg_size,
       // Inputs and Outputs
-      {{out, vkapi::MemoryAccessType::WRITE},
-       {in, vkapi::MemoryAccessType::READ}},
+      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
       // Shader params buffers
       {graph.logical_limits_ubo(out), graph.sizes_ubo(in)},
       // Push Constants
       {},
       // Specialization Constants
-      {graph.packed_dim_of(out), reduce_dim, group_dim},
+      {graph.packed_dim_of(out), reduce_dim_xyz, group_dim},
       // Resize Args
-      {},
+      {dim_ref, reduce_dim_xyz_ref, group_dim_xyz_ref},
       // Resizing Logic
       resize_softmax_node));
 }
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
index 8c060a9da4b..4c46596c206 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -10,7 +10,8 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
-#include <executorch/backends/vulkan/runtime/graph/ops/DispatchNode.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/DynamicDispatchNode.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
 
@@ -38,11 +39,11 @@ void add_staging_to_tensor_node(
     pcs = {graph.sizes_pc_of(out_tensor)};
   }
 
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       shader,
-      graph.create_global_wg_size(out_tensor),
-      graph.create_local_wg_size(out_tensor),
+      default_pick_global_wg_size,
+      default_pick_local_wg_size,
       // Input and Outputs
       {{out_tensor, vkapi::kWrite}, {in_staging, vkapi::kRead}},
       // Parameter Buffers
@@ -65,6 +66,44 @@ bool is_bitw8_shader(const vkapi::ShaderInfo& shader) {
   return shader_prefix_str == kBitw8PrefixStr;
 }
 
+vkapi::ShaderInfo get_tensor_to_staging_shader(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef in_tensor = args.at(1).refs.at(0);
+  return get_tensor_to_nchw_shader(
+      *graph->get_tensor(in_tensor), graph->int8_buffers_enabled());
+}
+
+utils::uvec3 tensor_to_staging_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef in_tensor = args.at(1).refs.at(0);
+  const ValueRef out_staging = args.at(0).refs.at(0);
+
+  utils::uvec3 global_wg_size = graph->create_global_wg_size(in_tensor);
+
+  // Normally, the image_to_nchw shader is structured so that each thread reads
+  // one texel from the input texture and writes each component of the texel
+  // into the corresponding location in the output buffer. However, this shader
+  // is structured slightly differently in that each thread writes out a
+  // complete 32 bit integer (containing 4 packed 8-bit integers) into the
+  // output buffer. Therefore, the global work group size for this shader will
+  // be the number of elements in the output buffer divided by 4, as opposed to
+  // the extents of the input texture.
+  if (is_bitw8_shader(shader)) {
+    const uint32_t buffer_len = utils::safe_downcast<uint32_t>(
+        graph->get_staging(out_staging)->numel() / 4);
+    global_wg_size = {buffer_len, 1, 1};
+  }
+
+  return global_wg_size;
+}
+
 void add_tensor_to_staging_node(
     ComputeGraph& graph,
     const ValueRef in_tensor,
@@ -74,8 +113,6 @@ void add_tensor_to_staging_node(
   vkapi::ShaderInfo shader = get_tensor_to_nchw_shader(
       *graph.get_tensor(in_tensor), graph.int8_buffers_enabled());
 
-  utils::uvec3 global_wg_size = graph.create_global_wg_size(in_tensor);
-
   vkapi::ParamsBindList ubos;
   if (graph.is_buffer_storage(in_tensor)) {
     ubos.append(
@@ -86,25 +123,15 @@ void add_tensor_to_staging_node(
     ubos.append({graph.sizes_ubo(in_tensor)});
   }
 
-  // Normally, the image_to_nchw shader is structured so that each thread reads
-  // one texel from the input texture and writes each component of the texel
-  // into the corresponding location in the output buffer. However, this shader
-  // is structured slightly differently in that each thread writes out a
-  // complete 32 bit integer (containing 4 packed 8-bit integers) into the
-  // output buffer. Therefore, the global work group size for this shader will
-  // be the number of elements in the output buffer divided by 4, as opposed to
-  // the extents of the input texture.
   if (is_bitw8_shader(shader)) {
-    uint32_t buffer_len = graph.get_staging(out_staging)->numel() / 4;
-    global_wg_size = {buffer_len, 1, 1};
     ubos.append({graph.numel_ubo(in_tensor)});
   }
 
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       shader,
-      global_wg_size,
-      graph.create_local_wg_size(global_wg_size),
+      tensor_to_staging_global_wg_size,
+      default_pick_local_wg_size,
       // Input and Outputs
       {{out_staging, vkapi::kWrite}, {in_tensor, vkapi::kRead}},
       // Parameter Buffers
diff --git a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
index bffa8e2a181..518148f12eb 100644
--- a/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/UnaryOp.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
@@ -51,14 +52,13 @@ void add_unary_op_node(
   ubos.append(
       {graph.create_params_buffer(min), graph.create_params_buffer(max)});
 
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      graph.create_global_wg_size(out),
-      graph.create_local_wg_size(out),
+      default_pick_global_wg_size,
+      default_pick_local_wg_size,
       // Inputs and Outputs
-      {{out, vkapi::MemoryAccessType::WRITE},
-       {in, vkapi::MemoryAccessType::READ}},
+      {{out, vkapi::kWrite}, {in, vkapi::kRead}},
       // Shader params buffers
       ubos,
       // Push Constants
diff --git a/backends/vulkan/runtime/graph/ops/impl/View.cpp b/backends/vulkan/runtime/graph/ops/impl/View.cpp
index 710ba0d576f..9dbe79faebb 100644
--- a/backends/vulkan/runtime/graph/ops/impl/View.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/View.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/View.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
@@ -67,11 +68,11 @@ void add_view_node(
   kernel_name.reserve(kShaderNameReserve);
   add_dtype_suffix(kernel_name, *t_out);
 
-  graph.execute_nodes().emplace_back(new DispatchNode(
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      graph.create_global_wg_size(out),
-      graph.create_local_wg_size(out),
+      default_pick_global_wg_size,
+      default_pick_local_wg_size,
       // Inputs and Outputs
       {{out, vkapi::MemoryAccessType::WRITE},
        {in, vkapi::MemoryAccessType::READ}},
diff --git a/backends/vulkan/runtime/utils/VecUtils.h b/backends/vulkan/runtime/utils/VecUtils.h
index c084a563544..6d2e8c63bb9 100644
--- a/backends/vulkan/runtime/utils/VecUtils.h
+++ b/backends/vulkan/runtime/utils/VecUtils.h
@@ -260,12 +260,18 @@ struct vec final {
     }
   }
 
-  const Type& operator[](const uint32_t& i) const {
+  template <
+      typename IndexType,
+      typename = std::enable_if_t<std::is_integral<IndexType>::value>>
+  const Type& operator[](const IndexType& i) const {
     VK_CHECK_COND(i >= 0 && i < N, "Index out of bounds!");
     return data[i];
   }
 
-  Type& operator[](const uint32_t& i) {
+  template <
+      typename IndexType,
+      typename = std::enable_if_t<std::is_integral<IndexType>::value>>
+  Type& operator[](const IndexType& i) {
     VK_CHECK_COND(i >= 0 && i < N, "Index out of bounds!");
     return data[i];
   }
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index dcd8c425d62..99ee1c0fa0b 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -547,8 +547,8 @@ vkcompute::ComputeGraph build_mm_graph(
     vkcompute::vkapi::ScalarType dtype,
     vkcompute::utils::StorageType in_out_stype,
     vkcompute::utils::GPUMemoryLayout memory_layout,
-    const bool prepack_mat2,
-    const float mat2_val) {
+    const std::vector<float>& mat2_data,
+    const bool prepack_mat2) {
   using namespace vkcompute;
   GraphConfig config;
   ComputeGraph graph(config);
@@ -569,10 +569,7 @@ vkcompute::ComputeGraph build_mm_graph(
       graph.add_input_tensor(mat1_size, dtype, in_out_stype, memory_layout);
   IOValueRef mat2{};
 
-  CREATE_RAND_WEIGHT_TENSOR(mat2_w, mat2_size, dtype);
-  if (mat2_val != 0.0f) {
-    std::fill(data_mat2_w.begin(), data_mat2_w.end(), mat2_val);
-  }
+  ValueRef mat2_w = graph.add_tensorref(mat2_size, dtype, mat2_data.data());
 
   if (prepack_mat2) {
     mat2.value = mat2_w;
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
index 71d6d0bc0de..0f0d2647792 100644
--- a/backends/vulkan/test/utils/test_utils.h
+++ b/backends/vulkan/test/utils/test_utils.h
@@ -265,8 +265,8 @@ vkcompute::ComputeGraph build_mm_graph(
     vkcompute::vkapi::ScalarType dtype,
     vkcompute::utils::StorageType in_out_stype,
     vkcompute::utils::GPUMemoryLayout memory_layout,
-    const bool prepack_mat2 = false,
-    const float mat2_val = 0.0f);
+    const std::vector<float>& mat2_data,
+    const bool prepack_mat2 = false);
 
 //
 // Debugging Utilities
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 85811aaaf11..9887f3c7ffb 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -2751,15 +2751,19 @@ void test_mm(
     utils::StorageType storage_type,
     utils::GPUMemoryLayout memory_layout,
     bool prepack = true) {
+  std::vector<int64_t> mat2_size = {B, K, N};
+
+  std::vector<float> mat2_data(utils::multiply_integers(mat2_size));
+  std::fill(mat2_data.begin(), mat2_data.end(), 2.0f);
   ComputeGraph graph = build_mm_graph(
-      B, M, K, N, dtype, storage_type, memory_layout, prepack, 2.0f);
+      B, M, K, N, dtype, storage_type, memory_layout, mat2_data, prepack);
 
   graph.prepare();
   graph.encode_prepack();
   graph.prepack();
-  graph.encode_execute();
 
   for (int i = 1; i < 4; i++) {
+    graph.encode_execute();
     if (prepack) {
       float val_mat1 = i;
       float val_out = K * (val_mat1 * 2.0f);
@@ -2828,8 +2832,12 @@ void test_mm_with_resize_reencode(
     utils::GPUMemoryLayout memory_layout) {
   ASSERT_TRUE(M > 1);
 
+  std::vector<int64_t> mat2_size = {B, K, N};
+  std::vector<float> mat2_data(utils::multiply_integers(mat2_size));
+  std::fill(mat2_data.begin(), mat2_data.end(), 2.0f);
+
   ComputeGraph graph = build_mm_graph(
-      B, M, K, N, dtype, storage_type, memory_layout, false, 2.0f);
+      B, M, K, N, dtype, storage_type, memory_layout, mat2_data, false);
 
   graph.prepare();
   graph.encode_prepack();
@@ -2851,8 +2859,6 @@ void test_mm_with_resize_reencode(
   graph.resize_input(1, new_mat2_size);
   graph.propagate_resize();
 
-  graph.encode_execute();
-
   for (int i = 1; i < 4; i++) {
     float val_mat1 = i;
     float val_mat2 = i + 1;