diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
index 0ee19206f59..19250419baf 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -60,18 +60,18 @@ void main() {
   const uint div_by_x = gl_GlobalInvocationID.x / out_limits_xy_scaled.x;
   ivec3 pos = ivec3(
     gl_GlobalInvocationID.x % out_limits_xy_scaled.x,
-    div_by_x % out_limits_xy_scaled.y,
-    div_by_x / out_limits_xy_scaled.y);
-
-  // scale pos.xy by batch sizes, because that's the top pixel to be processed
-  pos.x *= BATCH_SIZE_X;
-  pos.y *= BATCH_SIZE_Y;
+    div_by_x,
+    gl_GlobalInvocationID.y);
 
   // do not process if top pixel does not fit within the output range
-  if (pos.z >= out_limits.z) {
+  if (pos.y >= out_limits_xy_scaled.y || pos.z >= out_limits.z) {
     return;
   }
 
+  // scale pos.xy by batch sizes, because that's the top pixel to be processed
+  pos.x *= BATCH_SIZE_X;
+  pos.y *= BATCH_SIZE_Y;
+
   // Compute the index of the top-left element of the overlay region. Negative
   // indices indicate that the top-left element is in a region added by padding.
   const ivec2 ipos = pos.xy * stride - padding;
@@ -79,7 +79,6 @@ void main() {
   // Compute the start and end of the input indices to load. Padding is assumed
   // to be constant 0 padding, so any reads from the padding region is skipped.
   const ivec2 start = ipos;
-  const ivec2 end = ipos + overlay_region.xy;
 
   // sum outputs
   VEC4_T sum[BATCH_SIZE_Y * BATCH_SIZE_X];
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl
index ceadc35779e..f161c1ba460 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl
@@ -50,10 +50,11 @@ void main() {
   const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x;
   const ivec3 pos = ivec3(
     gl_GlobalInvocationID.x % out_limits.x,
-    div_by_x % out_limits.y,
-    div_by_x / out_limits.y);
+    div_by_x,
+    gl_GlobalInvocationID.y);
 
-  if (pos.z >= out_limits.z) {
+  // do not process if top pixel does not fit within the output range
+  if (pos.y >= out_limits.y || pos.z >= out_limits.z) {
     return;
   }
 
@@ -64,7 +65,6 @@ void main() {
   // Compute the start and end of the input indices to load. Padding is assumed
   // to be constant 0 padding, so any reads from the padding region is skipped.
   const ivec2 start = ipos;
-  const ivec2 end = ipos + overlay_region.xy;
 
   VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
   int kx = 0;
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index ff375fba89c..d85bd9d841e 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -407,13 +407,11 @@ void add_conv2d_node(
   utils::uvec3 wg_size = create_conv2d_global_wg_size(
       graph, method, out, weight_data, stride_equals_dilation);
 
-  if (method == Conv2dMethod::Depthwise) {
-    wg_size = {wg_size[0] * wg_size[1] * wg_size[2], 1, 1};
-  } else if (method == Conv2dMethod::Pointwise) {
+  utils::uvec3 local_wg_size;
+  if (method == Conv2dMethod::Depthwise || method == Conv2dMethod::Pointwise) {
     wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
   }
 
-  utils::uvec3 local_wg_size;
   if (method == Conv2dMethod::Pointwise) {
     uint32_t local_wg_size_y = 1;
     if (wg_size[1] % 8 == 0) {
@@ -424,6 +422,8 @@ void add_conv2d_node(
       local_wg_size_y = 2;
     }
     local_wg_size = {64 / local_wg_size_y, local_wg_size_y, 1};
+  } else if (method == Conv2dMethod::Depthwise) {
+    local_wg_size = {64, 1, 1};
   } else {
     local_wg_size = graph.create_local_wg_size(wg_size);
   }