diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
index ce986d4e12f..a0a235154a0 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
@@ -48,19 +48,18 @@ $else:
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")}
+
 $if STORAGE == "buffer":
-  ${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")}
-  ${layout_declare_spec_const(C, "int", "in_packed_dim", "DEFAULT_LAYOUT")}
-  ${layout_declare_spec_const(C, "int", "other_packed_dim", "DEFAULT_LAYOUT")}
+  const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
 $else:
-  ${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
   const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
   const lowp int packed_dim = unhash_packed_dim(out_layout);
 
-  ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
   const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
 
-  ${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")}
   const lowp ivec4 other_axis_map = unhash_axis_map(other_layout);
 
 #ifdef USING_BUFFER
@@ -77,7 +76,7 @@ void main() {
     return;
   }
 
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
   const ivec4 in_tidx = min(out_tidx, in_sizes - 1);
   const ivec4 other_tidx = min(out_tidx, other_sizes - 1);
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
index 2b41d2b7e1a..0cfd7f2f119 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -68,21 +68,6 @@
  */
 #define mod4(x) ((x) & 3)
 
-/*
- * Find the packed dimension of a tensor given its strides. The packed dimension
- * is the "fastest moving" dimension which will have a stride of 1.
- */
-int find_packed_dim(const ivec4 strides) {
-  int packed_dim = 0;
-  for (int i = 0; i <= 3; i++) {
-    if (strides[i] == 1) {
-      packed_dim = i;
-      break;
-    }
-  }
-  return packed_dim;
-}
-
 /*
  * Get the staging buffer indices that contain the data of the texel that
  * corresponds to the provided tensor index. Since the texel have 4 elements,
@@ -129,27 +114,26 @@ int tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes) {
       tidx.x;
 }
 
-// TODO(ssjia): make this function use dim order so that it can work with any
-// dim order. Currently it assumes that the dim order is contiguous, except for
-// the packed dim.
-ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const int packed_dim) {
+ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const ivec4 dim_order) {
   ivec4 idx;
   for (int i = 3; i >= 0; i--) {
-    if (i != packed_dim) {
-      idx[i] = bufi / strides[i];
-      bufi %= strides[i];
-    }
+    int dim = dim_order[i];
+    idx[dim] = bufi / strides[dim];
+    bufi %= strides[dim];
   }
-  idx[packed_dim] = bufi;
   return idx;
 }
 
-// Convenience overload of the above function, which will determine the packed
-// dim from the strides automatically so it doesn't have to be passed in as a
-// function argument.
-ivec4 bufi_to_tidx(const int bufi, const ivec4 strides) {
-  int packed_dim = find_packed_dim(strides);
-  return bufi_to_tidx(bufi, strides, packed_dim);
+/*
+ * bufi_to_tidx but assumes that the tensor is contiguous
+ */
+ivec4 contiguous_bufi_to_tidx(int bufi, const ivec4 strides) {
+  ivec4 idx;
+  for (int i = 3; i >= 0; i--) {
+    idx[i] = bufi / strides[i];
+    bufi %= strides[i];
+  }
+  return idx;
 }
 
 int tidx_to_bufi(const ivec4 tidx, ivec4 strides) {
@@ -269,12 +253,22 @@ ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) {
  * e.g. 0x11021, 1 -> ivec4(1, 2, 0, 1)
  */
 #define unhash_axis_map(hash) \
-  ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf))
+  (ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf)))
+
+/*
+ *
+ */
+#define unhash_dim_order(hash) \
+  (ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf)))
 
 #define unhash_packed_dim(hash) int(hash >> 16 & 0xf)
 
 #define DEFAULT_LAYOUT 0x02210
 
+#define DEFAULT_DIM_ORDER 0x03210
+
+#define DEFAULT_DIM_ORDER_IVEC4 ivec4(0, 1, 2, 3)
+
 /************************
  * Deprecated Functions *
  ************************/
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl
index dfb5f1f2f9c..4dd83f0d4ed 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl
@@ -62,7 +62,7 @@ void main() {
     return;
   }
 
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, 0);
+  const ivec4 out_tidx = contiguous_bufi_to_tidx(out_bufi, out_strides);
 
   const FLOAT_T scale = t_scales[out_tidx.x];
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
index ba4e4dd9dd9..62cd0610ffb 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
@@ -10,8 +10,8 @@ ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "nchw_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "nchw_in", DTYPE, STORAGE)}
 
 $if USE_PUSH_CONST:
   layout(push_constant) uniform restrict Block {
@@ -20,15 +20,14 @@ $if USE_PUSH_CONST:
     int numel;
   };
 $else:
-  ${layout_declare_ubo(2, "ivec4", "out_sizes")}
-  ${layout_declare_ubo(3, "ivec4", "out_strides")}
-  ${layout_declare_ubo(4, "int", "numel")}
+  ${layout_declare_ubo(B, "ivec4", "out_sizes")}
+  ${layout_declare_ubo(B, "ivec4", "out_strides")}
+  ${layout_declare_ubo(B, "int", "numel")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-// This constant is unused in this shader but is kept so that the signature is
-// consistent with nchw_to_image.
-${layout_declare_spec_const(C, "int", "UNUSED_layout", "0")}
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_DIM_ORDER")}
+const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
 ${layout_declare_spec_const(C, "int", "transpose_hw", "0")}
 
 void main() {
@@ -37,7 +36,7 @@ void main() {
     return;
   }
 
-  ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides);
+  ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
 
   ivec4 sizes = out_sizes;
   if (transpose_hw == 1) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select.glslh b/backends/vulkan/runtime/graph/ops/glsl/select.glslh
index 3bcbf04a3ba..6509015b4b6 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/select.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/select.glslh
@@ -9,6 +9,8 @@
 #ifndef SELECT_GLSLH
 #define SELECT_GLSLH
 
+#ifndef USING_BUFFER
+
 /*
  * Enable the fast path if a texel loaded from the input texture can be used as
  * is to store to the output texture. The following conditions must be met:
@@ -29,6 +31,8 @@ bool can_use_fast_path() {
   return true;
 }
 
+#endif // USING_BUFFER
+
 /*
  * Given an output tensor index, return the corresponding input tensor index for
  * the select operator. This is done by "inserting" the select index at the
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
index 5d4cc70fdc1..87325754f4d 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
@@ -9,6 +9,8 @@
 #ifndef SLICE_GLSLH
 #define SLICE_GLSLH
 
+#ifndef USING_BUFFER
+
 /**
  * Enable the fast path if a texel loaded from the input texture can be used as
  * is to store to the output texture. The following conditions must be met:
@@ -26,6 +28,8 @@ bool can_use_fast_path() {
   return true;
 }
 
+#endif // USING_BUFFER
+
 /*
  * Converts output tensor indices to input tensor indices for the slice operation.
  * This function maps the output indices to the corresponding input indices based on
diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
index 3ca854e0526..7e95b52d8f4 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
@@ -37,8 +37,10 @@ layout(push_constant) uniform restrict Block {
   int selected_dim;
 };
 
-${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")}
-${layout_declare_spec_const(C, "int", "in_packed_dim", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+
+const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -50,7 +52,7 @@ void main() {
     return;
   }
 
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
   ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
 
   const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/where.glsl b/backends/vulkan/runtime/graph/ops/glsl/where.glsl
index 5df813d1241..fe6304c0fa0 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/where.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/where.glsl
@@ -37,40 +37,28 @@ $if STORAGE == "buffer":
   ${layout_declare_ubo(B, "ivec4", "cond_strides")}
   ${layout_declare_ubo(B, "ivec4", "self_strides")}
   ${layout_declare_ubo(B, "ivec4", "other_strides")}
-
-  ${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")}
-  ${layout_declare_spec_const(C, "int", "cond_packed_dim", "DEFAULT_LAYOUT")}
-  ${layout_declare_spec_const(C, "int", "self_packed_dim", "DEFAULT_LAYOUT")}
-  ${layout_declare_spec_const(C, "int", "other_packed_dim", "DEFAULT_LAYOUT")}
 $else:
   ${layout_declare_ubo(B, "ivec3", "out_limits")}
 
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_DIM_ORDER")}
+
+const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
+
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 #ifdef USING_BUFFER
 
 void main() {
   int out_bufi = int(gl_GlobalInvocationID.x);
-  // ivec4 tidx = ivec4(gl_GlobalInvocationID, 0);
-  // int out_bufi = tidx_to_bufi(tidx, out_strides);
-  // int cond_bufi = tidx_to_bufi(tidx, cond_strides);
-  // int self_bufi = tidx_to_bufi(tidx, self_strides);
-  // int other_bufi = tidx_to_bufi(tidx, other_strides);
   if (out_bufi >= out_numl) {
     return;
   }
 
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
-  out_bufi = tidx_to_bufi(out_tidx, out_strides);
-
-  const ivec4 cond_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
-  const int cond_bufi = tidx_to_bufi(cond_tidx, cond_strides);
-
-  const ivec4 self_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
-  const int self_bufi = tidx_to_bufi(self_tidx, self_strides);
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
 
-  const ivec4 other_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
-  const int other_bufi = tidx_to_bufi(other_tidx, other_strides);
+  const int cond_bufi = tidx_to_bufi(out_tidx, cond_strides);
+  const int self_bufi = tidx_to_bufi(out_tidx, self_strides);
+  const int other_bufi = tidx_to_bufi(out_tidx, other_strides);
 
   COND_T cond = t_condition[cond_bufi] ;
   T v_self = t_self[self_bufi];
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
index d260ed767d0..28279c196c0 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -143,9 +143,9 @@ void add_binary_op_buffer_node(
           PushConstantDataInfo(&alpha_val, sizeof(float)),
       }},
       // Specialization Constants
-      {graph.packed_dim_of(out),
-       graph.packed_dim_of(in1),
-       graph.packed_dim_of(in2)},
+      {graph.hashed_layout_of(out),
+       graph.hashed_layout_of(in1),
+       graph.hashed_layout_of(in2)},
       // Resize Args
       {},
       // Resizing Logic
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp
index 6e101195e3f..07502a7a107 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp
@@ -43,6 +43,10 @@ void check_linear_qcsnw_args(
     VK_CHECK_COND(
         utils::val_at(-1, scales_sizes) == utils::val_at(-2, qmat2_sizes));
   }
+
+  if (graph.is_buffer_storage(out)) {
+    VK_CHECK_COND(graph.is_contiguous(out));
+  }
 }
 
 void resize_linear_qcsnw_node(
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
index 423c9789d67..7b5fad57483 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
@@ -55,7 +55,6 @@ void add_transfer_copy_node(
   } transfer_params{static_cast<int32_t>(dim_whcn)};
 
   std::vector<PushConstantDataInfo> push_constants;
-  vkapi::SpecVarList spec_vars;
 
   if (graph.is_buffer_storage(out)) {
     push_constants = {
@@ -64,23 +63,18 @@ void add_transfer_copy_node(
         graph.strides_pc_of(in),
         graph.numel_pc_of(out),
         PushConstantDataInfo(&transfer_params, sizeof(transfer_params))};
-
-    spec_vars = {
-        graph.packed_dim_of(out),
-        graph.packed_dim_of(in),
-    };
   } else {
     push_constants = {
         graph.sizes_pc_of(out),
         graph.sizes_pc_of(in),
         PushConstantDataInfo(&transfer_params, sizeof(transfer_params))};
-
-    spec_vars = {
-        graph.hashed_layout_of(out),
-        graph.hashed_layout_of(in),
-    };
   }
 
+  vkapi::SpecVarList spec_vars = {
+      graph.hashed_layout_of(out),
+      graph.hashed_layout_of(in),
+  };
+
   // Determine the shader directly
   std::string kernel_name;
   if (transfer_type == TransferType::SELECT) {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Where.cpp b/backends/vulkan/runtime/graph/ops/impl/Where.cpp
index a3be34830d3..ea610b1fe74 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Where.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Where.cpp
@@ -54,7 +54,7 @@ void add_where_texture_node(
       // Push Constants
       {},
       // Specialization Constants
-      {graph.packed_dim_of(out)},
+      {graph.hashed_layout_of(out)},
       // Resize Arguments
       {},
       // Resizing Logic
@@ -96,10 +96,7 @@ void add_where_buffer_node(
       // Push Constants
       {},
       // Specialization Constants
-      {graph.packed_dim_of(out),
-       graph.packed_dim_of(cond),
-       graph.packed_dim_of(self),
-       graph.packed_dim_of(other)},
+      {graph.hashed_layout_of(out)},
       // Resize Arguments
       {},
       // Resizing Logic
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index bd67933dc93..4ea61cd7ef3 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -52,13 +52,17 @@ def get_binary_elementwise_inputs():
             ((S, S1, S2), (S, S1, 1), 2.0),
             ((S, S1, S2), (S, 1, S2), 2.0),
             ((XS, S, S1, S2), (XS, S, 1, 1), 2.0),
+            ((3, 64, 1), (1, 64, 1)),
         ]
     )
     test_suite.layouts = [
         "utils::kWidthPacked",
         "utils::kChannelsPacked",
     ]
-    test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
+    test_suite.storage_types = [
+        "utils::kBuffer",
+        "utils::kTexture3D",
+    ]
     return test_suite
 
 
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index 3f5dba9e277..faa0e7d0c47 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -26,13 +26,14 @@ void record_nchw_to_buffer_op(
     vkapi::VulkanBuffer& src_buffer,
     api::vTensor& v_dst) {
   vkapi::PipelineBarrier pipeline_barrier{};
+  vkapi::SpecVarList specialization_constants = {v_dst.hashed_layout()};
 
   context->submit_compute_job(
       get_nchw_to_tensor_shader(v_dst, true, false),
       pipeline_barrier,
       {uint32_t(v_dst.numel()), 1, 1},
       {64, 1, 1},
-      {},
+      specialization_constants,
       VK_NULL_HANDLE,
       0,
       v_dst.buffer(