diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 6f7167c54fb..e9437e3bd09 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -14,6 +14,10 @@
 namespace vkcompute {
 namespace api {
 
+/*
+ * Used to infer the sizes of a tensor that would correspond to a given
+ * VulkanImage.
+ */
 std::vector<int64_t> calculate_sizes(
     const vkapi::VulkanImage& image,
     const utils::GPUMemoryLayout memory_layout) {
@@ -143,58 +147,19 @@ bool dim_order_is_valid(const std::vector<int64_t>& dim_order) {
   return sum == n * (n + 1) / 2;
 }
 
-/*
- * Applies the following transformations to a tensor's dim_order vector:
- *   1. Reverse the order of elements so that the fastest moving dimensions are
- *      first.
- *   2. Convert NCHW dimension indices to WHCN indices, so that 0 represents the
- *      width dimension, 1 represents the height dimension, and 2 represents the
- *      channels dimension.
- *   3. Unsqueeze the dim_order vector to the next multiple of 4.
-
- * These transformations make it easier to use the dim order in a compute shader
- */
-std::vector<int64_t> create_whcn_dim_order(
-    const std::vector<int64_t>& dim_order) {
-  size_t ndim = dim_order.size();
-  std::vector<int64_t> whcn_order(ndim);
-
-  // Convert from NCHW to WHCN index, and flip the dim order so that the fastest
-  // moving dimension is first.
-  // example: {     1,     2,        0} -> {       2,     0,      1}
-  //          {height, width, channels} -> {channels, width, height}
-  for (size_t whcn_i = 0, nchw_i = (ndim - 1); whcn_i < ndim;
-       ++whcn_i, --nchw_i) {
-    whcn_order.at(whcn_i) = ndim - 1 - dim_order.at(nchw_i);
-  }
-
-  // Unsqueeze to the next multiple of 4
-  size_t ndim_up4 = utils::align_up_4(ndim);
-  whcn_order.resize(ndim_up4);
-
-  // Append unsqueezed dimensions
-  for (size_t i = ndim; i < ndim_up4; ++i) {
-    whcn_order.at(i) = i;
-  }
-
-  return whcn_order;
-}
-
-std::vector<int64_t> unsqueeze_strides(
-    const std::vector<int64_t>& strides,
-    const int64_t numel) {
-  const size_t ndim = strides.size();
-  const size_t ndim_up4 = utils::align_up_4(strides.size());
-  std::vector<int64_t> unsqueezed_strides(ndim_up4);
-  for (int32_t i = 1; i <= ndim; ++i) {
-    int64_t dim_stride = strides.at(ndim - i);
-    unsqueezed_strides.at(ndim_up4 - i) = dim_stride;
-  }
-
-  for (int32_t i = ndim + 1; i <= ndim_up4; ++i) {
-    unsqueezed_strides.at(ndim_up4 - i) = numel;
-  }
-  return unsqueezed_strides;
+utils::ivec4 flip_and_unsqueeze_ivec4(
+    const std::vector<int64_t>& tensor_metadata,
+    const vTensor::Attribute metadata_type,
+    const size_t numel) {
+  VK_CHECK_COND(tensor_metadata.size() <= 4);
+  std::vector<int32_t> flipped_metadata =
+      flip_and_unsqueeze<int32_t>(tensor_metadata, metadata_type, numel);
+  return {
+      flipped_metadata.at(0),
+      flipped_metadata.at(1),
+      flipped_metadata.at(2),
+      flipped_metadata.at(3),
+  };
 }
 
 std::vector<int64_t> calculate_padded_sizes(
@@ -309,7 +274,8 @@ int64_t calculate_gpu_buffer_numel(
   return numel;
 }
 
-int32_t pack_into_int32(const std::vector<int64_t>& vec, const int32_t extra) {
+template <typename T, typename = std::enable_if_t<std::is_integral<T>::value>>
+int32_t pack_into_int32(const std::vector<T>& vec, const int32_t extra) {
   int32_t packed = static_cast<int32_t>(
       vec.at(0) + (vec.at(1) << 4) + (vec.at(2) << 8) + (vec.at(3) << 12) +
       (extra << 16));
@@ -322,22 +288,24 @@ int32_t create_hashed_layout(
     const int32_t packed_dim,
     const utils::StorageType storage_type) {
   if (storage_type == utils::kBuffer) {
-    return pack_into_int32(create_whcn_dim_order(dim_order), 0);
+    return pack_into_int32(
+        flip_and_unsqueeze<int64_t>(dim_order, kTensorDimOrder, 0), 0);
   }
   return pack_into_int32(axis_map, packed_dim);
 }
 
 size_t calculate_max_ubo_nbytes(
-    const size_t nbytes_per_ubo,
+    const size_t min_nbytes_per_ubo,
     const utils::StorageType storage_type) {
-  // For texture backed tensors, the metadata fields needed are:
-  // sizes, logical limits
-  size_t max_metadata_field_count = 2u;
+  size_t ivec4_ubo_nbytes = utils::align_up(size_t(16), min_nbytes_per_ubo);
+  size_t uvec3_ubo_nbytes = utils::align_up(size_t(12), min_nbytes_per_ubo);
+  size_t int32_ubo_nbytes = utils::align_up(size_t(4), min_nbytes_per_ubo);
   if (storage_type == utils::kBuffer) {
     // sizes, strides, dim order, numel
-    max_metadata_field_count = 4u;
+    return 3 * ivec4_ubo_nbytes + int32_ubo_nbytes;
   }
-  return max_metadata_field_count * nbytes_per_ubo;
+  // sizes, logical limits
+  return ivec4_ubo_nbytes + uvec3_ubo_nbytes;
 }
 
 //
@@ -595,8 +563,9 @@ vTensor::vTensor(
           packed_dim_,
           storage_type)),
       // Related to tensor metadata UBOs
-      nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
-      max_ubo_nbytes_{calculate_max_ubo_nbytes(nbytes_per_ubo_, storage_type)},
+      min_nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
+      max_ubo_nbytes_{
+          calculate_max_ubo_nbytes(min_nbytes_per_ubo_, storage_type)},
       uniforms_(),
       // Construct Tensor storage
       storage_(std::make_shared<vTensorStorage>(
@@ -607,23 +576,13 @@ vTensor::vTensor(
           sizes,
           dtype_,
           allocate_memory)) {
-  // Derived metadata
-  std::vector<int64_t> whcn_dim_order(4, 0);
-  std::vector<int64_t> unsqueezed_strides(4, 0);
-  // Only calculate derived metadata if needed for the desired storage type.
-  // Note that logical limits may be used by buffer storage as well in order to
-  // set global work group sizes for some compute shaders.
-  if (storage_type == utils::kBuffer) {
-    whcn_dim_order = create_whcn_dim_order(dim_order_);
-    unsqueezed_strides = unsqueeze_strides(strides_, numel_);
-  }
-
   uniform_data_ = std::make_shared<UniformData>(UniformData{
+      numel_,
       sizes_,
-      whcn_dim_order,
-      unsqueezed_strides,
-      calculate_logical_limits(storage_->image_extents_, axis_map_),
-      numel_});
+      dim_order_,
+      strides_,
+      calculate_logical_limits(storage_->image_extents_, axis_map_)});
+
   VK_CHECK_COND(
       dim_order_is_valid(dim_order_), "computed dim order is invalid");
 }
@@ -648,18 +607,18 @@ vTensor::vTensor(
           packed_dim_,
           utils::kTexture3D)),
       // Related to tensor metadata UBOs
-      nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
+      min_nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
       max_ubo_nbytes_{
-          calculate_max_ubo_nbytes(nbytes_per_ubo_, utils::kTexture3D)},
+          calculate_max_ubo_nbytes(min_nbytes_per_ubo_, utils::kTexture3D)},
       uniforms_(),
       // Construct Tensor storage
       storage_(std::make_shared<vTensorStorage>(context, image)) {
   uniform_data_ = std::make_shared<UniformData>(UniformData{
+      numel_,
       sizes_,
       {0, 0, 0, 0},
       {0, 0, 0, 0},
-      calculate_logical_limits(storage_->image_extents_, axis_map_),
-      numel_});
+      calculate_logical_limits(storage_->image_extents_, axis_map_)});
 }
 
 vTensor::vTensor(vTensor& other)
@@ -672,7 +631,7 @@ vTensor::vTensor(vTensor& other)
       strides_(other.strides_.begin(), other.strides_.end()),
       numel_(other.numel_),
       hashed_layout_(other.hashed_layout_),
-      nbytes_per_ubo_{other.nbytes_per_ubo_},
+      min_nbytes_per_ubo_{other.min_nbytes_per_ubo_},
       max_ubo_nbytes_{other.max_ubo_nbytes_},
       uniforms_(),
       // Copy Tensor storage
@@ -697,22 +656,35 @@ vTensor::vTensor(
           axis_map_,
           packed_dim_,
           other.storage_type())),
-      nbytes_per_ubo_{other.nbytes_per_ubo_},
+      min_nbytes_per_ubo_{other.min_nbytes_per_ubo_},
       max_ubo_nbytes_{other.max_ubo_nbytes_},
       uniforms_(),
       // Copy Tensor storage
       storage_(other.storage_) {
   uniform_data_ = std::make_shared<UniformData>(UniformData{
+      static_cast<size_t>(utils::multiply_integers(sizes_)),
       sizes_,
-      create_whcn_dim_order(dim_order_),
-      unsqueeze_strides(strides_, numel_),
-      other.logical_limits(),
-      static_cast<size_t>(utils::multiply_integers(sizes_))});
+      dim_order_,
+      strides_,
+      other.logical_limits()});
 
   VK_CHECK_COND(
       dim_order_is_valid(dim_order_), "new dim order provided is invalid");
 }
 
+vTensor::UniformData::UniformData(
+    const size_t numel_ll,
+    const std::vector<int64_t>& sizes,
+    const std::vector<int64_t>& dim_order,
+    const std::vector<int64_t>& strides,
+    const utils::uvec3& limits)
+    : numel(utils::safe_downcast<int32_t>(numel_ll)),
+      sizes_v(flip_and_unsqueeze_ivec4(sizes, kTensorSizes, numel_ll)),
+      dim_order_v(
+          flip_and_unsqueeze_ivec4(dim_order, kTensorDimOrder, numel_ll)),
+      strides_v(flip_and_unsqueeze_ivec4(strides, kTensorStrides, numel_ll)),
+      logical_limits(limits) {}
+
 uint32_t vTensor::UniformData::write_attribute(
     void* dst,
     const uint32_t dst_offset,
@@ -727,11 +699,11 @@ uint32_t vTensor::UniformData::write_attribute(
     return sizeof(member_name);                                            \
   }
   switch (attr) {
+    WRITE_ATTRIBUTE_CASE(NUMEL, numel);
     WRITE_ATTRIBUTE_CASE(SIZES, sizes_v);
-    WRITE_ATTRIBUTE_CASE(WHCN_DIM_ORDER, whcn_dim_order_v);
+    WRITE_ATTRIBUTE_CASE(WHCN_DIM_ORDER, dim_order_v);
     WRITE_ATTRIBUTE_CASE(STRIDES, strides_v);
     WRITE_ATTRIBUTE_CASE(LOGICAL_LIMITS, logical_limits);
-    WRITE_ATTRIBUTE_CASE(NUMEL, numel);
     default:
       VK_THROW("Invalid Attribute");
   }
@@ -806,84 +778,25 @@ size_t vTensor::get_max_ubo_nbytes(const size_t nbytes_per_ubo) const {
 }
 
 const vkapi::BufferBindInfo vTensor::sizes_ubo() {
-  if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
-  }
-  if (sizes_uniform_offset_ == kUniformOffsetUnset) {
-    VK_CHECK_COND(
-        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
-        "Uniform data allocation has exceeded Tensor uniform buffer size");
-    sizes_uniform_offset_ = uniforms_size_;
-    uniforms_size_ += nbytes_per_ubo_;
-    uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_);
-  }
-  return vkapi::BufferBindInfo(
-      uniforms_.buffer(), sizes_uniform_offset_, nbytes_per_ubo_);
+  return metadata_ubo_impl(&sizes_uniform_offset_, uniform_data_->sizes_v);
 }
 
 const vkapi::BufferBindInfo vTensor::dim_order_ubo() {
-  if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
-  }
-  if (dim_order_uniform_offset_ == kUniformOffsetUnset) {
-    VK_CHECK_COND(
-        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
-        "Uniform data allocation has exceeded Tensor uniform buffer size");
-    dim_order_uniform_offset_ = uniforms_size_;
-    uniforms_size_ += nbytes_per_ubo_;
-    uniforms_.update(
-        uniform_data_->whcn_dim_order_v, dim_order_uniform_offset_);
-  }
-  return vkapi::BufferBindInfo(
-      uniforms_.buffer(), dim_order_uniform_offset_, nbytes_per_ubo_);
+  return metadata_ubo_impl(
+      &dim_order_uniform_offset_, uniform_data_->dim_order_v);
 }
 
 const vkapi::BufferBindInfo vTensor::strides_ubo() {
-  if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
-  }
-  if (strides_uniform_offset == kUniformOffsetUnset) {
-    VK_CHECK_COND(
-        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
-        "Uniform data allocation has exceeded Tensor uniform buffer size");
-    strides_uniform_offset = uniforms_size_;
-    uniforms_size_ += nbytes_per_ubo_;
-    uniforms_.update(uniform_data_->strides_v, strides_uniform_offset);
-  }
-  return vkapi::BufferBindInfo(
-      uniforms_.buffer(), strides_uniform_offset, nbytes_per_ubo_);
+  return metadata_ubo_impl(&strides_uniform_offset, uniform_data_->strides_v);
 }
 
 const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
-  if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
-  }
-  if (logical_limits_uniform_offset_ == kUniformOffsetUnset) {
-    VK_CHECK_COND(
-        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
-        "Uniform data allocation has exceeded Tensor uniform buffer size");
-    logical_limits_uniform_offset_ = uniforms_size_;
-    uniforms_size_ += nbytes_per_ubo_;
-    uniforms_.update(logical_limits(), logical_limits_uniform_offset_);
-  }
-  return vkapi::BufferBindInfo(
-      uniforms_.buffer(), logical_limits_uniform_offset_, nbytes_per_ubo_);
+  return metadata_ubo_impl(
+      &logical_limits_uniform_offset_, uniform_data_->logical_limits);
 }
 
 const vkapi::BufferBindInfo vTensor::numel_ubo() {
-  if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
-  }
-  if (numel_uniform_offset_ == kUniformOffsetUnset) {
-    VK_CHECK_COND(
-        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
-        "Uniform data allocation has exceeded Tensor uniform buffer size");
-    numel_uniform_offset_ = uniforms_size_;
-    uniforms_size_ += nbytes_per_ubo_;
-    uniforms_.update(numel(), numel_uniform_offset_);
-  }
-  return vkapi::BufferBindInfo(
-      uniforms_.buffer(), numel_uniform_offset_, nbytes_per_ubo_);
+  return metadata_ubo_impl(&numel_uniform_offset_, uniform_data_->numel);
 }
 
 VkMemoryRequirements vTensor::get_memory_requirements() const {
@@ -936,13 +849,13 @@ void vTensor::update_metadata() {
   strides_ = calculate_strides(sizes_, dim_order_);
 
   // Update uniform data if it has been modified
-  uniform_data_->numel = numel_;
-  uniform_data_->sizes_v = utils::make_whcn_ivec4(sizes_);
-  uniform_data_->whcn_dim_order_v =
-      utils::make_ivec4(create_whcn_dim_order(dim_order_));
-  uniform_data_->strides_v =
-      utils::make_whcn_ivec4(unsqueeze_strides(strides_, numel_));
   uniform_data_->numel = utils::safe_downcast<int32_t>(numel_);
+  uniform_data_->sizes_v =
+      flip_and_unsqueeze_ivec4(sizes_, kTensorSizes, numel_);
+  uniform_data_->dim_order_v =
+      flip_and_unsqueeze_ivec4(dim_order_, kTensorDimOrder, numel_);
+  uniform_data_->strides_v =
+      flip_and_unsqueeze_ivec4(strides_, kTensorStrides, numel_);
   uniform_data_->logical_limits.limits =
       calculate_logical_limits(sizes_, axis_map_, packed_dim_);
 
@@ -950,8 +863,7 @@ void vTensor::update_metadata() {
     uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_);
   }
   if (dim_order_uniform_offset_ != kUniformOffsetUnset) {
-    uniforms_.update(
-        uniform_data_->whcn_dim_order_v, dim_order_uniform_offset_);
+    uniforms_.update(uniform_data_->dim_order_v, dim_order_uniform_offset_);
   }
   if (strides_uniform_offset != kUniformOffsetUnset) {
     uniforms_.update(uniform_data_->strides_v, strides_uniform_offset);
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index bcca956e5ea..fefbd2aa71a 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -36,10 +36,6 @@ std::vector<int64_t> calculate_strides(
     const std::vector<int64_t>& sizes,
     const std::vector<int64_t>& dim_order);
 
-std::vector<int64_t> unsqueeze_strides(
-    const std::vector<int64_t>& strides,
-    const int64_t numel);
-
 /*
  * When stored on the GPU, tensor data is stored using texels (i.e. a vector of
  * 4 scalar values) in order to take advantage of the GPU's native vectorization
@@ -236,28 +232,23 @@ class vTensor final {
   };
 
   class UniformData {
+    // Contains the number of elements in the tensor according to the canonical
+    // sizes.
+    int32_t numel;
     utils::ivec4 sizes_v;
-    utils::ivec4 whcn_dim_order_v;
+    utils::ivec4 dim_order_v;
     utils::ivec4 strides_v;
     // See the comments documenting logical_limits() for more context.
     TextureLimits logical_limits;
-    // Contains the number of elements in the tensor according to the canonical
-    // sizes.
-    int32_t numel;
 
     friend class vTensor;
 
     UniformData(
+        const size_t numel_ll,
         const std::vector<int64_t>& sizes,
-        const std::vector<int64_t>& whcn_dim_order,
+        const std::vector<int64_t>& dim_order,
         const std::vector<int64_t>& strides,
-        const utils::uvec3& logical_limits,
-        const size_t numel_ll)
-        : sizes_v(utils::make_whcn_ivec4(sizes)),
-          whcn_dim_order_v(utils::make_ivec4(whcn_dim_order)),
-          strides_v(utils::make_whcn_ivec4(strides)),
-          logical_limits(logical_limits),
-          numel(utils::safe_downcast<int32_t>(numel_ll)) {}
+        const utils::uvec3& limits);
 
    public:
     /*
@@ -326,7 +317,7 @@ class vTensor final {
   int32_t hashed_layout_;
 
   // Pre-compute these quantities to avoid frequent re-computation
-  size_t nbytes_per_ubo_;
+  size_t min_nbytes_per_ubo_;
   size_t max_ubo_nbytes_;
 
   /*
@@ -523,6 +514,26 @@ class vTensor final {
 
   size_t get_max_ubo_nbytes(const size_t nbytes_per_ubo) const;
 
+  template <typename T>
+  const vkapi::BufferBindInfo metadata_ubo_impl(
+      uint32_t* param_buffer_offset,
+      const T& data) {
+    if (!uniforms_.buffer()) {
+      uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
+    }
+    size_t ubo_nbytes = utils::align_up(sizeof(data), min_nbytes_per_ubo_);
+    if (*param_buffer_offset == kUniformOffsetUnset) {
+      VK_CHECK_COND(
+          (uniforms_size_ + ubo_nbytes) <= max_ubo_nbytes_,
+          "Uniform data allocation has exceeded Tensor uniform buffer size");
+      *param_buffer_offset = uniforms_size_;
+      uniforms_size_ += ubo_nbytes;
+      uniforms_.update(data, *param_buffer_offset);
+    }
+    return vkapi::BufferBindInfo(
+        uniforms_.buffer(), *param_buffer_offset, ubo_nbytes);
+  }
+
  public:
   /*
    * The functions below return the buffer binding info for a UBO that contains
@@ -649,5 +660,70 @@ static constexpr vTensor::Attribute kTensorLogicalLimits =
     vTensor::Attribute::LOGICAL_LIMITS;
 static constexpr vTensor::Attribute kTensorNumel = vTensor::Attribute::NUMEL;
 
+/*
+ * Prepare tensor metadata vector for consumption on the GPU:
+ * 1. Convert NCHW dim order and indexes to WCHN dim order and indexes
+ * 2. Unsqueeze to the next multiple of 4 dims
+ * 3. Convert to requested output dtype
+ */
+template <
+    typename T,
+    typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+std::vector<T> flip_and_unsqueeze(
+    const std::vector<int64_t>& tensor_metadata,
+    const vTensor::Attribute metadata_type,
+    const size_t numel,
+    const int32_t fixed_ndim = -1) {
+  const size_t ndim = tensor_metadata.size();
+  size_t ndim_up4 =
+      std::max(utils::align_up_4(tensor_metadata.size()), size_t(4));
+
+  if (fixed_ndim > 0) {
+    VK_CHECK_COND(fixed_ndim >= ndim);
+    ndim_up4 = static_cast<size_t>(fixed_ndim);
+  }
+
+  std::vector<T> flipped_metadata(ndim_up4);
+
+  for (int flipped_i = 0; flipped_i < ndim; ++flipped_i) {
+    T val_at_dim =
+        utils::safe_downcast<T>(tensor_metadata.at(ndim - 1 - flipped_i));
+    if (metadata_type == kTensorDimOrder) {
+      val_at_dim = utils::safe_downcast<T>(ndim - 1 - val_at_dim);
+    }
+    flipped_metadata.at(flipped_i) = val_at_dim;
+  }
+
+  switch (metadata_type) {
+    case kTensorStrides:
+      for (int unsqueezed_i = ndim; unsqueezed_i < ndim_up4; ++unsqueezed_i) {
+        flipped_metadata.at(unsqueezed_i) = utils::safe_downcast<T>(numel);
+      }
+      break;
+    case kTensorDimOrder:
+      for (int unsqueezed_i = ndim; unsqueezed_i < ndim_up4; ++unsqueezed_i) {
+        flipped_metadata.at(unsqueezed_i) =
+            utils::safe_downcast<T>(unsqueezed_i);
+      }
+      break;
+    // Default: unsqueeze with ones
+    default:
+      for (int unsqueezed_i = ndim; unsqueezed_i < ndim_up4; ++unsqueezed_i) {
+        flipped_metadata.at(unsqueezed_i) = utils::safe_downcast<T>(1);
+      }
+      break;
+  }
+
+  return flipped_metadata;
+}
+
+/*
+ * Same as flip and unsqueeze, but returns the metadata as an `ivec4`.
+ */
+utils::ivec4 flip_and_unsqueezed_ivec4(
+    const std::vector<int64_t>& tensor_metadata,
+    const vTensor::Attribute metadata_type,
+    const size_t numel);
+
 } // namespace api
 } // namespace vkcompute
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index 9a857f41fde..a193d02da88 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -114,7 +114,7 @@ TEST_F(VulkanComputeAPITest, print_shader_executable_properties) {
 std::vector<int64_t> get_reference_strides(
     const std::vector<int64_t>& sizes,
     const utils::GPUMemoryLayout layout,
-    const bool unsqueezed = false) {
+    const bool flip_unsqueezed = false) {
   int64_t C = utils::val_at(-3, sizes);
   int64_t H = utils::val_at(-2, sizes);
   int64_t W = utils::val_at(-1, sizes);
@@ -125,18 +125,20 @@ std::vector<int64_t> get_reference_strides(
     case utils::kWidthPacked:
       switch (sizes.size()) {
         case 1:
-          if (unsqueezed)
-            return {numel, numel, numel, 1};
+          if (flip_unsqueezed)
+            return {1, numel, numel, numel};
           return {1};
         case 2:
-          if (unsqueezed)
-            return {numel, numel, W, 1};
+          if (flip_unsqueezed)
+            return {1, W, numel, numel};
           return {W, 1};
         case 3:
-          if (unsqueezed)
-            return {numel, H * W, W, 1};
+          if (flip_unsqueezed)
+            return {1, W, H * W, numel};
           return {H * W, W, 1};
         case 4:
+          if (flip_unsqueezed)
+            return {1, W, H * W, C * H * W};
           return {C * H * W, H * W, W, 1};
         default:
           return {};
@@ -145,18 +147,21 @@ std::vector<int64_t> get_reference_strides(
     case utils::kHeightPacked:
       switch (sizes.size()) {
         case 1:
-          if (unsqueezed)
-            return {numel, numel, numel, 1};
+          if (flip_unsqueezed)
+            return {1, numel, numel, numel};
           return {1};
         case 2:
-          if (unsqueezed)
-            return {numel, numel, 1, H};
+          if (flip_unsqueezed)
+            return {H, 1, numel, numel};
+          return {1, H};
           return {1, H};
         case 3:
-          if (unsqueezed)
-            return {numel, H * W, 1, H};
+          if (flip_unsqueezed)
+            return {H, 1, H * W, numel};
           return {W * H, 1, H};
         case 4:
+          if (flip_unsqueezed)
+            return {H, 1, W * H, C * W * H};
           return {C * W * H, W * H, 1, H};
         default:
           return {};
@@ -164,18 +169,20 @@ std::vector<int64_t> get_reference_strides(
     case utils::kChannelsPacked:
       switch (sizes.size()) {
         case 1:
-          if (unsqueezed)
-            return {numel, numel, numel, 1};
+          if (flip_unsqueezed)
+            return {1, numel, numel, numel};
           return {1};
         case 2:
-          if (unsqueezed)
-            return {numel, numel, W, 1};
+          if (flip_unsqueezed)
+            return {1, W, numel, numel};
           return {W, 1};
         case 3:
-          if (unsqueezed)
-            return {numel, 1, W * C, C};
+          if (flip_unsqueezed)
+            return {C, W * C, 1, numel};
           return {1, W * C, C};
         case 4:
+          if (flip_unsqueezed)
+            return {C, W * C, 1, H * W * C};
           return {H * W * C, 1, W * C, C};
         default:
           return {};
@@ -184,6 +191,41 @@ std::vector<int64_t> get_reference_strides(
   return {};
 }
 
+/*
+ * Applies the following transformations to a tensor's dim_order vector:
+ *   1. Reverse the order of elements so that the fastest moving dimensions are
+ *      first.
+ *   2. Convert NCHW dimension indices to WHCN indices, so that 0 represents the
+ *      width dimension, 1 represents the height dimension, and 2 represents the
+ *      channels dimension.
+ *   3. Unsqueeze the dim_order vector to the next multiple of 4.
+ */
+std::vector<int64_t> create_whcn_dim_order(
+    const std::vector<int64_t>& dim_order) {
+  size_t ndim = dim_order.size();
+  std::vector<int64_t> whcn_order(ndim);
+
+  // Convert from NCHW to WHCN index, and flip the dim order so that the fastest
+  // moving dimension is first.
+  // example: {     1,     2,        0} -> {       2,     0,      1}
+  //          {height, width, channels} -> {channels, width, height}
+  for (size_t whcn_i = 0, nchw_i = (ndim - 1); whcn_i < ndim;
+       ++whcn_i, --nchw_i) {
+    whcn_order.at(whcn_i) = ndim - 1 - dim_order.at(nchw_i);
+  }
+
+  // Unsqueeze to the next multiple of 4
+  size_t ndim_up4 = utils::align_up_4(ndim);
+  whcn_order.resize(ndim_up4);
+
+  // Append unsqueezed dimensions
+  for (size_t i = ndim; i < ndim_up4; ++i) {
+    whcn_order.at(i) = i;
+  }
+
+  return whcn_order;
+}
+
 TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) {
   vkapi::ShaderInfo empty_shader_info;
   EXPECT_FALSE(empty_shader_info);
@@ -191,6 +233,20 @@ TEST_F(VulkanComputeAPITest, empty_init_shader_info_test) {
   EXPECT_TRUE(empty_shader_info.src_code.size == 0u);
 }
 
+bool compare_vectors(
+    const std::vector<int32_t>& v32,
+    const std::vector<int64_t>& v64) {
+  if (v32.size() != v64.size()) {
+    return false;
+  }
+  for (size_t i = 0; i < v32.size(); ++i) {
+    if (static_cast<int64_t>(v32[i]) != v64[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
 TEST_F(VulkanComputeAPITest, calculate_dim_order_test) {
   // ndim, GPUMemoryLayout, expected dim order pairs
   std::vector<std::tuple<size_t, int32_t, std::vector<int64_t>>> test_cases = {
@@ -238,17 +294,27 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
         std::vector<int64_t> dim_order =
             calculate_dim_order(sizes.size(), packed_dim);
         std::vector<int64_t> strides = calculate_strides(sizes, dim_order);
+        int64_t numel = utils::multiply_integers(sizes);
+
         std::vector<int64_t> ref_strides = get_reference_strides(sizes, layout);
         ASSERT_TRUE(strides == ref_strides);
 
-        int64_t numel = utils::multiply_integers(sizes);
         std::vector<int64_t> unsqueezed_strides =
-            unsqueeze_strides(strides, numel);
+            flip_and_unsqueeze<int64_t>(strides, kTensorStrides, numel);
+
         std::vector<int64_t> ref_unsqueezed_strides =
             get_reference_strides(sizes, layout, true);
 
         ASSERT_TRUE(unsqueezed_strides == ref_unsqueezed_strides);
 
+        std::vector<int64_t> whcn_dim_order =
+            flip_and_unsqueeze<int64_t>(dim_order, kTensorDimOrder, numel);
+
+        std::vector<int64_t> ref_whcn_dim_order =
+            create_whcn_dim_order(dim_order);
+
+        ASSERT_TRUE(whcn_dim_order == ref_whcn_dim_order);
+
         // Create new vTensor and check that the strides are correct
         vTensor new_v_tensor(
             context(),