diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index a85229b2b86..43ebbfecbc6 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -143,6 +143,43 @@ bool dim_order_is_valid(const std::vector<int64_t>& dim_order) {
   return sum == n * (n + 1) / 2;
 }
 
+/*
+ * Applies the following transformations to a tensor's dim_order vector:
+ *   1. Reverse the order of elements so that the fastest moving dimensions are
+ *      first.
+ *   2. Convert NCHW dimension indices to WHCN indices, so that 0 represents the
+ *      width dimension, 1 represents the height dimension, and 2 represents the
+ *      channels dimension.
+ *   3. Unsqueeze the dim_order vector to the next multiple of 4.
+
+ * These transformations make it easier to use the dim order in a compute shader
+ */
+std::vector<int64_t> create_whcn_dim_order(
+    const std::vector<int64_t>& dim_order) {
+  size_t ndim = dim_order.size();
+  std::vector<int64_t> whcn_order(ndim);
+
+  // Convert from NCHW to WHCN index, and flip the dim order so that the fastest
+  // moving dimension is first.
+  // example: {     1,     2,        0} -> {       2,     0,      1}
+  //          {height, width, channels} -> {channels, width, height}
+  for (size_t whcn_i = 0, nchw_i = (ndim - 1); whcn_i < ndim;
+       ++whcn_i, --nchw_i) {
+    whcn_order.at(whcn_i) = ndim - 1 - dim_order.at(nchw_i);
+  }
+
+  // Unsqueeze to the next multiple of 4
+  size_t ndim_up4 = utils::align_up_4(ndim);
+  whcn_order.resize(ndim_up4);
+
+  // Append unsqueezed dimensions
+  for (size_t i = ndim; i < ndim_up4; ++i) {
+    whcn_order.at(i) = i;
+  }
+
+  return whcn_order;
+}
+
 std::vector<int64_t> unsqueeze_strides(
     const std::vector<int64_t>& strides,
     const int64_t numel) {
@@ -212,6 +249,97 @@ utils::uvec3 calculate_image_extents(
   return extents;
 }
 
+/*
+ * The physical image extents describe the size of an allocated texture resource
+ * i.e. how many texels in the width, height and depth axis of the image.
+ * However, the axis map allows a tensor logical dimension to map to a different
+ * physical texture axis; in essence, it describes a permutation between the
+ * logical width, height, channels, etc. dimensions of a tensor and the width,
+ * height, depth axis of a texture.
+ *
+ * The "logical extents" is simply the physical image extents permuted by the
+ * axis mapping. The logical extents is useful for constructing global work
+ * group sizes, so that it is easier to convert the global thread ID to a
+ * tensor index.
+ */
+utils::uvec3 calculate_logical_limits(
+    const utils::uvec3& image_extents,
+    const std::vector<int64_t>& axis_map) {
+  return {
+      image_extents[axis_map.at(0)],
+      image_extents[axis_map.at(1)],
+      image_extents[axis_map.at(2)],
+  };
+}
+
+/*
+ * Convenience overload of the above function to calculate logical limits
+ * directly from tensor sizes.
+ */
+utils::uvec3 calculate_logical_limits(
+    const std::vector<int64_t>& sizes,
+    const std::vector<int64_t>& axis_map,
+    const int32_t packed_dim) {
+  return calculate_logical_limits(
+      calculate_image_extents(
+          calculate_padded_sizes(sizes, packed_dim), axis_map, packed_dim),
+      axis_map);
+}
+
+int64_t calculate_gpu_buffer_numel(
+    Context* const context,
+    const std::vector<int64_t>& sizes,
+    const utils::uvec3 image_extents,
+    const utils::StorageType storage_type,
+    const vkapi::ScalarType dtype) {
+  // For texture backed tensors, simply multiply the total number of texels by 4
+  if (storage_type != utils::kBuffer) {
+    return image_extents[0] * image_extents[1] * image_extents[2] * 4;
+  }
+  const bool is_int8 = dtype == vkapi::kChar;
+  const bool int8_supported =
+      context->adapter_ptr()->has_full_int8_buffers_support();
+  const size_t numel = utils::multiply_integers(sizes);
+  // For int8 tensors, if the device does not support int8 buffers, then int32
+  // is used instead to represent the buffer data. Therefore the number of
+  // elements in the buffer is aligned to the next multiple of 4.
+  if (is_int8 && int8_supported) {
+    return utils::align_up_4(numel);
+  }
+  return numel;
+}
+
+int32_t pack_into_int32(const std::vector<int64_t>& vec, const int32_t extra) {
+  int32_t packed = static_cast<int32_t>(
+      vec.at(0) + (vec.at(1) << 4) + (vec.at(2) << 8) + (vec.at(3) << 12) +
+      (extra << 16));
+  return packed;
+}
+
+int32_t create_hashed_layout(
+    const std::vector<int64_t>& dim_order,
+    const std::vector<int64_t>& axis_map,
+    const int32_t packed_dim,
+    const utils::StorageType storage_type) {
+  if (storage_type == utils::kBuffer) {
+    return pack_into_int32(create_whcn_dim_order(dim_order), 0);
+  }
+  return pack_into_int32(axis_map, packed_dim);
+}
+
+size_t calculate_max_ubo_nbytes(
+    const size_t nbytes_per_ubo,
+    const utils::StorageType storage_type) {
+  // For texture backed tensors, the metadata fields needed are:
+  // sizes, logical limits
+  size_t max_metadata_field_count = 2u;
+  if (storage_type == utils::kBuffer) {
+    // sizes, strides, dim order, numel
+    max_metadata_field_count = 4u;
+  }
+  return max_metadata_field_count * nbytes_per_ubo;
+}
+
 //
 // vTensorStorage
 //
@@ -322,14 +450,21 @@ vTensorStorage::vTensorStorage(
     const utils::StorageType storage_type,
     const std::vector<int64_t>& axis_map,
     const int32_t packed_dim,
-    const std::vector<int64_t>& padded_sizes,
+    const std::vector<int64_t>& sizes,
     const vkapi::ScalarType dtype,
     const bool allocate_memory)
     : context_(context),
       storage_type_{storage_type},
-      image_extents_(
-          calculate_image_extents(padded_sizes, axis_map, packed_dim)),
-      buffer_length_{utils::multiply_integers(padded_sizes)},
+      image_extents_(calculate_image_extents(
+          calculate_padded_sizes(sizes, packed_dim),
+          axis_map,
+          packed_dim)),
+      buffer_length_{calculate_gpu_buffer_numel(
+          context_,
+          sizes,
+          image_extents_,
+          storage_type,
+          dtype)},
       buffer_offset_{0},
       image_(allocate_image(
           context_,
@@ -446,35 +581,45 @@ vTensor::vTensor(
       dim_order_(calculate_dim_order(sizes_.size(), packed_dim_)),
       axis_map_(calculate_axis_map(sizes_, axis_map_layout)),
       strides_(calculate_strides(sizes, dim_order_)),
-      padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)},
-      unsqueezed_strides_{
-          unsqueeze_strides(strides_, utils::multiply_integers(sizes_))},
-      padded_numel_(utils::multiply_integers(padded_sizes_)),
+      numel_(utils::multiply_integers(sizes_)),
+      hashed_layout_(create_hashed_layout(
+          dim_order_,
+          axis_map_,
+          packed_dim_,
+          storage_type)),
+      // Related to tensor metadata UBOs
+      nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
+      max_ubo_nbytes_{calculate_max_ubo_nbytes(nbytes_per_ubo_, storage_type)},
       uniforms_(),
-      // Utility Uniform Buffers that can be passed to shaders as arguments
-      uniforms_size_(0),
-      sizes_uniform_offset_(kUniformOffsetUnset),
-      unsqueezed_strides_offset_(kUniformOffsetUnset),
-      numel_uniform_offset_(kUniformOffsetUnset),
-      logical_limits_uniform_offset_(kUniformOffsetUnset),
       // Construct Tensor storage
       storage_(std::make_shared<vTensorStorage>(
           context,
           storage_type,
           axis_map_,
           packed_dim_,
-          padded_sizes_,
+          sizes,
           dtype_,
           allocate_memory)) {
+  // Derived metadata
+  std::vector<int64_t> whcn_dim_order(4, 0);
+  std::vector<int64_t> unsqueezed_strides(4, 0);
+  // Only calculate derived metadata if needed for the desired storage type.
+  // Note that logical limits may be used by buffer storage as well in order to
+  // set global work group sizes for some compute shaders.
+  if (storage_type == utils::kBuffer) {
+    whcn_dim_order = create_whcn_dim_order(dim_order_);
+    unsqueezed_strides = unsqueeze_strides(strides_, numel_);
+  }
+
   uniform_data_ = std::make_shared<UniformData>(UniformData{
       sizes_,
-      unsqueezed_strides_,
-      {{0, 0, 0}},
-      static_cast<size_t>(utils::multiply_integers(sizes_))});
+      whcn_dim_order,
+      unsqueezed_strides,
+      TextureLimits(
+          calculate_logical_limits(storage_->image_extents_, axis_map_)),
+      numel_});
   VK_CHECK_COND(
       dim_order_is_valid(dim_order_), "computed dim order is invalid");
-
-  set_logical_limits(storage_->image_extents_);
 }
 
 // NOLINTNEXTLINE
@@ -490,24 +635,23 @@ vTensor::vTensor(
       dim_order_(),
       axis_map_(calculate_axis_map(sizes_, axis_map_layout)),
       strides_(),
-      padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_)),
-      unsqueezed_strides_(),
-      padded_numel_(utils::multiply_integers(padded_sizes_)),
+      numel_(utils::multiply_integers(sizes_)),
+      hashed_layout_(create_hashed_layout(
+          dim_order_,
+          axis_map_,
+          packed_dim_,
+          utils::kTexture3D)),
+      // Related to tensor metadata UBOs
+      nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
+      max_ubo_nbytes_{
+          calculate_max_ubo_nbytes(nbytes_per_ubo_, utils::kTexture3D)},
       uniforms_(),
-      // Utility Uniform Buffers that can be passed to shaders as arguments
-      uniforms_size_(0),
-      sizes_uniform_offset_(kUniformOffsetUnset),
-      unsqueezed_strides_offset_(kUniformOffsetUnset),
-      numel_uniform_offset_(kUniformOffsetUnset),
-      logical_limits_uniform_offset_(kUniformOffsetUnset),
       // Construct Tensor storage
       storage_(std::make_shared<vTensorStorage>(context, image)) {
-  uniform_data_ = std::make_shared<UniformData>(UniformData{
-      sizes_,
-      {0, 0, 0, 0},
-      {{0, 0, 0}},
-      static_cast<size_t>(utils::multiply_integers(sizes_))});
-  set_logical_limits(storage_->image_extents_);
+  TextureLimits logical_limits(
+      calculate_logical_limits(storage_->image_extents_, axis_map_));
+  uniform_data_ = std::make_shared<UniformData>(
+      UniformData{sizes_, {0, 0, 0, 0}, {0, 0, 0, 0}, logical_limits, numel_});
 }
 
 vTensor::vTensor(vTensor& other)
@@ -518,18 +662,11 @@ vTensor::vTensor(vTensor& other)
       dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
       axis_map_(other.axis_map_.begin(), other.axis_map_.end()),
       strides_(other.strides_.begin(), other.strides_.end()),
-      padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()},
-      unsqueezed_strides_{
-          other.unsqueezed_strides_.begin(),
-          other.unsqueezed_strides_.end()},
-      padded_numel_(other.padded_numel_),
+      numel_(other.numel_),
+      hashed_layout_(other.hashed_layout_),
+      nbytes_per_ubo_{other.nbytes_per_ubo_},
+      max_ubo_nbytes_{other.max_ubo_nbytes_},
       uniforms_(),
-      // Empty initialize Utility Uniform Buffers
-      uniforms_size_(0),
-      sizes_uniform_offset_(kUniformOffsetUnset),
-      unsqueezed_strides_offset_(kUniformOffsetUnset),
-      numel_uniform_offset_(kUniformOffsetUnset),
-      logical_limits_uniform_offset_(kUniformOffsetUnset),
       // Copy Tensor storage
       storage_(other.storage_) {
   uniform_data_ = std::make_shared<UniformData>(*other.get_uniform_data());
@@ -546,22 +683,21 @@ vTensor::vTensor(
       dim_order_(dim_order.begin(), dim_order.end()),
       axis_map_(calculate_axis_map(sizes_, utils::kDefaultAxisMap)),
       strides_(calculate_strides(sizes_, dim_order_)),
-      padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)},
-      unsqueezed_strides_{
-          unsqueeze_strides(strides_, utils::multiply_integers(sizes_))},
-      padded_numel_(utils::multiply_integers(padded_sizes_)),
+      numel_(other.numel_),
+      hashed_layout_(create_hashed_layout(
+          dim_order_,
+          axis_map_,
+          packed_dim_,
+          other.storage_type())),
+      nbytes_per_ubo_{other.nbytes_per_ubo_},
+      max_ubo_nbytes_{other.max_ubo_nbytes_},
       uniforms_(),
-      // Empty initialize Utility Uniform Buffers
-      uniforms_size_(0),
-      sizes_uniform_offset_(kUniformOffsetUnset),
-      unsqueezed_strides_offset_(kUniformOffsetUnset),
-      numel_uniform_offset_(kUniformOffsetUnset),
-      logical_limits_uniform_offset_(kUniformOffsetUnset),
       // Copy Tensor storage
       storage_(other.storage_) {
   uniform_data_ = std::make_shared<UniformData>(UniformData{
       sizes_,
-      unsqueezed_strides_,
+      create_whcn_dim_order(dim_order_),
+      unsqueeze_strides(strides_, numel_),
       {other.logical_limits()},
       static_cast<size_t>(utils::multiply_integers(sizes_))});
 
@@ -584,6 +720,7 @@ uint32_t vTensor::UniformData::write_attribute(
   }
   switch (attr) {
     WRITE_ATTRIBUTE_CASE(SIZES, sizes_v);
+    WRITE_ATTRIBUTE_CASE(WHCN_DIM_ORDER, whcn_dim_order_v);
     WRITE_ATTRIBUTE_CASE(STRIDES, strides_v);
     WRITE_ATTRIBUTE_CASE(LOGICAL_LIMITS, logical_limits);
     WRITE_ATTRIBUTE_CASE(NUMEL, numel);
@@ -624,12 +761,6 @@ vkapi::VulkanBuffer& vTensor::buffer(
   return storage_->buffer_;
 }
 
-void vTensor::set_logical_limits(const utils::uvec3& image_extents) {
-  uniform_data_->logical_limits.limits[0] = image_extents[axis_map_.at(0)];
-  uniform_data_->logical_limits.limits[1] = image_extents[axis_map_.at(1)];
-  uniform_data_->logical_limits.limits[2] = image_extents[axis_map_.at(2)];
-}
-
 utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
   switch (packed_dim_) {
     case WHCN::kWidthDim:
@@ -643,95 +774,108 @@ utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
   }
 }
 
+bool vTensor::is_contiguous() const {
+  if (storage_type() != utils::kBuffer) {
+    return false;
+  }
+  for (size_t i = 0; i < dim_order_.size(); ++i) {
+    if (dim_order_.at(i) != i) {
+      return false;
+    }
+  }
+  return true;
+}
+
+size_t vTensor::get_max_ubo_nbytes(const size_t nbytes_per_ubo) const {
+  // For texture backed tensors, the metadata fields needed are:
+  // sizes, logical limits
+  size_t max_metadata_field_count = 2u;
+  if (storage_type() == utils::kBuffer) {
+    // sizes, strides, dim order, numel
+    max_metadata_field_count = 4u;
+  }
+  return max_metadata_field_count * nbytes_per_ubo;
+}
+
 const vkapi::BufferBindInfo vTensor::sizes_ubo() {
-  const size_t size_per_ubo =
-      storage_->context_->adapter_ptr()->min_ubo_alignment();
-  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true);
+    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
   }
   if (sizes_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
+        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
     sizes_uniform_offset_ = uniforms_size_;
-    uniforms_size_ += size_per_ubo;
+    uniforms_size_ += nbytes_per_ubo_;
     uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_);
   }
   return vkapi::BufferBindInfo(
-      uniforms_.buffer(), sizes_uniform_offset_, size_per_ubo);
+      uniforms_.buffer(), sizes_uniform_offset_, nbytes_per_ubo_);
 }
 
-const vkapi::BufferBindInfo vTensor::strides_ubo() {
-  const size_t size_per_ubo =
-      storage_->context_->adapter_ptr()->min_ubo_alignment();
-  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
+const vkapi::BufferBindInfo vTensor::dim_order_ubo() {
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true);
+    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
   }
-  if (unsqueezed_strides_offset_ == kUniformOffsetUnset) {
+  if (dim_order_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
+        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
-    unsqueezed_strides_offset_ = uniforms_size_;
-    uniforms_size_ += size_per_ubo;
+    dim_order_uniform_offset_ = uniforms_size_;
+    uniforms_size_ += nbytes_per_ubo_;
     uniforms_.update(
-        utils::make_whcn_ivec4(unsqueezed_strides_),
-        unsqueezed_strides_offset_);
+        uniform_data_->whcn_dim_order_v, dim_order_uniform_offset_);
+  }
+  return vkapi::BufferBindInfo(
+      uniforms_.buffer(), dim_order_uniform_offset_, nbytes_per_ubo_);
+}
+
+const vkapi::BufferBindInfo vTensor::strides_ubo() {
+  if (!uniforms_.buffer()) {
+    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
+  }
+  if (strides_uniform_offset == kUniformOffsetUnset) {
+    VK_CHECK_COND(
+        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
+        "Uniform data allocation has exceeded Tensor uniform buffer size");
+    strides_uniform_offset = uniforms_size_;
+    uniforms_size_ += nbytes_per_ubo_;
+    uniforms_.update(uniform_data_->strides_v, strides_uniform_offset);
   }
   return vkapi::BufferBindInfo(
-      uniforms_.buffer(), unsqueezed_strides_offset_, size_per_ubo);
+      uniforms_.buffer(), strides_uniform_offset, nbytes_per_ubo_);
 }
 
 const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
-  const size_t size_per_ubo =
-      storage_->context_->adapter_ptr()->min_ubo_alignment();
-  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true);
+    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
   }
   if (logical_limits_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
+        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
     logical_limits_uniform_offset_ = uniforms_size_;
-    uniforms_size_ += size_per_ubo;
+    uniforms_size_ += nbytes_per_ubo_;
     uniforms_.update(logical_limits(), logical_limits_uniform_offset_);
   }
   return vkapi::BufferBindInfo(
-      uniforms_.buffer(), logical_limits_uniform_offset_, size_per_ubo);
+      uniforms_.buffer(), logical_limits_uniform_offset_, nbytes_per_ubo_);
 }
 
 const vkapi::BufferBindInfo vTensor::numel_ubo() {
-  const size_t size_per_ubo =
-      storage_->context_->adapter_ptr()->min_ubo_alignment();
-  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true);
+    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
   }
   if (numel_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
+        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
     numel_uniform_offset_ = uniforms_size_;
-    uniforms_size_ += size_per_ubo;
+    uniforms_size_ += nbytes_per_ubo_;
     uniforms_.update(numel(), numel_uniform_offset_);
   }
   return vkapi::BufferBindInfo(
-      uniforms_.buffer(), numel_uniform_offset_, size_per_ubo);
-}
-
-size_t vTensor::staging_buffer_numel() const {
-  const bool is_int8 = dtype_ == vkapi::kChar;
-  const bool int8_supported =
-      storage_->context_->adapter_ptr()->has_full_int8_buffers_support();
-  if (is_int8 && !int8_supported) {
-    return utils::align_up_4(numel());
-  }
-  if (storage_type() == utils::kBuffer) {
-    return numel();
-  }
-  return padded_numel_;
+      uniforms_.buffer(), numel_uniform_offset_, nbytes_per_ubo_);
 }
 
 VkMemoryRequirements vTensor::get_memory_requirements() const {
@@ -758,33 +902,36 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) {
 }
 
 void vTensor::update_metadata() {
+  numel_ = utils::multiply_integers(sizes_);
   strides_ = calculate_strides(sizes_, dim_order_);
-  uniform_data_->numel = utils::multiply_integers(sizes_);
-
-  padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_);
-  unsqueezed_strides_ = unsqueeze_strides(strides_, numel());
-  padded_numel_ = utils::multiply_integers(padded_sizes_);
 
   // Update uniform data if it has been modified
+  uniform_data_->numel = numel_;
   uniform_data_->sizes_v = utils::make_whcn_ivec4(sizes_);
-  uniform_data_->strides_v = utils::make_whcn_ivec4(unsqueezed_strides_);
-
-  // Calculate the image extents that would have been used to allocate a texture
-  // withthe current sizes, and use that to set the logical limits.
-  set_logical_limits(
-      calculate_image_extents(padded_sizes_, axis_map_, packed_dim_));
+  uniform_data_->whcn_dim_order_v =
+      utils::make_ivec4(create_whcn_dim_order(dim_order_));
+  uniform_data_->strides_v =
+      utils::make_whcn_ivec4(unsqueeze_strides(strides_, numel_));
+  uniform_data_->numel = utils::safe_downcast<int32_t>(numel_);
+  uniform_data_->logical_limits.limits =
+      calculate_logical_limits(sizes_, axis_map_, packed_dim_);
 
   if (sizes_uniform_offset_ != kUniformOffsetUnset) {
     uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_);
   }
-  if (unsqueezed_strides_offset_ != kUniformOffsetUnset) {
-    uniforms_.update(uniform_data_->strides_v, unsqueezed_strides_offset_);
+  if (dim_order_uniform_offset_ != kUniformOffsetUnset) {
+    uniforms_.update(
+        uniform_data_->whcn_dim_order_v, dim_order_uniform_offset_);
+  }
+  if (strides_uniform_offset != kUniformOffsetUnset) {
+    uniforms_.update(uniform_data_->strides_v, strides_uniform_offset);
   }
   if (numel_uniform_offset_ != kUniformOffsetUnset) {
-    uniforms_.update(numel(), numel_uniform_offset_);
+    uniforms_.update(numel_, numel_uniform_offset_);
   }
   if (logical_limits_uniform_offset_ != kUniformOffsetUnset) {
-    uniforms_.update(logical_limits(), logical_limits_uniform_offset_);
+    uniforms_.update(
+        uniform_data_->logical_limits.limits, logical_limits_uniform_offset_);
   }
 }
 
@@ -792,8 +939,8 @@ void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
   if (storage_type() != utils::kBuffer) {
     // For texture storage check that the current texture is large enough for
     // the new sizes of the tensor.
-    utils::uvec3 virtual_extents =
-        calculate_image_extents(padded_sizes_, axis_map_, packed_dim_);
+    utils::uvec3 virtual_extents = calculate_image_extents(
+        calculate_padded_sizes(sizes_, packed_dim_), axis_map_, packed_dim_);
 
     bool valid_resize = virtual_extents[0] <= storage_->image_extents_[0];
     valid_resize =
@@ -828,6 +975,11 @@ void vTensor::virtual_reconfigure(
   check_sizes(new_sizes);
   sizes_ = new_sizes;
   dim_order_ = new_dim_order;
+
+  // Update the hashed layout because dim order is updated
+  hashed_layout_ =
+      create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type());
+
   update_metadata();
 }
 
@@ -837,6 +989,7 @@ void vTensor::virtual_clone(const vTensor& other) {
   dim_order_ = other.dim_order_;
   axis_map_ = other.axis_map_;
   packed_dim_ = other.packed_dim_;
+  hashed_layout_ = other.hashed_layout_;
 
   *uniform_data_ = *other.get_uniform_data();
 }
@@ -895,6 +1048,11 @@ void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) {
       axis_map_.at(3) = dim0_whcn;
     }
   }
+
+  // Update the hashed layout because dim order / axis mpa is updated
+  hashed_layout_ =
+      create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type());
+
   update_metadata();
 }
 
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index 850dc2d7fab..78a24d87e77 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -81,6 +81,18 @@ struct LastAccess {
       : stage{stage_flags}, access{access_flags} {}
 };
 
+/*
+ * Calculate the number of elements that a GPU buffer would require to store the
+ * contents of a tensor. This will depend on the storage type and dtype of the
+ * tensor, as well as the features available on the device.
+ */
+int64_t calculate_gpu_buffer_numel(
+    Context* const context,
+    const std::vector<int64_t>& sizes,
+    const utils::uvec3 image_extents,
+    const utils::StorageType storage_type,
+    const vkapi::ScalarType dtype);
+
 class vTensorStorage final {
  public:
   // Do not allow empty vTensorStorage construction
@@ -91,7 +103,7 @@ class vTensorStorage final {
       const utils::StorageType storage_type,
       const std::vector<int64_t>& axis_map,
       const int32_t packed_dim,
-      const std::vector<int64_t>& padded_sizes,
+      const std::vector<int64_t>& sizes,
       const vkapi::ScalarType dtype,
       const bool allocate_memory = true);
 
@@ -140,6 +152,10 @@ class vTensorStorage final {
   void verify() const;
 
  public:
+  inline size_t buffer_len() const {
+    return utils::safe_downcast<size_t>(buffer_length_);
+  }
+
   inline VkFormat texture_format() {
     return image_.format();
   }
@@ -207,8 +223,11 @@ class vTensor final {
   vTensor(vTensor&& other) = default;
   vTensor& operator=(vTensor&& other) = default;
 
+  ~vTensor() = default;
+
   enum class Attribute : uint8_t {
     SIZES,
+    WHCN_DIM_ORDER,
     STRIDES,
     LOGICAL_LIMITS,
     NUMEL,
@@ -216,6 +235,7 @@ class vTensor final {
 
   class UniformData {
     utils::ivec4 sizes_v;
+    utils::ivec4 whcn_dim_order_v;
     utils::ivec4 strides_v;
     // See the comments documenting logical_limits() for more context.
     TextureLimits logical_limits;
@@ -227,10 +247,12 @@ class vTensor final {
 
     UniformData(
         const std::vector<int64_t>& sizes,
+        const std::vector<int64_t>& whcn_dim_order,
         const std::vector<int64_t>& strides,
         const TextureLimits& logical_limits,
         const size_t numel_ll)
         : sizes_v(utils::make_whcn_ivec4(sizes)),
+          whcn_dim_order_v(utils::make_ivec4(whcn_dim_order)),
           strides_v(utils::make_whcn_ivec4(strides)),
           logical_limits(logical_limits),
           numel(utils::safe_downcast<int32_t>(numel_ll)) {}
@@ -293,21 +315,17 @@ class vTensor final {
   // strides of the tensor in NCHW dimension order
   std::vector<int64_t> strides_;
 
-  /*
-   * The below metadata members are derived from the above, and are typically
-   * to i.e. pass tensor metadata to compute shaders.
-   */
+  // number of elements based on the canonical sizes
+  size_t numel_;
+
+  // For texture backed tensors, this int32 contains the axis map data packed
+  // into a single int32. For buffer backed tensors, this int32 contains the
+  // wchn dim order data packed into a single int32.
+  int32_t hashed_layout_;
 
-  // padded sizes of the tensor in NCHW dimension order. See the
-  // calculate_padded_sizes() function for more context. Note that padded sizes
-  // are only used for texture storage, and not for buffer storage.
-  std::vector<int64_t> padded_sizes_;
-  // Contains the strides of the tensor, with the dimensionality padded to the
-  // nearest multiple of 4. Unsqueezed dims will have a stride of int32_t max.
-  std::vector<int64_t> unsqueezed_strides_;
-  // Contains the number of elements in the tensor according to the padded
-  // sizes.
-  size_t padded_numel_;
+  // Pre-compute these quantities to avoid frequent re-computation
+  size_t nbytes_per_ubo_;
+  size_t max_ubo_nbytes_;
 
   /*
    * Utility GPU buffer that can be passed to shaders in order to convey tensor
@@ -320,15 +338,13 @@ class vTensor final {
    * context about the data contained in each buffer.
    */
   ParamsBuffer uniforms_;
-  uint32_t uniforms_size_;
-  uint32_t sizes_uniform_offset_;
-  uint32_t unsqueezed_strides_offset_;
-  uint32_t numel_uniform_offset_;
-  uint32_t logical_limits_uniform_offset_;
 
-  // Maximum number of metadata fields that can be stored in the metadata UBO.
-  // This is used to calculate the size of the UBO that should be allocated.
-  constexpr static size_t kMaxMetadataFieldCount = 4;
+  uint32_t uniforms_size_ = 0u;
+  uint32_t sizes_uniform_offset_ = kUniformOffsetUnset;
+  uint32_t dim_order_uniform_offset_ = kUniformOffsetUnset;
+  uint32_t strides_uniform_offset = kUniformOffsetUnset;
+  uint32_t numel_uniform_offset_ = kUniformOffsetUnset;
+  uint32_t logical_limits_uniform_offset_ = kUniformOffsetUnset;
 
   // Initial value of uniform buffer offsets. 1 is selected as it is essentially
   // impossible for a ubo to have an offset of 1.
@@ -381,9 +397,6 @@ class vTensor final {
     return storage_->storage_type_ == utils::kBuffer;
   }
 
- private:
-  void set_logical_limits(const utils::uvec3& image_extents);
-
  public:
   /*
    * The logical limits of the tensor are derived from the image extents of the
@@ -451,21 +464,37 @@ class vTensor final {
     return dim_order_;
   }
 
+  inline const std::vector<int64_t>& strides() const {
+    return strides_;
+  }
+
+  inline size_t numel() const {
+    return numel_;
+  }
+
+  inline size_t nbytes() const {
+    return element_size(dtype()) * numel();
+  }
+
   inline const std::vector<int64_t>& axis_map() const {
     return axis_map_;
   }
 
   /*
-   * Returns a single int32_t that contains the values of the axis map and the
-   * packed dimension packed into a single int32_t, such that it can be used as
-   * a specialization constant in a compute shader. This allows for the SPIR-V
-   * to bytecode compilation to perform compile-time unfolding on the axis map.
-   * Each element of the axis map and the value of the packed dimension take up
-   * 4 bits in the packed int32_t.
+   * For texture backed tensors, this function return a int32_t that contains
+   * the axis map + packed dimension. Each element of the axis map occupies 4
+   * bits of the int32.
+   *
+   * For buffer backed tensors, the int32_t contains the WHCN dim order, where
+   * each element of the dim order array occupies 4 bits of the int32.
+   *
+   * This int32 is typically consumed as a specialization constant in compute
+   * shaders where it is subsequently unpacked. The layout data of a vTensor
+   * instance is typically static once created, which is why this method is
+   * appropriate.
    */
   inline int32_t hashed_layout() const {
-    return axis_map_.at(0) + (axis_map_.at(1) << 4) + (axis_map_.at(2) << 8) +
-        (axis_map_.at(3) << 12) + (packed_dim_ << 16);
+    return hashed_layout_;
   }
 
   /*
@@ -478,57 +507,48 @@ class vTensor final {
     return axis_map_.at(0) == 0 && axis_map_.at(1) == 1 && axis_map_.at(2) == 2;
   }
 
-  inline const std::vector<int64_t>& strides() const {
-    return strides_;
-  }
+  /*
+   * Return true if a buffer backed tensor's dim order matches that of a
+   * contiguous tensor, i.e. the dim order will be {0, 1, 2, ... }.
+   * Returns false for texture backed tensors.
+   */
+  bool is_contiguous() const;
 
-  inline const std::vector<int64_t>& unsqueezed_strides() const {
-    return unsqueezed_strides_;
+ private:
+  inline size_t nbytes_per_ubo() const {
+    return storage_->context_->adapter_ptr()->min_ubo_alignment();
   }
 
+  size_t get_max_ubo_nbytes(const size_t nbytes_per_ubo) const;
+
+ public:
   /*
-   * Returns a GPU buffer containing the sizes of the tensor in WHCN order.
-   * Note that dimensions that are not present in the tensor's sizes are set to
-   * a size of 1.
+   * The functions below return the buffer binding info for a UBO that contains
+   * some metadata of the tensor, which can be used to pass in tensor metadata
+   * to a compute shader. The other method of passing in tensor metadata is via
+   * push constants. The trade-off between each is that push constants may be
+   * slightly more performant and memory efficient; however, to update the
+   * values in a push constant due to i.e. a tensor resize between inferences,
+   * the command buffer must be re-encoded. On the other hand, UBOs can update
+   * their data by writing to their mapped memory without requiring a command
+   * buffer re-encode.
    */
+
   const vkapi::BufferBindInfo sizes_ubo();
 
-  /*
-   * Returns a GPU buffer containing the strides of the tensor in WHCN order.
-   * Note that the strides are extended to a dimensionality that is a multiple
-   * of 4, thus dimensions that are not present in the tensor's sizes are set to
-   * have a stride equal to the stride of the "slowest moving" dimension.
-   */
+  const vkapi::BufferBindInfo dim_order_ubo();
+
   const vkapi::BufferBindInfo strides_ubo();
 
-  /*
-   * Returns a GPU buffer containing the logical limits of the tensor. See the
-   * comments for logical_limits() for more context.
-   */
   const vkapi::BufferBindInfo logical_limits_ubo();
 
-  /*
-   * Returns the number of elements in the buffer used to store the tensor.
-   */
   const vkapi::BufferBindInfo numel_ubo();
 
-  inline size_t numel() const {
-    return uniform_data_->numel;
-  }
-
-  inline size_t nbytes() const {
-    return element_size(dtype()) * numel();
-  }
-
-  /*
-   * Returns numel but based on padded_sizes_ instead of sizes_
-   */
-  inline size_t padded_numel() const {
-    return padded_numel_;
+ public:
+  inline size_t staging_buffer_numel() const {
+    return storage_->buffer_len();
   }
 
-  size_t staging_buffer_numel() const;
-
   inline size_t staging_buffer_nbytes() const {
     return element_size(dtype()) * staging_buffer_numel();
   }
@@ -608,6 +628,8 @@ class vTensor final {
 };
 
 static constexpr vTensor::Attribute kTensorSizes = vTensor::Attribute::SIZES;
+static constexpr vTensor::Attribute kTensorDimOrder =
+    vTensor::Attribute::WHCN_DIM_ORDER;
 static constexpr vTensor::Attribute kTensorStrides =
     vTensor::Attribute::STRIDES;
 static constexpr vTensor::Attribute kTensorLogicalLimits =
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 31514989dfc..21d80d5843f 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -346,6 +346,10 @@ class ComputeGraph final {
     return values_.at(idx).toTensor().strides_ubo();
   }
 
+  inline vkapi::BufferBindInfo dim_order_ubo(const ValueRef idx) {
+    return values_.at(idx).toTensor().dim_order_ubo();
+  }
+
   inline vkapi::BufferBindInfo numel_ubo(const ValueRef idx) {
     return values_.at(idx).toTensor().numel_ubo();
   }
@@ -354,6 +358,10 @@ class ComputeGraph final {
     return values_.at(idx).toTensor().has_standard_axis_map();
   }
 
+  inline bool is_contiguous(const ValueRef idx) const {
+    return values_.at(idx).toTensor().is_contiguous();
+  }
+
   inline vkapi::BufferBindInfo logical_limits_ubo(const ValueRef idx) {
     return values_.at(idx).toTensor().logical_limits_ubo();
   }
@@ -363,6 +371,12 @@ class ComputeGraph final {
         values_.at(idx).toConstTensor().get_uniform_data(), api::kTensorSizes);
   }
 
+  inline PushConstantDataInfo dim_order_pc_of(const ValueRef idx) const {
+    return PushConstantDataInfo(
+        values_.at(idx).toConstTensor().get_uniform_data(),
+        api::kTensorDimOrder);
+  }
+
   inline PushConstantDataInfo strides_pc_of(const ValueRef idx) const {
     return PushConstantDataInfo(
         values_.at(idx).toConstTensor().get_uniform_data(),
diff --git a/backends/vulkan/runtime/vk_api/Descriptor.cpp b/backends/vulkan/runtime/vk_api/Descriptor.cpp
index 938666802ef..9e8394ffa9c 100644
--- a/backends/vulkan/runtime/vk_api/Descriptor.cpp
+++ b/backends/vulkan/runtime/vk_api/Descriptor.cpp
@@ -32,8 +32,8 @@ BufferBindInfo::BufferBindInfo(
 
 BufferBindInfo::BufferBindInfo(
     const VulkanBuffer& buffer_p,
-    const uint32_t offset_p,
-    const uint32_t range_p)
+    const size_t offset_p,
+    const size_t range_p)
     : handle(buffer_p.handle()),
       offset(buffer_p.mem_offset() + offset_p),
       range(range_p) {
diff --git a/backends/vulkan/runtime/vk_api/Descriptor.h b/backends/vulkan/runtime/vk_api/Descriptor.h
index 60d66a22619..15ea5e23e33 100644
--- a/backends/vulkan/runtime/vk_api/Descriptor.h
+++ b/backends/vulkan/runtime/vk_api/Descriptor.h
@@ -36,8 +36,8 @@ struct BufferBindInfo final {
   BufferBindInfo(const VulkanBuffer& buffer_p, const uint32_t offset_p = 0u);
   BufferBindInfo(
       const VulkanBuffer& buffer_p,
-      const uint32_t offset_p,
-      const uint32_t range_p);
+      const size_t offset_p,
+      const size_t range_p);
 };
 
 struct ParamsBindList final {
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index c4ccc860bc2..17f197dfdeb 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -259,14 +259,10 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
             /*allocate_memory = */ false);
 
         ASSERT_TRUE(new_v_tensor.strides() == ref_strides);
-        ASSERT_TRUE(
-            new_v_tensor.unsqueezed_strides() == ref_unsqueezed_strides);
 
         // Resize vtensor and check that updated metadata is correct
         v_tensor_to_resize.virtual_reconfigure(sizes, dim_order);
         ASSERT_TRUE(v_tensor_to_resize.strides() == ref_strides);
-        ASSERT_TRUE(
-            v_tensor_to_resize.unsqueezed_strides() == ref_unsqueezed_strides);
       }
     }
   }
@@ -1003,18 +999,14 @@ TEST_F(VulkanComputeAPITest, texture_virtual_resize) {
     b.virtual_resize(new_sizes);
     c.virtual_resize(new_sizes);
 
-    fill_staging(
-        staging_buffer_a, float(new_sizes[1] + 1.5f), a.staging_buffer_numel());
-    fill_staging(
-        staging_buffer_b,
-        float(new_sizes[2] + 55.0f),
-        b.staging_buffer_numel());
+    fill_staging(staging_buffer_a, float(new_sizes[1] + 1.5f), a.numel());
+    fill_staging(staging_buffer_b, float(new_sizes[2] + 55.0f), b.numel());
 
     submit_to_gpu();
     check_staging_buffer(
         staging_buffer_c,
         float(new_sizes[1] + new_sizes[2] + 56.5f),
-        c.staging_buffer_numel());
+        c.numel());
   }
 }
 
@@ -1096,7 +1088,6 @@ TEST_F(VulkanComputeAPITest, test_tensor_creation_from_vulkan_image) {
 
   const auto exp_numel = w * h * d * 4;
   EXPECT_TRUE(tensor.numel() == exp_numel);
-  EXPECT_TRUE(tensor.padded_numel() == exp_numel);
 }
 
 TEST(VulkanComputeGraphTest, test_values_scalars) {