From d695359308bf221fe354c7262ccf1dfccabf4fe6 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 12 Jun 2025 12:34:26 -0700
Subject: [PATCH] [ET-VK] Clean up `vTensor` member variables and expose `dim
 order` UBO and push constant

Pull Request resolved: https://github.com/pytorch/executorch/pull/11599

## Changes

* Add dim order to the list of tensor metadata that can be ingested by compute shaders
* Do not persistently store derivative metadata (i.e. padded sizes, padded numel, unsqueezed strides, etc.) as members of vTensor; instead store these in `uniform_data_` and use `uniform_data_` as the source of truth

## Motivation

> Add dim order to the list of tensor metadata that can be ingested by compute shaders

Knowing the dim order is necessary to convert between a linear buffer index to N-dimensional tensor index using a tensor's strides. Technically, the dim order can be inferred from the strides by performing an index sort on the strides array; however to prevent compute shaders from having to do this operation frequently, it is more efficient to pass in the dim order directly to the compute shader.

Currently, ET-VK compute shaders make strong assumptions about the dim order of buffer backed tensors so as to avoid having to dynamically generate the dim order from the strides array. However, these assumptions are not enforced and it is more correct to just account for the dim order rather than make assumptions. This will be addressed in the next diff.

> Do not persistently store derivative metadata (i.e. padded sizes, padded numel, unsqueezed strides, etc.) as members of vTensor; instead store these in `uniform_data_` and use `uniform_data_` as the source of truth

I realized that the purpose of these "derived metadata" is to simply convert default tensor metadata such sizes, strides, etc. to a form where they can be used in a compute shader. There is no need to store these derived metadata persistently, since they are pretty much only useful in the final `ivec4` form they exist as inside `UniformData`. So to simplify `vTensor` and to reduce the size of the class, I elected to remove these superfluous data members.

## Performance Impact

* Potential memory footprint improvement from reducing the size of `vTensor`.
ghstack-source-id: 290022826
@exported-using-ghexport

Differential Revision: [D76393427](https://our.internmc.facebook.com/intern/diff/D76393427/)
---
 .../vulkan/runtime/api/containers/Tensor.cpp  | 410 ++++++++++++------
 .../vulkan/runtime/api/containers/Tensor.h    | 162 ++++---
 backends/vulkan/runtime/graph/ComputeGraph.h  |  14 +
 backends/vulkan/runtime/vk_api/Descriptor.cpp |   4 +-
 backends/vulkan/runtime/vk_api/Descriptor.h   |   4 +-
 .../vulkan/test/vulkan_compute_api_test.cpp   |  15 +-
 6 files changed, 397 insertions(+), 212 deletions(-)

diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index a85229b2b86..43ebbfecbc6 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -143,6 +143,43 @@ bool dim_order_is_valid(const std::vector<int64_t>& dim_order) {
   return sum == n * (n + 1) / 2;
 }
 
+/*
+ * Applies the following transformations to a tensor's dim_order vector:
+ *   1. Reverse the order of elements so that the fastest moving dimensions are
+ *      first.
+ *   2. Convert NCHW dimension indices to WHCN indices, so that 0 represents the
+ *      width dimension, 1 represents the height dimension, and 2 represents the
+ *      channels dimension.
+ *   3. Unsqueeze the dim_order vector to the next multiple of 4.
+
+ * These transformations make it easier to use the dim order in a compute shader
+ */
+std::vector<int64_t> create_whcn_dim_order(
+    const std::vector<int64_t>& dim_order) {
+  size_t ndim = dim_order.size();
+  std::vector<int64_t> whcn_order(ndim);
+
+  // Convert from NCHW to WHCN index, and flip the dim order so that the fastest
+  // moving dimension is first.
+  // example: {     1,     2,        0} -> {       2,     0,      1}
+  //          {height, width, channels} -> {channels, width, height}
+  for (size_t whcn_i = 0, nchw_i = (ndim - 1); whcn_i < ndim;
+       ++whcn_i, --nchw_i) {
+    whcn_order.at(whcn_i) = ndim - 1 - dim_order.at(nchw_i);
+  }
+
+  // Unsqueeze to the next multiple of 4
+  size_t ndim_up4 = utils::align_up_4(ndim);
+  whcn_order.resize(ndim_up4);
+
+  // Append unsqueezed dimensions
+  for (size_t i = ndim; i < ndim_up4; ++i) {
+    whcn_order.at(i) = i;
+  }
+
+  return whcn_order;
+}
+
 std::vector<int64_t> unsqueeze_strides(
     const std::vector<int64_t>& strides,
     const int64_t numel) {
@@ -212,6 +249,97 @@ utils::uvec3 calculate_image_extents(
   return extents;
 }
 
+/*
+ * The physical image extents describe the size of an allocated texture resource
+ * i.e. how many texels in the width, height and depth axis of the image.
+ * However, the axis map allows a tensor logical dimension to map to a different
+ * physical texture axis; in essence, it describes a permutation between the
+ * logical width, height, channels, etc. dimensions of a tensor and the width,
+ * height, depth axis of a texture.
+ *
+ * The "logical extents" is simply the physical image extents permuted by the
+ * axis mapping. The logical extents is useful for constructing global work
+ * group sizes, so that it is easier to convert the global thread ID to a
+ * tensor index.
+ */
+utils::uvec3 calculate_logical_limits(
+    const utils::uvec3& image_extents,
+    const std::vector<int64_t>& axis_map) {
+  return {
+      image_extents[axis_map.at(0)],
+      image_extents[axis_map.at(1)],
+      image_extents[axis_map.at(2)],
+  };
+}
+
+/*
+ * Convenience overload of the above function to calculate logical limits
+ * directly from tensor sizes.
+ */
+utils::uvec3 calculate_logical_limits(
+    const std::vector<int64_t>& sizes,
+    const std::vector<int64_t>& axis_map,
+    const int32_t packed_dim) {
+  return calculate_logical_limits(
+      calculate_image_extents(
+          calculate_padded_sizes(sizes, packed_dim), axis_map, packed_dim),
+      axis_map);
+}
+
+int64_t calculate_gpu_buffer_numel(
+    Context* const context,
+    const std::vector<int64_t>& sizes,
+    const utils::uvec3 image_extents,
+    const utils::StorageType storage_type,
+    const vkapi::ScalarType dtype) {
+  // For texture backed tensors, simply multiply the total number of texels by 4
+  if (storage_type != utils::kBuffer) {
+    return image_extents[0] * image_extents[1] * image_extents[2] * 4;
+  }
+  const bool is_int8 = dtype == vkapi::kChar;
+  const bool int8_supported =
+      context->adapter_ptr()->has_full_int8_buffers_support();
+  const size_t numel = utils::multiply_integers(sizes);
+  // For int8 tensors, if the device does not support int8 buffers, then int32
+  // is used instead to represent the buffer data. Therefore the number of
+  // elements in the buffer is aligned to the next multiple of 4.
+  if (is_int8 && int8_supported) {
+    return utils::align_up_4(numel);
+  }
+  return numel;
+}
+
+int32_t pack_into_int32(const std::vector<int64_t>& vec, const int32_t extra) {
+  int32_t packed = static_cast<int32_t>(
+      vec.at(0) + (vec.at(1) << 4) + (vec.at(2) << 8) + (vec.at(3) << 12) +
+      (extra << 16));
+  return packed;
+}
+
+int32_t create_hashed_layout(
+    const std::vector<int64_t>& dim_order,
+    const std::vector<int64_t>& axis_map,
+    const int32_t packed_dim,
+    const utils::StorageType storage_type) {
+  if (storage_type == utils::kBuffer) {
+    return pack_into_int32(create_whcn_dim_order(dim_order), 0);
+  }
+  return pack_into_int32(axis_map, packed_dim);
+}
+
+size_t calculate_max_ubo_nbytes(
+    const size_t nbytes_per_ubo,
+    const utils::StorageType storage_type) {
+  // For texture backed tensors, the metadata fields needed are:
+  // sizes, logical limits
+  size_t max_metadata_field_count = 2u;
+  if (storage_type == utils::kBuffer) {
+    // sizes, strides, dim order, numel
+    max_metadata_field_count = 4u;
+  }
+  return max_metadata_field_count * nbytes_per_ubo;
+}
+
 //
 // vTensorStorage
 //
@@ -322,14 +450,21 @@ vTensorStorage::vTensorStorage(
     const utils::StorageType storage_type,
     const std::vector<int64_t>& axis_map,
     const int32_t packed_dim,
-    const std::vector<int64_t>& padded_sizes,
+    const std::vector<int64_t>& sizes,
     const vkapi::ScalarType dtype,
     const bool allocate_memory)
     : context_(context),
       storage_type_{storage_type},
-      image_extents_(
-          calculate_image_extents(padded_sizes, axis_map, packed_dim)),
-      buffer_length_{utils::multiply_integers(padded_sizes)},
+      image_extents_(calculate_image_extents(
+          calculate_padded_sizes(sizes, packed_dim),
+          axis_map,
+          packed_dim)),
+      buffer_length_{calculate_gpu_buffer_numel(
+          context_,
+          sizes,
+          image_extents_,
+          storage_type,
+          dtype)},
       buffer_offset_{0},
       image_(allocate_image(
           context_,
@@ -446,35 +581,45 @@ vTensor::vTensor(
       dim_order_(calculate_dim_order(sizes_.size(), packed_dim_)),
       axis_map_(calculate_axis_map(sizes_, axis_map_layout)),
       strides_(calculate_strides(sizes, dim_order_)),
-      padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)},
-      unsqueezed_strides_{
-          unsqueeze_strides(strides_, utils::multiply_integers(sizes_))},
-      padded_numel_(utils::multiply_integers(padded_sizes_)),
+      numel_(utils::multiply_integers(sizes_)),
+      hashed_layout_(create_hashed_layout(
+          dim_order_,
+          axis_map_,
+          packed_dim_,
+          storage_type)),
+      // Related to tensor metadata UBOs
+      nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
+      max_ubo_nbytes_{calculate_max_ubo_nbytes(nbytes_per_ubo_, storage_type)},
       uniforms_(),
-      // Utility Uniform Buffers that can be passed to shaders as arguments
-      uniforms_size_(0),
-      sizes_uniform_offset_(kUniformOffsetUnset),
-      unsqueezed_strides_offset_(kUniformOffsetUnset),
-      numel_uniform_offset_(kUniformOffsetUnset),
-      logical_limits_uniform_offset_(kUniformOffsetUnset),
       // Construct Tensor storage
       storage_(std::make_shared<vTensorStorage>(
           context,
           storage_type,
           axis_map_,
           packed_dim_,
-          padded_sizes_,
+          sizes,
           dtype_,
           allocate_memory)) {
+  // Derived metadata
+  std::vector<int64_t> whcn_dim_order(4, 0);
+  std::vector<int64_t> unsqueezed_strides(4, 0);
+  // Only calculate derived metadata if needed for the desired storage type.
+  // Note that logical limits may be used by buffer storage as well in order to
+  // set global work group sizes for some compute shaders.
+  if (storage_type == utils::kBuffer) {
+    whcn_dim_order = create_whcn_dim_order(dim_order_);
+    unsqueezed_strides = unsqueeze_strides(strides_, numel_);
+  }
+
   uniform_data_ = std::make_shared<UniformData>(UniformData{
       sizes_,
-      unsqueezed_strides_,
-      {{0, 0, 0}},
-      static_cast<size_t>(utils::multiply_integers(sizes_))});
+      whcn_dim_order,
+      unsqueezed_strides,
+      TextureLimits(
+          calculate_logical_limits(storage_->image_extents_, axis_map_)),
+      numel_});
   VK_CHECK_COND(
       dim_order_is_valid(dim_order_), "computed dim order is invalid");
-
-  set_logical_limits(storage_->image_extents_);
 }
 
 // NOLINTNEXTLINE
@@ -490,24 +635,23 @@ vTensor::vTensor(
       dim_order_(),
       axis_map_(calculate_axis_map(sizes_, axis_map_layout)),
       strides_(),
-      padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_)),
-      unsqueezed_strides_(),
-      padded_numel_(utils::multiply_integers(padded_sizes_)),
+      numel_(utils::multiply_integers(sizes_)),
+      hashed_layout_(create_hashed_layout(
+          dim_order_,
+          axis_map_,
+          packed_dim_,
+          utils::kTexture3D)),
+      // Related to tensor metadata UBOs
+      nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
+      max_ubo_nbytes_{
+          calculate_max_ubo_nbytes(nbytes_per_ubo_, utils::kTexture3D)},
       uniforms_(),
-      // Utility Uniform Buffers that can be passed to shaders as arguments
-      uniforms_size_(0),
-      sizes_uniform_offset_(kUniformOffsetUnset),
-      unsqueezed_strides_offset_(kUniformOffsetUnset),
-      numel_uniform_offset_(kUniformOffsetUnset),
-      logical_limits_uniform_offset_(kUniformOffsetUnset),
       // Construct Tensor storage
       storage_(std::make_shared<vTensorStorage>(context, image)) {
-  uniform_data_ = std::make_shared<UniformData>(UniformData{
-      sizes_,
-      {0, 0, 0, 0},
-      {{0, 0, 0}},
-      static_cast<size_t>(utils::multiply_integers(sizes_))});
-  set_logical_limits(storage_->image_extents_);
+  TextureLimits logical_limits(
+      calculate_logical_limits(storage_->image_extents_, axis_map_));
+  uniform_data_ = std::make_shared<UniformData>(
+      UniformData{sizes_, {0, 0, 0, 0}, {0, 0, 0, 0}, logical_limits, numel_});
 }
 
 vTensor::vTensor(vTensor& other)
@@ -518,18 +662,11 @@ vTensor::vTensor(vTensor& other)
       dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
       axis_map_(other.axis_map_.begin(), other.axis_map_.end()),
       strides_(other.strides_.begin(), other.strides_.end()),
-      padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()},
-      unsqueezed_strides_{
-          other.unsqueezed_strides_.begin(),
-          other.unsqueezed_strides_.end()},
-      padded_numel_(other.padded_numel_),
+      numel_(other.numel_),
+      hashed_layout_(other.hashed_layout_),
+      nbytes_per_ubo_{other.nbytes_per_ubo_},
+      max_ubo_nbytes_{other.max_ubo_nbytes_},
       uniforms_(),
-      // Empty initialize Utility Uniform Buffers
-      uniforms_size_(0),
-      sizes_uniform_offset_(kUniformOffsetUnset),
-      unsqueezed_strides_offset_(kUniformOffsetUnset),
-      numel_uniform_offset_(kUniformOffsetUnset),
-      logical_limits_uniform_offset_(kUniformOffsetUnset),
       // Copy Tensor storage
       storage_(other.storage_) {
   uniform_data_ = std::make_shared<UniformData>(*other.get_uniform_data());
@@ -546,22 +683,21 @@ vTensor::vTensor(
       dim_order_(dim_order.begin(), dim_order.end()),
       axis_map_(calculate_axis_map(sizes_, utils::kDefaultAxisMap)),
       strides_(calculate_strides(sizes_, dim_order_)),
-      padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)},
-      unsqueezed_strides_{
-          unsqueeze_strides(strides_, utils::multiply_integers(sizes_))},
-      padded_numel_(utils::multiply_integers(padded_sizes_)),
+      numel_(other.numel_),
+      hashed_layout_(create_hashed_layout(
+          dim_order_,
+          axis_map_,
+          packed_dim_,
+          other.storage_type())),
+      nbytes_per_ubo_{other.nbytes_per_ubo_},
+      max_ubo_nbytes_{other.max_ubo_nbytes_},
       uniforms_(),
-      // Empty initialize Utility Uniform Buffers
-      uniforms_size_(0),
-      sizes_uniform_offset_(kUniformOffsetUnset),
-      unsqueezed_strides_offset_(kUniformOffsetUnset),
-      numel_uniform_offset_(kUniformOffsetUnset),
-      logical_limits_uniform_offset_(kUniformOffsetUnset),
       // Copy Tensor storage
       storage_(other.storage_) {
   uniform_data_ = std::make_shared<UniformData>(UniformData{
       sizes_,
-      unsqueezed_strides_,
+      create_whcn_dim_order(dim_order_),
+      unsqueeze_strides(strides_, numel_),
       {other.logical_limits()},
       static_cast<size_t>(utils::multiply_integers(sizes_))});
 
@@ -584,6 +720,7 @@ uint32_t vTensor::UniformData::write_attribute(
   }
   switch (attr) {
     WRITE_ATTRIBUTE_CASE(SIZES, sizes_v);
+    WRITE_ATTRIBUTE_CASE(WHCN_DIM_ORDER, whcn_dim_order_v);
     WRITE_ATTRIBUTE_CASE(STRIDES, strides_v);
     WRITE_ATTRIBUTE_CASE(LOGICAL_LIMITS, logical_limits);
     WRITE_ATTRIBUTE_CASE(NUMEL, numel);
@@ -624,12 +761,6 @@ vkapi::VulkanBuffer& vTensor::buffer(
   return storage_->buffer_;
 }
 
-void vTensor::set_logical_limits(const utils::uvec3& image_extents) {
-  uniform_data_->logical_limits.limits[0] = image_extents[axis_map_.at(0)];
-  uniform_data_->logical_limits.limits[1] = image_extents[axis_map_.at(1)];
-  uniform_data_->logical_limits.limits[2] = image_extents[axis_map_.at(2)];
-}
-
 utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
   switch (packed_dim_) {
     case WHCN::kWidthDim:
@@ -643,95 +774,108 @@ utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
   }
 }
 
+bool vTensor::is_contiguous() const {
+  if (storage_type() != utils::kBuffer) {
+    return false;
+  }
+  for (size_t i = 0; i < dim_order_.size(); ++i) {
+    if (dim_order_.at(i) != i) {
+      return false;
+    }
+  }
+  return true;
+}
+
+size_t vTensor::get_max_ubo_nbytes(const size_t nbytes_per_ubo) const {
+  // For texture backed tensors, the metadata fields needed are:
+  // sizes, logical limits
+  size_t max_metadata_field_count = 2u;
+  if (storage_type() == utils::kBuffer) {
+    // sizes, strides, dim order, numel
+    max_metadata_field_count = 4u;
+  }
+  return max_metadata_field_count * nbytes_per_ubo;
+}
+
 const vkapi::BufferBindInfo vTensor::sizes_ubo() {
-  const size_t size_per_ubo =
-      storage_->context_->adapter_ptr()->min_ubo_alignment();
-  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true);
+    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
   }
   if (sizes_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
+        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
     sizes_uniform_offset_ = uniforms_size_;
-    uniforms_size_ += size_per_ubo;
+    uniforms_size_ += nbytes_per_ubo_;
     uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_);
   }
   return vkapi::BufferBindInfo(
-      uniforms_.buffer(), sizes_uniform_offset_, size_per_ubo);
+      uniforms_.buffer(), sizes_uniform_offset_, nbytes_per_ubo_);
 }
 
-const vkapi::BufferBindInfo vTensor::strides_ubo() {
-  const size_t size_per_ubo =
-      storage_->context_->adapter_ptr()->min_ubo_alignment();
-  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
+const vkapi::BufferBindInfo vTensor::dim_order_ubo() {
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true);
+    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
   }
-  if (unsqueezed_strides_offset_ == kUniformOffsetUnset) {
+  if (dim_order_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
+        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
-    unsqueezed_strides_offset_ = uniforms_size_;
-    uniforms_size_ += size_per_ubo;
+    dim_order_uniform_offset_ = uniforms_size_;
+    uniforms_size_ += nbytes_per_ubo_;
     uniforms_.update(
-        utils::make_whcn_ivec4(unsqueezed_strides_),
-        unsqueezed_strides_offset_);
+        uniform_data_->whcn_dim_order_v, dim_order_uniform_offset_);
+  }
+  return vkapi::BufferBindInfo(
+      uniforms_.buffer(), dim_order_uniform_offset_, nbytes_per_ubo_);
+}
+
+const vkapi::BufferBindInfo vTensor::strides_ubo() {
+  if (!uniforms_.buffer()) {
+    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
+  }
+  if (strides_uniform_offset == kUniformOffsetUnset) {
+    VK_CHECK_COND(
+        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
+        "Uniform data allocation has exceeded Tensor uniform buffer size");
+    strides_uniform_offset = uniforms_size_;
+    uniforms_size_ += nbytes_per_ubo_;
+    uniforms_.update(uniform_data_->strides_v, strides_uniform_offset);
   }
   return vkapi::BufferBindInfo(
-      uniforms_.buffer(), unsqueezed_strides_offset_, size_per_ubo);
+      uniforms_.buffer(), strides_uniform_offset, nbytes_per_ubo_);
 }
 
 const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
-  const size_t size_per_ubo =
-      storage_->context_->adapter_ptr()->min_ubo_alignment();
-  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true);
+    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
   }
   if (logical_limits_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
+        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
     logical_limits_uniform_offset_ = uniforms_size_;
-    uniforms_size_ += size_per_ubo;
+    uniforms_size_ += nbytes_per_ubo_;
     uniforms_.update(logical_limits(), logical_limits_uniform_offset_);
   }
   return vkapi::BufferBindInfo(
-      uniforms_.buffer(), logical_limits_uniform_offset_, size_per_ubo);
+      uniforms_.buffer(), logical_limits_uniform_offset_, nbytes_per_ubo_);
 }
 
 const vkapi::BufferBindInfo vTensor::numel_ubo() {
-  const size_t size_per_ubo =
-      storage_->context_->adapter_ptr()->min_ubo_alignment();
-  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true);
+    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
   }
   if (numel_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
+        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
     numel_uniform_offset_ = uniforms_size_;
-    uniforms_size_ += size_per_ubo;
+    uniforms_size_ += nbytes_per_ubo_;
     uniforms_.update(numel(), numel_uniform_offset_);
   }
   return vkapi::BufferBindInfo(
-      uniforms_.buffer(), numel_uniform_offset_, size_per_ubo);
-}
-
-size_t vTensor::staging_buffer_numel() const {
-  const bool is_int8 = dtype_ == vkapi::kChar;
-  const bool int8_supported =
-      storage_->context_->adapter_ptr()->has_full_int8_buffers_support();
-  if (is_int8 && !int8_supported) {
-    return utils::align_up_4(numel());
-  }
-  if (storage_type() == utils::kBuffer) {
-    return numel();
-  }
-  return padded_numel_;
+      uniforms_.buffer(), numel_uniform_offset_, nbytes_per_ubo_);
 }
 
 VkMemoryRequirements vTensor::get_memory_requirements() const {
@@ -758,33 +902,36 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) {
 }
 
 void vTensor::update_metadata() {
+  numel_ = utils::multiply_integers(sizes_);
   strides_ = calculate_strides(sizes_, dim_order_);
-  uniform_data_->numel = utils::multiply_integers(sizes_);
-
-  padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_);
-  unsqueezed_strides_ = unsqueeze_strides(strides_, numel());
-  padded_numel_ = utils::multiply_integers(padded_sizes_);
 
   // Update uniform data if it has been modified
+  uniform_data_->numel = numel_;
   uniform_data_->sizes_v = utils::make_whcn_ivec4(sizes_);
-  uniform_data_->strides_v = utils::make_whcn_ivec4(unsqueezed_strides_);
-
-  // Calculate the image extents that would have been used to allocate a texture
-  // withthe current sizes, and use that to set the logical limits.
-  set_logical_limits(
-      calculate_image_extents(padded_sizes_, axis_map_, packed_dim_));
+  uniform_data_->whcn_dim_order_v =
+      utils::make_ivec4(create_whcn_dim_order(dim_order_));
+  uniform_data_->strides_v =
+      utils::make_whcn_ivec4(unsqueeze_strides(strides_, numel_));
+  uniform_data_->numel = utils::safe_downcast<int32_t>(numel_);
+  uniform_data_->logical_limits.limits =
+      calculate_logical_limits(sizes_, axis_map_, packed_dim_);
 
   if (sizes_uniform_offset_ != kUniformOffsetUnset) {
     uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_);
   }
-  if (unsqueezed_strides_offset_ != kUniformOffsetUnset) {
-    uniforms_.update(uniform_data_->strides_v, unsqueezed_strides_offset_);
+  if (dim_order_uniform_offset_ != kUniformOffsetUnset) {
+    uniforms_.update(
+        uniform_data_->whcn_dim_order_v, dim_order_uniform_offset_);
+  }
+  if (strides_uniform_offset != kUniformOffsetUnset) {
+    uniforms_.update(uniform_data_->strides_v, strides_uniform_offset);
   }
   if (numel_uniform_offset_ != kUniformOffsetUnset) {
-    uniforms_.update(numel(), numel_uniform_offset_);
+    uniforms_.update(numel_, numel_uniform_offset_);
   }
   if (logical_limits_uniform_offset_ != kUniformOffsetUnset) {
-    uniforms_.update(logical_limits(), logical_limits_uniform_offset_);
+    uniforms_.update(
+        uniform_data_->logical_limits.limits, logical_limits_uniform_offset_);
   }
 }
 
@@ -792,8 +939,8 @@ void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
   if (storage_type() != utils::kBuffer) {
     // For texture storage check that the current texture is large enough for
     // the new sizes of the tensor.
-    utils::uvec3 virtual_extents =
-        calculate_image_extents(padded_sizes_, axis_map_, packed_dim_);
+    utils::uvec3 virtual_extents = calculate_image_extents(
+        calculate_padded_sizes(sizes_, packed_dim_), axis_map_, packed_dim_);
 
     bool valid_resize = virtual_extents[0] <= storage_->image_extents_[0];
     valid_resize =
@@ -828,6 +975,11 @@ void vTensor::virtual_reconfigure(
   check_sizes(new_sizes);
   sizes_ = new_sizes;
   dim_order_ = new_dim_order;
+
+  // Update the hashed layout because dim order is updated
+  hashed_layout_ =
+      create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type());
+
   update_metadata();
 }
 
@@ -837,6 +989,7 @@ void vTensor::virtual_clone(const vTensor& other) {
   dim_order_ = other.dim_order_;
   axis_map_ = other.axis_map_;
   packed_dim_ = other.packed_dim_;
+  hashed_layout_ = other.hashed_layout_;
 
   *uniform_data_ = *other.get_uniform_data();
 }
@@ -895,6 +1048,11 @@ void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) {
       axis_map_.at(3) = dim0_whcn;
     }
   }
+
+  // Update the hashed layout because dim order / axis mpa is updated
+  hashed_layout_ =
+      create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type());
+
   update_metadata();
 }
 
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index 850dc2d7fab..78a24d87e77 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -81,6 +81,18 @@ struct LastAccess {
       : stage{stage_flags}, access{access_flags} {}
 };
 
+/*
+ * Calculate the number of elements that a GPU buffer would require to store the
+ * contents of a tensor. This will depend on the storage type and dtype of the
+ * tensor, as well as the features available on the device.
+ */
+int64_t calculate_gpu_buffer_numel(
+    Context* const context,
+    const std::vector<int64_t>& sizes,
+    const utils::uvec3 image_extents,
+    const utils::StorageType storage_type,
+    const vkapi::ScalarType dtype);
+
 class vTensorStorage final {
  public:
   // Do not allow empty vTensorStorage construction
@@ -91,7 +103,7 @@ class vTensorStorage final {
       const utils::StorageType storage_type,
       const std::vector<int64_t>& axis_map,
       const int32_t packed_dim,
-      const std::vector<int64_t>& padded_sizes,
+      const std::vector<int64_t>& sizes,
       const vkapi::ScalarType dtype,
       const bool allocate_memory = true);
 
@@ -140,6 +152,10 @@ class vTensorStorage final {
   void verify() const;
 
  public:
+  inline size_t buffer_len() const {
+    return utils::safe_downcast<size_t>(buffer_length_);
+  }
+
   inline VkFormat texture_format() {
     return image_.format();
   }
@@ -207,8 +223,11 @@ class vTensor final {
   vTensor(vTensor&& other) = default;
   vTensor& operator=(vTensor&& other) = default;
 
+  ~vTensor() = default;
+
   enum class Attribute : uint8_t {
     SIZES,
+    WHCN_DIM_ORDER,
     STRIDES,
     LOGICAL_LIMITS,
     NUMEL,
@@ -216,6 +235,7 @@ class vTensor final {
 
   class UniformData {
     utils::ivec4 sizes_v;
+    utils::ivec4 whcn_dim_order_v;
     utils::ivec4 strides_v;
     // See the comments documenting logical_limits() for more context.
     TextureLimits logical_limits;
@@ -227,10 +247,12 @@ class vTensor final {
 
     UniformData(
         const std::vector<int64_t>& sizes,
+        const std::vector<int64_t>& whcn_dim_order,
         const std::vector<int64_t>& strides,
         const TextureLimits& logical_limits,
         const size_t numel_ll)
         : sizes_v(utils::make_whcn_ivec4(sizes)),
+          whcn_dim_order_v(utils::make_ivec4(whcn_dim_order)),
           strides_v(utils::make_whcn_ivec4(strides)),
           logical_limits(logical_limits),
           numel(utils::safe_downcast<int32_t>(numel_ll)) {}
@@ -293,21 +315,17 @@ class vTensor final {
   // strides of the tensor in NCHW dimension order
   std::vector<int64_t> strides_;
 
-  /*
-   * The below metadata members are derived from the above, and are typically
-   * to i.e. pass tensor metadata to compute shaders.
-   */
+  // number of elements based on the canonical sizes
+  size_t numel_;
+
+  // For texture backed tensors, this int32 contains the axis map data packed
+  // into a single int32. For buffer backed tensors, this int32 contains the
+  // wchn dim order data packed into a single int32.
+  int32_t hashed_layout_;
 
-  // padded sizes of the tensor in NCHW dimension order. See the
-  // calculate_padded_sizes() function for more context. Note that padded sizes
-  // are only used for texture storage, and not for buffer storage.
-  std::vector<int64_t> padded_sizes_;
-  // Contains the strides of the tensor, with the dimensionality padded to the
-  // nearest multiple of 4. Unsqueezed dims will have a stride of int32_t max.
-  std::vector<int64_t> unsqueezed_strides_;
-  // Contains the number of elements in the tensor according to the padded
-  // sizes.
-  size_t padded_numel_;
+  // Pre-compute these quantities to avoid frequent re-computation
+  size_t nbytes_per_ubo_;
+  size_t max_ubo_nbytes_;
 
   /*
    * Utility GPU buffer that can be passed to shaders in order to convey tensor
@@ -320,15 +338,13 @@ class vTensor final {
    * context about the data contained in each buffer.
    */
   ParamsBuffer uniforms_;
-  uint32_t uniforms_size_;
-  uint32_t sizes_uniform_offset_;
-  uint32_t unsqueezed_strides_offset_;
-  uint32_t numel_uniform_offset_;
-  uint32_t logical_limits_uniform_offset_;
 
-  // Maximum number of metadata fields that can be stored in the metadata UBO.
-  // This is used to calculate the size of the UBO that should be allocated.
-  constexpr static size_t kMaxMetadataFieldCount = 4;
+  uint32_t uniforms_size_ = 0u;
+  uint32_t sizes_uniform_offset_ = kUniformOffsetUnset;
+  uint32_t dim_order_uniform_offset_ = kUniformOffsetUnset;
+  uint32_t strides_uniform_offset = kUniformOffsetUnset;
+  uint32_t numel_uniform_offset_ = kUniformOffsetUnset;
+  uint32_t logical_limits_uniform_offset_ = kUniformOffsetUnset;
 
   // Initial value of uniform buffer offsets. 1 is selected as it is essentially
   // impossible for a ubo to have an offset of 1.
@@ -381,9 +397,6 @@ class vTensor final {
     return storage_->storage_type_ == utils::kBuffer;
   }
 
- private:
-  void set_logical_limits(const utils::uvec3& image_extents);
-
  public:
   /*
    * The logical limits of the tensor are derived from the image extents of the
@@ -451,21 +464,37 @@ class vTensor final {
     return dim_order_;
   }
 
+  inline const std::vector<int64_t>& strides() const {
+    return strides_;
+  }
+
+  inline size_t numel() const {
+    return numel_;
+  }
+
+  inline size_t nbytes() const {
+    return element_size(dtype()) * numel();
+  }
+
   inline const std::vector<int64_t>& axis_map() const {
     return axis_map_;
   }
 
   /*
-   * Returns a single int32_t that contains the values of the axis map and the
-   * packed dimension packed into a single int32_t, such that it can be used as
-   * a specialization constant in a compute shader. This allows for the SPIR-V
-   * to bytecode compilation to perform compile-time unfolding on the axis map.
-   * Each element of the axis map and the value of the packed dimension take up
-   * 4 bits in the packed int32_t.
+   * For texture backed tensors, this function return a int32_t that contains
+   * the axis map + packed dimension. Each element of the axis map occupies 4
+   * bits of the int32.
+   *
+   * For buffer backed tensors, the int32_t contains the WHCN dim order, where
+   * each element of the dim order array occupies 4 bits of the int32.
+   *
+   * This int32 is typically consumed as a specialization constant in compute
+   * shaders where it is subsequently unpacked. The layout data of a vTensor
+   * instance is typically static once created, which is why this method is
+   * appropriate.
    */
   inline int32_t hashed_layout() const {
-    return axis_map_.at(0) + (axis_map_.at(1) << 4) + (axis_map_.at(2) << 8) +
-        (axis_map_.at(3) << 12) + (packed_dim_ << 16);
+    return hashed_layout_;
   }
 
   /*
@@ -478,57 +507,48 @@ class vTensor final {
     return axis_map_.at(0) == 0 && axis_map_.at(1) == 1 && axis_map_.at(2) == 2;
   }
 
-  inline const std::vector<int64_t>& strides() const {
-    return strides_;
-  }
+  /*
+   * Return true if a buffer backed tensor's dim order matches that of a
+   * contiguous tensor, i.e. the dim order will be {0, 1, 2, ... }.
+   * Returns false for texture backed tensors.
+   */
+  bool is_contiguous() const;
 
-  inline const std::vector<int64_t>& unsqueezed_strides() const {
-    return unsqueezed_strides_;
+ private:
+  inline size_t nbytes_per_ubo() const {
+    return storage_->context_->adapter_ptr()->min_ubo_alignment();
   }
 
+  size_t get_max_ubo_nbytes(const size_t nbytes_per_ubo) const;
+
+ public:
   /*
-   * Returns a GPU buffer containing the sizes of the tensor in WHCN order.
-   * Note that dimensions that are not present in the tensor's sizes are set to
-   * a size of 1.
+   * The functions below return the buffer binding info for a UBO that contains
+   * some metadata of the tensor, which can be used to pass in tensor metadata
+   * to a compute shader. The other method of passing in tensor metadata is via
+   * push constants. The trade-off between each is that push constants may be
+   * slightly more performant and memory efficient; however, to update the
+   * values in a push constant due to i.e. a tensor resize between inferences,
+   * the command buffer must be re-encoded. On the other hand, UBOs can update
+   * their data by writing to their mapped memory without requiring a command
+   * buffer re-encode.
    */
+
   const vkapi::BufferBindInfo sizes_ubo();
 
-  /*
-   * Returns a GPU buffer containing the strides of the tensor in WHCN order.
-   * Note that the strides are extended to a dimensionality that is a multiple
-   * of 4, thus dimensions that are not present in the tensor's sizes are set to
-   * have a stride equal to the stride of the "slowest moving" dimension.
-   */
+  const vkapi::BufferBindInfo dim_order_ubo();
+
   const vkapi::BufferBindInfo strides_ubo();
 
-  /*
-   * Returns a GPU buffer containing the logical limits of the tensor. See the
-   * comments for logical_limits() for more context.
-   */
   const vkapi::BufferBindInfo logical_limits_ubo();
 
-  /*
-   * Returns the number of elements in the buffer used to store the tensor.
-   */
   const vkapi::BufferBindInfo numel_ubo();
 
-  inline size_t numel() const {
-    return uniform_data_->numel;
-  }
-
-  inline size_t nbytes() const {
-    return element_size(dtype()) * numel();
-  }
-
-  /*
-   * Returns numel but based on padded_sizes_ instead of sizes_
-   */
-  inline size_t padded_numel() const {
-    return padded_numel_;
+ public:
+  inline size_t staging_buffer_numel() const {
+    return storage_->buffer_len();
   }
 
-  size_t staging_buffer_numel() const;
-
   inline size_t staging_buffer_nbytes() const {
     return element_size(dtype()) * staging_buffer_numel();
   }
@@ -608,6 +628,8 @@ class vTensor final {
 };
 
 static constexpr vTensor::Attribute kTensorSizes = vTensor::Attribute::SIZES;
+static constexpr vTensor::Attribute kTensorDimOrder =
+    vTensor::Attribute::WHCN_DIM_ORDER;
 static constexpr vTensor::Attribute kTensorStrides =
     vTensor::Attribute::STRIDES;
 static constexpr vTensor::Attribute kTensorLogicalLimits =
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 31514989dfc..21d80d5843f 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -346,6 +346,10 @@ class ComputeGraph final {
     return values_.at(idx).toTensor().strides_ubo();
   }
 
+  inline vkapi::BufferBindInfo dim_order_ubo(const ValueRef idx) {
+    return values_.at(idx).toTensor().dim_order_ubo();
+  }
+
   inline vkapi::BufferBindInfo numel_ubo(const ValueRef idx) {
     return values_.at(idx).toTensor().numel_ubo();
   }
@@ -354,6 +358,10 @@ class ComputeGraph final {
     return values_.at(idx).toTensor().has_standard_axis_map();
   }
 
+  inline bool is_contiguous(const ValueRef idx) const {
+    return values_.at(idx).toTensor().is_contiguous();
+  }
+
   inline vkapi::BufferBindInfo logical_limits_ubo(const ValueRef idx) {
     return values_.at(idx).toTensor().logical_limits_ubo();
   }
@@ -363,6 +371,12 @@ class ComputeGraph final {
         values_.at(idx).toConstTensor().get_uniform_data(), api::kTensorSizes);
   }
 
+  inline PushConstantDataInfo dim_order_pc_of(const ValueRef idx) const {
+    return PushConstantDataInfo(
+        values_.at(idx).toConstTensor().get_uniform_data(),
+        api::kTensorDimOrder);
+  }
+
   inline PushConstantDataInfo strides_pc_of(const ValueRef idx) const {
     return PushConstantDataInfo(
         values_.at(idx).toConstTensor().get_uniform_data(),
diff --git a/backends/vulkan/runtime/vk_api/Descriptor.cpp b/backends/vulkan/runtime/vk_api/Descriptor.cpp
index 938666802ef..9e8394ffa9c 100644
--- a/backends/vulkan/runtime/vk_api/Descriptor.cpp
+++ b/backends/vulkan/runtime/vk_api/Descriptor.cpp
@@ -32,8 +32,8 @@ BufferBindInfo::BufferBindInfo(
 
 BufferBindInfo::BufferBindInfo(
     const VulkanBuffer& buffer_p,
-    const uint32_t offset_p,
-    const uint32_t range_p)
+    const size_t offset_p,
+    const size_t range_p)
     : handle(buffer_p.handle()),
       offset(buffer_p.mem_offset() + offset_p),
       range(range_p) {
diff --git a/backends/vulkan/runtime/vk_api/Descriptor.h b/backends/vulkan/runtime/vk_api/Descriptor.h
index 60d66a22619..15ea5e23e33 100644
--- a/backends/vulkan/runtime/vk_api/Descriptor.h
+++ b/backends/vulkan/runtime/vk_api/Descriptor.h
@@ -36,8 +36,8 @@ struct BufferBindInfo final {
   BufferBindInfo(const VulkanBuffer& buffer_p, const uint32_t offset_p = 0u);
   BufferBindInfo(
       const VulkanBuffer& buffer_p,
-      const uint32_t offset_p,
-      const uint32_t range_p);
+      const size_t offset_p,
+      const size_t range_p);
 };
 
 struct ParamsBindList final {
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index c4ccc860bc2..17f197dfdeb 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -259,14 +259,10 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
             /*allocate_memory = */ false);
 
         ASSERT_TRUE(new_v_tensor.strides() == ref_strides);
-        ASSERT_TRUE(
-            new_v_tensor.unsqueezed_strides() == ref_unsqueezed_strides);
 
         // Resize vtensor and check that updated metadata is correct
         v_tensor_to_resize.virtual_reconfigure(sizes, dim_order);
         ASSERT_TRUE(v_tensor_to_resize.strides() == ref_strides);
-        ASSERT_TRUE(
-            v_tensor_to_resize.unsqueezed_strides() == ref_unsqueezed_strides);
       }
     }
   }
@@ -1003,18 +999,14 @@ TEST_F(VulkanComputeAPITest, texture_virtual_resize) {
     b.virtual_resize(new_sizes);
     c.virtual_resize(new_sizes);
 
-    fill_staging(
-        staging_buffer_a, float(new_sizes[1] + 1.5f), a.staging_buffer_numel());
-    fill_staging(
-        staging_buffer_b,
-        float(new_sizes[2] + 55.0f),
-        b.staging_buffer_numel());
+    fill_staging(staging_buffer_a, float(new_sizes[1] + 1.5f), a.numel());
+    fill_staging(staging_buffer_b, float(new_sizes[2] + 55.0f), b.numel());
 
     submit_to_gpu();
     check_staging_buffer(
         staging_buffer_c,
         float(new_sizes[1] + new_sizes[2] + 56.5f),
-        c.staging_buffer_numel());
+        c.numel());
   }
 }
 
@@ -1096,7 +1088,6 @@ TEST_F(VulkanComputeAPITest, test_tensor_creation_from_vulkan_image) {
 
   const auto exp_numel = w * h * d * 4;
   EXPECT_TRUE(tensor.numel() == exp_numel);
-  EXPECT_TRUE(tensor.padded_numel() == exp_numel);
 }
 
 TEST(VulkanComputeGraphTest, test_values_scalars) {