From d695359308bf221fe354c7262ccf1dfccabf4fe6 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 12 Jun 2025 12:34:26 -0700
Subject: [PATCH 1/3] [ET-VK] Clean up `vTensor` member variables and expose
 `dim order` UBO and push constant

Pull Request resolved: https://github.com/pytorch/executorch/pull/11599

## Changes

* Add dim order to the list of tensor metadata that can be ingested by compute shaders
* Do not persistently store derivative metadata (i.e. padded sizes, padded numel, unsqueezed strides, etc.) as members of vTensor; instead store these in `uniform_data_` and use `uniform_data_` as the source of truth

## Motivation

> Add dim order to the list of tensor metadata that can be ingested by compute shaders

Knowing the dim order is necessary to convert between a linear buffer index to N-dimensional tensor index using a tensor's strides. Technically, the dim order can be inferred from the strides by performing an index sort on the strides array; however to prevent compute shaders from having to do this operation frequently, it is more efficient to pass in the dim order directly to the compute shader.

Currently, ET-VK compute shaders make strong assumptions about the dim order of buffer backed tensors so as to avoid having to dynamically generate the dim order from the strides array. However, these assumptions are not enforced and it is more correct to just account for the dim order rather than make assumptions. This will be addressed in the next diff.

> Do not persistently store derivative metadata (i.e. padded sizes, padded numel, unsqueezed strides, etc.) as members of vTensor; instead store these in `uniform_data_` and use `uniform_data_` as the source of truth

I realized that the purpose of these "derived metadata" is to simply convert default tensor metadata such sizes, strides, etc. to a form where they can be used in a compute shader. There is no need to store these derived metadata persistently, since they are pretty much only useful in the final `ivec4` form they exist as inside `UniformData`. So to simplify `vTensor` and to reduce the size of the class, I elected to remove these superfluous data members.

## Performance Impact

* Potential memory footprint improvement from reducing the size of `vTensor`.
ghstack-source-id: 290022826
@exported-using-ghexport

Differential Revision: [D76393427](https://our.internmc.facebook.com/intern/diff/D76393427/)
---
 .../vulkan/runtime/api/containers/Tensor.cpp  | 410 ++++++++++++------
 .../vulkan/runtime/api/containers/Tensor.h    | 162 ++++---
 backends/vulkan/runtime/graph/ComputeGraph.h  |  14 +
 backends/vulkan/runtime/vk_api/Descriptor.cpp |   4 +-
 backends/vulkan/runtime/vk_api/Descriptor.h   |   4 +-
 .../vulkan/test/vulkan_compute_api_test.cpp   |  15 +-
 6 files changed, 397 insertions(+), 212 deletions(-)

diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index a85229b2b86..43ebbfecbc6 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -143,6 +143,43 @@ bool dim_order_is_valid(const std::vector<int64_t>& dim_order) {
   return sum == n * (n + 1) / 2;
 }
 
+/*
+ * Applies the following transformations to a tensor's dim_order vector:
+ *   1. Reverse the order of elements so that the fastest moving dimensions are
+ *      first.
+ *   2. Convert NCHW dimension indices to WHCN indices, so that 0 represents the
+ *      width dimension, 1 represents the height dimension, and 2 represents the
+ *      channels dimension.
+ *   3. Unsqueeze the dim_order vector to the next multiple of 4.
+
+ * These transformations make it easier to use the dim order in a compute shader
+ */
+std::vector<int64_t> create_whcn_dim_order(
+    const std::vector<int64_t>& dim_order) {
+  size_t ndim = dim_order.size();
+  std::vector<int64_t> whcn_order(ndim);
+
+  // Convert from NCHW to WHCN index, and flip the dim order so that the fastest
+  // moving dimension is first.
+  // example: {     1,     2,        0} -> {       2,     0,      1}
+  //          {height, width, channels} -> {channels, width, height}
+  for (size_t whcn_i = 0, nchw_i = (ndim - 1); whcn_i < ndim;
+       ++whcn_i, --nchw_i) {
+    whcn_order.at(whcn_i) = ndim - 1 - dim_order.at(nchw_i);
+  }
+
+  // Unsqueeze to the next multiple of 4
+  size_t ndim_up4 = utils::align_up_4(ndim);
+  whcn_order.resize(ndim_up4);
+
+  // Append unsqueezed dimensions
+  for (size_t i = ndim; i < ndim_up4; ++i) {
+    whcn_order.at(i) = i;
+  }
+
+  return whcn_order;
+}
+
 std::vector<int64_t> unsqueeze_strides(
     const std::vector<int64_t>& strides,
     const int64_t numel) {
@@ -212,6 +249,97 @@ utils::uvec3 calculate_image_extents(
   return extents;
 }
 
+/*
+ * The physical image extents describe the size of an allocated texture resource
+ * i.e. how many texels in the width, height and depth axis of the image.
+ * However, the axis map allows a tensor logical dimension to map to a different
+ * physical texture axis; in essence, it describes a permutation between the
+ * logical width, height, channels, etc. dimensions of a tensor and the width,
+ * height, depth axis of a texture.
+ *
+ * The "logical extents" is simply the physical image extents permuted by the
+ * axis mapping. The logical extents is useful for constructing global work
+ * group sizes, so that it is easier to convert the global thread ID to a
+ * tensor index.
+ */
+utils::uvec3 calculate_logical_limits(
+    const utils::uvec3& image_extents,
+    const std::vector<int64_t>& axis_map) {
+  return {
+      image_extents[axis_map.at(0)],
+      image_extents[axis_map.at(1)],
+      image_extents[axis_map.at(2)],
+  };
+}
+
+/*
+ * Convenience overload of the above function to calculate logical limits
+ * directly from tensor sizes.
+ */
+utils::uvec3 calculate_logical_limits(
+    const std::vector<int64_t>& sizes,
+    const std::vector<int64_t>& axis_map,
+    const int32_t packed_dim) {
+  return calculate_logical_limits(
+      calculate_image_extents(
+          calculate_padded_sizes(sizes, packed_dim), axis_map, packed_dim),
+      axis_map);
+}
+
+int64_t calculate_gpu_buffer_numel(
+    Context* const context,
+    const std::vector<int64_t>& sizes,
+    const utils::uvec3 image_extents,
+    const utils::StorageType storage_type,
+    const vkapi::ScalarType dtype) {
+  // For texture backed tensors, simply multiply the total number of texels by 4
+  if (storage_type != utils::kBuffer) {
+    return image_extents[0] * image_extents[1] * image_extents[2] * 4;
+  }
+  const bool is_int8 = dtype == vkapi::kChar;
+  const bool int8_supported =
+      context->adapter_ptr()->has_full_int8_buffers_support();
+  const size_t numel = utils::multiply_integers(sizes);
+  // For int8 tensors, if the device does not support int8 buffers, then int32
+  // is used instead to represent the buffer data. Therefore the number of
+  // elements in the buffer is aligned to the next multiple of 4.
+  if (is_int8 && int8_supported) {
+    return utils::align_up_4(numel);
+  }
+  return numel;
+}
+
+int32_t pack_into_int32(const std::vector<int64_t>& vec, const int32_t extra) {
+  int32_t packed = static_cast<int32_t>(
+      vec.at(0) + (vec.at(1) << 4) + (vec.at(2) << 8) + (vec.at(3) << 12) +
+      (extra << 16));
+  return packed;
+}
+
+int32_t create_hashed_layout(
+    const std::vector<int64_t>& dim_order,
+    const std::vector<int64_t>& axis_map,
+    const int32_t packed_dim,
+    const utils::StorageType storage_type) {
+  if (storage_type == utils::kBuffer) {
+    return pack_into_int32(create_whcn_dim_order(dim_order), 0);
+  }
+  return pack_into_int32(axis_map, packed_dim);
+}
+
+size_t calculate_max_ubo_nbytes(
+    const size_t nbytes_per_ubo,
+    const utils::StorageType storage_type) {
+  // For texture backed tensors, the metadata fields needed are:
+  // sizes, logical limits
+  size_t max_metadata_field_count = 2u;
+  if (storage_type == utils::kBuffer) {
+    // sizes, strides, dim order, numel
+    max_metadata_field_count = 4u;
+  }
+  return max_metadata_field_count * nbytes_per_ubo;
+}
+
 //
 // vTensorStorage
 //
@@ -322,14 +450,21 @@ vTensorStorage::vTensorStorage(
     const utils::StorageType storage_type,
     const std::vector<int64_t>& axis_map,
     const int32_t packed_dim,
-    const std::vector<int64_t>& padded_sizes,
+    const std::vector<int64_t>& sizes,
     const vkapi::ScalarType dtype,
     const bool allocate_memory)
     : context_(context),
       storage_type_{storage_type},
-      image_extents_(
-          calculate_image_extents(padded_sizes, axis_map, packed_dim)),
-      buffer_length_{utils::multiply_integers(padded_sizes)},
+      image_extents_(calculate_image_extents(
+          calculate_padded_sizes(sizes, packed_dim),
+          axis_map,
+          packed_dim)),
+      buffer_length_{calculate_gpu_buffer_numel(
+          context_,
+          sizes,
+          image_extents_,
+          storage_type,
+          dtype)},
       buffer_offset_{0},
       image_(allocate_image(
           context_,
@@ -446,35 +581,45 @@ vTensor::vTensor(
       dim_order_(calculate_dim_order(sizes_.size(), packed_dim_)),
       axis_map_(calculate_axis_map(sizes_, axis_map_layout)),
       strides_(calculate_strides(sizes, dim_order_)),
-      padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)},
-      unsqueezed_strides_{
-          unsqueeze_strides(strides_, utils::multiply_integers(sizes_))},
-      padded_numel_(utils::multiply_integers(padded_sizes_)),
+      numel_(utils::multiply_integers(sizes_)),
+      hashed_layout_(create_hashed_layout(
+          dim_order_,
+          axis_map_,
+          packed_dim_,
+          storage_type)),
+      // Related to tensor metadata UBOs
+      nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
+      max_ubo_nbytes_{calculate_max_ubo_nbytes(nbytes_per_ubo_, storage_type)},
       uniforms_(),
-      // Utility Uniform Buffers that can be passed to shaders as arguments
-      uniforms_size_(0),
-      sizes_uniform_offset_(kUniformOffsetUnset),
-      unsqueezed_strides_offset_(kUniformOffsetUnset),
-      numel_uniform_offset_(kUniformOffsetUnset),
-      logical_limits_uniform_offset_(kUniformOffsetUnset),
       // Construct Tensor storage
       storage_(std::make_shared<vTensorStorage>(
           context,
           storage_type,
           axis_map_,
           packed_dim_,
-          padded_sizes_,
+          sizes,
           dtype_,
           allocate_memory)) {
+  // Derived metadata
+  std::vector<int64_t> whcn_dim_order(4, 0);
+  std::vector<int64_t> unsqueezed_strides(4, 0);
+  // Only calculate derived metadata if needed for the desired storage type.
+  // Note that logical limits may be used by buffer storage as well in order to
+  // set global work group sizes for some compute shaders.
+  if (storage_type == utils::kBuffer) {
+    whcn_dim_order = create_whcn_dim_order(dim_order_);
+    unsqueezed_strides = unsqueeze_strides(strides_, numel_);
+  }
+
   uniform_data_ = std::make_shared<UniformData>(UniformData{
       sizes_,
-      unsqueezed_strides_,
-      {{0, 0, 0}},
-      static_cast<size_t>(utils::multiply_integers(sizes_))});
+      whcn_dim_order,
+      unsqueezed_strides,
+      TextureLimits(
+          calculate_logical_limits(storage_->image_extents_, axis_map_)),
+      numel_});
   VK_CHECK_COND(
       dim_order_is_valid(dim_order_), "computed dim order is invalid");
-
-  set_logical_limits(storage_->image_extents_);
 }
 
 // NOLINTNEXTLINE
@@ -490,24 +635,23 @@ vTensor::vTensor(
       dim_order_(),
       axis_map_(calculate_axis_map(sizes_, axis_map_layout)),
       strides_(),
-      padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_)),
-      unsqueezed_strides_(),
-      padded_numel_(utils::multiply_integers(padded_sizes_)),
+      numel_(utils::multiply_integers(sizes_)),
+      hashed_layout_(create_hashed_layout(
+          dim_order_,
+          axis_map_,
+          packed_dim_,
+          utils::kTexture3D)),
+      // Related to tensor metadata UBOs
+      nbytes_per_ubo_{context->adapter_ptr()->min_ubo_alignment()},
+      max_ubo_nbytes_{
+          calculate_max_ubo_nbytes(nbytes_per_ubo_, utils::kTexture3D)},
       uniforms_(),
-      // Utility Uniform Buffers that can be passed to shaders as arguments
-      uniforms_size_(0),
-      sizes_uniform_offset_(kUniformOffsetUnset),
-      unsqueezed_strides_offset_(kUniformOffsetUnset),
-      numel_uniform_offset_(kUniformOffsetUnset),
-      logical_limits_uniform_offset_(kUniformOffsetUnset),
       // Construct Tensor storage
       storage_(std::make_shared<vTensorStorage>(context, image)) {
-  uniform_data_ = std::make_shared<UniformData>(UniformData{
-      sizes_,
-      {0, 0, 0, 0},
-      {{0, 0, 0}},
-      static_cast<size_t>(utils::multiply_integers(sizes_))});
-  set_logical_limits(storage_->image_extents_);
+  TextureLimits logical_limits(
+      calculate_logical_limits(storage_->image_extents_, axis_map_));
+  uniform_data_ = std::make_shared<UniformData>(
+      UniformData{sizes_, {0, 0, 0, 0}, {0, 0, 0, 0}, logical_limits, numel_});
 }
 
 vTensor::vTensor(vTensor& other)
@@ -518,18 +662,11 @@ vTensor::vTensor(vTensor& other)
       dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
       axis_map_(other.axis_map_.begin(), other.axis_map_.end()),
       strides_(other.strides_.begin(), other.strides_.end()),
-      padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()},
-      unsqueezed_strides_{
-          other.unsqueezed_strides_.begin(),
-          other.unsqueezed_strides_.end()},
-      padded_numel_(other.padded_numel_),
+      numel_(other.numel_),
+      hashed_layout_(other.hashed_layout_),
+      nbytes_per_ubo_{other.nbytes_per_ubo_},
+      max_ubo_nbytes_{other.max_ubo_nbytes_},
       uniforms_(),
-      // Empty initialize Utility Uniform Buffers
-      uniforms_size_(0),
-      sizes_uniform_offset_(kUniformOffsetUnset),
-      unsqueezed_strides_offset_(kUniformOffsetUnset),
-      numel_uniform_offset_(kUniformOffsetUnset),
-      logical_limits_uniform_offset_(kUniformOffsetUnset),
       // Copy Tensor storage
       storage_(other.storage_) {
   uniform_data_ = std::make_shared<UniformData>(*other.get_uniform_data());
@@ -546,22 +683,21 @@ vTensor::vTensor(
       dim_order_(dim_order.begin(), dim_order.end()),
       axis_map_(calculate_axis_map(sizes_, utils::kDefaultAxisMap)),
       strides_(calculate_strides(sizes_, dim_order_)),
-      padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)},
-      unsqueezed_strides_{
-          unsqueeze_strides(strides_, utils::multiply_integers(sizes_))},
-      padded_numel_(utils::multiply_integers(padded_sizes_)),
+      numel_(other.numel_),
+      hashed_layout_(create_hashed_layout(
+          dim_order_,
+          axis_map_,
+          packed_dim_,
+          other.storage_type())),
+      nbytes_per_ubo_{other.nbytes_per_ubo_},
+      max_ubo_nbytes_{other.max_ubo_nbytes_},
       uniforms_(),
-      // Empty initialize Utility Uniform Buffers
-      uniforms_size_(0),
-      sizes_uniform_offset_(kUniformOffsetUnset),
-      unsqueezed_strides_offset_(kUniformOffsetUnset),
-      numel_uniform_offset_(kUniformOffsetUnset),
-      logical_limits_uniform_offset_(kUniformOffsetUnset),
       // Copy Tensor storage
       storage_(other.storage_) {
   uniform_data_ = std::make_shared<UniformData>(UniformData{
       sizes_,
-      unsqueezed_strides_,
+      create_whcn_dim_order(dim_order_),
+      unsqueeze_strides(strides_, numel_),
       {other.logical_limits()},
       static_cast<size_t>(utils::multiply_integers(sizes_))});
 
@@ -584,6 +720,7 @@ uint32_t vTensor::UniformData::write_attribute(
   }
   switch (attr) {
     WRITE_ATTRIBUTE_CASE(SIZES, sizes_v);
+    WRITE_ATTRIBUTE_CASE(WHCN_DIM_ORDER, whcn_dim_order_v);
     WRITE_ATTRIBUTE_CASE(STRIDES, strides_v);
     WRITE_ATTRIBUTE_CASE(LOGICAL_LIMITS, logical_limits);
     WRITE_ATTRIBUTE_CASE(NUMEL, numel);
@@ -624,12 +761,6 @@ vkapi::VulkanBuffer& vTensor::buffer(
   return storage_->buffer_;
 }
 
-void vTensor::set_logical_limits(const utils::uvec3& image_extents) {
-  uniform_data_->logical_limits.limits[0] = image_extents[axis_map_.at(0)];
-  uniform_data_->logical_limits.limits[1] = image_extents[axis_map_.at(1)];
-  uniform_data_->logical_limits.limits[2] = image_extents[axis_map_.at(2)];
-}
-
 utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
   switch (packed_dim_) {
     case WHCN::kWidthDim:
@@ -643,95 +774,108 @@ utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
   }
 }
 
+bool vTensor::is_contiguous() const {
+  if (storage_type() != utils::kBuffer) {
+    return false;
+  }
+  for (size_t i = 0; i < dim_order_.size(); ++i) {
+    if (dim_order_.at(i) != i) {
+      return false;
+    }
+  }
+  return true;
+}
+
+size_t vTensor::get_max_ubo_nbytes(const size_t nbytes_per_ubo) const {
+  // For texture backed tensors, the metadata fields needed are:
+  // sizes, logical limits
+  size_t max_metadata_field_count = 2u;
+  if (storage_type() == utils::kBuffer) {
+    // sizes, strides, dim order, numel
+    max_metadata_field_count = 4u;
+  }
+  return max_metadata_field_count * nbytes_per_ubo;
+}
+
 const vkapi::BufferBindInfo vTensor::sizes_ubo() {
-  const size_t size_per_ubo =
-      storage_->context_->adapter_ptr()->min_ubo_alignment();
-  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true);
+    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
   }
   if (sizes_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
+        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
     sizes_uniform_offset_ = uniforms_size_;
-    uniforms_size_ += size_per_ubo;
+    uniforms_size_ += nbytes_per_ubo_;
     uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_);
   }
   return vkapi::BufferBindInfo(
-      uniforms_.buffer(), sizes_uniform_offset_, size_per_ubo);
+      uniforms_.buffer(), sizes_uniform_offset_, nbytes_per_ubo_);
 }
 
-const vkapi::BufferBindInfo vTensor::strides_ubo() {
-  const size_t size_per_ubo =
-      storage_->context_->adapter_ptr()->min_ubo_alignment();
-  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
+const vkapi::BufferBindInfo vTensor::dim_order_ubo() {
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true);
+    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
   }
-  if (unsqueezed_strides_offset_ == kUniformOffsetUnset) {
+  if (dim_order_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
+        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
-    unsqueezed_strides_offset_ = uniforms_size_;
-    uniforms_size_ += size_per_ubo;
+    dim_order_uniform_offset_ = uniforms_size_;
+    uniforms_size_ += nbytes_per_ubo_;
     uniforms_.update(
-        utils::make_whcn_ivec4(unsqueezed_strides_),
-        unsqueezed_strides_offset_);
+        uniform_data_->whcn_dim_order_v, dim_order_uniform_offset_);
+  }
+  return vkapi::BufferBindInfo(
+      uniforms_.buffer(), dim_order_uniform_offset_, nbytes_per_ubo_);
+}
+
+const vkapi::BufferBindInfo vTensor::strides_ubo() {
+  if (!uniforms_.buffer()) {
+    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
+  }
+  if (strides_uniform_offset == kUniformOffsetUnset) {
+    VK_CHECK_COND(
+        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
+        "Uniform data allocation has exceeded Tensor uniform buffer size");
+    strides_uniform_offset = uniforms_size_;
+    uniforms_size_ += nbytes_per_ubo_;
+    uniforms_.update(uniform_data_->strides_v, strides_uniform_offset);
   }
   return vkapi::BufferBindInfo(
-      uniforms_.buffer(), unsqueezed_strides_offset_, size_per_ubo);
+      uniforms_.buffer(), strides_uniform_offset, nbytes_per_ubo_);
 }
 
 const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
-  const size_t size_per_ubo =
-      storage_->context_->adapter_ptr()->min_ubo_alignment();
-  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true);
+    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
   }
   if (logical_limits_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
+        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
     logical_limits_uniform_offset_ = uniforms_size_;
-    uniforms_size_ += size_per_ubo;
+    uniforms_size_ += nbytes_per_ubo_;
     uniforms_.update(logical_limits(), logical_limits_uniform_offset_);
   }
   return vkapi::BufferBindInfo(
-      uniforms_.buffer(), logical_limits_uniform_offset_, size_per_ubo);
+      uniforms_.buffer(), logical_limits_uniform_offset_, nbytes_per_ubo_);
 }
 
 const vkapi::BufferBindInfo vTensor::numel_ubo() {
-  const size_t size_per_ubo =
-      storage_->context_->adapter_ptr()->min_ubo_alignment();
-  const size_t max_ubo_size = kMaxMetadataFieldCount * size_per_ubo;
   if (!uniforms_.buffer()) {
-    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_size, true);
+    uniforms_ = ParamsBuffer(storage_->context_, max_ubo_nbytes_, true);
   }
   if (numel_uniform_offset_ == kUniformOffsetUnset) {
     VK_CHECK_COND(
-        (uniforms_size_ + size_per_ubo) <= max_ubo_size,
+        (uniforms_size_ + nbytes_per_ubo_) <= max_ubo_nbytes_,
         "Uniform data allocation has exceeded Tensor uniform buffer size");
     numel_uniform_offset_ = uniforms_size_;
-    uniforms_size_ += size_per_ubo;
+    uniforms_size_ += nbytes_per_ubo_;
     uniforms_.update(numel(), numel_uniform_offset_);
   }
   return vkapi::BufferBindInfo(
-      uniforms_.buffer(), numel_uniform_offset_, size_per_ubo);
-}
-
-size_t vTensor::staging_buffer_numel() const {
-  const bool is_int8 = dtype_ == vkapi::kChar;
-  const bool int8_supported =
-      storage_->context_->adapter_ptr()->has_full_int8_buffers_support();
-  if (is_int8 && !int8_supported) {
-    return utils::align_up_4(numel());
-  }
-  if (storage_type() == utils::kBuffer) {
-    return numel();
-  }
-  return padded_numel_;
+      uniforms_.buffer(), numel_uniform_offset_, nbytes_per_ubo_);
 }
 
 VkMemoryRequirements vTensor::get_memory_requirements() const {
@@ -758,33 +902,36 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) {
 }
 
 void vTensor::update_metadata() {
+  numel_ = utils::multiply_integers(sizes_);
   strides_ = calculate_strides(sizes_, dim_order_);
-  uniform_data_->numel = utils::multiply_integers(sizes_);
-
-  padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_);
-  unsqueezed_strides_ = unsqueeze_strides(strides_, numel());
-  padded_numel_ = utils::multiply_integers(padded_sizes_);
 
   // Update uniform data if it has been modified
+  uniform_data_->numel = numel_;
   uniform_data_->sizes_v = utils::make_whcn_ivec4(sizes_);
-  uniform_data_->strides_v = utils::make_whcn_ivec4(unsqueezed_strides_);
-
-  // Calculate the image extents that would have been used to allocate a texture
-  // withthe current sizes, and use that to set the logical limits.
-  set_logical_limits(
-      calculate_image_extents(padded_sizes_, axis_map_, packed_dim_));
+  uniform_data_->whcn_dim_order_v =
+      utils::make_ivec4(create_whcn_dim_order(dim_order_));
+  uniform_data_->strides_v =
+      utils::make_whcn_ivec4(unsqueeze_strides(strides_, numel_));
+  uniform_data_->numel = utils::safe_downcast<int32_t>(numel_);
+  uniform_data_->logical_limits.limits =
+      calculate_logical_limits(sizes_, axis_map_, packed_dim_);
 
   if (sizes_uniform_offset_ != kUniformOffsetUnset) {
     uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_);
   }
-  if (unsqueezed_strides_offset_ != kUniformOffsetUnset) {
-    uniforms_.update(uniform_data_->strides_v, unsqueezed_strides_offset_);
+  if (dim_order_uniform_offset_ != kUniformOffsetUnset) {
+    uniforms_.update(
+        uniform_data_->whcn_dim_order_v, dim_order_uniform_offset_);
+  }
+  if (strides_uniform_offset != kUniformOffsetUnset) {
+    uniforms_.update(uniform_data_->strides_v, strides_uniform_offset);
   }
   if (numel_uniform_offset_ != kUniformOffsetUnset) {
-    uniforms_.update(numel(), numel_uniform_offset_);
+    uniforms_.update(numel_, numel_uniform_offset_);
   }
   if (logical_limits_uniform_offset_ != kUniformOffsetUnset) {
-    uniforms_.update(logical_limits(), logical_limits_uniform_offset_);
+    uniforms_.update(
+        uniform_data_->logical_limits.limits, logical_limits_uniform_offset_);
   }
 }
 
@@ -792,8 +939,8 @@ void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
   if (storage_type() != utils::kBuffer) {
     // For texture storage check that the current texture is large enough for
     // the new sizes of the tensor.
-    utils::uvec3 virtual_extents =
-        calculate_image_extents(padded_sizes_, axis_map_, packed_dim_);
+    utils::uvec3 virtual_extents = calculate_image_extents(
+        calculate_padded_sizes(sizes_, packed_dim_), axis_map_, packed_dim_);
 
     bool valid_resize = virtual_extents[0] <= storage_->image_extents_[0];
     valid_resize =
@@ -828,6 +975,11 @@ void vTensor::virtual_reconfigure(
   check_sizes(new_sizes);
   sizes_ = new_sizes;
   dim_order_ = new_dim_order;
+
+  // Update the hashed layout because dim order is updated
+  hashed_layout_ =
+      create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type());
+
   update_metadata();
 }
 
@@ -837,6 +989,7 @@ void vTensor::virtual_clone(const vTensor& other) {
   dim_order_ = other.dim_order_;
   axis_map_ = other.axis_map_;
   packed_dim_ = other.packed_dim_;
+  hashed_layout_ = other.hashed_layout_;
 
   *uniform_data_ = *other.get_uniform_data();
 }
@@ -895,6 +1048,11 @@ void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) {
       axis_map_.at(3) = dim0_whcn;
     }
   }
+
+  // Update the hashed layout because dim order / axis mpa is updated
+  hashed_layout_ =
+      create_hashed_layout(dim_order_, axis_map_, packed_dim_, storage_type());
+
   update_metadata();
 }
 
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index 850dc2d7fab..78a24d87e77 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -81,6 +81,18 @@ struct LastAccess {
       : stage{stage_flags}, access{access_flags} {}
 };
 
+/*
+ * Calculate the number of elements that a GPU buffer would require to store the
+ * contents of a tensor. This will depend on the storage type and dtype of the
+ * tensor, as well as the features available on the device.
+ */
+int64_t calculate_gpu_buffer_numel(
+    Context* const context,
+    const std::vector<int64_t>& sizes,
+    const utils::uvec3 image_extents,
+    const utils::StorageType storage_type,
+    const vkapi::ScalarType dtype);
+
 class vTensorStorage final {
  public:
   // Do not allow empty vTensorStorage construction
@@ -91,7 +103,7 @@ class vTensorStorage final {
       const utils::StorageType storage_type,
       const std::vector<int64_t>& axis_map,
       const int32_t packed_dim,
-      const std::vector<int64_t>& padded_sizes,
+      const std::vector<int64_t>& sizes,
       const vkapi::ScalarType dtype,
       const bool allocate_memory = true);
 
@@ -140,6 +152,10 @@ class vTensorStorage final {
   void verify() const;
 
  public:
+  inline size_t buffer_len() const {
+    return utils::safe_downcast<size_t>(buffer_length_);
+  }
+
   inline VkFormat texture_format() {
     return image_.format();
   }
@@ -207,8 +223,11 @@ class vTensor final {
   vTensor(vTensor&& other) = default;
   vTensor& operator=(vTensor&& other) = default;
 
+  ~vTensor() = default;
+
   enum class Attribute : uint8_t {
     SIZES,
+    WHCN_DIM_ORDER,
     STRIDES,
     LOGICAL_LIMITS,
     NUMEL,
@@ -216,6 +235,7 @@ class vTensor final {
 
   class UniformData {
     utils::ivec4 sizes_v;
+    utils::ivec4 whcn_dim_order_v;
     utils::ivec4 strides_v;
     // See the comments documenting logical_limits() for more context.
     TextureLimits logical_limits;
@@ -227,10 +247,12 @@ class vTensor final {
 
     UniformData(
         const std::vector<int64_t>& sizes,
+        const std::vector<int64_t>& whcn_dim_order,
         const std::vector<int64_t>& strides,
         const TextureLimits& logical_limits,
         const size_t numel_ll)
         : sizes_v(utils::make_whcn_ivec4(sizes)),
+          whcn_dim_order_v(utils::make_ivec4(whcn_dim_order)),
           strides_v(utils::make_whcn_ivec4(strides)),
           logical_limits(logical_limits),
           numel(utils::safe_downcast<int32_t>(numel_ll)) {}
@@ -293,21 +315,17 @@ class vTensor final {
   // strides of the tensor in NCHW dimension order
   std::vector<int64_t> strides_;
 
-  /*
-   * The below metadata members are derived from the above, and are typically
-   * to i.e. pass tensor metadata to compute shaders.
-   */
+  // number of elements based on the canonical sizes
+  size_t numel_;
+
+  // For texture backed tensors, this int32 contains the axis map data packed
+  // into a single int32. For buffer backed tensors, this int32 contains the
+  // wchn dim order data packed into a single int32.
+  int32_t hashed_layout_;
 
-  // padded sizes of the tensor in NCHW dimension order. See the
-  // calculate_padded_sizes() function for more context. Note that padded sizes
-  // are only used for texture storage, and not for buffer storage.
-  std::vector<int64_t> padded_sizes_;
-  // Contains the strides of the tensor, with the dimensionality padded to the
-  // nearest multiple of 4. Unsqueezed dims will have a stride of int32_t max.
-  std::vector<int64_t> unsqueezed_strides_;
-  // Contains the number of elements in the tensor according to the padded
-  // sizes.
-  size_t padded_numel_;
+  // Pre-compute these quantities to avoid frequent re-computation
+  size_t nbytes_per_ubo_;
+  size_t max_ubo_nbytes_;
 
   /*
    * Utility GPU buffer that can be passed to shaders in order to convey tensor
@@ -320,15 +338,13 @@ class vTensor final {
    * context about the data contained in each buffer.
    */
   ParamsBuffer uniforms_;
-  uint32_t uniforms_size_;
-  uint32_t sizes_uniform_offset_;
-  uint32_t unsqueezed_strides_offset_;
-  uint32_t numel_uniform_offset_;
-  uint32_t logical_limits_uniform_offset_;
 
-  // Maximum number of metadata fields that can be stored in the metadata UBO.
-  // This is used to calculate the size of the UBO that should be allocated.
-  constexpr static size_t kMaxMetadataFieldCount = 4;
+  uint32_t uniforms_size_ = 0u;
+  uint32_t sizes_uniform_offset_ = kUniformOffsetUnset;
+  uint32_t dim_order_uniform_offset_ = kUniformOffsetUnset;
+  uint32_t strides_uniform_offset = kUniformOffsetUnset;
+  uint32_t numel_uniform_offset_ = kUniformOffsetUnset;
+  uint32_t logical_limits_uniform_offset_ = kUniformOffsetUnset;
 
   // Initial value of uniform buffer offsets. 1 is selected as it is essentially
   // impossible for a ubo to have an offset of 1.
@@ -381,9 +397,6 @@ class vTensor final {
     return storage_->storage_type_ == utils::kBuffer;
   }
 
- private:
-  void set_logical_limits(const utils::uvec3& image_extents);
-
  public:
   /*
    * The logical limits of the tensor are derived from the image extents of the
@@ -451,21 +464,37 @@ class vTensor final {
     return dim_order_;
   }
 
+  inline const std::vector<int64_t>& strides() const {
+    return strides_;
+  }
+
+  inline size_t numel() const {
+    return numel_;
+  }
+
+  inline size_t nbytes() const {
+    return element_size(dtype()) * numel();
+  }
+
   inline const std::vector<int64_t>& axis_map() const {
     return axis_map_;
   }
 
   /*
-   * Returns a single int32_t that contains the values of the axis map and the
-   * packed dimension packed into a single int32_t, such that it can be used as
-   * a specialization constant in a compute shader. This allows for the SPIR-V
-   * to bytecode compilation to perform compile-time unfolding on the axis map.
-   * Each element of the axis map and the value of the packed dimension take up
-   * 4 bits in the packed int32_t.
+   * For texture backed tensors, this function return a int32_t that contains
+   * the axis map + packed dimension. Each element of the axis map occupies 4
+   * bits of the int32.
+   *
+   * For buffer backed tensors, the int32_t contains the WHCN dim order, where
+   * each element of the dim order array occupies 4 bits of the int32.
+   *
+   * This int32 is typically consumed as a specialization constant in compute
+   * shaders where it is subsequently unpacked. The layout data of a vTensor
+   * instance is typically static once created, which is why this method is
+   * appropriate.
    */
   inline int32_t hashed_layout() const {
-    return axis_map_.at(0) + (axis_map_.at(1) << 4) + (axis_map_.at(2) << 8) +
-        (axis_map_.at(3) << 12) + (packed_dim_ << 16);
+    return hashed_layout_;
   }
 
   /*
@@ -478,57 +507,48 @@ class vTensor final {
     return axis_map_.at(0) == 0 && axis_map_.at(1) == 1 && axis_map_.at(2) == 2;
   }
 
-  inline const std::vector<int64_t>& strides() const {
-    return strides_;
-  }
+  /*
+   * Return true if a buffer backed tensor's dim order matches that of a
+   * contiguous tensor, i.e. the dim order will be {0, 1, 2, ... }.
+   * Returns false for texture backed tensors.
+   */
+  bool is_contiguous() const;
 
-  inline const std::vector<int64_t>& unsqueezed_strides() const {
-    return unsqueezed_strides_;
+ private:
+  inline size_t nbytes_per_ubo() const {
+    return storage_->context_->adapter_ptr()->min_ubo_alignment();
   }
 
+  size_t get_max_ubo_nbytes(const size_t nbytes_per_ubo) const;
+
+ public:
   /*
-   * Returns a GPU buffer containing the sizes of the tensor in WHCN order.
-   * Note that dimensions that are not present in the tensor's sizes are set to
-   * a size of 1.
+   * The functions below return the buffer binding info for a UBO that contains
+   * some metadata of the tensor, which can be used to pass in tensor metadata
+   * to a compute shader. The other method of passing in tensor metadata is via
+   * push constants. The trade-off between each is that push constants may be
+   * slightly more performant and memory efficient; however, to update the
+   * values in a push constant due to i.e. a tensor resize between inferences,
+   * the command buffer must be re-encoded. On the other hand, UBOs can update
+   * their data by writing to their mapped memory without requiring a command
+   * buffer re-encode.
    */
+
   const vkapi::BufferBindInfo sizes_ubo();
 
-  /*
-   * Returns a GPU buffer containing the strides of the tensor in WHCN order.
-   * Note that the strides are extended to a dimensionality that is a multiple
-   * of 4, thus dimensions that are not present in the tensor's sizes are set to
-   * have a stride equal to the stride of the "slowest moving" dimension.
-   */
+  const vkapi::BufferBindInfo dim_order_ubo();
+
   const vkapi::BufferBindInfo strides_ubo();
 
-  /*
-   * Returns a GPU buffer containing the logical limits of the tensor. See the
-   * comments for logical_limits() for more context.
-   */
   const vkapi::BufferBindInfo logical_limits_ubo();
 
-  /*
-   * Returns the number of elements in the buffer used to store the tensor.
-   */
   const vkapi::BufferBindInfo numel_ubo();
 
-  inline size_t numel() const {
-    return uniform_data_->numel;
-  }
-
-  inline size_t nbytes() const {
-    return element_size(dtype()) * numel();
-  }
-
-  /*
-   * Returns numel but based on padded_sizes_ instead of sizes_
-   */
-  inline size_t padded_numel() const {
-    return padded_numel_;
+ public:
+  inline size_t staging_buffer_numel() const {
+    return storage_->buffer_len();
   }
 
-  size_t staging_buffer_numel() const;
-
   inline size_t staging_buffer_nbytes() const {
     return element_size(dtype()) * staging_buffer_numel();
   }
@@ -608,6 +628,8 @@ class vTensor final {
 };
 
 static constexpr vTensor::Attribute kTensorSizes = vTensor::Attribute::SIZES;
+static constexpr vTensor::Attribute kTensorDimOrder =
+    vTensor::Attribute::WHCN_DIM_ORDER;
 static constexpr vTensor::Attribute kTensorStrides =
     vTensor::Attribute::STRIDES;
 static constexpr vTensor::Attribute kTensorLogicalLimits =
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 31514989dfc..21d80d5843f 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -346,6 +346,10 @@ class ComputeGraph final {
     return values_.at(idx).toTensor().strides_ubo();
   }
 
+  inline vkapi::BufferBindInfo dim_order_ubo(const ValueRef idx) {
+    return values_.at(idx).toTensor().dim_order_ubo();
+  }
+
   inline vkapi::BufferBindInfo numel_ubo(const ValueRef idx) {
     return values_.at(idx).toTensor().numel_ubo();
   }
@@ -354,6 +358,10 @@ class ComputeGraph final {
     return values_.at(idx).toTensor().has_standard_axis_map();
   }
 
+  inline bool is_contiguous(const ValueRef idx) const {
+    return values_.at(idx).toTensor().is_contiguous();
+  }
+
   inline vkapi::BufferBindInfo logical_limits_ubo(const ValueRef idx) {
     return values_.at(idx).toTensor().logical_limits_ubo();
   }
@@ -363,6 +371,12 @@ class ComputeGraph final {
         values_.at(idx).toConstTensor().get_uniform_data(), api::kTensorSizes);
   }
 
+  inline PushConstantDataInfo dim_order_pc_of(const ValueRef idx) const {
+    return PushConstantDataInfo(
+        values_.at(idx).toConstTensor().get_uniform_data(),
+        api::kTensorDimOrder);
+  }
+
   inline PushConstantDataInfo strides_pc_of(const ValueRef idx) const {
     return PushConstantDataInfo(
         values_.at(idx).toConstTensor().get_uniform_data(),
diff --git a/backends/vulkan/runtime/vk_api/Descriptor.cpp b/backends/vulkan/runtime/vk_api/Descriptor.cpp
index 938666802ef..9e8394ffa9c 100644
--- a/backends/vulkan/runtime/vk_api/Descriptor.cpp
+++ b/backends/vulkan/runtime/vk_api/Descriptor.cpp
@@ -32,8 +32,8 @@ BufferBindInfo::BufferBindInfo(
 
 BufferBindInfo::BufferBindInfo(
     const VulkanBuffer& buffer_p,
-    const uint32_t offset_p,
-    const uint32_t range_p)
+    const size_t offset_p,
+    const size_t range_p)
     : handle(buffer_p.handle()),
       offset(buffer_p.mem_offset() + offset_p),
       range(range_p) {
diff --git a/backends/vulkan/runtime/vk_api/Descriptor.h b/backends/vulkan/runtime/vk_api/Descriptor.h
index 60d66a22619..15ea5e23e33 100644
--- a/backends/vulkan/runtime/vk_api/Descriptor.h
+++ b/backends/vulkan/runtime/vk_api/Descriptor.h
@@ -36,8 +36,8 @@ struct BufferBindInfo final {
   BufferBindInfo(const VulkanBuffer& buffer_p, const uint32_t offset_p = 0u);
   BufferBindInfo(
       const VulkanBuffer& buffer_p,
-      const uint32_t offset_p,
-      const uint32_t range_p);
+      const size_t offset_p,
+      const size_t range_p);
 };
 
 struct ParamsBindList final {
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
index c4ccc860bc2..17f197dfdeb 100644
--- a/backends/vulkan/test/vulkan_compute_api_test.cpp
+++ b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -259,14 +259,10 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
             /*allocate_memory = */ false);
 
         ASSERT_TRUE(new_v_tensor.strides() == ref_strides);
-        ASSERT_TRUE(
-            new_v_tensor.unsqueezed_strides() == ref_unsqueezed_strides);
 
         // Resize vtensor and check that updated metadata is correct
         v_tensor_to_resize.virtual_reconfigure(sizes, dim_order);
         ASSERT_TRUE(v_tensor_to_resize.strides() == ref_strides);
-        ASSERT_TRUE(
-            v_tensor_to_resize.unsqueezed_strides() == ref_unsqueezed_strides);
       }
     }
   }
@@ -1003,18 +999,14 @@ TEST_F(VulkanComputeAPITest, texture_virtual_resize) {
     b.virtual_resize(new_sizes);
     c.virtual_resize(new_sizes);
 
-    fill_staging(
-        staging_buffer_a, float(new_sizes[1] + 1.5f), a.staging_buffer_numel());
-    fill_staging(
-        staging_buffer_b,
-        float(new_sizes[2] + 55.0f),
-        b.staging_buffer_numel());
+    fill_staging(staging_buffer_a, float(new_sizes[1] + 1.5f), a.numel());
+    fill_staging(staging_buffer_b, float(new_sizes[2] + 55.0f), b.numel());
 
     submit_to_gpu();
     check_staging_buffer(
         staging_buffer_c,
         float(new_sizes[1] + new_sizes[2] + 56.5f),
-        c.staging_buffer_numel());
+        c.numel());
   }
 }
 
@@ -1096,7 +1088,6 @@ TEST_F(VulkanComputeAPITest, test_tensor_creation_from_vulkan_image) {
 
   const auto exp_numel = w * h * d * 4;
   EXPECT_TRUE(tensor.numel() == exp_numel);
-  EXPECT_TRUE(tensor.padded_numel() == exp_numel);
 }
 
 TEST(VulkanComputeGraphTest, test_values_scalars) {

From ea7963113daba1173609bc0ae6b4ceb4e2656c22 Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 12 Jun 2025 12:34:28 -0700
Subject: [PATCH 2/3] [ET-VK] Use dim order when converting buffer index to
 tensor index

Pull Request resolved: https://github.com/pytorch/executorch/pull/11600

## Changes

* Update callsites to `bufi_to_tidx` to account for the tensor dim order
* Remove existing functions which do not accept dim order as argument.

## Motivation

> Update callsites to `bufi_to_tidx` to account for the tensor dim order
> Remove existing functions which do not accept dim order as argument.


As mentioned in the below diff, dim order is required to properly convert from a linear buffer index to N-dimension tensor index using a tensor's strides. Technically the dim order can be inferred from the strides array by performing an index sort. However, for the sake of efficiency it is better to just pass the dim order directly into the compute shader.

Currently the `bufi_to_tidx` function which performs the conversion between buffer index and tensor index assumes that the dim order follows a specific pattern using the packed dim as an input. However, it is not guaranteed that the dim order is the same as what is assumed.

Furthermore, there is an existing bug when calling `bufi_to_tidx` without providing `packed_dim` as an input. In this case, the function will infer the packed dim by finding the first dim with a stride of 1. However, this causes issues when multiple dims may have a stride of 1, which may occur when there are dims with a size of 1. In this case the wrong packed dim may be inferred and therefore the assumed dim order is completely wrong.

To address these issues, make it standard to either account for the packed dim when converting bufi to tidx, or to explicitly call out an assumption about the tensor's dim order.

## Performance Impact

* None expected
ghstack-source-id: 290022827
@exported-using-ghexport

Differential Revision: [D76393428](https://our.internmc.facebook.com/intern/diff/D76393428/)
---
 .../runtime/graph/ops/glsl/binary_op.glsl     | 13 ++---
 .../runtime/graph/ops/glsl/indexing_utils.h   | 56 +++++++++----------
 .../runtime/graph/ops/glsl/linear_qcsnw.glsl  |  2 +-
 .../graph/ops/glsl/nchw_to_buffer.glsl        | 17 +++---
 .../runtime/graph/ops/glsl/select.glslh       |  4 ++
 .../vulkan/runtime/graph/ops/glsl/slice.glslh |  4 ++
 .../graph/ops/glsl/transfer_buffer.glsl       |  8 ++-
 .../vulkan/runtime/graph/ops/glsl/where.glsl  | 28 +++-------
 .../runtime/graph/ops/impl/BinaryOp.cpp       |  6 +-
 .../graph/ops/impl/QuantizedLinearQCSNW.cpp   |  4 ++
 .../runtime/graph/ops/impl/Transfer.cpp       | 16 ++----
 .../vulkan/runtime/graph/ops/impl/Where.cpp   |  7 +--
 backends/vulkan/test/op_tests/cases.py        |  6 +-
 backends/vulkan/test/utils/test_utils.cpp     |  3 +-
 14 files changed, 82 insertions(+), 92 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
index ce986d4e12f..a0a235154a0 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
@@ -48,19 +48,18 @@ $else:
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")}
+
 $if STORAGE == "buffer":
-  ${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")}
-  ${layout_declare_spec_const(C, "int", "in_packed_dim", "DEFAULT_LAYOUT")}
-  ${layout_declare_spec_const(C, "int", "other_packed_dim", "DEFAULT_LAYOUT")}
+  const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
 $else:
-  ${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
   const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
   const lowp int packed_dim = unhash_packed_dim(out_layout);
 
-  ${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
   const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
 
-  ${layout_declare_spec_const(C, "int", "other_layout", "DEFAULT_LAYOUT")}
   const lowp ivec4 other_axis_map = unhash_axis_map(other_layout);
 
 #ifdef USING_BUFFER
@@ -77,7 +76,7 @@ void main() {
     return;
   }
 
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
   const ivec4 in_tidx = min(out_tidx, in_sizes - 1);
   const ivec4 other_tidx = min(out_tidx, other_sizes - 1);
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
index 2b41d2b7e1a..0cfd7f2f119 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -68,21 +68,6 @@
  */
 #define mod4(x) ((x) & 3)
 
-/*
- * Find the packed dimension of a tensor given its strides. The packed dimension
- * is the "fastest moving" dimension which will have a stride of 1.
- */
-int find_packed_dim(const ivec4 strides) {
-  int packed_dim = 0;
-  for (int i = 0; i <= 3; i++) {
-    if (strides[i] == 1) {
-      packed_dim = i;
-      break;
-    }
-  }
-  return packed_dim;
-}
-
 /*
  * Get the staging buffer indices that contain the data of the texel that
  * corresponds to the provided tensor index. Since the texel have 4 elements,
@@ -129,27 +114,26 @@ int tidx_to_nchwi(const ivec4 tidx, const ivec4 sizes) {
       tidx.x;
 }
 
-// TODO(ssjia): make this function use dim order so that it can work with any
-// dim order. Currently it assumes that the dim order is contiguous, except for
-// the packed dim.
-ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const int packed_dim) {
+ivec4 bufi_to_tidx(int bufi, const ivec4 strides, const ivec4 dim_order) {
   ivec4 idx;
   for (int i = 3; i >= 0; i--) {
-    if (i != packed_dim) {
-      idx[i] = bufi / strides[i];
-      bufi %= strides[i];
-    }
+    int dim = dim_order[i];
+    idx[dim] = bufi / strides[dim];
+    bufi %= strides[dim];
   }
-  idx[packed_dim] = bufi;
   return idx;
 }
 
-// Convenience overload of the above function, which will determine the packed
-// dim from the strides automatically so it doesn't have to be passed in as a
-// function argument.
-ivec4 bufi_to_tidx(const int bufi, const ivec4 strides) {
-  int packed_dim = find_packed_dim(strides);
-  return bufi_to_tidx(bufi, strides, packed_dim);
+/*
+ * bufi_to_tidx but assumes that the tensor is contiguous
+ */
+ivec4 contiguous_bufi_to_tidx(int bufi, const ivec4 strides) {
+  ivec4 idx;
+  for (int i = 3; i >= 0; i--) {
+    idx[i] = bufi / strides[i];
+    bufi %= strides[i];
+  }
+  return idx;
 }
 
 int tidx_to_bufi(const ivec4 tidx, ivec4 strides) {
@@ -269,12 +253,22 @@ ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) {
  * e.g. 0x11021, 1 -> ivec4(1, 2, 0, 1)
  */
 #define unhash_axis_map(hash) \
-  ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf))
+  (ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf)))
+
+/*
+ *
+ */
+#define unhash_dim_order(hash) \
+  (ivec4(hash & 0xf, (hash >> 4) & 0xf, (hash >> 8 & 0xf), (hash >> 12 & 0xf)))
 
 #define unhash_packed_dim(hash) int(hash >> 16 & 0xf)
 
 #define DEFAULT_LAYOUT 0x02210
 
+#define DEFAULT_DIM_ORDER 0x03210
+
+#define DEFAULT_DIM_ORDER_IVEC4 ivec4(0, 1, 2, 3)
+
 /************************
  * Deprecated Functions *
  ************************/
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl
index dfb5f1f2f9c..4dd83f0d4ed 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_qcsnw.glsl
@@ -62,7 +62,7 @@ void main() {
     return;
   }
 
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, 0);
+  const ivec4 out_tidx = contiguous_bufi_to_tidx(out_bufi, out_strides);
 
   const FLOAT_T scale = t_scales[out_tidx.x];
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
index ba4e4dd9dd9..62cd0610ffb 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
@@ -10,8 +10,8 @@ ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "nchw_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "nchw_in", DTYPE, STORAGE)}
 
 $if USE_PUSH_CONST:
   layout(push_constant) uniform restrict Block {
@@ -20,15 +20,14 @@ $if USE_PUSH_CONST:
     int numel;
   };
 $else:
-  ${layout_declare_ubo(2, "ivec4", "out_sizes")}
-  ${layout_declare_ubo(3, "ivec4", "out_strides")}
-  ${layout_declare_ubo(4, "int", "numel")}
+  ${layout_declare_ubo(B, "ivec4", "out_sizes")}
+  ${layout_declare_ubo(B, "ivec4", "out_strides")}
+  ${layout_declare_ubo(B, "int", "numel")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-// This constant is unused in this shader but is kept so that the signature is
-// consistent with nchw_to_image.
-${layout_declare_spec_const(C, "int", "UNUSED_layout", "0")}
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_DIM_ORDER")}
+const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
 ${layout_declare_spec_const(C, "int", "transpose_hw", "0")}
 
 void main() {
@@ -37,7 +36,7 @@ void main() {
     return;
   }
 
-  ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides);
+  ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
 
   ivec4 sizes = out_sizes;
   if (transpose_hw == 1) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/select.glslh b/backends/vulkan/runtime/graph/ops/glsl/select.glslh
index 3bcbf04a3ba..6509015b4b6 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/select.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/select.glslh
@@ -9,6 +9,8 @@
 #ifndef SELECT_GLSLH
 #define SELECT_GLSLH
 
+#ifndef USING_BUFFER
+
 /*
  * Enable the fast path if a texel loaded from the input texture can be used as
  * is to store to the output texture. The following conditions must be met:
@@ -29,6 +31,8 @@ bool can_use_fast_path() {
   return true;
 }
 
+#endif // USING_BUFFER
+
 /*
  * Given an output tensor index, return the corresponding input tensor index for
  * the select operator. This is done by "inserting" the select index at the
diff --git a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
index 5d4cc70fdc1..87325754f4d 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
+++ b/backends/vulkan/runtime/graph/ops/glsl/slice.glslh
@@ -9,6 +9,8 @@
 #ifndef SLICE_GLSLH
 #define SLICE_GLSLH
 
+#ifndef USING_BUFFER
+
 /**
  * Enable the fast path if a texel loaded from the input texture can be used as
  * is to store to the output texture. The following conditions must be met:
@@ -26,6 +28,8 @@ bool can_use_fast_path() {
   return true;
 }
 
+#endif // USING_BUFFER
+
 /*
  * Converts output tensor indices to input tensor indices for the slice operation.
  * This function maps the output indices to the corresponding input indices based on
diff --git a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
index 3ca854e0526..7e95b52d8f4 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/transfer_buffer.glsl
@@ -37,8 +37,10 @@ layout(push_constant) uniform restrict Block {
   int selected_dim;
 };
 
-${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")}
-${layout_declare_spec_const(C, "int", "in_packed_dim", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
+
+const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -50,7 +52,7 @@ void main() {
     return;
   }
 
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
   ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
 
   const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/where.glsl b/backends/vulkan/runtime/graph/ops/glsl/where.glsl
index 5df813d1241..fe6304c0fa0 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/where.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/where.glsl
@@ -37,40 +37,28 @@ $if STORAGE == "buffer":
   ${layout_declare_ubo(B, "ivec4", "cond_strides")}
   ${layout_declare_ubo(B, "ivec4", "self_strides")}
   ${layout_declare_ubo(B, "ivec4", "other_strides")}
-
-  ${layout_declare_spec_const(C, "int", "out_packed_dim", "DEFAULT_LAYOUT")}
-  ${layout_declare_spec_const(C, "int", "cond_packed_dim", "DEFAULT_LAYOUT")}
-  ${layout_declare_spec_const(C, "int", "self_packed_dim", "DEFAULT_LAYOUT")}
-  ${layout_declare_spec_const(C, "int", "other_packed_dim", "DEFAULT_LAYOUT")}
 $else:
   ${layout_declare_ubo(B, "ivec3", "out_limits")}
 
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_DIM_ORDER")}
+
+const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
+
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 #ifdef USING_BUFFER
 
 void main() {
   int out_bufi = int(gl_GlobalInvocationID.x);
-  // ivec4 tidx = ivec4(gl_GlobalInvocationID, 0);
-  // int out_bufi = tidx_to_bufi(tidx, out_strides);
-  // int cond_bufi = tidx_to_bufi(tidx, cond_strides);
-  // int self_bufi = tidx_to_bufi(tidx, self_strides);
-  // int other_bufi = tidx_to_bufi(tidx, other_strides);
   if (out_bufi >= out_numl) {
     return;
   }
 
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
-  out_bufi = tidx_to_bufi(out_tidx, out_strides);
-
-  const ivec4 cond_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
-  const int cond_bufi = tidx_to_bufi(cond_tidx, cond_strides);
-
-  const ivec4 self_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
-  const int self_bufi = tidx_to_bufi(self_tidx, self_strides);
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
 
-  const ivec4 other_tidx = bufi_to_tidx(out_bufi, out_strides, out_packed_dim);
-  const int other_bufi = tidx_to_bufi(other_tidx, other_strides);
+  const int cond_bufi = tidx_to_bufi(out_tidx, cond_strides);
+  const int self_bufi = tidx_to_bufi(out_tidx, self_strides);
+  const int other_bufi = tidx_to_bufi(out_tidx, other_strides);
 
   COND_T cond = t_condition[cond_bufi] ;
   T v_self = t_self[self_bufi];
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
index d260ed767d0..28279c196c0 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -143,9 +143,9 @@ void add_binary_op_buffer_node(
           PushConstantDataInfo(&alpha_val, sizeof(float)),
       }},
       // Specialization Constants
-      {graph.packed_dim_of(out),
-       graph.packed_dim_of(in1),
-       graph.packed_dim_of(in2)},
+      {graph.hashed_layout_of(out),
+       graph.hashed_layout_of(in1),
+       graph.hashed_layout_of(in2)},
       // Resize Args
       {},
       // Resizing Logic
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp
index 6e101195e3f..07502a7a107 100644
--- a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinearQCSNW.cpp
@@ -43,6 +43,10 @@ void check_linear_qcsnw_args(
     VK_CHECK_COND(
         utils::val_at(-1, scales_sizes) == utils::val_at(-2, qmat2_sizes));
   }
+
+  if (graph.is_buffer_storage(out)) {
+    VK_CHECK_COND(graph.is_contiguous(out));
+  }
 }
 
 void resize_linear_qcsnw_node(
diff --git a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
index 423c9789d67..7b5fad57483 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Transfer.cpp
@@ -55,7 +55,6 @@ void add_transfer_copy_node(
   } transfer_params{static_cast<int32_t>(dim_whcn)};
 
   std::vector<PushConstantDataInfo> push_constants;
-  vkapi::SpecVarList spec_vars;
 
   if (graph.is_buffer_storage(out)) {
     push_constants = {
@@ -64,23 +63,18 @@ void add_transfer_copy_node(
         graph.strides_pc_of(in),
         graph.numel_pc_of(out),
         PushConstantDataInfo(&transfer_params, sizeof(transfer_params))};
-
-    spec_vars = {
-        graph.packed_dim_of(out),
-        graph.packed_dim_of(in),
-    };
   } else {
     push_constants = {
         graph.sizes_pc_of(out),
         graph.sizes_pc_of(in),
         PushConstantDataInfo(&transfer_params, sizeof(transfer_params))};
-
-    spec_vars = {
-        graph.hashed_layout_of(out),
-        graph.hashed_layout_of(in),
-    };
   }
 
+  vkapi::SpecVarList spec_vars = {
+      graph.hashed_layout_of(out),
+      graph.hashed_layout_of(in),
+  };
+
   // Determine the shader directly
   std::string kernel_name;
   if (transfer_type == TransferType::SELECT) {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Where.cpp b/backends/vulkan/runtime/graph/ops/impl/Where.cpp
index a3be34830d3..ea610b1fe74 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Where.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Where.cpp
@@ -54,7 +54,7 @@ void add_where_texture_node(
       // Push Constants
       {},
       // Specialization Constants
-      {graph.packed_dim_of(out)},
+      {graph.hashed_layout_of(out)},
       // Resize Arguments
       {},
       // Resizing Logic
@@ -96,10 +96,7 @@ void add_where_buffer_node(
       // Push Constants
       {},
       // Specialization Constants
-      {graph.packed_dim_of(out),
-       graph.packed_dim_of(cond),
-       graph.packed_dim_of(self),
-       graph.packed_dim_of(other)},
+      {graph.hashed_layout_of(out)},
       // Resize Arguments
       {},
       // Resizing Logic
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index bd67933dc93..4ea61cd7ef3 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -52,13 +52,17 @@ def get_binary_elementwise_inputs():
             ((S, S1, S2), (S, S1, 1), 2.0),
             ((S, S1, S2), (S, 1, S2), 2.0),
             ((XS, S, S1, S2), (XS, S, 1, 1), 2.0),
+            ((3, 64, 1), (1, 64, 1)),
         ]
     )
     test_suite.layouts = [
         "utils::kWidthPacked",
         "utils::kChannelsPacked",
     ]
-    test_suite.storage_types = ["utils::kBuffer", "utils::kTexture3D"]
+    test_suite.storage_types = [
+        "utils::kBuffer",
+        "utils::kTexture3D",
+    ]
     return test_suite
 
 
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index 3f5dba9e277..faa0e7d0c47 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -26,13 +26,14 @@ void record_nchw_to_buffer_op(
     vkapi::VulkanBuffer& src_buffer,
     api::vTensor& v_dst) {
   vkapi::PipelineBarrier pipeline_barrier{};
+  vkapi::SpecVarList specialization_constants = {v_dst.hashed_layout()};
 
   context->submit_compute_job(
       get_nchw_to_tensor_shader(v_dst, true, false),
       pipeline_barrier,
       {uint32_t(v_dst.numel()), 1, 1},
       {64, 1, 1},
-      {},
+      specialization_constants,
       VK_NULL_HANDLE,
       0,
       v_dst.buffer(

From 0995d76031dcf1ec4b082483c0c9e587ab7e136e Mon Sep 17 00:00:00 2001
From: Stephen Jia <ssjia@meta.com>
Date: Thu, 12 Jun 2025 12:34:29 -0700
Subject: [PATCH 3/3] [ET-VK] New implementation of `cat` operator

Pull Request resolved: https://github.com/pytorch/executorch/pull/11508

## Changes

* Introduce `concat_texture.glsl` and `concat_buffer.glsl` to implement the `torch.cat` operator
* Introduce `Concat.cpp` to replace `Cat.cpp`
* Fix a bug with channels-packed buffer tensors where input data would be copied incorrectly with multiple dims have a stride of 1

## Motivation

> * Introduce `concat_texture.glsl` and `concat_buffer.glsl` to implement the `torch.cat` operator
> * Introduce `Concat.cpp` to replace `Cat.cpp`

The existing implementation of `torch.cat` uses the copy_channel_offset` shaders. However, these shaders have a critical bug where the output tensor is passed in separately with difference access types, i.e.

```
graph.execute_nodes().emplace_back(new DispatchNode(
        graph,
        VK_KERNEL_FROM_STR(kernel_name),
        global_size,
        local_size,
        // Inputs and Outputs
        {
            {out, vkapi::kWrite},
            {out, vkapi::kRead},
            {in, vkapi::kRead},
        },
```

This creates many validation layer errors because the memory barriers for the resource cannot be formed properly. The shader essentially relies on undefined behaviour to work correctly. The result is that the `cat` operator produces incorrect result on many platforms.

Rather than fix the `copy_offset` shaders, I decided to just introduce new shaders to perform the concat operation. The new implementation handles both buffer and texture inputs and is agnostic to memory layout.
ghstack-source-id: 290022825

Differential Revision: [D76305343](https://our.internmc.facebook.com/intern/diff/D76305343/)
---
 backends/vulkan/op_registry.py                |  45 ++++-
 .../runtime/graph/ops/glsl/concat_buffer.glsl |  69 +++++++
 .../runtime/graph/ops/glsl/concat_buffer.yaml |  14 ++
 .../graph/ops/glsl/concat_texture.glsl        | 129 ++++++++++++++
 .../graph/ops/glsl/concat_texture.yaml        |  14 ++
 .../vulkan/runtime/graph/ops/impl/Cat.cpp     |  98 ----------
 .../vulkan/runtime/graph/ops/impl/Concat.cpp  | 168 ++++++++++++++++++
 backends/vulkan/test/op_tests/cases.py        |   5 +-
 .../test/op_tests/utils/gen_correctness_vk.py |   3 +-
 backends/vulkan/test/test_vulkan_delegate.py  |   9 +-
 10 files changed, 448 insertions(+), 106 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/concat_buffer.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/concat_buffer.yaml
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/concat_texture.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/concat_texture.yaml
 delete mode 100644 backends/vulkan/runtime/graph/ops/impl/Cat.cpp
 create mode 100644 backends/vulkan/runtime/graph/ops/impl/Concat.cpp

diff --git a/backends/vulkan/op_registry.py b/backends/vulkan/op_registry.py
index 90fea61318c..9333f34430e 100644
--- a/backends/vulkan/op_registry.py
+++ b/backends/vulkan/op_registry.py
@@ -538,8 +538,6 @@ def register_rotary_emb_op(features: OpFeatures):
         exir_ops.edge.aten.clone.default,
         exir_ops.edge.aten.permute.default,
         exir_ops.edge.aten.permute_copy.default,
-        exir_ops.edge.aten.select_copy.int,
-        exir_ops.edge.aten.slice_copy.Tensor,
         exir_ops.edge.aten.view_copy.default,
     ]
 )
@@ -551,6 +549,48 @@ def register_view_ops(features: OpFeatures):
     return features
 
 
+# Fully featured transfer operators (i.e. operators that copy data from the input
+# tensor(s) to the output tensor(s)), which have memory layout agnostic implementations
+# for both texture and buffer storage types.
+@update_features(exir_ops.edge.aten.cat.default)
+def register_cat_op(features: OpFeatures):
+    features.texture_impl = TextureImplFeatures(
+        valid_packed_dims=all_packed_dims,
+    )
+    features.buffer_impl = True
+    features.resize_fn = True
+
+    def check_cat_node(node: torch.fx.Node) -> bool:
+        inputs = node.args[0]
+        if isinstance(inputs, (list, tuple)) and len(inputs) <= 3:
+            return True
+
+        return False
+
+    features.check_node_fn = check_cat_node
+
+    return features
+
+
+# Fully featured transfer operators (i.e. operators that copy data from the input
+# tensor(s) to the output tensor(s)), which have memory layout agnostic implementations
+# for both texture and buffer storage types.
+@update_features(
+    [
+        exir_ops.edge.aten.select_copy.int,
+        exir_ops.edge.aten.slice_copy.Tensor,
+    ]
+)
+def register_transfer_ops(features: OpFeatures):
+    features.texture_impl = TextureImplFeatures(
+        valid_packed_dims=all_packed_dims,
+    )
+    features.buffer_impl = True
+    features.resize_fn = True
+
+    return features
+
+
 # Ops ported from PyTorch Vulkan backend. These ops commonly support channels
 # packed tensors only and do not have a resize function.
 @update_features(
@@ -588,7 +628,6 @@ def register_ported_op(features: OpFeatures):
         exir_ops.edge.aten.squeeze_copy.dims,
         exir_ops.edge.aten.unsqueeze_copy.default,
         # Tensor combination
-        exir_ops.edge.aten.cat.default,
         exir_ops.edge.aten.repeat.default,
         exir_ops.edge.aten.split_with_sizes_copy.default,
         exir_ops.edge.aten.split.Tensor,
diff --git a/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.glsl
new file mode 100644
index 00000000000..895cecb413a
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.glsl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type("buffer")}
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "buffer")}
+
+$for i in range(NUM_INPUTS):
+  ${layout_declare_tensor(B, "r", "t_in" + str(i + 1), DTYPE, "buffer")}
+
+${layout_declare_ubo(B, "int", "concat_dim")}
+
+${layout_declare_ubo(B, "ivec4", "out_sizes")}
+${layout_declare_ubo(B, "ivec4", "out_strides")}
+
+$for i in range(NUM_INPUTS):
+  ${layout_declare_ubo(B, "ivec4", "in" + str(i+1) + "_sizes")}
+  ${layout_declare_ubo(B, "ivec4", "in" + str(i+1) + "_strides")}
+
+${layout_declare_ubo(B, "int", "out_numel")}
+
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+
+const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const int out_bufi = ivec3(gl_GlobalInvocationID).x;
+  if (out_bufi >= out_numel) {
+    return;
+  }
+
+  // Convert buffer linear index to 4-D tensor index for output
+  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
+
+  // Determine which input tensor to read from
+  ivec4 in_tidx = out_tidx;
+
+  $for i in range(NUM_INPUTS):
+    // Check if the index at the concat dim is within bounds of the input tensor
+    // If so, read from that input tensor and write to output
+    if (in_tidx[concat_dim] < in${i+1}_sizes[concat_dim]) {
+      int in_bufi = tidx_to_bufi(in_tidx, in${i+1}_strides);
+      t_out[out_bufi] = t_in${i+1}[in_bufi];
+      return;
+    }
+    // otherwise, decrement the index at the concat dim
+    else {
+      in_tidx[concat_dim] -= in${i+1}_sizes[concat_dim];
+    }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.yaml
new file mode 100644
index 00000000000..39f96df5e90
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/concat_buffer.yaml
@@ -0,0 +1,14 @@
+concat_buffer:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NUM_INPUTS: 2
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: concat_1_buffer
+      NUM_INPUTS: 1
+    - NAME: concat_2_buffer
+    - NAME: concat_3_buffer
+      NUM_INPUTS: 3
diff --git a/backends/vulkan/runtime/graph/ops/glsl/concat_texture.glsl b/backends/vulkan/runtime/graph/ops/glsl/concat_texture.glsl
new file mode 100644
index 00000000000..dac6266bf67
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/concat_texture.glsl
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+#define T ${buffer_scalar_type(DTYPE)}
+
+#define USING_TEXTURE3D
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
+
+$for i in range(NUM_INPUTS):
+  ${layout_declare_tensor(B, "r", "t_in" + str(i + 1), DTYPE, "texture3d")}
+
+${layout_declare_ubo(B, "int", "concat_dim")}
+
+$in_metadata = ""
+$for i in range(NUM_INPUTS):
+  $in_metadata += "ivec4 in" + str(i + 1) + "_sizes;\n"
+
+layout(push_constant) uniform restrict Block {
+  ivec4 out_sizes;
+  ${in_metadata}
+};
+
+${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
+const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
+const lowp int out_packed_dim = unhash_packed_dim(out_layout);
+
+$for i in range(NUM_INPUTS):
+  ${layout_declare_spec_const(C, "int", "in" + str(i+1) + "_layout", "DEFAULT_LAYOUT")}
+  const lowp ivec4 in${i+1}_axis_map = unhash_axis_map(in${i+1}_layout);
+  const lowp int in${i+1}_packed_dim = unhash_packed_dim(in${i+1}_layout);
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+// Check if we can use the fast path (no texel merging required)
+bool can_use_fast_path() {
+  // Fast path is possible when:
+  // 1. The concat dimension is not the packed dimension, or
+  // 2. The concat dimension is the packed dimension but both input tensors have dimensions
+  //    that are multiples of 4 along the packed dimension
+  if (concat_dim != out_packed_dim) {
+    return true;
+  }
+
+  // Check if all input tensors have dimensions that are multiples of 4 along the packed dimension
+  bool all_concat_dim_size_multiple_of_4 = true;
+  $for i in range(NUM_INPUTS):
+    all_concat_dim_size_multiple_of_4 =
+        all_concat_dim_size_multiple_of_4 &&
+        (in${i+1}_sizes[concat_dim] % 4 == 0);
+
+  return all_concat_dim_size_multiple_of_4;
+}
+
+void main() {
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
+  ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim);
+
+  if (any(greaterThanEqual(out_tidx, out_sizes))) {
+    return;
+  }
+
+  if (can_use_fast_path()) {
+    // Fast path: No texel merging required
+    ivec4 in_tidx = out_tidx;
+
+    $for i in range(NUM_INPUTS):
+      // For each input tensor, check if the tensor index is within bounds. If
+      // so, read the texel from the input tensor and write it to the output
+      if (in_tidx[concat_dim] < in${i+1}_sizes[concat_dim]) {
+        const ivec3 in_pos = tidx_to_pos(in_tidx, in${i+1}_sizes, in${i+1}_axis_map, in${i+1}_packed_dim);
+        const VEC4_T in_texel = load_texel(t_in${i+1}, in_pos);
+        write_texel_lpos(t_out, lpos, in_texel, out_axis_map);
+        return;
+      }
+      // Otherwise, adjust the index along the concat dimension and try the next
+      // input tensor.
+      else {
+        in_tidx[concat_dim] -= in${i+1}_sizes[concat_dim];
+      }
+  }
+  else {
+    // Slow path: Texel merging required
+    VEC4_T out_texel = VEC4_T(0);
+
+    // Process each element in the output texel individually
+    for (int texel_i = 0; texel_i < 4; ++texel_i) {
+      ivec4 curr_out_tidx = out_tidx;
+      curr_out_tidx[out_packed_dim] += texel_i;
+
+      // Skip if we're out of bounds
+      if (curr_out_tidx[out_packed_dim] >= out_sizes[out_packed_dim]) {
+        continue;
+      }
+
+      ivec4 in_tidx = curr_out_tidx;
+      $for i in range(NUM_INPUTS):
+        // For each input tensor, check if the tensor index is within bounds. If
+        // so, read the corresponding texel element from the input tensor and
+        // write it to the output texel.
+        if (in_tidx[concat_dim] < in${i+1}_sizes[concat_dim]) {
+          const ivec4 in_posi = tidx_to_posi(in_tidx, in${i+1}_sizes, in${i+1}_axis_map, in${i+1}_packed_dim);
+          out_texel[texel_i] = load_texel(t_in${i+1}, in_posi.xyz)[in_posi.w];
+          continue;
+        }
+        // Otherwise, adjust the index along the concat dimension and try the
+        // next input tensor.
+        else {
+          in_tidx[concat_dim] -= in${i+1}_sizes[concat_dim];
+        }
+    }
+
+    write_texel_lpos(t_out, lpos, out_texel, out_axis_map);
+  }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/concat_texture.yaml b/backends/vulkan/runtime/graph/ops/glsl/concat_texture.yaml
new file mode 100644
index 00000000000..ed5003382a1
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/concat_texture.yaml
@@ -0,0 +1,14 @@
+concat_texture:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NUM_INPUTS: 2
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: concat_1_texture3d
+      NUM_INPUTS: 1
+    - NAME: concat_2_texture3d
+    - NAME: concat_3_texture3d
+      NUM_INPUTS: 3
diff --git a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp b/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
deleted file mode 100644
index 25a0ff9a7f5..00000000000
--- a/backends/vulkan/runtime/graph/ops/impl/Cat.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
-
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/Copy.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
-#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
-
-namespace vkcompute {
-
-void add_cat_default_node(
-    ComputeGraph& graph,
-    ValueRef in_list_ref,
-    ValueRef dim_ref,
-    ValueRef out) {
-  ValueListPtr input_list = graph.get_value_list(in_list_ref);
-  int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
-  vTensorPtr t_out = graph.get_tensor(out);
-
-  const auto packed_dim = t_out->packed_dim();
-  const auto packed_dim_index = static_cast<DimIndex>(kWidth4D - packed_dim);
-
-  DimIndex dim_index = normalize_to_dim_index(*t_out, dim);
-  // Index of dimension to be concatenated in (w, h, c * b) coordinate system
-  const auto dim_xyz_index = std::min(2, -dim_index - 1);
-
-  if (dim_index > kWidth4D || dim_index < kBatch4D) {
-    VK_THROW("Unexpected value of dim_index=", dim_index);
-  }
-
-  utils::ivec4 src_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-  utils::ivec4 dst_offset = utils::make_ivec4({0, 0, 0, 0}, false);
-
-  const bool is_concat_channel = (dim_index == kChannel4D);
-
-  // if concatenating channels
-  if (is_concat_channel) {
-    // set destination offset w as channel size of the output tensor
-    dst_offset[3] = dim_at(t_out->sizes(), kChannel4D);
-  }
-
-  for (ValueRef input_ref : *input_list) {
-    const vTensorPtr t_in = graph.get_tensor(input_ref);
-    const utils::ivec3 range = t_in->logical_limits();
-    const auto in_channel_size = dim_at(t_in->sizes(), kChannel4D);
-    // if concatenating same dimension as the packed dimension
-    if (dim_index == packed_dim_index) {
-      // if concatenating channels, use add_copy_channel_offset_node function as
-      // add_copy_packed_dim_offset_node does not support channel packing
-      if (is_concat_channel) {
-        add_copy_channel_offset_node(
-            graph,
-            input_ref,
-            in_channel_size,
-            src_offset[2],
-            dst_offset[2],
-            out);
-        dst_offset[dim_xyz_index] += in_channel_size;
-      } else {
-        // src_offset[3] is not used now but will be used in the future when
-        // add_copy_packed_dim_offset_node will support channel packing
-        //
-        // set source offset w as channel size of the output tensor if
-        // concatenating channels
-        src_offset[3] = is_concat_channel ? in_channel_size : 0;
-        add_copy_packed_dim_offset_node(
-            graph, input_ref, range, src_offset, dst_offset, out);
-        dst_offset[dim_xyz_index] += dim_at(t_in->sizes(), packed_dim_index);
-      }
-    } else {
-      // set source offset w as channel size of the output tensor if
-      // concatenating channels
-      src_offset[3] = is_concat_channel ? in_channel_size : 0;
-      add_copy_offset_node(
-          graph, input_ref, range, src_offset, dst_offset, out, true, false);
-      dst_offset[dim_xyz_index] +=
-          is_concat_channel ? in_channel_size : range[dim_xyz_index];
-    }
-  }
-}
-
-void cat_default(ComputeGraph& graph, const std::vector<ValueRef>& args) {
-  add_cat_default_node(graph, args[0], args[1], args[2]);
-}
-
-REGISTER_OPERATORS {
-  VK_REGISTER_OP(aten.cat.default, cat_default);
-}
-
-} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Concat.cpp b/backends/vulkan/runtime/graph/ops/impl/Concat.cpp
new file mode 100644
index 00000000000..315dabdb1d5
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Concat.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Clone.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+std::vector<int64_t> get_concat_sizes(
+    ComputeGraph& graph,
+    const std::vector<ValueRef>& in_value_refs,
+    const int64_t dim) {
+  // Get the sizes of the first input tensor as a starting point
+  std::vector<int64_t> new_out_sizes = graph.sizes_of(in_value_refs.at(0));
+
+  // Sum up the sizes along the concatenation dimension
+  for (size_t i = 1; i < in_value_refs.size(); ++i) {
+    const std::vector<int64_t> in_sizes = graph.sizes_of(in_value_refs.at(i));
+    new_out_sizes.at(dim) += in_sizes.at(dim);
+  }
+
+  return new_out_sizes;
+}
+
+void resize_concat_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  // Extract relevant ValueRefs
+  const ValueRef out_ref = args.at(0).refs.at(0);
+  const std::vector<ValueRef>& in_value_refs = args.at(1).refs;
+
+  int64_t dim = graph->extract_scalar<int64_t>(extra_args.at(0));
+
+  // Normalize dim if negative
+  const int64_t ndim = graph->dim_of(out_ref);
+  if (dim < 0) {
+    dim += ndim;
+  }
+
+  // Calculate the new sizes
+  std::vector<int64_t> new_out_sizes =
+      get_concat_sizes(*graph, in_value_refs, dim);
+
+  // Resize the output tensor
+  graph->virtual_resize(out_ref, new_out_sizes);
+}
+
+void add_concat_node(
+    ComputeGraph& graph,
+    const ValueRef tensors_ref,
+    const ValueRef dim_ref,
+    const ValueRef out) {
+  std::vector<ValueRef> in_value_refs;
+
+  {
+    const ValueListPtr tensors = graph.get_value_list(tensors_ref);
+
+    VK_CHECK_COND(
+        tensors->size() <= 3,
+        "Currently only concatenation of <= 3 tensors is supported");
+
+    for (const ValueRef in : *tensors) {
+      in_value_refs.push_back(in);
+    }
+  }
+
+  const int64_t dim = graph.extract_scalar<int64_t>(dim_ref);
+
+  const int64_t ndim = graph.dim_of(in_value_refs.at(0));
+  int64_t normalized_dim = dim;
+  if (normalized_dim < 0) {
+    normalized_dim += ndim;
+  }
+
+  const int64_t dim_whcn = nchw_dim_to_whcn_dim(normalized_dim, ndim);
+  const ValueRef dim_whcn_ref = graph.get_or_add_value_for_int(dim_whcn);
+
+  vkapi::ParamsBindList param_buffers = {
+      graph.get_or_create_int_param_buffer(dim_whcn_ref, 0)};
+
+  std::vector<PushConstantDataInfo> push_constants;
+  vkapi::SpecVarList spec_vars;
+
+  if (graph.is_buffer_storage(out)) {
+    param_buffers.append(graph.sizes_ubo(out));
+    param_buffers.append(graph.strides_ubo(out));
+
+    for (const ValueRef in_ref : in_value_refs) {
+      param_buffers.append(graph.sizes_ubo(in_ref));
+      param_buffers.append(graph.strides_ubo(in_ref));
+    }
+
+    param_buffers.append(graph.numel_ubo(out));
+
+    spec_vars = {graph.hashed_layout_of(out)};
+  } else {
+    push_constants = {graph.sizes_pc_of(out)};
+
+    spec_vars = {graph.hashed_layout_of(out)};
+
+    for (const ValueRef in_ref : in_value_refs) {
+      push_constants.push_back(graph.sizes_pc_of(in_ref));
+      spec_vars.append(graph.hashed_layout_of(in_ref));
+    }
+  }
+
+  std::string kernel_name = "concat";
+  if (in_value_refs.size() == 1) {
+    kernel_name += "_1";
+  } else if (in_value_refs.size() == 2) {
+    kernel_name += "_2";
+  } else if (in_value_refs.size() == 3) {
+    kernel_name += "_3";
+  }
+  if (graph.is_buffer_storage(out)) {
+    kernel_name += "_buffer";
+  } else {
+    kernel_name += "_texture3d";
+  }
+
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      default_pick_global_wg_size,
+      default_pick_local_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {in_value_refs, vkapi::kRead}},
+      // Parameter buffers
+      param_buffers,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      spec_vars,
+      // Resize Args
+      {dim_ref},
+      // Resizing Logic
+      resize_concat_node));
+}
+
+void cat_tensor(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  // Extract arguments
+  const ValueRef tensors_ref = args.at(0);
+  const ValueRef dim_ref = args.at(1);
+  const ValueRef out = args.at(2);
+
+  // Add concat node
+  add_concat_node(graph, tensors_ref, dim_ref, out);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.cat.default, cat_tensor);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 4ea61cd7ef3..813807445f0 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -1196,9 +1196,12 @@ def get_cat_inputs():
     )
     test_suite.layouts = [
         "utils::kWidthPacked",
-        "utils::kHeightPacked",
         "utils::kChannelsPacked",
     ]
+    test_suite.storage_types = [
+        "utils::kTexture3D",
+        "utils::kBuffer",
+    ]
     test_suite.data_gen = "make_seq_tensor"
     test_suite.dtypes = ["at::kFloat"]
     return test_suite
diff --git a/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py b/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py
index ce6ab32ce60..4f0d2ff11ef 100644
--- a/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py
+++ b/backends/vulkan/test/op_tests/utils/gen_correctness_vk.py
@@ -29,6 +29,7 @@ class GeneratedOpsTest_{op_name} : public ::testing::TestWithParam< ::std::tuple
 
   void SetUp() override {{
     GraphConfig config;
+    config.expect_dynamic_shapes = true;
     utils::StorageType default_storage_type;
     utils::GPUMemoryLayout default_memory_layout;
     std::tie(test_dtype, default_storage_type, default_memory_layout) = GetParam();
@@ -119,7 +120,7 @@ def gen_parameterization(self) -> str:
       return vkapi::kInt;
     case c10::kChar:
       return vkapi::kChar;
-    case c10::kBool: 
+    case c10::kBool:
       return vkapi::kBool;
     default:
       VK_THROW("Unsupported at::ScalarType!");
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
index dfd22198363..0096834f3c6 100644
--- a/backends/vulkan/test/test_vulkan_delegate.py
+++ b/backends/vulkan/test/test_vulkan_delegate.py
@@ -733,6 +733,10 @@ def forward(self, x):
 
         self.lower_module_and_test_output(model, sample_inputs)
 
+    @unittest.skip(
+        "Currently this test is failing due to weird partitioning because the eq scalar"
+        "operator is not supported yet. Re-enable when the operator is supported."
+    )
     def test_vulkan_backend_partial_dynamic_shapes(self):
         class SimpleModel(torch.nn.Module):
             def __init__(self):
@@ -1286,14 +1290,13 @@ class TestModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
 
-            def forward(self, x, y, z, w):
-                return torch.cat([x, y, z, w], dim=1)
+            def forward(self, x, y, z):
+                return torch.cat([x, y, z], dim=1)
 
         sample_inputs = (
             torch.randn(size=(3, 6, 2, 7), dtype=torch.float32),
             torch.randn(size=(3, 1, 2, 7), dtype=torch.float32),
             torch.randn(size=(3, 9, 2, 7), dtype=torch.float32),
-            torch.randn(size=(3, 3, 2, 7), dtype=torch.float32),
         )
 
         self.lower_module_and_test_output(