From 5d2cd16142208e84638808733f06ba0f33526ea8 Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm5117.ash0.facebook.com>
Date: Sat, 23 Aug 2025 10:05:37 -0400
Subject: [PATCH] [ET-VK] Introduce `BufferMetadata` GLSL struct to abstract
 tensor layout

Pull Request resolved: https://github.com/pytorch/executorch/pull/13595

As title; introduce an consolidated metadata UBO for buffer storage that can be used to abstract tensor indexing operations for buffer-backed tensors.

This new metadata UBO is capable of representing tensors of up to 8 dimensions. This upper limit is hardcoded, but can be increased later on without needing to update callsites since everything is abstracted by the BufferMetadata struct.

Update the following ops to use this new metadata UBO:

* staging shaders (nchw_to_buffer and buffer_to_nchw)
* binary op


@imported-using-ghimport

Differential Revision: [D80800082](https://our.internmc.facebook.com/intern/diff/D80800082/)
ghstack-source-id: 305143836
---
 .../vulkan/runtime/api/containers/Tensor.cpp  |  50 +++++
 .../vulkan/runtime/api/containers/Tensor.h    |  29 +++
 backends/vulkan/runtime/graph/ComputeGraph.h  |   8 +
 .../runtime/graph/ops/glsl/binary_op.glsl     |  35 +--
 .../graph/ops/glsl/buffer_to_nchw.glsl        |  33 ++-
 .../graph/ops/glsl/buffer_to_nchw.yaml        |   2 -
 .../runtime/graph/ops/glsl/indexing.glslh     | 207 ++++++++++++++++++
 .../graph/ops/glsl/nchw_to_buffer.glsl        |  45 ++--
 .../graph/ops/glsl/nchw_to_buffer.yaml        |   2 -
 .../runtime/graph/ops/impl/BinaryOp.cpp       |  10 +-
 .../vulkan/runtime/graph/ops/impl/Staging.cpp |  35 +--
 .../runtime/graph/ops/utils/StagingUtils.cpp  |   6 -
 backends/vulkan/test/utils/test_utils.cpp     |  14 +-
 13 files changed, 374 insertions(+), 102 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index e9437e3bd09..fedb0d7f173 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -567,6 +567,7 @@ vTensor::vTensor(
       max_ubo_nbytes_{
           calculate_max_ubo_nbytes(min_nbytes_per_ubo_, storage_type)},
       uniforms_(),
+      buffer_meta_(),
       // Construct Tensor storage
       storage_(std::make_shared<vTensorStorage>(
           context,
@@ -611,6 +612,7 @@ vTensor::vTensor(
       max_ubo_nbytes_{
           calculate_max_ubo_nbytes(min_nbytes_per_ubo_, utils::kTexture3D)},
       uniforms_(),
+      buffer_meta_(),
       // Construct Tensor storage
       storage_(std::make_shared<vTensorStorage>(context, image)) {
   uniform_data_ = std::make_shared<UniformData>(UniformData{
@@ -634,6 +636,7 @@ vTensor::vTensor(vTensor& other)
       min_nbytes_per_ubo_{other.min_nbytes_per_ubo_},
       max_ubo_nbytes_{other.max_ubo_nbytes_},
       uniforms_(),
+      buffer_meta_(),
       // Copy Tensor storage
       storage_(other.storage_) {
   uniform_data_ = std::make_shared<UniformData>(*other.get_uniform_data());
@@ -659,6 +662,7 @@ vTensor::vTensor(
       min_nbytes_per_ubo_{other.min_nbytes_per_ubo_},
       max_ubo_nbytes_{other.max_ubo_nbytes_},
       uniforms_(),
+      buffer_meta_(),
       // Copy Tensor storage
       storage_(other.storage_) {
   uniform_data_ = std::make_shared<UniformData>(UniformData{
@@ -711,6 +715,38 @@ uint32_t vTensor::UniformData::write_attribute(
   return 0;
 }
 
+vTensor::BufferMetadata::BufferMetadata(
+    std::vector<int64_t>& src_sizes,
+    std::vector<int64_t>& src_dim_order,
+    std::vector<int64_t>& src_strides,
+    size_t src_numel) {
+  update(src_sizes, src_dim_order, src_strides, src_numel);
+}
+
+void vTensor::BufferMetadata::update(
+    std::vector<int64_t>& src_sizes,
+    std::vector<int64_t>& src_dim_order,
+    std::vector<int64_t>& src_strides,
+    size_t src_numel) {
+  int32_t fixed_ndim = utils::safe_downcast<int32_t>(kTensorDimLimit);
+
+  std::vector<uint32_t> fu_sizes = flip_and_unsqueeze<uint32_t>(
+      src_sizes, kTensorSizes, src_numel, fixed_ndim);
+  std::vector<uint32_t> fu_dim_order = flip_and_unsqueeze<uint32_t>(
+      src_dim_order, kTensorDimOrder, src_numel, fixed_ndim);
+  std::vector<uint32_t> fu_strides = flip_and_unsqueeze<uint32_t>(
+      src_strides, kTensorStrides, src_numel, fixed_ndim);
+
+  for (int i = 0; i < fixed_ndim; ++i) {
+    sizes[i] = fu_sizes.at(i);
+    dim_order[i] = fu_dim_order.at(i);
+    strides[i] = fu_strides.at(i);
+  }
+
+  ndim = utils::safe_downcast<uint32_t>(src_sizes.size());
+  numel = utils::safe_downcast<uint32_t>(src_numel);
+}
+
 vkapi::VulkanImage& vTensor::image(
     vkapi::PipelineBarrier& pipeline_barrier,
     const vkapi::PipelineStageFlags stage) & {
@@ -799,6 +835,15 @@ const vkapi::BufferBindInfo vTensor::numel_ubo() {
   return metadata_ubo_impl(&numel_uniform_offset_, uniform_data_->numel);
 }
 
+const vkapi::BufferBindInfo vTensor::buffer_meta_ubo() {
+  size_t ubo_nbytes = sizeof(BufferMetadata);
+  if (!buffer_meta_.buffer()) {
+    BufferMetadata data(sizes_, dim_order_, strides_, numel_);
+    buffer_meta_ = ParamsBuffer(storage_->context_, data);
+  }
+  return vkapi::BufferBindInfo(buffer_meta_.buffer(), 0, ubo_nbytes);
+}
+
 VkMemoryRequirements vTensor::get_memory_requirements() const {
   switch (storage_type()) {
     case utils::kBuffer:
@@ -875,6 +920,11 @@ void vTensor::update_metadata() {
     uniforms_.update(
         uniform_data_->logical_limits.limits, logical_limits_uniform_offset_);
   }
+
+  if (buffer_meta_.buffer()) {
+    BufferMetadata data(sizes_, dim_order_, strides_, numel_);
+    buffer_meta_.update(data);
+  }
 }
 
 void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
index fefbd2aa71a..eb0e09dbd81 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.h
+++ b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -19,6 +19,8 @@
 namespace vkcompute {
 namespace api {
 
+static constexpr size_t kTensorDimLimit = 8;
+
 /*
  * Given a GPUMemoryLayout value, produce a dim order vector that matches the
  * given memory layout. The produced dim order vector will be in the NCHW
@@ -262,6 +264,26 @@ class vTensor final {
         const Attribute attr);
   };
 
+  struct BufferMetadata {
+    uint32_t sizes[kTensorDimLimit];
+    uint32_t dim_order[kTensorDimLimit];
+    uint32_t strides[kTensorDimLimit];
+    uint32_t ndim;
+    uint32_t numel;
+
+    BufferMetadata(
+        std::vector<int64_t>& sizes,
+        std::vector<int64_t>& dim_order,
+        std::vector<int64_t>& strides,
+        size_t numel);
+
+    void update(
+        std::vector<int64_t>& sizes,
+        std::vector<int64_t>& dim_order,
+        std::vector<int64_t>& strides,
+        size_t numel);
+  };
+
  private:
   /*
    * "Core" tensor metadata. They are the minimum amount of information required
@@ -332,6 +354,11 @@ class vTensor final {
    */
   ParamsBuffer uniforms_;
 
+  /*
+   * Used to store data for BufferMetadata to pass to shaders as buffer_meta_ubo
+   */
+  ParamsBuffer buffer_meta_;
+
   uint32_t uniforms_size_ = 0u;
   uint32_t sizes_uniform_offset_ = kUniformOffsetUnset;
   uint32_t dim_order_uniform_offset_ = kUniformOffsetUnset;
@@ -557,6 +584,8 @@ class vTensor final {
 
   const vkapi::BufferBindInfo numel_ubo();
 
+  const vkapi::BufferBindInfo buffer_meta_ubo();
+
  public:
   inline size_t staging_buffer_numel() const {
     return storage_->buffer_len();
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
index 7686aa65025..4257f63fab6 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.h
+++ b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -357,6 +357,10 @@ class ComputeGraph final {
     return values_.at(idx).toConstTensor().has_buffer_storage();
   }
 
+  inline bool is_texture_storage(const ValueRef idx) const {
+    return !is_buffer_storage(idx);
+  }
+
   /*
    * Checks that the following is true:
    * 1. The value at `idx` is a tensor
@@ -411,6 +415,10 @@ class ComputeGraph final {
     return values_.at(idx).toTensor().sizes_ubo();
   }
 
+  inline vkapi::BufferBindInfo buffer_meta_ubo(const ValueRef idx) {
+    return values_.at(idx).toTensor().buffer_meta_ubo();
+  }
+
   inline vkapi::BufferBindInfo strides_ubo(const ValueRef idx) {
     return values_.at(idx).toTensor().strides_ubo();
   }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
index f2a9e9cfdac..6f2a93667ea 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/binary_op.glsl
@@ -34,6 +34,8 @@ $if IS_COMPARISON_OP:
 
 layout(std430) buffer;
 
+#include "indexing.glslh"
+
 $if IS_COMPARISON_OP:
   ${layout_declare_tensor(B, "w", "t_out", "uint8", STORAGE)}
 $else:
@@ -43,13 +45,11 @@ ${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 ${layout_declare_tensor(B, "r", "t_other", DTYPE, STORAGE)}
 
 $if STORAGE == "buffer":
+  ${layout_declare_ubo(B, "BufferMetadata", "outp")}
+  ${layout_declare_ubo(B, "BufferMetadata", "inp")}
+  ${layout_declare_ubo(B, "BufferMetadata", "other")}
+
   layout(push_constant) uniform restrict Block {
-    ivec4 in_sizes;
-    ivec4 other_sizes;
-    ivec4 out_strides;
-    ivec4 in_strides;
-    ivec4 other_strides;
-    int out_numel;
     float alpha;
   };
 $else:
@@ -83,25 +83,30 @@ $else:
 #ifdef USING_BUFFER
 
 void main() {
-  const int out_bufi = ivec3(gl_GlobalInvocationID).x;
-  if (out_bufi >= out_numel) {
+  const uint out_bufi = gl_GlobalInvocationID.x;
+  if (out_bufi >= numel(outp)) {
     return;
   }
 
   // Simple case; no broadcasting
-  if (in_sizes == other_sizes) {
+  if (are_equal(inp, other)) {
     t_out[out_bufi] = T(op(t_in[out_bufi], t_other[out_bufi], T(alpha)));
     return;
   }
 
-  const ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
-  const ivec4 in_tidx = min(out_tidx, in_sizes - 1);
-  const ivec4 other_tidx = min(out_tidx, other_sizes - 1);
+  TensorIndex outp_tidx;
+  linear_idx_to_tensor_idx(outp, out_bufi, outp_tidx);
+
+  TensorIndex inp_tidx = outp_tidx;
+  clamp_tensor_idx(inp, inp_tidx);
+
+  TensorIndex other_tidx = outp_tidx;
+  clamp_tensor_idx(other, other_tidx);
 
-  const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
-  const int other_bufi = tidx_to_bufi(other_tidx, other_strides);
+  uint inp_bufi = tensor_idx_to_linear_idx(inp, inp_tidx);
+  uint other_bufi = tensor_idx_to_linear_idx(other, other_tidx);
 
-  t_out[out_bufi] = T(op(t_in[in_bufi], t_other[other_bufi], T(alpha)));
+  t_out[out_bufi] = T(op(t_in[inp_bufi], t_other[other_bufi], T(alpha)));
 }
 
 #else // USING_TEXTURE
diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl
index 423c4df2679..6d164ae2645 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.glsl
@@ -4,40 +4,33 @@
 
 #define T ${buffer_scalar_type(DTYPE)}
 
-#include "indexing_utils.h"
-
 ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-${layout_declare_tensor(0, "w", "nchw_buf", DTYPE, STORAGE)}
-${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "nchw_buf", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_inp", DTYPE, STORAGE)}
 
-$if USE_PUSH_CONST:
-  layout(push_constant) uniform restrict Block {
-    ivec4 in_sizes;
-    ivec4 in_strides;
-    int numel;
-  };
-$else:
-  ${layout_declare_ubo(2, "ivec4", "in_sizes")}
-  ${layout_declare_ubo(3, "ivec4", "in_strides")}
-  ${layout_declare_ubo(4, "int", "numel")}
+${layout_declare_ubo(B, "BufferMetadata", "inp")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
 // This constant is unused in this shader but is kept so that the signature is
 // consistent with image_to_nchw.
-layout(constant_id = 3) const int UNUSED_packed_dim = W_DIM;
+${layout_declare_spec_const(C, "int", "unused", "0")}
 
 void main() {
-  int nchwi = int(gl_GlobalInvocationID.x);
-  if (nchwi >= numel) {
+  uint inp_bufi = gl_GlobalInvocationID.x;
+  if (inp_bufi>= numel(inp)) {
     return;
   }
 
-  ivec4 in_tidx = nchwi_to_tidx(nchwi, in_sizes);
-  const int in_bufi = tidx_to_bufi(in_tidx, in_strides);
+  TensorIndex inp_tidx;
+  linear_idx_to_tensor_idx(inp, inp_bufi, inp_tidx);
+
+  uint nchwi = tensor_idx_to_contiguous_idx(inp, inp_tidx);
 
-  nchw_buf[nchwi] = t_in[in_bufi];
+  nchw_buf[nchwi] = t_inp[inp_bufi];
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml
index 679e686dc2f..929108cca5e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/buffer_to_nchw.yaml
@@ -19,5 +19,3 @@ buffer_to_nchw:
       - VALUE: int32
   shader_variants:
     - NAME: buffer_to_nchw
-    - NAME: buffer_to_nchw_no_pc
-      USE_PUSH_CONST: False
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
new file mode 100644
index 00000000000..7155b4616e3
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef INDEXING_GLSLH
+#define INDEXING_GLSLH
+
+#define DIMLIMIT 8
+#define DIMLIMIT_DIV4 2
+
+#define mul_4(x) ((x) << 2)
+#define div_4(x) ((x) >> 2)
+
+#define mod_4(x) ((x) & 3)
+
+//
+// BufferMetadata
+//
+
+struct BufferMetadata {
+  uvec4 sizes[DIMLIMIT_DIV4];
+  uvec4 dim_order[DIMLIMIT_DIV4];
+  uvec4 strides[DIMLIMIT_DIV4];
+  uvec2 ndim_numel;
+};
+
+uint ndim(const BufferMetadata meta) {
+  return meta.ndim_numel[0];
+}
+
+int int_ndim(const BufferMetadata meta) {
+  return int(meta.ndim_numel[0]);
+}
+
+uint numel(const BufferMetadata meta) {
+  return meta.ndim_numel[1];
+}
+
+uint dim_order_at(const BufferMetadata meta, const int dim) {
+  return meta.dim_order[div_4(dim)][mod_4(dim)];
+}
+
+uint dim_order_at(const BufferMetadata meta, const uint dim) {
+  return meta.dim_order[div_4(dim)][mod_4(dim)];
+}
+
+uint stride_at(const BufferMetadata meta, const int dim) {
+  return meta.strides[div_4(dim)][mod_4(dim)];
+}
+
+uint stride_at(const BufferMetadata meta, const uint dim) {
+  return meta.strides[div_4(dim)][mod_4(dim)];
+}
+
+uint size_at(const BufferMetadata meta, const int dim) {
+  return meta.sizes[div_4(dim)][mod_4(dim)];
+}
+
+uint size_at(const BufferMetadata meta, const uint dim) {
+  return meta.sizes[div_4(dim)][mod_4(dim)];
+}
+
+bool are_equal(const BufferMetadata meta1, const BufferMetadata meta2) {
+  // sizes and strides must be the same to be considered equal
+  if (meta1.sizes[0] != meta2.sizes[0]) {
+    return false;
+  }
+  if (meta1.sizes[1] != meta2.sizes[1]) {
+    return false;
+  }
+  if (meta1.strides[0] != meta2.strides[0]) {
+    return false;
+  }
+  if (meta1.strides[1] != meta2.strides[1]) {
+    return false;
+  }
+  return true;
+}
+
+//
+// TensorIndex
+//
+
+struct TensorIndex {
+  uvec4 data[DIMLIMIT_DIV4];
+};
+
+void initialize(out TensorIndex tidx) {
+  tidx.data[0] = uvec4(0);
+  tidx.data[1] = uvec4(0);
+}
+
+uint idx_at(const TensorIndex tidx, const int dim) {
+  return tidx.data[div_4(dim)][mod_4(dim)];
+}
+
+//
+// Index Conversions
+//
+
+void contiguous_idx_to_tensor_idx(
+    const BufferMetadata meta,
+    uint contiguous_idx,
+    out TensorIndex tidx) {
+  initialize(tidx);
+  int dim = int_ndim(meta);
+  int i = 0;
+
+  uint contiguous_strides[DIMLIMIT];
+  contiguous_strides[0] = 1;
+  for (int d = 1; d < DIMLIMIT; ++d) {
+    contiguous_strides[d] = size_at(meta, d - 1) * contiguous_strides[d - 1];
+  }
+
+  for (int d = max(dim - 1, 0); d >= 0; d--) {
+    uint dim_stride = contiguous_strides[d];
+
+    tidx.data[div_4(d)][mod_4(d)] = contiguous_idx / dim_stride;
+    contiguous_idx = contiguous_idx % dim_stride;
+  }
+}
+
+uint tensor_idx_to_contiguous_idx(
+    const BufferMetadata meta,
+    const TensorIndex tidx) {
+  uint contiguous_strides[DIMLIMIT];
+  contiguous_strides[0] = 1;
+  for (int d = 1; d < DIMLIMIT; ++d) {
+    contiguous_strides[d] = size_at(meta, d - 1) * contiguous_strides[d - 1];
+  }
+
+  uint contig_idx = 0;
+  for (int d = 0; d < ndim(meta); ++d) {
+    contig_idx += contiguous_strides[d] * idx_at(tidx, d);
+  }
+  return contig_idx;
+}
+
+void linear_idx_to_tensor_idx(
+    const BufferMetadata meta,
+    uint linear_idx,
+    out TensorIndex tidx) {
+  initialize(tidx);
+  int dim = int_ndim(meta);
+  int i = 0;
+  for (int d = max(dim - 1, 0); d >= 0; d--) {
+    uint dim_idx = dim_order_at(meta, d);
+    uint dim_stride = stride_at(meta, dim_idx);
+
+    tidx.data[div_4(dim_idx)][mod_4(dim_idx)] = linear_idx / dim_stride;
+    linear_idx = linear_idx % dim_stride;
+  }
+}
+
+uint tensor_idx_to_linear_idx(
+    const BufferMetadata meta,
+    const TensorIndex tidx) {
+  uint lin_idx = 0;
+  for (int d = 0; d < ndim(meta); ++d) {
+    lin_idx += stride_at(meta, d) * idx_at(tidx, d);
+  }
+  return lin_idx;
+}
+
+void clamp_tensor_idx(const BufferMetadata meta, inout TensorIndex tidx) {
+  tidx.data[0] = min(tidx.data[0], meta.sizes[0] - 1);
+  tidx.data[1] = min(tidx.data[1], meta.sizes[1] - 1);
+}
+
+//
+// Debug utilities
+//
+
+#ifdef DEBUG_MODE
+
+void printTensorIndex(const TensorIndex tidx) {
+    debugPrintfEXT(
+        "TensorIndex: tidx=[%u %u %u %u %u %u %u %u]\\n",
+        tidx.data[0][0], tidx.data[0][1], tidx.data[0][2], tidx.data[0][3],
+        tidx.data[1][0], tidx.data[1][1], tidx.data[1][2], tidx.data[1][3]
+    );
+}
+
+void printBufferMetadata(const BufferMetadata meta) {
+    debugPrintfEXT(
+        "BufferMetadata: ndim=%u numel=%u\\n  sizes=[%u %u %u %u %u %u %u %u]\\n  dim_order=[%u %u %u %u %u %u %u %u]\\n  strides=[%u %u %u %u %u %u %u %u]\\n",
+        meta.ndim_numel[0], meta.ndim_numel[1],
+        meta.sizes[0][0], meta.sizes[0][1], meta.sizes[0][2], meta.sizes[0][3],
+        meta.sizes[1][1], meta.sizes[1][1], meta.sizes[1][2], meta.sizes[1][3],
+        meta.dim_order[0][0], meta.dim_order[0][1],
+        meta.dim_order[0][2], meta.dim_order[0][3],
+        meta.dim_order[1][0], meta.dim_order[1][1],
+        meta.dim_order[1][2], meta.dim_order[1][3],
+        meta.strides[0][0], meta.strides[0][1],
+        meta.strides[0][2], meta.strides[0][3],
+        meta.strides[1][1], meta.strides[1][1],
+        meta.strides[1][2], meta.strides[1][3]
+    );
+}
+
+#endif
+
+#endif // INDEXING_GLSLH
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
index 62cd0610ffb..074624dc37e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.glsl
@@ -4,46 +4,45 @@
 
 #define T ${buffer_scalar_type(DTYPE)}
 
-#include "indexing_utils.h"
-
 ${define_required_extensions(DTYPE)}
 
 layout(std430) buffer;
 
-${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+#include "indexing.glslh"
+
+${layout_declare_tensor(B, "w", "t_outp", DTYPE, STORAGE)}
 ${layout_declare_tensor(B, "r", "nchw_in", DTYPE, STORAGE)}
 
-$if USE_PUSH_CONST:
-  layout(push_constant) uniform restrict Block {
-    ivec4 out_sizes;
-    ivec4 out_strides;
-    int numel;
-  };
-$else:
-  ${layout_declare_ubo(B, "ivec4", "out_sizes")}
-  ${layout_declare_ubo(B, "ivec4", "out_strides")}
-  ${layout_declare_ubo(B, "int", "numel")}
+${layout_declare_ubo(B, "BufferMetadata", "outp")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_DIM_ORDER")}
-const lowp ivec4 out_dim_order = unhash_dim_order(out_layout);
+// This constant is unused in this shader but is kept so that the signature is
+// consistent with nchw_to_image.
+${layout_declare_spec_const(C, "int", "unused", "0")}
 ${layout_declare_spec_const(C, "int", "transpose_hw", "0")}
 
 void main() {
-  int out_bufi = int(gl_GlobalInvocationID.x);
-  if (out_bufi >= numel) {
+  const uint outp_bufi = int(gl_GlobalInvocationID.x);
+  if (outp_bufi >= numel(outp)) {
     return;
   }
 
-  ivec4 out_tidx = bufi_to_tidx(out_bufi, out_strides, out_dim_order);
+  TensorIndex outp_tidx;
+  uint nchwi;
+
+  linear_idx_to_tensor_idx(outp, outp_bufi, outp_tidx);
 
-  ivec4 sizes = out_sizes;
   if (transpose_hw == 1) {
-    sizes.xy = sizes.yx;
-    out_tidx.xy = out_tidx.yx;
+    BufferMetadata transposed_meta = outp;
+    transposed_meta.sizes[0].xy = transposed_meta.sizes[0].yx;
+    outp_tidx.data[0].xy = outp_tidx.data[0].yx;
+    nchwi = tensor_idx_to_contiguous_idx(transposed_meta, outp_tidx);
+  }
+  // Normal case
+  else {
+    nchwi = tensor_idx_to_contiguous_idx(outp, outp_tidx);
   }
-  const int in_nchwi = tidx_to_nchwi(out_tidx, sizes);
 
-  t_out[out_bufi] = nchw_in[in_nchwi];
+  t_outp[outp_bufi] = nchw_in[nchwi];
 }
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml
index 99e41a0ab6f..9d6c3aa76a9 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_buffer.yaml
@@ -19,5 +19,3 @@ nchw_to_buffer:
       - VALUE: int32
   shader_variants:
     - NAME: nchw_to_buffer
-    - NAME: nchw_to_buffer_no_pc
-      USE_PUSH_CONST: False
diff --git a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
index 6e9baafd45f..025b483eab7 100644
--- a/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/BinaryOp.cpp
@@ -139,15 +139,11 @@ void add_binary_op_buffer_node(
       // Inputs and Outputs
       {{out, vkapi::kWrite}, {{in1, in2}, vkapi::kRead}},
       // Shader params buffers
-      {},
+      {graph.buffer_meta_ubo(out),
+       graph.buffer_meta_ubo(in1),
+       graph.buffer_meta_ubo(in2)},
       // Push Constants
       {{
-          graph.sizes_pc_of(in1),
-          graph.sizes_pc_of(in2),
-          graph.strides_pc_of(out),
-          graph.strides_pc_of(in1),
-          graph.strides_pc_of(in2),
-          graph.numel_pc_of(out),
           PushConstantDataInfo(&alpha_val, sizeof(float)),
       }},
       // Specialization Constants
diff --git a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
index 5faeae3e21b..6cd5115563a 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Staging.cpp
@@ -29,13 +29,13 @@ void add_staging_to_tensor_node(
   vkapi::ShaderInfo shader = get_nchw_to_tensor_shader(
       graph, out_tensor, graph.int8_buffers_enabled());
 
-  std::vector<PushConstantDataInfo> pcs;
+  vkapi::ParamsBindList param_buffers = {};
   if (graph.is_buffer_storage(out_tensor)) {
-    pcs = {
-        graph.sizes_pc_of(out_tensor),
-        graph.strides_pc_of(out_tensor),
-        graph.numel_pc_of(out_tensor)};
-  } else {
+    param_buffers.append(graph.buffer_meta_ubo(out_tensor));
+  }
+
+  std::vector<PushConstantDataInfo> pcs;
+  if (graph.is_texture_storage(out_tensor)) {
     pcs = {graph.sizes_pc_of(out_tensor)};
   }
 
@@ -47,7 +47,7 @@ void add_staging_to_tensor_node(
       // Input and Outputs
       {{out_tensor, vkapi::kWrite}, {in_staging, vkapi::kRead}},
       // Parameter Buffers
-      {},
+      param_buffers,
       // Push Constants
       pcs,
       // Specialization Constants
@@ -113,13 +113,13 @@ void add_tensor_to_staging_node(
   vkapi::ShaderInfo shader =
       get_tensor_to_nchw_shader(graph, in_tensor, graph.int8_buffers_enabled());
 
-  std::vector<PushConstantDataInfo> pcs;
+  vkapi::ParamsBindList param_buffers = {};
   if (graph.is_buffer_storage(in_tensor)) {
-    pcs = {
-        graph.sizes_pc_of(in_tensor),
-        graph.strides_pc_of(in_tensor),
-        graph.numel_pc_of(in_tensor)};
-  } else {
+    param_buffers.append(graph.buffer_meta_ubo(in_tensor));
+  }
+
+  std::vector<PushConstantDataInfo> pcs;
+  if (graph.is_texture_storage(in_tensor)) {
     pcs = {graph.sizes_pc_of(in_tensor)};
   }
 
@@ -135,7 +135,7 @@ void add_tensor_to_staging_node(
       // Input and Outputs
       {{out_staging, vkapi::kWrite}, {in_tensor, vkapi::kRead}},
       // Parameter Buffers
-      {},
+      param_buffers,
       // Push Constants
       pcs,
       // Specialization Constants
@@ -154,6 +154,11 @@ void add_prepack_standard_node(
   vkapi::ShaderInfo shader =
       get_nchw_to_tensor_shader(graph, tensor, graph.int8_buffers_enabled());
 
+  vkapi::ParamsBindList param_buffers = {};
+  if (graph.is_buffer_storage(tensor)) {
+    param_buffers.append(graph.buffer_meta_ubo(tensor));
+  }
+
   std::vector<PushConstantDataInfo> pcs;
   if (graph.is_buffer_storage(tensor)) {
     pcs = {
@@ -175,7 +180,7 @@ void add_prepack_standard_node(
       tensor_data,
       tensor,
       // Parameter Buffers
-      {},
+      param_buffers,
       // Specialization Constants
       {graph.hashed_layout_of(tensor), transpose_hw_spec},
       pcs));
diff --git a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
index 904b91965d6..c90bfa402bb 100644
--- a/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
+++ b/backends/vulkan/runtime/graph/ops/utils/StagingUtils.cpp
@@ -44,9 +44,6 @@ vkapi::ShaderInfo get_nchw_to_tensor_shader(
 
   if (dst_storage_type == utils::kBuffer) {
     kernel_name = "nchw_to_buffer";
-    if (!push_constant_variant) {
-      kernel_name += "_no_pc";
-    }
     add_dtype_suffix(kernel_name, dst_dtype);
     return VK_KERNEL_FROM_STR(kernel_name);
   }
@@ -85,9 +82,6 @@ vkapi::ShaderInfo get_tensor_to_nchw_shader(
 
   if (src_storage_type == utils::kBuffer) {
     kernel_name = "buffer_to_nchw";
-    if (!push_constant_variant) {
-      kernel_name += "_no_pc";
-    }
     add_dtype_suffix(kernel_name, src_dtype);
     return VK_KERNEL_FROM_STR(kernel_name);
   }
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
index c026c1364fa..07d28229221 100644
--- a/backends/vulkan/test/utils/test_utils.cpp
+++ b/backends/vulkan/test/utils/test_utils.cpp
@@ -43,9 +43,6 @@ vkapi::ShaderInfo get_nchw_to_tensor_shader(
 
   if (v_dst.storage_type() == utils::kBuffer) {
     kernel_name = "nchw_to_buffer";
-    if (!push_constant_variant) {
-      kernel_name += "_no_pc";
-    }
     add_dtype_suffix(kernel_name, v_dst.dtype());
     return VK_KERNEL_FROM_STR(kernel_name);
   }
@@ -80,9 +77,6 @@ vkapi::ShaderInfo get_tensor_to_nchw_shader(
 
   if (v_src.storage_type() == utils::kBuffer) {
     kernel_name = "buffer_to_nchw";
-    if (!push_constant_variant) {
-      kernel_name += "_no_pc";
-    }
     add_dtype_suffix(kernel_name, v_src.dtype());
     return VK_KERNEL_FROM_STR(kernel_name);
   }
@@ -120,9 +114,7 @@ void record_nchw_to_buffer_op(
           vkapi::PipelineStage::COMPUTE,
           vkapi::MemoryAccessType::WRITE),
       src_buffer,
-      v_dst.sizes_ubo(),
-      v_dst.strides_ubo(),
-      v_dst.numel_ubo());
+      v_dst.buffer_meta_ubo());
 }
 
 void record_buffer_to_nchw_op(
@@ -140,9 +132,7 @@ void record_buffer_to_nchw_op(
       0,
       dst_buffer,
       v_src.buffer(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
-      v_src.sizes_ubo(),
-      v_src.strides_ubo(),
-      v_src.numel_ubo());
+      v_src.buffer_meta_ubo());
 }
 
 void record_nchw_to_image_op(