pytorch
diff --git a/‎.ci/scripts/setup-vulkan-linux-deps.sh‎
Lines changed: 3 additions & 3 deletions b/‎.ci/scripts/setup-vulkan-linux-deps.sh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/arm/quantizer/quantization_annotation/generic_annotator.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/quantizer/quantization_annotation/generic_annotator.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/test/quantizer/test_generic_annotater.py‎
Lines changed: 7 additions & 0 deletions b/‎backends/arm/test/quantizer/test_generic_annotater.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎backends/cadence/fusion_g3/operators/op_quantize.cpp‎
Lines changed: 2 additions & 2 deletions b/‎backends/cadence/fusion_g3/operators/op_quantize.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/vulkan/runtime/api/Context.cpp‎
Lines changed: 12 additions & 1 deletion b/‎backends/vulkan/runtime/api/Context.cpp‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎backends/vulkan/runtime/api/Context.h‎
Lines changed: 3 additions & 1 deletion b/‎backends/vulkan/runtime/api/Context.h‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎backends/vulkan/runtime/api/containers/StagingBuffer.h‎
Lines changed: 7 additions & 11 deletions b/‎backends/vulkan/runtime/api/containers/StagingBuffer.h‎
Lines changed: 7 additions & 11 deletions
diff --git a/‎backends/vulkan/runtime/api/containers/Tensor.cpp‎
Lines changed: 69 additions & 27 deletions b/‎backends/vulkan/runtime/api/containers/Tensor.cpp‎
Lines changed: 69 additions & 27 deletions
@@ -27,7 +27,7 @@ install_swiftshader() {
 
 install_vulkan_sdk() {
   VULKAN_SDK_VERSION=$1
-  _vulkan_sdk_url="https://sdk.lunarg.com/sdk/download/${VULKAN_SDK_VERSION}/linux/vulkansdk-linux-x86_64-${VULKAN_SDK_VERSION}.tar.gz"
+  _vulkan_sdk_url="https://sdk.lunarg.com/sdk/download/${VULKAN_SDK_VERSION}/linux/vulkansdk-linux-x86_64-${VULKAN_SDK_VERSION}.tar.xz"
 
   _vulkan_sdk_dir=/tmp/vulkansdk
   mkdir -p $_vulkan_sdk_dir
@@ -37,12 +37,12 @@ install_vulkan_sdk() {
   curl --silent --show-error --location --fail --retry 3 \
     --output "${_tmp_archive}" "${_vulkan_sdk_url}"
 
-  tar -C "${_vulkan_sdk_dir}" -xzf "${_tmp_archive}"
+  tar -C "${_vulkan_sdk_dir}" -xJf "${_tmp_archive}"
 
   export PATH="${PATH}:${_vulkan_sdk_dir}/${VULKAN_SDK_VERSION}/x86_64/bin/"
 }
 
-VULKAN_SDK_VERSION="1.2.198.1"
+VULKAN_SDK_VERSION="1.3.296.0"
 
 install_swiftshader
 install_vulkan_sdk "${VULKAN_SDK_VERSION}"
@@ -53,6 +53,7 @@
     torch.ops.aten.tile.default,
     torch.ops.aten.flip.default,
     torch.ops.aten.cat.default,
+    torch.ops.aten.concatenate.default,
     torch.ops.aten.stack.default,
     torch.ops.aten.chunk.default,
     torch.ops.aten.contiguous.default,
 
@@ -86,3 +86,10 @@ def test_flip(self):
         self.check_annotation(
             SingleOpModel(torch.flip, (torch.randn(2, 4),), dims=(0, 1)),
         )
+
+    def test_concat(self):
+        self.check_annotation(
+            SingleOpModel(
+                torch.concatenate, ((torch.randn(2, 3), torch.randn(2, 3)),), dim=0
+            ),
+        )
@@ -570,7 +570,7 @@ Tensor& quantize_per_tensor_out(
       err == torch::executor::Error::Ok,
       "Failed to resize out Tensor in quantize_per_tensor_out");
 
-  check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out);
+  // check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out);
 
   float scale_data = (float)scale;
   int zero_point_data = (int)zero_point;
@@ -696,7 +696,7 @@ Tensor& quantize_per_channel_out(
       zero_point.numel(),
       input.size(axis));
 
-  check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out);
+  // check_quantize_per_tensor_args(input, quant_min, quant_max, dtype, out);
 
   const double* scale_dt = scale.const_data_ptr<double>();
   const int64_t* zero_point_dt = zero_point.const_data_ptr<int64_t>();
 
@@ -119,7 +119,9 @@ void Context::register_shader_dispatch(
     const vkapi::DescriptorSet& descriptors,
     vkapi::PipelineBarrier& pipeline_barrier,
     const vkapi::ShaderInfo& shader_descriptor,
-    const utils::uvec3& global_workgroup_size) {
+    const utils::uvec3& global_workgroup_size,
+    const void* push_constants_data,
+    const uint32_t push_constants_size) {
   // Adjust the global workgroup size based on the output tile size
   uint32_t global_wg_w = utils::div_up(
       global_workgroup_size[0u], shader_descriptor.out_tile_size[0u]);
@@ -145,6 +147,15 @@ void Context::register_shader_dispatch(
   cmd_.bind_descriptors(descriptors.get_bind_handle());
   cmd_.insert_barrier(pipeline_barrier);
 
+  if (push_constants_size > 0 && push_constants_data != nullptr) {
+    const VkDescriptorSetLayout shader_layout =
+        shader_layout_cache().retrieve(shader_descriptor.kernel_layout);
+    const VkPipelineLayout pipeline_layout =
+        pipeline_layout_cache().retrieve(shader_layout);
+    cmd_.set_push_constants(
+        pipeline_layout, push_constants_data, push_constants_size);
+  }
+
   cmd_.dispatch(effective_global_wg);
 }
 
 
@@ -200,7 +200,9 @@ class Context final {
       const vkapi::DescriptorSet&,
       vkapi::PipelineBarrier&,
       const vkapi::ShaderInfo&,
-      const utils::uvec3&);
+      const utils::uvec3&,
+      const void* = nullptr,
+      const uint32_t = 0);
 
   void register_blit(
       vkapi::PipelineBarrier&,
 
@@ -23,8 +23,6 @@ class StagingBuffer final {
  private:
   Context* context_p_;
   vkapi::ScalarType dtype_;
-  size_t numel_;
-  size_t nbytes_;
   vkapi::VulkanBuffer vulkan_buffer_;
 
   void* mapped_data_;
@@ -36,10 +34,8 @@ class StagingBuffer final {
       const size_t numel)
       : context_p_(context_p),
         dtype_(dtype),
-        numel_(numel),
-        nbytes_(element_size(dtype_) * numel_),
-        vulkan_buffer_(
-            context_p_->adapter_ptr()->vma().create_staging_buffer(nbytes_)),
+        vulkan_buffer_(context_p_->adapter_ptr()->vma().create_staging_buffer(
+            element_size(dtype_) * numel)),
         mapped_data_(nullptr) {}
 
   StagingBuffer(const StagingBuffer&) = delete;
@@ -68,15 +64,15 @@ class StagingBuffer final {
   }
 
   inline size_t numel() {
-    return numel_;
+    return nbytes() / element_size(dtype_);
   }
 
   inline size_t nbytes() {
-    return nbytes_;
+    return vulkan_buffer_.mem_size();
   }
 
   inline void copy_from(const void* src, const size_t nbytes) {
-    VK_CHECK_COND(nbytes <= nbytes_);
+    VK_CHECK_COND(nbytes <= this->nbytes());
     memcpy(data(), src, nbytes);
     vmaFlushAllocation(
         vulkan_buffer_.vma_allocator(),
@@ -86,7 +82,7 @@ class StagingBuffer final {
   }
 
   inline void copy_to(void* dst, const size_t nbytes) {
-    VK_CHECK_COND(nbytes <= nbytes_);
+    VK_CHECK_COND(nbytes <= this->nbytes());
     vmaInvalidateAllocation(
         vulkan_buffer_.vma_allocator(),
         vulkan_buffer_.allocation(),
@@ -96,7 +92,7 @@ class StagingBuffer final {
   }
 
   inline void set_staging_zeros() {
-    memset(data(), 0, nbytes_);
+    memset(data(), 0, nbytes());
   }
 };
 
 
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/backends/vulkan/runtime/api/containers/Tensor.h>
+#include <cstring>
 
 namespace vkcompute {
 namespace api {
@@ -446,11 +447,10 @@ vTensor::vTensor(
       dim_order_(calculate_dim_order(sizes_.size(), packed_dim_)),
       axis_map_(default_axis_map()),
       strides_(calculate_strides(sizes, dim_order_)),
-      numel_(utils::multiply_integers(sizes_)),
       padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)},
-      unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
+      unsqueezed_strides_{
+          unsqueeze_strides(strides_, utils::multiply_integers(sizes_))},
       padded_numel_(utils::multiply_integers(padded_sizes_)),
-      logical_limits_{{0, 0, 0}},
       uniforms_(),
       // Utility Uniform Buffers that can be passed to shaders as arguments
       uniforms_size_(0),
@@ -467,6 +467,11 @@ vTensor::vTensor(
           padded_sizes_,
           dtype_,
           allocate_memory) {
+  uniform_data_ = std::make_shared<UniformData>(UniformData{
+      sizes_,
+      unsqueezed_strides_,
+      {{0, 0, 0}},
+      static_cast<size_t>(utils::multiply_integers(sizes_))});
   VK_CHECK_COND(
       dim_order_is_valid(dim_order_), "computed dim order is invalid");
 
@@ -494,11 +499,9 @@ vTensor::vTensor(
       dim_order_(),
       axis_map_(default_axis_map()),
       strides_(),
-      numel_(utils::multiply_integers(sizes_)),
       padded_sizes_(calculate_padded_sizes(sizes_, packed_dim_)),
       unsqueezed_strides_(),
       padded_numel_(utils::multiply_integers(padded_sizes_)),
-      logical_limits_(),
       uniforms_(),
       // Utility Uniform Buffers that can be passed to shaders as arguments
       uniforms_size_(0),
@@ -508,6 +511,11 @@ vTensor::vTensor(
       logical_limits_uniform_offset_(kUniformOffsetUnset),
       // Construct Tensor storage
       storage_(context, image) {
+  uniform_data_ = std::make_shared<UniformData>(UniformData{
+      sizes_,
+      {0, 0, 0, 0},
+      {{0, 0, 0}},
+      static_cast<size_t>(utils::multiply_integers(sizes_))});
   set_logical_limits(storage_.image_extents_);
 }
 
@@ -519,13 +527,11 @@ vTensor::vTensor(vTensor& other)
       dim_order_(other.dim_order_.begin(), other.dim_order_.end()),
       axis_map_(other.axis_map_.begin(), other.axis_map_.end()),
       strides_(other.strides_.begin(), other.strides_.end()),
-      numel_(other.numel_),
       padded_sizes_{other.padded_sizes_.begin(), other.padded_sizes_.end()},
       unsqueezed_strides_{
           other.unsqueezed_strides_.begin(),
           other.unsqueezed_strides_.end()},
       padded_numel_(other.padded_numel_),
-      logical_limits_{other.logical_limits_},
       uniforms_(),
       // Empty initialize Utility Uniform Buffers
       uniforms_size_(0),
@@ -534,7 +540,9 @@ vTensor::vTensor(vTensor& other)
       numel_uniform_offset_(kUniformOffsetUnset),
       logical_limits_uniform_offset_(kUniformOffsetUnset),
       // Copy Tensor storage
-      storage_(other.storage_) {}
+      storage_(other.storage_) {
+  uniform_data_ = std::make_shared<UniformData>(*other.get_uniform_data());
+}
 
 vTensor::vTensor(
     vTensor& other,
@@ -548,11 +556,10 @@ vTensor::vTensor(
       dim_order_(dim_order.begin(), dim_order.end()),
       axis_map_(default_axis_map()),
       strides_(calculate_strides(sizes_, dim_order_)),
-      numel_(utils::multiply_integers(sizes_)),
       padded_sizes_{calculate_padded_sizes(sizes, packed_dim_)},
-      unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
+      unsqueezed_strides_{
+          unsqueeze_strides(strides_, utils::multiply_integers(sizes_))},
       padded_numel_(utils::multiply_integers(padded_sizes_)),
-      logical_limits_(other.logical_limits_),
       uniforms_(),
       // Empty initialize Utility Uniform Buffers
       uniforms_size_(0),
@@ -562,14 +569,45 @@ vTensor::vTensor(
       logical_limits_uniform_offset_(kUniformOffsetUnset),
       // Copy Tensor storage
       storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) {
+  uniform_data_ = std::make_shared<UniformData>(UniformData{
+      sizes_,
+      unsqueezed_strides_,
+      {other.logical_limits()},
+      static_cast<size_t>(utils::multiply_integers(sizes_))});
+
   VK_CHECK_COND(
       dim_order_is_valid(dim_order_), "new dim order provided is invalid");
   VK_CHECK_COND(
-      offset_numel + numel_ <= other.numel(),
+      offset_numel + numel() <= other.numel(),
       "Tensor alias cannot access more elements than available in the original"
       "tensor");
 }
 
+uint32_t vTensor::UniformData::write_attribute(
+    void* dst,
+    const uint32_t dst_offset,
+    const uint32_t max_dst_size,
+    const Attribute attr) {
+#define WRITE_ATTRIBUTE_CASE(enum_name, member_name)                       \
+  case vTensor::Attribute::enum_name: {                                    \
+    VK_CHECK_COND(                                                         \
+        (dst_offset + sizeof(member_name)) <= max_dst_size,                \
+        "Attempting to write tensor attribute outside data boundary.");    \
+    memcpy((uint8_t*)dst + dst_offset, &member_name, sizeof(member_name)); \
+    return sizeof(member_name);                                            \
+  }
+  switch (attr) {
+    WRITE_ATTRIBUTE_CASE(SIZES, sizes_v);
+    WRITE_ATTRIBUTE_CASE(STRIDES, strides_v);
+    WRITE_ATTRIBUTE_CASE(LOGICAL_LIMITS, logical_limits);
+    WRITE_ATTRIBUTE_CASE(NUMEL, numel);
+    default:
+      VK_THROW("Invalid Attribute");
+  }
+#undef WRITE_ATTRIBUTE_CASE
+  return 0;
+}
+
 vkapi::VulkanImage& vTensor::image(
     vkapi::PipelineBarrier& pipeline_barrier,
     const vkapi::PipelineStageFlags stage) & {
@@ -601,9 +639,9 @@ vkapi::VulkanBuffer& vTensor::buffer(
 }
 
 void vTensor::set_logical_limits(const utils::uvec3& image_extents) {
-  logical_limits_.limits[0] = image_extents[axis_map_.at(0)];
-  logical_limits_.limits[1] = image_extents[axis_map_.at(1)];
-  logical_limits_.limits[2] = image_extents[axis_map_.at(2)];
+  uniform_data_->logical_limits.limits[0] = image_extents[axis_map_.at(0)];
+  uniform_data_->logical_limits.limits[1] = image_extents[axis_map_.at(1)];
+  uniform_data_->logical_limits.limits[2] = image_extents[axis_map_.at(2)];
 }
 
 utils::GPUMemoryLayout vTensor::estimate_memory_layout() const {
@@ -661,7 +699,7 @@ const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
         "Uniform data allocation has exceeded Tensor uniform buffer size");
     logical_limits_uniform_offset_ = uniforms_size_;
     uniforms_size_ += kSizePerUniform;
-    uniforms_.update(logical_limits_, logical_limits_uniform_offset_);
+    uniforms_.update(logical_limits(), logical_limits_uniform_offset_);
   }
   return vkapi::BufferBindInfo(
       uniforms_.buffer(), logical_limits_uniform_offset_);
@@ -677,7 +715,7 @@ const vkapi::BufferBindInfo vTensor::numel_ubo() {
         "Uniform data allocation has exceeded Tensor uniform buffer size");
     numel_uniform_offset_ = uniforms_size_;
     uniforms_size_ += kSizePerUniform;
-    uniforms_.update(numel_, numel_uniform_offset_);
+    uniforms_.update(numel(), numel_uniform_offset_);
   }
   return vkapi::BufferBindInfo(uniforms_.buffer(), numel_uniform_offset_);
 }
@@ -687,10 +725,10 @@ size_t vTensor::staging_buffer_numel() const {
   const bool int8_supported =
       storage_.context_->adapter_ptr()->has_full_int8_buffers_support();
   if (is_int8 && !int8_supported) {
-    return utils::align_up_4(numel_);
+    return utils::align_up_4(numel());
   }
   if (storage_type() == utils::kBuffer) {
-    return numel_;
+    return numel();
   }
   return padded_numel_;
 }
@@ -720,30 +758,32 @@ void vTensor::bind_allocation(const vkapi::Allocation& allocation) {
 
 void vTensor::update_metadata() {
   strides_ = calculate_strides(sizes_, dim_order_);
-  numel_ = utils::multiply_integers(sizes_);
+  uniform_data_->numel = utils::multiply_integers(sizes_);
 
   padded_sizes_ = calculate_padded_sizes(sizes_, packed_dim_);
-  unsqueezed_strides_ = unsqueeze_strides(strides_, numel_);
+  unsqueezed_strides_ = unsqueeze_strides(strides_, numel());
   padded_numel_ = utils::multiply_integers(padded_sizes_);
 
+  // Update uniform data if it has been modified
+  uniform_data_->sizes_v = utils::make_whcn_ivec4(sizes_);
+  uniform_data_->strides_v = utils::make_whcn_ivec4(unsqueezed_strides_);
+
   // Calculate the image extents that would have been used to allocate a texture
   // withthe current sizes, and use that to set the logical limits.
   set_logical_limits(
       calculate_image_extents(padded_sizes_, axis_map_, packed_dim_));
 
   if (sizes_uniform_offset_ != kUniformOffsetUnset) {
-    uniforms_.update(utils::make_whcn_ivec4(sizes_), sizes_uniform_offset_);
+    uniforms_.update(uniform_data_->sizes_v, sizes_uniform_offset_);
   }
   if (unsqueezed_strides_offset_ != kUniformOffsetUnset) {
-    uniforms_.update(
-        utils::make_whcn_ivec4(unsqueezed_strides_),
-        unsqueezed_strides_offset_);
+    uniforms_.update(uniform_data_->strides_v, unsqueezed_strides_offset_);
   }
   if (numel_uniform_offset_ != kUniformOffsetUnset) {
-    uniforms_.update(numel_, numel_uniform_offset_);
+    uniforms_.update(numel(), numel_uniform_offset_);
   }
   if (logical_limits_uniform_offset_ != kUniformOffsetUnset) {
-    uniforms_.update(logical_limits_, logical_limits_uniform_offset_);
+    uniforms_.update(logical_limits(), logical_limits_uniform_offset_);
   }
 }
 
@@ -796,6 +836,8 @@ void vTensor::virtual_clone(const vTensor& other) {
   dim_order_ = other.dim_order_;
   axis_map_ = other.axis_map_;
   packed_dim_ = other.packed_dim_;
+
+  *uniform_data_ = *other.get_uniform_data();
 }
 
 void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {