pytorch
diff --git a/‎CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/api/containers/Tensor.cpp‎
Lines changed: 14 additions & 49 deletions b/‎backends/vulkan/runtime/api/containers/Tensor.cpp‎
Lines changed: 14 additions & 49 deletions
diff --git a/‎backends/vulkan/runtime/api/containers/Tensor.h‎
Lines changed: 22 additions & 37 deletions b/‎backends/vulkan/runtime/api/containers/Tensor.h‎
Lines changed: 22 additions & 37 deletions
diff --git a/‎backends/vulkan/runtime/graph/ComputeGraph.cpp‎
Lines changed: 1 addition & 1 deletion b/‎backends/vulkan/runtime/graph/ComputeGraph.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ComputeGraph.h‎
Lines changed: 2 additions & 10 deletions b/‎backends/vulkan/runtime/graph/ComputeGraph.h‎
Lines changed: 2 additions & 10 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/activations.h‎
Lines changed: 1 addition & 1 deletion b/‎backends/vulkan/runtime/graph/ops/glsl/activations.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp‎
Lines changed: 1 addition & 1 deletion b/‎backends/vulkan/runtime/graph/ops/impl/BatchNorm.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/Cat.cpp‎
Lines changed: 3 additions & 3 deletions b/‎backends/vulkan/runtime/graph/ops/impl/Cat.cpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/Clone.cpp‎
Lines changed: 1 addition & 1 deletion b/‎backends/vulkan/runtime/graph/ops/impl/Clone.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -183,6 +183,8 @@ option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL "Build the Runner Util extension"
 
 option(EXECUTORCH_BUILD_EXTENSION_TENSOR "Build the Tensor extension" OFF)
 
+option(EXECUTORCH_BUILD_EXTENSION_TRAINING "Build the training extension" OFF)
+
 option(EXECUTORCH_BUILD_GTESTS "Build googletest based test binaries" OFF)
 
 option(EXECUTORCH_BUILD_MPS "Build the MPS backend" OFF)
@@ -636,6 +638,10 @@ if(EXECUTORCH_BUILD_EXTENSION_MODULE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/training)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/runner_util)
 endif()
 
@@ -22,6 +22,8 @@ please visit our documentation website [for the latest release](https://pytorch.
 
 Check out the [Getting Started](https://pytorch.org/executorch/stable/getting-started-setup.html#quick-setup-colab-jupyter-notebook-prototype) page for a quick spin.
 
+Check out the examples of [Llama](./examples/models/llama2/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch.
+
 ## Feedback
 
 We welcome any feedback, suggestions, and bug reports from the community to help
 
@@ -418,14 +418,12 @@ vTensor::vTensor(
       padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
       unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
       padded_numel_(utils::multiply_integers(padded_sizes_)),
-      texture_limits_{{0, 0, 0}},
       logical_limits_{{0, 0, 0}},
       // Utility Uniform Buffers that can be passed to shaders as arguments
       sizes_uniform_(),
       strides_uniform_(),
       numel_uniform_(),
       axis_map_uniform_(),
-      texture_limits_uniform_(),
       logical_limits_uniform_(),
       // Construct Tensor storage
       storage_(
@@ -440,12 +438,7 @@ vTensor::vTensor(
       dim_order_is_valid(dim_order_), "computed dim order is invalid");
 
   if (storage_type != utils::kBuffer) {
-    texture_limits_.limits = utils::ivec3{
-        utils::safe_downcast<int32_t>(storage_.image_extents_[0]),
-        utils::safe_downcast<int32_t>(storage_.image_extents_[1]),
-        utils::safe_downcast<int32_t>(storage_.image_extents_[2])};
-
-    update_logical_limits();
+    set_logical_limits(storage_.image_extents_);
   }
 
   if (dtype == vkapi::kHalf) {
@@ -470,14 +463,12 @@ vTensor::vTensor(const vTensor& other)
           other.unsqueezed_strides_.begin(),
           other.unsqueezed_strides_.end()},
       padded_numel_(other.padded_numel_),
-      texture_limits_{other.texture_limits_},
       logical_limits_{other.logical_limits_},
       // Empty initialize Utility Uniform Buffers
       sizes_uniform_(),
       strides_uniform_(),
       numel_uniform_(),
       axis_map_uniform_(),
-      texture_limits_uniform_(),
       logical_limits_uniform_(),
       // Copy Tensor storage
       storage_(other.storage_) {}
@@ -498,14 +489,12 @@ vTensor::vTensor(
       padded_sizes_{calculate_padded_sizes(sizes, memory_layout_)},
       unsqueezed_strides_{unsqueeze_strides(strides_, numel_)},
       padded_numel_(utils::multiply_integers(padded_sizes_)),
-      texture_limits_{other.texture_limits_},
       logical_limits_(other.logical_limits_),
       // Empty initialize Utility Uniform Buffers
       sizes_uniform_(),
       strides_uniform_(),
       numel_uniform_(),
       axis_map_uniform_(),
-      texture_limits_uniform_(),
       logical_limits_uniform_(),
       // Copy Tensor storage
       storage_(other.storage_, vkapi::element_size(dtype_) * offset_numel) {
@@ -547,18 +536,10 @@ vkapi::VulkanBuffer& vTensor::buffer(
   return storage_.buffer_;
 }
 
-void vTensor::update_logical_limits() {
-  logical_limits_.limits[0] = texture_limits_.limits[axis_map_.at(0)];
-  logical_limits_.limits[1] = texture_limits_.limits[axis_map_.at(1)];
-  logical_limits_.limits[2] = texture_limits_.limits[axis_map_.at(2)];
-}
-
-utils::uvec3 vTensor::logical_extents() const {
-  utils::uvec3 logical_extents(
-      {utils::safe_downcast<uint32_t>(logical_limits_.limits[0]),
-       utils::safe_downcast<uint32_t>(logical_limits_.limits[1]),
-       utils::safe_downcast<uint32_t>(logical_limits_.limits[2])});
-  return logical_extents;
+void vTensor::set_logical_limits(const utils::uvec3& image_extents) {
+  logical_limits_.limits[0] = image_extents[axis_map_.at(0)];
+  logical_limits_.limits[1] = image_extents[axis_map_.at(1)];
+  logical_limits_.limits[2] = image_extents[axis_map_.at(2)];
 }
 
 const vkapi::BufferBindInfo vTensor::sizes_ubo() {
@@ -585,13 +566,6 @@ const vkapi::BufferBindInfo vTensor::axis_map_ubo() {
   return vkapi::BufferBindInfo(axis_map_uniform_.buffer());
 }
 
-const vkapi::BufferBindInfo vTensor::texture_limits_ubo() {
-  if (!texture_limits_uniform_.buffer()) {
-    texture_limits_uniform_ = ParamsBuffer(storage_.context_, texture_limits_);
-  }
-  return vkapi::BufferBindInfo(texture_limits_uniform_.buffer());
-}
-
 const vkapi::BufferBindInfo vTensor::logical_limits_ubo() {
   if (!logical_limits_uniform_.buffer()) {
     logical_limits_uniform_ = ParamsBuffer(storage_.context_, logical_limits_);
@@ -655,18 +629,10 @@ void vTensor::update_metadata() {
   unsqueezed_strides_ = unsqueeze_strides(strides_, numel_);
   padded_numel_ = utils::multiply_integers(padded_sizes_);
 
-  // Calculate the extents of the image texture that would have been required
-  // for a tensor of the new sizes.
-  utils::uvec3 virtual_extents =
-      calculate_image_extents(padded_sizes_, axis_map_, memory_layout_);
-
-  // Update the texture limits to reflect the new virtual extents.
-  texture_limits_.limits = utils::ivec3{
-      utils::safe_downcast<int32_t>(virtual_extents[0]),
-      utils::safe_downcast<int32_t>(virtual_extents[1]),
-      utils::safe_downcast<int32_t>(virtual_extents[2])};
-
-  update_logical_limits();
+  // Calculate the image extents that would have been used to allocate a texture
+  // withthe current sizes, and use that to set the logical limits.
+  set_logical_limits(
+      calculate_image_extents(padded_sizes_, axis_map_, memory_layout_));
 
   if (sizes_uniform_.buffer()) {
     sizes_uniform_.update(utils::make_whcn_ivec4(sizes_));
@@ -680,9 +646,6 @@ void vTensor::update_metadata() {
   if (axis_map_uniform_.buffer()) {
     axis_map_uniform_.update(utils::make_ivec4(axis_map_));
   }
-  if (texture_limits_uniform_.buffer()) {
-    texture_limits_uniform_.update(texture_limits_);
-  }
   if (logical_limits_uniform_.buffer()) {
     logical_limits_uniform_.update(logical_limits_);
   }
@@ -695,9 +658,11 @@ void vTensor::check_sizes(const std::vector<int64_t>& sizes) const {
     utils::uvec3 virtual_extents =
         calculate_image_extents(padded_sizes_, axis_map_, memory_layout_);
 
-    bool valid_resize = virtual_extents[0] <= image_extents()[0];
-    valid_resize = valid_resize && virtual_extents[1] <= image_extents()[1];
-    valid_resize = valid_resize && virtual_extents[2] <= image_extents()[2];
+    bool valid_resize = virtual_extents[0] <= storage_.image_extents_[0];
+    valid_resize =
+        valid_resize && virtual_extents[1] <= storage_.image_extents_[1];
+    valid_resize =
+        valid_resize && virtual_extents[2] <= storage_.image_extents_[2];
 
     VK_CHECK_COND(
         valid_resize,
 
@@ -276,9 +276,7 @@ class vTensor final {
   // Contains the number of elements in the tensor according to the padded
   // sizes.
   size_t padded_numel_;
-  // See the comments documenting image_extents() for more context.
-  TextureLimits texture_limits_;
-  // See the comments documenting logical_extents() for more context.
+  // See the comments documenting logical_limits() for more context.
   TextureLimits logical_limits_;
 
   /*
@@ -294,7 +292,6 @@ class vTensor final {
   ParamsBuffer strides_uniform_;
   ParamsBuffer numel_uniform_;
   ParamsBuffer axis_map_uniform_;
-  ParamsBuffer texture_limits_uniform_;
   ParamsBuffer logical_limits_uniform_;
 
   vTensorStorage storage_;
@@ -342,28 +339,30 @@ class vTensor final {
     return storage_.storage_type_ == utils::kBuffer;
   }
 
-  /*
-   * Returns the raw image extents of the underlying image texture used to store
-   * the tensor's data. Note that due to axis mapping, the X, Y, and Z extents
-   * may not correspond to the width, height, or channels dimension of the
-   * tensor.
-   */
-  inline const utils::uvec3& image_extents() const {
-    return storage_.image_extents_;
-  }
-
  private:
-  void update_logical_limits();
+  void set_logical_limits(const utils::uvec3& image_extents);
 
  public:
   /*
-   * Returns the image extents of the underlying image texture, but re-ordered
-   * such that the first element is the extent of the axis used to represent the
-   * tensor's width dimension, the second element is the extent of the axis used
-   * to represent the tensor's height dimension, and the third element is the
-   * extent of the axis used to represent the tensor's channels dimension.
+   * The logical limits of the tensor are derived from the image extents of the
+   * image texture used to store the tensor, but with two key differences.
+   *
+   * First, the image extents are permuted according to the axis map. This
+   * makes it so that the first element of the logical limit is the limit of the
+   * texture axis corresponding to the width dimension of the tensor, the next
+   * element is the limit of the texture axis corresponding to the height
+   * dimension and the last element is the limit of the texture axis that
+   * corresponds to the channels dimension of the tensor.
+   *
+   * Second, the logical limits may use smaller extents than the actual image
+   * extents of the image texture. This is due to dynamic shape; if the tensor's
+   * `virtual_resize()` function is called, then the logical limits will reflect
+   * the extents that would be needed to support a tensor with the updated sizes
+   * instead of the original sizes.
    */
-  utils::uvec3 logical_extents() const;
+  inline const utils::ivec3& logical_limits() const {
+    return logical_limits_.limits;
+  }
 
   /*
    * Extract an `vkapi::ScalarType` from the TensorOptions member
@@ -430,18 +429,8 @@ class vTensor final {
   const vkapi::BufferBindInfo axis_map_ubo();
 
   /*
-   * Returns a GPU buffer containing the virtual image extents of the tensor.
-   * Since a tensor can be resized with the virtual_resize() function, this
-   * GPU buffer contains the image extents of the tensor calculated using the
-   * virtual_resize() function. This allows shaders to exit early if they are
-   * working outside the limits of the texture.
-   */
-  const vkapi::BufferBindInfo texture_limits_ubo();
-
-  /*
-   * Returns a GPU buffer containing the logical image extents of the tensor.
-   * It contains the same data as texture_limits_ubo(), but with the data
-   * re-ordered. See the comments for logical_extents() for more context.
+   * Returns a GPU buffer containing the logical limits of the tensor. See the
+   * comments for logical_limits() for more context.
    */
   const vkapi::BufferBindInfo logical_limits_ubo();
 
@@ -450,10 +439,6 @@ class vTensor final {
    */
   const vkapi::BufferBindInfo numel_ubo();
 
-  inline const utils::ivec3 texture_limits() const {
-    return texture_limits_.limits;
-  }
-
   inline size_t numel() const {
     return numel_;
   }
 
@@ -428,7 +428,7 @@ utils::uvec3 ComputeGraph::create_global_wg_size(const ValueRef idx) {
   if (is_buffer_storage(idx)) {
     return {uint32_t(numel_of(idx)), 1u, 1u};
   }
-  return image_extents_of(idx);
+  return logical_limits_of(idx);
 }
 
 utils::uvec3 ComputeGraph::create_local_wg_size(
 
@@ -284,12 +284,8 @@ class ComputeGraph final {
 
   vkapi::ScalarType dtype_of(const ValueRef idx) const;
 
-  inline utils::uvec3 image_extents_of(const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().image_extents();
-  }
-
-  inline utils::uvec3 logical_extents_of(const ValueRef idx) const {
-    return values_.at(idx).toConstTensor().logical_extents();
+  inline const utils::ivec3& logical_limits_of(const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().logical_limits();
   }
 
   inline int32_t numel_of(const ValueRef idx) const {
@@ -335,10 +331,6 @@ class ComputeGraph final {
     return values_.at(idx).toTensor().axis_map_ubo();
   }
 
-  inline vkapi::BufferBindInfo texture_limits_ubo(const ValueRef idx) {
-    return values_.at(idx).toTensor().texture_limits_ubo();
-  }
-
   inline vkapi::BufferBindInfo logical_limits_ubo(const ValueRef idx) {
     return values_.at(idx).toTensor().logical_limits_ubo();
   }
 
@@ -18,7 +18,7 @@ float hardswish(float x) {
 
 vec4 hardswish(vec4 tex) {
   return vec4(
-      hardswish(tex.x), hardswish(tex.y), hardswish(tex.z), hardswish(tex.z));
+      hardswish(tex.x), hardswish(tex.y), hardswish(tex.z), hardswish(tex.w));
 }
 
 float hardshrink(float x, float lambda, float neg_lambda) {
 
@@ -88,7 +88,7 @@ void add_native_batch_norm_node(
       {{out_ref, vkapi::MemoryAccessType::WRITE},
        {{in_ref, arg_weight, arg_bias, arg_mean, arg_var},
         vkapi::MemoryAccessType::READ}},
-      {t_out->texture_limits_ubo(),
+      {t_out->logical_limits_ubo(),
        graph.create_params_buffer(epsilon),
        graph.create_params_buffer(num_texel_per_batch)}));
 }
 
@@ -40,7 +40,7 @@ void add_cat_default_node(
 
     for (ValueRef input_ref : *input_list) {
       vTensorPtr t_in = graph.get_tensor(input_ref);
-      utils::ivec3 range = t_in->texture_limits();
+      utils::ivec3 range = t_in->logical_limits();
       add_copy_offset_node(
           graph, input_ref, range, src_offset, dst_offset, out);
       dst_offset[0] += range[0];
@@ -52,7 +52,7 @@ void add_cat_default_node(
 
     for (ValueRef input_ref : *input_list) {
       vTensorPtr t_in = graph.get_tensor(input_ref);
-      utils::ivec3 range = t_in->texture_limits();
+      utils::ivec3 range = t_in->logical_limits();
       add_copy_offset_node(
           graph, input_ref, range, src_offset, dst_offset, out);
       dst_offset[1] += range[1];
@@ -63,7 +63,7 @@ void add_cat_default_node(
 
     for (ValueRef input_ref : *input_list) {
       vTensorPtr t_in = graph.get_tensor(input_ref);
-      utils::ivec3 range = t_in->texture_limits();
+      utils::ivec3 range = t_in->logical_limits();
       add_copy_offset_node(
           graph, input_ref, range, src_offset, dst_offset, out);
       dst_offset[2] += range[2];
 
@@ -32,7 +32,7 @@ void add_clone_node(
       graph.create_local_wg_size(out),
       {{out, vkapi::MemoryAccessType::WRITE},
        {in, vkapi::MemoryAccessType::READ}},
-      {t_out->texture_limits_ubo()}));
+      {t_out->logical_limits_ubo()}));
 }
 
 void clone(ComputeGraph& graph, const std::vector<ValueRef>& args) {
Original file line number	Diff line number	Diff line change
`@@ -428,7 +428,7 @@ utils::uvec3 ComputeGraph::create_global_wg_size(const ValueRef idx) {`
`428`	`428`	`if (is_buffer_storage(idx)) {`
`429`	`429`	`return {uint32_t(numel_of(idx)), 1u, 1u};`
`430`	`430`	`}`
`431`		`- return image_extents_of(idx);`
	`431`	`+ return logical_limits_of(idx);`
`432`	`432`	`}`
`433`	`433`
`434`	`434`	`utils::uvec3 ComputeGraph::create_local_wg_size(`
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ float hardswish(float x) {`
`18`	`18`
`19`	`19`	`vec4 hardswish(vec4 tex) {`
`20`	`20`	`return vec4(`
`21`		`- hardswish(tex.x), hardswish(tex.y), hardswish(tex.z), hardswish(tex.z));`
	`21`	`+ hardswish(tex.x), hardswish(tex.y), hardswish(tex.z), hardswish(tex.w));`
`22`	`22`	`}`
`23`	`23`
`24`	`24`	`float hardshrink(float x, float lambda, float neg_lambda) {`
Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ void add_native_batch_norm_node(`
`88`	`88`	`{{out_ref, vkapi::MemoryAccessType::WRITE},`
`89`	`89`	`{{in_ref, arg_weight, arg_bias, arg_mean, arg_var},`
`90`	`90`	`vkapi::MemoryAccessType::READ}},`
`91`		`- {t_out->texture_limits_ubo(),`
	`91`	`+ {t_out->logical_limits_ubo(),`
`92`	`92`	`graph.create_params_buffer(epsilon),`
`93`	`93`	`graph.create_params_buffer(num_texel_per_batch)}));`
`94`	`94`	`}`
Original file line number	Diff line number	Diff line change
`@@ -32,7 +32,7 @@ void add_clone_node(`
`32`	`32`	`graph.create_local_wg_size(out),`
`33`	`33`	`{{out, vkapi::MemoryAccessType::WRITE},`
`34`	`34`	`{in, vkapi::MemoryAccessType::READ}},`
`35`		`- {t_out->texture_limits_ubo()}));`
	`35`	`+ {t_out->logical_limits_ubo()}));`
`36`	`36`	`}`
`37`	`37`
`38`	`38`	`void clone(ComputeGraph& graph, const std::vector<ValueRef>& args) {`