pytorch
diff --git a/‎backends/vulkan/runtime/api/containers/Tensor.cpp‎
Lines changed: 18 additions & 3 deletions b/‎backends/vulkan/runtime/api/containers/Tensor.cpp‎
Lines changed: 18 additions & 3 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl‎
Lines changed: 2 additions & 2 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.glsl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml‎
Lines changed: 6 additions & 4 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/pack_int4_linear_weight_transposed_interleaved.yaml‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl‎
Lines changed: 2 additions & 2 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml‎
Lines changed: 6 additions & 6 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.yaml‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/QuantizedLinearGroupwiseInt4.cpp‎
Lines changed: 4 additions & 14 deletions b/‎backends/vulkan/runtime/graph/ops/impl/QuantizedLinearGroupwiseInt4.cpp‎
Lines changed: 4 additions & 14 deletions
diff --git a/‎backends/vulkan/runtime/vk_api/Adapter.h‎
Lines changed: 6 additions & 5 deletions b/‎backends/vulkan/runtime/vk_api/Adapter.h‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎backends/xnnpack/test/ops/test_check_quant_params.py‎
Lines changed: 1 addition & 2 deletions b/‎backends/xnnpack/test/ops/test_check_quant_params.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎docs/source/intro-how-it-works.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/source/intro-how-it-works.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/llm_manual/export_nanogpt.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/llm_manual/export_nanogpt.py‎
Lines changed: 1 addition & 1 deletion
@@ -260,10 +260,25 @@ vkapi::VulkanImage allocate_image(
       return vkapi::VulkanImage();
   }
 
-  utils::uvec3 max_extents = adapter_ptr->max_texture_extents();
+    // TODO(ssjia): change to always check that the image extents do not exceed
+    // physical limits. Adding the check now based on `maxImageDimension3D` will
+    // cause some existing models to break. Anecdotally, on Adreno and
+    // SwiftShader devices, using 3D textures that exceed `maxImageDimension3D`
+    // appears to be ok. So we need to figure out if is it undefined behaviour
+    // or if there's a better way to figure out what the limit is. For now, only
+    // check during debug build so that we can detect when exceeding physical
+    // limits could be a potential cause for model outputs to be wrong. In the
+    // meantime, the threshold for using texture storage can be configured at
+    // export time.
+#ifdef VULKAN_DEBUG
+  uint32_t max_extent = storage_type == utils::kTexture3D
+      ? adapter_ptr->max_texture3d_dim()
+      : adapter_ptr->max_texture2d_dim();
+
   VK_CHECK_COND(
-      image_extents[0] <= max_extents[0] &&
-      image_extents[1] <= max_extents[1] && image_extents[2] <= max_extents[2]);
+      image_extents[0] <= max_extent && image_extents[1] <= max_extent &&
+      image_extents[2] <= max_extent);
+#endif
 
   VkSampler sampler = adapter_ptr->sampler_cache().retrieve(sampler_props);
 
 
@@ -131,6 +131,6 @@ void main() {
     t_qmat2[packed_pos.y * stride + packed_pos.x] = out_tex_1;
     t_qmat2[(packed_pos.y + 1) * stride + packed_pos.x] = out_tex_2;
   $else:
-    imageStore(t_qmat2, ivec3(packed_pos.xy, 0), out_tex_1);
-    imageStore(t_qmat2, ivec3(packed_pos.x, packed_pos.y + 1, 0), out_tex_2);
+    imageStore(t_qmat2, packed_pos.xy, out_tex_1);
+    imageStore(t_qmat2, ivec2(packed_pos.x, packed_pos.y + 1), out_tex_2);
 }
@@ -6,8 +6,10 @@
 
 pack_int4_linear_weight_transposed_interleaved:
   parameter_names_with_default_values:
-    STORAGE: texture3d
+    STORAGE: texture2d
+  generate_variant_forall:
+    STORAGE:
+      - VALUE: texture2d
+      - VALUE: buffer
   shader_variants:
-    - NAME: pack_int4_linear_weight_transposed_interleaved_texture3d
-    - NAME: pack_int4_linear_weight_transposed_interleaved_buffer
-      STORAGE: buffer
+    - NAME: pack_int4_linear_weight_transposed_interleaved
@@ -21,7 +21,7 @@ layout(std430) buffer;
 ${layout_declare_tensor(B, "w", "t_out", DTYPE, OUT_STORAGE, is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_mat1", DTYPE, IN_STORAGE, is_scalar_array=False)}
 ${layout_declare_tensor(B, "r", "t_qmat2", "uint8", WEIGHT_STORAGE, is_scalar_array=False)}
-${layout_declare_tensor(B, "r", "t_qparams", DTYPE, PARAMS_STORAGE, is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_qparams", DTYPE, "buffer", is_scalar_array=False)}
 
 layout(push_constant) uniform restrict Block {
   ivec4 out_sizes;
@@ -111,7 +111,7 @@ void main() {
         $else:
           const uvec4 packed_weight_tex = texelFetch(
               t_qmat2,
-              ivec3(gl_GlobalInvocationID.x, k + comp, 0),
+              ivec2(gl_GlobalInvocationID.x, k + comp),
               0);
 
         const uvec4 weight_tex_1 = (packed_weight_tex & 0xF0) >> 4;
 
@@ -9,14 +9,14 @@ q_4w_linear:
     DTYPE: float
     OUT_STORAGE: texture3d
     IN_STORAGE: texture3d
-    WEIGHT_STORAGE: texture3d
-    PARAMS_STORAGE: texture3d
+    WEIGHT_STORAGE: texture2d
+    PARAMS_STORAGE: buffer
   shader_variants:
-    - NAME: q_4w_linear_texture3d_texture3d_texture3d_texture3d_float
-    - NAME: q_4w_linear_buffer_buffer_texture3d_texture3d_float
+    - NAME: q_4w_linear_texture3d_texture3d_texture2d_float
+    - NAME: q_4w_linear_buffer_buffer_texture2d_float
       OUT_STORAGE: buffer
       IN_STORAGE: buffer
-    - NAME: q_4w_linear_buffer_buffer_texture3d_buffer_float
+    - NAME: q_4w_linear_buffer_buffer_buffer_float
       OUT_STORAGE: buffer
       IN_STORAGE: buffer
-      PARAMS_STORAGE: buffer
+      WEIGHT_STORAGE: buffer
@@ -83,10 +83,9 @@ ValueRef prepack_int4_linear_weight_transposed_interleaved(
   const int64_t N = qmat2_orig_sizes.at(ndim - 2);
   const int64_t N_div2 = N / int64_t(2);
 
-  utils::StorageType storage_type = utils::kTexture3D;
-  utils::uvec3 max_extents =
-      graph.context()->adapter_ptr()->max_texture_extents();
-  if (N_div2 > max_extents[0] * 4 || K > max_extents[1]) {
+  utils::StorageType storage_type = utils::kTexture2D;
+  uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
+  if (N_div2 > max_extent * 4 || K > max_extent) {
     storage_type = utils::kBuffer;
   }
 
@@ -132,22 +131,13 @@ void add_q_4w_linear_node(
   ValueRef mat2 =
       prepack_int4_linear_weight_transposed_interleaved(graph, mat2_data);
 
-  utils::StorageType qparams_storage_type = utils::kTexture3D;
-  utils::uvec3 max_extents =
-      graph.context()->adapter_ptr()->max_texture_extents();
-  if (graph.size_at<uint32_t>(-2, scales_and_zeros_data) > max_extents[0] * 4 ||
-      graph.size_at<uint32_t>(-3, scales_and_zeros_data) > max_extents[2]) {
-    qparams_storage_type = utils::kBuffer;
-  }
-
   ValueRef scales_and_zeros = prepack_standard_hw_transposed(
-      graph, scales_and_zeros_data, qparams_storage_type, utils::kWidthPacked);
+      graph, scales_and_zeros_data, utils::kBuffer, utils::kWidthPacked);
 
   std::string kernel_name = "q_4w_linear";
   add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
   add_storage_type_suffix(kernel_name, graph.storage_type_of(mat1));
   add_storage_type_suffix(kernel_name, graph.storage_type_of(mat2));
-  add_storage_type_suffix(kernel_name, qparams_storage_type);
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
   const uint32_t group_size_val = graph.extract_scalar<uint32_t>(group_size);
 
@@ -211,11 +211,12 @@ class Adapter final {
     return physical_device_.min_ubo_alignment;
   }
 
-  inline utils::uvec3 max_texture_extents() const {
-    return {
-        physical_device_.properties.limits.maxImageDimension1D,
-        physical_device_.properties.limits.maxImageDimension2D,
-        physical_device_.properties.limits.maxImageDimension3D};
+  inline uint32_t max_texture2d_dim() const {
+    return physical_device_.properties.limits.maxImageDimension2D;
+  }
+
+  inline uint32_t max_texture3d_dim() const {
+    return physical_device_.properties.limits.maxImageDimension3D;
   }
 
   inline uint32_t max_buffer_numel() const {
 
@@ -52,7 +52,7 @@ def _test_check_quant_message(self, ep_modifier, expected_message):
         torch._dynamo.reset()
         mod = torch.nn.Linear(10, 10)
         quantizer = XNNPACKQuantizer()
-        captured = export_for_training(mod, (torch.randn(1, 10),)).module()
+        captured = export_for_training(mod, (torch.randn(1, 10),), strict=True).module()
         quantizer.set_global(get_symmetric_quantization_config(is_per_channel=True))
         prepared = prepare_pt2e(captured, quantizer)
 
@@ -68,7 +68,6 @@ def _test_check_quant_message(self, ep_modifier, expected_message):
         self.assertEquals(str(context.exception), expected_message)
 
     def test_in_per_tensor_quant(self):
-
         for invalid_scale in [
             float("nan"),
             float("inf"),
 
@@ -17,10 +17,10 @@ ExecuTorch provides the following benefits to engineers who need to deploy machi
 
 * **Export that is robust and powerful.** Export uses [`torch.export()`](https://pytorch.org/docs/main/export.html), which uses the same technology used in PyTorch 2.x to capture PyTorch programs for fast execution. While eager mode is flexible and allows experimentation in Python, it may not work well if Python isn't available or cannot deliver efficient execution. The _Export Intermediate Representation (Export IR)_ that export flow generates can describe a wide range of dynamism in PyTorch models, including control flow and dynamic shapes, which makes it a powerful tool for fully capturing existing PyTorch models with little effort.
 * **Operator standardization.** During the graph export process, the nodes in the graph represent operators such as addition, multiplication, or convolution. These operators are part of a small standardized list called the [Core ATen Op set](https://pytorch.org/docs/main/torch.compiler_ir.html#core-aten-ir). Most PyTorch programs can be decomposed into a graph using this small set of operators during export. Small list of standardized operators reduces the surface, needed to be covered, by third-party operator libraries as well as accelerator backends, in order to run models exported for ExecuTorch. ExecuTorch runtime ships with one such library, called portable operator library, that implements core ATen opset.
-* **Standardization for compiler interfaces (aka delegates) and the OSS ecosystem.** In addition to the _Operator standardization_ above, ExecuTorch has a standardized interface for delegation to compilers. This allows third-party vendors and compilers to implement interfaces and API entry points for compilation and execution of (either partial or full) graphs targeting their specialized hardware. This provides greater flexibility in terms of hardware support and performance optimization, as well as easier integration with the PyTorch open source ecosystem for on-device AI.
-* **First-party SDK and toolchain.** Due to the above standardization efforts, it was possible to build a unified first-party SDK for ExecuTorch, where developers can export, compile, and deploy to a wide range of target devices--such as iOS, Android, and microcontrollers--using the same SDK, streamlining the process and gaining productivity. Additionally, the SDK provides profiling and debugging functionality to easily inspect intermediate states, which are core parts of most developer workflows.
+* **Standardization for compiler interfaces (aka delegates) and the OSS ecosystem.** In addition to the _Operator standardization_ above, ExecuTorch has a [standardized interface](./compiler-delegate-and-partitioner.md) for delegation to compilers. This allows third-party vendors and compilers to implement interfaces and API entry points for compilation and execution of (either partial or full) graphs targeting their specialized hardware. This provides greater flexibility in terms of hardware support and performance optimization, as well as easier integration with the PyTorch open source ecosystem for on-device AI.
+* **First-party Developer Tools** Due to the above standardization efforts, it was possible to build unified first-party [developer tools](./devtools-overview.md) for ExecuTorch, where developers can export, compile, and deploy to a wide range of target devices—such as iOS, Android, and microcontrollers—using the same APIs, streamlining the process and increasing productivity. Additionally, ExecuTorch provides profiling and debugging functionality to easily inspect intermediate states, which are core parts of most developer workflows.
 * **No intermediate conversions necessary.** ExecuTorch's main design principle is to allow developers to run their models on target devices without the need for converting to third-party intermediate representations. This eliminates a number of problems that on-device developers typically face when working with these conversion steps, such as lack of debuggability and profiling, the need to familiarize themselves with hardware-specific tools, and models not being able to run due to conversion steps failing.
-* **Ease of customization.** Developers can optimize their deployment for even better performance gains on the target architecture by applying custom techniques, such as linking with high-performance operator implementations or customizing memory planning based on storage and latency trade-offs. This level of customization is made possible through the standardization of the compiler pass interface and registration APIs on exported graphs.
+* **Ease of customization.** Developers can optimize their deployment for even better performance gains on the target architecture by applying custom techniques, such as [linking with high-performance operator implementations](./kernel-library-custom-aten-kernel.md) or [customizing memory planning](./compiler-memory-planning.md) based on storage and latency trade-offs. This level of customization is made possible through the standardization of the [compiler pass interface](./compiler-custom-compiler-passes.md) and registration APIs on exported graphs.
 * **Low overhead runtime and execution.** The ExecuTorch runtime, written in C++, is highly efficient and can run on a wide range of architectures, including Linux, iOS, Android, embedded systems, and bare metal hardware, with little additional setup or configuration. It is capable of linking in only those operators needed for the model, resulting in a minimal runtime binary size. It is also able to run at low latency because of ahead-of-time compilation and memory planning stages, with the runtime responsible only for execution (e.g., call operator `conv` and save the result in memory location X).
 
 The above highlights the key advantages of ExecuTorch across three main categories: portability, productivity, and performance. We consider it to be an ideal choice for enabling on-device AI across mobile and edge computing platforms.
@@ -28,7 +28,7 @@
 # The torch.no_grad() call tells PyTorch to exclude training-specific logic.
 with sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
     m = export_for_training(
-        model, example_inputs, dynamic_shapes=dynamic_shape
+        model, example_inputs, dynamic_shapes=dynamic_shape, strict=True
     ).module()
     traced_model = export(m, example_inputs, dynamic_shapes=dynamic_shape, strict=True)
Original file line number	Diff line number	Diff line change
`@@ -131,6 +131,6 @@ void main() {`
`131`	`131`	`t_qmat2[packed_pos.y * stride + packed_pos.x] = out_tex_1;`
`132`	`132`	`t_qmat2[(packed_pos.y + 1) * stride + packed_pos.x] = out_tex_2;`
`133`	`133`	`$else:`
`134`		`- imageStore(t_qmat2, ivec3(packed_pos.xy, 0), out_tex_1);`
`135`		`- imageStore(t_qmat2, ivec3(packed_pos.x, packed_pos.y + 1, 0), out_tex_2);`
	`134`	`+ imageStore(t_qmat2, packed_pos.xy, out_tex_1);`
	`135`	`+ imageStore(t_qmat2, ivec2(packed_pos.x, packed_pos.y + 1), out_tex_2);`
`136`	`136`	`}`