Merge branch 'main' into fold_batch_norm

Erik-Lundell · web-flow · commit d3c6a6091d31 · 2025-01-29T16:10:36.000+01:00
diff --git a/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py b/backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
@@ -105,21 +105,6 @@ def fold_and_annotate_arg(
         for arg in arg_list:
             if not isinstance(arg, Node):
                 return
-            """
-             Make sure arg has requires_grad set to False
-             For parameters that are not quantized, sometimes (i.e. convolution)
-             the Parameter(FakeTensor(...)) has requires_grad set to True, which
-             causes the retracing of the graph to fail with:
-
-             E       RuntimeError: isDifferentiableType(variable.scalar_type()) INTERNAL ASSERT FAILED at "/Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/functions/utils.h":74, please report a bug to PyTorch.
-             E
-             E       While executing %aten_convolution_default : [num_users=1] = call_function[target=executorch.exir.dialects.edge._ops.aten.convolution.default](args = (%quantized_decomposed_quantize_per_tensor_default, %b__frozen_param0, %p__param_constant1, [1, 1], [0, 0], [1, 1], False, [0, 0], 1), kwargs = {})
-             E       Original traceback:
-             E         File "/Users/perast01/src/executorch/backends/arm/test/ops/test_conv2d.py", line 110, in forward
-             E           x = conv(x)
-            """
-            if arg.op == "placeholder":
-                arg.meta["val"].requires_grad = False
 
             arg_quant_params = None
             if arg.target == dq_op:
diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -33,6 +33,8 @@ class Cat(torch.nn.Module):
                 ),
                 -1,
             ),
+            ((torch.randn(1, 2, 4, 4), torch.randn(1, 2, 4, 1)), 3),
+            ((torch.randn(1, 2, 4, 4), torch.randn(1, 2, 4, 4)), 0),
             ((torch.randn(2, 2, 4, 4), torch.randn(2, 2, 4, 1)), 3),
             (
                 (
@@ -47,8 +49,8 @@ class Cat(torch.nn.Module):
         def __init__(self):
             super().__init__()
 
-        def forward(self, tensors: tuple[torch.Tensor, ...], dim: int) -> torch.Tensor:
-            return torch.cat(tensors, dim=dim)
+        def forward(self, t: tuple[torch.Tensor, ...], dim: int) -> torch.Tensor:
+            return torch.cat(t, dim=dim)
 
     def _test_cat_tosa_MI_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[tuple[torch.Tensor, ...], int]
@@ -134,22 +136,38 @@ def test_cat_tosa_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
         test_data = (operands, dim)
         self._test_cat_tosa_BI_pipeline(self.Cat(), test_data)
 
-    # Mismatch in provided number of inputs and model signature, MLETORCH 519
-    @parameterized.expand(Cat.test_parameters)
+    @parameterized.expand(Cat.test_parameters[:-3])
     @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
     def test_cat_u55_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
         test_data = (operands, dim)
         self._test_cat_ethosu_BI_pipeline(
             self.Cat(), common.get_u55_compile_spec(), test_data
         )
 
-    # Mismatch in provided number of inputs and model signature, MLETORCH 519
-    @parameterized.expand(Cat.test_parameters)
+    # MLETORCH-630 Cat does not work on FVP with batch>1
+    @parameterized.expand(Cat.test_parameters[-3:])
     @pytest.mark.corstone_fvp
     @conftest.expectedFailureOnFVP
+    def test_cat_u55_BI_xfails(self, operands: tuple[torch.Tensor, ...], dim: int):
+        test_data = (operands, dim)
+        self._test_cat_ethosu_BI_pipeline(
+            self.Cat(), common.get_u55_compile_spec(), test_data
+        )
+
+    @parameterized.expand(Cat.test_parameters[:-3])
+    @pytest.mark.corstone_fvp
     def test_cat_u85_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
         test_data = (operands, dim)
         self._test_cat_ethosu_BI_pipeline(
             self.Cat(), common.get_u85_compile_spec(), test_data
         )
+
+    # MLETORCH-630 Cat does not work on FVP with batch>1
+    @parameterized.expand(Cat.test_parameters[-3:])
+    @pytest.mark.corstone_fvp
+    @conftest.expectedFailureOnFVP
+    def test_cat_u85_BI_xfails(self, operands: tuple[torch.Tensor, ...], dim: int):
+        test_data = (operands, dim)
+        self._test_cat_ethosu_BI_pipeline(
+            self.Cat(), common.get_u85_compile_spec(), test_data
+        )
diff --git a/backends/arm/test/ops/test_expand.py b/backends/arm/test/ops/test_expand.py
@@ -37,15 +37,17 @@ class Expand(torch.nn.Module):
         test_parameters = [
             (torch.rand(1), (2,)),
             (torch.randn(1, 4), (1, -1)),
-            (torch.rand(1, 1, 2, 2), (4, 3, -1, 2)),
             (torch.randn(1), (2, 2, 4)),
-            (torch.rand(3, 2, 4, 1), (-1, -1, -1, 3)),
+            (torch.randn(1, 1, 1, 5), (1, 4, -1, -1)),
             (torch.randn(1, 1, 192), (1, -1, -1)),
+            (torch.randn(1, 1), (1, 2, 2, 4)),
+            (torch.randn(1, 1), (2, 2, 2, 4)),
             (torch.randn(10, 1, 1, 97), (-1, 4, -1, -1)),
+            (torch.rand(1, 1, 2, 2), (4, 3, -1, 2)),
         ]
 
-        def forward(self, x: torch.Tensor, multiples: Sequence):
-            return x.expand(multiples)
+        def forward(self, x: torch.Tensor, m: Sequence):
+            return x.expand(m)
 
     def _test_expand_tosa_MI_pipeline(self, module: torch.nn.Module, test_data: Tuple):
         (
@@ -113,20 +115,34 @@ def test_expand_tosa_MI(self, test_input, multiples):
     def test_expand_tosa_BI(self, test_input, multiples):
         self._test_expand_tosa_BI_pipeline(self.Expand(), (test_input, multiples))
 
-    # Mismatch in provided number of inputs and model signature, MLETORCH 519
-    @parameterized.expand(Expand.test_parameters)
+    @parameterized.expand(Expand.test_parameters[:-3])
     @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
     def test_expand_u55_BI(self, test_input, multiples):
         self._test_expand_ethosu_BI_pipeline(
             common.get_u55_compile_spec(), self.Expand(), (test_input, multiples)
         )
 
-    # Mismatch in provided number of inputs and model signature, MLETORCH 519
-    @parameterized.expand(Expand.test_parameters)
+    # MLETORCH-629: Expand does not work on FVP with batch>1
+    @parameterized.expand(Expand.test_parameters[-3:])
     @pytest.mark.corstone_fvp
     @conftest.expectedFailureOnFVP
+    def test_expand_u55_BI_xfails(self, test_input, multiples):
+        self._test_expand_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.Expand(), (test_input, multiples)
+        )
+
+    @parameterized.expand(Expand.test_parameters[:-3])
+    @pytest.mark.corstone_fvp
     def test_expand_u85_BI(self, test_input, multiples):
         self._test_expand_ethosu_BI_pipeline(
             common.get_u85_compile_spec(), self.Expand(), (test_input, multiples)
         )
+
+    # MLETORCH-629: Expand does not work on FVP with batch>1
+    @parameterized.expand(Expand.test_parameters[-3:])
+    @pytest.mark.corstone_fvp
+    @conftest.expectedFailureOnFVP
+    def test_expand_u85_BI_xfails(self, test_input, multiples):
+        self._test_expand_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.Expand(), (test_input, multiples)
+        )
diff --git a/backends/arm/test/ops/test_full.py b/backends/arm/test/ops/test_full.py
@@ -143,20 +143,16 @@ def test_full_tosa_MI(self, test_tensor: Tuple):
     def test_full_tosa_BI(self, test_tensor: Tuple):
         self._test_full_tosa_BI_pipeline(self.AddVariableFull(), test_tensor)
 
-    # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(AddVariableFull.test_parameters)
     @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
     def test_full_u55_BI(self, test_tensor: Tuple):
         self._test_full_tosa_u55_pipeline(
             self.AddVariableFull(),
             test_tensor,
         )
 
-    # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(AddVariableFull.test_parameters)
     @pytest.mark.corstone_fvp
-    @conftest.expectedFailureOnFVP
     def test_full_u85_BI(self, test_tensor: Tuple):
         self._test_full_tosa_u85_pipeline(
             self.AddVariableFull(),
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
@@ -65,16 +65,7 @@ def get_input_names(program: ExportedProgram) -> list[str]:
     Returns:
         A list of strings with the names of the model input.
     """
-    input_names = []
-
-    # E.g. bias and weights are 'placeholders' as well. This is used to
-    # get only the use inputs.
-    usr_inputs = program.graph_signature.user_inputs
-    for node in program.graph.nodes:
-        if node.op == "placeholder" and node.name in usr_inputs:
-            input_names.append(node.name)
-
-    return input_names
+    return [spec.arg.name for spec in program.graph_signature.input_specs]
 
 
 def get_input_quantization_params(
@@ -334,13 +325,16 @@ def run_corstone(
 
 
 def prep_data_for_save(
-    data: torch.Tensor,
+    data,
     input_name: str,
     quant_param: Optional[QuantizationParams] = None,
 ):
-    data_np = np.array(data.detach(), order="C").astype(
-        torch_to_numpy_dtype_dict[data.dtype]
-    )
+    if isinstance(data, torch.Tensor):
+        data_np = np.array(data.detach(), order="C").astype(
+            torch_to_numpy_dtype_dict[data.dtype]
+        )
+    else:
+        data_np = np.array(data)
     if quant_param is not None:
         assert quant_param.node_name in input_name, (
             f"The quantization params name '{quant_param.node_name}' does not "
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -32,11 +32,20 @@ ${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
 ${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
 ${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
 ${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
-${layout_declare_ubo(4, "ivec3", "out_limits")}
-${layout_declare_ubo(5, "ivec4", "in_sizes")}
-${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
-${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
-${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 out_limits;
+  ivec4 in_sizes;
+  ivec2 kernel_size;
+  ivec2 stride;
+  ivec2 padding;
+  ivec2 dilation;
+  ivec2 overlay_region;
+  int in_group_size;
+  int dummy_padding;
+  float out_min;
+  float out_max;
+};
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -127,7 +136,7 @@ void main() {
   const ivec3 out_pos = pos_shared[offset_pos_index(gl_LocalInvocationIndex)];
   for (int y = 0; y < BATCH_SIZE_Y; y++) {
     for (int x = 0; x < BATCH_SIZE_X; x++) {
-      if (any(greaterThanEqual(ivec3(out_pos.x + x, out_pos.y + y, out_pos.z), out_limits))) {
+      if (any(greaterThanEqual(ivec3(out_pos.x + x, out_pos.y + y, out_pos.z), out_limits.xyz))) {
         continue;
       }
       imageStore(t_out, ivec3(out_pos.x + x, out_pos.y + y, out_pos.z), op(sum[y][x], out_min, out_max));
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl
@@ -24,11 +24,20 @@ ${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
 ${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
 ${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
 ${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
-${layout_declare_ubo(4, "ivec3", "out_limits")}
-${layout_declare_ubo(5, "ivec4", "in_sizes")}
-${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
-${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
-${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
+
+layout(push_constant) uniform restrict Block {
+  ivec4 out_limits;
+  ivec4 in_sizes;
+  ivec2 kernel_size;
+  ivec2 stride;
+  ivec2 padding;
+  ivec2 dilation;
+  ivec2 overlay_region;
+  int in_group_size;
+  int dummy_padding;
+  float out_min;
+  float out_max;
+};
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -407,7 +407,9 @@ void add_conv2d_node(
     wg_size = {wg_size[0] * wg_size[1] * wg_size[2], 1, 1};
   }
 
-  if (method == Conv2dMethod::Pointwise) {
+  vkapi::ParamsBindList param_buffers;
+  std::vector<PushConstantDataInfo> push_constants;
+  if (method == Conv2dMethod::Pointwise || method == Conv2dMethod::Depthwise) {
     const utils::ivec4 kernel_param_size_stride = {
         kernel_params.kernel_size[0],
         kernel_params.kernel_size[1],
@@ -420,55 +422,43 @@ void add_conv2d_node(
         kernel_params.dilation[0],
         kernel_params.dilation[1]};
 
-    graph.execute_nodes().emplace_back(new DispatchNode(
-        graph,
-        shader,
-        wg_size,
-        graph.create_local_wg_size(wg_size),
-        // Inputs and Outputs
-        {{out, vkapi::MemoryAccessType::WRITE},
-         {{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
-        // Shader params buffers
-        {},
-        // Specialization Constants
-        {},
-        // Resizing Logic
-        resize_conv2d_node,
-        {weight_data, stride, padding, dilation, transposed, output_padding},
-        {
-            graph.logical_limits_pc_of(out),
-            graph.sizes_pc_of(in),
-            PushConstantDataInfo(
-                &kernel_param_size_stride, sizeof(kernel_param_size_stride)),
-            PushConstantDataInfo(
-                &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)),
-            PushConstantDataInfo(
-                &extra_params, sizeof(extra_params), sizeof(utils::ivec4)),
-            PushConstantDataInfo(&out_params, sizeof(out_params)),
-        }));
+    push_constants = {
+        graph.logical_limits_pc_of(out),
+        graph.sizes_pc_of(in),
+        PushConstantDataInfo(
+            &kernel_param_size_stride, sizeof(kernel_param_size_stride)),
+        PushConstantDataInfo(
+            &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)),
+        PushConstantDataInfo(
+            &extra_params, sizeof(extra_params), sizeof(utils::ivec4)),
+        PushConstantDataInfo(&out_params, sizeof(out_params)),
+    };
   } else {
-    graph.execute_nodes().emplace_back(new DispatchNode(
-        graph,
-        shader,
-        wg_size,
-        graph.create_local_wg_size(wg_size),
-        // Inputs and Outputs
-        {{out, vkapi::MemoryAccessType::WRITE},
-         {{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
-        // Shader params buffers
-        {
-            t_out->logical_limits_ubo(),
-            t_in->sizes_ubo(),
-            graph.create_params_buffer(kernel_params),
-            graph.create_params_buffer(extra_params),
-            graph.create_params_buffer(out_params),
-        },
-        // Specialization Constants
-        {},
-        // Resizing Logic
-        resize_conv2d_node,
-        {weight_data, stride, padding, dilation, transposed, output_padding}));
+    param_buffers = {
+        t_out->logical_limits_ubo(),
+        t_in->sizes_ubo(),
+        graph.create_params_buffer(kernel_params),
+        graph.create_params_buffer(extra_params),
+        graph.create_params_buffer(out_params),
+    };
   }
+
+  graph.execute_nodes().emplace_back(new DispatchNode(
+      graph,
+      shader,
+      wg_size,
+      graph.create_local_wg_size(wg_size),
+      // Inputs and Outputs
+      {{out, vkapi::MemoryAccessType::WRITE},
+       {{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
+      // Shader params buffers
+      param_buffers,
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      resize_conv2d_node,
+      {weight_data, stride, padding, dilation, transposed, output_padding},
+      push_constants));
 }
 
 void add_conv1d_node(
diff --git a/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp b/backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
@@ -73,13 +73,18 @@ void add_q_8w_linear_node(
   auto viewFn = VK_GET_OP_FN("aten.view_copy.default");
   ValueRef mat1_W_packed = mat1;
   ValueRef out_W_packed = out;
+  // Create temporary tensors to store the width packed versions of mat1 and out
+  TmpTensor mat1_tmp(
+      &graph, graph.sizes_of(mat1), graph.dtype_of(mat1), utils::kWidthPacked);
+  TmpTensor out_tmp(
+      &graph, graph.sizes_of(out), graph.dtype_of(out), utils::kWidthPacked);
   if (!graph.is_buffer_storage(out) &&
       graph.packed_dim_of(mat1) != WHCN::kWidthDim) {
     // Ensure mat1 is width packed
-    mat1_W_packed = graph.add_tensor_like(mat1, utils::kWidthPacked);
+    mat1_W_packed = mat1_tmp;
     viewFn(graph, {mat1, graph.add_none(), mat1_W_packed});
     // Ensure out is packed correctly
-    out_W_packed = graph.add_tensor_like(out, utils::kWidthPacked);
+    out_W_packed = out_tmp;
   }
   ValueRef q_mat2 = prepack_standard(
       graph, q_mat2_data, graph.storage_type_of(out), utils::kWidthPacked);