pytorch
diff --git a/‎.ci/scripts/test_model.sh‎
Lines changed: 23 additions & 2 deletions b/‎.ci/scripts/test_model.sh‎
Lines changed: 23 additions & 2 deletions
diff --git a/‎.github/workflows/apple.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/apple.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/arm/test/test_arm_baremetal.sh‎
Lines changed: 20 additions & 6 deletions b/‎backends/arm/test/test_arm_baremetal.sh‎
Lines changed: 20 additions & 6 deletions
diff --git a/‎backends/arm/test/test_model.py‎
Lines changed: 26 additions & 4 deletions b/‎backends/arm/test/test_model.py‎
Lines changed: 26 additions & 4 deletions
diff --git a/‎backends/cadence/aot/fuse_ops.py‎
Lines changed: 36 additions & 28 deletions b/‎backends/cadence/aot/fuse_ops.py‎
Lines changed: 36 additions & 28 deletions
diff --git a/‎backends/cadence/aot/tests/test_fusion_ops_passes.py‎
Lines changed: 14 additions & 10 deletions b/‎backends/cadence/aot/tests/test_fusion_ops_passes.py‎
Lines changed: 14 additions & 10 deletions
diff --git a/‎backends/qualcomm/builders/op_slice_copy.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/qualcomm/builders/op_slice_copy.py‎
Lines changed: 1 addition & 1 deletion
@@ -188,6 +188,22 @@ test_model_with_qnn() {
     EXPORT_SCRIPT=edsr
     # Additional deps for edsr
     pip install piq
+  elif [[ "${MODEL_NAME}" == "cvt" ]]; then
+    EXPORT_SCRIPT=cvt
+  elif [[ "${MODEL_NAME}" == "dit" ]]; then
+    EXPORT_SCRIPT=dit
+  elif [[ "${MODEL_NAME}" == "efficientnet" ]]; then
+    EXPORT_SCRIPT=efficientnet
+  elif [[ "${MODEL_NAME}" == "focalnet" ]]; then
+    EXPORT_SCRIPT=focalnet
+  elif [[ "${MODEL_NAME}" == "mobilevit_v1" ]]; then
+    EXPORT_SCRIPT=mobilevit_v1
+  elif [[ "${MODEL_NAME}" == "mobilevit_v2" ]]; then
+    EXPORT_SCRIPT=mobilevit_v2
+  elif [[ "${MODEL_NAME}" == "pvt" ]]; then
+    EXPORT_SCRIPT=pvt
+  elif [[ "${MODEL_NAME}" == "swin" ]]; then
+    EXPORT_SCRIPT=swin_transformer
   elif [[ "${MODEL_NAME}" == "albert" ]]; then
     EXPORT_SCRIPT=albert
   elif [[ "${MODEL_NAME}" == "bert" ]]; then
@@ -196,6 +212,8 @@ test_model_with_qnn() {
     EXPORT_SCRIPT=distilbert
   elif [[ "${MODEL_NAME}" == "eurobert" ]]; then
     EXPORT_SCRIPT=eurobert
+  elif [[ "${MODEL_NAME}" == "roberta" ]]; then
+    EXPORT_SCRIPT=roberta
   else
     echo "Unsupported model $MODEL_NAME"
     exit 1
@@ -210,10 +228,13 @@ test_model_with_qnn() {
     "dl3"|"mv3"|"mv2"|"ic4"|"ic3"|"vit"|"mb"|"w2l")
         SCRIPT_FOLDER=scripts
         ;;
-    "albert"|"bert"|"distilbert")
+    "cvt"|"dit"|"focalnet"|"mobilevit_v2"|"pvt"|"swin")
+        SCRIPT_FOLDER=oss_scripts
+        ;;
+    "albert"|"bert"|"distilbert"|"roberta"|"efficientnet"|"mobilevit_v1")
         pip install evaluate
         SCRIPT_FOLDER=oss_scripts
-        # Bert models running in 16bit will encounter op validation fail on some operations,
+        # 16bit models will encounter op validation fail on some operations,
         # which requires CHIPSET >= SM8550.
         QNN_CHIPSET=SM8550
         ;;
 
@@ -39,7 +39,7 @@ jobs:
         id: set_version
         shell: bash
         run: |
-          VERSION="0.7.0.$(TZ='PST8PDT' date +%Y%m%d)"
+          VERSION="0.8.0.$(TZ='PST8PDT' date +%Y%m%d)"
           echo "version=$VERSION" >> "$GITHUB_OUTPUT"
 
   build-demo-ios:
 
@@ -470,7 +470,7 @@ jobs:
       docker-image: executorch-ubuntu-22.04-qnn-sdk
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 900
+      timeout: 90
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
@@ -489,14 +489,14 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
-        model: [albert, bert, distilbert] # eurobert requires transfomer >= 4.48.0, skip for now
+        model: [cvt, dit, efficientnet, focalnet, mobilevit_v1, mobilevit_v2, pvt, swin, albert, bert, distilbert, roberta] # eurobert requires transfomer >= 4.48.0, skip for now
       fail-fast: false
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-qnn-sdk
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 900
+      timeout: 90
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
 
@@ -73,22 +73,32 @@ all() { # Run all tests
 test_pytest_ops() { # Test ops and other things
     echo "${TEST_SUITE_NAME}: Run pytest"
 
+    # Make sure to not run this tests on FVP by removing the elf builds,
+    # as they are detected by the unit tests and used if they exists
+    rm -Rf arm_test/arm_semihosting_executor_runner_corstone-300
+    rm -Rf arm_test/arm_semihosting_executor_runner_corstone-320
+
     # Prepare for pytest
     backends/arm/scripts/build_executorch.sh
 
     # Run arm baremetal pytest tests without FVP
-    pytest  --verbose --color=yes --numprocesses=auto backends/arm/test/ --ignore=backends/arm/test/models
+    pytest  --verbose --color=yes --numprocesses=auto --durations=10 backends/arm/test/ --ignore=backends/arm/test/models
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
 test_pytest_models() { # Test ops and other things
     echo "${TEST_SUITE_NAME}: Run pytest"
 
+    # Make sure to not run this tests on FVP by removing the elf builds,
+    # as they are detected by the unit tests and used if they exists
+    rm -Rf arm_test/arm_semihosting_executor_runner_corstone-300
+    rm -Rf arm_test/arm_semihosting_executor_runner_corstone-320
+
     # Prepare for pytest
     backends/arm/scripts/build_executorch.sh
 
     # Run arm baremetal pytest tests without FVP
-    pytest  --verbose --color=yes backends/arm/test/models
+    pytest  --verbose --color=yes --durations=0 backends/arm/test/models
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -105,11 +115,13 @@ test_pytest_ops_ethosu_fvp() { # Same as test_pytest but also sometime verify us
     # Prepare Corstone-3x0 FVP for pytest
     backends/arm/scripts/build_executorch.sh
     backends/arm/scripts/build_portable_kernels.sh
-    # Build semihosting version of the runner used by pytest testing when
+    # Build semihosting version of the runner used by pytest testing. This builds:
+    # arm_test/arm_semihosting_executor_runner_corstone-300
+    # arm_test/arm_semihosting_executor_runner_corstone-320
     backends/arm/test/setup_testing.sh
 
     # Run arm baremetal pytest tests with FVP
-    pytest  --verbose --color=yes --numprocesses=auto backends/arm/test/ --ignore=backends/arm/test/models
+    pytest  --verbose --color=yes --numprocesses=auto --durations=10  backends/arm/test/ --ignore=backends/arm/test/models
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -119,11 +131,13 @@ test_pytest_models_ethosu_fvp() { # Same as test_pytest but also sometime verify
     # Prepare Corstone-3x0 FVP for pytest
     backends/arm/scripts/build_executorch.sh
     backends/arm/scripts/build_portable_kernels.sh
-    # Build semihosting version of the runner used by pytest testing
+    # Build semihosting version of the runner used by pytest testing. This builds:
+    # arm_test/arm_semihosting_executor_runner_corstone-300
+    # arm_test/arm_semihosting_executor_runner_corstone-320
     backends/arm/test/setup_testing.sh
 
     # Run arm baremetal pytest tests with FVP
-    pytest  --verbose --color=yes backends/arm/test/models
+    pytest  --verbose --color=yes --durations=0 backends/arm/test/models
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
 
@@ -7,6 +7,7 @@
 import os
 import subprocess
 import sys
+import time
 
 
 def get_args():
@@ -199,12 +200,17 @@ def run_elf_with_fvp(script_path: str, elf_file: str, target: str, timeout: int)
 
 
 if __name__ == "__main__":
-
+    total_start_time = time.perf_counter()
     args = get_args()
     script_path = os.path.join("backends", "arm", "scripts")
 
     if args.build_libs:
+        start_time = time.perf_counter()
         build_libs(args.test_output, script_path)
+        end_time = time.perf_counter()
+        print(
+            f"[Test model: {end_time - start_time:.2f} s] Build needed executorch libs"
+        )
 
     if args.model:
         model_name = args.model.split(" ")[0].split(";")[0]
@@ -217,6 +223,7 @@ def run_elf_with_fvp(script_path: str, elf_file: str, target: str, timeout: int)
             args.test_output, f"{model_name}_arm_delegate_{args.target}"
         )
 
+        start_time = time.perf_counter()
         pte_file = build_pte(
             args.test_output,
             model_name,
@@ -226,13 +233,17 @@ def run_elf_with_fvp(script_path: str, elf_file: str, target: str, timeout: int)
             output,
             args.no_intermediate,
         )
-        print(f"PTE file created: {pte_file} ")
+        end_time = time.perf_counter()
+        print(
+            f"[Test model: {end_time - start_time:.2f} s] PTE file created: {pte_file}"
+        )
 
         if "ethos-u" in args.target:
             elf_build_path = os.path.join(
                 output, f"{model_name}_arm_delegate_{args.target}"
             )
 
+            start_time = time.perf_counter()
             elf_file = build_ethosu_runtime(
                 args.test_output,
                 script_path,
@@ -243,7 +254,18 @@ def run_elf_with_fvp(script_path: str, elf_file: str, target: str, timeout: int)
                 args.extra_flags,
                 elf_build_path,
             )
-            print(f"ELF file created: {elf_file} ")
+            end_time = time.perf_counter()
+            print(
+                f"[Test model: {end_time - start_time:.2f} s] ELF file created: {elf_file}"
+            )
 
+            start_time = time.perf_counter()
             run_elf_with_fvp(script_path, elf_file, args.target, args.timeout)
-        print(f"Model: {model_name} on {args.target} -> PASS")
+            end_time = time.perf_counter()
+            print(
+                f"[Test model: {end_time - start_time:.2f} s] Tested elf on FVP {elf_file}"
+            )
+        total_end_time = time.perf_counter()
+        print(
+            f"[Test model: {total_end_time - total_start_time:.2f} s total] Model: {model_name} on {args.target} -> PASS"
+        )
@@ -856,19 +856,32 @@ class FuseMulTensorIntoQuantPass(ExportPass):
     def attempt_fusion(
         self, graph_module: torch.fx.GraphModule, mul_node: torch.fx.Node
     ) -> None:
-        full_nodes = [
-            arg
-            for arg in mul_node.args
-            if isinstance(arg, torch.fx.Node)
-            and arg.target == exir_ops.edge.aten.full.default
-        ]
+        if len(mul_node.args) != 2 or len(mul_node.users) != 1:
+            return
+
+        first_arg = cast(torch.fx.Node, mul_node.args[0])
+        second_arg = cast(torch.fx.Node, mul_node.args[1])
+
+        input_node = first_arg
+        full_node = second_arg
+        if second_arg.target == exir_ops.edge.aten.full.default:
+            # Most common case, nothing to change.
+            pass
+        elif first_arg.target == exir_ops.edge.aten.full.default:
+            # Input and full nodes are swapped.
+            full_node = first_arg
+            input_node = second_arg
+        else:
+            # Full node is not found, skip.
+            return
 
-        if len(full_nodes) != 1 or len(mul_node.users) != 1:
+        # Ensure that the mul op does not do any broadcasting.
+        if input_node.meta["val"].shape != mul_node.meta["val"].shape:
             return
 
-        full_node = full_nodes[0]
         mul_user = list(mul_node.users.keys())[0]
 
+        # Ensure only the expected quant ops are using the current mul op.
         if mul_user.target not in {
             exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
             exir_ops.edge.cadence.quantize_per_tensor.default,
@@ -878,33 +891,28 @@ def attempt_fusion(
         quant_node = mul_user
 
         # Calculate the new scale value.
-        prev_scale = quant_node.args[1]
-        assert isinstance(prev_scale, (int, float))
+        old_scale = quant_node.args[1]
+        assert isinstance(old_scale, (int, float))
         mul_scalar = full_node.args[1]
         assert isinstance(mul_scalar, (int, float))
-        new_scale = float(prev_scale) * float(mul_scalar)
+        """ The reason why we divide old scale by the mul value to get a new scale:
+            y = x * mul_scalar
+            q = zp + y / old_scale
+            q = zp + x * mul_scalar / old_scale
+            new_scale = old_scale / mul_scalar
+            q = zp + x / new_scale
+        """
+        new_scale = float(old_scale) / float(mul_scalar)
 
         logging.debug(
             f"Fused {mul_node} and {full_node} into {quant_node}. Updated scale from {quant_node.args[1]} to {new_scale}"
         )
 
-        # Replace the input first
-        quant_node.replace_input_with(
-            cast(torch.fx.Node, quant_node.args[0]),
-            cast(torch.fx.Node, mul_node.args[0]),
-        )
-
-        # Now update the scale in the args
-        new_quant_args = list(quant_node.args)
-        new_quant_args[1] = new_scale
-        quant_node.args = tuple(new_quant_args)
-
-        # Clean up the mul_node
-        mul_node.args = ()
-        mul_node.users = {}
-
-        graph_module.graph.erase_node(mul_node)
-        graph_module.graph.erase_node(full_node)
+        # Update quant node input and scale.
+        old_quant_input = cast(torch.fx.Node, quant_node.args[0])
+        new_quant_input = cast(torch.fx.Node, mul_node.args[0])
+        quant_node.replace_input_with(old_quant_input, new_quant_input)
+        quant_node.update_arg(1, new_scale)
 
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         for node in graph_module.graph.find_nodes(
 
@@ -598,7 +598,7 @@ def test_fuse_mul_scalar_into_dequant(self) -> None:
         self.assertEqual(deq_scale, dequant_scale * mul_value)
 
     def test_fuse_mul_into_quant(self) -> None:
-        quant_scale = 1.5
+        quant_scale = 5
         mul_value = 10
 
         builder = GraphBuilder()
@@ -613,7 +613,7 @@ def test_fuse_mul_into_quant(self) -> None:
         )
         quant = builder.call_operator(
             op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
-            args=(mul, quant_scale, 0, 0, 255, torch.uint8),
+            args=(mul, quant_scale, 7, 0, 255, torch.uint8),
         )
         builder.output([quant])
         original_graph = builder.get_graph_module()
@@ -631,14 +631,18 @@ def test_fuse_mul_into_quant(self) -> None:
         )
 
         # verify that the quant scale value was updated correctly
-        deq_scale = -1
-        for node in converted_graph.graph.nodes:
-            if (
-                node.target
-                == exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
-            ):
-                deq_scale = node.args[1]
-        self.assertEqual(deq_scale, quant_scale * mul_value)
+        for node in converted_graph.graph.find_nodes(
+            op="call_function",
+            target=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+        ):
+            new_quant_scale = node.args[1]
+            self.assertEqual(new_quant_scale, quant_scale / mul_value)
+
+        # verify the math is correct
+        inp = torch.randn(4, 32, dtype=torch.float32)
+        original_out = original_graph(inp)[0]
+        new_out = converted_graph(inp)[0]
+        assert torch.equal(original_out, new_out)
 
     def test_fuse_then_transpose_pass(self) -> None:
         # Create a graph with full -> transpose.
 
@@ -56,7 +56,7 @@ def define_node(
         if start < 0:
             start = start % input_tensor.shape[dim]
 
-        if len(node.args) > 3:
+        if len(node.args) > 3 and node.args[3] is not None:
             end = min(cast(int, node.args[3]), input_tensor.shape[dim])
             if end < 0:
                 end = end % input_tensor.shape[dim]