pytorch
diff --git a/‎.github/workflows/android-release-artifacts.yml‎
Lines changed: 15 additions & 2 deletions b/‎.github/workflows/android-release-artifacts.yml‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/pull.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 39 additions & 0 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 4 additions & 2 deletions b/‎CMakeLists.txt‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎backends/arm/_passes/decompose_linear_pass.py‎
Lines changed: 7 additions & 0 deletions b/‎backends/arm/_passes/decompose_linear_pass.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎backends/arm/operator_support/tosa_supported_operators.py‎
Lines changed: 83 additions & 14 deletions b/‎backends/arm/operator_support/tosa_supported_operators.py‎
Lines changed: 83 additions & 14 deletions
@@ -16,6 +16,7 @@ on:
         options:
           - "xnnpack"
           - "vulkan+xnnpack"
+          - "qnn"
   schedule:
     - cron: 0 10 * * *
 
@@ -83,17 +84,29 @@ jobs:
 
         echo -n "$SECRET_EXECUTORCH_MAVEN_SIGNING_GPG_KEY_CONTENTS" | base64 -d > /tmp/secring.gpg
 
+        GRADLE_ARGS=""
+
         # Update the version name in build.gradle in case of maven publish
         VERSION="${{ inputs.version }}"
         if [ ! -z "$VERSION" ]; then
-          sed -i "s/\(coordinates(\"org.pytorch\", \"executorch-android\", \"\)\([0-9]\+.[0-9]\+.[0-9]\+\)\(\")\)/\1$VERSION\3/" extension/android/executorch_android/build.gradle
+          GRADLE_ARGS+=" -DexecuTorchVersion=${VERSION}"
         fi
 
         FLAVOR="${{ inputs.flavor }}"
         if [[ "$FLAVOR" == "vulkan+xnnpack" || -z "$FLAVOR" ]]; then
           export EXECUTORCH_BUILD_VULKAN=ON
         fi
 
+        if [[ "$FLAVOR" == "qnn" ]]; then
+          PYTHON_EXECUTABLE=python bash .ci/scripts/setup-qnn-deps.sh
+          PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
+          export EXECUTORCH_BUILD_QNN=ON
+          source backends/qualcomm/scripts/qnn_config.sh
+          export QNN_SDK_ROOT="/tmp/qnn/${QNN_VERSION}"
+          export ANDROID_ABIS=arm64-v8a
+          GRADLE_ARGS+=" -DqnnVersion=2.28.0"
+        fi
+
         # Build AAR Package
         mkdir aar-out
         export BUILD_AAR_DIR=aar-out
@@ -106,7 +119,7 @@ jobs:
         # Publish to maven staging
         UPLOAD_TO_MAVEN="${{ inputs.upload_to_maven }}"
         if [[ "$UPLOAD_TO_MAVEN" == "true" ]]; then
-          (cd extension/android; ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:publishToMavenCentral)
+          (cd extension/android; ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew  ${GRADLE_ARGS} :executorch_android:publishToMavenCentral)
         fi
 
   upload-release-aar:
 
@@ -855,7 +855,8 @@ jobs:
         .ci/scripts/setup-linux.sh --build-tool "cmake"
 
         # Install test requirements
-        pip install -r backends/nxp/requirements-tests.txt
+        pip install -r backends/nxp/requirements-tests-pypi.txt
+        pip install -r backends/nxp/requirements-tests-eiq.txt
 
         # Run pytest
         PYTHON_EXECUTABLE=python bash backends/nxp/run_unittests.sh
 
@@ -940,3 +940,42 @@ jobs:
       build-mode: Release
       build-tool: cmake
       docker-image: ci-image:executorch-ubuntu-22.04-clang12
+
+  test-mcu-models:
+    name: test-mcu-models
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    strategy:
+      matrix:
+        include:
+          - build-tool: cmake
+      fail-fast: false
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.2xlarge
+      docker-image: ci-image:executorch-ubuntu-22.04-arm-sdk
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        BUILD_TOOL=${{ matrix.build-tool }}
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # Try to mirror these as closely as possible
+        source .ci/scripts/utils.sh
+        install_executorch "--use-pt-pinned-commit"
+
+        .ci/scripts/setup-arm-baremetal-tools.sh
+        source examples/arm/ethos-u-scratch/setup_path.sh
+
+        # Run selective Build
+        chmod +x examples/selective_build/test_selective_build.sh
+        examples/selective_build/test_selective_build.sh "${BUILD_TOOL}"
+
+        # Run MCU models
+        chmod +x examples/arm/run_mcu_models_fvp.sh
+        examples/arm/run_mcu_models_fvp.sh --target=cortex-m55
@@ -582,7 +582,6 @@ endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_APPLE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/apple)
-  list(APPEND _executorch_extensions apple_extension)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
@@ -649,7 +648,6 @@ endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple)
-  list(APPEND _executorch_extensions extension_llm_apple)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
@@ -968,6 +966,10 @@ else()
 endif()
 target_link_libraries(executorch_kernels INTERFACE ${_executorch_kernels})
 
+install(TARGETS executorch_backends executorch_extensions executorch_kernels
+        EXPORT ExecuTorchTargets
+)
+
 if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
   # Baseline libraries that executor_runner will link against.
   set(_executor_runner_libs executorch extension_evalue_util
 
@@ -90,6 +90,13 @@ def call(self, graph_module):
                     kwargs={},
                     from_node=node,
                 )
+                # Quantization parameters are inherited from original linear node, but
+                # output reshape should use the linear node's output qparams for both input
+                # and output.
+                if "input_qparams" in output.meta:
+                    output.meta["input_qparams"] = output.meta.get(
+                        "output_qparams", None
+                    )
 
             node.replace_all_uses_with(output)
             graph_module.graph.erase_node(node)
 
@@ -116,7 +116,7 @@ def tosa_support_factory(
 
     # Negative checks: Remove nodes from partitioning
     negative_checks: list[OperatorSupportBase] = [
-        CheckInt64Inputs(exported_program, reporter),
+        CheckInt64InputsAndOutputs(exported_program, reporter),
         CheckFloat64Inputs(exported_program, reporter),
         RankCheck(reporter, max_rank=5),
         *[
@@ -454,7 +454,18 @@ def is_node_supported(
         return True
 
 
-class CheckInt64Inputs(OperatorSupportBase):
+class CheckInt64InputsAndOutputs(OperatorSupportBase):
+    """TOSA does not support int64 tensors so in general, ops with int64 inputs or outputs should not be partitioned.
+    There are however some exceptions:
+        - Nodes with int64 output can be partitioned if they are constant, within int32,
+            and all users cast to something else. In this case, the int64 tensor can safely be cast to int32 AOT.
+        - Nodes with int64 output can be partitioned if all users are getitem with non-int64 output.
+            In this case, there are multiple outputs and the int64 ones are not used.
+        - Nodes with int64 inputs can be partitioned if the inputs are constant placeholders, or constant
+            ops fulfilling the criteria above.
+    Note that we don't check placeholders here, they are partitioned based on whether their users are partitioned
+    or not.
+    """
 
     def __init__(
         self, exported_program: ExportedProgram, reporter: WhyNoPartitionReporter
@@ -465,27 +476,85 @@ def __init__(
             if spec.kind == InputKind.USER_INPUT
         ]
         self.reporter = reporter
+        self.int32_min = torch.iinfo(torch.int32).min
+        self.int32_max = torch.iinfo(torch.int32).max
         super().__init__()
 
+    def inside_int32_bounds(self, node: torch.fx.Node) -> bool:
+        """Node is assumed to be call_function with int64 output."""
+        if isinstance(node.target, str):
+            return False
+        data = node.target(*node.args, **node.kwargs)
+        min_val, max_val = int(torch.min(data)), int(torch.max(data))
+        return min_val >= self.int32_min and max_val <= self.int32_max
+
     def is_node_supported(
         self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
     ) -> bool:
 
+        vals = node.meta["val"]
+        tensor_list = vals if isinstance(vals, (list, tuple)) else [vals]
+
+        any_int64 = any(tensor.dtype == torch.int64 for tensor in tensor_list)
+        # Don't partition nodes with int64 output...
+        if any_int64:
+            # ... Except for constant ops that are directly cast to something non-int64.
+            # This could be an explicit cast, or something like a less than that outputs a different dtype than the input.
+            users_output_non_int64 = all(
+                get_first_fake_tensor(output_node).dtype != torch.int64
+                for output_node in node.users
+            )
+            if (
+                node.target in ComputeConstantOpsAOT.targeted_ops
+                and users_output_non_int64
+            ):
+                if not self.inside_int32_bounds(node):
+                    self.reporter.report_reject(
+                        node, "Constant node outside int32 range."
+                    )
+                    return False
+                # Will never have input nodes, safe to return True
+                return True
+
+            # ... Or ops with multiple outputs where only non-int64 are used.
+            users_are_getitem = all(
+                user.target == operator.getitem for user in node.users
+            )
+            if users_are_getitem and users_output_non_int64:
+                # Passed output check, go to input check.
+                pass
+            else:
+                self.reporter.report_reject(
+                    node, "Non-constant node with int64 output."
+                )
+                return False
+
+        # Ops with int64 inputs are only partitioned if input nodes are constant and will be partitioned.
+        # If it is not partitioned, the partition will get an int64 input and fail.
         for input_node in node.all_input_nodes:
-            # We can cast constant placeholders and constant ops AOT, such int64 are ok.
-            # Otherwise, don't partition if one or more inputs are int64.
+            tensor_in = get_first_fake_tensor(input_node)
+            if tensor_in.dtype != torch.int64:
+                continue
+            # Constant placeholder
             if (
-                input_node.name in self.input_names
-                or not input_node.op == "placeholder"
+                input_node.op != "call_function"
+                and input_node.name not in self.input_names
             ):
-                tensor = get_first_fake_tensor(input_node)
-                if tensor.dtype == torch.int64:
-                    if input_node.target not in ComputeConstantOpsAOT.targeted_ops:
-                        self.reporter.report_reject(
-                            node,
-                            f"Had int64 input {input_node.name} that couldn't be handled.",
-                        )
-                        return False
+                continue
+            # Constant operator
+            if input_node.op == "call_function":
+                if input_node.target in ComputeConstantOpsAOT.targeted_ops:
+                    # This is not perfect since the input_node can still be rejected by other checks but
+                    # this should cover the majority of cases.
+                    if self.is_node_supported(
+                        None, input_node  # type: ignore[arg-type] #(we don't use 'submodules')
+                    ):
+                        continue
+            self.reporter.report_reject(
+                node, f"Non-constant int64 input {input_node.name}"
+            )
+            return False
+
         return True