pytorch
diff --git a/‎.ci/scripts/build-qnn-sdk.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/build-qnn-sdk.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/_unittest.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/_unittest.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/cuda.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/metal.yml‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/metal.yml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.github/workflows/pull.yml‎
Lines changed: 4 additions & 2 deletions b/‎.github/workflows/pull.yml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 4 additions & 43 deletions b/‎.github/workflows/trunk.yml‎
Lines changed: 4 additions & 43 deletions
diff --git a/‎CODEOWNERS‎
Lines changed: 12 additions & 12 deletions b/‎CODEOWNERS‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎backends/aoti/aoti_partitioner.py‎
Lines changed: 95 additions & 0 deletions b/‎backends/aoti/aoti_partitioner.py‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎backends/aoti/common_shims.cpp‎
Lines changed: 4 additions & 0 deletions b/‎backends/aoti/common_shims.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/aoti/common_shims.h‎
Lines changed: 1 addition & 0 deletions b/‎backends/aoti/common_shims.h‎
Lines changed: 1 addition & 0 deletions
@@ -18,7 +18,7 @@ build_qnn_backend() {
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
 
   parallelism=$(( $(nproc) - 1 ))
-  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number ${parallelism} --release
+  bash backends/qualcomm/scripts/build.sh --skip_linux_android --skip_linux_embedded --job_number ${parallelism} --release
 }
 
 set_up_aot() {
 
@@ -32,7 +32,7 @@ jobs:
       id-token: write
       contents: read
     with:
-      runner: linux.2xlarge
+      runner: linux.2xlarge.memory
       docker-image: ${{ inputs.docker-image }}
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
 
@@ -128,7 +128,7 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Setup Huggingface"
-        pip install -U "huggingface_hub[cli]" accelerate
+        pip install -U "huggingface_hub[cli]<1.0" accelerate
         huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
         pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
@@ -208,7 +208,7 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Setup Huggingface"
-        pip install -U "huggingface_hub[cli]" accelerate
+        pip install -U "huggingface_hub[cli]<1.0" accelerate
         huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
         pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
 
@@ -30,6 +30,8 @@ jobs:
 
   export-voxtral-metal-artifact:
     name: export-voxtral-metal-artifact
+      # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
+    if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
@@ -44,7 +46,7 @@ jobs:
         set -eux
 
         echo "::group::Setup Huggingface"
-        ${CONDA_RUN} pip install -U "huggingface_hub[cli]" accelerate
+        ${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0" accelerate
         ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
         echo "::endgroup::"
 
 
@@ -315,7 +315,7 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Setup Huggingface"
-        pip install -U "huggingface_hub[cli]" accelerate
+        pip install -U "huggingface_hub[cli]<1.0" accelerate
         huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
         pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
@@ -636,7 +636,7 @@ jobs:
         echo "::group::Setup ExecuTorch"
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
         echo "::endgroup::"
-                
+
         echo "::group::Setup requirements"
         # install phi-3-mini requirements
         bash examples/models/phi-3-mini/install_requirements.sh
@@ -909,6 +909,8 @@ jobs:
 
   test-samsung-models-linux:
     name: test-samsung-models-linux
+    # Skip this job if the pull request is from a fork (secrets are not available)
+    if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
 
@@ -626,7 +626,7 @@ jobs:
           BUILD_TORCHAO_EXPERIMENTAL=1 TORCHAO_BUILD_CPU_AARCH64=1 TORCHAO_BUILD_KLEIDIAI=1 TORCHAO_ENABLE_ARM_NEON_DOT=1 TORCHAO_PARALLEL_BACKEND=OPENMP pip install third-party/ao
         fi
 
-        pip install -U "huggingface_hub[cli]"
+        pip install -U "huggingface_hub[cli]<1.0"
 
         bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.model != 'phi_4_mini' && '--test_with_runner' || '' }}  ${{ matrix.backend == 'torchao' && '--use_torchao_kernels' || '' }}
 
@@ -659,7 +659,7 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Set up Huggingface"
-        ${CONDA_RUN} pip install -U "huggingface_hub[cli]" accelerate
+        ${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0" accelerate
         ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
         ${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
@@ -834,7 +834,7 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Setup Huggingface"
-        pip install -U "huggingface_hub[cli]" accelerate
+        pip install -U "huggingface_hub[cli]<1.0" accelerate
         huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
         pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
@@ -930,7 +930,7 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Set up Huggingface"
-        ${CONDA_RUN} pip install -U "huggingface_hub[cli]" accelerate
+        ${CONDA_RUN} pip install -U "huggingface_hub[cli]<1.0" accelerate
         ${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
         ${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
@@ -1043,45 +1043,6 @@ jobs:
       build-tool: cmake
       docker-image: ci-image:executorch-ubuntu-22.04-clang12
 
-  test-mcu-models:
-    name: test-mcu-models
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    strategy:
-      matrix:
-        include:
-          - build-tool: cmake
-      fail-fast: false
-    permissions:
-      id-token: write
-      contents: read
-    with:
-      runner: linux.2xlarge
-      docker-image: ci-image:executorch-ubuntu-22.04-arm-sdk
-      submodules: 'recursive'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        BUILD_TOOL=${{ matrix.build-tool }}
-
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
-
-        # Try to mirror these as closely as possible
-        source .ci/scripts/utils.sh
-        install_executorch "--use-pt-pinned-commit"
-
-        .ci/scripts/setup-arm-baremetal-tools.sh
-        source examples/arm/ethos-u-scratch/setup_path.sh
-
-        # Run selective Build
-        chmod +x examples/selective_build/test_selective_build.sh
-        examples/selective_build/test_selective_build.sh "${BUILD_TOOL}"
-
-        # Run MCU models
-        chmod +x examples/arm/run_mcu_models_fvp.sh
-        examples/arm/run_mcu_models_fvp.sh --target=cortex-m55
-
   test-models-windows:
     uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
     strategy:
 
@@ -49,31 +49,31 @@
 /extension/export_util @kimishpatel
 /extension/flat_tensor @lucylq
 /extension/gguf_util @larryliu0820
-/extension/kernel_util @kimishpatel @manuelcandales @swolchok
-/extension/llm @jackzhxng @larryliu0820 @swolchok @mergennachin
-/extension/memory_allocator @JacobSzwejbka @swolchok
+/extension/kernel_util @kimishpatel @manuelcandales
+/extension/llm @jackzhxng @larryliu0820 @mergennachin
+/extension/memory_allocator @JacobSzwejbka
 /extension/module @shoumikhin
-/extension/parallel @kimishpatel @swolchok
+/extension/parallel @kimishpatel
 /extension/pybindings @JacobSzwejbka @larryliu0820
-/extension/pytree @JacobSzwejbka @swolchok
-/extension/runner_util @swolchok
+/extension/pytree @JacobSzwejbka
+/extension/runner_util
 /extension/tensor @shoumikhin
-/extension/testing_util @swolchok
-/extension/threadpool @kimishpatel @swolchok
+/extension/testing_util
+/extension/threadpool @kimishpatel
 /extension/training @JacobSzwejbka
 
-/kernels @manuelcandales @swolchok
+/kernels @manuelcandales
 
 /profiler @Gasoonjia
 
-/runtime @JacobSzwejbka @lucylq @swolchok
+/runtime @JacobSzwejbka @lucylq
 /runtime/backend @cccclai
 
 /schema @JacobSzwejbka @lucylq
 
-/scripts @GregoryComer @swolchok
+/scripts @GregoryComer
 
-/shim @larryliu0820 @GregoryComer @swolchok
+/shim @larryliu0820 @GregoryComer
 
 /third-party @GregoryComer
 
 
@@ -0,0 +1,95 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Dict, List, Optional, Tuple
+
+import torch
+from executorch.exir._warnings import experimental
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.backend.partitioner import (
+    DelegationSpec,
+    Partitioner,
+    PartitionResult,
+)
+from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
+from torch._export.utils import is_buffer, is_lifted_tensor_constant, is_param
+from torch.export.exported_program import ExportedProgram
+
+
+@experimental(
+    "This API and all of cuda backend related functionality are experimental."
+)
+class AotiPartitioner(Partitioner):
+    """
+    Base partitioner for AOTInductor-driven backend integration.
+
+    This partitioner creates a single partition containing all operators from the input graph.
+    It skips core ATen decomposition, allowing the backend to handle decomposition using
+    AOTInductor's backend-specific decomposition table.
+
+    Only operators that cannot be handled by the aoti library will be excluded from
+    the partition and fall back to ExecuTorch's default or custom handling.
+    """
+
+    def __init__(self, backend_name: str, compile_spec: List[CompileSpec]) -> None:
+        """
+        Initialize the AOTI partitioner.
+
+        Args:
+            backend_name: The name of the backend (e.g., "CudaBackend", "MetalBackend")
+            compile_spec: List of compilation specifications
+        """
+        self.delegation_spec = DelegationSpec(backend_name, compile_spec)
+
+    def partition(self, exported_program: ExportedProgram) -> PartitionResult:
+        """
+        Fully delegate the graph to AOTInductor by tagging all nodes as a single partition.
+        """
+
+        partition_tags: Dict[str, DelegationSpec] = {}
+        tag = "tag0"
+
+        for node in exported_program.graph.nodes:
+            if node.op != "call_function":
+                continue
+            node.meta["delegation_tag"] = tag
+
+        partition_tags[tag] = self.delegation_spec
+
+        tag_constant_data(exported_program)
+        tag_mutated_buffer(exported_program)
+
+        # Tag constant placeholders that have no users
+        # tag_constant_data only tags constants that have users with delegation_tag
+        # but we need to tag all constants for this partition
+        for node in exported_program.graph.nodes:
+            if node.op == "placeholder" and (
+                is_param(exported_program, node)
+                or is_buffer(exported_program, node)
+                or is_lifted_tensor_constant(exported_program, node)
+            ):
+                if "delegation_tag" not in node.meta:
+                    node.meta["delegation_tag"] = tag
+
+        return PartitionResult(
+            tagged_exported_program=exported_program, partition_tags=partition_tags
+        )
+
+    def ops_to_not_decompose(
+        self, ep: ExportedProgram
+    ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
+        """
+        Return a list of operations that should not be decomposed and let the AOT compiler handle them.
+        Currently we skip ATen decompositon for all ops, and let the backend handle them.
+        """
+        do_not_decompose = set()
+
+        for node in ep.graph.nodes:
+            if node.op == "call_function" and isinstance(
+                node.target, torch._ops.OpOverload
+            ):
+                do_not_decompose.add(node.target)
+        return list(do_not_decompose), None
@@ -184,6 +184,10 @@ int32_t aoti_torch_dtype_int32() {
   return 3; // PyTorch's int32 dtype code
 }
 
+int32_t aoti_torch_dtype_bool() {
+  return 11; // PyTorch's bool dtype code
+}
+
 int32_t aoti_torch_dtype_int64() {
   return 4; // PyTorch's int64 dtype code
 }
 
@@ -63,6 +63,7 @@ int32_t aoti_torch_dtype_int8();
 int32_t aoti_torch_dtype_int16();
 int32_t aoti_torch_dtype_int32();
 int32_t aoti_torch_dtype_int64();
+int32_t aoti_torch_dtype_bool();
 
 // Dtype utility function needed by Metal backend
 size_t aoti_torch_dtype_element_size(int32_t dtype);
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ build_qnn_backend() {`
`18`	`18`	`export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"`
`19`	`19`
`20`	`20`	`parallelism=$(( $(nproc) - 1 ))`
`21`		`- bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number ${parallelism} --release`
	`21`	`+ bash backends/qualcomm/scripts/build.sh --skip_linux_android --skip_linux_embedded --job_number ${parallelism} --release`
`22`	`22`	`}`
`23`	`23`
`24`	`24`	`set_up_aot() {`
Original file line number	Diff line number	Diff line change
`@@ -184,6 +184,10 @@ int32_t aoti_torch_dtype_int32() {`
`184`	`184`	`return 3; // PyTorch's int32 dtype code`
`185`	`185`	`}`
`186`	`186`
	`187`	`+int32_t aoti_torch_dtype_bool() {`
	`188`	`+ return 11; // PyTorch's bool dtype code`
	`189`	`+}`
	`190`	`+`
`187`	`191`	`int32_t aoti_torch_dtype_int64() {`
`188`	`192`	`return 4; // PyTorch's int64 dtype code`
`189`	`193`	`}`