pytorch
diff --git a/‎.ci/scripts/build-qnn-sdk.sh‎
100644100755
Lines changed: 4 additions & 9 deletions b/‎.ci/scripts/build-qnn-sdk.sh‎
100644100755
Lines changed: 4 additions & 9 deletions
diff --git a/‎.github/workflows/build-presets.yml‎
Lines changed: 39 additions & 1 deletion b/‎.github/workflows/build-presets.yml‎
Lines changed: 39 additions & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CMakePresets.json‎
Lines changed: 15 additions & 0 deletions b/‎CMakePresets.json‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎backends/cadence/aot/tests/test_fusion_ops_passes.py‎
Lines changed: 40 additions & 32 deletions b/‎backends/cadence/aot/tests/test_fusion_ops_passes.py‎
Lines changed: 40 additions & 32 deletions
diff --git a/‎backends/xnnpack/CMakeLists.txt‎
Lines changed: 0 additions & 18 deletions b/‎backends/xnnpack/CMakeLists.txt‎
Lines changed: 0 additions & 18 deletions
diff --git a/‎codegen/api/__init__.py‎ b/‎codegen/api/__init__.py‎
diff --git a/‎codegen/api/custom_ops.py‎
Lines changed: 151 additions & 0 deletions b/‎codegen/api/custom_ops.py‎
Lines changed: 151 additions & 0 deletions
@@ -11,17 +11,12 @@ set -o xtrace
 
 build_qnn_backend() {
   echo "Start building qnn backend."
-  export ANDROID_NDK_ROOT=/opt/ndk
-  export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
+  export ANDROID_NDK_ROOT=${ANDROID_NDK_ROOT:-/opt/ndk}
+  export QNN_SDK_ROOT=${QNN_SDK_ROOT:-/tmp/qnn/2.28.0.241029}
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
 
-  # Workaround to avoid issues around missing flatccrt library (depending on the
-  # number of jobs used), see issue #7300:
-  # Build twice (second time with `--no_clean`) to make sure libflatccrt.a is
-  # available.
-  # TODO: Remove this workaround once the underlying issue is fixed.
-  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release || \
-  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release --no_clean
+  parallelism=$(( $(nproc) - 1 ))
+  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number ${parallelism} --release
 }
 
 set_up_aot() {
 
@@ -6,6 +6,8 @@ on:
     branches:
       - main
       - release/*
+    paths:
+      - .github/workflows/build-presets.yml
   workflow_dispatch:
 
 concurrency:
@@ -16,15 +18,51 @@ jobs:
   apple:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     strategy:
+      fail-fast: false
       matrix:
-        preset: [macos-arm64]
+        preset: [macos-arm64, pybind]
     with:
       job-name: build
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       runner: macos-latest-xlarge
       python-version: 3.12
       submodules: recursive
+      timeout: 90
       script: |
         set -eux
         ${CONDA_RUN} ./install_requirements.sh > /dev/null
         ${CONDA_RUN} cmake --preset ${{ matrix.preset }}
         ${CONDA_RUN} cmake --build cmake-out --parallel
+
+  linux:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    strategy:
+      fail-fast: false
+      matrix:
+        preset: [pybind]
+        runner: [linux.2xlarge, linux.arm64.2xlarge]
+        docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
+        # Excluding specific runner + docker image combinations that don't make sense:
+        #   - Excluding the ARM64 gcc image on the x86 runner (linux.2xlarge)
+        #   - Excluding the x86 clang image on the ARM64 runner (linux.arm64.2xlarge)
+        exclude:
+          - runner: linux.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+          - runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-clang12
+    with:
+      job-name: build
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      runner: ${{ matrix.runner }}
+      docker-image: ${{ matrix.docker-image }}
+      submodules: recursive
+      timeout: 90
+      script: |
+        set -eux
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        ./install_requirements.sh > /dev/null
+        cmake --preset ${{ matrix.preset }}
+        cmake --build cmake-out --parallel
@@ -582,6 +582,7 @@ if(EXECUTORCH_BUILD_PYBIND)
       ${TORCH_PYTHON_LIBRARY}
       bundled_program
       etdump
+      flatccrt
       executorch
       extension_data_loader
       util
 
@@ -15,6 +15,7 @@
     },
     {
       "name": "macos-arm64",
+      "displayName": "Build everything buildable on macOS arm64",
       "inherits": ["common"],
       "generator": "Xcode",
       "cacheVariables": {
@@ -28,6 +29,20 @@
         "type": "equals",
         "rhs": "Darwin"
       }
+    },
+    {
+      "name": "pybind",
+      "displayName": "Build pybindings exported in the wheel",
+      "inherits": ["common"],
+      "cacheVariables": {
+        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/pybind.cmake",
+        "CMAKE_OSX_DEPLOYMENT_TARGET": "10.15"
+      },
+      "condition": {
+        "type": "inList",
+        "string": "${hostSystemName}",
+        "list": ["Darwin", "Linux", "Windows"]
+      }
     }
   ]
 }
@@ -8,7 +8,7 @@
 
 
 import unittest
-from typing import Tuple
+from typing import Final, List, Tuple
 
 import executorch.backends.cadence.aot.ops_registrations  # noqa
 import torch
@@ -281,25 +281,23 @@ def forward(self, x):
         )
 
     def test_no_replace_quant_permute_dequant_with_requantize(self):
-        class M(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-
-            def forward(self, x):
-                x = torch.ops.quantized_decomposed.quantize_per_tensor(
-                    x, 1.2, 3, 0, 127, torch.int8
-                )
-                x = torch.permute(x, [2, 0, 1, 3])
-                x = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                    x, 4.5, 6, 0, 127, torch.int8
-                )
-                return x
-
-        inputs = torch.randn(2, 12, 1, 6)
-        model = M()
-        graph_module = export_to_edge(model, (inputs,)).exported_program().graph_module
-
-        graph_module = FuseQuantDequantToRequantizePass()(graph_module).graph_module
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(2, 12, 1, 6, dtype=torch.float32))
+        quant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            args=(x, 1.2, 3, 0, 127, torch.int8),
+        )
+        permute = builder.call_operator(
+            op=exir_ops.edge.aten.permute_copy.default, args=(quant, [2, 0, 1, 3])
+        )
+        dequant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            args=(permute, 4.5, 6, 0, 127, torch.int8),
+        )
+        builder.output(dequant)
+        graph_module = FuseQuantDequantToRequantizePass(
+            force_quant_dequant_fusion=False
+        )(builder.get_graph_module()).graph_module
         self.check_op_counts(
             graph_module,
             expected_op_counts={
@@ -436,18 +434,28 @@ def forward(self, x):
         )
 
     def test_fuse_mul_into_dequant(self):
-        class M(torch.nn.Module):
-            def forward(self, x):
-                x0 = torch.ops.quantized_decomposed.dequantize_per_tensor(
-                    x, 1.5, 0, 0, 255, torch.uint8
-                )
-                x1 = torch.full([4, 32], 3, dtype=torch.float32)
-                x2 = x0 * x1
-                return x2
+        INPUT_SHAPE: Final[List[int]] = [4, 32]
+        DEQUANT_SCALE: Final[float] = 1.5
+        FULL_VALUE: Final[float] = 3
 
-        inputs = (torch.randint(0, 255, [4, 32], dtype=torch.uint8),)
-        graph_module = export_to_edge(M(), inputs).exported_program().graph_module
-        graph_module = FuseMulTensorIntoDequantPass()(graph_module).graph_module
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(*INPUT_SHAPE, dtype=torch.float32))
+        dequant = builder.call_operator(
+            op=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            args=(x, DEQUANT_SCALE, 0, 0, 255, torch.uint8),
+        )
+        full = builder.call_operator(
+            op=exir_ops.edge.aten.full.default,
+            args=(INPUT_SHAPE, FULL_VALUE),
+        )
+        mul = builder.call_operator(
+            op=exir_ops.edge.aten.mul.Tensor,
+            args=(dequant, full),
+        )
+        builder.output(mul)
+        graph_module = FuseMulTensorIntoDequantPass()(
+            builder.get_graph_module()
+        ).graph_module
 
         # verify that the mul and full ops were removed
         self.check_op_counts(
@@ -466,7 +474,7 @@ def forward(self, x):
                 == exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
             ):
                 deq_scale = node.args[1]
-        self.assertEqual(deq_scale, 4.5)
+        self.assertEqual(deq_scale, DEQUANT_SCALE * FULL_VALUE)
 
     def test_fuse_mul_scalar_into_dequant(self):
         dequant_scale = 0.006
 
@@ -25,24 +25,6 @@ endif()
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
-# NB: Enabling this will serialize execution of delegate instances Keeping this
-# OFF by default to maintain existing behavior, to be revisited.
-option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE
-       "Enable workspace sharing across different delegate instances" ON
-)
-# Keeping this OFF by default due to regressions in decode and model load with
-# kleidi kernels
-option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI "Enable Arm Kleidi kernels" OFF)
-
-# Turning this on cache weights between partitions and methods. If weights
-# are shared across methods/partitions then this can reduce load time and
-# memory usage
-
-# Keeping this off maintains existing behavior. Turning this on serializes
-# execution and initialization of delegates, to be revisited
-option(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE
-        "Enable weights cache to cache and manage all packed weights" OFF)
-
 if(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE)
   add_definitions(-DENABLE_XNNPACK_WEIGHTS_CACHE)
 endif()
 
@@ -0,0 +1,151 @@
+from __future__ import annotations
+
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+from torchgen import dest
+
+
+# disable import sorting to avoid circular dependency.
+from torchgen.api.types import DispatcherSignature  # usort: skip
+from torchgen.context import method_with_native_function
+from torchgen.model import BaseTy, BaseType, DispatchKey, NativeFunction, Variant
+from torchgen.utils import concatMap, Target
+
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from executorch.codegen.model import ETKernelIndex
+    from torchgen.selective_build.selector import SelectiveBuilder
+
+
+# Generates RegisterKernelStub.cpp, which provides placeholder kernels for custom operators. This will be used at
+# model authoring side.
+@dataclass(frozen=True)
+class ComputeNativeFunctionStub:
+    @method_with_native_function
+    def __call__(self, f: NativeFunction) -> str | None:
+        if Variant.function not in f.variants:
+            return None
+
+        sig = DispatcherSignature.from_schema(
+            f.func, prefix=f"wrapper_CPU_{f.func.name.overload_name}_", symint=False
+        )
+        assert sig is not None
+        if len(f.func.returns) == 0:
+            ret_name = ""
+        elif len(f.func.returns) == 1:
+            if f.func.arguments.out:
+                ret_name = f.func.arguments.out[0].name
+            else:
+                ret_name = next(
+                    (
+                        a.name
+                        for a in f.func.arguments.flat_non_out
+                        if a.type == f.func.returns[0].type
+                    ),
+                    "",
+                )
+            if not ret_name:
+                # if return type is tensor
+                if f.func.returns[0].type == BaseType(BaseTy.Tensor):
+                    # Returns an empty tensor
+                    ret_name = "at::Tensor()"
+                else:
+                    raise Exception(  # noqa: TRY002
+                        f"Can't handle this return type {f.func}"
+                    )  # noqa: TRY002
+        elif len(f.func.arguments.out) == len(f.func.returns):
+            # Returns a tuple of out arguments
+            tensor_type = "at::Tensor &"
+            comma = ", "
+            ret_name = f"""::std::tuple<{comma.join([tensor_type] * len(f.func.returns))}>(
+                {comma.join([r.name for r in f.func.arguments.out])}
+            )"""
+        else:
+            assert all(
+                a.type == BaseType(BaseTy.Tensor) for a in f.func.returns
+            ), f"Only support tensor returns but got {f.func.returns}"
+            # Returns a tuple of empty tensors
+            tensor_type = "at::Tensor"
+            comma = ", "
+            ret_name = f"""::std::tuple<{comma.join([tensor_type] * len(f.func.returns))}>(
+                {comma.join(["at::Tensor()" for _ in f.func.returns])}
+            )"""
+        ret_str = f"return {ret_name};" if len(f.func.returns) > 0 else ""
+        return f"""
+{sig.defn()} {{
+    {ret_str}
+}}
+    """
+
+
+def gen_custom_ops_registration(
+    *,
+    native_functions: Sequence[NativeFunction],
+    selector: SelectiveBuilder,
+    kernel_index: ETKernelIndex,
+    rocm: bool,
+) -> tuple[str, str]:
+    """
+    Generate custom ops registration code for dest.RegisterDispatchKey.
+
+    :param native_functions: a sequence of `NativeFunction`
+    :param selector: for selective build.
+    :param kernel_index: kernels for all the ops.
+    :param rocm: bool for dest.RegisterDispatchKey.
+    :return: generated C++ code to register custom operators into PyTorch
+    """
+
+    # convert kernel index to BackendIndex. This is because we can't handle ETKernelIndex yet.
+    # TODO larryliu: evaluate if this code is still needed. If yes let it handle ETKernelIndex.
+
+    dispatch_key = DispatchKey.CPU
+    backend_index = kernel_index._to_backend_index()
+    static_init_dispatch_registrations = ""
+    ns_grouped_native_functions: dict[str, list[NativeFunction]] = defaultdict(list)
+    for native_function in native_functions:
+        ns_grouped_native_functions[native_function.namespace].append(native_function)
+
+    for namespace, functions in ns_grouped_native_functions.items():
+        if len(functions) == 0:
+            continue
+        dispatch_registrations_body = "\n".join(
+            list(
+                concatMap(
+                    dest.RegisterDispatchKey(
+                        backend_index,
+                        Target.REGISTRATION,
+                        selector,
+                        rocm=rocm,
+                        symint=False,
+                        class_method_name=None,
+                        skip_dispatcher_op_registration=False,
+                    ),
+                    functions,
+                )
+            )
+        )
+        static_init_dispatch_registrations += f"""
+TORCH_LIBRARY_IMPL({namespace}, {dispatch_key}, m) {{
+{dispatch_registrations_body}
+}}"""
+    anonymous_definition = "\n".join(
+        list(
+            concatMap(
+                dest.RegisterDispatchKey(
+                    backend_index,
+                    Target.ANONYMOUS_DEFINITION,
+                    selector,
+                    rocm=rocm,
+                    symint=False,
+                    class_method_name=None,
+                    skip_dispatcher_op_registration=False,
+                ),
+                native_functions,
+            )
+        )
+    )
+    return anonymous_definition, static_init_dispatch_registrations
Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@`
`15`	`15`	`},`
`16`	`16`	`{`
`17`	`17`	`"name": "macos-arm64",`
	`18`	`+ "displayName": "Build everything buildable on macOS arm64",`
`18`	`19`	`"inherits": ["common"],`
`19`	`20`	`"generator": "Xcode",`
`20`	`21`	`"cacheVariables": {`
`@@ -28,6 +29,20 @@`
`28`	`29`	`"type": "equals",`
`29`	`30`	`"rhs": "Darwin"`
`30`	`31`	`}`
	`32`	`+ },`
	`33`	`+ {`
	`34`	`+ "name": "pybind",`
	`35`	`+ "displayName": "Build pybindings exported in the wheel",`
	`36`	`+ "inherits": ["common"],`
	`37`	`+ "cacheVariables": {`
	`38`	`+ "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/pybind.cmake",`
	`39`	`+ "CMAKE_OSX_DEPLOYMENT_TARGET": "10.15"`
	`40`	`+ },`
	`41`	`+ "condition": {`
	`42`	`+ "type": "inList",`
	`43`	`+ "string": "${hostSystemName}",`
	`44`	`+ "list": ["Darwin", "Linux", "Windows"]`
	`45`	`+ }`
`31`	`46`	`}`
`32`	`47`	`]`
`33`	`48`	`}`