Update

swolchok · swolchok · commit e62191ada469 · 2025-02-10T11:00:34.000-08:00
[ghstack-poisoned]
diff --git a/.ci/docker/common/install_base.sh b/.ci/docker/common/install_base.sh
@@ -26,6 +26,11 @@ install_ubuntu() {
     libssl-dev \
     zip
 
+  # These libraries are needed by TorchVision
+  apt-get install -y --no-install-recommends \
+    libjpeg-dev \
+    libpng-dev
+
   # Cleanup package manager
   apt-get autoclean && apt-get clean
   rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh
@@ -31,8 +31,16 @@ install_miniconda() {
 
 install_python() {
   pushd /opt/conda
-  # Install the correct Python version
+  # Install the selected Python version for CI jobs
   as_ci_user conda create -n "py_${PYTHON_VERSION}" -y --file /opt/conda/conda-env-ci.txt python="${PYTHON_VERSION}"
+
+  # From https://github.com/pytorch/pytorch/blob/main/.ci/docker/common/install_conda.sh
+  if [[ $(uname -m) == "aarch64" ]]; then
+    conda_install "openblas==0.3.28=*openmp*"
+  else
+    conda_install mkl=2022.1.0 mkl-include=2022.1.0
+  fi
+
   popd
 }
 
@@ -53,7 +61,7 @@ fix_conda_ubuntu_libstdcxx() {
   # PyTorch sev: https://github.com/pytorch/pytorch/issues/105248
   # Ref: https://github.com/pytorch/pytorch/blob/main/.ci/docker/common/install_conda.sh
   if grep -e "2[02].04." /etc/issue >/dev/null; then
-    rm "/opt/conda/envs/py_${PYTHON_VERSION}/lib/libstdc++.so.6"
+    rm /opt/conda/envs/py_${PYTHON_VERSION}/lib/libstdc++.so*
   fi
 }
 
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -31,7 +31,7 @@ jobs:
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
-        
+
         # For mypy linting, we need to first install executorch first so that
         # it builds the python package information.
         BUILD_TOOL="cmake"
@@ -74,6 +74,7 @@ jobs:
       docker-image: executorch-ubuntu-22.04-linter
       fetch-depth: 0
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
       script: |
         FILES_NEEDS_FORMAT=$(/opt/google-java-format -n extension/android/src/main/java/org/pytorch/executorch/*.java \
           examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/*.java \
diff --git a/backends/arm/_passes/decompose_select.py b/backends/arm/_passes/decompose_select.py
@@ -37,14 +37,13 @@ def call(self, graph_module: torch.fx.GraphModule):
             rank = len(input_node.meta["val"].size())
             dim = dim % rank if dim < 0 else dim
             index = index % rank if index < 0 else index
-            dim_list = list(range(rank))
 
             with graph_module.graph.inserting_before(node):
                 slice_node = create_node(
                     graph_module.graph, slice_op, (input_node, dim, index, index + 1)
                 )
                 squeeze_node = create_node(
-                    graph_module.graph, squeeze_op, (slice_node, dim_list)
+                    graph_module.graph, squeeze_op, (slice_node, [dim])
                 )
 
             node.replace_all_uses_with(squeeze_node)
diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
@@ -365,10 +365,7 @@ buck_targets = [
   "//extension/llm/custom_ops:custom_ops",
 ]
 filters = [
-  # Second clause is to pick up fht_neon.c/fht_avx.c from FFHT. TODO:
-  # remove filters and patch extract_sources.py's Buck query to fetch
-  # srcs; presumably filters is here to remove .h files.
-  "(.cpp$)|(fht.*\\.c$)",
+  ".cpp$",
 ]
 excludes = [
   "^codegen",
diff --git a/codegen/tools/gen_oplist.py b/codegen/tools/gen_oplist.py
@@ -189,6 +189,23 @@ def _dump_yaml(
         )
 
 
+def create_kernel_key(maybe_kernel_key: str) -> str:
+    # It is a kernel key.
+    if maybe_kernel_key.lstrip().startswith("v1"):
+        return maybe_kernel_key
+    # It is a dtype.
+    else:
+        # Generate a kernel key based on the dtype provided.
+        # Note: no dim order is included in this kernel key.
+        # For a description of the kernel key format, see
+        # executorch/blob/main/runtime/kernel/operator_registry.h#L97-L123
+        try:
+            dtype = ScalarType[maybe_kernel_key]
+            return "v1/" + str(dtype.value) + ";"
+        except KeyError:
+            raise Exception(f"Unknown dtype: {maybe_kernel_key}")
+
+
 def gen_oplist(
     output_path: str,
     model_file_path: Optional[str] = None,
@@ -223,7 +240,11 @@ def gen_oplist(
         ops_and_metadata = json.loads(ops_dict)
         for op, metadata in ops_and_metadata.items():
             op_set.update({op})
-            op_metadata = metadata if len(metadata) > 0 else ["default"]
+            op_metadata = (
+                [create_kernel_key(x) for x in metadata]
+                if len(metadata) > 0
+                else ["default"]
+            )
             et_kernel_metadata = merge_et_kernel_metadata(
                 et_kernel_metadata, {op: op_metadata}
             )
diff --git a/codegen/tools/test/test_gen_oplist.py b/codegen/tools/test/test_gen_oplist.py
@@ -13,6 +13,7 @@
 
 import executorch.codegen.tools.gen_oplist as gen_oplist
 import yaml
+from executorch.codegen.tools.gen_oplist import ScalarType
 
 
 class TestGenOpList(unittest.TestCase):
@@ -89,7 +90,7 @@ def test_gen_op_list_with_root_ops_and_dtypes(
     ) -> None:
         output_path = os.path.join(self.temp_dir.name, "output.yaml")
         ops_dict = {
-            "aten::add": ["v1/3;0,1|3;0,1|3;0,1|3;0,1", "v1/6;0,1|6;0,1|6;0,1|6;0,1"],
+            "aten::add": ["v1/3;0,1|3;0,1|3;0,1|3;0,1", ScalarType.Float.name],
             "aten::mul": [],
         }
         args = [
@@ -104,7 +105,7 @@ def test_gen_op_list_with_root_ops_and_dtypes(
             {
                 "aten::add": [
                     "v1/3;0,1|3;0,1|3;0,1|3;0,1",
-                    "v1/6;0,1|6;0,1|6;0,1|6;0,1",
+                    "v1/6;",
                 ],
                 "aten::mul": ["default"],
             },
diff --git a/codegen/tools/test/test_gen_selected_op_variants.py b/codegen/tools/test/test_gen_selected_op_variants.py
@@ -12,7 +12,7 @@
 import expecttest
 
 
-class TestGenSelectedMobileOpsHeader(expecttest.TestCase):
+class TestGenSelectedOpVariants(expecttest.TestCase):
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
         self.addCleanup(self.temp_dir.cleanup)
@@ -84,7 +84,79 @@ def test_generates_correct_header(self) -> None:
             )
 
 
-class TestGenSelectedMobileOpsHeader_Empty(expecttest.TestCase):
+class TestGenSelectedOpVariants_UsingDtypeString(expecttest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.addCleanup(self.temp_dir.cleanup)
+        self.selected_ops_yaml = os.path.join(
+            self.temp_dir.name, "selected_operators.yaml"
+        )
+        with open(self.selected_ops_yaml, "w") as f:
+            f.write(
+                """
+include_all_non_op_selectives: False
+include_all_operators: False
+debug_info:
+  - model1@v100
+  - model2@v50
+operators:
+  aten::add:
+    is_root_operator: Yes
+    is_used_for_training: Yes
+    include_all_overloads: No
+  aten::add.int:
+    is_root_operator: No
+    is_used_for_training: No
+    include_all_overloads: Yes
+kernel_metadata: {}
+et_kernel_metadata:
+  aten::add.out:
+    # A list of different kernel keys (tensors with dtype-enum/dim-order) combinations used in model
+      - v1/6; # Float
+      - v1/3; # Int
+  aten::mul.out:
+      - v1/6; # Float
+  aten::sub.out:
+      - default
+build_features: []
+custom_classes: []
+            """
+            )
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def test_generates_correct_header(self) -> None:
+        gen_selected_op_variants.write_selected_op_variants(
+            os.path.join(self.temp_dir.name, "selected_operators.yaml"),
+            self.temp_dir.name,
+        )
+        with open(
+            os.path.join(self.temp_dir.name, "selected_op_variants.h"), "r"
+        ) as result:
+            self.assertExpectedInline(
+                result.read(),
+                """#pragma once
+/**
+ * Generated by executorch/codegen/tools/gen_selected_op_variants.py
+ */
+
+inline constexpr bool should_include_kernel_dtype(
+  const char *operator_name,
+  executorch::aten::ScalarType scalar_type
+) {
+  return ((executorch::aten::string_view(operator_name).compare("add.out") == 0)
+        && (scalar_type == executorch::aten::ScalarType::Float || scalar_type == executorch::aten::ScalarType::Int))
+ || ((executorch::aten::string_view(operator_name).compare("mul.out") == 0)
+        && (scalar_type == executorch::aten::ScalarType::Float))
+ || ((executorch::aten::string_view(operator_name).compare("sub.out") == 0)
+        && (true));
+}
+""",
+            )
+
+
+class TestGenSelectedOpVariants_Empty(expecttest.TestCase):
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
         self.addCleanup(self.temp_dir.cleanup)
diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md
@@ -434,18 +434,18 @@ to the backend(s) targeted at export. To support multiple devices, such as
 XNNPACK acceleration for Android and Core ML for iOS, export a separate PTE file
 for each backend.
 
-To delegate to a backend at export time, ExecuTorch provides the `to_backend()`
-function in the `EdgeProgramManager` object, which takes a backend-specific
-partitioner object. The partitioner is responsible for finding parts of the
-computation graph that can be accelerated by the target backend，and
-`to_backend()` function will delegate matched part to given backend for
-acceleration and optimization. Any portions of the computation graph not
-delegated will be executed by the ExecuTorch operator implementations.
+To delegate a model to a specific backend during export, ExecuTorch uses the
+`to_edge_transform_and_lower()` function. This function takes the exported program
+from `torch.export` and a backend-specific partitioner object. The partitioner
+identifies parts of the computation graph that can be optimized by the target
+backend. Within `to_edge_transform_and_lower()`, the exported program is
+converted to an edge dialect program. The partitioner then delegates compatible
+graph sections to the backend for acceleration and optimization. Any graph parts
+not delegated are executed by ExecuTorch's default operator implementations.
 
 To delegate the exported model to a specific backend, we need to import its
 partitioner as well as edge compile config from ExecuTorch codebase first, then
-call `to_backend` with an instance of partitioner on the `EdgeProgramManager`
-object `to_edge` function created.
+call `to_edge_transform_and_lower`.
 
 Here's an example of how to delegate nanoGPT to XNNPACK (if you're deploying to an Android phone for instance):
 
@@ -457,7 +457,7 @@ from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPar
 
 # Model to be delegated to specific backend should use specific edge compile config
 from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
-from executorch.exir import EdgeCompileConfig, to_edge
+from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
 
 import torch
 from torch.export import export
@@ -495,17 +495,14 @@ with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
 # Convert the model into a runnable ExecuTorch program.
 # To be further lowered to Xnnpack backend, `traced_model` needs xnnpack-specific edge compile config
 edge_config = get_xnnpack_edge_compile_config()
-edge_manager = to_edge(traced_model, compile_config=edge_config)
-
-# Delegate exported model to Xnnpack backend by invoking `to_backend` function with Xnnpack partitioner.
-edge_manager = edge_manager.to_backend(XnnpackPartitioner())
+# Converted to edge program and then delegate exported model to Xnnpack backend
+# by invoking `to` function with Xnnpack partitioner.
+edge_manager = to_edge_transform_and_lower(traced_model, partitioner = [XnnpackPartitioner()], compile_config = edge_config)
 et_program = edge_manager.to_executorch()
 
 # Save the Xnnpack-delegated ExecuTorch program to a file.
 with open("nanogpt.pte", "wb") as file:
     file.write(et_program.buffer)
-
-
 ```
 
 Additionally, update CMakeLists.txt to build and link the XNNPACK backend to
@@ -651,8 +648,8 @@ DuplicateDynamicQuantChainPass()(m)
 traced_model = export(m, example_inputs)
 ```
 
-Additionally, add or update the `to_backend()` call to use `XnnpackPartitioner`. This instructs ExecuTorch to
-optimize the model for CPU execution via the XNNPACK backend.
+Additionally, add or update the `to_edge_transform_and_lower()` call to use `XnnpackPartitioner`. This
+instructs ExecuTorch to optimize the model for CPU execution via the XNNPACK backend.
 
 ```python
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
@@ -661,8 +658,9 @@ from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
 ```
 
 ```python
-edge_manager = to_edge(traced_model, compile_config=edge_config)
-edge_manager = edge_manager.to_backend(XnnpackPartitioner()) # Lower to XNNPACK.
+edge_config = get_xnnpack_edge_compile_config()
+# Convert to edge dialect and lower to XNNPack.
+edge_manager = to_edge_transform_and_lower(traced_model, partitioner = [XnnpackPartitioner()], compile_config = edge_config)
 et_program = edge_manager.to_executorch()
 ```
 
@@ -682,20 +680,20 @@ target_link_libraries(
 For more information, see [Quantization in ExecuTorch](../quantization-overview.md).
 
 ## Profiling and Debugging
-After lowering a model by calling `to_backend()`, you may want to see what got delegated and what didn’t. ExecuTorch
+After lowering a model by calling `to_edge_transform_and_lower()`, you may want to see what got delegated and what didn’t. ExecuTorch
 provides utility methods to give insight on the delegation. You can use this information to gain visibility into
 the underlying computation and diagnose potential performance issues. Model authors can use this information to
 structure the model in a way that is compatible with the target backend.
 
 ### Visualizing the Delegation
 
-The `get_delegation_info()` method provides a summary of what happened to the model after the `to_backend()` call:
+The `get_delegation_info()` method provides a summary of what happened to the model after the `to_edge_transform_and_lower()` call:
 
 ```python
 from executorch.devtools.backend_debug import get_delegation_info
 from tabulate import tabulate
 
-# ... After call to to_backend(), but before to_executorch()
+# ... After call to to_edge_transform_and_lower(), but before to_executorch()
 graph_module = edge_manager.exported_program().graph_module
 delegation_info = get_delegation_info(graph_module)
 print(delegation_info.get_summary())
@@ -762,7 +760,7 @@ Through the ExecuTorch Developer Tools, users are able to profile model executio
 An ETRecord is an artifact generated at the time of export that contains model graphs and source-level metadata linking the ExecuTorch program to the original PyTorch model. You can view all profiling events without an ETRecord, though with an ETRecord, you will also be able to link each event to the types of operators being executed, module hierarchy, and stack traces of the original PyTorch source code. For more information, see [the ETRecord docs](../etrecord.md).
 
 
-In your export script, after calling `to_edge()` and `to_executorch()`, call `generate_etrecord()` with the `EdgeProgramManager` from `to_edge()` and the `ExecuTorchProgramManager` from `to_executorch()`. Make sure to copy the `EdgeProgramManager`, as the call to `to_backend()` mutates the graph in-place.
+In your export script, after calling `to_edge()` and `to_executorch()`, call `generate_etrecord()` with the `EdgeProgramManager` from `to_edge()` and the `ExecuTorchProgramManager` from `to_executorch()`. Make sure to copy the `EdgeProgramManager`, as the call to `to_edge_transform_and_lower()` mutates the graph in-place.
 
 ```
 # export_nanogpt.py
diff --git a/examples/qualcomm/oss_scripts/llama/TARGETS b/examples/qualcomm/oss_scripts/llama/TARGETS
@@ -22,12 +22,12 @@ python_library(
         "//caffe2:torch",
         "//executorch/backends/qualcomm/partition:partition",
         "//executorch/backends/qualcomm/quantizer:quantizer",
+        "//executorch/devtools/backend_debug:delegation_info",
         "//executorch/devtools:lib",
         "//executorch/examples/models:models",
         "//executorch/examples/qualcomm/oss_scripts/llama:static_llama",
         "//executorch/examples/qualcomm:utils",
         "//executorch/extension/export_util:export_util",
-        "//executorch/extension/llm/custom_ops:model_sharding_py",
         "//executorch/extension/llm/export:export_lib",
         "//executorch/extension/pybindings:aten_lib",
     ],
@@ -46,6 +46,7 @@ python_binary(
         "//executorch/extension/pybindings:aten_lib",
         "//executorch/backends/qualcomm/partition:partition",
         "//executorch/backends/qualcomm/quantizer:quantizer",
+        "//executorch/devtools/backend_debug:delegation_info",
         "//executorch/devtools:lib",
         "//executorch/examples/models:models",
         "//executorch/examples/qualcomm:utils",
diff --git a/examples/selective_build/targets.bzl b/examples/selective_build/targets.bzl
@@ -1,5 +1,5 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_oss_build_kwargs", "is_xplat", "runtime")
-load("@fbsource//xplat/executorch/codegen:codegen.bzl", "et_operator_library", "executorch_generated_lib")
+load("@fbsource//xplat/executorch/codegen:codegen.bzl", "et_operator_library", "executorch_generated_lib", "ScalarType")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -49,7 +49,9 @@ def define_common_targets():
     et_operator_library(
         name = "select_ops_in_dict",
         ops_dict = {
-            "aten::add.out": ["v1/3;0,1", "v1/6;0,1"],  # int, float
+            # 1. Use kernel key, generated with a model, or
+            # 2. Specify the dtype, from executorch/codegen/codegen.bzl
+            "aten::add.out": ["v1/3;0,1", ScalarType("Float")],  # int, float
             "aten::mm.out": [],  # all dtypes
         },
     )
diff --git a/examples/selective_build/test_selective_build.sh b/examples/selective_build/test_selective_build.sh
@@ -66,7 +66,6 @@ test_buck2_select_ops_in_dict() {
     # select ops and their dtypes using the dictionary API.
     $BUCK run //examples/selective_build:selective_build_test \
         --config=executorch.select_ops=dict \
-        --config=executorch.dtype_selective_build_lib=//examples/selective_build:select_ops_in_dict_lib \
         -- --model_path=./add_mul.pte
 
     echo "Removing add_mul.pte"
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
diff --git a/extension/llm/custom_ops/spinquant/test/CMakeLists.txt b/extension/llm/custom_ops/spinquant/test/CMakeLists.txt
diff --git a/shim/xplat/executorch/codegen/codegen.bzl b/shim/xplat/executorch/codegen/codegen.bzl
diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json

Original file line number	Diff line number	Diff line change
`@@ -37,14 +37,13 @@ def call(self, graph_module: torch.fx.GraphModule):`
`37`	`37`	`rank = len(input_node.meta["val"].size())`
`38`	`38`	`dim = dim % rank if dim < 0 else dim`
`39`	`39`	`index = index % rank if index < 0 else index`
`40`		`- dim_list = list(range(rank))`
`41`	`40`
`42`	`41`	`with graph_module.graph.inserting_before(node):`
`43`	`42`	`slice_node = create_node(`
`44`	`43`	`graph_module.graph, slice_op, (input_node, dim, index, index + 1)`
`45`	`44`	`)`
`46`	`45`	`squeeze_node = create_node(`
`47`		`- graph_module.graph, squeeze_op, (slice_node, dim_list)`
	`46`	`+ graph_module.graph, squeeze_op, (slice_node, [dim])`
`48`	`47`	`)`
`49`	`48`
`50`	`49`	`node.replace_all_uses_with(squeeze_node)`
Original file line number	Diff line number	Diff line change
`@@ -365,10 +365,7 @@ buck_targets = [`
`365`	`365`	`"//extension/llm/custom_ops:custom_ops",`
`366`	`366`	`]`
`367`	`367`	`filters = [`
`368`		`- # Second clause is to pick up fht_neon.c/fht_avx.c from FFHT. TODO:`
`369`		`- # remove filters and patch extract_sources.py's Buck query to fetch`
`370`		`- # srcs; presumably filters is here to remove .h files.`
`371`		`- "(.cpp$)\|(fht.*\\.c$)",`
	`368`	`+ ".cpp$",`
`372`	`369`	`]`
`373`	`370`	`excludes = [`
`374`	`371`	`"^codegen",`