pytorch · swolchok · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024
diff --git a/.ci/scripts/build-qnn-sdk.sh b/.ci/scripts/build-qnn-sdk.sh
@@ -11,7 +11,7 @@ set -o xtrace
 build_qnn_backend() {
   echo "Start building qnn backend."
   export ANDROID_NDK_ROOT=/opt/ndk
-  export QNN_SDK_ROOT=/tmp/qnn/2.23.0.240531
+  export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
 
   bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release

diff --git a/.ci/scripts/setup-qnn-deps.sh b/.ci/scripts/setup-qnn-deps.sh
@@ -7,14 +7,18 @@
 
 set -ex
 
+verify_pkg_installed() {
+  echo $(dpkg-query -W --showformat='${Status}\n' $1|grep "install ok installed")
+}
+
 install_qnn() {
   echo "Start installing qnn."
   QNN_INSTALLATION_DIR=/tmp/qnn
   mkdir -p "${QNN_INSTALLATION_DIR}"
 
-  curl -Lo /tmp/v2.23.0.24.06.24.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.23.0.24.06.24.zip"
+  curl -Lo /tmp/v2.25.0.24.07.28.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.25.0.240728.zip"
   echo "Finishing downloading qnn sdk."
-  unzip -qo /tmp/v2.23.0.24.06.24.zip -d /tmp
+  unzip -qo /tmp/v2.25.0.24.07.28.zip -d /tmp
   echo "Finishing unzip qnn sdk."
 
 
@@ -26,4 +30,22 @@ install_qnn() {
   ls -lah "${QNN_INSTALLATION_DIR}"
 }
 
+setup_libc++() {
+  sudo apt-get update
+  pkgs_to_check=('libc++-dev')
+  j=0
+  while [ $j -lt ${#pkgs_to_check[*]} ]; do
+    install_status=$(verify_pkg_installed ${pkgs_to_check[$j]})
+    if [ "$install_status" == "" ]; then
+      sudo apt-get install -y ${pkgs_to_check[$j]}
+      if [[ $? -ne 0 ]]; then
+        echo "ERROR: Failed to install required packages for libc++"
+        exit 1
+      fi
+    fi
+    j=$(( $j +1));
+  done
+}
+
+setup_libc++
 install_qnn
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
@@ -75,7 +75,7 @@ echo "COREML option ${COREML}"
 if [[ "${MODE}" =~ .*qnn.* ]]; then
   QNN=ON
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
-  export QNN_SDK_ROOT=/tmp/qnn/2.23.0.240531
+  export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
   export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
   export PYTHONPATH=".."
   cp schema/program.fbs exir/_serialize/program.fbs

diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh
@@ -33,6 +33,7 @@ if hash nproc &> /dev/null; then NPROC=$(nproc); fi
 EXECUTORCH_COMMON_CMAKE_ARGS="                      \
         -DCMAKE_INSTALL_PREFIX=${BUILD_DIR}         \
         -DCMAKE_BUILD_TYPE=${BUILD_TYPE}            \
+        -DEXECUTORCH_ENABLE_LOGGING=ON              \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON      \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON      \

diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
@@ -209,7 +209,13 @@ elif [[ "${BACKEND}" == "coreml" ]]; then
   fi
 elif [[ "${BACKEND}" == "xnnpack" ]]; then
   echo "Testing ${MODEL_NAME} with xnnpack..."
-  test_model_with_xnnpack true true
+  WITH_QUANTIZATION=true
+  WITH_DELEGATION=true
+  if [[ "$MODEL_NAME" == "mobilebert" ]]; then
+    # TODO(T197452682)
+    WITH_QUANTIZATION=false
+  fi
+  test_model_with_xnnpack "${WITH_QUANTIZATION}" "${WITH_DELEGATION}"
   if [[ $? -eq 0 ]]; then
     prepare_artifacts_upload
   fi

diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
@@ -178,6 +178,7 @@ jobs:
   upload-models:
     needs: export-models
     runs-on: linux.2xlarge
+    if: always()  # Continue this job regardless of previous job outcome
     steps:
       - name: Download the models from GitHub
         uses: actions/download-artifact@v3

diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml
@@ -165,6 +165,8 @@ jobs:
           # Test llama2
           if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then
             DELEGATE_CONFIG="xnnpack+custom+qe"
+          elif [[ ${{ matrix.delegate }} == "coreml" ]]; then
+            DELEGATE_CONFIG="coreml"
           fi
           PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
             bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}"
@@ -177,6 +179,7 @@ jobs:
   upload-models:
     needs: export-models
     runs-on: linux.2xlarge
+    if: always()  # Continue this job regardless of previous job outcome
     steps:
       - name: Download the models from GitHub
         uses: actions/download-artifact@v3

diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -74,6 +74,8 @@ exclude_patterns = [
     # NB: Objective-C is not supported
     'examples/apple/**',
     'examples/demo-apps/apple_ios/**',
+    # File contains @generated
+    'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
 ]
 command = [
     'python',
@@ -177,6 +179,8 @@ exclude_patterns = [
     '**/*.bat',
     '**/*.jpg',
     '**/*.jar',
+    # File contains @generated
+    'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h',
 ]
 command = [
     'python',

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -131,9 +131,7 @@ for detailed advice.
 
 #### C++ language version
 
-**C++11.**
-
-NOTE: The code does not yet fully conform to this, and some files require C++17.
+**C++17.**
 
 Rationale: This is a compromise between being compatible with older, proprietary
 toolchains, and having access to relatively modern C++ features.

diff --git a/backends/apple/coreml/compiler/coreml_preprocess.py b/backends/apple/coreml/compiler/coreml_preprocess.py
@@ -3,6 +3,7 @@
 # CoreML backend for delegating a EdgeProgram to CoreML.
 
 import json
+import logging
 
 import shutil
 import uuid
@@ -14,6 +15,7 @@
 from typing import Any, Dict, final, List, Optional, Tuple
 
 import coremltools as ct
+import coremltools.optimize as cto
 import executorchcoreml
 
 from executorch.exir.backend.backend_details import (
@@ -23,12 +25,16 @@
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
+
 
 class COMPILE_SPEC_KEYS(Enum):
     COMPUTE_UNITS = "compute_units"
     MODEL_TYPE = "model_type"
     MIN_DEPLOYMENT_TARGET = "min_deployment_target"
     MODEL_COMPUTE_PRECISION = "model_compute_precision"
+    OP_LINEAR_QUANTIZER_CONFIG = "op_linear_quantizer_config"
 
 
 class MODEL_PATHS(Enum):
@@ -169,12 +175,44 @@ def generate_compute_unit_compile_spec(
             compute_unit.name.lower().encode("utf-8"),
         )
 
+    @staticmethod
+    def generate_op_linear_quantizer_config_compile_spec(
+        op_linear_quantizer_config: Dict,
+    ) -> CompileSpec:
+        """
+        Returns the compile spec representing the model post conversion quantization,
+        which is a dict that will construct cto.coreml.OpLinearQuantizerConfig
+        """
+        str_representation = json.dumps(op_linear_quantizer_config)
+        byte_representation = str_representation.encode("utf-8")
+        return CompileSpec(
+            COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value,
+            byte_representation,
+        )
+
+    @staticmethod
+    def op_linear_quantizer_config_from_compile_specs(
+        compile_specs: List[CompileSpec],
+    ) -> cto.coreml.OpLinearQuantizerConfig:
+        """
+        Returns the model's post conversion quantization by parsing the list of compile specs.
+        """
+        for compile_spec in compile_specs:
+            if compile_spec.key == COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value:
+                config_dict_str = compile_spec.value.decode("utf-8")
+                config_dict = json.loads(config_dict_str)
+                config = cto.coreml.OpLinearQuantizerConfig._from_dict(config_dict)
+                return config
+
+        return None
+
     @staticmethod
     def generate_compile_specs(
         compute_unit: ct.ComputeUnit = ct.ComputeUnit.ALL,
         minimum_deployment_target: ct.target = ct.target.iOS15,
         compute_precision: ct.precision = ct.precision.FLOAT16,
         model_type: MODEL_TYPE = MODEL_TYPE.MODEL,
+        op_linear_quantizer_config: Optional[Dict] = None,
     ) -> List[CompileSpec]:
         """
         Returns the list of compile specs that's used by CoreMLBackend to lower the module.
@@ -192,6 +230,12 @@ def generate_compile_specs(
             CoreMLBackend.generate_compute_precision_compile_spec(compute_precision)
         )
         compile_specs.append(CoreMLBackend.generate_model_type_compile_spec(model_type))
+        if op_linear_quantizer_config is not None:
+            compile_specs.append(
+                CoreMLBackend.generate_op_linear_quantizer_config_compile_spec(
+                    op_linear_quantizer_config
+                )
+            )
 
         return compile_specs
 
@@ -368,18 +412,18 @@ def preprocess(
                 compile_specs,
             )
         )
-
         model_compute_precision: ct.precision = (
             CoreMLBackend.model_compute_precision_from_compile_specs(compile_specs)
         )
-
         minimum_deployment_target: ct.target = (
             CoreMLBackend.min_deployment_target_from_compile_specs(compile_specs)
         )
-
         compute_units: ct.ComputeUnit = CoreMLBackend.compute_unit_from_compile_specs(
             compile_specs
         )
+        op_linear_quantizer_config = (
+            CoreMLBackend.op_linear_quantizer_config_from_compile_specs(compile_specs)
+        )
 
         mlmodel = ct.convert(
             model=edge_program,
@@ -392,4 +436,15 @@ def preprocess(
             compute_units=compute_units,
         )
 
+        if op_linear_quantizer_config is not None:
+            logger.warning(
+                "Core ML Backend op_linear_quantizer_config API is experimental"
+            )
+            config = cto.coreml.OptimizationConfig(
+                global_config=op_linear_quantizer_config,
+                # skip embedding
+                op_type_configs={"gather": None},
+            )
+            mlmodel = cto.coreml.linear_quantize_weights(mlmodel, config=config)
+
         return CoreMLBackend.preprocess_model(mlmodel, model_type=model_type)
diff --git a/backends/apple/coreml/partition/coreml_partitioner.py b/backends/apple/coreml/partition/coreml_partitioner.py
@@ -17,7 +17,7 @@
     Partitioner,
     PartitionResult,
 )
-from executorch.exir.backend.utils import tag_constant_data
+from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
 from torch.export.exported_program import ExportedProgram
 from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
 from torch.fx.passes.operator_support import OperatorSupportBase
@@ -61,6 +61,7 @@ def __init__(
         self,
         skip_ops_for_coreml_delegation: Optional[List[str]] = None,
         compile_specs: Optional[List[CompileSpec]] = None,
+        take_over_mutable_buffer: Optional[bool] = True,
     ) -> None:
         if skip_ops_for_coreml_delegation is None:
             skip_ops_for_coreml_delegation = []
@@ -69,6 +70,7 @@ def __init__(
             backend_id=CoreMLBackend.__name__,
             compile_specs=compile_specs if compile_specs is not None else [],
         )
+        self.take_over_mutable_buffer = take_over_mutable_buffer
 
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         # Run the CapabilityBasedPartitioner to return the largest possible
@@ -89,6 +91,15 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
                 partition_tags[tag] = self.delegation_spec
 
         tag_constant_data(exported_program)
+        if self.take_over_mutable_buffer:
+            logger.info(
+                "Core ML partitioner will take over torch mutable buffer as Core ML state, "
+                "so if your model contains mutable buffer, "
+                "then you will need MacOS15+/iOS18+ to execute. "
+                "If you want your mutable buffer model to be compatible with older OS, "
+                "then please set `take_over_mutable_buffer=False`"
+            )
+            tag_mutated_buffer(exported_program)
 
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags

diff --git a/backends/apple/coreml/scripts/install_requirements.sh b/backends/apple/coreml/scripts/install_requirements.sh
@@ -24,7 +24,7 @@ rm -rf "$COREML_DIR_PATH/third-party"
 mkdir "$COREML_DIR_PATH/third-party"
 
 echo "${green}ExecuTorch: Cloning coremltools."
-git clone --depth 1 --branch 8.0b1 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
+git clone --depth 1 --branch 8.0b2 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
 cd $COREMLTOOLS_DIR_PATH
 
 STATUS=$?
@@ -47,6 +47,11 @@ cmake --build "$COREMLTOOLS_DIR_PATH/build" --parallel
 
 echo "${green}ExecuTorch: Installing coremltools."
 pip install "$COREMLTOOLS_DIR_PATH"
+# CoreMLTools have started supporting numpy 2.0,
+# but ExecuTorch example model test env is still using older transformers,
+# so for now we will need to downgrade numpy to 1.x
+# TODO: Remove this numpy downgrade once later transformers starts to be used
+pip install numpy==1.26.4
 STATUS=$?
 if [ $STATUS -ne 0 ]; then
     echo "${red}ExecuTorch: Failed to install coremltools."

diff --git a/backends/apple/coreml/test/test_coreml_partitioner.py b/backends/apple/coreml/test/test_coreml_partitioner.py
@@ -4,11 +4,14 @@
 
 import unittest
 
+import coremltools as ct
+
 import executorch.exir
 
 import torch
 import torchvision
 
+from executorch.backends.apple.coreml.compiler import CoreMLBackend
 from executorch.backends.apple.coreml.partition import CoreMLPartitioner
 
 
@@ -86,8 +89,54 @@ def test_vit_skip_conv(self):
             if node.op == "call_function"
         ] == total
 
+    def test_buffer(self):
+        embedding_dim = 3
+        max_seq_len = 2
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer(
+                    "cache",
+                    torch.zeros((max_seq_len, embedding_dim), dtype=torch.float32),
+                )
+
+            def forward(self, q, k_val, input_pos):
+                q_T = q.transpose(0, 1)
+                k = torch.ops.aten.index_put_(self.cache, [input_pos, None], k_val)
+                attn = k.mm(q_T)
+                return attn
+
+        model = Model()
+        model.eval()
+
+        q = torch.randn((1, embedding_dim))
+        k_val = torch.randn((1, embedding_dim))
+        input_pos = torch.tensor([0])
+        example_inputs = (q, k_val, input_pos)
+        exir_program_aten = torch.export.export(model, example_inputs)
+
+        compile_specs = CoreMLBackend.generate_compile_specs(
+            minimum_deployment_target=ct.target.iOS18
+        )
+        partitioner = CoreMLPartitioner(compile_specs=compile_specs)
+        edge_program_manager = executorch.exir.to_edge(
+            exir_program_aten, compile_config=self.edge_compile_config
+        )
+        delegated_program_manager = edge_program_manager.to_backend(partitioner)
+
+        assert [
+            node.target.__name__
+            for node in delegated_program_manager.exported_program().graph.nodes
+            if node.op == "call_function"
+        ] == [
+            "executorch_call_delegate",
+            "getitem",
+        ]
+
 
 if __name__ == "__main__":
     test_runner = TestCoreMLPartitioner()
     test_runner.test_add_sub_skip_mm()
     test_runner.test_vit_skip_conv()
+    test_runner.test_buffer()