Fix torchao deps (#13107)

metascroy · web-flow · commit ea4a7fa01746 · 2025-08-13T09:39:26.000-07:00
This PR

* Renames EXECUTORCH_BUILD_TORCHAO to EXECUTORCH_BUILD_KERNELS_TORCHAO
to be more in line with other kernel options (e.g.,
EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
* Fixes torchao lowbit kernel dependencies in xcframeworks
* Adds torchao lowbit kernels to the swift package
diff --git a/.Package.swift/kernels_torchao/dummy.swift b/.Package.swift/kernels_torchao/dummy.swift
diff --git a/.Package.swift/kernels_torchao_debug/dummy.swift b/.Package.swift/kernels_torchao_debug/dummy.swift
diff --git a/.ci/scripts/test_llama_torchao_lowbit.sh b/.ci/scripts/test_llama_torchao_lowbit.sh
@@ -29,27 +29,22 @@ cmake -DPYTHON_EXECUTABLE=python \
     -DEXECUTORCH_ENABLE_LOGGING=1 \
     -DCMAKE_BUILD_TYPE=Release \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=OFF \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_TORCHAO=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
     -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
     -Bcmake-out .
-cmake --build cmake-out -j16 --target install --config Release
+cmake --build cmake-out -j16 --config Release --target install
 
 # Install llama runner with torchao
 cmake -DPYTHON_EXECUTABLE=python \
-    -DBUILD_TESTING=OFF \
     -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_XNNPACK=OFF \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-    -DEXECUTORCH_BUILD_TORCHAO=ON \
     -Bcmake-out/examples/models/llama \
     examples/models/llama
 cmake --build cmake-out/examples/models/llama -j16 --config Release
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -485,7 +485,7 @@ jobs:
         eval "$(conda shell.bash hook)"
 
         # Install requirements
-        ${CONDA_RUN} EXECUTORCH_BUILD_TORCHAO=1 python install_executorch.py
+        ${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 python install_executorch.py
         ${CONDA_RUN} sh examples/models/llama/install_requirements.sh
 
         # Run test
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -278,29 +278,6 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
   )
 endif()
 
-if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
-  set(TORCHAO_BUILD_ATEN_OPS OFF)
-  set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
-  set(TORCHAO_BUILD_CPU_AARCH64 ON)
-  set(TORCHAO_ENABLE_ARM_NEON_DOT ON)
-
-  list(
-    APPEND
-    TORCHAO_INCLUDE_DIRS
-    ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
-    ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
-    ${EXECUTORCH_ROOT}/third-party/ao
-  )
-
-  set(EXECUTORCH_INCLUDE_DIRS ${TORCHAO_INCLUDE_DIRS})
-
-  add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/ao/torchao/experimental
-  )
-  executorch_target_link_options_shared_lib(torchao_ops_executorch)
-  list(APPEND _executorch_kernels torchao_ops_executorch)
-endif()
-
 if(EXECUTORCH_BUILD_TESTS)
   set(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
   include(CTest)
@@ -705,6 +682,65 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool)
 endif()
 
+if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
+  if(NOT TARGET cpuinfo)
+    message(
+      FATAL_ERROR
+        "EXECUTORCH_BUILD_KERNELS_TORCHAO requires EXECUTORCH_BUILD_CPUINFO be set ON"
+    )
+  endif()
+  if(NOT TARGET pthreadpool)
+    message(
+      FATAL_ERROR
+        "EXECUTORCH_BUILD_KERNELS_TORCHAO requires EXECUTORCH_BUILD_PTHREADPOOL be set ON"
+    )
+  endif()
+
+  # Configure TorchAO kernels
+  set(TORCHAO_BUILD_ATEN_OPS OFF)
+  set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
+  set(TORCHAO_BUILD_CPU_AARCH64 ON)
+  set(TORCHAO_ENABLE_ARM_NEON_DOT ON)
+  set(TORCHAO_BUILD_KLEIDIAI ON)
+
+  # TorchAO kernels look for EXECUTORCH_INCLUDE_DIRS
+  if(DEFINED EXECUTORCH_INCLUDE_DIRS)
+    message(FATAL_ERROR "EXECUTORCH_INCLUDE_DIRS is already defined")
+  endif()
+  set(EXECUTORCH_INCLUDE_DIRS
+      ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
+      ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
+  )
+  add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/ao/torchao/experimental
+  )
+  unset(EXECUTORCH_INCLUDE_DIRS)
+
+  executorch_target_link_options_shared_lib(torchao_ops_executorch)
+  list(APPEND _executorch_kernels torchao_ops_executorch)
+
+  install(
+    TARGETS torchao_ops_executorch torchao_kernels_aarch64
+    EXPORT ExecuTorchTargets
+    DESTINATION lib
+    INCLUDES
+    DESTINATION ${_common_include_directories}
+  )
+  # If using KleidiAI and XNNPACK has not installed it already, install it
+  if(TORCHAO_BUILD_KLEIDIAI AND NOT (EXECUTORCH_BUILD_XNNPACK
+                                     AND EXECUTORCH_XNNPACK_ENABLE_KLEIDI)
+  )
+    install(
+      TARGETS kleidiai
+      EXPORT ExecuTorchTargets
+      DESTINATION lib
+      INCLUDES
+      DESTINATION ${_common_include_directories}
+    )
+  endif()
+
+endif()
+
 if(EXECUTORCH_BUILD_PYBIND)
 
   # Add codegen tools subdirectory for selective_build pybind module
diff --git a/Package.swift b/Package.swift
@@ -84,6 +84,11 @@ let products = deliverables([
     ],
   ],
   "kernels_quantized": [:],
+  "kernels_torchao": [
+    "targets": [
+      "threadpool",
+    ],
+  ],
 ])
 
 let targets = deliverables([
diff --git a/docs/source/using-executorch-ios.md b/docs/source/using-executorch-ios.md
@@ -14,6 +14,7 @@ The ExecuTorch Runtime for iOS and macOS (ARM64) is distributed as a collection
 * `kernels_llm` - Custom kernels for LLMs
 * `kernels_optimized` - Accelerated generic CPU kernels
 * `kernels_quantized` - Quantized kernels
+* `kernels_torchao` - Quantized CPU kernels from torchao
 
 Link your binary with the ExecuTorch runtime and any backends or kernels used by the exported ML model. It is recommended to link the core runtime to the components that use ExecuTorch directly, and link kernels and backends against the main app target.
 
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
@@ -37,7 +37,7 @@ cmake_dependent_option(
   "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF
 )
 
-option(EXECUTORCH_BUILD_TORCHAO "Build the torchao kernels" OFF)
+option(EXECUTORCH_BUILD_KERNELS_TORCHAO_MPS "Build the torchao MPS kernels" OFF)
 
 if(NOT PYTHON_EXECUTABLE)
   set(PYTHON_EXECUTABLE python3)
@@ -115,21 +115,16 @@ if(TARGET custom_ops)
   list(APPEND link_libraries custom_ops)
 endif()
 
-if(EXECUTORCH_BUILD_TORCHAO)
+if(TARGET torchao_ops_executorch)
+  executorch_target_link_options_shared_lib(torchao_ops_executorch)
+  list(APPEND link_libraries torchao_ops_executorch)
+endif()
+
+if(EXECUTORCH_BUILD_KERNELS_TORCHAO_MPS)
   # Currently only enable this on Arm-based Macs
   if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL
                                              "arm64"
   )
-    set(TORCHAO_BUILD_ATEN_OPS OFF)
-    set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
-    set(TORCHAO_BUILD_CPU_AARCH64 ON)
-    set(TORCHAO_ENABLE_ARM_NEON_DOT ON)
-    add_subdirectory(
-      ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental
-      ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental
-    )
-    executorch_target_link_options_shared_lib(torchao_ops_executorch)
-    list(APPEND link_libraries torchao_ops_executorch)
     if(EXECUTORCH_BUILD_MPS)
       add_subdirectory(
         ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental/ops/mps
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
@@ -340,11 +340,13 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de
 
 ## Running with low-bit kernels
 
-We now give instructions for quantizating and running your model with low-bit kernels.  These are still experimental, and require you do development on an Arm-based Mac, and install executorch from source with the environment variable EXECUTORCH_BUILD_TORCHAO=1 defined:
+We now give instructions for quantizating and running your model with low-bit kernels.  These are still experimental, and require you do development on an Arm-based Mac, and install executorch from source with the environment variable EXECUTORCH_BUILD_KERNELS_TORCHAO=1 defined:
 ```
-EXECUTORCH_BUILD_TORCHAO=1 python install_executorch.py
+EXECUTORCH_BUILD_KERNELS_TORCHAO=1 python install_executorch.py
 ```
 
+(If you'd like lowbit to use KleidiAI when available, you can instead install with `EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_KLEIDIAI=1 python install_executorch.py`.)
+
 Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results.
 
 First export your model for lowbit quantization (step 2 above):
@@ -394,21 +396,19 @@ cmake -DPYTHON_EXECUTABLE=python \
     -DEXECUTORCH_BUILD_XNNPACK=OFF \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_TORCHAO=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
     -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
     -Bcmake-out .
-cmake --build cmake-out -j16 --target install --config Release
+cmake --build cmake-out -j16 --config Release --target install
 ```
 
 Next install the llama runner with torchao kernels enabled (similar to step 3.2 above):
 
 ```
 cmake -DPYTHON_EXECUTABLE=python \
     -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_XNNPACK=OFF \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-    -DEXECUTORCH_BUILD_TORCHAO=ON \
     -Bcmake-out/examples/models/llama \
     examples/models/llama
 cmake --build cmake-out/examples/models/llama -j16 --config Release
diff --git a/install_requirements.py b/install_requirements.py
@@ -118,12 +118,12 @@ def install_requirements(use_pytorch_nightly):
     # Install packages directly from local copy instead of pypi.
     # This is usually not recommended.
     new_env = os.environ.copy()
-    if ("EXECUTORCH_BUILD_TORCHAO" not in new_env) or (
-        new_env["EXECUTORCH_BUILD_TORCHAO"] == "0"
+    if ("EXECUTORCH_BUILD_KERNELS_TORCHAO" not in new_env) or (
+        new_env["EXECUTORCH_BUILD_KERNELS_TORCHAO"] == "0"
     ):
         new_env["USE_CPP"] = "0"
     else:
-        assert new_env["EXECUTORCH_BUILD_TORCHAO"] == "1"
+        assert new_env["EXECUTORCH_BUILD_KERNELS_TORCHAO"] == "1"
         new_env["USE_CPP"] = "1"
         new_env["CMAKE_POLICY_VERSION_MINIMUM"] = "3.5"
     subprocess.run(
diff --git a/third-party/ao b/third-party/ao
@@ -1 +1 @@
-Subproject commit 6bb2baf05122fe5b2a0f982a63140d5832e33cf5
+Subproject commit 1526dfe50cbce877ddb1d0055af46287caae7470
diff --git a/tools/cmake/preset/llm.cmake b/tools/cmake/preset/llm.cmake
@@ -20,7 +20,7 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
   set_overridable_option(EXECUTORCH_BUILD_COREML ON)
   set_overridable_option(EXECUTORCH_BUILD_MPS ON)
   if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
-    set_overridable_option(EXECUTORCH_BUILD_TORCHAO ON)
+    set_overridable_option(EXECUTORCH_BUILD_KERNELS_TORCHAO ON)
   endif()
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
   # Linux-specific code here