pytorch · metascroy · Aug 13, 2025 · Aug 1, 2025 · Aug 4, 2025 · Aug 4, 2025
diff --git a/.Package.swift/kernels_torchao/dummy.swift b/.Package.swift/kernels_torchao/dummy.swift
diff --git a/.Package.swift/kernels_torchao_debug/dummy.swift b/.Package.swift/kernels_torchao_debug/dummy.swift
diff --git a/.ci/scripts/test_llama_torchao_lowbit.sh b/.ci/scripts/test_llama_torchao_lowbit.sh
@@ -29,27 +29,22 @@ cmake -DPYTHON_EXECUTABLE=python \
     -DEXECUTORCH_ENABLE_LOGGING=1 \
     -DCMAKE_BUILD_TYPE=Release \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=OFF \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_TORCHAO=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
     -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
     -Bcmake-out .
-cmake --build cmake-out -j16 --target install --config Release
+cmake --build cmake-out -j16 --config Release --target install
 
 # Install llama runner with torchao
 cmake -DPYTHON_EXECUTABLE=python \
-    -DBUILD_TESTING=OFF \
     -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_XNNPACK=OFF \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-    -DEXECUTORCH_BUILD_TORCHAO=ON \
     -Bcmake-out/examples/models/llama \
     examples/models/llama
 cmake --build cmake-out/examples/models/llama -j16 --config Release

diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -485,7 +485,7 @@ jobs:
         eval "$(conda shell.bash hook)"
 
         # Install requirements
-        ${CONDA_RUN} EXECUTORCH_BUILD_TORCHAO=1 python install_executorch.py
+        ${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 python install_executorch.py
         ${CONDA_RUN} sh examples/models/llama/install_requirements.sh
 
         # Run test

@@ -278,29 +278,6 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
   )
 endif()
 
-if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
-  set(TORCHAO_BUILD_ATEN_OPS OFF)
-  set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
-  set(TORCHAO_BUILD_CPU_AARCH64 ON)
-  set(TORCHAO_ENABLE_ARM_NEON_DOT ON)
-
-  list(
-    APPEND
-    TORCHAO_INCLUDE_DIRS
-    ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
-    ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
-    ${EXECUTORCH_ROOT}/third-party/ao
-  )
-
-  set(EXECUTORCH_INCLUDE_DIRS ${TORCHAO_INCLUDE_DIRS})
-
-  add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/ao/torchao/experimental
-  )
-  executorch_target_link_options_shared_lib(torchao_ops_executorch)
-  list(APPEND _executorch_kernels torchao_ops_executorch)
-endif()
-
 if(EXECUTORCH_BUILD_TESTS)
   set(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
   include(CTest)
@@ -705,6 +682,55 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool)
 endif()
 
+if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
+  if (NOT TARGET cpuinfo)
+     message(FATAL_ERROR "EXECUTORCH_BUILD_KERNELS_TORCHAO requires EXECUTORCH_BUILD_CPUINFO be set ON")
+  endif()
+  if (NOT TARGET pthreadpool)
+     message(FATAL_ERROR "EXECUTORCH_BUILD_KERNELS_TORCHAO requires EXECUTORCH_BUILD_PTHREADPOOL be set ON")
+  endif()
+
+  # Configure TorchAO kernels
+  set(TORCHAO_BUILD_ATEN_OPS OFF)
+  set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
+  set(TORCHAO_BUILD_CPU_AARCH64 ON)
+  set(TORCHAO_ENABLE_ARM_NEON_DOT ON)
+  set(TORCHAO_BUILD_KLEIDIAI ON)
+
+  # TorchAO kernels look for EXECUTORCH_INCLUDE_DIRS
+  if(DEFINED EXECUTORCH_INCLUDE_DIRS)
+    message(FATAL_ERROR "EXECUTORCH_INCLUDE_DIRS is already defined")
+  endif()
+  set(EXECUTORCH_INCLUDE_DIRS
+    ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
+    ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
+  )
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/ao/torchao/experimental)
+  unset(EXECUTORCH_INCLUDE_DIRS)
+
+  executorch_target_link_options_shared_lib(torchao_ops_executorch)
+  list(APPEND _executorch_kernels torchao_ops_executorch)
+
+  install(
+    TARGETS torchao_ops_executorch torchao_kernels_aarch64
+    EXPORT ExecuTorchTargets
+    DESTINATION lib
+    INCLUDES
+    DESTINATION ${_common_include_directories}
+  )
+  # If using KleidiAI and XNNPACK has not installed it already, install it
+  if (TORCHAO_BUILD_KLEIDIAI AND NOT (EXECUTORCH_BUILD_XNNPACK AND EXECUTORCH_XNNPACK_ENABLE_KLEIDI))
+    install(
+      TARGETS kleidiai
+      EXPORT ExecuTorchTargets
+      DESTINATION lib
+      INCLUDES
+      DESTINATION ${_common_include_directories}
+    )
+  endif()
+
+endif()
+
 if(EXECUTORCH_BUILD_PYBIND)
 
   # Add codegen tools subdirectory for selective_build pybind module

diff --git a/Package.swift b/Package.swift
@@ -84,6 +84,11 @@ let products = deliverables([
     ],
   ],
   "kernels_quantized": [:],
+  "kernels_torchao": [
+    "targets": [
+      "threadpool",
+    ],
+  ],
 ])
 
 let targets = deliverables([

@@ -37,7 +37,7 @@ cmake_dependent_option(
   "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF
 )
 
-option(EXECUTORCH_BUILD_TORCHAO "Build the torchao kernels" OFF)
+option(EXECUTORCH_BUILD_KERNELS_TORCHAO_MPS "Build the torchao MPS kernels" OFF)
 
 if(NOT PYTHON_EXECUTABLE)
   set(PYTHON_EXECUTABLE python3)
@@ -115,21 +115,15 @@ if(TARGET custom_ops)
   list(APPEND link_libraries custom_ops)
 endif()
 
-if(EXECUTORCH_BUILD_TORCHAO)
+if (TARGET torchao_ops_executorch)
+  executorch_target_link_options_shared_lib(torchao_ops_executorch)
+  list(APPEND link_libraries torchao_ops_executorch)
+endif()
+
+
+if(EXECUTORCH_BUILD_KERNELS_TORCHAO_MPS)
   # Currently only enable this on Arm-based Macs
-  if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL
-                                             "arm64"
-  )
-    set(TORCHAO_BUILD_ATEN_OPS OFF)
-    set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
-    set(TORCHAO_BUILD_CPU_AARCH64 ON)
-    set(TORCHAO_ENABLE_ARM_NEON_DOT ON)
-    add_subdirectory(
-      ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental
-      ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental
-    )
-    executorch_target_link_options_shared_lib(torchao_ops_executorch)
-    list(APPEND link_libraries torchao_ops_executorch)
+  if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
     if(EXECUTORCH_BUILD_MPS)
       add_subdirectory(
         ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental/ops/mps

@@ -340,11 +340,13 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de
 
 ## Running with low-bit kernels
 
-We now give instructions for quantizating and running your model with low-bit kernels.  These are still experimental, and require you do development on an Arm-based Mac, and install executorch from source with the environment variable EXECUTORCH_BUILD_TORCHAO=1 defined:
+We now give instructions for quantizating and running your model with low-bit kernels.  These are still experimental, and require you do development on an Arm-based Mac, and install executorch from source with the environment variable EXECUTORCH_BUILD_KERNELS_TORCHAO=1 defined:
 ```
-EXECUTORCH_BUILD_TORCHAO=1 python install_executorch.py
+EXECUTORCH_BUILD_KERNELS_TORCHAO=1 python install_executorch.py
 ```
 
+(If you'd like lowbit to use KleidiAI when available, you can instead install with `EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_KLEIDIAI=1 python install_executorch.py`.)
+
 Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results.
 
 First export your model for lowbit quantization (step 2 above):
@@ -394,21 +396,19 @@ cmake -DPYTHON_EXECUTABLE=python \
     -DEXECUTORCH_BUILD_XNNPACK=OFF \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_TORCHAO=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
     -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
     -Bcmake-out .
-cmake --build cmake-out -j16 --target install --config Release
+cmake --build cmake-out -j16 --config Release --target install
 ```
 
 Next install the llama runner with torchao kernels enabled (similar to step 3.2 above):
 
 ```
 cmake -DPYTHON_EXECUTABLE=python \
     -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_XNNPACK=OFF \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-    -DEXECUTORCH_BUILD_TORCHAO=ON \
     -Bcmake-out/examples/models/llama \
     examples/models/llama
 cmake --build cmake-out/examples/models/llama -j16 --config Release

diff --git a/install_requirements.py b/install_requirements.py
@@ -118,12 +118,12 @@ def install_requirements(use_pytorch_nightly):
     # Install packages directly from local copy instead of pypi.
     # This is usually not recommended.
     new_env = os.environ.copy()
-    if ("EXECUTORCH_BUILD_TORCHAO" not in new_env) or (
-        new_env["EXECUTORCH_BUILD_TORCHAO"] == "0"
+    if ("EXECUTORCH_BUILD_KERNELS_TORCHAO" not in new_env) or (
+        new_env["EXECUTORCH_BUILD_KERNELS_TORCHAO"] == "0"
     ):
         new_env["USE_CPP"] = "0"
     else:
-        assert new_env["EXECUTORCH_BUILD_TORCHAO"] == "1"
+        assert new_env["EXECUTORCH_BUILD_KERNELS_TORCHAO"] == "1"
         new_env["USE_CPP"] = "1"
         new_env["CMAKE_POLICY_VERSION_MINIMUM"] = "3.5"
     subprocess.run(

@@ -20,7 +20,7 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
   set_overridable_option(EXECUTORCH_BUILD_COREML ON)
   set_overridable_option(EXECUTORCH_BUILD_MPS ON)
   if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
-    set_overridable_option(EXECUTORCH_BUILD_TORCHAO ON)
+    set_overridable_option(EXECUTORCH_BUILD_KERNELS_TORCHAO ON)
   endif()
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
   # Linux-specific code here
+3 −3		.github/workflows/1xH100_tests.yml
+1 −1		.github/workflows/1xL4_tests.yml
+4 −3		benchmarks/_models/eval_hf_models.py
+0 −0		benchmarks/float8/training/llama3.sh
+41 −0		benchmarks/float8/training/llama4.sh
+54 −22		benchmarks/mx_formats/cast_bench.py
+15 −14		benchmarks/prototype/moe_training/benchmark_kernels.py
+179 −0		benchmarks/prototype/moe_training/benchmark_moe_layer.py
+190 −0		benchmarks/prototype/moe_training/benchmark_per_group_scaling_kernels.py
+151 −0		benchmarks/prototype/moe_training/benchmark_rowwise_3d_quant_kernels.py
+0 −0		benchmarks/prototype/moe_training/benchmark_scaled_grouped_mm.py
+15 −11		test/core/test_config.py
+53 −24		test/dtypes/test_affine_quantized_float.py
+3 −0		test/dtypes/test_affine_quantized_tensor_parallel.py
+0 −153		test/dtypes/test_fbgemm_fp8.py
+4 −0		test/dtypes/test_nf4.py
+3 −3		test/float8/test_base.py
+1 −0		test/integration/test_integration.py
+70 −0		test/integration/test_loading_deprecated_checkpoint.py
+43 −4		test/prototype/moe_training/test_kernels.py
+1 −0		test/prototype/moe_training/test_scaled_grouped_mm.py
+118 −7		test/prototype/moe_training/test_training.py
+1 −1		test/prototype/test_dynamic_activation_lut.py
+4 −0		test/prototype/test_quantized_training.py
+216 −137		test/prototype/test_smoothquant.py
+563 −0		test/quantization/quantize_/workflows/float8/test_float8_tensor.py
+40 −0		test/quantization/test_qat.py
+4 −0		test/test_low_bit_optim.py
+31 −34		torchao/core/config.py
+4 −0		torchao/dtypes/floatx/float8_layout.py
+32 −14		torchao/experimental/CMakeLists.txt
+0 −17		torchao/experimental/kernels/cpu/aarch64/CMakeLists.txt
+21 −0		torchao/experimental/ops/tests/CMakeLists.txt
+6 −2		torchao/experimental/tests/test_quant_passes.py
+1 −0		torchao/float8/config.py
+14 −3		torchao/prototype/moe_training/conversion_utils.py
+3 −0		torchao/prototype/moe_training/kernels/__init__.py
+255 −0		torchao/prototype/moe_training/kernels/float8_rowwise.py
+47 −35		torchao/prototype/moe_training/kernels/jagged_float8_scales.py
+36 −38		torchao/prototype/moe_training/scaled_grouped_mm.py
+27 −8		torchao/prototype/moe_training/tensor.py
+38 −2		torchao/prototype/moe_training/utils.py
+10 −0		torchao/prototype/mx_formats/kernels.py
+1 −1		torchao/prototype/mx_formats/utils.py
+220 −0		torchao/prototype/quantization/codebook_groupwise/codebook_quantized_tensor.py
+2 −0		torchao/quantization/__init__.py
+2 −0		torchao/quantization/pt2e/utils.py
+15 −7		torchao/quantization/qat/api.py
+17 −1		torchao/quantization/qat/fake_quantize_config.py
+32 −0		torchao/quantization/qat/utils.py
+84 −35		torchao/quantization/quant_api.py
+11 −0		torchao/quantization/quantize_/common/__init__.py
+37 −0		torchao/quantization/quantize_/common/kernel_preference.py
+56 −0		torchao/quantization/quantize_/common/quantize_tensor_kwargs.py
+6 −0		torchao/quantization/quantize_/workflows/__init__.py
+0 −0		torchao/quantization/quantize_/workflows/float8/__init__.py
+613 −0		torchao/quantization/quantize_/workflows/float8/float8_tensor.py
+16 −8		torchao/testing/training/roofline_utils.py