Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
Empty file.
15 changes: 5 additions & 10 deletions .ci/scripts/test_llama_torchao_lowbit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,27 +29,22 @@ cmake -DPYTHON_EXECUTABLE=python \
-DEXECUTORCH_ENABLE_LOGGING=1 \
-DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
-DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-DEXECUTORCH_BUILD_XNNPACK=OFF \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_TORCHAO=ON \
-DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
-DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-Bcmake-out .
cmake --build cmake-out -j16 --target install --config Release
cmake --build cmake-out -j16 --config Release --target install

# Install llama runner with torchao
cmake -DPYTHON_EXECUTABLE=python \
-DBUILD_TESTING=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_XNNPACK=OFF \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_TORCHAO=ON \
-Bcmake-out/examples/models/llama \
examples/models/llama
cmake --build cmake-out/examples/models/llama -j16 --config Release
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@ jobs:
eval "$(conda shell.bash hook)"
# Install requirements
${CONDA_RUN} EXECUTORCH_BUILD_TORCHAO=1 python install_executorch.py
${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 python install_executorch.py
${CONDA_RUN} sh examples/models/llama/install_requirements.sh
# Run test
Expand Down
72 changes: 49 additions & 23 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -278,29 +278,6 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
)
endif()

if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
set(TORCHAO_BUILD_ATEN_OPS OFF)
set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
set(TORCHAO_BUILD_CPU_AARCH64 ON)
set(TORCHAO_ENABLE_ARM_NEON_DOT ON)

list(
APPEND
TORCHAO_INCLUDE_DIRS
${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
${EXECUTORCH_ROOT}/third-party/ao
)

set(EXECUTORCH_INCLUDE_DIRS ${TORCHAO_INCLUDE_DIRS})

add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/third-party/ao/torchao/experimental
)
executorch_target_link_options_shared_lib(torchao_ops_executorch)
list(APPEND _executorch_kernels torchao_ops_executorch)
endif()

if(EXECUTORCH_BUILD_TESTS)
set(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
include(CTest)
Expand Down Expand Up @@ -705,6 +682,55 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool)
endif()

if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
if (NOT TARGET cpuinfo)
message(FATAL_ERROR "EXECUTORCH_BUILD_KERNELS_TORCHAO requires EXECUTORCH_BUILD_CPUINFO be set ON")
endif()
if (NOT TARGET pthreadpool)
message(FATAL_ERROR "EXECUTORCH_BUILD_KERNELS_TORCHAO requires EXECUTORCH_BUILD_PTHREADPOOL be set ON")
endif()

# Configure TorchAO kernels
set(TORCHAO_BUILD_ATEN_OPS OFF)
set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
set(TORCHAO_BUILD_CPU_AARCH64 ON)
set(TORCHAO_ENABLE_ARM_NEON_DOT ON)
set(TORCHAO_BUILD_KLEIDIAI ON)

# TorchAO kernels look for EXECUTORCH_INCLUDE_DIRS
if(DEFINED EXECUTORCH_INCLUDE_DIRS)
message(FATAL_ERROR "EXECUTORCH_INCLUDE_DIRS is already defined")
endif()
set(EXECUTORCH_INCLUDE_DIRS
${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/ao/torchao/experimental)
unset(EXECUTORCH_INCLUDE_DIRS)

executorch_target_link_options_shared_lib(torchao_ops_executorch)
list(APPEND _executorch_kernels torchao_ops_executorch)

install(
TARGETS torchao_ops_executorch torchao_kernels_aarch64
EXPORT ExecuTorchTargets
DESTINATION lib
INCLUDES
DESTINATION ${_common_include_directories}
)
# If using KleidiAI and XNNPACK has not installed it already, install it
if (TORCHAO_BUILD_KLEIDIAI AND NOT (EXECUTORCH_BUILD_XNNPACK AND EXECUTORCH_XNNPACK_ENABLE_KLEIDI))
install(
TARGETS kleidiai
EXPORT ExecuTorchTargets
DESTINATION lib
INCLUDES
DESTINATION ${_common_include_directories}
)
endif()

endif()

if(EXECUTORCH_BUILD_PYBIND)

# Add codegen tools subdirectory for selective_build pybind module
Expand Down
5 changes: 5 additions & 0 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ let products = deliverables([
],
],
"kernels_quantized": [:],
"kernels_torchao": [
"targets": [
"threadpool",
],
],
])

let targets = deliverables([
Expand Down
24 changes: 9 additions & 15 deletions examples/models/llama/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ cmake_dependent_option(
"NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF
)

option(EXECUTORCH_BUILD_TORCHAO "Build the torchao kernels" OFF)
option(EXECUTORCH_BUILD_KERNELS_TORCHAO_MPS "Build the torchao MPS kernels" OFF)

if(NOT PYTHON_EXECUTABLE)
set(PYTHON_EXECUTABLE python3)
Expand Down Expand Up @@ -115,21 +115,15 @@ if(TARGET custom_ops)
list(APPEND link_libraries custom_ops)
endif()

if(EXECUTORCH_BUILD_TORCHAO)
if (TARGET torchao_ops_executorch)
executorch_target_link_options_shared_lib(torchao_ops_executorch)
list(APPEND link_libraries torchao_ops_executorch)
endif()


if(EXECUTORCH_BUILD_KERNELS_TORCHAO_MPS)
# Currently only enable this on Arm-based Macs
if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL
"arm64"
)
set(TORCHAO_BUILD_ATEN_OPS OFF)
set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
set(TORCHAO_BUILD_CPU_AARCH64 ON)
set(TORCHAO_ENABLE_ARM_NEON_DOT ON)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental
${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental
)
executorch_target_link_options_shared_lib(torchao_ops_executorch)
list(APPEND link_libraries torchao_ops_executorch)
if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
if(EXECUTORCH_BUILD_MPS)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental/ops/mps
Expand Down
16 changes: 8 additions & 8 deletions examples/models/llama/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -340,11 +340,13 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de

## Running with low-bit kernels

We now give instructions for quantizating and running your model with low-bit kernels. These are still experimental, and require you do development on an Arm-based Mac, and install executorch from source with the environment variable EXECUTORCH_BUILD_TORCHAO=1 defined:
We now give instructions for quantizating and running your model with low-bit kernels. These are still experimental, and require you do development on an Arm-based Mac, and install executorch from source with the environment variable EXECUTORCH_BUILD_KERNELS_TORCHAO=1 defined:
```
EXECUTORCH_BUILD_TORCHAO=1 python install_executorch.py
EXECUTORCH_BUILD_KERNELS_TORCHAO=1 python install_executorch.py
```

(If you'd like lowbit to use KleidiAI when available, you can instead install with `EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_KLEIDIAI=1 python install_executorch.py`.)

Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results.

First export your model for lowbit quantization (step 2 above):
Expand Down Expand Up @@ -394,21 +396,19 @@ cmake -DPYTHON_EXECUTABLE=python \
-DEXECUTORCH_BUILD_XNNPACK=OFF \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_TORCHAO=ON \
-DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
-DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-Bcmake-out .
cmake --build cmake-out -j16 --target install --config Release
cmake --build cmake-out -j16 --config Release --target install
```

Next install the llama runner with torchao kernels enabled (similar to step 3.2 above):

```
cmake -DPYTHON_EXECUTABLE=python \
-DCMAKE_BUILD_TYPE=Release \
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_XNNPACK=OFF \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_TORCHAO=ON \
-Bcmake-out/examples/models/llama \
examples/models/llama
cmake --build cmake-out/examples/models/llama -j16 --config Release
Expand Down
6 changes: 3 additions & 3 deletions install_requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,12 @@ def install_requirements(use_pytorch_nightly):
# Install packages directly from local copy instead of pypi.
# This is usually not recommended.
new_env = os.environ.copy()
if ("EXECUTORCH_BUILD_TORCHAO" not in new_env) or (
new_env["EXECUTORCH_BUILD_TORCHAO"] == "0"
if ("EXECUTORCH_BUILD_KERNELS_TORCHAO" not in new_env) or (
new_env["EXECUTORCH_BUILD_KERNELS_TORCHAO"] == "0"
):
new_env["USE_CPP"] = "0"
else:
assert new_env["EXECUTORCH_BUILD_TORCHAO"] == "1"
assert new_env["EXECUTORCH_BUILD_KERNELS_TORCHAO"] == "1"
new_env["USE_CPP"] = "1"
new_env["CMAKE_POLICY_VERSION_MINIMUM"] = "3.5"
subprocess.run(
Expand Down
2 changes: 1 addition & 1 deletion third-party/ao
Submodule ao updated 58 files
+3 −3 .github/workflows/1xH100_tests.yml
+1 −1 .github/workflows/1xL4_tests.yml
+4 −3 benchmarks/_models/eval_hf_models.py
+0 −0 benchmarks/float8/training/llama3.sh
+41 −0 benchmarks/float8/training/llama4.sh
+54 −22 benchmarks/mx_formats/cast_bench.py
+15 −14 benchmarks/prototype/moe_training/benchmark_kernels.py
+179 −0 benchmarks/prototype/moe_training/benchmark_moe_layer.py
+190 −0 benchmarks/prototype/moe_training/benchmark_per_group_scaling_kernels.py
+151 −0 benchmarks/prototype/moe_training/benchmark_rowwise_3d_quant_kernels.py
+0 −0 benchmarks/prototype/moe_training/benchmark_scaled_grouped_mm.py
+15 −11 test/core/test_config.py
+53 −24 test/dtypes/test_affine_quantized_float.py
+3 −0 test/dtypes/test_affine_quantized_tensor_parallel.py
+0 −153 test/dtypes/test_fbgemm_fp8.py
+4 −0 test/dtypes/test_nf4.py
+3 −3 test/float8/test_base.py
+1 −0 test/integration/test_integration.py
+70 −0 test/integration/test_loading_deprecated_checkpoint.py
+43 −4 test/prototype/moe_training/test_kernels.py
+1 −0 test/prototype/moe_training/test_scaled_grouped_mm.py
+118 −7 test/prototype/moe_training/test_training.py
+1 −1 test/prototype/test_dynamic_activation_lut.py
+4 −0 test/prototype/test_quantized_training.py
+216 −137 test/prototype/test_smoothquant.py
+563 −0 test/quantization/quantize_/workflows/float8/test_float8_tensor.py
+40 −0 test/quantization/test_qat.py
+4 −0 test/test_low_bit_optim.py
+31 −34 torchao/core/config.py
+4 −0 torchao/dtypes/floatx/float8_layout.py
+32 −14 torchao/experimental/CMakeLists.txt
+0 −17 torchao/experimental/kernels/cpu/aarch64/CMakeLists.txt
+21 −0 torchao/experimental/ops/tests/CMakeLists.txt
+6 −2 torchao/experimental/tests/test_quant_passes.py
+1 −0 torchao/float8/config.py
+14 −3 torchao/prototype/moe_training/conversion_utils.py
+3 −0 torchao/prototype/moe_training/kernels/__init__.py
+255 −0 torchao/prototype/moe_training/kernels/float8_rowwise.py
+47 −35 torchao/prototype/moe_training/kernels/jagged_float8_scales.py
+36 −38 torchao/prototype/moe_training/scaled_grouped_mm.py
+27 −8 torchao/prototype/moe_training/tensor.py
+38 −2 torchao/prototype/moe_training/utils.py
+10 −0 torchao/prototype/mx_formats/kernels.py
+1 −1 torchao/prototype/mx_formats/utils.py
+220 −0 torchao/prototype/quantization/codebook_groupwise/codebook_quantized_tensor.py
+2 −0 torchao/quantization/__init__.py
+2 −0 torchao/quantization/pt2e/utils.py
+15 −7 torchao/quantization/qat/api.py
+17 −1 torchao/quantization/qat/fake_quantize_config.py
+32 −0 torchao/quantization/qat/utils.py
+84 −35 torchao/quantization/quant_api.py
+11 −0 torchao/quantization/quantize_/common/__init__.py
+37 −0 torchao/quantization/quantize_/common/kernel_preference.py
+56 −0 torchao/quantization/quantize_/common/quantize_tensor_kwargs.py
+6 −0 torchao/quantization/quantize_/workflows/__init__.py
+0 −0 torchao/quantization/quantize_/workflows/float8/__init__.py
+613 −0 torchao/quantization/quantize_/workflows/float8/float8_tensor.py
+16 −8 torchao/testing/training/roofline_utils.py
2 changes: 1 addition & 1 deletion tools/cmake/preset/llm.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
set_overridable_option(EXECUTORCH_BUILD_COREML ON)
set_overridable_option(EXECUTORCH_BUILD_MPS ON)
if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
set_overridable_option(EXECUTORCH_BUILD_TORCHAO ON)
set_overridable_option(EXECUTORCH_BUILD_KERNELS_TORCHAO ON)
endif()
elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
# Linux-specific code here
Expand Down
Loading