diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 753033d73a7..f38e73deb2d 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -387,7 +387,7 @@ jobs: eval "$(conda shell.bash hook)" # Install requirements - ${CONDA_RUN} python install_executorch.py + ${CONDA_RUN} EXECUTORCH_BUILD_TORCHAO=1 python install_executorch.py ${CONDA_RUN} sh examples/models/llama/install_requirements.sh # Run test diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md index 77a058f9506..275fbd31467 100644 --- a/examples/models/llama/README.md +++ b/examples/models/llama/README.md @@ -339,7 +339,12 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de ## Running with low-bit kernels -We now give instructions for quantizating and running your model with low-bit kernels. These are still experimental, and require you do development on an Arm-based Mac. Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results. +We now give instructions for quantizating and running your model with low-bit kernels. These are still experimental, and require you do development on an Arm-based Mac, and install executorch from source with the environment variable EXECUTORCH_BUILD_TORCHAO=1 defined: +``` +EXECUTORCH_BUILD_TORCHAO=1 python install_executorch.py +``` + +Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results. First export your model for lowbit quantization (step 2 above): diff --git a/install_requirements.py b/install_requirements.py index 368e7cd079d..edd799b3bdd 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -118,7 +118,14 @@ def install_requirements(use_pytorch_nightly): # Install packages directly from local copy instead of pypi. # This is usually not recommended. new_env = os.environ.copy() - new_env["USE_CPP"] = "1" # install torchao kernels + if ("EXECUTORCH_BUILD_TORCHAO" not in new_env) or ( + new_env["EXECUTORCH_BUILD_TORCHAO"] == "0" + ): + new_env["USE_CPP"] = "0" + else: + assert new_env["EXECUTORCH_BUILD_TORCHAO"] == "1" + new_env["USE_CPP"] = "1" + new_env["CMAKE_POLICY_VERSION_MINIMUM"] = "3.5" subprocess.run( [ sys.executable, diff --git a/tools/cmake/preset/llm.cmake b/tools/cmake/preset/llm.cmake index 9589e1d8ce2..3180c338acc 100644 --- a/tools/cmake/preset/llm.cmake +++ b/tools/cmake/preset/llm.cmake @@ -17,6 +17,9 @@ set_overridable_option(EXECUTORCH_BUILD_XNNPACK ON) if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") set_overridable_option(EXECUTORCH_BUILD_COREML ON) set_overridable_option(EXECUTORCH_BUILD_MPS ON) + if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") + set_overridable_option(EXECUTORCH_BUILD_TORCHAO ON) + endif() elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux") # Linux-specific code here elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "WIN32")