diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 753033d73a7..f38e73deb2d 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -387,7 +387,7 @@ jobs:
         eval "$(conda shell.bash hook)"
 
         # Install requirements
-        ${CONDA_RUN} python install_executorch.py
+        ${CONDA_RUN} EXECUTORCH_BUILD_TORCHAO=1 python install_executorch.py
         ${CONDA_RUN} sh examples/models/llama/install_requirements.sh
 
         # Run test
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index 77a058f9506..275fbd31467 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -339,7 +339,12 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de
 
 ## Running with low-bit kernels
 
-We now give instructions for quantizating and running your model with low-bit kernels.  These are still experimental, and require you do development on an Arm-based Mac.  Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results.
+We now give instructions for quantizating and running your model with low-bit kernels.  These are still experimental, and require you do development on an Arm-based Mac, and install executorch from source with the environment variable EXECUTORCH_BUILD_TORCHAO=1 defined:
+```
+EXECUTORCH_BUILD_TORCHAO=1 python install_executorch.py
+```
+
+Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results.
 
 First export your model for lowbit quantization (step 2 above):
 
diff --git a/install_requirements.py b/install_requirements.py
index 368e7cd079d..edd799b3bdd 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -118,7 +118,14 @@ def install_requirements(use_pytorch_nightly):
     # Install packages directly from local copy instead of pypi.
     # This is usually not recommended.
     new_env = os.environ.copy()
-    new_env["USE_CPP"] = "1"  # install torchao kernels
+    if ("EXECUTORCH_BUILD_TORCHAO" not in new_env) or (
+        new_env["EXECUTORCH_BUILD_TORCHAO"] == "0"
+    ):
+        new_env["USE_CPP"] = "0"
+    else:
+        assert new_env["EXECUTORCH_BUILD_TORCHAO"] == "1"
+        new_env["USE_CPP"] = "1"
+        new_env["CMAKE_POLICY_VERSION_MINIMUM"] = "3.5"
     subprocess.run(
         [
             sys.executable,
diff --git a/tools/cmake/preset/llm.cmake b/tools/cmake/preset/llm.cmake
index 9589e1d8ce2..3180c338acc 100644
--- a/tools/cmake/preset/llm.cmake
+++ b/tools/cmake/preset/llm.cmake
@@ -17,6 +17,9 @@ set_overridable_option(EXECUTORCH_BUILD_XNNPACK ON)
 if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
   set_overridable_option(EXECUTORCH_BUILD_COREML ON)
   set_overridable_option(EXECUTORCH_BUILD_MPS ON)
+  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+    set_overridable_option(EXECUTORCH_BUILD_TORCHAO ON)
+  endif()
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
   # Linux-specific code here
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "WIN32")