diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 11446030cef..e07cbc1dafd 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -325,7 +325,7 @@ jobs:
         eval "$(conda shell.bash hook)"
 
         # Install requirements
-        ${CONDA_RUN} python install_executorch.py
+        ${CONDA_RUN} EXECUTORCH_BUILD_TORCHAO=1 python install_executorch.py
         ${CONDA_RUN} sh examples/models/llama/install_requirements.sh
 
         # Run test
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index bbd2107ad74..8605ed3efc7 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -338,7 +338,12 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de
 
 ## Running with low-bit kernels
 
-We now give instructions for quantizating and running your model with low-bit kernels.  These are still experimental, and require you do development on an Arm-based Mac.  Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results.
+We now give instructions for quantizating and running your model with low-bit kernels.  These are still experimental, and require you do development on an Arm-based Mac, and install executorch from source with the environment variable EXECUTORCH_BUILD_TORCHAO=1 defined:
+```
+EXECUTORCH_BUILD_TORCHAO=1 python install_executorch.py
+```
+
+Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results.
 
 First export your model for lowbit quantization (step 2 above):
 
diff --git a/install_requirements.py b/install_requirements.py
index 70781b5445a..0600c123dab 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -106,7 +106,14 @@ def install_requirements(use_pytorch_nightly):
     # Install packages directly from local copy instead of pypi.
     # This is usually not recommended.
     new_env = os.environ.copy()
-    new_env["USE_CPP"] = "1"  # install torchao kernels
+    if ("EXECUTORCH_BUILD_TORCHAO" not in new_env) or (
+        new_env["EXECUTORCH_BUILD_TORCHAO"] == "0"
+    ):
+        new_env["USE_CPP"] = "0"
+    else:
+        assert new_env["EXECUTORCH_BUILD_TORCHAO"] == "1"
+        new_env["USE_CPP"] = "1"
+        new_env["CMAKE_POLICY_VERSION_MINIMUM"] = "3.5"
     subprocess.run(
         [
             sys.executable,
diff --git a/tools/cmake/preset/llm.cmake b/tools/cmake/preset/llm.cmake
index da1364eb2ad..88148ee6cfd 100644
--- a/tools/cmake/preset/llm.cmake
+++ b/tools/cmake/preset/llm.cmake
@@ -20,6 +20,9 @@ set_overridable_option(EXECUTORCH_BUILD_XNNPACK ON)
 if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
   set_overridable_option(EXECUTORCH_BUILD_COREML ON)
   set_overridable_option(EXECUTORCH_BUILD_MPS ON)
+  if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+    set_overridable_option(EXECUTORCH_BUILD_TORCHAO ON)
+  endif()
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux")
   # Linux-specific code here
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows" OR CMAKE_SYSTEM_NAME STREQUAL "WIN32")