From 9ab5bf97a06511c69170d383c10b39298e088a75 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Tue, 18 Mar 2025 21:09:38 -0700
Subject: [PATCH 1/5] up

---
 examples/models/llama/CMakeLists.txt          | 20 ++++++----
 .../llama/source_transformation/quantize.py   | 39 +++++++++++++------
 run.sh                                        |  3 ++
 3 files changed, 43 insertions(+), 19 deletions(-)
 create mode 100644 run.sh

diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
index 959002892c6..297f0d26db7 100644
--- a/examples/models/llama/CMakeLists.txt
+++ b/examples/models/llama/CMakeLists.txt
@@ -116,16 +116,20 @@ endif()
 
 if(EXECUTORCH_BUILD_TORCHAO)
   set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental)
+  set(TORCHAO_BUILD_CPU_AARCH64 ON)
+  add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental
+    ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental
+  )
   target_link_options_shared_lib(torchao_ops_executorch)
   list(APPEND link_libraries torchao_ops_executorch)
-  if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
-    add_subdirectory(
-      ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental/ops/mps
-      ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental/ops/mps)
-    target_link_options_shared_lib(torchao_ops_mps_executorch)
-    list(APPEND link_libraries torchao_ops_mps_executorch)
-  endif()
+  # if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+  #   add_subdirectory(
+  #     ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental/ops/mps
+  #     ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental/ops/mps)
+  #   target_link_options_shared_lib(torchao_ops_mps_executorch)
+  #   list(APPEND link_libraries torchao_ops_mps_executorch)
+  # endif()
 endif()
 
 set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack)
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
index d81c0849e62..59108cf8396 100644
--- a/examples/models/llama/source_transformation/quantize.py
+++ b/examples/models/llama/source_transformation/quantize.py
@@ -98,21 +98,38 @@ def quantize(  # noqa C901
         matches = re.findall(pattern, qmode)
         assert len(matches) == 1, f"Expected 1 match for pattern but got {len(matches)}"
         bitwidth = int(matches[0][0])
-        _load_torchao_aten_lib(libname="libtorchao_ops_aten")
-        from torchao.experimental.quant_api import Int8DynActIntxWeightLinearQuantizer
+        # _load_torchao_aten_lib(libname="libtorchao_ops_aten")
+        # from torchao.experimental.quant_api import Int8DynActIntxWeightLinearQuantizer
+        from torchao.experimental.quant_api import int8_dynamic_activation_intx_weight, Int8DynActIntxWeightLinearQuantizer
+        from torchao.quantization.quant_api import quantize_
+        from torchao.utils import unwrap_tensor_subclass
+        from torchao.quantization.granularity import PerRow, PerGroup
 
         with torch.no_grad():
-            model = Int8DynActIntxWeightLinearQuantizer(
-                device="cpu",
-                precision=torch.float32,
-                groupsize=group_size,
-                bitwidth=bitwidth,
-                has_weight_zeros=False,
-            ).quantize(model)
-
+            # model = Int8DynActIntxWeightLinearQuantizer(
+            #     device="cpu",
+            #     precision=torch.float32,
+            #     groupsize=group_size,
+            #     bitwidth=bitwidth,
+            #     has_weight_zeros=False,  
+            # ).quantize(model)
+
+            quantize_(model,
+                int8_dynamic_activation_intx_weight(
+                    # group_size=group_size,
+                    # nbit=bitwidth,
+                    # has_weight_zeros=False,
+                    weight_dtype=getattr(torch, f"int{bitwidth}"),
+                    granularity=PerRow() if group_size == 0 else PerGroup(group_size),
+                    has_weight_zeros=False,
+                ),
+            )
+            model = unwrap_tensor_subclass(model)
         if verbose:
             print("quantized model:", model)
         return model
+
+        return model
     elif qmode == "8da4w":
         # Check for required args
         if group_size is None:
@@ -752,7 +769,7 @@ def get_quant_embedding_transform(args):
         bitwidth, group_size = args.embedding_quantize.split(":")[1].split(",")
         group_size = int(group_size)
         bitwidth = int(bitwidth)
-        _load_torchao_aten_lib(libname="libtorchao_ops_aten")
+        # _load_torchao_aten_lib(libname="libtorchao_ops_aten")
         from torchao.experimental.quant_api import IntxWeightEmbeddingQuantizer
 
         def _torchao_embedding_quantizer(model):
diff --git a/run.sh b/run.sh
new file mode 100644
index 00000000000..bb810e8c16a
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,3 @@
+for i in {1..5}; do
+    ./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path=$TOKENIZER --prompt="Once upon a time,"
+done

From 4030e773b32296165f1e3ea8cd9beec882f7149e Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Tue, 18 Mar 2025 21:10:27 -0700
Subject: [PATCH 2/5] up

---
 third-party/ao | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third-party/ao b/third-party/ao
index 7d8794622f3..ddb7f83dad9 160000
--- a/third-party/ao
+++ b/third-party/ao
@@ -1 +1 @@
-Subproject commit 7d8794622f3ac7ffa98761314019a20fba06edef
+Subproject commit ddb7f83dad97f918bae0e84ae27c5cf47d8c64fe

From 716944f7467e017457a53e0e35b0c4c1451eda99 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Wed, 19 Mar 2025 10:25:17 -0700
Subject: [PATCH 3/5] up

---
 install_requirements.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/install_requirements.py b/install_requirements.py
index 9353dad180e..31fb3a700e6 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -10,6 +10,7 @@
 import re
 import subprocess
 import sys
+import os
 
 
 def python_is_compatible():
@@ -117,6 +118,8 @@ def install_requirements(use_pytorch_nightly):
 
     # Install packages directly from local copy instead of pypi.
     # This is usually not recommended.
+    new_env = os.environ.copy()
+    new_env["USE_CPP"] = "1" # build torchao kernels
     subprocess.run(
         [
             sys.executable,
@@ -127,6 +130,7 @@ def install_requirements(use_pytorch_nightly):
             "--no-build-isolation",
             *LOCAL_REQUIREMENTS,
         ],
+        env=new_env,
         check=True,
     )
 

From d5e46c81235be50f9b3021e8ad457acfef4c505f Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Wed, 19 Mar 2025 17:32:06 -0700
Subject: [PATCH 4/5] up

---
 .ci/scripts/test_llama_torchao_lowbit.sh      | 85 +++++++++++++++++++
 .github/workflows/trunk.yml                   | 24 +++++-
 examples/models/llama/CMakeLists.txt          | 33 +++----
 examples/models/llama/README.md               | 73 ++++++++++++++++
 .../llama/source_transformation/quantize.py   | 30 ++-----
 install_requirements.py                       |  6 +-
 run.sh                                        |  3 -
 third-party/ao                                |  2 +-
 8 files changed, 210 insertions(+), 46 deletions(-)
 create mode 100644 .ci/scripts/test_llama_torchao_lowbit.sh
 delete mode 100644 run.sh

diff --git a/.ci/scripts/test_llama_torchao_lowbit.sh b/.ci/scripts/test_llama_torchao_lowbit.sh
new file mode 100644
index 00000000000..76fabb04250
--- /dev/null
+++ b/.ci/scripts/test_llama_torchao_lowbit.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+export EXECUTORCH_ROOT="$(dirname "${BASH_SOURCE[0]}")/../.."
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+which "${PYTHON_EXECUTABLE}"
+
+# Update tokenizers submodule
+pushd $EXECUTORCH_ROOT/extension/llm/tokenizers
+echo "Update tokenizers submodule"
+git submodule update --init
+popd
+
+# Install ET with CMake
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_INSTALL_PREFIX=cmake-out \
+    -DEXECUTORCH_ENABLE_LOGGING=1 \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=OFF \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -Bcmake-out .
+cmake --build cmake-out -j16 --target install --config Release
+
+# Install llama runner with torchao
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=OFF \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_TORCHAO=ON \
+    -Bcmake-out/examples/models/llama \
+    examples/models/llama
+cmake --build cmake-out/examples/models/llama -j16 --config Release
+
+# Download stories llama110m artifacts
+download_stories_model_artifacts
+
+echo "Creating tokenizer.bin"
+$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+
+# Export model
+LLAMA_CHECKPOINT=stories110M.pt
+LLAMA_PARAMS=params.json
+MODEL_OUT=model.pte
+TOKENIZER=tokenizer.bin
+
+# Set low-bit quantization parameters
+QLINEAR_BITWIDTH=3 # Can be 1-8
+QLINEAR_GROUP_SIZE=128 # Must be multiple of 16
+QEMBEDDING_BITWIDTH=4 # Can be 1-8
+QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16
+
+${PYTHON_EXECUTABLE} -m examples.models.llama.export_llama \
+    --checkpoint "${LLAMA_CHECKPOINT:?}" \
+    --params "${LLAMA_PARAMS:?}" \
+    -kv \
+    --use_sdpa_with_kv_cache \
+    --output_name=${MODEL_OUT} \
+    -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
+    --group_size ${QLINEAR_GROUP_SIZE} \
+    -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
+    --disable_dynamic_shape \
+    -d fp32
+
+# Test run
+./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path=$TOKENIZER --prompt="Once upon a time,"
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 7ba3e5fda75..097a272d0fe 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -23,8 +23,8 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     strategy:
       matrix:
-        # Mac runners are expensive and limited, and non reliable. 
-        # Do some basic testing for macos jobs, and rely mostly on 
+        # Mac runners are expensive and limited, and non reliable.
+        # Do some basic testing for macos jobs, and rely mostly on
         # test-models-linux-aarch64 job instead.
         model: [emformer_join, ic4, llama2, mobilebert, mv3, resnet50, vit, w2l]
         backend: [xnnpack-quantization-delegation]
@@ -288,6 +288,26 @@ jobs:
         # Test ANE llama
         ${CONDA_RUN} sh .ci/scripts/test_ane_static_llama.sh
 
+  test-llama-torchao-lowbit:
+    name: test-llama-torchao-lowbit
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m1-stable
+      python-version: '3.11'
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+        bash .ci/scripts/setup-conda.sh
+        eval "$(conda shell.bash hook)"
+
+        # Install requirements
+        ${CONDA_RUN} python install_executorch.py
+        ${CONDA_RUN} sh examples/models/llama/install_requirements.sh
+
+        # Run test
+        ${CONDA_RUN} sh .ci/scripts/test_llama_torchao_lowbit.sh
+
   test-llama-runner-linux:
     # Test Both linux x86 and linux aarch64
     name: test-llama-runner-linux
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
index 297f0d26db7..e6d45424bd4 100644
--- a/examples/models/llama/CMakeLists.txt
+++ b/examples/models/llama/CMakeLists.txt
@@ -115,21 +115,24 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
 endif()
 
 if(EXECUTORCH_BUILD_TORCHAO)
-  set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
-  set(TORCHAO_BUILD_CPU_AARCH64 ON)
-  add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental
-    ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental
-  )
-  target_link_options_shared_lib(torchao_ops_executorch)
-  list(APPEND link_libraries torchao_ops_executorch)
-  # if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
-  #   add_subdirectory(
-  #     ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental/ops/mps
-  #     ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental/ops/mps)
-  #   target_link_options_shared_lib(torchao_ops_mps_executorch)
-  #   list(APPEND link_libraries torchao_ops_mps_executorch)
-  # endif()
+  # Currently only enable this on Arm-based Macs
+  if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+    set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
+    set(TORCHAO_BUILD_CPU_AARCH64 ON)
+    add_subdirectory(
+      ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental
+      ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental
+    )
+    target_link_options_shared_lib(torchao_ops_executorch)
+    list(APPEND link_libraries torchao_ops_executorch)
+    if(EXECUTORCH_BUILD_MPS)
+      add_subdirectory(
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental/ops/mps
+        ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental/ops/mps)
+      target_link_options_shared_lib(torchao_ops_mps_executorch)
+      list(APPEND link_libraries torchao_ops_mps_executorch)
+    endif()
+  endif()
 endif()
 
 set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack)
diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index 4c0cce4dd9e..ab435513643 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -380,6 +380,79 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de
 ### Android
 Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-android.html) to for full instructions on building the Android LLAMA Demo App.
 
+## Running with low-bit kernels
+
+We now give instructions for quantizating and running your model with low-bit kernels.  These are still experimental, and require you do development on an Arm-based Mac.  Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results.
+
+First export your model for lowbit quantization (step 2 above):
+
+```
+# Set these paths to point to the downloaded files
+LLAMA_CHECKPOINT=path/to/checkpoint.pth
+LLAMA_PARAMS=path/to/params.json
+
+# Set low-bit quantization parameters
+QLINEAR_BITWIDTH=3 # Can be 1-8
+QLINEAR_GROUP_SIZE=128 # Must be multiple of 16
+QEMBEDDING_BITWIDTH=4 # Can be 1-8
+QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16
+
+python -m examples.models.llama.export_llama \
+  --model "llama3_2" \
+  --checkpoint "${LLAMA_CHECKPOINT:?}" \
+  --params "${LLAMA_PARAMS:?}" \
+  -kv \
+  --use_sdpa_with_kv_cache \
+  --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
+  --output_name="llama3_2.pte" \
+  -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
+  --group_size ${QLINEAR_GROUP_SIZE} \
+  -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
+  --disable_dynamic_shape \
+  -d fp32
+```
+
+Once the model is exported, we need to build ExecuTorch and the runner with the low-bit kernels.
+
+The first step is to install ExecuTorch (the same as step 3.1 above):
+
+```
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_INSTALL_PREFIX=cmake-out \
+    -DEXECUTORCH_ENABLE_LOGGING=1 \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=ON \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -Bcmake-out .
+cmake --build cmake-out -j16 --target install --config Release
+```
+
+Next install the llama runner with torchao kernels enabled (similar to step 3.2 above):
+
+```
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=OFF \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_TORCHAO=ON \
+    -Bcmake-out/examples/models/llama \
+    examples/models/llama
+cmake --build cmake-out/examples/models/llama -j16 --config Release
+```
+
+Finally run your model (similar to step 3.3 above):
+
+```
+cmake-out/examples/models/llama/llama_main --model_path=<model pte file> --tokenizer_path=<tokenizer.model> --prompt=<prompt>
+```
 
 ## Utility tools for Llama enablement
 
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
index 59108cf8396..d147e40b082 100644
--- a/examples/models/llama/source_transformation/quantize.py
+++ b/examples/models/llama/source_transformation/quantize.py
@@ -98,29 +98,20 @@ def quantize(  # noqa C901
         matches = re.findall(pattern, qmode)
         assert len(matches) == 1, f"Expected 1 match for pattern but got {len(matches)}"
         bitwidth = int(matches[0][0])
-        # _load_torchao_aten_lib(libname="libtorchao_ops_aten")
-        # from torchao.experimental.quant_api import Int8DynActIntxWeightLinearQuantizer
-        from torchao.experimental.quant_api import int8_dynamic_activation_intx_weight, Int8DynActIntxWeightLinearQuantizer
+
+        from torchao.experimental.quant_api import Int8DynamicActivationIntxWeightConfig
+        from torchao.quantization.granularity import PerGroup, PerRow
         from torchao.quantization.quant_api import quantize_
         from torchao.utils import unwrap_tensor_subclass
-        from torchao.quantization.granularity import PerRow, PerGroup
 
         with torch.no_grad():
-            # model = Int8DynActIntxWeightLinearQuantizer(
-            #     device="cpu",
-            #     precision=torch.float32,
-            #     groupsize=group_size,
-            #     bitwidth=bitwidth,
-            #     has_weight_zeros=False,  
-            # ).quantize(model)
-
-            quantize_(model,
-                int8_dynamic_activation_intx_weight(
-                    # group_size=group_size,
-                    # nbit=bitwidth,
-                    # has_weight_zeros=False,
+            quantize_(
+                model,
+                Int8DynamicActivationIntxWeightConfig(
                     weight_dtype=getattr(torch, f"int{bitwidth}"),
-                    granularity=PerRow() if group_size == 0 else PerGroup(group_size),
+                    granularity=(
+                        PerRow() if group_size in [0, -1] else PerGroup(group_size)
+                    ),
                     has_weight_zeros=False,
                 ),
             )
@@ -128,8 +119,6 @@ def quantize(  # noqa C901
         if verbose:
             print("quantized model:", model)
         return model
-
-        return model
     elif qmode == "8da4w":
         # Check for required args
         if group_size is None:
@@ -769,7 +758,6 @@ def get_quant_embedding_transform(args):
         bitwidth, group_size = args.embedding_quantize.split(":")[1].split(",")
         group_size = int(group_size)
         bitwidth = int(bitwidth)
-        # _load_torchao_aten_lib(libname="libtorchao_ops_aten")
         from torchao.experimental.quant_api import IntxWeightEmbeddingQuantizer
 
         def _torchao_embedding_quantizer(model):
diff --git a/install_requirements.py b/install_requirements.py
index 31fb3a700e6..d0a56d784d7 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -6,11 +6,11 @@
 # LICENSE file in the root directory of this source tree.
 
 import argparse
+import os
 import platform
 import re
 import subprocess
 import sys
-import os
 
 
 def python_is_compatible():
@@ -119,7 +119,7 @@ def install_requirements(use_pytorch_nightly):
     # Install packages directly from local copy instead of pypi.
     # This is usually not recommended.
     new_env = os.environ.copy()
-    new_env["USE_CPP"] = "1" # build torchao kernels
+    new_env["USE_CPP"] = "1"
     subprocess.run(
         [
             sys.executable,
@@ -147,8 +147,6 @@ def main(args):
 
 
 if __name__ == "__main__":
-    import os
-
     # Before doing anything, cd to the directory containing this script.
     os.chdir(os.path.dirname(os.path.abspath(__file__)))
     if not python_is_compatible():
diff --git a/run.sh b/run.sh
deleted file mode 100644
index bb810e8c16a..00000000000
--- a/run.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-for i in {1..5}; do
-    ./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path=$TOKENIZER --prompt="Once upon a time,"
-done
diff --git a/third-party/ao b/third-party/ao
index ddb7f83dad9..64bcf4c2575 160000
--- a/third-party/ao
+++ b/third-party/ao
@@ -1 +1 @@
-Subproject commit ddb7f83dad97f918bae0e84ae27c5cf47d8c64fe
+Subproject commit 64bcf4c25755a783685ba7383000b3bf722523c1

From ad9532158ab5f7b72de15aefe17385c1d5ea213f Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Thu, 20 Mar 2025 11:34:56 -0700
Subject: [PATCH 5/5] up

---
 examples/models/llama/README.md | 2 +-
 install_requirements.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md
index ab435513643..07c90505237 100644
--- a/examples/models/llama/README.md
+++ b/examples/models/llama/README.md
@@ -382,7 +382,7 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de
 
 ## Running with low-bit kernels
 
-We now give instructions for quantizating and running your model with low-bit kernels.  These are still experimental, and require you do development on an Arm-based Mac.  Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results.
+We now give instructions for quantizating and running your model with low-bit kernels.  These are still experimental, and require you do development on an Arm-based Mac.  Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results.  Currently dynamic shapes must be disabled when exporting a model with these kernels.
 
 First export your model for lowbit quantization (step 2 above):
 
diff --git a/install_requirements.py b/install_requirements.py
index d0a56d784d7..6770f8f98a1 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -119,7 +119,7 @@ def install_requirements(use_pytorch_nightly):
     # Install packages directly from local copy instead of pypi.
     # This is usually not recommended.
     new_env = os.environ.copy()
-    new_env["USE_CPP"] = "1"
+    new_env["USE_CPP"] = "1"  # install torchao kernels
     subprocess.run(
         [
             sys.executable,