From 9ab5bf97a06511c69170d383c10b39298e088a75 Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Tue, 18 Mar 2025 21:09:38 -0700 Subject: [PATCH 1/5] up --- examples/models/llama/CMakeLists.txt | 20 ++++++---- .../llama/source_transformation/quantize.py | 39 +++++++++++++------ run.sh | 3 ++ 3 files changed, 43 insertions(+), 19 deletions(-) create mode 100644 run.sh diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt index 959002892c6..297f0d26db7 100644 --- a/examples/models/llama/CMakeLists.txt +++ b/examples/models/llama/CMakeLists.txt @@ -116,16 +116,20 @@ endif() if(EXECUTORCH_BUILD_TORCHAO) set(TORCHAO_BUILD_EXECUTORCH_OPS ON) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental) + set(TORCHAO_BUILD_CPU_AARCH64 ON) + add_subdirectory( + ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental + ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental + ) target_link_options_shared_lib(torchao_ops_executorch) list(APPEND link_libraries torchao_ops_executorch) - if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") - add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental/ops/mps - ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental/ops/mps) - target_link_options_shared_lib(torchao_ops_mps_executorch) - list(APPEND link_libraries torchao_ops_mps_executorch) - endif() + # if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") + # add_subdirectory( + # ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental/ops/mps + # ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental/ops/mps) + # target_link_options_shared_lib(torchao_ops_mps_executorch) + # list(APPEND link_libraries torchao_ops_mps_executorch) + # endif() endif() set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack) diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py index d81c0849e62..59108cf8396 100644 --- a/examples/models/llama/source_transformation/quantize.py +++ b/examples/models/llama/source_transformation/quantize.py @@ -98,21 +98,38 @@ def quantize( # noqa C901 matches = re.findall(pattern, qmode) assert len(matches) == 1, f"Expected 1 match for pattern but got {len(matches)}" bitwidth = int(matches[0][0]) - _load_torchao_aten_lib(libname="libtorchao_ops_aten") - from torchao.experimental.quant_api import Int8DynActIntxWeightLinearQuantizer + # _load_torchao_aten_lib(libname="libtorchao_ops_aten") + # from torchao.experimental.quant_api import Int8DynActIntxWeightLinearQuantizer + from torchao.experimental.quant_api import int8_dynamic_activation_intx_weight, Int8DynActIntxWeightLinearQuantizer + from torchao.quantization.quant_api import quantize_ + from torchao.utils import unwrap_tensor_subclass + from torchao.quantization.granularity import PerRow, PerGroup with torch.no_grad(): - model = Int8DynActIntxWeightLinearQuantizer( - device="cpu", - precision=torch.float32, - groupsize=group_size, - bitwidth=bitwidth, - has_weight_zeros=False, - ).quantize(model) - + # model = Int8DynActIntxWeightLinearQuantizer( + # device="cpu", + # precision=torch.float32, + # groupsize=group_size, + # bitwidth=bitwidth, + # has_weight_zeros=False, + # ).quantize(model) + + quantize_(model, + int8_dynamic_activation_intx_weight( + # group_size=group_size, + # nbit=bitwidth, + # has_weight_zeros=False, + weight_dtype=getattr(torch, f"int{bitwidth}"), + granularity=PerRow() if group_size == 0 else PerGroup(group_size), + has_weight_zeros=False, + ), + ) + model = unwrap_tensor_subclass(model) if verbose: print("quantized model:", model) return model + + return model elif qmode == "8da4w": # Check for required args if group_size is None: @@ -752,7 +769,7 @@ def get_quant_embedding_transform(args): bitwidth, group_size = args.embedding_quantize.split(":")[1].split(",") group_size = int(group_size) bitwidth = int(bitwidth) - _load_torchao_aten_lib(libname="libtorchao_ops_aten") + # _load_torchao_aten_lib(libname="libtorchao_ops_aten") from torchao.experimental.quant_api import IntxWeightEmbeddingQuantizer def _torchao_embedding_quantizer(model): diff --git a/run.sh b/run.sh new file mode 100644 index 00000000000..bb810e8c16a --- /dev/null +++ b/run.sh @@ -0,0 +1,3 @@ +for i in {1..5}; do + ./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path=$TOKENIZER --prompt="Once upon a time," +done From 4030e773b32296165f1e3ea8cd9beec882f7149e Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Tue, 18 Mar 2025 21:10:27 -0700 Subject: [PATCH 2/5] up --- third-party/ao | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third-party/ao b/third-party/ao index 7d8794622f3..ddb7f83dad9 160000 --- a/third-party/ao +++ b/third-party/ao @@ -1 +1 @@ -Subproject commit 7d8794622f3ac7ffa98761314019a20fba06edef +Subproject commit ddb7f83dad97f918bae0e84ae27c5cf47d8c64fe From 716944f7467e017457a53e0e35b0c4c1451eda99 Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Wed, 19 Mar 2025 10:25:17 -0700 Subject: [PATCH 3/5] up --- install_requirements.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/install_requirements.py b/install_requirements.py index 9353dad180e..31fb3a700e6 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -10,6 +10,7 @@ import re import subprocess import sys +import os def python_is_compatible(): @@ -117,6 +118,8 @@ def install_requirements(use_pytorch_nightly): # Install packages directly from local copy instead of pypi. # This is usually not recommended. + new_env = os.environ.copy() + new_env["USE_CPP"] = "1" # build torchao kernels subprocess.run( [ sys.executable, @@ -127,6 +130,7 @@ def install_requirements(use_pytorch_nightly): "--no-build-isolation", *LOCAL_REQUIREMENTS, ], + env=new_env, check=True, ) From d5e46c81235be50f9b3021e8ad457acfef4c505f Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Wed, 19 Mar 2025 17:32:06 -0700 Subject: [PATCH 4/5] up --- .ci/scripts/test_llama_torchao_lowbit.sh | 85 +++++++++++++++++++ .github/workflows/trunk.yml | 24 +++++- examples/models/llama/CMakeLists.txt | 33 +++---- examples/models/llama/README.md | 73 ++++++++++++++++ .../llama/source_transformation/quantize.py | 30 ++----- install_requirements.py | 6 +- run.sh | 3 - third-party/ao | 2 +- 8 files changed, 210 insertions(+), 46 deletions(-) create mode 100644 .ci/scripts/test_llama_torchao_lowbit.sh delete mode 100644 run.sh diff --git a/.ci/scripts/test_llama_torchao_lowbit.sh b/.ci/scripts/test_llama_torchao_lowbit.sh new file mode 100644 index 00000000000..76fabb04250 --- /dev/null +++ b/.ci/scripts/test_llama_torchao_lowbit.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -exu + +source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" + +export EXECUTORCH_ROOT="$(dirname "${BASH_SOURCE[0]}")/../.." + +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then + PYTHON_EXECUTABLE=python3 +fi + +which "${PYTHON_EXECUTABLE}" + +# Update tokenizers submodule +pushd $EXECUTORCH_ROOT/extension/llm/tokenizers +echo "Update tokenizers submodule" +git submodule update --init +popd + +# Install ET with CMake +cmake -DPYTHON_EXECUTABLE=python \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DEXECUTORCH_ENABLE_LOGGING=1 \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_XNNPACK=OFF \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -Bcmake-out . +cmake --build cmake-out -j16 --target install --config Release + +# Install llama runner with torchao +cmake -DPYTHON_EXECUTABLE=python \ + -DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=OFF \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_TORCHAO=ON \ + -Bcmake-out/examples/models/llama \ + examples/models/llama +cmake --build cmake-out/examples/models/llama -j16 --config Release + +# Download stories llama110m artifacts +download_stories_model_artifacts + +echo "Creating tokenizer.bin" +$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin + +# Export model +LLAMA_CHECKPOINT=stories110M.pt +LLAMA_PARAMS=params.json +MODEL_OUT=model.pte +TOKENIZER=tokenizer.bin + +# Set low-bit quantization parameters +QLINEAR_BITWIDTH=3 # Can be 1-8 +QLINEAR_GROUP_SIZE=128 # Must be multiple of 16 +QEMBEDDING_BITWIDTH=4 # Can be 1-8 +QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16 + +${PYTHON_EXECUTABLE} -m examples.models.llama.export_llama \ + --checkpoint "${LLAMA_CHECKPOINT:?}" \ + --params "${LLAMA_PARAMS:?}" \ + -kv \ + --use_sdpa_with_kv_cache \ + --output_name=${MODEL_OUT} \ + -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \ + --group_size ${QLINEAR_GROUP_SIZE} \ + -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \ + --disable_dynamic_shape \ + -d fp32 + +# Test run +./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path=$TOKENIZER --prompt="Once upon a time," diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 7ba3e5fda75..097a272d0fe 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -23,8 +23,8 @@ jobs: uses: pytorch/test-infra/.github/workflows/macos_job.yml@main strategy: matrix: - # Mac runners are expensive and limited, and non reliable. - # Do some basic testing for macos jobs, and rely mostly on + # Mac runners are expensive and limited, and non reliable. + # Do some basic testing for macos jobs, and rely mostly on # test-models-linux-aarch64 job instead. model: [emformer_join, ic4, llama2, mobilebert, mv3, resnet50, vit, w2l] backend: [xnnpack-quantization-delegation] @@ -288,6 +288,26 @@ jobs: # Test ANE llama ${CONDA_RUN} sh .ci/scripts/test_ane_static_llama.sh + test-llama-torchao-lowbit: + name: test-llama-torchao-lowbit + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + with: + runner: macos-m1-stable + python-version: '3.11' + submodules: 'true' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + bash .ci/scripts/setup-conda.sh + eval "$(conda shell.bash hook)" + + # Install requirements + ${CONDA_RUN} python install_executorch.py + ${CONDA_RUN} sh examples/models/llama/install_requirements.sh + + # Run test + ${CONDA_RUN} sh .ci/scripts/test_llama_torchao_lowbit.sh + test-llama-runner-linux: # Test Both linux x86 and linux aarch64 name: test-llama-runner-linux diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt index 297f0d26db7..e6d45424bd4 100644 --- a/examples/models/llama/CMakeLists.txt +++ b/examples/models/llama/CMakeLists.txt @@ -115,21 +115,24 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM) endif() if(EXECUTORCH_BUILD_TORCHAO) - set(TORCHAO_BUILD_EXECUTORCH_OPS ON) - set(TORCHAO_BUILD_CPU_AARCH64 ON) - add_subdirectory( - ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental - ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental - ) - target_link_options_shared_lib(torchao_ops_executorch) - list(APPEND link_libraries torchao_ops_executorch) - # if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") - # add_subdirectory( - # ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental/ops/mps - # ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental/ops/mps) - # target_link_options_shared_lib(torchao_ops_mps_executorch) - # list(APPEND link_libraries torchao_ops_mps_executorch) - # endif() + # Currently only enable this on Arm-based Macs + if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") + set(TORCHAO_BUILD_EXECUTORCH_OPS ON) + set(TORCHAO_BUILD_CPU_AARCH64 ON) + add_subdirectory( + ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental + ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental + ) + target_link_options_shared_lib(torchao_ops_executorch) + list(APPEND link_libraries torchao_ops_executorch) + if(EXECUTORCH_BUILD_MPS) + add_subdirectory( + ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental/ops/mps + ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental/ops/mps) + target_link_options_shared_lib(torchao_ops_mps_executorch) + list(APPEND link_libraries torchao_ops_mps_executorch) + endif() + endif() endif() set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack) diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md index 4c0cce4dd9e..ab435513643 100644 --- a/examples/models/llama/README.md +++ b/examples/models/llama/README.md @@ -380,6 +380,79 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de ### Android Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-demo-android.html) to for full instructions on building the Android LLAMA Demo App. +## Running with low-bit kernels + +We now give instructions for quantizating and running your model with low-bit kernels. These are still experimental, and require you do development on an Arm-based Mac. Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results. + +First export your model for lowbit quantization (step 2 above): + +``` +# Set these paths to point to the downloaded files +LLAMA_CHECKPOINT=path/to/checkpoint.pth +LLAMA_PARAMS=path/to/params.json + +# Set low-bit quantization parameters +QLINEAR_BITWIDTH=3 # Can be 1-8 +QLINEAR_GROUP_SIZE=128 # Must be multiple of 16 +QEMBEDDING_BITWIDTH=4 # Can be 1-8 +QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16 + +python -m examples.models.llama.export_llama \ + --model "llama3_2" \ + --checkpoint "${LLAMA_CHECKPOINT:?}" \ + --params "${LLAMA_PARAMS:?}" \ + -kv \ + --use_sdpa_with_kv_cache \ + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ + --output_name="llama3_2.pte" \ + -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \ + --group_size ${QLINEAR_GROUP_SIZE} \ + -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \ + --disable_dynamic_shape \ + -d fp32 +``` + +Once the model is exported, we need to build ExecuTorch and the runner with the low-bit kernels. + +The first step is to install ExecuTorch (the same as step 3.1 above): + +``` +cmake -DPYTHON_EXECUTABLE=python \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DEXECUTORCH_ENABLE_LOGGING=1 \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -Bcmake-out . +cmake --build cmake-out -j16 --target install --config Release +``` + +Next install the llama runner with torchao kernels enabled (similar to step 3.2 above): + +``` +cmake -DPYTHON_EXECUTABLE=python \ + -DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_XNNPACK=OFF \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_TORCHAO=ON \ + -Bcmake-out/examples/models/llama \ + examples/models/llama +cmake --build cmake-out/examples/models/llama -j16 --config Release +``` + +Finally run your model (similar to step 3.3 above): + +``` +cmake-out/examples/models/llama/llama_main --model_path= --tokenizer_path= --prompt= +``` ## Utility tools for Llama enablement diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py index 59108cf8396..d147e40b082 100644 --- a/examples/models/llama/source_transformation/quantize.py +++ b/examples/models/llama/source_transformation/quantize.py @@ -98,29 +98,20 @@ def quantize( # noqa C901 matches = re.findall(pattern, qmode) assert len(matches) == 1, f"Expected 1 match for pattern but got {len(matches)}" bitwidth = int(matches[0][0]) - # _load_torchao_aten_lib(libname="libtorchao_ops_aten") - # from torchao.experimental.quant_api import Int8DynActIntxWeightLinearQuantizer - from torchao.experimental.quant_api import int8_dynamic_activation_intx_weight, Int8DynActIntxWeightLinearQuantizer + + from torchao.experimental.quant_api import Int8DynamicActivationIntxWeightConfig + from torchao.quantization.granularity import PerGroup, PerRow from torchao.quantization.quant_api import quantize_ from torchao.utils import unwrap_tensor_subclass - from torchao.quantization.granularity import PerRow, PerGroup with torch.no_grad(): - # model = Int8DynActIntxWeightLinearQuantizer( - # device="cpu", - # precision=torch.float32, - # groupsize=group_size, - # bitwidth=bitwidth, - # has_weight_zeros=False, - # ).quantize(model) - - quantize_(model, - int8_dynamic_activation_intx_weight( - # group_size=group_size, - # nbit=bitwidth, - # has_weight_zeros=False, + quantize_( + model, + Int8DynamicActivationIntxWeightConfig( weight_dtype=getattr(torch, f"int{bitwidth}"), - granularity=PerRow() if group_size == 0 else PerGroup(group_size), + granularity=( + PerRow() if group_size in [0, -1] else PerGroup(group_size) + ), has_weight_zeros=False, ), ) @@ -128,8 +119,6 @@ def quantize( # noqa C901 if verbose: print("quantized model:", model) return model - - return model elif qmode == "8da4w": # Check for required args if group_size is None: @@ -769,7 +758,6 @@ def get_quant_embedding_transform(args): bitwidth, group_size = args.embedding_quantize.split(":")[1].split(",") group_size = int(group_size) bitwidth = int(bitwidth) - # _load_torchao_aten_lib(libname="libtorchao_ops_aten") from torchao.experimental.quant_api import IntxWeightEmbeddingQuantizer def _torchao_embedding_quantizer(model): diff --git a/install_requirements.py b/install_requirements.py index 31fb3a700e6..d0a56d784d7 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -6,11 +6,11 @@ # LICENSE file in the root directory of this source tree. import argparse +import os import platform import re import subprocess import sys -import os def python_is_compatible(): @@ -119,7 +119,7 @@ def install_requirements(use_pytorch_nightly): # Install packages directly from local copy instead of pypi. # This is usually not recommended. new_env = os.environ.copy() - new_env["USE_CPP"] = "1" # build torchao kernels + new_env["USE_CPP"] = "1" subprocess.run( [ sys.executable, @@ -147,8 +147,6 @@ def main(args): if __name__ == "__main__": - import os - # Before doing anything, cd to the directory containing this script. os.chdir(os.path.dirname(os.path.abspath(__file__))) if not python_is_compatible(): diff --git a/run.sh b/run.sh deleted file mode 100644 index bb810e8c16a..00000000000 --- a/run.sh +++ /dev/null @@ -1,3 +0,0 @@ -for i in {1..5}; do - ./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path=$TOKENIZER --prompt="Once upon a time," -done diff --git a/third-party/ao b/third-party/ao index ddb7f83dad9..64bcf4c2575 160000 --- a/third-party/ao +++ b/third-party/ao @@ -1 +1 @@ -Subproject commit ddb7f83dad97f918bae0e84ae27c5cf47d8c64fe +Subproject commit 64bcf4c25755a783685ba7383000b3bf722523c1 From ad9532158ab5f7b72de15aefe17385c1d5ea213f Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Thu, 20 Mar 2025 11:34:56 -0700 Subject: [PATCH 5/5] up --- examples/models/llama/README.md | 2 +- install_requirements.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md index ab435513643..07c90505237 100644 --- a/examples/models/llama/README.md +++ b/examples/models/llama/README.md @@ -382,7 +382,7 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de ## Running with low-bit kernels -We now give instructions for quantizating and running your model with low-bit kernels. These are still experimental, and require you do development on an Arm-based Mac. Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results. +We now give instructions for quantizating and running your model with low-bit kernels. These are still experimental, and require you do development on an Arm-based Mac. Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results. Currently dynamic shapes must be disabled when exporting a model with these kernels. First export your model for lowbit quantization (step 2 above): diff --git a/install_requirements.py b/install_requirements.py index d0a56d784d7..6770f8f98a1 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -119,7 +119,7 @@ def install_requirements(use_pytorch_nightly): # Install packages directly from local copy instead of pypi. # This is usually not recommended. new_env = os.environ.copy() - new_env["USE_CPP"] = "1" + new_env["USE_CPP"] = "1" # install torchao kernels subprocess.run( [ sys.executable,