Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 102 additions & 22 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.10.11
python-version: '3.10.11'
- name: Setup Xcode
if: runner.os == 'macOS'
uses: maxim-lobanov/setup-xcode@v1
Expand Down Expand Up @@ -577,7 +577,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.10.11
python-version: '3.10.11'
- name: Print machine info
run: |
uname -a
Expand Down Expand Up @@ -625,6 +625,7 @@ jobs:
with:
runner: macos-m1-stable # neeps MPS, was macos-m1-stable
script: |
export PYTHON_VERSION="3.10"
set -x
# NS/MC: Remove previous installation of torch and torchao first
# as this script does not install anything into conda env but rather as system dep
Expand Down Expand Up @@ -737,6 +738,7 @@ jobs:
with:
runner: macos-m1-stable # needs MPS, was macos-m1-stable
script: |
export PYTHON_VERSION="3.10"
set -x
# NS/MC: Remove previous installation of torch and torchao first
# as this script does not install anything into conda env but rather as system dep
Expand Down Expand Up @@ -914,31 +916,19 @@ jobs:
continue-on-error: true
run: |
echo "Installing ExecuTorch"
bash torchchat/utils/scripts/build_native.sh et
- name: Install ET pip
bash torchchat/utils/scripts/install_et.sh
- name: Install ExecuTorch python
run: |
echo "ET build directory"
ls et-build | cat

echo "Install ExecuTorch python"
pushd et-build/src/executorch
if [ $(git rev-parse HEAD) != ${{env.et-git-hash}} ]; then
echo "Mismatched hash. Make sure branch install_et.sh matches branch from Github cache."
echo "On commit $(git rev-parse HEAD)"
echo "Expected commit ${{env.et-git-hash}}"
exit 1
fi
pip install .
chmod +x ./install_requirements.sh
chmod +x ./install_requirements.py
./install_requirements.sh
popd
- name: Install runner
run: |
# Pull submodules (re2, abseil) for Tiktoken
git submodule sync
git submodule update --init

export TORCHCHAT_ROOT=${PWD}
cmake -S . -B ./cmake-out -G Ninja
cmake --build ./cmake-out --target et_run

echo "Installing runner"
bash torchchat/utils/scripts/build_native.sh et
- name: Run inference
run: |
python torchchat.py download stories15M
Expand Down Expand Up @@ -1035,3 +1025,93 @@ jobs:
git submodule update --init
./runner/build_android.sh
echo "Tests complete."

test-torchao-experimental:
strategy:
matrix:
runner: [macos-14-xlarge]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout repo
uses: actions/checkout@v3
with:
submodules: true
- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.10.11
- name: Setup Xcode
if: runner.os == 'macOS'
uses: maxim-lobanov/setup-xcode@v1
with:
xcode-version: '15.3'
- name: Print machine info
run: |
uname -a
if [ $(uname -s) == Darwin ]; then
sysctl machdep.cpu.brand_string
sysctl machdep.cpu.core_count
fi
- name: Install torchchat
run: |
echo "Intalling pip3 packages"
./install/install_requirements.sh
pip3 list
python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
- name: Install torchao-experimental
id: install-torchao-experimental
run: |
bash torchchat/utils/scripts/build_torchao_experimental.sh
- name: Set git shas
id: setup-hash
run: |
export TORCHCHAT_ROOT=${PWD}
echo "et-git-hash=$(cat ${TORCHCHAT_ROOT}/install/.pins/et-pin.txt)" >> "$GITHUB_ENV"
- name: Load or install ET
id: install-et
uses: actions/cache@v3
env:
cache-key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}
with:
path: ./et-build
key: ${{env.cache-key}}
restore-keys: |
${{env.cache-key}}
- if: ${{ steps.install-et.outputs.cache-hit != 'true' }}
continue-on-error: true
run: |
echo "Installing ExecuTorch"
bash torchchat/utils/scripts/install_et.sh
- name: Install runner
run: |
echo "Installing runner"
bash torchchat/utils/scripts/build_native.sh et link_torchao
- name: Install runner AOTI
id: install-runner-aoti
run: |
bash torchchat/utils/scripts/build_native.sh aoti link_torchao
- name: Run inference
run: |
python torchchat.py download stories110M
wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model

export PRMT="Once upon a time in a land far away"

echo "Generate eager"
python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'

echo "Generate compile"
python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}" --device cpu --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile

echo "Export and run ET (C++ runner)"
python torchchat.py export stories110M --output-pte-path ./model.pte --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"

echo "Export and run AOTI (C++ runner)"
python torchchat.py export stories110M --output-dso-path ./model.so --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
./cmake-out/aoti_run ./model.so -z ./tokenizer.model -t 0 -i "${PRMT}"

echo "Generate AOTI"
python torchchat.py generate stories110M --dso-path ./model.so --prompt "${PRMT}"

echo "Tests complete."
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ __pycache__/
# Build directories
build/android/*
et-build/*
torchao-build/*
runner-et/cmake-out/*
runner-aoti/cmake-out/*
cmake-out/
Expand Down
61 changes: 61 additions & 0 deletions docs/quantization.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,67 @@ python3 torchchat.py export llama3 --quantize '{"embedding": {"bitwidth": 4, "gr
python3 torchchat.py generate llama3 --pte-path llama3.pte --prompt "Hello my name is"
```

## Experimental TorchAO lowbit kernels

### Use
The quantization scheme a8wxdq dynamically quantizes activations to 8 bits, and quantizes the weights in a groupwise manner with a specified bitwidth and groupsize.
It takes arguments bitwidth (2, 3, 4, 5, 6, 7), groupsize, and has_weight_zeros (true, false).
The argument has_weight_zeros indicates whether the weights are quantized with scales only (has_weight_zeros: false) or with both scales and zeros (has_weight_zeros: true).
Roughly speaking, {bitwidth: 4, groupsize: 256, has_weight_zeros: false} is similar to GGML's Q40 quantization scheme.

You should expect high performance on ARM CPU if bitwidth is 2, 3, 4, or 5 and groupsize is divisible by 16. With other platforms and argument choices, a slow fallback kernel will be used. You will see warnings about this during quantization.

### Setup
To use a8wxdq, you must set up the torchao experimental kernels. These will only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon.

From the torchchat root directory, run
```
sh torchchat/utils/scripts/build_torchao_experimental.sh
```

This should take about 10 seconds to complete. Once finished, you can use a8wxdq in torchchat.

Note: if you want to use the new kernels in the AOTI and C++ runners, you must pass the flag link_torchao when running the scripts the build the runners.

```
sh torchchat/utils/scripts/build_native.sh aoti link_torchao
```

```
sh torchchat/utils/scripts/build_native.sh et link_torchao
```

### Examples

#### Eager mode
```
python3 torchchat.py generate llama3 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
```

#### torch.compile
```
python3 torchchat.py generate llama3 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile
```

As with PyTorch in general, you can experiment with performance on a difference number of threads by defining OMP_NUM_THREADS. For example,

```
OMP_NUM_THREADS=6 python3 torchchat.py generate llama3 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile
```

#### AOTI
```
python torchchat.py export llama3 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --output-dso llama3.so
python3 torchchat.py generate llama3 --dso-path llama3_1.so --prompt "Hello my name is"
```

#### ExecuTorch
```
python torchchat.py export llama3 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --output-pte llama3.pte
```

Note: only the ExecuTorch C++ runner in torchchat when built using the instructions in the setup can run the exported *.pte file.

## Quantization Profiles

Four [sample profiles](https://github.com/pytorch/torchchat/tree/main/torchchat/quant_config/) are included with the torchchat distribution: `cuda.json`, `desktop.json`, `mobile.json`, `pi5.json`
Expand Down
2 changes: 1 addition & 1 deletion install/.pins/et-pin.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
91298923a0076c1b41059efb6dad2876426e4b03
c75711cb329cab3df91fb9083a18373f9a568377
1 change: 1 addition & 0 deletions install/.pins/torchao-experimental-pin.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3fa38aaf1276e36845a82fb399e5054718a441c4
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: update to commit hash that contains D62394341 after it lands.

2 changes: 1 addition & 1 deletion install/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ tiktoken
# Miscellaneous
snakeviz
sentencepiece
numpy < 2.0
numpy>=1.23.5,<2.0
gguf
lm-eval==0.4.2
blobfile
Expand Down
4 changes: 4 additions & 0 deletions runner/aoti.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,7 @@ if(Torch_FOUND)
target_link_libraries(aoti_run "${TORCH_LIBRARIES}" m)
set_property(TARGET aoti_run PROPERTY CXX_STANDARD 17)
endif()

if (LINK_TORCHAO_CUSTOM_OPS)
target_link_libraries(aoti_run "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/liblinear_a8wxdq_ATEN${CMAKE_SHARED_LIBRARY_SUFFIX}")
endif()
4 changes: 1 addition & 3 deletions runner/build_android.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ export CMAKE_OUT_DIR="cmake-out-android"
export EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT="OFF"
export EXECUTORCH_BUILD_KERNELS_CUSTOM="ON"
export CMAKE_OUT_DIR="cmake-out-android"
# export DCMAKE_INSTALL_PREFIX=cmake-out-android
#

build_runner_et() {
rm -rf cmake-out-android
Expand All @@ -43,5 +41,5 @@ install_executorch_python_libs $ENABLE_ET_PYBIND
export CMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake
export ANDROID_ABI=arm64-v8a
export ANDROID_PLATFORM=android-23
install_executorch
install_executorch_cpp_libs
build_runner_et
12 changes: 11 additions & 1 deletion runner/et.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ if(executorch_FOUND)

set(EXECUTORCH_SRC_ROOT ${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/src/executorch)
set(XNNPACK_ROOT ${EXECUTORCH_SRC_ROOT}/backends/xnnpack)
list(APPEND _srcs ${XNNPACK_ROOT}/threadpool/cpuinfo_utils.cpp)
list(APPEND _common_include_directories
${XNNPACK_ROOT}/third-party/cpuinfo/include)

Expand All @@ -80,7 +79,9 @@ if(executorch_FOUND)
et_run PRIVATE
executorch
extension_module
extension_tensor
extension_data_loader
extension_threadpool
optimized_kernels
quantized_kernels
portable_kernels
Expand Down Expand Up @@ -111,6 +112,15 @@ if(executorch_FOUND)
target_link_libraries(et_run PRIVATE log)
endif()

if(LINK_TORCHAO_CUSTOM_OPS)
# target_link_libraries(et_run PRIVATE "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/liblinear_a8wxdq_EXECUTORCH${CMAKE_SHARED_LIBRARY_SUFFIX}")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uncomment this line to test with dylib (poor perf vs. static lib)

target_link_libraries(et_run PRIVATE "$<LINK_LIBRARY:WHOLE_ARCHIVE,${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/liblinear_a8wxdq_EXECUTORCH.a>")
target_link_libraries(et_run PRIVATE
"${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/libtorchao_ops_linear_EXECUTORCH.a"
"${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/libtorchao_kernels_aarch64.a"
)
endif()

# Adding target_link_options_shared_lib as commented out below leads to this:
#
# CMake Error at Utils.cmake:22 (target_link_options):
Expand Down
17 changes: 9 additions & 8 deletions runner/run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,19 +39,20 @@ torch::Device aoti_device(torch::kCPU);

#else // __ET_MODEL__
#include <executorch/extension/module/module.h>
#include <executorch/extension/runner_util/managed_tensor.h>
#include <executorch/extension/tensor/tensor_ptr.h>
#include <executorch/runtime/core/evalue.h>
#include <executorch/runtime/core/exec_aten/exec_aten.h>
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>

#if defined(ET_USE_ADAPTIVE_THREADS)
#include <executorch/backends/xnnpack/threadpool/cpuinfo_utils.h>
#include <executorch/backends/xnnpack/threadpool/threadpool.h>
#include <executorch/extension/threadpool/cpuinfo_utils.h>
#include <executorch/extension/threadpool/threadpool.h>
#endif

using exec_aten::ScalarType;
using torch::executor::EValue;
using torch::executor::ManagedTensor;
using executorch::extension::TensorPtr;
using executorch::extension::make_tensor_ptr;
using torch::executor::Module;
using torch::executor::Result;
#endif
Expand Down Expand Up @@ -212,11 +213,11 @@ float* forward(Transformer* transformer, int token, int pos) {
.to(torch::kCPU);
auto logits = result[0].data_ptr();
#else // __ET_MODEL__
ManagedTensor pos_managed(pos_buffer, {1}, ScalarType::Long);
ManagedTensor tokens_managed(token_buffer, {1, 1}, ScalarType::Long);
TensorPtr pos_managed = make_tensor_ptr(ScalarType::Long, {1}, pos_buffer);
TensorPtr tokens_managed = make_tensor_ptr(ScalarType::Long, {1, 1}, token_buffer);
std::vector<EValue> inputs;
auto tmp1 = EValue(tokens_managed.get_aliasing_tensor());
auto tmp2 = EValue(pos_managed.get_aliasing_tensor());
auto tmp1 = EValue(tokens_managed);
auto tmp2 = EValue(pos_managed);

inputs.push_back(tmp1);
inputs.push_back(tmp2);
Expand Down
3 changes: 1 addition & 2 deletions torchchat/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def forward(self, x, freqs_cis, mask, input_pos=None):
return self.wo(output)

def replace_attention_with_custom_sdpa_attention(module: nn.Module):
from executorch.examples.models.llama2.custom_ops import ( # noqa
from executorch.extension.llm.custom_ops import ( # noqa
sdpa_with_kv_cache,
)

Expand Down Expand Up @@ -304,7 +304,6 @@ def export_for_et(model, device, output_path) -> str:
edge_manager = edge_manager.to_backend(XnnpackDynamicallyQuantizedPartitioner())
export_program = edge_manager.to_executorch(
ExecutorchBackendConfig(
extract_constant_segment=True,
extract_delegate_segments=True,
passes=[
QuantFusionPass(),
Expand Down
2 changes: 1 addition & 1 deletion torchchat/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -961,7 +961,7 @@ def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
from executorch.extension.pybindings import portable_lib as exec_lib

# ET changed the way it's loading the custom ops so it's not included in portable_lib but has to be loaded separately.
from executorch.examples.models.llama2.custom_ops import sdpa_with_kv_cache # no-qa
from executorch.extension.llm.custom_ops import sdpa_with_kv_cache # no-qa

class PTEModel(nn.Module):
def __init__(self, config, path) -> None:
Expand Down
Loading
Loading