diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 9d3ad63e3..55fe8f11d 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -1103,7 +1103,7 @@ jobs: with: path: | ./et-build - ./torchchat/utils/scripts + ./torchchat/utils/scripts/install_et.sh key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}-${{ hashFiles('**/install_et.sh') }} - if: ${{ steps.install-et.outputs.cache-hit != 'true' }} continue-on-error: true diff --git a/docs/quantization.md b/docs/quantization.md index 2fac20fc1..24a12fcbd 100644 --- a/docs/quantization.md +++ b/docs/quantization.md @@ -122,11 +122,11 @@ python3 torchchat.py generate llama3 --pte-path llama3.pte --prompt "Hello my n ### Use The quantization scheme a8wxdq dynamically quantizes activations to 8 bits, and quantizes the weights in a groupwise manner with a specified bitwidth and groupsize. -It takes arguments bitwidth (2, 3, 4, 5, 6, 7), groupsize, and has_weight_zeros (true, false). +It takes arguments bitwidth (1, 2, 3, 4, 5, 6, 7), groupsize, and has_weight_zeros (true, false). The argument has_weight_zeros indicates whether the weights are quantized with scales only (has_weight_zeros: false) or with both scales and zeros (has_weight_zeros: true). Roughly speaking, {bitwidth: 4, groupsize: 256, has_weight_zeros: false} is similar to GGML's Q4_0 quantization scheme. -You should expect high performance on ARM CPU if bitwidth is 2, 3, 4, or 5 and groupsize is divisible by 16. With other platforms and argument choices, a slow fallback kernel will be used. You will see warnings about this during quantization. +You should expect high performance on ARM CPU if bitwidth is 1, 2, 3, 4, or 5 and groupsize is divisible by 16. With other platforms and argument choices, a slow fallback kernel will be used. You will see warnings about this during quantization. ### Setup To use a8wxdq, you must set up the torchao experimental kernels. These will only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon. diff --git a/install/.pins/torchao-pin.txt b/install/.pins/torchao-pin.txt index b28bd09cd..a6a965960 100644 --- a/install/.pins/torchao-pin.txt +++ b/install/.pins/torchao-pin.txt @@ -1 +1 @@ -63cb7a9857654784f726fec75c0dc36167094d8a +ae3e7c68eae7085e13241cb3d6b39481868dd162 diff --git a/runner/aoti.cmake b/runner/aoti.cmake index 082a6f5ce..ae907b391 100644 --- a/runner/aoti.cmake +++ b/runner/aoti.cmake @@ -30,5 +30,5 @@ if(Torch_FOUND) endif() if (LINK_TORCHAO_OPS) - target_link_libraries(aoti_run "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/liblinear_a8wxdq_ATEN${CMAKE_SHARED_LIBRARY_SUFFIX}") + target_link_libraries(aoti_run "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/libtorchao_ops_aten${CMAKE_SHARED_LIBRARY_SUFFIX}") endif() diff --git a/runner/et.cmake b/runner/et.cmake index c788ead56..916ce9ea8 100644 --- a/runner/et.cmake +++ b/runner/et.cmake @@ -117,10 +117,9 @@ if(executorch_FOUND) endif() if(LINK_TORCHAO_OPS) - target_link_libraries(et_run PRIVATE "$") + target_link_libraries(et_run PRIVATE "$") target_link_libraries(et_run PRIVATE "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/libtorchao_kernels_aarch64.a" - "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/libtorchao_ops_linear_EXECUTORCH.a" ) endif() diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py index abca48d25..1be34a1c5 100644 --- a/torchchat/utils/quantize.py +++ b/torchchat/utils/quantize.py @@ -898,7 +898,7 @@ def quantized_model(self) -> nn.Module: # Try loading custom op try: import glob - libs = glob.glob(f"{torchao_build_path}/cmake-out/lib/liblinear_a8wxdq_ATEN.*") + libs = glob.glob(f"{torchao_build_path}/cmake-out/lib/libtorchao_ops_aten.*") libs = list(filter(lambda l: (l.endswith("so") or l.endswith("dylib")), libs)) torch.ops.load_library(libs[0]) except Exception as e: diff --git a/torchchat/utils/scripts/install_utils.sh b/torchchat/utils/scripts/install_utils.sh index f5698237f..10405382e 100644 --- a/torchchat/utils/scripts/install_utils.sh +++ b/torchchat/utils/scripts/install_utils.sh @@ -191,7 +191,7 @@ install_torchao_aten_ops() { cmake -DCMAKE_PREFIX_PATH=${MY_CMAKE_PREFIX_PATH} \ -DCMAKE_INSTALL_PREFIX=${CMAKE_OUT_DIR} \ -DCMAKE_BUILD_TYPE="Release" \ - -DTORCHAO_OP_TARGET="ATEN" \ + -DTORCHAO_OP_TARGET="aten" \ -S . \ -B ${CMAKE_OUT_DIR} -G Ninja cmake --build ${CMAKE_OUT_DIR} --target install --config Release @@ -207,7 +207,7 @@ install_torchao_executorch_ops() { cmake -DCMAKE_PREFIX_PATH=${MY_CMAKE_PREFIX_PATH} \ -DCMAKE_INSTALL_PREFIX=${CMAKE_OUT_DIR} \ -DCMAKE_BUILD_TYPE="Release" \ - -DTORCHAO_OP_TARGET="EXECUTORCH" \ + -DTORCHAO_OP_TARGET="executorch" \ -DEXECUTORCH_INCLUDE_DIRS="${EXECUTORCH_INCLUDE_DIRS}" \ -DEXECUTORCH_LIBRARIES="${EXECUTORCH_LIBRARIES}" \ -S . \