pytorch · metascroy · Sep 13, 2024 · Sep 13, 2024 · Sep 15, 2024 · Sep 15, 2024
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -434,7 +434,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v2
         with:
-          python-version: 3.10.11
+          python-version: '3.10.11'
       - name: Setup Xcode
         if: runner.os == 'macOS'
         uses: maxim-lobanov/setup-xcode@v1
@@ -577,7 +577,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@v2
         with:
-          python-version: 3.10.11
+          python-version: '3.10.11'
       - name: Print machine info
         run: |
           uname -a
@@ -625,6 +625,7 @@ jobs:
     with:
       runner: macos-m1-stable  # neeps MPS, was macos-m1-stable
       script: |
+        export PYTHON_VERSION="3.10"
         set -x
         # NS/MC: Remove previous installation of torch and torchao first
         # as this script does not install anything into conda env but rather as system dep
@@ -737,6 +738,7 @@ jobs:
     with:
       runner: macos-m1-stable  # needs MPS, was macos-m1-stable
       script: |
+        export PYTHON_VERSION="3.10"
         set -x
         # NS/MC: Remove previous installation of torch and torchao first
         # as this script does not install anything into conda env but rather as system dep
@@ -914,31 +916,19 @@ jobs:
         continue-on-error: true
         run: |
           echo "Installing ExecuTorch"
-          bash torchchat/utils/scripts/build_native.sh et
-      - name: Install ET pip
+          bash torchchat/utils/scripts/install_et.sh
+      - name: Install ExecuTorch python
         run: |
-          echo "ET build directory"
-          ls et-build | cat
-
+          echo "Install ExecuTorch python"
           pushd et-build/src/executorch
-          if [ $(git rev-parse HEAD) != ${{env.et-git-hash}} ]; then
-            echo "Mismatched hash.  Make sure branch install_et.sh matches branch from Github cache."
-            echo "On commit $(git rev-parse HEAD)"
-            echo "Expected commit ${{env.et-git-hash}}"
-            exit 1
-          fi
-          pip install .
+          chmod +x ./install_requirements.sh
+          chmod +x ./install_requirements.py
+          ./install_requirements.sh
           popd
       - name: Install runner
         run: |
-          # Pull submodules (re2, abseil) for Tiktoken
-          git submodule sync
-          git submodule update --init
-
-          export TORCHCHAT_ROOT=${PWD}
-          cmake -S . -B ./cmake-out -G Ninja
-          cmake --build ./cmake-out --target et_run
-
+          echo "Installing runner"
+          bash torchchat/utils/scripts/build_native.sh et
       - name: Run inference
         run: |
           python torchchat.py download stories15M
@@ -1035,3 +1025,93 @@ jobs:
           git submodule update --init
           ./runner/build_android.sh
           echo "Tests complete."
+
+  test-torchao-experimental:
+    strategy:
+      matrix:
+        runner: [macos-14-xlarge]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.10.11
+      - name: Setup Xcode
+        if: runner.os == 'macOS'
+        uses: maxim-lobanov/setup-xcode@v1
+        with:
+          xcode-version: '15.3'
+      - name: Print machine info
+        run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - name: Install torchchat
+        run: |
+          echo "Intalling pip3 packages"
+          ./install/install_requirements.sh
+          pip3 list
+          python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+      - name: Install torchao-experimental
+        id: install-torchao-experimental
+        run: |
+          bash torchchat/utils/scripts/build_torchao_experimental.sh
+      - name: Set git shas
+        id: setup-hash
+        run: |
+          export TORCHCHAT_ROOT=${PWD}
+          echo "et-git-hash=$(cat ${TORCHCHAT_ROOT}/install/.pins/et-pin.txt)" >> "$GITHUB_ENV"
+      - name: Load or install ET
+        id: install-et
+        uses: actions/cache@v3
+        env:
+          cache-key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}
+        with:
+          path: ./et-build
+          key: ${{env.cache-key}}
+          restore-keys: |
+            ${{env.cache-key}}
+      - if: ${{ steps.install-et.outputs.cache-hit != 'true' }}
+        continue-on-error: true
+        run: |
+          echo "Installing ExecuTorch"
+          bash torchchat/utils/scripts/install_et.sh
+      - name: Install runner
+        run: |
+          echo "Installing runner"
+          bash torchchat/utils/scripts/build_native.sh et link_torchao
+      - name: Install runner AOTI
+        id: install-runner-aoti
+        run: |
+          bash torchchat/utils/scripts/build_native.sh aoti link_torchao
+      - name: Run inference
+        run: |
+          python torchchat.py download stories110M
+          wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+
+          export PRMT="Once upon a time in a land far away"
+
+          echo "Generate eager"
+          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
+
+          echo "Generate compile"
+          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile
+
+          echo "Export and run ET (C++ runner)"
+          python torchchat.py export stories110M --output-pte-path ./model.pte --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
+          ./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
+
+          echo "Export and run AOTI (C++ runner)"
+          python torchchat.py export stories110M --output-dso-path ./model.so --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
+          ./cmake-out/aoti_run ./model.so -z ./tokenizer.model -t 0 -i "${PRMT}"
+
+          echo "Generate AOTI"
+          python torchchat.py generate stories110M --dso-path ./model.so --prompt "${PRMT}"
+
+          echo "Tests complete."
diff --git a/.gitignore b/.gitignore
@@ -14,6 +14,7 @@ __pycache__/
 # Build directories
 build/android/*
 et-build/*
+torchao-build/*
 runner-et/cmake-out/*
 runner-aoti/cmake-out/*
 cmake-out/

diff --git a/docs/quantization.md b/docs/quantization.md
@@ -118,6 +118,67 @@ python3 torchchat.py export llama3 --quantize '{"embedding": {"bitwidth": 4, "gr
 python3 torchchat.py generate llama3 --pte-path llama3.pte  --prompt "Hello my name is"
 ```
 
+## Experimental TorchAO lowbit kernels
+
+### Use
+The quantization scheme a8wxdq dynamically quantizes activations to 8 bits, and quantizes the weights in a groupwise manner with a specified bitwidth and groupsize.
+It takes arguments bitwidth (2, 3, 4, 5, 6, 7), groupsize, and has_weight_zeros (true, false).
+The argument has_weight_zeros indicates whether the weights are quantized with scales only (has_weight_zeros: false) or with both scales and zeros (has_weight_zeros: true).
+Roughly speaking, {bitwidth: 4, groupsize: 256, has_weight_zeros: false} is similar to GGML's Q40 quantization scheme.
+
+You should expect high performance on ARM CPU if bitwidth is 2, 3, 4, or 5 and groupsize is divisible by 16.  With other platforms and argument choices, a slow fallback kernel will be used.  You will see warnings about this during quantization.
+
+### Setup
+To use a8wxdq, you must set up the torchao experimental kernels.  These will only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon.
+
+From the torchchat root directory, run
+```
+sh torchchat/utils/scripts/build_torchao_experimental.sh
+```
+
+This should take about 10 seconds to complete.  Once finished, you can use a8wxdq in torchchat.
+
+Note: if you want to use the new kernels in the AOTI and C++ runners, you must pass the flag link_torchao when running the scripts the build the runners.
+
+```
+sh torchchat/utils/scripts/build_native.sh aoti link_torchao
+```
+
+```
+sh torchchat/utils/scripts/build_native.sh et link_torchao
+```
+
+### Examples
+
+#### Eager mode
+```
+python3 torchchat.py generate llama3 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
+```
+
+#### torch.compile
+```
+python3 torchchat.py generate llama3 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile
+```
+
+As with PyTorch in general, you can experiment with performance on a difference number of threads by defining OMP_NUM_THREADS.  For example,
+
+```
+OMP_NUM_THREADS=6 python3 torchchat.py generate llama3 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile
+```
+
+#### AOTI
+```
+python torchchat.py export llama3 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --output-dso llama3.so
+python3 torchchat.py generate llama3 --dso-path llama3_1.so --prompt "Hello my name is"
+```
+
+#### ExecuTorch
+```
+python torchchat.py export llama3 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --output-pte llama3.pte
+```
+
+Note: only the ExecuTorch C++ runner in torchchat when built using the instructions in the setup can run the exported *.pte file.
+
 ## Quantization Profiles
 
 Four [sample profiles](https://github.com/pytorch/torchchat/tree/main/torchchat/quant_config/) are included with the torchchat distribution: `cuda.json`, `desktop.json`, `mobile.json`, `pi5.json`

diff --git a/install/.pins/et-pin.txt b/install/.pins/et-pin.txt
@@ -1 +1 @@
-91298923a0076c1b41059efb6dad2876426e4b03
+c75711cb329cab3df91fb9083a18373f9a568377
diff --git a/install/.pins/torchao-experimental-pin.txt b/install/.pins/torchao-experimental-pin.txt
@@ -0,0 +1 @@
+3fa38aaf1276e36845a82fb399e5054718a441c4
diff --git a/install/requirements.txt b/install/requirements.txt
@@ -12,7 +12,7 @@ tiktoken
 # Miscellaneous
 snakeviz
 sentencepiece
-numpy < 2.0
+numpy>=1.23.5,<2.0
 gguf
 lm-eval==0.4.2
 blobfile

diff --git a/runner/aoti.cmake b/runner/aoti.cmake
@@ -28,3 +28,7 @@ if(Torch_FOUND)
     target_link_libraries(aoti_run "${TORCH_LIBRARIES}" m)
     set_property(TARGET aoti_run PROPERTY CXX_STANDARD 17)
 endif()
+
+if (LINK_TORCHAO_CUSTOM_OPS)
+    target_link_libraries(aoti_run "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/liblinear_a8wxdq_ATEN${CMAKE_SHARED_LIBRARY_SUFFIX}")
+endif()
diff --git a/runner/build_android.sh b/runner/build_android.sh
@@ -24,8 +24,6 @@ export CMAKE_OUT_DIR="cmake-out-android"
 export EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT="OFF"
 export EXECUTORCH_BUILD_KERNELS_CUSTOM="ON"
 export CMAKE_OUT_DIR="cmake-out-android"
-# export DCMAKE_INSTALL_PREFIX=cmake-out-android
-#
 
 build_runner_et() {
   rm -rf cmake-out-android
@@ -43,5 +41,5 @@ install_executorch_python_libs $ENABLE_ET_PYBIND
 export CMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake
 export ANDROID_ABI=arm64-v8a
 export ANDROID_PLATFORM=android-23
-install_executorch
+install_executorch_cpp_libs
 build_runner_et
diff --git a/runner/et.cmake b/runner/et.cmake
@@ -62,7 +62,6 @@ if(executorch_FOUND)
 
     set(EXECUTORCH_SRC_ROOT ${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/src/executorch)
     set(XNNPACK_ROOT ${EXECUTORCH_SRC_ROOT}/backends/xnnpack)
-    list(APPEND _srcs ${XNNPACK_ROOT}/threadpool/cpuinfo_utils.cpp)
     list(APPEND _common_include_directories
          ${XNNPACK_ROOT}/third-party/cpuinfo/include)
 
@@ -80,7 +79,9 @@ if(executorch_FOUND)
     et_run PRIVATE
     executorch
     extension_module
+    extension_tensor
     extension_data_loader
+    extension_threadpool
     optimized_kernels
     quantized_kernels
     portable_kernels
@@ -111,6 +112,15 @@ if(executorch_FOUND)
     target_link_libraries(et_run PRIVATE log)
   endif()
 
+  if(LINK_TORCHAO_CUSTOM_OPS)
+    # target_link_libraries(et_run PRIVATE "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/liblinear_a8wxdq_EXECUTORCH${CMAKE_SHARED_LIBRARY_SUFFIX}")
+    target_link_libraries(et_run PRIVATE "$<LINK_LIBRARY:WHOLE_ARCHIVE,${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/liblinear_a8wxdq_EXECUTORCH.a>")
+    target_link_libraries(et_run PRIVATE
+      "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/libtorchao_ops_linear_EXECUTORCH.a"
+      "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/libtorchao_kernels_aarch64.a"
+    )
+  endif()
+
   # Adding target_link_options_shared_lib as commented out below leads to this:
   #
   # CMake Error at Utils.cmake:22 (target_link_options):

diff --git a/runner/run.cpp b/runner/run.cpp
@@ -39,19 +39,20 @@ torch::Device aoti_device(torch::kCPU);
 
 #else // __ET_MODEL__
 #include <executorch/extension/module/module.h>
-#include <executorch/extension/runner_util/managed_tensor.h>
+#include <executorch/extension/tensor/tensor_ptr.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 
 #if defined(ET_USE_ADAPTIVE_THREADS)
-#include <executorch/backends/xnnpack/threadpool/cpuinfo_utils.h>
-#include <executorch/backends/xnnpack/threadpool/threadpool.h>
+#include <executorch/extension/threadpool/cpuinfo_utils.h>
+#include <executorch/extension/threadpool/threadpool.h>
 #endif
 
 using exec_aten::ScalarType;
 using torch::executor::EValue;
-using torch::executor::ManagedTensor;
+using executorch::extension::TensorPtr;
+using executorch::extension::make_tensor_ptr;
 using torch::executor::Module;
 using torch::executor::Result;
 #endif
@@ -212,11 +213,11 @@ float* forward(Transformer* transformer, int token, int pos) {
                              .to(torch::kCPU);
   auto logits = result[0].data_ptr();
 #else // __ET_MODEL__
-  ManagedTensor pos_managed(pos_buffer, {1}, ScalarType::Long);
-  ManagedTensor tokens_managed(token_buffer, {1, 1}, ScalarType::Long);
+  TensorPtr pos_managed = make_tensor_ptr(ScalarType::Long, {1}, pos_buffer);
+  TensorPtr tokens_managed = make_tensor_ptr(ScalarType::Long, {1, 1}, token_buffer);
   std::vector<EValue> inputs;
-  auto tmp1 = EValue(tokens_managed.get_aliasing_tensor());
-  auto tmp2 = EValue(pos_managed.get_aliasing_tensor());
+  auto tmp1 = EValue(tokens_managed);
+  auto tmp2 = EValue(pos_managed);
 
   inputs.push_back(tmp1);
   inputs.push_back(tmp2);

diff --git a/torchchat/export.py b/torchchat/export.py
@@ -194,7 +194,7 @@ def forward(self, x, freqs_cis, mask, input_pos=None):
             return self.wo(output)
 
     def replace_attention_with_custom_sdpa_attention(module: nn.Module):
-        from executorch.examples.models.llama2.custom_ops import (  # noqa
+        from executorch.extension.llm.custom_ops import (  # noqa
             sdpa_with_kv_cache,
         )
 
@@ -304,7 +304,6 @@ def export_for_et(model, device, output_path) -> str:
         edge_manager = edge_manager.to_backend(XnnpackDynamicallyQuantizedPartitioner())
         export_program = edge_manager.to_executorch(
             ExecutorchBackendConfig(
-                extract_constant_segment=True,
                 extract_delegate_segments=True,
                 passes=[
                     QuantFusionPass(),

diff --git a/torchchat/model.py b/torchchat/model.py
@@ -961,7 +961,7 @@ def apply_rotary_emb(x: Tensor, freqs_cis: Tensor) -> Tensor:
     from executorch.extension.pybindings import portable_lib as exec_lib
 
     # ET changed the way it's loading the custom ops so it's not included in portable_lib but has to be loaded separately.
-    from executorch.examples.models.llama2.custom_ops import sdpa_with_kv_cache  # no-qa
+    from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # no-qa
 
     class PTEModel(nn.Module):
         def __init__(self, config, path) -> None:
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		91298923a0076c1b41059efb6dad2876426e4b03
		c75711cb329cab3df91fb9083a18373f9a568377
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		3fa38aaf1276e36845a82fb399e5054718a441c4
Copy link Contributor Author metascroy Sep 13, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. TODO: update to commit hash that contains D62394341 after it lands.