init

metascroy · metascroy · commit cc1697093035 · 2024-09-19T15:06:08.000-07:00
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -1025,93 +1025,3 @@ jobs:
           git submodule update --init
           ./runner/build_android.sh
           echo "Tests complete."
-
-  test-torchao-experimental:
-    strategy:
-      matrix:
-        runner: [macos-14-xlarge]
-    runs-on: ${{matrix.runner}}
-    steps:
-      - name: Checkout repo
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-      - name: Setup Python
-        uses: actions/setup-python@v2
-        with:
-          python-version: 3.10.11
-      - name: Setup Xcode
-        if: runner.os == 'macOS'
-        uses: maxim-lobanov/setup-xcode@v1
-        with:
-          xcode-version: '15.3'
-      - name: Print machine info
-        run: |
-          uname -a
-          if [ $(uname -s) == Darwin ]; then
-            sysctl machdep.cpu.brand_string
-            sysctl machdep.cpu.core_count
-          fi
-      - name: Install torchchat
-        run: |
-          echo "Intalling pip3 packages"
-          ./install/install_requirements.sh
-          pip3 list
-          python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
-      - name: Install torchao-experimental
-        id: install-torchao-experimental
-        run: |
-          bash torchchat/utils/scripts/build_torchao_experimental.sh
-      - name: Set git shas
-        id: setup-hash
-        run: |
-          export TORCHCHAT_ROOT=${PWD}
-          echo "et-git-hash=$(cat ${TORCHCHAT_ROOT}/install/.pins/et-pin.txt)" >> "$GITHUB_ENV"
-      - name: Load or install ET
-        id: install-et
-        uses: actions/cache@v3
-        env:
-          cache-key: et-build-${{runner.os}}-${{runner.arch}}-${{env.et-git-hash}}
-        with:
-          path: ./et-build
-          key: ${{env.cache-key}}
-          restore-keys: |
-            ${{env.cache-key}}
-      - if: ${{ steps.install-et.outputs.cache-hit != 'true' }}
-        continue-on-error: true
-        run: |
-          echo "Installing ExecuTorch"
-          bash torchchat/utils/scripts/install_et.sh
-      - name: Install runner
-        run: |
-          echo "Installing runner"
-          bash torchchat/utils/scripts/build_native.sh et link_torchao
-      - name: Install runner AOTI
-        id: install-runner-aoti
-        run: |
-          bash torchchat/utils/scripts/build_native.sh aoti link_torchao
-      - name: Run inference
-        run: |
-          python torchchat.py download stories110M
-          wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
-
-          export PRMT="Once upon a time in a land far away"
-
-          echo "Generate eager"
-          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
-
-          echo "Generate compile"
-          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile
-
-          echo "Export and run ET (C++ runner)"
-          python torchchat.py export stories110M --output-pte-path ./model.pte --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
-          ./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
-
-          echo "Export and run AOTI (C++ runner)"
-          python torchchat.py export stories110M --output-dso-path ./model.so --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
-          ./cmake-out/aoti_run ./model.so -z ./tokenizer.model -t 0 -i "${PRMT}"
-
-          echo "Generate AOTI"
-          python torchchat.py generate stories110M --dso-path ./model.so --prompt "${PRMT}"
-
-          echo "Tests complete."
diff --git a/docs/quantization.md b/docs/quantization.md
@@ -118,66 +118,6 @@ python3 torchchat.py export llama3 --quantize '{"embedding": {"bitwidth": 4, "gr
 python3 torchchat.py generate llama3 --pte-path llama3.pte  --prompt "Hello my name is"
 ```
 
-## Experimental TorchAO lowbit kernels
-
-### Use
-The quantization scheme a8wxdq dynamically quantizes activations to 8 bits, and quantizes the weights in a groupwise manner with a specified bitwidth and groupsize.
-It takes arguments bitwidth (2, 3, 4, 5, 6, 7), groupsize, and has_weight_zeros (true, false).
-The argument has_weight_zeros indicates whether the weights are quantized with scales only (has_weight_zeros: false) or with both scales and zeros (has_weight_zeros: true).
-Roughly speaking, {bitwidth: 4, groupsize: 256, has_weight_zeros: false} is similar to GGML's Q40 quantization scheme.
-
-You should expect high performance on ARM CPU if bitwidth is 2, 3, 4, or 5 and groupsize is divisible by 16.  With other platforms and argument choices, a slow fallback kernel will be used.  You will see warnings about this during quantization.
-
-### Setup
-To use a8wxdq, you must set up the torchao experimental kernels.  These will only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon.
-
-From the torchchat root directory, run
-```
-sh torchchat/utils/scripts/build_torchao_experimental.sh
-```
-
-This should take about 10 seconds to complete.  Once finished, you can use a8wxdq in torchchat.
-
-Note: if you want to use the new kernels in the AOTI and C++ runners, you must pass the flag link_torchao when running the scripts the build the runners.
-
-```
-sh torchchat/utils/scripts/build_native.sh aoti link_torchao
-```
-
-```
-sh torchchat/utils/scripts/build_native.sh et link_torchao
-```
-
-### Examples
-
-#### Eager mode
-```
-python3 torchchat.py generate llama3 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}'
-```
-
-#### torch.compile
-```
-python3 torchchat.py generate llama3 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile
-```
-
-As with PyTorch in general, you can experiment with performance on a difference number of threads by defining OMP_NUM_THREADS.  For example,
-
-```
-OMP_NUM_THREADS=6 python3 torchchat.py generate llama3 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --compile
-```
-
-#### AOTI
-```
-python torchchat.py export llama3 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --output-dso llama3.so
-python3 torchchat.py generate llama3 --dso-path llama3_1.so --prompt "Hello my name is"
-```
-
-#### ExecuTorch
-```
-python torchchat.py export llama3 --device cpu --dtype float32 --quantize '{"linear:a8wxdq": {"bitwidth": 4, "groupsize": 256, "has_weight_zeros": false}}' --output-pte llama3.pte
-```
-
-Note: only the ExecuTorch C++ runner in torchchat when built using the instructions in the setup can run the exported *.pte file.
 
 ## Quantization Profiles
 
diff --git a/install/.pins/torchao-experimental-pin.txt b/install/.pins/torchao-experimental-pin.txt
diff --git a/runner/aoti.cmake b/runner/aoti.cmake
@@ -28,7 +28,3 @@ if(Torch_FOUND)
     target_link_libraries(aoti_run "${TORCH_LIBRARIES}" m)
     set_property(TARGET aoti_run PROPERTY CXX_STANDARD 17)
 endif()
-
-if (LINK_TORCHAO_CUSTOM_OPS)
-    target_link_libraries(aoti_run "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/liblinear_a8wxdq_ATEN${CMAKE_SHARED_LIBRARY_SUFFIX}")
-endif()
diff --git a/runner/et.cmake b/runner/et.cmake
@@ -112,30 +112,6 @@ if(executorch_FOUND)
     target_link_libraries(et_run PRIVATE log)
   endif()
 
-  if(LINK_TORCHAO_CUSTOM_OPS)
-    # target_link_libraries(et_run PRIVATE "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/liblinear_a8wxdq_EXECUTORCH${CMAKE_SHARED_LIBRARY_SUFFIX}")
-    target_link_libraries(et_run PRIVATE "$<LINK_LIBRARY:WHOLE_ARCHIVE,${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/liblinear_a8wxdq_EXECUTORCH.a>")
-    target_link_libraries(et_run PRIVATE
-      "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/libtorchao_ops_linear_EXECUTORCH.a"
-      "${TORCHCHAT_ROOT}/torchao-build/cmake-out/lib/libtorchao_kernels_aarch64.a"
-    )
-  endif()
-
-  # Adding target_link_options_shared_lib as commented out below leads to this:
-  #
-  # CMake Error at Utils.cmake:22 (target_link_options):
-  #   Cannot specify link options for target
-  #   "/Users/scroy/etorch/torchchat/et-build/src/executorch/${CMAKE_OUT_DIR}/examples/models/llama2/custom_ops/libcustom_ops_lib.a"
-  #   which is not built by this project.
-  # Call Stack (most recent call first):
-  #   Utils.cmake:30 (macos_kernel_link_options)
-  #   CMakeLists.txt:41 (target_link_options_shared_lib)
-  #
-  #target_link_options_shared_lib("${TORCHCHAT_ROOT}/et-build/src/executorch/${CMAKE_OUT_DIR}/examples/models/llama2/custom_ops/libcustom_ops_lib.a") # This one does not get installed by ExecuTorch
-
-  # This works on mac, but appears to run into issues on linux
-  # It is needed to solve:
-  # E 00:00:00.055965 executorch:method.cpp:536] Missing operator: [8] llama::sdpa_with_kv_cache.out
 else()
   MESSAGE(WARNING "ExecuTorch package not found")
 endif()
diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
@@ -96,19 +96,10 @@ def quantize_model(
                 precision = get_precision()
 
             try:
-                if quantizer == "linear:a8wxdq":
-                    quant_handler = ao_quantizer_class_dict[quantizer](
-                        device=device,
-                        precision=precision,
-                        bitwidth=q_kwargs.get("bitwidth", 4),
-                        groupsize=q_kwargs.get("groupsize", 128),
-                        has_weight_zeros=q_kwargs.get("has_weight_zeros", False),
-                    )
-                else:
-                    # Easier to ask forgiveness than permission
-                    quant_handler = ao_quantizer_class_dict[quantizer](
-                        groupsize=q_kwargs["groupsize"], device=device, precision=precision
-                    )
+                # Easier to ask forgiveness than permission
+                quant_handler = ao_quantizer_class_dict[quantizer](
+                    groupsize=q_kwargs["groupsize"], device=device, precision=precision
+                )
             except TypeError as e:
                 if "unexpected keyword argument 'device'" in str(e):
                     quant_handler = ao_quantizer_class_dict[quantizer](
@@ -870,33 +861,3 @@ def quantized_model(self) -> nn.Module:
     "linear:int4": Int4WeightOnlyQuantizer,
     "linear:a8w4dq": Int8DynActInt4WeightQuantizer,
 }
-
-try:
-    import importlib.util
-    import sys
-    import os
-    torchao_build_path = f"{os.getcwd()}/torchao-build"
-
-    # Try loading quantizer
-    torchao_experimental_quant_api_spec = importlib.util.spec_from_file_location(
-        "torchao_experimental_quant_api",
-        f"{torchao_build_path}/src/ao/torchao/experimental/quant_api.py",
-    )
-    torchao_experimental_quant_api = importlib.util.module_from_spec(torchao_experimental_quant_api_spec)
-    sys.modules["torchao_experimental_quant_api"] = torchao_experimental_quant_api
-    torchao_experimental_quant_api_spec.loader.exec_module(torchao_experimental_quant_api)
-    from torchao_experimental_quant_api import Int8DynActIntxWeightQuantizer
-    ao_quantizer_class_dict["linear:a8wxdq"] = Int8DynActIntxWeightQuantizer
-
-    # Try loading custom op
-    try:
-        import glob
-        libs = glob.glob(f"{torchao_build_path}/cmake-out/lib/liblinear_a8wxdq_ATEN.*")
-        libs = list(filter(lambda l: (l.endswith("so") or l.endswith("dylib")), libs))
-        torch.ops.load_library(libs[0])
-    except Exception as e:
-        print("Failed to torchao custom op library with error: ", e)
-        print("Slow fallback kernels will be used.")
-
-except Exception as e:
-    print(f"Failed to load torchao experimental a8wxdq quantizer with error: {e}")
diff --git a/torchchat/utils/scripts/build_native.sh b/torchchat/utils/scripts/build_native.sh
@@ -26,7 +26,6 @@ if [ $# -eq 0 ]; then
     exit 1
 fi
 
-LINK_TORCHAO=OFF
 while (( "$#" )); do
   case "$1" in
     -h|--help)
@@ -43,11 +42,6 @@ while (( "$#" )); do
       TARGET="et"
       shift
       ;;
-    link_torchao)
-      echo "Linking with torchao custom ops..."
-      LINK_TORCHAO=ON
-      shift
-      ;;
     *)
       echo "Invalid option: $1"
       show_help
@@ -72,26 +66,14 @@ if [[ "$TARGET" == "et" ]]; then
     echo "Make sure you run install_executorch_libs"
     exit 1
   fi
-
-  if [[ "$LINK_TORCHAO" == "ON" ]]; then
-    if [ ! -d "${TORCHCHAT_ROOT}/torchao-build" ]; then
-      echo "Directory ${TORCHCHAT_ROOT}/torchao-build does not exist."
-      echo "Make sure you run clone_torchao"
-      exit 1
-    fi
-    find_cmake_prefix_path
-    EXECUTORCH_INCLUDE_DIRS="${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/include;${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/src"
-    EXECUTORCH_LIBRARIES="${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libexecutorch_no_prim_ops.a;${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libextension_threadpool.a;${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libcpuinfo.a;${TORCHCHAT_ROOT}/${ET_BUILD_DIR}/install/lib/libpthreadpool.a"
-    install_torchao_custom_executorch_ops
-  fi
 fi
 popd
 
 # CMake commands
 if [[ "$TARGET" == "et" ]]; then
-    cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH=`python3 -c 'import torch;print(torch.utils.cmake_prefix_path)'` -DLINK_TORCHAO_CUSTOM_OPS="${LINK_TORCHAO}" -DET_USE_ADAPTIVE_THREADS=ON -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" -G Ninja
+    cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH=`python3 -c 'import torch;print(torch.utils.cmake_prefix_path)'` -DET_USE_ADAPTIVE_THREADS=ON -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" -G Ninja
 else
-    cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH=`python3 -c 'import torch;print(torch.utils.cmake_prefix_path)'` -DLINK_TORCHAO_CUSTOM_OPS="${LINK_TORCHAO}" -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" -G Ninja
+    cmake -S . -B ./cmake-out -DCMAKE_PREFIX_PATH=`python3 -c 'import torch;print(torch.utils.cmake_prefix_path)'` -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" -G Ninja
 fi
 cmake --build ./cmake-out --target "${TARGET}"_run
 
diff --git a/torchchat/utils/scripts/build_torchao_experimental.sh b/torchchat/utils/scripts/build_torchao_experimental.sh
diff --git a/torchchat/utils/scripts/install_utils.sh b/torchchat/utils/scripts/install_utils.sh
@@ -159,52 +159,3 @@ install_executorch_libs() {
   install_executorch_cpp_libs
   install_executorch_python_libs $1
 }
-
-clone_torchao() {
-  echo "Cloning torchao to ${TORCHCHAT_ROOT}/torchao-build/src"
-  rm -rf ${TORCHCHAT_ROOT}/torchao-build/src
-  mkdir -p ${TORCHCHAT_ROOT}/torchao-build/src
-  pushd ${TORCHCHAT_ROOT}/torchao-build/src
-  echo $pwd
-
-  cp -R ${HOME}/fbsource/fbcode/pytorch/ao .
-  # git clone https://github.com/pytorch/ao.git
-  # cd ao
-  # git checkout $(cat ${TORCHCHAT_ROOT}/intstall/.pins/torchao-experimental-pin.txt)
-
-  popd
-}
-
-install_torchao_custom_aten_ops() {
-  echo "Building torchao custom ops for ATen"
-  pushd ${TORCHCHAT_ROOT}/torchao-build/src/ao/torchao/experimental
-
-  CMAKE_OUT_DIR=${TORCHCHAT_ROOT}/torchao-build/cmake-out
-  cmake -DCMAKE_PREFIX_PATH=${MY_CMAKE_PREFIX_PATH} \
-    -DCMAKE_INSTALL_PREFIX=${CMAKE_OUT_DIR} \
-    -DCMAKE_BUILD_TYPE="Release" \
-    -DTORCHAO_OP_TARGET="ATEN" \
-    -S . \
-    -B ${CMAKE_OUT_DIR} -G Ninja
-  cmake --build  ${CMAKE_OUT_DIR} --target install --config Release
-
-  popd
-}
-
-install_torchao_custom_executorch_ops() {
-  echo "Building torchao custom ops for ExecuTorch"
-  pushd ${TORCHCHAT_ROOT}/torchao-build/src/ao/torchao/experimental
-
-  CMAKE_OUT_DIR="${TORCHCHAT_ROOT}/torchao-build/cmake-out"
-  cmake -DCMAKE_PREFIX_PATH=${MY_CMAKE_PREFIX_PATH} \
-    -DCMAKE_INSTALL_PREFIX=${CMAKE_OUT_DIR} \
-    -DCMAKE_BUILD_TYPE="Release" \
-    -DTORCHAO_OP_TARGET="EXECUTORCH" \
-    -DEXECUTORCH_INCLUDE_DIRS="${EXECUTORCH_INCLUDE_DIRS}" \
-    -DEXECUTORCH_LIBRARIES="${EXECUTORCH_LIBRARIES}" \
-    -S . \
-    -B ${CMAKE_OUT_DIR} -G Ninja
-  cmake --build  ${CMAKE_OUT_DIR} --target install --config Release
-
-  popd
-}