update experimental kernels in torchchat

metascroy · metascroy · commit bdac61615468 · 2025-01-29T17:15:26.000-08:00
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -1055,7 +1055,54 @@ jobs:
           ./runner/build_android.sh
           echo "Tests complete."
 
-  test-torchao-experimental:
+  test-torchao-experimental-python:
+    strategy:
+      matrix:
+        runner: [macos-14-xlarge]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.10.11
+      - name: Setup Xcode
+        if: runner.os == 'macOS'
+        uses: maxim-lobanov/setup-xcode@v1
+        with:
+          xcode-version: '15.3'
+      - name: Print machine info
+        run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - name: Install torchchat
+        run: |
+          echo "Intalling pip3 packages"
+          ./install/install_requirements.sh
+          pip3 list
+          python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")'
+      - name: Run inference
+        run: |
+          python torchchat.py download stories110M
+          wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+          export PRMT="Once upon a time in a land far away"
+          echo "Generate eager"
+          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
+          echo "Generate compile"
+          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --compile
+          echo "Export AOTI"
+          python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
+          echo "Generate AOTI"
+          python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}"
+          echo "Tests complete."
+
+  test-torchao-experimental-cpp:
     strategy:
       matrix:
         runner: [macos-14-xlarge]
@@ -1109,18 +1156,12 @@ jobs:
           python torchchat.py download stories110M
           wget -O ./tokenizer.model https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
           export PRMT="Once upon a time in a land far away"
-          echo "Generate eager"
-          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
-          echo "Generate compile"
-          python torchchat.py generate stories110M --temperature 0 --prompt "${PRMT}"  --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --compile
           echo "Export and run ET (C++ runner)"
           python torchchat.py export stories110M --output-pte-path ./model.pte --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
           ./cmake-out/et_run ./model.pte -z ./tokenizer.model -t 0 -i "${PRMT}"
           echo "Export and run AOTI (C++ runner)"
           python torchchat.py export stories110M --output-aoti-package-path ./model.pt2 --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}'
           ./cmake-out/aoti_run ./model.pt2 -z ./tokenizer.model -t 0 -i "${PRMT}"
-          echo "Generate AOTI"
-          python torchchat.py generate stories110M --aoti-package-path ./model.pt2 --prompt "${PRMT}"
           echo "Tests complete."
 
   test-torchao-experimental-mps:
diff --git a/docs/quantization.md b/docs/quantization.md
@@ -120,13 +120,15 @@ python3 torchchat.py generate llama3 --pte-path llama3.pte  --prompt "Hello my n
 
 ## Experimental TorchAO lowbit kernels
 
-WARNING: These kernels only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon.
+If you are on a Mac with Apple Silicon, we have 1-8 quantization available for embedding and linear layers, backed by CPU and MPS kernels.
+
+The CPU kernels are installed automatically by the torchchat install script and can be used out of the box.  To use the MPS kernels, follow the setup instructions below.
 
 ### Use
 
 #### linear:a8wxdq
 The quantization scheme linear:a8wxdq dynamically quantizes activations to 8 bits, and quantizes the weights in a groupwise manner with a specified bitwidth and groupsize.
-It takes arguments bitwidth (1, 2, 3, 4, 5, 6, 7), groupsize, and has_weight_zeros (true, false).
+It takes arguments bitwidth (1, 2, 3, 4, 5, 6, 7, 8), groupsize (-1 if channelwise desired), and has_weight_zeros (true, false).
 The argument has_weight_zeros indicates whether the weights are quantized with scales only (has_weight_zeros: false) or with both scales and zeros (has_weight_zeros: true).
 Roughly speaking, {bitwidth: 4, groupsize: 32, has_weight_zeros: false} is similar to GGML's Q4_0 quantization scheme.
 
@@ -138,7 +140,9 @@ The quantization scheme embedding:wx quantizes embeddings in a groupwise manner
 You should expect high performance on ARM CPU if groupsize is divisible by 32.  With other platforms and argument choices, a slow fallback kernel will be used.  You will see warnings about this during quantization.
 
 ### Setup
-To use linear:a8wxdq and embedding:wx, you must set up the torchao experimental kernels.  These will only work on devices with ARM CPUs, for example on Mac computers with Apple Silicon.
+If you are using the torchao ops from python, they are available out of the box on a Mac with Apple Silicon, and you can skip these setup steps.
+
+If you plan to use the kernels from the AOTI/ExecuTorch C++ runners, follow the setup steps below.
 
 From the torchchat root directory, run
 ```
@@ -147,7 +151,7 @@ bash torchchat/utils/scripts/build_torchao_ops.sh
 
 This should take about 10 seconds to complete.
 
-Note: if you want to use the new kernels in the AOTI and C++ runners, you must pass the flag link_torchao_ops when running the scripts the build the runners.
+When building the AOTI and C++ runners, you must pass the flag link_torchao_ops when running the scripts the build the runners.
 
 ```
 bash torchchat/utils/scripts/build_native.sh aoti link_torchao_ops
@@ -175,8 +179,8 @@ OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --device cpu --dtype fl
 
 #### AOTI
 ```
-OMP_NUM_THREADS=6 python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --output-dso llama3_1.so
-OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --dso-path llama3_1.so --prompt "Once upon a time,"  --num-samples 5
+OMP_NUM_THREADS=6 python torchchat.py export llama3.1 --device cpu --dtype float32 --quantize '{"embedding:wx": {"bitwidth": 2, "groupsize": 32}, "linear:a8wxdq": {"bitwidth": 3, "groupsize": 128, "has_weight_zeros": false}}' --output-aoti-package-path llama3_1.pt2
+OMP_NUM_THREADS=6 python3 torchchat.py generate llama3.1 --aoti-package-path llama3_1.pt2 --prompt "Once upon a time,"  --num-samples 5
 ```
 
 If you built the AOTI runner with link_torchao_ops as discussed in the setup section, you can also use the C++ runner:
diff --git a/install/install_requirements.sh b/install/install_requirements.sh
@@ -117,9 +117,11 @@ fi
 
 # For torchao need to install from github since nightly build doesn't have macos build.
 # TODO: Remove this and install nightly build, once it supports macos
+# USE_CPP=1 indicates that the torchao experimental aten kernels will be built and loaded
+# if on Mac with Apple Silicon
 (
   set -x
-  $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@2f97b0955953fa1a46594a27f0df2bc48d93e79d
+  USE_CPP=1 $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@11333ba2cb5c4e792bc4f5c0d70c12991f972008
 )
 
 if [[ -x "$(command -v nvidia-smi)" ]]; then
diff --git a/torchchat/utils/quantize.py b/torchchat/utils/quantize.py
@@ -50,6 +50,18 @@
     state_dict_device,
     use_et_backend,
 )
+from torchao.experimental.packed_linear_int8_dynamic_activation_intx_weight_layout import (
+    PackedLinearInt8DynamicActivationIntxWeightLayout,
+)
+from torchao.experimental.quant_api import (
+    int8_dynamic_activation_intx_weight,
+    IntxWeightEmbeddingQuantizer,
+)
+from torchao.quantization.granularity import (
+    PerGroup,
+    PerRow,
+)
+from torchao.dtypes import PlainLayout
 
 
 # Flag for whether the a8wxdq quantizer is available.
@@ -117,7 +129,47 @@ def quantize_model(
                     unwrap_tensor_subclass(model)
                 continue
 
-            if quantizer in ["linear:a8wxdq", "embedding:wx"]:
+            if quantizer == "linear:a8wxdq":
+                if get_precision() != torch.float32:
+                    print(f"Quantizer {quantizer} requires float32 inputs, but received {get_precision()}.  Changing dtype to float32.  Note that after quantization, the weights will be lowbit integers, not float32.")
+                    set_precision(torch.float32)
+
+                group_size = q_kwargs["groupsize"]
+                bit_width = q_kwargs["bitwidth"]
+                has_weight_zeros = q_kwargs["has_weight_zeros"]
+                granularity = PerRow()
+                if group_size != -1:
+                    granularity = PerGroup(group_size)
+                weight_dtype = getattr(torch, f"int{bit_width}")
+
+                try:
+                    quantize_(
+                        model, 
+                        int8_dynamic_activation_intx_weight(
+                            weight_dtype=weight_dtype,
+                            granularity=granularity,
+                            has_weight_zeros=has_weight_zeros,
+                            layout=PackedLinearInt8DynamicActivationIntxWeightLayout(),
+                        ),
+                    )
+                except Exception as e:
+                    print("Encountered error during quantization: {e}")
+                    print("Trying with PlainLayout")
+                    quantize_(
+                        model, 
+                        int8_dynamic_activation_intx_weight(
+                            weight_dtype=weight_dtype,
+                            granularity=granularity,
+                            has_weight_zeros=has_weight_zeros,
+                            layout=PlainLayout(),
+                        ),
+                    )
+
+                if not support_tensor_subclass:
+                    unwrap_tensor_subclass(model)
+                continue
+
+            if quantizer == "embedding:wx":
                 # These quantizers require float32 input weights.  Note that after quantization,
                 # the weights will no longer be float32, but lowbit integers
                 if get_precision() != torch.float32:
@@ -889,10 +941,12 @@ def quantized_model(self) -> nn.Module:
 # class references
 quantizer_class_dict = {
     "embedding": EmbeddingOnlyQuantHandler,
+    "embedding:wx": IntxWeightEmbeddingQuantizer,
     "linear:int8": WeightOnlyInt8QuantHandler,
     "precision": PrecisionHandler,
     "executor": ExecutorHandler,
     "linear:int4": Int4WeightOnlyQuantizer,
+    "linear:a8wxdq": None, # uses quantize_ API
     "linear:a8w4dq": Int8DynActInt4WeightQuantizer,
 }
 
@@ -916,26 +970,11 @@ def quantized_model(self) -> nn.Module:
         torchao_experimental_quant_api
     )
     from torchao_experimental_quant_api import (
-        Int8DynActIntxWeightLinearQuantizer,
-        IntxWeightEmbeddingQuantizer,
         UIntxWeightOnlyLinearQuantizer,
     )
-
-    quantizer_class_dict["linear:a8wxdq"] = Int8DynActIntxWeightLinearQuantizer
-    quantizer_class_dict["embedding:wx"] = IntxWeightEmbeddingQuantizer
     quantizer_class_dict["linear:afpwx"] = UIntxWeightOnlyLinearQuantizer
 
     # Try loading custom op
-    try:
-        import glob
-
-        libs = glob.glob(f"{torchao_build_path}/cmake-out/lib/libtorchao_ops_aten.*")
-        libs = list(filter(lambda l: (l.endswith("so") or l.endswith("dylib")), libs))
-        torch.ops.load_library(libs[0])
-        print("Loaded torchao cpu ops.")
-    except Exception as e:
-        print("Unable to load torchao cpu ops library. Slow fallback kernels will be used.")
-
     try:
         libname = "libtorchao_ops_mps_aten.dylib"
         libpath = f"{torchao_build_path}/cmake-out/lib/{libname}"

Original file line number	Diff line number	Diff line change
`@@ -117,9 +117,11 @@ fi`
`117`	`117`
`118`	`118`	`# For torchao need to install from github since nightly build doesn't have macos build.`
`119`	`119`	`# TODO: Remove this and install nightly build, once it supports macos`
	`120`	`+# USE_CPP=1 indicates that the torchao experimental aten kernels will be built and loaded`
	`121`	`+# if on Mac with Apple Silicon`
`120`	`122`	`(`
`121`	`123`	`set -x`
`122`		`- $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@2f97b0955953fa1a46594a27f0df2bc48d93e79d`
	`124`	`+ USE_CPP=1 $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@11333ba2cb5c4e792bc4f5c0d70c12991f972008`
`123`	`125`	`)`
`124`	`126`
`125`	`127`	`if [[ -x "$(command -v nvidia-smi)" ]]; then`