updates

metascroy · metascroy · commit 853be03ec72f · 2024-10-19T21:57:19.000-07:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -833,13 +833,5 @@ if(EXECUTORCH_BUILD_VULKAN)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/vulkan)
 endif()
 
-# if(EXECUTORCH_BUILD_TORCHAO)
-#   add_compile_options("-frtti")
-#   set(EXECUTORCH_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/..)
-#   set(EXECUTORCH_LIBRARIES executorch extension_threadpool) # cpuinfo pthreadpool)
-#   set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
-#   add_subdirectory(third-party/ao/torchao/experimental)
-# endif()
-
 # Print all summary
 executorch_print_configuration_summary()
diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt
@@ -37,6 +37,8 @@ cmake_dependent_option(
   "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF
 )
 
+option(EXECUTORCH_BUILD_TORCHAO "Build the torchao kernels" OFF)
+
 if(NOT PYTHON_EXECUTABLE)
   set(PYTHON_EXECUTABLE python3)
 endif()
@@ -122,17 +124,7 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
 endif()
 
 if(EXECUTORCH_BUILD_TORCHAO)
-  # Method1: torchao has a config
-  # set(torchao_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../lib/cmake/torchao)
-  # find_package(torchao REQUIRED)
-  # target_link_options_shared_lib(torchao::torchao_ops_executorch)
-  # list(APPEND link_libraries torchao::torchao_ops_executorch)
-
-  # Method2: torchao is built at top-level CMakeLists.txt
-  # list(APPEND link_libraries "$<LINK_LIBRARY:WHOLE_ARCHIVE,${CMAKE_CURRENT_BINARY_DIR}/../../../lib/libtorchao_ops_executorch.a>")
-  # list(APPEND link_libraries "${CMAKE_CURRENT_BINARY_DIR}/../../../lib/libtorchao_kernels_aarch64.a")
-
-  # Method3: submodule
+  set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental)
   target_link_options_shared_lib(torchao_ops_executorch)
   list(APPEND link_libraries torchao_ops_executorch)
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -12,14 +12,14 @@
 import copy
 import json
 import logging
+import re
 import shlex
 from enum import Enum
 from json import JSONDecodeError
 from pathlib import Path
 from typing import Callable, List, Optional, Union
 
 import pkg_resources
-
 import torch
 
 from executorch.devtools.etrecord import generate_etrecord
@@ -152,12 +152,41 @@ def build_args_parser() -> argparse.ArgumentParser:
         ],
         help="Use PT2E quantization. Comma separated options. e.g. xnnpack_dynamic (for per channel 8 bit weight), xnnpack_dynamic_qc4 (for per channel 4 bit weight), embedding.",
     )
+
+    def _is_valid_torchao_qmode_type(value):
+        if not value.startswith("torchao:"):
+            return False
+
+        linear_pattern = r"lin.8da(\d+)b(\d+)gw"
+        linear_matches = re.findall(linear_pattern, value)
+        print("LINEAR MATCHES", linear_matches)
+
+        if len(linear_matches) > 1:
+            return False
+
+        embedding_pattern = r"emb.(\d+)b(\d+)gw"
+        embedding_matches = re.findall(embedding_pattern, value)
+        print("EMBEDDING MATCHES", embedding_matches)
+        if len(embedding_matches) > 1:
+            return False
+        if len(linear_matches) + len(embedding_matches) == 0:
+            return False
+        return True
+
+    def _qmode_type(value):
+        choices = ["int8", "8da4w", "8da4w-gptq"]
+        if not (value in choices or _is_valid_torchao_qmode_type(value)):
+            raise argparse.ArgumentTypeError(
+                f"Value must be one of: {choices} or a valid torchao regex"
+            )
+        return value
+
     parser.add_argument(
         "-qmode",
         "--quantization_mode",
-        type=str,
+        type=_qmode_type,
         default=None,
-        choices=["int8", "8da4w", "8da4w-gptq"],
+        # choices=["int8", "8da4w", "8da4w-gptq"] + [f"torchao:8da{x}w" for x in range(1, 9)],
         help="type of quantization",
     )
 
diff --git a/examples/models/llama2/source_transformation/quantize.py b/examples/models/llama2/source_transformation/quantize.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import logging
+import re
 from functools import partial
 from pathlib import Path
 from typing import Any, Dict, Optional
@@ -31,7 +33,7 @@
     fsLinear = nn.Linear
 
 
-def quantize(
+def quantize(  # noqa: C901
     model: torch.nn.Module,
     qmode: str,
     activation_dtype: Optional[DType],
@@ -68,25 +70,43 @@ def quantize(
     if qmode == "int8":
         # Add quantization mode options here: group size, bit width, etc.
         return WeightOnlyInt8QuantHandler(model).quantized_model()
-    elif qmode.startswith("torchao"):
-        # format is torchao:8daxw
-        bitwidth = int(qmode[len("torchao:8da")])
-        if group_size is None:
-            raise Exception(f"For {qmode} quantization, group size must be specified.")
-        from torchao.experimental.quant_api import Int8DynActIntxWeightQuantizer
-        model = Int8DynActIntxWeightQuantizer(
-            device="cpu",
-            precision=torch_dtype, groupsize=group_size, bitwidth=bitwidth, has_weight_zeros=False).quantize(model)
+    elif qmode.startswith("torchao:"):
+        logging.warning(
+            "When qmode is torchao, the groupsize is obtained from the qmode string with regex parse; blocksize is ignored."
+        )
+        linear_pattern = r"lin.8da(\d+)b(\d+)gw"
+        linear_matches = re.findall(linear_pattern, qmode)
+        if linear_matches:
+            bitwidth = int(linear_matches[0][0])
+            group_size = int(linear_matches[0][1])
+            from torchao.experimental.quant_api import Int8DynActIntxWeightQuantizer
+
+            model = Int8DynActIntxWeightQuantizer(
+                device="cpu",
+                precision=torch_dtype,
+                groupsize=group_size,
+                bitwidth=bitwidth,
+                has_weight_zeros=False,
+            ).quantize(model)
+
+        embedding_pattern = r"emb.(\d+)b(\d+)gw"
+        embedding_matches = re.findall(embedding_pattern, qmode)
+        if embedding_matches:
+            pass  # TODO: add when embedding PR lands in torchao
+
         if verbose:
             print("quantized model:", model)
+
         return model
     elif qmode == "8da4w":
         # Check for required args
         if group_size is None:
             raise Exception("For 8da4w quantization, group size must be specified.")
         from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
 
-        model = Int8DynActInt4WeightQuantizer(precision=torch_dtype, groupsize=group_size, bitwidth=4).quantize(model)
+        model = Int8DynActInt4WeightQuantizer(
+            precision=torch_dtype, groupsize=group_size, bitwidth=4
+        ).quantize(model)
 
         if verbose:
             print("quantized model:", model)