Add torchao mps lowbit ops to llama runner

manuelcandales · manuelcandales · commit e1b15640c79b · 2024-12-05T14:01:31.000-05:00
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
@@ -38,6 +38,7 @@ cmake_dependent_option(
 )
 
 option(EXECUTORCH_BUILD_TORCHAO "Build the torchao kernels" OFF)
+option(EXECUTORCH_BUILD_TORCHAO_MPS "Build the torchao mps kernels" OFF)
 
 if(NOT PYTHON_EXECUTABLE)
   set(PYTHON_EXECUTABLE python3)
@@ -130,6 +131,13 @@ if(EXECUTORCH_BUILD_TORCHAO)
   list(APPEND link_libraries torchao_ops_executorch)
 endif()
 
+if(EXECUTORCH_BUILD_TORCHAO_MPS)
+  set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/ao/torchao/experimental/ops/mps ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/ao/torchao/experimental/ops/mps)
+  target_link_options_shared_lib(torchao_ops_mps_executorch)
+  list(APPEND link_libraries torchao_ops_mps_executorch)
+endif()
+
 set(XNNPACK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack)
 # Extra compile option and include dir for pthreadpool
 if(EXECUTORCH_BUILD_PTHREADPOOL)
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -600,7 +600,7 @@ def get_quantizer_and_quant_params(args):
 
 def _qmode_type(value):
     choices = ["int8", "8da4w", "8da4w-gptq", "vulkan_4w"]
-    patterns = [r"torchao:8da(\d+)w"]
+    patterns = [r"torchao:8da(\d+)w", r"torchao:fpa(\d+)w"]
 
     if value in choices:
         return value
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
@@ -72,12 +72,31 @@ def quantize(  # noqa C901
     if qmode == "int8":
         # Add quantization mode options here: group size, bit width, etc.
         return WeightOnlyInt8QuantHandler(model).quantized_model()
-    elif qmode.startswith("torchao:"):
+    elif qmode.startswith("torchao:fpa"):
+        pattern = r"torchao:fpa(\d+)w"
+        matches = re.findall(pattern, qmode)
+        assert len(matches) == 1, f"Expected 1 match for pattern but got {len(matches)}"
+        bitwidth = int(matches[0][0])
+        _load_torchao_aten_lib(libname="libtorchao_ops_mps_linear_fp_act_xbit_weight_aten")
+        from torchao.experimental.quant_api import UIntxWeightOnlyLinearQuantizer
+
+        with torch.no_grad():
+            model = UIntxWeightOnlyLinearQuantizer(
+                device="mps",
+                precision=torch.float32,
+                groupsize=group_size,
+                bitwidth=bitwidth
+            ).quantize(model)
+
+        if verbose:
+            print("quantized model:", model)
+        return model
+    elif qmode.startswith("torchao:8da"):
         pattern = r"torchao:8da(\d+)w"
         matches = re.findall(pattern, qmode)
         assert len(matches) == 1, f"Expected 1 match for pattern but got {len(matches)}"
         bitwidth = int(matches[0][0])
-        _load_torchao_ops_aten()
+        _load_torchao_aten_lib(libname="libtorchao_ops_aten")
         from torchao.experimental.quant_api import Int8DynActIntxWeightLinearQuantizer
 
         with torch.no_grad():
@@ -729,7 +748,7 @@ def get_quant_embedding_transform(args):
         bitwidth, group_size = args.embedding_quantize.split(":")[1].split(",")
         group_size = int(group_size)
         bitwidth = int(bitwidth)
-        _load_torchao_ops_aten()
+        _load_torchao_aten_lib(libname="libtorchao_ops_aten")
         from torchao.experimental.quant_api import IntxWeightEmbeddingQuantizer
 
         def _torchao_embedding_quantizer(model):
@@ -785,15 +804,15 @@ def get_quant_weight_transform(args, dtype_override, verbose):
     )
 
 
-def _load_torchao_ops_aten():
+def _load_torchao_aten_lib(libname):
     import glob
     import os
 
     libs = glob.glob(
         os.path.abspath(
             os.path.join(
                 os.environ.get("CMAKE_INSTALL_PREFIX", ""),
-                "lib/libtorchao_ops_aten.*",
+                f"lib/{libname}.*",
             )
         )
     )