up

metascroy · metascroy · commit 939fe291e590 · 2024-11-06T11:20:44.000-08:00
diff --git a/.ci/docker/ci_commit_pins/torchao.txt b/.ci/docker/ci_commit_pins/torchao.txt
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -570,7 +570,7 @@ def get_quantizer_and_quant_params(args):
 
 def _qmode_type(value):
     choices = ["int8", "8da4w", "8da4w-gptq", "vulkan_4w"]
-    patterns = [r"torchao:8da{\d+}w"]
+    patterns = [r"torchao:8da(\d+)w"]
 
     if value in choices:
         return value
@@ -579,10 +579,12 @@ def _qmode_type(value):
         matches = re.findall(pattern, value)
         if len(matches) == 1:
             return value
+
     raise argparse.ArgumentTypeError(
-            f"Got qmode {value}, but expected one of {choices}, or one of the regex patterns {patterns}."
+        f"Got qmode {value}, but expected one of {choices}, or one of the regex patterns {patterns}."
     )
 
+
 def _validate_args(args):
     """
     TODO: Combine all the backends under --backend args
@@ -596,7 +598,9 @@ def _validate_args(args):
     if args.num_sharding > 0 and not args.qnn:
         raise ValueError("Model shard is only supported with qnn backend now.")
 
-    if args.quantization_mode.startswith("torchao:") or args.embedding_quantize.startswith("torchao:"):
+    if args.quantization_mode.startswith(
+        "torchao:"
+    ) or args.embedding_quantize.startswith("torchao:"):
         if args.enable_dynamic_shape:
             raise ValueError(
                 "Dynamic shape is not currently supported with torchao ops. Please use --disable_dynamic_shape."
diff --git a/examples/models/llama/install_requirements.sh b/examples/models/llama/install_requirements.sh
@@ -10,8 +10,7 @@
 pip install snakeviz sentencepiece
 
 # Install torchao.
-TORCHAO_VERSION=$(cat "$(dirname "$0")"/../../../.ci/docker/ci_commit_pins/torchao.txt)
-pip install --no-use-pep517 "git+https://github.com/pytorch/ao.git@${TORCHAO_VERSION}"
+pip install "$(dirname "$0")/../../../third-party/ao"
 
 # Install lm-eval for Model Evaluation with lm-evalution-harness
 # Install tiktoken for tokenizer
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
@@ -73,69 +73,24 @@ def quantize(  # noqa C901
         # Add quantization mode options here: group size, bit width, etc.
         return WeightOnlyInt8QuantHandler(model).quantized_model()
     elif qmode.startswith("torchao:"):
-        import glob
-        import os
-
-        libs = glob.glob(
-            os.path.abspath(
-                os.path.join(
-                    os.environ.get("CMAKE_INSTALL_PREFIX", ""),
-                    "lib/libtorchao_ops_aten.*",
-                )
-            )
-        )
-        assert (
-            len(libs) == 1
-        ), f"Expected 1 library but got {len(libs)}.  If you installed the torchao ops in a non-standard location, please set CMAKE_INSTALL_PREFIX correctly."
-        logging.info(f"Loading custom ops library: {libs[0]}")
-        torch.ops.load_library(libs[0])
-
-        logging.warning(
-            "When qmode is torchao, the groupsize is obtained from the qmode string with regex parse; blocksize is ignored."
-        )
-        embedding_pattern = r"emb.(\d+),(\d+)"
-        linear_pattern = r"lin8da.(\d+),(\d+)"
-
-        matches = re.findall(linear_pattern, qmode)
-        if matches:
-            assert (
-                len(matches) == 1
-            ), f"Expected 1 match for linear_pattern but got {len(matches)}"
-            bitwidth = int(matches[0][0])
-            groupsize = int(matches[0][1])
-            from torchao.experimental.quant_api import (
-                Int8DynActIntxWeightLinearQuantizer,
-            )
-
-            with torch.no_grad():
-                model = Int8DynActIntxWeightLinearQuantizer(
-                    device="cpu",
-                    precision=torch_dtype,
-                    groupsize=groupsize,
-                    bitwidth=bitwidth,
-                    has_weight_zeros=False,
-                ).quantize(model)
-
-        matches = re.findall(embedding_pattern, qmode)
-        if matches:
-            assert (
-                len(matches) == 1
-            ), f"Expected 1 match for embedding_pattern but got {len(matches)}"
-            bitwidth = int(matches[0][0])
-            groupsize = int(matches[0][1])
-            from torchao.experimental.quant_api import IntxWeightEmbeddingQuantizer
-
-            with torch.no_grad():
-                model = IntxWeightEmbeddingQuantizer(
-                    device="cpu",
-                    precision=torch_dtype,
-                    bitwidth=bitwidth,
-                    groupsize=groupsize,
-                ).quantize(model)
+        pattern = r"torchao:8da(\d+)w"
+        matches = re.findall(pattern, qmode)
+        assert len(matches) == 1, f"Expected 1 match for pattern but got {len(matches)}"
+        bitwidth = int(matches[0][0])
+        _load_torchao_ops_aten()
+        from torchao.experimental.quant_api import Int8DynActIntxWeightLinearQuantizer
+
+        with torch.no_grad():
+            model = Int8DynActIntxWeightLinearQuantizer(
+                device="cpu",
+                precision=torch.float32,
+                groupsize=group_size,
+                bitwidth=bitwidth,
+                has_weight_zeros=False,
+            ).quantize(model)
 
         if verbose:
             print("quantized model:", model)
-
         return model
     elif qmode == "8da4w":
         # Check for required args
@@ -760,6 +715,25 @@ def forward(self, indices: torch.Tensor) -> torch.Tensor:
 
 
 def get_quant_embedding_transform(args):
+    if args.embedding_quantize.startswith("torchao:"):
+        bitwidth, group_size = args.embedding_quantize.split(":")[1].split(",")
+        group_size = int(group_size)
+        bitwidth = int(bitwidth)
+        _load_torchao_ops_aten()
+        from torchao.experimental.quant_api import IntxWeightEmbeddingQuantizer
+
+        def _torchao_embedding_quantizer(model):
+            with torch.no_grad():
+                model = IntxWeightEmbeddingQuantizer(
+                    device="cpu",
+                    precision=torch.float32,
+                    bitwidth=bitwidth,
+                    groupsize=group_size,
+                ).quantize(model)
+            return model
+
+        return _torchao_embedding_quantizer
+
     bitwidth, group_size = args.embedding_quantize.split(",")
     if group_size == "none" or group_size == "None" or group_size == "0":
         group_size = None
@@ -801,4 +775,23 @@ def get_quant_weight_transform(args, dtype_override, verbose):
     )
 
 
+def _load_torchao_ops_aten():
+    import glob
+    import os
+
+    libs = glob.glob(
+        os.path.abspath(
+            os.path.join(
+                os.environ.get("CMAKE_INSTALL_PREFIX", ""),
+                "lib/libtorchao_ops_aten.*",
+            )
+        )
+    )
+    assert (
+        len(libs) == 1
+    ), f"Expected 1 library but got {len(libs)}.  If you installed the torchao ops in a non-standard location, please set CMAKE_INSTALL_PREFIX correctly."
+    logging.info(f"Loading custom ops library: {libs[0]}")
+    torch.ops.load_library(libs[0])
+
+
 ############################ Source Transform End #######################
diff --git a/examples/models/llama3_2_vision/install_requirements.sh b/examples/models/llama3_2_vision/install_requirements.sh
@@ -9,5 +9,4 @@
 pip install --pre torchtune --extra-index-url https://download.pytorch.org/whl/nightly/cpu --no-cache-dir
 
 # Install torchao.
-TORCHAO_VERSION=$(cat "$(dirname "$0")"/../../../.ci/docker/ci_commit_pins/torchao.txt)
-pip install --no-use-pep517 "git+https://github.com/pytorch/ao.git@${TORCHAO_VERSION}"
+pip install "$(dirname "$0")/../../../third-party/ao"
diff --git a/examples/models/phi-3-mini-lora/install_requirements.sh b/examples/models/phi-3-mini-lora/install_requirements.sh
@@ -10,5 +10,4 @@ pip install torchtune
 pip install tiktoken
 
 # Install torchao.
-TORCHAO_VERSION=$(cat "$(dirname "$0")"/../../../.ci/docker/ci_commit_pins/torchao.txt)
-pip install --no-use-pep517 "git+https://github.com/pytorch/ao.git@${TORCHAO_VERSION}"
+pip install "$(dirname "$0")/../../../third-party/ao"