Add int8 per token dynamic activaiton quant and int4 weight quant for llama2 in executorch (pytorch#1904)

jerryzh168 · facebook-github-bot · commit a9439586f09f · 2024-02-09T20:08:15.000-08:00
Summary: Pull Request resolved: pytorch#1904 representation we are getting now: https://www.internalfb.com/intern/everpaste/?handle=GEIHRRnpyYOEAIUBAFtHZapvTH5xbsIXAAAB Reviewed By: kimishpatel Differential Revision: D53211239 fbshipit-source-id: 255f87e44079877fa70afe65fa6f0c512f06d213
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -38,7 +38,12 @@
 from ...portable.utils import export_to_edge, save_pte_program
 from ..model_factory import EagerModelFactory
 from .model import ModelArgs
-from .quantize import EmbeddingOnlyInt8QuantHandler, WeightOnlyInt8QuantHandler
+from .quantize import (
+    EmbeddingOnlyInt8QuantHandler,
+    Int8DynActInt4WeightQuantHandler,
+    WeightOnlyInt8QuantHandler,
+)
+
 
 IS_FBCODE = True  #  os.environ.get("FBCODE_PLATFORM", False)
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
@@ -104,19 +109,30 @@ def apply_pt2e_quantization(
     return m
 
 
-def quantize(model) -> torch.nn.Module:
+def quantize(model: torch.nn.Module, qmode: str) -> torch.nn.Module:
     """
     Quantizes a model by converting all weights to int8.
     Args:
         model: A model to quantize.
+        qmode: quantization mode, e.g. int8, int4
     Returns:
         A quantized model.
     """
-    model_int8 = WeightOnlyInt8QuantHandler(model)
-    model_int8_state_dict = model_int8.create_quantized_state_dict()
-    model_int8 = model_int8.convert_for_runtime()
-    model_int8.load_state_dict(model_int8_state_dict)
-    return model_int8
+    if qmode == "int8":
+        model_int8 = WeightOnlyInt8QuantHandler(model)
+        model_int8_state_dict = model_int8.create_quantized_state_dict()
+        model_int8 = model_int8.convert_for_runtime()
+        model_int8.load_state_dict(model_int8_state_dict)
+        return model_int8
+    elif qmode == "int4":
+        model_int4 = Int8DynActInt4WeightQuantHandler(model)
+        model_int4_state_dict = model_int4.create_quantized_state_dict()
+        model_int4 = model_int4.convert_for_runtime()
+        print("quantized model:", model_int4)
+        model_int4.load_state_dict(model_int4_state_dict)
+        return model_int4
+    else:
+        raise Exception(f"Unrecognized quantize mode: {qmode}")
 
 
 def build_model(
@@ -145,13 +161,20 @@ def build_args_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "-q", "--quantized_ckpt", default=None, help="quantized checkpoint file"
     )
-    parser.add_argument("-Q", "--quantize", default=None, action="store_true")
     parser.add_argument("-E", "--embedding-quantize", default=None, action="store_true")
     parser.add_argument(
-        "--pt2_quantize",
+        "--pt2e_quantize",
         default=None,
         help="Use PT2E quantization. Comma separated options. e.g. xnnpack_dynamic, embedding.",
     )
+    parser.add_argument(
+        "-qmode",
+        "--quantization_mode",
+        type=str,
+        default=None,
+        choices=["int8", "int4"],
+        help="type of quantization",
+    )
 
     parser.add_argument(
         "-c",
@@ -181,6 +204,7 @@ def build_args_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "-s",
         "--so_library",
+        default=None,
         required=False,
         help="shared library for quantized operators",
     )
@@ -253,12 +277,12 @@ def get_metadata(params: ModelArgs, args: argparse.Namespace) -> Dict[str, Any]:
 
 
 def _get_quantization_options(args):
-    if args.pt2_quantize is None:
+    if args.pt2e_quantize is None:
         return []
-    if args.quantize:
-        raise ValueError("Cannot specify both --quantize and --pt2_quantize")
+    if args.quantization_mode:
+        raise ValueError("Cannot specify both --quantization_mode and --pt2e_quantize")
 
-    quantization_options = args.pt2_quantize.split(",")
+    quantization_options = args.pt2e_quantize.split(",")
     quantization_options = [option.strip() for option in quantization_options]
     return quantization_options
 
@@ -312,16 +336,18 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
         dim = torch.export.Dim("token_dim", max=model.params.max_seq_len - 1)
         dynamic_shapes = {"tokens": {1: dim}}
 
-    if args.quantized_ckpt or args.quantize:
+    if args.quantized_ckpt or args.quantization_mode:
         modelname = f"{modelname}_q"
-        model = quantize(model)
+        model = quantize(model, args.quantization_mode)
 
         if args.verbose:
             print(f"{modelname}:")
             print(f"{model}")
 
     if args.dtype_override is not None:
-        if args.dtype_override == "fp16" and metadata["get_dtype"] != 5:
+        if (
+            args.dtype_override == "fp16" and metadata["get_dtype"] != 5
+        ) or args.quantization_mode == "int4":
             model.to(dtype=torch.float16)
             metadata["get_dtype"] = 5
         elif args.dtype_override == "fp32" and metadata["get_dtype"] != 6:
@@ -361,6 +387,12 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
         edge_manager = edge_manager.to_backend(XnnpackPartitioner())
         modelname = f"xnnpack_{modelname}"
 
+    # TODO: remove this after xnnpack delegation is ready
+    if args.quantization_mode == "int4":
+        raise Exception(
+            "some quantized ops should be lowered to xnnpack, but xnnpack delegate is not ready yet"
+        )
+
     export_program = edge_manager.to_executorch(
         ExecutorchBackendConfig(
             extract_constant_segment=True,
diff --git a/examples/models/llama2/llama_test.py b/examples/models/llama2/llama_test.py
@@ -13,7 +13,7 @@ class LlamaTest(unittest.TestCase):
     def test_quantized_llama(self):
         _ = build_model(
             modelname="model",
-            extra_opts="--fairseq2 -Q",
+            extra_opts="--fairseq2 -qmode int8",
             par_local_output=True,
             resource_pkg_name=__name__,
         )
diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py
@@ -174,7 +174,12 @@ def __init__(self, args: ModelArgs):
         self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
         self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)
 
-        mask = torch.full((1, 1, args.max_seq_len, args.max_seq_len), float("-inf"))
+        mask = torch.full(
+            (1, 1, args.max_seq_len, args.max_seq_len),
+            float("-inf"),
+            dtype=torch.float16,
+        )
+
         mask = torch.triu(mask, diagonal=1)
         self.register_buffer("mask", mask)
 
@@ -546,6 +551,12 @@ def __init__(self, **kwargs):
 
             simple_quantizer = WeightOnlyInt8QuantHandler(self.model_)
             self.model_ = simple_quantizer.convert_for_runtime()
+        elif "int4" in str(checkpoint_path):
+            print("Using int4 weight-only quantization!")
+            from .quantize import Int8DynActInt4WeightQuantHandler
+
+            simple_quantizer = INt8dynactint4weightquanthandler(self.model_)
+            self.model_ = simple_quantizer.convert_for_runtime()
 
         self.model_.load_state_dict(
             checkpoint, strict=False
diff --git a/examples/models/llama2/quantize.py b/examples/models/llama2/quantize.py

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@ class LlamaTest(unittest.TestCase):`
`13`	`13`	`def test_quantized_llama(self):`
`14`	`14`	`_ = build_model(`
`15`	`15`	`modelname="model",`
`16`		`- extra_opts="--fairseq2 -Q",`
	`16`	`+ extra_opts="--fairseq2 -qmode int8",`
`17`	`17`	`par_local_output=True,`
`18`	`18`	`resource_pkg_name=__name__,`
`19`	`19`	`)`