Cover case where some weights are unquantized

jackzhxng · jackzhxng · commit 82d748d49d39 · 2025-02-26T19:10:24.000-08:00
diff --git a/examples/models/checkpoint.py b/examples/models/checkpoint.py
@@ -9,6 +9,8 @@
 from pathlib import Path
 from typing import Any, Dict, Optional
 
+import torch
+
 
 def get_default_model_resource_dir(model_file_path: str) -> Path:
     """
@@ -52,7 +54,7 @@ def get_default_model_resource_dir(model_file_path: str) -> Path:
     return resource_dir
 
 
-def get_checkpoint_dtype(checkpoint: Dict[str, Any]) -> Optional[str]:
+def get_checkpoint_dtype(checkpoint: Dict[str, Any]) -> Optional[torch.dtype]:
     """
     Get the dtype of the checkpoint, returning "None" if the checkpoint is empty.
     """
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -15,6 +15,7 @@
 import re
 import shlex
 from enum import Enum
+from functools import partial
 from json import JSONDecodeError
 from pathlib import Path
 from typing import Callable, List, Optional, Union
@@ -597,20 +598,31 @@ def _prepare_for_llama_export(args) -> LLMEdgeManager:
         args=args,
     )
 
-    # Assumes the checkpoint has uniform dtype.
-    checkpoint_dtype = next(edge_manager.model.parameters()).dtype
-    logging.info(f"checkpoint dtype: {checkpoint_dtype}")
-    # We want to quantize the weights of the model in the checkpoint dtype.
+    # At this point, the model is loaded in the default fp32.
+
+    # Convert the non-weights of the model (the buffers) to the dtype_override.
+    # Need to do this before source transform quantization since the quantized
+    # parameters become buffers.
+    for buf in edge_manager.model.buffers():
+        buf.data = buf.data.to(dtype=dtype_override.to_torch_dtype())
+
+    # We want to quantize (in the source transforms) the weights of the model
+    # in the checkpoint dtype.
+    logging.info(f"Checkpoint dtype: {edge_manager.model.checkpoint_dtype}")
     edge_manager = edge_manager.set_output_dir(output_dir_path).source_transform(
         _get_source_transforms(
-            args.model, DType.from_torch_dtype(checkpoint_dtype), args
+            args.model,
+            DType.from_torch_dtype(edge_manager.model.checkpoint_dtype),
+            args,
         )
     )
 
-    _set_quantized_computation_dtype(
-        edge_manager.model,
-        dtype_override.to_torch_dtype(),  # pyre-ignore[16]
-    )
+    # Convert the parameters to the dtype_override.
+    # If source transform quantization has already happened at this point (-qmode),
+    # the quantized weights will become buffers and not be returned by .parameters(),
+    # so we don't convert them to the dtype_override.
+    for param in edge_manager.model.parameters():
+        param.data = param.data.to(dtype=dtype_override.to_torch_dtype())
 
     return edge_manager
 
@@ -785,8 +797,6 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
                 shares=args.num_sharding,
             )
 
-        from functools import partial
-
         # pyre-ignore
         from executorch.backends.qualcomm.quantizer.custom_annotation import (
             get_custom_quant_ios_dtype,
@@ -989,21 +999,11 @@ def _load_llama_model(
         )
     )
 
-    if dtype_override:
-        assert isinstance(
-            dtype_override, DType
-        ), "Override dtype needs to be of type <DType>"
-        dtype = dtype_override
-    else:
-        checkpoint_dtype = next(model.parameters()).dtype
-        dtype = DType.from_torch_dtype(checkpoint_dtype)
-        logging.info(f"Loaded model with dtype={dtype}")
-
     return LLMEdgeManager(
         model=model,
         modelname=modelname,
         max_seq_len=model.max_seq_len,
-        dtype=dtype,
+        dtype=dtype_override,
         use_kv_cache=use_kv_cache,
         generate_full_logits=generate_full_logits,
         example_inputs=example_inputs,
@@ -1088,7 +1088,14 @@ def _get_source_transforms(  # noqa
         this wil be a no-op.
         """
         modelname = f"{modelname}_e"
-        transforms.append(get_quant_embedding_transform(args))
+        transforms.append(get_quant_embedding_transform(args, dtype_override))
+
+    if args.quantization_mode or args.embedding_quantize:
+        transforms.append(
+            partial(
+                _set_quantized_computation_dtype, dtype=dtype_override.to_torch_dtype()
+            )
+        )
 
     if args.expand_rope_table:
         transforms.append(materialze_broadcast_of_rope_freq_cis)
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -54,7 +54,6 @@ def __init__(self, **kwargs):
         self.output_prune_map_path = kwargs.get("output_prune_map_path", None)
         self.max_seq_len = kwargs.get("max_seq_len", 128)
         self.max_context_len = kwargs.get("max_context_len", 128)
-        self.dtype = kwargs.get("dtype", None)
         self.args = kwargs.get("args", None)
 
         assert (
@@ -123,9 +122,6 @@ def __init__(self, **kwargs):
 """
             )
 
-        # Get checkpoint dtype.
-        self.checkpoint_dtype = get_checkpoint_dtype(checkpoint)
-
         with open(params_path, "r") as f:
             params = json.loads(f.read())
         output_prune_map = None
@@ -174,14 +170,7 @@ def __init__(self, **kwargs):
         with torch.device("meta"):
             # Model itself is loaded in default dtype, fp32.
             self.model_ = Transformer(model_args)
-            if self.dtype:
-                self.model_.to(dtype=self.dtype)
-
-            # Convert the model's weights only to the checkpoint's dtype, so that
-            # the checkpoint can be loaded into the model's state dict in its
-            # own dtype w/o potential precision loss.
-            for param in self.model_.parameters():
-                param.data = param.data.to(dtype=self.checkpoint_dtype)
+            self.model_.checkpoint_dtype = get_checkpoint_dtype(checkpoint)
 
         if "int8" in str(checkpoint_path):
             print("Using int8 weight-only quantization!")
@@ -251,6 +240,10 @@ def __init__(self, **kwargs):
             # assign=True: load params/buffers by assignment instead of performing an in-place copy.
             # Because we are using device="meta", tensors do not have memory associated with them
             # and an in-place copy is a no-op. Use assign=True in load_state_dict for this scenario.
+
+            # Also, the checkpoint is loaded and dtype promoted to the transformer's dtype, which is
+            # by default initialized to fp32. This is fine because every other supported type
+            # losslessly converts to fp32, so we don't lose precision here.
             missing, unexpected = self.model_.load_state_dict(
                 checkpoint,
                 strict=False,
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
@@ -72,7 +72,9 @@ def quantize(  # noqa C901
 
     if qmode == "int8":
         # Add quantization mode options here: group size, bit width, etc.
-        return WeightOnlyInt8QuantHandler(model).quantized_model()
+        return WeightOnlyInt8QuantHandler(
+            model, precision=torch_dtype
+        ).quantized_model()
     elif qmode.startswith("torchao:fpa"):
         pattern = r"torchao:fpa(\d+)w"
         matches = re.findall(pattern, qmode)
@@ -85,7 +87,7 @@ def quantize(  # noqa C901
             model = (
                 UIntxWeightOnlyLinearQuantizer(
                     device="mps",
-                    precision=torch.float32,
+                    precision=torch_dtype,
                     groupsize=group_size,
                     bitwidth=bitwidth,
                 )
@@ -107,7 +109,7 @@ def quantize(  # noqa C901
         with torch.no_grad():
             model = Int8DynActIntxWeightLinearQuantizer(
                 device="cpu",
-                precision=torch.float32,
+                precision=torch_dtype,
                 groupsize=group_size,
                 bitwidth=bitwidth,
                 has_weight_zeros=False,
@@ -346,6 +348,7 @@ def __init__(
         node_type: str = "*",
         bitwidth: Optional[int] = None,
         group_size: Optional[int] = None,
+        precision: torch.dtype = torch.float32,
     ):
         self.mod = mod
         self.group_size = group_size
@@ -354,6 +357,7 @@ def __init__(
             self.bitwidth = 8
         else:
             self.bitwidth = bitwidth
+        self.precision = precision
 
     @torch.no_grad()
     def create_quantized_state_dict(self) -> Dict:
@@ -389,7 +393,7 @@ def create_quantized_state_dict(self) -> Dict:
 
                     # print(f"expanded weight shape {input_weight.shape}")
                     weight, scales, _ = dynamically_quantize_per_channel(
-                        input_weight,
+                        input_weight.to(dtype=self.precision),
                         range_min,
                         range_max,
                         torch.int8,
@@ -574,6 +578,7 @@ def __init__(
         bitwidth: int = 8,
         group_size: Optional[int] = None,
         packed=False,
+        precision: Optional[torch.dtype] = None,
     ):
         if isinstance(packed, str):
             packed = packed == "True"
@@ -582,6 +587,8 @@ def __init__(
         self.group_size = group_size
         self.bitwidth = bitwidth
         self.packed = packed
+        # Dtype of the weights right before quantization.
+        self.precision = precision
         if (bitwidth not in [2, 4]) and packed:
             raise RuntimeError("pack only works with bitsize 2, 4")
 
@@ -612,7 +619,11 @@ def create_quantized_state_dict(self, packed=False) -> Dict:
                     f"quantize {fqn, mod} with group_size {self.group_size}, bitwidth {self.bitwidth}"
                 )
                 weight, scales, _ = dynamically_quantize_per_channel(
-                    mod.weight,
+                    (
+                        mod.weight.to(dtype=self.precision)
+                        if self.precision
+                        else mod.weight
+                    ),
                     range_min,
                     range_max,
                     torch.int8,
@@ -748,7 +759,7 @@ def forward(self, indices: torch.Tensor) -> torch.Tensor:
 ############################ Source Transform Start #######################
 
 
-def get_quant_embedding_transform(args):
+def get_quant_embedding_transform(args, dtype_override: Optional[DType] = None):
     if args.embedding_quantize.startswith("torchao:"):
         bitwidth, group_size = args.embedding_quantize.split(":")[1].split(",")
         group_size = int(group_size)
@@ -774,11 +785,13 @@ def _torchao_embedding_quantizer(model):
     else:
         group_size = int(group_size)
     bitwidth = int(bitwidth)
+    torch_dtype = dtype_override.to_torch_dtype() if dtype_override else None
     return lambda model: EmbeddingQuantHandler(
         model,
         bitwidth=bitwidth,
         group_size=group_size,
         packed=(bitwidth in [2, 4]),
+        precision=torch_dtype,
     ).quantized_model()
 
 
@@ -831,25 +844,32 @@ def _load_torchao_aten_lib(libname):
 # We want to do compute the actual ops in the dtype of the dtype_override,
 # since the precision of the quantized linear will initially be the dtype of the
 # checkpoint, not the dtype_override.
-# TODO(#8652): this is a temporary solution for until we can support the new ao,
-# quantize_ api, which apparently can support different dtypes at quantization and
-# computation.
-def _set_quantized_computation_dtype(module: nn.Module, dtype: torch.dtype):
-    """
-    Recursively iterate through the module and set the dtype/precision attributes
-    of all Int8DynActInt4WeightLinear and QuantizedGroupEmbedding submodules to 'fp32'.
-    """
-    for name, child in module.named_children():
-        if isinstance(child, Int8DynActInt4WeightLinear):
-            # Change the precision attribute to 'fp32'
-            child.precision = dtype
-            print(f"Changed precision of {name} to {dtype}")
-        elif isinstance(child, QuantizedGroupEmbedding):
-            child.dtype = dtype
-            print(f"Changed precision of {name} to {dtype}")
-        else:
-            # Recursively apply to child modules
-            _set_quantized_computation_dtype(child, dtype)
+def _set_quantized_computation_dtype(
+    module: nn.Module, dtype: torch.dtype
+) -> nn.Module:
+    def _set_quantized_computation_dtype_rec(
+        module: nn.Module, dtype: torch.dtype
+    ) -> None:
+        """
+        Recursively iterate through the module and set the dtype/precision attributes
+        of all Int8DynActInt4WeightLinear and QuantizedGroupEmbedding submodules to 'fp32'.
+        """
+        for name, child in module.named_children():
+            if isinstance(child, Int8DynActInt4WeightLinear):
+                # Change the precision attribute to 'fp32'
+                child.precision = dtype
+                print(f"Changed precision of {name} to {dtype}")
+            elif isinstance(child, QuantizedGroupEmbedding):
+                child.dtype = dtype
+                print(f"Changed precision of {name} to {dtype}")
+            elif isinstance(child, WeightOnlyInt8Linear):
+                child.dtype = dtype
+            else:
+                # Recursively apply to child modules
+                _set_quantized_computation_dtype_rec(child, dtype)
+
+    _set_quantized_computation_dtype_rec(module, dtype)
+    return module
 
 
 ############################ Source Transform End #######################