Fix logic + separate weight and activation dtypes

jackzhxng · jackzhxng · commit 68ead1b6f621 · 2025-02-18T15:39:22.000-08:00
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -46,6 +46,7 @@
     get_vulkan_quantizer,
 )
 from executorch.util.activation_memory_profiler import generate_memory_trace
+from torchao.quantization.GPTQ import Int8DynActInt4WeightLinear
 
 from ..model_factory import EagerModelFactory
 from .source_transformation.apply_spin_quant_r1_r2 import (
@@ -57,6 +58,7 @@
 from .source_transformation.quantize import (
     get_quant_embedding_transform,
     get_quant_weight_transform,
+    QuantizedGroupEmbedding,
 )
 from .source_transformation.quantized_kv_cache import (
     replace_kv_cache_with_custom_kv_cache,
@@ -593,24 +595,53 @@ def _prepare_for_llama_export(args) -> LLMEdgeManager:
         dtype_override=dtype_override,
         args=args,
     )
+
+    # # Override dtype of the model as specified by the user args.
+    # if dtype_override:
+    #     assert isinstance(
+    #         dtype_override, DType
+    #     ), "Override dtype needs to be of type <DType>"
+    #     torch_dtype = dtype_override.to_torch_dtype()
+    #     logging.info(f"model.to {torch_dtype}")
+    #     edge_manager.model = edge_manager.model.to(dtype=torch_dtype)
+    #     metadata_str=args.metadata,
+    #     dtype_override=dtype_override,
+    #     args=args,
+    # )
+
     # Assumes the checkpoint has uniform dtype.
     checkpoint_dtype = next(edge_manager.model.parameters()).dtype
     print(f"checkpoint dtype: {checkpoint_dtype}")
-    # We want to quantize with the model in the checkpoint dtype before casting to dtype_override.
+    # We want to quantize the weights of the model in the checkpoint dtype.
     edge_manager = edge_manager.set_output_dir(output_dir_path).source_transform(
         _get_source_transforms(
             args.model, DType.from_torch_dtype(checkpoint_dtype), args
         )
     )
 
-    # Override dtype of the model as specified by the user args.
-    if dtype_override:
-        assert isinstance(
-            dtype_override, DType
-        ), "Override dtype needs to be of type <DType>"
-        torch_dtype = dtype_override.to_torch_dtype()
-        logging.info(f"model.to {torch_dtype}")
-        edge_manager.model = edge_manager.model.to(dtype=torch_dtype)
+    quantized = torch.load("/home/jackzhxng/torchrepos/executorch/fake_quantized_weights.pt")
+    breakpoint()
+    # torch.testing.assert_close()
+
+    # We want to do compute the actual ops in the precision of the dtype_override.
+    def _set_precision_to_fp32(module):
+        """
+        Recursively iterate through the module and set the precision attribute
+        of all Int8DynActInt4WeightLinear submodules to 'fp32'.
+        """
+        for name, child in module.named_children():
+            if isinstance(child, Int8DynActInt4WeightLinear):
+                # Change the precision attribute to 'fp32'
+                child.precision = torch.float32
+                print(f"Changed precision of {name} to torch.float32")
+            elif isinstance(child, QuantizedGroupEmbedding):
+                child.dtype = torch.float32
+                print(f"Changed precision of {name} to torch.float32")
+            else:
+                # Recursively apply to child modules
+                _set_precision_to_fp32(child)
+
+    _set_precision_to_fp32(edge_manager.model)
 
     return edge_manager
 
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -54,6 +54,7 @@ def __init__(self, **kwargs):
         self.output_prune_map_path = kwargs.get("output_prune_map_path", None)
         self.max_seq_len = kwargs.get("max_seq_len", 128)
         self.max_context_len = kwargs.get("max_context_len", 128)
+        self.dtype = kwargs.get("dtype_override", None)
         self.args = kwargs.get("args", None)
 
         assert (
@@ -123,7 +124,7 @@ def __init__(self, **kwargs):
             )
 
         # Get checkpoint dtype.
-        self.dtype = get_checkpoint_dtype(checkpoint)
+        self.checkpoint_dtype = get_checkpoint_dtype(checkpoint)
 
         with open(params_path, "r") as f:
             params = json.loads(f.read())
@@ -171,7 +172,16 @@ def __init__(self, **kwargs):
         # Within the device="meta" context, tensors that are created do not carry data.
         # They possess all other metadata a tensor carries such as size, stride, requires_grad.
         with torch.device("meta"):
-            self.model_ = Transformer(model_args).to(dtype=self.dtype)
+            # Model itself is loaded in default dtype, fp32.
+            self.model_ = Transformer(model_args)
+            if self.dtype:
+                self.model_.to(dtype=self.dtype)
+
+            # Convert the model's weights only to the checkpoint's dtype, so that
+            # the checkpoint can be loaded into the model's state dict in its
+            # own dtype w/o potential precision loss.
+            for param in self.model_.parameters():
+                param.data = param.data.to(dtype=self.checkpoint_dtype)
 
         if "int8" in str(checkpoint_path):
             print("Using int8 weight-only quantization!")
@@ -265,10 +275,12 @@ def __init__(self, **kwargs):
             self.model_ = prune_output_vocab(self.model_, output_prune_map)
 
     def get_eager_model(self) -> torch.nn.Module:
-        if self.dtype:
+        return self.model_
+
+        if self.checkpoint_dtype:
             # convert to the type of the provided checkpoint
             # input and output are torch.long, so signature unchanged
-            return self.model_.to(self.dtype)
+            return self.model_.to(self.checkpoint_dtype)
         else:
             # int8 quantization code has some bf16,
             # switch all to FP32