Use model.to approach, forget edge case for checkpoint dtype > dtype_override precision

jackzhxng · jackzhxng · commit e0b9234a5187 · 2025-03-18T19:26:43.000-07:00
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -56,9 +56,9 @@
 
 from .source_transformation.attention import replace_attention_to_attention_sha
 from .source_transformation.quantize import (
-    set_quantized_computation_dtype,
     get_quant_embedding_transform,
     get_quant_weight_transform,
+    set_quantized_computation_dtype,
 )
 from .source_transformation.quantized_kv_cache import (
     replace_kv_cache_with_custom_kv_cache,
@@ -596,31 +596,24 @@ def _prepare_for_llama_export(args) -> LLMEdgeManager:
 
     # At this point, the model is loaded in the default fp32.
 
-    # Convert the non-weights of the model (the buffers) to the dtype_override.
-    # Need to do this before source transform quantization since the quantized
-    # parameters become buffers.
-    for buf in edge_manager.model.buffers():
-        buf.data = buf.data.to(dtype=dtype_override.to_torch_dtype())
+    # TODO: some validation for the combination of checkpoint dtype and dtype_override.
+
+    edge_manager.model = edge_manager.model.to(dtype=dtype_override.to_torch_dtype())
 
     # We want to quantize (in the source transforms) the weights of the model
     # in the checkpoint dtype.
     logging.info(f"Checkpoint dtype: {edge_manager.model.checkpoint_dtype}")
     edge_manager = edge_manager.set_output_dir(output_dir_path).source_transform(
         _get_source_transforms(
-            args.model,
-            dtype_override,
-            DType.from_torch_dtype(edge_manager.model.checkpoint_dtype),
-            args,
+            modelname=args.model,
+            dtype_override=dtype_override,
+            checkpoint_dtype=DType.from_torch_dtype(
+                edge_manager.model.checkpoint_dtype
+            ),
+            args=args,
         )
     )
 
-    # Convert the parameters to the dtype_override.
-    # If source transform quantization has already happened at this point (-qmode),
-    # the quantized weights will become buffers and not be returned by .parameters(),
-    # so we don't convert them to the dtype_override.
-    for param in edge_manager.model.parameters():
-        param.data = param.data.to(dtype=dtype_override.to_torch_dtype())
-
     return edge_manager
 
 
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
@@ -845,9 +845,7 @@ def _load_torchao_aten_lib(libname):
 # We want to do compute the actual ops in the dtype of the dtype_override,
 # since the precision of the quantized linear will initially be the dtype of the
 # checkpoint, not the dtype_override.
-def set_quantized_computation_dtype(
-    module: nn.Module, dtype: torch.dtype
-) -> nn.Module:
+def set_quantized_computation_dtype(module: nn.Module, dtype: torch.dtype) -> nn.Module:
     def _set_quantized_computation_dtype_rec(
         module: nn.Module, dtype: torch.dtype
     ) -> None: