pytorch
diff --git a/‎examples/models/checkpoint.py‎
Lines changed: 3 additions & 1 deletion b/‎examples/models/checkpoint.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎examples/models/llama/export_llama_lib.py‎
Lines changed: 71 additions & 64 deletions b/‎examples/models/llama/export_llama_lib.py‎
Lines changed: 71 additions & 64 deletions
diff --git a/‎examples/models/llama/model.py‎
Lines changed: 7 additions & 11 deletions b/‎examples/models/llama/model.py‎
Lines changed: 7 additions & 11 deletions
@@ -9,6 +9,8 @@
 from pathlib import Path
 from typing import Any, Dict, Optional
 
+import torch
+
 
 def get_default_model_resource_dir(model_file_path: str) -> Path:
     """
@@ -52,7 +54,7 @@ def get_default_model_resource_dir(model_file_path: str) -> Path:
     return resource_dir
 
 
-def get_checkpoint_dtype(checkpoint: Dict[str, Any]) -> Optional[str]:
+def get_checkpoint_dtype(checkpoint: Dict[str, Any]) -> Optional[torch.dtype]:
     """
     Get the dtype of the checkpoint, returning "None" if the checkpoint is empty.
     """
 
@@ -15,6 +15,7 @@
 import re
 import shlex
 from enum import Enum
+from functools import partial
 from json import JSONDecodeError
 from pathlib import Path
 from typing import Callable, List, Optional, Union
@@ -55,6 +56,7 @@
 
 from .source_transformation.attention import replace_attention_to_attention_sha
 from .source_transformation.quantize import (
+    set_quantized_computation_dtype,
     get_quant_embedding_transform,
     get_quant_weight_transform,
 )
@@ -563,43 +565,63 @@ def _prepare_for_llama_export(args) -> LLMEdgeManager:
     output_dir_path = canonical_path(args.output_dir, dir=True)
     weight_type = WeightType.FAIRSEQ2 if args.fairseq2 else WeightType.LLAMA
 
-    # dtype override
-    if args.dtype_override is not None:
-        dtype_override = DType[args.dtype_override]
-    elif args.quantization_mode in ["8da4w", "8da4w-gptq"]:
-        dtype_override = DType["fp16"]
-    else:
-        dtype_override = None
+    # Convert dtype override string arg to actual type.
+    dtype_override = DType[args.dtype_override]
+
+    edge_manager = _load_llama_model(
+        args.model,
+        checkpoint=checkpoint_path,
+        checkpoint_dir=checkpoint_dir,
+        params_path=params_path,
+        use_kv_cache=args.use_kv_cache,
+        use_sdpa_with_kv_cache=args.use_sdpa_with_kv_cache,
+        generate_full_logits=args.generate_full_logits,
+        weight_type=weight_type,
+        enable_dynamic_shape=args.enable_dynamic_shape,
+        calibration_tasks=args.calibration_tasks,
+        calibration_limit=args.calibration_limit,
+        calibration_seq_length=args.calibration_seq_length,
+        calibration_data=args.calibration_data,
+        tokenizer_path=args.tokenizer_path,
+        verbose=args.verbose,
+        max_seq_len=args.max_seq_length,
+        max_context_len=args.max_context_length,
+        input_prune_map_path=args.input_prune_map,
+        output_prune_map_path=args.output_prune_map,
+        metadata_str=args.metadata,
+        dtype_override=dtype_override,
+        args=args,
+    )
+
+    # At this point, the model is loaded in the default fp32.
+
+    # Convert the non-weights of the model (the buffers) to the dtype_override.
+    # Need to do this before source transform quantization since the quantized
+    # parameters become buffers.
+    for buf in edge_manager.model.buffers():
+        buf.data = buf.data.to(dtype=dtype_override.to_torch_dtype())
 
-    return (
-        _load_llama_model(
+    # We want to quantize (in the source transforms) the weights of the model
+    # in the checkpoint dtype.
+    logging.info(f"Checkpoint dtype: {edge_manager.model.checkpoint_dtype}")
+    edge_manager = edge_manager.set_output_dir(output_dir_path).source_transform(
+        _get_source_transforms(
             args.model,
-            checkpoint=checkpoint_path,
-            checkpoint_dir=checkpoint_dir,
-            params_path=params_path,
-            use_kv_cache=args.use_kv_cache,
-            use_sdpa_with_kv_cache=args.use_sdpa_with_kv_cache,
-            generate_full_logits=args.generate_full_logits,
-            weight_type=weight_type,
-            enable_dynamic_shape=args.enable_dynamic_shape,
-            calibration_tasks=args.calibration_tasks,
-            calibration_limit=args.calibration_limit,
-            calibration_seq_length=args.calibration_seq_length,
-            calibration_data=args.calibration_data,
-            tokenizer_path=args.tokenizer_path,
-            verbose=args.verbose,
-            max_seq_len=args.max_seq_length,
-            max_context_len=args.max_context_length,
-            input_prune_map_path=args.input_prune_map,
-            output_prune_map_path=args.output_prune_map,
-            metadata_str=args.metadata,
-            dtype_override=dtype_override,
-            args=args,
+            dtype_override,
+            DType.from_torch_dtype(edge_manager.model.checkpoint_dtype),
+            args,
         )
-        .set_output_dir(output_dir_path)
-        .source_transform(_get_source_transforms(args.model, dtype_override, args))
     )
 
+    # Convert the parameters to the dtype_override.
+    # If source transform quantization has already happened at this point (-qmode),
+    # the quantized weights will become buffers and not be returned by .parameters(),
+    # so we don't convert them to the dtype_override.
+    for param in edge_manager.model.parameters():
+        param.data = param.data.to(dtype=dtype_override.to_torch_dtype())
+
+    return edge_manager
+
 
 def get_quantizer_and_quant_params(args):
     pt2e_quant_params = get_pt2e_quantization_params(
@@ -783,8 +805,6 @@ def _to_edge_and_lower_llama(  # noqa: C901
                 shares=args.num_sharding,
             )
 
-        from functools import partial
-
         # pyre-ignore
         from executorch.backends.qualcomm.quantizer.custom_annotation import (
             get_custom_quant_ios_dtype,
@@ -1004,6 +1024,8 @@ def _load_llama_model(
     else:
         raise ValueError(f"{modelname} is not a valid Llama model.")
 
+    torch_dtype = dtype_override.to_torch_dtype() if dtype_override else None
+
     model, example_inputs, example_kwarg_inputs, dynamic_shapes = (
         EagerModelFactory.create_model(
             module_name,
@@ -1020,41 +1042,16 @@ def _load_llama_model(
             enable_dynamic_shape=enable_dynamic_shape,
             input_prune_map_path=input_prune_map_path,
             output_prune_map_path=output_prune_map_path,
+            dtype=torch_dtype,
             args=args,
         )
     )
-    if dtype_override:
-        assert isinstance(
-            dtype_override, DType
-        ), "Override dtype needs to be of type <DType>"
-        torch_dtype = dtype_override.to_torch_dtype()
-        logging.info(f"model.to {torch_dtype}")
-        model = model.to(dtype=torch_dtype)
-        dtype = dtype_override
-    else:
-        state_dict = model.state_dict()
-        dtype = state_dict[next(iter(state_dict))].dtype
-        assert dtype in [
-            torch.bfloat16,
-            torch.float16,
-            torch.float32,
-        ], f"Only support bfloat16, fp16 or fp32 got {dtype}"
-        logging.info(f"Loaded model with dtype={dtype}")
-
-        if dtype == torch.bfloat16:
-            dtype = DType.bf16
-        elif dtype == torch.float16:
-            dtype = DType.fp16
-        elif dtype == torch.float32:
-            dtype = DType.fp32
-        else:
-            raise ValueError(f"Unsupported dtype {dtype}")
 
     return LLMEdgeManager(
         model=model,
         modelname=modelname,
         max_seq_len=model.max_seq_len,
-        dtype=dtype,
+        dtype=dtype_override,
         use_kv_cache=use_kv_cache,
         generate_full_logits=generate_full_logits,
         example_inputs=example_inputs,
@@ -1091,7 +1088,10 @@ def _load_llama_model(
 
 
 def _get_source_transforms(  # noqa
-    modelname: str, dtype_override: Optional[DType], args
+    modelname: str,
+    dtype_override: DType,
+    checkpoint_dtype: Optional[DType],
+    args,
 ) -> List[Callable[[torch.nn.Module], torch.nn.Module]]:
     transforms = []
 
@@ -1125,7 +1125,7 @@ def _get_source_transforms(  # noqa
         """
         modelname = f"{modelname}_q"
         transforms.append(
-            get_quant_weight_transform(args, dtype_override, verbose_export())
+            get_quant_weight_transform(args, checkpoint_dtype, verbose_export())
         )
 
     if args.embedding_quantize:
@@ -1139,7 +1139,14 @@ def _get_source_transforms(  # noqa
         this wil be a no-op.
         """
         modelname = f"{modelname}_e"
-        transforms.append(get_quant_embedding_transform(args))
+        transforms.append(get_quant_embedding_transform(args, checkpoint_dtype))
+
+    if args.quantization_mode or args.embedding_quantize:
+        transforms.append(
+            partial(
+                set_quantized_computation_dtype, dtype=dtype_override.to_torch_dtype()
+            )
+        )
 
     if args.expand_rope_table:
         transforms.append(materialze_broadcast_of_rope_freq_cis)
 
@@ -122,9 +122,6 @@ def __init__(self, **kwargs):
 """
             )
 
-        # Get checkpoint dtype.
-        self.dtype = get_checkpoint_dtype(checkpoint)
-
         with open(params_path, "r") as f:
             params = json.loads(f.read())
         output_prune_map = None
@@ -171,7 +168,9 @@ def __init__(self, **kwargs):
         # Within the device="meta" context, tensors that are created do not carry data.
         # They possess all other metadata a tensor carries such as size, stride, requires_grad.
         with torch.device("meta"):
+            # Model itself is loaded in default dtype, fp32.
             self.model_ = Transformer(model_args)
+            self.model_.checkpoint_dtype = get_checkpoint_dtype(checkpoint)
 
         if "int8" in str(checkpoint_path):
             print("Using int8 weight-only quantization!")
@@ -241,6 +240,10 @@ def __init__(self, **kwargs):
             # assign=True: load params/buffers by assignment instead of performing an in-place copy.
             # Because we are using device="meta", tensors do not have memory associated with them
             # and an in-place copy is a no-op. Use assign=True in load_state_dict for this scenario.
+
+            # Also, the checkpoint is loaded and dtype promoted to the transformer's dtype, which is
+            # by default initialized to fp32. This is fine because every other supported type
+            # losslessly converts to fp32, so we don't lose precision here.
             missing, unexpected = self.model_.load_state_dict(
                 checkpoint,
                 strict=False,
@@ -277,14 +280,7 @@ def __init__(self, **kwargs):
             self.model_ = prune_output_vocab(self.model_, output_prune_map)
 
     def get_eager_model(self) -> torch.nn.Module:
-        if self.dtype:
-            # convert to the type of the provided checkpoint
-            # input and output are torch.long, so signature unchanged
-            return self.model_.to(self.dtype)
-        else:
-            # int8 quantization code has some bf16,
-            # switch all to FP32
-            return self.model_.to(torch.float32)
+        return self.model_
 
     def get_example_inputs(self):
         if self.use_kv_cache: