initial

jackzhxng · jackzhxng · commit e68b0289bcf0 · 2025-02-13T16:20:22.000-08:00
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -561,42 +561,49 @@ def _prepare_for_llama_export(args) -> LLMEdgeManager:
     output_dir_path = canonical_path(args.output_dir, dir=True)
     weight_type = WeightType.FAIRSEQ2 if args.fairseq2 else WeightType.LLAMA
 
-    # dtype override
-    if args.dtype_override is not None:
-        dtype_override = DType[args.dtype_override]
-    elif args.quantization_mode in ["8da4w", "8da4w-gptq"]:
+    # Conver dtype override string to actual type.
+    if args.quantization_mode in ["8da4w", "8da4w-gptq"]:
         dtype_override = DType["fp16"]
     else:
-        dtype_override = None
+        dtype_override = DType[args.dtype_override]
 
-    return (
-        _load_llama_model(
-            args.model,
-            checkpoint=checkpoint_path,
-            checkpoint_dir=checkpoint_dir,
-            params_path=params_path,
-            use_kv_cache=args.use_kv_cache,
-            use_sdpa_with_kv_cache=args.use_sdpa_with_kv_cache,
-            generate_full_logits=args.generate_full_logits,
-            weight_type=weight_type,
-            enable_dynamic_shape=args.enable_dynamic_shape,
-            calibration_tasks=args.calibration_tasks,
-            calibration_limit=args.calibration_limit,
-            calibration_seq_length=args.calibration_seq_length,
-            calibration_data=args.calibration_data,
-            tokenizer_path=args.tokenizer_path,
-            verbose=args.verbose,
-            max_seq_len=args.max_seq_length,
-            max_context_len=args.max_context_length,
-            input_prune_map_path=args.input_prune_map,
-            output_prune_map_path=args.output_prune_map,
-            metadata_str=args.metadata,
-            dtype_override=dtype_override,
-            args=args,
-        )
-        .set_output_dir(output_dir_path)
-        .source_transform(_get_source_transforms(args.model, dtype_override, args))
+    edge_manager = _load_llama_model(
+        args.model,
+        checkpoint=checkpoint_path,
+        checkpoint_dir=checkpoint_dir,
+        params_path=params_path,
+        use_kv_cache=args.use_kv_cache,
+        use_sdpa_with_kv_cache=args.use_sdpa_with_kv_cache,
+        generate_full_logits=args.generate_full_logits,
+        weight_type=weight_type,
+        enable_dynamic_shape=args.enable_dynamic_shape,
+        calibration_tasks=args.calibration_tasks,
+        calibration_limit=args.calibration_limit,
+        calibration_seq_length=args.calibration_seq_length,
+        calibration_data=args.calibration_data,
+        tokenizer_path=args.tokenizer_path,
+        verbose=args.verbose,
+        max_seq_len=args.max_seq_length,
+        max_context_len=args.max_context_length,
+        input_prune_map_path=args.input_prune_map,
+        output_prune_map_path=args.output_prune_map,
+        metadata_str=args.metadata,
+        dtype_override=dtype_override,
+        args=args,
     )
+    .set_output_dir(output_dir_path)
+    .source_transform(_get_source_transforms(args.model, dtype_override, args))
+
+    # Override dtype of the model as specified by the user args.
+    if dtype_override:
+        assert isinstance(
+            dtype_override, DType
+        ), "Override dtype needs to be of type <DType>"
+        torch_dtype = dtype_override.to_torch_dtype()
+        logging.info(f"model.to {torch_dtype}")
+        edge_manager.model = edge_manager.model.to(dtype=torch_dtype)
+
+    return edge_manager
 
 
 def get_quantizer_and_quant_params(args):
@@ -971,38 +978,12 @@ def _load_llama_model(
             args=args,
         )
     )
-    if dtype_override:
-        assert isinstance(
-            dtype_override, DType
-        ), "Override dtype needs to be of type <DType>"
-        torch_dtype = dtype_override.to_torch_dtype()
-        logging.info(f"model.to {torch_dtype}")
-        model = model.to(dtype=torch_dtype)
-        dtype = dtype_override
-    else:
-        state_dict = model.state_dict()
-        dtype = state_dict[next(iter(state_dict))].dtype
-        assert dtype in [
-            torch.bfloat16,
-            torch.float16,
-            torch.float32,
-        ], f"Only support bfloat16, fp16 or fp32 got {dtype}"
-        logging.info(f"Loaded model with dtype={dtype}")
-
-        if dtype == torch.bfloat16:
-            dtype = DType.bf16
-        elif dtype == torch.float16:
-            dtype = DType.fp16
-        elif dtype == torch.float32:
-            dtype = DType.fp32
-        else:
-            raise ValueError(f"Unsupported dtype {dtype}")
 
     return LLMEdgeManager(
         model=model,
         modelname=modelname,
         max_seq_len=model.max_seq_len,
-        dtype=dtype,
+        dtype=dtype_override,
         use_kv_cache=use_kv_cache,
         generate_full_logits=generate_full_logits,
         example_inputs=example_inputs,
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -171,70 +171,7 @@ def __init__(self, **kwargs):
         # Within the device="meta" context, tensors that are created do not carry data.
         # They possess all other metadata a tensor carries such as size, stride, requires_grad.
         with torch.device("meta"):
-            self.model_ = Transformer(model_args)
-
-        if "int8" in str(checkpoint_path):
-            print("Using int8 weight-only quantization!")
-            # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.examples.models.source_transformation.quantize`
-            from ..source_transformation.quantize import WeightOnlyInt8QuantHandler
-
-            simple_quantizer = WeightOnlyInt8QuantHandler(self.model_)
-            self.model_ = simple_quantizer.convert_for_runtime()
-        elif "8da4w" in str(checkpoint_path):
-            print("Using int4 weight and int8 dynamic activation quantization!")
-            from torchao.quantization.quant_api import Int8DynActInt4WeightQuantizer
-
-            self.model_ = Int8DynActInt4WeightQuantizer()._convert_for_runtime(
-                self.model_
-            )
-        elif hasattr(self.args, "use_spin_quant") and self.args.use_spin_quant:
-            print("Using SPIN quantization.")
-            self._transform_for_pre_quantization(checkpoint, model_args)
-
-            from .source_transformation.pre_quantization import (
-                sanitize_checkpoint_from_pre_quantization,
-            )
-
-            sanitize_checkpoint_from_pre_quantization(checkpoint)
-        elif hasattr(self.args, "use_qat") and self.args.use_qat:
-            print("Using QAT quantization.")
-            self._transform_for_pre_quantization(checkpoint, model_args)
-            if hasattr(self.args, "use_lora") and self.args.use_lora:
-                assert model_args.lora_args["rank"] == self.args.use_lora
-                from .source_transformation.lora import (
-                    transform_linear_for_lora_after_quantization,
-                )
-
-                self.model_ = transform_linear_for_lora_after_quantization(
-                    self.model_,
-                    checkpoint,
-                    self.args.use_lora,
-                )
-
-            from .source_transformation.pre_quantization import (
-                sanitize_checkpoint_from_pre_quantization,
-            )
-
-            sanitize_checkpoint_from_pre_quantization(checkpoint)
-
-        if hasattr(self.args, "use_attention_sink") and self.args.use_attention_sink:
-            from .source_transformation.attention_sink import enable_attention_sink
-
-            attention_sink_params = self.args.use_attention_sink.split(",")
-            assert len(attention_sink_params) == 3
-            sink_size = int(attention_sink_params[0])
-            window_size = int(attention_sink_params[1])
-            eviction_batch_size = int(attention_sink_params[2])
-
-            assert self.args.max_context_length == sink_size + window_size
-
-            self.model_ = enable_attention_sink(
-                module=self.model_,
-                params=model_args,
-                sink_size=sink_size,
-                window_size=window_size,
-                eviction_batch_size=eviction_batch_size,
-            )
+            self.model_ = Transformer(model_args).to(dtype=self.dtype)
 
         # assign=True: load params/buffers by assignment instead of performing an in-place copy.
         # Because we are using device="meta", tensors do not have memory associated with them
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
@@ -611,7 +611,7 @@ def create_quantized_state_dict(self, packed=False) -> Dict:
                     f"quantize {fqn, mod} with group_size {self.group_size}, bitwidth {self.bitwidth}"
                 )
                 weight, scales, _ = dynamically_quantize_per_channel(
-                    mod.weight.float(),
+                    mod.weight,
                     range_min,
                     range_max,
                     torch.int8,

Original file line number	Diff line number	Diff line change
`@@ -611,7 +611,7 @@ def create_quantized_state_dict(self, packed=False) -> Dict:`
`611`	`611`	`f"quantize {fqn, mod} with group_size {self.group_size}, bitwidth {self.bitwidth}"`
`612`	`612`	`)`
`613`	`613`	`weight, scales, _ = dynamically_quantize_per_channel(`
`614`		`- mod.weight.float(),`
	`614`	`+ mod.weight,`
`615`	`615`	`range_min,`
`616`	`616`	`range_max,`
`617`	`617`	`torch.int8,`