Fix bug

jackzhxng · jackzhxng · commit 49ed26dbe489 · 2025-02-27T08:40:52.000-08:00
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -612,6 +612,7 @@ def _prepare_for_llama_export(args) -> LLMEdgeManager:
     edge_manager = edge_manager.set_output_dir(output_dir_path).source_transform(
         _get_source_transforms(
             args.model,
+            dtype_override,
             DType.from_torch_dtype(edge_manager.model.checkpoint_dtype),
             args,
         )
@@ -1040,7 +1041,10 @@ def _load_llama_model(
 
 
 def _get_source_transforms(  # noqa
-    modelname: str, dtype_override: Optional[DType], args
+    modelname: str,
+    dtype_override: DType,
+    checkpoint_dtype: Optional[DType],
+    args,
 ) -> List[Callable[[torch.nn.Module], torch.nn.Module]]:
     transforms = []
 
@@ -1074,7 +1078,7 @@ def _get_source_transforms(  # noqa
         """
         modelname = f"{modelname}_q"
         transforms.append(
-            get_quant_weight_transform(args, dtype_override, verbose_export())
+            get_quant_weight_transform(args, checkpoint_dtype, verbose_export())
         )
 
     if args.embedding_quantize:
@@ -1088,7 +1092,7 @@ def _get_source_transforms(  # noqa
         this wil be a no-op.
         """
         modelname = f"{modelname}_e"
-        transforms.append(get_quant_embedding_transform(args, dtype_override))
+        transforms.append(get_quant_embedding_transform(args, checkpoint_dtype))
 
     if args.quantization_mode or args.embedding_quantize:
         transforms.append(
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
@@ -859,12 +859,13 @@ def _set_quantized_computation_dtype_rec(
             if isinstance(child, Int8DynActInt4WeightLinear):
                 # Change the precision attribute to 'fp32'
                 child.precision = dtype
-                print(f"Changed precision of {name} to {dtype}")
+                logging.info(f"Changed precision of {name} to {dtype}")
             elif isinstance(child, QuantizedGroupEmbedding):
                 child.dtype = dtype
-                print(f"Changed precision of {name} to {dtype}")
+                logging.info(f"Changed precision of {name} to {dtype}")
             elif isinstance(child, WeightOnlyInt8Linear):
                 child.dtype = dtype
+                logging.info(f"Changed precision of {name} to {dtype}")
             else:
                 # Recursively apply to child modules
                 _set_quantized_computation_dtype_rec(child, dtype)