enable model python files saving (#802)

WeiweiZhang1 · web-flow · commit da089e9f39b7 · 2025-09-09T11:42:27.000+08:00
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -58,6 +58,7 @@
     convert_dtype_str2torch,
     convert_fp8_layer_to_linear,
     convert_fp8_model_to_16b_model,
+    copy_python_files_from_model_cache,
     detect_device,
     estimate_tuning_block_mem,
     find_matching_blocks,
@@ -850,7 +851,8 @@ def remove_duplicates(lst):
             elif format == "llm_compressor":
                 from auto_round.export.export_to_llmcompressor import check_compressed_tensors_supported
 
-                if check_compressed_tensors_supported() and (is_nv_fp(self.data_type) or is_mx_fp(self.data_type)):
+                if is_nv_fp(self.data_type) or is_mx_fp(self.data_type):
+                    check_compressed_tensors_supported()
                     format = format.replace("llm_compressor", f"llm_compressor:{self.data_type}")
                     formats[index] = format
                 elif not is_wfp8afp8(self):
@@ -3036,6 +3038,10 @@ def save_quantized(
             processor = kwargs.get("processor", None)
             if processor is not None:
                 processor.save_pretrained(output_dir)
+            try:
+                copy_python_files_from_model_cache(self.model, output_dir)
+            except Exception as e:
+                logger.warning("Skipping source model Python file copy due to error: %s", e)
             return
         if self.act_bits <= 8 and format == "qdq":
             logger.warning(
diff --git a/auto_round/export/export_to_autogptq/export.py b/auto_round/export/export_to_autogptq/export.py
@@ -50,6 +50,7 @@
 from auto_round.utils import (
     SUPPORTED_LAYER_TYPES,
     check_to_quantized,
+    copy_python_files_from_model_cache,
     filter_quantization_config,
     get_autogptq_packing_qlinear,
     get_block_names,
@@ -259,3 +260,8 @@ def save(
     if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
         with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f:
             json.dump(model.config.quantization_config, f, indent=2)
+
+    try:
+        copy_python_files_from_model_cache(model, save_dir)
+    except Exception as e:
+        logger.warning("Skipping source model Python file copy due to error: %s", e)
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
@@ -31,6 +31,7 @@
     SUPPORTED_LAYER_TYPES,
     check_start_with_block_name,
     check_to_quantized,
+    copy_python_files_from_model_cache,
     filter_quantization_config,
     get_autogptq_packing_qlinear,
     get_module,
@@ -399,3 +400,8 @@ def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_seri
     if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
         with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f:
             json.dump(model.config.quantization_config, f, indent=2)
+
+    try:
+        copy_python_files_from_model_cache(model, save_dir)
+    except Exception as e:
+        logger.warning("Skipping source model Python file copy due to error: %s", e)
diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py
@@ -29,6 +29,7 @@
     _get_packing_device,
     check_start_with_block_name,
     check_to_quantized,
+    copy_python_files_from_model_cache,
     filter_quantization_config,
     get_module,
     logger,
@@ -270,3 +271,8 @@ def save(
     if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
         with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f:
             json.dump(model.config.quantization_config, f, indent=2)
+
+    try:
+        copy_python_files_from_model_cache(model, save_dir)
+    except Exception as e:
+        logger.warning("Skipping source model Python file copy due to error: %s", e)
diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
@@ -30,6 +30,7 @@
     _get_packing_device,
     check_start_with_block_name,
     check_to_quantized,
+    copy_python_files_from_model_cache,
     filter_quantization_config,
     get_module,
     is_mx_fp,
@@ -282,3 +283,8 @@ def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_seri
     if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
         with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f:
             json.dump(model.config.quantization_config, f, indent=2)
+
+    try:
+        copy_python_files_from_model_cache(model, save_dir)
+    except Exception as e:
+        logger.warning("Skipping source model Python file copy due to error: %s", e)
diff --git a/auto_round/export/export_to_awq/export.py b/auto_round/export/export_to_awq/export.py
@@ -34,6 +34,7 @@
 from auto_round.utils import (
     SUPPORTED_LAYER_TYPES,
     check_to_quantized,
+    copy_python_files_from_model_cache,
     extract_block_names_to_str,
     filter_quantization_config,
     get_block_names,
@@ -197,3 +198,8 @@ def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_seri
     if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
         with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f:
             json.dump(model.config.quantization_config, f, indent=2)
+
+    try:
+        copy_python_files_from_model_cache(model, save_dir)
+    except Exception as e:
+        logger.warning("Skipping source model Python file copy due to error: %s", e)
diff --git a/auto_round/export/export_to_llmcompressor/config.py b/auto_round/export/export_to_llmcompressor/config.py
@@ -76,10 +76,10 @@ def check_compressed_tensors_supported():  # pragma: no cover
 
         return True
     except ImportError:
-        logger.warning(
+        logger.error(
             "Please install compressed-tensors via 'pip install compressed-tensors'" " to save as llm-compressor format"
         )
-        return False
+        exit(-1)
 
 
 if check_compressed_tensors_supported():
diff --git a/auto_round/export/export_to_llmcompressor/export.py b/auto_round/export/export_to_llmcompressor/export.py
@@ -17,7 +17,16 @@
 import torch
 
 from auto_round.export.export_to_llmcompressor.config import quantization_config
-from auto_round.utils import detect_device, get_module, is_mx_fp, is_nv_fp, is_standard_fp, logger, set_module
+from auto_round.utils import (
+    copy_python_files_from_model_cache,
+    detect_device,
+    get_module,
+    is_mx_fp,
+    is_nv_fp,
+    is_standard_fp,
+    logger,
+    set_module,
+)
 from auto_round.wrapper import WrapperWALayer
 
 from .export_to_fp import save_quantized_as_fp
@@ -111,3 +120,8 @@ def save_quantized_as_llmcompressor(output_dir, **kwargs):
         if hasattr(model, "generation_config"):
             setattr(model.generation_config, "do_sample", True)
         model.save_pretrained(output_dir, safe_serialization=safe_serialization)
+
+    try:
+        copy_python_files_from_model_cache(model, output_dir)
+    except Exception as e:
+        logger.warning("Skipping source model Python file copy due to error: %s", e)
diff --git a/auto_round/export/export_to_llmcompressor/export_to_fp.py b/auto_round/export/export_to_llmcompressor/export_to_fp.py
@@ -29,6 +29,7 @@
     SUPPORTED_LAYER_TYPES,
     check_start_with_block_name,
     check_to_quantized,
+    copy_python_files_from_model_cache,
     filter_quantization_config,
     get_block_names,
     get_module,
@@ -274,3 +275,8 @@ def save(model: nn.Module, save_dir: str, max_shard_size: str = "5GB", safe_seri
     if hasattr(model, "config") and hasattr(model.config, "quantization_config"):
         with open(os.path.join(save_dir, config_file), "w", encoding="utf-8") as f:
             json.dump(model.config.quantization_config, f, indent=2)
+
+    try:
+        copy_python_files_from_model_cache(model, save_dir)
+    except Exception as e:
+        logger.warning("Skipping source model Python file copy due to error: %s", e)
diff --git a/auto_round/utils.py b/auto_round/utils.py
@@ -2550,8 +2550,11 @@ def is_nv_fp(backend):
 
 
 def is_wfp8afp8(ar):
-    if ("fp8" in ar.act_data_type or ("fp" in ar.act_data_type and ar.act_bits == 8)) and (
-        "fp8" in ar.data_type or ("fp" in ar.data_type and ar.bits == 8)
+    if (
+        ("fp8" in ar.act_data_type or ("fp" in ar.act_data_type and ar.act_bits == 8))
+        and ("fp8" in ar.data_type or ("fp" in ar.data_type and ar.bits == 8))
+        and is_standard_fp(ar.act_data_type)
+        and is_standard_fp(ar.data_type)
     ):
         return True
     else:
@@ -2677,3 +2680,35 @@ def _get_packing_device(device: str | torch.device | None = "auto") -> torch.dev
             raise ValueError(f"Invalid device string: {device}") from e
 
     raise TypeError(f"Unsupported device type: {type(device)} ({device})")
+
+
+# Adapted from https://github.com/vllm-project/llm-compressor/blob/
+# 5b3ddff74cae9651f24bef15d3255c4ee053fc60/src/llmcompressor/pytorch/model_load/helpers.py#L144
+def copy_python_files_from_model_cache(model, save_path: str):
+    config = model.config
+    cache_path = None
+    if hasattr(config, "_name_or_path"):
+        import os
+        import shutil
+
+        from huggingface_hub import hf_hub_download
+        from transformers import TRANSFORMERS_CACHE
+        from transformers.utils import http_user_agent
+
+        cache_path = config._name_or_path
+        if not os.path.exists(cache_path):
+            user_agent = http_user_agent()
+            config_file_path = hf_hub_download(
+                repo_id=cache_path,
+                filename="config.json",
+                cache_dir=TRANSFORMERS_CACHE,
+                force_download=False,
+                user_agent=user_agent,
+            )
+            cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1])
+
+        for file in os.listdir(cache_path):
+            full_file_name = os.path.join(cache_path, file)
+            if file.endswith(".py") and os.path.isfile(full_file_name):
+                logger.debug(f"Transferring {full_file_name} to {save_path}")
+                shutil.copy(full_file_name, save_path)