add memory monitor and import auto-scheme on demand (#1049)

wenhuach21 · web-flow · commit 247a9a7bcffa · 2025-11-20T16:53:41.000+08:00
diff --git a/auto_round/auto_scheme/__init__.py b/auto_round/auto_scheme/__init__.py
@@ -2,6 +2,7 @@
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
+
 # You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
@@ -11,12 +12,21 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from auto_round.logger import logger
 
 from auto_round.auto_scheme.gen_auto_scheme import AutoScheme
-from auto_round.auto_scheme.register import AUTO_SCHEME_METHODS
 
-try:
-    import auto_round.auto_scheme.default_alg
-except ImportError:
-    logger.warning("AutoScheme is currently supported only on Linux.")
+
+def __getattr__(name):
+    if name == "AUTO_SCHEME_METHODS":
+        try:
+            import auto_round.auto_scheme.default_alg
+        except ImportError:
+            logger.warning("AutoScheme is currently supported only on Linux.")
+
+        from auto_round.auto_scheme.register import AUTO_SCHEME_METHODS
+
+        return AUTO_SCHEME_METHODS
+
+    raise AttributeError(f"auto-scheme has no attribute '{name}'")
diff --git a/auto_round/auto_scheme/gen_auto_scheme.py b/auto_round/auto_scheme/gen_auto_scheme.py
@@ -17,7 +17,6 @@
 
 import torch
 
-from auto_round.auto_scheme.register import AUTO_SCHEME_METHODS
 from auto_round.auto_scheme.utils import compute_avg_bits_for_scheme
 from auto_round.compressors.utils import gguf_type_fallback
 from auto_round.export.export_to_gguf.config import GGUF_INNER_CONFIG
@@ -103,7 +102,9 @@ def _check_configs(self) -> None:
 
     def get_layer_config(self) -> dict[str, dict]:
         method_name = self.auto_scheme.method
-        method_func = AUTO_SCHEME_METHODS[method_name]
+        from auto_round import auto_scheme
+
+        method_func = auto_scheme.AUTO_SCHEME_METHODS[method_name]
         if self.auto_scheme.low_gpu_mem_usage:
             self.enable_torch_compile = False
 
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -87,6 +87,7 @@
     is_fp8_model,
     is_hpex_available,
     llm_load_model,
+    memory_monitor,
     mv_module_from_gpu,
     normalize_input,
     set_amax_for_all_moe_layers,
@@ -1025,6 +1026,7 @@ def quantize_and_save(
             self.save_quantized(save_folder, format=format, inplace=inplace, **kwargs)
 
             folders.append(save_folder)
+        memory_monitor.log_summary()
 
         return model, folders
 
@@ -1513,6 +1515,7 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
                         all_to_quantized_module_names.remove(m.tmp_name)
                 if not self.immediate_saving:
                     mv_module_from_gpu(block)
+                memory_monitor.log_summary()
                 pbar.update(1)
 
         pbar.close()
@@ -1752,6 +1755,8 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None:
                 layer.cpu()
                 layer_names.remove(layer_name)
         if len(layer_names) == 0:
+            memory_monitor.update()
+            memory_monitor.log_summary()
             return
         q_layer_inputs = None
         enable_quanted_input = self.enable_quanted_input
@@ -1770,7 +1775,7 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None:
             if hasattr(self.model, "hf_device_map") and len(self.model.hf_device_map) > 1:
                 accelerate.hooks.remove_hook_from_submodules(
                     self.model
-                )  ##self.model.hf_device_map has not been changed
+                )  # self.model.hf_device_map has not been changed
         if not self.immediate_saving:
             self.model = mv_module_from_gpu(self.model)
         clear_memory(device_list=self.device_list)
@@ -1789,13 +1794,14 @@ def _quantize_layers(self, layer_names: list, layer_inputs: dict) -> None:
                 immediate_saving(self, m, name=layer_name, last_group=True)
             del layer_input
             clear_memory(q_layer_input, device_list=self.device_list)
+            memory_monitor.log_summary()
 
     @torch.no_grad()
     def _get_block_outputs(
         self,
         block: torch.nn.Module,
-        input_ids: torch.Tensor,
-        input_others: torch.Tensor,
+        input_ids: torch.Tensor | list[torch.Tensor],
+        input_others: torch.Tensor | dict,
         bs: int,
         device: Union[str, torch.device],
         cache_device: Union[str, torch.device],
@@ -2805,7 +2811,7 @@ def _quantize_block(
             f"quantized {len(quantized_layer_names)}/{(len(quantized_layer_names) + len(unquantized_layer_names))} "
             f"layers in the block, loss iter 0: {init_loss:.6f} -> iter {best_iter}: {last_loss:.6f}"
         )
-        logger.info(dump_info)
+
         if self.low_gpu_mem_usage:
             clear_memory(device_list=self.device_list)  # clear cached memory during training
         if len(unquantized_layer_names) != 0:
@@ -2833,6 +2839,8 @@ def _quantize_block(
                 mv_module_from_gpu(block)
 
             clear_memory(input_ids)
+            memory_info_summary = memory_monitor.get_summary()
+            logger.infoclean(dump_info + "," + memory_info_summary)
 
             return q_outputs, output
         else:
@@ -2841,6 +2849,8 @@ def _quantize_block(
             if auto_offload:
                 mv_module_from_gpu(block)
             clear_memory(input_ids)
+            memory_info_summary = memory_monitor.get_summary()
+            logger.infoclean(dump_info + "," + memory_info_summary)
 
             return None, output
 
@@ -3174,7 +3184,7 @@ def _sampling_inputs(
         cls,
         input_ids: Union[list[torch.Tensor], dict],
         input_others: dict,
-        indices: list[int],
+        indices: list[int] | torch.Tensor,
         seqlen: int,
         batch_dim: int = 0,
         share_cache_keys: tuple = (),
diff --git a/auto_round/logger.py b/auto_round/logger.py
@@ -53,7 +53,18 @@ def trace(self, message, *args):
 
 
 # Add the trace method to the Logger class
+
 logging.Logger.trace = trace
+INFOCLEAN_LEVEL = 21
+logging.addLevelName(INFOCLEAN_LEVEL, "INFOCLEAN")
+
+
+def infoclean(self, message, *args, **kwargs):
+    if self.isEnabledFor(INFOCLEAN_LEVEL):
+        self._log(INFOCLEAN_LEVEL, message, args, **kwargs)
+
+
+logging.Logger.infoclean = infoclean
 
 
 class AutoRoundFormatter(logging.Formatter):
@@ -65,10 +76,11 @@ class AutoRoundFormatter(logging.Formatter):
     cyan = "\x1b[36;1m"
     blue = "\x1b[34;1m"
     _format = "%(asctime)s %(levelname)s %(filename)s L%(lineno)d: %(message)s"
-
+    _format_clean = "%(message)s"
     FORMATS = {
         logging.DEBUG: blue + _format + reset,
         logging.INFO: grey + _format + reset,
+        INFOCLEAN_LEVEL: grey + _format_clean + reset,
         logging.WARNING: yellow + _format + reset,
         logging.ERROR: bold_red + _format + reset,
         logging.CRITICAL: bold_red + _format + reset,
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
@@ -16,9 +16,11 @@
 import re
 from functools import lru_cache
 from itertools import combinations
+from threading import Lock
 from typing import Callable, Union
 
 import cpuinfo
+import psutil
 import torch
 
 from auto_round.logger import logger
@@ -442,6 +444,7 @@ def _clear_memory_for_cpu_and_cuda(
 def clear_memory(tensor: torch.Tensor | None | list[torch.Tensor] = None, device_list: list | tuple | None = None):
     from auto_round.utils.device import is_hpex_available
 
+    memory_monitor.update(device_list=device_list)
     if is_hpex_available():
         # hpu does not have empty_cache
         return
@@ -1308,3 +1311,95 @@ def parse_available_devices(device_map: Union[str, torch.device, int, dict, None
         return sorted(devices)
 
     raise TypeError(f"Unsupported device_map type: {type(device_map)}")
+
+
+class MemoryMonitor:
+    """Global memory monitor for tracking peak RAM and VRAM usage."""
+
+    _instance = None
+    _lock = Lock()
+    _initialized = False
+
+    def __new__(cls):
+        if cls._instance is None:
+            with cls._lock:
+                if cls._instance is None:
+                    cls._instance = super().__new__(cls)
+                    cls._instance._initialized = False
+        return cls._instance
+
+    def __init__(self):
+        if self._initialized:
+            return
+        self._initialized = True
+        self.peak_ram = 0.0  # GB
+        self.peak_vram = {}  # {device_id: peak_mb}
+        self.enabled = True
+
+    def update(self, device_list=None):
+        """Update current memory usage and track peaks."""
+        if not self.enabled:
+            return
+        # Track RAM
+        process = psutil.Process()
+        current_ram = process.memory_info().rss / 1024**3  # GB
+        self.peak_ram = max(self.peak_ram, current_ram)
+        if device_list is None:  # TODO this have issue, wait for clean memory all pass device_list
+            device_list = [0]
+        if device_list is not None:
+            if not isinstance(device_list, (list, tuple)):
+                device_list = [device_list]
+        else:
+            if torch.cuda.is_available():
+                device_list = list(range(torch.cuda.device_count()))
+            elif torch.xpu.is_available():
+                device_list = list(range(torch.xpu.device_count()))
+
+        for device in device_list:
+            if device == "cpu":
+                continue
+            if torch.cuda.is_available():
+                current_vram = torch.cuda.memory_reserved(device) / 1024**3  # GB
+            elif torch.xpu.is_available():
+                current_vram = torch.xpu.memory_reserved(device) / 1024**3  # GB
+            else:
+                return
+
+            device = str(device).split(":")[-1]
+            if current_vram > 0:
+                if device not in self.peak_vram:
+                    self.peak_vram[device] = 0.0
+
+                self.peak_vram[device] = max(self.peak_vram[device], current_vram)
+
+    def update_cpu(self):
+        if not self.enabled:
+            return
+        process = psutil.Process()
+        current_ram = process.memory_info().rss / 1024**3  # GB
+        self.peak_ram = max(self.peak_ram, current_ram)
+
+    def reset(self):
+        """Reset all statistics."""
+        self.peak_ram = 0.0
+        self.peak_vram = {}
+
+    def get_summary(self):
+        """Get summary of peak memory usage."""
+        summary = f"'peak_ram': {round(self.peak_ram, 2)}GB"
+        if len(self.peak_vram) > 0:
+            sorted_items = sorted(self.peak_vram.items())
+            items_str = ", ".join([f"'{k}': {round(v, 2)}GB" for k, v in sorted_items])
+            summary += f", 'peak_vram': {{{items_str}}}"
+        return summary
+
+    def log_summary(self):
+        """Log memory usage summary."""
+        summary = self.get_summary()
+        logger.info(summary)
+
+        return summary
+
+
+# Global singleton instance
+memory_monitor = MemoryMonitor()
diff --git a/docs/step_by_step.md b/docs/step_by_step.md
@@ -725,7 +725,7 @@ If not explicitly specify '--task', the default value will be used (typically co
   ~~~
   The last format will be used in evaluation if multiple formats have been exported.
 
-Note: To use the vllm backend, please add `--vllm` into the upper command.
+Note: To use the vllm backend, please add `--eval_backend vllm` to  the command above.
 
 ###  Eval the Quantized model