add cache_config's info to prometheus metrics. (#3100)

AllenDou · web-flow · commit 9289e577ec18 · 2024-02-29T06:15:18.000Z
diff --git a/vllm/config.py b/vllm/config.py
@@ -308,6 +308,10 @@ def __init__(
         self.num_gpu_blocks = None
         self.num_cpu_blocks = None
 
+    def metrics_info(self):
+        # convert cache_config to dict(key: str, value:str) for prometheus metrics info
+        return {key: str(value) for key, value in self.__dict__.items()}
+
     def _verify_args(self) -> None:
         if self.gpu_memory_utilization > 1.0:
             raise ValueError(
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -138,6 +138,7 @@ def __init__(
             self.stat_logger = StatLogger(
                 local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
                 labels=dict(model_name=model_config.model))
+            self.stat_logger.info("cache_config", self.cache_config)
 
         self.forward_dag = None
         if USE_RAY_COMPILED_DAG:
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
@@ -1,5 +1,5 @@
 from vllm.logger import init_logger
-from prometheus_client import Counter, Gauge, Histogram, REGISTRY, disable_created_metrics
+from prometheus_client import Counter, Gauge, Histogram, Info, REGISTRY, disable_created_metrics
 
 import time
 import numpy as np
@@ -23,6 +23,10 @@ def __init__(self, labelnames: List[str]):
             if hasattr(collector, "_name") and "vllm" in collector._name:
                 REGISTRY.unregister(collector)
 
+        self.info_cache_config = Info(
+            name='vllm:cache_config',
+            documentation='information of cache_config')
+
         # System stats
         self.gauge_scheduler_running = Gauge(
             name="vllm:num_requests_running",
@@ -128,6 +132,10 @@ def __init__(self, local_interval: float, labels: Dict[str, str]) -> None:
         self.labels = labels
         self.metrics = Metrics(labelnames=list(labels.keys()))
 
+    def info(self, type: str, obj: object) -> None:
+        if type == "cache_config":
+            self.metrics.info_cache_config.info(obj.metrics_info())
+
     def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
         return float(np.sum(tracked_stats) / (now - self.last_local_log))