refactor : logger and introduced normalize_name in utils.metrics module

Shayan-Ghani · Shayan-Ghani · commit 03dd66d042a0 · 2025-05-31T17:24:38.000+03:30
diff --git a/container_exporter.py b/container_exporter.py
@@ -6,8 +6,8 @@
 from fastapi import FastAPI
 from fastapi.responses import PlainTextResponse
 from contextlib import asynccontextmanager
-from utils.metrics import PromMetric, prune_stale_metrics, flush_metric_labels
-from logging import basicConfig, error, ERROR
+from utils.metrics import PromMetric, prune_stale_metrics, normalize_name
+import logging
 from settings.settings import settings
 
 docker_client: Docker
@@ -23,7 +23,16 @@ async def lifespan(app: FastAPI):
 
 app = FastAPI(lifespan=lifespan)
 
-gauge_container_status = Gauge('cxp_container_status', 'Docker container status (0 = not running, 1 = running, 2 = restarting/unhealthy)', ['container_name'])
+logging.basicConfig(
+    level=logging.ERROR,
+    format='%(asctime)s ERROR %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S',
+)
+
+logger = logging.getLogger(__name__)
+
+
+gauge_container_status = Gauge('cxp_container_status', 'Docker container status (0 = not running, 1 = running, 2 = restarting/unhealthy/paused)', ['container_name'])
 gauge_cpu_percentage = Gauge('cxp_cpu_percentage', 'Docker container CPU usage', ['container_name'])
 gauge_memory_percentage = Gauge('cxp_memory_percentage', 'Docker container memory usage in percent', ['container_name'])
 gauge_memory_bytes = Gauge('cxp_memory_bytes_total', 'Docker container memory usage in bytes', ['container_name'])
@@ -33,24 +42,26 @@ async def lifespan(app: FastAPI):
 counter_net_rx = Gauge("cxp_network_rx_bytes_total", "Total bytes received over network", ['container_name'])
 counter_net_tx = Gauge("cxp_network_tx_bytes_total", "Total bytes sent over network", ['container_name'])
 
-
-metrics_to_clear: list[PromMetric] = [gauge_cpu_percentage, gauge_memory_percentage, gauge_memory_bytes, counter_disk_read, counter_disk_write, counter_net_rx, counter_net_tx]
-
-
-
 async def get_containers(all=False) -> list[DockerContainer]:
     return await docker_client.containers.list(all=all)
 
 def update_container_status(running_containers:list[DockerContainer]):
     for c in running_containers:
-        gauge_container_status.labels(container_name=c._container.get("Names")[0][1:]).set(1 if c._container.get('State') == 'running' else 2)
+        info = c._container 
+        name = normalize_name(info.get("Names", []), info.get("Id", ""))
+        state = info.get("State", "").lower()
+        if state == "running":
+            gauge_container_status.labels(container_name=name).set(1)
+        else:
+            gauge_container_status.labels(container_name=name).set(2)
 
 # Async metrics gathering
 async def container_stats( running_containers: list[DockerContainer]):
     all_stats = await stat.get_containers_stats(running_containers)
     
     for stats in all_stats:
-        name = stats[0]['name'][1:]
+        name = stats[0].get('name', stats[0].get('id', 'Unkown').lstrip("/")).lstrip("/")
+        
         gauge_cpu_percentage.labels(container_name=name).set(stat.calculate_cpu_percentage(stats[0]))
         gauge_memory_percentage.labels(container_name=name).set(stat.calculate_memory_percentage(stats[0]))
         gauge_memory_bytes.labels(container_name=name).set(stat.calculate_memory_bytes(stats[0]))
@@ -69,29 +80,39 @@ async def container_stats( running_containers: list[DockerContainer]):
 ]
 
 # Metrics we want to always keep, and set to 0 instead
-persistent_metrics: list[PromMetric] = [gauge_container_status]
+persistent_metrics: list[Gauge] = [gauge_container_status]
 
 
 @app.get("/")
 def root():
     return {"message": "Welcome to CXP, Container Exporter for Prometheus."}
 
+@app.get("/healthz")
+async def healthz():
+    try:
+        # A simple, cheap call to Docker, e.g. list one container
+        await docker_client.containers.list(limit=1)
+        return PlainTextResponse("OK", status_code=200)
+    except:
+        return PlainTextResponse("NOT OK", status_code=500)
+    
 @app.get("/metrics")
 async def metrics():
     try:
         running_containers = await get_containers()
         update_container_status(running_containers)
-        prune_stale_metrics([c._container.get("Names")[0][1:] for c in running_containers], prunable_metrics, persistent_metrics)
+
+        c_names = [
+            normalize_name(c._container.get("Names", []), c._container.get("Id", ""))
+            for c in running_containers
+        ]
+        prune_stale_metrics(c_names, prunable_metrics, persistent_metrics)
+
         await container_stats(running_containers)
         return PlainTextResponse(
             content=generate_latest(),
             media_type=CONTENT_TYPE_LATEST 
         )
     except Exception as e:
-        basicConfig(    
-            level=ERROR,
-            format='%(asctime)s ERROR %(message)s',
-            datefmt='%Y-%m-%d %H:%M:%S'
-        )
-        error(str(e))
+        logger.error("Error running metrics collection: %s", e, exc_info=settings.CONTAINER_EXPORTER_DEBUG)
         return PlainTextResponse(f"Error running metrics collection: {str(e)}", status_code=500)
diff --git a/stats/get_docker_stats.py b/stats/get_docker_stats.py
@@ -33,7 +33,7 @@ def calculate_memory_percentage(stats: dict) -> float:
     return (usage / limit) * 100.0
 
 
-def calculate_memory_bytes(stats) -> bytes:
+def calculate_memory_bytes(stats) -> float:
     mem_stats = stats.get('memory_stats', {}) or {}
     memory_usage_bytes = mem_stats.get('usage')
     
diff --git a/utils/metrics.py b/utils/metrics.py
@@ -4,7 +4,7 @@
 from settings.settings import settings
 PromMetric = Union[Gauge, Counter]
 
-def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[PromMetric], persistent_metrics : list[PromMetric]):
+def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[PromMetric], persistent_metrics : list[Gauge]):
     """
     Removes time series for inactive containers from selected metrics
     while preserving container status metrics by setting them to 0.
@@ -30,9 +30,11 @@ def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[Prom
             if name not in active_set:
                 metric.labels(container_name=name).set(0)
 
-
-def flush_metric_labels(containers:list[DockerContainer], metrics_to_clear: list[PromMetric]):
-    for container in containers:
-        if container._container.get("State") != "running":
-            for metric in metrics_to_clear:
-                metric.labels(container_name=container._container.get("Names")[0][1:]).set(0)
+def normalize_name(raw_names: list[str], fallback_id: str) -> str:
+    """
+    Given Docker’s 'Names' array (e.g. ['/my‐container']), pick the first one and strip leading '/'.
+    If it’s missing or empty, return a short version of container ID.
+    """
+    if raw_names and isinstance(raw_names, list) and raw_names[0]:
+        return raw_names[0].lstrip("/")
+    return fallback_id[:12]