Merge pull request #35 from Shayan-Ghani/refinement

Shayan-Ghani · web-flow · commit 0841803a8b46 · 2025-05-31T19:12:11.000+03:30
Fix!: wrong calculation for disk and network metrics
diff --git a/container_exporter.py b/container_exporter.py
@@ -6,8 +6,8 @@
 from fastapi import FastAPI
 from fastapi.responses import PlainTextResponse
 from contextlib import asynccontextmanager
-from utils.metrics import PromMetric, prune_stale_metrics, flush_metric_labels
-from logging import basicConfig, error, ERROR
+from utils.metrics import PromMetric, prune_stale_metrics, normalize_name
+import logging
 from settings.settings import settings
 
 docker_client: Docker
@@ -23,44 +23,55 @@ async def lifespan(app: FastAPI):
 
 app = FastAPI(lifespan=lifespan)
 
-gauge_container_status = Gauge('cxp_container_status', 'Docker container status (0 = not running, 1 = running, 2 = restarting/unhealthy)', ['container_name'])
-gauge_cpu_percentage = Gauge('cxp_cpu_percentage', 'Docker container CPU usage', ['container_name'])
-gauge_memory_percentage = Gauge('cxp_memory_percentage', 'Docker container memory usage in percent', ['container_name'])
-gauge_memory_bytes = Gauge('cxp_memory_bytes_total', 'Docker container memory usage in bytes', ['container_name'])
-
-counter_disk_read = Counter("cxp_disk_io_read_bytes_total", "Total bytes read from disk", ['container_name'])
-counter_disk_write = Counter("cxp_disk_io_write_bytes_total", "Total bytes written to disk", ['container_name'])
-counter_net_rx = Counter("cxp_network_rx_bytes_total", "Total bytes received over network", ['container_name'])
-counter_net_tx = Counter("cxp_network_tx_bytes_total", "Total bytes sent over network", ['container_name'])
+logging.basicConfig(
+    level=logging.ERROR,
+    format='%(asctime)s ERROR %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S',
+)
 
+logger = logging.getLogger(__name__)
 
-metrics_to_clear: list[PromMetric] = [gauge_cpu_percentage, gauge_memory_percentage, gauge_memory_bytes, counter_disk_read, counter_disk_write, counter_net_rx, counter_net_tx]
 
+gauge_container_status = Gauge('cxp_container_status', 'Docker container status (0 = not running, 1 = running, 2 = restarting/unhealthy/paused)', ['container_name'])
+gauge_cpu_percentage = Gauge('cxp_cpu_percentage', 'Docker container CPU usage', ['container_name'])
+gauge_memory_percentage = Gauge('cxp_memory_percentage', 'Docker container memory usage in percent', ['container_name'])
+gauge_memory_bytes = Gauge('cxp_memory_bytes_total', 'Docker container memory usage in bytes', ['container_name'])
 
+counter_disk_read = Gauge("cxp_disk_io_read_bytes_total", "Total bytes read from disk", ['container_name'])
+counter_disk_write = Gauge("cxp_disk_io_write_bytes_total", "Total bytes written to disk", ['container_name'])
+counter_net_rx = Gauge("cxp_network_rx_bytes_total", "Total bytes received over network", ['container_name'])
+counter_net_tx = Gauge("cxp_network_tx_bytes_total", "Total bytes sent over network", ['container_name'])
 
 async def get_containers(all=False) -> list[DockerContainer]:
     return await docker_client.containers.list(all=all)
 
 def update_container_status(running_containers:list[DockerContainer]):
     for c in running_containers:
-        gauge_container_status.labels(container_name=c._container.get("Names")[0][1:]).set(1 if c._container.get('State') == 'running' else 2)
+        info = c._container 
+        name = normalize_name(info.get("Names", []), info.get("Id", ""))
+        state = info.get("State", "").lower()
+        if state == "running":
+            gauge_container_status.labels(container_name=name).set(1)
+        else:
+            gauge_container_status.labels(container_name=name).set(2)
 
 # Async metrics gathering
 async def container_stats( running_containers: list[DockerContainer]):
     all_stats = await stat.get_containers_stats(running_containers)
     
     for stats in all_stats:
-        name = stats[0]['name'][1:]
+        name = stats[0].get('name', stats[0].get('id', 'Unkown').lstrip("/")).lstrip("/")
+        
         gauge_cpu_percentage.labels(container_name=name).set(stat.calculate_cpu_percentage(stats[0]))
         gauge_memory_percentage.labels(container_name=name).set(stat.calculate_memory_percentage(stats[0]))
         gauge_memory_bytes.labels(container_name=name).set(stat.calculate_memory_bytes(stats[0]))
         disk_read, disk_write = stat.calculate_disk_io(stats[0])
         net_rx, net_tx = stat.calculate_network_io(stats[0])
 
-        counter_disk_read.labels(container_name=name).inc(disk_read)
-        counter_disk_write.labels(container_name=name).inc(disk_write)
-        counter_net_rx.labels(container_name=name).inc(net_rx)
-        counter_net_tx.labels(container_name=name).inc(net_tx)
+        counter_disk_read.labels(container_name=name).set(disk_read)
+        counter_disk_write.labels(container_name=name).set(disk_write)
+        counter_net_rx.labels(container_name=name).set(net_rx)
+        counter_net_tx.labels(container_name=name).set(net_tx)
 
 # List of metrics we want to prune (performance counters)
 prunable_metrics: list[PromMetric] = [
@@ -69,29 +80,39 @@ async def container_stats( running_containers: list[DockerContainer]):
 ]
 
 # Metrics we want to always keep, and set to 0 instead
-persistent_metrics: list[PromMetric] = [gauge_container_status]
+persistent_metrics: list[Gauge] = [gauge_container_status]
 
 
 @app.get("/")
 def root():
     return {"message": "Welcome to CXP, Container Exporter for Prometheus."}
 
+@app.get("/healthz")
+async def healthz():
+    try:
+        # A simple, cheap call to Docker, e.g. list one container
+        await docker_client.containers.list(limit=1)
+        return PlainTextResponse("OK", status_code=200)
+    except:
+        return PlainTextResponse("NOT OK", status_code=500)
+    
 @app.get("/metrics")
 async def metrics():
     try:
         running_containers = await get_containers()
         update_container_status(running_containers)
-        prune_stale_metrics([c._container.get("Names")[0][1:] for c in running_containers], prunable_metrics, persistent_metrics)
+
+        c_names = [
+            normalize_name(c._container.get("Names", []), c._container.get("Id", ""))
+            for c in running_containers
+        ]
+        prune_stale_metrics(c_names, prunable_metrics, persistent_metrics)
+
         await container_stats(running_containers)
         return PlainTextResponse(
             content=generate_latest(),
             media_type=CONTENT_TYPE_LATEST 
         )
     except Exception as e:
-        basicConfig(    
-            level=ERROR,
-            format='%(asctime)s ERROR %(message)s',
-            datefmt='%Y-%m-%d %H:%M:%S'
-        )
-        error(str(e))
+        logger.error("Error running metrics collection: %s", e, exc_info=settings.CONTAINER_EXPORTER_DEBUG)
         return PlainTextResponse(f"Error running metrics collection: {str(e)}", status_code=500)
diff --git a/dashboards/container_status.json b/dashboards/container_status.json
@@ -0,0 +1,183 @@
+{
+    "annotations": {
+      "list": [
+        {
+          "builtIn": 1,
+          "datasource": {
+            "type": "grafana",
+            "uid": "-- Grafana --"
+          },
+          "enable": true,
+          "hide": true,
+          "iconColor": "rgba(0, 211, 255, 1)",
+          "name": "Annotations & Alerts",
+          "type": "dashboard"
+        }
+      ]
+    },
+    "editable": true,
+    "fiscalYearStartMonth": 0,
+    "graphTooltip": 0,
+    "id": 2,
+    "links": [],
+    "panels": [
+      {
+        "datasource": {
+          "type": "prometheus",
+          "uid": "Your prometheus data source uid"
+        },
+        "description": "",
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "custom": {
+              "axisBorderShow": false,
+              "axisCenteredZero": false,
+              "axisColorMode": "text",
+              "axisLabel": "usage in percentage",
+              "axisPlacement": "auto",
+              "barAlignment": 0,
+              "barWidthFactor": 0.6,
+              "drawStyle": "line",
+              "fillOpacity": 0,
+              "gradientMode": "none",
+              "hideFrom": {
+                "legend": false,
+                "tooltip": false,
+                "viz": false
+              },
+              "insertNulls": false,
+              "lineInterpolation": "linear",
+              "lineStyle": {
+                "fill": "solid"
+              },
+              "lineWidth": 1,
+              "pointSize": 4,
+              "scaleDistribution": {
+                "type": "linear"
+              },
+              "showPoints": "always",
+              "spanNulls": false,
+              "stacking": {
+                "group": "A",
+                "mode": "none"
+              },
+              "thresholdsStyle": {
+                "mode": "dashed+area"
+              }
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green"
+                },
+                {
+                  "color": "yellow",
+                  "value": 85
+                },
+                {
+                  "color": "red",
+                  "value": 150
+                }
+              ]
+            }
+          },
+          "overrides": [
+            {
+              "__systemRef": "hideSeriesFrom",
+              "matcher": {
+                "id": "byNames",
+                "options": {
+                  "mode": "exclude",
+                  "names": [
+                    "{container_name=\"hopeful_dewdney\"}"
+                  ],
+                  "prefix": "All except:",
+                  "readOnly": true
+                }
+              },
+              "properties": [
+                {
+                  "id": "custom.hideFrom",
+                  "value": {
+                    "legend": false,
+                    "tooltip": false,
+                    "viz": true
+                  }
+                }
+              ]
+            }
+          ]
+        },
+        "gridPos": {
+          "h": 17,
+          "w": 24,
+          "x": 0,
+          "y": 0
+        },
+        "id": 1,
+        "options": {
+          "legend": {
+            "calcs": [
+              "min",
+              "max",
+              "mean"
+            ],
+            "displayMode": "table",
+            "placement": "right",
+            "showLegend": true,
+            "sortBy": "Mean",
+            "sortDesc": true
+          },
+          "tooltip": {
+            "hideZeros": false,
+            "mode": "multi",
+            "sort": "desc"
+          }
+        },
+        "pluginVersion": "12.0.1",
+        "targets": [
+          {
+            "datasource": {
+              "type": "prometheus",
+              "uid": "fenjtj2603wn4f"
+            },
+            "editorMode": "code",
+            "expr": "sum(cxp_container_status) by (container_name)",
+            "instant": false,
+            "range": true,
+            "refId": "A"
+          }
+        ],
+        "title": "Containers CPU Usage ",
+        "transparent": true,
+        "type": "timeseries"
+      }
+    ],
+    "preload": false,
+    "refresh": "5s",
+    "schemaVersion": 41,
+    "tags": [
+      "docker",
+      "CPU",
+      "Containers",
+      "container-exporter"
+    ],
+    "templating": {
+      "list": []
+    },
+    "time": {
+      "from": "now-5m",
+      "to": "now"
+    },
+    "timepicker": {},
+    "timezone": "",
+    "title": "Container Status",
+    "uid": "e7333381-1beb-4bad-bb6f-3203d46da0a9",
+    "version": 6,
+    "weekStart": "saturday"
+  }
diff --git a/dashboards/cpu_usage.json b/dashboards/cpu_usage.json
@@ -117,10 +117,10 @@
           {
             "datasource": {
               "type": "prometheus",
-              "uid": "Your prometheus data source uid"
+              "uid": "fenjtj2603wn4f"
             },
             "editorMode": "code",
-            "expr": "sum(docker_container_cpu_percentage) by (container_name)",
+            "expr": "sum(cxp_cpu_percentage) by (container_name)",
             "instant": false,
             "range": true,
             "refId": "A"
diff --git a/dashboards/memory_usage.json b/dashboards/memory_usage.json
@@ -116,7 +116,7 @@
               "uid": "Your prometheus data source uid"
             },
             "editorMode": "code",
-            "expr": "sum(docker_container_memory_percentage) by (container_name)",
+            "expr": "sum(cxp_memory_percentage) by (container_name)",
             "instant": false,
             "range": true,
             "refId": "A"
diff --git a/stats/get_docker_stats.py b/stats/get_docker_stats.py
@@ -33,7 +33,7 @@ def calculate_memory_percentage(stats: dict) -> float:
     return (usage / limit) * 100.0
 
 
-def calculate_memory_bytes(stats) -> bytes:
+def calculate_memory_bytes(stats) -> float:
     mem_stats = stats.get('memory_stats', {}) or {}
     memory_usage_bytes = mem_stats.get('usage')
     
diff --git a/utils/metrics.py b/utils/metrics.py
@@ -4,10 +4,12 @@
 from settings.settings import settings
 PromMetric = Union[Gauge, Counter]
 
-def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[PromMetric], persistent_metrics : list[PromMetric]):
+def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[PromMetric], persistent_metrics : list[Gauge]):
     """
     Removes time series for inactive containers from selected metrics
     while preserving container status metrics by setting them to 0.
+    when CONTAINER_EXPORTER_CLEAR_METRICS is set False it only clears Counter metrics
+    Gauge metrics are set to 0.
     """
     active_set = set(active_names)
 
@@ -28,9 +30,11 @@ def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[Prom
             if name not in active_set:
                 metric.labels(container_name=name).set(0)
 
-
-def flush_metric_labels(containers:list[DockerContainer], metrics_to_clear: list[PromMetric]):
-    for container in containers:
-        if container._container.get("State") != "running":
-            for metric in metrics_to_clear:
-                metric.labels(container_name=container._container.get("Names")[0][1:]).set(0)
+def normalize_name(raw_names: list[str], fallback_id: str) -> str:
+    """
+    Given Docker’s 'Names' array (e.g. ['/my‐container']), pick the first one and strip leading '/'.
+    If it’s missing or empty, return a short version of container ID.
+    """
+    if raw_names and isinstance(raw_names, list) and raw_names[0]:
+        return raw_names[0].lstrip("/")
+    return fallback_id[:12]