diff --git a/container_exporter.py b/container_exporter.py index 626307b..25fb2c4 100755 --- a/container_exporter.py +++ b/container_exporter.py @@ -6,8 +6,8 @@ from fastapi import FastAPI from fastapi.responses import PlainTextResponse from contextlib import asynccontextmanager -from utils.metrics import PromMetric, prune_stale_metrics, flush_metric_labels -from logging import basicConfig, error, ERROR +from utils.metrics import PromMetric, prune_stale_metrics, normalize_name +import logging from settings.settings import settings docker_client: Docker @@ -23,44 +23,55 @@ async def lifespan(app: FastAPI): app = FastAPI(lifespan=lifespan) -gauge_container_status = Gauge('cxp_container_status', 'Docker container status (0 = not running, 1 = running, 2 = restarting/unhealthy)', ['container_name']) -gauge_cpu_percentage = Gauge('cxp_cpu_percentage', 'Docker container CPU usage', ['container_name']) -gauge_memory_percentage = Gauge('cxp_memory_percentage', 'Docker container memory usage in percent', ['container_name']) -gauge_memory_bytes = Gauge('cxp_memory_bytes_total', 'Docker container memory usage in bytes', ['container_name']) - -counter_disk_read = Counter("cxp_disk_io_read_bytes_total", "Total bytes read from disk", ['container_name']) -counter_disk_write = Counter("cxp_disk_io_write_bytes_total", "Total bytes written to disk", ['container_name']) -counter_net_rx = Counter("cxp_network_rx_bytes_total", "Total bytes received over network", ['container_name']) -counter_net_tx = Counter("cxp_network_tx_bytes_total", "Total bytes sent over network", ['container_name']) +logging.basicConfig( + level=logging.ERROR, + format='%(asctime)s ERROR %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', +) +logger = logging.getLogger(__name__) -metrics_to_clear: list[PromMetric] = [gauge_cpu_percentage, gauge_memory_percentage, gauge_memory_bytes, counter_disk_read, counter_disk_write, counter_net_rx, counter_net_tx] +gauge_container_status = Gauge('cxp_container_status', 'Docker container status (0 = not running, 1 = running, 2 = restarting/unhealthy/paused)', ['container_name']) +gauge_cpu_percentage = Gauge('cxp_cpu_percentage', 'Docker container CPU usage', ['container_name']) +gauge_memory_percentage = Gauge('cxp_memory_percentage', 'Docker container memory usage in percent', ['container_name']) +gauge_memory_bytes = Gauge('cxp_memory_bytes_total', 'Docker container memory usage in bytes', ['container_name']) +counter_disk_read = Gauge("cxp_disk_io_read_bytes_total", "Total bytes read from disk", ['container_name']) +counter_disk_write = Gauge("cxp_disk_io_write_bytes_total", "Total bytes written to disk", ['container_name']) +counter_net_rx = Gauge("cxp_network_rx_bytes_total", "Total bytes received over network", ['container_name']) +counter_net_tx = Gauge("cxp_network_tx_bytes_total", "Total bytes sent over network", ['container_name']) async def get_containers(all=False) -> list[DockerContainer]: return await docker_client.containers.list(all=all) def update_container_status(running_containers:list[DockerContainer]): for c in running_containers: - gauge_container_status.labels(container_name=c._container.get("Names")[0][1:]).set(1 if c._container.get('State') == 'running' else 2) + info = c._container + name = normalize_name(info.get("Names", []), info.get("Id", "")) + state = info.get("State", "").lower() + if state == "running": + gauge_container_status.labels(container_name=name).set(1) + else: + gauge_container_status.labels(container_name=name).set(2) # Async metrics gathering async def container_stats( running_containers: list[DockerContainer]): all_stats = await stat.get_containers_stats(running_containers) for stats in all_stats: - name = stats[0]['name'][1:] + name = stats[0].get('name', stats[0].get('id', 'Unkown').lstrip("/")).lstrip("/") + gauge_cpu_percentage.labels(container_name=name).set(stat.calculate_cpu_percentage(stats[0])) gauge_memory_percentage.labels(container_name=name).set(stat.calculate_memory_percentage(stats[0])) gauge_memory_bytes.labels(container_name=name).set(stat.calculate_memory_bytes(stats[0])) disk_read, disk_write = stat.calculate_disk_io(stats[0]) net_rx, net_tx = stat.calculate_network_io(stats[0]) - counter_disk_read.labels(container_name=name).inc(disk_read) - counter_disk_write.labels(container_name=name).inc(disk_write) - counter_net_rx.labels(container_name=name).inc(net_rx) - counter_net_tx.labels(container_name=name).inc(net_tx) + counter_disk_read.labels(container_name=name).set(disk_read) + counter_disk_write.labels(container_name=name).set(disk_write) + counter_net_rx.labels(container_name=name).set(net_rx) + counter_net_tx.labels(container_name=name).set(net_tx) # List of metrics we want to prune (performance counters) prunable_metrics: list[PromMetric] = [ @@ -69,29 +80,39 @@ async def container_stats( running_containers: list[DockerContainer]): ] # Metrics we want to always keep, and set to 0 instead -persistent_metrics: list[PromMetric] = [gauge_container_status] +persistent_metrics: list[Gauge] = [gauge_container_status] @app.get("/") def root(): return {"message": "Welcome to CXP, Container Exporter for Prometheus."} +@app.get("/healthz") +async def healthz(): + try: + # A simple, cheap call to Docker, e.g. list one container + await docker_client.containers.list(limit=1) + return PlainTextResponse("OK", status_code=200) + except: + return PlainTextResponse("NOT OK", status_code=500) + @app.get("/metrics") async def metrics(): try: running_containers = await get_containers() update_container_status(running_containers) - prune_stale_metrics([c._container.get("Names")[0][1:] for c in running_containers], prunable_metrics, persistent_metrics) + + c_names = [ + normalize_name(c._container.get("Names", []), c._container.get("Id", "")) + for c in running_containers + ] + prune_stale_metrics(c_names, prunable_metrics, persistent_metrics) + await container_stats(running_containers) return PlainTextResponse( content=generate_latest(), media_type=CONTENT_TYPE_LATEST ) except Exception as e: - basicConfig( - level=ERROR, - format='%(asctime)s ERROR %(message)s', - datefmt='%Y-%m-%d %H:%M:%S' - ) - error(str(e)) + logger.error("Error running metrics collection: %s", e, exc_info=settings.CONTAINER_EXPORTER_DEBUG) return PlainTextResponse(f"Error running metrics collection: {str(e)}", status_code=500) \ No newline at end of file diff --git a/dashboards/container_status.json b/dashboards/container_status.json new file mode 100644 index 0000000..197f1b2 --- /dev/null +++ b/dashboards/container_status.json @@ -0,0 +1,183 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 2, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "Your prometheus data source uid" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "usage in percentage", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "always", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "dashed+area" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 85 + }, + { + "color": "red", + "value": 150 + } + ] + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "{container_name=\"hopeful_dewdney\"}" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 17, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.0.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "fenjtj2603wn4f" + }, + "editorMode": "code", + "expr": "sum(cxp_container_status) by (container_name)", + "instant": false, + "range": true, + "refId": "A" + } + ], + "title": "Containers CPU Usage ", + "transparent": true, + "type": "timeseries" + } + ], + "preload": false, + "refresh": "5s", + "schemaVersion": 41, + "tags": [ + "docker", + "CPU", + "Containers", + "container-exporter" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Container Status", + "uid": "e7333381-1beb-4bad-bb6f-3203d46da0a9", + "version": 6, + "weekStart": "saturday" + } \ No newline at end of file diff --git a/dashboards/cpu_usage.json b/dashboards/cpu_usage.json index 50c16ee..29f2a5a 100644 --- a/dashboards/cpu_usage.json +++ b/dashboards/cpu_usage.json @@ -117,10 +117,10 @@ { "datasource": { "type": "prometheus", - "uid": "Your prometheus data source uid" + "uid": "fenjtj2603wn4f" }, "editorMode": "code", - "expr": "sum(docker_container_cpu_percentage) by (container_name)", + "expr": "sum(cxp_cpu_percentage) by (container_name)", "instant": false, "range": true, "refId": "A" diff --git a/dashboards/memory_usage.json b/dashboards/memory_usage.json index 436e3f9..4cef976 100644 --- a/dashboards/memory_usage.json +++ b/dashboards/memory_usage.json @@ -116,7 +116,7 @@ "uid": "Your prometheus data source uid" }, "editorMode": "code", - "expr": "sum(docker_container_memory_percentage) by (container_name)", + "expr": "sum(cxp_memory_percentage) by (container_name)", "instant": false, "range": true, "refId": "A" diff --git a/stats/get_docker_stats.py b/stats/get_docker_stats.py index 67a0e14..1addca5 100644 --- a/stats/get_docker_stats.py +++ b/stats/get_docker_stats.py @@ -33,7 +33,7 @@ def calculate_memory_percentage(stats: dict) -> float: return (usage / limit) * 100.0 -def calculate_memory_bytes(stats) -> bytes: +def calculate_memory_bytes(stats) -> float: mem_stats = stats.get('memory_stats', {}) or {} memory_usage_bytes = mem_stats.get('usage') diff --git a/utils/metrics.py b/utils/metrics.py index ddaad3e..2c8c47b 100644 --- a/utils/metrics.py +++ b/utils/metrics.py @@ -4,10 +4,12 @@ from settings.settings import settings PromMetric = Union[Gauge, Counter] -def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[PromMetric], persistent_metrics : list[PromMetric]): +def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[PromMetric], persistent_metrics : list[Gauge]): """ Removes time series for inactive containers from selected metrics while preserving container status metrics by setting them to 0. + when CONTAINER_EXPORTER_CLEAR_METRICS is set False it only clears Counter metrics + Gauge metrics are set to 0. """ active_set = set(active_names) @@ -28,9 +30,11 @@ def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[Prom if name not in active_set: metric.labels(container_name=name).set(0) - -def flush_metric_labels(containers:list[DockerContainer], metrics_to_clear: list[PromMetric]): - for container in containers: - if container._container.get("State") != "running": - for metric in metrics_to_clear: - metric.labels(container_name=container._container.get("Names")[0][1:]).set(0) \ No newline at end of file +def normalize_name(raw_names: list[str], fallback_id: str) -> str: + """ + Given Docker’s 'Names' array (e.g. ['/my‐container']), pick the first one and strip leading '/'. + If it’s missing or empty, return a short version of container ID. + """ + if raw_names and isinstance(raw_names, list) and raw_names[0]: + return raw_names[0].lstrip("/") + return fallback_id[:12] \ No newline at end of file