Skip to content

Commit 0841803

Browse files
authored
Merge pull request #35 from Shayan-Ghani/refinement
Fix!: wrong calculation for disk and network metrics
2 parents 52c3c03 + e9477e7 commit 0841803

File tree

6 files changed

+245
-37
lines changed

6 files changed

+245
-37
lines changed

container_exporter.py

Lines changed: 47 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
from fastapi import FastAPI
77
from fastapi.responses import PlainTextResponse
88
from contextlib import asynccontextmanager
9-
from utils.metrics import PromMetric, prune_stale_metrics, flush_metric_labels
10-
from logging import basicConfig, error, ERROR
9+
from utils.metrics import PromMetric, prune_stale_metrics, normalize_name
10+
import logging
1111
from settings.settings import settings
1212

1313
docker_client: Docker
@@ -23,44 +23,55 @@ async def lifespan(app: FastAPI):
2323

2424
app = FastAPI(lifespan=lifespan)
2525

26-
gauge_container_status = Gauge('cxp_container_status', 'Docker container status (0 = not running, 1 = running, 2 = restarting/unhealthy)', ['container_name'])
27-
gauge_cpu_percentage = Gauge('cxp_cpu_percentage', 'Docker container CPU usage', ['container_name'])
28-
gauge_memory_percentage = Gauge('cxp_memory_percentage', 'Docker container memory usage in percent', ['container_name'])
29-
gauge_memory_bytes = Gauge('cxp_memory_bytes_total', 'Docker container memory usage in bytes', ['container_name'])
30-
31-
counter_disk_read = Counter("cxp_disk_io_read_bytes_total", "Total bytes read from disk", ['container_name'])
32-
counter_disk_write = Counter("cxp_disk_io_write_bytes_total", "Total bytes written to disk", ['container_name'])
33-
counter_net_rx = Counter("cxp_network_rx_bytes_total", "Total bytes received over network", ['container_name'])
34-
counter_net_tx = Counter("cxp_network_tx_bytes_total", "Total bytes sent over network", ['container_name'])
26+
logging.basicConfig(
27+
level=logging.ERROR,
28+
format='%(asctime)s ERROR %(message)s',
29+
datefmt='%Y-%m-%d %H:%M:%S',
30+
)
3531

32+
logger = logging.getLogger(__name__)
3633

37-
metrics_to_clear: list[PromMetric] = [gauge_cpu_percentage, gauge_memory_percentage, gauge_memory_bytes, counter_disk_read, counter_disk_write, counter_net_rx, counter_net_tx]
3834

35+
gauge_container_status = Gauge('cxp_container_status', 'Docker container status (0 = not running, 1 = running, 2 = restarting/unhealthy/paused)', ['container_name'])
36+
gauge_cpu_percentage = Gauge('cxp_cpu_percentage', 'Docker container CPU usage', ['container_name'])
37+
gauge_memory_percentage = Gauge('cxp_memory_percentage', 'Docker container memory usage in percent', ['container_name'])
38+
gauge_memory_bytes = Gauge('cxp_memory_bytes_total', 'Docker container memory usage in bytes', ['container_name'])
3939

40+
counter_disk_read = Gauge("cxp_disk_io_read_bytes_total", "Total bytes read from disk", ['container_name'])
41+
counter_disk_write = Gauge("cxp_disk_io_write_bytes_total", "Total bytes written to disk", ['container_name'])
42+
counter_net_rx = Gauge("cxp_network_rx_bytes_total", "Total bytes received over network", ['container_name'])
43+
counter_net_tx = Gauge("cxp_network_tx_bytes_total", "Total bytes sent over network", ['container_name'])
4044

4145
async def get_containers(all=False) -> list[DockerContainer]:
4246
return await docker_client.containers.list(all=all)
4347

4448
def update_container_status(running_containers:list[DockerContainer]):
4549
for c in running_containers:
46-
gauge_container_status.labels(container_name=c._container.get("Names")[0][1:]).set(1 if c._container.get('State') == 'running' else 2)
50+
info = c._container
51+
name = normalize_name(info.get("Names", []), info.get("Id", ""))
52+
state = info.get("State", "").lower()
53+
if state == "running":
54+
gauge_container_status.labels(container_name=name).set(1)
55+
else:
56+
gauge_container_status.labels(container_name=name).set(2)
4757

4858
# Async metrics gathering
4959
async def container_stats( running_containers: list[DockerContainer]):
5060
all_stats = await stat.get_containers_stats(running_containers)
5161

5262
for stats in all_stats:
53-
name = stats[0]['name'][1:]
63+
name = stats[0].get('name', stats[0].get('id', 'Unkown').lstrip("/")).lstrip("/")
64+
5465
gauge_cpu_percentage.labels(container_name=name).set(stat.calculate_cpu_percentage(stats[0]))
5566
gauge_memory_percentage.labels(container_name=name).set(stat.calculate_memory_percentage(stats[0]))
5667
gauge_memory_bytes.labels(container_name=name).set(stat.calculate_memory_bytes(stats[0]))
5768
disk_read, disk_write = stat.calculate_disk_io(stats[0])
5869
net_rx, net_tx = stat.calculate_network_io(stats[0])
5970

60-
counter_disk_read.labels(container_name=name).inc(disk_read)
61-
counter_disk_write.labels(container_name=name).inc(disk_write)
62-
counter_net_rx.labels(container_name=name).inc(net_rx)
63-
counter_net_tx.labels(container_name=name).inc(net_tx)
71+
counter_disk_read.labels(container_name=name).set(disk_read)
72+
counter_disk_write.labels(container_name=name).set(disk_write)
73+
counter_net_rx.labels(container_name=name).set(net_rx)
74+
counter_net_tx.labels(container_name=name).set(net_tx)
6475

6576
# List of metrics we want to prune (performance counters)
6677
prunable_metrics: list[PromMetric] = [
@@ -69,29 +80,39 @@ async def container_stats( running_containers: list[DockerContainer]):
6980
]
7081

7182
# Metrics we want to always keep, and set to 0 instead
72-
persistent_metrics: list[PromMetric] = [gauge_container_status]
83+
persistent_metrics: list[Gauge] = [gauge_container_status]
7384

7485

7586
@app.get("/")
7687
def root():
7788
return {"message": "Welcome to CXP, Container Exporter for Prometheus."}
7889

90+
@app.get("/healthz")
91+
async def healthz():
92+
try:
93+
# A simple, cheap call to Docker, e.g. list one container
94+
await docker_client.containers.list(limit=1)
95+
return PlainTextResponse("OK", status_code=200)
96+
except:
97+
return PlainTextResponse("NOT OK", status_code=500)
98+
7999
@app.get("/metrics")
80100
async def metrics():
81101
try:
82102
running_containers = await get_containers()
83103
update_container_status(running_containers)
84-
prune_stale_metrics([c._container.get("Names")[0][1:] for c in running_containers], prunable_metrics, persistent_metrics)
104+
105+
c_names = [
106+
normalize_name(c._container.get("Names", []), c._container.get("Id", ""))
107+
for c in running_containers
108+
]
109+
prune_stale_metrics(c_names, prunable_metrics, persistent_metrics)
110+
85111
await container_stats(running_containers)
86112
return PlainTextResponse(
87113
content=generate_latest(),
88114
media_type=CONTENT_TYPE_LATEST
89115
)
90116
except Exception as e:
91-
basicConfig(
92-
level=ERROR,
93-
format='%(asctime)s ERROR %(message)s',
94-
datefmt='%Y-%m-%d %H:%M:%S'
95-
)
96-
error(str(e))
117+
logger.error("Error running metrics collection: %s", e, exc_info=settings.CONTAINER_EXPORTER_DEBUG)
97118
return PlainTextResponse(f"Error running metrics collection: {str(e)}", status_code=500)

dashboards/container_status.json

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
{
2+
"annotations": {
3+
"list": [
4+
{
5+
"builtIn": 1,
6+
"datasource": {
7+
"type": "grafana",
8+
"uid": "-- Grafana --"
9+
},
10+
"enable": true,
11+
"hide": true,
12+
"iconColor": "rgba(0, 211, 255, 1)",
13+
"name": "Annotations & Alerts",
14+
"type": "dashboard"
15+
}
16+
]
17+
},
18+
"editable": true,
19+
"fiscalYearStartMonth": 0,
20+
"graphTooltip": 0,
21+
"id": 2,
22+
"links": [],
23+
"panels": [
24+
{
25+
"datasource": {
26+
"type": "prometheus",
27+
"uid": "Your prometheus data source uid"
28+
},
29+
"description": "",
30+
"fieldConfig": {
31+
"defaults": {
32+
"color": {
33+
"mode": "palette-classic"
34+
},
35+
"custom": {
36+
"axisBorderShow": false,
37+
"axisCenteredZero": false,
38+
"axisColorMode": "text",
39+
"axisLabel": "usage in percentage",
40+
"axisPlacement": "auto",
41+
"barAlignment": 0,
42+
"barWidthFactor": 0.6,
43+
"drawStyle": "line",
44+
"fillOpacity": 0,
45+
"gradientMode": "none",
46+
"hideFrom": {
47+
"legend": false,
48+
"tooltip": false,
49+
"viz": false
50+
},
51+
"insertNulls": false,
52+
"lineInterpolation": "linear",
53+
"lineStyle": {
54+
"fill": "solid"
55+
},
56+
"lineWidth": 1,
57+
"pointSize": 4,
58+
"scaleDistribution": {
59+
"type": "linear"
60+
},
61+
"showPoints": "always",
62+
"spanNulls": false,
63+
"stacking": {
64+
"group": "A",
65+
"mode": "none"
66+
},
67+
"thresholdsStyle": {
68+
"mode": "dashed+area"
69+
}
70+
},
71+
"mappings": [],
72+
"thresholds": {
73+
"mode": "absolute",
74+
"steps": [
75+
{
76+
"color": "green"
77+
},
78+
{
79+
"color": "yellow",
80+
"value": 85
81+
},
82+
{
83+
"color": "red",
84+
"value": 150
85+
}
86+
]
87+
}
88+
},
89+
"overrides": [
90+
{
91+
"__systemRef": "hideSeriesFrom",
92+
"matcher": {
93+
"id": "byNames",
94+
"options": {
95+
"mode": "exclude",
96+
"names": [
97+
"{container_name=\"hopeful_dewdney\"}"
98+
],
99+
"prefix": "All except:",
100+
"readOnly": true
101+
}
102+
},
103+
"properties": [
104+
{
105+
"id": "custom.hideFrom",
106+
"value": {
107+
"legend": false,
108+
"tooltip": false,
109+
"viz": true
110+
}
111+
}
112+
]
113+
}
114+
]
115+
},
116+
"gridPos": {
117+
"h": 17,
118+
"w": 24,
119+
"x": 0,
120+
"y": 0
121+
},
122+
"id": 1,
123+
"options": {
124+
"legend": {
125+
"calcs": [
126+
"min",
127+
"max",
128+
"mean"
129+
],
130+
"displayMode": "table",
131+
"placement": "right",
132+
"showLegend": true,
133+
"sortBy": "Mean",
134+
"sortDesc": true
135+
},
136+
"tooltip": {
137+
"hideZeros": false,
138+
"mode": "multi",
139+
"sort": "desc"
140+
}
141+
},
142+
"pluginVersion": "12.0.1",
143+
"targets": [
144+
{
145+
"datasource": {
146+
"type": "prometheus",
147+
"uid": "fenjtj2603wn4f"
148+
},
149+
"editorMode": "code",
150+
"expr": "sum(cxp_container_status) by (container_name)",
151+
"instant": false,
152+
"range": true,
153+
"refId": "A"
154+
}
155+
],
156+
"title": "Containers CPU Usage ",
157+
"transparent": true,
158+
"type": "timeseries"
159+
}
160+
],
161+
"preload": false,
162+
"refresh": "5s",
163+
"schemaVersion": 41,
164+
"tags": [
165+
"docker",
166+
"CPU",
167+
"Containers",
168+
"container-exporter"
169+
],
170+
"templating": {
171+
"list": []
172+
},
173+
"time": {
174+
"from": "now-5m",
175+
"to": "now"
176+
},
177+
"timepicker": {},
178+
"timezone": "",
179+
"title": "Container Status",
180+
"uid": "e7333381-1beb-4bad-bb6f-3203d46da0a9",
181+
"version": 6,
182+
"weekStart": "saturday"
183+
}

dashboards/cpu_usage.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,10 +117,10 @@
117117
{
118118
"datasource": {
119119
"type": "prometheus",
120-
"uid": "Your prometheus data source uid"
120+
"uid": "fenjtj2603wn4f"
121121
},
122122
"editorMode": "code",
123-
"expr": "sum(docker_container_cpu_percentage) by (container_name)",
123+
"expr": "sum(cxp_cpu_percentage) by (container_name)",
124124
"instant": false,
125125
"range": true,
126126
"refId": "A"

dashboards/memory_usage.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@
116116
"uid": "Your prometheus data source uid"
117117
},
118118
"editorMode": "code",
119-
"expr": "sum(docker_container_memory_percentage) by (container_name)",
119+
"expr": "sum(cxp_memory_percentage) by (container_name)",
120120
"instant": false,
121121
"range": true,
122122
"refId": "A"

stats/get_docker_stats.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def calculate_memory_percentage(stats: dict) -> float:
3333
return (usage / limit) * 100.0
3434

3535

36-
def calculate_memory_bytes(stats) -> bytes:
36+
def calculate_memory_bytes(stats) -> float:
3737
mem_stats = stats.get('memory_stats', {}) or {}
3838
memory_usage_bytes = mem_stats.get('usage')
3939

utils/metrics.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44
from settings.settings import settings
55
PromMetric = Union[Gauge, Counter]
66

7-
def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[PromMetric], persistent_metrics : list[PromMetric]):
7+
def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[PromMetric], persistent_metrics : list[Gauge]):
88
"""
99
Removes time series for inactive containers from selected metrics
1010
while preserving container status metrics by setting them to 0.
11+
when CONTAINER_EXPORTER_CLEAR_METRICS is set False it only clears Counter metrics
12+
Gauge metrics are set to 0.
1113
"""
1214
active_set = set(active_names)
1315

@@ -28,9 +30,11 @@ def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[Prom
2830
if name not in active_set:
2931
metric.labels(container_name=name).set(0)
3032

31-
32-
def flush_metric_labels(containers:list[DockerContainer], metrics_to_clear: list[PromMetric]):
33-
for container in containers:
34-
if container._container.get("State") != "running":
35-
for metric in metrics_to_clear:
36-
metric.labels(container_name=container._container.get("Names")[0][1:]).set(0)
33+
def normalize_name(raw_names: list[str], fallback_id: str) -> str:
34+
"""
35+
Given Docker’s 'Names' array (e.g. ['/my‐container']), pick the first one and strip leading '/'.
36+
If it’s missing or empty, return a short version of container ID.
37+
"""
38+
if raw_names and isinstance(raw_names, list) and raw_names[0]:
39+
return raw_names[0].lstrip("/")
40+
return fallback_id[:12]

0 commit comments

Comments
 (0)