66from fastapi import FastAPI
77from fastapi .responses import PlainTextResponse
88from contextlib import asynccontextmanager
9- from utils .metrics import PromMetric , prune_stale_metrics , flush_metric_labels
10- from logging import basicConfig , error , ERROR
9+ from utils .metrics import PromMetric , prune_stale_metrics , normalize_name
10+ import logging
1111from settings .settings import settings
1212
1313docker_client : Docker
@@ -23,44 +23,55 @@ async def lifespan(app: FastAPI):
2323
2424app = FastAPI (lifespan = lifespan )
2525
26- gauge_container_status = Gauge ('cxp_container_status' , 'Docker container status (0 = not running, 1 = running, 2 = restarting/unhealthy)' , ['container_name' ])
27- gauge_cpu_percentage = Gauge ('cxp_cpu_percentage' , 'Docker container CPU usage' , ['container_name' ])
28- gauge_memory_percentage = Gauge ('cxp_memory_percentage' , 'Docker container memory usage in percent' , ['container_name' ])
29- gauge_memory_bytes = Gauge ('cxp_memory_bytes_total' , 'Docker container memory usage in bytes' , ['container_name' ])
30-
31- counter_disk_read = Counter ("cxp_disk_io_read_bytes_total" , "Total bytes read from disk" , ['container_name' ])
32- counter_disk_write = Counter ("cxp_disk_io_write_bytes_total" , "Total bytes written to disk" , ['container_name' ])
33- counter_net_rx = Counter ("cxp_network_rx_bytes_total" , "Total bytes received over network" , ['container_name' ])
34- counter_net_tx = Counter ("cxp_network_tx_bytes_total" , "Total bytes sent over network" , ['container_name' ])
26+ logging .basicConfig (
27+ level = logging .ERROR ,
28+ format = '%(asctime)s ERROR %(message)s' ,
29+ datefmt = '%Y-%m-%d %H:%M:%S' ,
30+ )
3531
32+ logger = logging .getLogger (__name__ )
3633
37- metrics_to_clear : list [PromMetric ] = [gauge_cpu_percentage , gauge_memory_percentage , gauge_memory_bytes , counter_disk_read , counter_disk_write , counter_net_rx , counter_net_tx ]
3834
35+ gauge_container_status = Gauge ('cxp_container_status' , 'Docker container status (0 = not running, 1 = running, 2 = restarting/unhealthy/paused)' , ['container_name' ])
36+ gauge_cpu_percentage = Gauge ('cxp_cpu_percentage' , 'Docker container CPU usage' , ['container_name' ])
37+ gauge_memory_percentage = Gauge ('cxp_memory_percentage' , 'Docker container memory usage in percent' , ['container_name' ])
38+ gauge_memory_bytes = Gauge ('cxp_memory_bytes_total' , 'Docker container memory usage in bytes' , ['container_name' ])
3939
40+ counter_disk_read = Gauge ("cxp_disk_io_read_bytes_total" , "Total bytes read from disk" , ['container_name' ])
41+ counter_disk_write = Gauge ("cxp_disk_io_write_bytes_total" , "Total bytes written to disk" , ['container_name' ])
42+ counter_net_rx = Gauge ("cxp_network_rx_bytes_total" , "Total bytes received over network" , ['container_name' ])
43+ counter_net_tx = Gauge ("cxp_network_tx_bytes_total" , "Total bytes sent over network" , ['container_name' ])
4044
4145async def get_containers (all = False ) -> list [DockerContainer ]:
4246 return await docker_client .containers .list (all = all )
4347
4448def update_container_status (running_containers :list [DockerContainer ]):
4549 for c in running_containers :
46- gauge_container_status .labels (container_name = c ._container .get ("Names" )[0 ][1 :]).set (1 if c ._container .get ('State' ) == 'running' else 2 )
50+ info = c ._container
51+ name = normalize_name (info .get ("Names" , []), info .get ("Id" , "" ))
52+ state = info .get ("State" , "" ).lower ()
53+ if state == "running" :
54+ gauge_container_status .labels (container_name = name ).set (1 )
55+ else :
56+ gauge_container_status .labels (container_name = name ).set (2 )
4757
4858# Async metrics gathering
4959async def container_stats ( running_containers : list [DockerContainer ]):
5060 all_stats = await stat .get_containers_stats (running_containers )
5161
5262 for stats in all_stats :
53- name = stats [0 ]['name' ][1 :]
63+ name = stats [0 ].get ('name' , stats [0 ].get ('id' , 'Unkown' ).lstrip ("/" )).lstrip ("/" )
64+
5465 gauge_cpu_percentage .labels (container_name = name ).set (stat .calculate_cpu_percentage (stats [0 ]))
5566 gauge_memory_percentage .labels (container_name = name ).set (stat .calculate_memory_percentage (stats [0 ]))
5667 gauge_memory_bytes .labels (container_name = name ).set (stat .calculate_memory_bytes (stats [0 ]))
5768 disk_read , disk_write = stat .calculate_disk_io (stats [0 ])
5869 net_rx , net_tx = stat .calculate_network_io (stats [0 ])
5970
60- counter_disk_read .labels (container_name = name ).inc (disk_read )
61- counter_disk_write .labels (container_name = name ).inc (disk_write )
62- counter_net_rx .labels (container_name = name ).inc (net_rx )
63- counter_net_tx .labels (container_name = name ).inc (net_tx )
71+ counter_disk_read .labels (container_name = name ).set (disk_read )
72+ counter_disk_write .labels (container_name = name ).set (disk_write )
73+ counter_net_rx .labels (container_name = name ).set (net_rx )
74+ counter_net_tx .labels (container_name = name ).set (net_tx )
6475
6576# List of metrics we want to prune (performance counters)
6677prunable_metrics : list [PromMetric ] = [
@@ -69,29 +80,39 @@ async def container_stats( running_containers: list[DockerContainer]):
6980]
7081
7182# Metrics we want to always keep, and set to 0 instead
72- persistent_metrics : list [PromMetric ] = [gauge_container_status ]
83+ persistent_metrics : list [Gauge ] = [gauge_container_status ]
7384
7485
7586@app .get ("/" )
7687def root ():
7788 return {"message" : "Welcome to CXP, Container Exporter for Prometheus." }
7889
90+ @app .get ("/healthz" )
91+ async def healthz ():
92+ try :
93+ # A simple, cheap call to Docker, e.g. list one container
94+ await docker_client .containers .list (limit = 1 )
95+ return PlainTextResponse ("OK" , status_code = 200 )
96+ except :
97+ return PlainTextResponse ("NOT OK" , status_code = 500 )
98+
7999@app .get ("/metrics" )
80100async def metrics ():
81101 try :
82102 running_containers = await get_containers ()
83103 update_container_status (running_containers )
84- prune_stale_metrics ([c ._container .get ("Names" )[0 ][1 :] for c in running_containers ], prunable_metrics , persistent_metrics )
104+
105+ c_names = [
106+ normalize_name (c ._container .get ("Names" , []), c ._container .get ("Id" , "" ))
107+ for c in running_containers
108+ ]
109+ prune_stale_metrics (c_names , prunable_metrics , persistent_metrics )
110+
85111 await container_stats (running_containers )
86112 return PlainTextResponse (
87113 content = generate_latest (),
88114 media_type = CONTENT_TYPE_LATEST
89115 )
90116 except Exception as e :
91- basicConfig (
92- level = ERROR ,
93- format = '%(asctime)s ERROR %(message)s' ,
94- datefmt = '%Y-%m-%d %H:%M:%S'
95- )
96- error (str (e ))
117+ logger .error ("Error running metrics collection: %s" , e , exc_info = settings .CONTAINER_EXPORTER_DEBUG )
97118 return PlainTextResponse (f"Error running metrics collection: { str (e )} " , status_code = 500 )
0 commit comments