66from fastapi import FastAPI
77from fastapi .responses import PlainTextResponse
88from contextlib import asynccontextmanager
9- from utils .metrics import PromMetric , prune_stale_metrics , flush_metric_labels
10- from logging import basicConfig , error , ERROR
9+ from utils .metrics import PromMetric , prune_stale_metrics , normalize_name
10+ import logging
1111from settings .settings import settings
1212
1313docker_client : Docker
@@ -23,7 +23,16 @@ async def lifespan(app: FastAPI):
2323
2424app = FastAPI (lifespan = lifespan )
2525
26- gauge_container_status = Gauge ('cxp_container_status' , 'Docker container status (0 = not running, 1 = running, 2 = restarting/unhealthy)' , ['container_name' ])
26+ logging .basicConfig (
27+ level = logging .ERROR ,
28+ format = '%(asctime)s ERROR %(message)s' ,
29+ datefmt = '%Y-%m-%d %H:%M:%S' ,
30+ )
31+
32+ logger = logging .getLogger (__name__ )
33+
34+
35+ gauge_container_status = Gauge ('cxp_container_status' , 'Docker container status (0 = not running, 1 = running, 2 = restarting/unhealthy/paused)' , ['container_name' ])
2736gauge_cpu_percentage = Gauge ('cxp_cpu_percentage' , 'Docker container CPU usage' , ['container_name' ])
2837gauge_memory_percentage = Gauge ('cxp_memory_percentage' , 'Docker container memory usage in percent' , ['container_name' ])
2938gauge_memory_bytes = Gauge ('cxp_memory_bytes_total' , 'Docker container memory usage in bytes' , ['container_name' ])
@@ -33,24 +42,26 @@ async def lifespan(app: FastAPI):
3342counter_net_rx = Gauge ("cxp_network_rx_bytes_total" , "Total bytes received over network" , ['container_name' ])
3443counter_net_tx = Gauge ("cxp_network_tx_bytes_total" , "Total bytes sent over network" , ['container_name' ])
3544
36-
37- metrics_to_clear : list [PromMetric ] = [gauge_cpu_percentage , gauge_memory_percentage , gauge_memory_bytes , counter_disk_read , counter_disk_write , counter_net_rx , counter_net_tx ]
38-
39-
40-
4145async def get_containers (all = False ) -> list [DockerContainer ]:
4246 return await docker_client .containers .list (all = all )
4347
4448def update_container_status (running_containers :list [DockerContainer ]):
4549 for c in running_containers :
46- gauge_container_status .labels (container_name = c ._container .get ("Names" )[0 ][1 :]).set (1 if c ._container .get ('State' ) == 'running' else 2 )
50+ info = c ._container
51+ name = normalize_name (info .get ("Names" , []), info .get ("Id" , "" ))
52+ state = info .get ("State" , "" ).lower ()
53+ if state == "running" :
54+ gauge_container_status .labels (container_name = name ).set (1 )
55+ else :
56+ gauge_container_status .labels (container_name = name ).set (2 )
4757
4858# Async metrics gathering
4959async def container_stats ( running_containers : list [DockerContainer ]):
5060 all_stats = await stat .get_containers_stats (running_containers )
5161
5262 for stats in all_stats :
53- name = stats [0 ]['name' ][1 :]
63+ name = stats [0 ].get ('name' , stats [0 ].get ('id' , 'Unkown' ).lstrip ("/" )).lstrip ("/" )
64+
5465 gauge_cpu_percentage .labels (container_name = name ).set (stat .calculate_cpu_percentage (stats [0 ]))
5566 gauge_memory_percentage .labels (container_name = name ).set (stat .calculate_memory_percentage (stats [0 ]))
5667 gauge_memory_bytes .labels (container_name = name ).set (stat .calculate_memory_bytes (stats [0 ]))
@@ -69,29 +80,39 @@ async def container_stats( running_containers: list[DockerContainer]):
6980]
7081
7182# Metrics we want to always keep, and set to 0 instead
72- persistent_metrics : list [PromMetric ] = [gauge_container_status ]
83+ persistent_metrics : list [Gauge ] = [gauge_container_status ]
7384
7485
7586@app .get ("/" )
7687def root ():
7788 return {"message" : "Welcome to CXP, Container Exporter for Prometheus." }
7889
90+ @app .get ("/healthz" )
91+ async def healthz ():
92+ try :
93+ # A simple, cheap call to Docker, e.g. list one container
94+ await docker_client .containers .list (limit = 1 )
95+ return PlainTextResponse ("OK" , status_code = 200 )
96+ except :
97+ return PlainTextResponse ("NOT OK" , status_code = 500 )
98+
7999@app .get ("/metrics" )
80100async def metrics ():
81101 try :
82102 running_containers = await get_containers ()
83103 update_container_status (running_containers )
84- prune_stale_metrics ([c ._container .get ("Names" )[0 ][1 :] for c in running_containers ], prunable_metrics , persistent_metrics )
104+
105+ c_names = [
106+ normalize_name (c ._container .get ("Names" , []), c ._container .get ("Id" , "" ))
107+ for c in running_containers
108+ ]
109+ prune_stale_metrics (c_names , prunable_metrics , persistent_metrics )
110+
85111 await container_stats (running_containers )
86112 return PlainTextResponse (
87113 content = generate_latest (),
88114 media_type = CONTENT_TYPE_LATEST
89115 )
90116 except Exception as e :
91- basicConfig (
92- level = ERROR ,
93- format = '%(asctime)s ERROR %(message)s' ,
94- datefmt = '%Y-%m-%d %H:%M:%S'
95- )
96- error (str (e ))
117+ logger .error ("Error running metrics collection: %s" , e , exc_info = settings .CONTAINER_EXPORTER_DEBUG )
97118 return PlainTextResponse (f"Error running metrics collection: { str (e )} " , status_code = 500 )
0 commit comments