Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 47 additions & 26 deletions container_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from fastapi import FastAPI
from fastapi.responses import PlainTextResponse
from contextlib import asynccontextmanager
from utils.metrics import PromMetric, prune_stale_metrics, flush_metric_labels
from logging import basicConfig, error, ERROR
from utils.metrics import PromMetric, prune_stale_metrics, normalize_name
import logging
from settings.settings import settings

docker_client: Docker
Expand All @@ -23,44 +23,55 @@ async def lifespan(app: FastAPI):

app = FastAPI(lifespan=lifespan)

gauge_container_status = Gauge('cxp_container_status', 'Docker container status (0 = not running, 1 = running, 2 = restarting/unhealthy)', ['container_name'])
gauge_cpu_percentage = Gauge('cxp_cpu_percentage', 'Docker container CPU usage', ['container_name'])
gauge_memory_percentage = Gauge('cxp_memory_percentage', 'Docker container memory usage in percent', ['container_name'])
gauge_memory_bytes = Gauge('cxp_memory_bytes_total', 'Docker container memory usage in bytes', ['container_name'])

counter_disk_read = Counter("cxp_disk_io_read_bytes_total", "Total bytes read from disk", ['container_name'])
counter_disk_write = Counter("cxp_disk_io_write_bytes_total", "Total bytes written to disk", ['container_name'])
counter_net_rx = Counter("cxp_network_rx_bytes_total", "Total bytes received over network", ['container_name'])
counter_net_tx = Counter("cxp_network_tx_bytes_total", "Total bytes sent over network", ['container_name'])
logging.basicConfig(
level=logging.ERROR,
format='%(asctime)s ERROR %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
)

logger = logging.getLogger(__name__)

metrics_to_clear: list[PromMetric] = [gauge_cpu_percentage, gauge_memory_percentage, gauge_memory_bytes, counter_disk_read, counter_disk_write, counter_net_rx, counter_net_tx]

gauge_container_status = Gauge('cxp_container_status', 'Docker container status (0 = not running, 1 = running, 2 = restarting/unhealthy/paused)', ['container_name'])
gauge_cpu_percentage = Gauge('cxp_cpu_percentage', 'Docker container CPU usage', ['container_name'])
gauge_memory_percentage = Gauge('cxp_memory_percentage', 'Docker container memory usage in percent', ['container_name'])
gauge_memory_bytes = Gauge('cxp_memory_bytes_total', 'Docker container memory usage in bytes', ['container_name'])

counter_disk_read = Gauge("cxp_disk_io_read_bytes_total", "Total bytes read from disk", ['container_name'])
counter_disk_write = Gauge("cxp_disk_io_write_bytes_total", "Total bytes written to disk", ['container_name'])
counter_net_rx = Gauge("cxp_network_rx_bytes_total", "Total bytes received over network", ['container_name'])
counter_net_tx = Gauge("cxp_network_tx_bytes_total", "Total bytes sent over network", ['container_name'])

async def get_containers(all=False) -> list[DockerContainer]:
return await docker_client.containers.list(all=all)

def update_container_status(running_containers:list[DockerContainer]):
for c in running_containers:
gauge_container_status.labels(container_name=c._container.get("Names")[0][1:]).set(1 if c._container.get('State') == 'running' else 2)
info = c._container
name = normalize_name(info.get("Names", []), info.get("Id", ""))
state = info.get("State", "").lower()
if state == "running":
gauge_container_status.labels(container_name=name).set(1)
else:
gauge_container_status.labels(container_name=name).set(2)

# Async metrics gathering
async def container_stats( running_containers: list[DockerContainer]):
all_stats = await stat.get_containers_stats(running_containers)

for stats in all_stats:
name = stats[0]['name'][1:]
name = stats[0].get('name', stats[0].get('id', 'Unkown').lstrip("/")).lstrip("/")

gauge_cpu_percentage.labels(container_name=name).set(stat.calculate_cpu_percentage(stats[0]))
gauge_memory_percentage.labels(container_name=name).set(stat.calculate_memory_percentage(stats[0]))
gauge_memory_bytes.labels(container_name=name).set(stat.calculate_memory_bytes(stats[0]))
disk_read, disk_write = stat.calculate_disk_io(stats[0])
net_rx, net_tx = stat.calculate_network_io(stats[0])

counter_disk_read.labels(container_name=name).inc(disk_read)
counter_disk_write.labels(container_name=name).inc(disk_write)
counter_net_rx.labels(container_name=name).inc(net_rx)
counter_net_tx.labels(container_name=name).inc(net_tx)
counter_disk_read.labels(container_name=name).set(disk_read)
counter_disk_write.labels(container_name=name).set(disk_write)
counter_net_rx.labels(container_name=name).set(net_rx)
counter_net_tx.labels(container_name=name).set(net_tx)

# List of metrics we want to prune (performance counters)
prunable_metrics: list[PromMetric] = [
Expand All @@ -69,29 +80,39 @@ async def container_stats( running_containers: list[DockerContainer]):
]

# Metrics we want to always keep, and set to 0 instead
persistent_metrics: list[PromMetric] = [gauge_container_status]
persistent_metrics: list[Gauge] = [gauge_container_status]


@app.get("/")
def root():
return {"message": "Welcome to CXP, Container Exporter for Prometheus."}

@app.get("/healthz")
async def healthz():
try:
# A simple, cheap call to Docker, e.g. list one container
await docker_client.containers.list(limit=1)
return PlainTextResponse("OK", status_code=200)
except:
return PlainTextResponse("NOT OK", status_code=500)

@app.get("/metrics")
async def metrics():
try:
running_containers = await get_containers()
update_container_status(running_containers)
prune_stale_metrics([c._container.get("Names")[0][1:] for c in running_containers], prunable_metrics, persistent_metrics)

c_names = [
normalize_name(c._container.get("Names", []), c._container.get("Id", ""))
for c in running_containers
]
prune_stale_metrics(c_names, prunable_metrics, persistent_metrics)

await container_stats(running_containers)
return PlainTextResponse(
content=generate_latest(),
media_type=CONTENT_TYPE_LATEST
)
except Exception as e:
basicConfig(
level=ERROR,
format='%(asctime)s ERROR %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
error(str(e))
logger.error("Error running metrics collection: %s", e, exc_info=settings.CONTAINER_EXPORTER_DEBUG)
return PlainTextResponse(f"Error running metrics collection: {str(e)}", status_code=500)
183 changes: 183 additions & 0 deletions dashboards/container_status.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 2,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "Your prometheus data source uid"
},
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "usage in percentage",
"axisPlacement": "auto",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineStyle": {
"fill": "solid"
},
"lineWidth": 1,
"pointSize": 4,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "always",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "dashed+area"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green"
},
{
"color": "yellow",
"value": 85
},
{
"color": "red",
"value": 150
}
]
}
},
"overrides": [
{
"__systemRef": "hideSeriesFrom",
"matcher": {
"id": "byNames",
"options": {
"mode": "exclude",
"names": [
"{container_name=\"hopeful_dewdney\"}"
],
"prefix": "All except:",
"readOnly": true
}
},
"properties": [
{
"id": "custom.hideFrom",
"value": {
"legend": false,
"tooltip": false,
"viz": true
}
}
]
}
]
},
"gridPos": {
"h": 17,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"legend": {
"calcs": [
"min",
"max",
"mean"
],
"displayMode": "table",
"placement": "right",
"showLegend": true,
"sortBy": "Mean",
"sortDesc": true
},
"tooltip": {
"hideZeros": false,
"mode": "multi",
"sort": "desc"
}
},
"pluginVersion": "12.0.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "fenjtj2603wn4f"
},
"editorMode": "code",
"expr": "sum(cxp_container_status) by (container_name)",
"instant": false,
"range": true,
"refId": "A"
}
],
"title": "Containers CPU Usage ",
"transparent": true,
"type": "timeseries"
}
],
"preload": false,
"refresh": "5s",
"schemaVersion": 41,
"tags": [
"docker",
"CPU",
"Containers",
"container-exporter"
],
"templating": {
"list": []
},
"time": {
"from": "now-5m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Container Status",
"uid": "e7333381-1beb-4bad-bb6f-3203d46da0a9",
"version": 6,
"weekStart": "saturday"
}
4 changes: 2 additions & 2 deletions dashboards/cpu_usage.json
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,10 @@
{
"datasource": {
"type": "prometheus",
"uid": "Your prometheus data source uid"
"uid": "fenjtj2603wn4f"
},
"editorMode": "code",
"expr": "sum(docker_container_cpu_percentage) by (container_name)",
"expr": "sum(cxp_cpu_percentage) by (container_name)",
"instant": false,
"range": true,
"refId": "A"
Expand Down
2 changes: 1 addition & 1 deletion dashboards/memory_usage.json
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@
"uid": "Your prometheus data source uid"
},
"editorMode": "code",
"expr": "sum(docker_container_memory_percentage) by (container_name)",
"expr": "sum(cxp_memory_percentage) by (container_name)",
"instant": false,
"range": true,
"refId": "A"
Expand Down
2 changes: 1 addition & 1 deletion stats/get_docker_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def calculate_memory_percentage(stats: dict) -> float:
return (usage / limit) * 100.0


def calculate_memory_bytes(stats) -> bytes:
def calculate_memory_bytes(stats) -> float:
mem_stats = stats.get('memory_stats', {}) or {}
memory_usage_bytes = mem_stats.get('usage')

Expand Down
18 changes: 11 additions & 7 deletions utils/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
from settings.settings import settings
PromMetric = Union[Gauge, Counter]

def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[PromMetric], persistent_metrics : list[PromMetric]):
def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[PromMetric], persistent_metrics : list[Gauge]):
"""
Removes time series for inactive containers from selected metrics
while preserving container status metrics by setting them to 0.
when CONTAINER_EXPORTER_CLEAR_METRICS is set False it only clears Counter metrics
Gauge metrics are set to 0.
"""
active_set = set(active_names)

Expand All @@ -28,9 +30,11 @@ def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[Prom
if name not in active_set:
metric.labels(container_name=name).set(0)


def flush_metric_labels(containers:list[DockerContainer], metrics_to_clear: list[PromMetric]):
for container in containers:
if container._container.get("State") != "running":
for metric in metrics_to_clear:
metric.labels(container_name=container._container.get("Names")[0][1:]).set(0)
def normalize_name(raw_names: list[str], fallback_id: str) -> str:
"""
Given Docker’s 'Names' array (e.g. ['/my‐container']), pick the first one and strip leading '/'.
If it’s missing or empty, return a short version of container ID.
"""
if raw_names and isinstance(raw_names, list) and raw_names[0]:
return raw_names[0].lstrip("/")
return fallback_id[:12]