diff --git a/CHANGELOG.md b/CHANGELOG.md index aef18bf..75ab654 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,40 @@ # ChangeLog for CXP +--- +## [1.2.0] - 2025-05-30 + +### Changed +- **Internal framework refactor:** Migrated from **Flask** to **FastAPI** for improved asynchronous handling and performance. +- Updated operational dependencies: + - Added `fastapi`, `uvicorn` + - Removed `flask`, `gunicorn` + +### Notes +- **No changes** to Prometheus metrics endpoints, names, labels, or scrape behavior. +- Existing Prometheus scrapers, dashboards, and alerting rules will continue to work as-is. +- The internal implementation is now fully asynchronous with FastAPI, potentially improving concurrent scrape handling under heavy load. +- Logging and startup messages will differ due to the new framework and ASGI server (`uvicorn`). +- adjust the following settings for `uvicorn` as environment variables: + - HOST + - PORT + - WORKERS (Default : 3) + - LOG_LEVEL=(Default : warning) + +⚠️ **Breaking operational change:** if your deployment or runtime environment specifically depends on Flask or Gunicorn, you'll need to adjust service definitions accordingly. + +--- + +## [1.1.2-1.1.4] 2025-05-05 + +## Key points +- added Github actions deployment option +- this version makes the code more flexible against vulnerability dependency risks PRs. + +**check out README.MD, Deploy with Github Actions to make use of the new changes.** + + +--- + Version : 1.1.1 ## Key points @@ -21,12 +56,3 @@ Version : 1.1.1 - `cxp_network_rx_bytes_total`: Total number of bytes received over the network - `cxp_network_tx_bytes_total`: Total number of bytes transmitted over the network -# ChangeLog for CXP - -Version : 1.1.2-1.1.4 - -## Key points -- added Github actions deployment option -- this version makes the code more flexible against vulnerability dependency risks PRs. - -**check out README.MD, Deploy with Github Actions to make use of the new changes.** \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 90791a2..238fbc3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,10 @@ FROM python:3.10-slim-buster LABEL maintainer="Shayan Ghani " -ENV CONTAINER_EXPORTER_ENV=production CONTAINER_EXPORTER_DEBUG=0 CONTAINER_EXPORTER_PORT=8000 +ENV CONTAINER_EXPORTER_ENV=production \ + CONTAINER_EXPORTER_DEBUG=0 \ + CONTAINER_EXPORTER_PORT=8000 \ + PYTHONUNBUFFERED=1 EXPOSE 8000 diff --git a/README.md b/README.md index 1ec5adf..eed1e64 100644 --- a/README.md +++ b/README.md @@ -97,13 +97,19 @@ kill -9 ``` Replace `` with the pid of ./start.sh script. -#### 🚢 Run With A Custom Port: +#### 🚢 Run With A Custom Parameters: + +- adjust the following settings for `uvicorn` as environment variables: + - HOST (Default: 0.0.0.0) + - PORT (Default: 8000) + - WORKERS (Default : 3) + - LOG_LEVEL (Default : warning) + +Example: ```bash -./start.sh & +PORT="8000" ./start.sh & ``` -Change `` with a port of your choice. - ### 🔥 Add CXP to Prometheus - Edit your `prometheus.yml` file and add the address of container-exporter in scrape_configs: diff --git a/configs/__init__.py b/configs/__init__.py deleted file mode 100644 index d782e9b..0000000 --- a/configs/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from . import config \ No newline at end of file diff --git a/configs/config.py b/configs/config.py deleted file mode 100644 index 4944c81..0000000 --- a/configs/config.py +++ /dev/null @@ -1,6 +0,0 @@ -from os import environ - -class Config: - ENV = environ.get("CONTAINER_EXPORTER_ENV", "production") - DEBUG = bool(int(environ.get("CONTAINER_EXPORTER_DEBUG", "0"))) - TESTING = DEBUG \ No newline at end of file diff --git a/container_exporter.py b/container_exporter.py index 4337306..02ea88a 100755 --- a/container_exporter.py +++ b/container_exporter.py @@ -1,95 +1,100 @@ -from asyncio import gather, new_event_loop, wait -from aiodocker import Docker -from docker import from_env as docker_env +from asyncio import gather +from aiodocker import Docker +from aiodocker.containers import DockerContainer from stats import get_docker_stats as stat -from prometheus_client import Gauge, Counter +from prometheus_client import Gauge, Counter, CONTENT_TYPE_LATEST from prometheus_client.exposition import generate_latest -from flask import Flask, Response, request -from configs import config +from fastapi import FastAPI +from fastapi.responses import PlainTextResponse +from contextlib import asynccontextmanager +from utils.metrics import PromMetric, prune_stale_metrics, flush_metric_labels +from logging import basicConfig, error, ERROR + +docker_client: Docker + +@asynccontextmanager +async def lifespan(app: FastAPI): + global docker_client + docker_client = Docker() + + yield -app = Flask(__name__) + await docker_client.close() +app = FastAPI(lifespan=lifespan) -# Create Prometheus gauge metrics -container_status = Gauge('cxp_container_status', 'Docker container status (1 = running, 0 = not running)', ['container_name']) -container_cpu_percentage = Gauge('cxp_cpu_percentage', 'Docker container cpu usage', ['container_name']) -container_memory_percentage = Gauge('cxp_memory_percentage', 'Docker container memory usage in percent', ['container_name']) -container_memory_bytes_total = Gauge('cxp_memory_bytes_total', 'Docker container memory usage in bytes', ['container_name']) +gauge_container_status = Gauge('cxp_container_status', 'Docker container status (1 = running, 0 = not running)', ['container_name']) +gauge_cpu_percentage = Gauge('cxp_cpu_percentage', 'Docker container CPU usage', ['container_name']) +gauge_memory_percentage = Gauge('cxp_memory_percentage', 'Docker container memory usage in percent', ['container_name']) +gauge_memory_bytes = Gauge('cxp_memory_bytes_total', 'Docker container memory usage in bytes', ['container_name']) -disk_io_read_counter = Counter("cxp_disk_io_read_bytes_total", "Total number of bytes read from disk", ['container_name']) -disk_io_write_counter = Counter("cxp_disk_io_write_bytes_total", "Total number of bytes written to disk", ['container_name']) +counter_disk_read = Counter("cxp_disk_io_read_bytes_total", "Total bytes read from disk", ['container_name']) +counter_disk_write = Counter("cxp_disk_io_write_bytes_total", "Total bytes written to disk", ['container_name']) +counter_net_rx = Counter("cxp_network_rx_bytes_total", "Total bytes received over network", ['container_name']) +counter_net_tx = Counter("cxp_network_tx_bytes_total", "Total bytes sent over network", ['container_name']) -network_rx_counter = Counter("cxp_network_rx_bytes_total", "Total number of bytes received over the network", ['container_name']) -network_tx_counter = Counter("cxp_network_tx_bytes_total", "Total number of bytes transmitted over the network", ['container_name']) - -# get the data for running or not running(unhealthy) containers -def get_containers(all=False): - client = docker_env() - return client.containers.list(all) - -init_containers_names = [c.name for c in get_containers()] - -# update container status whether they are running. -def update_container_status(containers): - for container in containers: - if container.name in init_containers_names: - container_status.labels(container_name=container.name).set(1 if container.status == "running" else 0) - elif container.status == "running": - container_status.labels(container_name=container.name).set(1) - init_containers_names.append(container.name) - - for container_name in init_containers_names: - if container_name not in [c.name for c in containers]: - container_status.labels(container_name=container_name).set(0) - - -async def container_stats(): - docker = Docker() - try: - containers = await docker.containers.list() - tasks = [stat.get_container_stats(container) for container in containers] - all_stats = await gather(*tasks) - for stats in all_stats: - container_cpu_percentage.labels(container_name=stats[0]['name'][1:]).set(stat.calculate_cpu_percentage(stats[0])) - container_memory_percentage.labels(container_name=stats[0]['name'][1:]).set(stat.calculate_memory_percentage(stats[0])) - container_memory_bytes_total.labels(container_name=stats[0]['name'][1:]).set(stat.calculate_memory_bytes(stats[0])) - disk_io_read_counter.labels(container_name=stats[0]['name'][1:]).inc(stat.calculate_disk_io(stats[0])[0]) - disk_io_write_counter.labels(container_name=stats[0]['name'][1:]).inc(stat.calculate_disk_io(stats[0])[1]) - network_rx_counter.labels(container_name=stats[0]['name'][1:]).inc(stat.calculate_network_io(stats[0])[0]) - network_tx_counter.labels(container_name=stats[0]['name'][1:]).inc(stat.calculate_network_io(stats[0])[1]) - finally: - await docker.close() - -metrics_names = [container_cpu_percentage, container_memory_percentage , container_memory_bytes_total , disk_io_read_counter , disk_io_write_counter , network_rx_counter , network_tx_counter ] - -def flush_metric_labels(c): - for container in c: - if container.status != "running": - for m in metrics_names: - m.clear() - -@app.route('/') -def index(): - return "Welcome To CXP, Contianer Exporter For Prometheus." - -@app.route('/metrics') -def metrics(): - try: - all_containers = get_containers(all=True) - update_container_status(all_containers) - flush_metric_labels(all_containers) - loop = new_event_loop() - t = [loop.create_task(container_stats())] - loop.run_until_complete(wait(t)) - except Exception as e: - return f"Error running script: {str(e)}" +metrics_to_clear: list[PromMetric] = [gauge_cpu_percentage, gauge_memory_percentage, gauge_memory_bytes, counter_disk_read, counter_disk_write, counter_net_rx, counter_net_tx] - return Response(generate_latest(), mimetype='text/plain') -def create_app(): - app.config.from_object(config.Config) - return app -if __name__ == "__main__": - app.run('0.0.0.0', 8000) \ No newline at end of file +async def get_containers(all=False) -> list[DockerContainer]: + return await docker_client.containers.list(all=all) + +def update_container_status(running_containers:list[DockerContainer]): + + current_names = [c._container.get("Names")[0][1:] for c in running_containers] + for name in current_names: + gauge_container_status.labels(container_name=name).set(1) + +# Async metrics gathering +async def container_stats( running_containers: list[DockerContainer]): + tasks = [stat.get_container_stats(container) for container in running_containers] + all_stats = await gather(*tasks) + + for stats in all_stats: + name = stats[0]['name'][1:] + gauge_cpu_percentage.labels(container_name=name).set(stat.calculate_cpu_percentage(stats[0])) + gauge_memory_percentage.labels(container_name=name).set(stat.calculate_memory_percentage(stats[0])) + gauge_memory_bytes.labels(container_name=name).set(stat.calculate_memory_bytes(stats[0])) + disk_read, disk_write = stat.calculate_disk_io(stats[0]) + net_rx, net_tx = stat.calculate_network_io(stats[0]) + + counter_disk_read.labels(container_name=name).inc(disk_read) + counter_disk_write.labels(container_name=name).inc(disk_write) + counter_net_rx.labels(container_name=name).inc(net_rx) + counter_net_tx.labels(container_name=name).inc(net_tx) + +# List of metrics we want to prune (performance counters) +prunable_metrics: list[PromMetric] = [ + gauge_cpu_percentage, gauge_memory_percentage, gauge_memory_bytes, + counter_disk_read, counter_disk_write, counter_net_rx, counter_net_tx +] + +# Metrics we want to always keep, and set to 0 instead +persistent_metrics: list[PromMetric] = [gauge_container_status] + + +@app.get("/") +def root(): + return {"message": "Welcome to CXP, Container Exporter for Prometheus."} + +@app.get("/metrics") +async def metrics(): + try: + running_containers = await get_containers() + update_container_status(running_containers) + prune_stale_metrics([c._container.get("Names")[0][1:] for c in running_containers], prunable_metrics, persistent_metrics) + await container_stats(running_containers) + return PlainTextResponse( + content=generate_latest(), + media_type=CONTENT_TYPE_LATEST + ) + except Exception as e: + basicConfig( + level=ERROR, + format='%(asctime)s ERROR %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + error(str(e)) + return PlainTextResponse(f"Error running metrics collection: {str(e)}", status_code=500) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 6ef3d50..0b256ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,29 +1,28 @@ aiodocker==0.21.0 -aiohttp>=3.9.0 -aiosignal==1.3.1 +aiohappyeyeballs==2.6.1 +aiohttp==3.12.4 +aiosignal==1.3.2 +annotated-types==0.7.0 +anyio==4.9.0 async-timeout==4.0.3 attrs==23.1.0 -blinker==1.6.2 certifi==2024.7.4 charset-normalizer==3.2.0 -click==8.1.7 -docker==6.1.3 -Flask==2.3.3 -frozenlist==1.4.0 -gunicorn==23.0.0 +click==8.2.1 +exceptiongroup==1.3.0 +fastapi==0.115.12 +frozenlist==1.6.0 +h11==0.16.0 idna==3.7 -importlib-metadata==6.8.0 -itsdangerous==2.1.2 -jinja2>=3.1.3 -MarkupSafe==2.1.3 -multidict==6.0.4 -packaging==23.1 +multidict==6.4.4 prometheus-client==0.17.1 -requests==2.31.0 -typing_extensions==4.8.0 -urllib3>=2.0.7 -websocket-client==1.6.2 -werkzeug>=2.3.8 -yarl==1.9.2 -zipp==3.19.1 -pip==23.3 +propcache==0.3.1 +pydantic==2.11.5 +pydantic_core==2.33.2 +sniffio==1.3.1 +starlette==0.46.2 +typing-inspection==0.4.1 +typing_extensions==4.13.2 +urllib3==2.4.0 +uvicorn==0.30.0 +yarl==1.20.0 diff --git a/start.sh b/start.sh index c38c468..5178ebe 100755 --- a/start.sh +++ b/start.sh @@ -1,6 +1,30 @@ #!/bin/sh -port="${1:-8000}" +# Configurable variables +HOST=${HOST:-0.0.0.0} +PORT=${PORT:-8000} +WORKERS=${WORKERS:-3} +LOG_LEVEL=${LOG_LEVEL:-warning} -gunicorn -b 0.0.0.0:$port -w 3 --access-logfile - --error-logfile - --reload "container_exporter:create_app()" +echo "Starting Container Exporter..." +echo "Host: $HOST, Port: $PORT, Workers: $WORKERS, Log Level: $LOG_LEVEL" +# Trap signals to shut down gracefully +term_handler() { + echo "SIGTERM received, shutting down..." + kill -TERM "$child" 2>/dev/null + wait "$child" + exit 0 +} +trap term_handler SIGTERM + +while true; do + uvicorn "container_exporter:app" \ + --host "$HOST" \ + --port "$PORT" \ + --workers "$WORKERS" \ + --log-level "$LOG_LEVEL" + + echo "Uvicorn crashed with exit code $?. Restarting in 3 seconds..." + sleep 3 +done diff --git a/stats/get_docker_stats.py b/stats/get_docker_stats.py index 1a2e49a..7b54ffd 100644 --- a/stats/get_docker_stats.py +++ b/stats/get_docker_stats.py @@ -1,8 +1,12 @@ -def calculate_cpu_percentage(stats) -> float: +from aiodocker.docker import DockerContainer + +def calculate_cpu_percentage(stats:dict) -> float: cpu_percent = 0 cpu_delta = stats['cpu_stats']['cpu_usage']['total_usage'] - stats['precpu_stats']['cpu_usage']['total_usage'] + system_delta = stats['cpu_stats']['system_cpu_usage'] - stats['precpu_stats']['system_cpu_usage'] + number_cpus = stats['cpu_stats']['online_cpus'] if cpu_delta is not None and system_delta is not None and number_cpus is not None: cpu_percent = (cpu_delta / system_delta) * number_cpus * 100.0 @@ -56,6 +60,6 @@ def calculate_network_io(stats) -> bytes: return network_rx_bytes, network_tx_bytes -async def get_container_stats(container): +async def get_container_stats(container:DockerContainer): stats = await container.stats(stream=False) return stats \ No newline at end of file diff --git a/utils/metrics.py b/utils/metrics.py new file mode 100644 index 0000000..84aa709 --- /dev/null +++ b/utils/metrics.py @@ -0,0 +1,31 @@ +from aiodocker.containers import DockerContainer +from typing import Union, Iterable +from prometheus_client import Gauge, Counter + +PromMetric = Union[Gauge, Counter] + +def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[PromMetric], persistent_metrics : list[PromMetric]): + """ + Removes time series for inactive containers from selected metrics + while preserving container status metrics by setting them to 0. + """ + active_set = set(active_names) + + for metric in prunable_metrics: + for labels in metric._metrics: + name = labels[0] + if name not in active_set: + metric.clear() + + for metric in persistent_metrics: + for labels in list(metric._metrics): + name = labels[0] + if name not in active_set: + metric.labels(container_name=name).set(0) + + +def flush_metric_labels(containers:list[DockerContainer], metrics_to_clear: list[PromMetric]): + for container in containers: + if container._container.get("State") != "running": + for metric in metrics_to_clear: + metric.labels(container_name=container._container.get("Names")[0][1:]).set(0) \ No newline at end of file