Shayan-Ghani · Shayan-Ghani · May 30, 2025 · May 30, 2025 · May 30, 2025 · May 30, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,40 @@
 # ChangeLog for CXP
 
+---
+## [1.2.0] - 2025-05-30
+
+### Changed
+- **Internal framework refactor:** Migrated from **Flask** to **FastAPI** for improved asynchronous handling and performance.
+- Updated operational dependencies:
+  - Added `fastapi`, `uvicorn`
+  - Removed `flask`, `gunicorn`
+
+### Notes
+- **No changes** to Prometheus metrics endpoints, names, labels, or scrape behavior.
+- Existing Prometheus scrapers, dashboards, and alerting rules will continue to work as-is.
+- The internal implementation is now fully asynchronous with FastAPI, potentially improving concurrent scrape handling under heavy load.
+- Logging and startup messages will differ due to the new framework and ASGI server (`uvicorn`).
+- adjust the following settings for `uvicorn` as environment variables:
+  - HOST
+  - PORT
+  - WORKERS (Default : 3)
+  - LOG_LEVEL=(Default : warning)
+
+⚠️ **Breaking operational change:** if your deployment or runtime environment specifically depends on Flask or Gunicorn, you'll need to adjust service definitions accordingly.
+
+---
+
+## [1.1.2-1.1.4] 2025-05-05
+
+## Key points
+- added Github actions deployment option
+- this version makes the code more flexible against vulnerability dependency risks PRs.
+
+**check out README.MD, Deploy with Github Actions to make use of the new changes.**
+
+
+---
+
 Version : 1.1.1
 
 ## Key points
@@ -21,12 +56,3 @@ Version : 1.1.1
 - `cxp_network_rx_bytes_total`: Total number of bytes received over the network
 - `cxp_network_tx_bytes_total`: Total number of bytes transmitted over the network
 
-# ChangeLog for CXP
-
-Version : 1.1.2-1.1.4
-
-## Key points
-- added Github actions deployment option
-- this version makes the code more flexible against vulnerability dependency risks PRs.
-
-**check out README.MD, Deploy with Github Actions to make use of the new changes.**
diff --git a/Dockerfile b/Dockerfile
@@ -2,7 +2,10 @@ FROM python:3.10-slim-buster
 
 LABEL maintainer="Shayan Ghani <[email protected]>"
 
-ENV CONTAINER_EXPORTER_ENV=production CONTAINER_EXPORTER_DEBUG=0 CONTAINER_EXPORTER_PORT=8000
+ENV CONTAINER_EXPORTER_ENV=production \
+ CONTAINER_EXPORTER_DEBUG=0 \
+ CONTAINER_EXPORTER_PORT=8000 \
+ PYTHONUNBUFFERED=1
 
 EXPOSE 8000
 

diff --git a/README.md b/README.md
@@ -97,13 +97,19 @@ kill -9 <PID>
 ```
 Replace `<PID>` with the pid of ./start.sh script.
 
-#### 🚢 Run With A Custom Port:
+#### 🚢 Run With A Custom Parameters:
+
+- adjust the following settings for `uvicorn` as environment variables:
+  - HOST (Default: 0.0.0.0)
+  - PORT (Default: 8000)
+  - WORKERS (Default : 3)
+  - LOG_LEVEL (Default : warning)
+
+Example:
 ```bash
-./start.sh <your custome port> &
+PORT="8000" ./start.sh <your custome port> &
 ```
 
-Change `<your custom port>` with a port of your choice.
-
 ### 🔥 Add CXP to Prometheus
 - Edit your `prometheus.yml` file and add the address of container-exporter in scrape_configs:
 

diff --git a/configs/__init__.py b/configs/__init__.py
diff --git a/configs/config.py b/configs/config.py
diff --git a/container_exporter.py b/container_exporter.py
@@ -1,95 +1,100 @@
-from asyncio import gather, new_event_loop, wait
-from aiodocker import Docker 
-from docker import from_env as docker_env
+from asyncio import gather
+from aiodocker import Docker
+from aiodocker.containers import DockerContainer
 from stats import get_docker_stats as stat
-from prometheus_client import Gauge, Counter
+from prometheus_client import Gauge, Counter, CONTENT_TYPE_LATEST
 from prometheus_client.exposition import generate_latest
-from flask import Flask, Response, request
-from configs import config
+from fastapi import FastAPI
+from fastapi.responses import PlainTextResponse
+from contextlib import asynccontextmanager
+from utils.metrics import PromMetric, prune_stale_metrics, flush_metric_labels
+from logging import basicConfig, error, ERROR
+
+docker_client: Docker
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    global docker_client
+    docker_client = Docker()
+
+    yield
 
-app = Flask(__name__)
+    await docker_client.close()
 
+app = FastAPI(lifespan=lifespan)
 
-# Create Prometheus gauge metrics
-container_status = Gauge('cxp_container_status', 'Docker container status (1 = running, 0 = not running)', ['container_name'])
-container_cpu_percentage = Gauge('cxp_cpu_percentage', 'Docker container cpu usage', ['container_name'])
-container_memory_percentage = Gauge('cxp_memory_percentage', 'Docker container memory usage in percent', ['container_name'])
-container_memory_bytes_total = Gauge('cxp_memory_bytes_total', 'Docker container memory usage in bytes', ['container_name'])
+gauge_container_status = Gauge('cxp_container_status', 'Docker container status (1 = running, 0 = not running)', ['container_name'])
+gauge_cpu_percentage = Gauge('cxp_cpu_percentage', 'Docker container CPU usage', ['container_name'])
+gauge_memory_percentage = Gauge('cxp_memory_percentage', 'Docker container memory usage in percent', ['container_name'])
+gauge_memory_bytes = Gauge('cxp_memory_bytes_total', 'Docker container memory usage in bytes', ['container_name'])
 
-disk_io_read_counter = Counter("cxp_disk_io_read_bytes_total", "Total number of bytes read from disk", ['container_name'])
-disk_io_write_counter = Counter("cxp_disk_io_write_bytes_total", "Total number of bytes written to disk", ['container_name'])
+counter_disk_read = Counter("cxp_disk_io_read_bytes_total", "Total bytes read from disk", ['container_name'])
+counter_disk_write = Counter("cxp_disk_io_write_bytes_total", "Total bytes written to disk", ['container_name'])
+counter_net_rx = Counter("cxp_network_rx_bytes_total", "Total bytes received over network", ['container_name'])
+counter_net_tx = Counter("cxp_network_tx_bytes_total", "Total bytes sent over network", ['container_name'])
 
-network_rx_counter = Counter("cxp_network_rx_bytes_total", "Total number of bytes received over the network", ['container_name'])
-network_tx_counter = Counter("cxp_network_tx_bytes_total", "Total number of bytes transmitted over the network", ['container_name'])
-
 
-# get the data for running or not running(unhealthy) containers
-def get_containers(all=False):
-    client = docker_env()
-    return client.containers.list(all)
-
-init_containers_names = [c.name for c in get_containers()]
-
-# update container status whether they are running.
-def update_container_status(containers):
-    for container in containers:
-        if container.name in init_containers_names:
-            container_status.labels(container_name=container.name).set(1 if container.status == "running" else 0)
-        elif container.status == "running":
-            container_status.labels(container_name=container.name).set(1)
-            init_containers_names.append(container.name)
-
-    for container_name in init_containers_names:
-        if container_name not in [c.name for c in containers]:
-            container_status.labels(container_name=container_name).set(0)    
-
-
-async def container_stats():
-    docker = Docker()
-    try:
-        containers = await docker.containers.list()
-        tasks = [stat.get_container_stats(container) for container in containers]
-        all_stats = await gather(*tasks)
-        for stats in all_stats:
-            container_cpu_percentage.labels(container_name=stats[0]['name'][1:]).set(stat.calculate_cpu_percentage(stats[0]))
-            container_memory_percentage.labels(container_name=stats[0]['name'][1:]).set(stat.calculate_memory_percentage(stats[0]))        
-            container_memory_bytes_total.labels(container_name=stats[0]['name'][1:]).set(stat.calculate_memory_bytes(stats[0]))       
-            disk_io_read_counter.labels(container_name=stats[0]['name'][1:]).inc(stat.calculate_disk_io(stats[0])[0])
-            disk_io_write_counter.labels(container_name=stats[0]['name'][1:]).inc(stat.calculate_disk_io(stats[0])[1])
-            network_rx_counter.labels(container_name=stats[0]['name'][1:]).inc(stat.calculate_network_io(stats[0])[0])
-            network_tx_counter.labels(container_name=stats[0]['name'][1:]).inc(stat.calculate_network_io(stats[0])[1])
-    finally:
-        await docker.close()
-
-metrics_names = [container_cpu_percentage,  container_memory_percentage ,  container_memory_bytes_total , disk_io_read_counter , disk_io_write_counter , network_rx_counter ,  network_tx_counter ] 
-
-def flush_metric_labels(c):
-    for container in c:
-        if container.status != "running":
-            for m in metrics_names:
-                m.clear()
-
-@app.route('/')
-def index():
-    return "Welcome To CXP, Contianer Exporter For Prometheus."
-
-@app.route('/metrics')
-def metrics():    
-    try:
-        all_containers = get_containers(all=True)
-        update_container_status(all_containers)
-        flush_metric_labels(all_containers)
-        loop = new_event_loop()
-        t = [loop.create_task(container_stats())]
-        loop.run_until_complete(wait(t))
-    except Exception as e:
-        return f"Error running script: {str(e)}"
+metrics_to_clear: list[PromMetric] = [gauge_cpu_percentage, gauge_memory_percentage, gauge_memory_bytes, counter_disk_read, counter_disk_write, counter_net_rx, counter_net_tx]
 
-    return Response(generate_latest(), mimetype='text/plain')
 
-def create_app():
-    app.config.from_object(config.Config)
-    return app
 
-if __name__ == "__main__":
-    app.run('0.0.0.0', 8000)
+async def get_containers(all=False) -> list[DockerContainer]:
+    return await docker_client.containers.list(all=all)
+
+def update_container_status(running_containers:list[DockerContainer]):
+
+    current_names = [c._container.get("Names")[0][1:] for c in running_containers]
+    for name in current_names:            
+        gauge_container_status.labels(container_name=name).set(1)
+
+# Async metrics gathering
+async def container_stats( running_containers: list[DockerContainer]):
+    tasks = [stat.get_container_stats(container) for container in running_containers]
+    all_stats = await gather(*tasks)
+
+    for stats in all_stats:
+        name = stats[0]['name'][1:]
+        gauge_cpu_percentage.labels(container_name=name).set(stat.calculate_cpu_percentage(stats[0]))
+        gauge_memory_percentage.labels(container_name=name).set(stat.calculate_memory_percentage(stats[0]))
+        gauge_memory_bytes.labels(container_name=name).set(stat.calculate_memory_bytes(stats[0]))
+        disk_read, disk_write = stat.calculate_disk_io(stats[0])
+        net_rx, net_tx = stat.calculate_network_io(stats[0])
+
+        counter_disk_read.labels(container_name=name).inc(disk_read)
+        counter_disk_write.labels(container_name=name).inc(disk_write)
+        counter_net_rx.labels(container_name=name).inc(net_rx)
+        counter_net_tx.labels(container_name=name).inc(net_tx)
+
+# List of metrics we want to prune (performance counters)
+prunable_metrics: list[PromMetric] = [
+    gauge_cpu_percentage, gauge_memory_percentage, gauge_memory_bytes,
+    counter_disk_read, counter_disk_write, counter_net_rx, counter_net_tx
+]
+
+# Metrics we want to always keep, and set to 0 instead
+persistent_metrics: list[PromMetric] = [gauge_container_status]
+
+
+@app.get("/")
+def root():
+    return {"message": "Welcome to CXP, Container Exporter for Prometheus."}
+
+@app.get("/metrics")
+async def metrics():
+    try:
+        running_containers = await get_containers()
+        update_container_status(running_containers)
+        prune_stale_metrics([c._container.get("Names")[0][1:] for c in running_containers], prunable_metrics, persistent_metrics)
+        await container_stats(running_containers)
+        return PlainTextResponse(
+            content=generate_latest(),
+            media_type=CONTENT_TYPE_LATEST 
+        )
+    except Exception as e:
+        basicConfig(    
+            level=ERROR,
+            format='%(asctime)s ERROR %(message)s',
+            datefmt='%Y-%m-%d %H:%M:%S'
+        )
+        error(str(e))
+        return PlainTextResponse(f"Error running metrics collection: {str(e)}", status_code=500)
diff --git a/requirements.txt b/requirements.txt
@@ -1,29 +1,28 @@
 aiodocker==0.21.0
-aiohttp>=3.9.0
-aiosignal==1.3.1
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.4
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.9.0
 async-timeout==4.0.3
 attrs==23.1.0
-blinker==1.6.2
 certifi==2024.7.4
 charset-normalizer==3.2.0
-click==8.1.7
-docker==6.1.3
-Flask==2.3.3
-frozenlist==1.4.0
-gunicorn==23.0.0
+click==8.2.1
+exceptiongroup==1.3.0
+fastapi==0.115.12
+frozenlist==1.6.0
+h11==0.16.0
 idna==3.7
-importlib-metadata==6.8.0
-itsdangerous==2.1.2
-jinja2>=3.1.3
-MarkupSafe==2.1.3
-multidict==6.0.4
-packaging==23.1
+multidict==6.4.4
 prometheus-client==0.17.1
-requests==2.31.0
-typing_extensions==4.8.0
-urllib3>=2.0.7
-websocket-client==1.6.2
-werkzeug>=2.3.8
-yarl==1.9.2
-zipp==3.19.1
-pip==23.3
+propcache==0.3.1
+pydantic==2.11.5
+pydantic_core==2.33.2
+sniffio==1.3.1
+starlette==0.46.2
+typing-inspection==0.4.1
+typing_extensions==4.13.2
+urllib3==2.4.0
+uvicorn==0.30.0
+yarl==1.20.0
diff --git a/start.sh b/start.sh
@@ -1,6 +1,30 @@
 #!/bin/sh
 
-port="${1:-8000}"
+# Configurable variables
+HOST=${HOST:-0.0.0.0}
+PORT=${PORT:-8000}
+WORKERS=${WORKERS:-3}
+LOG_LEVEL=${LOG_LEVEL:-warning}
 
-gunicorn -b 0.0.0.0:$port -w 3 --access-logfile - --error-logfile - --reload "container_exporter:create_app()"
+echo "Starting Container Exporter..."
+echo "Host: $HOST, Port: $PORT, Workers: $WORKERS, Log Level: $LOG_LEVEL"
 
+# Trap signals to shut down gracefully
+term_handler() {
+  echo "SIGTERM received, shutting down..."
+  kill -TERM "$child" 2>/dev/null
+  wait "$child"
+  exit 0
+}
+trap term_handler SIGTERM
+
+while true; do
+  uvicorn "container_exporter:app" \
+    --host "$HOST" \
+    --port "$PORT" \
+    --workers "$WORKERS" \
+    --log-level "$LOG_LEVEL"
+
+  echo "Uvicorn crashed with exit code $?. Restarting in 3 seconds..."
+  sleep 3
+done
diff --git a/stats/get_docker_stats.py b/stats/get_docker_stats.py
@@ -1,8 +1,12 @@
-def calculate_cpu_percentage(stats) -> float:
+from aiodocker.docker import DockerContainer
+
+def calculate_cpu_percentage(stats:dict) -> float:
     cpu_percent = 0
 
     cpu_delta = stats['cpu_stats']['cpu_usage']['total_usage'] - stats['precpu_stats']['cpu_usage']['total_usage']
+
     system_delta = stats['cpu_stats']['system_cpu_usage'] - stats['precpu_stats']['system_cpu_usage']
+
     number_cpus = stats['cpu_stats']['online_cpus'] 
     if cpu_delta is not None and system_delta is not None and number_cpus is not None:
         cpu_percent = (cpu_delta / system_delta) * number_cpus * 100.0 
@@ -56,6 +60,6 @@ def calculate_network_io(stats) -> bytes:
 
     return network_rx_bytes, network_tx_bytes
 
-async def get_container_stats(container):
+async def get_container_stats(container:DockerContainer):
     stats = await container.stats(stream=False)
     return stats