Skip to content

Commit 52c3c03

Browse files
authored
Merge pull request #33 from Shayan-Ghani/settings
Feat: added settings for clear_metrics functionality and handled corner case, restarting/unhealthy container
2 parents 266c559 + e4a6369 commit 52c3c03

File tree

10 files changed

+139
-49
lines changed

10 files changed

+139
-49
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
- adjust the following settings for `uvicorn` as environment variables:
1818
- HOST
1919
- PORT
20-
- WORKERS (Default : 3)
20+
- WORKERS (Default : 1)
2121
- LOG_LEVEL=(Default : warning)
2222

2323
⚠️ **Breaking operational change:** if your deployment or runtime environment specifically depends on Flask or Gunicorn, you'll need to adjust service definitions accordingly.

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ Replace `<PID>` with the pid of ./start.sh script.
104104
- adjust the following settings for `uvicorn` as environment variables:
105105
- HOST (Default: 0.0.0.0)
106106
- PORT (Default: 8000)
107-
- WORKERS (Default : 3)
107+
- WORKERS (Default : 1)
108108
- LOG_LEVEL (Default : warning)
109109

110110
Example:
@@ -145,9 +145,9 @@ Check out [dashboards](./dashboards) directory for Json files. including CPU & M
145145
- [x] Check and Unregister *stat* metrics for containers that are not running
146146
- [x] Design and develop a static website to showcase Documentation, new features, etc.
147147
- [x] Enable functionality and smoke testing in ci
148+
- [X] Add `clear_metrics` functionality to switch on clearing the labels or setting them to 0 to maintain time series data, on user's demand.
148149
- [ ] Design grafana dashboards and share them on grafana cloud
149150
- [ ] Add unit tests
150-
- [ ] Add `clear_metrics` functionality to switch on clearing the labels or setting them to 0 to maintain time series data, on user's demand.
151151

152152
## Contributions
153153
Welcome to CXP! This project is production-ready now, and we encourage contributions to enhance its functionality, optimize code, and add new features

alerting/sample_rules.yml

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
groups:
2+
- name: container-status
3+
rules:
4+
- alert: ContainerDown
5+
expr: cxp_container_status == 0
6+
for: 5m
7+
labels:
8+
severity: critical
9+
annotations:
10+
summary: "Container {{ $labels.container_name }} is down"
11+
description: "No status heartbeat for container '{{ $labels.container_name }}' in the last 5 minutes."
12+
13+
- name: container-resource-usage
14+
rules:
15+
- alert: HighCPUUsage
16+
expr: avg_over_time(cxp_cpu_percentage[5m]) > 80.0
17+
for: 5m
18+
labels:
19+
severity: warning
20+
annotations:
21+
summary: "High CPU usage on {{ $labels.container_name }}"
22+
description: "CPU usage has averaged >80% for more than 5 minutes."
23+
24+
- alert: CriticalCPUUsage
25+
expr: avg_over_time(cxp_cpu_percentage[5m]) > 90.0
26+
for: 5m
27+
labels:
28+
severity: critical
29+
annotations:
30+
summary: "Critical CPU usage on {{ $labels.container_name }}"
31+
description: "CPU usage has averaged >90% for more than 5 minutes."
32+
33+
- alert: HighMemoryUsage
34+
expr: avg_over_time(cxp_memory_percentage[5m]) > 80.0
35+
for: 5m
36+
labels:
37+
severity: warning
38+
annotations:
39+
summary: "High memory usage on {{ $labels.container_name }}"
40+
description: "Memory usage has averaged >80% for more than 5 minutes."
41+
42+
- alert: CriticalMemoryUsage
43+
expr: avg_over_time(cxp_memory_percentage[5m]) > 90.0
44+
for: 5m
45+
labels:
46+
severity: critical
47+
annotations:
48+
summary: "Critical memory usage on {{ $labels.container_name }}"
49+
description: "Memory usage has averaged >90% for more than 5 minutes."
50+
51+
- name: exporter-health
52+
rules:
53+
- alert: ExporterDown
54+
expr: absent(cxp_container_status)
55+
for: 5m
56+
labels:
57+
severity: critical
58+
annotations:
59+
summary: "Container exporter metrics missing"
60+
description: "No cxp_container_status metric scraped for more than 5 minutes; exporter may be down or unreachable."

container_exporter.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from asyncio import gather
21
from aiodocker import Docker
32
from aiodocker.containers import DockerContainer
43
from stats import get_docker_stats as stat
@@ -9,21 +8,22 @@
98
from contextlib import asynccontextmanager
109
from utils.metrics import PromMetric, prune_stale_metrics, flush_metric_labels
1110
from logging import basicConfig, error, ERROR
11+
from settings.settings import settings
1212

1313
docker_client: Docker
1414

1515
@asynccontextmanager
1616
async def lifespan(app: FastAPI):
1717
global docker_client
1818
docker_client = Docker()
19-
19+
2020
yield
2121

2222
await docker_client.close()
2323

2424
app = FastAPI(lifespan=lifespan)
2525

26-
gauge_container_status = Gauge('cxp_container_status', 'Docker container status (1 = running, 0 = not running)', ['container_name'])
26+
gauge_container_status = Gauge('cxp_container_status', 'Docker container status (0 = not running, 1 = running, 2 = restarting/unhealthy)', ['container_name'])
2727
gauge_cpu_percentage = Gauge('cxp_cpu_percentage', 'Docker container CPU usage', ['container_name'])
2828
gauge_memory_percentage = Gauge('cxp_memory_percentage', 'Docker container memory usage in percent', ['container_name'])
2929
gauge_memory_bytes = Gauge('cxp_memory_bytes_total', 'Docker container memory usage in bytes', ['container_name'])
@@ -42,15 +42,12 @@ async def get_containers(all=False) -> list[DockerContainer]:
4242
return await docker_client.containers.list(all=all)
4343

4444
def update_container_status(running_containers:list[DockerContainer]):
45-
46-
current_names = [c._container.get("Names")[0][1:] for c in running_containers]
47-
for name in current_names:
48-
gauge_container_status.labels(container_name=name).set(1)
45+
for c in running_containers:
46+
gauge_container_status.labels(container_name=c._container.get("Names")[0][1:]).set(1 if c._container.get('State') == 'running' else 2)
4947

5048
# Async metrics gathering
5149
async def container_stats( running_containers: list[DockerContainer]):
52-
tasks = [stat.get_container_stats(container) for container in running_containers]
53-
all_stats = await gather(*tasks)
50+
all_stats = await stat.get_containers_stats(running_containers)
5451

5552
for stats in all_stats:
5653
name = stats[0]['name'][1:]

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ multidict==6.4.4
1818
prometheus-client==0.17.1
1919
propcache==0.3.1
2020
pydantic==2.11.5
21+
pydantic-settings==2.9.1
2122
pydantic_core==2.33.2
23+
python-dotenv==1.1.0
2224
sniffio==1.3.1
2325
starlette==0.46.2
2426
typing-inspection==0.4.1

scripts/healthcheck-ci.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ log "Starting CI Healthcheck..."
3131
log "Spinning up test container: $container_name"
3232
docker run -d --name "$container_name" alpine sleep 60 >/dev/null || fail "Failed to start container"
3333

34+
sleep 3
35+
3436
log "Checking root endpoint..."
3537
if curl --silent --fail http://localhost:8000/ > "${log_dir}/index.txt"; then
3638
pass "Root endpoint responded successfully."

settings/settings.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# settings.py
2+
from pydantic_settings import BaseSettings
3+
4+
class Settings(BaseSettings):
5+
CONTAINER_EXPORTER_ENV: str = "production"
6+
CONTAINER_EXPORTER_DEBUG: bool = False
7+
CONTAINER_EXPORTER_CLEAR_METRICS: bool = True
8+
class Config:
9+
env_file = ".env"
10+
env_file_encoding = "utf-8"
11+
12+
settings = Settings()

start.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Configurable variables
44
HOST=${HOST:-0.0.0.0}
55
PORT=${PORT:-8000}
6-
WORKERS=${WORKERS:-3}
6+
WORKERS=${WORKERS:-1}
77
LOG_LEVEL=${LOG_LEVEL:-warning}
88

99
echo "Starting Container Exporter..."

stats/get_docker_stats.py

Lines changed: 45 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,61 @@
11
from aiodocker.docker import DockerContainer
2+
from asyncio import gather
23

3-
def calculate_cpu_percentage(stats:dict) -> float:
4-
cpu_percent = 0
5-
6-
cpu_delta = stats['cpu_stats']['cpu_usage']['total_usage'] - stats['precpu_stats']['cpu_usage']['total_usage']
4+
def calculate_cpu_percentage(stats: dict) -> float:
5+
cpu_stats = stats.get('cpu_stats', {})
6+
precpu_stats = stats.get('precpu_stats', {})
7+
total = cpu_stats.get('cpu_usage', {}).get('total_usage')
8+
prev_total = precpu_stats.get('cpu_usage', {}).get('total_usage')
9+
system = cpu_stats.get('system_cpu_usage')
10+
prev_system = precpu_stats.get('system_cpu_usage')
11+
n_cpus = cpu_stats.get('online_cpus')
712

8-
system_delta = stats['cpu_stats']['system_cpu_usage'] - stats['precpu_stats']['system_cpu_usage']
13+
if None in (total, prev_total, system, prev_system, n_cpus):
14+
return 0.0
915

10-
number_cpus = stats['cpu_stats']['online_cpus']
11-
if cpu_delta is not None and system_delta is not None and number_cpus is not None:
12-
cpu_percent = (cpu_delta / system_delta) * number_cpus * 100.0
16+
cpu_delta = total - prev_total
17+
system_delta = system - prev_system
1318

14-
return cpu_percent
19+
if system_delta <= 0:
20+
return 0.0
1521

16-
def calculate_memory_percentage(stats) -> float:
17-
memory_percent = 0
18-
memory_usage_bytes = 0
19-
20-
memory_usage_bytes = stats['memory_stats']['usage']
21-
memory_limit = stats['memory_stats']['limit']
22-
if memory_usage_bytes is not None and memory_limit is not None:
23-
memory_percent = (memory_usage_bytes / memory_limit) * 100.0
22+
return (cpu_delta / system_delta) * n_cpus * 100.0
23+
24+
25+
def calculate_memory_percentage(stats: dict) -> float:
26+
mem_stats = stats.get('memory_stats', {})
27+
usage = mem_stats.get('usage')
28+
limit = mem_stats.get('limit')
29+
30+
if usage is None or limit is None or limit == 0:
31+
return 0.0
32+
33+
return (usage / limit) * 100.0
2434

25-
return memory_percent
2635

2736
def calculate_memory_bytes(stats) -> bytes:
28-
memory_usage_bytes = stats['memory_stats']['usage']
37+
mem_stats = stats.get('memory_stats', {}) or {}
38+
memory_usage_bytes = mem_stats.get('usage')
39+
2940
if memory_usage_bytes is not None:
3041
return memory_usage_bytes
31-
return 0
42+
return 0.0
3243

33-
def calculate_disk_io(stats) -> bytes:
44+
def calculate_disk_io(stats: dict) -> bytes:
3445
disk_io_read = 0
3546
disk_io_write = 0
3647

37-
if "blkio_stats" in stats and "io_service_bytes_recursive" in stats["blkio_stats"]:
38-
io_service_bytes_recursive = stats["blkio_stats"]["io_service_bytes_recursive"]
48+
io_list = stats.get("blkio_stats", {}) \
49+
.get("io_service_bytes_recursive") or []
50+
51+
for io_stat in io_list:
52+
op = io_stat.get("op")
53+
value = io_stat.get("value", 0)
54+
if op == "read":
55+
disk_io_read += value
56+
elif op == "write":
57+
disk_io_write += value
3958

40-
if io_service_bytes_recursive is not None:
41-
for io_stat in io_service_bytes_recursive:
42-
if "op" in io_stat and "value" in io_stat:
43-
if io_stat["op"] == "read":
44-
disk_io_read += io_stat["value"]
45-
elif io_stat["op"] == "write":
46-
disk_io_write += io_stat["value"]
4759

4860
return disk_io_read, disk_io_write
4961

@@ -60,6 +72,6 @@ def calculate_network_io(stats) -> bytes:
6072

6173
return network_rx_bytes, network_tx_bytes
6274

63-
async def get_container_stats(container:DockerContainer):
64-
stats = await container.stats(stream=False)
65-
return stats
75+
async def get_containers_stats(containers:list[DockerContainer]):
76+
tasks = [container.stats(stream=False) for container in containers]
77+
return await gather(*tasks)

utils/metrics.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from aiodocker.containers import DockerContainer
22
from typing import Union, Iterable
33
from prometheus_client import Gauge, Counter
4-
4+
from settings.settings import settings
55
PromMetric = Union[Gauge, Counter]
66

77
def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[PromMetric], persistent_metrics : list[PromMetric]):
@@ -15,8 +15,13 @@ def prune_stale_metrics(active_names: Iterable[str], prunable_metrics: list[Prom
1515
for labels in metric._metrics:
1616
name = labels[0]
1717
if name not in active_set:
18-
metric.clear()
19-
18+
if settings.CONTAINER_EXPORTER_CLEAR_METRICS:
19+
metric.clear()
20+
elif isinstance(metric, Gauge):
21+
metric.labels(container_name=name).set(0)
22+
else:
23+
metric.clear()
24+
2025
for metric in persistent_metrics:
2126
for labels in list(metric._metrics):
2227
name = labels[0]

0 commit comments

Comments
 (0)