Skip to content
This repository was archived by the owner on Jan 13, 2025. It is now read-only.

Commit 4374aa2

Browse files
authored
feat(api): Add /health endpoint (#28)
* fix(validations): move Prometheus /runtimeinfo API call under validations Signed-off-by: hayk96 <[email protected]> * feat(api): Add /health endpoint Signed-off-by: hayk96 <[email protected]> * docs: Update CHANGELOG.md Signed-off-by: hayk96 <[email protected]> * chore(api): Bump app version #patch Signed-off-by: hayk96 <[email protected]> --------- Signed-off-by: hayk96 <[email protected]>
1 parent 4daa8bb commit 4374aa2

File tree

7 files changed

+95
-16
lines changed

7 files changed

+95
-16
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# Changelog
22

3+
## 0.3.2 / 2024-06-08
4+
5+
* [ENHANCEMENT] Added a new endpoint: `/health` for retrieving system health. #28
6+
* [ENHANCEMENT] Added a new function that continuously checks (600 checks at 3-second intervals) for establishing a connection to Prometheus.
7+
* [BUGFIX] The Prometheus /runtimeinfo API call check has been moved under the validation function.
8+
* [BUGFIX] Added proper exception handling while checking the status of the reload API of Prometheus at runtime.
9+
310
## 0.3.1 / 2024-06-01
411

512
* [ENHANCEMENT] Added a new webpage, Metrics Management, based on the `/metrics-lifecycle-policies` API. This feature allows

main.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@
1515
prom_addr, rule_path = args.get("prom.addr"), args.get("rule.path")
1616
host, port = args.get("web.listen_address").split(":")
1717

18-
if not all([settings.check_prom_http_connection(prom_addr),
19-
settings.check_reload_api_status(prom_addr),
20-
settings.check_rules_directory(rule_path),
21-
settings.check_fs_permissions(rule_path)]):
18+
if not all([settings.check_rules_directory(rule_path),
19+
settings.check_fs_permissions(rule_path),
20+
settings.establish_prom_connection(prom_addr),
21+
settings.check_reload_api_status(prom_addr)]):
2222
sys.exit()
2323

2424

src/api/v1/api.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1-
from .. v1.endpoints import reverse_proxy, rules, policies, web
1+
from .. v1.endpoints import reverse_proxy, rules, policies, web, health
22
from fastapi import APIRouter
33

44
api_router = APIRouter()
55
api_router.include_router(rules.router, prefix="/api/v1")
66
api_router.include_router(policies.router, prefix="/api/v1")
77
api_router.include_router(web.router, prefix="")
8+
api_router.include_router(health.router, prefix="")
89
api_router.add_route("/{path:path}", reverse_proxy._reverse_proxy, ["GET", "POST", "PUT"])

src/api/v1/endpoints/health.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
from src.utils.settings import check_prom_readiness
2+
from fastapi import APIRouter, Response, status
3+
from src.utils.arguments import arg_parser
4+
5+
router = APIRouter()
6+
rule_path = arg_parser().get("rule.path")
7+
prom_addr = arg_parser().get("prom.addr")
8+
9+
10+
@router.get("/health",
11+
name="Get system health",
12+
description="Returns a 200 status when the prometheus-api is able to connect to the Prometheus server",
13+
status_code=status.HTTP_200_OK,
14+
tags=["health"],
15+
responses={
16+
200: {
17+
"description": "OK",
18+
"content": {
19+
"application/json": {
20+
"example": [
21+
{
22+
"status": "success",
23+
"message": "Service is up and running"
24+
}
25+
]
26+
}
27+
}
28+
},
29+
503: {
30+
"description": "Service Unavailable",
31+
"content": {
32+
"application/json": {
33+
"example": [
34+
{
35+
"status": "error",
36+
"message": "Service is unavailable due to a health-check failure"
37+
}
38+
]
39+
}
40+
}
41+
}
42+
})
43+
async def health(response: Response):
44+
global prom_addr
45+
if not check_prom_readiness(prom_addr):
46+
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
47+
return {"status": "error",
48+
"message": "Service is unavailable due to a health-check failure"}
49+
return {"status": "success",
50+
"message": "Service is up and running"}

src/core/policies.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
rule_path = arg_parser().get("rule.path")
99
prom_addr = arg_parser().get("prom.addr")
1010
policies_data_file = ".policies.json"
11-
prom_storage_retention_human = prom_info(
12-
prom_addr, "/runtimeinfo")["data"]["storageRetention"]
1311

1412

1513
def sync_to_file(data) -> None:
@@ -105,6 +103,8 @@ def validate_duration(val) -> tuple[bool, int, str, str, int]:
105103
This function compares the value of the 'keep_for'
106104
field with the retention time of the Prometheus server
107105
"""
106+
prom_storage_retention_human = prom_info(
107+
prom_addr, "/runtimeinfo")["data"]["storageRetention"]
108108
prom_storage_retention_seconds = parse(prom_storage_retention_human)
109109
val_seconds = parse(val)
110110
if val_seconds >= prom_storage_retention_seconds:

src/utils/openapi.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def openapi(app: FastAPI):
1616
"providing additional features and addressing its limitations. "
1717
"Running as a sidecar alongside the Prometheus server enables "
1818
"users to extend the capabilities of the API.",
19-
version="0.3.1",
19+
version="0.3.2",
2020
contact={
2121
"name": "Hayk Davtyan",
2222
"url": "https://hayk96.github.io",

src/utils/settings.py

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
from os import remove, path
22
from .log import logger
3+
from time import sleep
34
import requests
45

56

6-
def check_prom_http_connection(prometheus_address) -> bool:
7+
def check_prom_readiness(prometheus_address) -> bool:
78
"""Checks the connection to the Prometheus server over HTTP."""
89
try:
910
r = requests.get(f"{prometheus_address}/-/ready")
@@ -20,15 +21,35 @@ def check_prom_http_connection(prometheus_address) -> bool:
2021
return False
2122

2223

24+
def establish_prom_connection(prometheus_address, retries=600) -> bool:
25+
"""
26+
This function continuously checks the
27+
connection to the Prometheus server, waiting
28+
for it to establish. The total wait time is
29+
30 minutes (600 checks at 3-second intervals)
30+
"""
31+
for i in range(retries):
32+
if check_prom_readiness(prometheus_address):
33+
return True
34+
sleep(3)
35+
logger.error(
36+
"Connection to Prometheus failed: Maximum retry attempts exceeded. The server has been shut down.")
37+
return False
38+
39+
2340
def check_reload_api_status(prometheus_address) -> bool:
2441
"""Checks the status of the Prometheus Management API."""
25-
r = requests.post(f"{prometheus_address}/-/reload")
26-
if r.status_code == 403:
27-
logger.error(
28-
f"{r.text} It's disabled by default and can be enabled via the --web.enable-lifecycle. "
29-
f"See https://prometheus.io/docs/prometheus/latest/management_api/#reload for more details.")
30-
return False
31-
return True
42+
try:
43+
r = requests.post(f"{prometheus_address}/-/reload")
44+
except requests.exceptions.ConnectionError as e:
45+
logger.error(e)
46+
else:
47+
if r.status_code == 403:
48+
logger.error(
49+
f"{r.text} It's disabled by default and can be enabled via the --web.enable-lifecycle. "
50+
f"See https://prometheus.io/docs/prometheus/latest/management_api/#reload for more details.")
51+
return False
52+
return True
3253

3354

3455
def check_rules_directory(prometheus_rules_dir) -> bool:

0 commit comments

Comments
 (0)