Skip to content

Commit d45be19

Browse files
authored
Merge pull request #75 from AET-DevOps25/43-monitoring-implement-prometheus-grafana-with-alerting
43 monitoring implement prometheus grafana with alerting
2 parents 65f65ba + 35958a8 commit d45be19

File tree

54 files changed

+7962
-3
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+7962
-3
lines changed

.env.example

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@
55
# - To use: `./copy-env.sh .env.dev.example .env`
66
###############################################
77

8+
##########################
9+
# 📦 Application Version
10+
##########################
11+
12+
APP_VERSION=1.0.0-dev # Version of the SkillForge application (used in metrics and service info)
13+
814
##########################
915
# 🗃️ MongoDB Settings
1016
##########################
@@ -82,6 +88,16 @@ WEAVIATE_HOST=weaviate-db # (default: weaviate-db, matches compose service
8288
WEAVIATE_EXPOSED_HTTP_PORT=8080 # (default: 8080)
8389
WEAVIATE_EXPOSED_GRPC_PORT=50051 # (default: 50051)
8490

91+
##########################
92+
# 📊 Monitoring Ports
93+
##########################
94+
# Host-side ports for monitoring stack services
95+
PROMETHEUS_PORT=9090 # (default: 9090)
96+
ALERTMANAGER_PORT=9093 # (default: 9093)
97+
GRAFANA_EXPOSED_PORT=3001 # (default: 3001)
98+
LOKI_PORT=3100 # (default: 3100)
99+
MONGO_EXPORTER_PORT=9216 # (default: 9216)
100+
85101
##########################
86102
# For Production mode
87103
##########################

docker-compose.yml

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ services:
1313
- "--entrypoints.traefik.address=:8085"
1414
- "--api=true"
1515
- "--api.insecure=true"
16+
- "--metrics.prometheus=true"
17+
- "--metrics.prometheus.addrouterslabels=true"
1618
ports:
1719
- "80:80"
1820
- "8085:8085"
@@ -241,13 +243,15 @@ services:
241243
ports:
242244
- "8088:8080"
243245
- "50051:50051"
246+
- "2112:2112"
244247
environment:
245248
QUERY_DEFAULTS_LIMIT: 25
246249
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
247250
PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
248251
BACKUP_FILESYSTEM_PATH: '/var/lib/weaviate/backups'
249252
ENABLE_API_BASED_MODULES: 'true'
250253
CLUSTER_HOSTNAME: 'node1'
254+
PROMETHEUS_MONITORING_ENABLED: 'true'
251255
volumes:
252256
- weaviate-genai-data:/var/lib/weaviate
253257
healthcheck:
@@ -309,10 +313,131 @@ services:
309313
- "traefik.http.middlewares.genai-cors.headers.accessControlAllowOrigin=${CORS_ALLOW_ORIGINS:-*}"
310314
- "traefik.http.routers.genai.middlewares=genai-cors@traefik"
311315

316+
prometheus:
317+
image: prom/prometheus
318+
volumes:
319+
- ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
320+
- ./monitoring/prometheus/alert.rules.yml:/etc/prometheus/alert.rules.yml
321+
- prometheus-data:/prometheus
322+
ports:
323+
- "${PROMETHEUS_PORT:-9090}:9090"
324+
networks:
325+
- skillforge-network
326+
healthcheck:
327+
test: ["CMD", "wget", "-qO", "-", "http://localhost:9090/-/ready"]
328+
interval: 30s
329+
timeout: 5s
330+
retries: 3
331+
332+
alertmanager:
333+
image: prom/alertmanager
334+
container_name: skillforge-alertmanager
335+
volumes:
336+
- ./monitoring/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml
337+
ports:
338+
- "${ALERTMANAGER_PORT:-9093}:9093"
339+
networks:
340+
- skillforge-network
341+
healthcheck:
342+
test: ["CMD", "wget", "-qO", "-", "http://localhost:9093/-/ready"]
343+
interval: 30s
344+
timeout: 5s
345+
retries: 3
346+
347+
grafana:
348+
image: grafana/grafana
349+
ports:
350+
- "${GRAFANA_EXPOSED_PORT:-3001}:3000"
351+
environment:
352+
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PWD:-admin}
353+
- GF_DATE_FORMATS_DEFAULT_TIMEZONE=browser
354+
volumes:
355+
- ./monitoring/grafana/dashboards/:/var/lib/grafana/dashboards/
356+
- ./monitoring/grafana/provisioning/:/etc/grafana/provisioning/
357+
- grafana-data:/var/lib/grafana
358+
networks:
359+
- skillforge-network
360+
healthcheck:
361+
test: ["CMD", "curl", "-f", "http://localhost:3000/api/health"]
362+
interval: 30s
363+
timeout: 5s
364+
retries: 3
365+
366+
367+
mongo-exporter:
368+
image: percona/mongodb_exporter:0.30
369+
command:
370+
- --mongodb.uri=mongodb://${MONGODB_USERNAME:-root}:${MONGODB_PASSWORD:-password}@mongo:27017/admin?authSource=admin
371+
- --collector.dbstats
372+
- --collector.topmetrics
373+
- --discovering-mode
374+
- --compatible-mode
375+
- --web.listen-address=:9216
376+
environment:
377+
- MONGODB_URI=mongodb://${MONGODB_USERNAME:-root}:${MONGODB_PASSWORD:-password}@mongo:27017/admin?authSource=admin
378+
ports:
379+
- "${MONGO_EXPORTER_PORT:-9216}:9216"
380+
networks:
381+
- skillforge-network
382+
healthcheck:
383+
test: ["CMD", "wget", "-qO", "-", "http://localhost:9216/metrics"]
384+
interval: 30s
385+
timeout: 5s
386+
retries: 3
387+
depends_on:
388+
mongo:
389+
condition: service_healthy
390+
391+
# ─── Loki for Centralized Logging ────────────────────────────────────────────────
392+
loki:
393+
image: grafana/loki:latest
394+
container_name: skillforge-loki
395+
ports:
396+
- "${LOKI_PORT:-3100}:3100"
397+
volumes:
398+
- ./monitoring/loki/loki-config.yml:/etc/loki/local-config.yaml
399+
- loki-data:/loki
400+
command: -config.file=/etc/loki/local-config.yaml
401+
networks:
402+
- skillforge-network
403+
healthcheck:
404+
test: ["CMD", "wget", "-qO", "-", "http://localhost:3100/ready"]
405+
interval: 30s
406+
timeout: 5s
407+
retries: 3
408+
409+
# ─── Promtail for Log Collection ────────────────────────────────────────────────
410+
promtail:
411+
image: grafana/promtail:latest
412+
container_name: skillforge-promtail
413+
volumes:
414+
- /var/run/docker.sock:/var/run/docker.sock
415+
- /var/lib/docker/containers:/var/lib/docker/containers:ro
416+
- ./monitoring/loki/promtail-config.yml:/etc/promtail/config.yml
417+
command: -config.file=/etc/promtail/config.yml
418+
networks:
419+
- skillforge-network
420+
depends_on:
421+
loki:
422+
condition: service_healthy
423+
424+
# ─── MailHog (SMTP Test Server) ────────────────────────────────────────────────
425+
mailhog:
426+
image: mailhog/mailhog
427+
container_name: skillforge-mailhog
428+
ports:
429+
- "1025:1025" # SMTP
430+
- "8025:8025" # Web UI
431+
networks:
432+
- skillforge-network
433+
312434
volumes:
313435
mongo-data:
314436
redis-data:
315437
weaviate-genai-data:
438+
prometheus-data:
439+
grafana-data:
440+
loki-data:
316441

317442
networks:
318443
skillforge-network:

genai/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,5 @@ langchain
99
langchain-openai
1010
langchain_community
1111
tiktoken
12+
prometheus-fastapi-instrumentator
1213
pydantic

genai/src/main.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,16 @@ async def lifespan(app: FastAPI):
8181
stop_scheduler()
8282

8383
# --- App Initialization ---
84+
from prometheus_fastapi_instrumentator import Instrumentator, metrics
85+
from prometheus_client import Counter, Gauge
86+
87+
# Custom metrics
88+
GENAI_TOKENS_USED_TOTAL = Counter("genai_tokens_used_total", "Total tokens used by GenAI")
89+
# Gauge that exposes the running application version
90+
APP_VERSION_INFO = Gauge("app_version_info", "Application version info (constant 1)", ["version"])
91+
APP_VERSION_INFO.labels(version=APP_VERSION).set(1)
92+
# Register counter for exceptions
93+
FASTAPI_EXCEPTIONS_TOTAL = Counter("fastapi_exceptions_total", "Total count of exceptions in the GenAI service")
8494
app = FastAPI(
8595
title=APP_TITLE,
8696
version=APP_VERSION,
@@ -100,6 +110,15 @@ async def lifespan(app: FastAPI):
100110
root_path=os.getenv("API_ROOT_PATH", ""),
101111
)
102112

113+
# Expose default and custom metrics with exception tracking
114+
instrumentator = Instrumentator()
115+
116+
# Use the correct instrumentator setup that will track all metrics including errors
117+
instrumentator.instrument(app)
118+
119+
# Expose metrics endpoint
120+
instrumentator.expose(app)
121+
103122
app.add_middleware(
104123
CORSMiddleware,
105124
allow_origins=os.getenv("CORS_ALLOW_ORIGINS", "*").split(","),
@@ -112,6 +131,9 @@ async def lifespan(app: FastAPI):
112131
@app.exception_handler(HTTPException)
113132
async def http_exception_handler(request: Request, exc: HTTPException):
114133
logger.error(f"HTTPException: {exc.detail}")
134+
# Increment the exception counter for HTTP exceptions
135+
if exc.status_code >= 500: # Only count server errors as exceptions
136+
FASTAPI_EXCEPTIONS_TOTAL.inc()
115137
return JSONResponse(
116138
status_code=exc.status_code,
117139
content={"error": "HTTPException", "detail": exc.detail}
@@ -120,6 +142,8 @@ async def http_exception_handler(request: Request, exc: HTTPException):
120142
@app.exception_handler(Exception)
121143
async def unhandled_exception_handler(request: Request, exc: Exception):
122144
logger.exception("Unhandled exception occurred")
145+
# Increment the exception counter for Prometheus metrics
146+
FASTAPI_EXCEPTIONS_TOTAL.inc()
123147
return JSONResponse(
124148
status_code=500,
125149
content={"error": "InternalServerError", "detail": str(exc)}
@@ -245,6 +269,15 @@ async def query_vector_db(request: QueryRequest):
245269
docs = [DocumentResult(**doc) for doc in docs_data]
246270
return QueryResponse(query=request.query_text, results=docs)
247271

272+
# -------------------------------
273+
# --- Debug / Testing Endpoints --
274+
# -------------------------------
275+
@app.get(f"{API_PREFIX}/debug/error", tags=["System"])
276+
async def debug_error():
277+
"""Deliberately raises a 500 error so monitoring dashboards can be tested."""
278+
# The exception handler will automatically increment the counter
279+
raise RuntimeError("Forced debug error for monitoring test")
280+
248281
# -------------------------------
249282
# --- LLM Endpoints -------------
250283
# -------------------------------

monitoring/ALERTS.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# Observability & Alerting Quick Reference
2+
3+
This repo includes a minimal **Prometheus + Alertmanager + Grafana** stack with **MailHog** for local alert testing — no external services needed.
4+
5+
| Component | Container | Host Port(s) | Purpose |
6+
|-------------|--------------|---------------------|-------------------------------------------------------|
7+
| Prometheus | `prometheus` | **${PROMETHEUS_PORT:-9090}** | Scrapes metrics and evaluates rules (`alert.rules.yml`). |
8+
| Alertmanager| `alertmanager`| **${ALERTMANAGER_PORT:-9093}** | Routes fired alerts via MailHog (`alertmanager.yml`). |
9+
| Grafana | `grafana` | **${GRAFANA_EXPOSED_PORT:-3001}** | Dashboards + unified alert panel. |
10+
| MailHog | `mailhog` | **8025 (UI)`/`1025 SMTP | Captures alert e-mails for inspection. |
11+
12+
---
13+
14+
## Viewing Alerts Live
15+
16+
1. **Prometheus** – raw alert state
17+
[http://localhost:${PROMETHEUS_PORT:-9090}/alerts](http://localhost:${PROMETHEUS_PORT:-9090}/alerts)
18+
2. **Grafana** – visual alert panel
19+
[http://localhost:${GRAFANA_EXPOSED_PORT:-3001}/dashboards](http://localhost:${GRAFANA_EXPOSED_PORT:-3001}/dashboards) → open *Logs* or any dashboard with **Active Alerts**
20+
3. **MailHog** – captured alert e-mails
21+
[http://localhost:8025/](http://localhost:8025/)
22+
23+
> Alertmanager sends all alerts to `oncall@skillforge.ai` via the local SMTP relay at `mailhog:1025` (no TLS).
24+
> **No external mail is sent.**
25+
26+
---
27+
28+
## Example Alert Screenshots
29+
30+
![Prometheus Alerts UI](img/image.png)
31+
32+
![MailHog UI](img/image-1.png)
33+
34+
---
35+
36+
## Notes
37+
38+
- Prometheus scrape/rule interval ≈ **15s** — expect up to ~30s delay before alerts *fire*.
39+
- Dashboards use the pre-provisioned `DS_PROMETHEUS`. The *Logs* dashboard also uses **Loki**.
40+
- MailHog is ephemeral — inbox resets on container restart.

monitoring/README.md

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# Monitoring Stack
2+
3+
Concise reference for the local Prometheus + Grafana stack that monitors SkillForge micro-services.
4+
5+
## Stack Components
6+
7+
| Layer | Container | Host Port(s) | Purpose |
8+
| --- | --- | --- | --- |
9+
| Prometheus | `prometheus` | **${PROMETHEUS_PORT:-9090}** | Scrapes metrics & evaluates rules (`prometheus.yml`) |
10+
| Alertmanager | `alertmanager` | **${ALERTMANAGER_PORT:-9093}** | Sends alerts → MailHog (`alertmanager.yml`) |
11+
| Grafana | `grafana` | **${GRAFANA_EXPOSED_PORT:-3001}** | Dashboards & system-wide alert list |
12+
| Loki | `loki` | **${LOKI_PORT:-3100}** | Central log store |
13+
| Promtail | `promtail` || Ships container logs → Loki |
14+
| Mongo Exporter | `mongo-exporter` | **${MONGO_EXPORTER_PORT:-9216}** | MongoDB metrics |
15+
| MailHog | `mailhog` | **8025 UI** / 1025 SMTP | Captures alert e-mails |
16+
| Traefik | `reverse-proxy` | 80 / **8085** | Public entry & metrics |
17+
18+
### Starting the Stack
19+
20+
```bash
21+
# Start all services
22+
docker compose up -d
23+
```
24+
25+
All containers use `skillforge-network`; Prometheus scrapes via container names (e.g. `skillforge-user-service:8082`).
26+
27+
## Alerts
28+
29+
Alert rules are defined in `monitoring/prometheus/alert.rules.yml`, covering:
30+
31+
* Service availability (exporters, GenAI, Spring Boot)
32+
* JVM heap thresholds
33+
* Security issues (e.g. high auth failures)
34+
35+
See [ALERTS.md](ALERTS.md) for full alert list, testing instructions, and UI links.
36+
37+
## Dashboards
38+
39+
Dashboards are auto-loaded from `monitoring/grafana/dashboards/`:
40+
41+
| File | Focus | Data-sources |
42+
| --- | --- | --- |
43+
| `genai.json` | GenAI FastAPI metrics | Prometheus |
44+
| `mongo.json` | MongoDB internals | Prometheus |
45+
| `server.json` | Spring Boot JVM + HTTP | Prometheus |
46+
| `user-custom-metrics.json` | Auth & signups | Prometheus |
47+
| `logs.json` | Logs + active alerts | Loki + Prometheus |
48+
49+
Each dashboard includes test steps to simulate real traffic and failures.
50+
51+
See `grafana/README.md` for full details.
52+
53+
## Exporter Endpoints
54+
55+
| Source | Endpoint | Scrape job |
56+
| --- | --- | --- |
57+
| Spring Boot | `/actuator/prometheus` | `spring-boot` |
58+
| GenAI | `/metrics` | `genai` |
59+
| Traefik | `:${TRAEFIK_DASHBOARD_PORT:-8085}/metrics` | `traefik` |
60+
| Mongo Exporter | `:9216/metrics` | `mongo_exporter` |
61+
62+
Happy monitoring! Extend dashboards, adjust alert thresholds, or plug in more exporters as needed.

0 commit comments

Comments
 (0)