AET-DevOps25
diff --git a/‎.env.example‎
Lines changed: 16 additions & 0 deletions b/‎.env.example‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎docker-compose.yml‎
Lines changed: 125 additions & 0 deletions b/‎docker-compose.yml‎
Lines changed: 125 additions & 0 deletions
diff --git a/‎genai/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎genai/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎genai/src/main.py‎
Lines changed: 33 additions & 0 deletions b/‎genai/src/main.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎monitoring/ALERTS.md‎
Lines changed: 40 additions & 0 deletions b/‎monitoring/ALERTS.md‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎monitoring/README.md‎
Lines changed: 62 additions & 0 deletions b/‎monitoring/README.md‎
Lines changed: 62 additions & 0 deletions
@@ -5,6 +5,12 @@
 # - To use: `./copy-env.sh .env.dev.example .env`
 ###############################################
 
+##########################
+# 📦 Application Version
+##########################
+
+APP_VERSION=1.0.0-dev      # Version of the SkillForge application (used in metrics and service info)
+
 ##########################
 # 🗃️  MongoDB Settings
 ##########################
@@ -82,6 +88,16 @@ WEAVIATE_HOST=weaviate-db       # (default: weaviate-db, matches compose service
 WEAVIATE_EXPOSED_HTTP_PORT=8080 # (default: 8080)
 WEAVIATE_EXPOSED_GRPC_PORT=50051 # (default: 50051)
 
+##########################
+# 📊 Monitoring Ports
+##########################
+# Host-side ports for monitoring stack services
+PROMETHEUS_PORT=9090        # (default: 9090)
+ALERTMANAGER_PORT=9093      # (default: 9093)
+GRAFANA_EXPOSED_PORT=3001   # (default: 3001)
+LOKI_PORT=3100              # (default: 3100)
+MONGO_EXPORTER_PORT=9216    # (default: 9216)
+
 ##########################
 # For Production mode
 ##########################
 
@@ -13,6 +13,8 @@ services:
       - "--entrypoints.traefik.address=:8085"
       - "--api=true"
       - "--api.insecure=true"
+      - "--metrics.prometheus=true"
+      - "--metrics.prometheus.addrouterslabels=true"
     ports:
       - "80:80"
       - "8085:8085"
@@ -241,13 +243,15 @@ services:
     ports:
       - "8088:8080"
       - "50051:50051"
+      - "2112:2112"
     environment:
       QUERY_DEFAULTS_LIMIT: 25
       AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
       PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
       BACKUP_FILESYSTEM_PATH: '/var/lib/weaviate/backups'
       ENABLE_API_BASED_MODULES: 'true'
       CLUSTER_HOSTNAME: 'node1'
+      PROMETHEUS_MONITORING_ENABLED: 'true'
     volumes:
       - weaviate-genai-data:/var/lib/weaviate
     healthcheck:
@@ -309,10 +313,131 @@ services:
       - "traefik.http.middlewares.genai-cors.headers.accessControlAllowOrigin=${CORS_ALLOW_ORIGINS:-*}"
       - "traefik.http.routers.genai.middlewares=genai-cors@traefik"
 
+  prometheus:
+    image: prom/prometheus
+    volumes:
+      - ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
+      - ./monitoring/prometheus/alert.rules.yml:/etc/prometheus/alert.rules.yml
+      - prometheus-data:/prometheus
+    ports:
+      - "${PROMETHEUS_PORT:-9090}:9090"
+    networks:
+      - skillforge-network
+    healthcheck:
+      test: ["CMD", "wget", "-qO", "-", "http://localhost:9090/-/ready"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+
+  alertmanager:
+    image: prom/alertmanager
+    container_name: skillforge-alertmanager
+    volumes:
+      - ./monitoring/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml
+    ports:
+      - "${ALERTMANAGER_PORT:-9093}:9093"
+    networks:
+      - skillforge-network
+    healthcheck:
+      test: ["CMD", "wget", "-qO", "-", "http://localhost:9093/-/ready"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+
+  grafana:
+    image: grafana/grafana
+    ports:
+      - "${GRAFANA_EXPOSED_PORT:-3001}:3000"
+    environment:
+      - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PWD:-admin}
+      - GF_DATE_FORMATS_DEFAULT_TIMEZONE=browser
+    volumes:
+      - ./monitoring/grafana/dashboards/:/var/lib/grafana/dashboards/
+      - ./monitoring/grafana/provisioning/:/etc/grafana/provisioning/
+      - grafana-data:/var/lib/grafana
+    networks:
+      - skillforge-network
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:3000/api/health"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+
+
+  mongo-exporter:
+    image: percona/mongodb_exporter:0.30
+    command:
+      - --mongodb.uri=mongodb://${MONGODB_USERNAME:-root}:${MONGODB_PASSWORD:-password}@mongo:27017/admin?authSource=admin
+      - --collector.dbstats
+      - --collector.topmetrics
+      - --discovering-mode
+      - --compatible-mode
+      - --web.listen-address=:9216
+    environment:
+      - MONGODB_URI=mongodb://${MONGODB_USERNAME:-root}:${MONGODB_PASSWORD:-password}@mongo:27017/admin?authSource=admin
+    ports:
+      - "${MONGO_EXPORTER_PORT:-9216}:9216"
+    networks:
+      - skillforge-network
+    healthcheck:
+      test: ["CMD", "wget", "-qO", "-", "http://localhost:9216/metrics"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+    depends_on:
+      mongo:
+        condition: service_healthy
+
+  # ─── Loki for Centralized Logging ────────────────────────────────────────────────
+  loki:
+    image: grafana/loki:latest
+    container_name: skillforge-loki
+    ports:
+      - "${LOKI_PORT:-3100}:3100"
+    volumes:
+      - ./monitoring/loki/loki-config.yml:/etc/loki/local-config.yaml
+      - loki-data:/loki
+    command: -config.file=/etc/loki/local-config.yaml
+    networks:
+      - skillforge-network
+    healthcheck:
+      test: ["CMD", "wget", "-qO", "-", "http://localhost:3100/ready"]
+      interval: 30s
+      timeout: 5s
+      retries: 3
+
+  # ─── Promtail for Log Collection ────────────────────────────────────────────────
+  promtail:
+    image: grafana/promtail:latest
+    container_name: skillforge-promtail
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - /var/lib/docker/containers:/var/lib/docker/containers:ro
+      - ./monitoring/loki/promtail-config.yml:/etc/promtail/config.yml
+    command: -config.file=/etc/promtail/config.yml
+    networks:
+      - skillforge-network
+    depends_on:
+      loki:
+        condition: service_healthy
+
+  # ─── MailHog (SMTP Test Server) ────────────────────────────────────────────────
+  mailhog:
+    image: mailhog/mailhog
+    container_name: skillforge-mailhog
+    ports:
+      - "1025:1025"   # SMTP
+      - "8025:8025"   # Web UI
+    networks:
+      - skillforge-network
+
 volumes:
   mongo-data:
   redis-data:
   weaviate-genai-data:
+  prometheus-data:
+  grafana-data:
+  loki-data:
 
 networks:
   skillforge-network:
 
@@ -9,4 +9,5 @@ langchain
 langchain-openai
 langchain_community
 tiktoken
+prometheus-fastapi-instrumentator
 pydantic
@@ -81,6 +81,16 @@ async def lifespan(app: FastAPI):
    stop_scheduler()
 
 # --- App Initialization ---
+from prometheus_fastapi_instrumentator import Instrumentator, metrics
+from prometheus_client import Counter, Gauge
+
+# Custom metrics
+GENAI_TOKENS_USED_TOTAL = Counter("genai_tokens_used_total", "Total tokens used by GenAI")
+# Gauge that exposes the running application version
+APP_VERSION_INFO = Gauge("app_version_info", "Application version info (constant 1)", ["version"])
+APP_VERSION_INFO.labels(version=APP_VERSION).set(1)
+# Register counter for exceptions
+FASTAPI_EXCEPTIONS_TOTAL = Counter("fastapi_exceptions_total", "Total count of exceptions in the GenAI service")
 app = FastAPI(
     title=APP_TITLE,
     version=APP_VERSION,
@@ -100,6 +110,15 @@ async def lifespan(app: FastAPI):
     root_path=os.getenv("API_ROOT_PATH", ""),
 )
 
+# Expose default and custom metrics with exception tracking
+instrumentator = Instrumentator()
+
+# Use the correct instrumentator setup that will track all metrics including errors
+instrumentator.instrument(app)
+
+# Expose metrics endpoint
+instrumentator.expose(app)
+
 app.add_middleware(
     CORSMiddleware,
     allow_origins=os.getenv("CORS_ALLOW_ORIGINS", "*").split(","),
@@ -112,6 +131,9 @@ async def lifespan(app: FastAPI):
 @app.exception_handler(HTTPException)
 async def http_exception_handler(request: Request, exc: HTTPException):
     logger.error(f"HTTPException: {exc.detail}")
+    # Increment the exception counter for HTTP exceptions
+    if exc.status_code >= 500:  # Only count server errors as exceptions
+        FASTAPI_EXCEPTIONS_TOTAL.inc()
     return JSONResponse(
         status_code=exc.status_code,
         content={"error": "HTTPException", "detail": exc.detail}
@@ -120,6 +142,8 @@ async def http_exception_handler(request: Request, exc: HTTPException):
 @app.exception_handler(Exception)
 async def unhandled_exception_handler(request: Request, exc: Exception):
     logger.exception("Unhandled exception occurred")
+    # Increment the exception counter for Prometheus metrics
+    FASTAPI_EXCEPTIONS_TOTAL.inc()
     return JSONResponse(
         status_code=500,
         content={"error": "InternalServerError", "detail": str(exc)}
@@ -245,6 +269,15 @@ async def query_vector_db(request: QueryRequest):
     docs = [DocumentResult(**doc) for doc in docs_data]
     return QueryResponse(query=request.query_text, results=docs)
 
+# -------------------------------
+# --- Debug / Testing Endpoints --
+# -------------------------------
+@app.get(f"{API_PREFIX}/debug/error", tags=["System"])
+async def debug_error():
+    """Deliberately raises a 500 error so monitoring dashboards can be tested."""
+    # The exception handler will automatically increment the counter
+    raise RuntimeError("Forced debug error for monitoring test")
+
 # -------------------------------
 # --- LLM Endpoints -------------
 # -------------------------------
 
@@ -0,0 +1,40 @@
+# Observability & Alerting Quick Reference
+
+This repo includes a minimal **Prometheus + Alertmanager + Grafana** stack with **MailHog** for local alert testing — no external services needed.
+
+| Component   | Container     | Host Port(s)        | Purpose                                               |
+|-------------|--------------|---------------------|-------------------------------------------------------|
+| Prometheus  | `prometheus` | **${PROMETHEUS_PORT:-9090}**            | Scrapes metrics and evaluates rules (`alert.rules.yml`). |
+| Alertmanager| `alertmanager`| **${ALERTMANAGER_PORT:-9093}**           | Routes fired alerts via MailHog (`alertmanager.yml`). |
+| Grafana     | `grafana`    | **${GRAFANA_EXPOSED_PORT:-3001}**            | Dashboards + unified alert panel.                     |
+| MailHog     | `mailhog`    | **8025 (UI)`/`1025 SMTP | Captures alert e-mails for inspection.            |
+
+---
+
+## Viewing Alerts Live
+
+1. **Prometheus** – raw alert state  
+   [http://localhost:${PROMETHEUS_PORT:-9090}/alerts](http://localhost:${PROMETHEUS_PORT:-9090}/alerts)
+2. **Grafana** – visual alert panel  
+   [http://localhost:${GRAFANA_EXPOSED_PORT:-3001}/dashboards](http://localhost:${GRAFANA_EXPOSED_PORT:-3001}/dashboards) → open *Logs* or any dashboard with **Active Alerts**
+3. **MailHog** – captured alert e-mails  
+   [http://localhost:8025/](http://localhost:8025/)
+
+> Alertmanager sends all alerts to `oncall@skillforge.ai` via the local SMTP relay at `mailhog:1025` (no TLS).  
+> **No external mail is sent.**
+
+---
+
+## Example Alert Screenshots
+
+![Prometheus Alerts UI](img/image.png)
+
+![MailHog UI](img/image-1.png)
+
+---
+
+## Notes
+
+- Prometheus scrape/rule interval ≈ **15s** — expect up to ~30s delay before alerts *fire*.
+- Dashboards use the pre-provisioned `DS_PROMETHEUS`. The *Logs* dashboard also uses **Loki**.
+- MailHog is ephemeral — inbox resets on container restart.
@@ -0,0 +1,62 @@
+# Monitoring Stack
+
+Concise reference for the local Prometheus + Grafana stack that monitors SkillForge micro-services.
+
+## Stack Components
+
+| Layer | Container | Host Port(s) | Purpose |
+| --- | --- | --- | --- |
+| Prometheus | `prometheus` | **${PROMETHEUS_PORT:-9090}** | Scrapes metrics & evaluates rules (`prometheus.yml`) |
+| Alertmanager | `alertmanager` | **${ALERTMANAGER_PORT:-9093}** | Sends alerts → MailHog (`alertmanager.yml`) |
+| Grafana | `grafana` | **${GRAFANA_EXPOSED_PORT:-3001}** | Dashboards & system-wide alert list |
+| Loki | `loki` | **${LOKI_PORT:-3100}** | Central log store |
+| Promtail | `promtail` | – | Ships container logs → Loki |
+| Mongo Exporter | `mongo-exporter` | **${MONGO_EXPORTER_PORT:-9216}** | MongoDB metrics |
+| MailHog | `mailhog` | **8025 UI** / 1025 SMTP | Captures alert e-mails |
+| Traefik | `reverse-proxy` | 80 / **8085** | Public entry & metrics |
+
+### Starting the Stack
+
+```bash
+# Start all services
+docker compose up -d
+```
+
+All containers use `skillforge-network`; Prometheus scrapes via container names (e.g. `skillforge-user-service:8082`).
+
+## Alerts
+
+Alert rules are defined in `monitoring/prometheus/alert.rules.yml`, covering:
+
+* Service availability (exporters, GenAI, Spring Boot)
+* JVM heap thresholds
+* Security issues (e.g. high auth failures)
+
+See [ALERTS.md](ALERTS.md) for full alert list, testing instructions, and UI links.
+
+## Dashboards
+
+Dashboards are auto-loaded from `monitoring/grafana/dashboards/`:
+
+| File | Focus | Data-sources |
+| --- | --- | --- |
+| `genai.json` | GenAI FastAPI metrics | Prometheus |
+| `mongo.json` | MongoDB internals | Prometheus |
+| `server.json` | Spring Boot JVM + HTTP | Prometheus |
+| `user-custom-metrics.json` | Auth & signups | Prometheus |
+| `logs.json` | Logs + active alerts | Loki + Prometheus |
+
+Each dashboard includes test steps to simulate real traffic and failures.
+
+See `grafana/README.md` for full details.
+
+## Exporter Endpoints
+
+| Source | Endpoint | Scrape job |
+| --- | --- | --- |
+| Spring Boot | `/actuator/prometheus` | `spring-boot` |
+| GenAI | `/metrics` | `genai` |
+| Traefik | `:${TRAEFIK_DASHBOARD_PORT:-8085}/metrics` | `traefik` |
+| Mongo Exporter | `:9216/metrics` | `mongo_exporter` |
+
+Happy monitoring! Extend dashboards, adjust alert thresholds, or plug in more exporters as needed.