diff --git a/docker-compose.yaml b/docker-compose.yaml deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/GETTING_STARTED.md b/docs/GETTING_STARTED.md deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/TROUBLESHOOTING.md b/docs/TROUBLESHOOTING.md deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/TUTORIAL_ARGOCD.md b/docs/TUTORIAL_ARGOCD.md deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/TUTORIAL_CERT_MANAGER.md b/docs/TUTORIAL_CERT_MANAGER.md deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/TUTORIAL_GKE_SETUP.md b/docs/TUTORIAL_GKE_SETUP.md deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/TUTORIAL_INGRESS.md b/docs/TUTORIAL_INGRESS.md deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/TUTORIAL_LGTM.md b/docs/TUTORIAL_LGTM.md deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/alloy-config.md b/docs/alloy-config.md new file mode 100644 index 00000000..f44fad0b --- /dev/null +++ b/docs/alloy-config.md @@ -0,0 +1,323 @@ +# Grafana Alloy: Docker Compose Deployment Guide + +## Overview +Grafana Alloy is a unified telemetry collector that gathers logs, metrics, and traces from applications and forwards them to an LGTM stack. + +**Why use Alloy?** While Prometheus uses a pull model and scrapes metrics itself, Alloy acts as a central collector that can: +- Scrape logs from applications and push to Loki +- Collect metrics from applications and systems and forward to Prometheus +- Collect traces from various protocols and send to Tempo +- Provide a single deployment point for log, metric, and trace collection + +## Getting Started with Alloy Deployment + +### 1. Docker Compose Setup + +This deployment runs Alloy as a container that collects telemetry from your applications and forwards it to your external LGTM stack. + +**docker-compose.yml:** +```yaml +version: '3.8' + +services: + alloy: + image: grafana/alloy:latest + container_name: alloy + command: + - run + - /etc/alloy/config.alloy + - --server.http.listen-addr=0.0.0.0:12345 + - --storage.path=/var/lib/alloy/data + ports: + - "12345:12345" # Alloy UI/metrics + - "4317:4317" # OTLP gRPC (traces) + - "4318:4318" # OTLP HTTP (traces) + - "14268:14268" # Jaeger HTTP (traces) + volumes: + - ./config.alloy:/etc/alloy/config.alloy:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - /var/log:/var/log:ro + - alloy-data:/var/lib/alloy/data + restart: unless-stopped + environment: + - LOKI_ENDPOINT=https://loki./loki/api/v1/push + - TEMPO_ENDPOINT=https://tempo.:4317 + - PROMETHEUS_ENDPOINT=https://prometheus./api/v1/write + + # Node Exporter for system metrics collection + node-exporter: + image: prom/node-exporter:latest + container_name: node-exporter + command: + - '--path.procfs=/host/proc' + - '--path.rootfs=/rootfs' + - '--path.sysfs=/host/sys' + - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' + ports: + - "9100:9100" # Node exporter metrics endpoint + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + restart: unless-stopped + +volumes: + alloy-data: +``` + +### 2. Alloy Configuration + +The Alloy configuration defines how to collect logs, metrics, and traces from different sources and send them to your LGTM stack endpoints. + +**config.alloy:** +```alloy +// ============================================================================= +// GRAFANA ALLOY CONFIGURATION +// Collect logs, metrics, and traces from various sources +// ============================================================================= + +logging { + level = "info" + format = "logfmt" +} + +// ============================================================================= +// LOKI - LOG COLLECTION AND FORWARDING +// ============================================================================= + +loki.write "loki" { + endpoint { + url = "https://loki./loki/api/v1/push" + } +} + +// ============================================================================= +// DOCKER CONTAINER LOGS +// ============================================================================= +// Step 1: Discover all running Docker containers +discovery.docker "containers" { + host = "unix://var/run/docker.sock" // Connect to Docker daemon socket +} + +// Step 2: Collect logs from discovered containers +loki.source.docker "containers" { + host = "unix://var/run/docker.sock" // Docker daemon socket path + targets = discovery.docker.containers.targets // Use discovered containers as targets + forward_to = [loki.process.containers.receiver] // Send logs to processing stage +} + +// Step 3: Process and enrich logs with labels +loki.process "containers" { + stage.docker {} // Parse Docker-specific log format and metadata + + // Add custom labels for better filtering and searching + stage.labels { + values = { + container_name = "container_name", // Extract container name + host = "host", // Add host information + } + } + forward_to = [loki.write.loki.receiver] // Send processed logs to Loki +} + +// ============================================================================= +// SYSTEM LOGS (journald) - Collect systemd/system logs +// ============================================================================= +// Step 1: Collect logs from systemd journal +loki.source.journal "system_logs" { + forward_to = [loki.write.loki.receiver] // Send directly to Loki (no processing needed) + labels = { + job = "systemd-journal", // Label for easy identification in Grafana + } +} + +// ============================================================================= +// FILE LOGS - Collect application logs from files +// ============================================================================= +// Step 1: Define which log files to collect and assign job names +loki.source.file "app_logs" { + targets = [ + // Custom application logs + {__path__ = "/var/log/myapp/*.log", job = "myapp"}, + {__path__ = "/var/log/mysql/*.log", job = "mysql"}, + {__path__ = "/var/log/redis/*.log", job = "redis"}, + {__path__ = "/var/log/auth.log", job = "auth"}, + {__path__ = "/var/log/syslog", job = "system"}, + ] + forward_to = [loki.write.loki.receiver] // Send logs directly to Loki +} + +// ============================================================================= +// PROMETHEUS METRICS COLLECTION - System and Application Metrics +// ============================================================================= + +// Step 1: Collect system metrics (CPU, memory, disk, network) +prometheus.scrape "node_metrics" { + targets = [{ + __address__ = "node-exporter:9100" // Node exporter endpoint for system metrics + }] + job_name = "node" // Job name for identification in Prometheus + forward_to = [prometheus.remote_write.prometheus.receiver] // Send to remote Prometheus +} + +// Step 2: Discover Docker containers that expose metrics +discovery.docker "metrics_targets" { + host = "unix://var/run/docker.sock" // Connect to Docker daemon +} + +// Step 3: Filter containers to only include those with Prometheus metrics enabled +discovery.relabel "metrics_relabel" { + targets = discovery.docker.metrics_targets.targets + + // Keep only containers labeled with prometheus_scrape=true + rule { + source_labels = ["__meta_docker_container_label_prometheus_scrape"] + regex = "true" + action = "keep" + } +} + +// Step 4: Scrape metrics from discovered application containers +prometheus.scrape "app_metrics" { + targets = discovery.relabel.metrics_relabel.output // Use filtered containers + scrape_interval = "15s" // How often to collect metrics + forward_to = [prometheus.remote_write.prometheus.receiver] // Send to remote Prometheus +} + +// Step 5: Send all collected metrics to remote Prometheus +prometheus.remote_write "prometheus" { + endpoint { + url = "https://prometheus./api/v1/write" // Remote Prometheus endpoint + } +} + +// ============================================================================= +// TEMPO - TRACE COLLECTION +// ============================================================================= + +// Step 1: OTLP receiver for modern applications (OpenTelemetry) +otelcol.receiver.otlp "default" { + grpc { + endpoint = "0.0.0.0:4317" // gRPC endpoint for trace data + } + http { + endpoint = "0.0.0.0:4318" // HTTP endpoint for trace data + } + output { + traces = [otelcol.exporter.otlp.tempo.input] // Forward traces to Tempo exporter + } +} + +// Step 2: Export collected traces to Tempo +otelcol.exporter.otlp "tempo" { + client { + endpoint = "tempo.:4317" // Tempo server endpoint + tls { + insecure = false // Use secure TLS connection + } + } +} + +// Step 3: Jaeger receiver for legacy applications +otelcol.receiver.jaeger "default" { + protocols { + thrift_http { + endpoint = "0.0.0.0:14268" // HTTP endpoint for Jaeger thrift protocol + } + } + output { + traces = [otelcol.exporter.otlp.tempo.input] // Forward traces to Tempo exporter + } +} +``` + +### 3. Deployment + +```bash + +# Start Alloy +docker-compose up -d + +# Verify connectivity +docker-compose logs alloy + +# Test configuration +docker exec alloy alloy tools check /etc/alloy/config.alloy +``` + +## Verification Queries + +Use these queries in Grafana to verify data is flowing correctly. + +**Before running queries, select the appropriate data source in Grafana:** +- Loki for logs, Prometheus for metrics, Tempo for traces + +| Data Type | Data Source | Example Queries | Purpose | +|-----------|-------------|----------------|---------| +| **Logs** | Loki | `{job="myapp"} |Verify log collection from application|= "error"`
`{job="systemd-journal"} |= "error"` | Verify log collection from applications, containers, and system | +| **Metrics** | Prometheus | `up{job="node"}`
`rate(cpu_total[5m])`
`alloy_build_info` | Check system metrics, CPU usage, and Alloy health | +| **Traces** | Tempo | `sum(rate(traces_received_total[5m]))`
`rate(traces_spanmetrics_latency_bucket[5m])` | Verify trace ingestion and span metrics | + +**References:** +- [Grafana LogQL Documentation](https://grafana.com/docs/loki/latest/logql/) +- [Prometheus PromQL Documentation](https://prometheus.io/docs/prometheus/latest/querying/basics/) + +## Troubleshooting + +### Check Alloy Status +```bash +# View logs +docker-compose logs -f alloy + +# Check configuration +docker exec alloy alloy tools check /etc/alloy/config.alloy + +# View metrics +curl http://localhost:12345/metrics +``` + +### Common Issues + +**No logs in Loki:** +```bash +# Check if Alloy is receiving logs +curl http://localhost:12345/metrics | grep loki + +# Verify Loki endpoint +curl https://loki./ready +``` + +**No metrics in Prometheus:** +```bash +# Check if Alloy is receiving metrics +curl http://localhost:12345/metrics | grep prometheus + +# Verify Prometheus endpoint +curl https://prometheus./api/v1/query?query=up + +# Check node-exporter connectivity +curl http://localhost:9100/metrics | head -10 +``` + +**No traces in Tempo:** +```bash +# Check trace ingestion +curl http://localhost:12345/metrics | grep traces + +# Test Tempo endpoint +curl https://tempo.:3200/ready +``` + + +## Integration Flow + +This Alloy setup provides a complete telemetry pipeline: + +- **Logs**: Application/Docker logs → Alloy → Loki +- **Metrics**: System/Application metrics → Node Exporter → Alloy → Prometheus +- **Traces**: Application traces (OTLP/Jaeger) → Alloy → Tempo +- **Visualization**: All data available in Grafana + +--- + +**Want to learn more about Alloy?** Check out the [official Grafana Alloy documentation](https://grafana.com/docs/alloy/latest/) for advanced configuration options, integrations, and best practices for production deployments. diff --git a/docs/images/architecture-diagram.png b/docs/images/architecture-diagram.png deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/images/argocd-workflow.png b/docs/images/argocd-workflow.png deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/images/lgtm-flow.png b/docs/images/lgtm-flow.png deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/img/grafana-dashboard.png b/docs/img/grafana-dashboard.png new file mode 100644 index 00000000..b2a02361 Binary files /dev/null and b/docs/img/grafana-dashboard.png differ diff --git a/docs/img/grafana-datasource-loki.png b/docs/img/grafana-datasource-loki.png new file mode 100644 index 00000000..0cbc068e Binary files /dev/null and b/docs/img/grafana-datasource-loki.png differ diff --git a/docs/img/grafana-datasource-prometheus.png b/docs/img/grafana-datasource-prometheus.png new file mode 100644 index 00000000..b6c6b11e Binary files /dev/null and b/docs/img/grafana-datasource-prometheus.png differ diff --git a/docs/img/grafana-loki1.png b/docs/img/grafana-loki1.png new file mode 100644 index 00000000..8077cc77 Binary files /dev/null and b/docs/img/grafana-loki1.png differ diff --git a/docs/img/grafana-loki2.png b/docs/img/grafana-loki2.png new file mode 100644 index 00000000..afc88ffc Binary files /dev/null and b/docs/img/grafana-loki2.png differ diff --git a/docs/img/grafana-mimir.png b/docs/img/grafana-mimir.png new file mode 100644 index 00000000..74a84089 Binary files /dev/null and b/docs/img/grafana-mimir.png differ diff --git a/docs/img/grafana-mimir2.png b/docs/img/grafana-mimir2.png new file mode 100644 index 00000000..9a3e3f6f Binary files /dev/null and b/docs/img/grafana-mimir2.png differ diff --git a/docs/img/grafana-tempo1.png b/docs/img/grafana-tempo1.png new file mode 100644 index 00000000..ebaae9dd Binary files /dev/null and b/docs/img/grafana-tempo1.png differ diff --git a/docs/img/grafana-tempo2.png b/docs/img/grafana-tempo2.png new file mode 100644 index 00000000..23a05e44 Binary files /dev/null and b/docs/img/grafana-tempo2.png differ diff --git a/docs/img/grafana-tempo3.png b/docs/img/grafana-tempo3.png new file mode 100644 index 00000000..5c4701f2 Binary files /dev/null and b/docs/img/grafana-tempo3.png differ diff --git a/docs/img/grafana-tempo4.png b/docs/img/grafana-tempo4.png new file mode 100644 index 00000000..49566efc Binary files /dev/null and b/docs/img/grafana-tempo4.png differ diff --git a/docs/img/kubectl-get-pods.png b/docs/img/kubectl-get-pods.png new file mode 100644 index 00000000..fd014c90 Binary files /dev/null and b/docs/img/kubectl-get-pods.png differ diff --git a/docs/img/logcli.png b/docs/img/logcli.png new file mode 100644 index 00000000..f046f066 Binary files /dev/null and b/docs/img/logcli.png differ diff --git a/docs/img/mimir.png b/docs/img/mimir.png new file mode 100644 index 00000000..b9516e98 Binary files /dev/null and b/docs/img/mimir.png differ diff --git a/docs/img/mimir1.png b/docs/img/mimir1.png new file mode 100644 index 00000000..ee41521b Binary files /dev/null and b/docs/img/mimir1.png differ diff --git a/docs/img/mimir2.png b/docs/img/mimir2.png new file mode 100644 index 00000000..7cec6518 Binary files /dev/null and b/docs/img/mimir2.png differ diff --git a/docs/img/mimirb2.png b/docs/img/mimirb2.png new file mode 100644 index 00000000..94401107 Binary files /dev/null and b/docs/img/mimirb2.png differ diff --git a/docs/img/monitor-netbird-stack-ps.png b/docs/img/monitor-netbird-stack-ps.png new file mode 100644 index 00000000..167d4e1a Binary files /dev/null and b/docs/img/monitor-netbird-stack-ps.png differ diff --git a/docs/img/tempo1.png b/docs/img/tempo1.png new file mode 100644 index 00000000..4b20a329 Binary files /dev/null and b/docs/img/tempo1.png differ diff --git a/docs/img/tempo2.png b/docs/img/tempo2.png new file mode 100644 index 00000000..29e18b26 Binary files /dev/null and b/docs/img/tempo2.png differ diff --git a/docs/kubernetes-observability.md b/docs/kubernetes-observability.md new file mode 100644 index 00000000..044b0324 --- /dev/null +++ b/docs/kubernetes-observability.md @@ -0,0 +1,184 @@ +# Kubernetes Observability Stack + +This document describes the architecture, deployment, and configuration of a production-grade observability stack on Google Kubernetes Engine (GKE). The stack integrates Loki, Grafana, Tempo, and Mimir (LGTM) with Prometheus to provide a complete monitoring solution for logs, metrics, and traces for any application or infrastructure. + +## Overview + +The observability stack is designed to be production-ready, scalable, and portable. It leverages: + +- **Loki**: For distributed logging. +- **Mimir**: For long-term Prometheus metrics storage. +- **Tempo**: For distributed tracing. +- **Prometheus**: For metrics collection and scraping. +- **Grafana**: For data visualization and dashboarding. +- **Google Cloud Storage (GCS)**: For cost-effective, durable backend storage. + +Infrastructure provisioning and application deployment are fully automated using Terraform and Helm. + +## Architecture + +The system uses a centralized ingress controller to route traffic to specific services. Workload Identity is configured to securely authenticate Kubernetes Service Accounts with Google Cloud APIs, eliminating the need for static service account keys. + +```mermaid +graph TD + User([User / External Traffic]) -->|HTTPS| LB[GCP LoadBalancer] + LB -->|Routing| Ingress[Ingress Controller] + + subgraph "GKE Cluster (Namespace: observability)" + Ingress -->|Host: grafana.*| Grafana[Grafana UI] + Ingress -->|Host: loki.*| Loki[Loki Gateway] + Ingress -->|Host: mimir.*| Mimir[Mimir Gateway] + Ingress -->|Host: tempo.*| Tempo[Tempo Gateway] + Ingress -->|Host: prometheus.*| Prom[Prometheus] + + Prom -->|Remote Write| Mimir + Prom -->|Scrape| K8s[K8s Metrics] + end + + subgraph "Google Cloud Platform" + IAM[IAM & Workload Identity] + GCS[(Google Cloud Storage)] + end + + Loki -->|Read/Write| GCS + Mimir -->|Read/Write| GCS + Tempo -->|Read/Write| GCS + + K8sSA[K8s ServiceAccount] -.->|Impersonates| GCPSA[GCP ServiceAccount] + GCPSA -->|IAM Roles| GCS +``` + +## Prerequisites + +Before deploying the stack, ensure the following requirements are met: + +1. **Terraform**: Version 1.0 or later installed. +2. **Google Cloud CLI**: Installed and authenticated with `gcloud auth login` and `gcloud auth application-default login`. +3. **Kubernetes Access**: `kubectl` configured with context for the target GKE cluster. +4. **Permissions**: The authenticated user must have permissions to create GCS buckets, Service Accounts, and assign IAM roles (Storage Object Admin). + +## Configuration + +The deployment is configured via Terraform variables. Create a `terraform.tfvars` file in `lgtm-stack/terraform` to define your environment-specific values. + +| Variable | Description | Required | Default | +| :--- | :--- | :---: | :--- | +| `project_id` | Google Cloud Project ID. | Yes | - | +| `cluster_name` | Name of the target GKE cluster. | Yes | - | +| `region` | GCP Region for resources (e.g., `europe-west3`). | No | `us-central1` | +| `monitoring_domain` | Base domain for endpoints (e.g., `obs.example.com`). | Yes | - | +| `ingress_class_name` | Ingress Class Name (e.g., `nginx`, `traefik`). | No | `nginx` | +| `cert_issuer_name` | Name of the Cert-Manager Issuer (e.g., `letsencrypt-prod`). | No | `letsencrypt-prod` | +| `grafana_admin_password` | Initial admin password for Grafana. | Yes | - | + +### Ingress Compatibility + +This module is agnostic to the Ingress Controller and Certificate Issuer. By default, it assumes `nginx` and `letsencrypt-prod`. To use a different configuration (e.g., Traefik or a custom ClusterIssuer), update the `ingress_class_name` and `cert_issuer_name` variables in `terraform.tfvars`. + +## Installation + +1. **Initialize Terraform** + + Navigate to the Terraform directory and initialize the project to download required providers and modules. + + ```bash + cd ../lgtm-stack/terraform + terraform init + ``` + +2. **Plan Deployment** + + Generate an execution plan to verify the resources that will be created. + + ```bash + terraform plan + ``` + +3. **Apply Configuration** + + Execute the plan to provision infrastructure and deploy the application stack. + + ```bash + terraform apply + ``` + +## Verification + +### Service Status + +Verify that all pods are running successfully in the `` (default: `observability`) namespace. + +```bash +kubectl get pods -n +``` + +![Kubectl Get Pods](img/kubectl-get-pods.png) + +### Public Endpoints + +The stack exposes the following endpoints for data ingestion and visualization. Replace `` with your configured domain (e.g., `stack.observe.camer.digital`). + +| Service | Endpoint URL | Purpose | Method | Notes | +| :--- | :--- | :--- | :--- | :--- | +| **Grafana** | `https://grafana.` | **Visualization** | GET | Main UI for dashboards and alerts. | +| **Loki** | `https://loki./loki/api/v1/push` | **Logs Ingestion** | POST | Send logs via HTTP (JSON/Snappy). | +| **Mimir** | `https://mimir./prometheus/api/v1/push` | **Metrics Ingestion** | POST | Send metrics via Prometheus Remote Write. | +| **Tempo** (HTTP) | `https://tempo-push./v1/traces` | **Traces Ingestion** | POST | Send traces via OTLP HTTP. | +| **Tempo** (gRPC) | `tempo-grpc.:443` | **Traces Ingestion** | gRPC | Send traces via OTLP gRPC. | + +### Manual Verification + +You can verify the Write Path (Ingestion) by sending synthetic data to the exposed endpoints. + +**Example Verification (Mimir Connectivity):** + +```bash +curl -v -G "https://mimir./prometheus/api/v1/query" \ + --data-urlencode 'query=up' +``` + +**Example Verification (Loki Push):** + +```bash +# Set timestamp to avoid shell quoting issues +TS=$(date +%s)000000000 +curl -v -H "Content-Type: application/json" -XPOST \ + "https://loki./loki/api/v1/push" \ + --data-raw "{\"streams\": [{ \"stream\": { \"test\": \"manual_curl\" }, \"values\": [ [ \"$TS\", \"manual_test_log\" ] ] }]}" +``` + +### Useful API Documentation + +For advanced usage, refer to the official API documentation: + +- **Loki**: [Push API (Protobuf/JSON)](https://grafana.com/docs/loki/latest/reference/api/#push-log-entries-to-loki) +- **Mimir**: [Prometheus Remote Write API](https://grafana.com/docs/mimir/latest/references/http-api/#remote-write) +- **Tempo**: [OTLP HTTP API](https://grafana.com/docs/tempo/latest/configuration/?pg=docs-tempo-latest-api-otlp-http#otlp) + +### Dashboard Access + +Access the Grafana dashboard using the domain configured in `monitoring_domain`. + +- **URL**: `https://grafana.` +- **Username**: `admin` +- **Password**: ** + +![Grafana Dashboard](img/grafana-dashboard.png) + +## Maintenance + +### Upgrades + +To upgrade components, update the version variables in `terraform.tfvars` or `variables.tf` and re-run `terraform apply`. + +**Note**: The current stack uses **Loki v6.20.0**. Major version upgrades should be tested in a staging environment first to ensure compatibility with the storage schema. + +### Uninstallation + +To remove all resources created by this module: + +```bash +terraform destroy +``` + +**Warning**: Google Cloud Storage buckets containing observability data have `force_destroy` set to `false` to prevent accidental data loss. If you intend to delete the data, you must empty the buckets manually before running destroy. diff --git a/docs/manual-lgtm-deployment.md b/docs/manual-lgtm-deployment.md new file mode 100644 index 00000000..ffc21282 --- /dev/null +++ b/docs/manual-lgtm-deployment.md @@ -0,0 +1,84 @@ +# Manual LGTM Stack Deployment + +This guide explains how to manually deploy an observability stack. + +## 1. Deployment Constraints + +This deployment expects an external Docker network named `netbird_netbird` to exist. This is typically created by the NetBird management stack. + +If you do not have NetBird running, you must create this network manually to avoid errors: + +```bash +docker network create netbird_netbird +``` + +## 2. Deploying the Stack + +1. Navigate to the manual configuration directory: + + ```bash + cd ../lgtm-stack/manual + ``` + +2. Start the services in detached mode: + + ```bash + docker compose up -d + ``` + +This will spin up the following containers: + +* `loki` (Logs) +* `prometheus` (Metrics) +* `grafana` (Visualization) +* `alloy` (Collector) +* `node-exporter` & `container-metrics` (Host monitoring) + +## 3. Verifying the Deployment ("Pods") + +To check the status of your containers (often referred to as pods in Kubernetes contexts), run: + +```bash +docker compose ps +``` + +You should see all services with a status of `Up` (and `healthy` where applicable). + +![Sample monitoring stack](img/monitor-netbird-stack-ps.png) + +## 4. Accessing Grafana + +* **URL**: `http://localhost:3000` (or your server's IP address) +* **Default Credentials**: + * User: `admin` + * Password: `admin` + +## 5. Verifying Data Sources + +Once logged into Grafana, you should verify that it can connect to Prometheus and Loki. + +### 5.1 Verify Prometheus + +1. Navigate to **Connections** > **Data Sources**. +2. Click on **Prometheus**. +3. Scroll to the bottom and click **Save & test**. +4. You should see a green message: *“Successfully queried the Prometheus API.”* + +![Grafana Prometheus data source](img/grafana-datasource-prometheus.png) + +### 5.2 Verify Loki + +1. Navigate to **Connections** > **Data Sources**. +2. Click on **Loki**. +3. Scroll to the bottom and click **Save & test**. +4. You should see a green message indicating the data source is connected. + +![Grafana Loki data source](img/grafana-datasource-loki.png) + +## 6. Cleanup + +To stop and remove the stack: + +```bash +docker compose down +``` diff --git a/docs/testing-monitoring-stack-deployment.md b/docs/testing-monitoring-stack-deployment.md new file mode 100644 index 00000000..b97d4292 --- /dev/null +++ b/docs/testing-monitoring-stack-deployment.md @@ -0,0 +1,1190 @@ +# Grafana Observability Stack Testing Documentation + +## Project: - GKE Deployment + +## Table of Contents + +1. [Overview](#overview) +2. [Architecture](#architecture) +3. [Prerequisites](#prerequisites) +4. [Pre-Testing Health Checks](#pre-testing-health-checks) +5. [Component Testing](#component-testing) +6. [Remote Testing](#remote-testing) + +--- + + +## 1. Overview + +### 1.1 Purpose + +This documentation provides comprehensive testing procedures for the Grafana observability stack deployed on GKE cluster ``. The testing ensures all components are functioning correctly. + +### 1.2 Stack Components + +- **Grafana**: Dashboard and visualization +- **Loki**: Log aggregation and storage +- **Mimir**: Long-term metrics storage (Prometheus-compatible) +- **Tempo**: Distributed tracing backend +- **Prometheus**: Metrics collection and scraping +- **Alloy**: Unified telemetry collector + +--- + + + +## 2. Architecture + +### 2.1 Component Flow Diagram + +```mermaid +graph LR + %% Define Styles + classDef input fill:#000,stroke:#333,stroke-width:2px,color:#fff; + classDef storage fill:#000,stroke:#333,stroke-width:2px,stroke-dasharray: 5 5,color:#fff; + classDef component fill:#000,stroke:#333,stroke-width:2px,color:#fff; + classDef grafana fill:#000,stroke:#333,stroke-width:2px,color:#fff; + + %% Nodes + Ext[External Sources]:::input + LB[Load Balancer]:::component + GW[Gateway]:::component + + %% Write Path Nodes + Dist[Distributor]:::component + Ing[Ingester]:::component + Store[(Storage)]:::storage + + %% Read Path Nodes + Querier[Querier]:::component + QF[Query Frontend]:::component + Graf[Grafana]:::grafana + + %% Flows + Ext --> LB + LB --> GW + + %% Write Path (Top) + GW -->|Write / Push| Dist + Dist --> Ing + Ing --> Store + + %% Read Path (Bottom - branching from Gateway) + GW -.->|Read / Pull| Querier + Querier --> QF + QF --> Graf + + %% Internal Data Fetching (Implicit) + Querier -.->|Fetch Data| Ing + Querier -.->|Fetch Data| Store +``` + +### 2.2 Service Endpoints + +| Component | Internal Service | Port | Type | +|-----------|-----------------|------|------| +| Grafana | monitoring-grafana | 80 | ClusterIP | +| Loki Gateway | monitoring-loki-gateway | 80 | ClusterIP | +| Mimir NGINX | monitoring-mimir-nginx | 80 | ClusterIP | +| Tempo Gateway | monitoring-tempo-gateway | 3200 | ClusterIP | +| Prometheus | monitoring-prometheus-server | 80 | ClusterIP | + +--- + + + +## 3. Prerequisites + +### 3.1 Required Tools Installation + +- [Kubectl (Kubernetes CLI)](https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/) +- [Helm (Package Manager)](https://helm.sh/docs/intro/install/) +- [JQ (JSON Processor)](https://lindevs.com/install-jq-on-ubuntu) +- [LogCLI (Loki CLI)](https://grafana.com/docs/loki/latest/query/logcli/getting-started/) +- [Grafana Alloy (Collector)](https://grafana.com/docs/alloy/latest/set-up/install/linux/) + +### 3.2 GKE Cluster Access + +```bash +# Authenticate with GCP +gcloud auth login + +# Set project +gcloud config set project + +# Get cluster credentials +gcloud container clusters get-credentials \ + --zone= \ + --project= + +# Verify access +kubectl get nodes +kubectl get namespaces +kubectl get pods -n +``` + +### 3.3 Access Credentials + +```bash +# Get Grafana admin password +export GRAFANA_PASSWORD=$(kubectl get secret -n \ + monitoring-grafana -o jsonpath="{.data.admin-password}" | base64 --decode) +echo "Grafana Password: $GRAFANA_PASSWORD" + +# Store for later use +echo "export GRAFANA_PASSWORD='$GRAFANA_PASSWORD'" >> ~/.bashrc +source ~/.bashrc +``` + +--- + + + +## 4. Pre-Testing Health Checks + +### 4.1 Comprehensive Health Check + +```bash +# Check all pod statuses +kubectl get pods -n -o wide + +# Identify any unhealthy pods +kubectl get pods -n --field-selector=status.phase!=Running + +# Check pod events +kubectl get events -n --sort-by='.lastTimestamp' | tail -20 + +# Verify all services have endpoints +kubectl get endpoints -n + +# Check resource usage +kubectl top nodes +kubectl top pods -n --sort-by=memory +``` + +### 4.2 Service Connectivity Test + +```bash +# To view services +kubectl get svc -n +# Test internal DNS resolution +kubectl run -n dns-test --image=busybox:1.28 \ + --rm -it --restart=Never -- \ + nslookup + +# Test all major endpoints +## 1. Test Loki Gateway (Needs /loki prefix usually) +kubectl run -n test-loki --image=curlimages/curl -it --rm --restart=Never -- \ + curl -v --max-time 5 "http:///loki/ready" + +## 2. Test Mimir Gateway (Mimics Prometheus API) +kubectl run -n test-mimir --image=curlimages/curl -it --rm --restart=Never -- \ + curl -v --max-time 5 "http:///api/v1/status/buildinfo" + +## 3. Test Tempo Gateway (Often requires /status or is restricted) +kubectl run -n test-tempo --image=curlimages/curl -it --rm --restart=Never -- \ + curl -v --max-time 5 "http:///ready" +``` + +### 4.3 Storage Health + +```bash +# Check PVC status +kubectl get pvc -n + +# Verify all are Bound +kubectl get pvc -n -o wide | grep -v Bound || echo "All PVCs are Bound" +# Check disk usage on ingesters +kubectl exec -n -- df -h + +``` + +--- + + + +## 5. Component Testing(still on cluster) + +### 5.1 Deploy Test Generators + +Create all-generators.yaml file + +```bash +# ========================================================= +# LOG GENERATORS +# ========================================================= +apiVersion: apps/v1 +kind: Deployment +metadata: + name: log-generator + namespace: test-apps + labels: + app: log-generator +spec: + replicas: 3 + selector: + matchLabels: + app: log-generator + template: + metadata: + labels: + app: log-generator + spec: + containers: + - name: json-logs + image: mingrammer/flog:0.4.3 + args: + - --loop + - --format=json + - --number=10 + - --delay=1s + resources: + requests: + memory: "64Mi" + cpu: "100m" + limits: + memory: "128Mi" + cpu: "200m" + - name: structured-logs + image: busybox:1.28 + command: ["/bin/sh"] + args: + - -c + - | + while true; do + echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) level=info msg=\"Application started\" service=api version=1.0.0" + sleep 2 + echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) level=warn msg=\"High memory usage\" memory_percent=85 threshold=80" + sleep 3 + echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) level=error msg=\"Database connection failed\" error=\"timeout after 30s\" retry_count=3" + sleep 2 + echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) level=debug msg=\"Cache hit\" key=user:12345 hit_rate=92.5" + sleep 3 + done + resources: + requests: + memory: "32Mi" + cpu: "50m" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: error-simulator + namespace: test-apps + labels: + app: error-simulator +spec: + replicas: 2 + selector: + matchLabels: + app: error-simulator + template: + metadata: + labels: + app: error-simulator + spec: + containers: + - name: error-generator + image: busybox:1.28 + command: ["/bin/sh"] + args: + - -c + - | + while true; do + echo "ERROR: Out of memory exception in thread main" + sleep 5 + echo "FATAL: Unable to connect to database: connection refused" + sleep 7 + echo "ERROR: HTTP 500 Internal Server Error on /api/users" + sleep 4 + echo "CRITICAL: Disk usage exceeded 95% on /data" + sleep 6 + done + resources: + requests: + memory: "32Mi" + cpu: "50m" +--- +# ========================================================= +# METRICS GENERATORS +# ========================================================= +apiVersion: apps/v1 +kind: Deployment +metadata: + name: metrics-generator + namespace: test-apps + labels: + app: metrics-generator +spec: + replicas: 3 + selector: + matchLabels: + app: metrics-generator + template: + metadata: + labels: + app: metrics-generator + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + spec: + containers: + - name: prometheus-example-app + image: quay.io/brancz/prometheus-example-app:v0.3.0 + ports: + - containerPort: 8080 + name: metrics + resources: + requests: + memory: "64Mi" + cpu: "100m" + limits: + memory: "128Mi" + cpu: "200m" + livenessProbe: + httpGet: + path: / + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 10 +--- +apiVersion: v1 +kind: Service +metadata: + name: metrics-generator + namespace: test-apps + labels: + app: metrics-generator +spec: + selector: + app: metrics-generator + ports: + - port: 8080 + targetPort: 8080 + name: metrics +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: metrics-generator-monitor + namespace: test-apps + labels: + app: metrics-generator + release: monitoring +spec: + namespaceSelector: + matchNames: + - test-apps + selector: + matchLabels: + app: metrics-generator + endpoints: + - port: metrics + interval: 30s + path: /metrics +--- +# ========================================================= +# TRACE GENERATORS +# ========================================================= +apiVersion: apps/v1 +kind: Deployment +metadata: + name: trace-generator + namespace: test-apps + labels: + app: trace-generator +spec: + replicas: 2 + selector: + matchLabels: + app: trace-generator + template: + metadata: + labels: + app: trace-generator + spec: + containers: + - name: telemetrygen + image: ghcr.io/open-telemetry/opentelemetry-collector-contrib/telemetrygen:latest + args: + - traces + - --otlp-endpoint=otel-collector.test-apps.svc.cluster.local:4317 + - --otlp-insecure + - --rate=1 + - --duration=1h + - --workers=1 + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "128Mi" + cpu: "100m" +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: microservice-simulator + namespace: test-apps + labels: + app: microservice-sim +spec: + replicas: 2 + selector: + matchLabels: + app: microservice-sim + template: + metadata: + labels: + app: microservice-sim + spec: + containers: + - name: frontend + image: ghcr.io/open-telemetry/opentelemetry-collector-contrib/telemetrygen:latest + args: + - traces + - --otlp-endpoint=monitoring-tempo-distributor..svc.cluster.local:4317 + - --otlp-insecure + - --rate=1 + - --duration=1h + - --service=frontend-service + resources: + requests: + memory: "64Mi" + cpu: "50m" +--- +# ========================================================= +# ALLOY COLLECTOR (for Log scraping to Loki) +# ========================================================= +apiVersion: v1 +kind: ConfigMap +metadata: + name: alloy-config + namespace: test-apps +data: + config.alloy: | + logging { + level = "info" + format = "json" + } + + discovery.kubernetes "pods" { + role = "pod" + } + + discovery.relabel "pod_logs" { + targets = discovery.kubernetes.pods.targets + rule { + source_labels = ["__meta_kubernetes_namespace"] + target_label = "namespace" + } + rule { + source_labels = ["__meta_kubernetes_pod_name"] + target_label = "pod" + } + rule { + source_labels = ["__meta_kubernetes_pod_label_app"] + target_label = "app" + } + rule { + source_labels = ["__meta_kubernetes_pod_container_name"] + target_label = "container" + } + } + + loki.source.kubernetes "pod_logs" { + targets = discovery.relabel.pod_logs.output + forward_to = [loki.write.default.receiver] + } + + loki.write "default" { + endpoint { + url = "http://monitoring-loki-gateway..svc.cluster.local/loki/api/v1/push" + } + } +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: alloy-collector + namespace: test-apps +spec: + selector: + matchLabels: + app: alloy-collector + template: + metadata: + labels: + app: alloy-collector + spec: + serviceAccountName: alloy-collector + containers: + - name: alloy + image: grafana/alloy:latest + args: + - run + - /etc/alloy/config.alloy + - --server.http.listen-addr=0.0.0.0:12345 + volumeMounts: + - name: config + mountPath: /etc/alloy + - name: varlog + mountPath: /var/log + readOnly: true + ports: + - containerPort: 12345 + name: http + resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "256Mi" + cpu: "200m" + volumes: + - name: config + configMap: + name: alloy-config + - name: varlog + hostPath: + path: /var/log + type: Directory +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: alloy-collector + namespace: test-apps +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: alloy-collector +rules: +- apiGroups: [""] + resources: + - nodes + - nodes/proxy + - pods + - events + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - services + - endpoints + verbs: ["get", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: alloy-collector +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: alloy-collector +subjects: +- kind: ServiceAccount + name: alloy-collector + namespace: test-apps +--- +# ========================================================= +# TEMPO OTLP RECEIVER (for Traces) +# ========================================================= +apiVersion: v1 +kind: ConfigMap +metadata: + name: otel-collector-config + namespace: test-apps +data: + otel-collector-config.yaml: | + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + exporters: + otlp: + endpoint: monitoring-tempo-distributor..svc.cluster.local:4317 + tls: + insecure: true + + service: + pipelines: + traces: + receivers: [otlp] + exporters: [otlp] +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: otel-collector + namespace: test-apps + labels: + app: otel-collector +spec: + replicas: 1 + selector: + matchLabels: + app: otel-collector + template: + metadata: + labels: + app: otel-collector + spec: + containers: + - name: otel-collector + image: otel/opentelemetry-collector-contrib:latest + args: + - "--config=/etc/otel-collector-config.yaml" + volumeMounts: + - name: otel-collector-config + mountPath: /etc/otel-collector-config.yaml + subPath: otel-collector-config.yaml + ports: + - containerPort: 4317 + name: otlp-grpc + - containerPort: 4318 + name: otlp-http + resources: + requests: + memory: "256Mi" + cpu: "200m" + limits: + memory: "512Mi" + cpu: "500m" + volumes: + - name: otel-collector-config + configMap: + name: otel-collector-config +--- +apiVersion: v1 +kind: Service +metadata: + name: otel-collector + namespace: test-apps +spec: + selector: + app: otel-collector + ports: + - port: 4317 + targetPort: 4317 + name: otlp-grpc + - port: 4318 + targetPort: 4318 + name: otlp-http +``` + +```bash +# Install some CRDs +helm repo update +helm install prometheus prometheus-community/kube-prometheus +kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/main/example/prometheus-operator-crd-full/monitoring.coreos.com_alertmanagerconfigs.yaml --force-conflicts + +# Create test-apps namespace +kubectl create namespace test-apps +# Apply all generators +kubectl apply -f all-generators.yaml -n test-apps + +# Verify pods are running +kubectl get pods -n test-apps -w +kubectl wait --for=condition=ready pod -l app=log-generator -n test-apps --timeout=120s +``` + +#### 5.1.1 Verify Logs Are Being Generated(Loki testing) + +```bash +# Check log-generator pods +kubectl logs -n test-apps -l app=log-generator -c json-logs --tail=20 +kubectl logs -n test-apps -l app=log-generator -c structured-logs --tail=20 +kubectl logs -n test-apps -l app=error-simulator --tail=20 +``` + +#### 5.1.2 Query Logs via LogCLI + +```bash +# Port-forward to Loki +kubectl port-forward -n svc/monitoring-loki-gateway 3100:80 & +PF_PID=$! +sleep 3 + +# Export Loki address +export LOKI_ADDR=http://localhost:3100 + +# Test 1: Query all logs from test-apps namespace +logcli query '{namespace="test-apps"}' --limit=50 --since=5m + +# Test 2: Query JSON logs only +logcli query '{app="log-generator", container="json-logs"}' --limit=20 + +# Test 3: Query structured logs +logcli query '{app="log-generator", container="structured-logs"}' --limit=20 + +# Test 4: Query error logs +logcli query '{namespace="test-apps"} |~ "(?i)error|fatal|critical"' --limit=30 --since=5m + +# Test 5: Query by pod name +logcli query '{pod=~"log-generator.*"}' --limit=20 + +# Test 6: Query with label matching +logcli query '{app="error-simulator"}' --limit=15 + +# Cleanup +kill $PF_PID +``` + +![img](img/logcli.png) + +--- + +### 5.2 Mimir Testing (Metrics Storage) + +#### 5.2.1 Verify Metrics Generator is Running + +```bash +# Check metrics-generator pods +kubectl get pods -n test-apps -l app=metrics-generator + +# Check if ServiceMonitor was created +kubectl get servicemonitor -n test-apps +kubectl describe servicemonitor metrics-generator-monitor -n test-apps + +# Verify endpoints are registered +kubectl get endpoints -n test-apps metrics-generator +``` + +#### 5.2.2 Query Metrics via PromQL (Prometheus API) + +```bash +# Port-forward to Prometheus +kubectl port-forward -n svc/monitoring-prometheus-server 9090:80 & +sleep 3 + +# Test 1: Check if metrics are being scraped +curl -s "http://localhost:9090/api/v1/query?query=prometheus_example_app_up" | jq . + +# Test 2: List all metrics from metrics-generator +curl -s "http://localhost:9090/api/v1/label/__name__/values" | jq . | grep prometheus_example + +# Test 3: Query specific metrics +curl -s "http://localhost:9090/api/v1/query?query=prometheus_example_app_requests_total" | jq . + +# Test 4: Query rate of requests +curl -s "http://localhost:9090/api/v1/query?query=rate(prometheus_example_app_requests_total%5B5m%5D)" | jq . + +# Test 5: Check metrics count over time +curl -s "http://localhost:9090/api/v1/query_range?query=prometheus_example_app_up&start=$(date -d '30 minutes ago' +%s)&end=$(date +%s)&step=300" | jq . +``` + +![img](img/mimir1.png) + +![img](img/mimir2.png) + +#### 5.2.3 Check Metrics in Mimir Backend + +```bash +# Port-forward to Mimir NGINX gateway +kubectl port-forward -n svc/monitoring-mimir-nginx 8081:80 & +sleep 3 + +# Test 1: Query metrics from Mimir directly +curl -s "http://localhost:8081/prometheus/api/v1/query?query=prometheus_example_app_up" | jq . + +# Test 2: List label names +curl -s "http://localhost:8081/prometheus/api/v1/labels" | jq . + +# Test 3: Check ingestion metrics +curl -s "http://localhost:8081/prometheus/api/v1/query?query=cortex_ingester_ingested_samples_total" | jq . +``` + +![img](img/mimir.png) + +--- + +### 5.3 Tempo Testing (Distributed Tracing) + +#### 5.3.1 Verify Trace Generators Are Running + +```bash +# Check trace generators +kubectl get pods -n test-apps -l app=trace-generator +kubectl get pods -n test-apps -l app=microservice-sim + +# Check OTEL collector +kubectl get pods -n test-apps -l app=otel-collector + +# Verify OTEL collector service +kubectl get svc -n test-apps otel-collector +``` + +#### 5.3.2 Check Trace Generator Logs + +```bash +# Check synthetic-load-generator logs +kubectl logs -n test-apps -l app=trace-generator -c synthetic-load --tail=30 + +# Check microservice-simulator logs +kubectl logs -n test-apps -l app=microservice-sim -c frontend --tail=30 + +# Check OTEL collector for trace ingestion +kubectl logs -n test-apps -l app=otel-collector --tail=50 +``` + +#### 5.3.3 Query Traces via Tempo + +```bash +# Port-forward to Tempo +kubectl port-forward -n svc/monitoring-tempo-gateway 3200:80 & +sleep 3 + +# Test 1: Check if Tempo is receiving traces +curl -s "http://localhost:3200/api/search" | jq . + +# Test 2: Query traces for synthetic-load-test service +curl -s "http://localhost:3200/api/search?service=synthetic-load-test" | jq . + +# Test 3: Query traces for frontend-service +curl -s "http://localhost:3200/api/search?service=frontend-service" | jq . + +# Test 4: Query by trace duration (find slow traces) +curl -s "http://localhost:3200/api/search?minDuration=100ms" | jq . + +# Test 5: Get specific trace details +TRACE_ID=$(curl -s "http://localhost:3200/api/search?service=synthetic-load-test&limit=1" | jq -r '.traces[0].traceID') +curl -v "http://localhost:3200/api/search?service=synthetic-load-test&limit=1" +``` + +--- + +![img](img/tempo1.png) + +![img](img/tempo2.png) + +### 5.4 End-to-End Integration Testing + +#### 5.4.1 Validate Data Flow (Logs → Loki → Grafana) + +```bash +# Step 1: Generate logs +kubectl logs -n test-apps -l app=log-generator -c structured-logs --tail=5 + +# Step 2: Verify logs in Loki via LogCLI +export LOKI_ADDR=http://localhost:3100 +logcli query '{app="log-generator"}' --limit=5 + +``` + +#### 5.4.2 Validate Data Flow (Metrics → Prometheus → Mimir → Grafana) + +```bash +# Step 1: Check metrics are being generated +kubectl port-forward -n test-apps svc/metrics-generator 8082:8080 & +sleep 3 +curl -s "http://localhost:8082/metrics" | head -20 + +# Step 2: Verify Prometheus scrapes them + kubectl port-forward -n lgtm svc/monitoring-prometheus-kube-state-metrics 9091:8080 & + sleep 3 +curl -s "http://localhost:9091/api/v1/targets" | jq '.data.activeTargets[] | select(.labels.job=="test-apps/metrics-generator")' + +# Step 3: Query from Mimir +kubectl port-forward -n lgtm svc/monitoring-mimir-query-frontend 8083:8080 & +sleep 3 +curl -s "http://localhost:8083/prometheus/api/v1/query?query=http_requests_total" | jq . + +``` + +![img](img/mimirb2.png) + +#### 5.4.3 Validate Data Flow (Traces → OTEL → Tempo → Grafana) + +```bash +# Step 1: Verify OTEL collector is receiving traces +kubectl logs -n test-apps -l app=otel-collector | grep -i "span\|trace" + +# Step 2: Query Tempo for traces +curl -s "http://localhost:3200/api/search?service=synthetic-load-test&limit=1" | jq '.traces[0] | {traceID, spanSet}' + +``` + +--- + +#### 5.5 Validate No Errors in Collectors + +```bash +# Check Alloy logs for errors +kubectl logs -n test-apps -l app=alloy-collector | grep -i "error\|warning" | tail -20 + +# Check OTEL collector logs for errors +kubectl logs -n test-apps -l app=otel-collector | grep -i "error" | tail -20 + +# Check for pod restarts (should be 0) +kubectl get pods -n test-apps -o wide | grep -v "0/1.*0" +``` + +--- + +### 5.6 Cleanup After Testing + +```bash +# Delete test namespace +kubectl delete namespace test-apps + +# Kill port-forwards +pkill -f "kubectl port-forward" + +# Verify cleanup +kubectl get namespace | grep test-apps # Should return nothing +``` + +--- + + + +## 6. Remote Testing + +### 6.1 Remote Testing from External Machine + +#### 6.1.1 Test Grafana Remotely + +```bash +# Still on you gke, get the different endpoints +kubectl get ingress -n +# On an external machine +# Test health endpoint +curl -v http://$GRAFANA_IP/api/health + +# Test login(default admin password and user are "admin" and "admin") +curl -X POST https://$GRAFANA_ENDPOINT/login \ + -H "Content-Type: application/json" \ + -d '{"user":"admin","password":"'$GRAFANA_PASSWORD'"}' \ + -c cookies.txt + +``` + +#### 6.1.2 Test Loki/Prometheus/Mimir/Tempo Remotely + +```bash +# Deploy a containerized app(our example will be using a self-hosted netbird app locally) +# note: please if you want to use a local domain name use "localhost" +export NETBIRD_DOMAIN= +bash getting-started-with-zitadel.sh + +# Create a .alloy file +mkdir ~/observability-agent +nano -p ~/observability-agent/config.alloy +``` + +```alloy +# Paste this in ~/observability-agent/config.alloy +// ========================================================= +// 1. LOGS: UNIVERSAL DISCOVERY (LOKI) +// ========================================================= + +// Discover ALL containers running on Docker +discovery.docker "all_containers" { + host = "unix:///var/run/docker.sock" + // Note: No 'filter' block means "Get Everything" +} + +// Convert weird Docker metadata into clean labels +discovery.relabel "clean_labels" { + targets = discovery.docker.all_containers.targets + + rule { + source_labels = ["__meta_docker_container_name"] + regex = "/?(.*)" + target_label = "container" + } + + // Prevent Loop: Don't scrape Alloy's own logs + rule { + source_labels = ["__meta_docker_container_name"] + regex = "/?alloy-agent" + action = "drop" + } +} + +loki.source.docker "ship_logs" { + host = "unix:///var/run/docker.sock" + targets = discovery.relabel.clean_labels.output + forward_to = [loki.write.gke_loki.receiver] +} + +loki.write "gke_loki" { + endpoint { + url = "https:///loki/api/v1/push" + basic_auth { username = "admin" password = "admin" } + headers = { "X-Scope-OrgID" = "netbird-prod" } + } +} + +// ========================================================= +// 2. METRICS: CONTAINER PERFORMANCE (MIMIR / cAdvisor) +// ========================================================= + +// Built-in cAdvisor to read CPU/Mem of all containers +prometheus.exporter.cadvisor "container_metrics" { + docker_host = "unix:///var/run/docker.sock" + + // Important for performance + store_container_labels = false + + // Filter out short-lived processes if needed + enabled_metrics = ["cpu", "memory", "network", "diskIO"] +} + +prometheus.scrape "scrape_containers" { + targets = prometheus.exporter.cadvisor.container_metrics.targets + forward_to = [prometheus.remote_write.gke_mimir.receiver] + scrape_interval = "15s" +} + +// ========================================================= +// 3. METRICS: HOST PERFORMANCE (Node Exporter) +// ========================================================= +prometheus.exporter.unix "ubuntu_host" { + include_exporter_metrics = true +} + +prometheus.scrape "scrape_host" { + targets = prometheus.exporter.unix.ubuntu_host.targets + forward_to = [prometheus.remote_write.gke_mimir.receiver] +} + +prometheus.remote_write "gke_mimir" { + endpoint { + url = "https:///api/v1/push" + basic_auth { username = "admin" password = "admin" } + headers = { "X-Scope-OrgID" = "netbird-prod" } + } +} +// ========================================================= +// 4. TRACING: Receive from Beyla -> Send to Tempo +// ========================================================= + +// Listen for incoming traces from Beyla +otelcol.receiver.otlp "default" { + grpc { endpoint = "0.0.0.0:4317" } + http { endpoint = "0.0.0.0:4318" } + + output { + traces = [otelcol.exporter.otlp.gke_tempo.input] + } +} + +// Send traces to GKE +otelcol.exporter.otlp "gke_tempo" { + client { + endpoint = ":443" + + // Auth (reuse the basic_auth block defined below if needed, or inline it) + auth = otelcol.auth.basic.creds.handler + + headers = { + "X-Scope-OrgID" = "netbird-prod", + } + } +} + +otelcol.auth.basic "creds" { + username = "admin" + password = "admin" +} +``` + +```bash +# Run the new agent with Host Access (Required for cAdvisor) + +docker run -d \ + --name alloy-agent \ + --restart always \ + --privileged \ + --pid=host \ + -p 12345:12345 \ + -p 4317:4317 \ + -p 4318:4318 \ + -v /:/rootfs:ro \ + -v /var/run:/var/run:ro \ + -v /sys:/sys:ro \ + -v /var/lib/docker/:/var/lib/docker:ro \ + -v /dev/disk/:/dev/disk:ro \ + -v ~/observability-agent/config.alloy:/etc/alloy/config.alloy \ + grafana/alloy:latest run --server.http.listen-addr=0.0.0.0:12345 /etc/alloy/config.alloy +``` + +To test Tempo (Distributed Tracing) using your existing containerized Netbird application, we need to introduce Grafana Beyla. +Beyla uses eBPF technology to "watch" your Netbird containers from the kernel level. It automatically generates traces for every HTTP request and SQL query without you changing any Netbird code. + +# Start beyla + +```bash +docker run -d \ + --name beyla-tracer \ + --privileged \ + --pid=host \ + --net=host \ + -e BEYLA_OPEN_PORT=80,443,33073,10000 \ + -e BEYLA_SERVICE_NAME=netbird-app \ + -e OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 \ + -e BEYLA_PRINT_TRACES=true \ + grafana/beyla:latest +``` + +#### 6.1.3 + +Verify and Test on Dashboard +Now that you are sending everything, here is how you test it in Grafana. + +##### Test 1: Verify Universal Logs (Loki) + +Go to Explore -> Loki. +Go to Builder mode, and under label filter, choose "container" and choose any container. +output is as shown below + +![img](img/grafana-loki1.png) + +![img](img/grafana-loki2.png) + +##### Test 2: Verify Container Performance (Mimir) + +Go to Explore -> Prometheus (Mimir). + +- Check Memory Usage: +This query shows the Top 5 memory-hungry containers on your Ubuntu server. + +```Promql +topk(5, container_memory_usage_bytes{image!=""}) +``` + +output is as shown below +![img](img/grafana-mimir.png) + +- Check CPU Usage: +This shows how much CPU core % each container is using. + +```Promql +sum(rate(container_cpu_usage_seconds_total{image!=""}[1m])) by (name) +```` + +output is as shown below +![img](img/grafana-mimir2.png) + +##### Test 3: Verify Tempo traces + +Go to Explore -> Tempo(on the search bar). + +click "search" to see traces + +![img](img/grafana-tempo1.png) + +click any "trace" to view the latency of an app + +![img](img/grafana-tempo2.png) + +![img](img/grafana-tempo3.png) + +![img](img/grafana-tempo4.png) diff --git a/lgtm-stack/README.md b/lgtm-stack/README.md index e69de29b..36dd790c 100644 --- a/lgtm-stack/README.md +++ b/lgtm-stack/README.md @@ -0,0 +1,26 @@ +# LGTM Stack Deployment + +The **LGTM** stack is a comprehensive open-source observability platform powered by Grafana Labs. It provides unmatched correlation between metrics, logs, and traces, allowing complete visibility into your applications and infrastructure. + +## Components + +- **Loki**: Like Prometheus, but for logs. It is a horizontally-scalable, highly-available, multi-tenant log aggregation system. +- **Grafana**: The open observability platform for visualization and analytics. +- **Tempo**: A high-volume, minimal dependency distributed tracing backend. +- **Mimir**: Scalable long-term storage for Prometheus metrics. + +## Deployment Guides + +This repository provides two guides to help you deploy the stack: + +### 1. Automated Deployment +For a fully automated deployment using this stack, please follow the [Kubernetes Observability Guide](../docs/kubernetes-observability.md). + +### 2. Manual Deployment +If you prefer to configure and deploy components manually, or need to understand the individual steps, please refer to the [Manual LGTM Deployment Guide](../docs/manual-lgtm-deployment.md). + +## Testing & Verification +To verify that your deployment is working correctly, please refer to the [Testing Monitoring Stack Deployment Guide](../docs/testing-monitoring-stack-deployment.md). + +## Configuration +For detailed configuration of the Alloy collector, please refer to the [Alloy Configuration Guide](../docs/alloy-config.md). diff --git a/lgtm-stack/manual/alloy-config.alloy b/lgtm-stack/manual/alloy-config.alloy new file mode 100644 index 00000000..a9d181a8 --- /dev/null +++ b/lgtm-stack/manual/alloy-config.alloy @@ -0,0 +1,80 @@ +// ============================================================================== +// Grafana Alloy Configuration for NetBird Monitoring +// ============================================================================== + +logging { + level = "info" + format = "logfmt" +} + +// ============================================================================== +// DOCKER CONTAINER LOGS +// ============================================================================== + +discovery.docker "containers" { + host = "unix:///var/run/docker.sock" +} + +discovery.relabel "containers" { + targets = discovery.docker.containers.targets + + // Essential labels only: container name, compose project, service name, host + rule { + source_labels = ["__meta_docker_container_name"] + regex = "/(.*)" + target_label = "container" + } + + rule { + source_labels = ["__meta_docker_container_label_com_docker_compose_project"] + target_label = "compose_project" + } + + rule { + source_labels = ["__meta_docker_container_label_com_docker_compose_service"] + target_label = "service" + } + + rule { + target_label = "host" + replacement = constants.hostname + } +} + +loki.source.docker "docker_logs" { + host = "unix:///var/run/docker.sock" + targets = discovery.relabel.containers.output + forward_to = [loki.process.docker_logs.receiver] +} + +loki.process "docker_logs" { + forward_to = [loki.write.loki.receiver] + stage.docker {} +} + +// ============================================================================== +// SYSTEM LOGS (journald) +// ============================================================================== + +loki.source.journal "system_logs" { + forward_to = [loki.write.loki.receiver] + labels = { + job = "systemd-journal", + } +} + +// ============================================================================== +// LOKI WRITE ENDPOINT +// (metrics are scraped directly by Prometheus; Alloy only handles logs here) +// ============================================================================== + +loki.write "loki" { + endpoint { + url = "http://loki:3100/loki/api/v1/push" + } + + external_labels = { + cluster = "netbird-selfhosted", + env = "production", + } +} diff --git a/lgtm-stack/manual/docker-compose.yaml b/lgtm-stack/manual/docker-compose.yaml new file mode 100644 index 00000000..654264c7 --- /dev/null +++ b/lgtm-stack/manual/docker-compose.yaml @@ -0,0 +1,229 @@ +networks: + monitoring: + driver: bridge + # Connect to NetBird's main Docker network created by the NetBird compose stack + netbird: + external: true + name: netbird_netbird + +volumes: + loki_data: {} + prometheus_data: {} + grafana_data: {} + netbird_management_data: + external: true + name: netbird_netbird_management + +services: + # ============================================================================= + # LOKI - Log Aggregation + # ============================================================================= + loki: + image: grafana/loki:3.0.0 + container_name: loki + user: "0:0" + restart: unless-stopped + ports: + - "3100:3100" + command: -config.file=/etc/loki/local-config.yaml + volumes: + - ./loki-config.yaml:/etc/loki/local-config.yaml:ro + - loki_data:/loki + networks: + - monitoring + - netbird + healthcheck: + test: [ "CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3100/ready" ] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + # ============================================================================= + # PROMETHEUS - Metrics Storage + # ============================================================================= + prometheus: + image: prom/prometheus:v2.54.1 + container_name: prometheus + user: "0:0" + restart: unless-stopped + ports: + - "9090:9090" + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--storage.tsdb.retention.time=30d' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + - '--web.enable-lifecycle' + - '--web.enable-admin-api' + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus_data:/prometheus + networks: + - monitoring + - netbird + extra_hosts: + - "host.docker.internal:host-gateway" + healthcheck: + test: [ "CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy" ] + interval: 10s + timeout: 5s + retries: 5 + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + # ============================================================================= + # GRAFANA - Visualization + # ============================================================================= + grafana: + image: grafana/grafana:11.3.0 + container_name: grafana + user: "0:0" + restart: unless-stopped + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_PATHS_PROVISIONING=/etc/grafana/provisioning + - GF_FEATURE_TOGGLES_ENABLE=publicDashboards + - GF_LOG_LEVEL=info + - GF_AUTH_ANONYMOUS_ENABLED=false + volumes: + - grafana_data:/var/lib/grafana + networks: + - monitoring + - netbird + depends_on: + loki: + condition: service_healthy + prometheus: + condition: service_healthy + healthcheck: + test: [ "CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/api/health" ] + interval: 10s + timeout: 5s + retries: 5 + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + # ============================================================================= + # ALLOY - Telemetry Collection (logs only, no HTTP healthcheck) + # ============================================================================= + alloy: + image: grafana/alloy:v1.4.2 + container_name: alloy + user: "0:0" + restart: unless-stopped + ports: + - "12345:12345" + command: + - run + - --server.http.listen-addr=0.0.0.0:12345 + - --storage.path=/var/lib/alloy/data + - /etc/alloy/config.alloy + volumes: + - ./alloy-config.alloy:/etc/alloy/config.alloy:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - /var/log:/var/log:ro + - /sys:/sys:ro + - /proc:/proc:ro + networks: + - monitoring + - netbird + depends_on: + loki: + condition: service_healthy + prometheus: + condition: service_healthy + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + # ============================================================================= + # NODE EXPORTER - Host & cgroup metrics + # ============================================================================= + node-exporter: + image: prom/node-exporter:v1.8.2 + container_name: node-exporter + restart: unless-stopped + ports: + - "9100:9100" + command: + - '--path.procfs=/host/proc' + - '--path.sysfs=/host/sys' + - '--path.rootfs=/rootfs' + - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' + - '--collector.netclass.ignored-devices=^(veth.*|br.*|docker.*|virbr.*|lo)$$' + - '--collector.netdev.device-exclude=^(veth.*|br.*|docker.*|virbr.*|lo)$$' + - '--collector.cgroups' + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + networks: + - monitoring + pid: host + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + # ============================================================================= + # CONTAINER CGROUP METRICS EXPORTER + # ============================================================================= + container-metrics: + image: ghcr.io/mosquito/cgroups-exporter:latest + container_name: container-metrics + restart: unless-stopped + # On cgroup v2 + systemd, Docker container cgroups typically appear under + # /sys/fs/cgroup/system.slice/docker-.scope/. We mount the host + # /sys tree as /host_sys and point cgroups-exporter at a glob that matches + # those scopes. + command: + - cgroups-exporter + - --cgroups-path + - "/host_sys/fs/cgroup/system.slice/docker-*.scope/" + volumes: + - /sys/:/host_sys:ro + networks: + - monitoring + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + # ============================================================================= + # NETBIRD EVENTS EXPORTER - Rust Version + # ============================================================================= + netbird-events-exporter: + image: ghcr.io/onelrian/signal:latest + container_name: netbird-events-exporter + restart: unless-stopped + environment: + - NETBIRD_API_URL=${NETBIRD_API_URL:-https://${NETBIRD_DOMAIN}/api} + - NETBIRD_API_TOKEN=${NETBIRD_PAT} + - LOKI_URL=http://loki:3100 + - RUST_LOG=info + depends_on: + loki: + condition: service_healthy + networks: + - monitoring + - netbird \ No newline at end of file diff --git a/lgtm-stack/manual/loki-config.yaml b/lgtm-stack/manual/loki-config.yaml new file mode 100644 index 00000000..4f76a065 --- /dev/null +++ b/lgtm-stack/manual/loki-config.yaml @@ -0,0 +1,49 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +limits_config: + retention_period: 720h # 30 days + reject_old_samples: true + reject_old_samples_max_age: 168h + ingestion_rate_mb: 100 + ingestion_burst_size_mb: 200 + per_stream_rate_limit: 50MB + per_stream_rate_limit_burst: 100MB + max_entries_limit_per_query: 100000 + max_streams_per_user: 0 # unlimited + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + retention_delete_worker_count: 150 + delete_request_store: filesystem + +analytics: + reporting_enabled: false diff --git a/lgtm-stack/manual/prometheus.yml b/lgtm-stack/manual/prometheus.yml new file mode 100644 index 00000000..81636840 --- /dev/null +++ b/lgtm-stack/manual/prometheus.yml @@ -0,0 +1,62 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + cluster: 'netbird-selfhosted' + environment: 'production' + +# Alertmanager configuration (optional) +# alerting: +# alertmanagers: +# - static_configs: +# - targets: +# - alertmanager:9093 + +# Load rules once and periodically evaluate them +# rule_files: +# - "alerts/*.yml" + +scrape_configs: + # Prometheus itself + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + # Node Exporter - Host metrics (single logical host label) + - job_name: 'node-exporter' + static_configs: + - targets: ['node-exporter:9100'] + labels: + instance: 'netbird-host' + + # Docker daemon metrics (alternative to cAdvisor) + # Assumes dockerd is started with metrics enabled, e.g.: + # "metrics-addr": "0.0.0.0:9323", "experimental": true + # and that the host is reachable as host.docker.internal or via its LAN IP. + - job_name: 'docker-daemon' + static_configs: + - targets: ['host.docker.internal:9323'] + labels: + instance: 'netbird-host' + + # Loki metrics + - job_name: 'loki' + static_configs: + - targets: ['loki:3100'] + + # Grafana metrics + - job_name: 'grafana' + static_configs: + - targets: ['grafana:3000'] + + # Alloy metrics + - job_name: 'alloy' + static_configs: + - targets: ['alloy:12345'] + + # Container cgroup metrics + - job_name: 'container-metrics' + static_configs: + - targets: ['container-metrics:9753'] + labels: + instance: 'netbird-host' diff --git a/lgtm-stack/terraform/locals.tf b/lgtm-stack/terraform/locals.tf deleted file mode 100644 index e69de29b..00000000 diff --git a/lgtm-stack/terraform/main.tf b/lgtm-stack/terraform/main.tf index e69de29b..e3ff353f 100644 --- a/lgtm-stack/terraform/main.tf +++ b/lgtm-stack/terraform/main.tf @@ -0,0 +1,584 @@ +terraform { + required_version = ">= 1.0" + required_providers { + google = { + source = "hashicorp/google" + version = "~> 5.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.0" + } + helm = { + source = "hashicorp/helm" + version = "~> 2.12" + } + } + + # Production Best Practice: Store state remotely + # backend "gcs" { + # bucket = "YOUR_TF_STATE_BUCKET" + # prefix = "terraform/state" + # } +} + +provider "google" { + project = var.project_id + region = var.region +} + +provider "kubernetes" { + host = "https://${data.google_container_cluster.primary.endpoint}" + token = data.google_client_config.default.access_token + cluster_ca_certificate = base64decode(data.google_container_cluster.primary.master_auth[0].cluster_ca_certificate) +} + +provider "helm" { + kubernetes { + host = "https://${data.google_container_cluster.primary.endpoint}" + token = data.google_client_config.default.access_token + cluster_ca_certificate = base64decode(data.google_container_cluster.primary.master_auth[0].cluster_ca_certificate) + } +} + +# Data sources +data "google_client_config" "default" {} + +data "google_container_cluster" "primary" { + name = var.cluster_name + location = var.cluster_location +} + +# GCS Buckets +locals { + buckets = [ + "loki-chunks", + "loki-ruler", + "mimir-blocks", + "mimir-ruler", + "tempo-traces", + ] + + bucket_prefix = var.project_id + loki_schema_from_date = var.loki_schema_from_date +} + + +resource "google_storage_bucket" "observability_buckets" { + for_each = toset(local.buckets) + + name = "${local.bucket_prefix}-${each.key}" + location = var.region + force_destroy = false + + uniform_bucket_level_access = true + + versioning { + enabled = true + } + + lifecycle_rule { + condition { + age = 90 + } + action { + type = "Delete" + } + } + + labels = { + environment = var.environment + managed-by = "terraform" + component = "observability" + } +} + +# GCP Service Account +resource "google_service_account" "observability_sa" { + account_id = var.gcp_service_account_name + display_name = "GKE Observability Service Account" + description = "Service account for Loki, Tempo, Grafana, Mimir, and Prometheus in GKE" +} + +# Grant Storage Object Admin role on all buckets +resource "google_storage_bucket_iam_member" "bucket_object_admin" { + for_each = toset(local.buckets) + + bucket = google_storage_bucket.observability_buckets[each.key].name + role = "roles/storage.objectAdmin" + member = "serviceAccount:${google_service_account.observability_sa.email}" +} + +# Grant Legacy Bucket Writer role on all buckets +resource "google_storage_bucket_iam_member" "bucket_legacy_writer" { + for_each = toset(local.buckets) + + bucket = google_storage_bucket.observability_buckets[each.key].name + role = "roles/storage.legacyBucketWriter" + member = "serviceAccount:${google_service_account.observability_sa.email}" +} + +# Kubernetes Namespace +resource "kubernetes_namespace" "observability" { + metadata { + name = var.namespace + + labels = { + name = var.namespace + managed-by = "terraform" + } + } +} + +# Kubernetes Service Account +resource "kubernetes_service_account" "observability_sa" { + metadata { + name = var.k8s_service_account_name + namespace = kubernetes_namespace.observability.metadata[0].name + + annotations = { + "iam.gke.io/gcp-service-account" = google_service_account.observability_sa.email + } + + labels = { + managed-by = "terraform" + } + } +} + +# Workload Identity Binding +resource "google_service_account_iam_member" "workload_identity_binding" { + service_account_id = google_service_account.observability_sa.name + role = "roles/iam.workloadIdentityUser" + member = "serviceAccount:${var.project_id}.svc.id.goog[${var.namespace}/${var.k8s_service_account_name}]" +} + +# Cert-Manager Issuer +resource "kubernetes_manifest" "letsencrypt_issuer" { + manifest = { + apiVersion = "cert-manager.io/v1" + kind = "Issuer" + metadata = { + name = var.cert_issuer_name + namespace = var.namespace + } + spec = { + acme = { + server = "https://acme-v02.api.letsencrypt.org/directory" + email = var.letsencrypt_email + privateKeySecretRef = { + name = "${var.cert_issuer_name}-key" + } + solvers = [ + { + http01 = { + ingress = { + class = var.ingress_class_name + } + } + } + ] + } + } + } + + depends_on = [kubernetes_namespace.observability] +} + +# Add Helm Repositories +resource "helm_release" "cert_manager" { + count = var.install_cert_manager ? 1 : 0 + + name = "cert-manager" + repository = "https://charts.jetstack.io" + chart = "cert-manager" + namespace = "cert-manager" + create_namespace = true + version = var.cert_manager_version + + set { + name = "installCRDs" + value = "true" + } + + wait = true + timeout = 600 +} + +resource "helm_release" "nginx_ingress" { + count = var.install_nginx_ingress ? 1 : 0 + + name = "nginx-monitoring" + repository = "https://kubernetes.github.io/ingress-nginx" + chart = "ingress-nginx" + namespace = "ingress-nginx" + create_namespace = true + version = var.nginx_ingress_version + + set { + name = "controller.ingressClassResource.name" + value = var.ingress_class_name + } + + set { + name = "controller.ingressClass" + value = var.ingress_class_name + } + + set { + name = "controller.ingressClassResource.controllerValue" + value = "k8s.io/ingress-nginx" + } + + set { + name = "controller.ingressClassResource.enabled" + value = "true" + } + + set { + name = "controller.ingressClassByName" + value = "true" + } + + wait = true + timeout = 600 +} + +# Loki +resource "helm_release" "loki" { + name = "monitoring-loki" + repository = "https://grafana.github.io/helm-charts" + chart = "loki" + namespace = kubernetes_namespace.observability.metadata[0].name + version = var.loki_version + + values = [ + templatefile("values/loki-values.yaml", { + gcp_service_account_email = google_service_account.observability_sa.email + k8s_service_account_name = kubernetes_service_account.observability_sa.metadata[0].name + loki_chunks_bucket = google_storage_bucket.observability_buckets["loki-chunks"].name + loki_ruler_bucket = google_storage_bucket.observability_buckets["loki-ruler"].name + loki_admin_bucket = google_storage_bucket.observability_buckets["loki-chunks"].name + loki_schema_from_date = local.loki_schema_from_date + monitoring_domain = var.monitoring_domain + ingress_class_name = var.ingress_class_name + cert_issuer_name = var.cert_issuer_name + }) + ] + + depends_on = [ + kubernetes_service_account.observability_sa, + google_service_account_iam_member.workload_identity_binding, + google_storage_bucket_iam_member.bucket_object_admin + ] +} + +# Mimir +resource "helm_release" "mimir" { + name = "monitoring-mimir" + repository = "https://grafana.github.io/helm-charts" + chart = "mimir-distributed" + namespace = kubernetes_namespace.observability.metadata[0].name + version = var.mimir_version + + values = [ + templatefile("values/mimir-values.yaml", { + gcp_service_account_email = google_service_account.observability_sa.email + k8s_service_account_name = kubernetes_service_account.observability_sa.metadata[0].name + mimir_blocks_bucket = google_storage_bucket.observability_buckets["mimir-blocks"].name + mimir_ruler_bucket = google_storage_bucket.observability_buckets["mimir-ruler"].name + mimir_alertmanager_bucket = google_storage_bucket.observability_buckets["mimir-ruler"].name + monitoring_domain = var.monitoring_domain + ingress_class_name = var.ingress_class_name + cert_issuer_name = var.cert_issuer_name + }) + ] + + depends_on = [ + kubernetes_service_account.observability_sa, + google_service_account_iam_member.workload_identity_binding, + google_storage_bucket_iam_member.bucket_object_admin + ] +} + +# Tempo +resource "helm_release" "tempo" { + name = "monitoring-tempo" + repository = "https://grafana.github.io/helm-charts" + chart = "tempo-distributed" + namespace = kubernetes_namespace.observability.metadata[0].name + version = var.tempo_version + + values = [ + templatefile("values/tempo-values.yaml", { + gcp_service_account_email = google_service_account.observability_sa.email + k8s_service_account_name = kubernetes_service_account.observability_sa.metadata[0].name + tempo_traces_bucket = google_storage_bucket.observability_buckets["tempo-traces"].name + monitoring_domain = var.monitoring_domain + ingress_class_name = var.ingress_class_name + cert_issuer_name = var.cert_issuer_name + }) + ] + + depends_on = [ + kubernetes_service_account.observability_sa, + google_service_account_iam_member.workload_identity_binding, + google_storage_bucket_iam_member.bucket_object_admin + ] +} + +# Prometheus +resource "helm_release" "prometheus" { + name = "monitoring-prometheus" + repository = "https://prometheus-community.github.io/helm-charts" + chart = "prometheus" + namespace = kubernetes_namespace.observability.metadata[0].name + version = var.prometheus_version + + values = [ + templatefile("values/prometheus-values.yaml", { + gcp_service_account_email = google_service_account.observability_sa.email + k8s_service_account_name = kubernetes_service_account.observability_sa.metadata[0].name + monitoring_domain = var.monitoring_domain + cluster_name = var.cluster_name + environment = var.environment + project_id = var.project_id + region = var.region + ingress_class_name = var.ingress_class_name + cert_issuer_name = var.cert_issuer_name + }) + ] + + depends_on = [ + helm_release.mimir, + helm_release.loki + ] + + timeout = 600 +} + +# Grafana +resource "helm_release" "grafana" { + name = "monitoring-grafana" + repository = "https://grafana.github.io/helm-charts" + chart = "grafana" + namespace = kubernetes_namespace.observability.metadata[0].name + version = var.grafana_version + + values = [ + templatefile("values/grafana-values.yaml", { + gcp_service_account_email = google_service_account.observability_sa.email + k8s_service_account_name = kubernetes_service_account.observability_sa.metadata[0].name + monitoring_domain = var.monitoring_domain + grafana_admin_password = var.grafana_admin_password + ingress_class_name = var.ingress_class_name + cert_issuer_name = var.cert_issuer_name + }) + ] + + depends_on = [ + helm_release.prometheus, + helm_release.loki, + helm_release.mimir, + helm_release.tempo + ] + + timeout = 600 +} + +# Monitoring Ingress +resource "kubernetes_ingress_v1" "monitoring_stack" { + metadata { + name = "monitoring-stack-ingress" + namespace = kubernetes_namespace.observability.metadata[0].name + annotations = { + "kubernetes.io/ingress.class" = var.ingress_class_name + "cert-manager.io/issuer" = var.cert_issuer_name + "nginx.ingress.kubernetes.io/ssl-redirect" = "true" + "nginx.ingress.kubernetes.io/backend-protocol" = "HTTP" + "nginx.ingress.kubernetes.io/proxy-connect-timeout" = "300" + "nginx.ingress.kubernetes.io/proxy-send-timeout" = "300" + "nginx.ingress.kubernetes.io/proxy-read-timeout" = "300" + "nginx.ingress.kubernetes.io/proxy-body-size" = "50m" + } + } + + spec { + tls { + hosts = [ + "grafana.${var.monitoring_domain}", + "loki.${var.monitoring_domain}", + "mimir.${var.monitoring_domain}", + "tempo.${var.monitoring_domain}", + "tempo-push.${var.monitoring_domain}", + "prometheus.${var.monitoring_domain}" + ] + secret_name = "monitoring-tls" + } + + # Grafana + rule { + host = "grafana.${var.monitoring_domain}" + http { + path { + path = "/" + path_type = "Prefix" + backend { + service { + name = "monitoring-grafana" + port { + number = 80 + } + } + } + } + } + } + + # Loki + rule { + host = "loki.${var.monitoring_domain}" + http { + path { + path = "/" + path_type = "Prefix" + backend { + service { + name = "monitoring-loki-gateway" + port { + number = 80 + } + } + } + } + } + } + + # Mimir + rule { + host = "mimir.${var.monitoring_domain}" + http { + path { + path = "/" + path_type = "Prefix" + backend { + service { + name = "monitoring-mimir-nginx" + port { + number = 80 + } + } + } + } + } + } + + # Tempo Query + rule { + host = "tempo.${var.monitoring_domain}" + http { + path { + path = "/" + path_type = "Prefix" + backend { + service { + name = "monitoring-tempo-query-frontend" + port { + number = 3200 + } + } + } + } + } + } + + # Tempo Push + rule { + host = "tempo-push.${var.monitoring_domain}" + http { + path { + path = "/" + path_type = "Prefix" + backend { + service { + name = "monitoring-tempo-distributor" + port { + number = 4318 + } + } + } + } + } + } + + # Prometheus + rule { + host = "prometheus.${var.monitoring_domain}" + http { + path { + path = "/" + path_type = "Prefix" + backend { + service { + name = "monitoring-prometheus-server" + port { + number = 80 + } + } + } + } + } + } + } + + depends_on = [ + helm_release.grafana, + kubernetes_manifest.letsencrypt_issuer + ] +} + +# Tempo gRPC Ingress +resource "kubernetes_ingress_v1" "tempo_grpc" { + metadata { + name = "monitoring-stack-ingress-grpc" + namespace = kubernetes_namespace.observability.metadata[0].name + annotations = { + "kubernetes.io/ingress.class" = var.ingress_class_name + "cert-manager.io/issuer" = var.cert_issuer_name + "nginx.ingress.kubernetes.io/ssl-redirect" = "true" + "nginx.ingress.kubernetes.io/backend-protocol" = "GRPC" + } + } + + spec { + tls { + hosts = [ + "tempo-grpc.${var.monitoring_domain}" + ] + secret_name = "monitoring-grpc-tls" + } + + rule { + host = "tempo-grpc.${var.monitoring_domain}" + http { + path { + path = "/" + path_type = "Prefix" + backend { + service { + name = "monitoring-tempo-distributor" + port { + number = 4317 + } + } + } + } + } + } + } + + depends_on = [ + helm_release.tempo, + kubernetes_manifest.letsencrypt_issuer + ] +} diff --git a/lgtm-stack/terraform/outputs.tf b/lgtm-stack/terraform/outputs.tf deleted file mode 100644 index e69de29b..00000000 diff --git a/lgtm-stack/terraform/terraform.tfvars b/lgtm-stack/terraform/terraform.tfvars new file mode 100644 index 00000000..9e34f3a3 --- /dev/null +++ b/lgtm-stack/terraform/terraform.tfvars @@ -0,0 +1,36 @@ +# GCP Configuration +project_id = "" +region = "" +cluster_name = "" +cluster_location = "" + +# Kubernetes Configuration +namespace = "" +k8s_service_account_name = "" +gcp_service_account_name = "" + +# Environment +environment = "" + +# Domain Configuration +monitoring_domain = "" +letsencrypt_email = "" + +# Grafana +grafana_admin_password = "" + +# Helm Chart Versions (optional - defaults will be used if not specified) +loki_version = "6.20.0" +mimir_version = "5.5.0" +tempo_version = "1.57.0" +prometheus_version = "25.27.0" +grafana_version = "10.3.0" + +# Optional Components (set to true if you want Terraform to install these) +install_cert_manager = false +ingress_class_name = "nginx" # Set to your cluster's ingress class (e.g., "nginx", "traefik") +cert_issuer_name = "letsencrypt-prod" # Set to your Cert-Manager Issuer name +install_nginx_ingress = false + +# Loki Schema From Date +loki_schema_from_date = "2026-08-01" diff --git a/lgtm-stack/terraform/values/grafana-values.yaml b/lgtm-stack/terraform/values/grafana-values.yaml index e69de29b..548273c4 100644 --- a/lgtm-stack/terraform/values/grafana-values.yaml +++ b/lgtm-stack/terraform/values/grafana-values.yaml @@ -0,0 +1,85 @@ +serviceAccount: + create: false + name: ${k8s_service_account_name} + annotations: + iam.gke.io/gcp-service-account: ${gcp_service_account_email} # Replace with your GCP service account email + + +# Explicitly disable testFramework to avoid nil pointer in Helm chart +testFramework: + enabled: false + image: "bats/bats" + securityContext: + runAsUser: 1000 + +adminUser: admin +adminPassword: ${grafana_admin_password} + +persistence: + enabled: true + size: 1Gi + +service: + type: ClusterIP # Keeping service type as ClusterIP, assuming ingress will handle external access + port: 80 + targetPort: 3000 + annotations: {} # Removing service annotations as they are moved to ingress + +ingress: + enabled: false + ingressClassName: ${ingress_class_name} + annotations: + cert-manager.io/issuer: ${cert_issuer_name} + external-dns.alpha.kubernetes.io/hostname: grafana.${monitoring_domain} # Replace with your Grafana domain name (e.g., grafana.yourdomain.com) + +env: + GF_FEATURE_TOGGLES_ENABLE: publicDashboards + GF_LOG_LEVEL: info + GF_AUTH_ANONYMOUS_ENABLED: "false" + +datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://monitoring-prometheus-server:80 + isDefault: true + jsonData: + httpMethod: POST + timeInterval: 15s + + - name: Loki + type: loki + access: proxy + url: http://monitoring-loki-gateway:80 + jsonData: + maxLines: 1000 + + - name: Mimir + type: prometheus + access: proxy + url: http://monitoring-mimir-nginx:80/prometheus + jsonData: + httpMethod: POST + timeInterval: 15s + + - name: Tempo + type: tempo + access: proxy + url: http://monitoring-tempo-query-frontend:3200 + jsonData: + httpMethod: GET + tracesToLogsV2: {} + datasourceUid: loki + spanStartTimeShift: '-1h' + spanEndTimeShift: '1h' + filterByTraceID: true + filterBySpanID: false + tracesToMetrics: + datasourceUid: prometheus + serviceMap: + datasourceUid: prometheus + nodeGraph: + enabled: true diff --git a/lgtm-stack/terraform/values/loki-values.yaml b/lgtm-stack/terraform/values/loki-values.yaml index e69de29b..9642d46e 100644 --- a/lgtm-stack/terraform/values/loki-values.yaml +++ b/lgtm-stack/terraform/values/loki-values.yaml @@ -0,0 +1,296 @@ +# Distributed deployment mode with microservices components +# This uses the full distributed microservices architecture with individual components: +# ingester, querier, distributor, compactor, query-frontend, etc. +deploymentMode: Distributed + +# Explicitly disable Loki Canary +lokiCanary: + enabled: false + +loki: + auth_enabled: false + + storage: + type: gcs + bucketNames: + chunks: ${loki_chunks_bucket} + ruler: ${loki_ruler_bucket} + admin: ${loki_chunks_bucket} + gcs: + chunkBufferSize: 0 + requestTimeout: "0s" + enableHttp2: true + + commonConfig: + replication_factor: 1 + path_prefix: /var/loki + + schemaConfig: + configs: + - from: "2025-12-01" + store: tsdb + object_store: gcs + schema: v13 + index: + prefix: loki_index_ + period: 24h + + limits_config: + retention_period: 720h + reject_old_samples: true + reject_old_samples_max_age: 168h + ingestion_rate_mb: 100 + ingestion_burst_size_mb: 200 + per_stream_rate_limit: 50MB + per_stream_rate_limit_burst: 100MB + max_entries_limit_per_query: 100000 + max_streams_per_user: 0 + allow_structured_metadata: true + volume_enabled: true + + pattern_ingester: + enabled: true + + ruler: + enable_api: true + storage: + type: gcs + gcs: + bucket_name: ${loki_ruler_bucket} + + ingester: + chunk_encoding: snappy + + compactor: + working_directory: /var/loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + retention_delete_worker_count: 150 + delete_request_store: gcs + + analytics: + reporting_enabled: false + +# Global resource limits for Loki components (optimized for resource constraints) +resources: + requests: + cpu: 100m + memory: 512Mi + limits: + cpu: 1000m + memory: 1.5Gi + +serviceAccount: + create: false + name: ${k8s_service_account_name} + annotations: + iam.gke.io/gcp-service-account: ${gcp_service_account_email} # Replace with your GCP service account email + +singleBinary: + replicas: 0 + persistence: + enabled: true + size: 5Gi + storageClass: standard-rwo + affinity: {} + +gateway: + enabled: true + replicas: 2 + maxUnavailable: 1 + affinity: + podAntiAffinity: null + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + resources: + requests: + cpu: 100m + memory: 512Mi + limits: + cpu: 1000m + memory: 1.5Gi + ingress: + enabled: false + ingressClassName: ${ingress_class_name} + annotations: + cert-manager.io/issuer: ${cert_issuer_name} + external-dns.alpha.kubernetes.io/hostname: loki.${monitoring_domain} # Replace with your Loki domain name (e.g., loki.yourdomain.com) + +backend: + replicas: 0 +read: + replicas: 0 +write: + replicas: 0 +ingester: + replicas: 2 + maxUnavailable: 1 + zoneAwareReplication: + enabled: false + + affinity: + podAntiAffinity: null + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + resources: + requests: + cpu: 100m + memory: 512Mi + limits: + cpu: 1000m + memory: 1.5Gi +querier: + replicas: 2 + maxUnavailable: 1 + affinity: + podAntiAffinity: null + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + resources: + requests: + cpu: 100m + memory: 512Mi + limits: + cpu: 1000m + memory: 1.5Gi +queryFrontend: + replicas: 2 + maxUnavailable: 1 + affinity: + podAntiAffinity: null + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + resources: + requests: + cpu: 100m + memory: 512Mi + limits: + cpu: 1000m + memory: 1.5Gi +queryScheduler: + replicas: 2 + maxUnavailable: 1 + affinity: + podAntiAffinity: null + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + resources: + requests: + cpu: 100m + memory: 512Mi + limits: + cpu: 1000m + memory: 1.5Gi +distributor: + replicas: 2 + maxUnavailable: 1 + affinity: + podAntiAffinity: null + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + resources: + requests: + cpu: 100m + memory: 512Mi + limits: + cpu: 1000m + memory: 1.5Gi +compactor: + replicas: 1 + affinity: + podAntiAffinity: null + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + resources: + requests: + cpu: 100m + memory: 512Mi + limits: + cpu: 1000m + memory: 1.5Gi +indexGateway: + replicas: 2 + maxUnavailable: 1 + affinity: + podAntiAffinity: null + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + resources: + requests: + cpu: 100m + memory: 512Mi + limits: + cpu: 1000m + memory: 1.5Gi +bloomCompactor: + replicas: 0 +bloomGateway: + replicas: 0 + +chunksCache: + enabled: true +resultsCache: + enabled: true + +minio: + enabled: false + +test: + enabled: false + +monitoring: + selfMonitoring: + enabled: false + grafanaAgent: + installOperator: false + lokiCanary: + enabled: false diff --git a/lgtm-stack/terraform/values/mimir-values.yaml b/lgtm-stack/terraform/values/mimir-values.yaml index e69de29b..b8e0a72f 100644 --- a/lgtm-stack/terraform/values/mimir-values.yaml +++ b/lgtm-stack/terraform/values/mimir-values.yaml @@ -0,0 +1,167 @@ +serviceAccount: + create: false + name: ${k8s_service_account_name} + annotations: + iam.gke.io/gcp-service-account: ${gcp_service_account_email} + +mimir: + structuredConfig: + multitenancy_enabled: false + + + # GCS Storage Configuration + common: + storage: + backend: gcs + gcs: + bucket_name: ${mimir_blocks_bucket} + + blocks_storage: + backend: gcs + gcs: + bucket_name: ${mimir_blocks_bucket} + tsdb: + dir: /data/mimir-data/tsdb + retention_period: 720h + bucket_store: + sync_dir: /data/mimir-data/tsdb-sync + + alertmanager_storage: + backend: gcs + gcs: + bucket_name: ${mimir_ruler_bucket} + + ruler_storage: + backend: gcs + gcs: + bucket_name: ${mimir_ruler_bucket} + + server: + log_level: info + + limits: + ingestion_rate: 80000 + max_global_series_per_user: 0 + + compactor: + data_dir: /data/mimir-data/compactor + compaction_interval: 30m + + ingester: + ring: + replication_factor: 1 + +distributor: + replicas: 1 + resources: + requests: + cpu: 200m + memory: 512Mi + limits: + cpu: 500m + memory: 1Gi + +ingester: + replicas: 1 + persistentVolume: + enabled: true + size: 50Gi + storageClass: standard-rwo + resources: + requests: + cpu: 500m + memory: 2Gi + limits: + cpu: 1 + memory: 3Gi # Limit must be >= request, optimized for resource constraints + zoneAwareReplication: + enabled: false + +querier: + replicas: 1 + resources: + requests: + cpu: 200m + memory: 512Mi + limits: + cpu: 500m + memory: 1Gi + +query_frontend: + replicas: 1 + resources: + requests: + cpu: 200m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + +store_gateway: + replicas: 1 + persistentVolume: + enabled: true + size: 50Gi + storageClass: standard-rwo + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: 1 + memory: 2Gi # Limit must be >= request + zoneAwareReplication: + enabled: false + +compactor: + replicas: 1 + persistentVolume: + enabled: true + size: 300Gi + storageClass: standard-rwo + resources: + requests: + cpu: 500m + memory: 2Gi + limits: + cpu: 1 + memory: 3Gi # Limit must be >= request, optimized for memory constraints + +ruler: + replicas: 0 + enabled: false + +alertmanager: + replicas: 0 + enabled: false + +query_scheduler: + replicas: 0 + enabled: false + +overrides_exporter: + replicas: 1 + +nginx: + replicas: 1 + ingress: + enabled: false + ingressClassName: ${ingress_class_name} + annotations: + cert-manager.io/issuer: ${cert_issuer_name} + external-dns.alpha.kubernetes.io/hostname: mimir.${monitoring_domain} + +minio: + enabled: false + +memcached: + enabled: false +memcached-queries: + enabled: false +memcached-metadata: + enabled: false +memcached-results: + enabled: false + +rollout_operator: + enabled: false diff --git a/lgtm-stack/terraform/values/prometheus-values.yaml b/lgtm-stack/terraform/values/prometheus-values.yaml index e69de29b..68760793 100644 --- a/lgtm-stack/terraform/values/prometheus-values.yaml +++ b/lgtm-stack/terraform/values/prometheus-values.yaml @@ -0,0 +1,92 @@ +fullnameOverride: "monitoring-prometheus" + +alertmanager: + enabled: false +kube-state-metrics: + enabled: true +prometheus-node-exporter: + enabled: false +prometheus-pushgateway: + enabled: false +serviceAccount: + create: false # Use existing service account created by Terraform + name: ${k8s_service_account_name} + annotations: + iam.gke.io/gcp-service-account: ${gcp_service_account_email} + +server: + persistentVolume: + enabled: true + size: 3Gi + retention: 7d + service: + type: ClusterIP + port: 80 + targetPort: 9090 + ingress: + enabled: false + ingressClassName: ${ingress_class_name} + annotations: + cert-manager.io/issuer: ${cert_issuer_name} + external-dns.alpha.kubernetes.io/hostname: prometheus.${monitoring_domain} + resources: + requests: + memory: 2Gi + cpu: 500m + limits: + memory: 4Gi + cpu: 1000m + global: + evaluation_interval: 15s + scrape_interval: 15s + external_labels: + cluster: gke_${project_id}_${region}_${cluster_name} + environment: prod + + remoteWrite: + - url: http://monitoring-mimir-nginx:80/api/v1/push + queue_config: + capacity: 10000 + max_shards: 200 + min_shards: 1 + max_samples_per_send: 5000 + batch_send_deadline: 5s + min_backoff: 30ms + max_backoff: 5s + +extraScrapeConfigs: | + - job_name: loki + metrics_path: /metrics + static_configs: + - targets: + - 'monitoring-loki-distributor:3100' + - 'monitoring-loki-ingester:3100' + - 'monitoring-loki-querier:3100' + - 'monitoring-loki-query-frontend:3100' + - 'monitoring-loki-compactor:3100' + - 'monitoring-loki-index-gateway:3100' + - 'monitoring-loki-query-scheduler:3100' + labels: + instance: 'loki' + cluster: 'k8s' + + - job_name: grafana + static_configs: + - targets: ['monitoring-grafana:80'] + labels: + instance: 'grafana' + cluster: 'k8s' + + - job_name: mimir-distributor + static_configs: + - targets: ['monitoring-mimir-distributor:8080'] + labels: + instance: 'mimir-distributor' + cluster: 'k8s' + + - job_name: tempo + static_configs: + - targets: ['monitoring-tempo-query-frontend:3200'] + labels: + instance: 'tempo' + cluster: 'k8s' \ No newline at end of file diff --git a/lgtm-stack/terraform/values/tempo-values.yaml b/lgtm-stack/terraform/values/tempo-values.yaml index e69de29b..934b509b 100644 --- a/lgtm-stack/terraform/values/tempo-values.yaml +++ b/lgtm-stack/terraform/values/tempo-values.yaml @@ -0,0 +1,107 @@ +# Service Account +serviceAccount: + create: false + name: ${k8s_service_account_name} + annotations: + iam.gke.io/gcp-service-account: ${gcp_service_account_email} + +# Global Overrides +global_overrides: + per_tenant_override_config: /runtime-config/overrides.yaml + defaults: + metrics_generator: + processors: [service-graphs, span-metrics] + +# Storage (GCS) +storage: + trace: + backend: gcs + gcs: + bucket_name: ${tempo_traces_bucket} + chunk_buffer_size: 10485760 + wal: + path: /var/tempo/wal + +# Enable OTLP Receivers (This automatically exposes ports on Service) +traces: + otlp: + grpc: + enabled: true + http: + enabled: true + jaeger: + thriftCompact: + enabled: true + thriftHttp: + enabled: true + zipkin: + enabled: true + +# Distributor +distributor: + replicas: 1 + ring: + replication_factor: 1 + config: + log_received_spans: + enabled: false + +# Ingester +ingester: + replicas: 1 + config: + replication_factor: 1 + trace_idle_period: 10s + max_block_bytes: 1_000_000 + max_block_duration: 5m + +# Compactor +compactor: + replicas: 1 + config: + compaction: + block_retention: 720h + +# Querier +querier: + replicas: 1 + +# Query Frontend +queryFrontend: + replicas: 1 + config: + search: + duration_slo: 5s + throughput_bytes_slo: 1.074e+09 + +# Gateway +gateway: + enabled: true + replicas: 1 + ingress: + enabled: false + ingressClassName: "${ingress_class_name}" + annotations: + cert-manager.io/issuer: "${cert_issuer_name}" + external-dns.alpha.kubernetes.io/hostname: tempo.${monitoring_domain} + grpcIngress: + enabled: false + ingressClassName: "${ingress_class_name}" + annotations: + cert-manager.io/issuer: "${cert_issuer_name}" + service: + type: ClusterIP + +memcached: + enabled: true + +memcachedExporter: + enabled: false + +resources: + requests: + cpu: 100m + memory: 512Mi + limits: + cpu: 1000m + memory: 1.5Gi # Optimized for resource constraints% \ No newline at end of file diff --git a/lgtm-stack/terraform/variables.tf b/lgtm-stack/terraform/variables.tf index e69de29b..57f1e227 100644 --- a/lgtm-stack/terraform/variables.tf +++ b/lgtm-stack/terraform/variables.tf @@ -0,0 +1,137 @@ +variable "project_id" { + description = "GCP Project ID" + type = string +} + +variable "region" { + description = "GCP Region" + type = string + default = "us-central1" +} + +variable "cluster_name" { + description = "GKE Cluster Name" + type = string +} + +variable "cluster_location" { + description = "GKE Cluster Location" + type = string +} + +variable "namespace" { + description = "Kubernetes Namespace for Observability Stack" + type = string + default = "observability" +} + +variable "k8s_service_account_name" { + description = "Kubernetes Service Account Name" + type = string + default = "observability-sa" +} + +variable "gcp_service_account_name" { + description = "GCP Service Account Name (6-30 chars, lowercase, start with letter, end with letter/number)" + type = string + default = "gke-observability-sa" + + validation { + condition = can(regex("^[a-z](?:[-a-z0-9]{4,28}[a-z0-9])$", var.gcp_service_account_name)) + error_message = "GCP service account name must be 6-30 characters, start with a lowercase letter, contain only lowercase letters, numbers, and hyphens, and end with a lowercase letter or number." + } +} + +variable "environment" { + description = "Environment (e.g., restricted, production)" + type = string + default = "production" +} + +variable "monitoring_domain" { + description = "Domain for monitoring services" + type = string +} + +variable "letsencrypt_email" { + description = "Email address for Let's Encrypt certificate notifications" + type = string +} + +variable "ingress_class_name" { + description = "Ingress class to use for all ingress resources (e.g., nginx, traefik, kong). Must match an existing IngressClass in the cluster." + type = string + default = "nginx" +} + +variable "grafana_admin_password" { + description = "Admin password for Grafana" + type = string + sensitive = true +} + +variable "install_cert_manager" { + description = "Whether to install cert-manager" + type = bool + default = false +} + +variable "install_nginx_ingress" { + description = "Whether to install NGINX Ingress Controller" + type = bool + default = false +} + +variable "cert_manager_version" { + description = "Version of cert-manager chart" + type = string + default = "v1.15.0" +} + +variable "nginx_ingress_version" { + description = "Version of ingress-nginx chart" + type = string + default = "4.10.1" +} + +variable "loki_version" { + description = "Version of Loki chart" + type = string + default = "6.6.4" +} + +variable "mimir_version" { + description = "Version of Mimir chart" + type = string + default = "5.5.0" +} + +variable "tempo_version" { + description = "Version of Tempo chart" + type = string + default = "1.57.0" +} + +variable "prometheus_version" { + description = "Version of Prometheus chart" + type = string + default = "25.27.0" +} + +variable "grafana_version" { + description = "Version of Grafana chart" + type = string + default = "10.3.0" +} + +variable "loki_schema_from_date" { + description = "Date from which Loki schema is effective (YYYY-MM-DD)" + type = string + default = "2024-01-01" +} + +variable "cert_issuer_name" { + description = "Name of the Cert-Manager Issuer to create/use" + type = string + default = "letsencrypt-prod" +}