diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5cd285af..367ab512 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -114,6 +114,16 @@ jobs: echo "=== Post-deployment validation ===" ./scripts/test.sh check-deployment + - name: Wait for monitoring stack + run: | + echo "=== Waiting for monitoring components (required for autoscaling) ===" + kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server,app.kubernetes.io/name=prometheus -n eoapi --timeout=120s & + kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=grafana -n eoapi --timeout=120s & + kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=prometheus-adapter -n eoapi --timeout=120s & + wait # Wait for all background jobs + echo "✅ Monitoring stack ready" + kubectl get hpa -n eoapi + - name: Run integration tests run: | export RELEASE_NAME="$RELEASE_NAME" diff --git a/.github/workflows/tests/conftest.py b/.github/workflows/tests/conftest.py index e5cedd45..891df5f0 100644 --- a/.github/workflows/tests/conftest.py +++ b/.github/workflows/tests/conftest.py @@ -1,9 +1,13 @@ +import json import os -from typing import Any, Generator +import subprocess +import time +from typing import Any, Dict, Generator, List, Optional, cast import psycopg2 import psycopg2.extensions import pytest +import requests @pytest.fixture(scope="session") @@ -22,17 +26,22 @@ def stac_endpoint() -> str: @pytest.fixture(scope="session") -def db_connection() -> Generator[Any, None, None]: - """Create database connection for testing.""" - # Require all database connection parameters to be explicitly set +def db_connection() -> Generator[psycopg2.extensions.connection, None, None]: required_vars = ["PGHOST", "PGPORT", "PGDATABASE", "PGUSER", "PGPASSWORD"] missing_vars = [var for var in required_vars if not os.getenv(var)] - if missing_vars: pytest.fail( f"Required environment variables not set: {', '.join(missing_vars)}" ) + connection_params = { + "host": os.getenv("PGHOST"), + "port": os.getenv("PGPORT"), + "database": os.getenv("PGDATABASE"), + "user": os.getenv("PGUSER"), + "password": os.getenv("PGPASSWORD"), + } + # All required vars are guaranteed to exist due to check above try: conn = psycopg2.connect( @@ -47,3 +56,174 @@ def db_connection() -> Generator[Any, None, None]: conn.close() except psycopg2.Error as e: pytest.fail(f"Cannot connect to database: {e}") + + +def get_namespace() -> str: + """Get the namespace from environment variable.""" + return os.environ.get("NAMESPACE", "eoapi") + + +def get_release_name() -> str: + """Get the release name from environment variable.""" + return os.environ.get("RELEASE_NAME", "eoapi") + + +def kubectl_get( + resource: str, + namespace: Optional[str] = None, + label_selector: Optional[str] = None, + output: str = "json", +) -> subprocess.CompletedProcess[str]: + cmd: List[str] = ["kubectl", "get", resource] + + if namespace: + cmd.extend(["-n", namespace]) + + if label_selector: + cmd.extend(["-l", label_selector]) + + if output: + cmd.extend(["-o", output]) + + result = subprocess.run(cmd, capture_output=True, text=True) + return result + + +def kubectl_port_forward( + service: str, local_port: int, remote_port: int, namespace: str +) -> subprocess.Popen[str]: + cmd = [ + "kubectl", + "port-forward", + f"svc/{service}", + f"{local_port}:{remote_port}", + "-n", + namespace, + ] + + process = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True + ) + + time.sleep(3) + return process + + +def wait_for_url(url: str, timeout: int = 30, interval: int = 2) -> bool: + start_time = time.time() + while time.time() - start_time < timeout: + try: + response = requests.get(url, timeout=5) + if response.status_code == 200: + return True + except (requests.RequestException, requests.ConnectionError): + pass + time.sleep(interval) + return False + + +def make_request(url: str, timeout: int = 10) -> bool: + try: + response = requests.get(url, timeout=timeout) + return response.status_code == 200 + except requests.RequestException: + return False + + +def get_base_url() -> str: + """Get the base URL for API access.""" + namespace = get_namespace() + + # Check if we have an ingress + result = subprocess.run( + ["kubectl", "get", "ingress", "-n", namespace, "-o", "json"], + capture_output=True, + text=True, + ) + + if result.returncode == 0: + ingress_data = json.loads(result.stdout) + if ingress_data["items"]: + ingress = ingress_data["items"][0] + rules = ingress.get("spec", {}).get("rules", []) + if rules: + host = rules[0].get("host", "localhost") + # Check if host is accessible + try: + response = requests.get( + f"http://{host}/stac/collections", timeout=5 + ) + if response.status_code == 200: + return f"http://{host}" + except requests.RequestException: + pass + + return "http://localhost:8080" + + +def get_pod_metrics(namespace: str, service_name: str) -> List[Dict[str, str]]: + """Get CPU and memory metrics for pods of a specific service.""" + release_name_val = get_release_name() + result = subprocess.run( + [ + "kubectl", + "top", + "pods", + "-n", + namespace, + "-l", + f"app={release_name_val}-{service_name}", + "--no-headers", + ], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + return [] + + metrics: List[Dict[str, str]] = [] + for line in result.stdout.strip().split("\n"): + if line.strip(): + parts = line.split() + if len(parts) >= 3: + pod_name = parts[0] + cpu = parts[1] # e.g., "25m" + memory = parts[2] # e.g., "128Mi" + metrics.append({"pod": pod_name, "cpu": cpu, "memory": memory}) + + return metrics + + +def get_hpa_status(namespace: str, hpa_name: str) -> Optional[Dict[str, Any]]: + """Get HPA status for a specific HPA.""" + result = kubectl_get("hpa", namespace=namespace, output="json") + if result.returncode != 0: + return None + + hpas = json.loads(result.stdout) + for hpa in hpas["items"]: + if hpa["metadata"]["name"] == hpa_name: + return cast(Dict[str, Any], hpa) + + return None + + +def get_pod_count(namespace: str, service_name: str) -> int: + """Get the count of running pods for a specific service.""" + release_name_val = get_release_name() + result = kubectl_get( + "pods", + namespace=namespace, + label_selector=f"app={release_name_val}-{service_name}", + ) + + if result.returncode != 0: + return 0 + + pods = json.loads(result.stdout) + running_pods = [ + pod for pod in pods["items"] if pod["status"]["phase"] == "Running" + ] + + return len(running_pods) diff --git a/.github/workflows/tests/test_autoscaling.py b/.github/workflows/tests/test_autoscaling.py index 83cdedf2..d8a19c11 100644 --- a/.github/workflows/tests/test_autoscaling.py +++ b/.github/workflows/tests/test_autoscaling.py @@ -1,148 +1,22 @@ """Test autoscaling behavior and HPA functionality.""" import json -import os import subprocess import threading import time -from typing import Any, Dict, List, Optional, cast +from typing import Any, Dict, List import pytest import requests - - -def get_namespace() -> str: - return os.environ.get("NAMESPACE", "eoapi") - - -def get_release_name() -> str: - return os.environ.get("RELEASE_NAME", "eoapi") - - -def get_base_url() -> str: - namespace = get_namespace() - - # Check if we have an ingress - result = subprocess.run( - ["kubectl", "get", "ingress", "-n", namespace, "-o", "json"], - capture_output=True, - text=True, - ) - - if result.returncode == 0: - ingress_data = json.loads(result.stdout) - if ingress_data["items"]: - ingress = ingress_data["items"][0] - rules = ingress.get("spec", {}).get("rules", []) - if rules: - host = rules[0].get("host", "localhost") - # Check if host is accessible - try: - response = requests.get( - f"http://{host}/stac/collections", timeout=5 - ) - if response.status_code == 200: - return f"http://{host}" - except requests.RequestException: - pass - - return "http://localhost:8080" - - -def kubectl_get( - resource: str, - namespace: Optional[str] = None, - label_selector: Optional[str] = None, - output: str = "json", -) -> subprocess.CompletedProcess[str]: - cmd = ["kubectl", "get", resource] - - if namespace: - cmd.extend(["-n", namespace]) - - if label_selector: - cmd.extend(["-l", label_selector]) - - if output: - cmd.extend(["-o", output]) - - result = subprocess.run(cmd, capture_output=True, text=True) - return result - - -def get_pod_metrics(namespace: str, service_name: str) -> List[Dict[str, str]]: - release_name = get_release_name() - result = subprocess.run( - [ - "kubectl", - "top", - "pods", - "-n", - namespace, - "-l", - f"app={release_name}-{service_name}", - "--no-headers", - ], - capture_output=True, - text=True, - ) - - if result.returncode != 0: - return [] - - metrics: List[Dict[str, str]] = [] - for line in result.stdout.strip().split("\n"): - if line.strip(): - parts = line.split() - if len(parts) >= 3: - pod_name = parts[0] - cpu = parts[1] # e.g., "25m" - memory = parts[2] # e.g., "128Mi" - metrics.append({"pod": pod_name, "cpu": cpu, "memory": memory}) - - return metrics - - -def get_hpa_status(namespace: str, hpa_name: str) -> Optional[Dict[str, Any]]: - """Get HPA status for a specific HPA.""" - result = kubectl_get("hpa", namespace=namespace, output="json") - if result.returncode != 0: - return None - - hpas = json.loads(result.stdout) - for hpa in hpas["items"]: - if hpa["metadata"]["name"] == hpa_name: - return cast(Dict[str, Any], hpa) - - return None - - -def get_pod_count(namespace: str, service_name: str) -> int: - release_name = get_release_name() - result = kubectl_get( - "pods", - namespace=namespace, - label_selector=f"app={release_name}-{service_name}", - ) - - if result.returncode != 0: - return 0 - - pods = json.loads(result.stdout) - running_pods = [ - pod for pod in pods["items"] if pod["status"]["phase"] == "Running" - ] - - return len(running_pods) - - -def make_request(url: str, timeout: int = 10) -> bool: - """Make a single HTTP request and return success status.""" - try: - response = requests.get(url, timeout=timeout) - return bool(response.status_code == 200) - except requests.RequestException: - return False +from conftest import ( + get_base_url, + get_namespace, + get_pod_count, + get_pod_metrics, + get_release_name, + kubectl_get, + make_request, +) def generate_load( diff --git a/.github/workflows/tests/test_observability.py b/.github/workflows/tests/test_observability.py new file mode 100644 index 00000000..a6215dd5 --- /dev/null +++ b/.github/workflows/tests/test_observability.py @@ -0,0 +1,460 @@ +"""Test observability stack deployment and functionality.""" + +import json +import subprocess + +import pytest +import requests +from conftest import ( + get_namespace, + get_release_name, + kubectl_get, + kubectl_port_forward, + wait_for_url, +) + + +class TestMonitoringStackDeployment: + def test_prometheus_server_deployment(self) -> None: + namespace = get_namespace() + result = kubectl_get( + "deployment", + namespace=namespace, + label_selector="app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server", + ) + + assert result.returncode == 0, "Failed to get Prometheus deployment" + + deployments = json.loads(result.stdout) + assert deployments["items"], "No Prometheus server deployment found" + + deployment = deployments["items"][0] + + ready_replicas = deployment["status"].get("readyReplicas", 0) + desired_replicas = deployment["spec"]["replicas"] + assert ready_replicas == desired_replicas, ( + f"Prometheus not ready: {ready_replicas}/{desired_replicas} replicas" + ) + + def test_grafana_deployment(self) -> None: + namespace = get_namespace() + result = kubectl_get( + "deployment", + namespace=namespace, + label_selector="app.kubernetes.io/name=grafana", + ) + + assert result.returncode == 0, "Failed to get Grafana deployment" + + deployments = json.loads(result.stdout) + assert deployments["items"], "No Grafana deployment found" + + deployment = deployments["items"][0] + ready_replicas = deployment["status"].get("readyReplicas", 0) + desired_replicas = deployment["spec"]["replicas"] + assert ready_replicas == desired_replicas, ( + f"Grafana not ready: {ready_replicas}/{desired_replicas} replicas" + ) + + def test_prometheus_adapter_deployment(self) -> None: + namespace = get_namespace() + result = kubectl_get( + "deployment", + namespace=namespace, + label_selector="app.kubernetes.io/name=prometheus-adapter", + ) + + assert result.returncode == 0, ( + "Failed to get Prometheus Adapter deployment" + ) + + deployments = json.loads(result.stdout) + assert deployments["items"], "No Prometheus Adapter deployment found" + + deployment = deployments["items"][0] + ready_replicas = deployment["status"].get("readyReplicas", 0) + desired_replicas = deployment["spec"]["replicas"] + assert ready_replicas == desired_replicas, ( + f"Prometheus Adapter not ready: {ready_replicas}/{desired_replicas} replicas" + ) + + def test_kube_state_metrics_deployment(self) -> None: + """Test kube-state-metrics deployment is running.""" + namespace = get_namespace() + result = kubectl_get( + "deployment", + namespace=namespace, + label_selector="app.kubernetes.io/name=kube-state-metrics", + ) + + assert result.returncode == 0, ( + "Failed to get kube-state-metrics deployment" + ) + + deployments = json.loads(result.stdout) + assert deployments["items"], "No kube-state-metrics deployment found" + + deployment = deployments["items"][0] + ready_replicas = deployment["status"].get("readyReplicas", 0) + desired_replicas = deployment["spec"]["replicas"] + assert ready_replicas == desired_replicas, ( + f"kube-state-metrics not ready: {ready_replicas}/{desired_replicas} replicas" + ) + + def test_node_exporter_deployment(self) -> None: + """Test node-exporter DaemonSet is running.""" + namespace = get_namespace() + result = kubectl_get( + "daemonset", + namespace=namespace, + label_selector="app.kubernetes.io/name=prometheus-node-exporter", + ) + + assert result.returncode == 0, "Failed to get node-exporter daemonset" + + daemonsets = json.loads(result.stdout) + assert daemonsets["items"], "No node-exporter daemonset found" + + daemonset = daemonsets["items"][0] + ready = daemonset["status"].get("numberReady", 0) + desired = daemonset["status"].get("desiredNumberScheduled", 0) + assert ready > 0, "No node-exporter pods are ready" + assert ready == desired, ( + f"node-exporter not fully deployed: {ready}/{desired} nodes" + ) + + +class TestMetricsCollection: + def test_custom_metrics_api_available(self) -> None: + result = subprocess.run( + ["kubectl", "get", "--raw", "/apis/custom.metrics.k8s.io/v1beta1"], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + pytest.skip( + "Custom metrics API not available - prometheus-adapter may not be configured" + ) + + api_response = json.loads(result.stdout) + assert api_response["kind"] == "APIResourceList", ( + "Invalid custom metrics API response" + ) + assert ( + api_response["groupVersion"] == "custom.metrics.k8s.io/v1beta1" + ), "Wrong API version" + + def test_metrics_server_integration(self) -> None: + """Verify metrics-server is working for resource metrics.""" + # Test if we can get pod metrics + result = subprocess.run( + ["kubectl", "top", "pods", "-n", get_namespace(), "--no-headers"], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + pytest.skip("metrics-server not available or not ready") + + # Should have some metrics output + lines = result.stdout.strip().split("\n") + assert len(lines) > 0, "No pod metrics available" + + # Check format includes CPU and Memory columns + for line in lines: + if line.strip(): # Skip empty lines + parts = line.split() + assert len(parts) >= 3, f"Invalid metrics format: {line}" + + def test_prometheus_targets_reachable(self) -> None: + """Test that Prometheus can reach its scrape targets (when accessible).""" + namespace = get_namespace() + + # Check if Prometheus service exists + result = kubectl_get( + "svc", + namespace=namespace, + label_selector="app.kubernetes.io/name=prometheus,app.kubernetes.io/component=server", + ) + + if result.returncode != 0 or not json.loads(result.stdout)["items"]: + pytest.skip("Prometheus service not found") + + service = json.loads(result.stdout)["items"][0] + service_name = service["metadata"]["name"] + + # Try to port-forward to Prometheus + local_port = 19090 + prometheus_port = 80 + + process = None + try: + process = kubectl_port_forward( + service_name, local_port, prometheus_port, namespace + ) + + # Wait for port forward to establish + if not wait_for_url( + f"http://localhost:{local_port}/api/v1/targets" + ): + pytest.skip("Could not establish connection to Prometheus") + + # Check Prometheus targets + response = requests.get( + f"http://localhost:{local_port}/api/v1/targets" + ) + assert response.status_code == 200, ( + "Failed to get Prometheus targets" + ) + + targets_data = response.json() + assert targets_data["status"] == "success", ( + "Failed to retrieve targets" + ) + + active_targets = targets_data["data"]["activeTargets"] + + # Should have at least some targets + assert len(active_targets) > 0, "No active Prometheus targets found" + + # Check for expected target labels + expected_jobs = { + "kubernetes-pods", + "kubernetes-nodes", + "kubernetes-service-endpoints", + "kubernetes-apiservers", + } + + found_jobs = { + target["labels"].get("job") for target in active_targets + } + + # At least some of the expected jobs should be present + common_jobs = expected_jobs.intersection(found_jobs) + assert len(common_jobs) > 0, ( + f"None of the expected jobs found. Expected: {expected_jobs}, " + f"Found: {found_jobs}" + ) + + # Check health of targets + unhealthy_targets = [ + target for target in active_targets if target["health"] != "up" + ] + + # Warning about unhealthy targets but don't fail the test + if unhealthy_targets: + print( + f"Warning: {len(unhealthy_targets)} unhealthy targets found" + ) + + finally: + if process: + process.terminate() + process.wait() + + +class TestAutoscalingIntegration: + """Test HPA and metrics integration for autoscaling.""" + + def test_hpa_resources_exist(self) -> None: + """Verify HPA resources are created for eoAPI services.""" + namespace = get_namespace() + release = get_release_name() + result = kubectl_get("hpa", namespace=namespace) + + assert result.returncode == 0, "Failed to get HPA resources" + + hpas = json.loads(result.stdout)["items"] + + # Expected HPA names based on the Helm chart + expected_hpas = [ + f"{release}-pgstac", + f"{release}-raster", + f"{release}-stac", + f"{release}-vector", + ] + + found_hpas = {hpa["metadata"]["name"] for hpa in hpas} + + # Check which expected HPAs exist + existing_hpas = [hpa for hpa in expected_hpas if hpa in found_hpas] + + if not existing_hpas: + pytest.skip( + "No eoAPI HPA resources found - autoscaling may be disabled" + ) + + # For each found HPA, check configuration + for hpa_name in existing_hpas: + hpa = next(h for h in hpas if h["metadata"]["name"] == hpa_name) + spec = hpa["spec"] + + assert spec["minReplicas"] >= 1, ( + f"HPA {hpa_name} min replicas too low" + ) + assert spec["maxReplicas"] > spec["minReplicas"], ( + f"HPA {hpa_name} max replicas not greater than min" + ) + + def test_hpa_metrics_available(self) -> None: + """Test that HPA can access metrics for scaling decisions.""" + namespace = get_namespace() + result = kubectl_get("hpa", namespace=namespace) + + if result.returncode != 0: + pytest.skip("HPA resources not accessible") + + hpas = json.loads(result.stdout)["items"] + + if not hpas: + pytest.skip("No HPA resources found") + + # Check each HPA for metric availability + for hpa in hpas: + name = hpa["metadata"]["name"] + status = hpa.get("status", {}) + + # Check if HPA has current metrics (may be None initially) + current_metrics = status.get("currentMetrics") + + # Conditions tell us if metrics are available + conditions = status.get("conditions", []) + + # Look for ScalingActive condition + scaling_active = next( + (c for c in conditions if c["type"] == "ScalingActive"), None + ) + + if scaling_active: + assert scaling_active["status"] == "True", ( + f"HPA {name} scaling is not active: {scaling_active.get('message', 'Unknown reason')}" + ) + + # If we have been running for a while, we should have metrics + # But on fresh deployments, metrics might not be available yet + if current_metrics is not None: + assert len(current_metrics) > 0, ( + f"HPA {name} has no current metrics" + ) + + def test_service_resource_requests_configured(self) -> None: + """Verify pods have resource requests for HPA to function.""" + namespace = get_namespace() + release = get_release_name() + result = kubectl_get( + "deployment", + namespace=namespace, + label_selector=f"app.kubernetes.io/instance={release}", + ) + + if result.returncode != 0: + pytest.skip("Could not get eoAPI deployments") + + deployments = json.loads(result.stdout)["items"] + + if not deployments: + pytest.skip("No eoAPI deployments found") + + for deployment in deployments: + name = deployment["metadata"]["name"] + containers = deployment["spec"]["template"]["spec"]["containers"] + + for container in containers: + container_name = container["name"] + resources = container.get("resources", {}) + requests = resources.get("requests", {}) + + # At minimum, CPU requests should be set for HPA + # Memory is optional but recommended + if "cpu" not in requests: + print( + f"Warning: Container {container_name} in {name} " + f"has no CPU request - HPA may not function properly" + ) + + # If HPA is configured, we need resource requests + # This is more of a warning than a failure + if not requests: + print( + f"Warning: Container {container_name} in {name} " + f"has no resource requests defined" + ) + + +class TestGrafanaDashboards: + def test_grafana_service_accessibility(self) -> None: + namespace = get_namespace() + result = kubectl_get( + "svc", + namespace=namespace, + label_selector="app.kubernetes.io/name=grafana", + ) + + if result.returncode != 0: + pytest.skip("Grafana service not found") + + services = json.loads(result.stdout)["items"] + if not services: + pytest.skip("No Grafana service found") + + service = services[0] + service_name = service["metadata"]["name"] + + # Port forward to Grafana + local_port = 13000 + grafana_port = 80 + + process = None + try: + process = kubectl_port_forward( + service_name, local_port, grafana_port, namespace + ) + + if not wait_for_url(f"http://localhost:{local_port}/api/health"): + pytest.skip("Could not connect to Grafana") + + response = requests.get(f"http://localhost:{local_port}/api/health") + assert response.status_code == 200, "Grafana health check failed" + + health_data = response.json() + assert health_data.get("database") == "ok", ( + "Grafana database not healthy" + ) + + finally: + if process: + process.terminate() + process.wait() + + def test_grafana_admin_secret_exists(self) -> None: + namespace = get_namespace() + result = kubectl_get( + "secret", + namespace=namespace, + label_selector="app.kubernetes.io/name=grafana", + ) + + assert result.returncode == 0, "Failed to get Grafana secrets" + + secrets = json.loads(result.stdout)["items"] + assert secrets, "No Grafana secrets found" + + admin_secret = None + for secret in secrets: + name = secret["metadata"]["name"] + if "grafana" in name: + data = secret.get("data", {}) + # Check if it contains admin credentials + if "admin-password" in data or "admin-user" in data: + admin_secret = secret + break + + assert admin_secret is not None, ( + "Grafana admin credentials secret not found" + ) + + secret_data = admin_secret.get("data", {}) + assert "admin-password" in secret_data, ( + "admin-password not found in Grafana secret" + ) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d48e846..a6eb8479 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Automatic queue processor CronJob created when `use_queue` is "true" (configurable schedule via `queueProcessor.schedule`) - Automatic extent updater CronJob created when `update_collection_extent` is "false" (configurable schedule via `extentUpdater.schedule`) - Added ConfigMap checksum annotations to automatically restart pods when configuration changes [#344](https://github.com/developmentseed/eoapi-k8s/pull/344) -- Tests for autoscaling +- Tests for autoscaling [#343](https://github.com/developmentseed/eoapi-k8s/pull/343) +- Added tests for observability stack [#342](https://github.com/developmentseed/eoapi-k8s/pull/342) - Added validation to require `postgrescluster.enabled: false` when using external databases [#346](https://github.com/developmentseed/eoapi-k8s/pull/346) ### Changed diff --git a/charts/eoapi/local-base-values.yaml b/charts/eoapi/local-base-values.yaml index 179e4333..3644ee7d 100644 --- a/charts/eoapi/local-base-values.yaml +++ b/charts/eoapi/local-base-values.yaml @@ -86,9 +86,9 @@ browser: cpu: "200m" memory: "128Mi" -# Disable resource-intensive features for local development monitoring: - enabled: false + metricsServer: + enabled: true autoscaling: enabled: false diff --git a/charts/eoapi/local-k3s-values.yaml b/charts/eoapi/local-k3s-values.yaml index 7347cb47..efda8237 100644 --- a/charts/eoapi/local-k3s-values.yaml +++ b/charts/eoapi/local-k3s-values.yaml @@ -69,3 +69,8 @@ postgrescluster: limits: cpu: "500m" memory: "1Gi" + +# Disable metrics-server as k3s provides it built-in +monitoring: + metricsServer: + enabled: false diff --git a/docs/examples/values-full-observability.yaml b/docs/examples/values-full-observability.yaml new file mode 100644 index 00000000..da0cf202 --- /dev/null +++ b/docs/examples/values-full-observability.yaml @@ -0,0 +1,297 @@ +# Example values for eoAPI with full observability stack +# +# This configuration provides comprehensive observability including: +# - Core metrics collection and autoscaling (included in main chart) +# - Persistent Prometheus storage with 30-day retention +# - Advanced HPA policies with both CPU and request-rate scaling +# - Production-ready resource allocations +# - High availability setup with multiple replicas +# +# To deploy the full stack: +# +# 1. Deploy main chart with monitoring: +# helm install eoapi eoapi/eoapi -f values-full-observability.yaml --namespace eoapi --create-namespace +# +# 2. Deploy observability chart separately: +# helm install eoapi-obs eoapi/eoapi-observability --namespace eoapi +# +# 3. Optional: Configure external integrations +# - DataDog: Set up prometheus scraping +# - New Relic: Deploy NR Kubernetes integration +# - External Grafana: Point to the exposed Prometheus service +# +# Monitoring endpoints (if LoadBalancer is used): +# - Prometheus: http://:9090 +# - Grafana: http:// (from observability chart) +# +# Security considerations: +# - Use internal LoadBalancers for Prometheus in production +# - Set up proper RBAC for service accounts +# - Configure network policies to restrict access +# - Enable TLS for all external endpoints +# +# Performance tuning: +# - Monitor actual resource usage and adjust requests/limits +# - Tune HPA scaling policies based on traffic patterns +# - Adjust Prometheus retention based on storage costs +# - Consider using remote storage for Prometheus (S3, GCS, etc.) + +# Git SHA for deployments (set via CI/CD or command line) +gitSha: "latest" + +###################### +# INGRESS +###################### +ingress: + enabled: true + className: "nginx" + # IMPORTANT: Set a proper hostname for metrics collection + host: "eoapi.example.com" # Replace with your domain + tls: + enabled: true + secretName: eoapi-tls + +###################### +# DATABASE +###################### +postgrescluster: + enabled: true + monitoring: true # Enable PostgreSQL monitoring + instances: + - name: eoapi + replicas: 2 # HA setup for production + dataVolumeClaimSpec: + accessModes: + - "ReadWriteOnce" + resources: + requests: + storage: "100Gi" + cpu: "2048m" + memory: "8192Mi" + +###################### +# COMPREHENSIVE MONITORING +###################### +monitoring: + # Essential components + metricsServer: + enabled: true + apiService: + create: true + + # Full Prometheus setup with all collectors + prometheus: + enabled: true + # Keep alertmanager disabled - we'll use Grafana alerting instead + alertmanager: + enabled: false + # Enable pushgateway for advanced metrics + prometheus-pushgateway: + enabled: true + # Full metrics collection + kube-state-metrics: + enabled: true + prometheus-node-exporter: + enabled: true + # Production-ready resource allocation + resources: + limits: + cpu: 50m + memory: 64Mi + requests: + cpu: 50m + memory: 64Mi + # Prometheus server configuration + server: + # Expose Prometheus for external access (optional) + service: + type: LoadBalancer + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: "nlb" + service.beta.kubernetes.io/aws-load-balancer-internal: "true" + # Persistent storage for metrics + persistentVolume: + enabled: true + size: 50Gi + storageClass: "gp3" # Adjust for your cloud provider + # Retention and performance settings + retention: "30d" # Keep 30 days of metrics + resources: + limits: + cpu: "2000m" + memory: "4096Mi" + requests: + cpu: "1000m" + memory: "2048Mi" + + # Advanced prometheus-adapter configuration + prometheusAdapter: + enabled: true + # Enhanced resource allocation + resources: + limits: + cpu: 250m + memory: 256Mi + requests: + cpu: 100m + memory: 128Mi + +###################### +# SERVICES WITH ADVANCED AUTOSCALING +###################### + +stac: + enabled: true + autoscaling: + enabled: true + minReplicas: 3 # Higher minimum for HA + maxReplicas: 30 + type: "both" # Scale on both CPU and request rate + behaviour: + scaleDown: + stabilizationWindowSeconds: 600 # 10 minutes + policies: + - type: Percent + value: 50 + periodSeconds: 300 + scaleUp: + stabilizationWindowSeconds: 60 + policies: + - type: Percent + value: 100 + periodSeconds: 60 + targets: + cpu: 70 + requestRate: 40000m + settings: + resources: + limits: + cpu: "1500m" + memory: "3072Mi" + requests: + cpu: "750m" + memory: "1536Mi" + +raster: + enabled: true + autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 25 + type: "both" + behaviour: + scaleDown: + stabilizationWindowSeconds: 900 # 15 minutes - raster workloads are bursty + scaleUp: + stabilizationWindowSeconds: 120 # 2 minutes + targets: + cpu: 60 # Lower CPU target due to intensive processing + requestRate: 20000m + settings: + resources: + limits: + cpu: "2048m" + memory: "8192Mi" + requests: + cpu: "1024m" + memory: "4096Mi" + envVars: + GDAL_CACHEMAX: "1024" # 1GB cache + WEB_CONCURRENCY: "4" # Conservative for memory usage + GDAL_HTTP_MAX_RETRY: "3" + GDAL_HTTP_RETRY_DELAY: "1" + +vector: + enabled: true + autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 15 + type: "both" + targets: + cpu: 75 + requestRate: 60000m + settings: + resources: + limits: + cpu: "1200m" + memory: "2560Mi" + requests: + cpu: "600m" + memory: "1280Mi" + +multidim: + enabled: true # Enable for comprehensive setup + autoscaling: + enabled: true + minReplicas: 1 + maxReplicas: 10 + type: "cpu" # CPU-based scaling for multidim workloads + targets: + cpu: 50 # Very conservative due to resource intensity + settings: + resources: + limits: + cpu: "4096m" + memory: "16384Mi" # 16GB for large multidim datasets + requests: + cpu: "2048m" + memory: "8192Mi" + +###################### +# STAC BROWSER +###################### +browser: + enabled: true + replicaCount: 3 # HA setup + +###################### +# PGSTAC BOOTSTRAP +###################### +pgstacBootstrap: + enabled: true + settings: + loadSamples: false # No samples in production + waitConfig: + timeout: 1800 # 30 minutes timeout for large migrations + resources: + requests: + cpu: "1024m" + memory: "2048Mi" + limits: + cpu: "2048m" + memory: "4096Mi" + +###################### +# INTEGRATED OBSERVABILITY +###################### +# Grafana dashboards integrated with main chart (replaces separate eoapi-observability chart) +observability: + grafana: + enabled: true + persistence: + enabled: true + size: 10Gi + service: + type: LoadBalancer + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: "nlb" + service.beta.kubernetes.io/aws-load-balancer-internal: "false" + resources: + limits: + cpu: 100m + memory: 200Mi + requests: + cpu: 50m + memory: 100Mi + +###################### +# ADDITIONAL PRODUCTION SETTINGS +###################### + +# Service account with monitoring permissions +serviceAccount: + create: true + annotations: + # Add cloud provider annotations if needed + # eks.amazonaws.com/role-arn: arn:aws:iam::ACCOUNT:role/eoapi-monitoring-role diff --git a/docs/observability.md b/docs/observability.md index c160d7e6..05b1d757 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -47,6 +47,8 @@ For production deployments, use configuration files instead of command-line flag helm install eoapi eoapi/eoapi -f values-full-observability.yaml ``` +**For a complete example**: See [examples/values-full-observability.yaml](../examples/values-full-observability.yaml) + ## Architecture & Components **Component Responsibilities:** diff --git a/scripts/debug-deployment.sh b/scripts/debug-deployment.sh index 317e6743..b8727e57 100755 --- a/scripts/debug-deployment.sh +++ b/scripts/debug-deployment.sh @@ -5,8 +5,8 @@ set -e echo "=== Deployment Debug Information ===" # Get release name from environment or detect it -RELEASE_NAME=${RELEASE_NAME:-$(kubectl get pods --all-namespaces -l app.kubernetes.io/name=stac -o jsonpath='{.items[0].metadata.labels.app\.kubernetes\.io/instance}' 2>/dev/null || echo "eoapi")} -NAMESPACE=${NAMESPACE:-$(kubectl get pods --all-namespaces -l app.kubernetes.io/name=stac -o jsonpath='{.items[0].metadata.namespace}' 2>/dev/null || echo "eoapi")} +RELEASE_NAME=${RELEASE_NAME:-$(kubectl get pods --all-namespaces -l app.kubernetes.io/name=eoapi,app.kubernetes.io/component=stac -o jsonpath='{.items[0].metadata.labels.app\.kubernetes\.io/instance}' 2>/dev/null || echo "eoapi")} +NAMESPACE=${NAMESPACE:-$(kubectl get pods --all-namespaces -l app.kubernetes.io/name=eoapi,app.kubernetes.io/component=stac -o jsonpath='{.items[0].metadata.namespace}' 2>/dev/null || echo "eoapi")} echo "Using RELEASE_NAME: $RELEASE_NAME" echo "Using NAMESPACE: $NAMESPACE" @@ -126,13 +126,13 @@ kubectl get deployments -l app.kubernetes.io/name=eoapi-notifier -n "$NAMESPACE" # Logs from key components echo "--- Key Component Logs ---" echo "STAC API logs:" -kubectl logs -l app.kubernetes.io/name=stac -n "$NAMESPACE" --tail=20 2>/dev/null || echo "No STAC API logs in namespace $NAMESPACE" +kubectl logs -l app.kubernetes.io/name=eoapi,app.kubernetes.io/component=stac -n "$NAMESPACE" --tail=20 2>/dev/null || echo "No STAC API logs in namespace $NAMESPACE" echo "" echo "TiTiler logs:" -kubectl logs -l app.kubernetes.io/name=titiler -n "$NAMESPACE" --tail=20 2>/dev/null || echo "No TiTiler logs in namespace $NAMESPACE" +kubectl logs -l app.kubernetes.io/name=eoapi,app.kubernetes.io/component=raster -n "$NAMESPACE" --tail=20 2>/dev/null || echo "No TiTiler logs in namespace $NAMESPACE" echo "" echo "TiPG logs:" -kubectl logs -l app.kubernetes.io/name=tipg -n "$NAMESPACE" --tail=20 2>/dev/null || echo "No TiPG logs in namespace $NAMESPACE" +kubectl logs -l app.kubernetes.io/name=eoapi,app.kubernetes.io/component=vector -n "$NAMESPACE" --tail=20 2>/dev/null || echo "No TiPG logs in namespace $NAMESPACE" echo "" echo "eoapi-notifier logs:" kubectl logs -l app.kubernetes.io/name=eoapi-notifier -n "$NAMESPACE" --tail=20 2>/dev/null || echo "No eoapi-notifier logs in namespace $NAMESPACE" @@ -158,6 +158,28 @@ kubectl top nodes 2>/dev/null || echo "Metrics not available" echo "" echo "Pod resource usage in $NAMESPACE:" kubectl top pods -n "$NAMESPACE" 2>/dev/null || echo "Pod metrics not available" + +# Observability stack debugging +echo "--- Observability Stack ---" +echo "HPA status:" +kubectl get hpa -n "$NAMESPACE" -o wide 2>/dev/null || echo "No HPA resources found in namespace $NAMESPACE" +kubectl describe hpa -n "$NAMESPACE" 2>/dev/null || echo "No HPA resources to describe" +echo "" +echo "Custom Metrics API:" +kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" 2>/dev/null || echo "Custom metrics API not available" +echo "" +echo "Monitoring components:" +kubectl get pods -n "$NAMESPACE" | grep -E "(prometheus|grafana|metrics-server|adapter)" 2>/dev/null || echo "No monitoring components found in namespace $NAMESPACE" +echo "" +echo "Prometheus adapter logs:" +kubectl logs -l app.kubernetes.io/name=prometheus-adapter -n "$NAMESPACE" --tail=30 2>/dev/null || echo "No prometheus-adapter logs in namespace $NAMESPACE" +echo "" +echo "Grafana logs:" +kubectl logs -l app.kubernetes.io/name=grafana -n "$NAMESPACE" --tail=20 2>/dev/null || echo "No grafana logs in namespace $NAMESPACE" +echo "" +echo "Metrics server logs:" +kubectl logs -l app.kubernetes.io/name=metrics-server -n "$NAMESPACE" --tail=20 2>/dev/null || echo "No metrics-server logs in namespace $NAMESPACE" + # System controller logs if issues detected if ! kubectl get pods -n knative-serving &>/dev/null; then echo "--- Knative Controller Logs ---" diff --git a/scripts/deploy.sh b/scripts/deploy.sh index fc49e8e5..dcb2a81f 100755 --- a/scripts/deploy.sh +++ b/scripts/deploy.sh @@ -422,6 +422,22 @@ deploy_eoapi() { fi HELM_CMD="$HELM_CMD --set testing=true" HELM_CMD="$HELM_CMD --set ingress.host=eoapi.local" + HELM_CMD="$HELM_CMD --set ingress.className=traefik" + + HELM_CMD="$HELM_CMD --set monitoring.prometheus.enabled=true" + HELM_CMD="$HELM_CMD --set monitoring.prometheusAdapter.enabled=true" + HELM_CMD="$HELM_CMD --set observability.grafana.enabled=true" + HELM_CMD="$HELM_CMD --set monitoring.prometheusAdapter.prometheus.url=http://$RELEASE_NAME-prometheus-server.eoapi.svc.cluster.local" + + # Enable autoscaling with CPU metrics + HELM_CMD="$HELM_CMD --set stac.autoscaling.enabled=true" + HELM_CMD="$HELM_CMD --set stac.autoscaling.type=cpu" + HELM_CMD="$HELM_CMD --set raster.autoscaling.enabled=true" + HELM_CMD="$HELM_CMD --set raster.autoscaling.type=cpu" + HELM_CMD="$HELM_CMD --set vector.autoscaling.enabled=true" + HELM_CMD="$HELM_CMD --set vector.autoscaling.type=cpu" + + # Enable notifier HELM_CMD="$HELM_CMD --set eoapi-notifier.enabled=true" # Fix eoapi-notifier secret name dynamically HELM_CMD="$HELM_CMD --set eoapi-notifier.config.sources[0].config.connection.existingSecret.name=$RELEASE_NAME-pguser-eoapi" diff --git a/scripts/lib/common.sh b/scripts/lib/common.sh index dcaf18d6..914f66ce 100755 --- a/scripts/lib/common.sh +++ b/scripts/lib/common.sh @@ -92,7 +92,7 @@ detect_release_name() { # Fallback to pod labels if [ -z "$release_name" ]; then release_name=$(kubectl get pods ${namespace:+-n "$namespace"} \ - -l app.kubernetes.io/name=stac -o jsonpath='{.items[0].metadata.labels.app\.kubernetes\.io/instance}' \ + -l app.kubernetes.io/name=eoapi,app.kubernetes.io/component=stac -o jsonpath='{.items[0].metadata.labels.app\.kubernetes\.io/instance}' \ 2>/dev/null || echo "eoapi") fi @@ -101,7 +101,7 @@ detect_release_name() { # Auto-detect namespace from deployed eoAPI resources detect_namespace() { - kubectl get pods --all-namespaces -l app.kubernetes.io/name=stac \ + kubectl get pods --all-namespaces -l app.kubernetes.io/name=eoapi,app.kubernetes.io/component=stac \ -o jsonpath='{.items[0].metadata.namespace}' 2>/dev/null || echo "eoapi" } @@ -225,6 +225,7 @@ preflight_test() { validate_tools kubectl python3 || return 1 validate_cluster || return 1 ;; + *) log_error "Unknown test type: $test_type" return 1 diff --git a/scripts/test.sh b/scripts/test.sh index 874f7889..5be94eb2 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -45,33 +45,115 @@ ENVIRONMENT VARIABLES: EOF } -# Parse arguments -while [[ $# -gt 0 ]]; do - case $1 in - helm|integration|all|check-deps|check-deployment) - COMMAND="$1"; shift ;; - --debug) - DEBUG_MODE=true; shift ;; - --help|-h) - show_help; exit 0 ;; - *) - log_error "Unknown option: $1"; exit 1 ;; - esac -done +parse_args() { + while [[ $# -gt 0 ]]; do + case $1 in + helm|integration|all|check-deps|check-deployment) + COMMAND="$1"; shift ;; + --debug) + DEBUG_MODE=true; shift ;; + --help|-h) + show_help; exit 0 ;; + *) + log_error "Unknown option: $1" + show_help; exit 1 ;; + esac + done +} -# Default command -if [ -z "$COMMAND" ]; then - COMMAND="all" -fi +check_helm_dependencies() { + preflight_test "helm" || exit 1 + + if ! helm plugin list | grep -q unittest; then + log_info "Installing helm unittest plugin..." + helm plugin install https://github.com/helm-unittest/helm-unittest + fi +} + +check_integration_dependencies() { + preflight_test "integration" || exit 1 +} + +install_test_deps() { + log_info "Installing Python test dependencies..." + + local python_cmd="python" + if command_exists python3; then + python_cmd="python3" + fi + + if ! $python_cmd -m pip install --quiet pytest httpx >/dev/null 2>&1; then + log_error "Failed to install test dependencies (pytest, httpx)" + log_error "Please install manually: pip install pytest httpx" + exit 1 + fi + + log_info "Test dependencies installed." +} + +detect_deployment() { + if [ -z "${NAMESPACE:-}" ]; then + NAMESPACE=$(detect_namespace) + fi + + if [ -z "${RELEASE_NAME:-}" ]; then + RELEASE_NAME=$(detect_release_name "$NAMESPACE") + fi + + log_info "Using namespace: $NAMESPACE, release: $RELEASE_NAME" +} + +check_eoapi_deployment() { + validate_eoapi_deployment "$NAMESPACE" "$RELEASE_NAME" || { + log_error "eoAPI deployment validation failed" + debug_deployment_state + exit 1 + } +} + +wait_for_services() { + log_info "Waiting for eoAPI services to be ready..." + + local services=("stac" "raster" "vector") + for service in "${services[@]}"; do + if kubectl get pods -n "$NAMESPACE" -l "app.kubernetes.io/name=eoapi,app.kubernetes.io/component=$service" >/dev/null 2>&1; then + wait_for_pods "$NAMESPACE" "app.kubernetes.io/name=eoapi,app.kubernetes.io/component=$service" || return 1 + else + log_warning "Service $service not found, skipping wait" + fi + done -log_info "eoAPI Test Suite - Command: $COMMAND | Debug: $DEBUG_MODE | Release: $RELEASE_NAME" + log_info "✅ All eoAPI services are ready" +} + +setup_test_environment() { + local ingress_host + ingress_host=$(kubectl get ingress -n "$NAMESPACE" -o jsonpath='{.items[0].spec.rules[0].host}' 2>/dev/null || echo "localhost") + + export STAC_ENDPOINT="${STAC_ENDPOINT:-http://$ingress_host/stac}" + export RASTER_ENDPOINT="${RASTER_ENDPOINT:-http://$ingress_host/raster}" + export VECTOR_ENDPOINT="${VECTOR_ENDPOINT:-http://$ingress_host/vector}" + + log_info "Test endpoints configured:" + log_info " STAC: $STAC_ENDPOINT" + log_info " Raster: $RASTER_ENDPOINT" + log_info " Vector: $VECTOR_ENDPOINT" +} + +show_debug_info() { + log_info "=== Debug Information ===" + + log_info "=== Pods ===" + kubectl get pods -n "$NAMESPACE" -o wide 2>/dev/null || true + + log_info "=== Services ===" + kubectl get svc -n "$NAMESPACE" 2>/dev/null || true -# Check dependencies -check_dependencies() { - log_info "Checking dependencies..." - command -v helm >/dev/null 2>&1 || { log_error "helm required"; exit 1; } - command -v kubectl >/dev/null 2>&1 || { log_error "kubectl required"; exit 1; } - log_info "✅ Dependencies OK" + log_info "=== Ingress ===" + kubectl get ingress -n "$NAMESPACE" 2>/dev/null || true + + log_info "=== Recent Events ===" + kubectl get events -n "$NAMESPACE" --sort-by='.lastTimestamp' 2>/dev/null | tail -10 || true } # Run Helm tests @@ -185,6 +267,18 @@ run_integration_tests() { fi fi + # Run observability tests as part of integration + log_info "Running observability and monitoring tests..." + if [ -f ".github/workflows/tests/test_observability.py" ]; then + python3 -m pytest .github/workflows/tests/test_observability.py -v --tb=short || { + log_error "Observability tests failed - autoscaling won't work properly" + exit 1 + } + else + log_error "Observability tests not found - required for autoscaling validation" + exit 1 + fi + # Wait for Knative services to be ready if they exist if kubectl get ksvc -n "$NAMESPACE" >/dev/null 2>&1; then if kubectl get ksvc eoapi-cloudevents-sink -n "$NAMESPACE" >/dev/null 2>&1; then @@ -199,32 +293,87 @@ run_integration_tests() { log_info "✅ Integration tests completed" } -# Main execution -case "$COMMAND" in - helm) - check_dependencies - run_helm_tests - ;; - integration) - check_dependencies - run_integration_tests - ;; - all) - check_dependencies - run_helm_tests - run_integration_tests - ;; - check-deps) - check_dependencies - ;; - check-deployment) - debug_deployment_state - ;; - *) - log_error "Unknown command: $COMMAND" - show_help - exit 1 - ;; -esac +main() { + parse_args "$@" + + if [ -z "$COMMAND" ]; then + COMMAND="all" + fi + + if [ "$DEBUG_MODE" = true ]; then + log_info "eoAPI Test Suite (DEBUG) - Command: $COMMAND | Release: $RELEASE_NAME" + else + log_info "eoAPI Test Suite - Command: $COMMAND | Release: $RELEASE_NAME" + fi + + case $COMMAND in + helm) + check_helm_dependencies + run_helm_tests + ;; + check-deps) + log_info "Checking all dependencies..." + check_helm_dependencies + check_integration_dependencies + validate_cluster + install_test_deps + log_info "✅ All dependencies checked and ready" + ;; + check-deployment) + log_info "Checking deployment status..." + check_integration_dependencies + validate_cluster + detect_deployment + check_eoapi_deployment + log_info "✅ Deployment check complete" + ;; + integration) + check_integration_dependencies + validate_cluster + install_test_deps + detect_deployment + + if [ "$DEBUG_MODE" = true ]; then + show_debug_info + fi + + check_eoapi_deployment + wait_for_services + setup_test_environment + run_integration_tests + ;; + all) + log_info "Running comprehensive test suite (Helm + Integration tests)" + + log_info "=== Phase 1: Helm Tests ===" + check_helm_dependencies + run_helm_tests + + log_info "=== Phase 2: Integration Tests ===" + check_integration_dependencies + validate_cluster + install_test_deps + detect_deployment + + if [ "$DEBUG_MODE" = true ]; then + show_debug_info + fi + + check_eoapi_deployment + + wait_for_services + setup_test_environment + + run_integration_tests + ;; + *) + log_error "Unknown command: $COMMAND" + show_help + exit 1 + ;; + esac + + log_info "✅ Test suite complete" +} -log_info "✅ Test suite completed successfully" +main "$@"