developmentseed
diff --git a/‎.github/workflows/helm-tests.yml‎
Lines changed: 94 additions & 5 deletions b/‎.github/workflows/helm-tests.yml‎
Lines changed: 94 additions & 5 deletions
diff --git a/‎charts/eoapi-support/.gitignore‎ ‎charts/eoapi-observability/.gitignore‎charts/eoapi-support/.gitignore renamed to charts/eoapi-observability/.gitignore b/‎charts/eoapi-support/.gitignore‎ ‎charts/eoapi-observability/.gitignore‎charts/eoapi-support/.gitignore renamed to charts/eoapi-observability/.gitignore
diff --git a/‎charts/eoapi-support/.helmignore‎ ‎charts/eoapi-observability/.helmignore‎charts/eoapi-support/.helmignore renamed to charts/eoapi-observability/.helmignore b/‎charts/eoapi-support/.helmignore‎ ‎charts/eoapi-observability/.helmignore‎charts/eoapi-support/.helmignore renamed to charts/eoapi-observability/.helmignore
diff --git a/‎charts/eoapi-observability/Chart.yaml‎
Lines changed: 16 additions & 0 deletions b/‎charts/eoapi-observability/Chart.yaml‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎charts/eoapi-observability/README.md‎
Lines changed: 78 additions & 0 deletions b/‎charts/eoapi-observability/README.md‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎…-support/dashboards/eoAPI-Dashboard.json‎ ‎…vability/dashboards/eoAPI-Dashboard.json‎charts/eoapi-support/dashboards/eoAPI-Dashboard.json renamed to charts/eoapi-observability/dashboards/eoAPI-Dashboard.json b/‎…-support/dashboards/eoAPI-Dashboard.json‎ ‎…vability/dashboards/eoAPI-Dashboard.json‎charts/eoapi-support/dashboards/eoAPI-Dashboard.json renamed to charts/eoapi-observability/dashboards/eoAPI-Dashboard.json
diff --git a/‎…-support/templates/dashboard.config.yaml‎ ‎…vability/templates/dashboard.config.yaml‎charts/eoapi-support/templates/dashboard.config.yaml renamed to charts/eoapi-observability/templates/dashboard.config.yaml b/‎…-support/templates/dashboard.config.yaml‎ ‎…vability/templates/dashboard.config.yaml‎charts/eoapi-support/templates/dashboard.config.yaml renamed to charts/eoapi-observability/templates/dashboard.config.yaml
diff --git a/‎charts/eoapi-observability/tests/observability_tests.yaml‎
Lines changed: 44 additions & 0 deletions b/‎charts/eoapi-observability/tests/observability_tests.yaml‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎charts/eoapi-observability/values.yaml‎
Lines changed: 99 additions & 0 deletions b/‎charts/eoapi-observability/values.yaml‎
Lines changed: 99 additions & 0 deletions
@@ -22,11 +22,17 @@ jobs:
         with:
           version: ${{ env.HELM_VERSION }}
 
-      - name: Run Helm unit tests
-        run: make tests
+      - run: |
+          cd charts
+          helm unittest eoapi -f 'tests/*.yaml' -v eoapi/test-helm-values.yaml
+          # Run autoscaling-specific unit tests
+          helm unittest eoapi -f 'tests/autoscaling_tests.yaml' -v eoapi/test-autoscaling-values.yaml
+          # Run observability chart tests if they exist
+          if [ -d "eoapi-observability/tests" ]; then
+            helm unittest eoapi-observability -f 'tests/*.yaml'
+          fi
 
-  integration:
-    name: Integration Tests (K3s)
+  k3s-integration-tests:
     if: github.event.pull_request.head.repo.full_name == github.repository
     permissions:
       contents: 'read'
@@ -129,9 +135,92 @@ jobs:
           kubectl get jobs -o wide
           kubectl get services -o wide
           kubectl get ingress
+          echo ""
+
+          echo "Waiting for raster service to be ready..."
+          kubectl wait --for=condition=Ready pod -l app=${RELEASE_NAME}-raster --timeout=180s || {
+            echo "Raster service failed to become ready. Checking status..."
+            kubectl get pods -l app=${RELEASE_NAME}-raster -o wide
+            kubectl describe pods -l app=${RELEASE_NAME}-raster
+            exit 1
+          }
+          echo "raster service is ready, moving on..."
+
+          echo "Waiting for vector service to be ready..."
+          kubectl wait --for=condition=Ready pod -l app=${RELEASE_NAME}-vector --timeout=180s || {
+            echo "Vector service failed to become ready. Checking status..."
+            kubectl get pods -l app=${RELEASE_NAME}-vector -o wide
+            kubectl describe pods -l app=${RELEASE_NAME}-vector
+            exit 1
+          }
+          echo "vector service is ready, moving on..."
+
+          echo "Waiting for stac service to be ready..."
+          kubectl wait --for=condition=Ready pod -l app=${RELEASE_NAME}-stac --timeout=180s || {
+            echo "STAC service failed to become ready. Checking status..."
+            kubectl get pods -l app=${RELEASE_NAME}-stac -o wide
+            kubectl describe pods -l app=${RELEASE_NAME}-stac
+            exit 1
+          }
+          echo "all services are ready, moving on..."
+
+      - name: cleanup if services fail to boot
+        if: steps.watchservices.outcome == 'failure'
+        run: |
+          echo "The watchservices step failed or timed out. Extracting comprehensive debugging info..."
+
+          # Get and display all pods status with more detail
+          echo "===== Pod Status (detailed) ====="
+          kubectl get pods -o wide
+          echo ""
+
+          echo "===== Pod Readiness Summary ====="
+          kubectl get pods --no-headers | awk '{print $2, $3}' | sort | uniq -c
+          echo ""
+
+          # Check init container logs for all services
+          for SERVICE in raster vector stac multidim; do
+            echo "===== $SERVICE Service Pod Status ====="
+            kubectl get pods -l app=$RELEASE_NAME-$SERVICE -o wide || echo "No $SERVICE pods found"
+
+            POD_NAME=$(kubectl get pod -l app=$RELEASE_NAME-$SERVICE -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
+            if [ -n "$POD_NAME" ]; then
+              echo "===== $SERVICE Pod ($POD_NAME) Init Container Logs ====="
+              kubectl logs pod/$POD_NAME -c wait-for-pgstac-jobs --tail=100 || echo "Could not get $SERVICE init container logs"
+              echo ""
+
+              echo "===== $SERVICE Pod ($POD_NAME) Main Container Logs ====="
+              kubectl logs pod/$POD_NAME --tail=100 || echo "Could not get $SERVICE main container logs"
+              echo ""
+
+              echo "===== $SERVICE Pod ($POD_NAME) Description ====="
+              kubectl describe pod/$POD_NAME
+              echo ""
+            fi
+          done
+
+          # Show job status that init containers might be waiting for
+          echo "===== Job Status (what init containers are waiting for) ====="
+          kubectl get jobs -o wide
+          echo ""
+
+          # Check pgstac jobs using labels instead of hardcoded names
+          for APP_LABEL in pgstac-migrate pgstac-load-samples; do
+            echo "===== Jobs with app=$RELEASE_NAME-$APP_LABEL Status ====="
+            JOBS=$(kubectl get jobs -l app=$RELEASE_NAME-$APP_LABEL -o name 2>/dev/null || true)
+            if [ -n "$JOBS" ]; then
+              for JOB in $JOBS; do
+                echo "--- Job $JOB ---"
+                kubectl get "$JOB" -o yaml 2>/dev/null | grep -A 10 -E "conditions|status:" || echo "Could not get status for $JOB"
+              done
+            else
+              echo "No jobs found with app=$RELEASE_NAME-$APP_LABEL label"
+            fi
+            echo ""
+          done
 
 
       - name: Cleanup
         if: always()
         run: |
-          helm uninstall "$RELEASE_NAME" || true
+          helm uninstall "$RELEASE_NAME" || true
@@ -0,0 +1,16 @@
+apiVersion: v2
+name: eoapi-observability
+description: Observability tools for eoAPI monitoring
+
+appVersion: "0.2.0"
+version: "0.2.0"
+
+dependencies:
+  # Grafana for observability and dashboarding of metrics
+  # NOTE: Connects to Prometheus instance deployed by main eoapi chart
+  # https://github.com/grafana/helm-charts/tree/main/charts/grafana
+  #
+  - name: grafana
+    version: 7.3.3
+    repository: https://grafana.github.io/helm-charts
+    condition: grafana.enabled
@@ -0,0 +1,78 @@
+# eoAPI Observability
+
+Observability and dashboarding tools for eoAPI monitoring.
+
+This chart provides Grafana dashboards and observability tools for monitoring eoAPI deployments. It connects to the Prometheus instance deployed by the main `eoapi` chart.
+
+## Documentation
+
+Refer to the docs for full documentation about setup and configuration:
+
+- [Observability tooling](../../docs/observability.md)
+- [Autoscaling](../../docs/autoscaling.md)
+
+## Prerequisites
+
+The main `eoapi` chart must be deployed with monitoring enabled:
+
+```yaml
+monitoring:
+  prometheus:
+    enabled: true
+```
+
+## Installation
+
+```bash
+# Install main eoapi chart first (if not already installed)
+helm install eoapi eoapi/eoapi \
+  --set monitoring.prometheus.enabled=true \
+  --namespace eoapi --create-namespace
+
+# Then install observability tools
+helm install eoapi-obs eoapi/eoapi-observability --namespace eoapi
+```
+
+## Configuration
+
+### Key Values
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `grafana.enabled` | Enable Grafana deployment | `true` |
+| `prometheusUrl` | Prometheus server URL | Auto-detected |
+| `grafana.service.type` | Grafana service type | `LoadBalancer` |
+| `grafana.persistence.enabled` | Enable data persistence | `false` |
+
+
+### Enable Additional Features
+
+```yaml
+prometheus:
+  enabled: true
+  alertmanager:
+    enabled: true
+  prometheus-pushgateway:
+    enabled: true
+```
+
+## Dashboards
+
+Pre-built dashboards include:
+- eoAPI service metrics (request rates, response times, errors)
+- Container resources (CPU, memory, throttling)
+- Infrastructure monitoring (nodes, pods)
+- PostgreSQL metrics (when enabled)
+
+## Access Grafana
+
+```bash
+# Get service endpoint
+kubectl get svc eoapi-obs-grafana -n eoapi
+
+# Get admin password
+kubectl get secret eoapi-obs-grafana -n eoapi \
+  -o jsonpath="{.data.admin-password}" | base64 -d
+```
+
+Default credentials: `admin` / `admin` (change on first login)
@@ -0,0 +1,44 @@
+suite: eoapi-observability chart tests
+templates:
+  - templates/dashboard.config.yaml
+tests:
+  - it: "dashboard config created with default values"
+    asserts:
+      - isKind:
+          of: ConfigMap
+      - equal:
+          path: metadata.name
+          value: "RELEASE-NAME-dashboards"
+      - equal:
+          path: metadata.labels.eoapi_dashboard
+          value: "1"
+
+  - it: "dashboard config includes eoapi dashboard json"
+    asserts:
+      - isKind:
+          of: ConfigMap
+      - isNotEmpty:
+          path: data["kubernetes.json"]
+
+  - it: "observability chart works with different release names"
+    release:
+      name: "my-eoapi-obs"
+    asserts:
+      - equal:
+          path: metadata.name
+          value: "my-eoapi-obs-dashboards"
+      - equal:
+          path: metadata.labels.eoapi_dashboard
+          value: "1"
+
+  - it: "dashboard configmap structure is correct"
+    asserts:
+      - isKind:
+          of: ConfigMap
+      - hasDocuments:
+          count: 1
+      - exists:
+          path: data["kubernetes.json"]
+      - equal:
+          path: metadata.labels.eoapi_dashboard
+          value: "1"
@@ -0,0 +1,99 @@
+######################
+# EOAPI OBSERVABILITY
+######################
+# This chart provides observability and dashboarding tools for eoAPI monitoring.
+# It expects a Prometheus instance to already be available (deployed by main eoapi chart or externally).
+
+grafana:
+  enabled: true
+  persistence:
+    enabled: false
+  deploymentStrategy:
+    type: Recreate
+  service:
+    type: LoadBalancer
+    annotations:
+      service.beta.kubernetes.io/aws-load-balancer-type: "nlb"
+      service.beta.kubernetes.io/aws-load-balancer-internal: "false"
+  rbac:
+    namespaced: true
+    pspEnabled: false
+  # initChownData refers to an init container enabled by default that isn't
+  # needed as we don't reconfigure the linux user the grafana server will run as.
+  initChownData:
+    enabled: false
+
+  # Resources for grafana based on observed usage patterns
+  # Memory use increases over time but stays reasonable below 200Mi
+  # CPU use is minimal with peaks at up to 9m during dashboard browsing
+  resources:
+    limits:
+      cpu: 100m
+      memory: 200Mi
+    requests:
+      cpu: 10m
+      memory: 200Mi
+
+  # Prometheus datasource configuration
+  # Configure this to point to your Prometheus instance
+  datasources:
+    datasources.yaml:
+      apiVersion: 1
+      datasources:
+      - name: prometheus
+
+        orgId: 1
+        type: prometheus
+        # Default: assumes Prometheus deployed by main eoapi chart in same namespace
+        # Override prometheusUrl to point to external Prometheus if needed
+        url: "{{ .Values.prometheusUrl | default (printf \"http://%s-prometheus-server.%s.svc.cluster.local\" .Release.Name .Release.Namespace) }}"
+        access: proxy
+        jsonData:
+          timeInterval: "5s"
+        isDefault: true
+        editable: true
+        version: 1
+
+  # Dashboard providers configuration
+  dashboardProviders:
+    dashboardproviders.yaml:
+      apiVersion: 1
+      providers:
+      - name: 'default'
+        orgId: 1
+        folder: ''
+        type: file
+        disableDeletion: false
+        editable: true
+        options:
+          path: /var/lib/grafana/dashboards/default
+
+  # Dashboard ConfigMaps
+  dashboardsConfigMaps:
+    # References the ConfigMap created by templates/dashboard.config.yaml
+    default: "{{ .Release.Name }}-dashboards"
+
+# Prometheus connection configuration
+# Override this if connecting to external Prometheus instance
+prometheusUrl: ""
+
+# Advanced Prometheus features (optional)
+# These can be enabled if you want additional Prometheus functionality
+# beyond what's provided by the main eoapi chart
+prometheus:
+  enabled: false
+  # If enabled, provides alertmanager functionality
+  alertmanager:
+    enabled: false
+  # If enabled, provides pushgateway functionality
+  prometheus-pushgateway:
+    enabled: false
+  # Prometheus server - only enable if you want a separate instance
+  # for advanced monitoring beyond the core metrics in main chart
+  server:
+    enabled: false
+    service:
+      annotations:
+        service.beta.kubernetes.io/aws-load-balancer-type: "nlb"
+        service.beta.kubernetes.io/aws-load-balancer-internal: "false"
+      type: LoadBalancer