Skip to content

Commit 74f08e4

Browse files
committed
Refactor autoscalling and observability.
1 parent 501a83f commit 74f08e4

File tree

19 files changed

+2001
-500
lines changed

19 files changed

+2001
-500
lines changed

.github/workflows/helm-tests.yml

Lines changed: 94 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,17 @@ jobs:
2222
with:
2323
version: ${{ env.HELM_VERSION }}
2424

25-
- name: Run Helm unit tests
26-
run: make tests
25+
- run: |
26+
cd charts
27+
helm unittest eoapi -f 'tests/*.yaml' -v eoapi/test-helm-values.yaml
28+
# Run autoscaling-specific unit tests
29+
helm unittest eoapi -f 'tests/autoscaling_tests.yaml' -v eoapi/test-autoscaling-values.yaml
30+
# Run observability chart tests if they exist
31+
if [ -d "eoapi-observability/tests" ]; then
32+
helm unittest eoapi-observability -f 'tests/*.yaml'
33+
fi
2734
28-
integration:
29-
name: Integration Tests (K3s)
35+
k3s-integration-tests:
3036
if: github.event.pull_request.head.repo.full_name == github.repository
3137
permissions:
3238
contents: 'read'
@@ -129,9 +135,92 @@ jobs:
129135
kubectl get jobs -o wide
130136
kubectl get services -o wide
131137
kubectl get ingress
138+
echo ""
139+
140+
echo "Waiting for raster service to be ready..."
141+
kubectl wait --for=condition=Ready pod -l app=${RELEASE_NAME}-raster --timeout=180s || {
142+
echo "Raster service failed to become ready. Checking status..."
143+
kubectl get pods -l app=${RELEASE_NAME}-raster -o wide
144+
kubectl describe pods -l app=${RELEASE_NAME}-raster
145+
exit 1
146+
}
147+
echo "raster service is ready, moving on..."
148+
149+
echo "Waiting for vector service to be ready..."
150+
kubectl wait --for=condition=Ready pod -l app=${RELEASE_NAME}-vector --timeout=180s || {
151+
echo "Vector service failed to become ready. Checking status..."
152+
kubectl get pods -l app=${RELEASE_NAME}-vector -o wide
153+
kubectl describe pods -l app=${RELEASE_NAME}-vector
154+
exit 1
155+
}
156+
echo "vector service is ready, moving on..."
157+
158+
echo "Waiting for stac service to be ready..."
159+
kubectl wait --for=condition=Ready pod -l app=${RELEASE_NAME}-stac --timeout=180s || {
160+
echo "STAC service failed to become ready. Checking status..."
161+
kubectl get pods -l app=${RELEASE_NAME}-stac -o wide
162+
kubectl describe pods -l app=${RELEASE_NAME}-stac
163+
exit 1
164+
}
165+
echo "all services are ready, moving on..."
166+
167+
- name: cleanup if services fail to boot
168+
if: steps.watchservices.outcome == 'failure'
169+
run: |
170+
echo "The watchservices step failed or timed out. Extracting comprehensive debugging info..."
171+
172+
# Get and display all pods status with more detail
173+
echo "===== Pod Status (detailed) ====="
174+
kubectl get pods -o wide
175+
echo ""
176+
177+
echo "===== Pod Readiness Summary ====="
178+
kubectl get pods --no-headers | awk '{print $2, $3}' | sort | uniq -c
179+
echo ""
180+
181+
# Check init container logs for all services
182+
for SERVICE in raster vector stac multidim; do
183+
echo "===== $SERVICE Service Pod Status ====="
184+
kubectl get pods -l app=$RELEASE_NAME-$SERVICE -o wide || echo "No $SERVICE pods found"
185+
186+
POD_NAME=$(kubectl get pod -l app=$RELEASE_NAME-$SERVICE -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
187+
if [ -n "$POD_NAME" ]; then
188+
echo "===== $SERVICE Pod ($POD_NAME) Init Container Logs ====="
189+
kubectl logs pod/$POD_NAME -c wait-for-pgstac-jobs --tail=100 || echo "Could not get $SERVICE init container logs"
190+
echo ""
191+
192+
echo "===== $SERVICE Pod ($POD_NAME) Main Container Logs ====="
193+
kubectl logs pod/$POD_NAME --tail=100 || echo "Could not get $SERVICE main container logs"
194+
echo ""
195+
196+
echo "===== $SERVICE Pod ($POD_NAME) Description ====="
197+
kubectl describe pod/$POD_NAME
198+
echo ""
199+
fi
200+
done
201+
202+
# Show job status that init containers might be waiting for
203+
echo "===== Job Status (what init containers are waiting for) ====="
204+
kubectl get jobs -o wide
205+
echo ""
206+
207+
# Check pgstac jobs using labels instead of hardcoded names
208+
for APP_LABEL in pgstac-migrate pgstac-load-samples; do
209+
echo "===== Jobs with app=$RELEASE_NAME-$APP_LABEL Status ====="
210+
JOBS=$(kubectl get jobs -l app=$RELEASE_NAME-$APP_LABEL -o name 2>/dev/null || true)
211+
if [ -n "$JOBS" ]; then
212+
for JOB in $JOBS; do
213+
echo "--- Job $JOB ---"
214+
kubectl get "$JOB" -o yaml 2>/dev/null | grep -A 10 -E "conditions|status:" || echo "Could not get status for $JOB"
215+
done
216+
else
217+
echo "No jobs found with app=$RELEASE_NAME-$APP_LABEL label"
218+
fi
219+
echo ""
220+
done
132221
133222
134223
- name: Cleanup
135224
if: always()
136225
run: |
137-
helm uninstall "$RELEASE_NAME" || true
226+
helm uninstall "$RELEASE_NAME" || true
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
apiVersion: v2
2+
name: eoapi-observability
3+
description: Observability tools for eoAPI monitoring
4+
5+
appVersion: "0.2.0"
6+
version: "0.2.0"
7+
8+
dependencies:
9+
# Grafana for observability and dashboarding of metrics
10+
# NOTE: Connects to Prometheus instance deployed by main eoapi chart
11+
# https://github.com/grafana/helm-charts/tree/main/charts/grafana
12+
#
13+
- name: grafana
14+
version: 7.3.3
15+
repository: https://grafana.github.io/helm-charts
16+
condition: grafana.enabled
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# eoAPI Observability
2+
3+
Observability and dashboarding tools for eoAPI monitoring.
4+
5+
This chart provides Grafana dashboards and observability tools for monitoring eoAPI deployments. It connects to the Prometheus instance deployed by the main `eoapi` chart.
6+
7+
## Documentation
8+
9+
Refer to the docs for full documentation about setup and configuration:
10+
11+
- [Observability tooling](../../docs/observability.md)
12+
- [Autoscaling](../../docs/autoscaling.md)
13+
14+
## Prerequisites
15+
16+
The main `eoapi` chart must be deployed with monitoring enabled:
17+
18+
```yaml
19+
monitoring:
20+
prometheus:
21+
enabled: true
22+
```
23+
24+
## Installation
25+
26+
```bash
27+
# Install main eoapi chart first (if not already installed)
28+
helm install eoapi eoapi/eoapi \
29+
--set monitoring.prometheus.enabled=true \
30+
--namespace eoapi --create-namespace
31+
32+
# Then install observability tools
33+
helm install eoapi-obs eoapi/eoapi-observability --namespace eoapi
34+
```
35+
36+
## Configuration
37+
38+
### Key Values
39+
40+
| Parameter | Description | Default |
41+
|-----------|-------------|---------|
42+
| `grafana.enabled` | Enable Grafana deployment | `true` |
43+
| `prometheusUrl` | Prometheus server URL | Auto-detected |
44+
| `grafana.service.type` | Grafana service type | `LoadBalancer` |
45+
| `grafana.persistence.enabled` | Enable data persistence | `false` |
46+
47+
48+
### Enable Additional Features
49+
50+
```yaml
51+
prometheus:
52+
enabled: true
53+
alertmanager:
54+
enabled: true
55+
prometheus-pushgateway:
56+
enabled: true
57+
```
58+
59+
## Dashboards
60+
61+
Pre-built dashboards include:
62+
- eoAPI service metrics (request rates, response times, errors)
63+
- Container resources (CPU, memory, throttling)
64+
- Infrastructure monitoring (nodes, pods)
65+
- PostgreSQL metrics (when enabled)
66+
67+
## Access Grafana
68+
69+
```bash
70+
# Get service endpoint
71+
kubectl get svc eoapi-obs-grafana -n eoapi
72+
73+
# Get admin password
74+
kubectl get secret eoapi-obs-grafana -n eoapi \
75+
-o jsonpath="{.data.admin-password}" | base64 -d
76+
```
77+
78+
Default credentials: `admin` / `admin` (change on first login)

charts/eoapi-support/dashboards/eoAPI-Dashboard.json renamed to charts/eoapi-observability/dashboards/eoAPI-Dashboard.json

File renamed without changes.

charts/eoapi-support/templates/dashboard.config.yaml renamed to charts/eoapi-observability/templates/dashboard.config.yaml

File renamed without changes.
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
suite: eoapi-observability chart tests
2+
templates:
3+
- templates/dashboard.config.yaml
4+
tests:
5+
- it: "dashboard config created with default values"
6+
asserts:
7+
- isKind:
8+
of: ConfigMap
9+
- equal:
10+
path: metadata.name
11+
value: "RELEASE-NAME-dashboards"
12+
- equal:
13+
path: metadata.labels.eoapi_dashboard
14+
value: "1"
15+
16+
- it: "dashboard config includes eoapi dashboard json"
17+
asserts:
18+
- isKind:
19+
of: ConfigMap
20+
- isNotEmpty:
21+
path: data["kubernetes.json"]
22+
23+
- it: "observability chart works with different release names"
24+
release:
25+
name: "my-eoapi-obs"
26+
asserts:
27+
- equal:
28+
path: metadata.name
29+
value: "my-eoapi-obs-dashboards"
30+
- equal:
31+
path: metadata.labels.eoapi_dashboard
32+
value: "1"
33+
34+
- it: "dashboard configmap structure is correct"
35+
asserts:
36+
- isKind:
37+
of: ConfigMap
38+
- hasDocuments:
39+
count: 1
40+
- exists:
41+
path: data["kubernetes.json"]
42+
- equal:
43+
path: metadata.labels.eoapi_dashboard
44+
value: "1"
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
######################
2+
# EOAPI OBSERVABILITY
3+
######################
4+
# This chart provides observability and dashboarding tools for eoAPI monitoring.
5+
# It expects a Prometheus instance to already be available (deployed by main eoapi chart or externally).
6+
7+
grafana:
8+
enabled: true
9+
persistence:
10+
enabled: false
11+
deploymentStrategy:
12+
type: Recreate
13+
service:
14+
type: LoadBalancer
15+
annotations:
16+
service.beta.kubernetes.io/aws-load-balancer-type: "nlb"
17+
service.beta.kubernetes.io/aws-load-balancer-internal: "false"
18+
rbac:
19+
namespaced: true
20+
pspEnabled: false
21+
# initChownData refers to an init container enabled by default that isn't
22+
# needed as we don't reconfigure the linux user the grafana server will run as.
23+
initChownData:
24+
enabled: false
25+
26+
# Resources for grafana based on observed usage patterns
27+
# Memory use increases over time but stays reasonable below 200Mi
28+
# CPU use is minimal with peaks at up to 9m during dashboard browsing
29+
resources:
30+
limits:
31+
cpu: 100m
32+
memory: 200Mi
33+
requests:
34+
cpu: 10m
35+
memory: 200Mi
36+
37+
# Prometheus datasource configuration
38+
# Configure this to point to your Prometheus instance
39+
datasources:
40+
datasources.yaml:
41+
apiVersion: 1
42+
datasources:
43+
- name: prometheus
44+
45+
orgId: 1
46+
type: prometheus
47+
# Default: assumes Prometheus deployed by main eoapi chart in same namespace
48+
# Override prometheusUrl to point to external Prometheus if needed
49+
url: "{{ .Values.prometheusUrl | default (printf \"http://%s-prometheus-server.%s.svc.cluster.local\" .Release.Name .Release.Namespace) }}"
50+
access: proxy
51+
jsonData:
52+
timeInterval: "5s"
53+
isDefault: true
54+
editable: true
55+
version: 1
56+
57+
# Dashboard providers configuration
58+
dashboardProviders:
59+
dashboardproviders.yaml:
60+
apiVersion: 1
61+
providers:
62+
- name: 'default'
63+
orgId: 1
64+
folder: ''
65+
type: file
66+
disableDeletion: false
67+
editable: true
68+
options:
69+
path: /var/lib/grafana/dashboards/default
70+
71+
# Dashboard ConfigMaps
72+
dashboardsConfigMaps:
73+
# References the ConfigMap created by templates/dashboard.config.yaml
74+
default: "{{ .Release.Name }}-dashboards"
75+
76+
# Prometheus connection configuration
77+
# Override this if connecting to external Prometheus instance
78+
prometheusUrl: ""
79+
80+
# Advanced Prometheus features (optional)
81+
# These can be enabled if you want additional Prometheus functionality
82+
# beyond what's provided by the main eoapi chart
83+
prometheus:
84+
enabled: false
85+
# If enabled, provides alertmanager functionality
86+
alertmanager:
87+
enabled: false
88+
# If enabled, provides pushgateway functionality
89+
prometheus-pushgateway:
90+
enabled: false
91+
# Prometheus server - only enable if you want a separate instance
92+
# for advanced monitoring beyond the core metrics in main chart
93+
server:
94+
enabled: false
95+
service:
96+
annotations:
97+
service.beta.kubernetes.io/aws-load-balancer-type: "nlb"
98+
service.beta.kubernetes.io/aws-load-balancer-internal: "false"
99+
type: LoadBalancer

0 commit comments

Comments
 (0)