Skip to content

Commit cbd64e4

Browse files
committed
feat: integrate observability into main eoapi chart
1 parent 8b575b7 commit cbd64e4

28 files changed

+4263
-667
lines changed

.github/workflows/ci.yml

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,3 +158,151 @@ jobs:
158158
head -1 "$file" | grep -q "^---$" || { echo "❌ Missing frontmatter: $file"; missing=1; }
159159
done
160160
exit $missing
161+
162+
observability-tests:
163+
name: Observability tests
164+
if: github.event.pull_request.head.repo.full_name == github.repository
165+
permissions:
166+
contents: 'read'
167+
id-token: 'write'
168+
needs: integration-tests
169+
runs-on: ubuntu-latest
170+
steps:
171+
- uses: actions/checkout@v5
172+
173+
- name: Start K3s cluster
174+
uses: jupyterhub/action-k3s-helm@v4
175+
with:
176+
k3s-channel: latest
177+
helm-version: ${{ env.HELM_VERSION }}
178+
metrics-enabled: false
179+
docker-enabled: true
180+
181+
- name: Set release name
182+
run: echo "RELEASE_NAME=eoapi-$(echo "${{ github.sha }}" | cut -c1-8)" >> "$GITHUB_ENV"
183+
184+
- name: Wait for K3s to be fully ready
185+
run: |
186+
echo "=== Waiting for K3s to be fully ready ==="
187+
kubectl wait --for=condition=Ready pod -l k8s-app=kube-dns -n kube-system --timeout=300s
188+
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=traefik -n kube-system --timeout=300s
189+
kubectl get nodes
190+
kubectl get pods --all-namespaces
191+
sleep 10
192+
echo "✅ K3s is ready"
193+
194+
- name: Deploy eoAPI with monitoring
195+
run: |
196+
echo "=== Deploying eoAPI with monitoring stack ==="
197+
export RELEASE_NAME="$RELEASE_NAME"
198+
export PGO_VERSION="${{ env.PGO_VERSION }}"
199+
export GITHUB_SHA="${{ github.sha }}"
200+
export CI_MODE=true
201+
export OBSERVABILITY_MODE=true
202+
203+
# Deploy using consolidated script with observability mode enabled
204+
./scripts/deploy.sh --ci
205+
206+
- name: Wait for monitoring stack to be ready
207+
run: |
208+
echo "=== Waiting for monitoring components ==="
209+
210+
# Wait for metrics-server first (required for HPA)
211+
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=metrics-server -n eoapi --timeout=300s || echo "metrics-server not ready"
212+
213+
# Wait for Prometheus server
214+
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/component=server,app.kubernetes.io/name=prometheus -n eoapi --timeout=300s || echo "Prometheus server not ready"
215+
216+
# Wait for Grafana
217+
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=grafana -n eoapi --timeout=300s || echo "Grafana not ready"
218+
219+
# Wait for prometheus-adapter
220+
kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=prometheus-adapter -n eoapi --timeout=300s || echo "prometheus-adapter not ready"
221+
222+
# Give time for HPA to be created and configured
223+
echo "=== Waiting for HPA creation ==="
224+
sleep 60
225+
226+
echo "=== Final monitoring stack status ==="
227+
kubectl get pods -n eoapi | grep -E "(prometheus|grafana|metrics-server)" || true
228+
kubectl get hpa -n eoapi || echo "No HPA resources found yet"
229+
230+
- name: Validate core eoAPI services
231+
run: |
232+
echo "=== Validating core eoAPI services ==="
233+
234+
# Wait for core application pods to be ready
235+
kubectl wait --for=condition=Ready pod -l app="$RELEASE_NAME"-stac -n eoapi --timeout=300s
236+
kubectl wait --for=condition=Ready pod -l app="$RELEASE_NAME"-raster -n eoapi --timeout=300s
237+
kubectl wait --for=condition=Ready pod -l app="$RELEASE_NAME"-vector -n eoapi --timeout=300s
238+
239+
echo "✅ Core eoAPI services are ready"
240+
241+
- name: Run observability tests
242+
run: |
243+
echo "=== Running observability test suite ==="
244+
export RELEASE_NAME="$RELEASE_NAME"
245+
export NAMESPACE="eoapi"
246+
247+
# Install python dependencies for testing
248+
python -m pip install --upgrade pip
249+
pip install pytest requests
250+
251+
# Run observability tests
252+
python -m pytest .github/workflows/tests/test_observability.py -v --tb=short
253+
254+
# Run autoscaling tests
255+
python -m pytest .github/workflows/tests/test_autoscaling.py -v --tb=short -m "not slow"
256+
257+
- name: Debug observability stack on failure
258+
if: failure()
259+
run: |
260+
echo "=== Observability Debug Information ==="
261+
export RELEASE_NAME="$RELEASE_NAME"
262+
export NAMESPACE="eoapi"
263+
264+
echo "=== All Pods in namespace ==="
265+
kubectl get pods -n eoapi -o wide || true
266+
267+
echo "=== Monitoring Pods Status ==="
268+
kubectl get pods -n eoapi | grep -E "(prometheus|grafana|metrics-server|adapter)" || true
269+
270+
echo "=== Core eoAPI Pods Status ==="
271+
kubectl get pods -n eoapi | grep -E "(stac|raster|vector|postgres)" || true
272+
273+
echo "=== HPA Status ==="
274+
kubectl get hpa -n eoapi -o wide || true
275+
kubectl describe hpa -n eoapi || true
276+
277+
echo "=== Custom Metrics API ==="
278+
kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" || echo "Custom metrics API not available"
279+
280+
echo "=== Metrics Server API ==="
281+
kubectl get --raw "/apis/metrics.k8s.io/v1beta1/nodes" || echo "Metrics server API not available"
282+
283+
echo "=== Pod Metrics ==="
284+
kubectl top pods -n eoapi || echo "Pod metrics not available"
285+
286+
echo "=== Services ==="
287+
kubectl get svc -n eoapi || true
288+
289+
echo "=== Recent Events ==="
290+
kubectl get events -n eoapi --sort-by='.lastTimestamp' | tail -30 || true
291+
292+
echo "=== Failed Pod Logs ==="
293+
for pod in $(kubectl get pods -n eoapi --field-selector=status.phase!=Running --no-headers -o custom-columns=":metadata.name" 2>/dev/null || echo ""); do
294+
if [ -n "$pod" ]; then
295+
echo "=== Logs for failed pod: $pod ==="
296+
kubectl logs "$pod" -n eoapi --tail=50 || true
297+
fi
298+
done
299+
300+
echo "=== Component Logs ==="
301+
kubectl logs -l app.kubernetes.io/name=prometheus-adapter -n eoapi --tail=50 || echo "No prometheus-adapter logs"
302+
kubectl logs -l app.kubernetes.io/name=grafana -n eoapi --tail=30 || echo "No grafana logs"
303+
kubectl logs -l app.kubernetes.io/name=metrics-server -n eoapi --tail=30 || echo "No metrics-server logs"
304+
305+
- name: Cleanup observability test
306+
if: always()
307+
run: |
308+
helm uninstall "$RELEASE_NAME" || true

0 commit comments

Comments
 (0)