refactor: auto-scaling and observability components #284
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: CI tests | |
| on: | |
| push: | |
| branches: [ "main" ] | |
| pull_request: | |
| branches: [ "main" ] | |
| env: | |
| HELM_VERSION: v3.15.2 | |
| PGO_VERSION: 5.7.4 | |
| jobs: | |
| fast-checks: | |
| name: Simple tests | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v5 | |
| - uses: actions/setup-node@v6 | |
| with: | |
| node-version: '24' | |
| - name: Install Helm | |
| uses: azure/setup-helm@v4 | |
| with: | |
| version: ${{ env.HELM_VERSION }} | |
| - name: Setup Helm dependencies | |
| run: ./scripts/deploy.sh setup | |
| - name: Install ajv-cli | |
| run: npm install -g ajv-cli ajv-formats | |
| - name: Run linters | |
| run: make lint | |
| - name: Validate Helm values schema | |
| run: make validate-schema | |
| - name: Run Helm unit tests | |
| run: make tests | |
| integration-tests: | |
| name: Integration tests | |
| needs: fast-checks | |
| if: github.event.pull_request.head.repo.full_name == github.repository | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v5 | |
| - name: Start K3s cluster | |
| uses: jupyterhub/action-k3s-helm@v4 | |
| with: | |
| k3s-channel: latest | |
| helm-version: ${{ env.HELM_VERSION }} | |
| metrics-enabled: false | |
| docker-enabled: true | |
| - name: Set release name | |
| run: echo "RELEASE_NAME=eoapi-$(echo "${{ github.sha }}" | cut -c1-8)" >> "$GITHUB_ENV" | |
| - name: Wait for K3s readiness | |
| run: | | |
| echo "=== Waiting for K3s cluster to be ready ===" | |
| # The action already sets up kubectl context, just verify it works | |
| kubectl cluster-info | |
| kubectl get nodes | |
| # Wait for core components | |
| kubectl wait --for=condition=Ready pod -l k8s-app=kube-dns -n kube-system --timeout=300s | |
| kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=traefik -n kube-system --timeout=300s | |
| # Verify Traefik CRDs | |
| timeout=300; counter=0 | |
| for crd in "middlewares.traefik.io" "ingressroutes.traefik.io"; do | |
| while [ $counter -lt $timeout ] && ! kubectl get crd "$crd" &>/dev/null; do | |
| sleep 3; counter=$((counter + 3)) | |
| done | |
| [ $counter -ge $timeout ] && { echo "❌ Timeout waiting for $crd"; exit 1; } | |
| done | |
| echo "✅ K3s cluster ready" | |
| - name: Deploy eoAPI | |
| id: deploy | |
| run: | | |
| echo "=== eoAPI Deployment ===" | |
| export RELEASE_NAME="${RELEASE_NAME}" | |
| export PGO_VERSION="${{ env.PGO_VERSION }}" | |
| export CI_MODE=true | |
| # Deploy using consolidated script with CI mode | |
| ./scripts/deploy.sh --ci | |
| - name: Validate deployment | |
| run: | | |
| echo "=== Post-deployment validation ===" | |
| ./scripts/test.sh check-deployment | |
| - name: Run integration tests | |
| run: | | |
| export RELEASE_NAME="$RELEASE_NAME" | |
| ./scripts/test.sh integration --debug | |
| - name: Debug failed deployment | |
| if: failure() | |
| run: | | |
| ./scripts/debug-deployment.sh | |
| - name: Cleanup | |
| if: always() | |
| run: | | |
| helm uninstall "$RELEASE_NAME" -n eoapi || true | |
| kubectl delete namespace eoapi || true | |
| validate-docs: | |
| name: Validate documentation | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: actions/setup-node@v4 | |
| with: | |
| node-version: '20' | |
| - name: Check internal links | |
| run: | | |
| broken=0 | |
| find docs -name "*.md" | while read -r file; do | |
| if grep -q "](\./" "$file" 2>/dev/null; then | |
| grep -n "](\./" "$file" | while IFS=: read -r line link; do | |
| path=$(echo "$link" | sed -n 's/.*](\.\///; s/).*//p') | |
| if [[ "$path" == images/* ]]; then | |
| full="docs/$path" | |
| else | |
| full="docs/$path" | |
| fi | |
| if [[ ! -e "$full" ]]; then | |
| echo "❌ $file:$line -> $path" | |
| broken=1 | |
| fi | |
| done | |
| fi | |
| done | |
| exit $broken | |
| - name: Check external links | |
| run: | | |
| npm install -g [email protected] | |
| echo '{"timeout":"10s","retryCount":2,"aliveStatusCodes":[200,301,302,403,999]}' > .mlc.json | |
| find docs -name "*.md" -exec timeout 30 markdown-link-check {} --config .mlc.json \; || true | |
| - name: Check frontmatter | |
| run: | | |
| missing=0 | |
| find docs -name "*.md" -not -path "docs/_includes/*" | while read -r file; do | |
| head -1 "$file" | grep -q "^---$" || { echo "❌ Missing frontmatter: $file"; missing=1; } | |
| done | |
| exit $missing | |
| observability-tests: | |
| name: Observability Tests | |
| if: github.event.pull_request.head.repo.full_name == github.repository | |
| permissions: | |
| contents: 'read' | |
| id-token: 'write' | |
| needs: integration-tests | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v5 | |
| - name: Start K3s cluster | |
| uses: jupyterhub/action-k3s-helm@v4 | |
| with: | |
| k3s-channel: latest | |
| helm-version: ${{ env.HELM_VERSION }} | |
| metrics-enabled: false | |
| docker-enabled: true | |
| - name: Set release name | |
| run: echo "RELEASE_NAME=eoapi-$(echo "${{ github.sha }}" | cut -c1-8)" >> "$GITHUB_ENV" | |
| - name: Wait for K3s to be fully ready | |
| run: | | |
| echo "=== Waiting for K3s to be fully ready ===" | |
| kubectl wait --for=condition=Ready pod -l k8s-app=kube-dns -n kube-system --timeout=300s | |
| kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=traefik -n kube-system --timeout=300s | |
| kubectl get nodes | |
| kubectl get pods --all-namespaces | |
| sleep 10 | |
| echo "✅ K3s is ready" | |
| - name: Deploy eoAPI with monitoring | |
| run: | | |
| echo "=== Deploying eoAPI with monitoring stack ===" | |
| export RELEASE_NAME="$RELEASE_NAME" | |
| export PGO_VERSION="${{ env.PGO_VERSION }}" | |
| export GITHUB_SHA="${{ github.sha }}" | |
| ./scripts/deploy.sh --ci | |
| # Enable monitoring components | |
| helm upgrade "$RELEASE_NAME" ./charts/eoapi \ | |
| --set monitoring.prometheus.enabled=true \ | |
| --set monitoring.prometheusAdapter.enabled=true \ | |
| --set monitoring.kube-state-metrics.enabled=true \ | |
| --set monitoring.prometheus-node-exporter.enabled=true \ | |
| --set observability.grafana.enabled=true \ | |
| --set stac.autoscaling.enabled=true \ | |
| --set raster.autoscaling.enabled=true \ | |
| --set vector.autoscaling.enabled=true \ | |
| --namespace eoapi \ | |
| --wait --timeout=10m | |
| - name: Wait for monitoring stack to be ready | |
| run: | | |
| echo "=== Waiting for monitoring components ===" | |
| # Wait for Prometheus | |
| kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=prometheus -n eoapi --timeout=300s || echo "Prometheus not ready" | |
| # Wait for Grafana | |
| kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=grafana -n eoapi --timeout=300s || echo "Grafana not ready" | |
| # Wait for prometheus-adapter | |
| kubectl wait --for=condition=Ready pod -l app.kubernetes.io/name=prometheus-adapter -n eoapi --timeout=300s || echo "prometheus-adapter not ready" | |
| # Wait for HPA to be created | |
| sleep 30 | |
| echo "=== Final monitoring stack status ===" | |
| kubectl get pods -n eoapi -l 'app.kubernetes.io/component in (server,grafana,prometheus-adapter)' || true | |
| kubectl get hpa -n eoapi || true | |
| - name: Run observability tests | |
| run: | | |
| echo "=== Running observability test suite ===" | |
| export RELEASE_NAME="$RELEASE_NAME" | |
| export NAMESPACE="eoapi" | |
| # Install python dependencies for testing | |
| python -m pip install --upgrade pip | |
| pip install pytest requests | |
| # Run observability tests | |
| python -m pytest .github/workflows/tests/test_observability.py -v --tb=short | |
| # Run autoscaling tests | |
| python -m pytest .github/workflows/tests/test_autoscaling.py -v --tb=short -m "not slow" | |
| - name: Debug observability stack on failure | |
| if: failure() | |
| run: | | |
| echo "=== Observability Debug Information ===" | |
| echo "=== Monitoring Pods Status ===" | |
| kubectl get pods -n eoapi -l 'app.kubernetes.io/name in (prometheus,grafana,prometheus-adapter)' -o wide || true | |
| echo "=== HPA Status ===" | |
| kubectl get hpa -n eoapi -o wide || true | |
| kubectl describe hpa -n eoapi || true | |
| echo "=== Custom Metrics API ===" | |
| kubectl get --raw "/apis/custom.metrics.k8s.io/v1beta1" || true | |
| echo "=== Pod Metrics ===" | |
| kubectl top pods -n eoapi || true | |
| echo "=== Recent Events ===" | |
| kubectl get events -n eoapi --sort-by='.lastTimestamp' | tail -20 || true | |
| echo "=== Component Logs ===" | |
| kubectl logs -l app.kubernetes.io/name=prometheus-adapter -n eoapi --tail=50 || true | |
| kubectl logs -l app.kubernetes.io/name=grafana -n eoapi --tail=30 || true | |
| - name: Cleanup observability test | |
| if: always() | |
| run: | | |
| helm uninstall "$RELEASE_NAME" || true |