Skip to content

Add topology aware deletion priority example #547

Add topology aware deletion priority example

Add topology aware deletion priority example #547

Workflow file for this run

name: E2E-1.26
on:
push:
branches:
- master
- release-*
pull_request: {}
workflow_dispatch: {}
env:
# Common versions
GO_VERSION: '1.23.4'
KIND_VERSION: 'v0.18.0'
KIND_IMAGE: 'kindest/node:v1.26.4'
KIND_CLUSTER_NAME: 'ci-testing'
CERT_MANAGER_VERSION: 'v1.18.2'
jobs:
game-kruise:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
fetch-tags: true
- name: Ensure tags are available
run: git fetch --force --tags
- name: Setup Go
uses: actions/setup-go@v3
with:
go-version: ${{ env.GO_VERSION }}
- name: Determine build metadata
run: |
echo "::group::Determine build metadata"
bash ./scripts/ci/determine-build-metadata.sh
echo "::endgroup::"
- name: Prepare audit policy
run: |
echo "::group::Prepare audit policy"
bash ./scripts/ci/prepare-kind-audit.sh
echo "::endgroup::"
- name: Setup Kind Cluster
uses: helm/kind-action@v1.3.0
with:
node_image: ${{ env.KIND_IMAGE }}
cluster_name: ${{ env.KIND_CLUSTER_NAME }}
config: ./test/kind-conf.yaml
version: ${{ env.KIND_VERSION }}
- name: Ensure audit log file exists and is world-readable
run: |
echo "::group::Ensure audit log file"
bash ./scripts/ci/ensure-audit-log.sh
echo "::endgroup::"
- name: Build image
run: |
echo "::group::Build manager image"
bash ./scripts/ci/build-manager-image.sh
echo "::endgroup::"
- name: Install Cert-Manager
run: |
echo "::group::Install Cert-Manager"
bash ./scripts/ci/install-cert-manager.sh
echo "::endgroup::"
- name: Deploy Observability Infrastructure
run: |
echo "::group::Deploy observability stack"
set -ex
echo "=== Deploying observability stack for tracing E2E tests ==="
cd test/e2e
# Deploy the stack (script will not exit on pod failures)
./setup-k8s-observability.sh deploy
echo ""
echo "=== Checking deployment status ==="
kubectl get pods -n observability -o wide
# Check if OTel Collector is running properly
OTEL_READY=$(kubectl get pods -n observability -l app=otel-collector -o jsonpath='{.items[0].status.containerStatuses[0].ready}' 2>/dev/null || echo "false")
if [ "$OTEL_READY" != "true" ]; then
echo ""
echo "❌ ERROR: OTel Collector is not ready!"
echo ""
echo "=== Running comprehensive diagnostics ==="
./debug-otel-collector.sh observability || true
echo ""
echo "=== Extracting error keywords from logs ==="
kubectl logs -n observability -l app=otel-collector --tail=200 2>&1 | grep -E -i "error|fatal|panic|fail|invalid" | head -50 || echo "No obvious errors found"
echo ""
echo "=== Checking previous logs if pod restarted ==="
kubectl logs -n observability -l app=otel-collector --previous --tail=100 2>&1 || echo "No previous logs available"
exit 1
fi
# Check other components (warnings only, don't fail)
for component in tempo loki prometheus; do
READY=$(kubectl get pods -n observability -l app=$component -o jsonpath='{.items[0].status.containerStatuses[0].ready}' 2>/dev/null || echo "false")
if [ "$READY" != "true" ]; then
echo "⚠️ WARNING: $component is not ready, but continuing..."
else
echo "✅ $component is ready"
fi
done
echo ""
echo "=== Final observability stack status ==="
kubectl get pods -n observability
echo "✅ Observability stack deployment completed"
echo "::endgroup::"
- name: Install Kruise
run: |
echo "::group::Install Kruise"
bash ./scripts/ci/install-kruise.sh
echo "::endgroup::"
- name: Install Kruise Game
run: |
echo "::group::Install Kruise Game"
set -ex
kubectl cluster-info
IMG=${E2E_IMAGE} \
ENABLE_TRACING=true \
OTEL_COLLECTOR_ENDPOINT=otel-collector.observability.svc.cluster.local:4317 \
OTEL_SAMPLING_RATE=1.0 \
./scripts/deploy_kind.sh
for ((i=1;i<10;i++));
do
set +e
PODS=$(kubectl get pod -n kruise-game-system | grep '1/1' | wc -l)
set -e
if [ "$PODS" -eq "1" ]; then
break
fi
sleep 3
done
set +e
PODS=$(kubectl get pod -n kruise-game-system | grep '1/1' | wc -l)
kubectl get node -o yaml
kubectl get all -n kruise-game-system -o yaml
set -e
if [ "$PODS" -eq "1" ]; then
echo "Wait for kruise-game ready successfully"
else
echo "Timeout to wait for kruise-game ready"
exit 1
fi
echo "::endgroup::"
- name: Verify Kind Cluster
run: |
echo "::group::Verify Kind cluster"
bash ./scripts/ci/verify-kind-cluster.sh
echo "::endgroup::"
- name: Setup Port Forwards for Observability
run: |
echo "::group::Setup observability port-forwards"
set -x # Enable command echoing for debugging
echo "=== Setting up port forwards for Tempo and Loki ==="
# First, verify the services exist and have endpoints
echo "--- Checking Tempo service ---"
kubectl get svc -n observability tempo -o yaml || echo "❌ Tempo service not found"
kubectl get endpoints -n observability tempo || echo "❌ Tempo endpoints not found"
echo "--- Checking Loki service ---"
kubectl get svc -n observability loki -o yaml || echo "❌ Loki service not found"
kubectl get endpoints -n observability loki || echo "❌ Loki endpoints not found"
echo "--- Checking Tempo pod status ---"
kubectl get pods -n observability -l app.kubernetes.io/name=tempo || echo "❌ No Tempo pods"
echo "--- Checking Loki pod status ---"
kubectl get pods -n observability -l app.kubernetes.io/name=loki || echo "❌ No Loki pods"
# Port forward Tempo (background, with verbose output)
echo "--- Starting Tempo port-forward ---"
kubectl port-forward -n observability svc/tempo 3200:3200 -v=6 &
TEMPO_PID=$!
echo $TEMPO_PID > /tmp/tempo-pf.pid
echo "Tempo port-forward PID: $TEMPO_PID"
# Port forward Loki (background, with verbose output)
echo "--- Starting Loki port-forward ---"
kubectl port-forward -n observability svc/loki 3100:3100 -v=6 &
LOKI_PID=$!
echo $LOKI_PID > /tmp/loki-pf.pid
echo "Loki port-forward PID: $LOKI_PID"
# Wait for port forwards to be ready
echo "--- Waiting for port forwards to establish ---"
sleep 10
# Check if processes are still running
echo "--- Checking port-forward processes ---"
if ps -p $TEMPO_PID > /dev/null; then
echo "✓ Tempo port-forward process is running"
else
echo "❌ Tempo port-forward process died"
cat /tmp/tempo-pf.pid
fi
if ps -p $LOKI_PID > /dev/null; then
echo "✓ Loki port-forward process is running"
else
echo "❌ Loki port-forward process died"
cat /tmp/loki-pf.pid
fi
# Check if ports are listening
echo "--- Checking listening ports ---"
netstat -tuln | grep -E ':(3200|3100)' || echo "⚠️ Ports not listening"
ss -tuln | grep -E ':(3200|3100)' || echo "⚠️ Ports not found by ss"
# Try to connect to the ports
echo "--- Testing connectivity ---"
echo "Testing Tempo (localhost:3200)..."
if curl -v --max-time 5 http://localhost:3200/ready 2>&1; then
echo "✓ Tempo /ready endpoint responded"
else
echo "❌ Tempo /ready endpoint failed"
fi
echo "Testing Tempo search API..."
if curl -v --max-time 5 "http://localhost:3200/api/search?tags=service.name=test&limit=1" 2>&1; then
echo "✓ Tempo /api/search endpoint responded"
else
echo "❌ Tempo /api/search endpoint failed"
fi
echo "Testing Loki (localhost:3100)..."
if curl -v --max-time 5 http://localhost:3100/ready 2>&1; then
echo "✓ Loki /ready endpoint responded"
else
echo "❌ Loki /ready endpoint failed"
fi
echo "--- Port forward setup complete ---"
echo "TEMPO_PID=$TEMPO_PID"
echo "LOKI_PID=$LOKI_PID"
echo "::endgroup::"
- name: Verify Tracing Configuration
run: |
echo "::group::Verify tracing configuration"
bash ./scripts/ci/verify-tracing-config.sh
echo "::endgroup::"
- name: Verify Controller Metrics Endpoint
run: |
echo "::group::Verify controller metrics"
set -euo pipefail
echo "=== Verifying controller metrics endpoint ==="
METRICS_SVC=$(kubectl get svc -n kruise-game-system -l control-plane=controller-manager -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep metrics-service | head -n 1 || true)
if [ -z "$METRICS_SVC" ]; then
echo "❌ Could not find controller metrics Service"
kubectl get svc -n kruise-game-system
exit 1
fi
echo "Using metrics service: $METRICS_SVC"
echo "Waiting for metrics endpoints to be ready..."
for i in {1..12}; do
ENDPOINT_READY=$(kubectl get endpoints -n kruise-game-system "$METRICS_SVC" -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true)
if [ -n "$ENDPOINT_READY" ]; then
echo "Endpoints ready (IP=$ENDPOINT_READY)"
break
fi
echo " endpoints not ready yet (attempt $i/12); sleeping 5s"
sleep 5
done
ENDPOINT_READY=$(kubectl get endpoints -n kruise-game-system "$METRICS_SVC" -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true)
if [ -z "$ENDPOINT_READY" ]; then
echo "❌ Metrics service has no ready endpoints"
kubectl describe svc "$METRICS_SVC" -n kruise-game-system
kubectl get pods -n kruise-game-system -l control-plane=controller-manager
exit 1
fi
echo "Attempting to query metrics via API server service proxy..."
set +e
PROXY_OUTPUT=$(kubectl get --raw "/api/v1/namespaces/kruise-game-system/services/${METRICS_SVC}:http-metrics/proxy/metrics" 2> /tmp/proxy_err | head -n 200)
PROXY_STATUS=$?
set -e
if [ $PROXY_STATUS -ne 0 ]; then
echo "❌ Service proxy request failed:"
cat /tmp/proxy_err
echo "--- Service describe ---"
kubectl describe svc "$METRICS_SVC" -n kruise-game-system || true
echo "--- Endpoints ---"
kubectl get endpoints "$METRICS_SVC" -n kruise-game-system -o yaml || true
echo "--- Controller pods ---"
kubectl get pods -n kruise-game-system -l control-plane=controller-manager -o wide || true
echo "Attempting to read metrics directly from controller pod..."
CONTROLLER_POD=$(kubectl get pods -n kruise-game-system -l control-plane=controller-manager -o jsonpath='{.items[0].metadata.name}')
set +e
DIRECT_FULL_OUTPUT=$(kubectl exec -n kruise-game-system "$CONTROLLER_POD" -- wget -qO- http://127.0.0.1:8080/metrics 2> /tmp/direct_err)
DIRECT_STATUS=$?
set -e
if [ $DIRECT_STATUS -ne 0 ]; then
echo "❌ Direct pod metrics request failed:"
cat /tmp/direct_err
exit 1
fi
DIRECT_OUTPUT=$(echo "$DIRECT_FULL_OUTPUT" | head -n 200)
echo "--- Sample metrics output (first 20 lines) ---"
echo "$DIRECT_OUTPUT" | head -n 20
if ! echo "$DIRECT_OUTPUT" | grep -q "controller_runtime_webhook_requests_total"; then
echo "❌ Expected controller-runtime metrics not found even via direct pod exec"
exit 1
fi
echo "⚠️ Service proxy failed but direct pod metrics endpoint is reachable"
else
echo "--- Sample metrics output (first 20 lines) ---"
echo "$PROXY_OUTPUT" | head -n 20
if ! echo "$PROXY_OUTPUT" | grep -q "controller_runtime_webhook_requests_total"; then
echo "❌ Expected controller-runtime metrics not found in /metrics output"
exit 1
fi
echo "✅ Controller metrics endpoint reachable via service proxy"
fi
echo "::endgroup::"
- name: Run E2E Tests
env:
E2E_ARTIFACT_ROOT: /tmp/e2e-artifacts
E2E_ARTIFACT_SUFFIX: main
E2E_AUDIT_LOG_PATH: /tmp/kind-audit/audit.log
E2E_GINKGO_TIMEOUT: 60m
E2E_MAX_RESTARTS: "0"
TEMPO_URL: http://localhost:3200
LOKI_URL: http://localhost:3100
E2E_OBSERVABILITY_DEBUG: "true"
run: |
echo "::group::Run E2E tests"
bash ./scripts/ci/run-e2e-tests.sh
echo "::endgroup::"
- name: Collect Additional Diagnostics
if: always()
env:
E2E_ARTIFACT_ROOT: /tmp/e2e-artifacts
run: |
echo "::group::Collect E2E diagnostics"
bash ./scripts/ci/collect-e2e-artifacts.sh
echo "::endgroup::"
- name: Upload E2E Test Artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: e2e-test-artifacts-${{ env.KIND_VERSION }}
path: /tmp/e2e-artifacts
if-no-files-found: warn
retention-days: 7
compression-level: 6