Add topology aware deletion priority example #547

Workflow file for this run

.github/workflows/e2e-1.26.yaml at b2c7252

	name: E2E-1.26

	on:
	push:
	branches:
	- master
	- release-*
	pull_request: {}
	workflow_dispatch: {}

	env:
	# Common versions
	GO_VERSION: '1.23.4'
	KIND_VERSION: 'v0.18.0'
	KIND_IMAGE: 'kindest/node:v1.26.4'
	KIND_CLUSTER_NAME: 'ci-testing'
	CERT_MANAGER_VERSION: 'v1.18.2'

	jobs:

	game-kruise:
	runs-on: ubuntu-24.04
	steps:
	- uses: actions/checkout@v3
	with:
	submodules: true
	fetch-depth: 0
	fetch-tags: true
	- name: Ensure tags are available
	run: git fetch --force --tags
	- name: Setup Go
	uses: actions/setup-go@v3
	with:
	go-version: ${{ env.GO_VERSION }}
	- name: Determine build metadata
	run: \|
	echo "::group::Determine build metadata"
	bash ./scripts/ci/determine-build-metadata.sh
	echo "::endgroup::"
	- name: Prepare audit policy
	run: \|
	echo "::group::Prepare audit policy"
	bash ./scripts/ci/prepare-kind-audit.sh
	echo "::endgroup::"
	- name: Setup Kind Cluster
	uses: helm/kind-action@v1.3.0
	with:
	node_image: ${{ env.KIND_IMAGE }}
	cluster_name: ${{ env.KIND_CLUSTER_NAME }}
	config: ./test/kind-conf.yaml
	version: ${{ env.KIND_VERSION }}
	- name: Ensure audit log file exists and is world-readable
	run: \|
	echo "::group::Ensure audit log file"
	bash ./scripts/ci/ensure-audit-log.sh
	echo "::endgroup::"
	- name: Build image
	run: \|
	echo "::group::Build manager image"
	bash ./scripts/ci/build-manager-image.sh
	echo "::endgroup::"
	- name: Install Cert-Manager
	run: \|
	echo "::group::Install Cert-Manager"
	bash ./scripts/ci/install-cert-manager.sh
	echo "::endgroup::"
	- name: Deploy Observability Infrastructure
	run: \|
	echo "::group::Deploy observability stack"
	set -ex
	echo "=== Deploying observability stack for tracing E2E tests ==="
	cd test/e2e

	# Deploy the stack (script will not exit on pod failures)
	./setup-k8s-observability.sh deploy

	echo ""
	echo "=== Checking deployment status ==="
	kubectl get pods -n observability -o wide

	# Check if OTel Collector is running properly
	OTEL_READY=$(kubectl get pods -n observability -l app=otel-collector -o jsonpath='{.items[0].status.containerStatuses[0].ready}' 2>/dev/null \|\| echo "false")

	if [ "$OTEL_READY" != "true" ]; then
	echo ""
	echo "❌ ERROR: OTel Collector is not ready!"
	echo ""
	echo "=== Running comprehensive diagnostics ==="
	./debug-otel-collector.sh observability \|\| true
	echo ""
	echo "=== Extracting error keywords from logs ==="
	kubectl logs -n observability -l app=otel-collector --tail=200 2>&1 \| grep -E -i "error\|fatal\|panic\|fail\|invalid" \| head -50 \|\| echo "No obvious errors found"
	echo ""
	echo "=== Checking previous logs if pod restarted ==="
	kubectl logs -n observability -l app=otel-collector --previous --tail=100 2>&1 \|\| echo "No previous logs available"
	exit 1
	fi

	# Check other components (warnings only, don't fail)
	for component in tempo loki prometheus; do
	READY=$(kubectl get pods -n observability -l app=$component -o jsonpath='{.items[0].status.containerStatuses[0].ready}' 2>/dev/null \|\| echo "false")
	if [ "$READY" != "true" ]; then
	echo "⚠️ WARNING: $component is not ready, but continuing..."
	else
	echo "✅ $component is ready"
	fi
	done

	echo ""
	echo "=== Final observability stack status ==="
	kubectl get pods -n observability
	echo "✅ Observability stack deployment completed"
	echo "::endgroup::"
	- name: Install Kruise
	run: \|
	echo "::group::Install Kruise"
	bash ./scripts/ci/install-kruise.sh
	echo "::endgroup::"
	- name: Install Kruise Game
	run: \|
	echo "::group::Install Kruise Game"
	set -ex
	kubectl cluster-info
	IMG=${E2E_IMAGE} \
	ENABLE_TRACING=true \
	OTEL_COLLECTOR_ENDPOINT=otel-collector.observability.svc.cluster.local:4317 \
	OTEL_SAMPLING_RATE=1.0 \
	./scripts/deploy_kind.sh
	for ((i=1;i<10;i++));
	do
	set +e
	PODS=$(kubectl get pod -n kruise-game-system \| grep '1/1' \| wc -l)
	set -e
	if [ "$PODS" -eq "1" ]; then
	break
	fi
	sleep 3
	done
	set +e
	PODS=$(kubectl get pod -n kruise-game-system \| grep '1/1' \| wc -l)
	kubectl get node -o yaml
	kubectl get all -n kruise-game-system -o yaml
	set -e
	if [ "$PODS" -eq "1" ]; then
	echo "Wait for kruise-game ready successfully"
	else
	echo "Timeout to wait for kruise-game ready"
	exit 1
	fi
	echo "::endgroup::"
	- name: Verify Kind Cluster
	run: \|
	echo "::group::Verify Kind cluster"
	bash ./scripts/ci/verify-kind-cluster.sh
	echo "::endgroup::"
	- name: Setup Port Forwards for Observability
	run: \|
	echo "::group::Setup observability port-forwards"
	set -x # Enable command echoing for debugging
	echo "=== Setting up port forwards for Tempo and Loki ==="

	# First, verify the services exist and have endpoints
	echo "--- Checking Tempo service ---"
	kubectl get svc -n observability tempo -o yaml \|\| echo "❌ Tempo service not found"
	kubectl get endpoints -n observability tempo \|\| echo "❌ Tempo endpoints not found"

	echo "--- Checking Loki service ---"
	kubectl get svc -n observability loki -o yaml \|\| echo "❌ Loki service not found"
	kubectl get endpoints -n observability loki \|\| echo "❌ Loki endpoints not found"

	echo "--- Checking Tempo pod status ---"
	kubectl get pods -n observability -l app.kubernetes.io/name=tempo \|\| echo "❌ No Tempo pods"

	echo "--- Checking Loki pod status ---"
	kubectl get pods -n observability -l app.kubernetes.io/name=loki \|\| echo "❌ No Loki pods"

	# Port forward Tempo (background, with verbose output)
	echo "--- Starting Tempo port-forward ---"
	kubectl port-forward -n observability svc/tempo 3200:3200 -v=6 &
	TEMPO_PID=$!
	echo $TEMPO_PID > /tmp/tempo-pf.pid
	echo "Tempo port-forward PID: $TEMPO_PID"

	# Port forward Loki (background, with verbose output)
	echo "--- Starting Loki port-forward ---"
	kubectl port-forward -n observability svc/loki 3100:3100 -v=6 &
	LOKI_PID=$!
	echo $LOKI_PID > /tmp/loki-pf.pid
	echo "Loki port-forward PID: $LOKI_PID"

	# Wait for port forwards to be ready
	echo "--- Waiting for port forwards to establish ---"
	sleep 10

	# Check if processes are still running
	echo "--- Checking port-forward processes ---"
	if ps -p $TEMPO_PID > /dev/null; then
	echo "✓ Tempo port-forward process is running"
	else
	echo "❌ Tempo port-forward process died"
	cat /tmp/tempo-pf.pid
	fi

	if ps -p $LOKI_PID > /dev/null; then
	echo "✓ Loki port-forward process is running"
	else
	echo "❌ Loki port-forward process died"
	cat /tmp/loki-pf.pid
	fi

	# Check if ports are listening
	echo "--- Checking listening ports ---"
	netstat -tuln \| grep -E ':(3200\|3100)' \|\| echo "⚠️ Ports not listening"
	ss -tuln \| grep -E ':(3200\|3100)' \|\| echo "⚠️ Ports not found by ss"

	# Try to connect to the ports
	echo "--- Testing connectivity ---"

	echo "Testing Tempo (localhost:3200)..."
	if curl -v --max-time 5 http://localhost:3200/ready 2>&1; then
	echo "✓ Tempo /ready endpoint responded"
	else
	echo "❌ Tempo /ready endpoint failed"
	fi

	echo "Testing Tempo search API..."
	if curl -v --max-time 5 "http://localhost:3200/api/search?tags=service.name=test&limit=1" 2>&1; then
	echo "✓ Tempo /api/search endpoint responded"
	else
	echo "❌ Tempo /api/search endpoint failed"
	fi

	echo "Testing Loki (localhost:3100)..."
	if curl -v --max-time 5 http://localhost:3100/ready 2>&1; then
	echo "✓ Loki /ready endpoint responded"
	else
	echo "❌ Loki /ready endpoint failed"
	fi

	echo "--- Port forward setup complete ---"
	echo "TEMPO_PID=$TEMPO_PID"
	echo "LOKI_PID=$LOKI_PID"
	echo "::endgroup::"
	- name: Verify Tracing Configuration
	run: \|
	echo "::group::Verify tracing configuration"
	bash ./scripts/ci/verify-tracing-config.sh
	echo "::endgroup::"
	- name: Verify Controller Metrics Endpoint
	run: \|
	echo "::group::Verify controller metrics"
	set -euo pipefail
	echo "=== Verifying controller metrics endpoint ==="
	METRICS_SVC=$(kubectl get svc -n kruise-game-system -l control-plane=controller-manager -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' \| grep metrics-service \| head -n 1 \|\| true)
	if [ -z "$METRICS_SVC" ]; then
	echo "❌ Could not find controller metrics Service"
	kubectl get svc -n kruise-game-system
	exit 1
	fi
	echo "Using metrics service: $METRICS_SVC"

	echo "Waiting for metrics endpoints to be ready..."
	for i in {1..12}; do
	ENDPOINT_READY=$(kubectl get endpoints -n kruise-game-system "$METRICS_SVC" -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null \|\| true)
	if [ -n "$ENDPOINT_READY" ]; then
	echo "Endpoints ready (IP=$ENDPOINT_READY)"
	break
	fi
	echo " endpoints not ready yet (attempt $i/12); sleeping 5s"
	sleep 5
	done

	ENDPOINT_READY=$(kubectl get endpoints -n kruise-game-system "$METRICS_SVC" -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null \|\| true)
	if [ -z "$ENDPOINT_READY" ]; then
	echo "❌ Metrics service has no ready endpoints"
	kubectl describe svc "$METRICS_SVC" -n kruise-game-system
	kubectl get pods -n kruise-game-system -l control-plane=controller-manager
	exit 1
	fi

	echo "Attempting to query metrics via API server service proxy..."
	set +e
	PROXY_OUTPUT=$(kubectl get --raw "/api/v1/namespaces/kruise-game-system/services/${METRICS_SVC}:http-metrics/proxy/metrics" 2> /tmp/proxy_err \| head -n 200)
	PROXY_STATUS=$?
	set -e

	if [ $PROXY_STATUS -ne 0 ]; then
	echo "❌ Service proxy request failed:"
	cat /tmp/proxy_err
	echo "--- Service describe ---"
	kubectl describe svc "$METRICS_SVC" -n kruise-game-system \|\| true
	echo "--- Endpoints ---"
	kubectl get endpoints "$METRICS_SVC" -n kruise-game-system -o yaml \|\| true
	echo "--- Controller pods ---"
	kubectl get pods -n kruise-game-system -l control-plane=controller-manager -o wide \|\| true

	echo "Attempting to read metrics directly from controller pod..."
	CONTROLLER_POD=$(kubectl get pods -n kruise-game-system -l control-plane=controller-manager -o jsonpath='{.items[0].metadata.name}')
	set +e
	DIRECT_FULL_OUTPUT=$(kubectl exec -n kruise-game-system "$CONTROLLER_POD" -- wget -qO- http://127.0.0.1:8080/metrics 2> /tmp/direct_err)
	DIRECT_STATUS=$?
	set -e
	if [ $DIRECT_STATUS -ne 0 ]; then
	echo "❌ Direct pod metrics request failed:"
	cat /tmp/direct_err
	exit 1
	fi
	DIRECT_OUTPUT=$(echo "$DIRECT_FULL_OUTPUT" \| head -n 200)
	echo "--- Sample metrics output (first 20 lines) ---"
	echo "$DIRECT_OUTPUT" \| head -n 20

	if ! echo "$DIRECT_OUTPUT" \| grep -q "controller_runtime_webhook_requests_total"; then
	echo "❌ Expected controller-runtime metrics not found even via direct pod exec"
	exit 1
	fi

	echo "⚠️ Service proxy failed but direct pod metrics endpoint is reachable"
	else
	echo "--- Sample metrics output (first 20 lines) ---"
	echo "$PROXY_OUTPUT" \| head -n 20
	if ! echo "$PROXY_OUTPUT" \| grep -q "controller_runtime_webhook_requests_total"; then
	echo "❌ Expected controller-runtime metrics not found in /metrics output"
	exit 1
	fi
	echo "✅ Controller metrics endpoint reachable via service proxy"
	fi
	echo "::endgroup::"
	- name: Run E2E Tests
	env:
	E2E_ARTIFACT_ROOT: /tmp/e2e-artifacts
	E2E_ARTIFACT_SUFFIX: main
	E2E_AUDIT_LOG_PATH: /tmp/kind-audit/audit.log
	E2E_GINKGO_TIMEOUT: 60m
	E2E_MAX_RESTARTS: "0"
	TEMPO_URL: http://localhost:3200
	LOKI_URL: http://localhost:3100
	E2E_OBSERVABILITY_DEBUG: "true"
	run: \|
	echo "::group::Run E2E tests"
	bash ./scripts/ci/run-e2e-tests.sh
	echo "::endgroup::"
	- name: Collect Additional Diagnostics
	if: always()
	env:
	E2E_ARTIFACT_ROOT: /tmp/e2e-artifacts
	run: \|
	echo "::group::Collect E2E diagnostics"
	bash ./scripts/ci/collect-e2e-artifacts.sh
	echo "::endgroup::"
	- name: Upload E2E Test Artifacts
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: e2e-test-artifacts-${{ env.KIND_VERSION }}
	path: /tmp/e2e-artifacts
	if-no-files-found: warn
	retention-days: 7
	compression-level: 6

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add topology aware deletion priority example #547

Workflow file

Add topology aware deletion priority example #547

Uh oh!

Workflow file for this run