-
Notifications
You must be signed in to change notification settings - Fork 161
358 lines (326 loc) · 14.4 KB
/
e2e-1.26.yaml
File metadata and controls
358 lines (326 loc) · 14.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
name: E2E-1.26
on:
push:
branches:
- master
- release-*
pull_request: {}
workflow_dispatch: {}
env:
# Common versions
GO_VERSION: '1.23.4'
KIND_VERSION: 'v0.18.0'
KIND_IMAGE: 'kindest/node:v1.26.4'
KIND_CLUSTER_NAME: 'ci-testing'
CERT_MANAGER_VERSION: 'v1.18.2'
jobs:
game-kruise:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
fetch-tags: true
- name: Ensure tags are available
run: git fetch --force --tags
- name: Setup Go
uses: actions/setup-go@v3
with:
go-version: ${{ env.GO_VERSION }}
- name: Determine build metadata
run: |
echo "::group::Determine build metadata"
bash ./scripts/ci/determine-build-metadata.sh
echo "::endgroup::"
- name: Prepare audit policy
run: |
echo "::group::Prepare audit policy"
bash ./scripts/ci/prepare-kind-audit.sh
echo "::endgroup::"
- name: Setup Kind Cluster
uses: helm/kind-action@v1.3.0
with:
node_image: ${{ env.KIND_IMAGE }}
cluster_name: ${{ env.KIND_CLUSTER_NAME }}
config: ./test/kind-conf.yaml
version: ${{ env.KIND_VERSION }}
- name: Ensure audit log file exists and is world-readable
run: |
echo "::group::Ensure audit log file"
bash ./scripts/ci/ensure-audit-log.sh
echo "::endgroup::"
- name: Build image
run: |
echo "::group::Build manager image"
bash ./scripts/ci/build-manager-image.sh
echo "::endgroup::"
- name: Install Cert-Manager
run: |
echo "::group::Install Cert-Manager"
bash ./scripts/ci/install-cert-manager.sh
echo "::endgroup::"
- name: Deploy Observability Infrastructure
run: |
echo "::group::Deploy observability stack"
set -ex
echo "=== Deploying observability stack for tracing E2E tests ==="
cd test/e2e
# Deploy the stack (script will not exit on pod failures)
./setup-k8s-observability.sh deploy
echo ""
echo "=== Checking deployment status ==="
kubectl get pods -n observability -o wide
# Check if OTel Collector is running properly
OTEL_READY=$(kubectl get pods -n observability -l app=otel-collector -o jsonpath='{.items[0].status.containerStatuses[0].ready}' 2>/dev/null || echo "false")
if [ "$OTEL_READY" != "true" ]; then
echo ""
echo "❌ ERROR: OTel Collector is not ready!"
echo ""
echo "=== Running comprehensive diagnostics ==="
./debug-otel-collector.sh observability || true
echo ""
echo "=== Extracting error keywords from logs ==="
kubectl logs -n observability -l app=otel-collector --tail=200 2>&1 | grep -E -i "error|fatal|panic|fail|invalid" | head -50 || echo "No obvious errors found"
echo ""
echo "=== Checking previous logs if pod restarted ==="
kubectl logs -n observability -l app=otel-collector --previous --tail=100 2>&1 || echo "No previous logs available"
exit 1
fi
# Check other components (warnings only, don't fail)
for component in tempo loki prometheus; do
READY=$(kubectl get pods -n observability -l app=$component -o jsonpath='{.items[0].status.containerStatuses[0].ready}' 2>/dev/null || echo "false")
if [ "$READY" != "true" ]; then
echo "⚠️ WARNING: $component is not ready, but continuing..."
else
echo "✅ $component is ready"
fi
done
echo ""
echo "=== Final observability stack status ==="
kubectl get pods -n observability
echo "✅ Observability stack deployment completed"
echo "::endgroup::"
- name: Install Kruise
run: |
echo "::group::Install Kruise"
bash ./scripts/ci/install-kruise.sh
echo "::endgroup::"
- name: Install Kruise Game
run: |
echo "::group::Install Kruise Game"
set -ex
kubectl cluster-info
IMG=${E2E_IMAGE} \
ENABLE_TRACING=true \
OTEL_COLLECTOR_ENDPOINT=otel-collector.observability.svc.cluster.local:4317 \
OTEL_SAMPLING_RATE=1.0 \
./scripts/deploy_kind.sh
for ((i=1;i<10;i++));
do
set +e
PODS=$(kubectl get pod -n kruise-game-system | grep '1/1' | wc -l)
set -e
if [ "$PODS" -eq "1" ]; then
break
fi
sleep 3
done
set +e
PODS=$(kubectl get pod -n kruise-game-system | grep '1/1' | wc -l)
kubectl get node -o yaml
kubectl get all -n kruise-game-system -o yaml
set -e
if [ "$PODS" -eq "1" ]; then
echo "Wait for kruise-game ready successfully"
else
echo "Timeout to wait for kruise-game ready"
exit 1
fi
echo "::endgroup::"
- name: Verify Kind Cluster
run: |
echo "::group::Verify Kind cluster"
bash ./scripts/ci/verify-kind-cluster.sh
echo "::endgroup::"
- name: Setup Port Forwards for Observability
run: |
echo "::group::Setup observability port-forwards"
set -x # Enable command echoing for debugging
echo "=== Setting up port forwards for Tempo and Loki ==="
# First, verify the services exist and have endpoints
echo "--- Checking Tempo service ---"
kubectl get svc -n observability tempo -o yaml || echo "❌ Tempo service not found"
kubectl get endpoints -n observability tempo || echo "❌ Tempo endpoints not found"
echo "--- Checking Loki service ---"
kubectl get svc -n observability loki -o yaml || echo "❌ Loki service not found"
kubectl get endpoints -n observability loki || echo "❌ Loki endpoints not found"
echo "--- Checking Tempo pod status ---"
kubectl get pods -n observability -l app.kubernetes.io/name=tempo || echo "❌ No Tempo pods"
echo "--- Checking Loki pod status ---"
kubectl get pods -n observability -l app.kubernetes.io/name=loki || echo "❌ No Loki pods"
# Port forward Tempo (background, with verbose output)
echo "--- Starting Tempo port-forward ---"
kubectl port-forward -n observability svc/tempo 3200:3200 -v=6 &
TEMPO_PID=$!
echo $TEMPO_PID > /tmp/tempo-pf.pid
echo "Tempo port-forward PID: $TEMPO_PID"
# Port forward Loki (background, with verbose output)
echo "--- Starting Loki port-forward ---"
kubectl port-forward -n observability svc/loki 3100:3100 -v=6 &
LOKI_PID=$!
echo $LOKI_PID > /tmp/loki-pf.pid
echo "Loki port-forward PID: $LOKI_PID"
# Wait for port forwards to be ready
echo "--- Waiting for port forwards to establish ---"
sleep 10
# Check if processes are still running
echo "--- Checking port-forward processes ---"
if ps -p $TEMPO_PID > /dev/null; then
echo "✓ Tempo port-forward process is running"
else
echo "❌ Tempo port-forward process died"
cat /tmp/tempo-pf.pid
fi
if ps -p $LOKI_PID > /dev/null; then
echo "✓ Loki port-forward process is running"
else
echo "❌ Loki port-forward process died"
cat /tmp/loki-pf.pid
fi
# Check if ports are listening
echo "--- Checking listening ports ---"
netstat -tuln | grep -E ':(3200|3100)' || echo "⚠️ Ports not listening"
ss -tuln | grep -E ':(3200|3100)' || echo "⚠️ Ports not found by ss"
# Try to connect to the ports
echo "--- Testing connectivity ---"
echo "Testing Tempo (localhost:3200)..."
if curl -v --max-time 5 http://localhost:3200/ready 2>&1; then
echo "✓ Tempo /ready endpoint responded"
else
echo "❌ Tempo /ready endpoint failed"
fi
echo "Testing Tempo search API..."
if curl -v --max-time 5 "http://localhost:3200/api/search?tags=service.name=test&limit=1" 2>&1; then
echo "✓ Tempo /api/search endpoint responded"
else
echo "❌ Tempo /api/search endpoint failed"
fi
echo "Testing Loki (localhost:3100)..."
if curl -v --max-time 5 http://localhost:3100/ready 2>&1; then
echo "✓ Loki /ready endpoint responded"
else
echo "❌ Loki /ready endpoint failed"
fi
echo "--- Port forward setup complete ---"
echo "TEMPO_PID=$TEMPO_PID"
echo "LOKI_PID=$LOKI_PID"
echo "::endgroup::"
- name: Verify Tracing Configuration
run: |
echo "::group::Verify tracing configuration"
bash ./scripts/ci/verify-tracing-config.sh
echo "::endgroup::"
- name: Verify Controller Metrics Endpoint
run: |
echo "::group::Verify controller metrics"
set -euo pipefail
echo "=== Verifying controller metrics endpoint ==="
METRICS_SVC=$(kubectl get svc -n kruise-game-system -l control-plane=controller-manager -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep metrics-service | head -n 1 || true)
if [ -z "$METRICS_SVC" ]; then
echo "❌ Could not find controller metrics Service"
kubectl get svc -n kruise-game-system
exit 1
fi
echo "Using metrics service: $METRICS_SVC"
echo "Waiting for metrics endpoints to be ready..."
for i in {1..12}; do
ENDPOINT_READY=$(kubectl get endpoints -n kruise-game-system "$METRICS_SVC" -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true)
if [ -n "$ENDPOINT_READY" ]; then
echo "Endpoints ready (IP=$ENDPOINT_READY)"
break
fi
echo " endpoints not ready yet (attempt $i/12); sleeping 5s"
sleep 5
done
ENDPOINT_READY=$(kubectl get endpoints -n kruise-game-system "$METRICS_SVC" -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || true)
if [ -z "$ENDPOINT_READY" ]; then
echo "❌ Metrics service has no ready endpoints"
kubectl describe svc "$METRICS_SVC" -n kruise-game-system
kubectl get pods -n kruise-game-system -l control-plane=controller-manager
exit 1
fi
echo "Attempting to query metrics via API server service proxy..."
set +e
PROXY_OUTPUT=$(kubectl get --raw "/api/v1/namespaces/kruise-game-system/services/${METRICS_SVC}:http-metrics/proxy/metrics" 2> /tmp/proxy_err | head -n 200)
PROXY_STATUS=$?
set -e
if [ $PROXY_STATUS -ne 0 ]; then
echo "❌ Service proxy request failed:"
cat /tmp/proxy_err
echo "--- Service describe ---"
kubectl describe svc "$METRICS_SVC" -n kruise-game-system || true
echo "--- Endpoints ---"
kubectl get endpoints "$METRICS_SVC" -n kruise-game-system -o yaml || true
echo "--- Controller pods ---"
kubectl get pods -n kruise-game-system -l control-plane=controller-manager -o wide || true
echo "Attempting to read metrics directly from controller pod..."
CONTROLLER_POD=$(kubectl get pods -n kruise-game-system -l control-plane=controller-manager -o jsonpath='{.items[0].metadata.name}')
set +e
DIRECT_FULL_OUTPUT=$(kubectl exec -n kruise-game-system "$CONTROLLER_POD" -- wget -qO- http://127.0.0.1:8080/metrics 2> /tmp/direct_err)
DIRECT_STATUS=$?
set -e
if [ $DIRECT_STATUS -ne 0 ]; then
echo "❌ Direct pod metrics request failed:"
cat /tmp/direct_err
exit 1
fi
DIRECT_OUTPUT=$(echo "$DIRECT_FULL_OUTPUT" | head -n 200)
echo "--- Sample metrics output (first 20 lines) ---"
echo "$DIRECT_OUTPUT" | head -n 20
if ! echo "$DIRECT_OUTPUT" | grep -q "controller_runtime_webhook_requests_total"; then
echo "❌ Expected controller-runtime metrics not found even via direct pod exec"
exit 1
fi
echo "⚠️ Service proxy failed but direct pod metrics endpoint is reachable"
else
echo "--- Sample metrics output (first 20 lines) ---"
echo "$PROXY_OUTPUT" | head -n 20
if ! echo "$PROXY_OUTPUT" | grep -q "controller_runtime_webhook_requests_total"; then
echo "❌ Expected controller-runtime metrics not found in /metrics output"
exit 1
fi
echo "✅ Controller metrics endpoint reachable via service proxy"
fi
echo "::endgroup::"
- name: Run E2E Tests
env:
E2E_ARTIFACT_ROOT: /tmp/e2e-artifacts
E2E_ARTIFACT_SUFFIX: main
E2E_AUDIT_LOG_PATH: /tmp/kind-audit/audit.log
E2E_GINKGO_TIMEOUT: 60m
E2E_MAX_RESTARTS: "0"
TEMPO_URL: http://localhost:3200
LOKI_URL: http://localhost:3100
E2E_OBSERVABILITY_DEBUG: "true"
run: |
echo "::group::Run E2E tests"
bash ./scripts/ci/run-e2e-tests.sh
echo "::endgroup::"
- name: Collect Additional Diagnostics
if: always()
env:
E2E_ARTIFACT_ROOT: /tmp/e2e-artifacts
run: |
echo "::group::Collect E2E diagnostics"
bash ./scripts/ci/collect-e2e-artifacts.sh
echo "::endgroup::"
- name: Upload E2E Test Artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: e2e-test-artifacts-${{ env.KIND_VERSION }}
path: /tmp/e2e-artifacts
if-no-files-found: warn
retention-days: 7
compression-level: 6