Skip to content

Commit 259c144

Browse files
authored
fix(ci): harden firecracker e2e flake handling (#84)
1 parent 37a95cd commit 259c144

File tree

1 file changed

+114
-25
lines changed

1 file changed

+114
-25
lines changed

scripts/ci-cleanroom-e2e.sh

Lines changed: 114 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ backends:
110110
kernel_image: $KERNEL_IMAGE
111111
vcpus: 2
112112
memory_mib: 1024
113-
launch_seconds: 45
113+
launch_seconds: 90
114114
EOF
115115

116116
if [[ -n "$PRIVILEGED_MODE" ]]; then
@@ -159,6 +159,27 @@ fi
159159
socket_path="$tmpdir/cleanroom.sock"
160160
listen_endpoint="unix://$socket_path"
161161

162+
dump_runtime_diagnostics() {
163+
local server_lines="${1:-40}"
164+
if [[ -f "$tmpdir/server.log" ]]; then
165+
echo "--- server log tail ---" >&2
166+
tail -n "$server_lines" "$tmpdir/server.log" >&2 || true
167+
fi
168+
169+
# Surface recent Firecracker process logs when provisioning/agent readiness
170+
# flakes occur so failures are diagnosable from CI output alone.
171+
local fc_logs
172+
fc_logs="$(find "$XDG_STATE_HOME"/cleanroom/sandboxes -maxdepth 3 -type f \( -name 'firecracker.stdout.log' -o -name 'firecracker.stderr.log' \) 2>/dev/null | sort | tail -n 6 || true)"
173+
if [[ -n "$fc_logs" ]]; then
174+
echo "--- firecracker log tails ---" >&2
175+
while IFS= read -r log_file; do
176+
[[ -n "$log_file" ]] || continue
177+
echo "[$log_file]" >&2
178+
tail -n 30 "$log_file" >&2 || true
179+
done <<< "$fc_logs"
180+
fi
181+
}
182+
162183
echo "--- :rocket: Start cleanroom control-plane"
163184
./dist/cleanroom serve --listen "$listen_endpoint" --gateway-listen ":0" >"$tmpdir/server.log" 2>&1 &
164185
srv_pid=$!
@@ -177,16 +198,42 @@ if [[ ! -S "$socket_path" ]]; then
177198
fi
178199

179200
echo "--- :white_check_mark: Launched execution smoke test"
180-
./dist/cleanroom exec --host "$listen_endpoint" -c "$PWD" -- sh -lc 'echo cleanroom-e2e' | tee "$tmpdir/exec.out"
181-
if ! grep -q '^cleanroom-e2e$' "$tmpdir/exec.out"; then
182-
echo "expected smoke-test output missing" >&2
201+
smoke_attempt=1
202+
smoke_max_attempts=3
203+
while true; do
204+
set +e
205+
./dist/cleanroom exec --host "$listen_endpoint" -c "$PWD" -- sh -lc 'echo cleanroom-e2e' >"$tmpdir/exec.out" 2>"$tmpdir/exec.err"
206+
smoke_status=$?
207+
set -e
208+
209+
if [[ "$smoke_status" -eq 0 ]] && grep -q '^cleanroom-e2e$' "$tmpdir/exec.out"; then
210+
cat "$tmpdir/exec.out"
211+
break
212+
fi
213+
214+
if [[ "$smoke_status" -ne 0 ]] && grep -q 'timed out waiting for vsock guest agent' "$tmpdir/exec.err" && [[ "$smoke_attempt" -lt "$smoke_max_attempts" ]]; then
215+
echo "smoke test hit transient vsock timeout (attempt $smoke_attempt/$smoke_max_attempts); retrying"
216+
sleep "$smoke_attempt"
217+
smoke_attempt=$((smoke_attempt + 1))
218+
continue
219+
fi
220+
221+
echo "smoke test failed (exit $smoke_status)" >&2
222+
echo "--- smoke stdout ---" >&2
223+
cat "$tmpdir/exec.out" >&2 || true
224+
echo "--- smoke stderr ---" >&2
225+
cat "$tmpdir/exec.err" >&2 || true
226+
dump_runtime_diagnostics 80
183227
exit 1
184-
fi
228+
done
185229

186230
echo "--- :satellite: Git gateway allow/deny test"
187-
set +e
188-
# shellcheck disable=SC2016
189-
./dist/cleanroom exec --host "$listen_endpoint" -c "$PWD" -- sh -lc '
231+
git_gateway_attempt=1
232+
git_gateway_max_attempts=3
233+
while true; do
234+
set +e
235+
# shellcheck disable=SC2016
236+
./dist/cleanroom exec --host "$listen_endpoint" -c "$PWD" -- sh -lc '
190237
set -eu
191238
192239
key="$(env | awk -F= '"'"'/^GIT_CONFIG_KEY_[0-9]+=url\.http:\/\/.+\/git\/github\.com\/\.insteadOf$/ {print $2; exit}'"'"')"
@@ -228,10 +275,25 @@ set +e
228275
allow_url="${gw}/github.com/buildkite/cleanroom.git/info/refs?service=git-upload-pack"
229276
deny_url="${gw}/gitlab.com/gitlab-org/gitlab.git/info/refs?service=git-upload-pack"
230277
231-
set +e
232-
allow_resp="$(wget -q -S -O - "$allow_url" 2>&1)"
233-
allow_rc=$?
234-
set -e
278+
# Retry allowlisted probe because transient host/gateway network stalls can
279+
# fail a single request even when policy behavior is correct.
280+
allow_attempt=1
281+
allow_max_attempts=3
282+
allow_resp=""
283+
allow_rc=1
284+
while [ "$allow_attempt" -le "$allow_max_attempts" ]; do
285+
set +e
286+
allow_resp="$(wget -q -S -O - "$allow_url" 2>&1)"
287+
allow_rc=$?
288+
set -e
289+
if [ "$allow_rc" -eq 0 ]; then
290+
break
291+
fi
292+
if [ "$allow_attempt" -lt "$allow_max_attempts" ]; then
293+
sleep "$allow_attempt"
294+
fi
295+
allow_attempt=$((allow_attempt + 1))
296+
done
235297
if [ "$allow_rc" -ne 0 ]; then
236298
echo "allowlisted host probe failed (exit $allow_rc)" >&2
237299
echo "$allow_resp" >&2
@@ -247,8 +309,9 @@ set +e
247309
echo "$allow_resp" >&2
248310
exit 5
249311
fi
250-
if ! echo "$allow_resp" | grep -q "git-upload-pack"; then
251-
echo "allowlisted host probe did not return git upload-pack response" >&2
312+
# Accept either a smart-protocol response marker or an explicit HTTP 200.
313+
if ! echo "$allow_resp" | grep -q "git-upload-pack" && ! echo "$allow_resp" | grep -Eq "HTTP/[0-9.]+[[:space:]]+200"; then
314+
echo "allowlisted host probe returned unexpected response shape" >&2
252315
echo "$allow_resp" >&2
253316
exit 5
254317
fi
@@ -274,34 +337,60 @@ set +e
274337
275338
echo "guest image missing both git and wget; cannot exercise git gateway" >&2
276339
exit 8
277-
' >"$tmpdir/git-gateway.out" 2>"$tmpdir/git-gateway.err"
278-
git_gateway_status=$?
279-
set -e
340+
' >"$tmpdir/git-gateway.out" 2>"$tmpdir/git-gateway.err"
341+
git_gateway_status=$?
342+
set -e
343+
344+
if [[ "$git_gateway_status" -eq 0 ]]; then
345+
break
346+
fi
347+
348+
if grep -Eq 'timed out waiting for vsock guest agent|deadline_exceeded|Connection refused|Operation timed out' "$tmpdir/git-gateway.err" && [[ "$git_gateway_attempt" -lt "$git_gateway_max_attempts" ]]; then
349+
echo "git gateway test hit transient transport error (attempt $git_gateway_attempt/$git_gateway_max_attempts); retrying"
350+
sleep "$git_gateway_attempt"
351+
git_gateway_attempt=$((git_gateway_attempt + 1))
352+
continue
353+
fi
354+
break
355+
done
280356

281357
if [[ "$git_gateway_status" -ne 0 ]]; then
282358
echo "git gateway allow/deny test failed (exit $git_gateway_status)" >&2
283359
echo "--- guest stdout ---" >&2
284360
cat "$tmpdir/git-gateway.out" >&2 || true
285361
echo "--- guest stderr ---" >&2
286362
cat "$tmpdir/git-gateway.err" >&2 || true
287-
echo "--- gateway log tail ---" >&2
288-
tail -n 40 "$tmpdir/server.log" >&2 || true
363+
dump_runtime_diagnostics 80
289364
exit 1
290365
fi
291366

292367
echo "--- :warning: Exit code propagation test"
293-
set +e
294-
./dist/cleanroom exec --host "$listen_endpoint" -c "$PWD" -- sh -lc 'exit 7' >"$tmpdir/exit7.out" 2>"$tmpdir/exit7.err"
295-
status=$?
296-
set -e
368+
exit_attempt=1
369+
exit_max_attempts=3
370+
status=1
371+
while true; do
372+
set +e
373+
./dist/cleanroom exec --host "$listen_endpoint" -c "$PWD" -- sh -lc 'exit 7' >"$tmpdir/exit7.out" 2>"$tmpdir/exit7.err"
374+
status=$?
375+
set -e
376+
if [[ "$status" -eq 7 ]]; then
377+
break
378+
fi
379+
if grep -q 'timed out waiting for vsock guest agent' "$tmpdir/exit7.err" && [[ "$exit_attempt" -lt "$exit_max_attempts" ]]; then
380+
echo "exit propagation test hit transient vsock timeout (attempt $exit_attempt/$exit_max_attempts); retrying"
381+
sleep "$exit_attempt"
382+
exit_attempt=$((exit_attempt + 1))
383+
continue
384+
fi
385+
break
386+
done
297387
if [[ "$status" -ne 7 ]]; then
298388
echo "expected exit code 7 from guest command, got $status" >&2
299389
echo "stdout:" >&2
300390
cat "$tmpdir/exit7.out" >&2 || true
301391
echo "stderr:" >&2
302392
cat "$tmpdir/exit7.err" >&2 || true
303-
echo "server log (last 30 lines):" >&2
304-
tail -n 30 "$tmpdir/server.log" >&2 || true
393+
dump_runtime_diagnostics 80
305394
exit 1
306395
fi
307396

0 commit comments

Comments
 (0)