@@ -110,7 +110,7 @@ backends:
110110 kernel_image: $KERNEL_IMAGE
111111 vcpus: 2
112112 memory_mib: 1024
113- launch_seconds: 45
113+ launch_seconds: 90
114114EOF
115115
116116if [[ -n " $PRIVILEGED_MODE " ]]; then
159159socket_path=" $tmpdir /cleanroom.sock"
160160listen_endpoint=" unix://$socket_path "
161161
162+ dump_runtime_diagnostics () {
163+ local server_lines=" ${1:- 40} "
164+ if [[ -f " $tmpdir /server.log" ]]; then
165+ echo " --- server log tail ---" >&2
166+ tail -n " $server_lines " " $tmpdir /server.log" >&2 || true
167+ fi
168+
169+ # Surface recent Firecracker process logs when provisioning/agent readiness
170+ # flakes occur so failures are diagnosable from CI output alone.
171+ local fc_logs
172+ fc_logs=" $( find " $XDG_STATE_HOME " /cleanroom/sandboxes -maxdepth 3 -type f \( -name ' firecracker.stdout.log' -o -name ' firecracker.stderr.log' \) 2> /dev/null | sort | tail -n 6 || true) "
173+ if [[ -n " $fc_logs " ]]; then
174+ echo " --- firecracker log tails ---" >&2
175+ while IFS= read -r log_file; do
176+ [[ -n " $log_file " ]] || continue
177+ echo " [$log_file ]" >&2
178+ tail -n 30 " $log_file " >&2 || true
179+ done <<< " $fc_logs"
180+ fi
181+ }
182+
162183echo " --- :rocket: Start cleanroom control-plane"
163184./dist/cleanroom serve --listen " $listen_endpoint " --gateway-listen " :0" > " $tmpdir /server.log" 2>&1 &
164185srv_pid=$!
@@ -177,16 +198,42 @@ if [[ ! -S "$socket_path" ]]; then
177198fi
178199
179200echo " --- :white_check_mark: Launched execution smoke test"
180- ./dist/cleanroom exec --host " $listen_endpoint " -c " $PWD " -- sh -lc ' echo cleanroom-e2e' | tee " $tmpdir /exec.out"
181- if ! grep -q ' ^cleanroom-e2e$' " $tmpdir /exec.out" ; then
182- echo " expected smoke-test output missing" >&2
201+ smoke_attempt=1
202+ smoke_max_attempts=3
203+ while true ; do
204+ set +e
205+ ./dist/cleanroom exec --host " $listen_endpoint " -c " $PWD " -- sh -lc ' echo cleanroom-e2e' > " $tmpdir /exec.out" 2> " $tmpdir /exec.err"
206+ smoke_status=$?
207+ set -e
208+
209+ if [[ " $smoke_status " -eq 0 ]] && grep -q ' ^cleanroom-e2e$' " $tmpdir /exec.out" ; then
210+ cat " $tmpdir /exec.out"
211+ break
212+ fi
213+
214+ if [[ " $smoke_status " -ne 0 ]] && grep -q ' timed out waiting for vsock guest agent' " $tmpdir /exec.err" && [[ " $smoke_attempt " -lt " $smoke_max_attempts " ]]; then
215+ echo " smoke test hit transient vsock timeout (attempt $smoke_attempt /$smoke_max_attempts ); retrying"
216+ sleep " $smoke_attempt "
217+ smoke_attempt=$(( smoke_attempt + 1 ))
218+ continue
219+ fi
220+
221+ echo " smoke test failed (exit $smoke_status )" >&2
222+ echo " --- smoke stdout ---" >&2
223+ cat " $tmpdir /exec.out" >&2 || true
224+ echo " --- smoke stderr ---" >&2
225+ cat " $tmpdir /exec.err" >&2 || true
226+ dump_runtime_diagnostics 80
183227 exit 1
184- fi
228+ done
185229
186230echo " --- :satellite: Git gateway allow/deny test"
187- set +e
188- # shellcheck disable=SC2016
189- ./dist/cleanroom exec --host " $listen_endpoint " -c " $PWD " -- sh -lc '
231+ git_gateway_attempt=1
232+ git_gateway_max_attempts=3
233+ while true ; do
234+ set +e
235+ # shellcheck disable=SC2016
236+ ./dist/cleanroom exec --host " $listen_endpoint " -c " $PWD " -- sh -lc '
190237 set -eu
191238
192239 key="$(env | awk -F= ' " '" ' /^GIT_CONFIG_KEY_[0-9]+=url\.http:\/\/.+\/git\/github\.com\/\.insteadOf$/ {print $2; exit}' " '" ' )"
@@ -228,10 +275,25 @@ set +e
228275 allow_url="${gw}/github.com/buildkite/cleanroom.git/info/refs?service=git-upload-pack"
229276 deny_url="${gw}/gitlab.com/gitlab-org/gitlab.git/info/refs?service=git-upload-pack"
230277
231- set +e
232- allow_resp="$(wget -q -S -O - "$allow_url" 2>&1)"
233- allow_rc=$?
234- set -e
278+ # Retry allowlisted probe because transient host/gateway network stalls can
279+ # fail a single request even when policy behavior is correct.
280+ allow_attempt=1
281+ allow_max_attempts=3
282+ allow_resp=""
283+ allow_rc=1
284+ while [ "$allow_attempt" -le "$allow_max_attempts" ]; do
285+ set +e
286+ allow_resp="$(wget -q -S -O - "$allow_url" 2>&1)"
287+ allow_rc=$?
288+ set -e
289+ if [ "$allow_rc" -eq 0 ]; then
290+ break
291+ fi
292+ if [ "$allow_attempt" -lt "$allow_max_attempts" ]; then
293+ sleep "$allow_attempt"
294+ fi
295+ allow_attempt=$((allow_attempt + 1))
296+ done
235297 if [ "$allow_rc" -ne 0 ]; then
236298 echo "allowlisted host probe failed (exit $allow_rc)" >&2
237299 echo "$allow_resp" >&2
@@ -247,8 +309,9 @@ set +e
247309 echo "$allow_resp" >&2
248310 exit 5
249311 fi
250- if ! echo "$allow_resp" | grep -q "git-upload-pack"; then
251- echo "allowlisted host probe did not return git upload-pack response" >&2
312+ # Accept either a smart-protocol response marker or an explicit HTTP 200.
313+ if ! echo "$allow_resp" | grep -q "git-upload-pack" && ! echo "$allow_resp" | grep -Eq "HTTP/[0-9.]+[[:space:]]+200"; then
314+ echo "allowlisted host probe returned unexpected response shape" >&2
252315 echo "$allow_resp" >&2
253316 exit 5
254317 fi
@@ -274,34 +337,60 @@ set +e
274337
275338 echo "guest image missing both git and wget; cannot exercise git gateway" >&2
276339 exit 8
277- ' > " $tmpdir /git-gateway.out" 2> " $tmpdir /git-gateway.err"
278- git_gateway_status=$?
279- set -e
340+ ' > " $tmpdir /git-gateway.out" 2> " $tmpdir /git-gateway.err"
341+ git_gateway_status=$?
342+ set -e
343+
344+ if [[ " $git_gateway_status " -eq 0 ]]; then
345+ break
346+ fi
347+
348+ if grep -Eq ' timed out waiting for vsock guest agent|deadline_exceeded|Connection refused|Operation timed out' " $tmpdir /git-gateway.err" && [[ " $git_gateway_attempt " -lt " $git_gateway_max_attempts " ]]; then
349+ echo " git gateway test hit transient transport error (attempt $git_gateway_attempt /$git_gateway_max_attempts ); retrying"
350+ sleep " $git_gateway_attempt "
351+ git_gateway_attempt=$(( git_gateway_attempt + 1 ))
352+ continue
353+ fi
354+ break
355+ done
280356
281357if [[ " $git_gateway_status " -ne 0 ]]; then
282358 echo " git gateway allow/deny test failed (exit $git_gateway_status )" >&2
283359 echo " --- guest stdout ---" >&2
284360 cat " $tmpdir /git-gateway.out" >&2 || true
285361 echo " --- guest stderr ---" >&2
286362 cat " $tmpdir /git-gateway.err" >&2 || true
287- echo " --- gateway log tail ---" >&2
288- tail -n 40 " $tmpdir /server.log" >&2 || true
363+ dump_runtime_diagnostics 80
289364 exit 1
290365fi
291366
292367echo " --- :warning: Exit code propagation test"
293- set +e
294- ./dist/cleanroom exec --host " $listen_endpoint " -c " $PWD " -- sh -lc ' exit 7' > " $tmpdir /exit7.out" 2> " $tmpdir /exit7.err"
295- status=$?
296- set -e
368+ exit_attempt=1
369+ exit_max_attempts=3
370+ status=1
371+ while true ; do
372+ set +e
373+ ./dist/cleanroom exec --host " $listen_endpoint " -c " $PWD " -- sh -lc ' exit 7' > " $tmpdir /exit7.out" 2> " $tmpdir /exit7.err"
374+ status=$?
375+ set -e
376+ if [[ " $status " -eq 7 ]]; then
377+ break
378+ fi
379+ if grep -q ' timed out waiting for vsock guest agent' " $tmpdir /exit7.err" && [[ " $exit_attempt " -lt " $exit_max_attempts " ]]; then
380+ echo " exit propagation test hit transient vsock timeout (attempt $exit_attempt /$exit_max_attempts ); retrying"
381+ sleep " $exit_attempt "
382+ exit_attempt=$(( exit_attempt + 1 ))
383+ continue
384+ fi
385+ break
386+ done
297387if [[ " $status " -ne 7 ]]; then
298388 echo " expected exit code 7 from guest command, got $status " >&2
299389 echo " stdout:" >&2
300390 cat " $tmpdir /exit7.out" >&2 || true
301391 echo " stderr:" >&2
302392 cat " $tmpdir /exit7.err" >&2 || true
303- echo " server log (last 30 lines):" >&2
304- tail -n 30 " $tmpdir /server.log" >&2 || true
393+ dump_runtime_diagnostics 80
305394 exit 1
306395fi
307396
0 commit comments