Skip to content

Commit dbb6ea6

Browse files
authored
wait-for-hydra: add SSE support for real-time Hydra status (#41)
When `hydra-status-url` is provided, the action connects to the hydra-github-bridge SSE endpoint for real-time build status updates instead of polling the GitHub API. This dramatically reduces latency from minutes (poll interval) to seconds (event-driven). SSE mode features: - One-shot check on cached status before opening the stream - Event filtering by check-run name (ignores unrelated jobs) - Process substitution to avoid subshell exit trap - read -t 60 to enforce timeout during idle streams - Periodic re-checks every 120s as CDN buffering safety net - Graceful fallback to poll mode on connection failure Without `hydra-status-url`, the action behaves exactly as before (pure GitHub API polling with exponential backoff). Copyright (c) Moritz Angermann <moritz.angermann@iohk.io>, Input Output Group. SPDX-License-Identifier: Apache-2.0
1 parent 6bf0b5a commit dbb6ea6

File tree

2 files changed

+214
-36
lines changed

2 files changed

+214
-36
lines changed

wait-for-hydra/action.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ inputs:
2525
description: 'Maximum delay in seconds between polls (caps exponential backoff).'
2626
default: 300
2727

28+
hydra-status-url:
29+
description: 'Optional URL of the hydra-github-bridge SSE endpoint (e.g. https://hydra-bridge.ci.iog.io). When set, uses SSE for real-time status instead of polling GitHub.'
30+
default: ''
31+
2832
runs:
2933
using: "composite"
3034
steps:
@@ -37,5 +41,6 @@ runs:
3741
JITTER: ${{ inputs.jitter }}
3842
TIMEOUT: ${{ inputs.timeout }}
3943
MAX_DELAY: ${{ inputs.max-delay }}
44+
HYDRA_STATUS_URL: ${{ inputs.hydra-status-url }}
4045
RELEVANT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
4146
run: ${{ github.action_path }}/support/wait.sh

wait-for-hydra/support/wait.sh

Lines changed: 209 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
#!/usr/bin/env bash
22
# wait-for-hydra: Wait for a Hydra CI build to reach a terminal state.
33
#
4-
# Polls the GitHub API with exponential backoff until the specified
5-
# check-run or status reaches a terminal state.
4+
# Supports two modes:
5+
# 1. SSE mode (preferred): connects to hydra-github-bridge SSE endpoint
6+
# for real-time status updates. Requires HYDRA_STATUS_URL.
7+
# 2. Poll mode (fallback): polls GitHub API with exponential backoff.
68
#
79
# Exit codes:
810
# 0 - build succeeded
@@ -20,6 +22,7 @@ set -euo pipefail
2022
: "${JITTER:=30}"
2123
: "${TIMEOUT:=3600}"
2224
: "${MAX_DELAY:=300}"
25+
: "${HYDRA_STATUS_URL:=}"
2326

2427
# --- Validation --------------------------------------------------------------
2528

@@ -37,6 +40,17 @@ HYDRA_JOB="${CHECK:-$STATUS}"
3740

3841
# --- Helpers -----------------------------------------------------------------
3942

43+
# Determine the conclusion/state from a terminal status string.
44+
# Returns: "success", "failure", or empty (pending/unknown).
45+
classify_status() {
46+
local raw="$1"
47+
case "$raw" in
48+
success) echo "success" ;;
49+
failure) echo "failure" ;;
50+
*) echo "" ;;
51+
esac
52+
}
53+
4054
# Check if we've exceeded the timeout.
4155
check_timeout() {
4256
if [ "$TIMEOUT" -gt 0 ] && [ "$SECONDS" -ge "$TIMEOUT" ]; then
@@ -45,61 +59,220 @@ check_timeout() {
4559
fi
4660
}
4761

62+
# --- SSE Mode ----------------------------------------------------------------
63+
64+
# Try to get the current status from the bridge's one-shot endpoint.
65+
# The endpoint returns a JSON map keyed by check-run name:
66+
# { "required": { "conclusion": "success", ... }, "other-job": { ... } }
67+
# Returns the conclusion string for HYDRA_JOB, or empty on failure.
68+
sse_get_current_status() {
69+
local url="$1"
70+
local result
71+
result=$(curl -sf --max-time 10 "$url" 2>/dev/null) || return 1
72+
echo "$result" | jq -r --arg job "$HYDRA_JOB" '.[$job].conclusion // empty' 2>/dev/null || true
73+
}
74+
75+
# Connect to the SSE stream and wait for a terminal event.
76+
# Exits with 0 on success, 1 on failure, or returns 1 on connection error
77+
# to signal fallback to polling.
78+
sse_wait() {
79+
local owner repo sha
80+
# Extract owner/repo from GITHUB_REPOSITORY (format: owner/repo)
81+
owner="${GITHUB_REPOSITORY%%/*}"
82+
repo="${GITHUB_REPOSITORY##*/}"
83+
sha="$RELEVANT_SHA"
84+
85+
local base_url="${HYDRA_STATUS_URL%/}/status/${owner}/${repo}/${sha}"
86+
87+
echo "SSE: Checking current status at ${base_url}"
88+
89+
# One-shot check: build may already be done.
90+
local current_state
91+
current_state=$(sse_get_current_status "$base_url") || true
92+
if [ -n "$current_state" ]; then
93+
local result
94+
result=$(classify_status "$current_state")
95+
if [ "$result" = "success" ]; then
96+
echo "$HYDRA_JOB succeeded (from cached status)"
97+
exit 0
98+
elif [ "$result" = "failure" ]; then
99+
echo "$HYDRA_JOB failed (from cached status)"
100+
exit 1
101+
fi
102+
echo "SSE: Current status is '$current_state', connecting to event stream..."
103+
fi
104+
105+
# Stream SSE events. curl -N disables buffering.
106+
# Each event carries a single check-run: {"name":"...","conclusion":"..."}
107+
# We only act on events matching HYDRA_JOB.
108+
#
109+
# We use process substitution (< <(curl ...)) instead of a pipe (curl |
110+
# while) so the while loop runs in the current shell — otherwise `exit`
111+
# inside the loop would only terminate the subshell, not the script.
112+
#
113+
# read -t 60: timeout each read after 60s so check_timeout fires even
114+
# when the SSE stream is idle (Cloudflare buffering, no new events).
115+
# Without this, read blocks indefinitely and the script's TIMEOUT is
116+
# never enforced. In bash, read -t returns >128 on timeout, 1 on EOF.
117+
# Cap the SSE stream time to leave room for one-shot re-check and
118+
# polling fallback if the stream ends without delivering our event.
119+
# Reserve 2 minutes for fallback; minimum SSE time is 60 seconds.
120+
local sse_max_time
121+
if [ "$TIMEOUT" -gt 0 ]; then
122+
sse_max_time=$((TIMEOUT - SECONDS - 120))
123+
[ "$sse_max_time" -lt 60 ] && sse_max_time=60
124+
else
125+
sse_max_time=86400 # no timeout: cap at 24h (matches cache TTL)
126+
fi
127+
128+
# Track when we last did a one-shot re-check so we can poll the
129+
# cached endpoint periodically. CDN proxies (e.g. Cloudflare) may
130+
# buffer SSE events, so we re-check every 120s as a safety net.
131+
local last_recheck="$SECONDS"
132+
133+
echo "SSE: Connecting to ${base_url}/events (max ${sse_max_time}s, filtering for '$HYDRA_JOB')"
134+
while true; do
135+
check_timeout
136+
137+
# Periodic one-shot re-check: catch status changes that the SSE
138+
# stream failed to deliver (CDN buffering, lost events, etc.).
139+
if [ $((SECONDS - last_recheck)) -ge 120 ]; then
140+
last_recheck="$SECONDS"
141+
local poll_state
142+
poll_state=$(sse_get_current_status "$base_url") || true
143+
if [ -n "$poll_state" ]; then
144+
local poll_result
145+
poll_result=$(classify_status "$poll_state")
146+
if [ "$poll_result" = "success" ]; then
147+
echo "$HYDRA_JOB succeeded (from periodic re-check at ${SECONDS}s)"
148+
exit 0
149+
elif [ "$poll_result" = "failure" ]; then
150+
echo "$HYDRA_JOB failed (from periodic re-check at ${SECONDS}s)"
151+
exit 1
152+
fi
153+
fi
154+
fi
155+
156+
local read_rc=0
157+
IFS= read -r -t 60 line || read_rc=$?
158+
if [ "$read_rc" -gt 128 ]; then
159+
# read timed out (no data for 60s). Loop to check_timeout.
160+
continue
161+
elif [ "$read_rc" -ne 0 ]; then
162+
# EOF — curl exited (connection drop or max-time reached).
163+
break
164+
fi
165+
case "$line" in
166+
"data: "*)
167+
local data="${line#data: }"
168+
local name conclusion
169+
name=$(echo "$data" | jq -r '.name // empty' 2>/dev/null) || continue
170+
# Skip events for other check-runs.
171+
[ "$name" = "$HYDRA_JOB" ] || continue
172+
conclusion=$(echo "$data" | jq -r '.conclusion // empty' 2>/dev/null) || continue
173+
echo "SSE event: $name conclusion=$conclusion (${SECONDS}s elapsed)"
174+
local result
175+
result=$(classify_status "$conclusion")
176+
if [ "$result" = "success" ]; then
177+
echo "$HYDRA_JOB succeeded (via SSE)"
178+
exit 0
179+
elif [ "$result" = "failure" ]; then
180+
echo "$HYDRA_JOB failed (via SSE)"
181+
exit 1
182+
fi
183+
;;
184+
esac
185+
done < <(curl -Nsf --max-time "$sse_max_time" \
186+
"${base_url}/events" 2>/dev/null)
187+
188+
# SSE stream ended without a terminal event for HYDRA_JOB. This can
189+
# happen on connection drop, Cloudflare buffering, or curl max-time.
190+
# Before falling back to polling, do one final one-shot check — the
191+
# status may have changed while we were connected but the event was
192+
# lost in transit.
193+
echo "SSE: Stream ended after ${SECONDS}s, re-checking one-shot endpoint..."
194+
local final_state
195+
final_state=$(sse_get_current_status "$base_url") || true
196+
if [ -n "$final_state" ]; then
197+
local result
198+
result=$(classify_status "$final_state")
199+
if [ "$result" = "success" ]; then
200+
echo "$HYDRA_JOB succeeded (from one-shot re-check)"
201+
exit 0
202+
elif [ "$result" = "failure" ]; then
203+
echo "$HYDRA_JOB failed (from one-shot re-check)"
204+
exit 1
205+
fi
206+
fi
207+
208+
echo "SSE: Falling back to polling..."
209+
return 1
210+
}
211+
48212
# --- Poll Mode ---------------------------------------------------------------
49213

50214
poll_github() {
51215
if [ -n "$CHECK" ]; then
52-
# Debug output to stderr so it doesn't pollute the captured result.
53-
echo "Querying: gh api repos/$GITHUB_REPOSITORY/commits/$RELEVANT_SHA/check-runs --paginate --jq '...select(.name == \"$CHECK\")...'" >&2
216+
echo "Querying: gh api repos/$GITHUB_REPOSITORY/commits/$RELEVANT_SHA/check-runs --paginate --jq '.check_runs[] | select(.name == \"$CHECK\") | .conclusion'" >&2
54217
# Use tail -1 to handle paginated results that may concatenate
55218
# multiple values; take the last (most recent) non-empty line.
56219
gh api "repos/$GITHUB_REPOSITORY/commits/$RELEVANT_SHA/check-runs" \
57220
--paginate \
58221
--jq ".check_runs[] | select(.name == \"$CHECK\") | .conclusion" \
59222
| tail -1
60223
else
61-
# Debug output to stderr so it doesn't pollute the captured result.
62-
echo "Querying: gh api repos/$GITHUB_REPOSITORY/commits/$RELEVANT_SHA/status --paginate --jq '...select(.context == \"$STATUS\")...'" >&2
224+
echo "Querying: gh api repos/$GITHUB_REPOSITORY/commits/$RELEVANT_SHA/status --paginate --jq '.statuses[] | select(.context == \"$STATUS\") | .state'" >&2
63225
gh api "repos/$GITHUB_REPOSITORY/commits/$RELEVANT_SHA/status" \
64226
--paginate \
65227
--jq ".statuses[] | select(.context == \"$STATUS\") | .state" \
66228
| tail -1
67229
fi
68230
}
69231

232+
poll_wait() {
233+
local iteration=0
234+
local current_delay="$DELAY"
235+
236+
while true; do
237+
check_timeout
238+
iteration=$((iteration + 1))
239+
240+
local conclusion
241+
conclusion=$(poll_github)
242+
243+
case "$conclusion" in
244+
success)
245+
echo "$HYDRA_JOB succeeded (iteration $iteration, ${SECONDS}s elapsed)"
246+
exit 0
247+
;;
248+
failure)
249+
echo "$HYDRA_JOB failed (iteration $iteration, ${SECONDS}s elapsed)"
250+
exit 1
251+
;;
252+
*)
253+
local wait_time=$((current_delay + RANDOM % (JITTER + 1)))
254+
echo "$HYDRA_JOB pending (conclusion='$conclusion'). Iteration $iteration, ${SECONDS}s elapsed. Waiting ${wait_time}s..."
255+
sleep "$wait_time"
256+
257+
# Exponential backoff: double the delay, cap at MAX_DELAY.
258+
current_delay=$((current_delay * 2))
259+
if [ "$current_delay" -gt "$MAX_DELAY" ]; then
260+
current_delay="$MAX_DELAY"
261+
fi
262+
;;
263+
esac
264+
done
265+
}
266+
70267
# --- Main --------------------------------------------------------------------
71268

72269
SECONDS=0
73-
iteration=0
74-
current_delay="$DELAY"
75270

76271
echo "Waiting for $HYDRA_JOB on $RELEVANT_SHA (timeout=${TIMEOUT}s, max-delay=${MAX_DELAY}s)"
77272

78-
while true; do
79-
check_timeout
80-
iteration=$((iteration + 1))
81-
82-
conclusion=$(poll_github)
83-
84-
case "$conclusion" in
85-
success)
86-
echo "$HYDRA_JOB succeeded (iteration $iteration, ${SECONDS}s elapsed)"
87-
exit 0
88-
;;
89-
failure)
90-
echo "$HYDRA_JOB failed (iteration $iteration, ${SECONDS}s elapsed)"
91-
exit 1
92-
;;
93-
*)
94-
wait_time=$((current_delay + RANDOM % (JITTER + 1)))
95-
echo "$HYDRA_JOB pending (conclusion='$conclusion'). Iteration $iteration, ${SECONDS}s elapsed. Waiting ${wait_time}s..."
96-
sleep "$wait_time"
97-
98-
# Exponential backoff: double the delay, cap at MAX_DELAY.
99-
current_delay=$((current_delay * 2))
100-
if [ "$current_delay" -gt "$MAX_DELAY" ]; then
101-
current_delay="$MAX_DELAY"
102-
fi
103-
;;
104-
esac
105-
done
273+
if [ -n "$HYDRA_STATUS_URL" ]; then
274+
# Try SSE mode first; fall back to polling on connection failure.
275+
sse_wait || poll_wait
276+
else
277+
poll_wait
278+
fi

0 commit comments

Comments
 (0)