Skip to content

Commit a536998

Browse files
owineclaude
andcommitted
feat: add failure diagnostics and fix health check asymmetry
Capture container states, health check output, and container logs at the point of deployment failure before rollback destroys the evidence. Run health check after any deployment attempt, not just successful ones, so new stack failures get the same diagnostic treatment as existing stacks. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 59113cb commit a536998

File tree

3 files changed

+91
-1
lines changed

3 files changed

+91
-1
lines changed

.github/workflows/deploy.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -658,7 +658,9 @@ jobs:
658658
659659
- name: Health Check All Services
660660
id: health
661-
if: steps.backup.outputs.deployment_needed == 'true' && (steps.deploy-existing.outcome == 'success' || steps.deploy-existing.outcome == 'skipped') && (steps.deploy-new.outcome == 'success' || steps.deploy-new.outcome == 'skipped')
661+
# Run health check after any deployment attempt (success or failure) to capture system state
662+
# When a deployment fails, this provides diagnostics before rollback cleans up containers
663+
if: steps.backup.outputs.deployment_needed == 'true' && (steps.deploy-existing.conclusion != 'skipped' || steps.deploy-new.conclusion != 'skipped')
662664
continue-on-error: true
663665
run: |
664666
# Use auto-detected critical stacks if enabled, otherwise use manual input

scripts/deployment/deploy-stacks.sh

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,50 @@ fi
206206
DEPLOY_TIMEOUT=$((IMAGE_PULL_TIMEOUT + SERVICE_STARTUP_TIMEOUT))
207207
if ! timeout $DEPLOY_TIMEOUT op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml up -d --build --pull always --quiet-pull --quiet-build --wait --remove-orphans $COMPOSE_ARGS; then
208208
echo "❌ Failed to deploy $STACK during $OPERATION (timeout or error)"
209+
echo ""
210+
echo "📋 Failure Diagnostics for $STACK"
211+
echo "────────────────────────────────────────────────────────────────"
212+
213+
# Capture container states (requires op run to parse compose.yaml)
214+
echo "📊 Container States:"
215+
op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml ps -a --format 'table {{.Name}}\t{{.Service}}\t{{.State}}\t{{.Health}}' 2>/dev/null || echo " ⚠️ Could not retrieve container states"
216+
echo ""
217+
218+
# Capture logs and health output from unhealthy/failed containers (plain docker - no op run needed)
219+
ps_output=$(op run --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml ps -a --format '{{.Name}}\t{{.State}}\t{{.Health}}' 2>/dev/null || echo "")
220+
while IFS=$'\t' read -r name state health; do
221+
[ -z "$name" ] && continue
222+
capture=false
223+
case "$state" in
224+
running)
225+
[ "$health" = "unhealthy" ] || [ "$health" = "starting" ] && capture=true
226+
;;
227+
exited|restarting)
228+
capture=true
229+
;;
230+
esac
231+
232+
if [ "$capture" = true ]; then
233+
echo "🔸 Container: $name ($state / $health)"
234+
echo "────────────────────────────────"
235+
236+
# Health check output (last 3 checks)
237+
has_healthcheck=$(docker inspect --format='{{if .State.Health}}yes{{else}}no{{end}}' "$name" 2>/dev/null) || has_healthcheck="no"
238+
if [ "$has_healthcheck" = "yes" ]; then
239+
echo "📍 Health Check Output:"
240+
docker inspect --format='Status: {{.State.Health.Status}} | FailingStreak: {{.State.Health.FailingStreak}}{{if .State.Health.Log}}{{range $i, $log := .State.Health.Log}}{{if lt $i 3}}
241+
[Exit={{.ExitCode}}] {{.Output}}{{end}}{{end}}{{end}}' "$name" 2>/dev/null || echo " Could not retrieve health output"
242+
echo ""
243+
fi
244+
245+
echo "📋 Container Logs (last 50 lines):"
246+
docker logs --tail 50 --timestamps "$name" 2>&1 || echo " ⚠️ Could not retrieve logs"
247+
echo "────────────────────────────────"
248+
echo ""
249+
fi
250+
done <<< "$ps_output"
251+
252+
echo "════════════════════════════════════════════════════════════════"
209253
exit 1
210254
fi
211255

scripts/deployment/rollback-stacks.sh

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,50 @@ ROLLBACK_RESULT=$({
230230
DEPLOY_TIMEOUT=$((IMAGE_PULL_TIMEOUT + SERVICE_STARTUP_TIMEOUT))
231231
if ! timeout $DEPLOY_TIMEOUT op run --no-masking --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml up -d --build --pull always --quiet-pull --quiet-build --wait --remove-orphans $COMPOSE_ARGS; then
232232
echo "❌ Failed to roll back $STACK during $OPERATION (timeout or error)"
233+
echo ""
234+
echo "📋 Failure Diagnostics for $STACK"
235+
echo "────────────────────────────────────────────────────────────────"
236+
237+
# Capture container states (requires op run to parse compose.yaml)
238+
echo "📊 Container States:"
239+
op run --no-masking --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml ps -a --format 'table {{.Name}}\t{{.Service}}\t{{.State}}\t{{.Health}}' 2>/dev/null || echo " ⚠️ Could not retrieve container states"
240+
echo ""
241+
242+
# Capture logs and health output from unhealthy/failed containers (plain docker - no op run needed)
243+
ps_output=$(op run --no-masking --env-file=/opt/compose/compose.env -- docker compose -f compose.yaml ps -a --format '{{.Name}}\t{{.State}}\t{{.Health}}' 2>/dev/null || echo "")
244+
while IFS=$'\t' read -r name state health; do
245+
[ -z "$name" ] && continue
246+
capture=false
247+
case "$state" in
248+
running)
249+
[ "$health" = "unhealthy" ] || [ "$health" = "starting" ] && capture=true
250+
;;
251+
exited|restarting)
252+
capture=true
253+
;;
254+
esac
255+
256+
if [ "$capture" = true ]; then
257+
echo "🔸 Container: $name ($state / $health)"
258+
echo "────────────────────────────────"
259+
260+
# Health check output (last 3 checks)
261+
has_healthcheck=$(docker inspect --format='{{if .State.Health}}yes{{else}}no{{end}}' "$name" 2>/dev/null) || has_healthcheck="no"
262+
if [ "$has_healthcheck" = "yes" ]; then
263+
echo "📍 Health Check Output:"
264+
docker inspect --format='Status: {{.State.Health.Status}} | FailingStreak: {{.State.Health.FailingStreak}}{{if .State.Health.Log}}{{range $i, $log := .State.Health.Log}}{{if lt $i 3}}
265+
[Exit={{.ExitCode}}] {{.Output}}{{end}}{{end}}{{end}}' "$name" 2>/dev/null || echo " Could not retrieve health output"
266+
echo ""
267+
fi
268+
269+
echo "📋 Container Logs (last 50 lines):"
270+
docker logs --tail 50 --timestamps "$name" 2>&1 || echo " ⚠️ Could not retrieve logs"
271+
echo "────────────────────────────────"
272+
echo ""
273+
fi
274+
done <<< "$ps_output"
275+
276+
echo "════════════════════════════════════════════════════════════════"
233277
exit 1
234278
fi
235279

0 commit comments

Comments
 (0)