forked from protoLabsAI/protoMaker
-
Notifications
You must be signed in to change notification settings - Fork 0
297 lines (259 loc) · 11.7 KB
/
deploy-staging.yml
File metadata and controls
297 lines (259 loc) · 11.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
# SECURITY: This workflow is hardened against fork PR attacks. See docs/security/ci-hardening.md
name: Deploy Staging
on:
push:
branches:
- staging
paths-ignore:
- '*.md'
- 'docs/**'
- 'site/**'
- 'designs/**'
- '.automaker/**'
workflow_dispatch:
permissions: read-all
jobs:
deploy:
runs-on: [self-hosted, staging]
if: github.repository == 'protoLabsAI/protoMaker'
timeout-minutes: 60
concurrency:
group: staging-deploy
cancel-in-progress: false
# Default working directory for all steps — the runner's _work/ dir
# gets wiped by cron every 5min, so use /tmp as a safe fallback.
defaults:
run:
working-directory: /tmp
# Use a persistent deploy directory that survives the runner's workspace
# cleanup cron. The default _work/ directory gets wiped every 5min.
# Paths use $HOME so they work for any runner user without hardcoded PII.
steps:
- name: Setup persistent deploy directory
env:
GH_TOKEN: ${{ github.token }}
run: |
export DEPLOY_DIR="$HOME/staging-deploy/automaker"
export ENV_SOURCE="$HOME/staging-deploy/.env.staging"
echo "DEPLOY_DIR=$DEPLOY_DIR" >> "$GITHUB_ENV"
echo "ENV_SOURCE=$ENV_SOURCE" >> "$GITHUB_ENV"
REPO_URL="https://x-access-token:${GH_TOKEN}@github.com/${{ github.repository }}.git"
# Create deploy dir if first run
if [ ! -d "$DEPLOY_DIR/.git" ]; then
mkdir -p "$(dirname "$DEPLOY_DIR")"
git clone "$REPO_URL" "$DEPLOY_DIR"
fi
# Pull latest code (update remote URL in case token changed)
cd "$DEPLOY_DIR"
git remote set-url origin "$REPO_URL"
git fetch origin staging
git reset --hard origin/staging
# Copy .env from persistent staging config
# Uses dedicated .env.staging (not dev .env) to prevent accidental breakage
if [ -f "$ENV_SOURCE" ]; then
cp "$ENV_SOURCE" "$DEPLOY_DIR/.env"
echo "Copied .env from $ENV_SOURCE"
elif [ -f "$HOME/staging-deploy/.env" ]; then
echo "WARN: No staging env at $ENV_SOURCE — copying from dev .env as initial seed"
cp "$HOME/staging-deploy/.env" "$ENV_SOURCE"
cp "$ENV_SOURCE" "$DEPLOY_DIR/.env"
else
echo "ERROR: No .env found at $ENV_SOURCE or $HOME/staging-deploy/.env"
exit 1
fi
- name: Disk space pre-check
run: |
# Prune dangling images to free space before checking
docker image prune -f 2>/dev/null || true
# Require at least 10GB free on the deploy partition
AVAIL_KB=$(df -k "$DEPLOY_DIR" | awk 'NR==2 {print $4}')
AVAIL_GB=$((AVAIL_KB / 1024 / 1024))
echo "Available disk space: ${AVAIL_GB}GB (${AVAIL_KB}KB)"
MIN_GB=10
if [ "$AVAIL_GB" -lt "$MIN_GB" ]; then
echo "WARN: Only ${AVAIL_GB}GB free — running aggressive Docker cleanup"
# Remove all unused images (not just dangling), build cache, and unused volumes
docker system prune -af --volumes 2>/dev/null || true
docker builder prune -af 2>/dev/null || true
# Re-check after cleanup
AVAIL_KB=$(df -k "$DEPLOY_DIR" | awk 'NR==2 {print $4}')
AVAIL_GB=$((AVAIL_KB / 1024 / 1024))
echo "After cleanup: ${AVAIL_GB}GB free"
if [ "$AVAIL_GB" -lt "$MIN_GB" ]; then
echo "ERROR: Still only ${AVAIL_GB}GB free after cleanup — need at least ${MIN_GB}GB"
df -h "$DEPLOY_DIR"
exit 1
fi
fi
echo "Disk space OK: ${AVAIL_GB}GB free (minimum ${MIN_GB}GB)"
- name: Drain running agents
run: |
# Gracefully stop auto-mode and wait for agents to finish.
# Uses the API key from .env (same machine as staging server).
AUTOMAKER_API_KEY=$(grep -oP 'AUTOMAKER_API_KEY=\K.*' "$ENV_SOURCE" || echo "")
echo "::add-mask::$AUTOMAKER_API_KEY"
curl -sf -X POST http://localhost:8579/api/deploy/drain \
-H "Content-Type: application/json" \
-H "X-API-Key: $AUTOMAKER_API_KEY" \
--max-time 180 || echo "Drain skipped (server not running or no agents)"
- name: Tag rollback images
working-directory: ${{ env.DEPLOY_DIR }}
run: |
# Save current working images so we can restore on failure.
# Only tag app services — docs runs independently and is never rolled back.
for svc in server ui; do
img=$(docker compose -f docker-compose.staging.yml images -q "$svc" 2>/dev/null | head -1 || true)
if [ -n "$img" ]; then
docker tag "$img" "automaker-staging-${svc}:rollback"
echo "Tagged ${svc} image ${img} as rollback"
else
echo "No existing ${svc} image to tag (first deploy?)"
fi
done
- name: Rebuild and restart staging
id: rebuild
working-directory: ${{ env.DEPLOY_DIR }}
env:
GIT_COMMIT_SHA: ${{ github.sha }}
DOCKER_BUILDKIT: 1
COMPOSE_DOCKER_CLI_BUILD: 1
run: |
# setup-staging.sh --build isolates storybook from critical services.
# If storybook fails to build, the script continues (non-fatal).
./scripts/setup-staging.sh --build
./scripts/setup-staging.sh --start
- name: Verify deployment
id: verify
working-directory: ${{ env.DEPLOY_DIR }}
run: |
# Check server readiness (verifies API key, data dir, service init)
retries=0
while [ $retries -lt 15 ]; do
if curl -sf http://localhost:8579/api/health/ready > /dev/null; then
echo "Server is ready"
curl -sf http://localhost:8579/api/health/ready
break
fi
retries=$((retries + 1))
sleep 2
done
if [ $retries -eq 15 ]; then
echo "Server readiness check failed after 30s"
docker compose -f docker-compose.staging.yml logs --tail=50 server
# Check if server container has exhausted restart attempts
SERVER_STATUS=$(docker inspect automaker-server --format='{{.State.Status}}' 2>/dev/null || echo "unknown")
SERVER_RESTART_COUNT=$(docker inspect automaker-server --format='{{.RestartCount}}' 2>/dev/null || echo "0")
if [ "$SERVER_STATUS" = "exited" ] && [ "$SERVER_RESTART_COUNT" -ge 5 ]; then
echo "ALERT: Server container exhausted restart attempts (count: $SERVER_RESTART_COUNT)"
# Signal restart exhaustion for Discord notification
echo "RESTART_EXHAUSTED=true" >> "$GITHUB_ENV"
fi
exit 1
fi
- name: Smoke tests
id: smoke
working-directory: ${{ env.DEPLOY_DIR }}
run: |
# Source API key from .env (same machine as staging server)
export AUTOMAKER_API_KEY=$(grep -oP 'AUTOMAKER_API_KEY=\K.*' .env || echo "")
echo "::add-mask::$AUTOMAKER_API_KEY"
./scripts/smoke-test.sh http://localhost:8579
env:
DISCORD_ALERTS_WEBHOOK: ${{ secrets.DISCORD_ALERTS_WEBHOOK }}
- name: Rollback on failure
id: rollback
if: failure() && (steps.rebuild.outcome == 'failure' || steps.verify.outcome == 'failure' || steps.smoke.outcome == 'failure')
working-directory: ${{ env.DEPLOY_DIR }}
run: |
echo "Deploy verification failed — rolling back app services (docs runs independently)"
# Check if rollback images exist for app services
has_rollback=false
for svc in server ui; do
if docker image inspect "automaker-staging-${svc}:rollback" &>/dev/null; then
has_rollback=true
break
fi
done
if [ "$has_rollback" = "false" ]; then
echo "No rollback images available (first deploy?) — cannot rollback"
exit 1
fi
# Stop broken app containers only — docs is untouched
docker compose -f docker-compose.staging.yml down 2>/dev/null || true
# Retag rollback images as latest and collect which services to start
ROLLBACK_SERVICES=""
for svc in server ui; do
if docker image inspect "automaker-staging-${svc}:rollback" &>/dev/null; then
docker tag "automaker-staging-${svc}:rollback" "automaker-staging-${svc}:latest"
ROLLBACK_SERVICES="$ROLLBACK_SERVICES $svc"
echo "Restored ${svc} from rollback"
else
echo "No rollback image for ${svc} — skipping"
fi
done
# Restart only app services that have rollback images
docker compose -f docker-compose.staging.yml up -d $ROLLBACK_SERVICES
# Verify rollback readiness
retries=0
while [ $retries -lt 15 ]; do
if curl -sf http://localhost:8579/api/health/ready > /dev/null; then
echo "Rollback successful — server is ready"
break
fi
retries=$((retries + 1))
sleep 2
done
if [ $retries -eq 15 ]; then
echo "CRITICAL: Rollback also failed. Manual intervention required."
docker compose -f docker-compose.staging.yml logs --tail=50 server
exit 1
fi
- name: Cleanup
if: always()
run: |
# Remove rollback tags (keep images for layer cache)
for svc in server ui; do
docker rmi "automaker-staging-${svc}:rollback" 2>/dev/null || true
done
docker image prune -f 2>/dev/null || true
- name: Notify Discord
if: always()
run: |
VERSION=$(curl -sf http://localhost:8579/api/health | python3 -c "import sys,json; print(json.load(sys.stdin).get('version','unknown'))" 2>/dev/null || echo "unknown")
COMMIT="${{ github.sha }}"
COMMIT="${COMMIT:0:8}"
STATUS="${{ job.status }}"
ROLLBACK_OUTCOME="${{ steps.rollback.outcome }}"
RESTART_EXHAUSTED="${RESTART_EXHAUSTED:-false}"
RUN_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
if [ "$STATUS" = "success" ]; then
MSG="Staging deployed: \`${COMMIT}\` (v${VERSION}) - all healthy"
WEBHOOK="${DISCORD_DEPLOY_WEBHOOK:-}"
else
case "$ROLLBACK_OUTCOME" in
success)
MSG="ALERT: Staging deploy FAILED: \`${COMMIT}\` - rolled back to previous version. ${RUN_URL}"
;;
failure)
MSG="ALERT: Staging deploy FAILED: \`${COMMIT}\` - rollback FAILED. Manual intervention required. ${RUN_URL}"
;;
*)
MSG="ALERT: Staging deploy FAILED: \`${COMMIT}\` - no rollback performed. ${RUN_URL}"
;;
esac
# Add restart exhaustion alert if detected
if [ "$RESTART_EXHAUSTED" = "true" ]; then
MSG="${MSG}\n**CRITICAL**: Server container exhausted restart attempts (5/5). Manual restart required."
fi
WEBHOOK="${DISCORD_ALERTS_WEBHOOK:-}"
fi
# Post to the appropriate Discord channel
if [ -n "${WEBHOOK:-}" ]; then
curl -sf -H "Content-Type: application/json" \
-d "{\"content\": \"${MSG}\"}" \
"$WEBHOOK" || true
fi
echo "$MSG"
env:
DISCORD_DEPLOY_WEBHOOK: ${{ secrets.DISCORD_DEPLOY_WEBHOOK }}
DISCORD_ALERTS_WEBHOOK: ${{ secrets.DISCORD_ALERTS_WEBHOOK }}