homeMaker/.github/workflows/deploy-staging.yml at main · protoLabsAI/homeMaker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
# SECURITY: This workflow is hardened against fork PR attacks. See docs/security/ci-hardening.md
name: Deploy Staging

on:
  push:
    branches:
      - staging
    paths-ignore:
      - '*.md'
      - 'docs/**'
      - 'site/**'
      - 'designs/**'
      - '.automaker/**'
  workflow_dispatch:

permissions: read-all

jobs:
  deploy:
    runs-on: [self-hosted, staging]
    if: github.repository == 'protoLabsAI/protoMaker'
    timeout-minutes: 60
    concurrency:
      group: staging-deploy
      cancel-in-progress: false

    # Default working directory for all steps — the runner's _work/ dir
    # gets wiped by cron every 5min, so use /tmp as a safe fallback.
    defaults:
      run:
        working-directory: /tmp

    # Use a persistent deploy directory that survives the runner's workspace
    # cleanup cron. The default _work/ directory gets wiped every 5min.
    # Paths use $HOME so they work for any runner user without hardcoded PII.

    steps:
      - name: Setup persistent deploy directory
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          export DEPLOY_DIR="$HOME/staging-deploy/automaker"
          export ENV_SOURCE="$HOME/staging-deploy/.env.staging"
          echo "DEPLOY_DIR=$DEPLOY_DIR" >> "$GITHUB_ENV"
          echo "ENV_SOURCE=$ENV_SOURCE" >> "$GITHUB_ENV"

          REPO_URL="https://x-access-token:${GH_TOKEN}@github.com/${{ github.repository }}.git"

          # Create deploy dir if first run
          if [ ! -d "$DEPLOY_DIR/.git" ]; then
            mkdir -p "$(dirname "$DEPLOY_DIR")"
            git clone "$REPO_URL" "$DEPLOY_DIR"
          fi

          # Pull latest code (update remote URL in case token changed)
          cd "$DEPLOY_DIR"
          git remote set-url origin "$REPO_URL"
          git fetch origin staging
          git reset --hard origin/staging

          # Copy .env from persistent staging config
          # Uses dedicated .env.staging (not dev .env) to prevent accidental breakage
          if [ -f "$ENV_SOURCE" ]; then
            cp "$ENV_SOURCE" "$DEPLOY_DIR/.env"
            echo "Copied .env from $ENV_SOURCE"
          elif [ -f "$HOME/staging-deploy/.env" ]; then
            echo "WARN: No staging env at $ENV_SOURCE — copying from dev .env as initial seed"
            cp "$HOME/staging-deploy/.env" "$ENV_SOURCE"
            cp "$ENV_SOURCE" "$DEPLOY_DIR/.env"
          else
            echo "ERROR: No .env found at $ENV_SOURCE or $HOME/staging-deploy/.env"
            exit 1
          fi

      - name: Disk space pre-check
        run: |
          # Prune dangling images to free space before checking
          docker image prune -f 2>/dev/null || true

          # Require at least 10GB free on the deploy partition
          AVAIL_KB=$(df -k "$DEPLOY_DIR" | awk 'NR==2 {print $4}')
          AVAIL_GB=$((AVAIL_KB / 1024 / 1024))
          echo "Available disk space: ${AVAIL_GB}GB (${AVAIL_KB}KB)"

          MIN_GB=10
          if [ "$AVAIL_GB" -lt "$MIN_GB" ]; then
            echo "WARN: Only ${AVAIL_GB}GB free — running aggressive Docker cleanup"
            # Remove all unused images (not just dangling), build cache, and unused volumes
            docker system prune -af --volumes 2>/dev/null || true
            docker builder prune -af 2>/dev/null || true

            # Re-check after cleanup
            AVAIL_KB=$(df -k "$DEPLOY_DIR" | awk 'NR==2 {print $4}')
            AVAIL_GB=$((AVAIL_KB / 1024 / 1024))
            echo "After cleanup: ${AVAIL_GB}GB free"

            if [ "$AVAIL_GB" -lt "$MIN_GB" ]; then
              echo "ERROR: Still only ${AVAIL_GB}GB free after cleanup — need at least ${MIN_GB}GB"
              df -h "$DEPLOY_DIR"
              exit 1
            fi
          fi
          echo "Disk space OK: ${AVAIL_GB}GB free (minimum ${MIN_GB}GB)"

      - name: Drain running agents
        run: |
          # Gracefully stop auto-mode and wait for agents to finish.
          # Uses the API key from .env (same machine as staging server).
          AUTOMAKER_API_KEY=$(grep -oP 'AUTOMAKER_API_KEY=\K.*' "$ENV_SOURCE" || echo "")
          echo "::add-mask::$AUTOMAKER_API_KEY"
          curl -sf -X POST http://localhost:8579/api/deploy/drain \
            -H "Content-Type: application/json" \
            -H "X-API-Key: $AUTOMAKER_API_KEY" \
            --max-time 180 || echo "Drain skipped (server not running or no agents)"

      - name: Tag rollback images
        working-directory: ${{ env.DEPLOY_DIR }}
        run: |
          # Save current working images so we can restore on failure.
          # Only tag app services — docs runs independently and is never rolled back.
          for svc in server ui; do
            img=$(docker compose -f docker-compose.staging.yml images -q "$svc" 2>/dev/null | head -1 || true)
            if [ -n "$img" ]; then
              docker tag "$img" "automaker-staging-${svc}:rollback"
              echo "Tagged ${svc} image ${img} as rollback"
            else
              echo "No existing ${svc} image to tag (first deploy?)"
            fi
          done

      - name: Rebuild and restart staging
        id: rebuild
        working-directory: ${{ env.DEPLOY_DIR }}
        env:
          GIT_COMMIT_SHA: ${{ github.sha }}
          DOCKER_BUILDKIT: 1
          COMPOSE_DOCKER_CLI_BUILD: 1
        run: |
          # setup-staging.sh --build isolates storybook from critical services.
          # If storybook fails to build, the script continues (non-fatal).
          ./scripts/setup-staging.sh --build
          ./scripts/setup-staging.sh --start

      - name: Verify deployment
        id: verify
        working-directory: ${{ env.DEPLOY_DIR }}
        run: |
          # Check server readiness (verifies API key, data dir, service init)
          retries=0
          while [ $retries -lt 15 ]; do
            if curl -sf http://localhost:8579/api/health/ready > /dev/null; then
              echo "Server is ready"
              curl -sf http://localhost:8579/api/health/ready
              break
            fi
            retries=$((retries + 1))
            sleep 2
          done
          if [ $retries -eq 15 ]; then
            echo "Server readiness check failed after 30s"
            docker compose -f docker-compose.staging.yml logs --tail=50 server

            # Check if server container has exhausted restart attempts
            SERVER_STATUS=$(docker inspect automaker-server --format='{{.State.Status}}' 2>/dev/null || echo "unknown")
            SERVER_RESTART_COUNT=$(docker inspect automaker-server --format='{{.RestartCount}}' 2>/dev/null || echo "0")

            if [ "$SERVER_STATUS" = "exited" ] && [ "$SERVER_RESTART_COUNT" -ge 5 ]; then
              echo "ALERT: Server container exhausted restart attempts (count: $SERVER_RESTART_COUNT)"
              # Signal restart exhaustion for Discord notification
              echo "RESTART_EXHAUSTED=true" >> "$GITHUB_ENV"
            fi

            exit 1
          fi

      - name: Smoke tests
        id: smoke
        working-directory: ${{ env.DEPLOY_DIR }}
        run: |
          # Source API key from .env (same machine as staging server)
          export AUTOMAKER_API_KEY=$(grep -oP 'AUTOMAKER_API_KEY=\K.*' .env || echo "")
          echo "::add-mask::$AUTOMAKER_API_KEY"
          ./scripts/smoke-test.sh http://localhost:8579
        env:
          DISCORD_ALERTS_WEBHOOK: ${{ secrets.DISCORD_ALERTS_WEBHOOK }}

      - name: Rollback on failure
        id: rollback
        if: failure() && (steps.rebuild.outcome == 'failure' || steps.verify.outcome == 'failure' || steps.smoke.outcome == 'failure')
        working-directory: ${{ env.DEPLOY_DIR }}
        run: |
          echo "Deploy verification failed — rolling back app services (docs runs independently)"

          # Check if rollback images exist for app services
          has_rollback=false
          for svc in server ui; do
            if docker image inspect "automaker-staging-${svc}:rollback" &>/dev/null; then
              has_rollback=true
              break
            fi
          done

          if [ "$has_rollback" = "false" ]; then
            echo "No rollback images available (first deploy?) — cannot rollback"
            exit 1
          fi

          # Stop broken app containers only — docs is untouched
          docker compose -f docker-compose.staging.yml down 2>/dev/null || true

          # Retag rollback images as latest and collect which services to start
          ROLLBACK_SERVICES=""
          for svc in server ui; do
            if docker image inspect "automaker-staging-${svc}:rollback" &>/dev/null; then
              docker tag "automaker-staging-${svc}:rollback" "automaker-staging-${svc}:latest"
              ROLLBACK_SERVICES="$ROLLBACK_SERVICES $svc"
              echo "Restored ${svc} from rollback"
            else
              echo "No rollback image for ${svc} — skipping"
            fi
          done

          # Restart only app services that have rollback images
          docker compose -f docker-compose.staging.yml up -d $ROLLBACK_SERVICES

          # Verify rollback readiness
          retries=0
          while [ $retries -lt 15 ]; do
            if curl -sf http://localhost:8579/api/health/ready > /dev/null; then
              echo "Rollback successful — server is ready"
              break
            fi
            retries=$((retries + 1))
            sleep 2
          done
          if [ $retries -eq 15 ]; then
            echo "CRITICAL: Rollback also failed. Manual intervention required."
            docker compose -f docker-compose.staging.yml logs --tail=50 server
            exit 1
          fi

      - name: Cleanup
        if: always()
        run: |
          # Remove rollback tags (keep images for layer cache)
          for svc in server ui; do
            docker rmi "automaker-staging-${svc}:rollback" 2>/dev/null || true
          done
          docker image prune -f 2>/dev/null || true

      - name: Notify Discord
        if: always()
        run: |
          VERSION=$(curl -sf http://localhost:8579/api/health | python3 -c "import sys,json; print(json.load(sys.stdin).get('version','unknown'))" 2>/dev/null || echo "unknown")
          COMMIT="${{ github.sha }}"
          COMMIT="${COMMIT:0:8}"
          STATUS="${{ job.status }}"

          ROLLBACK_OUTCOME="${{ steps.rollback.outcome }}"
          RESTART_EXHAUSTED="${RESTART_EXHAUSTED:-false}"

          RUN_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"

          if [ "$STATUS" = "success" ]; then
            MSG="Staging deployed: \`${COMMIT}\` (v${VERSION}) - all healthy"
            WEBHOOK="${DISCORD_DEPLOY_WEBHOOK:-}"
          else
            case "$ROLLBACK_OUTCOME" in
              success)
                MSG="ALERT: Staging deploy FAILED: \`${COMMIT}\` - rolled back to previous version. ${RUN_URL}"
                ;;
              failure)
                MSG="ALERT: Staging deploy FAILED: \`${COMMIT}\` - rollback FAILED. Manual intervention required. ${RUN_URL}"
                ;;
              *)
                MSG="ALERT: Staging deploy FAILED: \`${COMMIT}\` - no rollback performed. ${RUN_URL}"
                ;;
            esac

            # Add restart exhaustion alert if detected
            if [ "$RESTART_EXHAUSTED" = "true" ]; then
              MSG="${MSG}\n**CRITICAL**: Server container exhausted restart attempts (5/5). Manual restart required."
            fi
            WEBHOOK="${DISCORD_ALERTS_WEBHOOK:-}"
          fi

          # Post to the appropriate Discord channel
          if [ -n "${WEBHOOK:-}" ]; then
            curl -sf -H "Content-Type: application/json" \
              -d "{\"content\": \"${MSG}\"}" \
              "$WEBHOOK" || true
          fi

          echo "$MSG"
        env:
          DISCORD_DEPLOY_WEBHOOK: ${{ secrets.DISCORD_DEPLOY_WEBHOOK }}
          DISCORD_ALERTS_WEBHOOK: ${{ secrets.DISCORD_ALERTS_WEBHOOK }}