Monitor Self-Hosted Runners #16388
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Monitor Self-Hosted Runners | |
| on: | |
| # push: # uncomment for testing if on a branch in the azure_scripts repo | |
| schedule: | |
| - cron: '7/15 * * * *' # every 15 minutes but at :07, :22, :37, :52 to try to make it less likely that github drops the job | |
| - cron: '0 9 * * 1' # weekly report every Monday at 9 AM UTC | |
| workflow_dispatch: | |
| inputs: | |
| send_weekly_report: | |
| description: 'Send a weekly report to Zulip' | |
| required: false | |
| default: false | |
| type: boolean | |
| env: | |
| ZULIP_SERVER: "https://leanprover.zulipchat.com" | |
| ZULIP_CHANNEL: "CI admins" | |
| STATE_FILE: "runner-state.json" | |
| STATS_FILE: "runner-stats.json" | |
| CACHE_KEY: ${{ github.ref == 'refs/heads/master' && 'runner-monitor-state-v4' || 'runner-monitor-state-testing' }} | |
| STATS_CACHE_KEY: ${{ github.ref == 'refs/heads/master' && 'runner-monitor-stats-v4' || 'runner-monitor-stats-testing' }} | |
| DRY_RUN: ${{ github.ref != 'refs/heads/master' }} | |
| jobs: | |
| monitor-runners: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: actions/create-github-app-token@29824e69f54612133e76f7eaac726eef6c875baf # v2.2.1 | |
| id: app-token | |
| with: | |
| app-id: ${{ secrets.LEANPROVER_COMMUNITY_RUNNERS_APP_ID }} | |
| private-key: ${{ secrets.LEANPROVER_COMMUNITY_RUNNERS_PRIVATE_KEY }} | |
| - name: Restore previous state | |
| id: cache-restore | |
| uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 | |
| with: | |
| path: ${{ env.STATE_FILE }} | |
| key: ${{ env.CACHE_KEY }}-${{ github.run_id }} | |
| restore-keys: ${{ env.CACHE_KEY }} | |
| - name: Restore previous stats | |
| id: stats-cache-restore | |
| uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 | |
| with: | |
| path: ${{ env.STATS_FILE }} | |
| key: ${{ env.STATS_CACHE_KEY }}-${{ github.run_id }} | |
| restore-keys: ${{ env.STATS_CACHE_KEY }} | |
| - name: Create empty state | |
| if: steps.cache-restore.outputs.cache-hit == '' | |
| run: | | |
| echo "No previous state file found, creating empty state" | |
| echo '{"last_run": "", "runners": {}}' > "${{ env.STATE_FILE }}" | |
| - name: Create empty stats | |
| if: steps.stats-cache-restore.outputs.cache-hit == '' | |
| run: | | |
| echo "No previous stats file found, creating empty stats" | |
| echo '{"runners": {}, "last_cleanup": ""}' > "${{ env.STATS_FILE }}" | |
| - name: Check self-hosted runners | |
| id: check-runners | |
| run: | | |
| python3 -m monitor_runners.workflow check-runners \ | |
| --token "${{ steps.app-token.outputs.token }}" \ | |
| --org "${{ github.repository_owner }}" \ | |
| --state-file "${{ env.STATE_FILE }}" \ | |
| --stats-file "${{ env.STATS_FILE }}" \ | |
| --response-file "runners_response.json" \ | |
| --schedule "${{ github.event.schedule || '' }}" \ | |
| --send-weekly-report "${{ toJSON(inputs.send_weekly_report) }}" \ | |
| --github-output "$GITHUB_OUTPUT" | |
| - name: Manage runner labels based on bors status | |
| id: manage-labels | |
| if: steps.check-runners.outputs.api_ok == 'true' | |
| run: | | |
| python3 -m monitor_runners.workflow manage-labels \ | |
| --token "${{ steps.app-token.outputs.token }}" \ | |
| --org "${{ github.repository_owner }}" \ | |
| --response-file "runners_response.json" \ | |
| --dry-run "${{ env.DRY_RUN }}" \ | |
| --github-output "$GITHUB_OUTPUT" | |
| - name: Generate stats report | |
| id: weekly-stats | |
| run: | | |
| python3 -m monitor_runners.workflow weekly-report \ | |
| --stats-file "${{ env.STATS_FILE }}" \ | |
| --github-output "$GITHUB_OUTPUT" | |
| - name: Send status message on Zulip | |
| if: steps.check-runners.outputs.should_notify == 'true' | |
| id: zulip-status | |
| run: | | |
| dry_run="${{ env.DRY_RUN }}" | |
| if [ "$dry_run" = "true" ]; then | |
| echo "DRY RUN: would send Zulip status message" | |
| echo "message_id=dry-run" >> "$GITHUB_OUTPUT" | |
| exit 0 | |
| fi | |
| # Ensure that the bot is subscribed to the env.ZULIP_CHANNEL | |
| # Otherwise it will not have permission to edit its own messages! | |
| zulip_url="${{ env.ZULIP_SERVER }}" | |
| zulip_email="${{ secrets.ZULIP_MONITOR_RUNNERS_BOT_EMAIL }}" | |
| zulip_key="${{ secrets.ZULIP_MONITOR_RUNNERS_API_KEY }}" | |
| message_id="${{ steps.check-runners.outputs.last_message_id }}" | |
| content_file=$(mktemp) | |
| cat << 'EOF' > "$content_file" | |
| ${{ steps.check-runners.outputs.message }} | |
| EOF | |
| if [ "${{ steps.check-runners.outputs.should_edit }}" = "true" ]; then | |
| echo "Editing existing Zulip message: $message_id" | |
| response=$(curl -s -X PATCH \ | |
| -u "$zulip_email:$zulip_key" \ | |
| "$zulip_url/api/v1/messages/$message_id" \ | |
| --data-urlencode "content@$content_file") | |
| else | |
| echo "Sending new Zulip message" | |
| response=$(curl -s -X POST \ | |
| -u "$zulip_email:$zulip_key" \ | |
| "$zulip_url/api/v1/messages" \ | |
| --data-urlencode "type=stream" \ | |
| --data-urlencode "to=${{ env.ZULIP_CHANNEL }}" \ | |
| --data-urlencode "topic=Runner Status" \ | |
| --data-urlencode "content@$content_file") | |
| message_id=$(echo "$response" | jq -r '.id // empty') | |
| fi | |
| if ! echo "$response" | jq -e '.result == "success"' > /dev/null; then | |
| echo "ERROR: Zulip API request failed" | |
| echo "$response" | |
| exit 1 | |
| fi | |
| if [ -z "$message_id" ]; then | |
| echo "ERROR: Missing Zulip message id" | |
| echo "$response" | |
| exit 1 | |
| fi | |
| echo "message_id=$message_id" >> "$GITHUB_OUTPUT" | |
| rm -f "$content_file" | |
| - name: Update status notification state | |
| if: steps.check-runners.outputs.should_notify == 'true' | |
| run: | | |
| if [ "${{ env.DRY_RUN }}" = "true" ]; then | |
| echo "DRY RUN: would update status notification state" | |
| exit 0 | |
| fi | |
| current_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ") | |
| offline_set='${{ steps.check-runners.outputs.offline_set }}' | |
| message_id="${{ steps.zulip-status.outputs.message_id }}" | |
| state=$(cat "${{ env.STATE_FILE }}") | |
| updated_state=$(echo "$state" | jq \ | |
| --arg time "$current_time" \ | |
| --arg message_id "$message_id" \ | |
| --argjson offline_set "$offline_set" \ | |
| ' | |
| .last_notification = (.last_notification // {}) | | |
| .last_notification.offline_set = $offline_set | | |
| .last_notification.message_id = $message_id | | |
| .last_notification.updated_at = $time | |
| ') | |
| echo "$updated_state" > "${{ env.STATE_FILE }}" | |
| - name: Send label management notification on Zulip | |
| if: steps.manage-labels.outputs.has_label_errors == 'true' && github.ref == 'refs/heads/master' | |
| uses: zulip/github-actions-zulip/send-message@e4c8f27c732ba9bd98ac6be0583096dea82feea5 # v1.0.2 | |
| with: | |
| api-key: ${{ secrets.ZULIP_MONITOR_RUNNERS_API_KEY }} | |
| email: ${{ secrets.ZULIP_MONITOR_RUNNERS_BOT_EMAIL }} | |
| organization-url: ${{ env.ZULIP_SERVER }} | |
| to: ${{ env.ZULIP_CHANNEL }} | |
| type: 'stream' | |
| topic: 'Runner Status' | |
| content: | | |
| ${{ steps.manage-labels.outputs.label_errors }} | |
| - name: Send processing error notification on Zulip | |
| if: steps.check-runners.outputs.has_processing_errors == 'true' && github.ref == 'refs/heads/master' | |
| uses: zulip/github-actions-zulip/send-message@e4c8f27c732ba9bd98ac6be0583096dea82feea5 # v1.0.2 | |
| with: | |
| api-key: ${{ secrets.ZULIP_MONITOR_RUNNERS_API_KEY }} | |
| email: ${{ secrets.ZULIP_MONITOR_RUNNERS_BOT_EMAIL }} | |
| organization-url: ${{ env.ZULIP_SERVER }} | |
| to: ${{ env.ZULIP_CHANNEL }} | |
| type: 'stream' | |
| topic: 'Runner Status' | |
| content: | | |
| ${{ steps.check-runners.outputs.processing_errors }} | |
| - name: Send weekly report on Zulip | |
| if: steps.check-runners.outputs.is_weekly_report == 'true' && github.ref == 'refs/heads/master' | |
| uses: zulip/github-actions-zulip/send-message@e4c8f27c732ba9bd98ac6be0583096dea82feea5 # v1.0.2 | |
| with: | |
| api-key: ${{ secrets.ZULIP_MONITOR_RUNNERS_API_KEY }} | |
| email: ${{ secrets.ZULIP_MONITOR_RUNNERS_BOT_EMAIL }} | |
| organization-url: ${{ env.ZULIP_SERVER }} | |
| to: ${{ env.ZULIP_CHANNEL }} | |
| type: 'stream' | |
| topic: 'Weekly Runner Report' | |
| content: | | |
| ${{ steps.weekly-stats.outputs.weekly_message }} | |
| - name: Save state to cache | |
| if: always() && (github.event_name != 'workflow_dispatch' || github.ref != 'refs/heads/master') # On manual run of test branches it's okay to upload: the cache key won't collide | |
| uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 | |
| with: | |
| path: ${{ env.STATE_FILE }} | |
| key: ${{ env.CACHE_KEY }}-${{ github.run_id }} | |
| - name: Save stats to cache | |
| if: always() && (github.event_name != 'workflow_dispatch' || github.ref != 'refs/heads/master') # On manual run of test branches it's okay to upload: the cache key won't collide | |
| uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 | |
| with: | |
| path: ${{ env.STATS_FILE }} | |
| key: ${{ env.STATS_CACHE_KEY }}-${{ github.run_id }} | |
| - name: upload files as artifact | |
| if: always() | |
| uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 | |
| with: | |
| name: monitor_runners_artifact | |
| path: | | |
| ${{ env.STATE_FILE }} | |
| ${{ env.STATS_FILE }} | |
| - name: Log summary | |
| run: | | |
| echo "=== Runner Monitor Summary ===" | |
| echo "::group::State file contents:" | |
| cat "${{ env.STATE_FILE }}" | jq . | |
| echo "::endgroup::" | |
| echo "::group::Stats file contents:" | |
| cat "${{ env.STATS_FILE }}" | jq . | |
| echo "::endgroup::" | |
| echo "" | |
| echo "=== Statistics Summary ===" | |
| runner_count=$(cat "${{ env.STATS_FILE }}" | jq '.runners | keys | length') | |
| echo "Runners tracked: $runner_count" | |
| data_points=$(cat "${{ env.STATS_FILE }}" | jq '[.runners[].history[]] | length') | |
| echo "Total data points: $data_points" | |
| echo "" | |
| echo "=== Label Management Summary ===" | |
| echo "Bors active: ${{ steps.manage-labels.outputs.bors_active }}" | |
| cat << 'EOF' | |
| ${{ steps.manage-labels.outputs.label_summary }} | |
| EOF | |
| if [ "${{ steps.manage-labels.outputs.has_label_errors }}" = "true" ]; then | |
| echo "⚠️ Label management errors occurred (see Zulip notification)" | |
| fi | |
| echo "" | |
| echo "::group::=== 7-Day Statistics Report ===" | |
| cat << 'EOF' | |
| ${{ steps.weekly-stats.outputs.weekly_message }} | |
| EOF | |
| echo "::endgroup::" | |
| if [ "${{ steps.check-runners.outputs.should_notify }}" = "true" ]; then | |
| echo "" | |
| echo "📢 Status notifications sent to Zulip" | |
| else | |
| echo "" | |
| echo "✅ No status notifications needed - all runners stable" | |
| fi | |
| if [ "${{ steps.check-runners.outputs.is_weekly_report }}" = "true" ]; then | |
| echo "📊 Weekly report sent to Zulip" | |
| else | |
| echo "📊 Weekly report generated but not sent (not scheduled weekly run)" | |
| fi | |
| workflow-keepalive: | |
| if: github.event_name == 'schedule' | |
| runs-on: ubuntu-latest | |
| permissions: | |
| actions: write | |
| steps: | |
| - uses: liskin/gh-workflow-keepalive@f72ff1a1336129f29bf0166c0fd0ca6cf1bcb38c # v1.2.1 |