Skip to content

Monitor Self-Hosted Runners #16377

Monitor Self-Hosted Runners

Monitor Self-Hosted Runners #16377

name: Monitor Self-Hosted Runners
on:
# push: # uncomment for testing if on a branch in the azure_scripts repo
schedule:
- cron: '7/15 * * * *' # every 15 minutes but at :07, :22, :37, :52 to try to make it less likely that github drops the job
- cron: '0 9 * * 1' # weekly report every Monday at 9 AM UTC
workflow_dispatch:
inputs:
send_weekly_report:
description: 'Send a weekly report to Zulip'
required: false
default: false
type: boolean
env:
ZULIP_SERVER: "https://leanprover.zulipchat.com"
ZULIP_CHANNEL: "CI admins"
STATE_FILE: "runner-state.json"
STATS_FILE: "runner-stats.json"
CACHE_KEY: ${{ github.ref == 'refs/heads/master' && 'runner-monitor-state-v4' || 'runner-monitor-state-testing' }}
STATS_CACHE_KEY: ${{ github.ref == 'refs/heads/master' && 'runner-monitor-stats-v4' || 'runner-monitor-stats-testing' }}
DRY_RUN: ${{ github.ref != 'refs/heads/master' }}
jobs:
monitor-runners:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/create-github-app-token@29824e69f54612133e76f7eaac726eef6c875baf # v2.2.1
id: app-token
with:
app-id: ${{ secrets.LEANPROVER_COMMUNITY_RUNNERS_APP_ID }}
private-key: ${{ secrets.LEANPROVER_COMMUNITY_RUNNERS_PRIVATE_KEY }}
- name: Restore previous state
id: cache-restore
uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
with:
path: ${{ env.STATE_FILE }}
key: ${{ env.CACHE_KEY }}-${{ github.run_id }}
restore-keys: ${{ env.CACHE_KEY }}
- name: Restore previous stats
id: stats-cache-restore
uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
with:
path: ${{ env.STATS_FILE }}
key: ${{ env.STATS_CACHE_KEY }}-${{ github.run_id }}
restore-keys: ${{ env.STATS_CACHE_KEY }}
- name: Create empty state
if: steps.cache-restore.outputs.cache-hit == ''
run: |
echo "No previous state file found, creating empty state"
echo '{"last_run": "", "runners": {}}' > "${{ env.STATE_FILE }}"
- name: Create empty stats
if: steps.stats-cache-restore.outputs.cache-hit == ''
run: |
echo "No previous stats file found, creating empty stats"
echo '{"runners": {}, "last_cleanup": ""}' > "${{ env.STATS_FILE }}"
- name: Check self-hosted runners
id: check-runners
run: |
python3 -m monitor_runners.workflow check-runners \
--token "${{ steps.app-token.outputs.token }}" \
--org "${{ github.repository_owner }}" \
--state-file "${{ env.STATE_FILE }}" \
--stats-file "${{ env.STATS_FILE }}" \
--response-file "runners_response.json" \
--schedule "${{ github.event.schedule || '' }}" \
--send-weekly-report "${{ toJSON(inputs.send_weekly_report) }}" \
--github-output "$GITHUB_OUTPUT"
- name: Manage runner labels based on bors status
id: manage-labels
if: steps.check-runners.outputs.api_ok == 'true'
run: |
python3 -m monitor_runners.workflow manage-labels \
--token "${{ steps.app-token.outputs.token }}" \
--org "${{ github.repository_owner }}" \
--response-file "runners_response.json" \
--dry-run "${{ env.DRY_RUN }}" \
--github-output "$GITHUB_OUTPUT"
- name: Generate stats report
id: weekly-stats
run: |
python3 -m monitor_runners.workflow weekly-report \
--stats-file "${{ env.STATS_FILE }}" \
--github-output "$GITHUB_OUTPUT"
- name: Send status message on Zulip
if: steps.check-runners.outputs.should_notify == 'true'
id: zulip-status
run: |
dry_run="${{ env.DRY_RUN }}"
if [ "$dry_run" = "true" ]; then
echo "DRY RUN: would send Zulip status message"
echo "message_id=dry-run" >> "$GITHUB_OUTPUT"
exit 0
fi
# Ensure that the bot is subscribed to the env.ZULIP_CHANNEL
# Otherwise it will not have permission to edit its own messages!
zulip_url="${{ env.ZULIP_SERVER }}"
zulip_email="${{ secrets.ZULIP_MONITOR_RUNNERS_BOT_EMAIL }}"
zulip_key="${{ secrets.ZULIP_MONITOR_RUNNERS_API_KEY }}"
message_id="${{ steps.check-runners.outputs.last_message_id }}"
content_file=$(mktemp)
cat << 'EOF' > "$content_file"
${{ steps.check-runners.outputs.message }}
EOF
if [ "${{ steps.check-runners.outputs.should_edit }}" = "true" ]; then
echo "Editing existing Zulip message: $message_id"
response=$(curl -s -X PATCH \
-u "$zulip_email:$zulip_key" \
"$zulip_url/api/v1/messages/$message_id" \
--data-urlencode "content@$content_file")
else
echo "Sending new Zulip message"
response=$(curl -s -X POST \
-u "$zulip_email:$zulip_key" \
"$zulip_url/api/v1/messages" \
--data-urlencode "type=stream" \
--data-urlencode "to=${{ env.ZULIP_CHANNEL }}" \
--data-urlencode "topic=Runner Status" \
--data-urlencode "content@$content_file")
message_id=$(echo "$response" | jq -r '.id // empty')
fi
if ! echo "$response" | jq -e '.result == "success"' > /dev/null; then
echo "ERROR: Zulip API request failed"
echo "$response"
exit 1
fi
if [ -z "$message_id" ]; then
echo "ERROR: Missing Zulip message id"
echo "$response"
exit 1
fi
echo "message_id=$message_id" >> "$GITHUB_OUTPUT"
rm -f "$content_file"
- name: Update status notification state
if: steps.check-runners.outputs.should_notify == 'true'
run: |
if [ "${{ env.DRY_RUN }}" = "true" ]; then
echo "DRY RUN: would update status notification state"
exit 0
fi
current_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
offline_set='${{ steps.check-runners.outputs.offline_set }}'
message_id="${{ steps.zulip-status.outputs.message_id }}"
state=$(cat "${{ env.STATE_FILE }}")
updated_state=$(echo "$state" | jq \
--arg time "$current_time" \
--arg message_id "$message_id" \
--argjson offline_set "$offline_set" \
'
.last_notification = (.last_notification // {}) |
.last_notification.offline_set = $offline_set |
.last_notification.message_id = $message_id |
.last_notification.updated_at = $time
')
echo "$updated_state" > "${{ env.STATE_FILE }}"
- name: Send label management notification on Zulip
if: steps.manage-labels.outputs.has_label_errors == 'true' && github.ref == 'refs/heads/master'
uses: zulip/github-actions-zulip/send-message@e4c8f27c732ba9bd98ac6be0583096dea82feea5 # v1.0.2
with:
api-key: ${{ secrets.ZULIP_MONITOR_RUNNERS_API_KEY }}
email: ${{ secrets.ZULIP_MONITOR_RUNNERS_BOT_EMAIL }}
organization-url: ${{ env.ZULIP_SERVER }}
to: ${{ env.ZULIP_CHANNEL }}
type: 'stream'
topic: 'Runner Status'
content: |
${{ steps.manage-labels.outputs.label_errors }}
- name: Send processing error notification on Zulip
if: steps.check-runners.outputs.has_processing_errors == 'true' && github.ref == 'refs/heads/master'
uses: zulip/github-actions-zulip/send-message@e4c8f27c732ba9bd98ac6be0583096dea82feea5 # v1.0.2
with:
api-key: ${{ secrets.ZULIP_MONITOR_RUNNERS_API_KEY }}
email: ${{ secrets.ZULIP_MONITOR_RUNNERS_BOT_EMAIL }}
organization-url: ${{ env.ZULIP_SERVER }}
to: ${{ env.ZULIP_CHANNEL }}
type: 'stream'
topic: 'Runner Status'
content: |
${{ steps.check-runners.outputs.processing_errors }}
- name: Send weekly report on Zulip
if: steps.check-runners.outputs.is_weekly_report == 'true' && github.ref == 'refs/heads/master'
uses: zulip/github-actions-zulip/send-message@e4c8f27c732ba9bd98ac6be0583096dea82feea5 # v1.0.2
with:
api-key: ${{ secrets.ZULIP_MONITOR_RUNNERS_API_KEY }}
email: ${{ secrets.ZULIP_MONITOR_RUNNERS_BOT_EMAIL }}
organization-url: ${{ env.ZULIP_SERVER }}
to: ${{ env.ZULIP_CHANNEL }}
type: 'stream'
topic: 'Weekly Runner Report'
content: |
${{ steps.weekly-stats.outputs.weekly_message }}
- name: Save state to cache
if: always() && (github.event_name != 'workflow_dispatch' || github.ref != 'refs/heads/master') # On manual run of test branches it's okay to upload: the cache key won't collide
uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
with:
path: ${{ env.STATE_FILE }}
key: ${{ env.CACHE_KEY }}-${{ github.run_id }}
- name: Save stats to cache
if: always() && (github.event_name != 'workflow_dispatch' || github.ref != 'refs/heads/master') # On manual run of test branches it's okay to upload: the cache key won't collide
uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
with:
path: ${{ env.STATS_FILE }}
key: ${{ env.STATS_CACHE_KEY }}-${{ github.run_id }}
- name: upload files as artifact
if: always()
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: monitor_runners_artifact
path: |
${{ env.STATE_FILE }}
${{ env.STATS_FILE }}
- name: Log summary
run: |
echo "=== Runner Monitor Summary ==="
echo "::group::State file contents:"
cat "${{ env.STATE_FILE }}" | jq .
echo "::endgroup::"
echo "::group::Stats file contents:"
cat "${{ env.STATS_FILE }}" | jq .
echo "::endgroup::"
echo ""
echo "=== Statistics Summary ==="
runner_count=$(cat "${{ env.STATS_FILE }}" | jq '.runners | keys | length')
echo "Runners tracked: $runner_count"
data_points=$(cat "${{ env.STATS_FILE }}" | jq '[.runners[].history[]] | length')
echo "Total data points: $data_points"
echo ""
echo "=== Label Management Summary ==="
echo "Bors active: ${{ steps.manage-labels.outputs.bors_active }}"
cat << 'EOF'
${{ steps.manage-labels.outputs.label_summary }}
EOF
if [ "${{ steps.manage-labels.outputs.has_label_errors }}" = "true" ]; then
echo "⚠️ Label management errors occurred (see Zulip notification)"
fi
echo ""
echo "::group::=== 7-Day Statistics Report ==="
cat << 'EOF'
${{ steps.weekly-stats.outputs.weekly_message }}
EOF
echo "::endgroup::"
if [ "${{ steps.check-runners.outputs.should_notify }}" = "true" ]; then
echo ""
echo "📢 Status notifications sent to Zulip"
else
echo ""
echo "✅ No status notifications needed - all runners stable"
fi
if [ "${{ steps.check-runners.outputs.is_weekly_report }}" = "true" ]; then
echo "📊 Weekly report sent to Zulip"
else
echo "📊 Weekly report generated but not sent (not scheduled weekly run)"
fi
workflow-keepalive:
if: github.event_name == 'schedule'
runs-on: ubuntu-latest
permissions:
actions: write
steps:
- uses: liskin/gh-workflow-keepalive@f72ff1a1336129f29bf0166c0fd0ca6cf1bcb38c # v1.2.1