Monitor Self-Hosted Runners #16377

Workflow file for this run

.github/workflows/monitor_runners.yml at 044d459

	name: Monitor Self-Hosted Runners

	on:
	# push: # uncomment for testing if on a branch in the azure_scripts repo
	schedule:
	- cron: '7/15 * * * *' # every 15 minutes but at :07, :22, :37, :52 to try to make it less likely that github drops the job
	- cron: '0 9 * * 1' # weekly report every Monday at 9 AM UTC
	workflow_dispatch:
	inputs:
	send_weekly_report:
	description: 'Send a weekly report to Zulip'
	required: false
	default: false
	type: boolean

	env:
	ZULIP_SERVER: "https://leanprover.zulipchat.com"
	ZULIP_CHANNEL: "CI admins"
	STATE_FILE: "runner-state.json"
	STATS_FILE: "runner-stats.json"
	CACHE_KEY: ${{ github.ref == 'refs/heads/master' && 'runner-monitor-state-v4' \|\| 'runner-monitor-state-testing' }}
	STATS_CACHE_KEY: ${{ github.ref == 'refs/heads/master' && 'runner-monitor-stats-v4' \|\| 'runner-monitor-stats-testing' }}
	DRY_RUN: ${{ github.ref != 'refs/heads/master' }}

	jobs:
	monitor-runners:
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v4

	- uses: actions/create-github-app-token@29824e69f54612133e76f7eaac726eef6c875baf # v2.2.1
	id: app-token
	with:
	app-id: ${{ secrets.LEANPROVER_COMMUNITY_RUNNERS_APP_ID }}
	private-key: ${{ secrets.LEANPROVER_COMMUNITY_RUNNERS_PRIVATE_KEY }}

	- name: Restore previous state
	id: cache-restore
	uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
	with:
	path: ${{ env.STATE_FILE }}
	key: ${{ env.CACHE_KEY }}-${{ github.run_id }}
	restore-keys: ${{ env.CACHE_KEY }}

	- name: Restore previous stats
	id: stats-cache-restore
	uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
	with:
	path: ${{ env.STATS_FILE }}
	key: ${{ env.STATS_CACHE_KEY }}-${{ github.run_id }}
	restore-keys: ${{ env.STATS_CACHE_KEY }}

	- name: Create empty state
	if: steps.cache-restore.outputs.cache-hit == ''
	run: \|
	echo "No previous state file found, creating empty state"
	echo '{"last_run": "", "runners": {}}' > "${{ env.STATE_FILE }}"

	- name: Create empty stats
	if: steps.stats-cache-restore.outputs.cache-hit == ''
	run: \|
	echo "No previous stats file found, creating empty stats"
	echo '{"runners": {}, "last_cleanup": ""}' > "${{ env.STATS_FILE }}"

	- name: Check self-hosted runners
	id: check-runners
	run: \|
	python3 -m monitor_runners.workflow check-runners \
	--token "${{ steps.app-token.outputs.token }}" \
	--org "${{ github.repository_owner }}" \
	--state-file "${{ env.STATE_FILE }}" \
	--stats-file "${{ env.STATS_FILE }}" \
	--response-file "runners_response.json" \
	--schedule "${{ github.event.schedule \|\| '' }}" \
	--send-weekly-report "${{ toJSON(inputs.send_weekly_report) }}" \
	--github-output "$GITHUB_OUTPUT"

	- name: Manage runner labels based on bors status
	id: manage-labels
	if: steps.check-runners.outputs.api_ok == 'true'
	run: \|
	python3 -m monitor_runners.workflow manage-labels \
	--token "${{ steps.app-token.outputs.token }}" \
	--org "${{ github.repository_owner }}" \
	--response-file "runners_response.json" \
	--dry-run "${{ env.DRY_RUN }}" \
	--github-output "$GITHUB_OUTPUT"

	- name: Generate stats report
	id: weekly-stats
	run: \|
	python3 -m monitor_runners.workflow weekly-report \
	--stats-file "${{ env.STATS_FILE }}" \
	--github-output "$GITHUB_OUTPUT"

	- name: Send status message on Zulip
	if: steps.check-runners.outputs.should_notify == 'true'
	id: zulip-status
	run: \|
	dry_run="${{ env.DRY_RUN }}"
	if [ "$dry_run" = "true" ]; then
	echo "DRY RUN: would send Zulip status message"
	echo "message_id=dry-run" >> "$GITHUB_OUTPUT"
	exit 0
	fi

	# Ensure that the bot is subscribed to the env.ZULIP_CHANNEL
	# Otherwise it will not have permission to edit its own messages!
	zulip_url="${{ env.ZULIP_SERVER }}"
	zulip_email="${{ secrets.ZULIP_MONITOR_RUNNERS_BOT_EMAIL }}"
	zulip_key="${{ secrets.ZULIP_MONITOR_RUNNERS_API_KEY }}"
	message_id="${{ steps.check-runners.outputs.last_message_id }}"

	content_file=$(mktemp)
	cat << 'EOF' > "$content_file"
	${{ steps.check-runners.outputs.message }}
	EOF

	if [ "${{ steps.check-runners.outputs.should_edit }}" = "true" ]; then
	echo "Editing existing Zulip message: $message_id"
	response=$(curl -s -X PATCH \
	-u "$zulip_email:$zulip_key" \
	"$zulip_url/api/v1/messages/$message_id" \
	--data-urlencode "content@$content_file")
	else
	echo "Sending new Zulip message"
	response=$(curl -s -X POST \
	-u "$zulip_email:$zulip_key" \
	"$zulip_url/api/v1/messages" \
	--data-urlencode "type=stream" \
	--data-urlencode "to=${{ env.ZULIP_CHANNEL }}" \
	--data-urlencode "topic=Runner Status" \
	--data-urlencode "content@$content_file")
	message_id=$(echo "$response" \| jq -r '.id // empty')
	fi

	if ! echo "$response" \| jq -e '.result == "success"' > /dev/null; then
	echo "ERROR: Zulip API request failed"
	echo "$response"
	exit 1
	fi

	if [ -z "$message_id" ]; then
	echo "ERROR: Missing Zulip message id"
	echo "$response"
	exit 1
	fi

	echo "message_id=$message_id" >> "$GITHUB_OUTPUT"
	rm -f "$content_file"

	- name: Update status notification state
	if: steps.check-runners.outputs.should_notify == 'true'
	run: \|
	if [ "${{ env.DRY_RUN }}" = "true" ]; then
	echo "DRY RUN: would update status notification state"
	exit 0
	fi

	current_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
	offline_set='${{ steps.check-runners.outputs.offline_set }}'
	message_id="${{ steps.zulip-status.outputs.message_id }}"

	state=$(cat "${{ env.STATE_FILE }}")
	updated_state=$(echo "$state" \| jq \
	--arg time "$current_time" \
	--arg message_id "$message_id" \
	--argjson offline_set "$offline_set" \
	'
	.last_notification = (.last_notification // {}) \|
	.last_notification.offline_set = $offline_set \|
	.last_notification.message_id = $message_id \|
	.last_notification.updated_at = $time
	')

	echo "$updated_state" > "${{ env.STATE_FILE }}"

	- name: Send label management notification on Zulip
	if: steps.manage-labels.outputs.has_label_errors == 'true' && github.ref == 'refs/heads/master'
	uses: zulip/github-actions-zulip/send-message@e4c8f27c732ba9bd98ac6be0583096dea82feea5 # v1.0.2
	with:
	api-key: ${{ secrets.ZULIP_MONITOR_RUNNERS_API_KEY }}
	email: ${{ secrets.ZULIP_MONITOR_RUNNERS_BOT_EMAIL }}
	organization-url: ${{ env.ZULIP_SERVER }}
	to: ${{ env.ZULIP_CHANNEL }}
	type: 'stream'
	topic: 'Runner Status'
	content: \|
	${{ steps.manage-labels.outputs.label_errors }}

	- name: Send processing error notification on Zulip
	if: steps.check-runners.outputs.has_processing_errors == 'true' && github.ref == 'refs/heads/master'
	uses: zulip/github-actions-zulip/send-message@e4c8f27c732ba9bd98ac6be0583096dea82feea5 # v1.0.2
	with:
	api-key: ${{ secrets.ZULIP_MONITOR_RUNNERS_API_KEY }}
	email: ${{ secrets.ZULIP_MONITOR_RUNNERS_BOT_EMAIL }}
	organization-url: ${{ env.ZULIP_SERVER }}
	to: ${{ env.ZULIP_CHANNEL }}
	type: 'stream'
	topic: 'Runner Status'
	content: \|
	${{ steps.check-runners.outputs.processing_errors }}

	- name: Send weekly report on Zulip
	if: steps.check-runners.outputs.is_weekly_report == 'true' && github.ref == 'refs/heads/master'
	uses: zulip/github-actions-zulip/send-message@e4c8f27c732ba9bd98ac6be0583096dea82feea5 # v1.0.2
	with:
	api-key: ${{ secrets.ZULIP_MONITOR_RUNNERS_API_KEY }}
	email: ${{ secrets.ZULIP_MONITOR_RUNNERS_BOT_EMAIL }}
	organization-url: ${{ env.ZULIP_SERVER }}
	to: ${{ env.ZULIP_CHANNEL }}
	type: 'stream'
	topic: 'Weekly Runner Report'
	content: \|
	${{ steps.weekly-stats.outputs.weekly_message }}

	- name: Save state to cache
	if: always() && (github.event_name != 'workflow_dispatch' \|\| github.ref != 'refs/heads/master') # On manual run of test branches it's okay to upload: the cache key won't collide
	uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
	with:
	path: ${{ env.STATE_FILE }}
	key: ${{ env.CACHE_KEY }}-${{ github.run_id }}

	- name: Save stats to cache
	if: always() && (github.event_name != 'workflow_dispatch' \|\| github.ref != 'refs/heads/master') # On manual run of test branches it's okay to upload: the cache key won't collide
	uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
	with:
	path: ${{ env.STATS_FILE }}
	key: ${{ env.STATS_CACHE_KEY }}-${{ github.run_id }}

	- name: upload files as artifact
	if: always()
	uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
	with:
	name: monitor_runners_artifact
	path: \|
	${{ env.STATE_FILE }}
	${{ env.STATS_FILE }}

	- name: Log summary
	run: \|
	echo "=== Runner Monitor Summary ==="
	echo "::group::State file contents:"
	cat "${{ env.STATE_FILE }}" \| jq .
	echo "::endgroup::"

	echo "::group::Stats file contents:"
	cat "${{ env.STATS_FILE }}" \| jq .
	echo "::endgroup::"

	echo ""
	echo "=== Statistics Summary ==="
	runner_count=$(cat "${{ env.STATS_FILE }}" \| jq '.runners \| keys \| length')
	echo "Runners tracked: $runner_count"
	data_points=$(cat "${{ env.STATS_FILE }}" \| jq '[.runners[].history[]] \| length')
	echo "Total data points: $data_points"

	echo ""
	echo "=== Label Management Summary ==="
	echo "Bors active: ${{ steps.manage-labels.outputs.bors_active }}"
	cat << 'EOF'
	${{ steps.manage-labels.outputs.label_summary }}
	EOF
	if [ "${{ steps.manage-labels.outputs.has_label_errors }}" = "true" ]; then
	echo "⚠️ Label management errors occurred (see Zulip notification)"
	fi

	echo ""
	echo "::group::=== 7-Day Statistics Report ==="
	cat << 'EOF'
	${{ steps.weekly-stats.outputs.weekly_message }}
	EOF
	echo "::endgroup::"

	if [ "${{ steps.check-runners.outputs.should_notify }}" = "true" ]; then
	echo ""
	echo "📢 Status notifications sent to Zulip"
	else
	echo ""
	echo "✅ No status notifications needed - all runners stable"
	fi

	if [ "${{ steps.check-runners.outputs.is_weekly_report }}" = "true" ]; then
	echo "📊 Weekly report sent to Zulip"
	else
	echo "📊 Weekly report generated but not sent (not scheduled weekly run)"
	fi

	workflow-keepalive:
	if: github.event_name == 'schedule'
	runs-on: ubuntu-latest
	permissions:
	actions: write
	steps:
	- uses: liskin/gh-workflow-keepalive@f72ff1a1336129f29bf0166c0fd0ca6cf1bcb38c # v1.2.1

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Monitor Self-Hosted Runners #16377

Workflow file

Monitor Self-Hosted Runners #16377

Uh oh!

Workflow file for this run