Skip to content

🎼 Conductor Health Check #509

🎼 Conductor Health Check

🎼 Conductor Health Check #509

Workflow file for this run

# yaml-language-server: $schema=https://json.schemastore.org/github-workflow.json
name: 🎼 Conductor Health Check
on:
schedule:
# Daily summary at 9 AM UTC (will adapt to weekly/monthly based on activity)
- cron: '0 9 * * *'
workflow_dispatch:
push:
branches: [ main, master ]
paths:
- '.conductor/**'
- '.github/workflows/conductor.yml'
pull_request:
types: [opened, synchronize, reopened]
paths:
- '.conductor/**'
- '.github/workflows/conductor.yml'
# Prevent concurrent runs and cancel in-progress runs
concurrency:
group: conductor-health-check
cancel-in-progress: true
permissions:
contents: read
issues: write
pull-requests: write
jobs:
health-check:
name: System Health Check
runs-on: ubuntu-latest
# Skip if triggered by GitHub Actions bot to prevent recursion
if: |
github.actor != 'github-actions[bot]' &&
github.actor != 'dependabot[bot]' &&
!contains(github.event.head_commit.message, '[skip ci]') &&
!contains(github.event.head_commit.message, '[ci skip]')
steps:
- name: Check repository activity
id: activity_check
uses: actions/github-script@v7
with:
github-token: ${{ secrets.CONDUCTOR_GITHUB_TOKEN }}
script: |
const now = new Date();
// Skip if this is a scheduled run within 30 minutes of last run
if (context.eventName === 'schedule') {
const { data: runs } = await github.rest.actions.listWorkflowRuns({
owner: context.repo.owner,
repo: context.repo.repo,
workflow_id: 'conductor.yml',
status: 'completed',
per_page: 1
});
if (runs.workflow_runs.length > 0) {
const lastRun = new Date(runs.workflow_runs[0].created_at);
const minutesSinceLastRun = (now - lastRun) / (1000 * 60);
if (minutesSinceLastRun < 30) {
console.log(`Skipping - last run was ${minutesSinceLastRun} minutes ago`);
core.setOutput('should_run', false);
return;
}
}
}
// Get recent activity
const [pulls, pushes, issues] = await Promise.all([
github.rest.pulls.list({
owner: context.repo.owner,
repo: context.repo.repo,
state: 'all',
sort: 'updated',
direction: 'desc',
per_page: 1
}),
github.rest.repos.listCommits({
owner: context.repo.owner,
repo: context.repo.repo,
per_page: 1
}),
github.rest.issues.listForRepo({
owner: context.repo.owner,
repo: context.repo.repo,
state: 'all',
sort: 'updated',
direction: 'desc',
per_page: 1
})
]);
// Find most recent activity
let lastActivity = new Date(0);
if (pulls.data.length > 0) {
const prDate = new Date(pulls.data[0].updated_at);
if (prDate > lastActivity) lastActivity = prDate;
}
if (pushes.data.length > 0) {
const pushDate = new Date(pushes.data[0].commit.committer.date);
if (pushDate > lastActivity) lastActivity = pushDate;
}
if (issues.data.length > 0) {
const issueDate = new Date(issues.data[0].updated_at);
if (issueDate > lastActivity) lastActivity = issueDate;
}
const daysSinceActivity = Math.floor((now - lastActivity) / (1000 * 60 * 60 * 24));
// Determine if we should run checks
let shouldRun = true;
let summaryType = 'daily';
if (context.eventName === 'schedule') {
if (daysSinceActivity > 14) {
// Monthly summary if no activity for 2+ weeks
const dayOfMonth = now.getDate();
shouldRun = dayOfMonth === 1; // Only on 1st of month
summaryType = 'monthly';
} else if (daysSinceActivity > 3) {
// Weekly summary if no activity for 3+ days
const dayOfWeek = now.getDay();
shouldRun = dayOfWeek === 1; // Only on Mondays
summaryType = 'weekly';
}
}
console.log(`Event: ${context.eventName}`);
console.log(`Days since last activity: ${daysSinceActivity}`);
console.log(`Should run: ${shouldRun}`);
console.log(`Summary type: ${summaryType}`);
core.setOutput('should_run', shouldRun);
core.setOutput('days_inactive', daysSinceActivity);
core.setOutput('summary_type', summaryType);
- name: Checkout repository
if: steps.activity_check.outputs.should_run == 'true'
uses: actions/checkout@v4
- name: Setup Python
if: steps.activity_check.outputs.should_run == 'true'
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Install dependencies
if: steps.activity_check.outputs.should_run == 'true'
run: |
pip install pyyaml jq
- name: Setup GitHub CLI with proper token
if: steps.activity_check.outputs.should_run == 'true'
run: |
# Use CONDUCTOR_GITHUB_TOKEN for all operations
# IMPORTANT: Do not set GH_TOKEN here as it conflicts with gh auth login
echo "${{ secrets.CONDUCTOR_GITHUB_TOKEN }}" > token.txt
gh auth login --with-token < token.txt
rm -f token.txt
gh auth status
- name: Ensure required labels exist
if: steps.activity_check.outputs.should_run == 'true'
run: |
# Create labels if they don't exist
labels=(
"conductor:task|0e8a16|Tasks for AI agents"
"conductor:status|1d76db|System status tracking"
"conductor:in-progress|fbca04|Task being worked on"
"conductor:blocked|d93f0b|Task is blocked"
"conductor:archived|c5def5|Completed and archived"
"conductor:alert|e11d21|System health alert"
"conductor:init|7057ff|Initialization task for discovery"
"code-review|5319e7|Code review task for pull requests"
"needs-review|fbca04|PR needs code review"
"skip-review|c5def5|Skip AI code review"
"effort:small|76d7c4|Small effort task"
"effort:medium|f39c12|Medium effort task"
"effort:large|e74c3c|Large effort task"
"priority:low|c5def5|Low priority"
"priority:medium|fbca04|Medium priority"
"priority:high|e11d21|High priority"
"priority:critical|b60205|Critical priority - urgent"
"skill:frontend|7057ff|Frontend development"
"skill:backend|008672|Backend development"
"skill:devops|0052cc|DevOps and infrastructure"
"skill:ml|ff6b6b|Machine learning"
)
for label_info in "${labels[@]}"; do
# Split on pipe characters
IFS='|' read -r name color description <<< "$label_info"
gh label list | grep -q "^${name}" || \
gh label create "${name}" --color "${color}" --description "${description}" || true
done
env:
GH_TOKEN: ${{ secrets.CONDUCTOR_GITHUB_TOKEN }}
- name: Validate configuration
if: steps.activity_check.outputs.should_run == 'true'
run: |
python .conductor/scripts/validate-config.py || echo "Config validation completed with warnings"
env:
GH_TOKEN: ${{ secrets.CONDUCTOR_GITHUB_TOKEN }}
- name: Check system dependencies
if: steps.activity_check.outputs.should_run == 'true'
run: |
python .conductor/scripts/dependency-check.py || echo "Dependency check completed with warnings"
env:
GH_TOKEN: ${{ secrets.CONDUCTOR_GITHUB_TOKEN }}
- name: Run health check
if: steps.activity_check.outputs.should_run == 'true'
run: |
python .conductor/scripts/health-check.py --summary-type ${{ steps.activity_check.outputs.summary_type }} || echo "Health check completed with warnings"
env:
GH_TOKEN: ${{ secrets.CONDUCTOR_GITHUB_TOKEN }}
DAYS_INACTIVE: ${{ steps.activity_check.outputs.days_inactive }}
- name: Update system status
if: steps.activity_check.outputs.should_run == 'true'
run: |
python .conductor/scripts/update-status.py --no-comment || echo "Status update completed with warnings"
env:
GH_TOKEN: ${{ secrets.CONDUCTOR_GITHUB_TOKEN }}
- name: Generate status summary
if: steps.activity_check.outputs.should_run == 'true'
run: |
python .conductor/scripts/generate-summary.py > $GITHUB_STEP_SUMMARY || echo "Summary generation completed"
env:
GH_TOKEN: ${{ secrets.CONDUCTOR_GITHUB_TOKEN }}
- name: Clean up stale work
if: steps.activity_check.outputs.should_run == 'true'
run: |
python .conductor/scripts/cleanup-stale.py || echo "Cleanup completed"
env:
GH_TOKEN: ${{ secrets.CONDUCTOR_GITHUB_TOKEN }}
- name: Archive completed tasks
if: steps.activity_check.outputs.should_run == 'true'
run: |
python .conductor/scripts/archive-completed.py --max-age 7 || echo "Archival completed"
env:
GH_TOKEN: ${{ secrets.CONDUCTOR_GITHUB_TOKEN }}
- name: Check for critical issues
if: steps.activity_check.outputs.should_run == 'true'
id: critical_check
run: |
# Check if there are any critical system issues
CRITICAL_ISSUES=0
# Check for high number of stale agents
STALE_COUNT=$(GH_TOKEN=${{ secrets.CONDUCTOR_GITHUB_TOKEN }} python .conductor/scripts/health-check.py --json 2>/dev/null | jq -r '.stale_agents // 0' 2>/dev/null || echo "0")
if [ "$STALE_COUNT" -gt 3 ]; then
echo "⚠️ High stale agent count: $STALE_COUNT"
CRITICAL_ISSUES=1
fi
# Check system health score
HEALTH_SCORE=$(GH_TOKEN=${{ secrets.CONDUCTOR_GITHUB_TOKEN }} python .conductor/scripts/update-status.py --json 2>/dev/null | jq -r '.health_score // 0' 2>/dev/null || echo "0")
if (( $(echo "$HEALTH_SCORE < 0.5" | bc -l 2>/dev/null || echo 0) )); then
echo "⚠️ Low health score: $HEALTH_SCORE"
CRITICAL_ISSUES=1
fi
echo "critical_issues=$CRITICAL_ISSUES" >> $GITHUB_OUTPUT
- name: Create issue for critical problems
if: steps.activity_check.outputs.should_run == 'true' && steps.critical_check.outputs.critical_issues == '1'
uses: actions/github-script@v7
with:
github-token: ${{ secrets.CONDUCTOR_GITHUB_TOKEN }}
script: |
const title = '🚨 Conductor System Health Alert';
const body = `
## 🎼 System Health Alert
The automated health check has detected critical issues with the Code Conductor system.
### Issues Detected
- High number of stale agents
- Low system health score
- Potential system performance degradation
### Recommended Actions
1. Review stale agent cleanup: \`python .conductor/scripts/cleanup-stale.py\`
2. Check system status: \`python .conductor/scripts/update-status.py\`
3. Validate configuration: \`python .conductor/scripts/validate-config.py\`
4. Review recent activity logs
### System Status
Generated by: ${context.workflow} #${context.runNumber}
Timestamp: ${new Date().toISOString()}
_This issue was created automatically by the health monitoring system._
`;
// Check if similar issue already exists
const existingIssues = await github.rest.issues.listForRepo({
owner: context.repo.owner,
repo: context.repo.repo,
labels: 'conductor:alert',
state: 'open'
});
// Only create if no existing alert and not in stable period
const daysInactive = parseInt('${{ steps.activity_check.outputs.days_inactive }}');
const shouldAlert = daysInactive < 2; // Stop alerting after 48h of inactivity
if (existingIssues.data.length === 0 && shouldAlert) {
await github.rest.issues.create({
owner: context.repo.owner,
repo: context.repo.repo,
title: title,
body: body,
labels: ['conductor:alert', 'priority:high']
});
}