Update CI Dashboard Data #602
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Update CI Dashboard Data | |
| on: | |
| schedule: | |
| # Run every 3 hours | |
| - cron: '0 */3 * * *' | |
| workflow_dispatch: | |
| # Manual trigger (for "Refresh Now" button) | |
| inputs: | |
| reason: | |
| description: 'Reason for manual refresh' | |
| required: false | |
| default: 'Manual refresh' | |
| jobs: | |
| update-data: | |
| runs-on: ubuntu-latest | |
| outputs: | |
| new_failures: ${{ steps.process.outputs.new_failures }} | |
| notifications: ${{ steps.process.outputs.notifications }} | |
| steps: | |
| - name: Checkout dashboard repo | |
| uses: actions/checkout@v4 | |
| - name: Setup Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: '20' | |
| - name: Install dependencies | |
| run: npm install | |
| - name: Load config | |
| run: | | |
| # Using local config.yaml for now | |
| echo "Using local config.yaml" | |
| cat config.yaml | |
| - name: Fetch workflow runs and jobs | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: | | |
| # Fetch recent nightly workflow runs (last 10 days) | |
| echo "Fetching nightly workflow runs..." | |
| gh api \ | |
| -H "Accept: application/vnd.github+json" \ | |
| --paginate \ | |
| "repos/kata-containers/kata-containers/actions/workflows/ci-nightly.yaml/runs?created=>$(date -d '10 days ago' +%Y-%m-%d)" \ | |
| --jq '.workflow_runs' | jq -s 'add // []' > nightly-runs.json | |
| echo "Found $(jq 'length' nightly-runs.json) nightly runs" | |
| # For each nightly run, fetch ALL jobs (with pagination) | |
| echo "Fetching jobs for each run..." | |
| echo '[]' > all-jobs.json | |
| for run_id in $(jq -r '.[0:15] | .[].id' nightly-runs.json); do | |
| echo "Fetching jobs for run $run_id..." | |
| # Use filter=all to get jobs from ALL attempts (not just latest) | |
| # This is critical for tracking flaky tests that fail then pass on retry | |
| gh api \ | |
| -H "Accept: application/vnd.github+json" \ | |
| --paginate \ | |
| "repos/kata-containers/kata-containers/actions/runs/$run_id/jobs?per_page=100&filter=all" \ | |
| --jq '.jobs[]' | \ | |
| jq -s --arg run_id "$run_id" '[.[] | . + {workflow_run_id: $run_id}]' > run-jobs.json | |
| echo " Found $(jq 'length' run-jobs.json) jobs" | |
| # Merge | |
| jq -s 'add' all-jobs.json run-jobs.json > temp-jobs.json | |
| mv temp-jobs.json all-jobs.json | |
| done | |
| # Create final format | |
| echo '{"jobs":' > raw-runs.json | |
| cat all-jobs.json >> raw-runs.json | |
| echo '}' >> raw-runs.json | |
| echo "Fetched $(jq '.jobs | length' raw-runs.json) jobs total" | |
| # Show found jobs | |
| echo "Jobs found:" | |
| jq '.jobs[0:30] | .[] | {name: .name, conclusion: .conclusion, started_at: .started_at}' raw-runs.json | |
| # Fetch logs for ALL failed jobs to extract test failure details | |
| echo "" | |
| echo "Fetching logs for failed jobs..." | |
| mkdir -p job-logs | |
| failed_count=$(jq -r '.jobs[] | select(.conclusion == "failure") | .id' raw-runs.json | wc -l) | |
| echo "Found $failed_count failed jobs to fetch logs for" | |
| for job_id in $(jq -r '.jobs[] | select(.conclusion == "failure") | .id' raw-runs.json); do | |
| echo "Fetching logs for job $job_id..." | |
| # GitHub logs API returns a 302 redirect to a signed URL | |
| # Use curl with -L to follow redirects and get the actual log content | |
| curl -sL \ | |
| -H "Authorization: token $GH_TOKEN" \ | |
| -H "Accept: application/vnd.github+json" \ | |
| "https://api.github.com/repos/kata-containers/kata-containers/actions/jobs/$job_id/logs" \ | |
| -o "job-logs/$job_id.log" 2>&1 | |
| # Check if we got actual log content (not an error message) | |
| if [ -f "job-logs/$job_id.log" ]; then | |
| size=$(wc -c < "job-logs/$job_id.log") | |
| echo " Log file size: $size bytes" | |
| # Check if it's actually log content (should be > 1KB and contain common log patterns) | |
| if [ "$size" -lt 1000 ]; then | |
| echo " ⚠️ WARNING: Log file seems too small, might be an error response" | |
| echo " Content preview:" | |
| head -5 "job-logs/$job_id.log" | head -3 | sed 's/^/ /' | |
| elif ! grep -q "not ok\|ok \|TAP\|bats\|Running" "job-logs/$job_id.log" 2>/dev/null; then | |
| echo " ⚠️ WARNING: Log doesn't contain expected TAP/bats output patterns" | |
| echo " First 10 lines:" | |
| head -10 "job-logs/$job_id.log" | sed 's/^/ /' | |
| else | |
| echo " ✓ Log appears valid (contains TAP/bats patterns)" | |
| # Count "not ok" lines for quick verification | |
| not_ok_count=$(grep -c "not ok" "job-logs/$job_id.log" 2>/dev/null || echo "0") | |
| echo " Found $not_ok_count 'not ok' lines" | |
| fi | |
| else | |
| echo " ✗ Failed to create log file" | |
| fi | |
| done | |
| echo "Log files fetched: $(ls job-logs/ 2>/dev/null | wc -l)" | |
| echo "Total log size: $(du -sh job-logs/ 2>/dev/null | cut -f1)" | |
| - name: Fetch s390x nightly workflow runs and jobs | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: | | |
| # Fetch s390x nightly workflow runs (last 10 days) | |
| echo "Fetching s390x nightly workflow runs..." | |
| gh api \ | |
| -H "Accept: application/vnd.github+json" \ | |
| --paginate \ | |
| "repos/kata-containers/kata-containers/actions/workflows/ci-nightly-s390x.yaml/runs?created=>$(date -d '10 days ago' +%Y-%m-%d)" \ | |
| --jq '.workflow_runs' | jq -s 'add // []' > s390x-runs.json | |
| echo "Found $(jq 'length' s390x-runs.json) s390x nightly runs" | |
| # Fetch jobs for each run | |
| echo '[]' > s390x-jobs.json | |
| for run_id in $(jq -r '.[0:15] | .[].id' s390x-runs.json); do | |
| echo "Fetching jobs for s390x run $run_id..." | |
| # Use filter=all to get jobs from ALL attempts | |
| gh api \ | |
| -H "Accept: application/vnd.github+json" \ | |
| --paginate \ | |
| "repos/kata-containers/kata-containers/actions/runs/$run_id/jobs?per_page=100&filter=all" \ | |
| --jq '.jobs[]' | \ | |
| jq -s --arg run_id "$run_id" '[.[] | . + {workflow_run_id: $run_id, source_workflow: "ci-nightly-s390x"}]' > run-jobs.json | |
| echo " Found $(jq 'length' run-jobs.json) jobs" | |
| jq -s 'add' s390x-jobs.json run-jobs.json > temp-jobs.json | |
| mv temp-jobs.json s390x-jobs.json | |
| done | |
| echo "Fetched $(jq 'length' s390x-jobs.json) s390x jobs total" | |
| # Merge s390x jobs into all-jobs.json | |
| jq -s 'add' all-jobs.json s390x-jobs.json > temp-jobs.json | |
| mv temp-jobs.json all-jobs.json | |
| # Recreate raw-runs.json with merged data | |
| echo '{"jobs":' > raw-runs.json | |
| cat all-jobs.json >> raw-runs.json | |
| echo '}' >> raw-runs.json | |
| echo "Total jobs after s390x merge: $(jq '.jobs | length' raw-runs.json)" | |
| - name: Fetch required tests from gatekeeper | |
| run: | | |
| # Fetch the authoritative list of required tests | |
| curl -sL "https://raw.githubusercontent.com/kata-containers/kata-containers/refs/heads/main/tools/testing/gatekeeper/required-tests.yaml" \ | |
| -o required-tests.yaml | |
| echo "Downloaded required-tests.yaml ($(wc -c < required-tests.yaml) bytes)" | |
| - name: Fetch CoCo Charts E2E test data | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: | | |
| echo "Fetching CoCo Charts E2E workflow runs (scheduled only)..." | |
| # Fetch recent scheduled workflow runs from confidential-containers/charts | |
| gh api \ | |
| -H "Accept: application/vnd.github+json" \ | |
| --paginate \ | |
| "repos/confidential-containers/charts/actions/workflows/e2e-tests.yaml/runs?event=schedule&created=>$(date -d '10 days ago' +%Y-%m-%d)" \ | |
| --jq '.workflow_runs' | jq -s 'add // []' > coco-charts-runs.json | |
| echo "Found $(jq 'length' coco-charts-runs.json) CoCo Charts scheduled runs" | |
| # Fetch jobs for each run | |
| echo '[]' > coco-charts-jobs.json | |
| for run_id in $(jq -r '.[0:15] | .[].id' coco-charts-runs.json); do | |
| echo "Fetching jobs for CoCo Charts run $run_id..." | |
| # Use filter=all to get jobs from ALL attempts | |
| gh api \ | |
| -H "Accept: application/vnd.github+json" \ | |
| --paginate \ | |
| "repos/confidential-containers/charts/actions/runs/$run_id/jobs?per_page=100&filter=all" \ | |
| --jq '.jobs[]' | \ | |
| jq -s --arg run_id "$run_id" '[.[] | . + {workflow_run_id: $run_id, source_repo: "confidential-containers/charts"}]' > run-jobs.json | |
| echo " Found $(jq 'length' run-jobs.json) jobs" | |
| jq -s 'add' coco-charts-jobs.json run-jobs.json > temp-jobs.json | |
| mv temp-jobs.json coco-charts-jobs.json | |
| done | |
| echo "Fetched $(jq 'length' coco-charts-jobs.json) CoCo Charts jobs total" | |
| # Fetch logs for failed CoCo Charts jobs | |
| echo "" | |
| echo "Fetching logs for failed CoCo Charts jobs..." | |
| mkdir -p coco-charts-logs | |
| for job_id in $(jq -r '.[] | select(.conclusion == "failure") | .id' coco-charts-jobs.json); do | |
| echo "Fetching logs for CoCo Charts job $job_id..." | |
| curl -sL \ | |
| -H "Authorization: token $GH_TOKEN" \ | |
| -H "Accept: application/vnd.github+json" \ | |
| "https://api.github.com/repos/confidential-containers/charts/actions/jobs/$job_id/logs" \ | |
| -o "coco-charts-logs/$job_id.log" 2>&1 | |
| if [ -f "coco-charts-logs/$job_id.log" ]; then | |
| size=$(wc -c < "coco-charts-logs/$job_id.log") | |
| echo " Log file size: $size bytes" | |
| # Check for Go test FAIL patterns | |
| fail_count=$(grep -c "FAIL:" "coco-charts-logs/$job_id.log" 2>/dev/null || echo "0") | |
| echo " Found $fail_count 'FAIL:' lines (Go test failures)" | |
| fi | |
| done | |
| echo "CoCo Charts log files fetched: $(ls coco-charts-logs/ 2>/dev/null | wc -l)" | |
| - name: Fetch CoCo Cloud API Adaptor E2E test data | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: | | |
| echo "Fetching Cloud API Adaptor daily E2E workflow runs (scheduled only)..." | |
| # Fetch recent scheduled workflow runs from confidential-containers/cloud-api-adaptor | |
| gh api \ | |
| -H "Accept: application/vnd.github+json" \ | |
| --paginate \ | |
| "repos/confidential-containers/cloud-api-adaptor/actions/workflows/daily-e2e-tests.yaml/runs?event=schedule&created=>$(date -d '10 days ago' +%Y-%m-%d)" \ | |
| --jq '.workflow_runs' | jq -s 'add // []' > coco-caa-runs.json | |
| echo "Found $(jq 'length' coco-caa-runs.json) CAA scheduled runs" | |
| # Fetch jobs for each run | |
| echo '[]' > coco-caa-jobs.json | |
| for run_id in $(jq -r '.[0:15] | .[].id' coco-caa-runs.json); do | |
| echo "Fetching jobs for CAA run $run_id..." | |
| # Use filter=all to get jobs from ALL attempts | |
| gh api \ | |
| -H "Accept: application/vnd.github+json" \ | |
| --paginate \ | |
| "repos/confidential-containers/cloud-api-adaptor/actions/runs/$run_id/jobs?per_page=100&filter=all" \ | |
| --jq '.jobs[]' | \ | |
| jq -s --arg run_id "$run_id" '[.[] | . + {workflow_run_id: $run_id, source_repo: "confidential-containers/cloud-api-adaptor"}]' > run-jobs.json | |
| echo " Found $(jq 'length' run-jobs.json) jobs" | |
| jq -s 'add' coco-caa-jobs.json run-jobs.json > temp-jobs.json | |
| mv temp-jobs.json coco-caa-jobs.json | |
| done | |
| echo "Fetched $(jq 'length' coco-caa-jobs.json) CAA jobs total" | |
| # Fetch logs for failed CAA jobs | |
| echo "" | |
| echo "Fetching logs for failed CAA jobs..." | |
| mkdir -p coco-caa-logs | |
| for job_id in $(jq -r '.[] | select(.conclusion == "failure") | .id' coco-caa-jobs.json); do | |
| echo "Fetching logs for CAA job $job_id..." | |
| curl -sL \ | |
| -H "Authorization: token $GH_TOKEN" \ | |
| -H "Accept: application/vnd.github+json" \ | |
| "https://api.github.com/repos/confidential-containers/cloud-api-adaptor/actions/jobs/$job_id/logs" \ | |
| -o "coco-caa-logs/$job_id.log" 2>&1 | |
| if [ -f "coco-caa-logs/$job_id.log" ]; then | |
| size=$(wc -c < "coco-caa-logs/$job_id.log") | |
| echo " Log file size: $size bytes" | |
| # Check for Go test FAIL patterns | |
| fail_count=$(grep -c "FAIL:" "coco-caa-logs/$job_id.log" 2>/dev/null || echo "0") | |
| echo " Found $fail_count 'FAIL:' lines (Go test failures)" | |
| fi | |
| done | |
| echo "CAA log files fetched: $(ls coco-caa-logs/ 2>/dev/null | wc -l)" | |
| - name: Process data | |
| id: process | |
| run: | | |
| # Process raw data into dashboard format using config | |
| # Also outputs new failures for notifications | |
| node scripts/process-data.js | |
| # Check if there are new failures to notify about | |
| if [ -f notifications.json ]; then | |
| echo "new_failures=true" >> $GITHUB_OUTPUT | |
| echo "notifications=$(cat notifications.json | jq -c)" >> $GITHUB_OUTPUT | |
| else | |
| echo "new_failures=false" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Commit updated data | |
| run: | | |
| git config user.name "github-actions[bot]" | |
| git config user.email "github-actions[bot]@users.noreply.github.com" | |
| git add data.json | |
| if git diff --staged --quiet; then | |
| echo "No changes to commit" | |
| else | |
| git commit -m "Update dashboard data [$(date -u +%Y-%m-%dT%H:%M:%SZ)]" | |
| # Stash any unstaged changes (temp files from processing) | |
| git stash --include-untracked || true | |
| # Rebase to avoid conflicts with other concurrent jobs | |
| git pull --rebase origin main | |
| git push | |
| # Drop stash (we don't need the temp files) | |
| git stash drop || true | |
| fi | |
| notify-slack: | |
| needs: update-data | |
| if: needs.update-data.outputs.new_failures == 'true' | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout dashboard repo | |
| uses: actions/checkout@v4 | |
| - name: Load config | |
| run: | | |
| # Using local config.yaml | |
| echo "Using local config.yaml" | |
| - name: Send DM to maintainers for failures | |
| env: | |
| # All workspace tokens | |
| SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} | |
| SLACK_BOT_TOKEN_NVIDIA: ${{ secrets.SLACK_BOT_TOKEN_NVIDIA }} | |
| SLACK_BOT_TOKEN_CNCF: ${{ secrets.SLACK_BOT_TOKEN_CNCF }} | |
| SLACK_BOT_TOKEN_INTEL: ${{ secrets.SLACK_BOT_TOKEN_INTEL }} | |
| NOTIFICATIONS: ${{ needs.update-data.outputs.notifications }} | |
| run: | | |
| # Function to get token for a workspace | |
| get_token() { | |
| local workspace=$1 | |
| case "$workspace" in | |
| "nvidia") echo "$SLACK_BOT_TOKEN_NVIDIA" ;; | |
| "cncf") echo "$SLACK_BOT_TOKEN_CNCF" ;; | |
| "intel") echo "$SLACK_BOT_TOKEN_INTEL" ;; | |
| *) echo "$SLACK_BOT_TOKEN" ;; # default | |
| esac | |
| } | |
| # Send direct messages to maintainers for new failures | |
| echo "$NOTIFICATIONS" | jq -c '.[] | select(.type == "new_failure")' | while read -r notification; do | |
| section=$(echo "$notification" | jq -r '.section') | |
| test_name=$(echo "$notification" | jq -r '.test_name') | |
| error=$(echo "$notification" | jq -r '.error') | |
| run_url=$(echo "$notification" | jq -r '.run_url') | |
| # Process each maintainer with their workspace | |
| echo "$notification" | jq -c '.maintainer_contacts[]' 2>/dev/null | while read -r contact; do | |
| slack_id=$(echo "$contact" | jq -r '.slack_id') | |
| workspace=$(echo "$contact" | jq -r '.workspace // "default"') | |
| if [ -n "$slack_id" ] && [ "$slack_id" != "null" ]; then | |
| token=$(get_token "$workspace") | |
| if [ -n "$token" ]; then | |
| echo "Sending DM to $slack_id in workspace $workspace about $test_name" | |
| curl -s -X POST "https://slack.com/api/chat.postMessage" \ | |
| -H "Authorization: Bearer $token" \ | |
| -H "Content-Type: application/json" \ | |
| -d @- <<EOF | |
| { | |
| "channel": "${slack_id}", | |
| "blocks": [ | |
| { | |
| "type": "header", | |
| "text": { | |
| "type": "plain_text", | |
| "text": "🔴 Nightly Test Failure", | |
| "emoji": true | |
| } | |
| }, | |
| { | |
| "type": "section", | |
| "fields": [ | |
| { | |
| "type": "mrkdwn", | |
| "text": "*Section:*\n${section}" | |
| }, | |
| { | |
| "type": "mrkdwn", | |
| "text": "*Test:*\n\`${test_name}\`" | |
| } | |
| ] | |
| }, | |
| { | |
| "type": "section", | |
| "text": { | |
| "type": "mrkdwn", | |
| "text": "*Failed Step:*\n${error}" | |
| } | |
| }, | |
| { | |
| "type": "actions", | |
| "elements": [ | |
| { | |
| "type": "button", | |
| "text": { | |
| "type": "plain_text", | |
| "text": "🔗 View Run", | |
| "emoji": true | |
| }, | |
| "url": "${run_url}", | |
| "style": "danger" | |
| }, | |
| { | |
| "type": "button", | |
| "text": { | |
| "type": "plain_text", | |
| "text": "📊 Dashboard", | |
| "emoji": true | |
| }, | |
| "url": "https://kata-containers.github.io/ci-dashboard/" | |
| } | |
| ] | |
| } | |
| ] | |
| } | |
| EOF | |
| sleep 1 # Rate limiting | |
| else | |
| echo "No token configured for workspace: $workspace" | |
| fi | |
| fi | |
| done | |
| done | |
| - name: Send recovery DMs to maintainers | |
| env: | |
| SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} | |
| SLACK_BOT_TOKEN_NVIDIA: ${{ secrets.SLACK_BOT_TOKEN_NVIDIA }} | |
| SLACK_BOT_TOKEN_CNCF: ${{ secrets.SLACK_BOT_TOKEN_CNCF }} | |
| SLACK_BOT_TOKEN_INTEL: ${{ secrets.SLACK_BOT_TOKEN_INTEL }} | |
| NOTIFICATIONS: ${{ needs.update-data.outputs.notifications }} | |
| run: | | |
| get_token() { | |
| local workspace=$1 | |
| case "$workspace" in | |
| "nvidia") echo "$SLACK_BOT_TOKEN_NVIDIA" ;; | |
| "cncf") echo "$SLACK_BOT_TOKEN_CNCF" ;; | |
| "intel") echo "$SLACK_BOT_TOKEN_INTEL" ;; | |
| *) echo "$SLACK_BOT_TOKEN" ;; | |
| esac | |
| } | |
| # Send DMs for section recovery | |
| echo "$NOTIFICATIONS" | jq -c '.[] | select(.type == "recovery")' | while read -r notification; do | |
| section=$(echo "$notification" | jq -r '.section') | |
| echo "$notification" | jq -c '.maintainer_contacts[]' 2>/dev/null | while read -r contact; do | |
| slack_id=$(echo "$contact" | jq -r '.slack_id') | |
| workspace=$(echo "$contact" | jq -r '.workspace // "default"') | |
| if [ -n "$slack_id" ] && [ "$slack_id" != "null" ]; then | |
| token=$(get_token "$workspace") | |
| if [ -n "$token" ]; then | |
| curl -s -X POST "https://slack.com/api/chat.postMessage" \ | |
| -H "Authorization: Bearer $token" \ | |
| -H "Content-Type: application/json" \ | |
| -d @- <<EOF | |
| { | |
| "channel": "${slack_id}", | |
| "text": "☀️ *${section}* is back to 100% passing!" | |
| } | |
| EOF | |
| fi | |
| fi | |
| done | |
| done | |