Update CI Dashboard Data #602

Workflow file for this run

.github/workflows/update-data.yml at bf5c276

	name: Update CI Dashboard Data

	on:
	schedule:
	# Run every 3 hours
	- cron: '0 /3 * *'

	workflow_dispatch:
	# Manual trigger (for "Refresh Now" button)
	inputs:
	reason:
	description: 'Reason for manual refresh'
	required: false
	default: 'Manual refresh'

	jobs:
	update-data:
	runs-on: ubuntu-latest

	outputs:
	new_failures: ${{ steps.process.outputs.new_failures }}
	notifications: ${{ steps.process.outputs.notifications }}

	steps:
	- name: Checkout dashboard repo
	uses: actions/checkout@v4

	- name: Setup Node.js
	uses: actions/setup-node@v4
	with:
	node-version: '20'

	- name: Install dependencies
	run: npm install

	- name: Load config
	run: \|
	# Using local config.yaml for now
	echo "Using local config.yaml"
	cat config.yaml

	- name: Fetch workflow runs and jobs
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: \|
	# Fetch recent nightly workflow runs (last 10 days)
	echo "Fetching nightly workflow runs..."
	gh api \
	-H "Accept: application/vnd.github+json" \
	--paginate \
	"repos/kata-containers/kata-containers/actions/workflows/ci-nightly.yaml/runs?created=>$(date -d '10 days ago' +%Y-%m-%d)" \
	--jq '.workflow_runs' \| jq -s 'add // []' > nightly-runs.json

	echo "Found $(jq 'length' nightly-runs.json) nightly runs"

	# For each nightly run, fetch ALL jobs (with pagination)
	echo "Fetching jobs for each run..."
	echo '[]' > all-jobs.json

	for run_id in $(jq -r '.[0:15] \| .[].id' nightly-runs.json); do
	echo "Fetching jobs for run $run_id..."

	# Use filter=all to get jobs from ALL attempts (not just latest)
	# This is critical for tracking flaky tests that fail then pass on retry
	gh api \
	-H "Accept: application/vnd.github+json" \
	--paginate \
	"repos/kata-containers/kata-containers/actions/runs/$run_id/jobs?per_page=100&filter=all" \
	--jq '.jobs[]' \| \
	jq -s --arg run_id "$run_id" '[.[] \| . + {workflow_run_id: $run_id}]' > run-jobs.json

	echo " Found $(jq 'length' run-jobs.json) jobs"

	# Merge
	jq -s 'add' all-jobs.json run-jobs.json > temp-jobs.json
	mv temp-jobs.json all-jobs.json
	done

	# Create final format
	echo '{"jobs":' > raw-runs.json
	cat all-jobs.json >> raw-runs.json
	echo '}' >> raw-runs.json

	echo "Fetched $(jq '.jobs \| length' raw-runs.json) jobs total"

	# Show found jobs
	echo "Jobs found:"
	jq '.jobs[0:30] \| .[] \| {name: .name, conclusion: .conclusion, started_at: .started_at}' raw-runs.json

	# Fetch logs for ALL failed jobs to extract test failure details
	echo ""
	echo "Fetching logs for failed jobs..."
	mkdir -p job-logs

	failed_count=$(jq -r '.jobs[] \| select(.conclusion == "failure") \| .id' raw-runs.json \| wc -l)
	echo "Found $failed_count failed jobs to fetch logs for"

	for job_id in $(jq -r '.jobs[] \| select(.conclusion == "failure") \| .id' raw-runs.json); do
	echo "Fetching logs for job $job_id..."

	# GitHub logs API returns a 302 redirect to a signed URL
	# Use curl with -L to follow redirects and get the actual log content
	curl -sL \
	-H "Authorization: token $GH_TOKEN" \
	-H "Accept: application/vnd.github+json" \
	"https://api.github.com/repos/kata-containers/kata-containers/actions/jobs/$job_id/logs" \
	-o "job-logs/$job_id.log" 2>&1

	# Check if we got actual log content (not an error message)
	if [ -f "job-logs/$job_id.log" ]; then
	size=$(wc -c < "job-logs/$job_id.log")
	echo " Log file size: $size bytes"

	# Check if it's actually log content (should be > 1KB and contain common log patterns)
	if [ "$size" -lt 1000 ]; then
	echo " ⚠️ WARNING: Log file seems too small, might be an error response"
	echo " Content preview:"
	head -5 "job-logs/$job_id.log" \| head -3 \| sed 's/^/ /'
	elif ! grep -q "not ok\\|ok \\|TAP\\|bats\\|Running" "job-logs/$job_id.log" 2>/dev/null; then
	echo " ⚠️ WARNING: Log doesn't contain expected TAP/bats output patterns"
	echo " First 10 lines:"
	head -10 "job-logs/$job_id.log" \| sed 's/^/ /'
	else
	echo " ✓ Log appears valid (contains TAP/bats patterns)"
	# Count "not ok" lines for quick verification
	not_ok_count=$(grep -c "not ok" "job-logs/$job_id.log" 2>/dev/null \|\| echo "0")
	echo " Found $not_ok_count 'not ok' lines"
	fi
	else
	echo " ✗ Failed to create log file"
	fi
	done

	echo "Log files fetched: $(ls job-logs/ 2>/dev/null \| wc -l)"
	echo "Total log size: $(du -sh job-logs/ 2>/dev/null \| cut -f1)"

	- name: Fetch s390x nightly workflow runs and jobs
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: \|
	# Fetch s390x nightly workflow runs (last 10 days)
	echo "Fetching s390x nightly workflow runs..."
	gh api \
	-H "Accept: application/vnd.github+json" \
	--paginate \
	"repos/kata-containers/kata-containers/actions/workflows/ci-nightly-s390x.yaml/runs?created=>$(date -d '10 days ago' +%Y-%m-%d)" \
	--jq '.workflow_runs' \| jq -s 'add // []' > s390x-runs.json

	echo "Found $(jq 'length' s390x-runs.json) s390x nightly runs"

	# Fetch jobs for each run
	echo '[]' > s390x-jobs.json

	for run_id in $(jq -r '.[0:15] \| .[].id' s390x-runs.json); do
	echo "Fetching jobs for s390x run $run_id..."

	# Use filter=all to get jobs from ALL attempts
	gh api \
	-H "Accept: application/vnd.github+json" \
	--paginate \
	"repos/kata-containers/kata-containers/actions/runs/$run_id/jobs?per_page=100&filter=all" \
	--jq '.jobs[]' \| \
	jq -s --arg run_id "$run_id" '[.[] \| . + {workflow_run_id: $run_id, source_workflow: "ci-nightly-s390x"}]' > run-jobs.json

	echo " Found $(jq 'length' run-jobs.json) jobs"

	jq -s 'add' s390x-jobs.json run-jobs.json > temp-jobs.json
	mv temp-jobs.json s390x-jobs.json
	done

	echo "Fetched $(jq 'length' s390x-jobs.json) s390x jobs total"

	# Merge s390x jobs into all-jobs.json
	jq -s 'add' all-jobs.json s390x-jobs.json > temp-jobs.json
	mv temp-jobs.json all-jobs.json

	# Recreate raw-runs.json with merged data
	echo '{"jobs":' > raw-runs.json
	cat all-jobs.json >> raw-runs.json
	echo '}' >> raw-runs.json

	echo "Total jobs after s390x merge: $(jq '.jobs \| length' raw-runs.json)"

	- name: Fetch required tests from gatekeeper
	run: \|
	# Fetch the authoritative list of required tests
	curl -sL "https://raw.githubusercontent.com/kata-containers/kata-containers/refs/heads/main/tools/testing/gatekeeper/required-tests.yaml" \
	-o required-tests.yaml
	echo "Downloaded required-tests.yaml ($(wc -c < required-tests.yaml) bytes)"

	- name: Fetch CoCo Charts E2E test data
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: \|
	echo "Fetching CoCo Charts E2E workflow runs (scheduled only)..."

	# Fetch recent scheduled workflow runs from confidential-containers/charts
	gh api \
	-H "Accept: application/vnd.github+json" \
	--paginate \
	"repos/confidential-containers/charts/actions/workflows/e2e-tests.yaml/runs?event=schedule&created=>$(date -d '10 days ago' +%Y-%m-%d)" \
	--jq '.workflow_runs' \| jq -s 'add // []' > coco-charts-runs.json

	echo "Found $(jq 'length' coco-charts-runs.json) CoCo Charts scheduled runs"

	# Fetch jobs for each run
	echo '[]' > coco-charts-jobs.json

	for run_id in $(jq -r '.[0:15] \| .[].id' coco-charts-runs.json); do
	echo "Fetching jobs for CoCo Charts run $run_id..."

	# Use filter=all to get jobs from ALL attempts
	gh api \
	-H "Accept: application/vnd.github+json" \
	--paginate \
	"repos/confidential-containers/charts/actions/runs/$run_id/jobs?per_page=100&filter=all" \
	--jq '.jobs[]' \| \
	jq -s --arg run_id "$run_id" '[.[] \| . + {workflow_run_id: $run_id, source_repo: "confidential-containers/charts"}]' > run-jobs.json

	echo " Found $(jq 'length' run-jobs.json) jobs"

	jq -s 'add' coco-charts-jobs.json run-jobs.json > temp-jobs.json
	mv temp-jobs.json coco-charts-jobs.json
	done

	echo "Fetched $(jq 'length' coco-charts-jobs.json) CoCo Charts jobs total"

	# Fetch logs for failed CoCo Charts jobs
	echo ""
	echo "Fetching logs for failed CoCo Charts jobs..."
	mkdir -p coco-charts-logs

	for job_id in $(jq -r '.[] \| select(.conclusion == "failure") \| .id' coco-charts-jobs.json); do
	echo "Fetching logs for CoCo Charts job $job_id..."

	curl -sL \
	-H "Authorization: token $GH_TOKEN" \
	-H "Accept: application/vnd.github+json" \
	"https://api.github.com/repos/confidential-containers/charts/actions/jobs/$job_id/logs" \
	-o "coco-charts-logs/$job_id.log" 2>&1

	if [ -f "coco-charts-logs/$job_id.log" ]; then
	size=$(wc -c < "coco-charts-logs/$job_id.log")
	echo " Log file size: $size bytes"

	# Check for Go test FAIL patterns
	fail_count=$(grep -c "FAIL:" "coco-charts-logs/$job_id.log" 2>/dev/null \|\| echo "0")
	echo " Found $fail_count 'FAIL:' lines (Go test failures)"
	fi
	done

	echo "CoCo Charts log files fetched: $(ls coco-charts-logs/ 2>/dev/null \| wc -l)"

	- name: Fetch CoCo Cloud API Adaptor E2E test data
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: \|
	echo "Fetching Cloud API Adaptor daily E2E workflow runs (scheduled only)..."

	# Fetch recent scheduled workflow runs from confidential-containers/cloud-api-adaptor
	gh api \
	-H "Accept: application/vnd.github+json" \
	--paginate \
	"repos/confidential-containers/cloud-api-adaptor/actions/workflows/daily-e2e-tests.yaml/runs?event=schedule&created=>$(date -d '10 days ago' +%Y-%m-%d)" \
	--jq '.workflow_runs' \| jq -s 'add // []' > coco-caa-runs.json

	echo "Found $(jq 'length' coco-caa-runs.json) CAA scheduled runs"

	# Fetch jobs for each run
	echo '[]' > coco-caa-jobs.json

	for run_id in $(jq -r '.[0:15] \| .[].id' coco-caa-runs.json); do
	echo "Fetching jobs for CAA run $run_id..."

	# Use filter=all to get jobs from ALL attempts
	gh api \
	-H "Accept: application/vnd.github+json" \
	--paginate \
	"repos/confidential-containers/cloud-api-adaptor/actions/runs/$run_id/jobs?per_page=100&filter=all" \
	--jq '.jobs[]' \| \
	jq -s --arg run_id "$run_id" '[.[] \| . + {workflow_run_id: $run_id, source_repo: "confidential-containers/cloud-api-adaptor"}]' > run-jobs.json

	echo " Found $(jq 'length' run-jobs.json) jobs"

	jq -s 'add' coco-caa-jobs.json run-jobs.json > temp-jobs.json
	mv temp-jobs.json coco-caa-jobs.json
	done

	echo "Fetched $(jq 'length' coco-caa-jobs.json) CAA jobs total"

	# Fetch logs for failed CAA jobs
	echo ""
	echo "Fetching logs for failed CAA jobs..."
	mkdir -p coco-caa-logs

	for job_id in $(jq -r '.[] \| select(.conclusion == "failure") \| .id' coco-caa-jobs.json); do
	echo "Fetching logs for CAA job $job_id..."

	curl -sL \
	-H "Authorization: token $GH_TOKEN" \
	-H "Accept: application/vnd.github+json" \
	"https://api.github.com/repos/confidential-containers/cloud-api-adaptor/actions/jobs/$job_id/logs" \
	-o "coco-caa-logs/$job_id.log" 2>&1

	if [ -f "coco-caa-logs/$job_id.log" ]; then
	size=$(wc -c < "coco-caa-logs/$job_id.log")
	echo " Log file size: $size bytes"

	# Check for Go test FAIL patterns
	fail_count=$(grep -c "FAIL:" "coco-caa-logs/$job_id.log" 2>/dev/null \|\| echo "0")
	echo " Found $fail_count 'FAIL:' lines (Go test failures)"
	fi
	done

	echo "CAA log files fetched: $(ls coco-caa-logs/ 2>/dev/null \| wc -l)"

	- name: Process data
	id: process
	run: \|
	# Process raw data into dashboard format using config
	# Also outputs new failures for notifications
	node scripts/process-data.js

	# Check if there are new failures to notify about
	if [ -f notifications.json ]; then
	echo "new_failures=true" >> $GITHUB_OUTPUT
	echo "notifications=$(cat notifications.json \| jq -c)" >> $GITHUB_OUTPUT
	else
	echo "new_failures=false" >> $GITHUB_OUTPUT
	fi

	- name: Commit updated data
	run: \|
	git config user.name "github-actions[bot]"
	git config user.email "github-actions[bot]@users.noreply.github.com"

	git add data.json

	if git diff --staged --quiet; then
	echo "No changes to commit"
	else
	git commit -m "Update dashboard data [$(date -u +%Y-%m-%dT%H:%M:%SZ)]"
	# Stash any unstaged changes (temp files from processing)
	git stash --include-untracked \|\| true
	# Rebase to avoid conflicts with other concurrent jobs
	git pull --rebase origin main
	git push
	# Drop stash (we don't need the temp files)
	git stash drop \|\| true
	fi

	notify-slack:
	needs: update-data
	if: needs.update-data.outputs.new_failures == 'true'
	runs-on: ubuntu-latest

	steps:
	- name: Checkout dashboard repo
	uses: actions/checkout@v4

	- name: Load config
	run: \|
	# Using local config.yaml
	echo "Using local config.yaml"

	- name: Send DM to maintainers for failures
	env:
	# All workspace tokens
	SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
	SLACK_BOT_TOKEN_NVIDIA: ${{ secrets.SLACK_BOT_TOKEN_NVIDIA }}
	SLACK_BOT_TOKEN_CNCF: ${{ secrets.SLACK_BOT_TOKEN_CNCF }}
	SLACK_BOT_TOKEN_INTEL: ${{ secrets.SLACK_BOT_TOKEN_INTEL }}
	NOTIFICATIONS: ${{ needs.update-data.outputs.notifications }}
	run: \|
	# Function to get token for a workspace
	get_token() {
	local workspace=$1
	case "$workspace" in
	"nvidia") echo "$SLACK_BOT_TOKEN_NVIDIA" ;;
	"cncf") echo "$SLACK_BOT_TOKEN_CNCF" ;;
	"intel") echo "$SLACK_BOT_TOKEN_INTEL" ;;
	*) echo "$SLACK_BOT_TOKEN" ;; # default
	esac
	}

	# Send direct messages to maintainers for new failures
	echo "$NOTIFICATIONS" \| jq -c '.[] \| select(.type == "new_failure")' \| while read -r notification; do
	section=$(echo "$notification" \| jq -r '.section')
	test_name=$(echo "$notification" \| jq -r '.test_name')
	error=$(echo "$notification" \| jq -r '.error')
	run_url=$(echo "$notification" \| jq -r '.run_url')

	# Process each maintainer with their workspace
	echo "$notification" \| jq -c '.maintainer_contacts[]' 2>/dev/null \| while read -r contact; do
	slack_id=$(echo "$contact" \| jq -r '.slack_id')
	workspace=$(echo "$contact" \| jq -r '.workspace // "default"')

	if [ -n "$slack_id" ] && [ "$slack_id" != "null" ]; then
	token=$(get_token "$workspace")

	if [ -n "$token" ]; then
	echo "Sending DM to $slack_id in workspace $workspace about $test_name"

	curl -s -X POST "https://slack.com/api/chat.postMessage" \
	-H "Authorization: Bearer $token" \
	-H "Content-Type: application/json" \
	-d @- <<EOF
	{
	"channel": "${slack_id}",
	"blocks": [
	{
	"type": "header",
	"text": {
	"type": "plain_text",
	"text": "🔴 Nightly Test Failure",
	"emoji": true
	}
	},
	{
	"type": "section",
	"fields": [
	{
	"type": "mrkdwn",
	"text": "Section:\n${section}"
	},
	{
	"type": "mrkdwn",
	"text": "Test:\n\`${test_name}\`"
	}
	]
	},
	{
	"type": "section",
	"text": {
	"type": "mrkdwn",
	"text": "Failed Step:\n${error}"
	}
	},
	{
	"type": "actions",
	"elements": [
	{
	"type": "button",
	"text": {
	"type": "plain_text",
	"text": "🔗 View Run",
	"emoji": true
	},
	"url": "${run_url}",
	"style": "danger"
	},
	{
	"type": "button",
	"text": {
	"type": "plain_text",
	"text": "📊 Dashboard",
	"emoji": true
	},
	"url": "https://kata-containers.github.io/ci-dashboard/"
	}
	]
	}
	]
	}
	EOF
	sleep 1 # Rate limiting
	else
	echo "No token configured for workspace: $workspace"
	fi
	fi
	done
	done

	- name: Send recovery DMs to maintainers
	env:
	SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
	SLACK_BOT_TOKEN_NVIDIA: ${{ secrets.SLACK_BOT_TOKEN_NVIDIA }}
	SLACK_BOT_TOKEN_CNCF: ${{ secrets.SLACK_BOT_TOKEN_CNCF }}
	SLACK_BOT_TOKEN_INTEL: ${{ secrets.SLACK_BOT_TOKEN_INTEL }}
	NOTIFICATIONS: ${{ needs.update-data.outputs.notifications }}
	run: \|
	get_token() {
	local workspace=$1
	case "$workspace" in
	"nvidia") echo "$SLACK_BOT_TOKEN_NVIDIA" ;;
	"cncf") echo "$SLACK_BOT_TOKEN_CNCF" ;;
	"intel") echo "$SLACK_BOT_TOKEN_INTEL" ;;
	*) echo "$SLACK_BOT_TOKEN" ;;
	esac
	}

	# Send DMs for section recovery
	echo "$NOTIFICATIONS" \| jq -c '.[] \| select(.type == "recovery")' \| while read -r notification; do
	section=$(echo "$notification" \| jq -r '.section')

	echo "$notification" \| jq -c '.maintainer_contacts[]' 2>/dev/null \| while read -r contact; do
	slack_id=$(echo "$contact" \| jq -r '.slack_id')
	workspace=$(echo "$contact" \| jq -r '.workspace // "default"')

	if [ -n "$slack_id" ] && [ "$slack_id" != "null" ]; then
	token=$(get_token "$workspace")

	if [ -n "$token" ]; then
	curl -s -X POST "https://slack.com/api/chat.postMessage" \
	-H "Authorization: Bearer $token" \
	-H "Content-Type: application/json" \
	-d @- <<EOF
	{
	"channel": "${slack_id}",
	"text": "☀️ ${section} is back to 100% passing!"
	}
	EOF
	fi
	fi
	done
	done

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Update CI Dashboard Data #602

Workflow file

Update CI Dashboard Data #602

Uh oh!

Workflow file for this run