Self-Hosted Runner Health Check #1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Self-Hosted Runner Health Check | |
| on: | |
| workflow_dispatch: | |
| jobs: | |
| health-check: | |
| if: github.repository_owner == 'bluerobotics' | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - runner: blueos-ci | |
| name: "BlueOS CI (ARM32)" | |
| - runner: pi4-builder2 | |
| name: "Pi4 Builder 2 (ARM32)" | |
| - runner: pi5-builder | |
| name: "Pi5 Builder (ARM64)" | |
| runs-on: ${{ matrix.runner }} | |
| name: ${{ matrix.name }} | |
| steps: | |
| - name: System Information | |
| run: | | |
| echo "## System Information" >> $GITHUB_STEP_SUMMARY | |
| echo "| Property | Value |" >> $GITHUB_STEP_SUMMARY | |
| echo "|----------|-------|" >> $GITHUB_STEP_SUMMARY | |
| echo "| Hostname | $(hostname) |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Kernel | $(uname -r) |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Architecture | $(uname -m) |" >> $GITHUB_STEP_SUMMARY | |
| echo "| OS | $(cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2) |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Uptime | $(uptime -p) |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Date | $(date) |" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| - name: Disk Usage | |
| run: | | |
| echo "## Disk Usage" >> $GITHUB_STEP_SUMMARY | |
| echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
| df -h | grep -E "^/dev|Filesystem" >> $GITHUB_STEP_SUMMARY | |
| echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| # Check if disk usage is critical (>85%) | |
| DISK_USAGE=$(df / | tail -1 | awk '{print $5}' | sed 's/%//') | |
| if [ "$DISK_USAGE" -gt 85 ]; then | |
| echo "::warning::Disk usage is at ${DISK_USAGE}% - consider cleaning up" | |
| echo "⚠️ **WARNING: Disk usage is at ${DISK_USAGE}%**" >> $GITHUB_STEP_SUMMARY | |
| else | |
| echo "✅ Disk usage is at ${DISK_USAGE}%" >> $GITHUB_STEP_SUMMARY | |
| fi | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| - name: Memory Usage | |
| run: | | |
| echo "## Memory Usage" >> $GITHUB_STEP_SUMMARY | |
| echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
| free -h >> $GITHUB_STEP_SUMMARY | |
| echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| # Check memory usage | |
| MEM_TOTAL=$(free | grep Mem | awk '{print $2}') | |
| MEM_USED=$(free | grep Mem | awk '{print $3}') | |
| MEM_PERCENT=$((MEM_USED * 100 / MEM_TOTAL)) | |
| if [ "$MEM_PERCENT" -gt 85 ]; then | |
| echo "::warning::Memory usage is at ${MEM_PERCENT}%" | |
| echo "⚠️ **WARNING: Memory usage is at ${MEM_PERCENT}%**" >> $GITHUB_STEP_SUMMARY | |
| else | |
| echo "✅ Memory usage is at ${MEM_PERCENT}%" >> $GITHUB_STEP_SUMMARY | |
| fi | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| - name: CPU Information | |
| run: | | |
| echo "## CPU Information" >> $GITHUB_STEP_SUMMARY | |
| echo "| Property | Value |" >> $GITHUB_STEP_SUMMARY | |
| echo "|----------|-------|" >> $GITHUB_STEP_SUMMARY | |
| echo "| CPU Model | $(cat /proc/cpuinfo | grep 'model name\|Model' | head -1 | cut -d':' -f2 | xargs) |" >> $GITHUB_STEP_SUMMARY | |
| echo "| CPU Cores | $(nproc) |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Load Average | $(cat /proc/loadavg | cut -d' ' -f1-3) |" >> $GITHUB_STEP_SUMMARY | |
| # Check CPU temperature if available | |
| if [ -f /sys/class/thermal/thermal_zone0/temp ]; then | |
| TEMP=$(cat /sys/class/thermal/thermal_zone0/temp) | |
| TEMP_C=$((TEMP / 1000)) | |
| echo "| CPU Temperature | ${TEMP_C}°C |" >> $GITHUB_STEP_SUMMARY | |
| if [ "$TEMP_C" -gt 70 ]; then | |
| echo "::warning::CPU temperature is ${TEMP_C}°C - consider improving cooling" | |
| fi | |
| fi | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| - name: Docker Status | |
| run: | | |
| echo "## Docker Status" >> $GITHUB_STEP_SUMMARY | |
| # Check if Docker is running | |
| if systemctl is-active --quiet docker; then | |
| echo "✅ Docker service is running" >> $GITHUB_STEP_SUMMARY | |
| else | |
| echo "❌ **Docker service is NOT running**" >> $GITHUB_STEP_SUMMARY | |
| echo "::error::Docker service is not running" | |
| fi | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "### Docker Version" >> $GITHUB_STEP_SUMMARY | |
| echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
| docker version --format '{{.Server.Version}}' 2>/dev/null || echo "Unable to get Docker version" | |
| echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "### Docker Disk Usage" >> $GITHUB_STEP_SUMMARY | |
| echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
| docker system df 2>/dev/null || echo "Unable to get Docker disk usage" | |
| echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "### Running Containers" >> $GITHUB_STEP_SUMMARY | |
| CONTAINERS=$(docker ps -q 2>/dev/null | wc -l) | |
| echo "Running containers: $CONTAINERS" >> $GITHUB_STEP_SUMMARY | |
| if [ "$CONTAINERS" -gt 0 ]; then | |
| echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
| docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Size}}" 2>/dev/null | |
| echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
| fi | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "### Docker Images" >> $GITHUB_STEP_SUMMARY | |
| IMAGES=$(docker images -q 2>/dev/null | wc -l) | |
| echo "Total images: $IMAGES" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| - name: QEMU/binfmt Status | |
| run: | | |
| echo "## QEMU/binfmt Status" >> $GITHUB_STEP_SUMMARY | |
| # Check binfmt_misc mount | |
| if mount | grep -q binfmt_misc; then | |
| echo "✅ binfmt_misc is mounted" >> $GITHUB_STEP_SUMMARY | |
| else | |
| echo "❌ **binfmt_misc is NOT mounted**" >> $GITHUB_STEP_SUMMARY | |
| echo "::warning::binfmt_misc is not mounted - cross-architecture builds may fail" | |
| fi | |
| # List registered binfmt handlers | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "### Registered Handlers" >> $GITHUB_STEP_SUMMARY | |
| if [ -d /proc/sys/fs/binfmt_misc ]; then | |
| echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
| ls -la /proc/sys/fs/binfmt_misc/ 2>/dev/null | grep -v "^total" | head -20 | |
| echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
| # Check for qemu-arm specifically | |
| if [ -f /proc/sys/fs/binfmt_misc/qemu-arm ]; then | |
| echo "✅ qemu-arm handler is registered" >> $GITHUB_STEP_SUMMARY | |
| else | |
| echo "⚠️ qemu-arm handler is NOT registered" >> $GITHUB_STEP_SUMMARY | |
| fi | |
| else | |
| echo "binfmt_misc directory not found" >> $GITHUB_STEP_SUMMARY | |
| fi | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| - name: Loop Devices Status | |
| run: | | |
| echo "## Loop Devices" >> $GITHUB_STEP_SUMMARY | |
| LOOP_COUNT=$(losetup -l 2>/dev/null | grep -v "^NAME" | wc -l) | |
| echo "Active loop devices: $LOOP_COUNT" >> $GITHUB_STEP_SUMMARY | |
| if [ "$LOOP_COUNT" -gt 0 ]; then | |
| echo "::warning::Found $LOOP_COUNT active loop devices - may indicate incomplete cleanup" | |
| echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
| losetup -l 2>/dev/null | |
| echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
| else | |
| echo "✅ No stale loop devices" >> $GITHUB_STEP_SUMMARY | |
| fi | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| - name: Runner Work Directory | |
| run: | | |
| echo "## Runner Work Directory" >> $GITHUB_STEP_SUMMARY | |
| # Check for leftover directories that could cause issues | |
| WORK_DIR="${GITHUB_WORKSPACE}/.." | |
| echo "Work directory: $WORK_DIR" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "### Directory Contents" >> $GITHUB_STEP_SUMMARY | |
| echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
| ls -la "$WORK_DIR" 2>/dev/null | head -20 | |
| echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| # Check for problematic directories | |
| for dir in unfor19-awscli .cache deploy; do | |
| if [ -d "$WORK_DIR/BlueOS/$dir" ]; then | |
| echo "⚠️ Found leftover directory: $dir" >> $GITHUB_STEP_SUMMARY | |
| echo "::warning::Found leftover directory: $dir" | |
| fi | |
| done | |
| - name: Network Connectivity | |
| run: | | |
| echo "## Network Connectivity" >> $GITHUB_STEP_SUMMARY | |
| # Test connectivity to key services | |
| declare -A ENDPOINTS=( | |
| ["GitHub"]="github.com" | |
| ["Docker Hub"]="hub.docker.com" | |
| ["AWS S3"]="s3.us-east-2.amazonaws.com" | |
| ["Raspberry Pi Downloads"]="downloads.raspberrypi.org" | |
| ) | |
| echo "| Service | Status |" >> $GITHUB_STEP_SUMMARY | |
| echo "|---------|--------|" >> $GITHUB_STEP_SUMMARY | |
| for name in "GitHub" "Docker Hub" "AWS S3" "Raspberry Pi Downloads"; do | |
| host="${ENDPOINTS[$name]}" | |
| if ping -c 1 -W 5 "$host" > /dev/null 2>&1; then | |
| echo "| $name | ✅ Reachable |" >> $GITHUB_STEP_SUMMARY | |
| else | |
| echo "| $name | ❌ Unreachable |" >> $GITHUB_STEP_SUMMARY | |
| echo "::warning::Cannot reach $name ($host)" | |
| fi | |
| done | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| - name: Python Environment | |
| run: | | |
| echo "## Python Environment" >> $GITHUB_STEP_SUMMARY | |
| echo "| Property | Value |" >> $GITHUB_STEP_SUMMARY | |
| echo "|----------|-------|" >> $GITHUB_STEP_SUMMARY | |
| echo "| Python Version | $(python3 --version 2>/dev/null || echo 'Not found') |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Pip Version | $(pip3 --version 2>/dev/null | cut -d' ' -f2 || echo 'Not found') |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Pip Location | $(which pip3 2>/dev/null || echo 'Not found') |" >> $GITHUB_STEP_SUMMARY | |
| # Check if awscli is installed | |
| if command -v aws &> /dev/null; then | |
| echo "| AWS CLI | $(aws --version 2>/dev/null | cut -d' ' -f1) |" >> $GITHUB_STEP_SUMMARY | |
| else | |
| echo "| AWS CLI | Not installed |" >> $GITHUB_STEP_SUMMARY | |
| fi | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| - name: Recent Errors in System Log | |
| run: | | |
| echo "## Recent System Errors (last 50 lines)" >> $GITHUB_STEP_SUMMARY | |
| echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
| sudo journalctl -p err -n 50 --no-pager 2>/dev/null | tail -30 || echo "Unable to read system journal" | |
| echo "\`\`\`" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| - name: Test Docker Pull | |
| run: | | |
| echo "## Docker Pull Test" >> $GITHUB_STEP_SUMMARY | |
| # Try to pull a small test image | |
| if docker pull hello-world > /dev/null 2>&1; then | |
| echo "✅ Docker pull works correctly" >> $GITHUB_STEP_SUMMARY | |
| docker rmi hello-world > /dev/null 2>&1 || true | |
| else | |
| echo "❌ **Docker pull failed**" >> $GITHUB_STEP_SUMMARY | |
| echo "::error::Docker pull test failed" | |
| fi | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| - name: Test QEMU Emulation | |
| run: | | |
| echo "## QEMU Emulation Test" >> $GITHUB_STEP_SUMMARY | |
| ARCH=$(uname -m) | |
| if [ "$ARCH" = "aarch64" ]; then | |
| # On ARM64, test ARM32 emulation | |
| TEST_PLATFORM="linux/arm/v7" | |
| TEST_DESC="ARM32 on ARM64" | |
| else | |
| # On ARM32, we don't need emulation for ARM32 | |
| TEST_PLATFORM="linux/arm/v7" | |
| TEST_DESC="ARM32 native" | |
| fi | |
| echo "Testing: $TEST_DESC" >> $GITHUB_STEP_SUMMARY | |
| if docker run --rm --platform $TEST_PLATFORM alpine:latest echo "QEMU test passed" > /dev/null 2>&1; then | |
| echo "✅ Platform emulation works ($TEST_PLATFORM)" >> $GITHUB_STEP_SUMMARY | |
| else | |
| echo "⚠️ Platform emulation may have issues ($TEST_PLATFORM)" >> $GITHUB_STEP_SUMMARY | |
| echo "::warning::QEMU emulation test failed for $TEST_PLATFORM" | |
| fi | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| - name: Health Summary | |
| run: | | |
| echo "## Health Check Complete" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "Runner: **${{ matrix.name }}** (${{ matrix.runner }})" >> $GITHUB_STEP_SUMMARY | |
| echo "Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> $GITHUB_STEP_SUMMARY |