Check Links #107
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Check Links | |
| on: | |
| # Run M/W/F mornings | |
| schedule: | |
| - cron: '0 10 * * 1,2,3,4,5' | |
| workflow_dispatch: | |
| permissions: | |
| contents: write | |
| pull-requests: write | |
| jobs: | |
| check-links: | |
| name: Check links | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Create lychee config | |
| run: | | |
| cat > lychee.toml << 'EOF' | |
| # Lychee link checker configuration | |
| # https://lychee.cli.rs/ | |
| # Accept these status codes as valid | |
| accept = [200, 204, 301, 302, 307, 308] | |
| # User agent to avoid being blocked | |
| user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" | |
| # Exclude patterns (regex) | |
| exclude = [ | |
| # Placeholder/example URLs | |
| "^https://example\\.com", | |
| "^https://github\\.com/owner/repo", | |
| # Analytics and tracking | |
| "^https://us\\.i\\.posthog\\.com", | |
| "^https://c\\.vialoops\\.com", | |
| "_vercel/(speed-)?insights", | |
| # Sites that block automated requests | |
| "^https://docs\\.stack-auth\\.com", | |
| "^https://www\\.linkedin\\.com", | |
| "^https://linkedin\\.com", | |
| "^https://twitter\\.com", | |
| "^https://x\\.com", | |
| "^https://www\\.npmjs\\.com", | |
| "^https://cdn\\.simpleicons\\.org", | |
| # All GitHub blob/tree/tag URLs - verified locally before lychee runs | |
| # This avoids 503 rate limiting from GitHub's automated request detection | |
| # Issues/PRs/compare links are still checked via HTTP (they rarely break) | |
| "^https://github\\.com/[^/]+/[^/]+/(blob|tree)/", | |
| "^https://github\\.com/[^/]+/[^/]+/releases/tag/", | |
| # Non-HTTP links | |
| "^mailto:", | |
| "^tel:", | |
| "^javascript:", | |
| # Anchor-only links | |
| "^#" | |
| ] | |
| EOF | |
| - name: Fetch sitemap and extract URLs | |
| run: | | |
| curl -s https://buildwithfern.com/learn/sitemap.xml | grep -oP '(?<=<loc>)[^<]+' > urls.txt | |
| echo "Found $(wc -l < urls.txt) URLs in sitemap" | |
| - name: Extract and verify GitHub blob/tree/tag URLs locally | |
| id: verify_github | |
| run: | | |
| # Extract all GitHub blob/tree/tag URLs from the repo source files | |
| # This is much faster than fetching all published pages via HTTP (~11 min -> seconds) | |
| # These URLs are excluded from lychee and verified locally instead to avoid 503 errors | |
| echo "Scanning repo for GitHub blob/tree/tag URLs..." | |
| start_time=$(date +%s) | |
| > github-urls.txt | |
| # Search the content directories for GitHub blob/tree URLs | |
| # Include fern/ (main docs) and README.md (root) | |
| # Exclude .git and any cloned repos | |
| grep -RhoE 'https://github\.com/[^/]+/[^/]+/(blob|tree)/[^"'"'"')<>[:space:]]+' \ | |
| fern/ \ | |
| README.md \ | |
| --exclude-dir=.git \ | |
| --exclude-dir=.github-repos \ | |
| >> github-urls.txt 2>/dev/null || true | |
| # Also search for releases/tag URLs | |
| grep -RhoE 'https://github\.com/[^/]+/[^/]+/releases/tag/[^"'"'"')<>[:space:]]+' \ | |
| fern/ \ | |
| README.md \ | |
| --exclude-dir=.git \ | |
| --exclude-dir=.github-repos \ | |
| >> github-urls.txt 2>/dev/null || true | |
| # Deduplicate URLs | |
| sort -u github-urls.txt -o github-urls.txt | |
| # Remove example/placeholder URLs (e.g., github.com/your-org/...) | |
| # These are documentation examples, not real repos to verify | |
| if [ -s github-urls.txt ]; then | |
| grep -v 'github\.com/your-org/' github-urls.txt > github-urls.filtered || true | |
| mv github-urls.filtered github-urls.txt | |
| fi | |
| total_urls=$(wc -l < github-urls.txt | tr -d ' ') | |
| end_time=$(date +%s) | |
| echo "Found $total_urls unique GitHub URLs to verify locally (took $((end_time - start_time))s)" | |
| if [ "$total_urls" -eq 0 ]; then | |
| echo "No GitHub URLs to verify" | |
| echo "verified_count=0" >> $GITHUB_OUTPUT | |
| echo "missing_count=0" >> $GITHUB_OUTPUT | |
| echo "has_missing=false" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| # Extract unique repos that need to be cloned (org/repo format) | |
| grep -oE 'https://github\.com/[^/]+/[^/]+' github-urls.txt | sort -u > github-repos.txt | |
| echo "Repos to clone:" | |
| cat github-repos.txt | |
| # Clone each repo (shallow clone for efficiency) | |
| mkdir -p .github-repos | |
| while IFS= read -r repo_url; do | |
| [ -z "$repo_url" ] && continue | |
| # Extract org/repo for directory structure | |
| org_repo=$(echo "$repo_url" | sed -E 's#https://github\.com/##') | |
| org=$(echo "$org_repo" | cut -d'/' -f1) | |
| repo_name=$(echo "$org_repo" | cut -d'/' -f2) | |
| echo "Cloning $org/$repo_name..." | |
| mkdir -p ".github-repos/$org" | |
| # Use sparse checkout for efficiency - we only need to check if paths exist | |
| git clone --depth 1 --filter=blob:none --sparse "$repo_url.git" ".github-repos/$org/$repo_name" 2>/dev/null || { | |
| echo "Warning: Failed to clone $repo_url, will mark URLs from this repo as unverifiable" | |
| continue | |
| } | |
| # Fetch all tags for tag verification (lightweight fetch) | |
| cd ".github-repos/$org/$repo_name" | |
| git fetch --tags --depth 1 2>/dev/null || true | |
| cd ../../.. | |
| done < github-repos.txt | |
| # Verify each URL by checking if the path exists in the cloned repo | |
| verified_count=0 | |
| missing_count=0 | |
| > github-missing.txt | |
| > github-verified.txt | |
| while IFS= read -r url; do | |
| [ -z "$url" ] && continue | |
| # Remove any URL fragments or query strings | |
| clean_url="${url%%#*}" | |
| clean_url="${clean_url%%\?*}" | |
| # Extract org, repo name, and determine URL type | |
| org=$(echo "$clean_url" | sed -E 's#https://github\.com/([^/]+)/.*#\1#') | |
| repo_name=$(echo "$clean_url" | sed -E 's#https://github\.com/[^/]+/([^/]+)/.*#\1#') | |
| # Check if we have the repo cloned | |
| if [ ! -d ".github-repos/$org/$repo_name" ]; then | |
| echo "Repo not cloned, cannot verify: $url" | |
| echo "- [UNVERIFIABLE] $url (repo clone failed)" >> github-missing.txt | |
| missing_count=$((missing_count + 1)) | |
| continue | |
| fi | |
| cd ".github-repos/$org/$repo_name" | |
| # Check if this is a releases/tag URL | |
| if echo "$clean_url" | grep -qE '/releases/tag/'; then | |
| # Extract tag name | |
| tag_name=$(echo "$clean_url" | sed -E 's#.*/releases/tag/(.*)#\1#') | |
| # Check if tag exists | |
| if git tag -l "$tag_name" | grep -q "^${tag_name}$"; then | |
| echo "Verified (tag): $url -> tag $tag_name" | |
| echo "$url" >> ../../../github-verified.txt | |
| verified_count=$((verified_count + 1)) | |
| else | |
| echo "MISSING (tag): $url -> tag $tag_name" | |
| echo "- [LOCAL_MISSING] $url (tag: $tag_name in $org/$repo_name)" >> ../../../github-missing.txt | |
| missing_count=$((missing_count + 1)) | |
| fi | |
| else | |
| # This is a blob/tree URL - extract ref and path | |
| # Format: https://github.com/ORG/REPO/(blob|tree)/REF/PATH | |
| ref=$(echo "$clean_url" | sed -E 's#https://github\.com/[^/]+/[^/]+/(blob|tree)/([^/]+)/.*#\2#') | |
| rel_path=$(echo "$clean_url" | sed -E 's#https://github\.com/[^/]+/[^/]+/(blob|tree)/[^/]+/(.*)#\2#') | |
| # URL-decode the relative path (handles %5B -> [, %5D -> ], spaces, etc.) | |
| # This is needed because GitHub URLs encode special characters like [ and ] | |
| rel_path=$(python3 -c "import sys, urllib.parse; print(urllib.parse.unquote(sys.argv[1]))" "$rel_path") | |
| # Handle HEAD ref specially | |
| if [ "$ref" = "HEAD" ]; then | |
| ref=$(git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null | sed 's@^refs/remotes/origin/@@' || echo "main") | |
| fi | |
| # For specific commit SHAs, we need to fetch them | |
| if echo "$ref" | grep -qE '^[0-9a-f]{40}$'; then | |
| git fetch --depth 1 origin "$ref" 2>/dev/null || true | |
| fi | |
| # Check if path exists using git ls-tree (works with sparse checkout) | |
| # Use grep -qxF for literal string matching (paths may contain regex chars like [ and ]) | |
| if git ls-tree -r --name-only "$ref" -- "$rel_path" 2>/dev/null | grep -qxF "$rel_path" || \ | |
| git ls-tree -d --name-only "$ref" -- "$rel_path" 2>/dev/null | grep -qxF "$rel_path"; then | |
| echo "Verified: $url -> $rel_path (ref: $ref)" | |
| echo "$url" >> ../../../github-verified.txt | |
| verified_count=$((verified_count + 1)) | |
| else | |
| # Also check if it's a directory (tree link) | |
| if git ls-tree -d "$ref" -- "$rel_path" 2>/dev/null | grep -q .; then | |
| echo "Verified (directory): $url -> $rel_path (ref: $ref)" | |
| echo "$url" >> ../../../github-verified.txt | |
| verified_count=$((verified_count + 1)) | |
| else | |
| echo "MISSING: $url -> $rel_path (ref: $ref)" | |
| echo "- [LOCAL_MISSING] $url (path: $rel_path in $org/$repo_name, ref: $ref)" >> ../../../github-missing.txt | |
| missing_count=$((missing_count + 1)) | |
| fi | |
| fi | |
| fi | |
| cd ../../.. | |
| done < github-urls.txt | |
| echo "" | |
| echo "=== GitHub URL Verification Summary ===" | |
| echo "Total URLs checked: $total_urls" | |
| echo "Verified locally: $verified_count" | |
| echo "Missing/unverifiable: $missing_count" | |
| echo "verified_count=$verified_count" >> $GITHUB_OUTPUT | |
| echo "missing_count=$missing_count" >> $GITHUB_OUTPUT | |
| if [ "$missing_count" -gt 0 ]; then | |
| echo "has_missing=true" >> $GITHUB_OUTPUT | |
| echo "" | |
| echo "Missing URLs:" | |
| cat github-missing.txt | |
| else | |
| echo "has_missing=false" >> $GITHUB_OUTPUT | |
| fi | |
| # Cleanup cloned repos | |
| rm -rf .github-repos | |
| - name: Extract GitHub URLs for HTTP checking from source files | |
| id: extract_github_http | |
| run: | | |
| # Extract GitHub URLs that need HTTP checking (issues, compare, commit, discussions, non-fern-api PRs) | |
| # These are extracted from source files and passed directly to lychee | |
| # This avoids the --include filter issue where lychee applies it to input pages, not discovered links | |
| echo "Scanning repo for GitHub URLs to check via HTTP..." | |
| > github-http-urls.txt | |
| # Search for issues URLs | |
| grep -RhoE 'https://github\.com/[^/]+/[^/]+/issues/[^"'"'"')<>[:space:]]+' \ | |
| fern/ README.md \ | |
| --exclude-dir=.git --exclude-dir=.github-repos \ | |
| >> github-http-urls.txt 2>/dev/null || true | |
| # Search for compare URLs | |
| grep -RhoE 'https://github\.com/[^/]+/[^/]+/compare/[^"'"'"')<>[:space:]]+' \ | |
| fern/ README.md \ | |
| --exclude-dir=.git --exclude-dir=.github-repos \ | |
| >> github-http-urls.txt 2>/dev/null || true | |
| # Search for commit/commits URLs | |
| grep -RhoE 'https://github\.com/[^/]+/[^/]+/commits?/[^"'"'"')<>[:space:]]+' \ | |
| fern/ README.md \ | |
| --exclude-dir=.git --exclude-dir=.github-repos \ | |
| >> github-http-urls.txt 2>/dev/null || true | |
| # Search for discussions URLs | |
| grep -RhoE 'https://github\.com/[^/]+/[^/]+/discussions/[^"'"'"')<>[:space:]]+' \ | |
| fern/ README.md \ | |
| --exclude-dir=.git --exclude-dir=.github-repos \ | |
| >> github-http-urls.txt 2>/dev/null || true | |
| # Search for pull request URLs (excluding fern-api org - those are too slow to check) | |
| grep -RhoE 'https://github\.com/[^/]+/[^/]+/pull/[^"'"'"')<>[:space:]]+' \ | |
| fern/ README.md \ | |
| --exclude-dir=.git --exclude-dir=.github-repos \ | |
| 2>/dev/null | grep -v 'github\.com/fern-api/' \ | |
| >> github-http-urls.txt || true | |
| # Remove example/placeholder URLs (e.g., github.com/your-org/...) | |
| if [ -s github-http-urls.txt ]; then | |
| grep -v 'github\.com/your-org/' github-http-urls.txt > github-http-urls.filtered || true | |
| mv github-http-urls.filtered github-http-urls.txt | |
| fi | |
| # Deduplicate URLs | |
| sort -u github-http-urls.txt -o github-http-urls.txt | |
| total_urls=$(wc -l < github-http-urls.txt | tr -d ' ') | |
| echo "Found $total_urls unique GitHub URLs to check via HTTP" | |
| echo "github_http_count=$total_urls" >> $GITHUB_OUTPUT | |
| if [ "$total_urls" -gt 0 ]; then | |
| echo "URLs to check:" | |
| cat github-http-urls.txt | |
| fi | |
| - name: Upload URLs (early, for debugging) | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: urls | |
| path: | | |
| github-urls.txt | |
| github-http-urls.txt | |
| urls.txt | |
| if-no-files-found: ignore | |
| - name: Check GitHub links (very low concurrency to avoid 503 rate limiting) | |
| id: lychee_github | |
| if: steps.extract_github_http.outputs.github_http_count != '0' | |
| uses: lycheeverse/lychee-action@v2 | |
| with: | |
| args: >- | |
| --no-progress | |
| --timeout 30 | |
| --max-retries 5 | |
| --retry-wait-time 20 | |
| --max-concurrency 2 | |
| github-http-urls.txt | |
| output: ./lychee-raw-github.md | |
| format: markdown | |
| fail: false | |
| jobSummary: false | |
| - name: Check non-GitHub links (high concurrency) | |
| id: lychee_main | |
| uses: lycheeverse/lychee-action@v2 | |
| with: | |
| args: >- | |
| --config lychee.toml | |
| --no-progress | |
| --timeout 30 | |
| --max-retries 3 | |
| --retry-wait-time 10 | |
| --max-concurrency 20 | |
| --exclude "^https://github\\.com/" | |
| --files-from urls.txt | |
| output: ./lychee-raw-main.md | |
| format: markdown | |
| fail: false | |
| jobSummary: false | |
| - name: Combine lychee outputs | |
| run: | | |
| # Combine both lychee runs into a single raw file for downstream processing | |
| : > lychee-raw.md | |
| [ -f lychee-raw-main.md ] && cat lychee-raw-main.md >> lychee-raw.md | |
| [ -f lychee-raw-github.md ] && cat lychee-raw-github.md >> lychee-raw.md | |
| echo "Combined lychee outputs into lychee-raw.md" | |
| - name: Extract 429 URLs and retry with exponential backoff | |
| id: retry429 | |
| run: | | |
| # Extract all 429 URLs from the raw lychee report | |
| # Note: 403s are excluded from broken links report but NOT retried (bot-blocking sites won't change) | |
| grep '\[429\]' ./lychee-raw.md | sed -E 's/.*<([^>]+)>.*/\1/' | sort -u > urls-429-all.txt || true | |
| # Initialize counters for repo-internal GitHub URLs | |
| verified_locally=0 | |
| missing_locally=0 | |
| # Process repo-internal GitHub URLs by checking if files exist locally | |
| # This avoids false positives from GitHub rate limiting while still catching real broken links | |
| > urls-429.txt | |
| > urls-429-repo-missing.txt | |
| while IFS= read -r url; do | |
| [ -z "$url" ] && continue | |
| # Check if this is a repo-internal GitHub URL (blob/main pattern) | |
| if echo "$url" | grep -qE '^https://github\.com/fern-api/docs/blob/main/'; then | |
| # Extract relative path from URL (strip prefix and query string) | |
| rel_path="${url#https://github.com/fern-api/docs/blob/main/}" | |
| rel_path="${rel_path%%\?*}" | |
| if [ -f "$rel_path" ]; then | |
| echo "Verified locally (file exists): $url -> $rel_path" | |
| verified_locally=$((verified_locally + 1)) | |
| else | |
| echo "MISSING locally: $url -> $rel_path" | |
| echo "- [LOCAL_MISSING] $url (expected path: $rel_path)" >> urls-429-repo-missing.txt | |
| missing_locally=$((missing_locally + 1)) | |
| fi | |
| else | |
| # Non-repo URL, add to retry list | |
| echo "$url" >> urls-429.txt | |
| fi | |
| done < urls-429-all.txt | |
| echo "Repo-internal GitHub URLs verified locally: $verified_locally" | |
| echo "Repo-internal GitHub URLs missing locally: $missing_locally" | |
| echo "verified_locally=$verified_locally" >> $GITHUB_OUTPUT | |
| echo "missing_locally=$missing_locally" >> $GITHUB_OUTPUT | |
| count=$(wc -l < urls-429.txt | tr -d ' ') | |
| echo "Found $count other URLs with 429 status to retry" | |
| echo "rate_limited_count=$count" >> $GITHUB_OUTPUT | |
| # Initialize still-failing file | |
| > urls-429-still-failing.txt | |
| # Add any locally-missing repo URLs to the still-failing list | |
| if [ -s urls-429-repo-missing.txt ]; then | |
| cat urls-429-repo-missing.txt >> urls-429-still-failing.txt | |
| fi | |
| if [ "$count" -eq "0" ] || [ ! -s urls-429.txt ]; then | |
| echo "No other 429 URLs to retry" | |
| else | |
| echo "Retrying $count URLs with exponential backoff..." | |
| while IFS= read -r url; do | |
| [ -z "$url" ] && continue | |
| delay=15 | |
| max_attempts=4 | |
| success=false | |
| for attempt in $(seq 1 $max_attempts); do | |
| echo "[$attempt/$max_attempts] Checking: $url" | |
| status=$(curl -s -o /dev/null -w "%{http_code}" \ | |
| -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" \ | |
| --max-time 30 \ | |
| "$url" || echo "000") | |
| if [ "$status" -lt 400 ]; then | |
| echo "OK on retry (status $status): $url" | |
| success=true | |
| break | |
| elif [ "$status" -eq 429 ]; then | |
| echo "Still 429, sleeping ${delay}s before next attempt..." | |
| sleep "$delay" | |
| delay=$((delay * 2)) | |
| else | |
| echo "Non-429 failure (status $status): $url" | |
| break | |
| fi | |
| done | |
| if [ "$success" = false ]; then | |
| echo "- [${status}] $url" >> urls-429-still-failing.txt | |
| fi | |
| done < urls-429.txt | |
| fi | |
| still_failing=$(wc -l < urls-429-still-failing.txt | tr -d ' ') | |
| echo "URLs still failing after retry: $still_failing" | |
| echo "still_failing_429=$still_failing" >> $GITHUB_OUTPUT | |
| if [ "$still_failing" -gt 0 ]; then | |
| echo "has_429_failures=true" >> $GITHUB_OUTPUT | |
| else | |
| echo "has_429_failures=false" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Build errors-only report and summarize | |
| id: check_failures | |
| run: | | |
| # Pattern matches 4xx (except 403 and 429) and 5xx status codes | |
| # 403 and 429 are handled separately (retried with exponential backoff) | |
| FAILURE_PATTERN='\[400\]|\[401\]|\[402\]|\[40[4-9]\]|\[41[0-9]\]|\[42[0-8]\]|\[4[3-9][0-9]\]|\[5[0-9]{2}\]' | |
| # Extract broken links from raw report | |
| grep -E "$FAILURE_PATTERN" ./lychee-raw.md > broken-links-raw.txt 2>/dev/null || true | |
| # Filter out repo-internal GitHub URLs that exist locally (handles 5xx false positives) | |
| # Also separate 5xx errors from external GitHub repos (likely rate limiting, not broken) | |
| verified_5xx=0 | |
| external_5xx=0 | |
| > broken-links.txt | |
| > external-5xx-links.txt | |
| while IFS= read -r line; do | |
| [ -z "$line" ] && continue | |
| # Extract URL and status code from the line (format: ... [STATUS] <URL> ...) | |
| url=$(echo "$line" | sed -E 's/.*<([^>]+)>.*/\1/') | |
| status=$(echo "$line" | sed -E 's/.*\[([0-9]+)\].*/\1/') | |
| # Check if this is a repo-internal GitHub URL (blob/main pattern) | |
| if echo "$url" | grep -qE '^https://github\.com/fern-api/docs/blob/main/'; then | |
| # Extract relative path from URL (strip prefix and query string) | |
| rel_path="${url#https://github.com/fern-api/docs/blob/main/}" | |
| rel_path="${rel_path%%\?*}" | |
| if [ -f "$rel_path" ]; then | |
| echo "Verified locally (5xx but file exists): $url -> $rel_path" | |
| verified_5xx=$((verified_5xx + 1)) | |
| else | |
| # File doesn't exist locally, this is a real broken link | |
| echo "$line" >> broken-links.txt | |
| fi | |
| # Check if this is a 5xx error from an external GitHub repo (likely rate limiting) | |
| elif echo "$status" | grep -qE '^5[0-9]{2}$' && echo "$url" | grep -qE '^https://github\.com/'; then | |
| echo "External GitHub 5xx (likely rate limiting): $url" | |
| echo "$line" >> external-5xx-links.txt | |
| external_5xx=$((external_5xx + 1)) | |
| else | |
| # Not a repo-internal URL and not external GitHub 5xx, keep it in the broken links list | |
| echo "$line" >> broken-links.txt | |
| fi | |
| done < broken-links-raw.txt | |
| # Deduplicate error files (same URL may appear multiple times if linked from multiple pages) | |
| sort -u broken-links.txt -o broken-links.txt | |
| sort -u external-5xx-links.txt -o external-5xx-links.txt | |
| echo "Repo-internal GitHub URLs with 5xx verified locally: $verified_5xx" | |
| echo "External GitHub URLs with 5xx (likely rate limiting): $external_5xx" | |
| echo "verified_5xx=$verified_5xx" >> $GITHUB_OUTPUT | |
| echo "external_5xx=$external_5xx" >> $GITHUB_OUTPUT | |
| broken_count=$(wc -l < broken-links.txt | tr -d ' ') | |
| # Get rate limit stats | |
| rate_limited="${{ steps.retry429.outputs.rate_limited_count }}" | |
| rate_limited=${rate_limited:-0} | |
| still_failing_429="${{ steps.retry429.outputs.still_failing_429 }}" | |
| still_failing_429=${still_failing_429:-0} | |
| # Get GitHub local verification stats | |
| github_verified="${{ steps.verify_github.outputs.verified_count }}" | |
| github_verified=${github_verified:-0} | |
| github_missing="${{ steps.verify_github.outputs.missing_count }}" | |
| github_missing=${github_missing:-0} | |
| # Build clean errors-only report | |
| cat > lychee-report.md << 'HEADER' | |
| # Link Check Report | |
| This report only shows broken links (errors). Redirects and successful links are not included. | |
| HEADER | |
| if [ "$broken_count" -gt 0 ]; then | |
| echo "## Broken Links ($broken_count)" >> lychee-report.md | |
| echo "" >> lychee-report.md | |
| # Format: - [STATUS] URL | |
| sed -E 's/.*\[([0-9]+)\].*<([^>]+)>.*/- [\1] \2/' broken-links.txt >> lychee-report.md | |
| echo "" >> lychee-report.md | |
| fi | |
| if [ "$still_failing_429" -gt 0 ]; then | |
| echo "## Rate-Limited URLs (429) - Still Failing After Retry" >> lychee-report.md | |
| echo "" >> lychee-report.md | |
| cat urls-429-still-failing.txt >> lychee-report.md | |
| echo "" >> lychee-report.md | |
| fi | |
| if [ "$github_missing" -gt 0 ]; then | |
| echo "## GitHub URLs - Missing Locally ($github_missing)" >> lychee-report.md | |
| echo "" >> lychee-report.md | |
| cat github-missing.txt >> lychee-report.md | |
| echo "" >> lychee-report.md | |
| fi | |
| # External GitHub 5xx errors are informational only (likely rate limiting, not broken) | |
| external_5xx_count=$(wc -l < external-5xx-links.txt | tr -d ' ') | |
| if [ "$external_5xx_count" -gt 0 ]; then | |
| echo "## External GitHub URLs - 5xx (Likely Rate Limiting, Manual Validation Recommended)" >> lychee-report.md | |
| echo "" >> lychee-report.md | |
| echo "_These links returned 5xx errors during automated checking. They are likely valid but rate-limited. Please validate manually._" >> lychee-report.md | |
| echo "" >> lychee-report.md | |
| sed -E 's/.*\[([0-9]+)\].*<([^>]+)>.*/- [\1] \2/' external-5xx-links.txt >> lychee-report.md | |
| echo "" >> lychee-report.md | |
| fi | |
| if [ "$broken_count" -eq 0 ] && [ "$still_failing_429" -eq 0 ] && [ "$github_missing" -eq 0 ]; then | |
| echo "No broken links found!" >> lychee-report.md | |
| fi | |
| echo "" >> lychee-report.md | |
| echo "---" >> lychee-report.md | |
| echo "" >> lychee-report.md | |
| echo "Rate-limited URLs (429) that recovered after retry: $rate_limited" >> lychee-report.md | |
| # Calculate recovered 429s (rate_limited - still_failing_429) | |
| recovered_429=0 | |
| if [ "$rate_limited" -gt 0 ]; then | |
| recovered_429=$((rate_limited - still_failing_429)) | |
| fi | |
| # Set output for failure detection | |
| if [ "$broken_count" -gt 0 ]; then | |
| echo "has_other_failures=true" >> $GITHUB_OUTPUT | |
| else | |
| echo "has_other_failures=false" >> $GITHUB_OUTPUT | |
| fi | |
| # Extract lychee's summary table only (stop at first ## heading) | |
| sed -n '/^# Summary$/,/^## /p' ./lychee-raw.md | head -n -1 > lychee-summary-table.md | |
| # Print summary to logs | |
| echo "" | |
| cat lychee-summary-table.md | |
| echo "" | |
| # Get repo-internal GitHub URL counts | |
| verified_locally="${{ steps.retry429.outputs.verified_locally }}" | |
| verified_locally=${verified_locally:-0} | |
| missing_locally="${{ steps.retry429.outputs.missing_locally }}" | |
| missing_locally=${missing_locally:-0} | |
| echo "Recovery Info:" | |
| echo " Repo-internal GitHub URLs (429 verified locally): $verified_locally" | |
| echo " Repo-internal GitHub URLs (429 missing locally): $missing_locally" | |
| echo " Repo-internal GitHub URLs (5xx verified locally): $verified_5xx" | |
| echo " Rate-limited (429) - recovered after retry: $recovered_429" | |
| echo " Rate-limited (429) - still failing: $still_failing_429" | |
| echo " GitHub URLs verified locally: $github_verified" | |
| echo " GitHub URLs missing locally: $github_missing" | |
| echo " External GitHub URLs with 5xx (likely rate limiting): $external_5xx_count" | |
| if [ "$broken_count" -gt 0 ]; then | |
| echo "" | |
| echo "BROKEN LINKS (non-429):" | |
| cat broken-links.txt | sed -E 's/.*\[([0-9]+)\].*<([^>]+)>.*/ - [\1] \2/' | |
| fi | |
| if [ "$still_failing_429" -gt 0 ]; then | |
| echo "" | |
| echo "RATE-LIMITED URLs STILL FAILING (429):" | |
| cat urls-429-still-failing.txt | |
| fi | |
| if [ "$github_missing" -gt 0 ]; then | |
| echo "" | |
| echo "GITHUB URLs MISSING LOCALLY:" | |
| cat github-missing.txt | |
| fi | |
| # Build GitHub Step Summary using lychee's table + 429 info + persistent errors only | |
| { | |
| # Copy lychee's summary table | |
| cat lychee-summary-table.md | |
| # Add recovery info | |
| echo "" | |
| echo "### Local Verification & Rate Limit Recovery" | |
| echo "" | |
| echo "| Status | Count |" | |
| echo "|--------|------:|" | |
| echo "| Repo-internal GitHub URLs (429 verified locally) | $verified_locally |" | |
| echo "| Repo-internal GitHub URLs (429 missing locally) | $missing_locally |" | |
| echo "| Repo-internal GitHub URLs (5xx verified locally) | $verified_5xx |" | |
| echo "| Rate-limited (429) recovered after retry | $recovered_429 |" | |
| echo "| Rate-limited (429) still failing | $still_failing_429 |" | |
| echo "| GitHub URLs verified locally | $github_verified |" | |
| echo "| GitHub URLs missing locally | $github_missing |" | |
| echo "| External GitHub URLs with 5xx (likely rate limiting) | $external_5xx_count |" | |
| echo "" | |
| # Show persistent errors only (non-429 broken links + 429s that didn't recover) | |
| echo "## Errors (after retry)" | |
| echo "" | |
| has_persistent_errors=false | |
| if [ "$broken_count" -gt 0 ]; then | |
| has_persistent_errors=true | |
| sed -E 's/.*\[([0-9]+)\].*<([^>]+)>.*/- [\1] \2/' broken-links.txt | |
| fi | |
| if [ "$still_failing_429" -gt 0 ]; then | |
| has_persistent_errors=true | |
| cat urls-429-still-failing.txt | |
| fi | |
| if [ "$github_missing" -gt 0 ]; then | |
| has_persistent_errors=true | |
| cat github-missing.txt | |
| fi | |
| if [ "$has_persistent_errors" = false ]; then | |
| echo "No errors remained after retry." | |
| fi | |
| echo "" | |
| } >> "$GITHUB_STEP_SUMMARY" | |
| - name: Upload errors-only report | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: lychee-report | |
| path: ./lychee-report.md | |
| if-no-files-found: ignore | |
| - name: Upload lychee outputs and verification results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: lychee-outputs | |
| path: | | |
| github-verified.txt | |
| github-missing.txt | |
| lychee-raw-main.md | |
| lychee-raw-github.md | |
| if-no-files-found: ignore | |
| - name: Create PR for broken links | |
| id: create-pr | |
| if: steps.check_failures.outputs.has_other_failures == 'true' || steps.retry429.outputs.has_429_failures == 'true' || steps.verify_github.outputs.has_missing == 'true' | |
| uses: actions/github-script@v7 | |
| env: | |
| DEVIN_PROMPT: | | |
| @devin-ai-integration Please fix the broken links detected by the scheduled link checker. | |
| **Instructions:** | |
| 1. For each URL listed below, identify the source file containing the broken link | |
| 2. Try to find the correct replacement URL. Common fixes include: | |
| - Updating outdated commit SHAs to the latest commit | |
| - Fixing URL-encoded paths | |
| - Updating renamed file paths | |
| 3. **IMPORTANT: If you cannot confidently find a correct replacement** (e.g., the only signal is a 5xx/503 error, connection reset, or rate limiting), **do NOT remove or modify the link**. Instead, leave it unchanged and add a PR comment on the relevant line asking for manual validation. | |
| 4. Only update links/paths contained in this PR, not other links that follow a similar pattern | |
| 5. Run `fern docs dev` locally to verify your changes don't break anything | |
| 6. Push your fix to this PR branch | |
| 7. After CI posts a preview link, use it to verify that your changes actually fix the underlying issue, then post a comment to the PR mentioning that you have tested it using the preview link | |
| 8. When the PR is ready for review, request a review from @davidkonigsberg in GitHub and send a message in the Devin session that includes "<!here>" to alert everyone in the channel. | |
| 9. Delete the scaffold file (.github/broken-links/broken-links.md) as part of your fix | |
| **Broken Links:** | |
| with: | |
| github-token: ${{ secrets.GITHUB_TOKEN }} | |
| script: | | |
| const fs = require('fs'); | |
| const devinPrompt = process.env.DEVIN_PROMPT; | |
| const owner = context.repo.owner; | |
| const repo = context.repo.repo; | |
| const baseBranch = 'main'; | |
| const branchName = 'devin/fix-broken-links'; | |
| const filePath = '.github/broken-links/broken-links.md'; | |
| // Read broken links files | |
| let brokenLinksContent = ''; | |
| let urls429Content = ''; | |
| let githubMissingContent = ''; | |
| try { | |
| brokenLinksContent = fs.readFileSync('broken-links.txt', 'utf8'); | |
| } catch (e) { | |
| console.log('No broken-links.txt found'); | |
| } | |
| try { | |
| urls429Content = fs.readFileSync('urls-429-still-failing.txt', 'utf8'); | |
| } catch (e) { | |
| console.log('No urls-429-still-failing.txt found'); | |
| } | |
| try { | |
| githubMissingContent = fs.readFileSync('github-missing.txt', 'utf8'); | |
| } catch (e) { | |
| console.log('No github-missing.txt found'); | |
| } | |
| if (!brokenLinksContent && !urls429Content && !githubMissingContent) { | |
| console.log('No broken links to report'); | |
| return; | |
| } | |
| // Build the scaffold file content | |
| let scaffoldContent = devinPrompt + '\n'; | |
| if (brokenLinksContent.trim()) { | |
| scaffoldContent += '\n## Non-429 Broken Links\n\n'; | |
| // Format: extract status code and URL from lychee output | |
| const lines = brokenLinksContent.trim().split('\n'); | |
| for (const line of lines) { | |
| const match = line.match(/\[(\d+)\].*<([^>]+)>/); | |
| if (match) { | |
| scaffoldContent += `- [${match[1]}] ${match[2]}\n`; | |
| } else { | |
| scaffoldContent += `- ${line}\n`; | |
| } | |
| } | |
| } | |
| if (urls429Content.trim()) { | |
| scaffoldContent += '\n## Rate-Limited URLs (429) Still Failing After Retry\n\n'; | |
| scaffoldContent += urls429Content.trim() + '\n'; | |
| } | |
| if (githubMissingContent.trim()) { | |
| scaffoldContent += '\n## GitHub URLs Missing Locally\n\n'; | |
| scaffoldContent += githubMissingContent.trim() + '\n'; | |
| } | |
| scaffoldContent += '\n---\n'; | |
| scaffoldContent += `[View workflow run](https://github.com/${owner}/${repo}/actions/runs/${context.runId})\n`; | |
| // Get the base branch ref | |
| const baseRef = await github.rest.git.getRef({ | |
| owner, | |
| repo, | |
| ref: `heads/${baseBranch}`, | |
| }); | |
| const baseCommitSha = baseRef.data.object.sha; | |
| const baseCommit = await github.rest.git.getCommit({ | |
| owner, | |
| repo, | |
| commit_sha: baseCommitSha, | |
| }); | |
| const baseTreeSha = baseCommit.data.tree.sha; | |
| // Check if branch already exists | |
| let branchExists = false; | |
| let existingPR = null; | |
| try { | |
| await github.rest.git.getRef({ | |
| owner, | |
| repo, | |
| ref: `heads/${branchName}`, | |
| }); | |
| branchExists = true; | |
| // Check for existing open PR | |
| const prs = await github.rest.pulls.list({ | |
| owner, | |
| repo, | |
| state: 'open', | |
| head: `${owner}:${branchName}`, | |
| }); | |
| if (prs.data.length > 0) { | |
| existingPR = prs.data[0]; | |
| } | |
| } catch (e) { | |
| if (e.status !== 404) throw e; | |
| } | |
| // If branch exists but no open PR, delete the old branch | |
| if (branchExists && !existingPR) { | |
| console.log(`Deleting old branch ${branchName}...`); | |
| await github.rest.git.deleteRef({ | |
| owner, | |
| repo, | |
| ref: `heads/${branchName}`, | |
| }); | |
| branchExists = false; | |
| } | |
| let prUrl = ''; | |
| let prNumber = 0; | |
| if (existingPR) { | |
| // Update existing PR by updating the file | |
| console.log(`Updating existing PR #${existingPR.number}...`); | |
| // Get the current file SHA if it exists | |
| let fileSha = null; | |
| try { | |
| const fileContent = await github.rest.repos.getContent({ | |
| owner, | |
| repo, | |
| path: filePath, | |
| ref: branchName, | |
| }); | |
| fileSha = fileContent.data.sha; | |
| } catch (e) { | |
| if (e.status !== 404) throw e; | |
| } | |
| await github.rest.repos.createOrUpdateFileContents({ | |
| owner, | |
| repo, | |
| path: filePath, | |
| message: `Update broken links list from workflow run ${context.runId}`, | |
| content: Buffer.from(scaffoldContent).toString('base64'), | |
| branch: branchName, | |
| sha: fileSha, | |
| }); | |
| prUrl = existingPR.html_url; | |
| prNumber = existingPR.number; | |
| console.log(`Updated PR: ${prUrl}`); | |
| } else { | |
| // Create new branch and PR | |
| console.log('Creating new branch and PR...'); | |
| // Create a blob with the scaffold content | |
| const blob = await github.rest.git.createBlob({ | |
| owner, | |
| repo, | |
| content: scaffoldContent, | |
| encoding: 'utf-8', | |
| }); | |
| // Create a tree with the new file | |
| const tree = await github.rest.git.createTree({ | |
| owner, | |
| repo, | |
| base_tree: baseTreeSha, | |
| tree: [ | |
| { | |
| path: filePath, | |
| mode: '100644', | |
| type: 'blob', | |
| sha: blob.data.sha, | |
| }, | |
| ], | |
| }); | |
| // Create a commit | |
| const commit = await github.rest.git.createCommit({ | |
| owner, | |
| repo, | |
| message: `Add broken links scaffold for Devin to fix`, | |
| tree: tree.data.sha, | |
| parents: [baseCommitSha], | |
| }); | |
| // Create the branch | |
| await github.rest.git.createRef({ | |
| owner, | |
| repo, | |
| ref: `refs/heads/${branchName}`, | |
| sha: commit.data.sha, | |
| }); | |
| // Create the PR | |
| const pr = await github.rest.pulls.create({ | |
| owner, | |
| repo, | |
| title: 'Fix broken links (Devin)', | |
| head: branchName, | |
| base: baseBranch, | |
| body: scaffoldContent, | |
| draft: true, | |
| }); | |
| prUrl = pr.data.html_url; | |
| prNumber = pr.data.number; | |
| console.log(`Created PR: ${prUrl}`); | |
| } | |
| core.setOutput('pr_url', prUrl); | |
| core.setOutput('pr_number', prNumber); | |
| core.setOutput('pr_created', 'true'); | |
| - name: Send Slack notification for broken links | |
| if: steps.create-pr.outputs.pr_created == 'true' | |
| uses: actions/github-script@v7 | |
| env: | |
| SLACK_TOKEN: ${{ secrets.DEVIN_AI_PR_BOT_SLACK_TOKEN }} | |
| PR_URL: ${{ steps.create-pr.outputs.pr_url }} | |
| with: | |
| script: | | |
| const slackToken = process.env.SLACK_TOKEN; | |
| const prUrl = process.env.PR_URL; | |
| const channelId = 'C0A23CZEFNF'; | |
| // Devin Slack App member ID for proper mention | |
| const devinMention = '<@U088PL5FS3B>'; | |
| const message = `${devinMention} *Broken Links Detected in Docs*\nThe scheduled link checker found broken links in the \`docs\` repo. Please fix them by following the instructions in the <${prUrl}|PR>.`; | |
| try { | |
| const response = await fetch('https://slack.com/api/chat.postMessage', { | |
| method: 'POST', | |
| headers: { | |
| 'Content-Type': 'application/json', | |
| 'Authorization': `Bearer ${slackToken}` | |
| }, | |
| body: JSON.stringify({ | |
| channel: channelId, | |
| text: message, | |
| mrkdwn: true | |
| }) | |
| }); | |
| const result = await response.json(); | |
| if (result.ok) { | |
| console.log(`Sent Slack notification for broken links PR`); | |
| } else { | |
| console.error(`Failed to send Slack notification: ${result.error}`); | |
| } | |
| } catch (error) { | |
| console.error(`Error sending Slack notification: ${error.message}`); | |
| } | |
| - name: Fail if there are broken links | |
| if: steps.check_failures.outputs.has_other_failures == 'true' || steps.retry429.outputs.has_429_failures == 'true' || steps.verify_github.outputs.has_missing == 'true' | |
| run: | | |
| echo "Link check failed!" | |
| if [ "${{ steps.check_failures.outputs.has_other_failures }}" == "true" ]; then | |
| echo "There are broken links (non-429 failures) in the report." | |
| fi | |
| if [ "${{ steps.retry429.outputs.has_429_failures }}" == "true" ]; then | |
| echo "Some URLs still returned 429 after exponential backoff retry." | |
| echo "These URLs may need to be excluded or the rate limit needs more time to reset." | |
| fi | |
| if [ "${{ steps.verify_github.outputs.has_missing }}" == "true" ]; then | |
| echo "Some GitHub URLs point to paths that don't exist in the repos." | |
| fi | |
| exit 1 |