Skip to content

Check Links

Check Links #107

Workflow file for this run

name: Check Links
on:
# Run M/W/F mornings
schedule:
- cron: '0 10 * * 1,2,3,4,5'
workflow_dispatch:
permissions:
contents: write
pull-requests: write
jobs:
check-links:
name: Check links
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Create lychee config
run: |
cat > lychee.toml << 'EOF'
# Lychee link checker configuration
# https://lychee.cli.rs/
# Accept these status codes as valid
accept = [200, 204, 301, 302, 307, 308]
# User agent to avoid being blocked
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
# Exclude patterns (regex)
exclude = [
# Placeholder/example URLs
"^https://example\\.com",
"^https://github\\.com/owner/repo",
# Analytics and tracking
"^https://us\\.i\\.posthog\\.com",
"^https://c\\.vialoops\\.com",
"_vercel/(speed-)?insights",
# Sites that block automated requests
"^https://docs\\.stack-auth\\.com",
"^https://www\\.linkedin\\.com",
"^https://linkedin\\.com",
"^https://twitter\\.com",
"^https://x\\.com",
"^https://www\\.npmjs\\.com",
"^https://cdn\\.simpleicons\\.org",
# All GitHub blob/tree/tag URLs - verified locally before lychee runs
# This avoids 503 rate limiting from GitHub's automated request detection
# Issues/PRs/compare links are still checked via HTTP (they rarely break)
"^https://github\\.com/[^/]+/[^/]+/(blob|tree)/",
"^https://github\\.com/[^/]+/[^/]+/releases/tag/",
# Non-HTTP links
"^mailto:",
"^tel:",
"^javascript:",
# Anchor-only links
"^#"
]
EOF
- name: Fetch sitemap and extract URLs
run: |
curl -s https://buildwithfern.com/learn/sitemap.xml | grep -oP '(?<=<loc>)[^<]+' > urls.txt
echo "Found $(wc -l < urls.txt) URLs in sitemap"
- name: Extract and verify GitHub blob/tree/tag URLs locally
id: verify_github
run: |
# Extract all GitHub blob/tree/tag URLs from the repo source files
# This is much faster than fetching all published pages via HTTP (~11 min -> seconds)
# These URLs are excluded from lychee and verified locally instead to avoid 503 errors
echo "Scanning repo for GitHub blob/tree/tag URLs..."
start_time=$(date +%s)
> github-urls.txt
# Search the content directories for GitHub blob/tree URLs
# Include fern/ (main docs) and README.md (root)
# Exclude .git and any cloned repos
grep -RhoE 'https://github\.com/[^/]+/[^/]+/(blob|tree)/[^"'"'"')<>[:space:]]+' \
fern/ \
README.md \
--exclude-dir=.git \
--exclude-dir=.github-repos \
>> github-urls.txt 2>/dev/null || true
# Also search for releases/tag URLs
grep -RhoE 'https://github\.com/[^/]+/[^/]+/releases/tag/[^"'"'"')<>[:space:]]+' \
fern/ \
README.md \
--exclude-dir=.git \
--exclude-dir=.github-repos \
>> github-urls.txt 2>/dev/null || true
# Deduplicate URLs
sort -u github-urls.txt -o github-urls.txt
# Remove example/placeholder URLs (e.g., github.com/your-org/...)
# These are documentation examples, not real repos to verify
if [ -s github-urls.txt ]; then
grep -v 'github\.com/your-org/' github-urls.txt > github-urls.filtered || true
mv github-urls.filtered github-urls.txt
fi
total_urls=$(wc -l < github-urls.txt | tr -d ' ')
end_time=$(date +%s)
echo "Found $total_urls unique GitHub URLs to verify locally (took $((end_time - start_time))s)"
if [ "$total_urls" -eq 0 ]; then
echo "No GitHub URLs to verify"
echo "verified_count=0" >> $GITHUB_OUTPUT
echo "missing_count=0" >> $GITHUB_OUTPUT
echo "has_missing=false" >> $GITHUB_OUTPUT
exit 0
fi
# Extract unique repos that need to be cloned (org/repo format)
grep -oE 'https://github\.com/[^/]+/[^/]+' github-urls.txt | sort -u > github-repos.txt
echo "Repos to clone:"
cat github-repos.txt
# Clone each repo (shallow clone for efficiency)
mkdir -p .github-repos
while IFS= read -r repo_url; do
[ -z "$repo_url" ] && continue
# Extract org/repo for directory structure
org_repo=$(echo "$repo_url" | sed -E 's#https://github\.com/##')
org=$(echo "$org_repo" | cut -d'/' -f1)
repo_name=$(echo "$org_repo" | cut -d'/' -f2)
echo "Cloning $org/$repo_name..."
mkdir -p ".github-repos/$org"
# Use sparse checkout for efficiency - we only need to check if paths exist
git clone --depth 1 --filter=blob:none --sparse "$repo_url.git" ".github-repos/$org/$repo_name" 2>/dev/null || {
echo "Warning: Failed to clone $repo_url, will mark URLs from this repo as unverifiable"
continue
}
# Fetch all tags for tag verification (lightweight fetch)
cd ".github-repos/$org/$repo_name"
git fetch --tags --depth 1 2>/dev/null || true
cd ../../..
done < github-repos.txt
# Verify each URL by checking if the path exists in the cloned repo
verified_count=0
missing_count=0
> github-missing.txt
> github-verified.txt
while IFS= read -r url; do
[ -z "$url" ] && continue
# Remove any URL fragments or query strings
clean_url="${url%%#*}"
clean_url="${clean_url%%\?*}"
# Extract org, repo name, and determine URL type
org=$(echo "$clean_url" | sed -E 's#https://github\.com/([^/]+)/.*#\1#')
repo_name=$(echo "$clean_url" | sed -E 's#https://github\.com/[^/]+/([^/]+)/.*#\1#')
# Check if we have the repo cloned
if [ ! -d ".github-repos/$org/$repo_name" ]; then
echo "Repo not cloned, cannot verify: $url"
echo "- [UNVERIFIABLE] $url (repo clone failed)" >> github-missing.txt
missing_count=$((missing_count + 1))
continue
fi
cd ".github-repos/$org/$repo_name"
# Check if this is a releases/tag URL
if echo "$clean_url" | grep -qE '/releases/tag/'; then
# Extract tag name
tag_name=$(echo "$clean_url" | sed -E 's#.*/releases/tag/(.*)#\1#')
# Check if tag exists
if git tag -l "$tag_name" | grep -q "^${tag_name}$"; then
echo "Verified (tag): $url -> tag $tag_name"
echo "$url" >> ../../../github-verified.txt
verified_count=$((verified_count + 1))
else
echo "MISSING (tag): $url -> tag $tag_name"
echo "- [LOCAL_MISSING] $url (tag: $tag_name in $org/$repo_name)" >> ../../../github-missing.txt
missing_count=$((missing_count + 1))
fi
else
# This is a blob/tree URL - extract ref and path
# Format: https://github.com/ORG/REPO/(blob|tree)/REF/PATH
ref=$(echo "$clean_url" | sed -E 's#https://github\.com/[^/]+/[^/]+/(blob|tree)/([^/]+)/.*#\2#')
rel_path=$(echo "$clean_url" | sed -E 's#https://github\.com/[^/]+/[^/]+/(blob|tree)/[^/]+/(.*)#\2#')
# URL-decode the relative path (handles %5B -> [, %5D -> ], spaces, etc.)
# This is needed because GitHub URLs encode special characters like [ and ]
rel_path=$(python3 -c "import sys, urllib.parse; print(urllib.parse.unquote(sys.argv[1]))" "$rel_path")
# Handle HEAD ref specially
if [ "$ref" = "HEAD" ]; then
ref=$(git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null | sed 's@^refs/remotes/origin/@@' || echo "main")
fi
# For specific commit SHAs, we need to fetch them
if echo "$ref" | grep -qE '^[0-9a-f]{40}$'; then
git fetch --depth 1 origin "$ref" 2>/dev/null || true
fi
# Check if path exists using git ls-tree (works with sparse checkout)
# Use grep -qxF for literal string matching (paths may contain regex chars like [ and ])
if git ls-tree -r --name-only "$ref" -- "$rel_path" 2>/dev/null | grep -qxF "$rel_path" || \
git ls-tree -d --name-only "$ref" -- "$rel_path" 2>/dev/null | grep -qxF "$rel_path"; then
echo "Verified: $url -> $rel_path (ref: $ref)"
echo "$url" >> ../../../github-verified.txt
verified_count=$((verified_count + 1))
else
# Also check if it's a directory (tree link)
if git ls-tree -d "$ref" -- "$rel_path" 2>/dev/null | grep -q .; then
echo "Verified (directory): $url -> $rel_path (ref: $ref)"
echo "$url" >> ../../../github-verified.txt
verified_count=$((verified_count + 1))
else
echo "MISSING: $url -> $rel_path (ref: $ref)"
echo "- [LOCAL_MISSING] $url (path: $rel_path in $org/$repo_name, ref: $ref)" >> ../../../github-missing.txt
missing_count=$((missing_count + 1))
fi
fi
fi
cd ../../..
done < github-urls.txt
echo ""
echo "=== GitHub URL Verification Summary ==="
echo "Total URLs checked: $total_urls"
echo "Verified locally: $verified_count"
echo "Missing/unverifiable: $missing_count"
echo "verified_count=$verified_count" >> $GITHUB_OUTPUT
echo "missing_count=$missing_count" >> $GITHUB_OUTPUT
if [ "$missing_count" -gt 0 ]; then
echo "has_missing=true" >> $GITHUB_OUTPUT
echo ""
echo "Missing URLs:"
cat github-missing.txt
else
echo "has_missing=false" >> $GITHUB_OUTPUT
fi
# Cleanup cloned repos
rm -rf .github-repos
- name: Extract GitHub URLs for HTTP checking from source files
id: extract_github_http
run: |
# Extract GitHub URLs that need HTTP checking (issues, compare, commit, discussions, non-fern-api PRs)
# These are extracted from source files and passed directly to lychee
# This avoids the --include filter issue where lychee applies it to input pages, not discovered links
echo "Scanning repo for GitHub URLs to check via HTTP..."
> github-http-urls.txt
# Search for issues URLs
grep -RhoE 'https://github\.com/[^/]+/[^/]+/issues/[^"'"'"')<>[:space:]]+' \
fern/ README.md \
--exclude-dir=.git --exclude-dir=.github-repos \
>> github-http-urls.txt 2>/dev/null || true
# Search for compare URLs
grep -RhoE 'https://github\.com/[^/]+/[^/]+/compare/[^"'"'"')<>[:space:]]+' \
fern/ README.md \
--exclude-dir=.git --exclude-dir=.github-repos \
>> github-http-urls.txt 2>/dev/null || true
# Search for commit/commits URLs
grep -RhoE 'https://github\.com/[^/]+/[^/]+/commits?/[^"'"'"')<>[:space:]]+' \
fern/ README.md \
--exclude-dir=.git --exclude-dir=.github-repos \
>> github-http-urls.txt 2>/dev/null || true
# Search for discussions URLs
grep -RhoE 'https://github\.com/[^/]+/[^/]+/discussions/[^"'"'"')<>[:space:]]+' \
fern/ README.md \
--exclude-dir=.git --exclude-dir=.github-repos \
>> github-http-urls.txt 2>/dev/null || true
# Search for pull request URLs (excluding fern-api org - those are too slow to check)
grep -RhoE 'https://github\.com/[^/]+/[^/]+/pull/[^"'"'"')<>[:space:]]+' \
fern/ README.md \
--exclude-dir=.git --exclude-dir=.github-repos \
2>/dev/null | grep -v 'github\.com/fern-api/' \
>> github-http-urls.txt || true
# Remove example/placeholder URLs (e.g., github.com/your-org/...)
if [ -s github-http-urls.txt ]; then
grep -v 'github\.com/your-org/' github-http-urls.txt > github-http-urls.filtered || true
mv github-http-urls.filtered github-http-urls.txt
fi
# Deduplicate URLs
sort -u github-http-urls.txt -o github-http-urls.txt
total_urls=$(wc -l < github-http-urls.txt | tr -d ' ')
echo "Found $total_urls unique GitHub URLs to check via HTTP"
echo "github_http_count=$total_urls" >> $GITHUB_OUTPUT
if [ "$total_urls" -gt 0 ]; then
echo "URLs to check:"
cat github-http-urls.txt
fi
- name: Upload URLs (early, for debugging)
uses: actions/upload-artifact@v4
with:
name: urls
path: |
github-urls.txt
github-http-urls.txt
urls.txt
if-no-files-found: ignore
- name: Check GitHub links (very low concurrency to avoid 503 rate limiting)
id: lychee_github
if: steps.extract_github_http.outputs.github_http_count != '0'
uses: lycheeverse/lychee-action@v2
with:
args: >-
--no-progress
--timeout 30
--max-retries 5
--retry-wait-time 20
--max-concurrency 2
github-http-urls.txt
output: ./lychee-raw-github.md
format: markdown
fail: false
jobSummary: false
- name: Check non-GitHub links (high concurrency)
id: lychee_main
uses: lycheeverse/lychee-action@v2
with:
args: >-
--config lychee.toml
--no-progress
--timeout 30
--max-retries 3
--retry-wait-time 10
--max-concurrency 20
--exclude "^https://github\\.com/"
--files-from urls.txt
output: ./lychee-raw-main.md
format: markdown
fail: false
jobSummary: false
- name: Combine lychee outputs
run: |
# Combine both lychee runs into a single raw file for downstream processing
: > lychee-raw.md
[ -f lychee-raw-main.md ] && cat lychee-raw-main.md >> lychee-raw.md
[ -f lychee-raw-github.md ] && cat lychee-raw-github.md >> lychee-raw.md
echo "Combined lychee outputs into lychee-raw.md"
- name: Extract 429 URLs and retry with exponential backoff
id: retry429
run: |
# Extract all 429 URLs from the raw lychee report
# Note: 403s are excluded from broken links report but NOT retried (bot-blocking sites won't change)
grep '\[429\]' ./lychee-raw.md | sed -E 's/.*<([^>]+)>.*/\1/' | sort -u > urls-429-all.txt || true
# Initialize counters for repo-internal GitHub URLs
verified_locally=0
missing_locally=0
# Process repo-internal GitHub URLs by checking if files exist locally
# This avoids false positives from GitHub rate limiting while still catching real broken links
> urls-429.txt
> urls-429-repo-missing.txt
while IFS= read -r url; do
[ -z "$url" ] && continue
# Check if this is a repo-internal GitHub URL (blob/main pattern)
if echo "$url" | grep -qE '^https://github\.com/fern-api/docs/blob/main/'; then
# Extract relative path from URL (strip prefix and query string)
rel_path="${url#https://github.com/fern-api/docs/blob/main/}"
rel_path="${rel_path%%\?*}"
if [ -f "$rel_path" ]; then
echo "Verified locally (file exists): $url -> $rel_path"
verified_locally=$((verified_locally + 1))
else
echo "MISSING locally: $url -> $rel_path"
echo "- [LOCAL_MISSING] $url (expected path: $rel_path)" >> urls-429-repo-missing.txt
missing_locally=$((missing_locally + 1))
fi
else
# Non-repo URL, add to retry list
echo "$url" >> urls-429.txt
fi
done < urls-429-all.txt
echo "Repo-internal GitHub URLs verified locally: $verified_locally"
echo "Repo-internal GitHub URLs missing locally: $missing_locally"
echo "verified_locally=$verified_locally" >> $GITHUB_OUTPUT
echo "missing_locally=$missing_locally" >> $GITHUB_OUTPUT
count=$(wc -l < urls-429.txt | tr -d ' ')
echo "Found $count other URLs with 429 status to retry"
echo "rate_limited_count=$count" >> $GITHUB_OUTPUT
# Initialize still-failing file
> urls-429-still-failing.txt
# Add any locally-missing repo URLs to the still-failing list
if [ -s urls-429-repo-missing.txt ]; then
cat urls-429-repo-missing.txt >> urls-429-still-failing.txt
fi
if [ "$count" -eq "0" ] || [ ! -s urls-429.txt ]; then
echo "No other 429 URLs to retry"
else
echo "Retrying $count URLs with exponential backoff..."
while IFS= read -r url; do
[ -z "$url" ] && continue
delay=15
max_attempts=4
success=false
for attempt in $(seq 1 $max_attempts); do
echo "[$attempt/$max_attempts] Checking: $url"
status=$(curl -s -o /dev/null -w "%{http_code}" \
-H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" \
--max-time 30 \
"$url" || echo "000")
if [ "$status" -lt 400 ]; then
echo "OK on retry (status $status): $url"
success=true
break
elif [ "$status" -eq 429 ]; then
echo "Still 429, sleeping ${delay}s before next attempt..."
sleep "$delay"
delay=$((delay * 2))
else
echo "Non-429 failure (status $status): $url"
break
fi
done
if [ "$success" = false ]; then
echo "- [${status}] $url" >> urls-429-still-failing.txt
fi
done < urls-429.txt
fi
still_failing=$(wc -l < urls-429-still-failing.txt | tr -d ' ')
echo "URLs still failing after retry: $still_failing"
echo "still_failing_429=$still_failing" >> $GITHUB_OUTPUT
if [ "$still_failing" -gt 0 ]; then
echo "has_429_failures=true" >> $GITHUB_OUTPUT
else
echo "has_429_failures=false" >> $GITHUB_OUTPUT
fi
- name: Build errors-only report and summarize
id: check_failures
run: |
# Pattern matches 4xx (except 403 and 429) and 5xx status codes
# 403 and 429 are handled separately (retried with exponential backoff)
FAILURE_PATTERN='\[400\]|\[401\]|\[402\]|\[40[4-9]\]|\[41[0-9]\]|\[42[0-8]\]|\[4[3-9][0-9]\]|\[5[0-9]{2}\]'
# Extract broken links from raw report
grep -E "$FAILURE_PATTERN" ./lychee-raw.md > broken-links-raw.txt 2>/dev/null || true
# Filter out repo-internal GitHub URLs that exist locally (handles 5xx false positives)
# Also separate 5xx errors from external GitHub repos (likely rate limiting, not broken)
verified_5xx=0
external_5xx=0
> broken-links.txt
> external-5xx-links.txt
while IFS= read -r line; do
[ -z "$line" ] && continue
# Extract URL and status code from the line (format: ... [STATUS] <URL> ...)
url=$(echo "$line" | sed -E 's/.*<([^>]+)>.*/\1/')
status=$(echo "$line" | sed -E 's/.*\[([0-9]+)\].*/\1/')
# Check if this is a repo-internal GitHub URL (blob/main pattern)
if echo "$url" | grep -qE '^https://github\.com/fern-api/docs/blob/main/'; then
# Extract relative path from URL (strip prefix and query string)
rel_path="${url#https://github.com/fern-api/docs/blob/main/}"
rel_path="${rel_path%%\?*}"
if [ -f "$rel_path" ]; then
echo "Verified locally (5xx but file exists): $url -> $rel_path"
verified_5xx=$((verified_5xx + 1))
else
# File doesn't exist locally, this is a real broken link
echo "$line" >> broken-links.txt
fi
# Check if this is a 5xx error from an external GitHub repo (likely rate limiting)
elif echo "$status" | grep -qE '^5[0-9]{2}$' && echo "$url" | grep -qE '^https://github\.com/'; then
echo "External GitHub 5xx (likely rate limiting): $url"
echo "$line" >> external-5xx-links.txt
external_5xx=$((external_5xx + 1))
else
# Not a repo-internal URL and not external GitHub 5xx, keep it in the broken links list
echo "$line" >> broken-links.txt
fi
done < broken-links-raw.txt
# Deduplicate error files (same URL may appear multiple times if linked from multiple pages)
sort -u broken-links.txt -o broken-links.txt
sort -u external-5xx-links.txt -o external-5xx-links.txt
echo "Repo-internal GitHub URLs with 5xx verified locally: $verified_5xx"
echo "External GitHub URLs with 5xx (likely rate limiting): $external_5xx"
echo "verified_5xx=$verified_5xx" >> $GITHUB_OUTPUT
echo "external_5xx=$external_5xx" >> $GITHUB_OUTPUT
broken_count=$(wc -l < broken-links.txt | tr -d ' ')
# Get rate limit stats
rate_limited="${{ steps.retry429.outputs.rate_limited_count }}"
rate_limited=${rate_limited:-0}
still_failing_429="${{ steps.retry429.outputs.still_failing_429 }}"
still_failing_429=${still_failing_429:-0}
# Get GitHub local verification stats
github_verified="${{ steps.verify_github.outputs.verified_count }}"
github_verified=${github_verified:-0}
github_missing="${{ steps.verify_github.outputs.missing_count }}"
github_missing=${github_missing:-0}
# Build clean errors-only report
cat > lychee-report.md << 'HEADER'
# Link Check Report
This report only shows broken links (errors). Redirects and successful links are not included.
HEADER
if [ "$broken_count" -gt 0 ]; then
echo "## Broken Links ($broken_count)" >> lychee-report.md
echo "" >> lychee-report.md
# Format: - [STATUS] URL
sed -E 's/.*\[([0-9]+)\].*<([^>]+)>.*/- [\1] \2/' broken-links.txt >> lychee-report.md
echo "" >> lychee-report.md
fi
if [ "$still_failing_429" -gt 0 ]; then
echo "## Rate-Limited URLs (429) - Still Failing After Retry" >> lychee-report.md
echo "" >> lychee-report.md
cat urls-429-still-failing.txt >> lychee-report.md
echo "" >> lychee-report.md
fi
if [ "$github_missing" -gt 0 ]; then
echo "## GitHub URLs - Missing Locally ($github_missing)" >> lychee-report.md
echo "" >> lychee-report.md
cat github-missing.txt >> lychee-report.md
echo "" >> lychee-report.md
fi
# External GitHub 5xx errors are informational only (likely rate limiting, not broken)
external_5xx_count=$(wc -l < external-5xx-links.txt | tr -d ' ')
if [ "$external_5xx_count" -gt 0 ]; then
echo "## External GitHub URLs - 5xx (Likely Rate Limiting, Manual Validation Recommended)" >> lychee-report.md
echo "" >> lychee-report.md
echo "_These links returned 5xx errors during automated checking. They are likely valid but rate-limited. Please validate manually._" >> lychee-report.md
echo "" >> lychee-report.md
sed -E 's/.*\[([0-9]+)\].*<([^>]+)>.*/- [\1] \2/' external-5xx-links.txt >> lychee-report.md
echo "" >> lychee-report.md
fi
if [ "$broken_count" -eq 0 ] && [ "$still_failing_429" -eq 0 ] && [ "$github_missing" -eq 0 ]; then
echo "No broken links found!" >> lychee-report.md
fi
echo "" >> lychee-report.md
echo "---" >> lychee-report.md
echo "" >> lychee-report.md
echo "Rate-limited URLs (429) that recovered after retry: $rate_limited" >> lychee-report.md
# Calculate recovered 429s (rate_limited - still_failing_429)
recovered_429=0
if [ "$rate_limited" -gt 0 ]; then
recovered_429=$((rate_limited - still_failing_429))
fi
# Set output for failure detection
if [ "$broken_count" -gt 0 ]; then
echo "has_other_failures=true" >> $GITHUB_OUTPUT
else
echo "has_other_failures=false" >> $GITHUB_OUTPUT
fi
# Extract lychee's summary table only (stop at first ## heading)
sed -n '/^# Summary$/,/^## /p' ./lychee-raw.md | head -n -1 > lychee-summary-table.md
# Print summary to logs
echo ""
cat lychee-summary-table.md
echo ""
# Get repo-internal GitHub URL counts
verified_locally="${{ steps.retry429.outputs.verified_locally }}"
verified_locally=${verified_locally:-0}
missing_locally="${{ steps.retry429.outputs.missing_locally }}"
missing_locally=${missing_locally:-0}
echo "Recovery Info:"
echo " Repo-internal GitHub URLs (429 verified locally): $verified_locally"
echo " Repo-internal GitHub URLs (429 missing locally): $missing_locally"
echo " Repo-internal GitHub URLs (5xx verified locally): $verified_5xx"
echo " Rate-limited (429) - recovered after retry: $recovered_429"
echo " Rate-limited (429) - still failing: $still_failing_429"
echo " GitHub URLs verified locally: $github_verified"
echo " GitHub URLs missing locally: $github_missing"
echo " External GitHub URLs with 5xx (likely rate limiting): $external_5xx_count"
if [ "$broken_count" -gt 0 ]; then
echo ""
echo "BROKEN LINKS (non-429):"
cat broken-links.txt | sed -E 's/.*\[([0-9]+)\].*<([^>]+)>.*/ - [\1] \2/'
fi
if [ "$still_failing_429" -gt 0 ]; then
echo ""
echo "RATE-LIMITED URLs STILL FAILING (429):"
cat urls-429-still-failing.txt
fi
if [ "$github_missing" -gt 0 ]; then
echo ""
echo "GITHUB URLs MISSING LOCALLY:"
cat github-missing.txt
fi
# Build GitHub Step Summary using lychee's table + 429 info + persistent errors only
{
# Copy lychee's summary table
cat lychee-summary-table.md
# Add recovery info
echo ""
echo "### Local Verification & Rate Limit Recovery"
echo ""
echo "| Status | Count |"
echo "|--------|------:|"
echo "| Repo-internal GitHub URLs (429 verified locally) | $verified_locally |"
echo "| Repo-internal GitHub URLs (429 missing locally) | $missing_locally |"
echo "| Repo-internal GitHub URLs (5xx verified locally) | $verified_5xx |"
echo "| Rate-limited (429) recovered after retry | $recovered_429 |"
echo "| Rate-limited (429) still failing | $still_failing_429 |"
echo "| GitHub URLs verified locally | $github_verified |"
echo "| GitHub URLs missing locally | $github_missing |"
echo "| External GitHub URLs with 5xx (likely rate limiting) | $external_5xx_count |"
echo ""
# Show persistent errors only (non-429 broken links + 429s that didn't recover)
echo "## Errors (after retry)"
echo ""
has_persistent_errors=false
if [ "$broken_count" -gt 0 ]; then
has_persistent_errors=true
sed -E 's/.*\[([0-9]+)\].*<([^>]+)>.*/- [\1] \2/' broken-links.txt
fi
if [ "$still_failing_429" -gt 0 ]; then
has_persistent_errors=true
cat urls-429-still-failing.txt
fi
if [ "$github_missing" -gt 0 ]; then
has_persistent_errors=true
cat github-missing.txt
fi
if [ "$has_persistent_errors" = false ]; then
echo "No errors remained after retry."
fi
echo ""
} >> "$GITHUB_STEP_SUMMARY"
- name: Upload errors-only report
if: always()
uses: actions/upload-artifact@v4
with:
name: lychee-report
path: ./lychee-report.md
if-no-files-found: ignore
- name: Upload lychee outputs and verification results
if: always()
uses: actions/upload-artifact@v4
with:
name: lychee-outputs
path: |
github-verified.txt
github-missing.txt
lychee-raw-main.md
lychee-raw-github.md
if-no-files-found: ignore
- name: Create PR for broken links
id: create-pr
if: steps.check_failures.outputs.has_other_failures == 'true' || steps.retry429.outputs.has_429_failures == 'true' || steps.verify_github.outputs.has_missing == 'true'
uses: actions/github-script@v7
env:
DEVIN_PROMPT: |
@devin-ai-integration Please fix the broken links detected by the scheduled link checker.
**Instructions:**
1. For each URL listed below, identify the source file containing the broken link
2. Try to find the correct replacement URL. Common fixes include:
- Updating outdated commit SHAs to the latest commit
- Fixing URL-encoded paths
- Updating renamed file paths
3. **IMPORTANT: If you cannot confidently find a correct replacement** (e.g., the only signal is a 5xx/503 error, connection reset, or rate limiting), **do NOT remove or modify the link**. Instead, leave it unchanged and add a PR comment on the relevant line asking for manual validation.
4. Only update links/paths contained in this PR, not other links that follow a similar pattern
5. Run `fern docs dev` locally to verify your changes don't break anything
6. Push your fix to this PR branch
7. After CI posts a preview link, use it to verify that your changes actually fix the underlying issue, then post a comment to the PR mentioning that you have tested it using the preview link
8. When the PR is ready for review, request a review from @davidkonigsberg in GitHub and send a message in the Devin session that includes "<!here>" to alert everyone in the channel.
9. Delete the scaffold file (.github/broken-links/broken-links.md) as part of your fix
**Broken Links:**
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
const fs = require('fs');
const devinPrompt = process.env.DEVIN_PROMPT;
const owner = context.repo.owner;
const repo = context.repo.repo;
const baseBranch = 'main';
const branchName = 'devin/fix-broken-links';
const filePath = '.github/broken-links/broken-links.md';
// Read broken links files
let brokenLinksContent = '';
let urls429Content = '';
let githubMissingContent = '';
try {
brokenLinksContent = fs.readFileSync('broken-links.txt', 'utf8');
} catch (e) {
console.log('No broken-links.txt found');
}
try {
urls429Content = fs.readFileSync('urls-429-still-failing.txt', 'utf8');
} catch (e) {
console.log('No urls-429-still-failing.txt found');
}
try {
githubMissingContent = fs.readFileSync('github-missing.txt', 'utf8');
} catch (e) {
console.log('No github-missing.txt found');
}
if (!brokenLinksContent && !urls429Content && !githubMissingContent) {
console.log('No broken links to report');
return;
}
// Build the scaffold file content
let scaffoldContent = devinPrompt + '\n';
if (brokenLinksContent.trim()) {
scaffoldContent += '\n## Non-429 Broken Links\n\n';
// Format: extract status code and URL from lychee output
const lines = brokenLinksContent.trim().split('\n');
for (const line of lines) {
const match = line.match(/\[(\d+)\].*<([^>]+)>/);
if (match) {
scaffoldContent += `- [${match[1]}] ${match[2]}\n`;
} else {
scaffoldContent += `- ${line}\n`;
}
}
}
if (urls429Content.trim()) {
scaffoldContent += '\n## Rate-Limited URLs (429) Still Failing After Retry\n\n';
scaffoldContent += urls429Content.trim() + '\n';
}
if (githubMissingContent.trim()) {
scaffoldContent += '\n## GitHub URLs Missing Locally\n\n';
scaffoldContent += githubMissingContent.trim() + '\n';
}
scaffoldContent += '\n---\n';
scaffoldContent += `[View workflow run](https://github.com/${owner}/${repo}/actions/runs/${context.runId})\n`;
// Get the base branch ref
const baseRef = await github.rest.git.getRef({
owner,
repo,
ref: `heads/${baseBranch}`,
});
const baseCommitSha = baseRef.data.object.sha;
const baseCommit = await github.rest.git.getCommit({
owner,
repo,
commit_sha: baseCommitSha,
});
const baseTreeSha = baseCommit.data.tree.sha;
// Check if branch already exists
let branchExists = false;
let existingPR = null;
try {
await github.rest.git.getRef({
owner,
repo,
ref: `heads/${branchName}`,
});
branchExists = true;
// Check for existing open PR
const prs = await github.rest.pulls.list({
owner,
repo,
state: 'open',
head: `${owner}:${branchName}`,
});
if (prs.data.length > 0) {
existingPR = prs.data[0];
}
} catch (e) {
if (e.status !== 404) throw e;
}
// If branch exists but no open PR, delete the old branch
if (branchExists && !existingPR) {
console.log(`Deleting old branch ${branchName}...`);
await github.rest.git.deleteRef({
owner,
repo,
ref: `heads/${branchName}`,
});
branchExists = false;
}
let prUrl = '';
let prNumber = 0;
if (existingPR) {
// Update existing PR by updating the file
console.log(`Updating existing PR #${existingPR.number}...`);
// Get the current file SHA if it exists
let fileSha = null;
try {
const fileContent = await github.rest.repos.getContent({
owner,
repo,
path: filePath,
ref: branchName,
});
fileSha = fileContent.data.sha;
} catch (e) {
if (e.status !== 404) throw e;
}
await github.rest.repos.createOrUpdateFileContents({
owner,
repo,
path: filePath,
message: `Update broken links list from workflow run ${context.runId}`,
content: Buffer.from(scaffoldContent).toString('base64'),
branch: branchName,
sha: fileSha,
});
prUrl = existingPR.html_url;
prNumber = existingPR.number;
console.log(`Updated PR: ${prUrl}`);
} else {
// Create new branch and PR
console.log('Creating new branch and PR...');
// Create a blob with the scaffold content
const blob = await github.rest.git.createBlob({
owner,
repo,
content: scaffoldContent,
encoding: 'utf-8',
});
// Create a tree with the new file
const tree = await github.rest.git.createTree({
owner,
repo,
base_tree: baseTreeSha,
tree: [
{
path: filePath,
mode: '100644',
type: 'blob',
sha: blob.data.sha,
},
],
});
// Create a commit
const commit = await github.rest.git.createCommit({
owner,
repo,
message: `Add broken links scaffold for Devin to fix`,
tree: tree.data.sha,
parents: [baseCommitSha],
});
// Create the branch
await github.rest.git.createRef({
owner,
repo,
ref: `refs/heads/${branchName}`,
sha: commit.data.sha,
});
// Create the PR
const pr = await github.rest.pulls.create({
owner,
repo,
title: 'Fix broken links (Devin)',
head: branchName,
base: baseBranch,
body: scaffoldContent,
draft: true,
});
prUrl = pr.data.html_url;
prNumber = pr.data.number;
console.log(`Created PR: ${prUrl}`);
}
core.setOutput('pr_url', prUrl);
core.setOutput('pr_number', prNumber);
core.setOutput('pr_created', 'true');
- name: Send Slack notification for broken links
if: steps.create-pr.outputs.pr_created == 'true'
uses: actions/github-script@v7
env:
SLACK_TOKEN: ${{ secrets.DEVIN_AI_PR_BOT_SLACK_TOKEN }}
PR_URL: ${{ steps.create-pr.outputs.pr_url }}
with:
script: |
const slackToken = process.env.SLACK_TOKEN;
const prUrl = process.env.PR_URL;
const channelId = 'C0A23CZEFNF';
// Devin Slack App member ID for proper mention
const devinMention = '<@U088PL5FS3B>';
const message = `${devinMention} *Broken Links Detected in Docs*\nThe scheduled link checker found broken links in the \`docs\` repo. Please fix them by following the instructions in the <${prUrl}|PR>.`;
try {
const response = await fetch('https://slack.com/api/chat.postMessage', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${slackToken}`
},
body: JSON.stringify({
channel: channelId,
text: message,
mrkdwn: true
})
});
const result = await response.json();
if (result.ok) {
console.log(`Sent Slack notification for broken links PR`);
} else {
console.error(`Failed to send Slack notification: ${result.error}`);
}
} catch (error) {
console.error(`Error sending Slack notification: ${error.message}`);
}
- name: Fail if there are broken links
if: steps.check_failures.outputs.has_other_failures == 'true' || steps.retry429.outputs.has_429_failures == 'true' || steps.verify_github.outputs.has_missing == 'true'
run: |
echo "Link check failed!"
if [ "${{ steps.check_failures.outputs.has_other_failures }}" == "true" ]; then
echo "There are broken links (non-429 failures) in the report."
fi
if [ "${{ steps.retry429.outputs.has_429_failures }}" == "true" ]; then
echo "Some URLs still returned 429 after exponential backoff retry."
echo "These URLs may need to be excluded or the rate limit needs more time to reset."
fi
if [ "${{ steps.verify_github.outputs.has_missing }}" == "true" ]; then
echo "Some GitHub URLs point to paths that don't exist in the repos."
fi
exit 1