-
-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Add external link checking with lychee #15893
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
9592886
f1b1df3
082da39
bb72320
12c24da
061980b
9a514bd
197ca55
d8f00ec
0b94753
e71be85
0cb8b70
6b13baf
86594d3
4a963ad
7419a69
43ebc4d
7534a9e
76cf9a1
f0ea8a6
de36c8e
2c7df02
eb10555
b88debc
06ff59d
2135419
3fb2cab
1879d32
e245022
876f2bf
0df8be4
5e23c12
726582e
4e14ee7
149dc21
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,97 @@ | ||
| name: Check External Links | ||
|
|
||
| on: | ||
| # Run weekly on Sundays at 2 AM UTC | ||
| schedule: | ||
| - cron: '0 2 * * 0' | ||
|
|
||
| # Allow manual triggering | ||
| workflow_dispatch: | ||
|
|
||
| # Run on PRs that modify docs (non-blocking) | ||
| pull_request: | ||
| branches: [master] | ||
|
|
||
| jobs: | ||
| # Job for PRs: check only changed files | ||
| check-pr: | ||
| if: github.event_name == 'pull_request' | ||
| runs-on: ubuntu-latest | ||
|
|
||
| steps: | ||
| - uses: actions/checkout@v4 | ||
| with: | ||
| fetch-depth: 0 | ||
|
|
||
| - name: Get changed files | ||
| id: changed | ||
| run: | | ||
| FILES=$(git diff --name-only --diff-filter=AM origin/${{ github.base_ref }}...HEAD -- '*.md' '*.mdx' || true) | ||
| if [ -z "$FILES" ]; then | ||
| echo "files=" >> $GITHUB_OUTPUT | ||
| echo "No markdown files changed" | ||
| else | ||
| echo "files<<EOF" >> $GITHUB_OUTPUT | ||
| echo "$FILES" >> $GITHUB_OUTPUT | ||
| echo "EOF" >> $GITHUB_OUTPUT | ||
| echo "Changed files:" | ||
| echo "$FILES" | ||
| fi | ||
|
|
||
| - name: Restore lychee cache | ||
| if: steps.changed.outputs.files != '' | ||
| uses: actions/cache/restore@v4 | ||
| with: | ||
| path: .lycheecache | ||
| key: lychee-cache- | ||
| restore-keys: lychee-cache- | ||
|
|
||
| - name: Check external links | ||
| if: steps.changed.outputs.files != '' | ||
| uses: lycheeverse/lychee-action@v2 | ||
| with: | ||
| args: --verbose --no-progress ${{ steps.changed.outputs.files }} | ||
| fail: true | ||
| jobSummary: true | ||
| env: | ||
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
|
|
||
| # Job for scheduled/manual runs: check all files, create issue | ||
| check-full: | ||
| if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' | ||
| runs-on: ubuntu-latest | ||
| permissions: | ||
| issues: write | ||
|
|
||
| steps: | ||
| - uses: actions/checkout@v4 | ||
|
|
||
| # Cache strategy: see lychee.toml for details | ||
| # - Restore previous cache so successful checks are skipped | ||
| # - Transient errors (429, 5xx) are excluded from cache and retried | ||
| # - Save updated cache for next run | ||
| - name: Restore lychee cache | ||
| uses: actions/cache/restore@v4 | ||
| with: | ||
| path: .lycheecache | ||
| key: lychee-cache- | ||
| restore-keys: lychee-cache- | ||
|
|
||
| - name: Check external links | ||
| id: lychee | ||
| uses: lycheeverse/lychee-action@v2 | ||
| with: | ||
| args: --verbose . | ||
| output: ./lychee-report.md | ||
| format: markdown | ||
| fail: true | ||
| jobSummary: true | ||
| env: | ||
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | ||
|
|
||
| - name: Save lychee cache | ||
| uses: actions/cache/save@v4 | ||
| if: always() | ||
| with: | ||
| path: .lycheecache | ||
| key: lychee-cache-${{ github.run_id }} | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -105,3 +105,6 @@ public/og-images/* | |
| yalc.lock | ||
| /public/doctree.json | ||
| /public/doctree-dev.json | ||
|
|
||
| # Lychee cache | ||
| .lycheecache | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,58 @@ | ||
| # URLs to ignore during external link checking | ||
| # Supports regex patterns - lines starting with # are comments | ||
| # Note: Private IPs (localhost, 10.x, 172.16-31.x, 192.168.x) are handled by exclude_all_private in lychee.toml | ||
|
|
||
| # Example/placeholder URLs | ||
| https?://example\.com.* | ||
| https?://your-.* | ||
| https?://.*\.example\..* | ||
| https?://___.*___.* | ||
|
|
||
| # Internal Sentry development URLs | ||
| https?://.*\.getsentry\.net.* | ||
| https?://sentry-content-dashboard\.sentry\.dev.* | ||
|
|
||
| # Sites known to block automated checkers | ||
| https?://twitter\.com.* | ||
| https?://x\.com.* | ||
| https?://linkedin\.com.* | ||
| https?://www\.linkedin\.com.* | ||
| https?://www\.npmjs\.com.* | ||
| https?://search\.maven\.org.* | ||
| https?://medium\.com.* | ||
| https?://.*\.medium\.com.* | ||
| https?://gitlab\.com/oauth/.* | ||
| https?://docs\.gitlab\.com.* | ||
| https?://dev\.epicgames\.com.* | ||
| https?://docs\.unrealengine\.com.* | ||
| https?://cursor\.com.* | ||
| https?://dash\.cloudflare\.com.* | ||
| https?://www\.freedesktop\.org.* | ||
|
|
||
| # TLS compatibility issues (sites work in browser but fail in lychee due to native-tls) | ||
| # bottlepy.org only supports TLS 1.3, incompatible with lychee's TLS backend | ||
| https?://bottlepy\.org.* | ||
|
|
||
| # Cloudflare ECH (Encrypted Client Hello) required - fails even with curl/openssl | ||
| https?://help\.revise\.dev.* | ||
| https?://.*\.intercomhelpcenter\.com.* | ||
|
|
||
| # Rate-limited sites (may fail intermittently with 429) | ||
| https?://godoc\.org.* | ||
| https?://pkg\.go\.dev.* | ||
|
|
||
| # Interactive demos that may not respond to HEAD requests | ||
| https?://demo\.arcade\.software.* | ||
|
|
||
| # Private/internal resources | ||
| https?://.*\.notion\.so.* | ||
| https?://www\.notion\.so.* | ||
| https?://github\.com/getsentry/getsentry.* | ||
| https?://github\.com/getsentry/sentry-options-automator.* | ||
| https?://github\.com/getsentry/etl.* | ||
| https?://sentry\.zendesk\.com.* | ||
|
|
||
| # Placeholder domains commonly used in docs | ||
| https?://api\.example\.com.* | ||
| https?://your-api-host.* | ||
| https?://empowerplant\.io.* |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,58 @@ | ||
| # Lychee configuration for external link checking | ||
| # Documentation: https://github.com/lycheeverse/lychee | ||
|
|
||
| # Base URL to resolve root-relative links | ||
| base_url = "https://docs.sentry.io" | ||
|
|
||
| # Only check HTTP and HTTPS links | ||
| scheme = ["https", "http"] | ||
|
|
||
| # Exclude all private IP addresses automatically (localhost, 10.x, 172.16-31.x, 192.168.x, etc.) | ||
| exclude_all_private = true | ||
|
|
||
| # Exclude internal links (already handled by lint-404s script) | ||
| exclude = ['^https://docs\.sentry\.io'] | ||
|
|
||
| # Maximum number of concurrent requests | ||
| max_concurrency = 32 | ||
|
|
||
| # Maximum number of retries per request | ||
| max_retries = 2 | ||
|
|
||
| # Request timeout in seconds | ||
| timeout = 30 | ||
|
|
||
| # Retry wait time in seconds | ||
| retry_wait_time = 2 | ||
|
|
||
| # User agent (some sites block default user agents) | ||
| user_agent = "Mozilla/5.0 (compatible; Sentry-Docs-Link-Checker; +https://github.com/getsentry/sentry-docs)" | ||
|
|
||
| # Accept common status codes that indicate the link works | ||
| # Include 403 (possibly bot blocking) and 418 (freedesktop teapot) to reduce noise | ||
| accept = [200, 201, 202, 203, 204, 206, 301, 302, 308, 403, 418] | ||
|
|
||
| # Don't validate URL fragments/anchors (e.g., #section-name) | ||
| # Fragment checking is unreliable: JS-rendered anchors appear broken, and many sites don't validate them | ||
| include_fragments = false | ||
|
|
||
| # Only check external links (our internal check handles internal ones) | ||
| include_mail = false | ||
| include_verbatim = false | ||
|
|
||
| # Follow redirects | ||
| max_redirects = 10 | ||
|
|
||
| # Cache settings | ||
| # | ||
| # Strategy: Weekly scheduled runs populate the cache, PR checks consume it. | ||
| # - Successful responses (200, 301, 403, 404) are cached and skipped on subsequent runs | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: The comment in 🔍 Detailed AnalysisThe 💡 Suggested FixTo align the configuration with the documented intent, add 🤖 Prompt for AI AgentDid we get this right? 👍 / 👎 to inform future reviews.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 404 is a response that is successful in terms of giving us the relevant information (does the link point to an existing page or not). Not that it should be accepted as a valid link. |
||
| # - Transient errors (429 rate limits, 5xx server errors) are NOT cached, so they get retried | ||
| # - Cache lifetime is just under 2 weeks so it survives between weekly runs | ||
| # | ||
| # This means each weekly run only re-checks: | ||
| # 1. Links that failed with transient errors last time | ||
| # 2. New links not yet in cache | ||
| cache = true | ||
| max_cache_age = "335h" | ||
| cache_exclude_status = "429, 500.." | ||
|
Comment on lines
+49
to
+58
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: The 🔍 Detailed AnalysisThere is a contradiction in the 💡 Suggested FixTo align the configuration with the documented intent, add 🤖 Prompt for AI AgentDid we get this right? 👍 / 👎 to inform future reviews.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Duplicate of #15893 (comment) |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| /** | ||
| * Pre-commit hook wrapper for lychee external link checker. | ||
| * Runs lychee on provided files and warns on broken links without blocking commits. | ||
| * | ||
| * Usage: bun scripts/lint-external-links.ts [files...] | ||
| */ | ||
|
|
||
| import {spawnSync} from 'child_process'; | ||
|
|
||
| // Check if lychee is installed | ||
| const versionCheck = spawnSync('lychee', ['--version'], { | ||
| encoding: 'utf-8', | ||
| stdio: 'pipe', | ||
| }); | ||
| if (versionCheck.error || versionCheck.status !== 0) { | ||
| console.log('Warning: lychee not installed. Skipping external link check.'); | ||
| console.log( | ||
| 'Install with: brew install lychee (macOS) or cargo install lychee (cross-platform)' | ||
| ); | ||
| process.exit(0); | ||
| } | ||
|
|
||
| const files = process.argv.slice(2); | ||
| if (files.length === 0) { | ||
| process.exit(0); | ||
| } | ||
|
|
||
| // Run lychee on the provided files | ||
| const result = spawnSync('lychee', ['--no-progress', ...files], { | ||
| stdio: 'inherit', | ||
| encoding: 'utf-8', | ||
| }); | ||
|
|
||
| if (result.status !== 0) { | ||
| console.log(''); | ||
| console.log('⚠️ External link issues found (commit not blocked)'); | ||
| } | ||
|
|
||
| // Always exit 0 so commit proceeds | ||
| process.exit(0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Renamed markdown files excluded from PR link checks
The
--diff-filter=AMflag only includes files with git status A (Added) or M (Modified), but excludes files with status R (Renamed). When a markdown file is renamed in a PR—even if it's also modified with new broken links—it won't appear in theFILESlist and won't be checked. The filter could use--diff-filter=AMRto also include renamed files, ensuring their content is validated for broken external links.