Skip to content

Commit 8ec4796

Browse files
Add dead link checker to dev deployment workflow
- Create comprehensive dead link checker script that crawls the deployed site - Check both internal and external links for HTTP status codes - Add step to deploy-dev workflow to run after deployment - Fail deployment if dead links are found - Handle false positives from Google Fonts and Twitter bot detection Co-Authored-By: Alek <[email protected]>
1 parent 8a466ee commit 8ec4796

File tree

3 files changed

+209
-0
lines changed

3 files changed

+209
-0
lines changed

.github/workflows/deploy-dev.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,14 @@ jobs:
4242
id: deploy
4343
run: |
4444
reflex deploy --project ${{ secrets.DEV_PROJECT_ID }} --token ${{ secrets.DEV_TOKEN }} --no-interactive
45+
46+
- name: Check for dead links
47+
run: |
48+
# Install dependencies for dead link checker
49+
pip install -r scripts/requirements.txt
50+
51+
# Wait a moment for deployment to be fully ready
52+
sleep 30
53+
54+
# Run dead link checker
55+
python scripts/check_dead_links.py https://pcweb-gray-orca.rxc.app --max-pages 200 --timeout 15 --delay 1

scripts/check_dead_links.py

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Dead link checker for the Reflex website.
4+
Crawls the deployed site and checks for broken links.
5+
"""
6+
7+
import argparse
8+
import re
9+
import sys
10+
import time
11+
from collections import deque
12+
from urllib.parse import urljoin, urlparse
13+
from urllib.robotparser import RobotFileParser
14+
15+
import requests
16+
from bs4 import BeautifulSoup
17+
18+
19+
class DeadLinkChecker:
20+
def __init__(self, base_url, max_pages=500, timeout=10, delay=0.5):
21+
self.base_url = base_url.rstrip('/')
22+
self.domain = urlparse(base_url).netloc
23+
self.max_pages = max_pages
24+
self.timeout = timeout
25+
self.delay = delay
26+
27+
self.visited_pages = set()
28+
self.checked_links = set()
29+
self.dead_links = []
30+
self.pages_to_visit = deque([base_url])
31+
32+
self.session = requests.Session()
33+
self.session.headers.update({
34+
'User-Agent': 'Mozilla/5.0 (compatible; DeadLinkChecker/1.0)'
35+
})
36+
37+
def is_internal_url(self, url):
38+
"""Check if URL is internal to our domain."""
39+
parsed = urlparse(url)
40+
return parsed.netloc == self.domain or parsed.netloc == ''
41+
42+
def normalize_url(self, url):
43+
"""Normalize URL for comparison."""
44+
parsed = urlparse(url)
45+
normalized = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
46+
if parsed.query:
47+
normalized += f"?{parsed.query}"
48+
return normalized
49+
50+
def check_link(self, url, source_page):
51+
"""Check if a single link is working."""
52+
if url in self.checked_links:
53+
return True
54+
55+
self.checked_links.add(url)
56+
57+
parsed = urlparse(url)
58+
if parsed.netloc in ['fonts.googleapis.com', 'fonts.gstatic.com']:
59+
return True
60+
61+
try:
62+
response = self.session.head(url, timeout=self.timeout, allow_redirects=True)
63+
64+
if response.status_code == 405:
65+
response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
66+
67+
if response.status_code == 403 and 'twitter.com' in url:
68+
print(f"Warning: Twitter link may be blocked by bot detection: {url}")
69+
return True
70+
71+
if response.status_code >= 400:
72+
self.dead_links.append({
73+
'url': url,
74+
'status_code': response.status_code,
75+
'source_page': source_page,
76+
'error': f"HTTP {response.status_code}"
77+
})
78+
return False
79+
80+
except requests.exceptions.RequestException as e:
81+
self.dead_links.append({
82+
'url': url,
83+
'status_code': None,
84+
'source_page': source_page,
85+
'error': str(e)
86+
})
87+
return False
88+
89+
return True
90+
91+
def extract_links(self, html, page_url):
92+
"""Extract all links from HTML content."""
93+
soup = BeautifulSoup(html, 'html.parser')
94+
links = []
95+
96+
for tag in soup.find_all(['a', 'link', 'img', 'script']):
97+
url = None
98+
if tag.name == 'a':
99+
url = tag.get('href')
100+
elif tag.name == 'link':
101+
url = tag.get('href')
102+
elif tag.name == 'img':
103+
url = tag.get('src')
104+
elif tag.name == 'script':
105+
url = tag.get('src')
106+
107+
if url:
108+
absolute_url = urljoin(page_url, url)
109+
if not absolute_url.startswith(('javascript:', 'mailto:', 'tel:')):
110+
links.append(absolute_url)
111+
112+
return links
113+
114+
def crawl_page(self, url):
115+
"""Crawl a single page and extract links."""
116+
if url in self.visited_pages or len(self.visited_pages) >= self.max_pages:
117+
return []
118+
119+
self.visited_pages.add(url)
120+
print(f"Crawling: {url}")
121+
122+
try:
123+
response = self.session.get(url, timeout=self.timeout)
124+
response.raise_for_status()
125+
126+
content_type = response.headers.get('content-type', '').lower()
127+
if 'text/html' not in content_type:
128+
return []
129+
130+
links = self.extract_links(response.text, url)
131+
132+
for link in links:
133+
self.check_link(link, url)
134+
135+
if self.is_internal_url(link):
136+
normalized = self.normalize_url(link)
137+
if normalized not in self.visited_pages:
138+
self.pages_to_visit.append(normalized)
139+
140+
time.sleep(self.delay)
141+
return links
142+
143+
except requests.exceptions.RequestException as e:
144+
print(f"Error crawling {url}: {e}")
145+
return []
146+
147+
def run(self):
148+
"""Run the dead link checker."""
149+
print(f"Starting dead link check for {self.base_url}")
150+
print(f"Max pages: {self.max_pages}, Timeout: {self.timeout}s")
151+
152+
while self.pages_to_visit and len(self.visited_pages) < self.max_pages:
153+
url = self.pages_to_visit.popleft()
154+
self.crawl_page(url)
155+
156+
print(f"\nCrawl complete!")
157+
print(f"Pages visited: {len(self.visited_pages)}")
158+
print(f"Links checked: {len(self.checked_links)}")
159+
print(f"Dead links found: {len(self.dead_links)}")
160+
161+
if self.dead_links:
162+
print("\n❌ DEAD LINKS FOUND:")
163+
for link_info in self.dead_links:
164+
print(f" URL: {link_info['url']}")
165+
print(f" Error: {link_info['error']}")
166+
print(f" Found on: {link_info['source_page']}")
167+
print()
168+
return False
169+
else:
170+
print("\n✅ No dead links found!")
171+
return True
172+
173+
174+
def main():
175+
parser = argparse.ArgumentParser(description='Check for dead links on a website')
176+
parser.add_argument('url', help='Base URL to start crawling from')
177+
parser.add_argument('--max-pages', type=int, default=500, help='Maximum pages to crawl')
178+
parser.add_argument('--timeout', type=int, default=10, help='Request timeout in seconds')
179+
parser.add_argument('--delay', type=float, default=0.5, help='Delay between requests')
180+
181+
args = parser.parse_args()
182+
183+
checker = DeadLinkChecker(
184+
base_url=args.url,
185+
max_pages=args.max_pages,
186+
timeout=args.timeout,
187+
delay=args.delay
188+
)
189+
190+
success = checker.run()
191+
sys.exit(0 if success else 1)
192+
193+
194+
if __name__ == '__main__':
195+
main()

scripts/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
requests>=2.25.0
2+
beautifulsoup4>=4.9.0
3+
lxml>=4.6.0

0 commit comments

Comments
 (0)