Skip to content

Commit 5db8b12

Browse files
kgowruKapil Gowru
andauthored
07 22 updating redirects (#95)
Co-authored-by: Kapil Gowru <[email protected]>
1 parent df6ea17 commit 5db8b12

File tree

17 files changed

+4182
-984
lines changed

17 files changed

+4182
-984
lines changed

check_urls.py

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
#!/usr/bin/env python3
2+
"""
3+
URL Checker Script for Fern Docs Sitemap
4+
Checks all URLs in the sitemap for 404 errors and other issues.
5+
"""
6+
7+
import xml.etree.ElementTree as ET
8+
import requests
9+
import time
10+
import sys
11+
from urllib.parse import urlparse
12+
from concurrent.futures import ThreadPoolExecutor, as_completed
13+
import argparse
14+
15+
class URLChecker:
16+
def __init__(self, sitemap_path, max_workers=10, delay=0.1, timeout=30):
17+
self.sitemap_path = sitemap_path
18+
self.max_workers = max_workers
19+
self.delay = delay
20+
self.timeout = timeout
21+
self.session = requests.Session()
22+
self.session.headers.update({
23+
'User-Agent': 'Fern-URL-Checker/1.0'
24+
})
25+
26+
def parse_sitemap(self):
27+
"""Parse the XML sitemap and extract all URLs."""
28+
try:
29+
tree = ET.parse(self.sitemap_path)
30+
root = tree.getroot()
31+
32+
# Handle namespace
33+
namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
34+
urls = []
35+
36+
for url_elem in root.findall('ns:url', namespace):
37+
loc_elem = url_elem.find('ns:loc', namespace)
38+
if loc_elem is not None:
39+
urls.append(loc_elem.text.strip())
40+
41+
return urls
42+
except ET.ParseError as e:
43+
print(f"❌ Error parsing XML sitemap: {e}")
44+
return []
45+
except FileNotFoundError:
46+
print(f"❌ Sitemap file not found: {self.sitemap_path}")
47+
return []
48+
49+
def check_url(self, url):
50+
"""Check a single URL and return result."""
51+
try:
52+
response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
53+
return {
54+
'url': url,
55+
'status_code': response.status_code,
56+
'final_url': response.url,
57+
'redirected': url != response.url,
58+
'error': None
59+
}
60+
except requests.exceptions.RequestException as e:
61+
return {
62+
'url': url,
63+
'status_code': None,
64+
'final_url': None,
65+
'redirected': False,
66+
'error': str(e)
67+
}
68+
69+
def check_urls(self, urls):
70+
"""Check all URLs concurrently."""
71+
results = []
72+
failed_urls = []
73+
redirect_urls = []
74+
75+
print(f"🔍 Checking {len(urls)} URLs...")
76+
print(f"⚙️ Using {self.max_workers} workers with {self.delay}s delay")
77+
print("=" * 60)
78+
79+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
80+
# Submit all URL check tasks
81+
future_to_url = {executor.submit(self.check_url, url): url for url in urls}
82+
83+
for i, future in enumerate(as_completed(future_to_url), 1):
84+
result = future.result()
85+
results.append(result)
86+
87+
# Add delay between requests
88+
if self.delay > 0:
89+
time.sleep(self.delay)
90+
91+
# Print progress
92+
if i % 50 == 0 or i == len(urls):
93+
print(f"Progress: {i}/{len(urls)} URLs checked")
94+
95+
# Categorize results
96+
if result['error']:
97+
failed_urls.append(result)
98+
print(f"❌ ERROR: {result['url']} - {result['error']}")
99+
elif result['status_code'] == 404:
100+
failed_urls.append(result)
101+
print(f"❌ 404: {result['url']}")
102+
elif result['status_code'] >= 400:
103+
failed_urls.append(result)
104+
print(f"⚠️ {result['status_code']}: {result['url']}")
105+
elif result['redirected']:
106+
redirect_urls.append(result)
107+
print(f"🔄 REDIRECT: {result['url']}{result['final_url']}")
108+
elif result['status_code'] == 200:
109+
print(f"✅ OK: {result['url']}")
110+
else:
111+
print(f"ℹ️ {result['status_code']}: {result['url']}")
112+
113+
return results, failed_urls, redirect_urls
114+
115+
def print_summary(self, results, failed_urls, redirect_urls):
116+
"""Print summary of results."""
117+
print("\n" + "=" * 60)
118+
print("📊 SUMMARY")
119+
print("=" * 60)
120+
121+
total_urls = len(results)
122+
success_urls = len([r for r in results if r['status_code'] == 200 and not r['error']])
123+
124+
print(f"Total URLs checked: {total_urls}")
125+
print(f"✅ Successful (200): {success_urls}")
126+
print(f"🔄 Redirects: {len(redirect_urls)}")
127+
print(f"❌ Failed/Errors: {len(failed_urls)}")
128+
129+
if failed_urls:
130+
print(f"\n❌ FAILED URLS ({len(failed_urls)}):")
131+
print("-" * 40)
132+
for result in failed_urls:
133+
if result['error']:
134+
print(f"ERROR: {result['url']} - {result['error']}")
135+
else:
136+
print(f"{result['status_code']}: {result['url']}")
137+
138+
if redirect_urls:
139+
print(f"\n🔄 REDIRECTED URLS ({len(redirect_urls)}):")
140+
print("-" * 40)
141+
for result in redirect_urls:
142+
print(f"{result['url']}{result['final_url']}")
143+
144+
return len(failed_urls) == 0
145+
146+
def main():
147+
parser = argparse.ArgumentParser(description='Check URLs in Fern sitemap for 404 errors')
148+
parser.add_argument('--sitemap', default='fern/docs.xml', help='Path to sitemap XML file')
149+
parser.add_argument('--workers', type=int, default=10, help='Number of concurrent workers')
150+
parser.add_argument('--delay', type=float, default=0.1, help='Delay between requests (seconds)')
151+
parser.add_argument('--timeout', type=int, default=30, help='Request timeout (seconds)')
152+
parser.add_argument('--max-urls', type=int, help='Limit number of URLs to check (for testing)')
153+
154+
args = parser.parse_args()
155+
156+
checker = URLChecker(args.sitemap, args.workers, args.delay, args.timeout)
157+
158+
print("🚀 Fern Docs URL Checker")
159+
print("=" * 60)
160+
161+
# Parse sitemap
162+
urls = checker.parse_sitemap()
163+
if not urls:
164+
print("❌ No URLs found in sitemap")
165+
sys.exit(1)
166+
167+
# Limit URLs if specified (for testing)
168+
if args.max_urls:
169+
urls = urls[:args.max_urls]
170+
print(f"🔬 Testing mode: checking first {len(urls)} URLs")
171+
172+
# Check URLs
173+
results, failed_urls, redirect_urls = checker.check_urls(urls)
174+
175+
# Print summary and exit
176+
success = checker.print_summary(results, failed_urls, redirect_urls)
177+
sys.exit(0 if success else 1)
178+
179+
if __name__ == "__main__":
180+
main()

0 commit comments

Comments
 (0)