1+ #!/usr/bin/env python3
2+ """
3+ URL Checker Script for Fern Docs Sitemap
4+ Checks all URLs in the sitemap for 404 errors and other issues.
5+ """
6+
7+ import xml .etree .ElementTree as ET
8+ import requests
9+ import time
10+ import sys
11+ from urllib .parse import urlparse
12+ from concurrent .futures import ThreadPoolExecutor , as_completed
13+ import argparse
14+
15+ class URLChecker :
16+ def __init__ (self , sitemap_path , max_workers = 10 , delay = 0.1 , timeout = 30 ):
17+ self .sitemap_path = sitemap_path
18+ self .max_workers = max_workers
19+ self .delay = delay
20+ self .timeout = timeout
21+ self .session = requests .Session ()
22+ self .session .headers .update ({
23+ 'User-Agent' : 'Fern-URL-Checker/1.0'
24+ })
25+
26+ def parse_sitemap (self ):
27+ """Parse the XML sitemap and extract all URLs."""
28+ try :
29+ tree = ET .parse (self .sitemap_path )
30+ root = tree .getroot ()
31+
32+ # Handle namespace
33+ namespace = {'ns' : 'http://www.sitemaps.org/schemas/sitemap/0.9' }
34+ urls = []
35+
36+ for url_elem in root .findall ('ns:url' , namespace ):
37+ loc_elem = url_elem .find ('ns:loc' , namespace )
38+ if loc_elem is not None :
39+ urls .append (loc_elem .text .strip ())
40+
41+ return urls
42+ except ET .ParseError as e :
43+ print (f"❌ Error parsing XML sitemap: { e } " )
44+ return []
45+ except FileNotFoundError :
46+ print (f"❌ Sitemap file not found: { self .sitemap_path } " )
47+ return []
48+
49+ def check_url (self , url ):
50+ """Check a single URL and return result."""
51+ try :
52+ response = self .session .get (url , timeout = self .timeout , allow_redirects = True )
53+ return {
54+ 'url' : url ,
55+ 'status_code' : response .status_code ,
56+ 'final_url' : response .url ,
57+ 'redirected' : url != response .url ,
58+ 'error' : None
59+ }
60+ except requests .exceptions .RequestException as e :
61+ return {
62+ 'url' : url ,
63+ 'status_code' : None ,
64+ 'final_url' : None ,
65+ 'redirected' : False ,
66+ 'error' : str (e )
67+ }
68+
69+ def check_urls (self , urls ):
70+ """Check all URLs concurrently."""
71+ results = []
72+ failed_urls = []
73+ redirect_urls = []
74+
75+ print (f"🔍 Checking { len (urls )} URLs..." )
76+ print (f"⚙️ Using { self .max_workers } workers with { self .delay } s delay" )
77+ print ("=" * 60 )
78+
79+ with ThreadPoolExecutor (max_workers = self .max_workers ) as executor :
80+ # Submit all URL check tasks
81+ future_to_url = {executor .submit (self .check_url , url ): url for url in urls }
82+
83+ for i , future in enumerate (as_completed (future_to_url ), 1 ):
84+ result = future .result ()
85+ results .append (result )
86+
87+ # Add delay between requests
88+ if self .delay > 0 :
89+ time .sleep (self .delay )
90+
91+ # Print progress
92+ if i % 50 == 0 or i == len (urls ):
93+ print (f"Progress: { i } /{ len (urls )} URLs checked" )
94+
95+ # Categorize results
96+ if result ['error' ]:
97+ failed_urls .append (result )
98+ print (f"❌ ERROR: { result ['url' ]} - { result ['error' ]} " )
99+ elif result ['status_code' ] == 404 :
100+ failed_urls .append (result )
101+ print (f"❌ 404: { result ['url' ]} " )
102+ elif result ['status_code' ] >= 400 :
103+ failed_urls .append (result )
104+ print (f"⚠️ { result ['status_code' ]} : { result ['url' ]} " )
105+ elif result ['redirected' ]:
106+ redirect_urls .append (result )
107+ print (f"🔄 REDIRECT: { result ['url' ]} → { result ['final_url' ]} " )
108+ elif result ['status_code' ] == 200 :
109+ print (f"✅ OK: { result ['url' ]} " )
110+ else :
111+ print (f"ℹ️ { result ['status_code' ]} : { result ['url' ]} " )
112+
113+ return results , failed_urls , redirect_urls
114+
115+ def print_summary (self , results , failed_urls , redirect_urls ):
116+ """Print summary of results."""
117+ print ("\n " + "=" * 60 )
118+ print ("📊 SUMMARY" )
119+ print ("=" * 60 )
120+
121+ total_urls = len (results )
122+ success_urls = len ([r for r in results if r ['status_code' ] == 200 and not r ['error' ]])
123+
124+ print (f"Total URLs checked: { total_urls } " )
125+ print (f"✅ Successful (200): { success_urls } " )
126+ print (f"🔄 Redirects: { len (redirect_urls )} " )
127+ print (f"❌ Failed/Errors: { len (failed_urls )} " )
128+
129+ if failed_urls :
130+ print (f"\n ❌ FAILED URLS ({ len (failed_urls )} ):" )
131+ print ("-" * 40 )
132+ for result in failed_urls :
133+ if result ['error' ]:
134+ print (f"ERROR: { result ['url' ]} - { result ['error' ]} " )
135+ else :
136+ print (f"{ result ['status_code' ]} : { result ['url' ]} " )
137+
138+ if redirect_urls :
139+ print (f"\n 🔄 REDIRECTED URLS ({ len (redirect_urls )} ):" )
140+ print ("-" * 40 )
141+ for result in redirect_urls :
142+ print (f"{ result ['url' ]} → { result ['final_url' ]} " )
143+
144+ return len (failed_urls ) == 0
145+
146+ def main ():
147+ parser = argparse .ArgumentParser (description = 'Check URLs in Fern sitemap for 404 errors' )
148+ parser .add_argument ('--sitemap' , default = 'fern/docs.xml' , help = 'Path to sitemap XML file' )
149+ parser .add_argument ('--workers' , type = int , default = 10 , help = 'Number of concurrent workers' )
150+ parser .add_argument ('--delay' , type = float , default = 0.1 , help = 'Delay between requests (seconds)' )
151+ parser .add_argument ('--timeout' , type = int , default = 30 , help = 'Request timeout (seconds)' )
152+ parser .add_argument ('--max-urls' , type = int , help = 'Limit number of URLs to check (for testing)' )
153+
154+ args = parser .parse_args ()
155+
156+ checker = URLChecker (args .sitemap , args .workers , args .delay , args .timeout )
157+
158+ print ("🚀 Fern Docs URL Checker" )
159+ print ("=" * 60 )
160+
161+ # Parse sitemap
162+ urls = checker .parse_sitemap ()
163+ if not urls :
164+ print ("❌ No URLs found in sitemap" )
165+ sys .exit (1 )
166+
167+ # Limit URLs if specified (for testing)
168+ if args .max_urls :
169+ urls = urls [:args .max_urls ]
170+ print (f"🔬 Testing mode: checking first { len (urls )} URLs" )
171+
172+ # Check URLs
173+ results , failed_urls , redirect_urls = checker .check_urls (urls )
174+
175+ # Print summary and exit
176+ success = checker .print_summary (results , failed_urls , redirect_urls )
177+ sys .exit (0 if success else 1 )
178+
179+ if __name__ == "__main__" :
180+ main ()
0 commit comments