22"""
33URL Checker Script for Fern Docs Sitemap
44Checks all URLs in the sitemap for 404 errors and other issues.
5+ Follows complete redirect chains and flags home page redirects as errors.
56"""
67
78import xml .etree .ElementTree as ET
1314import argparse
1415
1516class URLChecker :
16- def __init__ (self , sitemap_path , max_workers = 10 , delay = 0.1 , timeout = 30 ):
17+ def __init__ (self , sitemap_path , max_workers = 10 , delay = 0.1 , timeout = 30 , max_redirects = 10 ):
1718 self .sitemap_path = sitemap_path
1819 self .max_workers = max_workers
1920 self .delay = delay
2021 self .timeout = timeout
22+ self .max_redirects = max_redirects
2123 self .session = requests .Session ()
2224 self .session .headers .update ({
2325 'User-Agent' : 'Fern-URL-Checker/1.0'
2426 })
25- # Define the problematic home page URL
26- self .home_page_url = 'https://fern-api.docs.buildwithfern.com/learn/home'
27+ # Define the problematic home page URLs (multiple variations)
28+ self .home_page_urls = {
29+ 'https://fern-api.docs.buildwithfern.com/learn/home' ,
30+ 'https://fern-v2.docs.buildwithfern.com/learn/v2/home' ,
31+ 'https://buildfern.com/learn/home' ,
32+ 'https://fern-api.docs.buildwithfern.com/learn' ,
33+ 'https://fern-v2.docs.buildwithfern.com/learn' ,
34+ 'https://buildfern.com/learn'
35+ }
2736 # File handle for output logging
2837 self .output_file = None
2938
@@ -57,30 +66,115 @@ def parse_sitemap(self):
5766 self .log (f"❌ Sitemap file not found: { self .sitemap_path } " )
5867 return []
5968
60- def check_url (self , url ):
61- """Check a single URL and return result."""
69+ def is_home_page (self , url ):
70+ """Check if a URL is a home page variant."""
71+ url_clean = url .rstrip ('/' )
72+ return url_clean in {u .rstrip ('/' ) for u in self .home_page_urls }
73+
74+ def follow_redirect_chain (self , url ):
75+ """Follow redirects manually to track the complete chain."""
76+ redirect_chain = [url ]
77+ current_url = url
78+ redirect_count = 0
79+
6280 try :
63- response = self .session .get (url , timeout = self .timeout , allow_redirects = True )
64- is_home_redirect = (url != response .url and
65- response .url .rstrip ('/' ) == self .home_page_url .rstrip ('/' ))
81+ while redirect_count < self .max_redirects :
82+ # Make request without following redirects automatically
83+ response = self .session .get (current_url , timeout = self .timeout , allow_redirects = False )
84+
85+ # Check if this step leads to home page
86+ if self .is_home_page (current_url ):
87+ return {
88+ 'status_code' : response .status_code ,
89+ 'final_url' : current_url ,
90+ 'redirect_chain' : redirect_chain ,
91+ 'redirect_count' : redirect_count ,
92+ 'leads_to_home' : True ,
93+ 'home_at_step' : redirect_count ,
94+ 'error' : None
95+ }
96+
97+ # If not a redirect, we're done
98+ if response .status_code not in [301 , 302 , 303 , 307 , 308 ]:
99+ return {
100+ 'status_code' : response .status_code ,
101+ 'final_url' : current_url ,
102+ 'redirect_chain' : redirect_chain ,
103+ 'redirect_count' : redirect_count ,
104+ 'leads_to_home' : False ,
105+ 'home_at_step' : None ,
106+ 'error' : None
107+ }
108+
109+ # Get redirect location
110+ location = response .headers .get ('Location' )
111+ if not location :
112+ return {
113+ 'status_code' : response .status_code ,
114+ 'final_url' : current_url ,
115+ 'redirect_chain' : redirect_chain ,
116+ 'redirect_count' : redirect_count ,
117+ 'leads_to_home' : False ,
118+ 'home_at_step' : None ,
119+ 'error' : 'Redirect response missing Location header'
120+ }
121+
122+ # Handle relative URLs
123+ if location .startswith ('/' ):
124+ parsed_current = urlparse (current_url )
125+ location = f"{ parsed_current .scheme } ://{ parsed_current .netloc } { location } "
126+ elif not location .startswith ('http' ):
127+ parsed_current = urlparse (current_url )
128+ location = f"{ parsed_current .scheme } ://{ parsed_current .netloc } /{ location } "
129+
130+ redirect_count += 1
131+ current_url = location
132+ redirect_chain .append (current_url )
133+
134+ # Check if we've seen this URL before (redirect loop)
135+ if current_url in redirect_chain [:- 1 ]:
136+ return {
137+ 'status_code' : response .status_code ,
138+ 'final_url' : current_url ,
139+ 'redirect_chain' : redirect_chain ,
140+ 'redirect_count' : redirect_count ,
141+ 'leads_to_home' : False ,
142+ 'home_at_step' : None ,
143+ 'error' : f'Redirect loop detected at step { redirect_count } '
144+ }
145+
146+ # Too many redirects
66147 return {
67- 'url' : url ,
68- 'status_code' : response .status_code ,
69- 'final_url' : response .url ,
70- 'redirected' : url != response .url ,
71- 'home_redirect' : is_home_redirect ,
72- 'error' : None
148+ 'status_code' : None ,
149+ 'final_url' : current_url ,
150+ 'redirect_chain' : redirect_chain ,
151+ 'redirect_count' : redirect_count ,
152+ 'leads_to_home' : False ,
153+ 'home_at_step' : None ,
154+ 'error' : f'Too many redirects (>{ self .max_redirects } )'
73155 }
156+
74157 except requests .exceptions .RequestException as e :
75158 return {
76- 'url' : url ,
77159 'status_code' : None ,
78- 'final_url' : None ,
79- 'redirected' : False ,
80- 'home_redirect' : False ,
160+ 'final_url' : current_url ,
161+ 'redirect_chain' : redirect_chain ,
162+ 'redirect_count' : redirect_count ,
163+ 'leads_to_home' : False ,
164+ 'home_at_step' : None ,
81165 'error' : str (e )
82166 }
83167
168+ def check_url (self , url ):
169+ """Check a single URL and return result with full redirect chain."""
170+ result = self .follow_redirect_chain (url )
171+
172+ # Add original URL for reference
173+ result ['original_url' ] = url
174+ result ['redirected' ] = len (result ['redirect_chain' ]) > 1
175+
176+ return result
177+
84178 def check_urls (self , urls ):
85179 """Check all URLs concurrently."""
86180 results = []
@@ -90,6 +184,7 @@ def check_urls(self, urls):
90184
91185 self .log (f"🔍 Checking { len (urls )} URLs..." )
92186 self .log (f"⚙️ Using { self .max_workers } workers with { self .delay } s delay" )
187+ self .log (f"🔄 Following up to { self .max_redirects } redirects per URL" )
93188 self .log ("=" * 60 )
94189
95190 with ThreadPoolExecutor (max_workers = self .max_workers ) as executor :
@@ -109,25 +204,36 @@ def check_urls(self, urls):
109204 self .log (f"Progress: { i } /{ len (urls )} URLs checked" )
110205
111206 # Categorize results
207+ original_url = result ['original_url' ]
208+
112209 if result ['error' ]:
113210 failed_urls .append (result )
114- self .log (f"❌ ERROR: { result ['url' ]} - { result ['error' ]} " )
211+ self .log (f"❌ ERROR: { original_url } - { result ['error' ]} " )
212+ if result ['redirect_count' ] > 0 :
213+ self .log (f" Chain: { ' → ' .join (result ['redirect_chain' ])} " )
214+ elif result ['leads_to_home' ]:
215+ home_redirect_urls .append (result )
216+ self .log (f"🏠 HOME REDIRECT: { original_url } → HOME (step { result ['home_at_step' ]} )" )
217+ self .log (f" Chain: { ' → ' .join (result ['redirect_chain' ])} " )
115218 elif result ['status_code' ] == 404 :
116219 failed_urls .append (result )
117- self .log (f"❌ 404: { result ['url' ]} " )
118- elif result ['status_code' ] >= 400 :
220+ self .log (f"❌ 404: { original_url } " )
221+ if result ['redirect_count' ] > 0 :
222+ self .log (f" Chain: { ' → ' .join (result ['redirect_chain' ])} " )
223+ elif result ['status_code' ] and result ['status_code' ] >= 400 :
119224 failed_urls .append (result )
120- self .log (f"⚠️ { result ['status_code' ]} : { result ['url' ]} " )
121- elif result ['home_redirect' ]:
122- home_redirect_urls .append (result )
123- self .log (f"🏠 HOME REDIRECT: { result ['url' ]} → { result ['final_url' ]} " )
225+ self .log (f"⚠️ { result ['status_code' ]} : { original_url } " )
226+ if result ['redirect_count' ] > 0 :
227+ self .log (f" Chain: { ' → ' .join (result ['redirect_chain' ])} " )
124228 elif result ['redirected' ]:
125229 redirect_urls .append (result )
126- self .log (f"🔄 REDIRECT: { result ['url' ]} → { result ['final_url' ]} " )
230+ self .log (f"🔄 REDIRECT ({ result ['redirect_count' ]} steps): { original_url } → { result ['final_url' ]} " )
231+ if result ['redirect_count' ] > 1 :
232+ self .log (f" Chain: { ' → ' .join (result ['redirect_chain' ])} " )
127233 elif result ['status_code' ] == 200 :
128- self .log (f"✅ OK: { result [ 'url' ] } " )
234+ self .log (f"✅ OK: { original_url } " )
129235 else :
130- self .log (f"ℹ️ { result ['status_code' ]} : { result [ 'url' ] } " )
236+ self .log (f"ℹ️ { result ['status_code' ]} : { original_url } " )
131237
132238 return results , failed_urls , redirect_urls , home_redirect_urls
133239
@@ -138,51 +244,58 @@ def print_summary(self, results, failed_urls, redirect_urls, home_redirect_urls)
138244 self .log ("=" * 60 )
139245
140246 total_urls = len (results )
141- success_urls = len ([r for r in results if r ['status_code' ] == 200 and not r ['error' ]])
247+ success_urls = len ([r for r in results if r ['status_code' ] == 200 and not r ['error' ] and not r [ 'leads_to_home' ] ])
142248
143249 self .log (f"Total URLs checked: { total_urls } " )
144250 self .log (f"✅ Successful (200): { success_urls } " )
145- self .log (f"🔄 Redirects: { len (redirect_urls )} " )
146- self .log (f"🏠 Home page redirects: { len (home_redirect_urls )} " )
251+ self .log (f"🔄 Redirects (working) : { len (redirect_urls )} " )
252+ self .log (f"🏠 Home page redirects (ERROR) : { len (home_redirect_urls )} " )
147253 self .log (f"❌ Failed/Errors: { len (failed_urls )} " )
148254
255+ if home_redirect_urls :
256+ self .log (f"\n 🏠 HOME PAGE REDIRECTS - FLAGGED AS ERRORS ({ len (home_redirect_urls )} ):" )
257+ self .log ("-" * 40 )
258+ self .log ("⚠️ These URLs redirect to the home page instead of specific content:" )
259+ for result in home_redirect_urls :
260+ self .log (f"{ result ['original_url' ]} (step { result ['home_at_step' ]} )" )
261+ self .log (f" Chain: { ' → ' .join (result ['redirect_chain' ])} " )
262+
149263 if failed_urls :
150264 self .log (f"\n ❌ FAILED URLS ({ len (failed_urls )} ):" )
151265 self .log ("-" * 40 )
152266 for result in failed_urls :
153267 if result ['error' ]:
154- self .log (f"ERROR: { result ['url ' ]} - { result ['error' ]} " )
268+ self .log (f"ERROR: { result ['original_url ' ]} - { result ['error' ]} " )
155269 else :
156- self .log (f"{ result ['status_code' ]} : { result ['url' ]} " )
157-
158- if home_redirect_urls :
159- self .log (f"\n 🏠 HOME PAGE REDIRECTS ({ len (home_redirect_urls )} ):" )
160- self .log ("-" * 40 )
161- self .log ("⚠️ These URLs redirect to the home page instead of specific content:" )
162- for result in home_redirect_urls :
163- self .log (f"{ result ['url' ]} → { result ['final_url' ]} " )
270+ self .log (f"{ result ['status_code' ]} : { result ['original_url' ]} " )
271+ if result ['redirect_count' ] > 0 :
272+ self .log (f" Chain: { ' → ' .join (result ['redirect_chain' ])} " )
164273
165274 if redirect_urls :
166- self .log (f"\n 🔄 OTHER REDIRECTED URLS ({ len (redirect_urls )} ):" )
275+ self .log (f"\n 🔄 WORKING REDIRECTED URLS ({ len (redirect_urls )} ):" )
167276 self .log ("-" * 40 )
168277 for result in redirect_urls :
169- self .log (f"{ result ['url' ]} → { result ['final_url' ]} " )
278+ self .log (f"{ result ['original_url' ]} → { result ['final_url' ]} ({ result ['redirect_count' ]} steps)" )
279+ if result ['redirect_count' ] > 1 :
280+ self .log (f" Chain: { ' → ' .join (result ['redirect_chain' ])} " )
170281
171- # Consider home redirects as problematic for the exit code
172- return len (failed_urls ) == 0 and len (home_redirect_urls ) == 0
282+ # Home redirects are now considered errors
283+ total_errors = len (failed_urls ) + len (home_redirect_urls )
284+ return total_errors == 0
173285
174286def main ():
175- parser = argparse .ArgumentParser (description = 'Check URLs in Fern sitemap for 404 errors' )
287+ parser = argparse .ArgumentParser (description = 'Check URLs in Fern sitemap for 404 errors and home redirects ' )
176288 parser .add_argument ('--sitemap' , default = 'fern/docs.xml' , help = 'Path to sitemap XML file' )
177289 parser .add_argument ('--workers' , type = int , default = 10 , help = 'Number of concurrent workers' )
178290 parser .add_argument ('--delay' , type = float , default = 0.1 , help = 'Delay between requests (seconds)' )
179291 parser .add_argument ('--timeout' , type = int , default = 30 , help = 'Request timeout (seconds)' )
292+ parser .add_argument ('--max-redirects' , type = int , default = 10 , help = 'Maximum number of redirects to follow' )
180293 parser .add_argument ('--max-urls' , type = int , help = 'Limit number of URLs to check (for testing)' )
181294 parser .add_argument ('--output' , default = 'check_urls_output.txt' , help = 'Output file path' )
182295
183296 args = parser .parse_args ()
184297
185- checker = URLChecker (args .sitemap , args .workers , args .delay , args .timeout )
298+ checker = URLChecker (args .sitemap , args .workers , args .delay , args .timeout , args . max_redirects )
186299
187300 # Open output file for writing
188301 try :
@@ -193,7 +306,7 @@ def main():
193306 sys .exit (1 )
194307
195308 try :
196- checker .log ("🚀 Fern Docs URL Checker" )
309+ checker .log ("🚀 Fern Docs URL Checker - Enhanced Redirect Tracking " )
197310 checker .log ("=" * 60 )
198311
199312 # Parse sitemap
@@ -214,7 +327,15 @@ def main():
214327 success = checker .print_summary (results , failed_urls , redirect_urls , home_redirect_urls )
215328
216329 checker .log (f"\n 📁 Results saved to: { args .output } " )
217- sys .exit (0 if success else 1 )
330+
331+ # Exit with error code if there are any issues (including home redirects)
332+ total_issues = len (failed_urls ) + len (home_redirect_urls )
333+ if total_issues > 0 :
334+ checker .log (f"\n ❌ Found { total_issues } issues (including home redirects)" )
335+ sys .exit (1 )
336+ else :
337+ checker .log (f"\n ✅ All URLs are working correctly!" )
338+ sys .exit (0 )
218339
219340 finally :
220341 # Close output file
0 commit comments