@@ -22,6 +22,17 @@ def __init__(self, sitemap_path, max_workers=10, delay=0.1, timeout=30):
2222 self .session .headers .update ({
2323 'User-Agent' : 'Fern-URL-Checker/1.0'
2424 })
25+ # Define the problematic home page URL
26+ self .home_page_url = 'https://fern-api.docs.buildwithfern.com/learn/home'
27+ # File handle for output logging
28+ self .output_file = None
29+
30+ def log (self , message ):
31+ """Print to console and write to file if file is open."""
32+ print (message )
33+ if self .output_file :
34+ self .output_file .write (message + '\n ' )
35+ self .output_file .flush () # Ensure immediate write
2536
2637 def parse_sitemap (self ):
2738 """Parse the XML sitemap and extract all URLs."""
@@ -40,21 +51,24 @@ def parse_sitemap(self):
4051
4152 return urls
4253 except ET .ParseError as e :
43- print (f"❌ Error parsing XML sitemap: { e } " )
54+ self . log (f"❌ Error parsing XML sitemap: { e } " )
4455 return []
4556 except FileNotFoundError :
46- print (f"❌ Sitemap file not found: { self .sitemap_path } " )
57+ self . log (f"❌ Sitemap file not found: { self .sitemap_path } " )
4758 return []
4859
4960 def check_url (self , url ):
5061 """Check a single URL and return result."""
5162 try :
5263 response = self .session .get (url , timeout = self .timeout , allow_redirects = True )
64+ is_home_redirect = (url != response .url and
65+ response .url .rstrip ('/' ) == self .home_page_url .rstrip ('/' ))
5366 return {
5467 'url' : url ,
5568 'status_code' : response .status_code ,
5669 'final_url' : response .url ,
5770 'redirected' : url != response .url ,
71+ 'home_redirect' : is_home_redirect ,
5872 'error' : None
5973 }
6074 except requests .exceptions .RequestException as e :
@@ -63,6 +77,7 @@ def check_url(self, url):
6377 'status_code' : None ,
6478 'final_url' : None ,
6579 'redirected' : False ,
80+ 'home_redirect' : False ,
6681 'error' : str (e )
6782 }
6883
@@ -71,10 +86,11 @@ def check_urls(self, urls):
7186 results = []
7287 failed_urls = []
7388 redirect_urls = []
89+ home_redirect_urls = []
7490
75- print (f"🔍 Checking { len (urls )} URLs..." )
76- print (f"⚙️ Using { self .max_workers } workers with { self .delay } s delay" )
77- print ("=" * 60 )
91+ self . log (f"🔍 Checking { len (urls )} URLs..." )
92+ self . log (f"⚙️ Using { self .max_workers } workers with { self .delay } s delay" )
93+ self . log ("=" * 60 )
7894
7995 with ThreadPoolExecutor (max_workers = self .max_workers ) as executor :
8096 # Submit all URL check tasks
@@ -90,58 +106,70 @@ def check_urls(self, urls):
90106
91107 # Print progress
92108 if i % 50 == 0 or i == len (urls ):
93- print (f"Progress: { i } /{ len (urls )} URLs checked" )
109+ self . log (f"Progress: { i } /{ len (urls )} URLs checked" )
94110
95111 # Categorize results
96112 if result ['error' ]:
97113 failed_urls .append (result )
98- print (f"❌ ERROR: { result ['url' ]} - { result ['error' ]} " )
114+ self . log (f"❌ ERROR: { result ['url' ]} - { result ['error' ]} " )
99115 elif result ['status_code' ] == 404 :
100116 failed_urls .append (result )
101- print (f"❌ 404: { result ['url' ]} " )
117+ self . log (f"❌ 404: { result ['url' ]} " )
102118 elif result ['status_code' ] >= 400 :
103119 failed_urls .append (result )
104- print (f"⚠️ { result ['status_code' ]} : { result ['url' ]} " )
120+ self .log (f"⚠️ { result ['status_code' ]} : { result ['url' ]} " )
121+ elif result ['home_redirect' ]:
122+ home_redirect_urls .append (result )
123+ self .log (f"🏠 HOME REDIRECT: { result ['url' ]} → { result ['final_url' ]} " )
105124 elif result ['redirected' ]:
106125 redirect_urls .append (result )
107- print (f"🔄 REDIRECT: { result ['url' ]} → { result ['final_url' ]} " )
126+ self . log (f"🔄 REDIRECT: { result ['url' ]} → { result ['final_url' ]} " )
108127 elif result ['status_code' ] == 200 :
109- print (f"✅ OK: { result ['url' ]} " )
128+ self . log (f"✅ OK: { result ['url' ]} " )
110129 else :
111- print (f"ℹ️ { result ['status_code' ]} : { result ['url' ]} " )
130+ self . log (f"ℹ️ { result ['status_code' ]} : { result ['url' ]} " )
112131
113- return results , failed_urls , redirect_urls
132+ return results , failed_urls , redirect_urls , home_redirect_urls
114133
115- def print_summary (self , results , failed_urls , redirect_urls ):
134+ def print_summary (self , results , failed_urls , redirect_urls , home_redirect_urls ):
116135 """Print summary of results."""
117- print ("\n " + "=" * 60 )
118- print ("📊 SUMMARY" )
119- print ("=" * 60 )
136+ self . log ("\n " + "=" * 60 )
137+ self . log ("📊 SUMMARY" )
138+ self . log ("=" * 60 )
120139
121140 total_urls = len (results )
122141 success_urls = len ([r for r in results if r ['status_code' ] == 200 and not r ['error' ]])
123142
124- print (f"Total URLs checked: { total_urls } " )
125- print (f"✅ Successful (200): { success_urls } " )
126- print (f"🔄 Redirects: { len (redirect_urls )} " )
127- print (f"❌ Failed/Errors: { len (failed_urls )} " )
143+ self .log (f"Total URLs checked: { total_urls } " )
144+ self .log (f"✅ Successful (200): { success_urls } " )
145+ self .log (f"🔄 Redirects: { len (redirect_urls )} " )
146+ self .log (f"🏠 Home page redirects: { len (home_redirect_urls )} " )
147+ self .log (f"❌ Failed/Errors: { len (failed_urls )} " )
128148
129149 if failed_urls :
130- print (f"\n ❌ FAILED URLS ({ len (failed_urls )} ):" )
131- print ("-" * 40 )
150+ self . log (f"\n ❌ FAILED URLS ({ len (failed_urls )} ):" )
151+ self . log ("-" * 40 )
132152 for result in failed_urls :
133153 if result ['error' ]:
134- print (f"ERROR: { result ['url' ]} - { result ['error' ]} " )
154+ self . log (f"ERROR: { result ['url' ]} - { result ['error' ]} " )
135155 else :
136- print (f"{ result ['status_code' ]} : { result ['url' ]} " )
156+ self .log (f"{ result ['status_code' ]} : { result ['url' ]} " )
157+
158+ if home_redirect_urls :
159+ self .log (f"\n 🏠 HOME PAGE REDIRECTS ({ len (home_redirect_urls )} ):" )
160+ self .log ("-" * 40 )
161+ self .log ("⚠️ These URLs redirect to the home page instead of specific content:" )
162+ for result in home_redirect_urls :
163+ self .log (f"{ result ['url' ]} → { result ['final_url' ]} " )
137164
138165 if redirect_urls :
139- print (f"\n 🔄 REDIRECTED URLS ({ len (redirect_urls )} ):" )
140- print ("-" * 40 )
166+ self . log (f"\n 🔄 OTHER REDIRECTED URLS ({ len (redirect_urls )} ):" )
167+ self . log ("-" * 40 )
141168 for result in redirect_urls :
142- print (f"{ result ['url' ]} → { result ['final_url' ]} " )
169+ self . log (f"{ result ['url' ]} → { result ['final_url' ]} " )
143170
144- return len (failed_urls ) == 0
171+ # Consider home redirects as problematic for the exit code
172+ return len (failed_urls ) == 0 and len (home_redirect_urls ) == 0
145173
146174def main ():
147175 parser = argparse .ArgumentParser (description = 'Check URLs in Fern sitemap for 404 errors' )
@@ -150,31 +178,48 @@ def main():
150178 parser .add_argument ('--delay' , type = float , default = 0.1 , help = 'Delay between requests (seconds)' )
151179 parser .add_argument ('--timeout' , type = int , default = 30 , help = 'Request timeout (seconds)' )
152180 parser .add_argument ('--max-urls' , type = int , help = 'Limit number of URLs to check (for testing)' )
181+ parser .add_argument ('--output' , default = 'check_urls_output.txt' , help = 'Output file path' )
153182
154183 args = parser .parse_args ()
155184
156185 checker = URLChecker (args .sitemap , args .workers , args .delay , args .timeout )
157186
158- print ("🚀 Fern Docs URL Checker" )
159- print ("=" * 60 )
160-
161- # Parse sitemap
162- urls = checker .parse_sitemap ()
163- if not urls :
164- print ("❌ No URLs found in sitemap" )
187+ # Open output file for writing
188+ try :
189+ checker .output_file = open (args .output , 'w' , encoding = 'utf-8' )
190+ checker .log (f"📝 Output will be saved to: { args .output } " )
191+ except IOError as e :
192+ print (f"❌ Error opening output file { args .output } : { e } " )
165193 sys .exit (1 )
166194
167- # Limit URLs if specified (for testing)
168- if args .max_urls :
169- urls = urls [:args .max_urls ]
170- print (f"🔬 Testing mode: checking first { len (urls )} URLs" )
171-
172- # Check URLs
173- results , failed_urls , redirect_urls = checker .check_urls (urls )
174-
175- # Print summary and exit
176- success = checker .print_summary (results , failed_urls , redirect_urls )
177- sys .exit (0 if success else 1 )
195+ try :
196+ checker .log ("🚀 Fern Docs URL Checker" )
197+ checker .log ("=" * 60 )
198+
199+ # Parse sitemap
200+ urls = checker .parse_sitemap ()
201+ if not urls :
202+ checker .log ("❌ No URLs found in sitemap" )
203+ sys .exit (1 )
204+
205+ # Limit URLs if specified (for testing)
206+ if args .max_urls :
207+ urls = urls [:args .max_urls ]
208+ checker .log (f"🔬 Testing mode: checking first { len (urls )} URLs" )
209+
210+ # Check URLs
211+ results , failed_urls , redirect_urls , home_redirect_urls = checker .check_urls (urls )
212+
213+ # Print summary and exit
214+ success = checker .print_summary (results , failed_urls , redirect_urls , home_redirect_urls )
215+
216+ checker .log (f"\n 📁 Results saved to: { args .output } " )
217+ sys .exit (0 if success else 1 )
218+
219+ finally :
220+ # Close output file
221+ if checker .output_file :
222+ checker .output_file .close ()
178223
179224if __name__ == "__main__" :
180225 main ()
0 commit comments