fern-api · kgowru · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025
diff --git a/check_urls.py b/check_urls.py
@@ -22,6 +22,17 @@ def __init__(self, sitemap_path, max_workers=10, delay=0.1, timeout=30):
         self.session.headers.update({
             'User-Agent': 'Fern-URL-Checker/1.0'
         })
+        # Define the problematic home page URL
+        self.home_page_url = 'https://fern-api.docs.buildwithfern.com/learn/home'
+        # File handle for output logging
+        self.output_file = None
+
+    def log(self, message):
+        """Print to console and write to file if file is open."""
+        print(message)
+        if self.output_file:
+            self.output_file.write(message + '\n')
+            self.output_file.flush()  # Ensure immediate write
 
     def parse_sitemap(self):
         """Parse the XML sitemap and extract all URLs."""
@@ -40,21 +51,24 @@ def parse_sitemap(self):
 
             return urls
         except ET.ParseError as e:
-            print(f"❌ Error parsing XML sitemap: {e}")
+            self.log(f"❌ Error parsing XML sitemap: {e}")
             return []
         except FileNotFoundError:
-            print(f"❌ Sitemap file not found: {self.sitemap_path}")
+            self.log(f"❌ Sitemap file not found: {self.sitemap_path}")
             return []
 
     def check_url(self, url):
         """Check a single URL and return result."""
         try:
             response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
+            is_home_redirect = (url != response.url and 
+                              response.url.rstrip('/') == self.home_page_url.rstrip('/'))
             return {
                 'url': url,
                 'status_code': response.status_code,
                 'final_url': response.url,
                 'redirected': url != response.url,
+                'home_redirect': is_home_redirect,
                 'error': None
             }
         except requests.exceptions.RequestException as e:
@@ -63,6 +77,7 @@ def check_url(self, url):
                 'status_code': None,
                 'final_url': None,
                 'redirected': False,
+                'home_redirect': False,
                 'error': str(e)
             }
 
@@ -71,10 +86,11 @@ def check_urls(self, urls):
         results = []
         failed_urls = []
         redirect_urls = []
+        home_redirect_urls = []
 
-        print(f"🔍 Checking {len(urls)} URLs...")
-        print(f"⚙️  Using {self.max_workers} workers with {self.delay}s delay")
-        print("=" * 60)
+        self.log(f"🔍 Checking {len(urls)} URLs...")
+        self.log(f"⚙️  Using {self.max_workers} workers with {self.delay}s delay")
+        self.log("=" * 60)
 
         with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
             # Submit all URL check tasks
@@ -90,58 +106,70 @@ def check_urls(self, urls):
 
                 # Print progress
                 if i % 50 == 0 or i == len(urls):
-                    print(f"Progress: {i}/{len(urls)} URLs checked")
+                    self.log(f"Progress: {i}/{len(urls)} URLs checked")
 
                 # Categorize results
                 if result['error']:
                     failed_urls.append(result)
-                    print(f"❌ ERROR: {result['url']} - {result['error']}")
+                    self.log(f"❌ ERROR: {result['url']} - {result['error']}")
                 elif result['status_code'] == 404:
                     failed_urls.append(result)
-                    print(f"❌ 404: {result['url']}")
+                    self.log(f"❌ 404: {result['url']}")
                 elif result['status_code'] >= 400:
                     failed_urls.append(result)
-                    print(f"⚠️  {result['status_code']}: {result['url']}")
+                    self.log(f"⚠️  {result['status_code']}: {result['url']}")
+                elif result['home_redirect']:
+                    home_redirect_urls.append(result)
+                    self.log(f"🏠 HOME REDIRECT: {result['url']} → {result['final_url']}")
                 elif result['redirected']:
                     redirect_urls.append(result)
-                    print(f"🔄 REDIRECT: {result['url']} → {result['final_url']}")
+                    self.log(f"🔄 REDIRECT: {result['url']} → {result['final_url']}")
                 elif result['status_code'] == 200:
-                    print(f"✅ OK: {result['url']}")
+                    self.log(f"✅ OK: {result['url']}")
                 else:
-                    print(f"ℹ️  {result['status_code']}: {result['url']}")
+                    self.log(f"ℹ️  {result['status_code']}: {result['url']}")
 
-        return results, failed_urls, redirect_urls
+        return results, failed_urls, redirect_urls, home_redirect_urls
 
-    def print_summary(self, results, failed_urls, redirect_urls):
+    def print_summary(self, results, failed_urls, redirect_urls, home_redirect_urls):
         """Print summary of results."""
-        print("\n" + "=" * 60)
-        print("📊 SUMMARY")
-        print("=" * 60)
+        self.log("\n" + "=" * 60)
+        self.log("📊 SUMMARY")
+        self.log("=" * 60)
 
         total_urls = len(results)
         success_urls = len([r for r in results if r['status_code'] == 200 and not r['error']])
 
-        print(f"Total URLs checked: {total_urls}")
-        print(f"✅ Successful (200): {success_urls}")
-        print(f"🔄 Redirects: {len(redirect_urls)}")
-        print(f"❌ Failed/Errors: {len(failed_urls)}")
+        self.log(f"Total URLs checked: {total_urls}")
+        self.log(f"✅ Successful (200): {success_urls}")
+        self.log(f"🔄 Redirects: {len(redirect_urls)}")
+        self.log(f"🏠 Home page redirects: {len(home_redirect_urls)}")
+        self.log(f"❌ Failed/Errors: {len(failed_urls)}")
 
         if failed_urls:
-            print(f"\n❌ FAILED URLS ({len(failed_urls)}):")
-            print("-" * 40)
+            self.log(f"\n❌ FAILED URLS ({len(failed_urls)}):")
+            self.log("-" * 40)
             for result in failed_urls:
                 if result['error']:
-                    print(f"ERROR: {result['url']} - {result['error']}")
+                    self.log(f"ERROR: {result['url']} - {result['error']}")
                 else:
-                    print(f"{result['status_code']}: {result['url']}")
+                    self.log(f"{result['status_code']}: {result['url']}")
+
+        if home_redirect_urls:
+            self.log(f"\n🏠 HOME PAGE REDIRECTS ({len(home_redirect_urls)}):")
+            self.log("-" * 40)
+            self.log("⚠️  These URLs redirect to the home page instead of specific content:")
+            for result in home_redirect_urls:
+                self.log(f"{result['url']} → {result['final_url']}")
 
         if redirect_urls:
-            print(f"\n🔄 REDIRECTED URLS ({len(redirect_urls)}):")
-            print("-" * 40)
+            self.log(f"\n🔄 OTHER REDIRECTED URLS ({len(redirect_urls)}):")
+            self.log("-" * 40)
             for result in redirect_urls:
-                print(f"{result['url']} → {result['final_url']}")
+                self.log(f"{result['url']} → {result['final_url']}")
 
-        return len(failed_urls) == 0
+        # Consider home redirects as problematic for the exit code
+        return len(failed_urls) == 0 and len(home_redirect_urls) == 0
 
 def main():
     parser = argparse.ArgumentParser(description='Check URLs in Fern sitemap for 404 errors')
@@ -150,31 +178,48 @@ def main():
     parser.add_argument('--delay', type=float, default=0.1, help='Delay between requests (seconds)')
     parser.add_argument('--timeout', type=int, default=30, help='Request timeout (seconds)')
     parser.add_argument('--max-urls', type=int, help='Limit number of URLs to check (for testing)')
+    parser.add_argument('--output', default='check_urls_output.txt', help='Output file path')
 
     args = parser.parse_args()
 
     checker = URLChecker(args.sitemap, args.workers, args.delay, args.timeout)
 
-    print("🚀 Fern Docs URL Checker")
-    print("=" * 60)
-
-    # Parse sitemap
-    urls = checker.parse_sitemap()
-    if not urls:
-        print("❌ No URLs found in sitemap")
+    # Open output file for writing
+    try:
+        checker.output_file = open(args.output, 'w', encoding='utf-8')
+        checker.log(f"📝 Output will be saved to: {args.output}")
+    except IOError as e:
+        print(f"❌ Error opening output file {args.output}: {e}")
         sys.exit(1)
 
-    # Limit URLs if specified (for testing)
-    if args.max_urls:
-        urls = urls[:args.max_urls]
-        print(f"🔬 Testing mode: checking first {len(urls)} URLs")
-
-    # Check URLs
-    results, failed_urls, redirect_urls = checker.check_urls(urls)
-
-    # Print summary and exit
-    success = checker.print_summary(results, failed_urls, redirect_urls)
-    sys.exit(0 if success else 1)
+    try:
+        checker.log("🚀 Fern Docs URL Checker")
+        checker.log("=" * 60)
+
+        # Parse sitemap
+        urls = checker.parse_sitemap()
+        if not urls:
+            checker.log("❌ No URLs found in sitemap")
+            sys.exit(1)
+
+        # Limit URLs if specified (for testing)
+        if args.max_urls:
+            urls = urls[:args.max_urls]
+            checker.log(f"🔬 Testing mode: checking first {len(urls)} URLs")
+
+        # Check URLs
+        results, failed_urls, redirect_urls, home_redirect_urls = checker.check_urls(urls)
+
+        # Print summary and exit
+        success = checker.print_summary(results, failed_urls, redirect_urls, home_redirect_urls)
+
+        checker.log(f"\n📁 Results saved to: {args.output}")
+        sys.exit(0 if success else 1)
+
+    finally:
+        # Close output file
+        if checker.output_file:
+            checker.output_file.close()
 
 if __name__ == "__main__":
     main()