Skip to content

Commit a6f398c

Browse files
kgowruKapil Gowrufern-support
authored
07 22 verifying redirects (#98)
Co-authored-by: Kapil Gowru <[email protected]> Co-authored-by: Fern Support <[email protected]>
1 parent 8ff5655 commit a6f398c

File tree

14 files changed

+4239
-1362
lines changed

14 files changed

+4239
-1362
lines changed

check_urls.py

Lines changed: 92 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,17 @@ def __init__(self, sitemap_path, max_workers=10, delay=0.1, timeout=30):
2222
self.session.headers.update({
2323
'User-Agent': 'Fern-URL-Checker/1.0'
2424
})
25+
# Define the problematic home page URL
26+
self.home_page_url = 'https://fern-api.docs.buildwithfern.com/learn/home'
27+
# File handle for output logging
28+
self.output_file = None
29+
30+
def log(self, message):
31+
"""Print to console and write to file if file is open."""
32+
print(message)
33+
if self.output_file:
34+
self.output_file.write(message + '\n')
35+
self.output_file.flush() # Ensure immediate write
2536

2637
def parse_sitemap(self):
2738
"""Parse the XML sitemap and extract all URLs."""
@@ -40,21 +51,24 @@ def parse_sitemap(self):
4051

4152
return urls
4253
except ET.ParseError as e:
43-
print(f"❌ Error parsing XML sitemap: {e}")
54+
self.log(f"❌ Error parsing XML sitemap: {e}")
4455
return []
4556
except FileNotFoundError:
46-
print(f"❌ Sitemap file not found: {self.sitemap_path}")
57+
self.log(f"❌ Sitemap file not found: {self.sitemap_path}")
4758
return []
4859

4960
def check_url(self, url):
5061
"""Check a single URL and return result."""
5162
try:
5263
response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
64+
is_home_redirect = (url != response.url and
65+
response.url.rstrip('/') == self.home_page_url.rstrip('/'))
5366
return {
5467
'url': url,
5568
'status_code': response.status_code,
5669
'final_url': response.url,
5770
'redirected': url != response.url,
71+
'home_redirect': is_home_redirect,
5872
'error': None
5973
}
6074
except requests.exceptions.RequestException as e:
@@ -63,6 +77,7 @@ def check_url(self, url):
6377
'status_code': None,
6478
'final_url': None,
6579
'redirected': False,
80+
'home_redirect': False,
6681
'error': str(e)
6782
}
6883

@@ -71,10 +86,11 @@ def check_urls(self, urls):
7186
results = []
7287
failed_urls = []
7388
redirect_urls = []
89+
home_redirect_urls = []
7490

75-
print(f"🔍 Checking {len(urls)} URLs...")
76-
print(f"⚙️ Using {self.max_workers} workers with {self.delay}s delay")
77-
print("=" * 60)
91+
self.log(f"🔍 Checking {len(urls)} URLs...")
92+
self.log(f"⚙️ Using {self.max_workers} workers with {self.delay}s delay")
93+
self.log("=" * 60)
7894

7995
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
8096
# Submit all URL check tasks
@@ -90,58 +106,70 @@ def check_urls(self, urls):
90106

91107
# Print progress
92108
if i % 50 == 0 or i == len(urls):
93-
print(f"Progress: {i}/{len(urls)} URLs checked")
109+
self.log(f"Progress: {i}/{len(urls)} URLs checked")
94110

95111
# Categorize results
96112
if result['error']:
97113
failed_urls.append(result)
98-
print(f"❌ ERROR: {result['url']} - {result['error']}")
114+
self.log(f"❌ ERROR: {result['url']} - {result['error']}")
99115
elif result['status_code'] == 404:
100116
failed_urls.append(result)
101-
print(f"❌ 404: {result['url']}")
117+
self.log(f"❌ 404: {result['url']}")
102118
elif result['status_code'] >= 400:
103119
failed_urls.append(result)
104-
print(f"⚠️ {result['status_code']}: {result['url']}")
120+
self.log(f"⚠️ {result['status_code']}: {result['url']}")
121+
elif result['home_redirect']:
122+
home_redirect_urls.append(result)
123+
self.log(f"🏠 HOME REDIRECT: {result['url']}{result['final_url']}")
105124
elif result['redirected']:
106125
redirect_urls.append(result)
107-
print(f"🔄 REDIRECT: {result['url']}{result['final_url']}")
126+
self.log(f"🔄 REDIRECT: {result['url']}{result['final_url']}")
108127
elif result['status_code'] == 200:
109-
print(f"✅ OK: {result['url']}")
128+
self.log(f"✅ OK: {result['url']}")
110129
else:
111-
print(f"ℹ️ {result['status_code']}: {result['url']}")
130+
self.log(f"ℹ️ {result['status_code']}: {result['url']}")
112131

113-
return results, failed_urls, redirect_urls
132+
return results, failed_urls, redirect_urls, home_redirect_urls
114133

115-
def print_summary(self, results, failed_urls, redirect_urls):
134+
def print_summary(self, results, failed_urls, redirect_urls, home_redirect_urls):
116135
"""Print summary of results."""
117-
print("\n" + "=" * 60)
118-
print("📊 SUMMARY")
119-
print("=" * 60)
136+
self.log("\n" + "=" * 60)
137+
self.log("📊 SUMMARY")
138+
self.log("=" * 60)
120139

121140
total_urls = len(results)
122141
success_urls = len([r for r in results if r['status_code'] == 200 and not r['error']])
123142

124-
print(f"Total URLs checked: {total_urls}")
125-
print(f"✅ Successful (200): {success_urls}")
126-
print(f"🔄 Redirects: {len(redirect_urls)}")
127-
print(f"❌ Failed/Errors: {len(failed_urls)}")
143+
self.log(f"Total URLs checked: {total_urls}")
144+
self.log(f"✅ Successful (200): {success_urls}")
145+
self.log(f"🔄 Redirects: {len(redirect_urls)}")
146+
self.log(f"🏠 Home page redirects: {len(home_redirect_urls)}")
147+
self.log(f"❌ Failed/Errors: {len(failed_urls)}")
128148

129149
if failed_urls:
130-
print(f"\n❌ FAILED URLS ({len(failed_urls)}):")
131-
print("-" * 40)
150+
self.log(f"\n❌ FAILED URLS ({len(failed_urls)}):")
151+
self.log("-" * 40)
132152
for result in failed_urls:
133153
if result['error']:
134-
print(f"ERROR: {result['url']} - {result['error']}")
154+
self.log(f"ERROR: {result['url']} - {result['error']}")
135155
else:
136-
print(f"{result['status_code']}: {result['url']}")
156+
self.log(f"{result['status_code']}: {result['url']}")
157+
158+
if home_redirect_urls:
159+
self.log(f"\n🏠 HOME PAGE REDIRECTS ({len(home_redirect_urls)}):")
160+
self.log("-" * 40)
161+
self.log("⚠️ These URLs redirect to the home page instead of specific content:")
162+
for result in home_redirect_urls:
163+
self.log(f"{result['url']}{result['final_url']}")
137164

138165
if redirect_urls:
139-
print(f"\n🔄 REDIRECTED URLS ({len(redirect_urls)}):")
140-
print("-" * 40)
166+
self.log(f"\n🔄 OTHER REDIRECTED URLS ({len(redirect_urls)}):")
167+
self.log("-" * 40)
141168
for result in redirect_urls:
142-
print(f"{result['url']}{result['final_url']}")
169+
self.log(f"{result['url']}{result['final_url']}")
143170

144-
return len(failed_urls) == 0
171+
# Consider home redirects as problematic for the exit code
172+
return len(failed_urls) == 0 and len(home_redirect_urls) == 0
145173

146174
def main():
147175
parser = argparse.ArgumentParser(description='Check URLs in Fern sitemap for 404 errors')
@@ -150,31 +178,48 @@ def main():
150178
parser.add_argument('--delay', type=float, default=0.1, help='Delay between requests (seconds)')
151179
parser.add_argument('--timeout', type=int, default=30, help='Request timeout (seconds)')
152180
parser.add_argument('--max-urls', type=int, help='Limit number of URLs to check (for testing)')
181+
parser.add_argument('--output', default='check_urls_output.txt', help='Output file path')
153182

154183
args = parser.parse_args()
155184

156185
checker = URLChecker(args.sitemap, args.workers, args.delay, args.timeout)
157186

158-
print("🚀 Fern Docs URL Checker")
159-
print("=" * 60)
160-
161-
# Parse sitemap
162-
urls = checker.parse_sitemap()
163-
if not urls:
164-
print("❌ No URLs found in sitemap")
187+
# Open output file for writing
188+
try:
189+
checker.output_file = open(args.output, 'w', encoding='utf-8')
190+
checker.log(f"📝 Output will be saved to: {args.output}")
191+
except IOError as e:
192+
print(f"❌ Error opening output file {args.output}: {e}")
165193
sys.exit(1)
166194

167-
# Limit URLs if specified (for testing)
168-
if args.max_urls:
169-
urls = urls[:args.max_urls]
170-
print(f"🔬 Testing mode: checking first {len(urls)} URLs")
171-
172-
# Check URLs
173-
results, failed_urls, redirect_urls = checker.check_urls(urls)
174-
175-
# Print summary and exit
176-
success = checker.print_summary(results, failed_urls, redirect_urls)
177-
sys.exit(0 if success else 1)
195+
try:
196+
checker.log("🚀 Fern Docs URL Checker")
197+
checker.log("=" * 60)
198+
199+
# Parse sitemap
200+
urls = checker.parse_sitemap()
201+
if not urls:
202+
checker.log("❌ No URLs found in sitemap")
203+
sys.exit(1)
204+
205+
# Limit URLs if specified (for testing)
206+
if args.max_urls:
207+
urls = urls[:args.max_urls]
208+
checker.log(f"🔬 Testing mode: checking first {len(urls)} URLs")
209+
210+
# Check URLs
211+
results, failed_urls, redirect_urls, home_redirect_urls = checker.check_urls(urls)
212+
213+
# Print summary and exit
214+
success = checker.print_summary(results, failed_urls, redirect_urls, home_redirect_urls)
215+
216+
checker.log(f"\n📁 Results saved to: {args.output}")
217+
sys.exit(0 if success else 1)
218+
219+
finally:
220+
# Close output file
221+
if checker.output_file:
222+
checker.output_file.close()
178223

179224
if __name__ == "__main__":
180225
main()

0 commit comments

Comments
 (0)