Skip to content

Commit 346a9a2

Browse files
kgowruKapil GowruKapil Gowru
authored
07 23 redirects 11 (#108)
Co-authored-by: Kapil Gowru <[email protected]> Co-authored-by: Kapil Gowru <[email protected]>
1 parent 0f9150b commit 346a9a2

File tree

5 files changed

+1310
-1050
lines changed

5 files changed

+1310
-1050
lines changed

check_urls.py

Lines changed: 169 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"""
33
URL Checker Script for Fern Docs Sitemap
44
Checks all URLs in the sitemap for 404 errors and other issues.
5+
Follows complete redirect chains and flags home page redirects as errors.
56
"""
67

78
import xml.etree.ElementTree as ET
@@ -13,17 +14,25 @@
1314
import argparse
1415

1516
class URLChecker:
16-
def __init__(self, sitemap_path, max_workers=10, delay=0.1, timeout=30):
17+
def __init__(self, sitemap_path, max_workers=10, delay=0.1, timeout=30, max_redirects=10):
1718
self.sitemap_path = sitemap_path
1819
self.max_workers = max_workers
1920
self.delay = delay
2021
self.timeout = timeout
22+
self.max_redirects = max_redirects
2123
self.session = requests.Session()
2224
self.session.headers.update({
2325
'User-Agent': 'Fern-URL-Checker/1.0'
2426
})
25-
# Define the problematic home page URL
26-
self.home_page_url = 'https://fern-api.docs.buildwithfern.com/learn/home'
27+
# Define the problematic home page URLs (multiple variations)
28+
self.home_page_urls = {
29+
'https://fern-api.docs.buildwithfern.com/learn/home',
30+
'https://fern-v2.docs.buildwithfern.com/learn/v2/home',
31+
'https://buildfern.com/learn/home',
32+
'https://fern-api.docs.buildwithfern.com/learn',
33+
'https://fern-v2.docs.buildwithfern.com/learn',
34+
'https://buildfern.com/learn'
35+
}
2736
# File handle for output logging
2837
self.output_file = None
2938

@@ -57,30 +66,115 @@ def parse_sitemap(self):
5766
self.log(f"❌ Sitemap file not found: {self.sitemap_path}")
5867
return []
5968

60-
def check_url(self, url):
61-
"""Check a single URL and return result."""
69+
def is_home_page(self, url):
70+
"""Check if a URL is a home page variant."""
71+
url_clean = url.rstrip('/')
72+
return url_clean in {u.rstrip('/') for u in self.home_page_urls}
73+
74+
def follow_redirect_chain(self, url):
75+
"""Follow redirects manually to track the complete chain."""
76+
redirect_chain = [url]
77+
current_url = url
78+
redirect_count = 0
79+
6280
try:
63-
response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
64-
is_home_redirect = (url != response.url and
65-
response.url.rstrip('/') == self.home_page_url.rstrip('/'))
81+
while redirect_count < self.max_redirects:
82+
# Make request without following redirects automatically
83+
response = self.session.get(current_url, timeout=self.timeout, allow_redirects=False)
84+
85+
# Check if this step leads to home page
86+
if self.is_home_page(current_url):
87+
return {
88+
'status_code': response.status_code,
89+
'final_url': current_url,
90+
'redirect_chain': redirect_chain,
91+
'redirect_count': redirect_count,
92+
'leads_to_home': True,
93+
'home_at_step': redirect_count,
94+
'error': None
95+
}
96+
97+
# If not a redirect, we're done
98+
if response.status_code not in [301, 302, 303, 307, 308]:
99+
return {
100+
'status_code': response.status_code,
101+
'final_url': current_url,
102+
'redirect_chain': redirect_chain,
103+
'redirect_count': redirect_count,
104+
'leads_to_home': False,
105+
'home_at_step': None,
106+
'error': None
107+
}
108+
109+
# Get redirect location
110+
location = response.headers.get('Location')
111+
if not location:
112+
return {
113+
'status_code': response.status_code,
114+
'final_url': current_url,
115+
'redirect_chain': redirect_chain,
116+
'redirect_count': redirect_count,
117+
'leads_to_home': False,
118+
'home_at_step': None,
119+
'error': 'Redirect response missing Location header'
120+
}
121+
122+
# Handle relative URLs
123+
if location.startswith('/'):
124+
parsed_current = urlparse(current_url)
125+
location = f"{parsed_current.scheme}://{parsed_current.netloc}{location}"
126+
elif not location.startswith('http'):
127+
parsed_current = urlparse(current_url)
128+
location = f"{parsed_current.scheme}://{parsed_current.netloc}/{location}"
129+
130+
redirect_count += 1
131+
current_url = location
132+
redirect_chain.append(current_url)
133+
134+
# Check if we've seen this URL before (redirect loop)
135+
if current_url in redirect_chain[:-1]:
136+
return {
137+
'status_code': response.status_code,
138+
'final_url': current_url,
139+
'redirect_chain': redirect_chain,
140+
'redirect_count': redirect_count,
141+
'leads_to_home': False,
142+
'home_at_step': None,
143+
'error': f'Redirect loop detected at step {redirect_count}'
144+
}
145+
146+
# Too many redirects
66147
return {
67-
'url': url,
68-
'status_code': response.status_code,
69-
'final_url': response.url,
70-
'redirected': url != response.url,
71-
'home_redirect': is_home_redirect,
72-
'error': None
148+
'status_code': None,
149+
'final_url': current_url,
150+
'redirect_chain': redirect_chain,
151+
'redirect_count': redirect_count,
152+
'leads_to_home': False,
153+
'home_at_step': None,
154+
'error': f'Too many redirects (>{self.max_redirects})'
73155
}
156+
74157
except requests.exceptions.RequestException as e:
75158
return {
76-
'url': url,
77159
'status_code': None,
78-
'final_url': None,
79-
'redirected': False,
80-
'home_redirect': False,
160+
'final_url': current_url,
161+
'redirect_chain': redirect_chain,
162+
'redirect_count': redirect_count,
163+
'leads_to_home': False,
164+
'home_at_step': None,
81165
'error': str(e)
82166
}
83167

168+
def check_url(self, url):
169+
"""Check a single URL and return result with full redirect chain."""
170+
result = self.follow_redirect_chain(url)
171+
172+
# Add original URL for reference
173+
result['original_url'] = url
174+
result['redirected'] = len(result['redirect_chain']) > 1
175+
176+
return result
177+
84178
def check_urls(self, urls):
85179
"""Check all URLs concurrently."""
86180
results = []
@@ -90,6 +184,7 @@ def check_urls(self, urls):
90184

91185
self.log(f"🔍 Checking {len(urls)} URLs...")
92186
self.log(f"⚙️ Using {self.max_workers} workers with {self.delay}s delay")
187+
self.log(f"🔄 Following up to {self.max_redirects} redirects per URL")
93188
self.log("=" * 60)
94189

95190
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
@@ -109,25 +204,36 @@ def check_urls(self, urls):
109204
self.log(f"Progress: {i}/{len(urls)} URLs checked")
110205

111206
# Categorize results
207+
original_url = result['original_url']
208+
112209
if result['error']:
113210
failed_urls.append(result)
114-
self.log(f"❌ ERROR: {result['url']} - {result['error']}")
211+
self.log(f"❌ ERROR: {original_url} - {result['error']}")
212+
if result['redirect_count'] > 0:
213+
self.log(f" Chain: {' → '.join(result['redirect_chain'])}")
214+
elif result['leads_to_home']:
215+
home_redirect_urls.append(result)
216+
self.log(f"🏠 HOME REDIRECT: {original_url} → HOME (step {result['home_at_step']})")
217+
self.log(f" Chain: {' → '.join(result['redirect_chain'])}")
115218
elif result['status_code'] == 404:
116219
failed_urls.append(result)
117-
self.log(f"❌ 404: {result['url']}")
118-
elif result['status_code'] >= 400:
220+
self.log(f"❌ 404: {original_url}")
221+
if result['redirect_count'] > 0:
222+
self.log(f" Chain: {' → '.join(result['redirect_chain'])}")
223+
elif result['status_code'] and result['status_code'] >= 400:
119224
failed_urls.append(result)
120-
self.log(f"⚠️ {result['status_code']}: {result['url']}")
121-
elif result['home_redirect']:
122-
home_redirect_urls.append(result)
123-
self.log(f"🏠 HOME REDIRECT: {result['url']}{result['final_url']}")
225+
self.log(f"⚠️ {result['status_code']}: {original_url}")
226+
if result['redirect_count'] > 0:
227+
self.log(f" Chain: {' → '.join(result['redirect_chain'])}")
124228
elif result['redirected']:
125229
redirect_urls.append(result)
126-
self.log(f"🔄 REDIRECT: {result['url']}{result['final_url']}")
230+
self.log(f"🔄 REDIRECT ({result['redirect_count']} steps): {original_url}{result['final_url']}")
231+
if result['redirect_count'] > 1:
232+
self.log(f" Chain: {' → '.join(result['redirect_chain'])}")
127233
elif result['status_code'] == 200:
128-
self.log(f"✅ OK: {result['url']}")
234+
self.log(f"✅ OK: {original_url}")
129235
else:
130-
self.log(f"ℹ️ {result['status_code']}: {result['url']}")
236+
self.log(f"ℹ️ {result['status_code']}: {original_url}")
131237

132238
return results, failed_urls, redirect_urls, home_redirect_urls
133239

@@ -138,51 +244,58 @@ def print_summary(self, results, failed_urls, redirect_urls, home_redirect_urls)
138244
self.log("=" * 60)
139245

140246
total_urls = len(results)
141-
success_urls = len([r for r in results if r['status_code'] == 200 and not r['error']])
247+
success_urls = len([r for r in results if r['status_code'] == 200 and not r['error'] and not r['leads_to_home']])
142248

143249
self.log(f"Total URLs checked: {total_urls}")
144250
self.log(f"✅ Successful (200): {success_urls}")
145-
self.log(f"🔄 Redirects: {len(redirect_urls)}")
146-
self.log(f"🏠 Home page redirects: {len(home_redirect_urls)}")
251+
self.log(f"🔄 Redirects (working): {len(redirect_urls)}")
252+
self.log(f"🏠 Home page redirects (ERROR): {len(home_redirect_urls)}")
147253
self.log(f"❌ Failed/Errors: {len(failed_urls)}")
148254

255+
if home_redirect_urls:
256+
self.log(f"\n🏠 HOME PAGE REDIRECTS - FLAGGED AS ERRORS ({len(home_redirect_urls)}):")
257+
self.log("-" * 40)
258+
self.log("⚠️ These URLs redirect to the home page instead of specific content:")
259+
for result in home_redirect_urls:
260+
self.log(f"{result['original_url']} (step {result['home_at_step']})")
261+
self.log(f" Chain: {' → '.join(result['redirect_chain'])}")
262+
149263
if failed_urls:
150264
self.log(f"\n❌ FAILED URLS ({len(failed_urls)}):")
151265
self.log("-" * 40)
152266
for result in failed_urls:
153267
if result['error']:
154-
self.log(f"ERROR: {result['url']} - {result['error']}")
268+
self.log(f"ERROR: {result['original_url']} - {result['error']}")
155269
else:
156-
self.log(f"{result['status_code']}: {result['url']}")
157-
158-
if home_redirect_urls:
159-
self.log(f"\n🏠 HOME PAGE REDIRECTS ({len(home_redirect_urls)}):")
160-
self.log("-" * 40)
161-
self.log("⚠️ These URLs redirect to the home page instead of specific content:")
162-
for result in home_redirect_urls:
163-
self.log(f"{result['url']}{result['final_url']}")
270+
self.log(f"{result['status_code']}: {result['original_url']}")
271+
if result['redirect_count'] > 0:
272+
self.log(f" Chain: {' → '.join(result['redirect_chain'])}")
164273

165274
if redirect_urls:
166-
self.log(f"\n🔄 OTHER REDIRECTED URLS ({len(redirect_urls)}):")
275+
self.log(f"\n🔄 WORKING REDIRECTED URLS ({len(redirect_urls)}):")
167276
self.log("-" * 40)
168277
for result in redirect_urls:
169-
self.log(f"{result['url']}{result['final_url']}")
278+
self.log(f"{result['original_url']}{result['final_url']} ({result['redirect_count']} steps)")
279+
if result['redirect_count'] > 1:
280+
self.log(f" Chain: {' → '.join(result['redirect_chain'])}")
170281

171-
# Consider home redirects as problematic for the exit code
172-
return len(failed_urls) == 0 and len(home_redirect_urls) == 0
282+
# Home redirects are now considered errors
283+
total_errors = len(failed_urls) + len(home_redirect_urls)
284+
return total_errors == 0
173285

174286
def main():
175-
parser = argparse.ArgumentParser(description='Check URLs in Fern sitemap for 404 errors')
287+
parser = argparse.ArgumentParser(description='Check URLs in Fern sitemap for 404 errors and home redirects')
176288
parser.add_argument('--sitemap', default='fern/docs.xml', help='Path to sitemap XML file')
177289
parser.add_argument('--workers', type=int, default=10, help='Number of concurrent workers')
178290
parser.add_argument('--delay', type=float, default=0.1, help='Delay between requests (seconds)')
179291
parser.add_argument('--timeout', type=int, default=30, help='Request timeout (seconds)')
292+
parser.add_argument('--max-redirects', type=int, default=10, help='Maximum number of redirects to follow')
180293
parser.add_argument('--max-urls', type=int, help='Limit number of URLs to check (for testing)')
181294
parser.add_argument('--output', default='check_urls_output.txt', help='Output file path')
182295

183296
args = parser.parse_args()
184297

185-
checker = URLChecker(args.sitemap, args.workers, args.delay, args.timeout)
298+
checker = URLChecker(args.sitemap, args.workers, args.delay, args.timeout, args.max_redirects)
186299

187300
# Open output file for writing
188301
try:
@@ -193,7 +306,7 @@ def main():
193306
sys.exit(1)
194307

195308
try:
196-
checker.log("🚀 Fern Docs URL Checker")
309+
checker.log("🚀 Fern Docs URL Checker - Enhanced Redirect Tracking")
197310
checker.log("=" * 60)
198311

199312
# Parse sitemap
@@ -214,7 +327,15 @@ def main():
214327
success = checker.print_summary(results, failed_urls, redirect_urls, home_redirect_urls)
215328

216329
checker.log(f"\n📁 Results saved to: {args.output}")
217-
sys.exit(0 if success else 1)
330+
331+
# Exit with error code if there are any issues (including home redirects)
332+
total_issues = len(failed_urls) + len(home_redirect_urls)
333+
if total_issues > 0:
334+
checker.log(f"\n❌ Found {total_issues} issues (including home redirects)")
335+
sys.exit(1)
336+
else:
337+
checker.log(f"\n✅ All URLs are working correctly!")
338+
sys.exit(0)
218339

219340
finally:
220341
# Close output file

fern/docs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,7 @@ redirects:
441441
destination: /learn/cli-reference/:slug*
442442
permanent: true
443443
- source: /learn/cli-reference/changelog/:slug*
444-
destination: learn/cli-api-reference/cli-reference/changelog/:slug*
444+
destination: /learn/cli-api-reference/cli-reference/changelog/:slug*
445445
permanent: true
446446

447447
# ============================================================================

fern/products/docs/docs.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ navigation:
2525
- page: Aside
2626
path: ./pages/component-library/default-components/aside.mdx
2727
icon: fa-regular fa-comment-dots
28+
- page: Button
29+
path: ./pages/component-library/default-components/button.mdx
30+
icon: fa-regular fa-button
2831
- page: Callouts
2932
path: ./pages/component-library/default-components/callouts.mdx
3033
icon: fa-regular fa-exclamation-triangle
@@ -43,12 +46,15 @@ navigation:
4346
- page: Endpoint Request Snippet
4447
path: ./pages/component-library/default-components/endpoint-request-snippet.mdx
4548
icon: fa-regular fa-arrow-up
49+
slug: request-snippet
4650
- page: Endpoint Response Snippet
4751
path: ./pages/component-library/default-components/endpoint-response-snippet.mdx
4852
icon: fa-regular fa-arrow-down
53+
slug: response-snippet
4954
- page: Endpoint Schema Snippet
5055
path: ./pages/component-library/default-components/endpoint-schema-snippet.mdx
5156
icon: fa-regular fa-sitemap
57+
slug: schema-snippet
5258
- page: Frames
5359
path: ./pages/component-library/default-components/frames.mdx
5460
icon: fa-regular fa-window-maximize

0 commit comments

Comments
 (0)