Skip to content

Commit e945118

Browse files
Add sitemap discovery to dead link checker
- Check for sitemap.xml first before falling back to breadth-first crawling - Parse XML sitemap using standard sitemap protocol - Maintain backward compatibility with existing crawling approach - Improve efficiency when sitemap is available Co-Authored-By: Alek <alek@pynecone.io>
1 parent cc03ba1 commit e945118

File tree

1 file changed

+39
-3
lines changed

1 file changed

+39
-3
lines changed

scripts/check_dead_links.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import re
99
import sys
1010
import time
11+
import xml.etree.ElementTree as ET
1112
from collections import deque
1213
from urllib.parse import urljoin, urlparse
1314

@@ -143,14 +144,49 @@ def crawl_page(self, url):
143144
print(f"Error crawling {url}: {e}")
144145
return []
145146

147+
def get_sitemap_urls(self):
148+
"""Try to get URLs from sitemap.xml."""
149+
sitemap_url = f"{self.base_url}/sitemap.xml"
150+
print(f"Checking for sitemap at: {sitemap_url}")
151+
152+
try:
153+
response = self.session.get(sitemap_url, timeout=self.timeout)
154+
if response.status_code == 200:
155+
print("✅ Found sitemap.xml, parsing URLs...")
156+
root = ET.fromstring(response.content)
157+
158+
urls = []
159+
for url_elem in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
160+
loc_elem = url_elem.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
161+
if loc_elem is not None and loc_elem.text:
162+
urls.append(loc_elem.text)
163+
164+
print(f"Found {len(urls)} URLs in sitemap")
165+
return urls
166+
else:
167+
print(f"No sitemap found (HTTP {response.status_code})")
168+
return None
169+
170+
except Exception as e:
171+
print(f"Error fetching sitemap: {e}")
172+
return None
173+
146174
def run(self):
147175
"""Run the dead link checker."""
148176
print(f"Starting dead link check for {self.base_url}")
149177
print(f"Max pages: {self.max_pages}, Timeout: {self.timeout}s")
150178

151-
while self.pages_to_visit and (not self.max_pages or len(self.visited_pages) < self.max_pages):
152-
url = self.pages_to_visit.popleft()
153-
self.crawl_page(url)
179+
sitemap_urls = self.get_sitemap_urls()
180+
if sitemap_urls:
181+
print("Using sitemap-based crawling...")
182+
for url in sitemap_urls:
183+
if not self.max_pages or len(self.visited_pages) < self.max_pages:
184+
self.crawl_page(url)
185+
else:
186+
print("Using breadth-first crawling...")
187+
while self.pages_to_visit and (not self.max_pages or len(self.visited_pages) < self.max_pages):
188+
url = self.pages_to_visit.popleft()
189+
self.crawl_page(url)
154190

155191
print(f"\nCrawl complete!")
156192
print(f"Pages visited: {len(self.visited_pages)}")

0 commit comments

Comments
 (0)