|
8 | 8 | import re |
9 | 9 | import sys |
10 | 10 | import time |
| 11 | +import xml.etree.ElementTree as ET |
11 | 12 | from collections import deque |
12 | 13 | from urllib.parse import urljoin, urlparse |
13 | 14 |
|
@@ -143,14 +144,49 @@ def crawl_page(self, url): |
143 | 144 | print(f"Error crawling {url}: {e}") |
144 | 145 | return [] |
145 | 146 |
|
| 147 | + def get_sitemap_urls(self): |
| 148 | + """Try to get URLs from sitemap.xml.""" |
| 149 | + sitemap_url = f"{self.base_url}/sitemap.xml" |
| 150 | + print(f"Checking for sitemap at: {sitemap_url}") |
| 151 | + |
| 152 | + try: |
| 153 | + response = self.session.get(sitemap_url, timeout=self.timeout) |
| 154 | + if response.status_code == 200: |
| 155 | + print("✅ Found sitemap.xml, parsing URLs...") |
| 156 | + root = ET.fromstring(response.content) |
| 157 | + |
| 158 | + urls = [] |
| 159 | + for url_elem in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}url'): |
| 160 | + loc_elem = url_elem.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc') |
| 161 | + if loc_elem is not None and loc_elem.text: |
| 162 | + urls.append(loc_elem.text) |
| 163 | + |
| 164 | + print(f"Found {len(urls)} URLs in sitemap") |
| 165 | + return urls |
| 166 | + else: |
| 167 | + print(f"No sitemap found (HTTP {response.status_code})") |
| 168 | + return None |
| 169 | + |
| 170 | + except Exception as e: |
| 171 | + print(f"Error fetching sitemap: {e}") |
| 172 | + return None |
| 173 | + |
146 | 174 | def run(self): |
147 | 175 | """Run the dead link checker.""" |
148 | 176 | print(f"Starting dead link check for {self.base_url}") |
149 | 177 | print(f"Max pages: {self.max_pages}, Timeout: {self.timeout}s") |
150 | 178 |
|
151 | | - while self.pages_to_visit and (not self.max_pages or len(self.visited_pages) < self.max_pages): |
152 | | - url = self.pages_to_visit.popleft() |
153 | | - self.crawl_page(url) |
| 179 | + sitemap_urls = self.get_sitemap_urls() |
| 180 | + if sitemap_urls: |
| 181 | + print("Using sitemap-based crawling...") |
| 182 | + for url in sitemap_urls: |
| 183 | + if not self.max_pages or len(self.visited_pages) < self.max_pages: |
| 184 | + self.crawl_page(url) |
| 185 | + else: |
| 186 | + print("Using breadth-first crawling...") |
| 187 | + while self.pages_to_visit and (not self.max_pages or len(self.visited_pages) < self.max_pages): |
| 188 | + url = self.pages_to_visit.popleft() |
| 189 | + self.crawl_page(url) |
154 | 190 |
|
155 | 191 | print(f"\nCrawl complete!") |
156 | 192 | print(f"Pages visited: {len(self.visited_pages)}") |
|
0 commit comments