|
11 | 11 | import feedparser |
12 | 12 | import lxml.html |
13 | 13 | import validators |
14 | | -from lxml.cssselect import CSSSelector |
15 | | -from lxml.etree import XPath |
| 14 | +from lxml.etree import XMLParser, XPath |
16 | 15 | from requests import ConnectionError, HTTPError, ReadTimeout |
17 | 16 |
|
18 | 17 | from fundus.logging import create_logger |
19 | | -from fundus.parser.utility import generic_nodes_to_text |
20 | 18 | from fundus.scraping.filter import URLFilter, inverse |
21 | 19 | from fundus.scraping.session import _default_header, session_handler |
22 | 20 |
|
@@ -164,8 +162,9 @@ class Sitemap(URLSource): |
164 | 162 | sitemap_filter: URLFilter = lambda url: not bool(url) |
165 | 163 |
|
166 | 164 | _decompressor: ClassVar[_ArchiveDecompressor] = _ArchiveDecompressor() |
167 | | - _sitemap_selector: ClassVar[XPath] = CSSSelector("sitemap > loc") |
168 | | - _url_selector: ClassVar[XPath] = CSSSelector("url > loc") |
| 165 | + _sitemap_selector: ClassVar[XPath] = XPath("//*[local-name()='sitemap']/*[local-name()='loc']") |
| 166 | + _url_selector: ClassVar[XPath] = XPath("//*[local-name()='url']/*[local-name()='loc']") |
| 167 | + _parser = XMLParser(strip_cdata=False) |
169 | 168 |
|
170 | 169 | def __iter__(self) -> Iterator[str]: |
171 | 170 | def yield_recursive(sitemap_url: str) -> Iterator[str]: |
@@ -195,13 +194,13 @@ def yield_recursive(sitemap_url: str) -> Iterator[str]: |
195 | 194 | if not content: |
196 | 195 | logger.warning(f"Warning! Empty sitemap at {sitemap_url!r}") |
197 | 196 | return |
198 | | - tree = lxml.html.fromstring(content) |
199 | | - urls = generic_nodes_to_text(self._url_selector(tree), normalize=True) |
| 197 | + tree = lxml.etree.fromstring(content, parser=self._parser) |
| 198 | + urls = [node.text for node in self._url_selector(tree)] |
200 | 199 | if urls: |
201 | 200 | for new_url in reversed(urls) if self.reverse else urls: |
202 | 201 | yield clean_url(new_url) |
203 | 202 | elif self.recursive: |
204 | | - sitemap_locs = [node.text_content() for node in self._sitemap_selector(tree)] |
| 203 | + sitemap_locs = [node.text for node in self._sitemap_selector(tree)] |
205 | 204 | filtered_locs = list(filter(inverse(self.sitemap_filter), sitemap_locs)) |
206 | 205 | for loc in reversed(filtered_locs) if self.reverse else filtered_locs: |
207 | 206 | yield from yield_recursive(loc) |
|
0 commit comments