Skip to content

Commit 0afc87f

Browse files
committed
handle malformed XML
1 parent 8eb1488 commit 0afc87f

File tree

1 file changed

+7
-3
lines changed

1 file changed

+7
-3
lines changed

src/fundus/scraping/url.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import feedparser
1212
import lxml.html
1313
import validators
14-
from lxml.etree import XMLParser, XPath
14+
from lxml.etree import XMLParser, XMLSyntaxError, XPath
1515
from requests import ConnectionError, HTTPError, ReadTimeout
1616

1717
from fundus.logging import create_logger
@@ -164,7 +164,7 @@ class Sitemap(URLSource):
164164
_decompressor: ClassVar[_ArchiveDecompressor] = _ArchiveDecompressor()
165165
_sitemap_selector: ClassVar[XPath] = XPath("//*[local-name()='sitemap']/*[local-name()='loc']")
166166
_url_selector: ClassVar[XPath] = XPath("//*[local-name()='url']/*[local-name()='loc']")
167-
_parser = XMLParser(strip_cdata=False)
167+
_parser = XMLParser(strip_cdata=False, recover=True)
168168

169169
def __iter__(self) -> Iterator[str]:
170170
def yield_recursive(sitemap_url: str) -> Iterator[str]:
@@ -194,7 +194,11 @@ def yield_recursive(sitemap_url: str) -> Iterator[str]:
194194
if not content:
195195
logger.warning(f"Warning! Empty sitemap at {sitemap_url!r}")
196196
return
197-
tree = lxml.etree.fromstring(content, parser=self._parser)
197+
try:
198+
tree = lxml.etree.fromstring(content, parser=self._parser)
199+
except XMLSyntaxError:
200+
logger.warning(f"Warning! Couldn't parse sitemap {sitemap_url!r} because of invalid XML")
201+
return
198202
urls = [node.text for node in self._url_selector(tree)]
199203
if urls:
200204
for new_url in reversed(urls) if self.reverse else urls:

0 commit comments

Comments
 (0)