File tree Expand file tree Collapse file tree 1 file changed +7
-3
lines changed Expand file tree Collapse file tree 1 file changed +7
-3
lines changed Original file line number Diff line number Diff line change 1111import feedparser
1212import lxml .html
1313import validators
14- from lxml .etree import XMLParser , XPath
14+ from lxml .etree import XMLParser , XMLSyntaxError , XPath
1515from requests import ConnectionError , HTTPError , ReadTimeout
1616
1717from fundus .logging import create_logger
@@ -164,7 +164,7 @@ class Sitemap(URLSource):
164164 _decompressor : ClassVar [_ArchiveDecompressor ] = _ArchiveDecompressor ()
165165 _sitemap_selector : ClassVar [XPath ] = XPath ("//*[local-name()='sitemap']/*[local-name()='loc']" )
166166 _url_selector : ClassVar [XPath ] = XPath ("//*[local-name()='url']/*[local-name()='loc']" )
167- _parser = XMLParser (strip_cdata = False )
167+ _parser = XMLParser (strip_cdata = False , recover = True )
168168
169169 def __iter__ (self ) -> Iterator [str ]:
170170 def yield_recursive (sitemap_url : str ) -> Iterator [str ]:
@@ -194,7 +194,11 @@ def yield_recursive(sitemap_url: str) -> Iterator[str]:
194194 if not content :
195195 logger .warning (f"Warning! Empty sitemap at { sitemap_url !r} " )
196196 return
197- tree = lxml .etree .fromstring (content , parser = self ._parser )
197+ try :
198+ tree = lxml .etree .fromstring (content , parser = self ._parser )
199+ except XMLSyntaxError :
200+ logger .warning (f"Warning! Couldn't parse sitemap { sitemap_url !r} because of invalid XML" )
201+ return
198202 urls = [node .text for node in self ._url_selector (tree )]
199203 if urls :
200204 for new_url in reversed (urls ) if self .reverse else urls :
You can’t perform that action at this time.
0 commit comments