Skip to content

Commit 2ed6323

Browse files
committed
Generalize Sitemap Selectors
1 parent aa8b133 commit 2ed6323

File tree

2 files changed

+10
-13
lines changed

2 files changed

+10
-13
lines changed

src/fundus/publishers/tz/__init__.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,7 @@ class TZ(metaclass=PublisherGroup):
1515
sources=[
1616
Sitemap(
1717
"https://dailynews.co.tz/wp-sitemap.xml",
18-
sitemap_filter=inverse(regex_filter("sitemap-posts-post")),
19-
reverse=True,
18+
sitemap_filter=inverse(regex_filter("post-sitemap")),
2019
languages={"en"},
2120
),
2221
],
@@ -27,9 +26,8 @@ class TZ(metaclass=PublisherGroup):
2726
parser=DailyNewsTZParser,
2827
sources=[
2928
Sitemap(
30-
"https://habarileo.co.tz/wp-sitemap.xml",
31-
sitemap_filter=inverse(regex_filter("sitemap-posts-post")),
32-
reverse=True,
29+
"https://habarileo.co.tz/sitemap.xml",
30+
sitemap_filter=inverse(regex_filter("post-sitemap")),
3331
),
3432
],
3533
)

src/fundus/scraping/url.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,10 @@
1111
import feedparser
1212
import lxml.html
1313
import validators
14-
from lxml.cssselect import CSSSelector
15-
from lxml.etree import XPath
14+
from lxml.etree import XMLParser, XPath
1615
from requests import ConnectionError, HTTPError, ReadTimeout
1716

1817
from fundus.logging import create_logger
19-
from fundus.parser.utility import generic_nodes_to_text
2018
from fundus.scraping.filter import URLFilter, inverse
2119
from fundus.scraping.session import _default_header, session_handler
2220

@@ -164,8 +162,8 @@ class Sitemap(URLSource):
164162
sitemap_filter: URLFilter = lambda url: not bool(url)
165163

166164
_decompressor: ClassVar[_ArchiveDecompressor] = _ArchiveDecompressor()
167-
_sitemap_selector: ClassVar[XPath] = CSSSelector("sitemap > loc")
168-
_url_selector: ClassVar[XPath] = CSSSelector("url > loc")
165+
_sitemap_selector: ClassVar[XPath] = XPath("//*[local-name()='sitemap']/*[local-name()='loc']")
166+
_url_selector: ClassVar[XPath] = XPath("//*[local-name()='url']/*[local-name()='loc']")
169167

170168
def __iter__(self) -> Iterator[str]:
171169
def yield_recursive(sitemap_url: str) -> Iterator[str]:
@@ -195,13 +193,14 @@ def yield_recursive(sitemap_url: str) -> Iterator[str]:
195193
if not content:
196194
logger.warning(f"Warning! Empty sitemap at {sitemap_url!r}")
197195
return
198-
tree = lxml.html.fromstring(content)
199-
urls = generic_nodes_to_text(self._url_selector(tree), normalize=True)
196+
parser = XMLParser(strip_cdata=False)
197+
tree = lxml.etree.fromstring(content, parser=parser)
198+
urls = [node.text for node in self._url_selector(tree)]
200199
if urls:
201200
for new_url in reversed(urls) if self.reverse else urls:
202201
yield clean_url(new_url)
203202
elif self.recursive:
204-
sitemap_locs = [node.text_content() for node in self._sitemap_selector(tree)]
203+
sitemap_locs = [node.text for node in self._sitemap_selector(tree)]
205204
filtered_locs = list(filter(inverse(self.sitemap_filter), sitemap_locs))
206205
for loc in reversed(filtered_locs) if self.reverse else filtered_locs:
207206
yield from yield_recursive(loc)

0 commit comments

Comments
 (0)