Skip to content

Commit ba61e0f

Browse files
authored
Merge pull request #801 from marten-ti/add-row-publisher
add Rest Of World Publisher
2 parents 6e35b4e + 8c15dd1 commit ba61e0f

File tree

6 files changed

+1424
-0
lines changed

6 files changed

+1424
-0
lines changed

docs/supported_publishers.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3433,6 +3433,25 @@
34333433
<code>description</code>
34343434
</td>
34353435
</tr>
3436+
<tr>
3437+
<td>
3438+
<code>RestOfWorld</code>
3439+
</td>
3440+
<td>
3441+
<div>Rest of World</div>
3442+
</td>
3443+
<td>
3444+
<a href="https://restofworld.org/">
3445+
<span>restofworld.org</span>
3446+
</a>
3447+
</td>
3448+
<td>
3449+
<code>en</code>
3450+
</td>
3451+
<td>&#160;</td>
3452+
<td>&#160;</td>
3453+
<td>&#160;</td>
3454+
</tr>
34363455
<tr>
34373456
<td>
34383457
<code>Reuters</code>

src/fundus/publishers/us/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from .free_beacon import FreeBeaconParser
1010
from .la_times import LATimesParser
1111
from .occupy_democrats import OccupyDemocratsParser
12+
from .rest_of_world import RestOfWorldParser
1213
from .reuters import ReutersParser
1314
from .rolling_stone import RollingStoneParser
1415
from .techcrunch import TechCrunchParser
@@ -274,3 +275,14 @@ class US(metaclass=PublisherGroup):
274275
Sitemap("https://www.wired.com/sitemap-archive-1.xml"),
275276
],
276277
)
278+
279+
RestOfWorld = Publisher(
280+
name="Rest of World",
281+
domain="https://restofworld.org/",
282+
parser=RestOfWorldParser,
283+
url_filter=inverse(regex_filter(r"restofworld\.org\/20\d{2}\/")),
284+
sources=[
285+
RSSFeed("https://restofworld.org/feed/latest/"),
286+
Sitemap("https://restofworld.org/sitemap.xml"),
287+
],
288+
)
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
from datetime import date, datetime
2+
from typing import List, Optional
3+
4+
from lxml.cssselect import CSSSelector
5+
from lxml.etree import XPath
6+
7+
from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
8+
from fundus.parser.utility import (
9+
extract_article_body_with_selector,
10+
generic_author_parsing,
11+
generic_date_parsing,
12+
generic_topic_parsing,
13+
image_extraction,
14+
)
15+
16+
17+
class RestOfWorldParser(ParserProxy):
18+
class V1(BaseParser):
19+
_summary_selector: CSSSelector = CSSSelector("div.post-subheader__summary li, p.post-header__text__dek")
20+
_paragraph_selector: CSSSelector = CSSSelector("div.post-content > p")
21+
_subheadline_selector: CSSSelector = CSSSelector("div.post-content > h2")
22+
23+
@attribute
24+
def body(self) -> Optional[ArticleBody]:
25+
return extract_article_body_with_selector(
26+
self.precomputed.doc,
27+
summary_selector=self._summary_selector,
28+
subheadline_selector=self._subheadline_selector,
29+
paragraph_selector=self._paragraph_selector,
30+
)
31+
32+
@attribute
33+
def authors(self) -> List[str]:
34+
return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author"))
35+
36+
@attribute
37+
def publishing_date(self) -> Optional[datetime]:
38+
return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True))
39+
40+
@attribute
41+
def title(self) -> Optional[str]:
42+
return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True)
43+
44+
@attribute
45+
def topics(self) -> List[str]:
46+
return generic_topic_parsing(self.precomputed.ld.xpath_search("NewsArticle/keywords", scalar=True))
47+
48+
@attribute
49+
def images(self) -> List[Image]:
50+
return image_extraction(
51+
doc=self.precomputed.doc,
52+
paragraph_selector=self._paragraph_selector,
53+
image_selector=XPath("//figure//img | //img[@src or @data-src]"),
54+
caption_selector=XPath("./ancestor::figure[1]//*[contains(@class,'figcaption__caption')][1]"),
55+
author_selector=XPath(
56+
"(./ancestor::figure[1]//*[(contains(@class,'figcaption__credit') "
57+
"or contains(@class,'credit') or contains(@class,'byline'))])[last()]"
58+
),
59+
relative_urls=True,
60+
)

0 commit comments

Comments
 (0)