Skip to content

Commit ff641ed

Browse files
add RestOfWorld Parser
1 parent 73fbe67 commit ff641ed

File tree

1 file changed

+67
-0
lines changed

1 file changed

+67
-0
lines changed
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
from datetime import date, datetime
2+
from typing import List, Optional
3+
4+
from lxml.cssselect import CSSSelector
5+
from lxml.etree import XPath
6+
7+
from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
8+
from fundus.parser.utility import (
9+
extract_article_body_with_selector,
10+
generic_author_parsing,
11+
generic_date_parsing,
12+
image_extraction,
13+
)
14+
15+
16+
class RestOfWorldParser(ParserProxy):
17+
class V1(BaseParser):
18+
VALID_UNTIL = date.today()
19+
20+
_summary_selector: CSSSelector = CSSSelector(
21+
"div.post-subheader__summary li, p.post-header__text__dek"
22+
)
23+
_paragraph_selector: CSSSelector = CSSSelector(
24+
"div.post-content > p"
25+
)
26+
_subheadline_selector: CSSSelector = CSSSelector(
27+
"div.post-content > h2"
28+
)
29+
30+
@attribute
31+
def body(self) -> Optional[ArticleBody]:
32+
return extract_article_body_with_selector(
33+
self.precomputed.doc,
34+
summary_selector=self._summary_selector,
35+
subheadline_selector=self._subheadline_selector,
36+
paragraph_selector=self._paragraph_selector,
37+
)
38+
39+
@attribute
40+
def authors(self) -> List[str]:
41+
return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author"))
42+
43+
@attribute
44+
def publishing_date(self) -> Optional[datetime]:
45+
return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True))
46+
47+
@attribute
48+
def title(self) -> Optional[str]:
49+
return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True)
50+
51+
@attribute
52+
def topics(self) -> List[str]:
53+
keywords: List[str] = self.precomputed.ld.xpath_search("NewsArticle/keywords")
54+
return [keyword for keyword in keywords if not keyword.startswith("Subject: ")]
55+
56+
@attribute
57+
def images(self) -> List[Image]:
58+
return image_extraction(
59+
doc=self.precomputed.doc,
60+
paragraph_selector=self._paragraph_selector,
61+
image_selector=XPath("//figure//img | //img[@src or @data-src]"),
62+
caption_selector=XPath("./ancestor::figure//*[self::figcaption or contains(@class, 'caption')]"),
63+
author_selector=XPath(
64+
"(./ancestor::figure//*[contains(@class, 'credit') or contains(@class, 'byline')])[last()]"
65+
),
66+
relative_urls=True,
67+
)

0 commit comments

Comments
 (0)