|
| 1 | +from datetime import date, datetime |
| 2 | +from typing import List, Optional |
| 3 | + |
| 4 | +from lxml.cssselect import CSSSelector |
| 5 | +from lxml.etree import XPath |
| 6 | + |
| 7 | +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute |
| 8 | +from fundus.parser.utility import ( |
| 9 | + extract_article_body_with_selector, |
| 10 | + generic_author_parsing, |
| 11 | + generic_date_parsing, |
| 12 | + generic_topic_parsing, |
| 13 | + image_extraction, |
| 14 | +) |
| 15 | + |
| 16 | + |
| 17 | +class RestOfWorldParser(ParserProxy): |
| 18 | + class V1(BaseParser): |
| 19 | + _summary_selector: CSSSelector = CSSSelector("div.post-subheader__summary li, p.post-header__text__dek") |
| 20 | + _paragraph_selector: CSSSelector = CSSSelector("div.post-content > p") |
| 21 | + _subheadline_selector: CSSSelector = CSSSelector("div.post-content > h2") |
| 22 | + |
| 23 | + @attribute |
| 24 | + def body(self) -> Optional[ArticleBody]: |
| 25 | + return extract_article_body_with_selector( |
| 26 | + self.precomputed.doc, |
| 27 | + summary_selector=self._summary_selector, |
| 28 | + subheadline_selector=self._subheadline_selector, |
| 29 | + paragraph_selector=self._paragraph_selector, |
| 30 | + ) |
| 31 | + |
| 32 | + @attribute |
| 33 | + def authors(self) -> List[str]: |
| 34 | + return generic_author_parsing(self.precomputed.ld.xpath_search("NewsArticle/author")) |
| 35 | + |
| 36 | + @attribute |
| 37 | + def publishing_date(self) -> Optional[datetime]: |
| 38 | + return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True)) |
| 39 | + |
| 40 | + @attribute |
| 41 | + def title(self) -> Optional[str]: |
| 42 | + return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True) |
| 43 | + |
| 44 | + @attribute |
| 45 | + def topics(self) -> List[str]: |
| 46 | + return generic_topic_parsing(self.precomputed.ld.xpath_search("NewsArticle/keywords", scalar=True)) |
| 47 | + |
| 48 | + @attribute |
| 49 | + def images(self) -> List[Image]: |
| 50 | + return image_extraction( |
| 51 | + doc=self.precomputed.doc, |
| 52 | + paragraph_selector=self._paragraph_selector, |
| 53 | + image_selector=XPath("//figure//img | //img[@src or @data-src]"), |
| 54 | + caption_selector=XPath("./ancestor::figure[1]//*[contains(@class,'figcaption__caption')][1]"), |
| 55 | + author_selector=XPath( |
| 56 | + "(./ancestor::figure[1]//*[(contains(@class,'figcaption__credit') " |
| 57 | + "or contains(@class,'credit') or contains(@class,'byline'))])[last()]" |
| 58 | + ), |
| 59 | + relative_urls=True, |
| 60 | + ) |
0 commit comments