1+ from datetime import date , datetime
2+ from typing import List , Optional
3+
4+ from lxml .cssselect import CSSSelector
5+ from lxml .etree import XPath
6+
7+ from fundus .parser import ArticleBody , BaseParser , Image , ParserProxy , attribute
8+ from fundus .parser .utility import (
9+ extract_article_body_with_selector ,
10+ generic_author_parsing ,
11+ generic_date_parsing ,
12+ image_extraction ,
13+ )
14+
15+
16+ class RestOfWorldParser (ParserProxy ):
17+ class V1 (BaseParser ):
18+ VALID_UNTIL = date .today ()
19+
20+ _summary_selector : CSSSelector = CSSSelector (
21+ "div.post-subheader__summary li, p.post-header__text__dek"
22+ )
23+ _paragraph_selector : CSSSelector = CSSSelector (
24+ "div.post-content > p"
25+ )
26+ _subheadline_selector : CSSSelector = CSSSelector (
27+ "div.post-content > h2"
28+ )
29+
30+ @attribute
31+ def body (self ) -> Optional [ArticleBody ]:
32+ return extract_article_body_with_selector (
33+ self .precomputed .doc ,
34+ summary_selector = self ._summary_selector ,
35+ subheadline_selector = self ._subheadline_selector ,
36+ paragraph_selector = self ._paragraph_selector ,
37+ )
38+
39+ @attribute
40+ def authors (self ) -> List [str ]:
41+ return generic_author_parsing (self .precomputed .ld .xpath_search ("NewsArticle/author" ))
42+
43+ @attribute
44+ def publishing_date (self ) -> Optional [datetime ]:
45+ return generic_date_parsing (self .precomputed .ld .xpath_search ("NewsArticle/datePublished" , scalar = True ))
46+
47+ @attribute
48+ def title (self ) -> Optional [str ]:
49+ return self .precomputed .ld .xpath_search ("NewsArticle/headline" , scalar = True )
50+
51+ @attribute
52+ def topics (self ) -> List [str ]:
53+ keywords : List [str ] = self .precomputed .ld .xpath_search ("NewsArticle/keywords" )
54+ return [keyword for keyword in keywords if not keyword .startswith ("Subject: " )]
55+
56+ @attribute
57+ def images (self ) -> List [Image ]:
58+ return image_extraction (
59+ doc = self .precomputed .doc ,
60+ paragraph_selector = self ._paragraph_selector ,
61+ image_selector = XPath ("//figure//img | //img[@src or @data-src]" ),
62+ caption_selector = XPath ("./ancestor::figure//*[self::figcaption or contains(@class, 'caption')]" ),
63+ author_selector = XPath (
64+ "(./ancestor::figure//*[contains(@class, 'credit') or contains(@class, 'byline')])[last()]"
65+ ),
66+ relative_urls = True ,
67+ )
0 commit comments