|
| 1 | +import datetime |
| 2 | +import re |
| 3 | +from typing import List, Optional |
| 4 | + |
| 5 | +from lxml.cssselect import CSSSelector |
| 6 | +from lxml.etree import XPath |
| 7 | + |
| 8 | +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute |
| 9 | +from fundus.parser.utility import ( |
| 10 | + extract_article_body_with_selector, |
| 11 | + generic_author_parsing, |
| 12 | + generic_date_parsing, |
| 13 | + generic_topic_parsing, |
| 14 | + image_extraction, |
| 15 | +) |
| 16 | + |
| 17 | + |
| 18 | +class NatureParser(ParserProxy): |
| 19 | + class V1(BaseParser): |
| 20 | + _summary_selector = CSSSelector("div.c-article-abstract p, p.c-article-abstract") |
| 21 | + |
| 22 | + _paragraph_selector = XPath( |
| 23 | + "//div[@data-test='access-teaser']//p" |
| 24 | + "[" |
| 25 | + " not(ancestor::*[@data-label='Related' or contains(@class, 'recommended')])" |
| 26 | + " and not(contains(@class, 'recommended__title'))" |
| 27 | + " and not(ancestor::figure)" |
| 28 | + " and not(ancestor::figcaption)" |
| 29 | + " and not(ancestor::a)" |
| 30 | + "]" |
| 31 | + ) |
| 32 | + |
| 33 | + _subheadline_selector = XPath( |
| 34 | + "//div[@data-test='access-teaser']//h2" "[not(ancestor::article[contains(@class, 'recommended')])]" |
| 35 | + ) |
| 36 | + |
| 37 | + _lower_boundary_selector = XPath( |
| 38 | + "(//*[(@class='app-access-wall') or " |
| 39 | + "contains(@class, 'c-related-articles') or " |
| 40 | + "(self::article and contains(@class, 'related'))])[1]" |
| 41 | + ) |
| 42 | + _caption_selector = XPath("./ancestor::figure//figcaption") |
| 43 | + _author_pattern = re.compile(r"(?i)\s*(credit|source|illustration|analysis by):?\s+(?P<credits>.*)") |
| 44 | + |
| 45 | + _bloat_topics = ["multidisciplinary", "Science", "Humanities and Social Sciences"] |
| 46 | + |
| 47 | + _paywall_selector = XPath("//div[@class='app-access-wall__container']") |
| 48 | + |
| 49 | + @attribute |
| 50 | + def body(self) -> Optional[ArticleBody]: |
| 51 | + return extract_article_body_with_selector( |
| 52 | + self.precomputed.doc, |
| 53 | + summary_selector=self._summary_selector, |
| 54 | + subheadline_selector=self._subheadline_selector, |
| 55 | + paragraph_selector=self._paragraph_selector, |
| 56 | + ) |
| 57 | + |
| 58 | + @attribute |
| 59 | + def publishing_date(self) -> Optional[datetime.datetime]: |
| 60 | + return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) |
| 61 | + |
| 62 | + @attribute |
| 63 | + def authors(self) -> List[str]: |
| 64 | + return generic_author_parsing(self.precomputed.ld.bf_search("author")) |
| 65 | + |
| 66 | + @attribute |
| 67 | + def title(self) -> Optional[str]: |
| 68 | + return self.precomputed.ld.bf_search("headline") |
| 69 | + |
| 70 | + @attribute |
| 71 | + def topics(self) -> List[str]: |
| 72 | + return [ |
| 73 | + topic |
| 74 | + for topic in generic_topic_parsing(self.precomputed.ld.bf_search("keywords")) |
| 75 | + if topic not in self._bloat_topics |
| 76 | + ] |
| 77 | + |
| 78 | + @attribute |
| 79 | + def free_access(self) -> bool: |
| 80 | + return not bool(self._paywall_selector(self.precomputed.doc)) |
| 81 | + |
| 82 | + @attribute |
| 83 | + def images(self) -> List[Image]: |
| 84 | + return image_extraction( |
| 85 | + doc=self.precomputed.doc, |
| 86 | + paragraph_selector=self._paragraph_selector, |
| 87 | + relative_urls=True, |
| 88 | + caption_selector=self._caption_selector, |
| 89 | + author_selector=self._author_pattern, |
| 90 | + lower_boundary_selector=self._lower_boundary_selector, |
| 91 | + ) |
0 commit comments