|
4 | 4 | from lxml.cssselect import CSSSelector |
5 | 5 | from lxml.etree import XPath |
6 | 6 |
|
7 | | -from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute |
| 7 | +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute |
8 | 8 | from fundus.parser.utility import ( |
9 | 9 | extract_article_body_with_selector, |
10 | 10 | generic_author_parsing, |
11 | 11 | generic_date_parsing, |
12 | | - parse_title_from_root, |
| 12 | + generic_topic_parsing, |
| 13 | + image_extraction, |
13 | 14 | ) |
14 | 15 |
|
15 | 16 |
|
16 | 17 | class NatureParser(ParserProxy): |
17 | 18 | class V1(BaseParser): |
| 19 | + _summary_selector = CSSSelector("div.c-article-abstract p, p.c-article-abstract") |
| 20 | + |
| 21 | + #_paragraph_selector = CSSSelector("div.article__teaser[data-test='access-teaser'] > p") |
18 | 22 | _paragraph_selector = XPath( |
19 | | - "//div[contains(@class,'c-article-body')]//p | //div[contains(@class,'c-article-section__content')]//p" |
| 23 | + "//div[@data-test='access-teaser']//p" |
| 24 | + "[" |
| 25 | + " not(ancestor::*[@data-label='Related' or contains(@class, 'recommended')])" |
| 26 | + " and not(contains(@class, 'recommended__title'))" |
| 27 | + " and not(ancestor::figure)" |
| 28 | + " and not(ancestor::figcaption)" |
| 29 | + " and not(ancestor::a)" |
| 30 | + "]" |
20 | 31 | ) |
21 | | - _subheadline_selector = XPath("//h2[contains(@class,'c-article-section__heading')]") |
22 | | - _author_selector = XPath("//li[contains(@class,'c-article-author')]//a") |
23 | 32 |
|
24 | | - @attribute |
25 | | - def title(self) -> Optional[str]: |
26 | | - return self.precomputed.meta.get("dc.title") or parse_title_from_root(self.precomputed.doc) |
| 33 | + #_subheadline_selector = CSSSelector("div.article__teaser[data-test='access-teaser'] > h2") |
| 34 | + _subheadline_selector = XPath( |
| 35 | + "//div[@data-test='access-teaser']//h2" |
| 36 | + "[not(ancestor::article[contains(@class, 'recommended')])]" |
| 37 | + ) |
| 38 | + |
| 39 | + _lower_boundary_selector = XPath( |
| 40 | + "(//*[(@class='app-access-wall') or " |
| 41 | + "contains(@class, 'c-related-articles') or " |
| 42 | + "(self::article and contains(@class, 'related'))])[1]" |
| 43 | + ) |
| 44 | + |
| 45 | + _image_selector = XPath("//div[contains(@class, 'article__teaser')]//figure//img") |
| 46 | + |
| 47 | + _caption_selector = XPath("./ancestor::figure//figcaption") |
| 48 | + _author_selector = XPath("./ancestor::figure//span[contains(@class, 'copyright')]") |
| 49 | + |
27 | 50 |
|
28 | 51 | @attribute |
29 | 52 | def body(self) -> Optional[ArticleBody]: |
30 | 53 | return extract_article_body_with_selector( |
31 | 54 | self.precomputed.doc, |
32 | | - paragraph_selector=self._paragraph_selector, |
| 55 | + summary_selector=self._summary_selector, |
33 | 56 | subheadline_selector=self._subheadline_selector, |
| 57 | + paragraph_selector=self._paragraph_selector, |
34 | 58 | ) |
35 | 59 |
|
| 60 | + @attribute |
| 61 | + def publishing_date(self) -> Optional[datetime.datetime]: |
| 62 | + return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) |
| 63 | + |
36 | 64 | @attribute |
37 | 65 | def authors(self) -> List[str]: |
38 | | - return generic_author_parsing( |
39 | | - [node.text_content() for node in self._author_selector(self.precomputed.doc) or []] |
40 | | - ) |
| 66 | + return generic_author_parsing(self.precomputed.ld.bf_search("author")) |
41 | 67 |
|
42 | 68 | @attribute |
43 | | - def publishing_date(self) -> Optional[datetime.datetime]: |
44 | | - return generic_date_parsing( |
45 | | - self.precomputed.meta.get("article:published_time") or self.precomputed.meta.get("dc.date") |
46 | | - ) |
| 69 | + def title(self) -> Optional[str]: |
| 70 | + return self.precomputed.ld.bf_search("headline") |
| 71 | + |
| 72 | + @attribute |
| 73 | + def topics(self) -> List[str]: |
| 74 | + return generic_topic_parsing(self.precomputed.meta.get("article:tag")) |
| 75 | + |
| 76 | + @attribute |
| 77 | + def free_access(self) -> bool: |
| 78 | + access = self.precomputed.ld.bf_search("isAccessibleForFree") |
| 79 | + if isinstance(access, bool): |
| 80 | + return access |
| 81 | + if isinstance(access, str): |
| 82 | + return access.lower() == "true" |
| 83 | + |
| 84 | + @attribute |
| 85 | + def images(self) -> List[Image]: |
| 86 | + all_img_nodes = self.precomputed.doc.xpath("//img") |
| 87 | + for node in all_img_nodes: |
| 88 | + for attr in ["src", "data-src", "srcset", "data-srcset"]: |
| 89 | + attr_val = node.get(attr) |
| 90 | + if not attr_val: |
| 91 | + continue |
| 92 | + |
| 93 | + #Nature uses nasty URLs for their Images, missing https: |
| 94 | + if "srcset" in attr: |
| 95 | + fixed_parts = [] |
| 96 | + for part in attr_val.split(","): |
| 97 | + part = part.strip() |
| 98 | + if part.startswith("//"): |
| 99 | + fixed_parts.append("https:" + part) |
| 100 | + else: |
| 101 | + fixed_parts.append(part) |
| 102 | + node.set(attr, ", ".join(fixed_parts)) |
| 103 | + elif attr_val.strip().startswith("//"): |
| 104 | + node.set(attr, "https:" + attr_val.strip()) |
| 105 | + |
| 106 | + all_source_nodes = self.precomputed.doc.xpath("//source") |
| 107 | + for node in all_source_nodes: |
| 108 | + for attr in ["srcset", "data-srcset"]: |
| 109 | + attr_val = node.get(attr) |
| 110 | + if not attr_val: |
| 111 | + continue |
| 112 | + |
| 113 | + #Nature uses nasty URLs for their Images, missing https: |
| 114 | + fixed_parts = [] |
| 115 | + for part in attr_val.split(","): |
| 116 | + part = part.strip() |
| 117 | + if part.startswith("//"): |
| 118 | + fixed_parts.append("https:" + part) |
| 119 | + else: |
| 120 | + fixed_parts.append(part) |
| 121 | + node.set(attr, ", ".join(fixed_parts)) |
| 122 | + |
| 123 | + #Try-Catch for Bounds Error if PayWall |
| 124 | + try: |
| 125 | + return image_extraction( |
| 126 | + doc=self.precomputed.doc, |
| 127 | + paragraph_selector=self._paragraph_selector, |
| 128 | + image_selector=self._image_selector, |
| 129 | + caption_selector=self._caption_selector, |
| 130 | + author_selector=self._author_selector, |
| 131 | + lower_boundary_selector=self._lower_boundary_selector, |
| 132 | + ) |
| 133 | + except ValueError as e: |
| 134 | + if "Bounds could not be determined" in str(e): |
| 135 | + #This is a paywalled article with no paragraphs. |
| 136 | + return [] |
| 137 | + else: |
| 138 | + raise e |
0 commit comments