11import datetime
2+ import re
23from typing import List , Optional
34
45from lxml .cssselect import CSSSelector
@@ -18,7 +19,6 @@ class NatureParser(ParserProxy):
1819 class V1 (BaseParser ):
1920 _summary_selector = CSSSelector ("div.c-article-abstract p, p.c-article-abstract" )
2021
21- #_paragraph_selector = CSSSelector("div.article__teaser[data-test='access-teaser'] > p")
2222 _paragraph_selector = XPath (
2323 "//div[@data-test='access-teaser']//p"
2424 "["
@@ -30,23 +30,21 @@ class V1(BaseParser):
3030 "]"
3131 )
3232
33- #_subheadline_selector = CSSSelector("div.article__teaser[data-test='access-teaser'] > h2")
3433 _subheadline_selector = XPath (
35- "//div[@data-test='access-teaser']//h2"
36- "[not(ancestor::article[contains(@class, 'recommended')])]"
34+ "//div[@data-test='access-teaser']//h2" "[not(ancestor::article[contains(@class, 'recommended')])]"
3735 )
3836
3937 _lower_boundary_selector = XPath (
4038 "(//*[(@class='app-access-wall') or "
4139 "contains(@class, 'c-related-articles') or "
4240 "(self::article and contains(@class, 'related'))])[1]"
4341 )
44-
45- _image_selector = XPath ("//div[contains(@class, 'article__teaser')]//figure//img" )
46-
4742 _caption_selector = XPath ("./ancestor::figure//figcaption" )
48- _author_selector = XPath ("./ancestor::figure//span[contains(@class, 'copyright')]" )
43+ _author_pattern = re .compile (r"(?i)\s*(credit|source|illustration|analysis by):?\s+(?P<credits>.*)" )
44+
45+ _bloat_topics = ["multidisciplinary" , "Science" , "Humanities and Social Sciences" ]
4946
47+ _paywall_selector = XPath ("//div[@class='app-access-wall__container']" )
5048
5149 @attribute
5250 def body (self ) -> Optional [ArticleBody ]:
@@ -71,68 +69,23 @@ def title(self) -> Optional[str]:
7169
7270 @attribute
7371 def topics (self ) -> List [str ]:
74- return generic_topic_parsing (self .precomputed .meta .get ("article:tag" ))
75-
72+ return [
73+ topic
74+ for topic in generic_topic_parsing (self .precomputed .ld .bf_search ("keywords" ))
75+ if topic not in self ._bloat_topics
76+ ]
77+
7678 @attribute
7779 def free_access (self ) -> bool :
78- access = self .precomputed .ld .bf_search ("isAccessibleForFree" )
79- if isinstance (access , bool ):
80- return access
81- if isinstance (access , str ):
82- return access .lower () == "true"
80+ return not bool (self ._paywall_selector (self .precomputed .doc ))
8381
8482 @attribute
8583 def images (self ) -> List [Image ]:
86- all_img_nodes = self .precomputed .doc .xpath ("//img" )
87- for node in all_img_nodes :
88- for attr in ["src" , "data-src" , "srcset" , "data-srcset" ]:
89- attr_val = node .get (attr )
90- if not attr_val :
91- continue
92-
93- #Nature uses nasty URLs for their Images, missing https:
94- if "srcset" in attr :
95- fixed_parts = []
96- for part in attr_val .split ("," ):
97- part = part .strip ()
98- if part .startswith ("//" ):
99- fixed_parts .append ("https:" + part )
100- else :
101- fixed_parts .append (part )
102- node .set (attr , ", " .join (fixed_parts ))
103- elif attr_val .strip ().startswith ("//" ):
104- node .set (attr , "https:" + attr_val .strip ())
105-
106- all_source_nodes = self .precomputed .doc .xpath ("//source" )
107- for node in all_source_nodes :
108- for attr in ["srcset" , "data-srcset" ]:
109- attr_val = node .get (attr )
110- if not attr_val :
111- continue
112-
113- #Nature uses nasty URLs for their Images, missing https:
114- fixed_parts = []
115- for part in attr_val .split ("," ):
116- part = part .strip ()
117- if part .startswith ("//" ):
118- fixed_parts .append ("https:" + part )
119- else :
120- fixed_parts .append (part )
121- node .set (attr , ", " .join (fixed_parts ))
122-
123- #Try-Catch for Bounds Error if PayWall
124- try :
125- return image_extraction (
126- doc = self .precomputed .doc ,
127- paragraph_selector = self ._paragraph_selector ,
128- image_selector = self ._image_selector ,
129- caption_selector = self ._caption_selector ,
130- author_selector = self ._author_selector ,
131- lower_boundary_selector = self ._lower_boundary_selector ,
132- )
133- except ValueError as e :
134- if "Bounds could not be determined" in str (e ):
135- #This is a paywalled article with no paragraphs.
136- return []
137- else :
138- raise e
84+ return image_extraction (
85+ doc = self .precomputed .doc ,
86+ paragraph_selector = self ._paragraph_selector ,
87+ relative_urls = True ,
88+ caption_selector = self ._caption_selector ,
89+ author_selector = self ._author_pattern ,
90+ lower_boundary_selector = self ._lower_boundary_selector ,
91+ )
0 commit comments