Skip to content

Commit e01e866

Browse files
authored
New Nature Parser
Added Image Extraction Detect Free Access Return Empty if Paywall (but still correctly see image) No side links, foot notes, image credits or copyright noticed get extracted anymore (at least on the mentioned website URLs)
1 parent 591428f commit e01e866

File tree

1 file changed

+108
-16
lines changed

1 file changed

+108
-16
lines changed

src/fundus/publishers/uk/nature.py

Lines changed: 108 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,43 +4,135 @@
44
from lxml.cssselect import CSSSelector
55
from lxml.etree import XPath
66

7-
from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
7+
from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
88
from fundus.parser.utility import (
99
extract_article_body_with_selector,
1010
generic_author_parsing,
1111
generic_date_parsing,
12-
parse_title_from_root,
12+
generic_topic_parsing,
13+
image_extraction,
1314
)
1415

1516

1617
class NatureParser(ParserProxy):
1718
class V1(BaseParser):
19+
_summary_selector = CSSSelector("div.c-article-abstract p, p.c-article-abstract")
20+
21+
#_paragraph_selector = CSSSelector("div.article__teaser[data-test='access-teaser'] > p")
1822
_paragraph_selector = XPath(
19-
"//div[contains(@class,'c-article-body')]//p | //div[contains(@class,'c-article-section__content')]//p"
23+
"//div[@data-test='access-teaser']//p"
24+
"["
25+
" not(ancestor::*[@data-label='Related' or contains(@class, 'recommended')])"
26+
" and not(contains(@class, 'recommended__title'))"
27+
" and not(ancestor::figure)"
28+
" and not(ancestor::figcaption)"
29+
" and not(ancestor::a)"
30+
"]"
2031
)
21-
_subheadline_selector = XPath("//h2[contains(@class,'c-article-section__heading')]")
22-
_author_selector = XPath("//li[contains(@class,'c-article-author')]//a")
2332

24-
@attribute
25-
def title(self) -> Optional[str]:
26-
return self.precomputed.meta.get("dc.title") or parse_title_from_root(self.precomputed.doc)
33+
#_subheadline_selector = CSSSelector("div.article__teaser[data-test='access-teaser'] > h2")
34+
_subheadline_selector = XPath(
35+
"//div[@data-test='access-teaser']//h2"
36+
"[not(ancestor::article[contains(@class, 'recommended')])]"
37+
)
38+
39+
_lower_boundary_selector = XPath(
40+
"(//*[(@class='app-access-wall') or "
41+
"contains(@class, 'c-related-articles') or "
42+
"(self::article and contains(@class, 'related'))])[1]"
43+
)
44+
45+
_image_selector = XPath("//div[contains(@class, 'article__teaser')]//figure//img")
46+
47+
_caption_selector = XPath("./ancestor::figure//figcaption")
48+
_author_selector = XPath("./ancestor::figure//span[contains(@class, 'copyright')]")
49+
2750

2851
@attribute
2952
def body(self) -> Optional[ArticleBody]:
3053
return extract_article_body_with_selector(
3154
self.precomputed.doc,
32-
paragraph_selector=self._paragraph_selector,
55+
summary_selector=self._summary_selector,
3356
subheadline_selector=self._subheadline_selector,
57+
paragraph_selector=self._paragraph_selector,
3458
)
3559

60+
@attribute
61+
def publishing_date(self) -> Optional[datetime.datetime]:
62+
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
63+
3664
@attribute
3765
def authors(self) -> List[str]:
38-
return generic_author_parsing(
39-
[node.text_content() for node in self._author_selector(self.precomputed.doc) or []]
40-
)
66+
return generic_author_parsing(self.precomputed.ld.bf_search("author"))
4167

4268
@attribute
43-
def publishing_date(self) -> Optional[datetime.datetime]:
44-
return generic_date_parsing(
45-
self.precomputed.meta.get("article:published_time") or self.precomputed.meta.get("dc.date")
46-
)
69+
def title(self) -> Optional[str]:
70+
return self.precomputed.ld.bf_search("headline")
71+
72+
@attribute
73+
def topics(self) -> List[str]:
74+
return generic_topic_parsing(self.precomputed.meta.get("article:tag"))
75+
76+
@attribute
77+
def free_access(self) -> bool:
78+
access = self.precomputed.ld.bf_search("isAccessibleForFree")
79+
if isinstance(access, bool):
80+
return access
81+
if isinstance(access, str):
82+
return access.lower() == "true"
83+
84+
@attribute
85+
def images(self) -> List[Image]:
86+
all_img_nodes = self.precomputed.doc.xpath("//img")
87+
for node in all_img_nodes:
88+
for attr in ["src", "data-src", "srcset", "data-srcset"]:
89+
attr_val = node.get(attr)
90+
if not attr_val:
91+
continue
92+
93+
#Nature uses nasty URLs for their Images, missing https:
94+
if "srcset" in attr:
95+
fixed_parts = []
96+
for part in attr_val.split(","):
97+
part = part.strip()
98+
if part.startswith("//"):
99+
fixed_parts.append("https:" + part)
100+
else:
101+
fixed_parts.append(part)
102+
node.set(attr, ", ".join(fixed_parts))
103+
elif attr_val.strip().startswith("//"):
104+
node.set(attr, "https:" + attr_val.strip())
105+
106+
all_source_nodes = self.precomputed.doc.xpath("//source")
107+
for node in all_source_nodes:
108+
for attr in ["srcset", "data-srcset"]:
109+
attr_val = node.get(attr)
110+
if not attr_val:
111+
continue
112+
113+
#Nature uses nasty URLs for their Images, missing https:
114+
fixed_parts = []
115+
for part in attr_val.split(","):
116+
part = part.strip()
117+
if part.startswith("//"):
118+
fixed_parts.append("https:" + part)
119+
else:
120+
fixed_parts.append(part)
121+
node.set(attr, ", ".join(fixed_parts))
122+
123+
#Try-Catch for Bounds Error if PayWall
124+
try:
125+
return image_extraction(
126+
doc=self.precomputed.doc,
127+
paragraph_selector=self._paragraph_selector,
128+
image_selector=self._image_selector,
129+
caption_selector=self._caption_selector,
130+
author_selector=self._author_selector,
131+
lower_boundary_selector=self._lower_boundary_selector,
132+
)
133+
except ValueError as e:
134+
if "Bounds could not be determined" in str(e):
135+
#This is a paywalled article with no paragraphs.
136+
return []
137+
else:
138+
raise e

0 commit comments

Comments
 (0)