Skip to content

Commit 33fca80

Browse files
committed
fix formatting, typing and some minor issues
1 parent 33f44bf commit 33fca80

File tree

5 files changed

+262
-93
lines changed

5 files changed

+262
-93
lines changed

src/fundus/publishers/uk/nature.py

Lines changed: 21 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import datetime
2+
import re
23
from typing import List, Optional
34

45
from lxml.cssselect import CSSSelector
@@ -18,7 +19,6 @@ class NatureParser(ParserProxy):
1819
class V1(BaseParser):
1920
_summary_selector = CSSSelector("div.c-article-abstract p, p.c-article-abstract")
2021

21-
#_paragraph_selector = CSSSelector("div.article__teaser[data-test='access-teaser'] > p")
2222
_paragraph_selector = XPath(
2323
"//div[@data-test='access-teaser']//p"
2424
"["
@@ -30,23 +30,21 @@ class V1(BaseParser):
3030
"]"
3131
)
3232

33-
#_subheadline_selector = CSSSelector("div.article__teaser[data-test='access-teaser'] > h2")
3433
_subheadline_selector = XPath(
35-
"//div[@data-test='access-teaser']//h2"
36-
"[not(ancestor::article[contains(@class, 'recommended')])]"
34+
"//div[@data-test='access-teaser']//h2" "[not(ancestor::article[contains(@class, 'recommended')])]"
3735
)
3836

3937
_lower_boundary_selector = XPath(
4038
"(//*[(@class='app-access-wall') or "
4139
"contains(@class, 'c-related-articles') or "
4240
"(self::article and contains(@class, 'related'))])[1]"
4341
)
44-
45-
_image_selector = XPath("//div[contains(@class, 'article__teaser')]//figure//img")
46-
4742
_caption_selector = XPath("./ancestor::figure//figcaption")
48-
_author_selector = XPath("./ancestor::figure//span[contains(@class, 'copyright')]")
43+
_author_pattern = re.compile(r"(?i)\s*(credit|source|illustration|analysis by):?\s+(?P<credits>.*)")
44+
45+
_bloat_topics = ["multidisciplinary", "Science", "Humanities and Social Sciences"]
4946

47+
_paywall_selector = XPath("//div[@class='app-access-wall__container']")
5048

5149
@attribute
5250
def body(self) -> Optional[ArticleBody]:
@@ -71,68 +69,23 @@ def title(self) -> Optional[str]:
7169

7270
@attribute
7371
def topics(self) -> List[str]:
74-
return generic_topic_parsing(self.precomputed.meta.get("article:tag"))
75-
72+
return [
73+
topic
74+
for topic in generic_topic_parsing(self.precomputed.ld.bf_search("keywords"))
75+
if topic not in self._bloat_topics
76+
]
77+
7678
@attribute
7779
def free_access(self) -> bool:
78-
access = self.precomputed.ld.bf_search("isAccessibleForFree")
79-
if isinstance(access, bool):
80-
return access
81-
if isinstance(access, str):
82-
return access.lower() == "true"
80+
return not bool(self._paywall_selector(self.precomputed.doc))
8381

8482
@attribute
8583
def images(self) -> List[Image]:
86-
all_img_nodes = self.precomputed.doc.xpath("//img")
87-
for node in all_img_nodes:
88-
for attr in ["src", "data-src", "srcset", "data-srcset"]:
89-
attr_val = node.get(attr)
90-
if not attr_val:
91-
continue
92-
93-
#Nature uses nasty URLs for their Images, missing https:
94-
if "srcset" in attr:
95-
fixed_parts = []
96-
for part in attr_val.split(","):
97-
part = part.strip()
98-
if part.startswith("//"):
99-
fixed_parts.append("https:" + part)
100-
else:
101-
fixed_parts.append(part)
102-
node.set(attr, ", ".join(fixed_parts))
103-
elif attr_val.strip().startswith("//"):
104-
node.set(attr, "https:" + attr_val.strip())
105-
106-
all_source_nodes = self.precomputed.doc.xpath("//source")
107-
for node in all_source_nodes:
108-
for attr in ["srcset", "data-srcset"]:
109-
attr_val = node.get(attr)
110-
if not attr_val:
111-
continue
112-
113-
#Nature uses nasty URLs for their Images, missing https:
114-
fixed_parts = []
115-
for part in attr_val.split(","):
116-
part = part.strip()
117-
if part.startswith("//"):
118-
fixed_parts.append("https:" + part)
119-
else:
120-
fixed_parts.append(part)
121-
node.set(attr, ", ".join(fixed_parts))
122-
123-
#Try-Catch for Bounds Error if PayWall
124-
try:
125-
return image_extraction(
126-
doc=self.precomputed.doc,
127-
paragraph_selector=self._paragraph_selector,
128-
image_selector=self._image_selector,
129-
caption_selector=self._caption_selector,
130-
author_selector=self._author_selector,
131-
lower_boundary_selector=self._lower_boundary_selector,
132-
)
133-
except ValueError as e:
134-
if "Bounds could not be determined" in str(e):
135-
#This is a paywalled article with no paragraphs.
136-
return []
137-
else:
138-
raise e
84+
return image_extraction(
85+
doc=self.precomputed.doc,
86+
paragraph_selector=self._paragraph_selector,
87+
relative_urls=True,
88+
caption_selector=self._caption_selector,
89+
author_selector=self._author_pattern,
90+
lower_boundary_selector=self._lower_boundary_selector,
91+
)

0 commit comments

Comments
 (0)