Skip to content

Commit 6e35b4e

Browse files
authored
Merge pull request #797 from Kucki2018/master
Add Nature (UK Scientific Journal)
2 parents bd1457c + 33fca80 commit 6e35b4e

File tree

6 files changed

+447
-74
lines changed

6 files changed

+447
-74
lines changed

docs/supported_publishers.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3131,6 +3131,25 @@
31313131
<td>&#160;</td>
31323132
<td>&#160;</td>
31333133
</tr>
3134+
<tr>
3135+
<td>
3136+
<code>Nature</code>
3137+
</td>
3138+
<td>
3139+
<div>Nature</div>
3140+
</td>
3141+
<td>
3142+
<a href="https://www.nature.com/">
3143+
<span>www.nature.com</span>
3144+
</a>
3145+
</td>
3146+
<td>
3147+
<code>en</code>
3148+
</td>
3149+
<td>&#160;</td>
3150+
<td>&#160;</td>
3151+
<td>&#160;</td>
3152+
</tr>
31343153
<tr>
31353154
<td>
31363155
<code>BBC</code>

src/fundus/publishers/uk/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from .express import ExpressParser
1414
from .i_news import INewsParser
1515
from .metro import MetroParser
16+
from .nature import NatureParser
1617
from .the_bbc import TheBBCParser
1718
from .the_guardian import TheGuardianParser
1819
from .the_independent import TheIndependentParser
@@ -145,6 +146,17 @@ class UK(metaclass=PublisherGroup):
145146
],
146147
)
147148

149+
Nature = Publisher(
150+
name="Nature",
151+
domain="https://www.nature.com/",
152+
parser=NatureParser,
153+
sources=[
154+
RSSFeed("https://www.nature.com/nature.rss"),
155+
NewsMap("https://www.nature.com/latest-news/sitemap.xml"),
156+
Sitemap("https://www.nature.com/sitemap.xml"),
157+
],
158+
)
159+
148160
Express = Publisher(
149161
name="Daily Express",
150162
domain="https://www.express.co.uk/",

src/fundus/publishers/uk/nature.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import datetime
2+
import re
3+
from typing import List, Optional
4+
5+
from lxml.cssselect import CSSSelector
6+
from lxml.etree import XPath
7+
8+
from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
9+
from fundus.parser.utility import (
10+
extract_article_body_with_selector,
11+
generic_author_parsing,
12+
generic_date_parsing,
13+
generic_topic_parsing,
14+
image_extraction,
15+
)
16+
17+
18+
class NatureParser(ParserProxy):
19+
class V1(BaseParser):
20+
_summary_selector = CSSSelector("div.c-article-abstract p, p.c-article-abstract")
21+
22+
_paragraph_selector = XPath(
23+
"//div[@data-test='access-teaser']//p"
24+
"["
25+
" not(ancestor::*[@data-label='Related' or contains(@class, 'recommended')])"
26+
" and not(contains(@class, 'recommended__title'))"
27+
" and not(ancestor::figure)"
28+
" and not(ancestor::figcaption)"
29+
" and not(ancestor::a)"
30+
"]"
31+
)
32+
33+
_subheadline_selector = XPath(
34+
"//div[@data-test='access-teaser']//h2" "[not(ancestor::article[contains(@class, 'recommended')])]"
35+
)
36+
37+
_lower_boundary_selector = XPath(
38+
"(//*[(@class='app-access-wall') or "
39+
"contains(@class, 'c-related-articles') or "
40+
"(self::article and contains(@class, 'related'))])[1]"
41+
)
42+
_caption_selector = XPath("./ancestor::figure//figcaption")
43+
_author_pattern = re.compile(r"(?i)\s*(credit|source|illustration|analysis by):?\s+(?P<credits>.*)")
44+
45+
_bloat_topics = ["multidisciplinary", "Science", "Humanities and Social Sciences"]
46+
47+
_paywall_selector = XPath("//div[@class='app-access-wall__container']")
48+
49+
@attribute
50+
def body(self) -> Optional[ArticleBody]:
51+
return extract_article_body_with_selector(
52+
self.precomputed.doc,
53+
summary_selector=self._summary_selector,
54+
subheadline_selector=self._subheadline_selector,
55+
paragraph_selector=self._paragraph_selector,
56+
)
57+
58+
@attribute
59+
def publishing_date(self) -> Optional[datetime.datetime]:
60+
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
61+
62+
@attribute
63+
def authors(self) -> List[str]:
64+
return generic_author_parsing(self.precomputed.ld.bf_search("author"))
65+
66+
@attribute
67+
def title(self) -> Optional[str]:
68+
return self.precomputed.ld.bf_search("headline")
69+
70+
@attribute
71+
def topics(self) -> List[str]:
72+
return [
73+
topic
74+
for topic in generic_topic_parsing(self.precomputed.ld.bf_search("keywords"))
75+
if topic not in self._bloat_topics
76+
]
77+
78+
@attribute
79+
def free_access(self) -> bool:
80+
return not bool(self._paywall_selector(self.precomputed.doc))
81+
82+
@attribute
83+
def images(self) -> List[Image]:
84+
return image_extraction(
85+
doc=self.precomputed.doc,
86+
paragraph_selector=self._paragraph_selector,
87+
relative_urls=True,
88+
caption_selector=self._caption_selector,
89+
author_selector=self._author_pattern,
90+
lower_boundary_selector=self._lower_boundary_selector,
91+
)

0 commit comments

Comments
 (0)