Skip to content

Commit a94311e

Browse files
authored
Nature PY Publisher UK
1 parent c7a220e commit a94311e

File tree

1 file changed

+46
-0
lines changed

1 file changed

+46
-0
lines changed

src/fundus/publishers/uk/nature.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import datetime
2+
from typing import List, Optional
3+
4+
from lxml.cssselect import CSSSelector
5+
from lxml.etree import XPath
6+
7+
from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
8+
from fundus.parser.utility import (
9+
extract_article_body_with_selector,
10+
generic_author_parsing,
11+
generic_date_parsing,
12+
parse_title_from_root,
13+
)
14+
15+
16+
class NatureParser(ParserProxy):
17+
class V1(BaseParser):
18+
_paragraph_selector = XPath(
19+
"//div[contains(@class,'c-article-body')]//p | //div[contains(@class,'c-article-section__content')]//p"
20+
)
21+
_subheadline_selector = XPath("//h2[contains(@class,'c-article-section__heading')]")
22+
_author_selector = XPath("//li[contains(@class,'c-article-author')]//a")
23+
24+
@attribute
25+
def title(self) -> Optional[str]:
26+
return self.precomputed.meta.get("dc.title") or parse_title_from_root(self.precomputed.doc)
27+
28+
@attribute
29+
def body(self) -> Optional[ArticleBody]:
30+
return extract_article_body_with_selector(
31+
self.precomputed.doc,
32+
paragraph_selector=self._paragraph_selector,
33+
subheadline_selector=self._subheadline_selector,
34+
)
35+
36+
@attribute
37+
def authors(self) -> List[str]:
38+
return generic_author_parsing(
39+
[node.text_content() for node in self._author_selector(self.precomputed.doc) or []]
40+
)
41+
42+
@attribute
43+
def publishing_date(self) -> Optional[datetime.datetime]:
44+
return generic_date_parsing(
45+
self.precomputed.meta.get("article:published_time") or self.precomputed.meta.get("dc.date")
46+
)

0 commit comments

Comments
 (0)