flairNLP
diff --git a/‎docs/supported_publishers.md‎
Lines changed: 19 additions & 0 deletions b/‎docs/supported_publishers.md‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎src/fundus/publishers/uk/__init__.py‎
Lines changed: 12 additions & 0 deletions b/‎src/fundus/publishers/uk/__init__.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/fundus/publishers/uk/nature.py‎
Lines changed: 91 additions & 0 deletions b/‎src/fundus/publishers/uk/nature.py‎
Lines changed: 91 additions & 0 deletions
@@ -3131,6 +3131,25 @@
       <td>&#160;</td>
       <td>&#160;</td>
     </tr>
+    <tr>
+      <td>
+        <code>Nature</code>
+      </td>
+      <td>
+        <div>Nature</div>
+      </td>
+      <td>
+        <a href="https://www.nature.com/">
+          <span>www.nature.com</span>
+        </a>
+      </td>
+      <td>
+        <code>en</code>
+      </td>
+      <td>&#160;</td>
+      <td>&#160;</td>
+      <td>&#160;</td>
+    </tr>
     <tr>
       <td>
         <code>BBC</code>
 
@@ -13,6 +13,7 @@
 from .express import ExpressParser
 from .i_news import INewsParser
 from .metro import MetroParser
+from .nature import NatureParser
 from .the_bbc import TheBBCParser
 from .the_guardian import TheGuardianParser
 from .the_independent import TheIndependentParser
@@ -145,6 +146,17 @@ class UK(metaclass=PublisherGroup):
         ],
     )
 
+    Nature = Publisher(
+        name="Nature",
+        domain="https://www.nature.com/",
+        parser=NatureParser,
+        sources=[
+            RSSFeed("https://www.nature.com/nature.rss"),
+            NewsMap("https://www.nature.com/latest-news/sitemap.xml"),
+            Sitemap("https://www.nature.com/sitemap.xml"),
+        ],
+    )
+
     Express = Publisher(
         name="Daily Express",
         domain="https://www.express.co.uk/",
 
@@ -0,0 +1,91 @@
+import datetime
+import re
+from typing import List, Optional
+
+from lxml.cssselect import CSSSelector
+from lxml.etree import XPath
+
+from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
+from fundus.parser.utility import (
+    extract_article_body_with_selector,
+    generic_author_parsing,
+    generic_date_parsing,
+    generic_topic_parsing,
+    image_extraction,
+)
+
+
+class NatureParser(ParserProxy):
+    class V1(BaseParser):
+        _summary_selector = CSSSelector("div.c-article-abstract p, p.c-article-abstract")
+
+        _paragraph_selector = XPath(
+            "//div[@data-test='access-teaser']//p"
+            "["
+            "  not(ancestor::*[@data-label='Related' or contains(@class, 'recommended')])"
+            "  and not(contains(@class, 'recommended__title'))"
+            "  and not(ancestor::figure)"
+            "  and not(ancestor::figcaption)"
+            "  and not(ancestor::a)"
+            "]"
+        )
+
+        _subheadline_selector = XPath(
+            "//div[@data-test='access-teaser']//h2" "[not(ancestor::article[contains(@class, 'recommended')])]"
+        )
+
+        _lower_boundary_selector = XPath(
+            "(//*[(@class='app-access-wall') or "
+            "contains(@class, 'c-related-articles') or "
+            "(self::article and contains(@class, 'related'))])[1]"
+        )
+        _caption_selector = XPath("./ancestor::figure//figcaption")
+        _author_pattern = re.compile(r"(?i)\s*(credit|source|illustration|analysis by):?\s+(?P<credits>.*)")
+
+        _bloat_topics = ["multidisciplinary", "Science", "Humanities and Social Sciences"]
+
+        _paywall_selector = XPath("//div[@class='app-access-wall__container']")
+
+        @attribute
+        def body(self) -> Optional[ArticleBody]:
+            return extract_article_body_with_selector(
+                self.precomputed.doc,
+                summary_selector=self._summary_selector,
+                subheadline_selector=self._subheadline_selector,
+                paragraph_selector=self._paragraph_selector,
+            )
+
+        @attribute
+        def publishing_date(self) -> Optional[datetime.datetime]:
+            return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
+
+        @attribute
+        def authors(self) -> List[str]:
+            return generic_author_parsing(self.precomputed.ld.bf_search("author"))
+
+        @attribute
+        def title(self) -> Optional[str]:
+            return self.precomputed.ld.bf_search("headline")
+
+        @attribute
+        def topics(self) -> List[str]:
+            return [
+                topic
+                for topic in generic_topic_parsing(self.precomputed.ld.bf_search("keywords"))
+                if topic not in self._bloat_topics
+            ]
+
+        @attribute
+        def free_access(self) -> bool:
+            return not bool(self._paywall_selector(self.precomputed.doc))
+
+        @attribute
+        def images(self) -> List[Image]:
+            return image_extraction(
+                doc=self.precomputed.doc,
+                paragraph_selector=self._paragraph_selector,
+                relative_urls=True,
+                caption_selector=self._caption_selector,
+                author_selector=self._author_pattern,
+                lower_boundary_selector=self._lower_boundary_selector,
+            )