flairNLP
diff --git a/‎docs/supported_publishers.md‎
Lines changed: 56 additions & 0 deletions b/‎docs/supported_publishers.md‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎src/fundus/publishers/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/fundus/publishers/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/fundus/publishers/lu/__init__.py‎
Lines changed: 44 additions & 0 deletions b/‎src/fundus/publishers/lu/__init__.py‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎src/fundus/publishers/lu/luxemburger_wort.py‎
Lines changed: 61 additions & 0 deletions b/‎src/fundus/publishers/lu/luxemburger_wort.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎src/fundus/publishers/lu/tageblatt.py‎
Lines changed: 57 additions & 0 deletions b/‎src/fundus/publishers/lu/tageblatt.py‎
Lines changed: 57 additions & 0 deletions
@@ -2199,6 +2199,62 @@
 </table>
 
 
+## LU-Publishers
+
+<table class="publishers lu">
+  <thead>
+    <tr>
+      <th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
+      <th>Name&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
+      <th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
+      <th>Languages</th>
+      <th>Missing&#160;Attributes</th>
+      <th>Additional&#160;Attributes&#160;&#160;&#160;&#160;</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>
+        <code>LuxemburgerWort</code>
+      </td>
+      <td>
+        <div>Luxemburger Wort</div>
+      </td>
+      <td>
+        <a href="https://www.wort.lu/">
+          <span>www.wort.lu</span>
+        </a>
+      </td>
+      <td>
+        <code>de</code>
+      </td>
+      <td>&#160;</td>
+      <td>&#160;</td>
+    </tr>
+    <tr>
+      <td>
+        <code>Tageblatt</code>
+      </td>
+      <td>
+        <div>Tageblatt</div>
+      </td>
+      <td>
+        <a href="https://www.tageblatt.lu/">
+          <span>www.tageblatt.lu</span>
+        </a>
+      </td>
+      <td>
+        <code>de</code>
+      </td>
+      <td>
+        <code>topics</code>
+      </td>
+      <td>&#160;</td>
+    </tr>
+  </tbody>
+</table>
+
+
 ## MX-Publishers
 
 <table class="publishers mx">
 
@@ -23,6 +23,7 @@
 from fundus.publishers.li import LI
 from fundus.publishers.ls import LS
 from fundus.publishers.lt import LT
+from fundus.publishers.lu import LU
 from fundus.publishers.mx import MX
 from fundus.publishers.my import MY
 from fundus.publishers.na import NA
@@ -91,6 +92,7 @@ class PublisherCollection(metaclass=PublisherCollectionMeta):
     li = LI
     ls = LS
     lt = LT
+    lu = LU
     mx = MX
     my = MY
     na = NA
 
@@ -0,0 +1,44 @@
+from fundus.publishers.base_objects import Publisher, PublisherGroup
+from fundus.scraping.filter import inverse, regex_filter
+from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
+
+from .luxemburger_wort import LuxemburgerWortParser
+from .tageblatt import TageblattParser
+
+
+class LU(metaclass=PublisherGroup):
+    default_language = "de"
+
+    Tageblatt = Publisher(
+        name="Tageblatt",
+        domain="https://www.tageblatt.lu/",
+        parser=TageblattParser,
+        sources=[
+            RSSFeed("https://www.tageblatt.lu/category/politik/feed/atom/"),
+            RSSFeed("https://www.tageblatt.lu/category/meinung/feed/atom/"),
+            RSSFeed("https://www.tageblatt.lu/category/nachrichten/feed/atom/"),
+            RSSFeed("https://www.tageblatt.lu/category/wirtschaft/feed/atom/"),
+            RSSFeed("https://www.tageblatt.lu/category/sport/feed/atom/"),
+            RSSFeed("https://www.tageblatt.lu/category/kultur/feed/atom/"),
+            RSSFeed("https://www.tageblatt.lu/category/wissen/feed/atom/"),
+            RSSFeed("https://www.tageblatt.lu/category/campus/feed/atom/"),
+            RSSFeed("https://www.tageblatt.lu/category/magazin/feed/atom/"),
+            RSSFeed("https://www.tageblatt.lu/category/auto/feed/atom/"),
+            Sitemap(
+                "https://www.tageblatt.lu/wp-sitemap.xml",
+                sitemap_filter=inverse(regex_filter("posts-post")),
+                reverse=True,
+            ),
+        ],
+    )
+
+    LuxemburgerWort = Publisher(
+        name="Luxemburger Wort",
+        domain="https://www.wort.lu/",
+        parser=LuxemburgerWortParser,
+        sources=[
+            RSSFeed("https://www.wort.lu/de/rss"),
+            Sitemap("https://www.wort.lu/sitemap.xml", reverse=True),
+            NewsMap("https://www.wort.lu/sitemap-news.xml"),
+        ],
+    )
@@ -0,0 +1,61 @@
+import datetime
+import re
+from typing import List, Optional
+
+from lxml.cssselect import CSSSelector
+from lxml.etree import XPath
+
+from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
+from fundus.parser.utility import (
+    extract_article_body_with_selector,
+    generic_author_parsing,
+    generic_date_parsing,
+    generic_nodes_to_text,
+    generic_topic_parsing,
+    image_extraction,
+)
+
+
+class LuxemburgerWortParser(ParserProxy):
+    class V1(BaseParser):
+        _paragraph_selector = XPath("//p[contains(@class, 'articleParagraph')]")
+        _summary_selector = XPath("//h2[contains(@class, 'articleParagraph')]")
+        _subheadline_selector = XPath("//h4[contains(@class, 'articleSubheading')]")
+
+        _topic_selector = XPath("//div[contains(@class, 'tag-list')]//a")
+
+        @attribute
+        def body(self) -> Optional[ArticleBody]:
+            return extract_article_body_with_selector(
+                self.precomputed.doc,
+                summary_selector=self._summary_selector,
+                subheadline_selector=self._subheadline_selector,
+                paragraph_selector=self._paragraph_selector,
+            )
+
+        @attribute
+        def authors(self) -> List[str]:
+            return generic_author_parsing(self.precomputed.ld.bf_search("author"))
+
+        @attribute
+        def publishing_date(self) -> Optional[datetime.datetime]:
+            return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
+
+        @attribute
+        def title(self) -> Optional[str]:
+            return self.precomputed.meta.get("og:title")
+
+        @attribute
+        def topics(self) -> List[str]:
+            return generic_topic_parsing(generic_nodes_to_text(self._topic_selector(self.precomputed.doc)))
+
+        @attribute
+        def images(self) -> List[Image]:
+            return image_extraction(
+                doc=self.precomputed.doc,
+                paragraph_selector=self._paragraph_selector,
+                image_selector=XPath("//figure[not(contains(@class, 'Teaser'))]//img"),
+                upper_boundary_selector=CSSSelector("h1"),
+                caption_selector=XPath("./ancestor::figure//div[contains(@class, 'ImageCaption')]"),
+                author_selector=re.compile(r"(?i)Foto:\s*(?P<credits>.*)"),
+            )
@@ -0,0 +1,57 @@
+import datetime
+import re
+from typing import List, Optional
+
+from lxml.cssselect import CSSSelector
+from lxml.etree import XPath
+
+from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
+from fundus.parser.utility import (
+    extract_article_body_with_selector,
+    generic_author_parsing,
+    generic_date_parsing,
+    image_extraction,
+)
+
+
+class TageblattParser(ParserProxy):
+    class V1(BaseParser):
+        _paragraph_selector = XPath("//div[@class='text-content']/p[@class='text' and normalize-space(text())]")
+        _summary_selector = XPath("//p[contains(@class,'teaser__text')]")
+        _subheadline_selector = XPath("//div[@class='text-content']//h2[contains(@class,'crosshead')]")
+
+        _bloat_authors = ["No Author", "Redaktion"]
+
+        @attribute
+        def body(self) -> Optional[ArticleBody]:
+            return extract_article_body_with_selector(
+                self.precomputed.doc,
+                summary_selector=self._summary_selector,
+                subheadline_selector=self._subheadline_selector,
+                paragraph_selector=self._paragraph_selector,
+            )
+
+        @attribute
+        def authors(self) -> List[str]:
+            return [
+                author
+                for author in generic_author_parsing(self.precomputed.ld.bf_search("author"))
+                if author not in self._bloat_authors
+            ]
+
+        @attribute
+        def publishing_date(self) -> Optional[datetime.datetime]:
+            return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
+
+        @attribute
+        def title(self) -> Optional[str]:
+            return self.precomputed.meta.get("og:title")
+
+        @attribute
+        def images(self) -> List[Image]:
+            return image_extraction(
+                doc=self.precomputed.doc,
+                paragraph_selector=self._paragraph_selector,
+                upper_boundary_selector=CSSSelector("h1"),
+                author_selector=re.compile(r"(?i)(Foto|Bild):\s*(?P<credits>.*)"),
+            )