Skip to content

Commit 2b212a3

Browse files
authored
Merge pull request #775 from flairNLP/add-tageblatt
Add `LU`
2 parents 5ccd80f + b0e4241 commit 2b212a3

File tree

10 files changed

+590
-0
lines changed

10 files changed

+590
-0
lines changed

docs/supported_publishers.md

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2199,6 +2199,62 @@
21992199
</table>
22002200

22012201

2202+
## LU-Publishers
2203+
2204+
<table class="publishers lu">
2205+
<thead>
2206+
<tr>
2207+
<th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
2208+
<th>Name&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
2209+
<th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
2210+
<th>Languages</th>
2211+
<th>Missing&#160;Attributes</th>
2212+
<th>Additional&#160;Attributes&#160;&#160;&#160;&#160;</th>
2213+
</tr>
2214+
</thead>
2215+
<tbody>
2216+
<tr>
2217+
<td>
2218+
<code>LuxemburgerWort</code>
2219+
</td>
2220+
<td>
2221+
<div>Luxemburger Wort</div>
2222+
</td>
2223+
<td>
2224+
<a href="https://www.wort.lu/">
2225+
<span>www.wort.lu</span>
2226+
</a>
2227+
</td>
2228+
<td>
2229+
<code>de</code>
2230+
</td>
2231+
<td>&#160;</td>
2232+
<td>&#160;</td>
2233+
</tr>
2234+
<tr>
2235+
<td>
2236+
<code>Tageblatt</code>
2237+
</td>
2238+
<td>
2239+
<div>Tageblatt</div>
2240+
</td>
2241+
<td>
2242+
<a href="https://www.tageblatt.lu/">
2243+
<span>www.tageblatt.lu</span>
2244+
</a>
2245+
</td>
2246+
<td>
2247+
<code>de</code>
2248+
</td>
2249+
<td>
2250+
<code>topics</code>
2251+
</td>
2252+
<td>&#160;</td>
2253+
</tr>
2254+
</tbody>
2255+
</table>
2256+
2257+
22022258
## MX-Publishers
22032259

22042260
<table class="publishers mx">

src/fundus/publishers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from fundus.publishers.li import LI
2424
from fundus.publishers.ls import LS
2525
from fundus.publishers.lt import LT
26+
from fundus.publishers.lu import LU
2627
from fundus.publishers.mx import MX
2728
from fundus.publishers.my import MY
2829
from fundus.publishers.na import NA
@@ -91,6 +92,7 @@ class PublisherCollection(metaclass=PublisherCollectionMeta):
9192
li = LI
9293
ls = LS
9394
lt = LT
95+
lu = LU
9496
mx = MX
9597
my = MY
9698
na = NA
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from fundus.publishers.base_objects import Publisher, PublisherGroup
2+
from fundus.scraping.filter import inverse, regex_filter
3+
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
4+
5+
from .luxemburger_wort import LuxemburgerWortParser
6+
from .tageblatt import TageblattParser
7+
8+
9+
class LU(metaclass=PublisherGroup):
10+
default_language = "de"
11+
12+
Tageblatt = Publisher(
13+
name="Tageblatt",
14+
domain="https://www.tageblatt.lu/",
15+
parser=TageblattParser,
16+
sources=[
17+
RSSFeed("https://www.tageblatt.lu/category/politik/feed/atom/"),
18+
RSSFeed("https://www.tageblatt.lu/category/meinung/feed/atom/"),
19+
RSSFeed("https://www.tageblatt.lu/category/nachrichten/feed/atom/"),
20+
RSSFeed("https://www.tageblatt.lu/category/wirtschaft/feed/atom/"),
21+
RSSFeed("https://www.tageblatt.lu/category/sport/feed/atom/"),
22+
RSSFeed("https://www.tageblatt.lu/category/kultur/feed/atom/"),
23+
RSSFeed("https://www.tageblatt.lu/category/wissen/feed/atom/"),
24+
RSSFeed("https://www.tageblatt.lu/category/campus/feed/atom/"),
25+
RSSFeed("https://www.tageblatt.lu/category/magazin/feed/atom/"),
26+
RSSFeed("https://www.tageblatt.lu/category/auto/feed/atom/"),
27+
Sitemap(
28+
"https://www.tageblatt.lu/wp-sitemap.xml",
29+
sitemap_filter=inverse(regex_filter("posts-post")),
30+
reverse=True,
31+
),
32+
],
33+
)
34+
35+
LuxemburgerWort = Publisher(
36+
name="Luxemburger Wort",
37+
domain="https://www.wort.lu/",
38+
parser=LuxemburgerWortParser,
39+
sources=[
40+
RSSFeed("https://www.wort.lu/de/rss"),
41+
Sitemap("https://www.wort.lu/sitemap.xml", reverse=True),
42+
NewsMap("https://www.wort.lu/sitemap-news.xml"),
43+
],
44+
)
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import datetime
2+
import re
3+
from typing import List, Optional
4+
5+
from lxml.cssselect import CSSSelector
6+
from lxml.etree import XPath
7+
8+
from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
9+
from fundus.parser.utility import (
10+
extract_article_body_with_selector,
11+
generic_author_parsing,
12+
generic_date_parsing,
13+
generic_nodes_to_text,
14+
generic_topic_parsing,
15+
image_extraction,
16+
)
17+
18+
19+
class LuxemburgerWortParser(ParserProxy):
20+
class V1(BaseParser):
21+
_paragraph_selector = XPath("//p[contains(@class, 'articleParagraph')]")
22+
_summary_selector = XPath("//h2[contains(@class, 'articleParagraph')]")
23+
_subheadline_selector = XPath("//h4[contains(@class, 'articleSubheading')]")
24+
25+
_topic_selector = XPath("//div[contains(@class, 'tag-list')]//a")
26+
27+
@attribute
28+
def body(self) -> Optional[ArticleBody]:
29+
return extract_article_body_with_selector(
30+
self.precomputed.doc,
31+
summary_selector=self._summary_selector,
32+
subheadline_selector=self._subheadline_selector,
33+
paragraph_selector=self._paragraph_selector,
34+
)
35+
36+
@attribute
37+
def authors(self) -> List[str]:
38+
return generic_author_parsing(self.precomputed.ld.bf_search("author"))
39+
40+
@attribute
41+
def publishing_date(self) -> Optional[datetime.datetime]:
42+
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
43+
44+
@attribute
45+
def title(self) -> Optional[str]:
46+
return self.precomputed.meta.get("og:title")
47+
48+
@attribute
49+
def topics(self) -> List[str]:
50+
return generic_topic_parsing(generic_nodes_to_text(self._topic_selector(self.precomputed.doc)))
51+
52+
@attribute
53+
def images(self) -> List[Image]:
54+
return image_extraction(
55+
doc=self.precomputed.doc,
56+
paragraph_selector=self._paragraph_selector,
57+
image_selector=XPath("//figure[not(contains(@class, 'Teaser'))]//img"),
58+
upper_boundary_selector=CSSSelector("h1"),
59+
caption_selector=XPath("./ancestor::figure//div[contains(@class, 'ImageCaption')]"),
60+
author_selector=re.compile(r"(?i)Foto:\s*(?P<credits>.*)"),
61+
)
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import datetime
2+
import re
3+
from typing import List, Optional
4+
5+
from lxml.cssselect import CSSSelector
6+
from lxml.etree import XPath
7+
8+
from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
9+
from fundus.parser.utility import (
10+
extract_article_body_with_selector,
11+
generic_author_parsing,
12+
generic_date_parsing,
13+
image_extraction,
14+
)
15+
16+
17+
class TageblattParser(ParserProxy):
18+
class V1(BaseParser):
19+
_paragraph_selector = XPath("//div[@class='text-content']/p[@class='text' and normalize-space(text())]")
20+
_summary_selector = XPath("//p[contains(@class,'teaser__text')]")
21+
_subheadline_selector = XPath("//div[@class='text-content']//h2[contains(@class,'crosshead')]")
22+
23+
_bloat_authors = ["No Author", "Redaktion"]
24+
25+
@attribute
26+
def body(self) -> Optional[ArticleBody]:
27+
return extract_article_body_with_selector(
28+
self.precomputed.doc,
29+
summary_selector=self._summary_selector,
30+
subheadline_selector=self._subheadline_selector,
31+
paragraph_selector=self._paragraph_selector,
32+
)
33+
34+
@attribute
35+
def authors(self) -> List[str]:
36+
return [
37+
author
38+
for author in generic_author_parsing(self.precomputed.ld.bf_search("author"))
39+
if author not in self._bloat_authors
40+
]
41+
42+
@attribute
43+
def publishing_date(self) -> Optional[datetime.datetime]:
44+
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
45+
46+
@attribute
47+
def title(self) -> Optional[str]:
48+
return self.precomputed.meta.get("og:title")
49+
50+
@attribute
51+
def images(self) -> List[Image]:
52+
return image_extraction(
53+
doc=self.precomputed.doc,
54+
paragraph_selector=self._paragraph_selector,
55+
upper_boundary_selector=CSSSelector("h1"),
56+
author_selector=re.compile(r"(?i)(Foto|Bild):\s*(?P<credits>.*)"),
57+
)

0 commit comments

Comments
 (0)