Skip to content

Commit 886d718

Browse files
committed
New publisher added: Der Freitag
1 parent 716b827 commit 886d718

File tree

6 files changed

+312
-0
lines changed

6 files changed

+312
-0
lines changed

docs/supported_publishers.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1342,6 +1342,25 @@
13421342
<td>&#160;</td>
13431343
<td>&#160;</td>
13441344
</tr>
1345+
<tr>
1346+
<td>
1347+
<code>DerFreitag</code>
1348+
</td>
1349+
<td>
1350+
<div>der Freitag</div>
1351+
</td>
1352+
<td>
1353+
<a href="https://www.freitag.de/">
1354+
<span>www.freitag.de</span>
1355+
</a>
1356+
</td>
1357+
<td>
1358+
<code>de</code>
1359+
</td>
1360+
<td>&#160;</td>
1361+
<td>&#160;</td>
1362+
<td>&#160;</td>
1363+
</tr>
13451364
<tr>
13461365
<td>
13471366
<code>NetzpolitikOrg</code>

src/fundus/publishers/de/__init__.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from .boersenzeitung import BoersenZeitungParser
1313
from .br import BRParser
1414
from .business_insider_de import BusinessInsiderDEParser
15+
from .der_freitag import DerFreitagParser
1516
from .die_welt import DieWeltParser
1617
from .die_zeit import DieZeitParser
1718
from .dw import DWParser
@@ -595,3 +596,13 @@ class DE(metaclass=PublisherGroup):
595596
Sitemap("https://www.gamestar.de/artikel_archiv_index.xml"),
596597
],
597598
)
599+
600+
DerFreitag = Publisher(
601+
name="der Freitag",
602+
domain="https://www.freitag.de/",
603+
parser=DerFreitagParser,
604+
sources=[
605+
RSSFeed("https://www.freitag.de/@@RSS"),
606+
Sitemap("https://www.freitag.de/sitemap.xml", sitemap_filter=inverse(regex_filter("sitemap-articles"))),
607+
],
608+
)
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from datetime import date, datetime
2+
from typing import List, Optional
3+
4+
from lxml.cssselect import CSSSelector
5+
from lxml.etree import XPath
6+
7+
from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
8+
from fundus.parser.utility import (
9+
extract_article_body_with_selector,
10+
generic_author_parsing,
11+
generic_date_parsing,
12+
image_extraction,
13+
)
14+
15+
16+
class DerFreitagParser(ParserProxy):
17+
class V1(BaseParser):
18+
_summary_selector = CSSSelector("header > p.bc-article-intro__text")
19+
_paragraph_selector = CSSSelector("div.bo-article-text > p")
20+
_subheadline_selector = CSSSelector("div.bo-article-text > h2")
21+
22+
@attribute
23+
def title(self) -> Optional[str]:
24+
return self.precomputed.meta.get("og:title")
25+
26+
@attribute
27+
def body(self) -> Optional[ArticleBody]:
28+
return extract_article_body_with_selector(
29+
self.precomputed.doc,
30+
summary_selector=self._summary_selector,
31+
subheadline_selector=self._subheadline_selector,
32+
paragraph_selector=self._paragraph_selector,
33+
)
34+
35+
@attribute
36+
def authors(self) -> List[str]:
37+
return generic_author_parsing(self.precomputed.ld.bf_search("author"))
38+
39+
@attribute
40+
def publishing_date(self) -> Optional[datetime]:
41+
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
42+
43+
@attribute
44+
def topics(self) -> List[str]:
45+
return self.precomputed.ld.bf_search("keywords")
46+
47+
@attribute
48+
def images(self) -> List[Image]:
49+
return image_extraction(
50+
doc=self.precomputed.doc,
51+
paragraph_selector=self._paragraph_selector,
52+
)

0 commit comments

Comments
 (0)