Skip to content

Commit 925990e

Browse files
authored
Merge pull request #798 from bresslem/add-publisher-der-freitag
New publisher added: Der Freitag
2 parents ba61e0f + a70ce32 commit 925990e

File tree

6 files changed

+232
-0
lines changed

6 files changed

+232
-0
lines changed

docs/supported_publishers.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1361,6 +1361,25 @@
13611361
<td>&#160;</td>
13621362
<td>&#160;</td>
13631363
</tr>
1364+
<tr>
1365+
<td>
1366+
<code>DerFreitag</code>
1367+
</td>
1368+
<td>
1369+
<div>der Freitag</div>
1370+
</td>
1371+
<td>
1372+
<a href="https://www.freitag.de/">
1373+
<span>www.freitag.de</span>
1374+
</a>
1375+
</td>
1376+
<td>
1377+
<code>de</code>
1378+
</td>
1379+
<td>&#160;</td>
1380+
<td>&#160;</td>
1381+
<td>&#160;</td>
1382+
</tr>
13641383
<tr>
13651384
<td>
13661385
<code>NetzpolitikOrg</code>

src/fundus/publishers/de/__init__.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from .boersenzeitung import BoersenZeitungParser
1313
from .br import BRParser
1414
from .business_insider_de import BusinessInsiderDEParser
15+
from .der_freitag import DerFreitagParser
1516
from .die_welt import DieWeltParser
1617
from .die_zeit import DieZeitParser
1718
from .dw import DWParser
@@ -605,3 +606,13 @@ class DE(metaclass=PublisherGroup):
605606
Sitemap("https://www.gamestar.de/artikel_archiv_index.xml"),
606607
],
607608
)
609+
610+
DerFreitag = Publisher(
611+
name="der Freitag",
612+
domain="https://www.freitag.de/",
613+
parser=DerFreitagParser,
614+
sources=[
615+
RSSFeed("https://www.freitag.de/@@RSS"),
616+
Sitemap("https://www.freitag.de/sitemap.xml", sitemap_filter=inverse(regex_filter("sitemap-articles"))),
617+
],
618+
)
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
from datetime import date, datetime
2+
from typing import List, Optional
3+
4+
from lxml.cssselect import CSSSelector
5+
from lxml.etree import XPath
6+
7+
from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
8+
from fundus.parser.utility import (
9+
extract_article_body_with_selector,
10+
generic_author_parsing,
11+
generic_date_parsing,
12+
image_extraction,
13+
)
14+
15+
16+
class DerFreitagParser(ParserProxy):
17+
class V1(BaseParser):
18+
_summary_selector = CSSSelector("header > p.bc-article-intro__text")
19+
_paragraph_selector = CSSSelector("div.bo-article-text > p")
20+
_subheadline_selector = CSSSelector("div.bo-article-text > h2")
21+
22+
@attribute
23+
def title(self) -> Optional[str]:
24+
return self.precomputed.meta.get("og:title")
25+
26+
@attribute
27+
def body(self) -> Optional[ArticleBody]:
28+
return extract_article_body_with_selector(
29+
self.precomputed.doc,
30+
summary_selector=self._summary_selector,
31+
subheadline_selector=self._subheadline_selector,
32+
paragraph_selector=self._paragraph_selector,
33+
)
34+
35+
@attribute
36+
def authors(self) -> List[str]:
37+
return generic_author_parsing(self.precomputed.ld.bf_search("author"))
38+
39+
@attribute
40+
def publishing_date(self) -> Optional[datetime]:
41+
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
42+
43+
@attribute
44+
def topics(self) -> List[str]:
45+
return self.precomputed.ld.bf_search("keywords")
46+
47+
@attribute
48+
def images(self) -> List[Image]:
49+
return image_extraction(
50+
doc=self.precomputed.doc,
51+
paragraph_selector=self._paragraph_selector,
52+
upper_boundary_selector=CSSSelector("header.bc-article-intro"),
53+
lower_boundary_selector=CSSSelector("span.freitag-article-end"),
54+
image_selector=CSSSelector("figure img,div[role='figure'] img"),
55+
caption_selector=XPath("./ancestor::figure//figcaption//span[@class='bo-image__caption__desc']"),
56+
author_selector=XPath("./ancestor::figure//figcaption//span[@class='bo-image__caption__credit']"),
57+
)

0 commit comments

Comments
 (0)