diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md index 7952efc96..2f011e1ae 100644 --- a/docs/supported_publishers.md +++ b/docs/supported_publishers.md @@ -937,6 +937,28 @@     + + + KlasseGegenKlasse + + +
Klasse Gegen Klasse
+ + + + www.klassegegenklasse.org + + + + de + + + images + title + +   +   + Krautreporter diff --git a/src/fundus/publishers/de/__init__.py b/src/fundus/publishers/de/__init__.py index e67dbe58f..48b1bdbca 100644 --- a/src/fundus/publishers/de/__init__.py +++ b/src/fundus/publishers/de/__init__.py @@ -26,6 +26,7 @@ from .hessenschau import HessenschauParser from .junge_welt import JungeWeltParser from .kicker import KickerParser +from .klassegegenklasse import KlasseGegenKlasseParser from .krautreporter import KrautreporterParser from .mdr import MDRParser from .merkur import MerkurParser @@ -595,3 +596,16 @@ class DE(metaclass=PublisherGroup): Sitemap("https://www.gamestar.de/artikel_archiv_index.xml"), ], ) + + KlasseGegenKlasse = Publisher( + name="Klasse Gegen Klasse", + domain="https://www.klassegegenklasse.org/", + parser=KlasseGegenKlasseParser, + sources=[ + RSSFeed("https://www.klassegegenklasse.org/feed/"), + Sitemap( + "https://www.klassegegenklasse.org/wp-sitemap.xml", + ), + ], + request_header={"user-agent": "Fundus"}, + ) diff --git a/src/fundus/publishers/de/klassegegenklasse.py b/src/fundus/publishers/de/klassegegenklasse.py new file mode 100644 index 000000000..c742a0300 --- /dev/null +++ b/src/fundus/publishers/de/klassegegenklasse.py @@ -0,0 +1,151 @@ +import re +from datetime import datetime +from typing import List, Optional + +from lxml.cssselect import CSSSelector +from lxml.etree import XPath + +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute +from fundus.parser.utility import ( + extract_article_body_with_selector, + generic_author_parsing, + generic_date_parsing, + generic_topic_parsing, + image_extraction, +) + + +class KlasseGegenKlasseParser(ParserProxy): + class V1(BaseParser): + _paragraph_selector = CSSSelector("article p, main article p, .post-content p, .entry-content p, .content p") + _summary_selector = CSSSelector( + "article .entry-content > p:first-child, article > p:first-child, .post-content > p:first-child" + ) + _subheadline_selector = CSSSelector("article h2, .entry-content h2, .post-content h2") + + @attribute + def body(self) -> Optional[ArticleBody]: + return extract_article_body_with_selector( + self.precomputed.doc, + summary_selector=self._summary_selector, + subheadline_selector=self._subheadline_selector, + paragraph_selector=self._paragraph_selector, + ) + + @attribute + def authors(self) -> List[str]: + # 1) nur Meta (kein LD) + res = generic_author_parsing(self.precomputed.meta.get("author")) + if res: + return res + + # 2) DOM-Fallbacks (WP-typisch) + nodes = self.precomputed.doc.xpath( + "//a[@rel='author']/text()" + " | //span[contains(@class,'author')]//a/text()" + " | //div[contains(@class,'author')]//a/text()" + " | //div[contains(@class,'byline')]//a/text()" + " | //span[contains(@class,'byline')]//a/text()" + " | //a[contains(@href,'/autor/') or contains(@href,'/author/')]/text()" + ) + vals = [t.strip() for t in nodes if t and t.strip()] + seen, out = set(), [] + for v in vals: + if v not in seen: + seen.add(v) + out.append(v) + return out + + @attribute + def publishing_date(self) -> Optional[datetime]: + # 1) Meta + for cand in ( + self.precomputed.meta.get("article:published_time"), + self.precomputed.meta.get("og:article:published_time"), + self.precomputed.meta.get("date"), + ): + dt = generic_date_parsing(cand) + if dt: + return dt + + # 2)