From a6ef81dad526345e25777e37668aefb5ff574911 Mon Sep 17 00:00:00 2001
From: baurlaur <160800935+baurlaur@users.noreply.github.com>
Date: Thu, 23 Oct 2025 15:13:13 +0200
Subject: [PATCH] add klassegegenklasse (DE) publisher + parser + tests +
tables
---
docs/supported_publishers.md | 22 +++
src/fundus/publishers/de/__init__.py | 14 ++
src/fundus/publishers/de/klassegegenklasse.py | 151 ++++++++++++++++++
.../test_data/de/KlasseGegenKlasse.json | 31 ++++
.../de/KlasseGegenKlasse_2025_10_23.html.gz | Bin 0 -> 6792 bytes
tests/resources/parser/test_data/de/meta.info | 4 +
6 files changed, 222 insertions(+)
create mode 100644 src/fundus/publishers/de/klassegegenklasse.py
create mode 100644 tests/resources/parser/test_data/de/KlasseGegenKlasse.json
create mode 100644 tests/resources/parser/test_data/de/KlasseGegenKlasse_2025_10_23.html.gz
diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md
index 7952efc96..2f011e1ae 100644
--- a/docs/supported_publishers.md
+++ b/docs/supported_publishers.md
@@ -937,6 +937,28 @@
Krautreporter
diff --git a/src/fundus/publishers/de/__init__.py b/src/fundus/publishers/de/__init__.py
index e67dbe58f..48b1bdbca 100644
--- a/src/fundus/publishers/de/__init__.py
+++ b/src/fundus/publishers/de/__init__.py
@@ -26,6 +26,7 @@
from .hessenschau import HessenschauParser
from .junge_welt import JungeWeltParser
from .kicker import KickerParser
+from .klassegegenklasse import KlasseGegenKlasseParser
from .krautreporter import KrautreporterParser
from .mdr import MDRParser
from .merkur import MerkurParser
@@ -595,3 +596,16 @@ class DE(metaclass=PublisherGroup):
Sitemap("https://www.gamestar.de/artikel_archiv_index.xml"),
],
)
+
+ KlasseGegenKlasse = Publisher(
+ name="Klasse Gegen Klasse",
+ domain="https://www.klassegegenklasse.org/",
+ parser=KlasseGegenKlasseParser,
+ sources=[
+ RSSFeed("https://www.klassegegenklasse.org/feed/"),
+ Sitemap(
+ "https://www.klassegegenklasse.org/wp-sitemap.xml",
+ ),
+ ],
+ request_header={"user-agent": "Fundus"},
+ )
diff --git a/src/fundus/publishers/de/klassegegenklasse.py b/src/fundus/publishers/de/klassegegenklasse.py
new file mode 100644
index 000000000..c742a0300
--- /dev/null
+++ b/src/fundus/publishers/de/klassegegenklasse.py
@@ -0,0 +1,151 @@
+import re
+from datetime import datetime
+from typing import List, Optional
+
+from lxml.cssselect import CSSSelector
+from lxml.etree import XPath
+
+from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
+from fundus.parser.utility import (
+ extract_article_body_with_selector,
+ generic_author_parsing,
+ generic_date_parsing,
+ generic_topic_parsing,
+ image_extraction,
+)
+
+
+class KlasseGegenKlasseParser(ParserProxy):
+ class V1(BaseParser):
+ _paragraph_selector = CSSSelector("article p, main article p, .post-content p, .entry-content p, .content p")
+ _summary_selector = CSSSelector(
+ "article .entry-content > p:first-child, article > p:first-child, .post-content > p:first-child"
+ )
+ _subheadline_selector = CSSSelector("article h2, .entry-content h2, .post-content h2")
+
+ @attribute
+ def body(self) -> Optional[ArticleBody]:
+ return extract_article_body_with_selector(
+ self.precomputed.doc,
+ summary_selector=self._summary_selector,
+ subheadline_selector=self._subheadline_selector,
+ paragraph_selector=self._paragraph_selector,
+ )
+
+ @attribute
+ def authors(self) -> List[str]:
+ # 1) nur Meta (kein LD)
+ res = generic_author_parsing(self.precomputed.meta.get("author"))
+ if res:
+ return res
+
+ # 2) DOM-Fallbacks (WP-typisch)
+ nodes = self.precomputed.doc.xpath(
+ "//a[@rel='author']/text()"
+ " | //span[contains(@class,'author')]//a/text()"
+ " | //div[contains(@class,'author')]//a/text()"
+ " | //div[contains(@class,'byline')]//a/text()"
+ " | //span[contains(@class,'byline')]//a/text()"
+ " | //a[contains(@href,'/autor/') or contains(@href,'/author/')]/text()"
+ )
+ vals = [t.strip() for t in nodes if t and t.strip()]
+ seen, out = set(), []
+ for v in vals:
+ if v not in seen:
+ seen.add(v)
+ out.append(v)
+ return out
+
+ @attribute
+ def publishing_date(self) -> Optional[datetime]:
+ # 1) Meta
+ for cand in (
+ self.precomputed.meta.get("article:published_time"),
+ self.precomputed.meta.get("og:article:published_time"),
+ self.precomputed.meta.get("date"),
+ ):
+ dt = generic_date_parsing(cand)
+ if dt:
+ return dt
+
+ # 2) |