From a18d660c5c6f1c6504b371c072ceee1b4ae82d09 Mon Sep 17 00:00:00 2001 From: Lily Date: Wed, 22 Oct 2025 19:00:06 +0200 Subject: [PATCH 1/2] Add T-Online body extraction --- src/fundus/publishers/de/__init__.py | 11 +++++++- src/fundus/publishers/de/t_online.py | 39 ++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 src/fundus/publishers/de/t_online.py diff --git a/src/fundus/publishers/de/__init__.py b/src/fundus/publishers/de/__init__.py index e67dbe58f..085e9e54c 100644 --- a/src/fundus/publishers/de/__init__.py +++ b/src/fundus/publishers/de/__init__.py @@ -49,7 +49,7 @@ from .wdr import WDRParser from .winfuture import WinfutureParser from .zdf import ZDFParser - +from .t_online import TOnlineParser # noinspection PyPep8Naming class DE(metaclass=PublisherGroup): @@ -595,3 +595,12 @@ class DE(metaclass=PublisherGroup): Sitemap("https://www.gamestar.de/artikel_archiv_index.xml"), ], ) + + TOnline = Publisher( + name="T-Online", + domain="https://www.t-online.de/", + parser=TOnlineParser, + sources=[ + Sitemap("https://www.t-online.de/sitemap.xml"), + ], + ) diff --git a/src/fundus/publishers/de/t_online.py b/src/fundus/publishers/de/t_online.py new file mode 100644 index 000000000..489509c9b --- /dev/null +++ b/src/fundus/publishers/de/t_online.py @@ -0,0 +1,39 @@ +import datetime +from typing import List, Optional + +from lxml.cssselect import CSSSelector + +from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute +from fundus.parser.utility import ( + extract_article_body_with_selector, + generic_author_parsing, + generic_date_parsing, +) + + +class TOnlineParser(ParserProxy): + class V1(BaseParser): + _paragraph_selector = CSSSelector("div[class*='px-24'] > p.text-18") + _summary_selector = CSSSelector("p.font-bold.text-18") + _subheadline_selector = CSSSelector("h3, h2") + + @attribute + def body(self) -> Optional[ArticleBody]: + return extract_article_body_with_selector( + self.precomputed.doc, + summary_selector=self._summary_selector, + subheadline_selector=self._subheadline_selector, + paragraph_selector=self._paragraph_selector, + ) + + @attribute + def publishing_date(self) -> Optional[datetime.datetime]: + return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) + + @attribute + def authors(self) -> List[str]: + return generic_author_parsing(self.precomputed.ld.bf_search("author")) + + @attribute + def title(self) -> Optional[str]: + return self.precomputed.ld.bf_search("headline") \ No newline at end of file From b64ca592b68f5b9fcfc513bfaf5e68e750806e8e Mon Sep 17 00:00:00 2001 From: freylily <59416754+freylily@users.noreply.github.com> Date: Wed, 22 Oct 2025 17:03:15 +0000 Subject: [PATCH 2/2] Update documentation from @ 716b827c912c47d2d1b17c564d832e9f768badef --- docs/supported_publishers.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md index 7952efc96..3b7f1f8f3 100644 --- a/docs/supported_publishers.md +++ b/docs/supported_publishers.md @@ -1207,6 +1207,28 @@     + + + TOnline + + +
T-Online
+ + + + www.t-online.de + + + + de + + + images + topics + +   +   + Tagesschau