diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md index 7952efc96..d17f056b5 100644 --- a/docs/supported_publishers.md +++ b/docs/supported_publishers.md @@ -2202,6 +2202,46 @@ +## LB-Publishers + + + + + + + + + + + + + + + + + + + + + + + + +
Class                                Name                                                                        URL                                                        LanguagesMissing AttributesDeprecated AttributesAdditional Attributes    
+ LBCGroup + +
LBC
+
+ + www.lbcgroup.tv + + + ar + + topics +   
+ + ## LI-Publishers diff --git a/src/fundus/publishers/__init__.py b/src/fundus/publishers/__init__.py index 21de84caa..47ed05747 100644 --- a/src/fundus/publishers/__init__.py +++ b/src/fundus/publishers/__init__.py @@ -20,6 +20,7 @@ from fundus.publishers.it import IT from fundus.publishers.jp import JP from fundus.publishers.kr import KR +from fundus.publishers.lb import LB from fundus.publishers.li import LI from fundus.publishers.ls import LS from fundus.publishers.lt import LT @@ -89,6 +90,7 @@ class PublisherCollection(metaclass=PublisherCollectionMeta): it = IT jp = JP kr = KR + lb = LB li = LI ls = LS lt = LT diff --git a/src/fundus/publishers/lb/__init__.py b/src/fundus/publishers/lb/__init__.py new file mode 100644 index 000000000..6df4a40bf --- /dev/null +++ b/src/fundus/publishers/lb/__init__.py @@ -0,0 +1,17 @@ +from fundus.publishers.base_objects import Publisher, PublisherGroup +from fundus.publishers.lb.lbc_group import LBCGroupParser +from fundus.scraping.url import NewsMap, RSSFeed, Sitemap + +class LB(metaclass=PublisherGroup): + default_language= "ar" + + LBCGroup=Publisher( + name = "LBC", + domain = "https://www.lbcgroup.tv", + parser = LBCGroupParser, + sources=[ + RSSFeed("https://www.lbcgroup.tv/Rss/latest-news/en"), + NewsMap("https://www.lbcgroup.tv/newssitemap.xml"), + Sitemap("https://www.lbcgroup.tv/sitemap.xml"), + ], + ) diff --git a/src/fundus/publishers/lb/lbc_group.py b/src/fundus/publishers/lb/lbc_group.py new file mode 100644 index 000000000..a0eda7b5c --- /dev/null +++ b/src/fundus/publishers/lb/lbc_group.py @@ -0,0 +1,60 @@ +import datetime +import re +from typing import List, Optional + +from lxml.etree import XPath + +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute +from fundus.parser.utility import ( + extract_article_body_with_selector, + generic_author_parsing, + generic_date_parsing, + generic_topic_parsing, + image_extraction, +) + + +class LBCGroupParser(ParserProxy): + class V1(BaseParser): + content_container_selector = XPath("//div[@class='LongDesc']/div[1]/div[1]") + + # We tell the parser utility that the content container itself ('.') + # should be treated as the main text block, allowing extraction of text nodes. + _paragraph_selector = XPath(".") + + # There are no subheadlines (like

) in your snippet. + _subheadline_selector = None + + @attribute + def body(self) -> Optional[ArticleBody]: + # Use the defined content_selector to locate the block of text. + return extract_article_body_with_selector( + self.precomputed.doc, + content_selector=self._content_container_selector, + paragraph_selector=self._paragraph_selector, + subheadline_selector=self._subheadline_selector, + # Optionally, remove elements like the banner injection and the 'Reuters' credit + # if you want a cleaner body, but we'll focus on text for now. + ) + + @attribute + def title(self) -> Optional[str]: + return self.precomputed.meta.get("og:title") + + @attribute + def authors(self) -> List[str]: + return generic_author_parsing(self.precomputed.ld.bf_search("author")) + + @attribute + def publishing_date(self) -> Optional[datetime.datetime]: + return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + author_selector=XPath("./ancestor::figure//footer"), + size_pattern=re.compile(r"/rs:fill:(?P[0-9]+):"), + ) + \ No newline at end of file