flairNLP · nancyboukamel-ds · Oct 21, 2025 · Oct 21, 2025 · Oct 24, 2025 · addie9800
diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md
@@ -2202,6 +2202,46 @@
 </table>
 
 
+## LB-Publishers
+
+<table class="publishers lb">
+  <thead>
+    <tr>
+      <th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
+      <th>Name&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
+      <th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
+      <th>Languages</th>
+      <th>Missing&#160;Attributes</th>
+      <th>Deprecated&#160;Attributes</th>
+      <th>Additional&#160;Attributes&#160;&#160;&#160;&#160;</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>
+        <code>LBCGroup</code>
+      </td>
+      <td>
+        <div>LBC</div>
+      </td>
+      <td>
+        <a href="https://www.lbcgroup.tv">
+          <span>www.lbcgroup.tv</span>
+        </a>
+      </td>
+      <td>
+        <code>ar</code>
+      </td>
+      <td>
+        <code>topics</code>
+      </td>
+      <td>&#160;</td>
+      <td>&#160;</td>
+    </tr>
+  </tbody>
+</table>
+
+
 ## LI-Publishers
 
 <table class="publishers li">

diff --git a/src/fundus/publishers/__init__.py b/src/fundus/publishers/__init__.py
@@ -20,6 +20,7 @@
 from fundus.publishers.it import IT
 from fundus.publishers.jp import JP
 from fundus.publishers.kr import KR
+from fundus.publishers.lb import LB
 from fundus.publishers.li import LI
 from fundus.publishers.ls import LS
 from fundus.publishers.lt import LT
@@ -89,6 +90,7 @@ class PublisherCollection(metaclass=PublisherCollectionMeta):
     it = IT
     jp = JP
     kr = KR
+    lb = LB
     li = LI
     ls = LS
     lt = LT

diff --git a/src/fundus/publishers/lb/__init__.py b/src/fundus/publishers/lb/__init__.py
@@ -0,0 +1,17 @@
+from fundus.publishers.base_objects import Publisher, PublisherGroup
+from fundus.publishers.lb.lbc_group import LBCGroupParser
+from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
+
+class LB(metaclass=PublisherGroup):
+    default_language= "ar"
+
+    LBCGroup=Publisher(
+        name = "LBC",
+        domain = "https://www.lbcgroup.tv",
+        parser = LBCGroupParser,
+        sources=[
+            RSSFeed("https://www.lbcgroup.tv/Rss/latest-news/en"),
+            NewsMap("https://www.lbcgroup.tv/newssitemap.xml"),
+            Sitemap("https://www.lbcgroup.tv/sitemap.xml"),
+        ],
+    )
diff --git a/src/fundus/publishers/lb/lbc_group.py b/src/fundus/publishers/lb/lbc_group.py
@@ -0,0 +1,60 @@
+import datetime
+import re
+from typing import List, Optional
+
+from lxml.etree import XPath
+
+from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
+from fundus.parser.utility import (
+    extract_article_body_with_selector,
+    generic_author_parsing,
+    generic_date_parsing,
+    generic_topic_parsing,
+    image_extraction,
+)
+
+
+class LBCGroupParser(ParserProxy):
+    class V1(BaseParser):
+        content_container_selector = XPath("//div[@class='LongDesc']/div[1]/div[1]")
+
+        # We tell the parser utility that the content container itself ('.') 
+        # should be treated as the main text block, allowing extraction of text nodes.
+        _paragraph_selector = XPath(".") 
+
+        # There are no subheadlines (like <h2>) in your snippet.
+        _subheadline_selector = None 
+
+        @attribute
+        def body(self) -> Optional[ArticleBody]:
+            # Use the defined content_selector to locate the block of text.
+            return extract_article_body_with_selector(
+                self.precomputed.doc,
+                content_selector=self._content_container_selector,
+                paragraph_selector=self._paragraph_selector,
+                subheadline_selector=self._subheadline_selector,
+                # Optionally, remove elements like the banner injection and the 'Reuters' credit 
+                # if you want a cleaner body, but we'll focus on text for now.
+            )
+
+        @attribute
+        def title(self) -> Optional[str]:
+            return self.precomputed.meta.get("og:title")
+
+        @attribute
+        def authors(self) -> List[str]:
+            return generic_author_parsing(self.precomputed.ld.bf_search("author"))
+
+        @attribute
+        def publishing_date(self) -> Optional[datetime.datetime]:
+            return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
+
+        @attribute
+        def images(self) -> List[Image]:
+            return image_extraction(
+                doc=self.precomputed.doc,
+                paragraph_selector=self._paragraph_selector,
+                author_selector=XPath("./ancestor::figure//footer"),
+                size_pattern=re.compile(r"/rs:fill:(?P<width>[0-9]+):"),
+            )
+