WIP

bucheben · bucheben · commit e75f5a2dbe1b · 2025-10-23T13:49:58.000+02:00
diff --git a/src/fundus/publishers/__init__.py b/src/fundus/publishers/__init__.py
@@ -35,6 +35,7 @@
 from fundus.publishers.tr import TR
 from fundus.publishers.tw import TW
 from fundus.publishers.tz import TZ
+from fundus.publishers.ua import UA
 from fundus.publishers.uk import UK
 from fundus.publishers.us import US
 from fundus.publishers.za import ZA
@@ -104,6 +105,7 @@ class PublisherCollection(metaclass=PublisherCollectionMeta):
     tr = TR
     tw = TW
     tz = TZ
+    ua = UA
     uk = UK
     us = US
     za = ZA
diff --git a/src/fundus/publishers/ua/__init__.py b/src/fundus/publishers/ua/__init__.py
@@ -0,0 +1,16 @@
+from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
+from fundus.publishers.base_objects import Publisher, PublisherGroup
+
+from .pravda import PravdaParser
+
+class UA(metaclass=PublisherGroup):
+    default_language = "uk"
+
+    Pravda = Publisher(
+        name="Ukrainska Pravda",
+        domain="https://www.pravda.com.ua",
+        parser=PravdaParser,
+        sources = [
+            NewsMap("https://www.pravda.com.ua/sitemap/sitemap-news.xml", languages={  "uk", "en", "rus" }),
+        ]
+    )
diff --git a/src/fundus/publishers/ua/pravda.py b/src/fundus/publishers/ua/pravda.py
@@ -0,0 +1,52 @@
+from datetime import date, datetime
+from typing import List, Optional
+
+from lxml.cssselect import CSSSelector
+from lxml.etree import XPath
+
+from fundus.parser import ParserProxy, BaseParser, attribute
+#from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
+from fundus.parser.utility import (
+    extract_article_body_with_selector,
+    generic_author_parsing,
+    generic_date_parsing,
+    image_extraction,
+)
+
+
+
+class PravdaParser(ParserProxy):
+    class V1(BaseParser):
+        #_summary_selector = XPath(
+            
+            #"//p[@class='post__excerpt'] | //h2[preceding-sibling::h1[contains(@class, 'post__title')]]"
+        #)
+        #_paragraph_selector = CSSSelector("div.entry-content > div.entry-content__content > p, blockquote > p")
+        #_subheadline_selector = CSSSelector("div.entry-content > div.entry-content__content > h2")
+
+        #@attribute
+        #def body(self) -> Optional[ArticleBody]:
+            #return extract_article_body_with_selector(
+                #self.precomputed.doc,
+                #summary_selector=self._summary_selector,
+                #subheadline_selector=self._subheadline_selector,
+                #paragraph_selector=self._paragraph_selector,
+            #)
+
+        @attribute
+        def title(self) -> Optional[str]:
+            return self.precomputed.ld.xpath_search("NewsArticle/headline")[0]
+
+        @attribute
+        def authors(self) -> List[str]:
+            # The first hit is the name of the news source itself
+            print(generic_author_parsing(self.precomputed.ld.xpath_search('NewsArticle/author/name')[1:]))
+            return generic_author_parsing(self.precomputed.ld.xpath_search('NewsArticle/author/name')[1:])
+
+        @attribute
+        def publishing_date(self) -> Optional[datetime]:
+            return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished")[1])
+
+        
+
+