Skip to content

Commit e75f5a2

Browse files
committed
WIP
1 parent 716b827 commit e75f5a2

File tree

3 files changed

+70
-0
lines changed

3 files changed

+70
-0
lines changed

src/fundus/publishers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from fundus.publishers.tr import TR
3636
from fundus.publishers.tw import TW
3737
from fundus.publishers.tz import TZ
38+
from fundus.publishers.ua import UA
3839
from fundus.publishers.uk import UK
3940
from fundus.publishers.us import US
4041
from fundus.publishers.za import ZA
@@ -104,6 +105,7 @@ class PublisherCollection(metaclass=PublisherCollectionMeta):
104105
tr = TR
105106
tw = TW
106107
tz = TZ
108+
ua = UA
107109
uk = UK
108110
us = US
109111
za = ZA
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
2+
from fundus.publishers.base_objects import Publisher, PublisherGroup
3+
4+
from .pravda import PravdaParser
5+
6+
class UA(metaclass=PublisherGroup):
7+
default_language = "uk"
8+
9+
Pravda = Publisher(
10+
name="Ukrainska Pravda",
11+
domain="https://www.pravda.com.ua",
12+
parser=PravdaParser,
13+
sources = [
14+
NewsMap("https://www.pravda.com.ua/sitemap/sitemap-news.xml", languages={ "uk", "en", "rus" }),
15+
]
16+
)

src/fundus/publishers/ua/pravda.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from datetime import date, datetime
2+
from typing import List, Optional
3+
4+
from lxml.cssselect import CSSSelector
5+
from lxml.etree import XPath
6+
7+
from fundus.parser import ParserProxy, BaseParser, attribute
8+
#from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
9+
from fundus.parser.utility import (
10+
extract_article_body_with_selector,
11+
generic_author_parsing,
12+
generic_date_parsing,
13+
image_extraction,
14+
)
15+
16+
17+
18+
class PravdaParser(ParserProxy):
19+
class V1(BaseParser):
20+
#_summary_selector = XPath(
21+
22+
#"//p[@class='post__excerpt'] | //h2[preceding-sibling::h1[contains(@class, 'post__title')]]"
23+
#)
24+
#_paragraph_selector = CSSSelector("div.entry-content > div.entry-content__content > p, blockquote > p")
25+
#_subheadline_selector = CSSSelector("div.entry-content > div.entry-content__content > h2")
26+
27+
#@attribute
28+
#def body(self) -> Optional[ArticleBody]:
29+
#return extract_article_body_with_selector(
30+
#self.precomputed.doc,
31+
#summary_selector=self._summary_selector,
32+
#subheadline_selector=self._subheadline_selector,
33+
#paragraph_selector=self._paragraph_selector,
34+
#)
35+
36+
@attribute
37+
def title(self) -> Optional[str]:
38+
return self.precomputed.ld.xpath_search("NewsArticle/headline")[0]
39+
40+
@attribute
41+
def authors(self) -> List[str]:
42+
# The first hit is the name of the news source itself
43+
print(generic_author_parsing(self.precomputed.ld.xpath_search('NewsArticle/author/name')[1:]))
44+
return generic_author_parsing(self.precomputed.ld.xpath_search('NewsArticle/author/name')[1:])
45+
46+
@attribute
47+
def publishing_date(self) -> Optional[datetime]:
48+
return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished")[1])
49+
50+
51+
52+

0 commit comments

Comments
 (0)