Skip to content

Commit 7b6748e

Browse files
committed
add publisher Pravda
1 parent a8bbdf8 commit 7b6748e

File tree

4 files changed

+101
-0
lines changed

4 files changed

+101
-0
lines changed

docs/supported_publishers.md

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2945,6 +2945,49 @@
29452945
</table>
29462946

29472947

2948+
## UA-Publishers
2949+
2950+
<table class="publishers ua">
2951+
<thead>
2952+
<tr>
2953+
<th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
2954+
<th>Name&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
2955+
<th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
2956+
<th>Languages</th>
2957+
<th>Missing&#160;Attributes</th>
2958+
<th>Deprecated&#160;Attributes</th>
2959+
<th>Additional&#160;Attributes&#160;&#160;&#160;&#160;</th>
2960+
</tr>
2961+
</thead>
2962+
<tbody>
2963+
<tr>
2964+
<td>
2965+
<code>Pravda</code>
2966+
</td>
2967+
<td>
2968+
<div>Ukrainska Pravda</div>
2969+
</td>
2970+
<td>
2971+
<a href="https://www.pravda.com.ua">
2972+
<span>www.pravda.com.ua</span>
2973+
</a>
2974+
</td>
2975+
<td>
2976+
<code>en</code>
2977+
<code>ru</code>
2978+
<code>uk</code>
2979+
</td>
2980+
<td>
2981+
<code>images</code>
2982+
<code>topics</code>
2983+
</td>
2984+
<td>&#160;</td>
2985+
<td>&#160;</td>
2986+
</tr>
2987+
</tbody>
2988+
</table>
2989+
2990+
29482991
## UK-Publishers
29492992

29502993
<table class="publishers uk">

src/fundus/publishers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from fundus.publishers.tr import TR
3636
from fundus.publishers.tw import TW
3737
from fundus.publishers.tz import TZ
38+
from fundus.publishers.ua import UA
3839
from fundus.publishers.uk import UK
3940
from fundus.publishers.us import US
4041
from fundus.publishers.za import ZA
@@ -104,6 +105,7 @@ class PublisherCollection(metaclass=PublisherCollectionMeta):
104105
tr = TR
105106
tw = TW
106107
tz = TZ
108+
ua = UA
107109
uk = UK
108110
us = US
109111
za = ZA
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from fundus.publishers.base_objects import Publisher, PublisherGroup
2+
from fundus.scraping.filter import inverse, regex_filter
3+
from fundus.scraping.url import NewsMap, Sitemap
4+
5+
from .pravda import PravdaParser
6+
7+
8+
class UA(metaclass=PublisherGroup):
9+
default_language = "uk"
10+
11+
Pravda = Publisher(
12+
name="Ukrainska Pravda",
13+
domain="https://www.pravda.com.ua",
14+
parser=PravdaParser,
15+
sources=[
16+
Sitemap("https://www.pravda.com.ua/sitemap/sitemap-archive.xml", languages={"uk", "en", "ru"}),
17+
NewsMap("https://www.pravda.com.ua/sitemap/sitemap-news.xml", languages={"uk", "en", "ru"}),
18+
],
19+
url_filter=inverse(regex_filter("[^e]pravda.com.ua.*/news/")),
20+
)

src/fundus/publishers/ua/pravda.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from datetime import datetime
2+
from typing import List, Optional
3+
4+
from lxml.cssselect import CSSSelector
5+
6+
from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
7+
from fundus.parser.utility import (
8+
extract_article_body_with_selector,
9+
generic_author_parsing,
10+
generic_date_parsing,
11+
image_extraction,
12+
)
13+
14+
15+
class PravdaParser(ParserProxy):
16+
class V1(BaseParser):
17+
_paragraph_selector = CSSSelector("div.post_news_text > p")
18+
19+
@attribute
20+
def body(self) -> Optional[ArticleBody]:
21+
return extract_article_body_with_selector(
22+
self.precomputed.doc,
23+
paragraph_selector=self._paragraph_selector,
24+
)
25+
26+
@attribute
27+
def title(self) -> Optional[str]:
28+
return self.precomputed.ld.xpath_search("NewsArticle/headline", scalar=True)
29+
30+
@attribute
31+
def authors(self) -> List[str]:
32+
return generic_author_parsing(self.precomputed.ld.xpath_search("ProfilePage/mainEntity/name"))
33+
34+
@attribute
35+
def publishing_date(self) -> Optional[datetime]:
36+
return generic_date_parsing(self.precomputed.ld.xpath_search("NewsArticle/datePublished", scalar=True))

0 commit comments

Comments
 (0)