Skip to content

Commit 65626fb

Browse files
committed
Progress3
1 parent 55797ab commit 65626fb

File tree

4 files changed

+13
-8
lines changed

4 files changed

+13
-8
lines changed

pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,3 +82,8 @@ filterwarnings = [
8282
"error"
8383
]
8484

85+
[tool.uv.workspace]
86+
members = [
87+
"3-11",
88+
]
89+

src/fundus/publishers/ua/__init__.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from fundus.publishers.base_objects import Publisher, PublisherGroup
2-
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
2+
from fundus.scraping.filter import inverse, regex_filter
3+
from fundus.scraping.url import NewsMap, Sitemap
34

45
from .pravda import PravdaParser
56

@@ -12,6 +13,8 @@ class UA(metaclass=PublisherGroup):
1213
domain="https://www.pravda.com.ua",
1314
parser=PravdaParser,
1415
sources=[
15-
NewsMap("https://www.pravda.com.ua/sitemap/sitemap-news.xml", languages={"uk", "en", "rus"}),
16+
Sitemap("https://www.pravda.com.ua/sitemap/sitemap-archive.xml", languages={"uk", "en", "ru"}),
17+
NewsMap("https://www.pravda.com.ua/sitemap/sitemap-news.xml", languages={"uk", "en", "ru"}),
1618
],
19+
url_filter=inverse(regex_filter("[^e]pravda.com.ua.*/news/")),
1720
)

src/fundus/publishers/ua/pravda.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
1-
from datetime import date, datetime
1+
from datetime import datetime
22
from typing import List, Optional
33

44
from lxml.cssselect import CSSSelector
5-
from lxml.etree import XPath
65

7-
from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
6+
from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
87
from fundus.parser.utility import (
98
extract_article_body_with_selector,
109
generic_author_parsing,
@@ -16,7 +15,6 @@
1615
class PravdaParser(ParserProxy):
1716
class V1(BaseParser):
1817
_paragraph_selector = CSSSelector("div.post_news_text > p")
19-
# _summary_selector = CSSSelector("/html/head/meta[name='description']")
2018

2119
@attribute
2220
def body(self) -> Optional[ArticleBody]:
@@ -31,8 +29,6 @@ def title(self) -> Optional[str]:
3129

3230
@attribute
3331
def authors(self) -> List[str]:
34-
# print(self.precomputed.ld.__dict__)
35-
print(generic_author_parsing(self.precomputed.ld.xpath_search("ProfilePage/mainEntity/name")))
3632
return generic_author_parsing(self.precomputed.ld.xpath_search("ProfilePage/mainEntity/name"))
3733

3834
@attribute

tests/test_publisher_collection.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ def get_two_letter_code() -> List[str]:
2020

2121

2222
language_codes = get_two_letter_code()
23+
print(language_codes)
2324

2425

2526
class TestPublisherCollection:

0 commit comments

Comments
 (0)