Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -3610,6 +3610,46 @@
</table>


## VN-Publishers

<table class="publishers vn">
<thead>
<tr>
<th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
<th>Name&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
<th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
<th>Languages</th>
<th>Missing&#160;Attributes</th>
<th>Deprecated&#160;Attributes</th>
<th>Additional&#160;Attributes&#160;&#160;&#160;&#160;</th>
</tr>
</thead>
<tbody>
<tr>
<td>
<code>VnExpress</code>
</td>
<td>
<div>VnExpress</div>
</td>
<td>
<a href="https://vnexpress.net/">
<span>vnexpress.net</span>
</a>
</td>
<td>
<code>vi</code>
</td>
<td>
<code>images</code>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
</tbody>
</table>


## ZA-Publishers

<table class="publishers za">
Expand Down
12 changes: 4 additions & 8 deletions src/fundus/parser/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,12 +140,10 @@ def to_unicode_characters(text: str) -> str:
return self.__xml

@overload
def xpath_search(self, query: Union[XPath, str], scalar: Literal[False] = False) -> List[Any]:
...
def xpath_search(self, query: Union[XPath, str], scalar: Literal[False] = False) -> List[Any]: ...

@overload
def xpath_search(self, query: Union[XPath, str], scalar: Literal[True] = True) -> Optional[Any]:
...
def xpath_search(self, query: Union[XPath, str], scalar: Literal[True] = True) -> Optional[Any]: ...

def xpath_search(self, query: Union[XPath, str], scalar: bool = False):
"""Search through LD using XPath expressions
Expand Down Expand Up @@ -300,12 +298,10 @@ def __init__(self, texts: Iterable[str]):
self._data: Tuple[str, ...] = tuple(texts)

@overload
def __getitem__(self, i: int) -> str:
...
def __getitem__(self, i: int) -> str: ...

@overload
def __getitem__(self, s: slice) -> "TextSequence":
...
def __getitem__(self, s: slice) -> "TextSequence": ...

def __getitem__(self, i):
return self._data[i] if isinstance(i, int) else type(self)(self._data[i])
Expand Down
2 changes: 2 additions & 0 deletions src/fundus/publishers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from fundus.publishers.tz import TZ
from fundus.publishers.uk import UK
from fundus.publishers.us import US
from fundus.publishers.vn import VN
from fundus.publishers.za import ZA

__all__ = ["Publisher", "PublisherGroup"]
Expand Down Expand Up @@ -107,3 +108,4 @@ class PublisherCollection(metaclass=PublisherCollectionMeta):
uk = UK
us = US
za = ZA
vn = VN
4 changes: 3 additions & 1 deletion src/fundus/publishers/de/mdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ class MDRParser(ParserProxy):
class V1(BaseParser):
_author_substitution_pattern: Pattern[str] = re.compile(r"MDR \w*$|MDR \w*-\w*$|MDRfragt-Redaktionsteam|^von")
# regex examples: https://regex101.com/r/2DSjAz/1
_source_detection: str = r"^((MDR (AKTUELL ){0,1}\(([A-z]{2,3}(\/[A-z]{2,3})*|[A-z, ]{2,50}))\)|(Quell(e|en): (u.a. ){0,1}[A-z,]{3,4})|[A-z]{2,4}(, [A-z]{2,4}){0,3}( \([A-z]{2,4}\)){0,1}$|[A-z]{2,4}\/[A-z(), \/]{3,10}$)"
_source_detection: str = (
r"^((MDR (AKTUELL ){0,1}\(([A-z]{2,3}(\/[A-z]{2,3})*|[A-z, ]{2,50}))\)|(Quell(e|en): (u.a. ){0,1}[A-z,]{3,4})|[A-z]{2,4}(, [A-z]{2,4}){0,3}( \([A-z]{2,4}\)){0,1}$|[A-z]{2,4}\/[A-z(), \/]{3,10}$)"
)
_paragraph_selector = XPath(
f"//div[contains(@class, 'paragraph')]"
f"/p[not(re:test(em, '{_source_detection}') or re:test(text(), '{_source_detection}'))]",
Expand Down
4 changes: 3 additions & 1 deletion src/fundus/publishers/jp/sankei_shimbun.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ def title(self) -> Optional[str]:
@attribute
def authors(self) -> List[str]:
return [
author for author in generic_author_parsing(self.precomputed.meta.get("author")) if "産経新聞" not in author
author
for author in generic_author_parsing(self.precomputed.meta.get("author"))
if "産経新聞" not in author
]

@attribute
Expand Down
19 changes: 19 additions & 0 deletions src/fundus/publishers/vn/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.vn.vnexpress import VnExpressIntlParser
from fundus.scraping.filter import inverse, regex_filter
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap

class VN(metaclass=PublisherGroup):
default_language = "vi"

VnExpress = Publisher(
name="VnExpress",
domain="https://vnexpress.net/",
parser=VnExpressIntlParser,
sources=[
RSSFeed("https://vnexpress.net/rss/tin-moi-nhat.rss"),
Sitemap("https://vnexpress.net/sitemap.xml"),
NewsMap("https://vnexpress.net/google-news-sitemap.xml"),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like they tricked you with the sitemap links, They all redirect you to home

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead you can add the other RSSFeeds from here: https://vnexpress.net/rss as sources

],
suppress_robots=True,
)
108 changes: 108 additions & 0 deletions src/fundus/publishers/vn/vnexpress.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from datetime import datetime
from typing import List, Optional, Any
from lxml.cssselect import CSSSelector
from lxml.etree import XPath
from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
)


class VnExpressIntlParser(ParserProxy):
class V1(BaseParser):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The images attribute seems to be missing in this parser.

_summary_selector = CSSSelector("p.description")
_paragraph_selector = CSSSelector("article.fck_detail > p")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this article, the author is also extracted from the bottom of the article.

_subheadline_selector = CSSSelector("article.fck_detail > h2")

@attribute
def title(self) -> Optional[str]:
title_list: List[Any] = self.precomputed.ld.xpath_search("//NewsArticle/headline")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use scalar=True here, and it will not return a List. Have you observed self.precomputed.ld.xpath_search("//NewsArticle/headline") to be unreliable? Usually, relying on the JSON should be sufficient.

if title_list and isinstance(title_list[0], str):
title: str = title_list[0]
return title

title_meta = self.precomputed.meta.get("og:title")
if title_meta and isinstance(title_meta, str):
return title_meta

title_nodes = CSSSelector("h1.title-detail")(self.precomputed.doc)
if title_nodes:
title_text: str = title_nodes[0].text_content().strip()
return title_text

return None

@attribute
def authors(self) -> List[str]:
author_data_list: List[Any] = self.precomputed.ld.xpath_search("//NewsArticle/author")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can pass in whatever you get back directly into generic_author_parsing. It is designed to work with various inputs. Have you observed self.precomputed.ld.xpath_search("//NewsArticle/author") to be unreliable? Usually, relying on the JSON should be sufficient.

if author_data_list:
author_ld = author_data_list[0]
authors = generic_author_parsing(author_ld)
if authors:
return authors

author_nodes = CSSSelector("p.author_mail strong")(self.precomputed.doc)
if author_nodes:
return [node.text_content().strip() for node in author_nodes if node.text_content().strip()]

return []

@attribute
def publishing_date(self) -> Optional[datetime]:
date_list: List[Any] = self.precomputed.ld.xpath_search("//NewsArticle/datePublished")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here, you can use scalar=True as well. And it should be sufficient to use the JSON value.

if date_list and isinstance(date_list[0], str):
date_str: str = date_list[0]
return generic_date_parsing(date_str)

date_meta = self.precomputed.meta.get("article:published_time")
if date_meta and isinstance(date_meta, str):
return generic_date_parsing(date_meta)

return None

@attribute
def body(self) -> Optional[ArticleBody]:
return extract_article_body_with_selector(
self.precomputed.doc,
summary_selector=self._summary_selector,
paragraph_selector=self._paragraph_selector,
subheadline_selector=self._subheadline_selector,
)

def _parse_ld_keywords(self) -> List[str]:
keywords_list: List[Any] = self.precomputed.ld.xpath_search("//NewsArticle/keywords")

if not keywords_list:
return []

keywords = keywords_list[0] if keywords_list else None

result: List[str] = []
if isinstance(keywords, list):
for item in keywords:
if isinstance(item, str):
result.extend([k.strip() for k in item.split(',') if k.strip()])
elif isinstance(item, list):
result.extend([k.strip() for k in item if isinstance(k, str) and k.strip()])
elif isinstance(keywords, str):
result = [k.strip() for k in keywords.split(',') if k.strip()]

return result

def _parse_meta_topics(self) -> List[str]:
section = self.precomputed.meta.get("article:section")
if section and isinstance(section, str):
return [section]
meta_keywords = self.precomputed.meta.get("keywords")
if meta_keywords and isinstance(meta_keywords, str):
return [k.strip() for k in meta_keywords.split(',') if k.strip()]
return []

@attribute
def topics(self) -> List[str]:
ld_topics = self._parse_ld_keywords()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can simplify this greatly by just using generic_topic_parsing(self.precomputed.meta.get("keywords"), which essentially does the same thing your custom helper methods do.

if ld_topics:
return ld_topics
return self._parse_meta_topics()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are also some bloat topic like Tin nóng (= hot news), which should be removed.

3 changes: 1 addition & 2 deletions src/fundus/scraping/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,7 @@ def url_filter(url: str) -> bool:


class SupportsBool(Protocol):
def __bool__(self) -> bool:
...
def __bool__(self) -> bool: ...
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You probably have a different black version installed. This PR should normally not edit these files.



class ExtractionFilter(Protocol):
Expand Down
3 changes: 1 addition & 2 deletions src/fundus/scraping/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,7 @@ class WebSourceInfo(SourceInfo):


class HTMLSource(Protocol):
def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]:
...
def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]: ...


class WebSource:
Expand Down
12 changes: 4 additions & 8 deletions src/fundus/utils/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,21 @@


@overload
def _get_match_dict(pattern: Pattern[str], string: str, conversion: Callable[[str], _T]) -> Dict[str, _T]:
...
def _get_match_dict(pattern: Pattern[str], string: str, conversion: Callable[[str], _T]) -> Dict[str, _T]: ...


@overload
def _get_match_dict(
pattern: Pattern[str], string: str, conversion: Callable[[str], _T], keep_none: Literal[True]
) -> Dict[str, Optional[_T]]:
...
) -> Dict[str, Optional[_T]]: ...


@overload
def _get_match_dict(pattern: Pattern[str], string: str) -> Dict[str, str]:
...
def _get_match_dict(pattern: Pattern[str], string: str) -> Dict[str, str]: ...


@overload
def _get_match_dict(pattern: Pattern[str], string: str, keep_none: Literal[True]) -> Dict[str, Optional[str]]:
...
def _get_match_dict(pattern: Pattern[str], string: str, keep_none: Literal[True]) -> Dict[str, Optional[str]]: ...


def _get_match_dict( # type: ignore[misc]
Expand Down
32 changes: 32 additions & 0 deletions tests/resources/parser/test_data/vn/VnExpress.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"V1": {
"authors": [
"VnExpress"
],
"body": {
"summary": [
"Người đàn ông 63 tuổi đau bụng kéo dài, bác sĩ chẩn đoán ung thư lymphoma ruột non - căn bệnh ít gặp trong hệ tiêu hóa."
],
"sections": [
{
"headline": [],
"paragraphs": [
"Ngày 22/10, BS.CK2 Cao Thị Hồng, Trưởng Trung tâm Kiểm tra sức khỏe Chợ Rẫy Việt Nhật (HECI), Bệnh viện Chợ Rẫy, cho biết bệnh nhân đau bụng âm ỉ, ăn uống khó tiêu, đầy hơi và tiêu chảy kéo dài nhiều tuần, sụt 2-3 kg. Ông nghĩ bị rối loạn tiêu hóa thông thường, song uống thuốc không khỏi.",
"Bác sĩ đánh giá bệnh nhân nhiều dấu hiệu bất thường nên nội soi dạ dày và đại tràng lấy mẫu sinh thiết phát hiện tế bào lympho không điển hình, nghi ngờ ác tính. Xét nghiệm hóa mô miễn dịch xác định bệnh nhân mắc ung thư lymphoma không Hodgkin ruột non. Bác sĩ nhiều chuyên khoa tiêu hóa, huyết học, giải phẫu bệnh cùng hội chẩn, xây dựng phác đồ điều trị cho bệnh nhân.",
"Theo bác sĩ Hồng, lymphoma đường ruột là u lympho ác tính ngoài hạch phát triển từ mô lympho trong thành ruột, chiếm khoảng 15-25% ca ung thư ruột non, gần 5% ca ung thư dạ dày và 0,2-1% ung thư đại trực tràng. Hầu hết lymphoma đường tiêu hóa thuộc nhóm không Hodgkin, với tỷ lệ mắc mới toàn cầu khoảng 5,6 ca trên 100.000 dân mỗi năm.",
"Bệnh xuất hiện chủ yếu ở người trên 60 tuổi, đặc biệt là nam giới. Lymphoma đường ruột thường có các triệu chứng như đau bụng, sụt cân, buồn nôn, tiêu chảy hoặc tiêu phân có máu...",
"Nội soi và tầm soát sức khỏe định kỳ giúp phát hiện sớm các bệnh lý ác tính đường tiêu hóa. Hướng điều trị thường phụ thuộc vào loại cũng như giai đoạn bệnh, có thể phẫu thuật, hóa trị hoặc xạ trị.",
"Lê Phương"
]
}
]
},
"publishing_date": "2025-10-22 19:25:52+07:00",
"title": "Đau bụng kéo dài, phát hiện ung thư lymphoma ruột non",
"topics": [
"ung thư ruột non",
"đau bụng",
"rối loạn tiêu hóa"
]
}
}
Binary file not shown.
6 changes: 6 additions & 0 deletions tests/resources/parser/test_data/vn/meta.info
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"VnExpress_2025_10_22.html.gz": {
"url": "https://vnexpress.net/dau-bung-keo-dai-phat-hien-ung-thu-lymphoma-ruot-non-4954454.html",
"crawl_date": "2025-10-22 14:57:05.843578"
}
}
Loading