flairNLP · bachthyaglx · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025 · Oct 22, 2025
diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md
@@ -3610,6 +3610,46 @@
 </table>
 
 
+## VN-Publishers
+
+<table class="publishers vn">
+  <thead>
+    <tr>
+      <th>Class&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
+      <th>Name&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
+      <th>URL&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;</th>
+      <th>Languages</th>
+      <th>Missing&#160;Attributes</th>
+      <th>Deprecated&#160;Attributes</th>
+      <th>Additional&#160;Attributes&#160;&#160;&#160;&#160;</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>
+        <code>VnExpress</code>
+      </td>
+      <td>
+        <div>VnExpress</div>
+      </td>
+      <td>
+        <a href="https://vnexpress.net/">
+          <span>vnexpress.net</span>
+        </a>
+      </td>
+      <td>
+        <code>vi</code>
+      </td>
+      <td>
+        <code>images</code>
+      </td>
+      <td>&#160;</td>
+      <td>&#160;</td>
+    </tr>
+  </tbody>
+</table>
+
+
 ## ZA-Publishers
 
 <table class="publishers za">

diff --git a/src/fundus/parser/data.py b/src/fundus/parser/data.py
@@ -140,12 +140,10 @@ def to_unicode_characters(text: str) -> str:
         return self.__xml
 
     @overload
-    def xpath_search(self, query: Union[XPath, str], scalar: Literal[False] = False) -> List[Any]:
-        ...
+    def xpath_search(self, query: Union[XPath, str], scalar: Literal[False] = False) -> List[Any]: ...
 
     @overload
-    def xpath_search(self, query: Union[XPath, str], scalar: Literal[True] = True) -> Optional[Any]:
-        ...
+    def xpath_search(self, query: Union[XPath, str], scalar: Literal[True] = True) -> Optional[Any]: ...
 
     def xpath_search(self, query: Union[XPath, str], scalar: bool = False):
         """Search through LD using XPath expressions
@@ -300,12 +298,10 @@ def __init__(self, texts: Iterable[str]):
         self._data: Tuple[str, ...] = tuple(texts)
 
     @overload
-    def __getitem__(self, i: int) -> str:
-        ...
+    def __getitem__(self, i: int) -> str: ...
 
     @overload
-    def __getitem__(self, s: slice) -> "TextSequence":
-        ...
+    def __getitem__(self, s: slice) -> "TextSequence": ...
 
     def __getitem__(self, i):
         return self._data[i] if isinstance(i, int) else type(self)(self._data[i])

diff --git a/src/fundus/publishers/__init__.py b/src/fundus/publishers/__init__.py
@@ -37,6 +37,7 @@
 from fundus.publishers.tz import TZ
 from fundus.publishers.uk import UK
 from fundus.publishers.us import US
+from fundus.publishers.vn import VN
 from fundus.publishers.za import ZA
 
 __all__ = ["Publisher", "PublisherGroup"]
@@ -107,3 +108,4 @@ class PublisherCollection(metaclass=PublisherCollectionMeta):
     uk = UK
     us = US
     za = ZA
+    vn = VN
diff --git a/src/fundus/publishers/de/mdr.py b/src/fundus/publishers/de/mdr.py
@@ -20,7 +20,9 @@ class MDRParser(ParserProxy):
     class V1(BaseParser):
         _author_substitution_pattern: Pattern[str] = re.compile(r"MDR \w*$|MDR \w*-\w*$|MDRfragt-Redaktionsteam|^von")
         # regex examples: https://regex101.com/r/2DSjAz/1
-        _source_detection: str = r"^((MDR (AKTUELL ){0,1}\(([A-z]{2,3}(\/[A-z]{2,3})*|[A-z, ]{2,50}))\)|(Quell(e|en): (u.a. ){0,1}[A-z,]{3,4})|[A-z]{2,4}(, [A-z]{2,4}){0,3}( \([A-z]{2,4}\)){0,1}$|[A-z]{2,4}\/[A-z(), \/]{3,10}$)"
+        _source_detection: str = (
+            r"^((MDR (AKTUELL ){0,1}\(([A-z]{2,3}(\/[A-z]{2,3})*|[A-z, ]{2,50}))\)|(Quell(e|en): (u.a. ){0,1}[A-z,]{3,4})|[A-z]{2,4}(, [A-z]{2,4}){0,3}( \([A-z]{2,4}\)){0,1}$|[A-z]{2,4}\/[A-z(), \/]{3,10}$)"
+        )
         _paragraph_selector = XPath(
             f"//div[contains(@class, 'paragraph')]"
             f"/p[not(re:test(em, '{_source_detection}') or re:test(text(), '{_source_detection}'))]",

diff --git a/src/fundus/publishers/jp/sankei_shimbun.py b/src/fundus/publishers/jp/sankei_shimbun.py
@@ -41,7 +41,9 @@ def title(self) -> Optional[str]:
         @attribute
         def authors(self) -> List[str]:
             return [
-                author for author in generic_author_parsing(self.precomputed.meta.get("author")) if "産経新聞" not in author
+                author
+                for author in generic_author_parsing(self.precomputed.meta.get("author"))
+                if "産経新聞" not in author
             ]
 
         @attribute

diff --git a/src/fundus/publishers/vn/__init__.py b/src/fundus/publishers/vn/__init__.py
@@ -0,0 +1,19 @@
+from fundus.publishers.base_objects import Publisher, PublisherGroup
+from fundus.publishers.vn.vnexpress import VnExpressIntlParser
+from fundus.scraping.filter import inverse, regex_filter
+from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
+
+class VN(metaclass=PublisherGroup):
+  default_language = "vi"
+
+  VnExpress = Publisher(
+    name="VnExpress",
+    domain="https://vnexpress.net/",
+    parser=VnExpressIntlParser,
+    sources=[
+      RSSFeed("https://vnexpress.net/rss/tin-moi-nhat.rss"),
+      Sitemap("https://vnexpress.net/sitemap.xml"),
+      NewsMap("https://vnexpress.net/google-news-sitemap.xml"),
+    ],
+    suppress_robots=True,
+  )
diff --git a/src/fundus/publishers/vn/vnexpress.py b/src/fundus/publishers/vn/vnexpress.py
@@ -0,0 +1,108 @@
+from datetime import datetime
+from typing import List, Optional, Any
+from lxml.cssselect import CSSSelector
+from lxml.etree import XPath
+from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
+from fundus.parser.utility import (
+  extract_article_body_with_selector,
+  generic_author_parsing,
+  generic_date_parsing,
+)
+
+
+class VnExpressIntlParser(ParserProxy):
+  class V1(BaseParser):
+    _summary_selector = CSSSelector("p.description")
+    _paragraph_selector = CSSSelector("article.fck_detail > p")
+    _subheadline_selector = CSSSelector("article.fck_detail > h2")
+
+    @attribute
+    def title(self) -> Optional[str]:
+      title_list: List[Any] = self.precomputed.ld.xpath_search("//NewsArticle/headline")
+      if title_list and isinstance(title_list[0], str):
+        title: str = title_list[0]
+        return title
+
+      title_meta = self.precomputed.meta.get("og:title")
+      if title_meta and isinstance(title_meta, str):
+        return title_meta
+
+      title_nodes = CSSSelector("h1.title-detail")(self.precomputed.doc)
+      if title_nodes:
+        title_text: str = title_nodes[0].text_content().strip()
+        return title_text
+
+      return None
+
+    @attribute
+    def authors(self) -> List[str]:
+      author_data_list: List[Any] = self.precomputed.ld.xpath_search("//NewsArticle/author")
+      if author_data_list:
+        author_ld = author_data_list[0]
+        authors = generic_author_parsing(author_ld)
+        if authors:
+          return authors
+
+      author_nodes = CSSSelector("p.author_mail strong")(self.precomputed.doc)
+      if author_nodes:
+        return [node.text_content().strip() for node in author_nodes if node.text_content().strip()]
+
+      return []
+
+    @attribute
+    def publishing_date(self) -> Optional[datetime]:
+      date_list: List[Any] = self.precomputed.ld.xpath_search("//NewsArticle/datePublished")
+      if date_list and isinstance(date_list[0], str):
+        date_str: str = date_list[0]
+        return generic_date_parsing(date_str)
+
+      date_meta = self.precomputed.meta.get("article:published_time")
+      if date_meta and isinstance(date_meta, str):
+        return generic_date_parsing(date_meta)
+
+      return None
+
+    @attribute
+    def body(self) -> Optional[ArticleBody]:
+      return extract_article_body_with_selector(
+        self.precomputed.doc,
+        summary_selector=self._summary_selector,
+        paragraph_selector=self._paragraph_selector,
+        subheadline_selector=self._subheadline_selector,
+      )
+
+    def _parse_ld_keywords(self) -> List[str]:
+      keywords_list: List[Any] = self.precomputed.ld.xpath_search("//NewsArticle/keywords")
+
+      if not keywords_list:
+        return []
+
+      keywords = keywords_list[0] if keywords_list else None
+
+      result: List[str] = []
+      if isinstance(keywords, list):
+        for item in keywords:
+          if isinstance(item, str):
+            result.extend([k.strip() for k in item.split(',') if k.strip()])
+          elif isinstance(item, list):
+            result.extend([k.strip() for k in item if isinstance(k, str) and k.strip()])
+      elif isinstance(keywords, str):
+        result = [k.strip() for k in keywords.split(',') if k.strip()]
+
+      return result
+
+    def _parse_meta_topics(self) -> List[str]:
+      section = self.precomputed.meta.get("article:section")
+      if section and isinstance(section, str):
+        return [section]
+      meta_keywords = self.precomputed.meta.get("keywords")
+      if meta_keywords and isinstance(meta_keywords, str):
+        return [k.strip() for k in meta_keywords.split(',') if k.strip()]
+      return []
+
+    @attribute
+    def topics(self) -> List[str]:
+      ld_topics = self._parse_ld_keywords()
+      if ld_topics:
+        return ld_topics
+      return self._parse_meta_topics()
diff --git a/src/fundus/scraping/filter.py b/src/fundus/scraping/filter.py
@@ -83,8 +83,7 @@ def url_filter(url: str) -> bool:
 
 
 class SupportsBool(Protocol):
-    def __bool__(self) -> bool:
-        ...
+    def __bool__(self) -> bool: ...
 
 
 class ExtractionFilter(Protocol):

diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py
@@ -90,8 +90,7 @@ class WebSourceInfo(SourceInfo):
 
 
 class HTMLSource(Protocol):
-    def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]:
-        ...
+    def fetch(self, url_filter: Optional[URLFilter] = None) -> Iterator[HTML]: ...
 
 
 class WebSource:

diff --git a/src/fundus/utils/regex.py b/src/fundus/utils/regex.py
@@ -5,25 +5,21 @@
 
 
 @overload
-def _get_match_dict(pattern: Pattern[str], string: str, conversion: Callable[[str], _T]) -> Dict[str, _T]:
-    ...
+def _get_match_dict(pattern: Pattern[str], string: str, conversion: Callable[[str], _T]) -> Dict[str, _T]: ...
 
 
 @overload
 def _get_match_dict(
     pattern: Pattern[str], string: str, conversion: Callable[[str], _T], keep_none: Literal[True]
-) -> Dict[str, Optional[_T]]:
-    ...
+) -> Dict[str, Optional[_T]]: ...
 
 
 @overload
-def _get_match_dict(pattern: Pattern[str], string: str) -> Dict[str, str]:
-    ...
+def _get_match_dict(pattern: Pattern[str], string: str) -> Dict[str, str]: ...
 
 
 @overload
-def _get_match_dict(pattern: Pattern[str], string: str, keep_none: Literal[True]) -> Dict[str, Optional[str]]:
-    ...
+def _get_match_dict(pattern: Pattern[str], string: str, keep_none: Literal[True]) -> Dict[str, Optional[str]]: ...
 
 
 def _get_match_dict(  # type: ignore[misc]

diff --git a/tests/resources/parser/test_data/vn/VnExpress.json b/tests/resources/parser/test_data/vn/VnExpress.json
@@ -0,0 +1,32 @@
+{
+  "V1": {
+    "authors": [
+      "VnExpress"
+    ],
+    "body": {
+      "summary": [
+        "Người đàn ông 63 tuổi đau bụng kéo dài, bác sĩ chẩn đoán ung thư lymphoma ruột non - căn bệnh ít gặp trong hệ tiêu hóa."
+      ],
+      "sections": [
+        {
+          "headline": [],
+          "paragraphs": [
+            "Ngày 22/10, BS.CK2 Cao Thị Hồng, Trưởng Trung tâm Kiểm tra sức khỏe Chợ Rẫy Việt Nhật (HECI), Bệnh viện Chợ Rẫy, cho biết bệnh nhân đau bụng âm ỉ, ăn uống khó tiêu, đầy hơi và tiêu chảy kéo dài nhiều tuần, sụt 2-3 kg. Ông nghĩ bị rối loạn tiêu hóa thông thường, song uống thuốc không khỏi.",
+            "Bác sĩ đánh giá bệnh nhân nhiều dấu hiệu bất thường nên nội soi dạ dày và đại tràng lấy mẫu sinh thiết phát hiện tế bào lympho không điển hình, nghi ngờ ác tính. Xét nghiệm hóa mô miễn dịch xác định bệnh nhân mắc ung thư lymphoma không Hodgkin ruột non. Bác sĩ nhiều chuyên khoa tiêu hóa, huyết học, giải phẫu bệnh cùng hội chẩn, xây dựng phác đồ điều trị cho bệnh nhân.",
+            "Theo bác sĩ Hồng, lymphoma đường ruột là u lympho ác tính ngoài hạch phát triển từ mô lympho trong thành ruột, chiếm khoảng 15-25% ca ung thư ruột non, gần 5% ca ung thư dạ dày và 0,2-1% ung thư đại trực tràng. Hầu hết lymphoma đường tiêu hóa thuộc nhóm không Hodgkin, với tỷ lệ mắc mới toàn cầu khoảng 5,6 ca trên 100.000 dân mỗi năm.",
+            "Bệnh xuất hiện chủ yếu ở người trên 60 tuổi, đặc biệt là nam giới. Lymphoma đường ruột thường có các triệu chứng như đau bụng, sụt cân, buồn nôn, tiêu chảy hoặc tiêu phân có máu...",
+            "Nội soi và tầm soát sức khỏe định kỳ giúp phát hiện sớm các bệnh lý ác tính đường tiêu hóa. Hướng điều trị thường phụ thuộc vào loại cũng như giai đoạn bệnh, có thể phẫu thuật, hóa trị hoặc xạ trị.",
+            "Lê Phương"
+          ]
+        }
+      ]
+    },
+    "publishing_date": "2025-10-22 19:25:52+07:00",
+    "title": "Đau bụng kéo dài, phát hiện ung thư lymphoma ruột non",
+    "topics": [
+      "ung thư ruột non",
+      "đau bụng",
+      "rối loạn tiêu hóa"
+    ]
+  }
+}
diff --git a/tests/resources/parser/test_data/vn/VnExpress_2025_10_22.html.gz b/tests/resources/parser/test_data/vn/VnExpress_2025_10_22.html.gz
diff --git a/tests/resources/parser/test_data/vn/meta.info b/tests/resources/parser/test_data/vn/meta.info
@@ -0,0 +1,6 @@
+{
+  "VnExpress_2025_10_22.html.gz": {
+    "url": "https://vnexpress.net/dau-bung-keo-dai-phat-hien-ung-thu-lymphoma-ruot-non-4954454.html",
+    "crawl_date": "2025-10-22 14:57:05.843578"
+  }
+}