Skip to content

Commit 41e593d

Browse files
authored
Merge pull request #832 from flairNLP/add-dizindaba
Add `Dizindaba`
2 parents 53e04de + 1974193 commit 41e593d

File tree

6 files changed

+159
-0
lines changed

6 files changed

+159
-0
lines changed

docs/supported_publishers.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3775,6 +3775,27 @@
37753775
<td>&#160;</td>
37763776
<td>&#160;</td>
37773777
</tr>
3778+
<tr>
3779+
<td>
3780+
<code>Dizindaba</code>
3781+
</td>
3782+
<td>
3783+
<div>Dizindaba</div>
3784+
</td>
3785+
<td>
3786+
<a href="https://www.dizindaba.co.za/">
3787+
<span>www.dizindaba.co.za</span>
3788+
</a>
3789+
</td>
3790+
<td>
3791+
<code>xh</code>
3792+
</td>
3793+
<td>
3794+
<code>topics</code>
3795+
</td>
3796+
<td>&#160;</td>
3797+
<td>&#160;</td>
3798+
</tr>
37783799
<tr>
37793800
<td>
37803801
<code>TimesLive</code>

src/fundus/publishers/za/__init__.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from fundus.publishers.base_objects import Publisher, PublisherGroup
22
from fundus.publishers.za.daily_maverick import DailyMaverickParser
3+
from fundus.publishers.za.dizindaba import DizindabaParser
34
from fundus.publishers.za.times_live import TimesLiveParser
45
from fundus.scraping.filter import inverse, regex_filter
56
from fundus.scraping.url import NewsMap, Sitemap
@@ -35,3 +36,17 @@ class ZA(metaclass=PublisherGroup):
3536
NewsMap("https://www.timeslive.co.za/sitemap/google-news/sunday-times-daily/news/"),
3637
],
3738
)
39+
40+
Dizindaba = Publisher(
41+
name="Dizindaba",
42+
domain="https://www.dizindaba.co.za/",
43+
parser=DizindabaParser,
44+
sources=[
45+
Sitemap(
46+
"https://dizindaba.co.za/sitemap_index.xml",
47+
sitemap_filter=inverse(regex_filter("post-sitemap")),
48+
languages={"xh"},
49+
reverse=True,
50+
)
51+
],
52+
)
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import datetime
2+
import re
3+
from typing import List, Optional
4+
5+
from lxml.etree import XPath
6+
7+
from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
8+
from fundus.parser.utility import (
9+
extract_article_body_with_selector,
10+
generic_author_parsing,
11+
generic_date_parsing,
12+
generic_topic_parsing,
13+
image_extraction,
14+
)
15+
16+
17+
class DizindabaParser(ParserProxy):
18+
class V1(BaseParser):
19+
_author_selector = r"(?i)(intatheli|by):(?P<author>[A-z\s]*)\|"
20+
_compiled_author_selector = re.compile(_author_selector)
21+
22+
_paragraph_selector = XPath(
23+
f"//div[@itemprop='articleBody']/p[not(re:test(string(),'{_author_selector}')) and text()]",
24+
namespaces={"re": "http://exslt.org/regular-expressions"},
25+
)
26+
_subheadline_selector = XPath("//div[@itemprop='articleBody']/p[not(position()>1 or text())]/strong")
27+
28+
@attribute
29+
def body(self) -> Optional[ArticleBody]:
30+
return extract_article_body_with_selector(
31+
self.precomputed.doc,
32+
paragraph_selector=self._paragraph_selector,
33+
subheadline_selector=self._subheadline_selector,
34+
)
35+
36+
@attribute
37+
def authors(self) -> List[str]:
38+
author_candidate = self.precomputed.doc.xpath("//div[@itemprop='articleBody']/p[1]/text()")
39+
if author_candidate and (match := self._compiled_author_selector.search(author_candidate[0])):
40+
return generic_author_parsing(match.group("author"))
41+
return generic_author_parsing(self.precomputed.ld.bf_search("author"))
42+
43+
@attribute
44+
def publishing_date(self) -> Optional[datetime.datetime]:
45+
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))
46+
47+
@attribute
48+
def title(self) -> Optional[str]:
49+
return self.precomputed.ld.bf_search("headline")
50+
51+
@attribute
52+
def images(self) -> List[Image]:
53+
return image_extraction(
54+
doc=self.precomputed.doc,
55+
paragraph_selector=self._paragraph_selector,
56+
upper_boundary_selector=XPath("//article"),
57+
)
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
{
2+
"V1": {
3+
"authors": [
4+
"Dizindaba"
5+
],
6+
"body": {
7+
"summary": [],
8+
"sections": [
9+
{
10+
"headline": [],
11+
"paragraphs": [
12+
"Iindaba zakutshanje ezimnandi ngakulutsha oluthwaxwa yentswela ngqesho zezo zinge Sebe lezeKhaya (Home Affairs) nelazise okokuba ligaya isigaba sesibini senyathelo lokuheha amawaka angama 4000 olutsha olungaphangeliyo. Apha ulutsha luzakusetyenziselwa ukusebenzisa ubuchwepheshe bale mihla kwi projekithi ejongene nokuphuculwa kwamarekhodi asekuhlaleni. Kuvakala okokuba le projekithi intsha ijongise ekuguquleni into ephaya kwi 350 million yemibandela (records) yoluntu ebisoloko igcinwe emaphepheni.",
13+
"Ngoku ukuphucula uziso lwee nkonzo olululo kuzakusetyenziswa ubuchwephesha bale mihla (Electronic format). Le nkuntyula yenani eliyi 4000 yinxalenye ye 10000 yolutsha olungaphangeliyo noluza kugayelwa ikontilaka eziinyanga ezingama 36 yengqesho leli Sebe lezeKhaya le projekithi yogayo yaqala kwe yeThupha (August) kunyaka ka 2022 nalapho kwagaywa i2000 yolutsha olunezidanga kodwa lungasebenzi. Kuvakala okokuba ngoku ukusukela kwe yomnquma iSebe sele ligaye into ephaya kwi 1341 yolutsha.",
14+
"Kuvakala okokuba le projekithi yogayo loluthsa isebenzisana neSebe Lengqesho nemisebenzi kunye ne Harambe Youth Employment Accelarator njenge nxalenye ye National Pathway Management Network. Kubonakala oku iyindlela eyiyo ekuvelisweni kwamathuba emisebenzi. Le projekithi yeminyaka emithathu neyi R2.4 billion yaye yaziswa nguPrezidanti Cyril Ramaphosa ngethuba wayesenza intetho yakhe yesizwe (SONA). Omnye wolutsha owathi waxhamla ngethuba iqala le projekithi yokusetyenziswa kobuchwepesha bale mihla (Digital project) ukuguqula imininingwana yoluntu ebisemaphepheni ibe ngu Kamogelo Kgatshe waba ngumanjela wale projekithi. Okufunekayo kolo lutsha lufuna umsebenzi yile mininingwane efunekayo apha ngezantsi nenombolo engu 0800727272",
15+
"Abemi boMzantsi Afrika abaphakathi kweminyaka eli-18 nama-35 ubudala. Iziqinisekiso zakho kufuneka zihambelane nobuncinci bemfuno yendima oyifakelayo isicelo. Akukho ngxelo yolwaphulo-mthetho; kunye nembali ecacileyo yetyala",
16+
"R5000 ngenyanga. Amaxwebhu oLawulo lweWarehouse (izithuba ezili-16). Iimbaleki (240 izithuba). Abalungiseleli (izithuba ezingama-735). Udityaniswa kwakhona (izithuba ezingama-734). Ukufumana oomabhalane (izithuba ezili-100). Iskena (izithuba ezili-125). Abaqhubi (izithuba ezili-10). Izalathisi (izithuba ezili-1600) – R5500 ngenyanga. Abalawuli bomgangatho (izithuba ezingama-200) – ama-R6500 ngenyanga. Iinkokeli zeqela (izithuba ezingama-210) – R6500 ngenyanga. Inkxaso yoBugcisa (izithuba ezingama-24) – R9500 ngenyanga. Abaphathi (izithuba ezi-6) – R14250 ngenyanga."
17+
]
18+
}
19+
]
20+
},
21+
"images": [
22+
{
23+
"versions": [
24+
{
25+
"url": "https://dizindaba.co.za/wp-content/uploads/2023/03/1584304517924-66x66.jpg",
26+
"query_width": null,
27+
"size": {
28+
"width": 66,
29+
"height": 66
30+
},
31+
"type": "image/jpeg"
32+
},
33+
{
34+
"url": "https://dizindaba.co.za/wp-content/uploads/2023/03/1584304517924-150x150.jpg",
35+
"query_width": null,
36+
"size": {
37+
"width": 150,
38+
"height": 150
39+
},
40+
"type": "image/jpeg"
41+
},
42+
{
43+
"url": "https://dizindaba.co.za/wp-content/uploads/2023/03/1584304517924.jpg",
44+
"query_width": null,
45+
"size": {
46+
"width": 200,
47+
"height": 200
48+
},
49+
"type": "image/jpeg"
50+
}
51+
],
52+
"is_cover": true,
53+
"description": null,
54+
"caption": "UKamogelo Kgatshe wonyulwa njengomanejala njengenxalenye yeqela lokuqala leprojekithi yokugcinwa kweerekhodi zoluntu ngobuchwepeshe bale mihla kwi Sebe leMicimbi yezeKhaya.",
55+
"authors": [],
56+
"position": 163
57+
}
58+
],
59+
"publishing_date": "2023-04-10 12:05:49+00:00",
60+
"title": "URhulumente uthabatha indlela eyiyo yokuvezela ulutsha imisebenzi"
61+
}
62+
}
49.6 KB
Binary file not shown.

tests/resources/parser/test_data/za/meta.info

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
"url": "https://www.dailymaverick.co.za/article/2025-05-03-icj-concludes-hearings-on-israel-aid-obligations-in-gaza/",
44
"crawl_date": "2025-05-22 13:24:27.994785"
55
},
6+
"Dizindaba_2025_11_16.html.gz": {
7+
"url": "https://dizindaba.co.za/2023/04/10/urhulumente-uthabatha-indlela-eyiyo-yokuvezela-ulutsha-imisebenzi/",
8+
"crawl_date": "2025-11-16 19:14:20.207117"
9+
},
610
"TimesLive_2025_05_22.html.gz": {
711
"url": "https://www.timeslive.co.za/sunday-times-daily/business/2025-05-21-2025-budget-30-sars-gets-r4bn-to-hire-army-of-debt-collectors/",
812
"crawl_date": "2025-05-22 12:29:24.622820"

0 commit comments

Comments
 (0)