Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -3758,6 +3758,63 @@
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>DurbanLocal</code>
</td>
<td>
<div>Durban Local</div>
</td>
<td>
<a href="https://www.durbanlocal.co.za/">
<span>www.durbanlocal.co.za</span>
</a>
</td>
<td>
<code>en</code>
</td>
<td>&#160;</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>Isolezwe</code>
</td>
<td>
<div>Isolezwe</div>
</td>
<td>
<a href="https://www.isolezwe.co.za/">
<span>www.isolezwe.co.za</span>
</a>
</td>
<td>
<code>zu</code>
</td>
<td>&#160;</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>IsolezweLesiXhosa</code>
</td>
<td>
<div>Isolezwe LesiXhosa</div>
</td>
<td>
<a href="https://www.isolezwelesixhosa.co.za/">
<span>www.isolezwelesixhosa.co.za</span>
</a>
</td>
<td>
<code>xh</code>
</td>
<td>&#160;</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>TimesLive</code>
Expand Down
34 changes: 34 additions & 0 deletions src/fundus/publishers/za/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.za.daily_maverick import DailyMaverickParser
from fundus.publishers.za.independent_online import IndependentOnlineParser
from fundus.publishers.za.times_live import TimesLiveParser
from fundus.scraping.filter import inverse, regex_filter
from fundus.scraping.url import NewsMap, Sitemap
Expand Down Expand Up @@ -35,3 +36,36 @@ class ZA(metaclass=PublisherGroup):
NewsMap("https://www.timeslive.co.za/sitemap/google-news/sunday-times-daily/news/"),
],
)

DurbanLocal = Publisher(
name="Durban Local",
domain="https://www.durbanlocal.co.za/",
parser=IndependentOnlineParser,
sources=[
Sitemap("https://durbanlocal.co.za/sitemap/", sitemap_filter=inverse(regex_filter("/your-ethekwini/"))),
],
)

Isolezwe = Publisher(
name="Isolezwe",
domain="https://www.isolezwe.co.za/",
parser=IndependentOnlineParser,
sources=[
Sitemap(
"https://isolezwe.co.za/sitemap/", sitemap_filter=inverse(regex_filter("/isolezwe/")), languages={"zu"}
),
],
)

IsolezweLesiXhosa = Publisher(
name="Isolezwe LesiXhosa",
domain="https://www.isolezwelesixhosa.co.za/",
parser=IndependentOnlineParser,
sources=[
Sitemap("https://isolezwelesixhosa.co.za/sitemap/isolezwe-lesixhosa/iindaba/", languages={"xh"}),
Sitemap("https://isolezwelesixhosa.co.za/sitemap/isolezwe-lesixhosa/ezemidlalo/", languages={"xh"}),
Sitemap("https://isolezwelesixhosa.co.za/sitemap/isolezwe-lesixhosa/ezoyolo/", languages={"xh"}),
Sitemap("https://isolezwelesixhosa.co.za/sitemap/isolezwe-lesixhosa/izimvo/", languages={"xh"}),
Sitemap("https://isolezwelesixhosa.co.za/sitemap/isolezwe-lesixhosa/entsimini/", languages={"xh"}),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we do the same trick here as with the publishers above?

Further i saw some english language sitemaps, maybe it would be good to add them? Could be beneficial for cross lingual corpora. What do you think?

Copy link
Collaborator Author

@addie9800 addie9800 Nov 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, now that I think about it yes. I tried to find one regex that selects all xhosa sitemaps, but not the english ones. I struggled to find that. But I realised, that I can use two separate ones and and them together. I'll update that.
Unfortunately the English Sitemaps are empty :/ But usually I would definitely agree.

],
)
60 changes: 60 additions & 0 deletions src/fundus/publishers/za/independent_online.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import datetime
import re
from typing import List, Optional

from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
image_extraction,
strip_nodes_to_text,
)


class IndependentOnlineParser(ParserProxy):
class V1(BaseParser):
_paragraph_selector = XPath("//div[@class='article_content__Ag4R_']//div[@class='text_text__oJhZK']/p")

_topics_selector = XPath("//div[@class='tags_tags__zi1sf']/a")

@attribute
def body(self) -> Optional[ArticleBody]:
return extract_article_body_with_selector(
self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
)

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.bf_search("author"))

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def title(self) -> Optional[str]:
return self.precomputed.ld.bf_search("headline")

@attribute
def topics(self) -> List[str]:
topic_string = strip_nodes_to_text(self._topics_selector(self.precomputed.doc), join_on=",")
if topic_string is not None:
return generic_topic_parsing(topic_string, delimiter=",")
return generic_topic_parsing(self.precomputed.meta.get("keywords", []))

@attribute
def images(self) -> List[Image]:
return image_extraction(
doc=self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
upper_boundary_selector=XPath("//h1"),
lower_boundary_selector=XPath("//aside[@class='article_sidebar__qgf5d']"),
image_selector=XPath("//div[contains(@class, 'image')]//img"),
caption_selector=XPath("./ancestor::div[@class='image_image-widget__LYZT4']//p"),
author_selector=re.compile(r"(?i)image:(?P<credits>.+)"),
)
103 changes: 103 additions & 0 deletions tests/resources/parser/test_data/za/DurbanLocal.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
{
"V1": {
"authors": [
"Fouzia Van Der Fort"
],
"body": {
"summary": [],
"sections": [
{
"headline": [],
"paragraphs": [
"Romance novelists gathered with readers at a Rom Con to discuss writing and publishing in the Mother City, at the weekend.",
"These discussions took place at Exclusive Books, Waterfront, on Saturday, June 21.",
"Three panels of authors, from Cape Town, Johannesburg and Durban, spoke with heart about their love of reading, writing and happily ever afters.",
"Author Rushdiyah Narker, from Kensington, said she started reading regency romance because it was the closest to what romance was like for Muslims.",
"\"There's always this bruh, but I can't let him know that I like him. Everybody must see me stare at him because my mom is going to scold,\" she said.",
"Ms Narker’s book entitled Some Unspoken Thing is available on pre-order from Loot and will be in stores on August 1.",
"\"I read everything. I read thrillers, mysteries and romance. I always come back to romance because you know you're going to get a happy ending,\" she said.",
"She explained that it was a form of \"escapism\".",
"\"With the world being so shit, why can't we just have our happy endings? I don't make excuses for reading romance. It's what I enjoy; if you don't like that, then bye,\" she said.",
"Novelist Jo Watson, from Johannesburg, who has written dozens of romance books, admitted that it was \"not my vibe\" and had not read the genre before putting pen to paper, which is ironic since she is widely considered to be South Africa's most successful romance writer and other romance authors jokingly call her \"their queen\".",
"Author Nuhaa Bardien, from Brooklyn, said she started reading romance novels before she was allowed to, as a teenager, because she got an adult library card.",
"\"I am a big romance reader. I love any kind of romance and if it doesn't have romance then I'm probably not reading it,\" she said.",
"She said she enjoyed historical romance, which is her favourite.",
"Durban author Arini Vlotman, who also got an adult library card, said: \"I love romance. Give me all of the romance, in whatever genre, in whatever form, in whichever year. I started as the stereotypical youngster going to the library and I went through each shelf because there were only so many books.\"",
"Moderator and author Shameez Patel Papathanasiou, from Goodwood, asked why romance was popular but ridiculed. The authors concurred that it was a \"taboo\" genre but that women were owning their \"pleasures\".",
"Ms Bardien said that it makes one \"so happy\" and it was somewhere readers could find comfort.",
"\"People are like, 'oh my gosh, it's horrible or it's porn' or they degrade it. There's so much more to it. If you look at just all our stories, they're so different, but there's something there for everyone, or there is a story out there for everyone,\" she said.",
"Author Qarnita Loxton, from Melkbosstrand, spoke about sneaking Mills and Boons books from her 50-year-old aunt's room.",
"\"She was a lot older than me, in my teens, and we read the same books because it gave warmth, love and hopeful feelings that transcend generations,\" she said.",
"Ms Loxton said South Africa had a unique market of readers, as more non-fiction books are sold compared to its counterparts.",
"\"I think it is growing, the market is growing. Women are buying books. Women in the age group that we are writing in have disposable income for books. That is the bottom line. That is what publishers listen to and what bookstores sell. So, we're here to stay,\" she said.",
"Author and one of the organisers, Kelly L Clarke, from Vredehoek, said the conference brought local romance readers and authors together to discuss and celebrate the beloved genre and showcase Capetonian talent.",
"\"We had a wonderful, engaged crowd who was excited to engage with our authors and showed so much support,\" she said."
]
}
]
},
"images": [
{
"versions": [
{
"url": "https://iol-prod.appspot.com/image/539fcc98d82ad488e5202c91419dec9fe247846e=w700",
"query_width": null,
"size": null,
"type": null
}
],
"is_cover": true,
"description": null,
"caption": "Pictured right, is reader Nicole Bresler, from Burgundy Estate in Durbanville, speaking to romance novelist Shameez Patel Papathanasiou, from Goodwood, at Exclusive Books (EB) Waterfront.",
"authors": [
"Fouzia Van Der Fort"
],
"position": 173
},
{
"versions": [
{
"url": "https://iol-prod.appspot.com/image/607500931dfce7817c219cd9b0519d0f54d14d50=w700",
"query_width": null,
"size": null,
"type": null
}
],
"is_cover": false,
"description": null,
"caption": "Pictured at the back, from left, are romance authors Jo Watson, from Johannesburg, and Lindsay Norman, from Constantia. In front are Qarnita Loxton, from Melkbosstrand, Nuhaa Bardien, from Brooklyn, Arini Vlotman, from Durban, and Rushiyah Narker, from Kensington.",
"authors": [
"Fouzia Van Der Fort"
],
"position": 191
},
{
"versions": [
{
"url": "https://iol-prod.appspot.com/image/34857fdaea338f442c270a3d35fd6a477dc8334f=w700",
"query_width": null,
"size": null,
"type": null
}
],
"is_cover": false,
"description": null,
"caption": "Pictured from left, are romance novelists Therese Beharrie, Shameez Patel Papathanasiou, from Goodwood, Kelly L. Clarke, from Vredehoek, Zayaan Schroeder, from Athlone, and Dominique Wolf, from Johannesburg.",
"authors": [
"Fouzia Van Der Fort"
],
"position": 236
}
],
"publishing_date": "2025-06-23 12:00:00+00:00",
"title": "Rom Con celebrates romance literature with authors and readers",
"topics": [
"books",
"romance",
"author",
"reader",
"genre",
"exclusivebooks"
]
}
}
Binary file not shown.
88 changes: 88 additions & 0 deletions tests/resources/parser/test_data/za/Isolezwe.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
{
"V1": {
"authors": [
"Zimbili Vilakazi"
],
"body": {
"summary": [],
"sections": [
{
"headline": [],
"paragraphs": [
"LUSELUDE ukhalo ngempi yokuqedwa inkinga yemilutha yewunga enkabeni yeTheku.",
"Lokhu kuza emva kokuvimbanisa kwayo emgwaqweni uChe Guevara - owawaziwa ngoMoore Road - ngemijondolo kaseyili noplastiki.",
"Sekuphele amasonto amathathu le milutha elinganiselwa ku-200 edle owodwa wemizila yalo mgwaqo. Kuwona yakhe imijondolo, ethikameza abashayeli bezimoto, abaphoqeleke ukuthi basebenzise eminye.",
"Phakathi kokunye okuthikameza abashayeli abasuke bezojoyina lo mgwaqo besuka emgwaqweni u-M4, ukuthi eminye yale milutha inkanisa ngaserobhothini, nokwenza kube nzima ukuthi abashayeli bamele irobhothi uma livaliwe ngenxa yovalo lokushaywa kwamawindi ezimoto, bese bebanjwa inkunzi.",
"Okhulumela amaphoyisa kaMasipala weTheku, uColonel Boysie Zungu, uthe njengoba selulethiwe ezindlebeni zabo udaba lokuvalwa komgwaqo, bazalusukumela.",
"“Sizotshala amaphoyisa ukuthi abhekane nabo,” kusho uZungu.",
"Nokho ukhale ngenkinga yokuthi amaphara bahlale bewasusa ngapha, avumbuke ngapha.",
"“Sisuka kude nawo. Uzokhumbula ukuthi ake aba se-Albert Park, ku-M4. Uwajaha ngapha, bahlakazeke bahambe, ngakusasa bahlangane kwenye indawo,” kusho uZungu.",
"Uthe umasipala uphezu kwayo imizamo yesixazululo sesikhathi eside.",
"“Umasipala wayithola indawo ngapha eningizimu yeTheku, kumanje kuyaqhubeka ukwakha kuleyo ndawo. Siyacabanga ukuthi uma sekuphothuliwe ukwakhiwa kwaleso sakhiwo siyobe sesikhona isisombululo,” usho kanje.",
"Ubalule ukuthi inkinga yamaphara idinga ukubambisana kwezinhlaka ezehlukene okukhona khona imindeni lapho kuvela khona laba bantu abagcina behlala emgwaqweni, uMnyango wezokuThuthukiswa koMphakathi nezinye.",
"Maphakathi nalo nyaka, Isolezwe like labhala udaba ngesikhungo sokugcina imilutha yewunga esakhiwayo eLower Ilovu, ngaseManzimtoti. Nokho udaba lwakhona lwalumayelana nokuthi umphakathi wakule ndawo wawubhikisha, uthi awukufuni ukwakhiwa kwalesi sikhungo endaweni yawo. Wawukhala ngokuthi kuzonyuka izinga lobugebengu endaweni uma imilutha yewunga izogcinwa endaweni yawo. Ngokombiko kamasipala, le ndawo kwathiwa izoba nemibhede ebalelwa ku-450."
]
}
]
},
"images": [
{
"versions": [
{
"url": "https://iol-prod.appspot.com/image/450a2c7417d4d233f0b8ead4444281716f86381b=w700",
"query_width": null,
"size": null,
"type": null
}
],
"is_cover": true,
"description": null,
"caption": "Adaze inkani amaphara eThekwini",
"authors": [
"eThekwini Municipality / Facebook"
],
"position": 178
},
{
"versions": [
{
"url": "https://iol-prod.appspot.com/image/349459c84e2f9eff661e6d5373cce32e0c25ea3c=w700",
"query_width": null,
"size": null,
"type": null
}
],
"is_cover": false,
"description": null,
"caption": "IMILUTHA yewunga esivimbanise emgwaqweni uChe Guevara (Moore Road) enkabeni yeTheku",
"authors": [
"DOCTOR NGCOBO"
],
"position": 197
},
{
"versions": [
{
"url": "https://iol-prod.appspot.com/image/5949328626f0ce46cd37e2924d50d6be667dc82d=w700",
"query_width": null,
"size": null,
"type": null
}
],
"is_cover": false,
"description": null,
"caption": "IMILUTHA yewunga esivimbanise emgwaqweni uChe Guevara (Moore Road) enkabeni yeTheku",
"authors": [
"DOCTOR NGCOBO"
],
"position": 216
}
],
"publishing_date": "2025-11-08 08:24:00+00:00",
"title": "Kubukeka kukude phambili amaphara edla umzila eThekwini",
"topics": [
"homeless",
"amaphara"
]
}
}
Loading