diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md index 3521e8876..1e5c652a8 100644 --- a/docs/supported_publishers.md +++ b/docs/supported_publishers.md @@ -3758,6 +3758,63 @@     + + + DurbanLocal + + +
Durban Local
+ + + + www.durbanlocal.co.za + + + + en + +   +   +   + + + + Isolezwe + + +
Isolezwe
+ + + + www.isolezwe.co.za + + + + zu + +   +   +   + + + + IsolezweLesiXhosa + + +
Isolezwe LesiXhosa
+ + + + www.isolezwelesixhosa.co.za + + + + xh + +   +   +   + TimesLive diff --git a/src/fundus/publishers/za/__init__.py b/src/fundus/publishers/za/__init__.py index 39e023e26..3371643c3 100644 --- a/src/fundus/publishers/za/__init__.py +++ b/src/fundus/publishers/za/__init__.py @@ -1,7 +1,8 @@ from fundus.publishers.base_objects import Publisher, PublisherGroup from fundus.publishers.za.daily_maverick import DailyMaverickParser +from fundus.publishers.za.independent_online import IndependentOnlineParser from fundus.publishers.za.times_live import TimesLiveParser -from fundus.scraping.filter import inverse, regex_filter +from fundus.scraping.filter import inverse, lor, regex_filter from fundus.scraping.url import NewsMap, Sitemap @@ -35,3 +36,36 @@ class ZA(metaclass=PublisherGroup): NewsMap("https://www.timeslive.co.za/sitemap/google-news/sunday-times-daily/news/"), ], ) + + DurbanLocal = Publisher( + name="Durban Local", + domain="https://www.durbanlocal.co.za/", + parser=IndependentOnlineParser, + sources=[ + Sitemap("https://durbanlocal.co.za/sitemap/", sitemap_filter=inverse(regex_filter("/your-ethekwini/"))), + ], + ) + + Isolezwe = Publisher( + name="Isolezwe", + domain="https://www.isolezwe.co.za/", + parser=IndependentOnlineParser, + sources=[ + Sitemap( + "https://isolezwe.co.za/sitemap/", sitemap_filter=inverse(regex_filter("/isolezwe/")), languages={"zu"} + ), + ], + ) + + IsolezweLesiXhosa = Publisher( + name="Isolezwe LesiXhosa", + domain="https://www.isolezwelesixhosa.co.za/", + parser=IndependentOnlineParser, + sources=[ + Sitemap( + "https://isolezwelesixhosa.co.za/sitemap/", + sitemap_filter=lor(inverse(regex_filter("/isolezwe-lesixhosa/")), regex_filter("english")), + languages={"xh"}, + ), + ], + ) diff --git a/src/fundus/publishers/za/independent_online.py b/src/fundus/publishers/za/independent_online.py new file mode 100644 index 000000000..46da21b89 --- /dev/null +++ b/src/fundus/publishers/za/independent_online.py @@ -0,0 +1,60 @@ +import datetime +import re +from typing import List, Optional + +from lxml.etree import XPath + +from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute +from fundus.parser.utility import ( + extract_article_body_with_selector, + generic_author_parsing, + generic_date_parsing, + generic_topic_parsing, + image_extraction, + strip_nodes_to_text, +) + + +class IndependentOnlineParser(ParserProxy): + class V1(BaseParser): + _paragraph_selector = XPath("//div[@class='article_content__Ag4R_']//div[@class='text_text__oJhZK']/p") + + _topics_selector = XPath("//div[@class='tags_tags__zi1sf']/a") + + @attribute + def body(self) -> Optional[ArticleBody]: + return extract_article_body_with_selector( + self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + ) + + @attribute + def authors(self) -> List[str]: + return generic_author_parsing(self.precomputed.ld.bf_search("author")) + + @attribute + def publishing_date(self) -> Optional[datetime.datetime]: + return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) + + @attribute + def title(self) -> Optional[str]: + return self.precomputed.ld.bf_search("headline") + + @attribute + def topics(self) -> List[str]: + topic_string = strip_nodes_to_text(self._topics_selector(self.precomputed.doc), join_on=",") + if topic_string is not None: + return generic_topic_parsing(topic_string, delimiter=",") + return generic_topic_parsing(self.precomputed.meta.get("keywords", [])) + + @attribute + def images(self) -> List[Image]: + return image_extraction( + doc=self.precomputed.doc, + paragraph_selector=self._paragraph_selector, + upper_boundary_selector=XPath("//h1"), + lower_boundary_selector=XPath("//aside[@class='article_sidebar__qgf5d']"), + image_selector=XPath("//div[contains(@class, 'image')]//img"), + caption_selector=XPath("./ancestor::div[@class='image_image-widget__LYZT4']//p"), + author_selector=re.compile(r"(?i)image:(?P.+)"), + ) diff --git a/tests/resources/parser/test_data/za/DurbanLocal.json b/tests/resources/parser/test_data/za/DurbanLocal.json new file mode 100644 index 000000000..004853767 --- /dev/null +++ b/tests/resources/parser/test_data/za/DurbanLocal.json @@ -0,0 +1,103 @@ +{ + "V1": { + "authors": [ + "Fouzia Van Der Fort" + ], + "body": { + "summary": [], + "sections": [ + { + "headline": [], + "paragraphs": [ + "Romance novelists gathered with readers at a Rom Con to discuss writing and publishing in the Mother City, at the weekend.", + "These discussions took place at Exclusive Books, Waterfront, on Saturday, June 21.", + "Three panels of authors, from Cape Town, Johannesburg and Durban, spoke with heart about their love of reading, writing and happily ever afters.", + "Author Rushdiyah Narker, from Kensington, said she started reading regency romance because it was the closest to what romance was like for Muslims.", + "\"There's always this bruh, but I can't let him know that I like him. Everybody must see me stare at him because my mom is going to scold,\" she said.", + "Ms Narker’s book entitled Some Unspoken Thing is available on pre-order from Loot and will be in stores on August 1.", + "\"I read everything. I read thrillers, mysteries and romance. I always come back to romance because you know you're going to get a happy ending,\" she said.", + "She explained that it was a form of \"escapism\".", + "\"With the world being so shit, why can't we just have our happy endings? I don't make excuses for reading romance. It's what I enjoy; if you don't like that, then bye,\" she said.", + "Novelist Jo Watson, from Johannesburg, who has written dozens of romance books, admitted that it was \"not my vibe\" and had not read the genre before putting pen to paper, which is ironic since she is widely considered to be South Africa's most successful romance writer and other romance authors jokingly call her \"their queen\".", + "Author Nuhaa Bardien, from Brooklyn, said she started reading romance novels before she was allowed to, as a teenager, because she got an adult library card.", + "\"I am a big romance reader. I love any kind of romance and if it doesn't have romance then I'm probably not reading it,\" she said.", + "She said she enjoyed historical romance, which is her favourite.", + "Durban author Arini Vlotman, who also got an adult library card, said: \"I love romance. Give me all of the romance, in whatever genre, in whatever form, in whichever year. I started as the stereotypical youngster going to the library and I went through each shelf because there were only so many books.\"", + "Moderator and author Shameez Patel Papathanasiou, from Goodwood, asked why romance was popular but ridiculed. The authors concurred that it was a \"taboo\" genre but that women were owning their \"pleasures\".", + "Ms Bardien said that it makes one \"so happy\" and it was somewhere readers could find comfort.", + "\"People are like, 'oh my gosh, it's horrible or it's porn' or they degrade it. There's so much more to it. If you look at just all our stories, they're so different, but there's something there for everyone, or there is a story out there for everyone,\" she said.", + "Author Qarnita Loxton, from Melkbosstrand, spoke about sneaking Mills and Boons books from her 50-year-old aunt's room.", + "\"She was a lot older than me, in my teens, and we read the same books because it gave warmth, love and hopeful feelings that transcend generations,\" she said.", + "Ms Loxton said South Africa had a unique market of readers, as more non-fiction books are sold compared to its counterparts.", + "\"I think it is growing, the market is growing. Women are buying books. Women in the age group that we are writing in have disposable income for books. That is the bottom line. That is what publishers listen to and what bookstores sell. So, we're here to stay,\" she said.", + "Author and one of the organisers, Kelly L Clarke, from Vredehoek, said the conference brought local romance readers and authors together to discuss and celebrate the beloved genre and showcase Capetonian talent.", + "\"We had a wonderful, engaged crowd who was excited to engage with our authors and showed so much support,\" she said." + ] + } + ] + }, + "images": [ + { + "versions": [ + { + "url": "https://iol-prod.appspot.com/image/539fcc98d82ad488e5202c91419dec9fe247846e=w700", + "query_width": null, + "size": null, + "type": null + } + ], + "is_cover": true, + "description": null, + "caption": "Pictured right, is reader Nicole Bresler, from Burgundy Estate in Durbanville, speaking to romance novelist Shameez Patel Papathanasiou, from Goodwood, at Exclusive Books (EB) Waterfront.", + "authors": [ + "Fouzia Van Der Fort" + ], + "position": 173 + }, + { + "versions": [ + { + "url": "https://iol-prod.appspot.com/image/607500931dfce7817c219cd9b0519d0f54d14d50=w700", + "query_width": null, + "size": null, + "type": null + } + ], + "is_cover": false, + "description": null, + "caption": "Pictured at the back, from left, are romance authors Jo Watson, from Johannesburg, and Lindsay Norman, from Constantia. In front are Qarnita Loxton, from Melkbosstrand, Nuhaa Bardien, from Brooklyn, Arini Vlotman, from Durban, and Rushiyah Narker, from Kensington.", + "authors": [ + "Fouzia Van Der Fort" + ], + "position": 191 + }, + { + "versions": [ + { + "url": "https://iol-prod.appspot.com/image/34857fdaea338f442c270a3d35fd6a477dc8334f=w700", + "query_width": null, + "size": null, + "type": null + } + ], + "is_cover": false, + "description": null, + "caption": "Pictured from left, are romance novelists Therese Beharrie, Shameez Patel Papathanasiou, from Goodwood, Kelly L. Clarke, from Vredehoek, Zayaan Schroeder, from Athlone, and Dominique Wolf, from Johannesburg.", + "authors": [ + "Fouzia Van Der Fort" + ], + "position": 236 + } + ], + "publishing_date": "2025-06-23 12:00:00+00:00", + "title": "Rom Con celebrates romance literature with authors and readers", + "topics": [ + "books", + "romance", + "author", + "reader", + "genre", + "exclusivebooks" + ] + } +} diff --git a/tests/resources/parser/test_data/za/DurbanLocal_2025_11_09.html.gz b/tests/resources/parser/test_data/za/DurbanLocal_2025_11_09.html.gz new file mode 100644 index 000000000..b1ad7ecdb Binary files /dev/null and b/tests/resources/parser/test_data/za/DurbanLocal_2025_11_09.html.gz differ diff --git a/tests/resources/parser/test_data/za/Isolezwe.json b/tests/resources/parser/test_data/za/Isolezwe.json new file mode 100644 index 000000000..e67666200 --- /dev/null +++ b/tests/resources/parser/test_data/za/Isolezwe.json @@ -0,0 +1,88 @@ +{ + "V1": { + "authors": [ + "Zimbili Vilakazi" + ], + "body": { + "summary": [], + "sections": [ + { + "headline": [], + "paragraphs": [ + "LUSELUDE ukhalo ngempi yokuqedwa inkinga yemilutha yewunga enkabeni yeTheku.", + "Lokhu kuza emva kokuvimbanisa kwayo emgwaqweni uChe Guevara - owawaziwa ngoMoore Road - ngemijondolo kaseyili noplastiki.", + "Sekuphele amasonto amathathu le milutha elinganiselwa ku-200 edle owodwa wemizila yalo mgwaqo. Kuwona yakhe imijondolo, ethikameza abashayeli bezimoto, abaphoqeleke ukuthi basebenzise eminye.", + "Phakathi kokunye okuthikameza abashayeli abasuke bezojoyina lo mgwaqo besuka emgwaqweni u-M4, ukuthi eminye yale milutha inkanisa ngaserobhothini, nokwenza kube nzima ukuthi abashayeli bamele irobhothi uma livaliwe ngenxa yovalo lokushaywa kwamawindi ezimoto, bese bebanjwa inkunzi.", + "Okhulumela amaphoyisa kaMasipala weTheku, uColonel Boysie Zungu, uthe njengoba selulethiwe ezindlebeni zabo udaba lokuvalwa komgwaqo, bazalusukumela.", + "“Sizotshala amaphoyisa ukuthi abhekane nabo,” kusho uZungu.", + "Nokho ukhale ngenkinga yokuthi amaphara bahlale bewasusa ngapha, avumbuke ngapha.", + "“Sisuka kude nawo. Uzokhumbula ukuthi ake aba se-Albert Park, ku-M4. Uwajaha ngapha, bahlakazeke bahambe, ngakusasa bahlangane kwenye indawo,” kusho uZungu.", + "Uthe umasipala uphezu kwayo imizamo yesixazululo sesikhathi eside.", + "“Umasipala wayithola indawo ngapha eningizimu yeTheku, kumanje kuyaqhubeka ukwakha kuleyo ndawo. Siyacabanga ukuthi uma sekuphothuliwe ukwakhiwa kwaleso sakhiwo siyobe sesikhona isisombululo,” usho kanje.", + "Ubalule ukuthi inkinga yamaphara idinga ukubambisana kwezinhlaka ezehlukene okukhona khona imindeni lapho kuvela khona laba bantu abagcina behlala emgwaqweni, uMnyango wezokuThuthukiswa koMphakathi nezinye.", + "Maphakathi nalo nyaka, Isolezwe like labhala udaba ngesikhungo sokugcina imilutha yewunga esakhiwayo eLower Ilovu, ngaseManzimtoti. Nokho udaba lwakhona lwalumayelana nokuthi umphakathi wakule ndawo wawubhikisha, uthi awukufuni ukwakhiwa kwalesi sikhungo endaweni yawo. Wawukhala ngokuthi kuzonyuka izinga lobugebengu endaweni uma imilutha yewunga izogcinwa endaweni yawo. Ngokombiko kamasipala, le ndawo kwathiwa izoba nemibhede ebalelwa ku-450." + ] + } + ] + }, + "images": [ + { + "versions": [ + { + "url": "https://iol-prod.appspot.com/image/450a2c7417d4d233f0b8ead4444281716f86381b=w700", + "query_width": null, + "size": null, + "type": null + } + ], + "is_cover": true, + "description": null, + "caption": "Adaze inkani amaphara eThekwini", + "authors": [ + "eThekwini Municipality / Facebook" + ], + "position": 178 + }, + { + "versions": [ + { + "url": "https://iol-prod.appspot.com/image/349459c84e2f9eff661e6d5373cce32e0c25ea3c=w700", + "query_width": null, + "size": null, + "type": null + } + ], + "is_cover": false, + "description": null, + "caption": "IMILUTHA yewunga esivimbanise emgwaqweni uChe Guevara (Moore Road) enkabeni yeTheku", + "authors": [ + "DOCTOR NGCOBO" + ], + "position": 197 + }, + { + "versions": [ + { + "url": "https://iol-prod.appspot.com/image/5949328626f0ce46cd37e2924d50d6be667dc82d=w700", + "query_width": null, + "size": null, + "type": null + } + ], + "is_cover": false, + "description": null, + "caption": "IMILUTHA yewunga esivimbanise emgwaqweni uChe Guevara (Moore Road) enkabeni yeTheku", + "authors": [ + "DOCTOR NGCOBO" + ], + "position": 216 + } + ], + "publishing_date": "2025-11-08 08:24:00+00:00", + "title": "Kubukeka kukude phambili amaphara edla umzila eThekwini", + "topics": [ + "homeless", + "amaphara" + ] + } +} diff --git a/tests/resources/parser/test_data/za/IsolezweLesiXhosa.json b/tests/resources/parser/test_data/za/IsolezweLesiXhosa.json new file mode 100644 index 000000000..54e8be695 --- /dev/null +++ b/tests/resources/parser/test_data/za/IsolezweLesiXhosa.json @@ -0,0 +1,49 @@ +{ + "V1": { + "authors": [ + "Jonisayi Maromo" + ], + "body": { + "summary": [], + "sections": [ + { + "headline": [], + "paragraphs": [ + "Owayesakuba nguMphathiswa wamaShishini oLuntu, uGqirha Malusi Gigaba, akabanjwanga emva kotyelelo lwakhe kwiofisi yecandelo leGunyabantu lezoTshutshiso eliphanda urhwaphilizo (IDAC) eLynnwood, ePitoli, ngentsasa yangoLwesihlanu.", + "Iingxelo zangaphambili zidize ukuba uGigaba uzinikele kwabasemagunyeni malunga netyala lakwaTransnet lorhwaphilizo. Nangona kunjalo, isithethi seIDAC, uHenry Mamothame, ukuqinisekisile ukuba uGigaba \"akabanjwanga\" kwaye \"akuzukuvela enkundleni namhlanje.\"", + "\"UMnu Malusi Gigaba akabanjwanga. Uthethathethana neIDAC kwaye akazukuvela enkundleni namhlanje (ngoLwesihlanu). I-IDAC ayizukuphinda ithethe ngakumbi ngalo mba,\" utshilo uMamothame kwingxelo emfutshane ethunyelwe oonondaba.", + "Utyelelo lukaGigaba kwiiofisi zeIDAC luphawula amanyathelo amatsha kwiinzame eziqhubekayo zeNPA zokutshutshisa abantu ababandakanyeka kurhwaphilizo kumaqumrhu karhulumente ngexesha lokubanjwa ngobhongwane kombuso.", + "Uphando lwakwaTransnet lugxile kwiikhontrakthi zokuthengwa kweempahla ezixabisa iibhiliyoni zeerandi, kuqukwa nesivumelwano sikaloliwe seR54 billion esanikezelwa ngexesha likaGigaba njengoMphathiswa wamaShishini oLuntu phakathi konyaka u2010 no2014.", + "Abantu abaninzi ababefudula bengabalawuli, kuquka owayesakuba yiNgqonyela yeqela lakwaTransnet uBrian Molefe kunye nowayesakuba liGosa eliPhezulu lezeMali uAnoj Singh, sele bevele enkundleni bejongene nezityholo zobuqhophololo norhwaphilizo.", + "UGigaba ebesoloko ekhanyela nabuphi na ubugwenxa kwaye ebekhe wazikhaba izityholo ezimdibanisa neshishini losapho lakwaGupta." + ] + } + ] + }, + "images": [ + { + "versions": [ + { + "url": "https://iol-prod.appspot.com/image/22967847ec0a746452362a913e4b42265da59e39=w700", + "query_width": null, + "size": null, + "type": null + } + ], + "is_cover": true, + "description": null, + "caption": "UMalusi Gigaba ozinikele kwabasemagunyeni malunga netyala lakwaTransnet lorhwaphilizo akazange abanjwe.", + "authors": [ + "Phando Jikelo/Independent Newspapers" + ], + "position": 177 + } + ], + "publishing_date": "2025-11-07 11:00:52+00:00", + "title": "UGigaba akabanjwanga emva kokutyelela i-IDAC", + "topics": [ + "malusi gigaba", + "idac investigations" + ] + } +} diff --git a/tests/resources/parser/test_data/za/IsolezweLesiXhosa_2025_11_09.html.gz b/tests/resources/parser/test_data/za/IsolezweLesiXhosa_2025_11_09.html.gz new file mode 100644 index 000000000..3e006f978 Binary files /dev/null and b/tests/resources/parser/test_data/za/IsolezweLesiXhosa_2025_11_09.html.gz differ diff --git a/tests/resources/parser/test_data/za/Isolezwe_2025_11_09.html.gz b/tests/resources/parser/test_data/za/Isolezwe_2025_11_09.html.gz new file mode 100644 index 000000000..9d7b8221e Binary files /dev/null and b/tests/resources/parser/test_data/za/Isolezwe_2025_11_09.html.gz differ diff --git a/tests/resources/parser/test_data/za/meta.info b/tests/resources/parser/test_data/za/meta.info index 15dfeb1a6..2a422216c 100644 --- a/tests/resources/parser/test_data/za/meta.info +++ b/tests/resources/parser/test_data/za/meta.info @@ -3,6 +3,18 @@ "url": "https://www.dailymaverick.co.za/article/2025-05-03-icj-concludes-hearings-on-israel-aid-obligations-in-gaza/", "crawl_date": "2025-05-22 13:24:27.994785" }, + "DurbanLocal_2025_11_09.html.gz": { + "url": "https://durbanlocal.co.za/entertainment/2025-06-23-we-love-romance-declared-at-rom-con/", + "crawl_date": "2025-11-09 00:45:15.046807" + }, + "IsolezweLesiXhosa_2025_11_09.html.gz": { + "url": "https://isolezwelesixhosa.co.za/iindaba/2025-11-07-ugigaba-akabanjwanga-emva-kokutyelela-i-idac/", + "crawl_date": "2025-11-09 01:07:59.558667" + }, + "Isolezwe_2025_11_09.html.gz": { + "url": "https://isolezwe.co.za/izindaba/2025-11-08-kubukeka-kukude-phambili-amaphara-edla-umzila-ethekwini/", + "crawl_date": "2025-11-09 00:51:18.108142" + }, "TimesLive_2025_05_22.html.gz": { "url": "https://www.timeslive.co.za/sunday-times-daily/business/2025-05-21-2025-budget-30-sars-gets-r4bn-to-hire-army-of-debt-collectors/", "crawl_date": "2025-05-22 12:29:24.622820" diff --git a/tests/utility.py b/tests/utility.py index b7716ad03..f9356bb14 100644 --- a/tests/utility.py +++ b/tests/utility.py @@ -262,7 +262,7 @@ def write(self) -> None: def load_html_test_file_mapping(publisher: Publisher) -> Dict[Type[BaseParser], HTMLTestFile]: html_paths = (test_resource_path / Path(f"{publisher.__group__.__name__.lower()}")).glob( - f"{publisher.__name__}*.html.gz" + f"{publisher.__name__}_*.html.gz" ) html_files = [HTMLTestFile.load(path) for path in html_paths] html_mapping: Dict[Type[BaseParser], HTMLTestFile] = {}