Skip to content

Commit 3022f6f

Browse files
authored
Merge pull request #831 from flairNLP/fix-the-nation
Fix `summary_selector` for `TheNation`
2 parents 41e593d + bf16133 commit 3022f6f

File tree

6 files changed

+115
-10
lines changed

6 files changed

+115
-10
lines changed

src/fundus/publishers/us/the_nation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ class V2(V1):
106106
# use the old layout for main content, so we concatenate XPath from V1 onto V1_1.
107107

108108
_summary_selector = XPath(
109-
"//div[@class='article-header-content'] /h2 | //div[contains(@class, 'article-title')] /p"
109+
"//div[@class='article-header-content'] /h2 | //article//div[contains(@class, 'article-title')] /p"
110110
)
111111
_paragraph_selector = XPath("(//article | //div[@class='article-body-inner']) / p")
112112

src/fundus/publishers/za/__init__.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from fundus.publishers.za.dizindaba import DizindabaParser
44
from fundus.publishers.za.times_live import TimesLiveParser
55
from fundus.scraping.filter import inverse, regex_filter
6-
from fundus.scraping.url import NewsMap, Sitemap
6+
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap
77

88

99
class ZA(metaclass=PublisherGroup):
@@ -27,13 +27,10 @@ class ZA(metaclass=PublisherGroup):
2727
domain="https://www.timeslive.co.za/",
2828
parser=TimesLiveParser,
2929
sources=[
30-
NewsMap("https://www.timeslive.co.za/sitemap/google-news/times-live/news/"),
31-
NewsMap("https://www.timeslive.co.za/sitemap/google-news/times-live/politics/"),
32-
NewsMap("https://www.timeslive.co.za/sitemap/google-news/times-live/sport/"),
33-
NewsMap("https://www.timeslive.co.za/sitemap/google-news/times-live/lifestyle/"),
34-
NewsMap("https://www.timeslive.co.za/sitemap/google-news/sunday-times/news/"),
35-
NewsMap("https://www.timeslive.co.za/sitemap/google-news/sunday-times/business/"),
36-
NewsMap("https://www.timeslive.co.za/sitemap/google-news/sunday-times-daily/news/"),
30+
RSSFeed("https://www.timeslive.co.za/arc/outboundfeeds/google-news-feed/"),
31+
NewsMap("https://www.timeslive.co.za/arc/outboundfeeds/sitemap-news-index/"),
32+
Sitemap("https://www.timeslive.co.za/arc/outboundfeeds/sitemap-index/"),
33+
Sitemap("https://www.timeslive.co.za/arc/outboundfeeds/sitemap-section-index/"),
3734
],
3835
)
3936

src/fundus/publishers/za/times_live.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import datetime
2-
import re
32
from typing import List, Optional
43

54
from lxml.etree import XPath
@@ -16,6 +15,8 @@
1615

1716
class TimesLiveParser(ParserProxy):
1817
class V1(BaseParser):
18+
VALID_UNTIL = datetime.date(2025, 9, 30)
19+
1920
_paragraph_selector = XPath("//div[@class='wrap']//div[@class='text']/p[span or text()]")
2021
_summary_selector = XPath("//h3[contains(@class, 'article-title-tertiary')] ")
2122
_subheadline_selector = XPath("//div[@class='wrap']//div[@class='text']/h3")
@@ -82,3 +83,20 @@ def images(self) -> List[Image]:
8283
author_selector=XPath("./ancestor::div[contains(@class, 'image-container')]//span[@class='name']"),
8384
relative_urls=True,
8485
)
86+
87+
class V1_1(V1):
88+
VALID_UNTIL = datetime.date.today()
89+
90+
_paragraph_selector = XPath("//article/p[not(string()='TimesLIVE')]") # There are no subheadlines/summaries
91+
92+
@attribute
93+
def images(self) -> List[Image]:
94+
return image_extraction(
95+
doc=self.precomputed.doc,
96+
paragraph_selector=self._paragraph_selector,
97+
lower_boundary_selector=XPath("//div[@class='wrap']//hr"),
98+
upper_boundary_selector=XPath("//h1"),
99+
caption_selector=XPath("./ancestor::figure//span[contains(@class, 'caption')]"),
100+
author_selector=XPath("./ancestor::figure//span[contains(@class, 'credit')]"),
101+
relative_urls=True,
102+
)

tests/resources/parser/test_data/za/TimesLive.json

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,5 +85,91 @@
8585
"early retirement",
8686
"defence drc"
8787
]
88+
},
89+
"V1_1": {
90+
"authors": [
91+
"Tania Broughton"
92+
],
93+
"body": {
94+
"summary": [],
95+
"sections": [
96+
{
97+
"headline": [],
98+
"paragraphs": [
99+
"Duduzile Zuma-Sambudla had been on the state’s radar long before the July 2021 riots, which her advocate Dali Mpofu referred to as police “spying” on her.",
100+
"This emerged during day one of the trial against former president Jacob Zuma’s daughter, an MK Party MP, who is accused of “terrorism” and inciting violence through a series of social media messages she posted before and during the 2021 July unrest, which left 350 people dead in KwaZulu-Natal and Gauteng.",
101+
"Zuma-Sambudla pleaded not guilty to the charges when she appeared before Durban high court judge Mbuzeni Mathenjwa on Monday.",
102+
"Her father sat in the front row of the public gallery, along with other MK Party supporters.",
103+
"The indictment claims that the unrest was sparked by the jailing of Zuma, as ordered by the Constitutional Court after he refused to comply with its order that he appear before the Zondo Commission of Inquiry into state capture.",
104+
"Zuma-Sambudla and other supporters openly criticised this, using social media to organise, plan, incite and coordinate violent incidents.",
105+
"At her earlier bail application, she did not deny sending some of the messages, but said it was nonsensical and petty to say that people were influenced by them.",
106+
"She said the state was clutching at straws and its case against her was weak.",
107+
"The first witness at the trial was Hawks head Major-Gen Gopaul Govender, who was appointed as the national co-ordinator for the investigation into the civil arrest.",
108+
"He said Zuma-Sambudla had been a “person of interest” a long time before July 8, when the violence erupted following the incarceration of Jacob Zuma for contempt of court.",
109+
"In his evidence in chief, and under cross-examination by Mpofu, he said after the apex court had ruled, on June 29 2021, that Zuma must go to jail, there had been “intelligence gathering at a high level to monitor the temperature” in the country.",
110+
"Zuma-Sambudla had been flagged because she was an influential social media user and, on June 30, had posted to her 124,000 followers, loosely translated, “Comrades, the time to fight in this arena with the mobile phone is over.”",
111+
"She had then tweeted that the fight for radical economic transformation must be intensified “on the outside”. Then, in a series of 19 tweets, she had shared videos of the violence with words “we see you” or “I see you”.",
112+
"He said these words could be viewed as being supportive of the violence and chaos.",
113+
"Govender said the investigation revealed that of the three Twitter accounts in her name, two were fake.",
114+
"He said not all of her followers were supportive of her messaging, with some calling her out, saying she was inciting violence and calling on her to “stop playing with people’s lives”.",
115+
"He said the investigation had shown international links to the violence coming from Cuba.",
116+
"There were also people “providing finance”, but he conceded under cross-examination that she, Zuma-Sambudla, had been cleared early on in the investigation of having no direct links with the looters, had not been at any of the crime scenes, and her bank records had shown that she had not been involved in funding the unrest.",
117+
"She had also not been directly involved in any of the many WhatsApp groups which, according to Govender, were created for the sole purpose of communicating from a high level to the “ground forces” and whose members included politicians, community leaders, ward councillors and business people.",
118+
"Regarding the tweets, Mpofu adopted a “so what” attitude and said none of them incited violence.",
119+
"He said he himself followed his client on social media and questioned if that made him a criminal.",
120+
"He said the state had “drawn a blank” in its initial investigations and was now prosecuting his client on an “interpretation of her tweets”.",
121+
"“What is dangerous about a tweet that says ‘we see you’ with a video of burning premises, which had already happened and had been shared widely on social media?” he asked.",
122+
"Govender responded that it was “trending”. Zuma-Sambudla, a person of political influence and authority, the daughter of the former president, had shared it. Her posts carried weight, he said.",
123+
"“It was, like, encouraging, appreciating and supporting it,” Govender said.",
124+
"He said context and timing were important. Specifically on July 9 and 10, “South Africa was a crumbling disaster” with the majority of burning, looting and murders taking place in the first 72 hours.",
125+
"“I will argue that your evidence is completely unhelpful to the state, although it might be helpful to us. Nothing you have said sheds light on why the accused is sitting here,” said Mpofu.",
126+
"Earlier, in his opening statement, state advocate Yuri Gangai said the case was about the “power of words” and that Zuma-Sambudla, instead of being a voice to calm the nation, had not shown restraint.",
127+
"He said she had understood the extent of the volatility and had chosen to fuel it from the comfort of her home.",
128+
"The state is expected to call a cybercrime expert during the course of the next two weeks.",
129+
"On Tuesday, it will call Sarah-Jane Trent, executive director of Forensics for Justice (owned by Paul O’Sullivan), who laid the complaint against Zuma-Sambudla with the Hawks."
130+
]
131+
}
132+
]
133+
},
134+
"images": [
135+
{
136+
"versions": [
137+
{
138+
"url": "https://www.timeslive.co.za/resizer/v2/DLJSG4UH2NFNZP2HPZCYAG4GSM?auth=91191352fb42187691cd436f4cb1d91783ac8963f243097ffa4a879d7a49e4fc&width=800&height=450&smart=true",
139+
"query_width": "max-width:800",
140+
"size": {
141+
"width": 800,
142+
"height": 450
143+
},
144+
"type": null
145+
},
146+
{
147+
"url": "https://www.timeslive.co.za/resizer/v2/DLJSG4UH2NFNZP2HPZCYAG4GSM?auth=91191352fb42187691cd436f4cb1d91783ac8963f243097ffa4a879d7a49e4fc&width=1600&height=900&smart=true",
148+
"query_width": "max-width:1600",
149+
"size": {
150+
"width": 1600,
151+
"height": 900
152+
},
153+
"type": null
154+
}
155+
],
156+
"is_cover": true,
157+
"description": "Duduzile Zuma-Sambudla will face the music for her role in the July civil unrest when her terrorism trial is heard in November.",
158+
"caption": "Duduzile Zuma-Sambudla will face the music for her role in the July civil unrest when her terrorism trial is heard in November.",
159+
"authors": [
160+
"NPA"
161+
],
162+
"position": 342
163+
}
164+
],
165+
"publishing_date": "2025-11-10 16:21:53+00:00",
166+
"title": "Duduzile Zuma-Sambudla’s legal team argues state has been ‘spying’ on her way before July riots",
167+
"topics": [
168+
"Duduzile Zuma-Sambudla",
169+
"2021 unrest",
170+
"courts",
171+
"Dali Mpofu",
172+
"Jacob Zuma"
173+
]
88174
}
89175
}
Binary file not shown.

tests/resources/parser/test_data/za/meta.info

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,9 @@
1010
"TimesLive_2025_05_22.html.gz": {
1111
"url": "https://www.timeslive.co.za/sunday-times-daily/business/2025-05-21-2025-budget-30-sars-gets-r4bn-to-hire-army-of-debt-collectors/",
1212
"crawl_date": "2025-05-22 12:29:24.622820"
13+
},
14+
"TimesLive_2025_11_10.html.gz": {
15+
"url": "https://www.timeslive.co.za/news/south-africa/2025-11-10-dududzile-zuma-sambudlas-legal-team-argues-state-has-been-spying-on-her-way-before-july-riots/",
16+
"crawl_date": "2025-11-10 23:05:51.333562"
1317
}
1418
}

0 commit comments

Comments
 (0)