Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 42 additions & 1 deletion wp1/logic/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def delete_builder(wp10db, user_id, builder_id):
rq_cancel_success = True
with wp10db.cursor() as cursor:
cursor.execute(
"""SELECT s_rq_job_id, s_id FROM zim_schedules
"""SELECT s_rq_job_id, s_id FROM zim_schedules
WHERE s_builder_id = %s AND s_rq_job_id IS NOT NULL""",
(builder_id,),
)
Expand Down Expand Up @@ -570,6 +570,7 @@ def request_zim_file_task_for_builder(
selection = latest_selection_for(wp10db, builder.b_id, "text/tab-separated-values")

with wp10db.cursor() as cursor:
# First, try to update the zim_task for the current selection
cursor.execute(
"""UPDATE zim_tasks SET
z_status = 'REQUESTED', z_task_id = %s, z_zim_schedule_id = %s, z_requested_at = %s
Expand All @@ -582,6 +583,46 @@ def request_zim_file_task_for_builder(
selection.s_id,
),
)
rows_updated = cursor.rowcount

# If no rows were updated this means the selection version might have changed
# try to update by zim_schedule_id instead (for regenerating failed ZIMs)
if rows_updated == 0 and zim_schedule_id:
cursor.execute(
"""UPDATE zim_tasks SET
z_status = 'REQUESTED', z_task_id = %s, z_selection_id = %s, z_requested_at = %s
WHERE z_zim_schedule_id = %s
""",
(
task_id,
selection.s_id,
utcnow().strftime(TS_FORMAT_WP10),
zim_schedule_id,
),
)
rows_updated = cursor.rowcount

# If still no rows were updated then we need to insert a new zim_task
if rows_updated == 0:
cursor.execute(
"""INSERT INTO zim_tasks (z_selection_id, z_zim_schedule_id, z_status, z_task_id, z_requested_at)
VALUES (%s, %s, 'REQUESTED', %s, %s)
""",
(
selection.s_id,
zim_schedule_id,
task_id,
utcnow().strftime(TS_FORMAT_WP10),
),
)

# Update b_selection_zim_version to point to the current selection version
# ensures the download URL lookup finds the correct zim_task with the new task_id
cursor.execute(
"""UPDATE builders SET b_selection_zim_version = %s WHERE b_id = %s""",
(selection.s_version, builder.b_id),
)

cursor.execute(
"""SELECT * FROM zim_tasks WHERE z_selection_id = %s""", (selection.s_id,)
)
Expand Down
195 changes: 195 additions & 0 deletions wp1/logic/builder_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,48 @@ def _insert_selection(
)
self.wp10db.commit()

def _setup_failed_zim_regeneration_scenario(
self,
zim_schedule_id=b"schedule-123",
old_task_id="old-task-id",
create_new_selection=False,
):
self._insert_zim_schedule(zim_schedule_id, self.builder.b_id)

with self.wp10db.cursor() as cursor:
# Insert selection v1 with failed ZIM
cursor.execute(
"""INSERT INTO selections
(s_id, s_builder_id, s_updated_at, s_content_type, s_version,
s_object_key, s_article_count)
VALUES (%s, %s, '20250102000000', 'text/tab-separated-values', 1,
'old.tsv', 100)""",
(1, self.builder.b_id),
)
cursor.execute(
"""INSERT INTO zim_tasks
(z_selection_id, z_zim_schedule_id, z_status, z_task_id)
VALUES (%s, %s, 'FAILED', %s)""",
(1, zim_schedule_id, old_task_id),
)
# Optionally create selection v2
if create_new_selection:
cursor.execute(
"""INSERT INTO selections
(s_id, s_builder_id, s_updated_at, s_content_type, s_version,
s_object_key, s_article_count)
VALUES (%s, %s, '20250103000000', 'text/tab-separated-values', 2,
'new.tsv', 100)""",
(2, self.builder.b_id),
)
cursor.execute(
"UPDATE builders SET b_current_version = 2 WHERE b_id = %s",
(self.builder.b_id,),
)

self.wp10db.commit()
return zim_schedule_id

def _get_builder_by_user_id(self):
with self.wp10db.cursor() as cursor:
cursor.execute(
Expand Down Expand Up @@ -1793,3 +1835,156 @@ def test_get_builder_module_class_missing_builder_class(self, mock_import):
logic_builder.get_builder_module_class("wp1.selection.models.simple")

self.assertIn("Builder class not found in module", str(cm.exception))

@patch("wp1.logic.builder.zimfarm.request_zimfarm_task")
@patch(
"wp1.logic.builder.utcnow", return_value=datetime.datetime(2025, 1, 2, 0, 0, 0)
)
def test_regenerate_zim_updates_old_task_when_selection_version_changed(
self, mock_utcnow, mock_request_zimfarm_task
):
"""
Ensure the existing zim_task is updated (not dupplicated) when the selection version changes.
"""
self._insert_builder()
zim_schedule_id = self._setup_failed_zim_regeneration_scenario(
create_new_selection=True
)

mock_request_zimfarm_task.return_value = "new-task-id"

redis = MagicMock()
result = logic_builder.request_zim_file_task_for_builder(
redis, self.wp10db, self.builder, zim_schedule_id=zim_schedule_id
)

with self.wp10db.cursor() as cursor:
cursor.execute("SELECT COUNT(*) as count FROM zim_tasks")
count = cursor.fetchone()["count"]
cursor.execute("SELECT z_selection_id FROM zim_tasks")
row = cursor.fetchone()

self.assertEqual(1, count)
self.assertEqual(b"2", row["z_selection_id"])

@patch("wp1.logic.builder.zimfarm.request_zimfarm_task")
@patch(
"wp1.logic.builder.utcnow", return_value=datetime.datetime(2025, 1, 2, 0, 0, 0)
)
def test_regenerate_zim_saves_new_task_id(
self, mock_utcnow, mock_request_zimfarm_task
):
"""
test that new task_id from Zimfarm is saved correctly.
"""
self._insert_builder()
zim_schedule_id = self._setup_failed_zim_regeneration_scenario(
old_task_id="task_v1"
)

mock_request_zimfarm_task.return_value = "task_v2"

redis = MagicMock()
result = logic_builder.request_zim_file_task_for_builder(
redis, self.wp10db, self.builder, zim_schedule_id=zim_schedule_id
)

self.assertIsNotNone(result)
self.assertEqual(b"task_v2", result.z_task_id)
self.assertEqual(b"REQUESTED", result.z_status)

with self.wp10db.cursor() as cursor:
cursor.execute("SELECT z_task_id FROM zim_tasks WHERE z_selection_id = 1")
row = cursor.fetchone()

self.assertEqual(b"task_v2", row["z_task_id"])

@patch("wp1.logic.builder.zimfarm.request_zimfarm_task")
@patch(
"wp1.logic.builder.utcnow", return_value=datetime.datetime(2025, 1, 2, 0, 0, 0)
)
def test_regenerate_zim_updates_b_selection_zim_version(
self, mock_utcnow, mock_request_zimfarm_task
):
"""
test that b_selection_zim_version is updated for downloads to work.
"""
self._insert_builder(zim_version=1)
zim_schedule_id = self._setup_failed_zim_regeneration_scenario(
old_task_id="old-task", create_new_selection=True
)

with self.wp10db.cursor() as cursor:
cursor.execute(
"SELECT b_selection_zim_version FROM builders WHERE b_id = %s",
(self.builder.b_id,),
)
version_before = cursor.fetchone()["b_selection_zim_version"]

self.assertEqual(1, version_before)

mock_request_zimfarm_task.return_value = "task_v2"

redis = MagicMock()
result = logic_builder.request_zim_file_task_for_builder(
redis, self.wp10db, self.builder, zim_schedule_id=zim_schedule_id
)

with self.wp10db.cursor() as cursor:
cursor.execute(
"SELECT b_selection_zim_version FROM builders WHERE b_id = %s",
(self.builder.b_id,),
)
version_after = cursor.fetchone()["b_selection_zim_version"]

self.assertEqual(2, version_after)

@patch("wp1.logic.builder.zimfarm.zim_file_url_for_task_id")
def test_download_url_after_regeneration(self, mock_zim_file_url):
"""
test that download URL works after regenerating a failed ZIM.
"""
self._insert_builder()

with self.wp10db.cursor() as cursor:
# Selection v1 with failed ZIM
cursor.execute(
"""INSERT INTO selections
(s_id, s_builder_id, s_updated_at, s_content_type, s_version, s_object_key, s_article_count)
VALUES (%s, %s, '20230101000000', 'text/tab-separated-values', 1, 'old.tsv', 100)""",
(1, self.builder.b_id),
)
cursor.execute(
"""INSERT INTO zim_tasks
(z_selection_id, z_status, z_task_id)
VALUES (%s, 'FAILED', 'old-failed-task')""",
(1,),
)
# Selection v2 with successful ZIM
cursor.execute(
"""INSERT INTO selections
(s_id, s_builder_id, s_updated_at, s_content_type, s_version, s_object_key, s_article_count)
VALUES (%s, %s, '20230102000000', 'text/tab-separated-values', 2, 'new.tsv', 100)""",
(2, self.builder.b_id),
)
cursor.execute(
"""INSERT INTO zim_tasks
(z_selection_id, z_status, z_task_id)
VALUES (%s, 'FILE_READY', 'new-successful-task')""",
(2,),
)
cursor.execute(
"""UPDATE builders SET
b_current_version = 2,
b_selection_zim_version = 2
WHERE b_id = %s""",
(self.builder.b_id,),
)
self.wp10db.commit()

mock_zim_file_url.return_value = "https://download.kiwix.org/zim/new-file.zim"

url = logic_builder.latest_zim_file_url_for(self.wp10db, self.builder.b_id)

self.assertIsNotNone(url)
mock_zim_file_url.assert_called_once_with(b"new-successful-task")
11 changes: 7 additions & 4 deletions wp1/zimfarm.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,10 +371,13 @@ def create_or_update_zimfarm_schedule(
)
try:
existing_zim_schedule = find_existing_schedule_in_db(wp10db, builder.b_id)
if existing_zim_schedule and zimfarm_schedule_exists(
redis, existing_zim_schedule.s_id.decode("utf-8")
):
r = requests.patch("%s/schedules" % base_url, headers=headers, json=params)
if existing_zim_schedule and zimfarm_schedule_exists(redis, builder_id):
schedule_name = get_zimfarm_schedule_name(builder_id)
r = requests.patch(
"%s/schedules/%s" % (base_url, schedule_name),
headers=headers,
json=params,
)
r.raise_for_status()
zim_schedule = existing_zim_schedule
zim_schedule.s_title = title.encode("utf-8")
Expand Down
Loading