Skip to content

Commit d040e10

Browse files
authored
Fix: rebuild missing dataset files (#1417)
1 parent d4c983e commit d040e10

File tree

3 files changed

+140
-16
lines changed

3 files changed

+140
-16
lines changed

functions-python/tasks_executor/src/tasks/dataset_files/README.md

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ The function accepts the following payload:
1919
{
2020
"dry_run": true, // [optional] If true, do not upload or modify the database (default: true)
2121
"after_date": "YYYY-MM-DD", // [optional] Only include datasets downloaded after this ISO date
22-
"latest_only": true // [optional] If true, only process the latest version of each dataset (default: true)
22+
"latest_only": true, // [optional] If true, only process the latest version of each dataset (default: true)
23+
"dataset_id": id // [optional] If provided, only process the specified dataset. It will supersede the after_date and latest_only parameters.
2324
}
2425
```
2526

@@ -32,7 +33,13 @@ The function accepts the following payload:
3233
"latest_only": true
3334
}
3435
```
35-
36+
or
37+
```json
38+
{
39+
"dry_run": false,
40+
"dataset_id": "mdb-1147-202407031702"
41+
}
42+
```
3643
---
3744

3845
## What It Does
@@ -52,6 +59,9 @@ For each GTFS dataset with missing file information (missing zipped/unzipped siz
5259
7. Computes SHA256 hashes for each file
5360
8. Stores metadata in the `Gtfsfile` table for later use
5461

62+
If the `dataset_id` parameter is provided, the process is a bit simplified. It does not download the dataset as it is
63+
assumed the dataset is already present in the bucket. The rest of the processing is the same.
64+
5565
---
5666

5767
## GCP Environment Variables

functions-python/tasks_executor/src/tasks/dataset_files/rebuild_missing_dataset_files.py

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,13 @@ def rebuild_missing_dataset_files_handler(payload) -> dict:
2323
dry_run = payload.get("dry_run", True)
2424
after_date = payload.get("after_date", None)
2525
latest_only = payload.get("latest_only", True)
26+
dataset_id = payload.get("dataset_id", None)
2627

2728
return rebuild_missing_dataset_files(
28-
dry_run=dry_run, after_date=after_date, latest_only=latest_only
29+
dry_run=dry_run,
30+
after_date=after_date,
31+
latest_only=latest_only,
32+
dataset_id=dataset_id,
2933
)
3034

3135

@@ -67,6 +71,7 @@ def rebuild_missing_dataset_files(
6771
dry_run: bool = True,
6872
after_date: str = None,
6973
latest_only: bool = True,
74+
dataset_id: str = None,
7075
) -> dict:
7176
"""
7277
Processes GTFS datasets missing extracted files and updates database.
@@ -76,13 +81,22 @@ def rebuild_missing_dataset_files(
7681
dry_run (bool): If True, only logs how many would be processed.
7782
after_date (str): Only consider datasets downloaded after this ISO date.
7883
latest_only (bool): Whether to include only latest datasets.
84+
dataset_id (str | None): If provided, only process the dataset with this stable id.
7985
8086
Returns:
8187
dict: Result summary.
8288
"""
83-
datasets = get_datasets_with_missing_files_query(
84-
db_session, after_date=after_date, latest_only=latest_only
85-
)
89+
90+
if dataset_id:
91+
datasets = (
92+
db_session.query(Gtfsdataset)
93+
.filter(Gtfsdataset.stable_id == dataset_id)
94+
.options(joinedload(Gtfsdataset.feed))
95+
)
96+
else:
97+
datasets = get_datasets_with_missing_files_query(
98+
db_session, after_date=after_date, latest_only=latest_only
99+
)
86100

87101
if dry_run:
88102
total = datasets.count()
@@ -102,6 +116,9 @@ def rebuild_missing_dataset_files(
102116
logging.info("Starting to process datasets with missing files...")
103117
execution_id = f"task-executor-uuid-{uuid.uuid4()}"
104118
messages = []
119+
all_datasets_count = datasets.count()
120+
topic = (os.getenv("DATASET_PROCESSING_TOPIC_NAME"),)
121+
105122
for dataset in datasets.all():
106123
try:
107124
message = {
@@ -124,16 +141,13 @@ def rebuild_missing_dataset_files(
124141
count += 1
125142
total_processed += 1
126143

127-
if count % batch_count == 0:
128-
publish_messages(
129-
messages,
130-
os.getenv("PROJECT_ID"),
131-
os.getenv("DATASET_PROCESSING_TOPIC_NAME"),
132-
)
144+
if count % batch_count == 0 or all_datasets_count == count:
145+
publish_messages(messages, os.getenv("PROJECT_ID"), topic)
133146
messages = []
134147
logging.info(
135-
"Published message for %d datasets. Total processed: %d",
136-
batch_count,
148+
"Published message to topic %s for %d datasets. Total processed: %d",
149+
topic,
150+
batch_count if count % batch_count == 0 else all_datasets_count - count,
137151
total_processed,
138152
)
139153

@@ -147,6 +161,7 @@ def rebuild_missing_dataset_files(
147161
"after_date": after_date,
148162
"latest_only": latest_only,
149163
"datasets_bucket_name": os.environ.get("DATASETS_BUCKET_NAME"),
164+
"dataset_id": dataset_id,
150165
},
151166
}
152167
logging.info("Task summary: %s", result)

functions-python/tasks_executor/tests/tasks/dataset_files/test_rebuild_missing_dataset_files.py

Lines changed: 101 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
import os
1919
import unittest
2020
from datetime import datetime
21-
from unittest.mock import patch
21+
from types import SimpleNamespace
22+
from unittest.mock import patch, MagicMock
2223

2324
from sqlalchemy.orm import Session
2425

@@ -46,7 +47,7 @@ def test_handler_calls_main_function(self, mock_rebuild_func):
4647

4748
self.assertEqual(response["message"], "test")
4849
mock_rebuild_func.assert_called_once_with(
49-
dry_run=True, after_date="2024-01-01", latest_only=False
50+
dry_run=True, after_date="2024-01-01", latest_only=False, dataset_id=None
5051
)
5152

5253
@with_db_session(db_url=default_db_url)
@@ -93,3 +94,101 @@ def test_rebuild_missing_dataset_files_processing(
9394
self.assertIn("completed", response["message"])
9495
self.assertGreaterEqual(response["total_processed"], 0)
9596
self.assertTrue(publish_mock.called or response["total_processed"] == 0)
97+
98+
99+
class TestRebuildSpecificDatasetFiles(unittest.TestCase):
100+
@patch(
101+
"tasks.dataset_files.rebuild_missing_dataset_files.rebuild_missing_dataset_files"
102+
)
103+
def test_handler_calls_main_function(self, mock_rebuild_func):
104+
mock_rebuild_func.return_value = {"message": "test", "total_processed": 0}
105+
payload = {"dry_run": True, "after_date": "2024-01-01", "latest_only": False}
106+
107+
response = rebuild_missing_dataset_files_handler(payload)
108+
109+
self.assertEqual(response["message"], "test")
110+
mock_rebuild_func.assert_called_once_with(
111+
dry_run=True, after_date="2024-01-01", latest_only=False, dataset_id=None
112+
)
113+
114+
@patch(
115+
"tasks.dataset_files.rebuild_missing_dataset_files.rebuild_missing_dataset_files"
116+
)
117+
def test_handler_forwards_dataset_id(self, mock_rebuild_func):
118+
payload = {
119+
"dry_run": False,
120+
"after_date": None,
121+
"latest_only": True,
122+
"dataset_id": "ds-123",
123+
}
124+
125+
rebuild_missing_dataset_files_handler(payload)
126+
127+
mock_rebuild_func.assert_called_once_with(
128+
dry_run=False, after_date=None, latest_only=True, dataset_id="ds-123"
129+
)
130+
131+
def test_rebuild_with_specific_dataset_id_publishes_one_message(self):
132+
dataset_stable_id = "ds-123"
133+
fake_feed = SimpleNamespace(
134+
producer_url="https://example.com",
135+
stable_id="feed-stable",
136+
id=42,
137+
authentication_type=None,
138+
authentication_info_url=None,
139+
api_key_parameter_name=None,
140+
)
141+
fake_dataset = SimpleNamespace(
142+
stable_id=dataset_stable_id, hash="abc123", feed=fake_feed
143+
)
144+
145+
# Mock the chained SQLAlchemy calls:
146+
# db_session.query(Gtfsdataset).filter(...).options(...).count()/all()
147+
db_session = MagicMock()
148+
query_mock = MagicMock()
149+
filter_mock = MagicMock()
150+
options_mock = MagicMock()
151+
152+
db_session.query.return_value = query_mock
153+
query_mock.filter.return_value = filter_mock
154+
filter_mock.options.return_value = options_mock
155+
156+
options_mock.count.return_value = 1
157+
options_mock.all.return_value = [fake_dataset]
158+
159+
with patch.dict(
160+
os.environ,
161+
{"PROJECT_ID": "test-project", "DATASET_PROCESSING_TOPIC_NAME": "topic"},
162+
clear=False,
163+
), patch(
164+
"tasks.dataset_files.rebuild_missing_dataset_files.get_datasets_with_missing_files_query"
165+
) as get_query_mock, patch(
166+
"tasks.dataset_files.rebuild_missing_dataset_files.publish_messages"
167+
) as mock_publish:
168+
from tasks.dataset_files.rebuild_missing_dataset_files import (
169+
rebuild_missing_dataset_files,
170+
Gtfsdataset,
171+
)
172+
173+
result = rebuild_missing_dataset_files(
174+
db_session=db_session,
175+
dry_run=False,
176+
after_date=None,
177+
latest_only=True, # ignored when dataset_id is provided
178+
dataset_id=dataset_stable_id,
179+
)
180+
181+
# Asserts
182+
get_query_mock.assert_not_called() # bypasses generic query when dataset_id is set
183+
db_session.query.assert_called_once_with(Gtfsdataset)
184+
query_mock.filter.assert_called_once() # filtered by stable_id
185+
options_mock.count.assert_called_once()
186+
options_mock.all.assert_called_once()
187+
188+
self.assertEqual(result["total_processed"], 1)
189+
mock_publish.assert_called_once()
190+
191+
messages_arg, project_id_arg, _topic_arg = mock_publish.call_args[0]
192+
self.assertEqual(project_id_arg, "test-project")
193+
self.assertEqual(len(messages_arg), 1)
194+
self.assertEqual(messages_arg[0]["dataset_stable_id"], dataset_stable_id)

0 commit comments

Comments
 (0)