Skip to content

Commit 453f002

Browse files
authored
Merge pull request #1266 from MobilityData/1204-bounding-boxes-are-missing-for-many-feeds-with-good-stopstxt-files
fix: 1204 bounding boxes are missing for many feeds with good stopstxt files
2 parents c201c5a + 7cede2d commit 453f002

File tree

7 files changed

+308
-3
lines changed

7 files changed

+308
-3
lines changed

functions-python/helpers/query_helper.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,3 +173,28 @@ def get_datasets_with_missing_reports_query(
173173
Gtfsdataset.stable_id, Gtfsfeed.stable_id
174174
)
175175
return query
176+
177+
178+
def get_feeds_with_missing_bounding_boxes_query(
179+
db_session: Session,
180+
) -> Query:
181+
"""
182+
Get GTFS feeds with datasets missing bounding boxes.
183+
184+
Args:
185+
db_session: SQLAlchemy session
186+
187+
Returns:
188+
A SQLAlchemy query object for GTFS feeds with datasets missing bounding boxes
189+
ordered by feed stable id.
190+
"""
191+
query = (
192+
db_session.query(Gtfsfeed)
193+
.join(Gtfsdataset, Gtfsdataset.feed_id == Gtfsfeed.id)
194+
.filter(Gtfsdataset.latest.is_(True))
195+
.filter(Gtfsdataset.bounding_box.is_(None))
196+
.filter(~Gtfsfeed.feedlocationgrouppoints.any())
197+
.distinct(Gtfsfeed.stable_id, Gtfsdataset.stable_id)
198+
.order_by(Gtfsdataset.stable_id, Gtfsfeed.stable_id)
199+
)
200+
return query

functions-python/tasks_executor/README.md

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
This directory contains Google Cloud Functions used as a single point of access to multiple _tasks_.
44

55
## Usage
6+
67
The function receive the following payload:
8+
79
```
810
{
911
"task": "string", # [required] Name of the task to execute
@@ -12,6 +14,7 @@ The function receive the following payload:
1214
```
1315

1416
Example:
17+
1518
```json
1619
{
1720
"task": "rebuild_missing_validation_reports",
@@ -21,11 +24,22 @@ Example:
2124
"filter_statuses": ["active", "inactive", "future"]
2225
}
2326
}
27+
{
28+
"task": "rebuild_missing_bounding_boxes",
29+
"payload": {
30+
"dry_run": true,
31+
"after_date": "2025-06-01"
32+
}
33+
}
2434
```
35+
2536
To get the list of supported tasks use:
2637
``
2738
{
28-
"name": "list_tasks",
29-
"payload": {}
39+
"name": "list_tasks",
40+
"payload": {}
3041
}
31-
`````
42+
43+
```
44+
45+
```

functions-python/tasks_executor/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ geoalchemy2==0.14.7
1616

1717
# Google specific packages for this function
1818
google-cloud-workflows
19+
google-cloud-pubsub
1920
flask
2021

2122
# Configuration

functions-python/tasks_executor/src/main.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@
2222
from tasks.validation_reports.rebuild_missing_validation_reports import (
2323
rebuild_missing_validation_reports_handler,
2424
)
25+
from tasks.missing_bounding_boxes.rebuild_missing_bounding_boxes import (
26+
rebuild_missing_bounding_boxes_handler,
27+
)
2528

2629

2730
init_logger()
@@ -42,6 +45,10 @@
4245
"description": "Rebuilds missing validation reports for GTFS datasets.",
4346
"handler": rebuild_missing_validation_reports_handler,
4447
},
48+
"rebuild_missing_bounding_boxes": {
49+
"description": "Rebuilds missing bounding boxes for GTFS datasets that contain valid stops.txt files.",
50+
"handler": rebuild_missing_bounding_boxes_handler,
51+
},
4552
}
4653

4754

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import logging
2+
import os
3+
from typing import Dict, List
4+
from sqlalchemy.orm import Session
5+
6+
from shared.database.database import with_db_session
7+
from shared.helpers.pub_sub import publish_messages
8+
from shared.helpers.query_helper import (
9+
get_feeds_with_missing_bounding_boxes_query,
10+
)
11+
from shared.database_gen.sqlacodegen_models import Gtfsdataset
12+
from datetime import datetime
13+
14+
15+
def rebuild_missing_bounding_boxes_handler(payload) -> dict:
16+
(dry_run, after_date) = get_parameters(payload)
17+
18+
return rebuild_missing_bounding_boxes(
19+
dry_run=dry_run,
20+
after_date=after_date,
21+
)
22+
23+
24+
@with_db_session
25+
def rebuild_missing_bounding_boxes(
26+
dry_run: bool = True,
27+
after_date: str = None,
28+
db_session: Session | None = None,
29+
) -> dict:
30+
"""
31+
Find GTFS feeds/datasets missing bounding boxes and either log or publish them for processing.
32+
33+
Args:
34+
dry_run (bool): If True, only logs the number of feeds found (no publishing).
35+
after_date (str, optional): ISO date string (YYYY-MM-DD). Only datasets downloaded after this date are included.
36+
db_session (Session, optional): SQLAlchemy session, injected by @with_db_session.
37+
38+
Returns:
39+
dict: Summary message and count of processed feeds.
40+
"""
41+
filter_after = None
42+
if after_date:
43+
try:
44+
filter_after = datetime.fromisoformat(after_date)
45+
except Exception:
46+
logging.warning(
47+
"Invalid after_date format, expected ISO format (YYYY-MM-DD)"
48+
)
49+
query = get_feeds_with_missing_bounding_boxes_query(db_session)
50+
if filter_after:
51+
query = query.filter(Gtfsdataset.downloaded_at >= filter_after)
52+
feeds = query.all()
53+
54+
if dry_run:
55+
total_processed = len(feeds)
56+
logging.info(
57+
"Dry run mode: %s feeds with missing bounding boxes found, filtered after %s.",
58+
total_processed,
59+
after_date,
60+
)
61+
return {
62+
"message": f"Dry run: {total_processed} feeds with missing bounding boxes found."
63+
+ (f" Filtered after: {filter_after}" if filter_after else ""),
64+
"total_processed": total_processed,
65+
}
66+
else:
67+
# publish a message to a Pub/Sub topic for each feed
68+
pubsub_topic_name = os.getenv("PUBSUB_TOPIC_NAME", None)
69+
project_id = os.getenv("PROJECT_ID")
70+
71+
logging.info("Publishing to topic: %s", pubsub_topic_name)
72+
feeds_data = prepare_feeds_data(db_session)
73+
publish_messages(feeds_data, project_id, pubsub_topic_name)
74+
75+
total_processed = len(feeds_data)
76+
logging.info(
77+
"Published %s feeds with missing bounding boxes to Pub/Sub topic: %s, filtered after %s.",
78+
total_processed,
79+
pubsub_topic_name,
80+
after_date,
81+
)
82+
return {
83+
"message": f"Successfully published {total_processed} feeds with missing bounding boxes."
84+
+ (f" Filtered after: {filter_after}" if filter_after else ""),
85+
"total_processed": total_processed,
86+
}
87+
88+
89+
def prepare_feeds_data(db_session: Session | None = None) -> List[Dict]:
90+
"""
91+
Format feeds data for Pub/Sub messages.
92+
93+
Args:
94+
feeds: List of Gtfsfeed objects
95+
96+
Returns:
97+
List of dictionaries with feed data
98+
"""
99+
data = []
100+
query = get_feeds_with_missing_bounding_boxes_query(db_session)
101+
feeds = query.all()
102+
103+
for feed in feeds:
104+
# Get the latest dataset
105+
if feed.gtfsdatasets and any(dataset.latest for dataset in feed.gtfsdatasets):
106+
latest_dataset = next(
107+
dataset for dataset in feed.gtfsdatasets if dataset.latest
108+
)
109+
110+
data.append(
111+
{
112+
"stable_id": feed.stable_id,
113+
"dataset_id": latest_dataset.stable_id,
114+
"url": latest_dataset.hosted_url,
115+
}
116+
)
117+
118+
return data
119+
120+
121+
def get_parameters(payload):
122+
"""
123+
Get parameters from the payload and environment variables.
124+
125+
Args:
126+
payload (dict): dictionary containing the payload data.
127+
Returns:
128+
tuple: (dry_run, after_date)
129+
"""
130+
dry_run = payload.get("dry_run", True)
131+
dry_run = dry_run if isinstance(dry_run, bool) else str(dry_run).lower() == "true"
132+
after_date = payload.get("after_date", None)
133+
return dry_run, after_date
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import unittest
2+
from unittest.mock import patch, MagicMock
3+
from datetime import datetime
4+
5+
from tasks.missing_bounding_boxes.rebuild_missing_bounding_boxes import (
6+
get_parameters,
7+
rebuild_missing_bounding_boxes,
8+
)
9+
10+
11+
class TestTasksExecutor(unittest.TestCase):
12+
def test_get_parameters(self):
13+
payload = {"dry_run": True}
14+
dry_run, after_date = get_parameters(payload)
15+
self.assertTrue(dry_run)
16+
self.assertIsNone(after_date)
17+
18+
def test_get_parameters_with_valid_after_date(self):
19+
payload = {"dry_run": False, "after_date": "2024-06-01"}
20+
dry_run, after_date = get_parameters(payload)
21+
self.assertFalse(dry_run)
22+
self.assertEqual(after_date, "2024-06-01")
23+
# Check ISO format
24+
try:
25+
datetime.fromisoformat(after_date)
26+
except ValueError:
27+
self.fail(f"after_date '{after_date}' is not a valid ISO date string")
28+
29+
def test_get_parameters_with_string_bool(self):
30+
payload = {"dry_run": "false", "after_date": None}
31+
dry_run, after_date = get_parameters(payload)
32+
self.assertFalse(dry_run)
33+
self.assertIsNone(after_date)
34+
35+
def test_get_parameters_missing_keys(self):
36+
payload = {}
37+
dry_run, after_date = get_parameters(payload)
38+
self.assertTrue(dry_run)
39+
self.assertIsNone(after_date)
40+
41+
@patch(
42+
"tasks.missing_bounding_boxes.rebuild_missing_bounding_boxes.get_feeds_with_missing_bounding_boxes_query"
43+
)
44+
def test_rebuild_missing_bounding_boxes_dry_run(self, mock_query):
45+
# Mock Gtfsdataset and Gtfsfeed objects
46+
mock_dataset1 = MagicMock()
47+
mock_dataset1.latest = True
48+
mock_dataset1.stable_id = "dataset1"
49+
mock_dataset1.hosted_url = "http://example.com/dataset1"
50+
mock_feed1 = MagicMock()
51+
mock_feed1.stable_id = "feed1"
52+
mock_feed1.gtfsdatasets = [mock_dataset1]
53+
54+
mock_dataset2 = MagicMock()
55+
mock_dataset2.latest = True
56+
mock_dataset2.stable_id = "dataset2"
57+
mock_dataset2.hosted_url = "http://example.com/dataset2"
58+
mock_feed2 = MagicMock()
59+
mock_feed2.stable_id = "feed2"
60+
mock_feed2.gtfsdatasets = [mock_dataset2]
61+
62+
mock_query.return_value.filter.return_value = mock_query.return_value
63+
mock_query.return_value.all.return_value = [mock_feed1, mock_feed2]
64+
65+
result = rebuild_missing_bounding_boxes(
66+
dry_run=True, after_date=None, db_session=MagicMock()
67+
)
68+
self.assertIn("Dry run", result["message"])
69+
self.assertEqual(result["total_processed"], 2)
70+
71+
@patch(
72+
"tasks.missing_bounding_boxes.rebuild_missing_bounding_boxes.publish_messages"
73+
)
74+
@patch(
75+
"tasks.missing_bounding_boxes.rebuild_missing_bounding_boxes.get_feeds_with_missing_bounding_boxes_query"
76+
)
77+
def test_rebuild_missing_bounding_boxes_publish(self, mock_query, mock_publish):
78+
# Mock Gtfsdataset and Gtfsfeed objects
79+
mock_dataset = MagicMock()
80+
mock_dataset.latest = True
81+
mock_dataset.stable_id = "dataset1"
82+
mock_dataset.hosted_url = "http://example.com/dataset1"
83+
mock_feed = MagicMock()
84+
mock_feed.stable_id = "feed1"
85+
mock_feed.gtfsdatasets = [mock_dataset]
86+
87+
mock_query.return_value.filter.return_value = mock_query.return_value
88+
mock_query.return_value.all.return_value = [mock_feed]
89+
mock_publish.return_value = None
90+
91+
result = rebuild_missing_bounding_boxes(
92+
dry_run=False, after_date=None, db_session=MagicMock()
93+
)
94+
self.assertIn("Successfully published", result["message"])
95+
self.assertEqual(result["total_processed"], 1)
96+
97+
@patch(
98+
"tasks.missing_bounding_boxes.rebuild_missing_bounding_boxes.get_feeds_with_missing_bounding_boxes_query"
99+
)
100+
def test_rebuild_missing_bounding_boxes_invalid_after_date(self, mock_query):
101+
mock_query.return_value.filter.return_value = mock_query.return_value
102+
mock_query.return_value.all.return_value = []
103+
# Should log a warning and not raise
104+
result = rebuild_missing_bounding_boxes(
105+
dry_run=True, after_date="not-a-date", db_session=MagicMock()
106+
)
107+
self.assertIn("Dry run", result["message"])
108+
self.assertEqual(result["total_processed"], 0)
109+
110+
111+
if __name__ == "__main__":
112+
unittest.main()

infra/functions-python/main.tf

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1221,6 +1221,7 @@ resource "google_cloudfunctions2_function" "tasks_executor" {
12211221
environment_variables = {
12221222
PROJECT_ID = var.project_id
12231223
ENV = var.environment
1224+
PUBSUB_TOPIC_NAME = "rebuild-bounding-boxes-topic"
12241225
}
12251226
available_memory = local.function_tasks_executor_config.memory
12261227
timeout_seconds = local.function_tasks_executor_config.timeout
@@ -1245,6 +1246,18 @@ resource "google_cloudfunctions2_function" "tasks_executor" {
12451246
}
12461247
}
12471248

1249+
# Create the Pub/Sub topic used for publishing messages about rebuilding missing bounding boxes
1250+
resource "google_pubsub_topic" "rebuild_missing_bounding_boxes" {
1251+
name = "rebuild-bounding-boxes-topic"
1252+
}
1253+
1254+
# Grant the Cloud Functions service account permission to publish messages to the rebuild-bounding-boxes-topic Pub/Sub topic
1255+
resource "google_pubsub_topic_iam_member" "rebuild_missing_bounding_boxes_publisher" {
1256+
topic = google_pubsub_topic.rebuild_missing_bounding_boxes.name
1257+
role = "roles/pubsub.publisher"
1258+
member = "serviceAccount:${google_service_account.functions_service_account.email}"
1259+
}
1260+
12481261
# IAM entry for all users to invoke the function
12491262
resource "google_cloudfunctions2_function_iam_member" "tokens_invoker" {
12501263
project = var.project_id

0 commit comments

Comments
 (0)