Skip to content

Commit b17e4c0

Browse files
authored
feat: batch task to reduce geojson maps precision and remove ids (#1353)
1 parent f971c5a commit b17e4c0

File tree

14 files changed

+712
-23
lines changed

14 files changed

+712
-23
lines changed

api/tests/test_utils/db_utils.py

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -192,23 +192,14 @@ def is_test_db(url):
192192

193193
def empty_database(db, url):
194194
if is_test_db(url):
195-
196-
metadata_tables = Base.metadata.tables
197-
198-
# Get all table names excluding those in the excluded_tables list
199-
all_table_names = [table_name for table_name in metadata_tables.keys() if table_name not in excluded_tables]
200-
201-
# Sort the table names in reverse order of dependencies
202-
tables_to_delete = sorted(
203-
all_table_names, key=lambda name: len(metadata_tables[name].foreign_keys), reverse=True
204-
)
205-
206195
try:
207196
with db.start_db_session() as session:
208-
for table_name in tables_to_delete:
209-
table = Base.metadata.tables[table_name]
210-
delete_stmt = delete(table)
211-
session.execute(delete_stmt)
212-
197+
# Using sorted_tables to respect foreign key constraints
198+
for table in reversed(Base.metadata.sorted_tables):
199+
if table.name not in excluded_tables:
200+
table = Base.metadata.tables[table.name]
201+
delete_stmt = delete(table)
202+
session.execute(delete_stmt)
203+
session.commit()
213204
except Exception as error:
214205
logging.error(f"Error while deleting from test db: {error}")

functions-python/helpers/locations.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from sqlalchemy import func, cast
77
from geoalchemy2.types import Geography
88

9-
import pycountry
109
from shared.database_gen.sqlacodegen_models import Feed, Location, Geopolygon
1110
import logging
1211

@@ -35,6 +34,8 @@ def get_country_code(country_name: str) -> Optional[str]:
3534
Returns:
3635
Optional[str]: Two-letter ISO country code or None if not found
3736
"""
37+
import pycountry
38+
3839
# Return None for empty or whitespace-only strings
3940
if not country_name or not country_name.strip():
4041
logging.error("Could not find country code for: empty string")

functions-python/process_validation_report/tests/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ def pytest_sessionfinish(session, exitstatus):
143143
returning the exit status to the system.
144144
"""
145145
# Cleaned at the beginning instead of the end so we can examine the DB after the test.
146-
# clean_testing_db()
146+
clean_testing_db()
147147

148148

149149
def pytest_unconfigure(config):

functions-python/process_validation_report/tests/test_validation_report.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ def test_get_dataset(self, db_session):
8282
)
8383
try:
8484
db_session.add(feed)
85+
db_session.flush()
8586
db_session.add(dataset)
8687
db_session.flush()
8788
returned_dataset = get_dataset(dataset_stable_id, db_session)
@@ -123,6 +124,7 @@ def test_create_validation_report_entities(self, mock_get, db_session):
123124
)
124125
try:
125126
db_session.add(feed)
127+
db_session.flush()
126128
db_session.add(dataset)
127129
db_session.commit()
128130
create_validation_report_entities(feed_stable_id, dataset_stable_id, "1.0")
@@ -347,17 +349,18 @@ def test_create_validation_report_entities_missing_validator_version(
347349
],
348350
},
349351
)
350-
feed_stable_id = faker.word()
351-
dataset_stable_id = faker.word()
352+
feed_stable_id = faker.uuid4()
353+
dataset_stable_id = faker.uuid4()
352354

353355
# Create GTFS Feed
354-
feed = Gtfsfeed(id=faker.word(), data_type="gtfs", stable_id=feed_stable_id)
356+
feed = Gtfsfeed(id=faker.uuid4(), data_type="gtfs", stable_id=feed_stable_id)
355357
# Create a new dataset
356358
dataset = Gtfsdataset(
357-
id=faker.word(), feed_id=feed.id, stable_id=dataset_stable_id, latest=True
359+
id=faker.uuid4(), feed_id=feed.id, stable_id=dataset_stable_id, latest=True
358360
)
359361
try:
360362
db_session.add(feed)
363+
db_session.flush()
361364
db_session.add(dataset)
362365
db_session.commit()
363366
create_validation_report_entities(feed_stable_id, dataset_stable_id, "1.0")

functions-python/tasks_executor/README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,15 @@ To get the list of supported tasks use:
5050
"payload": {}
5151
}
5252
```
53+
To update the geolocation files precision:
54+
```json
55+
{
56+
"task": "update_geojson_files_precision",
57+
"payload": {
58+
"dry_run": true,
59+
"data_type": "gtfs",
60+
"precision": 5,
61+
"limit": 10
62+
}
63+
}
64+
```

functions-python/tasks_executor/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ pluggy~=1.3.0
1111
certifi~=2025.8.3
1212
fastapi
1313
uvicorn[standard]
14+
psutil
1415

1516

1617
# SQL Alchemy and Geo Alchemy

functions-python/tasks_executor/src/main.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@
3535
from tasks.visualization_files.rebuild_missing_visualization_files import (
3636
rebuild_missing_visualization_files_handler,
3737
)
38+
from tasks.geojson.update_geojson_files_precision import (
39+
update_geojson_files_precision_handler,
40+
)
3841

3942
init_logger()
4043
LIST_COMMAND: Final[str] = "list"
@@ -66,6 +69,10 @@
6669
"description": "Rebuilds missing dataset files for GTFS datasets.",
6770
"handler": rebuild_missing_dataset_files_handler,
6871
},
72+
"update_geojson_files": {
73+
"description": "Iterate over bucket looking for {feed_stable_id}/geolocation.geojson and update precision.",
74+
"handler": update_geojson_files_precision_handler,
75+
},
6976
"rebuild_missing_visualization_files": {
7077
"description": "Rebuilds missing visualization files for GTFS datasets.",
7178
"handler": rebuild_missing_visualization_files_handler,
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# Update GeoJSON files
2+
3+
This task adjust the GeoJSON files removing the map IDs and reducing the precision of the coordinates to 5 decimal places. ALso updates the geolocation_file_created_date and geolocation_file_dataset_id fields in the Feed table.
4+
5+
---
6+
7+
## Task ID
8+
9+
Use task ID: `update_geojson_files_precision`
10+
11+
---
12+
13+
## Usage
14+
15+
The function accepts the following payload:
16+
17+
```json
18+
{
19+
"dry_run": true, // [optional] If true, do not upload or modify the database (default: true)
20+
"precision": 5, // [optional] Number of decimal places to keep in coordinates (default: 5)
21+
"limit": 10, // [optional] Limit the number of feeds to process (default: no limit)
22+
"data_type": "gtfs" // [optional] Type of data to process, either "gtfs" or "gbfs" (default: "gtfs")
23+
}
24+
```
25+
26+
### Example:
27+
28+
```json
29+
{
30+
"dry_run": true,
31+
"data_type": "gtfs",
32+
"limit": 10
33+
}
34+
```
35+
36+
---
37+
38+
## What It Does
39+
40+
List all feeds with GeoJSON files, download each file, remove map IDs, reduce coordinate precision to the specified number of decimal places, and re-upload the modified file.
41+
Also updates the `geolocation_file_created_date` and `geolocation_file_dataset_id` fields in the `Feed` table.
42+
43+
## GCP Environment Variables
44+
45+
The function requires the following environment variables:
46+
47+
| Variable | Description |
48+
|--------------------------------|-------------------------------------------------------------------------|
49+
| `DATASETS_BUCKET_NAME` | The name of the GCS bucket used to store extracted GTFS files |
50+
| `GBFS_SNAPSHOTS_BUCKET_NAME` | The name of the GCS bucket used to store extracted GBFS snapshots files |
51+
52+
---
53+
54+
## Additional Notes
55+
56+
* Commits to the database occur in batches of 100 feeds to improve performance and avoid large transaction blocks.
57+
* If `dry_run` is enabled, files are uploads or DB modifications are performed. Only the number of affected feeds is logged.
58+
* The function is safe to rerun. It will only affect feeds with missing geolocation_file_dataset_id.

0 commit comments

Comments
 (0)