Skip to content

Commit 4f62811

Browse files
authored
feat: using lat/lon from all available files for gbfs loc extraction (#1226)
1 parent 3ea7771 commit 4f62811

File tree

6 files changed

+83
-59
lines changed

6 files changed

+83
-59
lines changed

functions-python/gbfs_validator/src/gbfs_data_processor.py

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -436,27 +436,23 @@ def trigger_location_extraction(self):
436436
version_id = f"{self.stable_id}_{autodiscovery_version.version}_{autodiscovery_version.extracted_from}"
437437
endpoints = self.gbfs_endpoints.get(version_id, [])
438438

439-
# Find the station_information_url endpoint
440-
station_information_url = next(
441-
(
442-
endpoint.url
443-
for endpoint in endpoints
444-
if endpoint.name == "station_information"
445-
),
446-
None,
447-
)
448-
# If station_information_url is not found, use vehicle_status_url
449-
vehicle_status_url = next(
450-
(
451-
endpoint.url
452-
for endpoint in endpoints
453-
if endpoint.name == "vehicle_status"
454-
),
455-
None,
456-
)
457-
if not station_information_url and not vehicle_status_url:
439+
def get_endpoint_url(name: str) -> Optional[str]:
440+
return next(
441+
(endpoint.url for endpoint in endpoints if endpoint.name == name), None
442+
)
443+
444+
# Get the URLs for the required endpoints
445+
station_information_url = get_endpoint_url("station_information")
446+
vehicle_status_url = get_endpoint_url("vehicle_status")
447+
free_bike_status_url = get_endpoint_url("free_bike_status")
448+
449+
if (
450+
not station_information_url
451+
and not vehicle_status_url
452+
and not free_bike_status_url
453+
):
458454
self.logger.warning(
459-
"No station_information_url or vehicle_status_url found."
455+
"No station_information_url or vehicle_status_url or free_bike_status_url found."
460456
)
461457
return
462458
client = tasks_v2.CloudTasksClient()
@@ -466,6 +462,7 @@ def trigger_location_extraction(self):
466462
"data_type": "gbfs",
467463
"station_information_url": station_information_url,
468464
"vehicle_status_url": vehicle_status_url,
465+
"free_bike_status_url": free_bike_status_url,
469466
}
470467
).encode()
471468
project_id = os.getenv("PROJECT_ID")

functions-python/reverse_geolocation/.env.rename_me

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,6 @@ PROJECT_ID=${{PROJECT_ID}}
55
GCP_REGION=${{GCP_REGION}}
66
SERVICE_ACCOUNT_EMAIL=${{SERVICE_ACCOUNT_EMAIL}}
77
DATASETS_BUCKET_NAME=${{DATASETS_BUCKET_NAME}}
8-
PUBSUB_TOPIC_NAME=${{PUBSUB_TOPIC_NAME}}
8+
PUBSUB_TOPIC_NAME=${{PUBSUB_TOPIC_NAME}}
9+
DATASET_BUCKET_NAME_GBFS=${{DATASET_BUCKET_NAME_GBFS}}
10+
DATASET_BUCKET_NAME_GTFS=${{DATASET_BUCKET_NAME_GTFS}}

functions-python/reverse_geolocation/README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,15 +58,16 @@ This function performs the core reverse geolocation logic. It processes location
5858
- `stable_id`: Identifies the feed (GTFS or GBFS).
5959
- `dataset_id`: Required if `data_type` is not provided or is `gtfs`. Identifies the dataset being processed.
6060
- `stops_url`: Required if `data_type` is not provided or is `gtfs`. URL of the GTFS `stops.txt` file.
61-
- `station_information_url`: Required if `data_type` is `gbfs` and `vehicle_status_url` is omitted. URL of the GBFS `station_information.json` file.
62-
- `vehicle_status_url`: Required if `data_type` is `gbfs` and `station_information_url` is omitted. URL of the GBFS `vehicle_status.json` file.
61+
- `station_information_url`: Required if `data_type` is `gbfs` and `vehicle_status_url` and `free_bike_status_url` are omitted. URL of the GBFS `station_information.json` file.
62+
- `vehicle_status_url`: Required if `data_type` is `gbfs` and `station_information_url` and `free_bike_status_url` are omitted. URL of the GBFS `vehicle_status.json` file.
63+
- `free_bike_status_url`: Required if `data_type` is `gbfs` and `station_information_url` and `vehicle_status_url` are omitted. URL of the GBFS `free_bike_status.json` file.
6364
- `data_type`: Optional. Specifies the type of data being processed. Can be `gtfs` or `gbfs`. If not provided, the function will attempt to determine the type based on the URLs provided.
6465

6566
### Processing Steps:
6667

6768
1. **Load Location Data**
6869
- For GTFS: the function reads `stops.txt` into a Pandas DataFrame, ensuring unique longitude-latitude pairs.
69-
- For GBFS: location data is extracted from `station_information.json` (preferred) or `vehicle_status.json` (fallback), also ensuring uniqueness.
70+
- For GBFS: location data is extracted from `station_information.json` and `vehicle_status.json` and `free_bike_status.json`, also ensuring uniqueness.
7071

7172
2. **Updates Bounding Box**
7273
- For GTFS: the bounding box is derived from stop coordinates. The dataset's bounding box is updated in the database.

functions-python/reverse_geolocation/src/parse_request.py

Lines changed: 48 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import io
22
import logging
3-
from typing import Tuple, Optional
3+
from typing import Tuple, Optional, List
44

55
import flask
66
import pandas as pd
@@ -10,10 +10,11 @@
1010

1111
def parse_request_parameters(
1212
request: flask.Request,
13-
) -> Tuple[pd.DataFrame, str, Optional[str], str, str]:
13+
) -> Tuple[pd.DataFrame, str, Optional[str], str, List[str]]:
1414
"""
1515
Parse the request parameters and return a DataFrame with the stops data.
16-
@:returns Tuple: A tuple containing the stops DataFrame, stable ID, and dataset ID.
16+
@:returns Tuple: A tuple containing the stops DataFrame, stable ID, dataset ID, data type, and a list of URLs that
17+
were used to fetch the data.
1718
"""
1819
logging.info("Parsing request parameters.")
1920
request_json = request.get_json(silent=True)
@@ -37,13 +38,14 @@ def parse_request_parameters(
3738
logging.info("Data type: %s", data_type)
3839
if data_type == "gtfs":
3940
df, stable_id, dataset_id, url = parse_request_parameters_gtfs(request_json)
41+
urls = [url]
4042
elif data_type == "gbfs":
41-
df, stable_id, dataset_id, url = parse_request_parameters_gbfs(request_json)
43+
df, stable_id, dataset_id, urls = parse_request_parameters_gbfs(request_json)
4244
else:
4345
raise ValueError(
4446
f"Invalid data_type '{data_type}'. Supported types are 'gtfs' and 'gbfs'."
4547
)
46-
return df, stable_id, dataset_id, data_type, url
48+
return df, stable_id, dataset_id, data_type, urls
4749

4850

4951
def parse_request_parameters_gtfs(
@@ -82,16 +84,11 @@ def parse_station_information_url(station_information_url) -> pd.DataFrame:
8284

8385
lat_expr = parse("data.stations[*].lat")
8486
lon_expr = parse("data.stations[*].lon")
85-
station_id_expr = parse("data.stations[*].station_id")
8687

8788
lats = [match.value for match in lat_expr.find(data)]
8889
lons = [match.value for match in lon_expr.find(data)]
89-
station_ids = [match.value for match in station_id_expr.find(data)]
9090

91-
stations_info = [
92-
{"station_id": sid, "stop_lat": lat, "stop_lon": lon}
93-
for sid, lat, lon in zip(station_ids, lats, lons)
94-
]
91+
stations_info = [{"stop_lat": lat, "stop_lon": lon} for lat, lon in zip(lats, lons)]
9592
return pd.DataFrame(stations_info)
9693

9794

@@ -103,29 +100,42 @@ def parse_vehicle_status_url(vehicle_status_url) -> pd.DataFrame:
103100

104101
lat_expr = parse("data.vehicles[*].lat")
105102
lon_expr = parse("data.vehicles[*].lon")
106-
vehicle_id_expr = parse("data.vehicles[*].vehicle_id")
107103

108104
lats = [match.value for match in lat_expr.find(data)]
109105
lons = [match.value for match in lon_expr.find(data)]
110-
vehicle_ids = [match.value for match in vehicle_id_expr.find(data)]
111106

112-
vehicles_info = [
113-
{"vehicle_id": vid, "stop_lat": lat, "stop_lon": lon}
114-
for vid, lat, lon in zip(vehicle_ids, lats, lons)
115-
]
107+
vehicles_info = [{"stop_lat": lat, "stop_lon": lon} for lat, lon in zip(lats, lons)]
116108

117109
return pd.DataFrame(vehicles_info)
118110

119111

112+
def parse_free_bike_status_url(free_bike_status_url):
113+
"""Parse the free bike status URL and return a DataFrame with bike_id, lat, and lon."""
114+
response = requests.get(free_bike_status_url)
115+
response.raise_for_status()
116+
data = response.json()
117+
118+
lat_expr = parse("data.bikes[*].lat")
119+
lon_expr = parse("data.bikes[*].lon")
120+
121+
lats = [match.value for match in lat_expr.find(data)]
122+
lons = [match.value for match in lon_expr.find(data)]
123+
124+
bikes_info = [{"stop_lat": lat, "stop_lon": lon} for lat, lon in zip(lats, lons)]
125+
126+
return pd.DataFrame(bikes_info)
127+
128+
120129
def parse_request_parameters_gbfs(
121130
request_json: dict,
122-
) -> Tuple[pd.DataFrame, str, Optional[str], str]:
131+
) -> Tuple[pd.DataFrame, str, Optional[str], List[str]]:
123132
"""Parse the request parameters for GBFS data."""
124133
if (
125134
not request_json
126135
or (
127136
"station_information_url" not in request_json
128137
and "vehicle_status_url" not in request_json
138+
and "free_bike_status_url" not in request_json
129139
)
130140
or "stable_id" not in request_json
131141
):
@@ -137,10 +147,26 @@ def parse_request_parameters_gbfs(
137147
stable_id = request_json["stable_id"]
138148
station_information_url = request_json.get("station_information_url")
139149
vehicle_status_url = request_json.get("vehicle_status_url")
150+
free_bike_status_url = request_json.get("free_bike_status_url")
151+
stops_df = pd.DataFrame()
152+
urls = []
140153
if station_information_url:
141154
logging.info("Parsing station information URL")
142-
stops_df = parse_station_information_url(station_information_url)
143-
else:
155+
stops_df_station_information = parse_station_information_url(
156+
station_information_url
157+
)
158+
stops_df = pd.concat(
159+
[stops_df, stops_df_station_information], ignore_index=True
160+
)
161+
urls.append(station_information_url)
162+
if vehicle_status_url:
144163
logging.info("Parsing vehicle status URL")
145-
stops_df = parse_vehicle_status_url(vehicle_status_url)
146-
return stops_df, stable_id, None, station_information_url or vehicle_status_url
164+
stops_df_vehicle_status = parse_vehicle_status_url(vehicle_status_url)
165+
stops_df = pd.concat([stops_df, stops_df_vehicle_status], ignore_index=True)
166+
urls.append(vehicle_status_url)
167+
if free_bike_status_url:
168+
logging.info("Parsing free bike status URL")
169+
stops_df_free_bike_status = parse_free_bike_status_url(free_bike_status_url)
170+
stops_df = pd.concat([stops_df, stops_df_free_bike_status], ignore_index=True)
171+
urls.append(free_bike_status_url)
172+
return stops_df, stable_id, None, urls

functions-python/reverse_geolocation/src/reverse_geolocation_processor.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ def create_geojson_aggregate(
227227
bounding_box: shapely.Polygon,
228228
data_type: str,
229229
logger,
230-
extraction_url: str = None,
230+
extraction_urls: List[str] = None,
231231
) -> None:
232232
"""Create a GeoJSON file with the aggregated locations. This file will be uploaded to GCS and used for
233233
visualization."""
@@ -250,7 +250,7 @@ def create_geojson_aggregate(
250250
json_data = {
251251
"type": "FeatureCollection",
252252
"extracted_at": datetime.now().isoformat(),
253-
"extraction_url": extraction_url,
253+
"extraction_url": extraction_urls,
254254
"features": [
255255
{
256256
"type": "Feature",
@@ -424,7 +424,7 @@ def reverse_geolocation_process(
424424
stable_id,
425425
dataset_id,
426426
data_type,
427-
extraction_url,
427+
extraction_urls,
428428
) = parse_request_parameters(request)
429429

430430
# Remove duplicate lat/lon points
@@ -464,7 +464,7 @@ def reverse_geolocation_process(
464464
stable_id=stable_id,
465465
bounding_box=bounding_box,
466466
data_type=data_type,
467-
extraction_url=extraction_url,
467+
extraction_urls=extraction_urls,
468468
logger=logger,
469469
)
470470

functions-python/reverse_geolocation/tests/test_reverse_geolocation_processor.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -84,14 +84,13 @@ def test_parse_request_parameters_gbfs_station_information(self, requests_mock):
8484
"data_type": "gbfs",
8585
}
8686

87-
df, stable_id, dataset_id, data_type, url = parse_request_parameters(request)
87+
df, stable_id, dataset_id, data_type, urls = parse_request_parameters(request)
8888

8989
self.assertEqual(stable_id, "stable123")
9090
self.assertEqual(dataset_id, None)
9191
self.assertEqual(data_type, "gbfs")
92-
self.assertEqual(url, "http://dummy.url")
93-
self.assertEqual(df.shape, (2, 3))
94-
self.assertIn("station_id", df.columns)
92+
self.assertEqual(urls[0], "http://dummy.url")
93+
self.assertEqual(df.shape, (2, 2))
9594

9695
@patch("parse_request.requests")
9796
def test_parse_request_parameters_gbfs_vehicle_status(self, requests_mock):
@@ -114,14 +113,13 @@ def test_parse_request_parameters_gbfs_vehicle_status(self, requests_mock):
114113
"data_type": "gbfs",
115114
}
116115

117-
df, stable_id, dataset_id, data_type, url = parse_request_parameters(request)
116+
df, stable_id, dataset_id, data_type, urls = parse_request_parameters(request)
118117

119118
self.assertEqual(stable_id, "stable456")
120119
self.assertEqual(dataset_id, None)
121120
self.assertEqual(data_type, "gbfs")
122-
self.assertEqual(url, "http://dummy.vehicle")
123-
self.assertEqual(df.shape, (2, 3))
124-
self.assertIn("vehicle_id", df.columns)
121+
self.assertEqual(urls[0], "http://dummy.vehicle")
122+
self.assertEqual(df.shape, (2, 2))
125123

126124
@patch("parse_request.requests")
127125
def test_parse_request_parameters_invalid_request(self, requests_mock):
@@ -426,7 +424,7 @@ def test_create_geojson_aggregate(self, _, mock_storage_client):
426424
stable_id="test_stable_id",
427425
bounding_box=bounding_box,
428426
data_type="gtfs",
429-
extraction_url="test_extraction_url",
427+
extraction_urls=["test_extraction_url"],
430428
logger=logger,
431429
)
432430

0 commit comments

Comments
 (0)