Skip to content

Commit badc9fa

Browse files
committed
Fix duplicate admin levels
1 parent 139d9ba commit badc9fa

File tree

3 files changed

+82
-3
lines changed

3 files changed

+82
-3
lines changed

functions-python/reverse_geolocation/src/location_group_utils.py

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import pycountry
66
from geoalchemy2 import WKTElement
77
from geoalchemy2.shape import to_shape
8+
from pyproj import Geod
9+
from shapely.geometry.geo import mapping
810
from sqlalchemy.orm import Session
911

1012
from shared.database_gen.sqlacodegen_models import (
@@ -18,6 +20,7 @@
1820
from shared.helpers.locations import get_geopolygons_covers
1921

2022
ERROR_STATUS_CODE = 299 # Custom error code for the function to avoid retries
23+
GEOD = Geod(ellps="WGS84") # Geod object for geodesic calculations
2124

2225

2326
def generate_color(
@@ -165,17 +168,66 @@ def extract_location_aggregate(
165168
)
166169

167170

171+
def geodesic_area_m2(geom) -> float:
172+
"""Return absolute geodesic area in m² for a Shapely geometry (Polygon/MultiPolygon)."""
173+
# pyproj can handle GeoJSON-like mappings, including MultiPolygons
174+
area, _perim = GEOD.geometry_area_perimeter(mapping(geom))
175+
return abs(area)
176+
177+
178+
def resolve_same_level_conflicts(items: List[Geopolygon]) -> Geopolygon:
179+
# 1) Prefer iso_3166_2_code if present
180+
candidates = [g for g in items if g.iso_3166_2_code] or items
181+
182+
if len(candidates) > 1:
183+
# 2) Prefer the smallest geodesic area (m²)
184+
def area_for(g: Geopolygon) -> float:
185+
try:
186+
geom = to_shape(g.geometry) # -> Shapely geometry (in EPSG:4326)
187+
return geodesic_area_m2(geom)
188+
except Exception:
189+
# If geometry missing/bad, push to the end
190+
return float("inf")
191+
192+
candidates.sort(key=area_for)
193+
194+
# 3) Prefer with name, then lowest OSM id
195+
candidates.sort(key=lambda g: (0 if (g.name and g.name.strip()) else 1, g.osm_id))
196+
return candidates[0]
197+
198+
199+
def dedupe_by_admin_level(geopolygons: List[Geopolygon], logger) -> List[Geopolygon]:
200+
"""Keep one polygon per admin_level using tie-break rules above."""
201+
by_level: dict[int, List[Geopolygon]] = {}
202+
for g in geopolygons:
203+
by_level.setdefault(g.admin_level, []).append(g)
204+
205+
winners: List[Geopolygon] = []
206+
for level, items in sorted(
207+
by_level.items(), key=lambda kv: kv[0]
208+
): # keep order by level
209+
if len(items) > 1:
210+
logger.debug(
211+
"Tie at admin_level=%s among osm_ids=%s",
212+
level,
213+
[i.osm_id for i in items],
214+
)
215+
winners.append(resolve_same_level_conflicts(items))
216+
return winners
217+
218+
168219
def extract_location_aggregate_geopolygons(
169220
stop_point: WKTElement, geopolygons, logger, db_session: Session
170221
) -> Optional[GeopolygonAggregate]:
171222
admin_levels = {g.admin_level for g in geopolygons}
172-
if len(admin_levels) != len(geopolygons):
223+
# If duplicates per admin_level exist, resolve instead of returning None
224+
if admin_levels != len(geopolygons):
173225
logger.warning(
174226
"Duplicate admin levels for point: %s -> %s",
175227
stop_point,
176228
geopolygons_as_string(geopolygons),
177229
)
178-
return None
230+
geopolygons = dedupe_by_admin_level(geopolygons, logger)
179231

180232
valid_iso_3166_1 = any(g.iso_3166_1_code for g in geopolygons)
181233
valid_iso_3166_2 = any(g.iso_3166_2_code for g in geopolygons)

functions-python/reverse_geolocation/src/reverse_geolocation_processor.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,19 @@ def reverse_geolocation_process(
331331
use_cache=use_cache,
332332
)
333333

334+
if not location_groups:
335+
no_locations_message = "No locations found for the provided stops."
336+
logger.warning(no_locations_message)
337+
record_execution_trace(
338+
execution_id=execution_id,
339+
stable_id=stable_id,
340+
status=Status.FAILED,
341+
logger=logger,
342+
dataset_file=None,
343+
error_message=no_locations_message,
344+
)
345+
return no_locations_message, ERROR_STATUS_CODE
346+
334347
# Create GeoJSON Aggregate
335348
create_geojson_aggregate(
336349
list(location_groups.values()),

functions-python/reverse_geolocation/src/scripts/reverse_geolocation_process_verifier.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,23 @@
6262
"mdb-2825-202508181628/extracted/stops.txt",
6363
"data_type": "gtfs",
6464
},
65+
{
66+
# 5. 1355 stops, Canada, Saskatchewan , Duplicate admin level https://github.com/MobilityData/mobility-feed-api/issues/965
67+
"stable_id": "local-test-gtfs-mdb-716",
68+
"stops_url": "https://storage.googleapis.com/mobilitydata-datasets-dev/mdb-716/mdb-716-202507082001/"
69+
"extracted/stops.txt",
70+
"data_type": "gtfs",
71+
},
72+
{
73+
# 6. 667 stops, Canada, New Brunswick, Duplicate admin level https://github.com/MobilityData/mobility-feed-api/issues/965
74+
"stable_id": "local-test-gtfs-mdb-1111",
75+
"stops_url": "https://storage.googleapis.com/mobilitydata-datasets-dev/mdb-1111/mdb-1111-202507082012/"
76+
"extracted/stops.txt",
77+
"data_type": "gtfs",
78+
},
6579
]
6680
run_with_feed_index = (
67-
4 # Set to an integer index to run with a specific feed from the list above
81+
5 # Set to an integer index to run with a specific feed from the list above
6882
)
6983

7084

0 commit comments

Comments
 (0)