Skip to content

Commit 182e492

Browse files
authored
fix: pmtiles generation fails with error: route_id (#1373)
1 parent 10eb0d0 commit 182e492

File tree

7 files changed

+98
-7
lines changed

7 files changed

+98
-7
lines changed

functions-python/helpers/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,5 @@ google-cloud-bigquery
2929
# Additional package
3030
pycountry
3131
shapely
32-
pandas
32+
pandas
33+
charset_normalizer

functions-python/helpers/tests/test_helpers.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,19 @@
11
import hashlib
22
import os
3+
import tempfile
34
import unittest
45
from unittest.mock import Mock, MagicMock
56
from unittest.mock import patch
67

78
import pytest
89
import urllib3_mock
910

10-
from utils import create_bucket, download_and_get_hash, download_url_content
11+
from utils import (
12+
create_bucket,
13+
download_and_get_hash,
14+
download_url_content,
15+
detect_encoding,
16+
)
1117

1218
responses = urllib3_mock.Responses("requests.packages.urllib3")
1319
expected_user_agent = (
@@ -260,3 +266,48 @@ def test_create_http_pmtiles_builder_task(
260266
self.assertEqual(args[3], "my-project")
261267
self.assertEqual(args[4], "northamerica-northeast1")
262268
self.assertEqual(args[5], "pmtiles-queue")
269+
270+
271+
class TestDetectEncoding(unittest.TestCase):
272+
def test_utf8_encoding(self):
273+
with tempfile.NamedTemporaryFile(delete=False, mode="w", encoding="utf-8") as f:
274+
f.write("\ufeff") # Write BOM
275+
f.write("col1,col2\nval1,val2\n")
276+
fname = f.name
277+
enc = detect_encoding(fname)
278+
self.assertEqual(enc, "utf-8-sig")
279+
os.remove(fname)
280+
281+
# Add a non-ASCII character (e.g., é, ü, ñ) to the test data
282+
def test_utf8_encoding_non_ascii(self):
283+
with tempfile.NamedTemporaryFile(delete=False, mode="w", encoding="utf-8") as f:
284+
f.write("col1,col2\nval1,valü2\n") # ü is non-ASCII
285+
fname = f.name
286+
enc = detect_encoding(fname)
287+
self.assertEqual(enc, "utf-8-sig")
288+
os.remove(fname)
289+
290+
def test_latin1_encoding(self):
291+
# Use a longer string with several Latin-1 characters
292+
latin1_text = "col1,col2\nvalñ,valö,valü,valé,valà,valç,valø\n"
293+
with tempfile.NamedTemporaryFile(
294+
delete=False, mode="w", encoding="latin1"
295+
) as f:
296+
f.write(latin1_text)
297+
fname = f.name
298+
enc = detect_encoding(fname)
299+
# Multiple encodings can represent Latin-1 characters, this is a safe considering changes in the OS
300+
# and charset_normalizer library
301+
self.assertIn(
302+
enc, ["latin_1", "iso-8859-1", "windows-1252", "latin1", "cp1250"]
303+
)
304+
os.remove(fname)
305+
306+
def test_encoding_detection_failure(self):
307+
# Write some random bytes that charset_normalizer can't detect
308+
with tempfile.NamedTemporaryFile(delete=False, mode="wb") as f:
309+
f.write(b"\x00\x01\x02\x03\x04")
310+
fname = f.name
311+
enc = detect_encoding(fname)
312+
self.assertEqual(enc, "utf-8-sig")
313+
os.remove(fname)

functions-python/helpers/utils.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,3 +339,31 @@ def record_execution_trace(
339339
timestamp=datetime.now(),
340340
)
341341
trace_service.save(trace)
342+
343+
344+
def detect_encoding(
345+
filename: str, sample_size: int = 100_000, logger: Optional[logging.Logger] = None
346+
) -> str:
347+
"""Detect file encoding using a small sample of the file.
348+
If detections fails or if UTF-8 is detected, defaults to 'utf-8-sig' to handle BOM.
349+
"""
350+
from charset_normalizer import from_bytes
351+
352+
with open(filename, "rb") as f:
353+
raw = f.read(sample_size)
354+
result = from_bytes(raw).best()
355+
356+
if result is None:
357+
logger = logger or logging.getLogger(__name__)
358+
logger.warning(
359+
"Encoding detection failed for %s, defaulting to utf-8-sig", filename
360+
)
361+
return "utf-8-sig"
362+
363+
enc = result.encoding.lower()
364+
365+
# If UTF-8 is detected, always use utf-8-sig to strip BOM if present
366+
if enc in ("utf_8", "utf-8", "utf8", "utf8mb4"):
367+
return "utf-8-sig"
368+
369+
return enc

functions-python/pmtiles_builder/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ google-cloud-storage
2323

2424
# Configuration
2525
python-dotenv==1.0.0
26+
2627
tippecanoe
2728
psutil
2829
pandas
29-
30+
charset_normalizer

functions-python/pmtiles_builder/src/csv_cache.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@
1717
import os
1818
from typing import TypedDict, List, Dict
1919

20-
2120
from gtfs import stop_txt_is_lat_log_required
2221
from shared.helpers.logger import get_logger
2322
from shared.helpers.transform import get_safe_value, get_safe_float
23+
from shared.helpers.utils import detect_encoding
2424

2525
STOP_TIMES_FILE = "stop_times.txt"
2626
SHAPES_FILE = "shapes.txt"
@@ -93,7 +93,8 @@ def _read_csv(self, filename) -> list[dict]:
9393
"""
9494
try:
9595
self.logger.debug("Loading %s", filename)
96-
with open(filename, newline="", encoding="utf-8") as f:
96+
encoding = detect_encoding(filename, logger=self.logger)
97+
with open(filename, newline="", encoding=encoding) as f:
9798
return list(csv.DictReader(f))
9899
except Exception as e:
99100
raise Exception(f"Failed to read CSV file {filename}: {e}") from e

functions-python/pmtiles_builder/src/main.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
from shared.helpers.runtime_metrics import track_metrics
4848
from shared.database.database import with_db_session
4949
from shared.helpers.transform import get_safe_value, get_safe_float
50+
from shared.helpers.utils import detect_encoding
5051

5152
init_logger()
5253

@@ -337,8 +338,11 @@ def _create_shapes_index(self) -> dict:
337338
self.logger.info("Creating shapes index")
338339
shapes_index = {}
339340
try:
341+
encoding = detect_encoding(
342+
filename=self.get_path(SHAPES_FILE), logger=self.logger
343+
)
340344
with open(
341-
self.get_path(SHAPES_FILE), "r", encoding="utf-8", newline=""
345+
self.get_path(SHAPES_FILE), "r", encoding=encoding, newline=""
342346
) as f:
343347
header = f.readline()
344348
columns = next(csv.reader([header]))

functions-python/pmtiles_builder/src/scripts/pmtiles_builder_verifier.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,13 @@
2525
"dataset_stable_id": "mdb-2841-202509032137",
2626
"env": "prod",
2727
},
28+
{
29+
"stable_id": "mdb-733",
30+
"dataset_stable_id": "mdb-733-202509111637",
31+
"env": "prod",
32+
},
2833
]
29-
run_with_feed_index = 1 # Change this index to run with a different feed
34+
run_with_feed_index = 2 # Change this index to run with a different feed
3035

3136
FILES = [STOP_TIMES_FILE, SHAPES_FILE, TRIPS_FILE, ROUTES_FILE, STOPS_FILE, AGENCY_FILE]
3237

0 commit comments

Comments
 (0)