fix: pmtiles generation fails with error: route_id (#1373)

davidgamez · web-flow · commit 182e492cae55 · 2025-09-25T17:23:37.000-04:00
diff --git a/functions-python/helpers/requirements.txt b/functions-python/helpers/requirements.txt
@@ -29,4 +29,5 @@ google-cloud-bigquery
 # Additional package
 pycountry
 shapely
-pandas
+pandas
+charset_normalizer
diff --git a/functions-python/helpers/tests/test_helpers.py b/functions-python/helpers/tests/test_helpers.py
@@ -1,13 +1,19 @@
 import hashlib
 import os
+import tempfile
 import unittest
 from unittest.mock import Mock, MagicMock
 from unittest.mock import patch
 
 import pytest
 import urllib3_mock
 
-from utils import create_bucket, download_and_get_hash, download_url_content
+from utils import (
+    create_bucket,
+    download_and_get_hash,
+    download_url_content,
+    detect_encoding,
+)
 
 responses = urllib3_mock.Responses("requests.packages.urllib3")
 expected_user_agent = (
@@ -260,3 +266,48 @@ def test_create_http_pmtiles_builder_task(
         self.assertEqual(args[3], "my-project")
         self.assertEqual(args[4], "northamerica-northeast1")
         self.assertEqual(args[5], "pmtiles-queue")
+
+
+class TestDetectEncoding(unittest.TestCase):
+    def test_utf8_encoding(self):
+        with tempfile.NamedTemporaryFile(delete=False, mode="w", encoding="utf-8") as f:
+            f.write("\ufeff")  # Write BOM
+            f.write("col1,col2\nval1,val2\n")
+            fname = f.name
+        enc = detect_encoding(fname)
+        self.assertEqual(enc, "utf-8-sig")
+        os.remove(fname)
+
+    # Add a non-ASCII character (e.g., é, ü, ñ) to the test data
+    def test_utf8_encoding_non_ascii(self):
+        with tempfile.NamedTemporaryFile(delete=False, mode="w", encoding="utf-8") as f:
+            f.write("col1,col2\nval1,valü2\n")  # ü is non-ASCII
+            fname = f.name
+        enc = detect_encoding(fname)
+        self.assertEqual(enc, "utf-8-sig")
+        os.remove(fname)
+
+    def test_latin1_encoding(self):
+        # Use a longer string with several Latin-1 characters
+        latin1_text = "col1,col2\nvalñ,valö,valü,valé,valà,valç,valø\n"
+        with tempfile.NamedTemporaryFile(
+            delete=False, mode="w", encoding="latin1"
+        ) as f:
+            f.write(latin1_text)
+            fname = f.name
+        enc = detect_encoding(fname)
+        # Multiple encodings can represent Latin-1 characters, this is a safe considering changes in the OS
+        # and charset_normalizer library
+        self.assertIn(
+            enc, ["latin_1", "iso-8859-1", "windows-1252", "latin1", "cp1250"]
+        )
+        os.remove(fname)
+
+    def test_encoding_detection_failure(self):
+        # Write some random bytes that charset_normalizer can't detect
+        with tempfile.NamedTemporaryFile(delete=False, mode="wb") as f:
+            f.write(b"\x00\x01\x02\x03\x04")
+            fname = f.name
+        enc = detect_encoding(fname)
+        self.assertEqual(enc, "utf-8-sig")
+        os.remove(fname)
diff --git a/functions-python/helpers/utils.py b/functions-python/helpers/utils.py
@@ -339,3 +339,31 @@ def record_execution_trace(
         timestamp=datetime.now(),
     )
     trace_service.save(trace)
+
+
+def detect_encoding(
+    filename: str, sample_size: int = 100_000, logger: Optional[logging.Logger] = None
+) -> str:
+    """Detect file encoding using a small sample of the file.
+    If detections fails or if UTF-8 is detected, defaults to 'utf-8-sig' to handle BOM.
+    """
+    from charset_normalizer import from_bytes
+
+    with open(filename, "rb") as f:
+        raw = f.read(sample_size)
+    result = from_bytes(raw).best()
+
+    if result is None:
+        logger = logger or logging.getLogger(__name__)
+        logger.warning(
+            "Encoding detection failed for %s, defaulting to utf-8-sig", filename
+        )
+        return "utf-8-sig"
+
+    enc = result.encoding.lower()
+
+    # If UTF-8 is detected, always use utf-8-sig to strip BOM if present
+    if enc in ("utf_8", "utf-8", "utf8", "utf8mb4"):
+        return "utf-8-sig"
+
+    return enc
diff --git a/functions-python/pmtiles_builder/requirements.txt b/functions-python/pmtiles_builder/requirements.txt
@@ -23,7 +23,8 @@ google-cloud-storage
 
 # Configuration
 python-dotenv==1.0.0
+
 tippecanoe
 psutil
 pandas
-
+charset_normalizer
diff --git a/functions-python/pmtiles_builder/src/csv_cache.py b/functions-python/pmtiles_builder/src/csv_cache.py
@@ -17,10 +17,10 @@
 import os
 from typing import TypedDict, List, Dict
 
-
 from gtfs import stop_txt_is_lat_log_required
 from shared.helpers.logger import get_logger
 from shared.helpers.transform import get_safe_value, get_safe_float
+from shared.helpers.utils import detect_encoding
 
 STOP_TIMES_FILE = "stop_times.txt"
 SHAPES_FILE = "shapes.txt"
@@ -93,7 +93,8 @@ def _read_csv(self, filename) -> list[dict]:
         """
         try:
             self.logger.debug("Loading %s", filename)
-            with open(filename, newline="", encoding="utf-8") as f:
+            encoding = detect_encoding(filename, logger=self.logger)
+            with open(filename, newline="", encoding=encoding) as f:
                 return list(csv.DictReader(f))
         except Exception as e:
             raise Exception(f"Failed to read CSV file {filename}: {e}") from e
diff --git a/functions-python/pmtiles_builder/src/main.py b/functions-python/pmtiles_builder/src/main.py
@@ -47,6 +47,7 @@
 from shared.helpers.runtime_metrics import track_metrics
 from shared.database.database import with_db_session
 from shared.helpers.transform import get_safe_value, get_safe_float
+from shared.helpers.utils import detect_encoding
 
 init_logger()
 
@@ -337,8 +338,11 @@ def _create_shapes_index(self) -> dict:
         self.logger.info("Creating shapes index")
         shapes_index = {}
         try:
+            encoding = detect_encoding(
+                filename=self.get_path(SHAPES_FILE), logger=self.logger
+            )
             with open(
-                self.get_path(SHAPES_FILE), "r", encoding="utf-8", newline=""
+                self.get_path(SHAPES_FILE), "r", encoding=encoding, newline=""
             ) as f:
                 header = f.readline()
                 columns = next(csv.reader([header]))
diff --git a/functions-python/pmtiles_builder/src/scripts/pmtiles_builder_verifier.py b/functions-python/pmtiles_builder/src/scripts/pmtiles_builder_verifier.py
@@ -25,8 +25,13 @@
         "dataset_stable_id": "mdb-2841-202509032137",
         "env": "prod",
     },
+    {
+        "stable_id": "mdb-733",
+        "dataset_stable_id": "mdb-733-202509111637",
+        "env": "prod",
+    },
 ]
-run_with_feed_index = 1  # Change this index to run with a different feed
+run_with_feed_index = 2  # Change this index to run with a different feed
 
 FILES = [STOP_TIMES_FILE, SHAPES_FILE, TRIPS_FILE, ROUTES_FILE, STOPS_FILE, AGENCY_FILE]