Handle too long column labels for Postgres (#299)

Pierlou · web-flow · commit 07a3a28b6e48 · 2025-07-25T09:05:05.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@
 - Fix purge csv tables CLI by using the csv db connection [#294](https://github.com/datagouv/hydra/pull/294)
 - Better gz files extraction function name [#295](https://github.com/datagouv/hydra/pull/295)
 - Add more detailed statuses [#297](https://github.com/datagouv/hydra/pull/297)
+- Handle cases of too long columns labels for postgres [#298](https://github.com/datagouv/hydra/pull/298)
 
 ## 2.3.0 (2025-07-15)
 
diff --git a/tests/test_analysis/test_analysis_csv.py b/tests/test_analysis/test_analysis_csv.py
@@ -459,6 +459,24 @@ def create_analysis(scan: dict) -> dict:
             },
             False,
         ),
+        # some column names get truncated in db, but validation works (file content and analysis are unchanged)
+        (
+            *(
+                default_kwargs
+                | {
+                    "header": ["a" * 70, "b" * 70, "a" * 30],
+                    "rows": [["1", "13002526500013", "1.2"], ["5", "38271817900023", "2.3"]],
+                    "columns": {
+                        "a" * 70: {"score": 1.0, "format": "int", "python_type": "int"},
+                        "b" * 70: {"score": 1.0, "format": "siret", "python_type": "string"},
+                        "a" * 30: {"score": 1.0, "format": "float", "python_type": "float"},
+                    },
+                    "formats": {"int": ["a" * 70], "siret": ["b" * 70], "float": ["a" * 30]},
+                },
+            )
+            * 2,
+            True,
+        ),
     ),
 )
 async def test_validation(
diff --git a/udata_hydra/analysis/csv.py b/udata_hydra/analysis/csv.py
@@ -217,6 +217,23 @@ async def analyse_csv(
 
 
 async def get_previous_analysis(resource_id: str) -> dict | None:
+    def match_columns(table_columns: list, analysis_columns: list) -> dict | None:
+        """If a column name is too long for postgres (>60 characters) it gets truncated in the table"""
+        # retrieving the columns that match exactly between table and analysis
+        matching = {col: col for col in table_columns if col in analysis_columns}
+        # early stop if all columns match perfectly
+        if len(matching) == len(table_columns):
+            return matching
+        # matching truncated columns in table with actual label
+        for col in table_columns:
+            if col in matching:
+                continue
+            for label in analysis_columns:
+                if label.startswith(col):
+                    matching[col] = label
+                    break
+        return matching if len(matching) == len(table_columns) else None
+
     db = await context.pool("csv")
     q = (
         "SELECT parsing_table, csv_detective FROM tables_index "
@@ -229,12 +246,14 @@ async def get_previous_analysis(resource_id: str) -> dict | None:
     # the csv_detective column is JSONB, so keys are reordered compared to the actual table
     # so we get the right order from the table
     # landing here we can safely assume that the table exists
-    rows = await db.fetch(f'SELECT * FROM "{res[0]["parsing_table"]}" LIMIT 1')
+    rows: list[Record] = await db.fetch(f'SELECT * FROM "{res[0]["parsing_table"]}" LIMIT 1')
+    # the __id column is generated by hydra, not natively in the data
+    table_columns = [col for col in rows[0].keys() if col != "__id"]
+    matching = match_columns(table_columns, list(analysis["columns"].keys()))
+    if matching is None:
+        return None
     analysis["columns"] = {
-        col: analysis["columns"][col]
-        for col in rows[0].keys()
-        # the __id column is generated by hydra, not natively in the data
-        if col != "__id"
+        matching[col]: analysis["columns"][matching[col]] for col in table_columns
     }
     return analysis