Skip to content

Commit 07a3a28

Browse files
authored
Handle too long column labels for Postgres (#299)
1 parent 7a00041 commit 07a3a28

File tree

3 files changed

+43
-5
lines changed

3 files changed

+43
-5
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
- Fix purge csv tables CLI by using the csv db connection [#294](https://github.com/datagouv/hydra/pull/294)
99
- Better gz files extraction function name [#295](https://github.com/datagouv/hydra/pull/295)
1010
- Add more detailed statuses [#297](https://github.com/datagouv/hydra/pull/297)
11+
- Handle cases of too long columns labels for postgres [#298](https://github.com/datagouv/hydra/pull/298)
1112

1213
## 2.3.0 (2025-07-15)
1314

tests/test_analysis/test_analysis_csv.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,24 @@ def create_analysis(scan: dict) -> dict:
459459
},
460460
False,
461461
),
462+
# some column names get truncated in db, but validation works (file content and analysis are unchanged)
463+
(
464+
*(
465+
default_kwargs
466+
| {
467+
"header": ["a" * 70, "b" * 70, "a" * 30],
468+
"rows": [["1", "13002526500013", "1.2"], ["5", "38271817900023", "2.3"]],
469+
"columns": {
470+
"a" * 70: {"score": 1.0, "format": "int", "python_type": "int"},
471+
"b" * 70: {"score": 1.0, "format": "siret", "python_type": "string"},
472+
"a" * 30: {"score": 1.0, "format": "float", "python_type": "float"},
473+
},
474+
"formats": {"int": ["a" * 70], "siret": ["b" * 70], "float": ["a" * 30]},
475+
},
476+
)
477+
* 2,
478+
True,
479+
),
462480
),
463481
)
464482
async def test_validation(

udata_hydra/analysis/csv.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,23 @@ async def analyse_csv(
217217

218218

219219
async def get_previous_analysis(resource_id: str) -> dict | None:
220+
def match_columns(table_columns: list, analysis_columns: list) -> dict | None:
221+
"""If a column name is too long for postgres (>60 characters) it gets truncated in the table"""
222+
# retrieving the columns that match exactly between table and analysis
223+
matching = {col: col for col in table_columns if col in analysis_columns}
224+
# early stop if all columns match perfectly
225+
if len(matching) == len(table_columns):
226+
return matching
227+
# matching truncated columns in table with actual label
228+
for col in table_columns:
229+
if col in matching:
230+
continue
231+
for label in analysis_columns:
232+
if label.startswith(col):
233+
matching[col] = label
234+
break
235+
return matching if len(matching) == len(table_columns) else None
236+
220237
db = await context.pool("csv")
221238
q = (
222239
"SELECT parsing_table, csv_detective FROM tables_index "
@@ -229,12 +246,14 @@ async def get_previous_analysis(resource_id: str) -> dict | None:
229246
# the csv_detective column is JSONB, so keys are reordered compared to the actual table
230247
# so we get the right order from the table
231248
# landing here we can safely assume that the table exists
232-
rows = await db.fetch(f'SELECT * FROM "{res[0]["parsing_table"]}" LIMIT 1')
249+
rows: list[Record] = await db.fetch(f'SELECT * FROM "{res[0]["parsing_table"]}" LIMIT 1')
250+
# the __id column is generated by hydra, not natively in the data
251+
table_columns = [col for col in rows[0].keys() if col != "__id"]
252+
matching = match_columns(table_columns, list(analysis["columns"].keys()))
253+
if matching is None:
254+
return None
233255
analysis["columns"] = {
234-
col: analysis["columns"][col]
235-
for col in rows[0].keys()
236-
# the __id column is generated by hydra, not natively in the data
237-
if col != "__id"
256+
matching[col]: analysis["columns"][matching[col]] for col in table_columns
238257
}
239258
return analysis
240259

0 commit comments

Comments
 (0)