@@ -217,6 +217,23 @@ async def analyse_csv(
217217
218218
219219async def get_previous_analysis (resource_id : str ) -> dict | None :
220+ def match_columns (table_columns : list , analysis_columns : list ) -> dict | None :
221+ """If a column name is too long for postgres (>60 characters) it gets truncated in the table"""
222+ # retrieving the columns that match exactly between table and analysis
223+ matching = {col : col for col in table_columns if col in analysis_columns }
224+ # early stop if all columns match perfectly
225+ if len (matching ) == len (table_columns ):
226+ return matching
227+ # matching truncated columns in table with actual label
228+ for col in table_columns :
229+ if col in matching :
230+ continue
231+ for label in analysis_columns :
232+ if label .startswith (col ):
233+ matching [col ] = label
234+ break
235+ return matching if len (matching ) == len (table_columns ) else None
236+
220237 db = await context .pool ("csv" )
221238 q = (
222239 "SELECT parsing_table, csv_detective FROM tables_index "
@@ -229,12 +246,14 @@ async def get_previous_analysis(resource_id: str) -> dict | None:
229246 # the csv_detective column is JSONB, so keys are reordered compared to the actual table
230247 # so we get the right order from the table
231248 # landing here we can safely assume that the table exists
232- rows = await db .fetch (f'SELECT * FROM "{ res [0 ]["parsing_table" ]} " LIMIT 1' )
249+ rows : list [Record ] = await db .fetch (f'SELECT * FROM "{ res [0 ]["parsing_table" ]} " LIMIT 1' )
250+ # the __id column is generated by hydra, not natively in the data
251+ table_columns = [col for col in rows [0 ].keys () if col != "__id" ]
252+ matching = match_columns (table_columns , list (analysis ["columns" ].keys ()))
253+ if matching is None :
254+ return None
233255 analysis ["columns" ] = {
234- col : analysis ["columns" ][col ]
235- for col in rows [0 ].keys ()
236- # the __id column is generated by hydra, not natively in the data
237- if col != "__id"
256+ matching [col ]: analysis ["columns" ][matching [col ]] for col in table_columns
238257 }
239258 return analysis
240259
0 commit comments