@@ -217,6 +217,23 @@ async def analyse_csv(
217
217
218
218
219
219
async def get_previous_analysis (resource_id : str ) -> dict | None :
220
+ def match_columns (table_columns : list , analysis_columns : list ) -> dict | None :
221
+ """If a column name is too long for postgres (>60 characters) it gets truncated in the table"""
222
+ # retrieving the columns that match exactly between table and analysis
223
+ matching = {col : col for col in table_columns if col in analysis_columns }
224
+ # early stop if all columns match perfectly
225
+ if len (matching ) == len (table_columns ):
226
+ return matching
227
+ # matching truncated columns in table with actual label
228
+ for col in table_columns :
229
+ if col in matching :
230
+ continue
231
+ for label in analysis_columns :
232
+ if label .startswith (col ):
233
+ matching [col ] = label
234
+ break
235
+ return matching if len (matching ) == len (table_columns ) else None
236
+
220
237
db = await context .pool ("csv" )
221
238
q = (
222
239
"SELECT parsing_table, csv_detective FROM tables_index "
@@ -229,12 +246,14 @@ async def get_previous_analysis(resource_id: str) -> dict | None:
229
246
# the csv_detective column is JSONB, so keys are reordered compared to the actual table
230
247
# so we get the right order from the table
231
248
# landing here we can safely assume that the table exists
232
- rows = await db .fetch (f'SELECT * FROM "{ res [0 ]["parsing_table" ]} " LIMIT 1' )
249
+ rows : list [Record ] = await db .fetch (f'SELECT * FROM "{ res [0 ]["parsing_table" ]} " LIMIT 1' )
250
+ # the __id column is generated by hydra, not natively in the data
251
+ table_columns = [col for col in rows [0 ].keys () if col != "__id" ]
252
+ matching = match_columns (table_columns , list (analysis ["columns" ].keys ()))
253
+ if matching is None :
254
+ return None
233
255
analysis ["columns" ] = {
234
- col : analysis ["columns" ][col ]
235
- for col in rows [0 ].keys ()
236
- # the __id column is generated by hydra, not natively in the data
237
- if col != "__id"
256
+ matching [col ]: analysis ["columns" ][matching [col ]] for col in table_columns
238
257
}
239
258
return analysis
240
259
0 commit comments