@@ -323,66 +323,65 @@ def run_import( # noqa: C901
323323 return
324324
325325 # Read the CSV file with explicit schema for /id suffixed columns
326- # Override automatic type inference to ensure all /id suffixed
327- # columns are strings
328- # Handle potential encoding issues when reading the CSV
329326 try :
330- df = pl .read_csv (filename , separator = separator , truncate_ragged_lines = True )
331- except Exception as e :
332- log .warning (f"Error reading CSV with default settings: { e } " )
333- # If there are encoding issues, we may need to handle the file differently
334- # This could be a character encoding issue in the file
335- log .warning ("Attempting to read CSV with explicit encoding..." )
336- # Note: polars.read_csv accepts encoding parameter but only supports
337- # specific values ('utf8', 'utf8-lossy', 'windows-1252',
338- # 'windows-1252-lossy')
339- # Map common encodings to supported polars values or fallback to utf8
327+ # First, get the header to determine if schema overrides are needed.
328+ header = pl .read_csv (
329+ filename , separator = separator , n_rows = 0 , truncate_ragged_lines = True
330+ ).columns
331+ id_columns = [col for col in header if col .endswith ("/id" )]
332+ schema_overrides = (
333+ {col : pl .Utf8 for col in id_columns } if id_columns else None
334+ )
335+
336+ # Now, read the full file once with the correct schema and
337+ # encoding fallbacks.
340338 polars_encoding = _map_encoding_to_polars (encoding )
341339 try :
342- df = pl .read_csv (
340+ source_df = pl .read_csv (
343341 filename ,
344342 separator = separator ,
345343 encoding = polars_encoding ,
346344 truncate_ragged_lines = True ,
345+ schema_overrides = schema_overrides ,
347346 )
348- except ValueError as ve :
349- # If the encoding is not supported by polars, fallback to utf8
350- if "encoding" in str (ve ).lower ():
351- log .warning (
352- f"Unsupported encoding '{ encoding } ' for polars, "
353- f"falling back to utf8: { ve } "
354- )
355- df = pl .read_csv (
356- filename ,
357- separator = separator ,
358- encoding = "utf8" ,
359- truncate_ragged_lines = True ,
360- )
361- else :
362- raise
363-
364- # Identify columns that end with /id suffix
365- id_columns = [col for col in df .columns if col .endswith ("/id" )]
366-
367- # If we have /id suffixed columns, re-read with explicit schema
368- if id_columns :
369- log .debug (f"Found /id suffixed columns: { id_columns } " )
370- # Create schema override to force /id columns to be strings
371- schema_overrides = {col : pl .Utf8 for col in id_columns }
372- log .debug (f"Schema overrides for /id columns: { schema_overrides } " )
373- # Re-read with explicit schema
374- source_df = pl .read_csv (
375- filename ,
376- separator = separator ,
377- truncate_ragged_lines = True ,
378- schema_overrides = schema_overrides ,
379- )
380- log .debug (
381- f"Re-read DataFrame with schema overrides. /id column types: "
382- f"{ [f'{ col } : { source_df [col ].dtype } ' for col in id_columns ]} "
347+ except (pl .exceptions .ComputeError , ValueError ) as e :
348+ if "encoding" not in str (e ).lower ():
349+ raise # Not an encoding error, re-raise.
350+
351+ log .warning (
352+ f"Read failed with encoding '{ encoding } ', trying fallbacks..."
353+ )
354+ source_df = None
355+ for enc in ["utf8" , "windows-1252" , "latin-1" , "iso-8859-1" , "cp1252" ]:
356+ try :
357+ source_df = pl .read_csv (
358+ filename ,
359+ separator = separator ,
360+ encoding = _map_encoding_to_polars (enc ),
361+ truncate_ragged_lines = True ,
362+ schema_overrides = schema_overrides ,
363+ )
364+ log .warning (
365+ f"Successfully read with fallback encoding '{ enc } '."
366+ )
367+ break
368+ except (pl .exceptions .ComputeError , ValueError ):
369+ continue
370+ if source_df is None :
371+ raise ValueError (
372+ "Could not read CSV with any of the tried encodings."
373+ ) from e
374+ except Exception as e :
375+ log .error (
376+ f"Failed to read source file '{ filename } ' for relational import: { e } "
383377 )
384- else :
385- source_df = df
378+ return
379+ # At this point, source_df is guaranteed to be a DataFrame since
380+ # we would have returned early if there was an error.
381+ if source_df is None :
382+ # This should never happen due to the logic above, but as a safety check
383+ raise RuntimeError ("source_df is unexpectedly None after CSV reading" )
384+
386385 # Only proceed with relational import if there are strategies defined
387386 strategies = import_plan .get ("strategies" , {})
388387 if strategies :
0 commit comments