Skip to content

Commit 41bdce6

Browse files
committed
Improve efficiency of relational import
1 parent 374c930 commit 41bdce6

File tree

1 file changed

+50
-51
lines changed

1 file changed

+50
-51
lines changed

src/odoo_data_flow/importer.py

100755100644
Lines changed: 50 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -323,66 +323,65 @@ def run_import( # noqa: C901
323323
return
324324

325325
# Read the CSV file with explicit schema for /id suffixed columns
326-
# Override automatic type inference to ensure all /id suffixed
327-
# columns are strings
328-
# Handle potential encoding issues when reading the CSV
329326
try:
330-
df = pl.read_csv(filename, separator=separator, truncate_ragged_lines=True)
331-
except Exception as e:
332-
log.warning(f"Error reading CSV with default settings: {e}")
333-
# If there are encoding issues, we may need to handle the file differently
334-
# This could be a character encoding issue in the file
335-
log.warning("Attempting to read CSV with explicit encoding...")
336-
# Note: polars.read_csv accepts encoding parameter but only supports
337-
# specific values ('utf8', 'utf8-lossy', 'windows-1252',
338-
# 'windows-1252-lossy')
339-
# Map common encodings to supported polars values or fallback to utf8
327+
# First, get the header to determine if schema overrides are needed.
328+
header = pl.read_csv(
329+
filename, separator=separator, n_rows=0, truncate_ragged_lines=True
330+
).columns
331+
id_columns = [col for col in header if col.endswith("/id")]
332+
schema_overrides = (
333+
{col: pl.Utf8 for col in id_columns} if id_columns else None
334+
)
335+
336+
# Now, read the full file once with the correct schema and
337+
# encoding fallbacks.
340338
polars_encoding = _map_encoding_to_polars(encoding)
341339
try:
342-
df = pl.read_csv(
340+
source_df = pl.read_csv(
343341
filename,
344342
separator=separator,
345343
encoding=polars_encoding,
346344
truncate_ragged_lines=True,
345+
schema_overrides=schema_overrides,
347346
)
348-
except ValueError as ve:
349-
# If the encoding is not supported by polars, fallback to utf8
350-
if "encoding" in str(ve).lower():
351-
log.warning(
352-
f"Unsupported encoding '{encoding}' for polars, "
353-
f"falling back to utf8: {ve}"
354-
)
355-
df = pl.read_csv(
356-
filename,
357-
separator=separator,
358-
encoding="utf8",
359-
truncate_ragged_lines=True,
360-
)
361-
else:
362-
raise
363-
364-
# Identify columns that end with /id suffix
365-
id_columns = [col for col in df.columns if col.endswith("/id")]
366-
367-
# If we have /id suffixed columns, re-read with explicit schema
368-
if id_columns:
369-
log.debug(f"Found /id suffixed columns: {id_columns}")
370-
# Create schema override to force /id columns to be strings
371-
schema_overrides = {col: pl.Utf8 for col in id_columns}
372-
log.debug(f"Schema overrides for /id columns: {schema_overrides}")
373-
# Re-read with explicit schema
374-
source_df = pl.read_csv(
375-
filename,
376-
separator=separator,
377-
truncate_ragged_lines=True,
378-
schema_overrides=schema_overrides,
379-
)
380-
log.debug(
381-
f"Re-read DataFrame with schema overrides. /id column types: "
382-
f"{[f'{col}: {source_df[col].dtype}' for col in id_columns]}"
347+
except (pl.exceptions.ComputeError, ValueError) as e:
348+
if "encoding" not in str(e).lower():
349+
raise # Not an encoding error, re-raise.
350+
351+
log.warning(
352+
f"Read failed with encoding '{encoding}', trying fallbacks..."
353+
)
354+
source_df = None
355+
for enc in ["utf8", "windows-1252", "latin-1", "iso-8859-1", "cp1252"]:
356+
try:
357+
source_df = pl.read_csv(
358+
filename,
359+
separator=separator,
360+
encoding=_map_encoding_to_polars(enc),
361+
truncate_ragged_lines=True,
362+
schema_overrides=schema_overrides,
363+
)
364+
log.warning(
365+
f"Successfully read with fallback encoding '{enc}'."
366+
)
367+
break
368+
except (pl.exceptions.ComputeError, ValueError):
369+
continue
370+
if source_df is None:
371+
raise ValueError(
372+
"Could not read CSV with any of the tried encodings."
373+
) from e
374+
except Exception as e:
375+
log.error(
376+
f"Failed to read source file '{filename}' for relational import: {e}"
383377
)
384-
else:
385-
source_df = df
378+
return
379+
# At this point, source_df is guaranteed to be a DataFrame since
380+
# we would have returned early if there was an error.
381+
if source_df is None:
382+
# This should never happen due to the logic above, but as a safety check
383+
raise RuntimeError("source_df is unexpectedly None after CSV reading")
384+
386385
# Only proceed with relational import if there are strategies defined
387386
strategies = import_plan.get("strategies", {})
388387
if strategies:

0 commit comments

Comments
 (0)