Skip to content

Commit 5be5bee

Browse files
feat(file-based): Add Calamine-first with Openpyxl fallback for Excel parser
Implements a fallback mechanism for Excel file parsing to handle edge cases where Calamine fails (e.g., invalid date values like year 20225). The parser now tries Calamine first for performance, then falls back to Openpyxl if Calamine encounters an error. Changes: - Modified open_and_parse_file() to implement try-catch with fallback logic - Added logger parameter to log when fallback is triggered - Added openpyxl as optional dependency in pyproject.toml - Added openpyxl to file-based extras list This resolves crashes in Google Drive source when processing large numbers of Excel files with malformed data, allowing syncs to complete successfully instead of failing entirely. Fixes: airbytehq/oncall#10097 Co-Authored-By: unknown <>
1 parent 5d9125f commit 5be5bee

File tree

2 files changed

+27
-10
lines changed

2 files changed

+27
-10
lines changed

airbyte_cdk/sources/file_based/file_types/excel_parser.py

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ async def infer_schema(
6464
fields: Dict[str, str] = {}
6565

6666
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
67-
df = self.open_and_parse_file(fp)
67+
df = self.open_and_parse_file(fp, logger, file.uri)
6868
for column, df_type in df.dtypes.items():
6969
# Choose the broadest data type if the column's data type differs in dataframes
7070
prev_frame_column_type = fields.get(column) # type: ignore [call-overload]
@@ -92,7 +92,7 @@ def parse_records(
9292
discovered_schema: Optional[Mapping[str, SchemaType]] = None,
9393
) -> Iterable[Dict[str, Any]]:
9494
"""
95-
Parses records from an Excel file based on the provided configuration.
95+
Parses records from an Excel file with fallback error handling.
9696
9797
Args:
9898
config (FileBasedStreamConfig): Configuration for the file-based stream.
@@ -111,7 +111,7 @@ def parse_records(
111111
try:
112112
# Open and parse the file using the stream reader
113113
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
114-
df = self.open_and_parse_file(fp)
114+
df = self.open_and_parse_file(fp, logger, file.uri)
115115
# Yield records as dictionaries
116116
# DataFrame.to_dict() method returns datetime values in pandas.Timestamp values, which are not serializable by orjson
117117
# DataFrame.to_json() returns string with datetime values serialized to iso8601 with microseconds to align with pydantic behavior
@@ -181,14 +181,30 @@ def validate_format(excel_format: BaseModel, logger: logging.Logger) -> None:
181181
raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR)
182182

183183
@staticmethod
184-
def open_and_parse_file(fp: Union[IOBase, str, Path]) -> pd.DataFrame:
184+
def open_and_parse_file(
185+
fp: Union[IOBase, str, Path],
186+
logger: Optional[logging.Logger] = None,
187+
file_uri: Optional[str] = None,
188+
) -> pd.DataFrame:
185189
"""
186-
Opens and parses the Excel file.
187-
188-
Args:
189-
fp: File pointer to the Excel file.
190+
Opens and parses the Excel file with Calamine-first and Openpyxl fallback.
190191
191192
Returns:
192193
pd.DataFrame: Parsed data from the Excel file.
193194
"""
194-
return pd.ExcelFile(fp, engine="calamine").parse() # type: ignore [arg-type, call-overload, no-any-return]
195+
try:
196+
return pd.ExcelFile(fp, engine="calamine").parse() # type: ignore [arg-type, call-overload, no-any-return]
197+
except Exception as calamine_exc:
198+
if logger:
199+
logger.warning(
200+
"Calamine parsing failed for %s, falling back to openpyxl: %s",
201+
file_uri or "file",
202+
str(calamine_exc),
203+
)
204+
205+
try:
206+
fp.seek(0) # type: ignore [union-attr]
207+
except (AttributeError, OSError):
208+
pass
209+
210+
return pd.ExcelFile(fp, engine="openpyxl").parse() # type: ignore [arg-type, call-overload, no-any-return]

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ pdf2image = { version = "1.16.3", optional = true }
7373
pyarrow = { version = "^19.0.0", optional = true }
7474
pytesseract = { version = "0.3.10", optional = true } # Used indirectly by unstructured library
7575
python-calamine = { version = "0.2.3", optional = true } # TODO: Remove if unused
76+
openpyxl = { version = "^3.1.0", optional = true }
7677
python-snappy = { version = "0.7.3", optional = true } # TODO: remove if unused
7778
tiktoken = { version = "0.8.0", optional = true }
7879
nltk = { version = "3.9.1", optional = true }
@@ -120,7 +121,7 @@ deptry = "^0.23.0"
120121
dagger-io = "0.19.0"
121122

122123
[tool.poetry.extras]
123-
file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "python-snappy"]
124+
file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "openpyxl", "python-snappy"]
124125
vector-db-based = ["langchain_community", "langchain_core", "langchain_text_splitters", "openai", "cohere", "tiktoken"]
125126
sql = ["sqlalchemy"]
126127
dev = ["pytest"]

0 commit comments

Comments
 (0)