Skip to content
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
5be5bee
feat(file-based): Add Calamine-first with Openpyxl fallback for Excel…
devin-ai-integration[bot] Nov 14, 2025
65e8adc
chore: Update poetry.lock after adding openpyxl dependency
devin-ai-integration[bot] Nov 14, 2025
a7664eb
chore: Add openpyxl to Deptry DEP002 ignore list
devin-ai-integration[bot] Nov 14, 2025
34fe892
fix backoff parser, add better logging, and unit tests
agarctfi Nov 14, 2025
adfe576
refactor: Improve exception handling in Excel parser
devin-ai-integration[bot] Nov 14, 2025
88084ad
fix: Add sheet_name=0 to ExcelFile.parse() calls to satisfy MyPy type…
devin-ai-integration[bot] Nov 14, 2025
546bd46
fix: Implement two-tier exception handling for Calamine panics and up…
devin-ai-integration[bot] Nov 14, 2025
67fa697
style: Fix Ruff formatting - remove trailing whitespace in test file
devin-ai-integration[bot] Nov 14, 2025
e431f9d
refactor: Remove duplicate file_uri_for_logging property from Uploada…
devin-ai-integration[bot] Nov 20, 2025
a82a2fa
feat: Add ExcelCalamineParsingError exception for Excel parser fallback
devin-ai-integration[bot] Nov 20, 2025
6a38d55
refactor: Separate Excel parsing logic into three focused methods
devin-ai-integration[bot] Nov 20, 2025
fef9ac2
Add check for CALAMINE_PANIC_EXCEPTIONS
agarctfi Nov 20, 2025
fd1939e
update error handling
darynaishchenko Nov 21, 2025
d2f691a
refactor: Convert static methods to instance methods and update test
devin-ai-integration[bot] Nov 21, 2025
63d24a6
refactor: Move seek logic into _open_and_parse_file_with_openpyxl
devin-ai-integration[bot] Nov 24, 2025
44f7df1
test: Add parametrized test for non-seekable files in openpyxl fallback
devin-ai-integration[bot] Nov 24, 2025
49f3e19
refactor: Narrow exception handling to OSError only in seek logic
devin-ai-integration[bot] Nov 24, 2025
fffe027
refactor: Rename file_info parameter to file in Excel parsing methods
devin-ai-integration[bot] Nov 24, 2025
9d6428c
Merge branch 'main' into devin/1763137629-excel-parser-openpyxl-fallback
agarctfi Nov 24, 2025
0831b04
Fix properties
agarctfi Nov 24, 2025
463be27
Revert "Fix properties"
agarctfi Dec 9, 2025
95fc5e3
readd source_uri
agarctfi Dec 9, 2025
38a1a1c
Auto-fix lint and format issues
Dec 9, 2025
3277b70
Potential fix for pull request finding 'Unused import'
agarctfi Dec 9, 2025
7d73cc6
Potential fix for pull request finding 'Unused import'
agarctfi Dec 9, 2025
d2b0255
Merge branch 'main' into devin/1763137629-excel-parser-openpyxl-fallback
agarctfi Dec 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions airbyte_cdk/sources/file_based/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,12 @@ class RecordParseError(BaseFileBasedSourceError):
pass


class ExcelCalamineParsingError(BaseFileBasedSourceError):
"""Raised when Calamine engine fails to parse an Excel file."""

pass


class SchemaInferenceError(BaseFileBasedSourceError):
pass

Expand Down
93 changes: 86 additions & 7 deletions airbyte_cdk/sources/file_based/file_types/excel_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
#

import logging
import warnings
from io import IOBase
from pathlib import Path
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Type, Union, cast

import orjson
import pandas as pd
Expand All @@ -17,6 +18,7 @@
)
from airbyte_cdk.sources.file_based.exceptions import (
ConfigValidationError,
ExcelCalamineParsingError,
FileBasedSourceError,
RecordParseError,
)
Expand Down Expand Up @@ -64,7 +66,7 @@ async def infer_schema(
fields: Dict[str, str] = {}

with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
df = self.open_and_parse_file(fp)
df = self.open_and_parse_file(fp, logger, file)
for column, df_type in df.dtypes.items():
# Choose the broadest data type if the column's data type differs in dataframes
prev_frame_column_type = fields.get(column) # type: ignore [call-overload]
Expand Down Expand Up @@ -92,7 +94,7 @@ def parse_records(
discovered_schema: Optional[Mapping[str, SchemaType]] = None,
) -> Iterable[Dict[str, Any]]:
"""
Parses records from an Excel file based on the provided configuration.
Parses records from an Excel file with fallback error handling.

Args:
config (FileBasedStreamConfig): Configuration for the file-based stream.
Expand All @@ -111,7 +113,7 @@ def parse_records(
try:
# Open and parse the file using the stream reader
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
df = self.open_and_parse_file(fp)
df = self.open_and_parse_file(fp, logger, file)
# Yield records as dictionaries
# DataFrame.to_dict() method returns datetime values in pandas.Timestamp values, which are not serializable by orjson
# DataFrame.to_json() returns string with datetime values serialized to iso8601 with microseconds to align with pydantic behavior
Expand Down Expand Up @@ -181,14 +183,91 @@ def validate_format(excel_format: BaseModel, logger: logging.Logger) -> None:
raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR)

@staticmethod
def open_and_parse_file(fp: Union[IOBase, str, Path]) -> pd.DataFrame:
def _open_and_parse_file_with_calamine(
fp: Union[IOBase, str, Path],
logger: logging.Logger,
file_info: RemoteFile,
) -> pd.DataFrame:
"""Opens and parses Excel file using Calamine engine.

Args:
fp: File pointer to the Excel file.
logger: Logger for logging information and errors.
file_info: Remote file information for logging context.

Copy link

Copilot AI Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring is incomplete and doesn't document the newly added parameters. Please add documentation for:

  • logger: Optional logger for warning and error messages
  • file_info: Optional file information (RemoteFile or string) used for logging context
Suggested change
Args:
fp (Union[IOBase, str, Path]): The file pointer, file path, or file-like object to parse.
logger (Optional[logging.Logger]): Optional logger for warning and error messages.
file_info (Optional[Union[str, RemoteFile]]): Optional file information (RemoteFile or string) used for logging context.

Copilot uses AI. Check for mistakes.
Returns:
pd.DataFrame: Parsed data from the Excel file.

Raises:
ExcelCalamineParsingError: If Calamine fails to parse the file.
"""
Opens and parses the Excel file.
try:
return pd.ExcelFile(fp, engine="calamine").parse() # type: ignore [arg-type, call-overload, no-any-return]
except BaseException as exc:
# Calamine engine raises PanicException(child of BaseException) if Calamine fails to parse the file.
# Checking if ValueError in exception arg to know if it was actually an error during parsing due to invalid values in cells.
# Otherwise, raise an exception.
if "ValueError" in str(exc):
logger.warning(
f"Calamine parsing failed for {file_info.file_uri_for_logging}, falling back to openpyxl: {exc}"
)
raise ExcelCalamineParsingError(
f"Calamine engine failed to parse {file_info.file_uri_for_logging}",
filename=file_info.uri,
) from exc
raise exc

@staticmethod
def _open_and_parse_file_with_openpyxl(
fp: Union[IOBase, str, Path],
logger: logging.Logger,
file_info: RemoteFile,
) -> pd.DataFrame:
"""Opens and parses Excel file using Openpyxl engine.

Args:
fp: File pointer to the Excel file.
logger: Logger for logging information and errors.
file_info: Remote file information for logging context.

Returns:
pd.DataFrame: Parsed data from the Excel file.
"""
return pd.ExcelFile(fp, engine="calamine").parse() # type: ignore [arg-type, call-overload, no-any-return]
with warnings.catch_warnings(record=True) as warning_records:
warnings.simplefilter("always")
df = pd.ExcelFile(fp, engine="openpyxl").parse() # type: ignore [arg-type, call-overload]

for warning in warning_records:
logger.warning(
f"Openpyxl warning for {file_info.file_uri_for_logging}: {warning.message}"
)

return df # type: ignore [no-any-return]

@staticmethod
def open_and_parse_file(
fp: Union[IOBase, str, Path],
logger: logging.Logger,
file_info: RemoteFile,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we use file as naming instead of file_info because we know this is an instance of RemoteFile?
here and in _open_and_parse_file_with_calamine and in _open_and_parse_file_with_openpyxl

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done! I've renamed file_info to file across all three methods.

Changes Made:

  • _open_and_parse_file_with_calamine: Renamed parameter and all references
  • _open_and_parse_file_with_openpyxl: Renamed parameter and all references
  • open_and_parse_file: Renamed parameter and all references

Rationale:
Since the type is RemoteFile, the shorter name file is more appropriate and makes the code more concise while maintaining clarity.

Local Verification:

  • ✅ All Excel parser tests pass (5 passed, 1 skipped)
  • ✅ Ruff format and lint pass

Commit: fffe027 - "refactor: Rename file_info parameter to file in Excel parsing methods"

) -> pd.DataFrame:
"""Opens and parses the Excel file with Calamine-first and Openpyxl fallback.

Args:
fp: File pointer to the Excel file.
logger: Logger for logging information and errors.
file_info: Remote file information for logging context.

Returns:
pd.DataFrame: Parsed data from the Excel file.
"""
try:
return ExcelParser._open_and_parse_file_with_calamine(fp, logger, file_info)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like with this change open_and_parse_file method should not be static, should be updated to use self

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point! Since these methods are calling each other within the class, they should be regular instance methods using self instead of @staticmethod. I'll update all three methods to remove @staticmethod and use self for the method calls.

except ExcelCalamineParsingError:
# Fallback to openpyxl
try:
fp.seek(0) # type: ignore [union-attr]
except (AttributeError, OSError):
# Some file-like objects may not be seekable; attempt openpyxl parsing anyway
pass
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this try/except needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The try/except is needed because some file-like objects (like certain stream types) may not support seeking. The AttributeError catches objects that don't have a seek method, and OSError catches objects that have seek but it fails (e.g., non-seekable streams).

If we don't catch these exceptions, the fallback would fail even though Openpyxl might still be able to parse the file from its current position. The comment explains this: "Some file-like objects may not be seekable; attempt openpyxl parsing anyway"

However, I agree this could be improved - we could log a warning when seeking fails to make it more visible that the stream might be partially consumed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But in except block we have pass so it will call _open_and_parse_file_with_openpyxl anyway.
I suggest to add this try/except block into _open_and_parse_file_with_openpyxl and add logging instead of pass

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good suggestion! I've moved the fp.seek(0) try/except block into _open_and_parse_file_with_openpyxl and added logging instead of pass.

Changes made:

  1. ✅ Moved seek logic into _open_and_parse_file_with_openpyxl method
  2. ✅ Added hasattr(fp, "seek") check to avoid AttributeError on non-file-like objects (str/Path)
  3. ✅ Added info-level logging for seek failures: "Could not rewind stream for {file_uri}; proceeding with openpyxl from current position: {exc}"
  4. ✅ Removed duplicate seek logic from open_and_parse_file orchestration method
  5. ✅ Added brief comment explaining why we attempt to rewind

Why info-level instead of warning:

  • The Calamine fallback warning and openpyxl warnings are already logged at warning level
  • A failed seek is not necessarily a problem by itself since we still attempt parsing
  • Using info keeps the warning count stable (the test asserts 2 warnings: fallback + openpyxl)

This refactoring centralizes fallback-specific concerns within the openpyxl path and simplifies the orchestration method to focus purely on flow control.

Commit: 63d24a6 - "refactor: Move seek logic into _open_and_parse_file_with_openpyxl"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you also add a unit test with not seekable file and call read with openpyxl and confirm the info log is being emitted? The test should cover non seekable files that raise both AttributeError and OSError

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done! I've added a parametrized unit test that covers both AttributeError and OSError cases for non-seekable files.

Test Added:
test_openpyxl_logs_info_when_seek_fails - Parametrized test with two cases:

  • attribute-error: Tests when seek() raises AttributeError
  • os-error: Tests when seek() raises OSError

Test Implementation:

  • Creates a FakeFP class with a seek() method that raises the desired exception
  • Tests _open_and_parse_file_with_openpyxl directly to isolate the behavior
  • Verifies the info log is emitted with correct content:
    • Checks for "Could not rewind stream" message
    • Verifies file URI is included in the log message
  • Confirms parsing proceeds and returns the expected DataFrame
  • Verifies pd.ExcelFile is called with engine="openpyxl"

Local Verification:

  • ✅ Both test cases pass (attribute-error and os-error)
  • ✅ All Excel parser tests pass (6 passed, 1 skipped)
  • ✅ Ruff format and lint pass

Commit: 44f7df1 - "test: Add parametrized test for non-seekable files in openpyxl fallback"


return ExcelParser._open_and_parse_file_with_openpyxl(fp, logger, file_info)
12 changes: 5 additions & 7 deletions airbyte_cdk/sources/file_based/remote_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ class RemoteFile(BaseModel):
last_modified: datetime
mime_type: Optional[str] = None

@property
def file_uri_for_logging(self) -> str:
"""Returns a user-friendly identifier for logging."""
return self.uri


class UploadableRemoteFile(RemoteFile, ABC):
"""
Expand Down Expand Up @@ -48,10 +53,3 @@ def source_file_relative_path(self) -> str:
Returns the relative path of the source file.
"""
return self.uri

@property
def file_uri_for_logging(self) -> str:
"""
Returns the URI for the file being logged.
"""
return self.uri
8 changes: 4 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ pdf2image = { version = "1.16.3", optional = true }
pyarrow = { version = "^19.0.0", optional = true }
pytesseract = { version = "0.3.10", optional = true } # Used indirectly by unstructured library
python-calamine = { version = "0.2.3", optional = true } # TODO: Remove if unused
openpyxl = { version = "^3.1.0", optional = true }
python-snappy = { version = "0.7.3", optional = true } # TODO: remove if unused
tiktoken = { version = "0.8.0", optional = true }
nltk = { version = "3.9.1", optional = true }
Expand Down Expand Up @@ -120,7 +121,7 @@ deptry = "^0.23.0"
dagger-io = "0.19.0"

[tool.poetry.extras]
file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "python-snappy"]
file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "openpyxl", "python-snappy"]
vector-db-based = ["langchain_community", "langchain_core", "langchain_text_splitters", "openai", "cohere", "tiktoken"]
sql = ["sqlalchemy"]
dev = ["pytest"]
Expand Down Expand Up @@ -252,6 +253,7 @@ DEP002 = [
"cohere",
"markdown",
"openai",
"openpyxl",
"pdf2image",
"pdfminer.six",
"pytesseract",
Expand Down
53 changes: 53 additions & 0 deletions unit_tests/sources/file_based/file_types/test_excel_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@


import datetime
import warnings
from io import BytesIO
from unittest.mock import MagicMock, Mock, mock_open, patch

Expand Down Expand Up @@ -136,3 +137,55 @@ def test_file_read_error(mock_stream_reader, mock_logger, file_config, remote_fi
list(
parser.parse_records(file_config, remote_file, mock_stream_reader, mock_logger)
)


class FakePanic(BaseException):
"""Simulates the PyO3 PanicException which does not inherit from Exception."""


def test_open_and_parse_file_falls_back_to_openpyxl(mock_logger):
parser = ExcelParser()
fp = BytesIO(b"test")
remote_file = RemoteFile(uri="s3://mybucket/test.xlsx", last_modified=datetime.datetime.now())

fallback_df = pd.DataFrame({"a": [1]})

calamine_excel_file = MagicMock()

def calamine_parse_side_effect():
raise FakePanic(
"failed to construct date: PyErr { type: <class 'ValueError'>, value: ValueError('year 20225 is out of range'), traceback: None }"
)

calamine_excel_file.parse.side_effect = calamine_parse_side_effect

openpyxl_excel_file = MagicMock()

def openpyxl_parse_side_effect():
warnings.warn("Cell A146 has invalid date", UserWarning)
return fallback_df

openpyxl_excel_file.parse.side_effect = openpyxl_parse_side_effect

with (
patch("airbyte_cdk.sources.file_based.file_types.excel_parser.pd.ExcelFile") as mock_excel,
):
mock_excel.side_effect = [calamine_excel_file, openpyxl_excel_file]

result = parser.open_and_parse_file(fp, mock_logger, remote_file)

pd.testing.assert_frame_equal(result, fallback_df)
assert mock_logger.warning.call_count == 2
assert "Openpyxl warning" in mock_logger.warning.call_args_list[1].args[0]


def test_open_and_parse_file_does_not_swallow_keyboard_interrupt(mock_logger):
parser = ExcelParser()
fp = BytesIO(b"test")
remote_file = RemoteFile(uri="s3://mybucket/test.xlsx", last_modified=datetime.datetime.now())

with patch("airbyte_cdk.sources.file_based.file_types.excel_parser.pd.ExcelFile") as mock_excel:
mock_excel.return_value.parse.side_effect = KeyboardInterrupt()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please rewrite this test to have system error as side effect and add a comment to the test about why we need this test.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! The test should use SystemExit instead of KeyboardInterrupt to properly test that we're not catching system-level exceptions. I'll update the test to:

  1. Use SystemExit() as the side effect
  2. Add a comment explaining why this test is important (to ensure we don't accidentally catch and suppress critical system exceptions like SystemExit and KeyboardInterrupt that should propagate up)


with pytest.raises(KeyboardInterrupt):
parser.open_and_parse_file(fp, mock_logger, remote_file)
Loading