Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
5be5bee
feat(file-based): Add Calamine-first with Openpyxl fallback for Excel…
devin-ai-integration[bot] Nov 14, 2025
65e8adc
chore: Update poetry.lock after adding openpyxl dependency
devin-ai-integration[bot] Nov 14, 2025
a7664eb
chore: Add openpyxl to Deptry DEP002 ignore list
devin-ai-integration[bot] Nov 14, 2025
34fe892
fix backoff parser, add better logging, and unit tests
agarctfi Nov 14, 2025
adfe576
refactor: Improve exception handling in Excel parser
devin-ai-integration[bot] Nov 14, 2025
88084ad
fix: Add sheet_name=0 to ExcelFile.parse() calls to satisfy MyPy type…
devin-ai-integration[bot] Nov 14, 2025
546bd46
fix: Implement two-tier exception handling for Calamine panics and up…
devin-ai-integration[bot] Nov 14, 2025
67fa697
style: Fix Ruff formatting - remove trailing whitespace in test file
devin-ai-integration[bot] Nov 14, 2025
e431f9d
refactor: Remove duplicate file_uri_for_logging property from Uploada…
devin-ai-integration[bot] Nov 20, 2025
a82a2fa
feat: Add ExcelCalamineParsingError exception for Excel parser fallback
devin-ai-integration[bot] Nov 20, 2025
6a38d55
refactor: Separate Excel parsing logic into three focused methods
devin-ai-integration[bot] Nov 20, 2025
fef9ac2
Add check for CALAMINE_PANIC_EXCEPTIONS
agarctfi Nov 20, 2025
fd1939e
update error handling
darynaishchenko Nov 21, 2025
d2f691a
refactor: Convert static methods to instance methods and update test
devin-ai-integration[bot] Nov 21, 2025
63d24a6
refactor: Move seek logic into _open_and_parse_file_with_openpyxl
devin-ai-integration[bot] Nov 24, 2025
44f7df1
test: Add parametrized test for non-seekable files in openpyxl fallback
devin-ai-integration[bot] Nov 24, 2025
49f3e19
refactor: Narrow exception handling to OSError only in seek logic
devin-ai-integration[bot] Nov 24, 2025
fffe027
refactor: Rename file_info parameter to file in Excel parsing methods
devin-ai-integration[bot] Nov 24, 2025
9d6428c
Merge branch 'main' into devin/1763137629-excel-parser-openpyxl-fallback
agarctfi Nov 24, 2025
0831b04
Fix properties
agarctfi Nov 24, 2025
463be27
Revert "Fix properties"
agarctfi Dec 9, 2025
95fc5e3
readd source_uri
agarctfi Dec 9, 2025
38a1a1c
Auto-fix lint and format issues
Dec 9, 2025
3277b70
Potential fix for pull request finding 'Unused import'
agarctfi Dec 9, 2025
7d73cc6
Potential fix for pull request finding 'Unused import'
agarctfi Dec 9, 2025
d2b0255
Merge branch 'main' into devin/1763137629-excel-parser-openpyxl-fallback
agarctfi Dec 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 80 additions & 9 deletions airbyte_cdk/sources/file_based/file_types/excel_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#

import logging
import warnings
from io import IOBase
from pathlib import Path
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
Expand Down Expand Up @@ -64,7 +65,7 @@ async def infer_schema(
fields: Dict[str, str] = {}

with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
df = self.open_and_parse_file(fp)
df = self.open_and_parse_file(fp, logger, file)
for column, df_type in df.dtypes.items():
# Choose the broadest data type if the column's data type differs in dataframes
prev_frame_column_type = fields.get(column) # type: ignore [call-overload]
Expand Down Expand Up @@ -92,7 +93,7 @@ def parse_records(
discovered_schema: Optional[Mapping[str, SchemaType]] = None,
) -> Iterable[Dict[str, Any]]:
"""
Parses records from an Excel file based on the provided configuration.
Parses records from an Excel file with fallback error handling.

Args:
config (FileBasedStreamConfig): Configuration for the file-based stream.
Expand All @@ -111,7 +112,7 @@ def parse_records(
try:
# Open and parse the file using the stream reader
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
df = self.open_and_parse_file(fp)
df = self.open_and_parse_file(fp, logger, file)
# Yield records as dictionaries
# DataFrame.to_dict() method returns datetime values in pandas.Timestamp values, which are not serializable by orjson
# DataFrame.to_json() returns string with datetime values serialized to iso8601 with microseconds to align with pydantic behavior
Expand Down Expand Up @@ -181,14 +182,84 @@ def validate_format(excel_format: BaseModel, logger: logging.Logger) -> None:
raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR)

@staticmethod
def open_and_parse_file(fp: Union[IOBase, str, Path]) -> pd.DataFrame:
def open_and_parse_file(
fp: Union[IOBase, str, Path],
logger: Optional[logging.Logger] = None,
file_info: Optional[Union[str, RemoteFile]] = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to have to file_info not a type of RemoteFile?
Also I think we can remove Optional from typing for logger and file_info, because in both times where we call it we've already added these arguments.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! I checked the call sites and you're absolutely right:

  1. file_info is always RemoteFile: Both call sites (lines 68 and 115 in excel_parser.py) pass file which is a RemoteFile object. The Union[str, RemoteFile] type was overly broad.

  2. Both parameters are always provided: All call sites provide both logger and file_info, so Optional can be removed.

I'll update the signature to:

def open_and_parse_file(
    fp: Union[IOBase, str, Path],
    logger: logging.Logger,
    file_info: RemoteFile,
) -> pd.DataFrame:

This will make the API clearer and remove unnecessary None checks. I'll address this along with your other refactoring suggestions.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed! I've investigated all call sites and confirmed:

  1. file_info is always RemoteFile - Both call sites (lines 68 and 115 in excel_parser.py) pass a RemoteFile object, never a string
  2. logger and file_info are always provided - All call sites provide both parameters

I've updated the signature to:

def open_and_parse_file(
    fp: Union[IOBase, str, Path],
    logger: logging.Logger,
    file_info: RemoteFile,
) -> pd.DataFrame:

Removed Optional from both parameters and changed file_info from Union[str, RemoteFile] to just RemoteFile.

) -> pd.DataFrame:
"""
Opens and parses the Excel file.

Args:
fp: File pointer to the Excel file.
Opens and parses the Excel file with Calamine-first and Openpyxl fallback.

Copy link

Copilot AI Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring is incomplete and doesn't document the newly added parameters. Please add documentation for:

  • logger: Optional logger for warning and error messages
  • file_info: Optional file information (RemoteFile or string) used for logging context
Suggested change
Args:
fp (Union[IOBase, str, Path]): The file pointer, file path, or file-like object to parse.
logger (Optional[logging.Logger]): Optional logger for warning and error messages.
file_info (Optional[Union[str, RemoteFile]]): Optional file information (RemoteFile or string) used for logging context.

Copilot uses AI. Check for mistakes.
Returns:
pd.DataFrame: Parsed data from the Excel file.
"""
return pd.ExcelFile(fp, engine="calamine").parse() # type: ignore [arg-type, call-overload, no-any-return]
file_label = "file"
Copy link
Contributor

@darynaishchenko darynaishchenko Nov 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would like to separate the logic of this method:

  1. _open_and_parse_file_with_calamine: add a new method to class to read file with calamine engine, catch PanicException, log an error and raise ExcelCalamineParsingError(BaseFileBasedSourceError)
  2. _open_and_parse_file_with_openpyxl: add a new method to class to read with openpyxl engine
  3. open_and_parse_file: try call _open_and_parse_file_with_calamine except ExcelCalamineParsingError log an error and call _open_and_parse_file_with_openpyxl

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great suggestion for better separation of concerns! I'll refactor the code to extract the parsing logic into separate methods.

Proposed approach:

  1. Create custom exception: ExcelCalamineParsingError(BaseFileBasedSourceError) to signal Calamine failures
  2. Extract Calamine logic: _open_and_parse_file_with_calamine(fp, logger, file_info) - catches all exceptions (including PyO3 panics), logs warning, raises ExcelCalamineParsingError
  3. Extract Openpyxl logic: _open_and_parse_file_with_openpyxl(fp, logger, file_info) - handles openpyxl parsing with warning capture
  4. Simplify orchestration: open_and_parse_file() tries Calamine, catches ExcelCalamineParsingError, falls back to Openpyxl

Questions before I proceed:

  1. Should _open_and_parse_file_with_calamine handle the file pointer seek/reset logic, or should that remain in the orchestration method?
  2. For the custom exception, should I store the original exception as a cause (e.g., raise ExcelCalamineParsingError(...) from original_exc)?
  3. Should the two-tier exception handling (Exception + BaseException) remain in the Calamine method to catch PyO3 panics?

I'll proceed with the refactoring once you confirm the approach looks good!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add ExcelCalamineParsingError(BaseFileBasedSourceError) into airbyte-python-cdk/airbyte_cdk/sources/file_based/exceptions.py.
Extract Calamine logic: _open_and_parse_file_with_calamine(fp, logger, file_info) - should catch only PanicException which happens when date is out of range and raise ExcelCalamineParsingError.

  1. It should only call pd.ExcelFile(fp, engine="calamine").parse() as in current implementation in the main branch and raise an error as described above
  2. yes, raise ExcelCalamineParsingError(...) from original_exc
  3. no

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Refactoring complete! I've implemented the three-method separation as requested:

  1. _open_and_parse_file_with_calamine() - Calls pd.ExcelFile(fp, engine="calamine").parse() like the original implementation, catches all exceptions (including PyO3 PanicException which inherits from BaseException), logs warning, and raises ExcelCalamineParsingError with exception chaining (from exc)

  2. _open_and_parse_file_with_openpyxl() - Handles Openpyxl parsing with warning capture and logging

  3. open_and_parse_file() - Orchestrates the fallback: tries Calamine first, catches ExcelCalamineParsingError, handles file pointer seeking, then calls Openpyxl

I've also:

  • Added ExcelCalamineParsingError(BaseFileBasedSourceError) to exceptions.py
  • Removed Optional from logger and file_info parameters (Comment 14)
  • Changed file_info type from Union[str, RemoteFile] to just RemoteFile (Comment 14)
  • Simplified file_url logic to use only file_info.file_uri_for_logging (Comment 17)
  • Updated tests to match new signatures

All tests pass locally (MyPy, Ruff, pytest). Ready for review!

file_url = None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the change in remote_file.py and with providing the remote file only when calling this method I think it's safe to remove the check for file_url value below and use only file_info.file_uri_for_logging

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Simplified! I've removed the file_url variable and getattr(file_info, "url", None) logic. Now the code uses only file_info.file_uri_for_logging for all logging messages.

The refactored code no longer needs the _format_message_with_link() helper method since we're not using separate file URLs anymore - just the URI from file_info.file_uri_for_logging.

if isinstance(file_info, RemoteFile):
file_label = file_info.file_uri_for_logging
file_url = getattr(file_info, "url", None)
Copy link

Copilot AI Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using getattr(file_info, "url", None) assumes RemoteFile may have a url attribute, but RemoteFile only has uri, last_modified, and mime_type attributes according to the class definition. If you intended to use uri for the URL, use file_info.uri or getattr(file_info, "uri", None) instead. If a separate url attribute is expected from a subclass, please document this.

Suggested change
file_url = getattr(file_info, "url", None)
file_url = file_info.uri

Copilot uses AI. Check for mistakes.
elif isinstance(file_info, str):
file_label = file_info
calamine_exc: Optional[BaseException] = None
try:
with pd.ExcelFile(fp, engine="calamine") as excel_file: # type: ignore [arg-type, call-overload]
return excel_file.parse(sheet_name=0) # type: ignore [no-any-return]
except Exception as exc:
calamine_exc = exc
if logger:
logger.warning(
ExcelParser._format_message_with_link(
f"Calamine parsing failed for {file_label}, falling back to openpyxl: {exc}",
file_url,
)
)
except BaseException as exc: # noqa: BLE001
# PyO3 PanicException from Calamine inherits from BaseException, not Exception
if isinstance(exc, (KeyboardInterrupt, SystemExit)):
raise
calamine_exc = exc
if logger:
logger.warning(
ExcelParser._format_message_with_link(
f"Calamine parsing failed for {file_label}, falling back to openpyxl: {exc}",
file_url,
)
)
Copy link

Copilot AI Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The two exception handlers (lines 207-215 and 216-227) are redundant. Since BaseException catches everything including Exception, the first except Exception block will handle standard exceptions, and the second except BaseException will only catch non-Exception BaseExceptions (like PyO3 panics). Both handlers log identical warning messages and set calamine_exc. Consider combining them or documenting why they're separate, or just use a single except BaseException block with the KeyboardInterrupt/SystemExit check.

Copilot uses AI. Check for mistakes.

# Fallback to openpyxl
try:
fp.seek(0) # type: ignore [union-attr]
except (AttributeError, OSError):
# Some file-like objects may not be seekable; attempt openpyxl parsing anyway
pass
Copy link

Copilot AI Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the file pointer is not seekable and Calamine has partially consumed the stream before failing, the fallback to openpyxl will receive a partially consumed stream and likely fail. Consider logging a warning when seek fails, or document this limitation. The silent pass here could mask a critical failure mode where both parsers fail due to stream state rather than file content issues.

Suggested change
pass
warning_msg = (
f"File-like object for {file_label} is not seekable. "
"Calamine may have partially consumed the stream, so the fallback to openpyxl may fail due to stream state, not file content. "
"Consider providing a seekable file-like object for best results."
)
if logger:
logger.warning(ExcelParser._format_message_with_link(warning_msg, file_url))
else:
warnings.warn(warning_msg)

Copilot uses AI. Check for mistakes.

try:
with warnings.catch_warnings(record=True) as warning_records:
warnings.simplefilter("always")
with pd.ExcelFile(fp, engine="openpyxl") as excel_file: # type: ignore [arg-type, call-overload]
df = excel_file.parse(sheet_name=0) # type: ignore [no-any-return]
if logger:
for warning in warning_records:
logger.warning(
ExcelParser._format_message_with_link(
f"Openpyxl warning for {file_label}: {warning.message}",
file_url,
)
)
return df
except Exception as openpyxl_exc:
# If both engines fail, raise the original calamine exception
if logger:
logger.error(
ExcelParser._format_message_with_link(
f"Both Calamine and Openpyxl parsing failed for {file_label}. Calamine error: {calamine_exc}, Openpyxl error: {openpyxl_exc}",
file_url,
)
)
raise calamine_exc if calamine_exc else openpyxl_exc

@staticmethod
def _format_message_with_link(message: str, file_url: Optional[str]) -> str:
if file_url:
return f"{message} (view: {file_url})"
return message
5 changes: 5 additions & 0 deletions airbyte_cdk/sources/file_based/remote_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ class RemoteFile(BaseModel):
last_modified: datetime
mime_type: Optional[str] = None

@property
def file_uri_for_logging(self) -> str:
"""Returns a user-friendly identifier for logging."""
return self.uri


class UploadableRemoteFile(RemoteFile, ABC):
"""
Expand Down
8 changes: 4 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ pdf2image = { version = "1.16.3", optional = true }
pyarrow = { version = "^19.0.0", optional = true }
pytesseract = { version = "0.3.10", optional = true } # Used indirectly by unstructured library
python-calamine = { version = "0.2.3", optional = true } # TODO: Remove if unused
openpyxl = { version = "^3.1.0", optional = true }
python-snappy = { version = "0.7.3", optional = true } # TODO: remove if unused
tiktoken = { version = "0.8.0", optional = true }
nltk = { version = "3.9.1", optional = true }
Expand Down Expand Up @@ -120,7 +121,7 @@ deptry = "^0.23.0"
dagger-io = "0.19.0"

[tool.poetry.extras]
file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "python-snappy"]
file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "openpyxl", "python-snappy"]
vector-db-based = ["langchain_community", "langchain_core", "langchain_text_splitters", "openai", "cohere", "tiktoken"]
sql = ["sqlalchemy"]
dev = ["pytest"]
Expand Down Expand Up @@ -252,6 +253,7 @@ DEP002 = [
"cohere",
"markdown",
"openai",
"openpyxl",
"pdf2image",
"pdfminer.six",
"pytesseract",
Expand Down
41 changes: 41 additions & 0 deletions unit_tests/sources/file_based/file_types/test_excel_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@


import datetime
import warnings
from io import BytesIO
from unittest.mock import MagicMock, Mock, mock_open, patch

Expand Down Expand Up @@ -136,3 +137,43 @@ def test_file_read_error(mock_stream_reader, mock_logger, file_config, remote_fi
list(
parser.parse_records(file_config, remote_file, mock_stream_reader, mock_logger)
)


class FakePanic(BaseException):
"""Simulates the PyO3 PanicException which does not inherit from Exception."""


def test_open_and_parse_file_falls_back_to_openpyxl(mock_logger):
parser = ExcelParser()
fp = BytesIO(b"test")

fallback_df = pd.DataFrame({"a": [1]})

calamine_ctx = MagicMock()
calamine_excel_file = MagicMock()
calamine_ctx.__enter__.return_value = calamine_excel_file

def calamine_parse_side_effect(sheet_name=None):
raise FakePanic("calamine panic")

calamine_excel_file.parse.side_effect = calamine_parse_side_effect

openpyxl_ctx = MagicMock()
openpyxl_excel_file = MagicMock()
openpyxl_ctx.__enter__.return_value = openpyxl_excel_file

def openpyxl_parse_side_effect(sheet_name=None):
warnings.warn("Cell A146 has invalid date", UserWarning)
return fallback_df

openpyxl_excel_file.parse.side_effect = openpyxl_parse_side_effect

with patch("airbyte_cdk.sources.file_based.file_types.excel_parser.pd.ExcelFile") as mock_excel:
mock_excel.side_effect = [calamine_ctx, openpyxl_ctx]

result = parser.open_and_parse_file(fp, mock_logger, "file.xlsx")

pd.testing.assert_frame_equal(result, fallback_df)
assert mock_logger.warning.call_count == 2
assert "Openpyxl warning" in mock_logger.warning.call_args_list[1].args[0]
mock_logger.error.assert_not_called()
Loading