Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
5be5bee
feat(file-based): Add Calamine-first with Openpyxl fallback for Excel…
devin-ai-integration[bot] Nov 14, 2025
65e8adc
chore: Update poetry.lock after adding openpyxl dependency
devin-ai-integration[bot] Nov 14, 2025
a7664eb
chore: Add openpyxl to Deptry DEP002 ignore list
devin-ai-integration[bot] Nov 14, 2025
34fe892
fix backoff parser, add better logging, and unit tests
agarctfi Nov 14, 2025
adfe576
refactor: Improve exception handling in Excel parser
devin-ai-integration[bot] Nov 14, 2025
88084ad
fix: Add sheet_name=0 to ExcelFile.parse() calls to satisfy MyPy type…
devin-ai-integration[bot] Nov 14, 2025
546bd46
fix: Implement two-tier exception handling for Calamine panics and up…
devin-ai-integration[bot] Nov 14, 2025
67fa697
style: Fix Ruff formatting - remove trailing whitespace in test file
devin-ai-integration[bot] Nov 14, 2025
e431f9d
refactor: Remove duplicate file_uri_for_logging property from Uploada…
devin-ai-integration[bot] Nov 20, 2025
a82a2fa
feat: Add ExcelCalamineParsingError exception for Excel parser fallback
devin-ai-integration[bot] Nov 20, 2025
6a38d55
refactor: Separate Excel parsing logic into three focused methods
devin-ai-integration[bot] Nov 20, 2025
fef9ac2
Add check for CALAMINE_PANIC_EXCEPTIONS
agarctfi Nov 20, 2025
fd1939e
update error handling
darynaishchenko Nov 21, 2025
d2f691a
refactor: Convert static methods to instance methods and update test
devin-ai-integration[bot] Nov 21, 2025
63d24a6
refactor: Move seek logic into _open_and_parse_file_with_openpyxl
devin-ai-integration[bot] Nov 24, 2025
44f7df1
test: Add parametrized test for non-seekable files in openpyxl fallback
devin-ai-integration[bot] Nov 24, 2025
49f3e19
refactor: Narrow exception handling to OSError only in seek logic
devin-ai-integration[bot] Nov 24, 2025
fffe027
refactor: Rename file_info parameter to file in Excel parsing methods
devin-ai-integration[bot] Nov 24, 2025
9d6428c
Merge branch 'main' into devin/1763137629-excel-parser-openpyxl-fallback
agarctfi Nov 24, 2025
0831b04
Fix properties
agarctfi Nov 24, 2025
463be27
Revert "Fix properties"
agarctfi Dec 9, 2025
95fc5e3
readd source_uri
agarctfi Dec 9, 2025
38a1a1c
Auto-fix lint and format issues
Dec 9, 2025
3277b70
Potential fix for pull request finding 'Unused import'
agarctfi Dec 9, 2025
7d73cc6
Potential fix for pull request finding 'Unused import'
agarctfi Dec 9, 2025
d2b0255
Merge branch 'main' into devin/1763137629-excel-parser-openpyxl-fallback
agarctfi Dec 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 25 additions & 9 deletions airbyte_cdk/sources/file_based/file_types/excel_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ async def infer_schema(
fields: Dict[str, str] = {}

with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
df = self.open_and_parse_file(fp)
df = self.open_and_parse_file(fp, logger, file.uri)
for column, df_type in df.dtypes.items():
# Choose the broadest data type if the column's data type differs in dataframes
prev_frame_column_type = fields.get(column) # type: ignore [call-overload]
Expand Down Expand Up @@ -92,7 +92,7 @@ def parse_records(
discovered_schema: Optional[Mapping[str, SchemaType]] = None,
) -> Iterable[Dict[str, Any]]:
"""
Parses records from an Excel file based on the provided configuration.
Parses records from an Excel file with fallback error handling.

Args:
config (FileBasedStreamConfig): Configuration for the file-based stream.
Expand All @@ -111,7 +111,7 @@ def parse_records(
try:
# Open and parse the file using the stream reader
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
df = self.open_and_parse_file(fp)
df = self.open_and_parse_file(fp, logger, file.uri)
# Yield records as dictionaries
# DataFrame.to_dict() method returns datetime values in pandas.Timestamp values, which are not serializable by orjson
# DataFrame.to_json() returns string with datetime values serialized to iso8601 with microseconds to align with pydantic behavior
Expand Down Expand Up @@ -181,14 +181,30 @@ def validate_format(excel_format: BaseModel, logger: logging.Logger) -> None:
raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR)

@staticmethod
def open_and_parse_file(fp: Union[IOBase, str, Path]) -> pd.DataFrame:
def open_and_parse_file(
fp: Union[IOBase, str, Path],
logger: Optional[logging.Logger] = None,
file_uri: Optional[str] = None,
) -> pd.DataFrame:
"""
Opens and parses the Excel file.

Args:
fp: File pointer to the Excel file.
Opens and parses the Excel file with Calamine-first and Openpyxl fallback.

Copy link

Copilot AI Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring is incomplete and doesn't document the newly added parameters. Please add documentation for:

  • logger: Optional logger for warning and error messages
  • file_info: Optional file information (RemoteFile or string) used for logging context
Suggested change
Args:
fp (Union[IOBase, str, Path]): The file pointer, file path, or file-like object to parse.
logger (Optional[logging.Logger]): Optional logger for warning and error messages.
file_info (Optional[Union[str, RemoteFile]]): Optional file information (RemoteFile or string) used for logging context.

Copilot uses AI. Check for mistakes.
Returns:
pd.DataFrame: Parsed data from the Excel file.
"""
return pd.ExcelFile(fp, engine="calamine").parse() # type: ignore [arg-type, call-overload, no-any-return]
try:
return pd.ExcelFile(fp, engine="calamine").parse() # type: ignore [arg-type, call-overload, no-any-return]
except Exception as calamine_exc:
if logger:
logger.warning(
"Calamine parsing failed for %s, falling back to openpyxl: %s",
file_uri or "file",
str(calamine_exc),
)

try:
fp.seek(0) # type: ignore [union-attr]
except (AttributeError, OSError):
pass

return pd.ExcelFile(fp, engine="openpyxl").parse() # type: ignore [arg-type, call-overload, no-any-return]
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ pdf2image = { version = "1.16.3", optional = true }
pyarrow = { version = "^19.0.0", optional = true }
pytesseract = { version = "0.3.10", optional = true } # Used indirectly by unstructured library
python-calamine = { version = "0.2.3", optional = true } # TODO: Remove if unused
openpyxl = { version = "^3.1.0", optional = true }
python-snappy = { version = "0.7.3", optional = true } # TODO: remove if unused
tiktoken = { version = "0.8.0", optional = true }
nltk = { version = "3.9.1", optional = true }
Expand Down Expand Up @@ -120,7 +121,7 @@ deptry = "^0.23.0"
dagger-io = "0.19.0"

[tool.poetry.extras]
file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "python-snappy"]
file-based = ["avro", "fastavro", "pyarrow", "unstructured", "pdf2image", "pdfminer.six", "unstructured.pytesseract", "pytesseract", "markdown", "python-calamine", "openpyxl", "python-snappy"]
vector-db-based = ["langchain_community", "langchain_core", "langchain_text_splitters", "openai", "cohere", "tiktoken"]
sql = ["sqlalchemy"]
dev = ["pytest"]
Expand Down
Loading