Skip to content

Commit 34fe892

Browse files
authored
fix backoff parser, add better logging, and unit tests
1 parent a7664eb commit 34fe892

File tree

3 files changed

+101
-13
lines changed

3 files changed

+101
-13
lines changed

airbyte_cdk/sources/file_based/file_types/excel_parser.py

Lines changed: 59 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#
44

55
import logging
6+
import warnings
67
from io import IOBase
78
from pathlib import Path
89
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
@@ -64,7 +65,7 @@ async def infer_schema(
6465
fields: Dict[str, str] = {}
6566

6667
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
67-
df = self.open_and_parse_file(fp, logger, file.uri)
68+
df = self.open_and_parse_file(fp, logger, file)
6869
for column, df_type in df.dtypes.items():
6970
# Choose the broadest data type if the column's data type differs in dataframes
7071
prev_frame_column_type = fields.get(column) # type: ignore [call-overload]
@@ -111,7 +112,7 @@ def parse_records(
111112
try:
112113
# Open and parse the file using the stream reader
113114
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
114-
df = self.open_and_parse_file(fp, logger, file.uri)
115+
df = self.open_and_parse_file(fp, logger, file)
115116
# Yield records as dictionaries
116117
# DataFrame.to_dict() method returns datetime values in pandas.Timestamp values, which are not serializable by orjson
117118
# DataFrame.to_json() returns string with datetime values serialized to iso8601 with microseconds to align with pydantic behavior
@@ -184,27 +185,72 @@ def validate_format(excel_format: BaseModel, logger: logging.Logger) -> None:
184185
def open_and_parse_file(
185186
fp: Union[IOBase, str, Path],
186187
logger: Optional[logging.Logger] = None,
187-
file_uri: Optional[str] = None,
188+
file_info: Optional[Union[str, RemoteFile]] = None,
188189
) -> pd.DataFrame:
189190
"""
190191
Opens and parses the Excel file with Calamine-first and Openpyxl fallback.
191192
192193
Returns:
193194
pd.DataFrame: Parsed data from the Excel file.
194195
"""
196+
file_label = "file"
197+
file_url = None
198+
if isinstance(file_info, RemoteFile):
199+
file_label = file_info.file_uri_for_logging
200+
file_url = getattr(file_info, "url", None)
201+
elif isinstance(file_info, str):
202+
file_label = file_info
203+
calamine_exc: Optional[BaseException] = None
195204
try:
196-
return pd.ExcelFile(fp, engine="calamine").parse() # type: ignore [arg-type, call-overload, no-any-return]
197-
except Exception as calamine_exc:
205+
with pd.ExcelFile(fp, engine="calamine") as excel_file: # type: ignore [arg-type, call-overload]
206+
return excel_file.parse() # type: ignore [no-any-return]
207+
except BaseException as exc: # noqa: BLE001
208+
if isinstance(exc, (KeyboardInterrupt, SystemExit)):
209+
raise
210+
calamine_exc = exc
198211
if logger:
199212
logger.warning(
200-
"Calamine parsing failed for %s, falling back to openpyxl: %s",
201-
file_uri or "file",
202-
str(calamine_exc),
213+
ExcelParser._format_message_with_link(
214+
f"Calamine parsing failed for {file_label}, falling back to openpyxl: {exc}",
215+
file_url,
216+
)
203217
)
204218

205-
try:
206-
fp.seek(0) # type: ignore [union-attr]
207-
except (AttributeError, OSError):
208-
pass
219+
# Fallback to openpyxl
220+
try:
221+
fp.seek(0) # type: ignore [union-attr]
222+
except (AttributeError, OSError):
223+
pass
209224

210-
return pd.ExcelFile(fp, engine="openpyxl").parse() # type: ignore [arg-type, call-overload, no-any-return]
225+
try:
226+
with warnings.catch_warnings(record=True) as warning_records:
227+
warnings.simplefilter("always")
228+
with pd.ExcelFile(fp, engine="openpyxl") as excel_file: # type: ignore [arg-type, call-overload]
229+
df = excel_file.parse() # type: ignore [no-any-return]
230+
if logger:
231+
for warning in warning_records:
232+
logger.warning(
233+
ExcelParser._format_message_with_link(
234+
f"Openpyxl warning for {file_label}: {warning.message}",
235+
file_url,
236+
)
237+
)
238+
return df
239+
except BaseException as openpyxl_exc: # noqa: BLE001
240+
if isinstance(openpyxl_exc, (KeyboardInterrupt, SystemExit)):
241+
raise
242+
# If both engines fail, raise the original calamine exception
243+
if logger:
244+
logger.error(
245+
ExcelParser._format_message_with_link(
246+
f"Both Calamine and Openpyxl parsing failed for {file_label}. Calamine error: {calamine_exc}, Openpyxl error: {openpyxl_exc}",
247+
file_url,
248+
)
249+
)
250+
raise calamine_exc if calamine_exc else openpyxl_exc
251+
252+
@staticmethod
253+
def _format_message_with_link(message: str, file_url: Optional[str]) -> str:
254+
if file_url:
255+
return f"{message} (view: {file_url})"
256+
return message

airbyte_cdk/sources/file_based/remote_file.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ class RemoteFile(BaseModel):
1717
last_modified: datetime
1818
mime_type: Optional[str] = None
1919

20+
@property
21+
def file_uri_for_logging(self) -> str:
22+
"""Returns a user-friendly identifier for logging."""
23+
return self.uri
24+
2025

2126
class UploadableRemoteFile(RemoteFile, ABC):
2227
"""

unit_tests/sources/file_based/file_types/test_excel_parser.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55

66
import datetime
7+
import warnings
78
from io import BytesIO
89
from unittest.mock import MagicMock, Mock, mock_open, patch
910

@@ -136,3 +137,39 @@ def test_file_read_error(mock_stream_reader, mock_logger, file_config, remote_fi
136137
list(
137138
parser.parse_records(file_config, remote_file, mock_stream_reader, mock_logger)
138139
)
140+
141+
142+
class FakePanic(BaseException):
143+
"""Simulates the PyO3 PanicException which does not inherit from Exception."""
144+
145+
146+
def test_open_and_parse_file_falls_back_to_openpyxl(mock_logger):
147+
parser = ExcelParser()
148+
fp = BytesIO(b"test")
149+
150+
fallback_df = pd.DataFrame({"a": [1]})
151+
152+
calamine_ctx = MagicMock()
153+
calamine_excel_file = MagicMock()
154+
calamine_ctx.__enter__.return_value = calamine_excel_file
155+
calamine_excel_file.parse.side_effect = FakePanic("calamine panic")
156+
157+
openpyxl_ctx = MagicMock()
158+
openpyxl_excel_file = MagicMock()
159+
openpyxl_ctx.__enter__.return_value = openpyxl_excel_file
160+
161+
def openpyxl_parse_side_effect():
162+
warnings.warn("Cell A146 has invalid date", UserWarning)
163+
return fallback_df
164+
165+
openpyxl_excel_file.parse.side_effect = openpyxl_parse_side_effect
166+
167+
with patch("airbyte_cdk.sources.file_based.file_types.excel_parser.pd.ExcelFile") as mock_excel:
168+
mock_excel.side_effect = [calamine_ctx, openpyxl_ctx]
169+
170+
result = parser.open_and_parse_file(fp, mock_logger, "file.xlsx")
171+
172+
pd.testing.assert_frame_equal(result, fallback_df)
173+
assert mock_logger.warning.call_count == 2
174+
assert "Openpyxl warning" in mock_logger.warning.call_args_list[1].args[0]
175+
mock_logger.error.assert_not_called()

0 commit comments

Comments
 (0)