|
3 | 3 | # |
4 | 4 |
|
5 | 5 | import logging |
| 6 | +import warnings |
6 | 7 | from io import IOBase |
7 | 8 | from pathlib import Path |
8 | 9 | from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union |
@@ -64,7 +65,7 @@ async def infer_schema( |
64 | 65 | fields: Dict[str, str] = {} |
65 | 66 |
|
66 | 67 | with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: |
67 | | - df = self.open_and_parse_file(fp, logger, file.uri) |
| 68 | + df = self.open_and_parse_file(fp, logger, file) |
68 | 69 | for column, df_type in df.dtypes.items(): |
69 | 70 | # Choose the broadest data type if the column's data type differs in dataframes |
70 | 71 | prev_frame_column_type = fields.get(column) # type: ignore [call-overload] |
@@ -111,7 +112,7 @@ def parse_records( |
111 | 112 | try: |
112 | 113 | # Open and parse the file using the stream reader |
113 | 114 | with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: |
114 | | - df = self.open_and_parse_file(fp, logger, file.uri) |
| 115 | + df = self.open_and_parse_file(fp, logger, file) |
115 | 116 | # Yield records as dictionaries |
116 | 117 | # DataFrame.to_dict() method returns datetime values in pandas.Timestamp values, which are not serializable by orjson |
117 | 118 | # DataFrame.to_json() returns string with datetime values serialized to iso8601 with microseconds to align with pydantic behavior |
@@ -184,27 +185,72 @@ def validate_format(excel_format: BaseModel, logger: logging.Logger) -> None: |
184 | 185 | def open_and_parse_file( |
185 | 186 | fp: Union[IOBase, str, Path], |
186 | 187 | logger: Optional[logging.Logger] = None, |
187 | | - file_uri: Optional[str] = None, |
| 188 | + file_info: Optional[Union[str, RemoteFile]] = None, |
188 | 189 | ) -> pd.DataFrame: |
189 | 190 | """ |
190 | 191 | Opens and parses the Excel file with Calamine-first and Openpyxl fallback. |
191 | 192 |
|
192 | 193 | Returns: |
193 | 194 | pd.DataFrame: Parsed data from the Excel file. |
194 | 195 | """ |
| 196 | + file_label = "file" |
| 197 | + file_url = None |
| 198 | + if isinstance(file_info, RemoteFile): |
| 199 | + file_label = file_info.file_uri_for_logging |
| 200 | + file_url = getattr(file_info, "url", None) |
| 201 | + elif isinstance(file_info, str): |
| 202 | + file_label = file_info |
| 203 | + calamine_exc: Optional[BaseException] = None |
195 | 204 | try: |
196 | | - return pd.ExcelFile(fp, engine="calamine").parse() # type: ignore [arg-type, call-overload, no-any-return] |
197 | | - except Exception as calamine_exc: |
| 205 | + with pd.ExcelFile(fp, engine="calamine") as excel_file: # type: ignore [arg-type, call-overload] |
| 206 | + return excel_file.parse() # type: ignore [no-any-return] |
| 207 | + except BaseException as exc: # noqa: BLE001 |
| 208 | + if isinstance(exc, (KeyboardInterrupt, SystemExit)): |
| 209 | + raise |
| 210 | + calamine_exc = exc |
198 | 211 | if logger: |
199 | 212 | logger.warning( |
200 | | - "Calamine parsing failed for %s, falling back to openpyxl: %s", |
201 | | - file_uri or "file", |
202 | | - str(calamine_exc), |
| 213 | + ExcelParser._format_message_with_link( |
| 214 | + f"Calamine parsing failed for {file_label}, falling back to openpyxl: {exc}", |
| 215 | + file_url, |
| 216 | + ) |
203 | 217 | ) |
204 | 218 |
|
205 | | - try: |
206 | | - fp.seek(0) # type: ignore [union-attr] |
207 | | - except (AttributeError, OSError): |
208 | | - pass |
| 219 | + # Fallback to openpyxl |
| 220 | + try: |
| 221 | + fp.seek(0) # type: ignore [union-attr] |
| 222 | + except (AttributeError, OSError): |
| 223 | + pass |
209 | 224 |
|
210 | | - return pd.ExcelFile(fp, engine="openpyxl").parse() # type: ignore [arg-type, call-overload, no-any-return] |
| 225 | + try: |
| 226 | + with warnings.catch_warnings(record=True) as warning_records: |
| 227 | + warnings.simplefilter("always") |
| 228 | + with pd.ExcelFile(fp, engine="openpyxl") as excel_file: # type: ignore [arg-type, call-overload] |
| 229 | + df = excel_file.parse() # type: ignore [no-any-return] |
| 230 | + if logger: |
| 231 | + for warning in warning_records: |
| 232 | + logger.warning( |
| 233 | + ExcelParser._format_message_with_link( |
| 234 | + f"Openpyxl warning for {file_label}: {warning.message}", |
| 235 | + file_url, |
| 236 | + ) |
| 237 | + ) |
| 238 | + return df |
| 239 | + except BaseException as openpyxl_exc: # noqa: BLE001 |
| 240 | + if isinstance(openpyxl_exc, (KeyboardInterrupt, SystemExit)): |
| 241 | + raise |
| 242 | + # If both engines fail, raise the original calamine exception |
| 243 | + if logger: |
| 244 | + logger.error( |
| 245 | + ExcelParser._format_message_with_link( |
| 246 | + f"Both Calamine and Openpyxl parsing failed for {file_label}. Calamine error: {calamine_exc}, Openpyxl error: {openpyxl_exc}", |
| 247 | + file_url, |
| 248 | + ) |
| 249 | + ) |
| 250 | + raise calamine_exc if calamine_exc else openpyxl_exc |
| 251 | + |
| 252 | + @staticmethod |
| 253 | + def _format_message_with_link(message: str, file_url: Optional[str]) -> str: |
| 254 | + if file_url: |
| 255 | + return f"{message} (view: {file_url})" |
| 256 | + return message |
0 commit comments