-
Notifications
You must be signed in to change notification settings - Fork 32
feat(file-based): Add Calamine-first with Openpyxl fallback for Excel parser #850
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 20 commits
5be5bee
65e8adc
a7664eb
34fe892
adfe576
88084ad
546bd46
67fa697
e431f9d
a82a2fa
6a38d55
fef9ac2
fd1939e
d2f691a
63d24a6
44f7df1
49f3e19
fffe027
9d6428c
0831b04
463be27
95fc5e3
38a1a1c
3277b70
7d73cc6
d2b0255
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -3,9 +3,10 @@ | |||||||||||||
| # | ||||||||||||||
|
|
||||||||||||||
| import logging | ||||||||||||||
| import warnings | ||||||||||||||
| from io import IOBase | ||||||||||||||
| from pathlib import Path | ||||||||||||||
| from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union | ||||||||||||||
| from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Type, Union, cast | ||||||||||||||
|
|
||||||||||||||
| import orjson | ||||||||||||||
| import pandas as pd | ||||||||||||||
|
|
@@ -17,6 +18,7 @@ | |||||||||||||
| ) | ||||||||||||||
| from airbyte_cdk.sources.file_based.exceptions import ( | ||||||||||||||
| ConfigValidationError, | ||||||||||||||
| ExcelCalamineParsingError, | ||||||||||||||
| FileBasedSourceError, | ||||||||||||||
| RecordParseError, | ||||||||||||||
| ) | ||||||||||||||
|
|
@@ -64,7 +66,7 @@ async def infer_schema( | |||||||||||||
| fields: Dict[str, str] = {} | ||||||||||||||
|
|
||||||||||||||
| with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: | ||||||||||||||
| df = self.open_and_parse_file(fp) | ||||||||||||||
| df = self.open_and_parse_file(fp, logger, file) | ||||||||||||||
| for column, df_type in df.dtypes.items(): | ||||||||||||||
| # Choose the broadest data type if the column's data type differs in dataframes | ||||||||||||||
| prev_frame_column_type = fields.get(column) # type: ignore [call-overload] | ||||||||||||||
|
|
@@ -92,7 +94,7 @@ def parse_records( | |||||||||||||
| discovered_schema: Optional[Mapping[str, SchemaType]] = None, | ||||||||||||||
| ) -> Iterable[Dict[str, Any]]: | ||||||||||||||
| """ | ||||||||||||||
| Parses records from an Excel file based on the provided configuration. | ||||||||||||||
| Parses records from an Excel file with fallback error handling. | ||||||||||||||
|
|
||||||||||||||
| Args: | ||||||||||||||
| config (FileBasedStreamConfig): Configuration for the file-based stream. | ||||||||||||||
|
|
@@ -111,7 +113,7 @@ def parse_records( | |||||||||||||
| try: | ||||||||||||||
| # Open and parse the file using the stream reader | ||||||||||||||
| with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp: | ||||||||||||||
| df = self.open_and_parse_file(fp) | ||||||||||||||
| df = self.open_and_parse_file(fp, logger, file) | ||||||||||||||
| # Yield records as dictionaries | ||||||||||||||
| # DataFrame.to_dict() method returns datetime values in pandas.Timestamp values, which are not serializable by orjson | ||||||||||||||
| # DataFrame.to_json() returns string with datetime values serialized to iso8601 with microseconds to align with pydantic behavior | ||||||||||||||
|
|
@@ -180,15 +182,93 @@ def validate_format(excel_format: BaseModel, logger: logging.Logger) -> None: | |||||||||||||
| logger.info(f"Expected ExcelFormat, got {excel_format}") | ||||||||||||||
| raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) | ||||||||||||||
|
|
||||||||||||||
| @staticmethod | ||||||||||||||
| def open_and_parse_file(fp: Union[IOBase, str, Path]) -> pd.DataFrame: | ||||||||||||||
| def _open_and_parse_file_with_calamine( | ||||||||||||||
| self, | ||||||||||||||
| fp: Union[IOBase, str, Path], | ||||||||||||||
| logger: logging.Logger, | ||||||||||||||
| file: RemoteFile, | ||||||||||||||
| ) -> pd.DataFrame: | ||||||||||||||
| """Opens and parses Excel file using Calamine engine. | ||||||||||||||
|
|
||||||||||||||
| Args: | ||||||||||||||
| fp: File pointer to the Excel file. | ||||||||||||||
| logger: Logger for logging information and errors. | ||||||||||||||
| file: Remote file information for logging context. | ||||||||||||||
|
|
||||||||||||||
|
||||||||||||||
| Args: | |
| fp (Union[IOBase, str, Path]): The file pointer, file path, or file-like object to parse. | |
| logger (Optional[logging.Logger]): Optional logger for warning and error messages. | |
| file_info (Optional[Union[str, RemoteFile]]): Optional file information (RemoteFile or string) used for logging context. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,16 @@ class RemoteFile(BaseModel): | |
| last_modified: datetime | ||
| mime_type: Optional[str] = None | ||
|
|
||
| @property | ||
| def file_uri_for_logging(self) -> str: | ||
| """Returns a user-friendly identifier for logging.""" | ||
| return self.uri | ||
|
|
||
| @property | ||
| def source_uri(self) -> str: | ||
|
||
| """Returns the canonical source URI.""" | ||
| return self.uri | ||
|
|
||
|
|
||
| class UploadableRemoteFile(RemoteFile, ABC): | ||
| """ | ||
|
|
@@ -48,17 +58,3 @@ def source_file_relative_path(self) -> str: | |
| Returns the relative path of the source file. | ||
| """ | ||
| return self.uri | ||
|
|
||
| @property | ||
| def file_uri_for_logging(self) -> str: | ||
| """ | ||
| Returns the URI for the file being logged. | ||
| """ | ||
| return self.uri | ||
|
|
||
| @property | ||
| def source_uri(self) -> str: | ||
| """ | ||
| Returns the Source URI for the file being logged. | ||
| """ | ||
| return self.uri | ||
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Uh oh!
There was an error while loading. Please reload this page.