From 6392251c52f468d3e7f46891a85599000dd6b430 Mon Sep 17 00:00:00 2001 From: Alfredo Garcia Date: Thu, 20 Nov 2025 10:26:47 -0600 Subject: [PATCH 1/2] Feat(Source-File): Add Excel Reader Options For sheet_name / sheet_names --- .../connectors/source-file/metadata.yaml | 2 +- .../connectors/source-file/pyproject.toml | 2 +- .../source-file/source_file/client.py | 44 ++++++++++++- .../source-file/unit_tests/test_client.py | 64 +++++++++++++++++++ docs/integrations/sources/file.md | 5 ++ 5 files changed, 114 insertions(+), 3 deletions(-) diff --git a/airbyte-integrations/connectors/source-file/metadata.yaml b/airbyte-integrations/connectors/source-file/metadata.yaml index 6a97d7d1bd84..b29a9d57ba1f 100644 --- a/airbyte-integrations/connectors/source-file/metadata.yaml +++ b/airbyte-integrations/connectors/source-file/metadata.yaml @@ -10,7 +10,7 @@ data: connectorSubtype: file connectorType: source definitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77 - dockerImageTag: 0.6.0 + dockerImageTag: 0.7.0 dockerRepository: airbyte/source-file documentationUrl: https://docs.airbyte.com/integrations/sources/file githubIssueLabel: source-file diff --git a/airbyte-integrations/connectors/source-file/pyproject.toml b/airbyte-integrations/connectors/source-file/pyproject.toml index 1731c7bdb45e..e12981b49949 100644 --- a/airbyte-integrations/connectors/source-file/pyproject.toml +++ b/airbyte-integrations/connectors/source-file/pyproject.toml @@ -3,7 +3,7 @@ requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" [tool.poetry] -version = "0.6.0" +version = "0.7.0" name = "source-file" description = "Source implementation for File" authors = ["Airbyte "] diff --git a/airbyte-integrations/connectors/source-file/source_file/client.py b/airbyte-integrations/connectors/source-file/source_file/client.py index d0b083798575..82976ebacd4a 100644 --- a/airbyte-integrations/connectors/source-file/source_file/client.py +++ b/airbyte-integrations/connectors/source-file/source_file/client.py @@ -521,7 +521,49 @@ def openpyxl_chunk_reader(self, file, **kwargs): # Load workbook with data-only to avoid loading formulas work_book = load_workbook(filename=file, data_only=True, read_only=True) - for sheetname in work_book.sheetnames: + sheet_name_option = kwargs.get("sheet_name") + if sheet_name_option is None: + sheet_name_option = kwargs.get("sheet_names") + + if sheet_name_option is None: + target_sheets = work_book.sheetnames + else: + if isinstance(sheet_name_option, (list, tuple, set)): + requested_sheets = list(sheet_name_option) + else: + requested_sheets = [sheet_name_option] + + normalized_sheets = [] + for requested_sheet in requested_sheets: + if isinstance(requested_sheet, int): + try: + normalized_sheets.append(work_book.sheetnames[requested_sheet]) + except IndexError as err: + raise AirbyteTracedException( + message="Sheet index is out of range for the provided Excel file.", + internal_message=f"Sheet index {requested_sheet} does not exist.", + failure_type=FailureType.config_error, + ) from err + elif isinstance(requested_sheet, str): + normalized_sheets.append(requested_sheet) + else: + raise AirbyteTracedException( + message="Invalid sheet_name reader option provided.", + internal_message="sheet_name must be a string, integer index, or a list of those values.", + failure_type=FailureType.config_error, + ) + + missing_sheets = [sheet for sheet in normalized_sheets if sheet not in work_book.sheetnames] + if missing_sheets: + raise AirbyteTracedException( + message="One or more sheet names were not found in the Excel file.", + internal_message=f"Missing sheets: {missing_sheets}", + failure_type=FailureType.config_error, + ) + + target_sheets = normalized_sheets + + for sheetname in target_sheets: work_sheet = work_book[sheetname] data = list(work_sheet.iter_rows(values_only=True)) diff --git a/airbyte-integrations/connectors/source-file/unit_tests/test_client.py b/airbyte-integrations/connectors/source-file/unit_tests/test_client.py index f56ffd51beba..2ebd7985531c 100644 --- a/airbyte-integrations/connectors/source-file/unit_tests/test_client.py +++ b/airbyte-integrations/connectors/source-file/unit_tests/test_client.py @@ -288,6 +288,16 @@ def generate_excel_file(data): return tmp_file +def generate_multi_sheet_excel_file(sheet_data): + """Helper to generate an Excel file with multiple sheets.""" + tmp_file = NamedTemporaryFile(suffix=".xlsx", delete=False) + with pd.ExcelWriter(tmp_file.name, engine="openpyxl") as writer: + for sheet_name, data in sheet_data.items(): + pd.DataFrame(data).to_excel(writer, index=False, header=False, sheet_name=sheet_name) + tmp_file.seek(0) + return tmp_file + + def test_excel_reader_option_names(config): """ Test the 'names' option for the Excel reader. @@ -344,3 +354,57 @@ def test_excel_reader_option_header(config): read_file = next(client.load_dataframes(fp=tmp.name)) assert isinstance(read_file, pd.DataFrame) assert read_file.to_dict(orient="records") == expected_data + + +def test_excel_reader_option_sheet_name(config): + config["format"] = "excel" + config["reader_options"] = {"sheet_name": "SheetB"} + client = Client(**config) + + sheet_data = { + "SheetA": [["A1", "A2"], ["Value1", "Value2"]], + "SheetB": [["B1", "B2"], ["Keep1", "Keep2"]], + } + expected_data = [{"B1": "Keep1", "B2": "Keep2"}] + + with generate_multi_sheet_excel_file(sheet_data) as tmp: + records = [] + for df_chunk in client.load_dataframes(fp=tmp.name): + records.extend(df_chunk.to_dict(orient="records")) + assert records == expected_data + + +def test_excel_reader_option_sheet_name_list(config): + config["format"] = "excel" + config["reader_options"] = {"sheet_name": ["SheetB", "SheetC"]} + client = Client(**config) + + sheet_data = { + "SheetA": [["A1", "A2"], ["Value1", "Value2"]], + "SheetB": [["B1", "B2"], ["Keep1", "Keep2"]], + "SheetC": [["C1", "C2"], ["Keep3", "Keep4"]], + } + expected_data = [ + {"B1": "Keep1", "B2": "Keep2"}, + {"C1": "Keep3", "C2": "Keep4"}, + ] + + with generate_multi_sheet_excel_file(sheet_data) as tmp: + records = [] + for df_chunk in client.load_dataframes(fp=tmp.name): + records.extend(df_chunk.to_dict(orient="records")) + assert records == expected_data + + +def test_excel_reader_option_sheet_name_missing(config): + config["format"] = "excel" + config["reader_options"] = {"sheet_name": "Missing"} + client = Client(**config) + + sheet_data = { + "SheetA": [["A1", "A2"], ["Value1", "Value2"]], + } + + with generate_multi_sheet_excel_file(sheet_data) as tmp: + with pytest.raises(AirbyteTracedException): + next(client.load_dataframes(fp=tmp.name)) diff --git a/docs/integrations/sources/file.md b/docs/integrations/sources/file.md index 94c3a1c46c6d..9cf99e3e5d86 100644 --- a/docs/integrations/sources/file.md +++ b/docs/integrations/sources/file.md @@ -186,6 +186,10 @@ For example, you can use the `{"orient" : "records"}` to change how orientation If you need to read Excel Binary Workbook, please specify `excel_binary` format in `File Format` select. +#### Excel-specific reader options + +- `sheet_name`: Limit the sync to one or more worksheets inside the workbook. Accepts either a single sheet name (string), a zero-based sheet index (integer), or an array mixing names and indexes (for example, `{"sheet_name": ["Finance", 2]}`). When omitted, every sheet in the file is read sequentially and appended into the same destination table. If any requested sheet does not exist, setup will fail with a configuration error. + :::caution This connector does not support syncing unstructured data files such as raw text, audio, or videos. ::: @@ -298,6 +302,7 @@ In order to read large files from a remote location, this connector uses the [sm | Version | Date | Pull Request | Subject | |:-----------|:-----------|:---------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| 0.7.0 | 2025-11-20 | [XXXXX](https://github.com/airbytehq/airbyte/pull/XXXXX) | Add `sheet_name` or `sheet_names` handling for Excel `reader_options` | | 0.6.0 | 2025-11-03 | [69148](https://github.com/airbytehq/airbyte/pull/69148) | Promoting release candidate 0.6.0-rc.1 to a main version. | | 0.6.0-rc.1 | 2025-10-22 | [68588](https://github.com/airbytehq/airbyte/pull/68588) | Update to airbyte-cdk ^v7 | | 0.5.46 | 2025-10-21 | [68484](https://github.com/airbytehq/airbyte/pull/68484) | Update dependencies | From 803491e1ac7365d2ecb712d32bcbd6e3b4925d34 Mon Sep 17 00:00:00 2001 From: Alfredo Garcia Date: Thu, 20 Nov 2025 10:28:22 -0600 Subject: [PATCH 2/2] Update Doc --- docs/integrations/sources/file.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/integrations/sources/file.md b/docs/integrations/sources/file.md index 9cf99e3e5d86..ebed1cc6403d 100644 --- a/docs/integrations/sources/file.md +++ b/docs/integrations/sources/file.md @@ -302,7 +302,7 @@ In order to read large files from a remote location, this connector uses the [sm | Version | Date | Pull Request | Subject | |:-----------|:-----------|:---------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| 0.7.0 | 2025-11-20 | [XXXXX](https://github.com/airbytehq/airbyte/pull/XXXXX) | Add `sheet_name` or `sheet_names` handling for Excel `reader_options` | +| 0.7.0 | 2025-11-20 | [69774](https://github.com/airbytehq/airbyte/pull/69774) | Add `sheet_name` or `sheet_names` handling for Excel `reader_options` | | 0.6.0 | 2025-11-03 | [69148](https://github.com/airbytehq/airbyte/pull/69148) | Promoting release candidate 0.6.0-rc.1 to a main version. | | 0.6.0-rc.1 | 2025-10-22 | [68588](https://github.com/airbytehq/airbyte/pull/68588) | Update to airbyte-cdk ^v7 | | 0.5.46 | 2025-10-21 | [68484](https://github.com/airbytehq/airbyte/pull/68484) | Update dependencies |