diff --git a/airbyte-integrations/connectors/source-file/integration_tests/sample_files/test-with-multiple-sheets.xlsx b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/test-with-multiple-sheets.xlsx new file mode 100644 index 000000000000..d7b279b1b31d Binary files /dev/null and b/airbyte-integrations/connectors/source-file/integration_tests/sample_files/test-with-multiple-sheets.xlsx differ diff --git a/airbyte-integrations/connectors/source-file/source_file/client.py b/airbyte-integrations/connectors/source-file/source_file/client.py index d0b083798575..060dca74fe9f 100644 --- a/airbyte-integrations/connectors/source-file/source_file/client.py +++ b/airbyte-integrations/connectors/source-file/source_file/client.py @@ -517,11 +517,21 @@ def openpyxl_chunk_reader(self, file, **kwargs): skiprows = kwargs.get("skiprows", 0) user_provided_column_names = kwargs.get("names") chunk_size = 500 - + sheet_name = kwargs.get("sheet_name", None) # Load workbook with data-only to avoid loading formulas work_book = load_workbook(filename=file, data_only=True, read_only=True) - for sheetname in work_book.sheetnames: + # Iterate through sheets + # Panda's read_excel allows specifying sheet_name as str, int, list of str/int or None (all sheets) + for idx, sheetname in enumerate(work_book.sheetnames): + # Handle sheet_name filtering + if sheet_name and isinstance(sheet_name, str) and sheetname != sheet_name: + continue + elif sheet_name and isinstance(sheet_name, int) and idx != sheet_name: + continue + elif sheet_name and isinstance(sheet_name, list) and sheetname not in sheet_name and idx not in sheet_name: + continue + work_sheet = work_book[sheetname] data = list(work_sheet.iter_rows(values_only=True)) diff --git a/airbyte-integrations/connectors/source-file/unit_tests/test_client.py b/airbyte-integrations/connectors/source-file/unit_tests/test_client.py index f56ffd51beba..2e1555b28209 100644 --- a/airbyte-integrations/connectors/source-file/unit_tests/test_client.py +++ b/airbyte-integrations/connectors/source-file/unit_tests/test_client.py @@ -99,6 +99,30 @@ def test_load_dataframes_xlsx(config, absolute_path, test_files, file_name, shou expected = read_excel(f, engine="openpyxl") assert read_file.equals(expected) +@pytest.mark.parametrize("file_name, should_raise_error, sheet_name", [ + ("test-with-multiple-sheets.xlsx", False, "unit_tests"), # single sheet by name + ("test-with-multiple-sheets.xlsx", False, 0), # single sheet by index + ("test-with-multiple-sheets.xlsx", False, ["unit_tests"]), # list of sheet names + ("test-with-multiple-sheets.xlsx", False, [0]) # list of sheet indices + ]) +def test_load_dataframes_xlsx_with_sheets(config, absolute_path, test_files, file_name, should_raise_error, sheet_name): + config["format"] = "excel" + config["reader_options"] = {"sheet_name": sheet_name} + client = Client(**config) + f = f"{absolute_path}/{test_files}/{file_name}" + if should_raise_error: + with pytest.raises(AirbyteTracedException): + next(client.load_dataframes(fp=f)) + else: + read_file = next(client.load_dataframes(fp=f)) + expected = read_excel(f, engine="openpyxl", sheet_name=sheet_name) + print(sheet_name) + if isinstance(sheet_name, list): + print(sheet_name) + print(expected) + expected = expected[sheet_name[0]] + print(expected) + assert read_file.equals(expected) @pytest.mark.parametrize("file_format, file_path", [("json", "formats/json/demo.json"), ("jsonl", "formats/jsonl/jsonl_nested.jsonl")]) def test_load_nested_json(client, config, absolute_path, test_files, file_format, file_path):