Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
14 changes: 12 additions & 2 deletions airbyte-integrations/connectors/source-file/source_file/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,11 +517,21 @@ def openpyxl_chunk_reader(self, file, **kwargs):
skiprows = kwargs.get("skiprows", 0)
user_provided_column_names = kwargs.get("names")
chunk_size = 500

sheet_name = kwargs.get("sheet_name", None)
# Load workbook with data-only to avoid loading formulas
work_book = load_workbook(filename=file, data_only=True, read_only=True)

for sheetname in work_book.sheetnames:
# Iterate through sheets
# Panda's read_excel allows specifying sheet_name as str, int, list of str/int or None (all sheets)
for idx, sheetname in enumerate(work_book.sheetnames):
# Handle sheet_name filtering
if sheet_name and isinstance(sheet_name, str) and sheetname != sheet_name:
continue
elif sheet_name and isinstance(sheet_name, int) and idx != sheet_name:
continue
elif sheet_name and isinstance(sheet_name, list) and sheetname not in sheet_name and idx not in sheet_name:
continue

work_sheet = work_book[sheetname]
data = list(work_sheet.iter_rows(values_only=True))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,30 @@ def test_load_dataframes_xlsx(config, absolute_path, test_files, file_name, shou
expected = read_excel(f, engine="openpyxl")
assert read_file.equals(expected)

@pytest.mark.parametrize("file_name, should_raise_error, sheet_name", [
("test-with-multiple-sheets.xlsx", False, "unit_tests"), # single sheet by name
("test-with-multiple-sheets.xlsx", False, 0), # single sheet by index
("test-with-multiple-sheets.xlsx", False, ["unit_tests"]), # list of sheet names
("test-with-multiple-sheets.xlsx", False, [0]) # list of sheet indices
])
def test_load_dataframes_xlsx_with_sheets(config, absolute_path, test_files, file_name, should_raise_error, sheet_name):
config["format"] = "excel"
config["reader_options"] = {"sheet_name": sheet_name}
client = Client(**config)
f = f"{absolute_path}/{test_files}/{file_name}"
if should_raise_error:
with pytest.raises(AirbyteTracedException):
next(client.load_dataframes(fp=f))
else:
read_file = next(client.load_dataframes(fp=f))
expected = read_excel(f, engine="openpyxl", sheet_name=sheet_name)
print(sheet_name)
if isinstance(sheet_name, list):
print(sheet_name)
print(expected)
expected = expected[sheet_name[0]]
print(expected)
assert read_file.equals(expected)

@pytest.mark.parametrize("file_format, file_path", [("json", "formats/json/demo.json"), ("jsonl", "formats/jsonl/jsonl_nested.jsonl")])
def test_load_nested_json(client, config, absolute_path, test_files, file_format, file_path):
Expand Down
Loading