Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion airbyte-integrations/connectors/source-file/metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ data:
connectorSubtype: file
connectorType: source
definitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77
dockerImageTag: 0.6.0
dockerImageTag: 0.7.0
dockerRepository: airbyte/source-file
documentationUrl: https://docs.airbyte.com/integrations/sources/file
githubIssueLabel: source-file
Expand Down
2 changes: 1 addition & 1 deletion airbyte-integrations/connectors/source-file/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

[tool.poetry]
version = "0.6.0"
version = "0.7.0"
name = "source-file"
description = "Source implementation for File"
authors = ["Airbyte <[email protected]>"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,49 @@ def openpyxl_chunk_reader(self, file, **kwargs):
# Load workbook with data-only to avoid loading formulas
work_book = load_workbook(filename=file, data_only=True, read_only=True)

for sheetname in work_book.sheetnames:
sheet_name_option = kwargs.get("sheet_name")
if sheet_name_option is None:
sheet_name_option = kwargs.get("sheet_names")

if sheet_name_option is None:
target_sheets = work_book.sheetnames
else:
if isinstance(sheet_name_option, (list, tuple, set)):
requested_sheets = list(sheet_name_option)
else:
requested_sheets = [sheet_name_option]

normalized_sheets = []
for requested_sheet in requested_sheets:
if isinstance(requested_sheet, int):
try:
normalized_sheets.append(work_book.sheetnames[requested_sheet])
except IndexError as err:
raise AirbyteTracedException(
message="Sheet index is out of range for the provided Excel file.",
internal_message=f"Sheet index {requested_sheet} does not exist.",
failure_type=FailureType.config_error,
) from err
elif isinstance(requested_sheet, str):
normalized_sheets.append(requested_sheet)
else:
raise AirbyteTracedException(
message="Invalid sheet_name reader option provided.",
internal_message="sheet_name must be a string, integer index, or a list of those values.",
failure_type=FailureType.config_error,
)

missing_sheets = [sheet for sheet in normalized_sheets if sheet not in work_book.sheetnames]
if missing_sheets:
raise AirbyteTracedException(
message="One or more sheet names were not found in the Excel file.",
internal_message=f"Missing sheets: {missing_sheets}",
failure_type=FailureType.config_error,
)

target_sheets = normalized_sheets

for sheetname in target_sheets:
work_sheet = work_book[sheetname]
data = list(work_sheet.iter_rows(values_only=True))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,16 @@ def generate_excel_file(data):
return tmp_file


def generate_multi_sheet_excel_file(sheet_data):
"""Helper to generate an Excel file with multiple sheets."""
tmp_file = NamedTemporaryFile(suffix=".xlsx", delete=False)
with pd.ExcelWriter(tmp_file.name, engine="openpyxl") as writer:
for sheet_name, data in sheet_data.items():
pd.DataFrame(data).to_excel(writer, index=False, header=False, sheet_name=sheet_name)
tmp_file.seek(0)
return tmp_file


def test_excel_reader_option_names(config):
"""
Test the 'names' option for the Excel reader.
Expand Down Expand Up @@ -344,3 +354,57 @@ def test_excel_reader_option_header(config):
read_file = next(client.load_dataframes(fp=tmp.name))
assert isinstance(read_file, pd.DataFrame)
assert read_file.to_dict(orient="records") == expected_data


def test_excel_reader_option_sheet_name(config):
config["format"] = "excel"
config["reader_options"] = {"sheet_name": "SheetB"}
client = Client(**config)

sheet_data = {
"SheetA": [["A1", "A2"], ["Value1", "Value2"]],
"SheetB": [["B1", "B2"], ["Keep1", "Keep2"]],
}
expected_data = [{"B1": "Keep1", "B2": "Keep2"}]

with generate_multi_sheet_excel_file(sheet_data) as tmp:
records = []
for df_chunk in client.load_dataframes(fp=tmp.name):
records.extend(df_chunk.to_dict(orient="records"))
assert records == expected_data


def test_excel_reader_option_sheet_name_list(config):
config["format"] = "excel"
config["reader_options"] = {"sheet_name": ["SheetB", "SheetC"]}
client = Client(**config)

sheet_data = {
"SheetA": [["A1", "A2"], ["Value1", "Value2"]],
"SheetB": [["B1", "B2"], ["Keep1", "Keep2"]],
"SheetC": [["C1", "C2"], ["Keep3", "Keep4"]],
}
expected_data = [
{"B1": "Keep1", "B2": "Keep2"},
{"C1": "Keep3", "C2": "Keep4"},
]

with generate_multi_sheet_excel_file(sheet_data) as tmp:
records = []
for df_chunk in client.load_dataframes(fp=tmp.name):
records.extend(df_chunk.to_dict(orient="records"))
assert records == expected_data


def test_excel_reader_option_sheet_name_missing(config):
config["format"] = "excel"
config["reader_options"] = {"sheet_name": "Missing"}
client = Client(**config)

sheet_data = {
"SheetA": [["A1", "A2"], ["Value1", "Value2"]],
}

with generate_multi_sheet_excel_file(sheet_data) as tmp:
with pytest.raises(AirbyteTracedException):
next(client.load_dataframes(fp=tmp.name))
5 changes: 5 additions & 0 deletions docs/integrations/sources/file.md
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,10 @@ For example, you can use the `{"orient" : "records"}` to change how orientation

If you need to read Excel Binary Workbook, please specify `excel_binary` format in `File Format` select.

#### Excel-specific reader options

- `sheet_name`: Limit the sync to one or more worksheets inside the workbook. Accepts either a single sheet name (string), a zero-based sheet index (integer), or an array mixing names and indexes (for example, `{"sheet_name": ["Finance", 2]}`). When omitted, every sheet in the file is read sequentially and appended into the same destination table. If any requested sheet does not exist, setup will fail with a configuration error.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ [vale] reported by reviewdog 🐶
[Google.Will] Avoid using 'will'.


:::caution
This connector does not support syncing unstructured data files such as raw text, audio, or videos.
:::
Expand Down Expand Up @@ -298,6 +302,7 @@ In order to read large files from a remote location, this connector uses the [sm

| Version | Date | Pull Request | Subject |
|:-----------|:-----------|:---------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| 0.7.0 | 2025-11-20 | [69774](https://github.com/airbytehq/airbyte/pull/69774) | Add `sheet_name` or `sheet_names` handling for Excel `reader_options` |
| 0.6.0 | 2025-11-03 | [69148](https://github.com/airbytehq/airbyte/pull/69148) | Promoting release candidate 0.6.0-rc.1 to a main version. |
| 0.6.0-rc.1 | 2025-10-22 | [68588](https://github.com/airbytehq/airbyte/pull/68588) | Update to airbyte-cdk ^v7 |
| 0.5.46 | 2025-10-21 | [68484](https://github.com/airbytehq/airbyte/pull/68484) | Update dependencies |
Expand Down
Loading