Skip to content

Commit 035ae24

Browse files
authored
[GuideLLM Refactor] Propagate valid failures from HuggingFace datasets loading (#413)
## Summary Propagate errors not related to not found dataset errors to surface to the user that the dataset configuration is incorrect and how to address/fix it ## Details <!-- Provide a detailed list of all changes introduced in this pull request. --> - [ ] ## Test Plan <!-- List the steps needed to test this PR. --> - ## Related Issues <!-- Link any relevant issues that this PR addresses. --> - Resolves # --- - [ ] "I certify that all code in this PR is my own, except as noted below." ## Use of AI - [ ] Includes AI-assisted code completion - [ ] Includes code generated by an AI application - [ ] Includes AI-generated tests (NOTE: AI written tests should have a docstring that includes `## WRITTEN BY AI ##`)
2 parents 5f4a731 + eb84935 commit 035ae24

File tree

2 files changed

+28
-16
lines changed

2 files changed

+28
-16
lines changed

src/guidellm/data/deserializers/deserializer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def deserialize(
7777
if dataset is None:
7878
raise DataNotSupportedError(
7979
f"No suitable deserializer found for data {data} "
80-
f"with kwargs {data_kwargs} and type_ {type_}."
80+
f"with kwargs {data_kwargs} and deserializer type {type_}."
8181
)
8282

8383
if resolve_split:

src/guidellm/data/deserializers/huggingface.py

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@
1212
load_dataset,
1313
load_from_disk,
1414
)
15+
from datasets.exceptions import (
16+
DataFilesNotFoundError,
17+
DatasetNotFoundError,
18+
FileNotFoundDatasetsError,
19+
)
1520
from transformers import PreTrainedTokenizerBase
1621

1722
from guidellm.data.deserializers.deserializer import (
@@ -35,38 +40,45 @@ def __call__(
3540
_ = (processor_factory, random_seed)
3641

3742
if isinstance(
38-
data, (Dataset, IterableDataset, DatasetDict, IterableDatasetDict)
43+
data, Dataset | IterableDataset | DatasetDict | IterableDatasetDict
3944
):
4045
return data
4146

4247
load_error = None
4348

4449
if (
45-
isinstance(data, (str, Path))
50+
isinstance(data, str | Path)
4651
and (path := Path(data)).exists()
4752
and ((path.is_file() and path.suffix == ".py") or path.is_dir())
4853
):
4954
# Handle python script or nested python script in a directory
5055
try:
5156
return load_dataset(str(data), **data_kwargs)
52-
except Exception as err: # noqa: BLE001
53-
load_error = err
54-
55-
if (
56-
isinstance(data, (str, Path))
57-
and (path := Path(data)).exists()
58-
and path.is_dir()
59-
):
60-
# Handle local dataset directory
61-
try:
62-
return load_from_disk(str(data), **data_kwargs)
63-
except Exception as err: # noqa: BLE001
57+
except (
58+
FileNotFoundDatasetsError,
59+
DatasetNotFoundError,
60+
DataFilesNotFoundError,
61+
) as err:
6462
load_error = err
63+
except Exception: # noqa: BLE001
64+
# Try loading as a local dataset directory next
65+
try:
66+
return load_from_disk(str(data), **data_kwargs)
67+
except (
68+
FileNotFoundDatasetsError,
69+
DatasetNotFoundError,
70+
DataFilesNotFoundError,
71+
) as err2:
72+
load_error = err2
6573

6674
try:
6775
# Handle dataset identifier from the Hugging Face Hub
6876
return load_dataset(str(data), **data_kwargs)
69-
except Exception as err: # noqa: BLE001
77+
except (
78+
FileNotFoundDatasetsError,
79+
DatasetNotFoundError,
80+
DataFilesNotFoundError,
81+
) as err:
7082
load_error = err
7183

7284
not_supported = DataNotSupportedError(

0 commit comments

Comments
 (0)