Skip to content

Commit 08d6ed5

Browse files
[FEATURE] from_hub raise on existing dataset name (#5358)
# Description <!-- Please include a summary of the changes and the related issue. Please also include relevant motivation and context. List any dependencies that are required for this change. --> This PR changes the behaviour of the `from_disk` method when a dataset of the same name already exists. Currently, a new dataset is create with the name + uuid. This change will: - check id the dataset name exists, and if so - warn that the dataset exists and that the `name` parameter could be used to create a new one - try to push records to the existing dataset with a try except to add more context Closes #5346 **Type of change** - Improvement (change adding some improvement to an existing functionality) --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 5c12e32 commit 08d6ed5

File tree

2 files changed

+43
-21
lines changed

2 files changed

+43
-21
lines changed

argilla/src/argilla/datasets/_export/_disk.py

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
from abc import ABC
2020
from pathlib import Path
2121
from typing import TYPE_CHECKING, Optional, Tuple, Type, Union
22-
from uuid import uuid4
2322

23+
from argilla._exceptions import RecordsIngestionError, ArgillaError
2424
from argilla._models import DatasetModel
2525
from argilla.client import Argilla
2626
from argilla.settings import Settings
@@ -90,28 +90,39 @@ def from_disk(
9090

9191
# Get the relevant workspace_id of the incoming dataset
9292
if isinstance(workspace, str):
93-
workspace_id = client.workspaces(workspace).id
94-
elif isinstance(workspace, Workspace):
95-
workspace_id = workspace.id
93+
workspace = client.workspaces(workspace)
94+
if not workspace:
95+
raise ArgillaError(f"Workspace {workspace} not found on the server.")
9696
else:
9797
warnings.warn("Workspace not provided. Using default workspace.")
98-
workspace_id = client.workspaces.default.id
99-
dataset_model.workspace_id = workspace_id
98+
workspace = client.workspaces.default
99+
dataset_model.workspace_id = workspace.id
100100

101-
# Get a relevant and unique name for the incoming dataset.
102-
if name:
103-
logging.warning(f"Changing dataset name from {dataset_model.name} to {name}")
101+
if name and (name != dataset_model.name):
102+
logging.info(f"Changing dataset name from {dataset_model.name} to {name}")
104103
dataset_model.name = name
105-
elif client.api.datasets.name_exists(name=dataset_model.name, workspace_id=workspace_id):
106-
logging.warning(f"Loaded dataset name {dataset_model.name} already exists. Changing to unique UUID.")
107-
dataset_model.name = f"{dataset_model.name}_{uuid4()}"
108-
109-
# Create the dataset and load the settings and records
110-
dataset = cls.from_model(model=dataset_model, client=client)
111-
dataset.settings = Settings.from_json(path=settings_path)
112-
dataset.create()
104+
105+
if client.api.datasets.name_exists(name=dataset_model.name, workspace_id=workspace.id):
106+
warnings.warn(
107+
f"Loaded dataset name {dataset_model.name} already exists in the workspace {workspace.name} so using it. To create a new dataset, provide a unique name to the `name` parameter."
108+
)
109+
dataset_model = client.api.datasets.get_by_name_and_workspace_id(
110+
name=dataset_model.name, workspace_id=workspace.id
111+
)
112+
dataset = cls.from_model(model=dataset_model, client=client)
113+
else:
114+
# Create a new dataset and load the settings and records
115+
dataset = cls.from_model(model=dataset_model, client=client)
116+
dataset.settings = Settings.from_json(path=settings_path)
117+
dataset.create()
118+
113119
if os.path.exists(records_path) and with_records:
114-
dataset.records.from_json(path=records_path)
120+
try:
121+
dataset.records.from_json(path=records_path)
122+
except RecordsIngestionError as e:
123+
raise RecordsIngestionError(
124+
message="Error importing dataset records from disk. Records and datasets settings are not compatible."
125+
) from e
115126
return dataset
116127

117128
############################

argilla/tests/integration/test_export_dataset.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
import argilla as rg
2424
import pytest
25+
from argilla._exceptions import ConflictError
2526
from huggingface_hub.utils._errors import BadRequestError, FileMetadataError, HfHubHTTPError
2627

2728
_RETRIES = 5
@@ -125,7 +126,9 @@ def test_import_dataset_from_disk(
125126

126127
with TemporaryDirectory() as temp_dir:
127128
output_dir = dataset.to_disk(path=temp_dir, with_records=with_records_export)
128-
new_dataset = rg.Dataset.from_disk(output_dir, client=client, with_records=with_records_import)
129+
new_dataset = rg.Dataset.from_disk(
130+
output_dir, client=client, with_records=with_records_import, name=f"test_{uuid.uuid4()}"
131+
)
129132

130133
if with_records_export and with_records_import:
131134
for i, record in enumerate(new_dataset.records(with_suggestions=True)):
@@ -175,11 +178,19 @@ def test_import_dataset_from_hub(
175178
match="Trying to load a dataset `with_records=True` but dataset does not contain any records.",
176179
):
177180
new_dataset = rg.Dataset.from_hub(
178-
repo_id=repo_id, client=client, with_records=with_records_import, token=token
181+
repo_id=repo_id,
182+
client=client,
183+
with_records=with_records_import,
184+
token=token,
185+
name=f"test_{uuid.uuid4()}",
179186
)
180187
else:
181188
new_dataset = rg.Dataset.from_hub(
182-
repo_id=repo_id, client=client, with_records=with_records_import, token=token
189+
repo_id=repo_id,
190+
client=client,
191+
with_records=with_records_import,
192+
token=token,
193+
name=f"test_{uuid.uuid4()}",
183194
)
184195

185196
if with_records_import and with_records_export:

0 commit comments

Comments
 (0)