[REFACTOR] refactor from hub method to simplify method (#5420)

burtenshaw · frascuchon · pre-commit-ci[bot] · web-flow · commit e28ef13caf11 · 2024-08-29T11:43:32.000+02:00
# Description  Closes #<issue_number> **Type of change**  - Bug fix (non-breaking change which fixes an issue) - New feature (non-breaking change which adds functionality) - Breaking change (fix or feature that would cause existing functionality to not work as expected) - Refactor (change restructuring the codebase without changing functionality) - Improvement (change adding some improvement to an existing functionality) - Documentation update **How Has This Been Tested**  **Checklist**  - I added relevant documentation - I followed the style guidelines of this project - I did a self-review of my code - I made corresponding changes to the documentation - I confirm My changes generate no new warnings - I have added tests that prove my fix is effective or that my feature works - I have added relevant notes to the CHANGELOG.md file (See https://keepachangelog.com/) --------- Co-authored-by: Paco Aranda <frascuchon@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/argilla/src/argilla/datasets/_export/_hub.py b/argilla/src/argilla/datasets/_export/_hub.py
@@ -16,14 +16,15 @@
 import warnings
 from collections import defaultdict
 from tempfile import TemporaryDirectory
-from typing import TYPE_CHECKING, Any, Optional, Type, Union
+from typing import TYPE_CHECKING, Any, Optional, Type, Union, Dict
 from uuid import UUID
 
+from datasets import DatasetDict
+from datasets.data_files import EmptyDatasetError
+
 from argilla._exceptions._api import UnprocessableEntityError
 from argilla._exceptions._records import RecordsIngestionError
 from argilla._exceptions._settings import SettingsError
-from datasets.data_files import EmptyDatasetError
-
 from argilla.datasets._export._disk import DiskImportExportMixin
 from argilla.records._mapping import IngestedRecordMapper
 from argilla.responses import Response
@@ -72,6 +73,7 @@ def to_hub(
 
         with TemporaryDirectory() as tmpdirname:
             config_dir = os.path.join(tmpdirname)
+
             self.to_disk(path=config_dir, with_records=False)
 
             if generate_card:
@@ -129,9 +131,12 @@ def from_hub(
         Returns:
             A `Dataset` loaded from the Hugging Face Hub.
         """
-        from datasets import Dataset, DatasetDict, load_dataset
+        from datasets import load_dataset
         from huggingface_hub import snapshot_download
 
+        if name is None:
+            name = repo_id.replace("/", "_")
+
         if settings is not None:
             dataset = cls(name=name, settings=settings)
             dataset.create()
@@ -150,31 +155,9 @@ def from_hub(
 
         if with_records:
             try:
-                hf_dataset: Dataset = load_dataset(path=repo_id, **kwargs)  # type: ignore
-                if isinstance(hf_dataset, DatasetDict) and "split" not in kwargs:
-                    if len(hf_dataset.keys()) > 1:
-                        raise ValueError(
-                            "Only one dataset can be loaded at a time, use `split` to select a split, available splits"
-                            f" are: {', '.join(hf_dataset.keys())}."
-                        )
-                    hf_dataset: Dataset = hf_dataset[list(hf_dataset.keys())[0]]
-                for feature in hf_dataset.features:
-                    if feature not in dataset.settings.fields or feature not in dataset.settings.questions:
-                        warnings.warn(
-                            message=f"Feature {feature} in Hugging Face dataset is not defined in dataset settings."
-                        )
-                        warnings.warn(
-                            message=f"Available fields: {dataset.settings.fields}. Available questions: {dataset.settings.questions}."
-                        )
-                try:
-                    cls._log_dataset_records(hf_dataset=hf_dataset, dataset=dataset)
-                except (RecordsIngestionError, UnprocessableEntityError) as e:
-                    if settings is not None:
-                        raise SettingsError(
-                            message=f"Failed to load records from Hugging Face dataset. Defined settings do not match dataset schema {hf_dataset.features}"
-                        ) from e
-                    else:
-                        raise e
+                hf_dataset = load_dataset(path=repo_id, **kwargs)  # type: ignore
+                hf_dataset = cls._get_dataset_split(hf_dataset=hf_dataset, **kwargs)
+                cls._log_dataset_records(hf_dataset=hf_dataset, dataset=dataset)
             except EmptyDatasetError:
                 warnings.warn(
                     message="Trying to load a dataset `with_records=True` but dataset does not contain any records.",
@@ -221,9 +204,7 @@ def _log_dataset_records(hf_dataset: "HFDataset", dataset: "Dataset"):
         records = []
         for idx, row in enumerate(hf_dataset):
             record = mapper(row)
-            record.id = row.pop("id")
             for question_name, values in response_questions.items():
-                response_users = {}
                 response_values = values["responses"][idx]
                 response_users = values["users"][idx]
                 response_status = values["status"][idx]
@@ -240,4 +221,31 @@ def _log_dataset_records(hf_dataset: "HFDataset", dataset: "Dataset"):
                     )
                     record.responses.add(response)
             records.append(record)
-        dataset.records.log(records=records)
+
+        try:
+            dataset.records.log(records=records)
+        except (RecordsIngestionError, UnprocessableEntityError) as e:
+            raise SettingsError(
+                message=f"Failed to load records from Hugging Face dataset. Defined settings do not match dataset schema. Hugging face dataset features: {hf_dataset.features}. Argilla dataset settings : {dataset.settings}"
+            ) from e
+
+    @staticmethod
+    def _get_dataset_split(hf_dataset: "HFDataset", split: Optional[str] = None, **kwargs: Dict) -> "HFDataset":
+        """Get a single dataset from a Hugging Face dataset.
+
+        Parameters:
+            hf_dataset (HFDataset): The Hugging Face dataset to get a single dataset from.
+
+        Returns:
+            HFDataset: The single dataset.
+        """
+
+        if isinstance(hf_dataset, DatasetDict) and split is None:
+            split = next(iter(hf_dataset.keys()))
+            if len(hf_dataset.keys()) > 1:
+                warnings.warn(
+                    message=f"Multiple splits found in Hugging Face dataset. Using the first split: {split}. "
+                    f"Available splits are: {', '.join(hf_dataset.keys())}."
+                )
+            hf_dataset = hf_dataset[split]
+        return hf_dataset
diff --git a/argilla/tests/integration/test_export_dataset.py b/argilla/tests/integration/test_export_dataset.py
@@ -250,10 +250,70 @@ def test_import_dataset_from_hub_using_settings(
 
         assert new_dataset.settings.fields[0].name == "text"
         assert new_dataset.settings.questions[0].name == "label"
+
+    @pytest.mark.parametrize("with_records_import", [True, False])
+    def test_import_dataset_from_hub_using_settings(
+        self,
+        token: str,
+        dataset: rg.Dataset,
+        client,
+        mock_data: List[dict[str, Any]],
+        with_records_export: bool,
+        with_records_import: bool,
+    ):
+        repo_id = (
+            f"argilla-internal-testing/test_import_dataset_from_hub_using_settings_with_records{with_records_export}"
+        )
+        mock_dataset_name = f"test_import_dataset_from_hub_using_settings_{uuid.uuid4()}"
+        dataset.records.log(records=mock_data)
+
+        dataset.to_hub(repo_id=repo_id, with_records=with_records_export, token=token)
+        settings = rg.Settings(
+            fields=[
+                rg.TextField(name="text"),
+            ],
+            questions=[
+                rg.LabelQuestion(name="label", labels=["positive", "negative"]),
+                rg.LabelQuestion(name="extra_label", labels=["extra_positive", "extra_negative"]),
+            ],
+        )
+        if with_records_import and not with_records_export:
+            with pytest.warns(
+                expected_warning=UserWarning,
+                match="Trying to load a dataset `with_records=True` but dataset does not contain any records.",
+            ):
+                new_dataset = rg.Dataset.from_hub(
+                    repo_id=repo_id,
+                    client=client,
+                    with_records=with_records_import,
+                    token=token,
+                    settings=settings,
+                    name=mock_dataset_name,
+                )
+        else:
+            new_dataset = rg.Dataset.from_hub(
+                repo_id=repo_id,
+                client=client,
+                with_records=with_records_import,
+                token=token,
+                settings=settings,
+                name=mock_dataset_name,
+            )
+
+        if with_records_import and with_records_export:
+            for i, record in enumerate(new_dataset.records(with_suggestions=True)):
+                assert record.fields["text"] == mock_data[i]["text"]
+                assert record.suggestions["label"].value == mock_data[i]["label"]
+        else:
+            assert len(new_dataset.records.to_list()) == 0
+
+        assert new_dataset.settings.fields[0].name == "text"
+        assert new_dataset.settings.questions[0].name == "label"
         assert new_dataset.settings.questions[1].name == "extra_label"
         assert len(new_dataset.settings.questions[1].labels) == 2
         assert new_dataset.settings.questions[1].labels[0] == "extra_positive"
         assert new_dataset.settings.questions[1].labels[1] == "extra_negative"
+        assert new_dataset.name == mock_dataset_name
 
     def test_import_dataset_from_hub_using_wrong_settings(
         self,