Make ModelSearchArguments and DatasetSearchArguments more robust (#1300)

Wauplin · web-flow · commit 6d360e377d22 · 2023-01-19T16:56:16.000+01:00
* Make ModelTags and DatasetTags more robust to server-side changes

* add warnings about DatasetSearchArguments and ModelSearchArguments inefficiency

* add another warning in doc
diff --git a/docs/source/searching-the-hub.mdx b/docs/source/searching-the-hub.mdx
@@ -45,6 +45,15 @@ The `huggingface_hub` provides a user-friendly interface to know what exactly ca
 
 These are nested namespace objects that have **every single option** available on the Hub and that will return what should be passed to `filter`. The best of all is: it has tab completion 🎊 .
 
+<Tip warning={true}>
+
+[`ModelSearchArguments`] and [`DatasetSearchArguments`] are legacy helpers meant for exploratory
+purposes only. Their initialization require listing all models and datasets on the Hub which
+makes them increasingly slower as the number of repos on the Hub increases. For some production-ready code,
+consider passing raw strings when making a filtered search on the Hub.
+
+</Tip>
+
 ## Searching for a Model
 
 Let's pose a problem that would be complicated to solve without access to this information:
diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py
@@ -583,9 +583,21 @@ class ModelSearchArguments(AttributeDictionary):
 
     ```python
     >>> args = ModelSearchArguments()
-    >>> args.author_or_organization.huggingface
+
+    >>> args.author.huggingface
+    'huggingface'
+
     >>> args.language.en
+    'en'
     ```
+
+    <Tip warning={true}>
+
+    `ModelSearchArguments` is a legacy class meant for exploratory purposes only. Its
+    initialization requires listing all models on the Hub which makes it increasingly
+    slower as the number of repos on the Hub increases.
+
+    </Tip>
     """
 
     def __init__(self, api: Optional["HfApi"] = None):
@@ -621,9 +633,21 @@ class DatasetSearchArguments(AttributeDictionary):
 
     ```python
     >>> args = DatasetSearchArguments()
-    >>> args.author_or_organization.huggingface
+
+    >>> args.author.huggingface
+    'huggingface'
+
     >>> args.language.en
+    'language:en'
     ```
+
+    <Tip warning={true}>
+
+    `DatasetSearchArguments` is a legacy class meant for exploratory purposes only. Its
+    initialization requires listing all datasets on the Hub which makes it increasingly
+    slower as the number of repos on the Hub increases.
+
+    </Tip>
     """
 
     def __init__(self, api: Optional["HfApi"] = None):
diff --git a/src/huggingface_hub/utils/endpoint_helpers.py b/src/huggingface_hub/utils/endpoint_helpers.py
@@ -300,14 +300,13 @@ def __init__(self, tag_dictionary: dict, keys: Optional[list] = None):
             self._unpack_and_assign_dictionary(key)
 
     def _unpack_and_assign_dictionary(self, key: str):
-        "Assignes nested attributes to `self.key` containing information as an `AttributeDictionary`"
-        setattr(self, key, AttributeDictionary())
-        for item in self._tag_dictionary[key]:
-            ref = getattr(self, key)
-            item["label"] = (
-                item["label"].replace(" ", "").replace("-", "_").replace(".", "_")
-            )
-            setattr(ref, item["label"], item["id"])
+        "Assign nested attributes to `self.key` containing information as an `AttributeDictionary`"
+        ref = AttributeDictionary()
+        setattr(self, key, ref)
+        for item in self._tag_dictionary.get(key, []):
+            label = item["label"].replace(" ", "").replace("-", "_").replace(".", "_")
+            ref[label] = item["id"]
+        self[key] = ref
 
 
 class ModelTags(GeneralTags):
diff --git a/tests/test_endpoint_helpers.py b/tests/test_endpoint_helpers.py
@@ -144,38 +144,44 @@ def test_filter(self):
 class ModelTagsTest(unittest.TestCase):
     @with_production_testing
     def test_tags(self):
-        _api = HfApi()
-        path = f"{_api.endpoint}/api/models-tags-by-type"
-        r = requests.get(path)
-        r.raise_for_status()
-        d = r.json()
-        o = ModelTags(d)
-        for kind in ["library", "language", "license", "dataset", "pipeline_tag"]:
-            self.assertTrue(len(getattr(o, kind).keys()) > 0)
+        # ModelTags instantiation must not fail!
+        res = requests.get(f"{HfApi().endpoint}/api/models-tags-by-type")
+        res.raise_for_status()
+        tags = ModelTags(res.json())
+
+        # Check existing keys to get notified about server-side changes
+        for existing_key in [
+            "dataset",
+            "language",
+            "library",
+            "license",
+            "pipeline_tag",
+        ]:
+            self.assertGreater(len(getattr(tags, existing_key).keys()), 0)
 
 
 class DatasetTagsTest(unittest.TestCase):
-    @unittest.skip(
-        "DatasetTags is currently broken. See"
-        " https://github.com/huggingface/huggingface_hub/pull/1250. Skip test until"
-        " it's fixed."
-    )
     @with_production_testing
     def test_tags(self):
-        _api = HfApi()
-        path = f"{_api.endpoint}/api/datasets-tags-by-type"
-        r = requests.get(path)
-        r.raise_for_status()
-        d = r.json()
-        o = DatasetTags(d)
-        for kind in [
-            "language",
-            "multilinguality",
+        # DatasetTags instantiation must not fail!
+        res = requests.get(f"{HfApi().endpoint}/api/datasets-tags-by-type")
+        res.raise_for_status()
+        tags = DatasetTags(res.json())
+
+        # Some keys existed before but have been removed server-side
+        for missing_key in (
             "language_creators",
-            "task_categories",
-            "size_categories",
+            "multilinguality",
+        ):
+            self.assertEqual(len(getattr(tags, missing_key).keys()), 0)
+
+        # Check existing keys to get notified about server-side changes
+        for existing_key in [
             "benchmark",
-            "task_ids",
+            "language",
             "license",
+            "size_categories",
+            "task_categories",
+            "task_ids",
         ]:
-            self.assertTrue(len(getattr(o, kind).keys()) > 0)
+            self.assertGreater(len(getattr(tags, existing_key).keys()), 0)