Add support for pagination in list_models list_datasets and list_spaces (#1176)

Wauplin · web-flow · commit 711f68838731 · 2022-11-14T12:40:57.000+01:00
* Add support for (future) pagination

* better (and fix) handling of links header

* Limit pagination

* add comment
diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py
@@ -17,6 +17,7 @@
 import re
 import warnings
 from dataclasses import dataclass, field
+from itertools import islice
 from pathlib import Path
 from typing import Any, BinaryIO, Dict, Iterable, Iterator, List, Optional, Tuple, Union
 from urllib.parse import quote
@@ -71,6 +72,7 @@
     _deprecate_method,
     _deprecate_positional_args,
 )
+from .utils._pagination import paginate
 from .utils._typing import Literal, TypedDict
 from .utils.endpoint_helpers import (
     AttributeDictionary,
@@ -808,15 +810,11 @@ def list_models(
             params.update({"config": True})
         if cardData:
             params.update({"cardData": True})
-        r = requests.get(path, params=params, headers=headers)
-        hf_raise_for_status(r)
-        items = [ModelInfo(**x) for x in r.json()]
 
-        # If pagination has been enabled server-side, older versions of `huggingface_hub`
-        # are deprecated as output is truncated.
-        _warn_if_truncated(
-            items, total_count=r.headers.get("X-Total-Count"), limit=limit
-        )
+        data = paginate(path, params=params, headers=headers)
+        if limit is not None:
+            data = islice(data, limit)  # Do not iterate over all pages
+        items = [ModelInfo(**x) for x in data]
 
         if emissions_thresholds is not None:
             if cardData is None:
@@ -1015,17 +1013,11 @@ def list_datasets(
             params.update({"limit": limit})
         if full or cardData:
             params.update({"full": True})
-        r = requests.get(path, params=params, headers=headers)
-        hf_raise_for_status(r)
-        items = [DatasetInfo(**x) for x in r.json()]
 
-        # If pagination has been enabled server-side, older versions of `huggingface_hub`
-        # are deprecated as output is truncated.
-        _warn_if_truncated(
-            items, total_count=r.headers.get("X-Total-Count"), limit=limit
-        )
-
-        return items
+        data = paginate(path, params=params, headers=headers)
+        if limit is not None:
+            data = islice(data, limit)  # Do not iterate over all pages
+        return [DatasetInfo(**x) for x in data]
 
     def _unpack_dataset_filter(self, dataset_filter: DatasetFilter):
         """
@@ -1162,17 +1154,11 @@ def list_spaces(
             params.update({"datasets": datasets})
         if models is not None:
             params.update({"models": models})
-        r = requests.get(path, params=params, headers=headers)
-        hf_raise_for_status(r)
-        items = [SpaceInfo(**x) for x in r.json()]
-
-        # If pagination has been enabled server-side, older versions of `huggingface_hub`
-        # are deprecated as output is truncated.
-        _warn_if_truncated(
-            items, total_count=r.headers.get("X-Total-Count"), limit=limit
-        )
 
-        return items
+        data = paginate(path, params=params, headers=headers)
+        if limit is not None:
+            data = islice(data, limit)  # Do not iterate over all pages
+        return [SpaceInfo(**x) for x in data]
 
     @validate_hf_hub_args
     def model_info(
@@ -3474,38 +3460,6 @@ def _parse_revision_from_pr_url(pr_url: str) -> str:
     return f"refs/pr/{re_match[1]}"
 
 
-def _warn_if_truncated(
-    items: List[Any], limit: Optional[int], total_count: Optional[str]
-) -> None:
-    # TODO: remove this once pagination is properly implemented in `huggingface_hub`.
-    if total_count is None:
-        # Total count header not implemented
-        return
-
-    try:
-        total_count_int = int(total_count)
-    except ValueError:
-        # Total count header not implemented properly server-side
-        return
-
-    if len(items) == total_count_int:
-        # All items have been returned => not truncated
-        return
-
-    if limit is not None and len(items) == limit:
-        # `limit` is set => truncation is expected
-        return
-
-    # Otherwise, pagination has been enabled server-side and the output has been
-    # truncated by server => warn user.
-    warnings.warn(
-        "The list of repos returned by the server has been truncated. Listing repos"
-        " from the Hub using `list_models`, `list_datasets` and `list_spaces` now"
-        " requires pagination. To get the full list of repos, please consider upgrading"
-        " `huggingface_hub` to its latest version."
-    )
-
-
 api = HfApi()
 
 set_access_token = api.set_access_token
diff --git a/src/huggingface_hub/utils/_pagination.py b/src/huggingface_hub/utils/_pagination.py
@@ -0,0 +1,54 @@
+# coding=utf-8
+# Copyright 2022-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains utilities to handle pagination on Huggingface Hub."""
+from typing import Dict, Iterable, Optional
+
+import requests
+
+from . import hf_raise_for_status, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def paginate(path: str, params: Dict, headers: Dict) -> Iterable:
+    """Fetch a list of models/datasets/spaces and paginate through results.
+
+    For now, pagination is not mandatory on the Hub. However at some point the number of
+    repos per page will be limited for performance reasons. This helper makes `huggingface_hub`
+    compliant with future server-side updates.
+
+    This is using the same "Link" header format as GitHub.
+    See:
+    - https://requests.readthedocs.io/en/latest/api/#requests.Response.links
+    - https://docs.github.com/en/rest/guides/traversing-with-pagination#link-header
+    """
+    r = requests.get(path, params=params, headers=headers)
+    hf_raise_for_status(r)
+    yield from r.json()
+
+    # If pagination is implemented server-side, follow pages
+    # Next link already contains query params
+    next_page = _get_next_page(r)
+    while next_page is not None:
+        logger.debug(f"Pagination detected. Requesting next page: {next_page}")
+        r = requests.get(next_page, headers=headers)
+        hf_raise_for_status(r)
+        yield from r.json()
+        next_page = _get_next_page(r)
+
+
+def _get_next_page(response: requests.Response) -> Optional[str]:
+    return response.links.get("next", {}).get("url")
diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py
@@ -55,7 +55,6 @@
     ModelSearchArguments,
     RepoFile,
     SpaceInfo,
-    _warn_if_truncated,
     erase_from_credential_store,
     read_from_credential_store,
     repo_type_and_id_from_hf_id,
@@ -2297,26 +2296,3 @@ def _assert_token_is(
         self, mock_build_hf_headers: Mock, expected_value: str
     ) -> None:
         self.assertEqual(mock_build_hf_headers.call_args[1]["token"], expected_value)
-
-
-class WarnIfTruncatedTest(unittest.TestCase):
-    def test_warn_if_truncated(self) -> None:
-        # Can't tell if output is truncated
-        _warn_if_truncated([1, 2, 3], limit=None, total_count=None)
-
-        # Can't tell if output is truncated
-        _warn_if_truncated([1, 2, 3], limit=None, total_count="foo")
-
-        # All items returned
-        _warn_if_truncated([1, 2, 3], limit=None, total_count="3")
-
-        # Output is truncated (no limit, received 3)
-        with self.assertWarns(UserWarning):
-            _warn_if_truncated([1, 2, 3], limit=None, total_count="5")
-
-        # Output is truncated (limit is 4, received 3)
-        with self.assertWarns(UserWarning):
-            _warn_if_truncated([1, 2, 3], limit=4, total_count="5")
-
-        # Output is truncated by the user
-        _warn_if_truncated([1, 2, 3], limit=3, total_count="5")
diff --git a/tests/test_utils_pagination.py b/tests/test_utils_pagination.py
@@ -0,0 +1,76 @@
+import unittest
+from unittest.mock import Mock, call, patch
+
+from huggingface_hub.utils._pagination import paginate
+
+from .testing_utils import handle_injection_in_test
+
+
+class TestPagination(unittest.TestCase):
+    @patch("huggingface_hub.utils._pagination.requests.get")
+    @patch("huggingface_hub.utils._pagination.hf_raise_for_status")
+    @handle_injection_in_test
+    def test_mocked_paginate(
+        self, mock_get: Mock, mock_hf_raise_for_status: Mock
+    ) -> None:
+        mock_params = Mock()
+        mock_headers = Mock()
+
+        # Simulate page 1
+        mock_response_page_1 = Mock()
+        mock_response_page_1.json.return_value = [1, 2, 3]
+        mock_response_page_1.links = {"next": {"url": "url_p2"}}
+
+        # Simulate page 2
+        mock_response_page_2 = Mock()
+        mock_response_page_2.json.return_value = [4, 5, 6]
+        mock_response_page_2.links = {"next": {"url": "url_p3"}}
+
+        # Simulate page 3
+        mock_response_page_3 = Mock()
+        mock_response_page_3.json.return_value = [7, 8]
+        mock_response_page_3.links = {}
+
+        # Mock response
+        mock_get.side_effect = [
+            mock_response_page_1,
+            mock_response_page_2,
+            mock_response_page_3,
+        ]
+
+        results = paginate("url", params=mock_params, headers=mock_headers)
+
+        # Requests are made only when generator is yielded
+        self.assertEqual(mock_get.call_count, 0)
+
+        # Results after concatenating pages
+        self.assertListEqual(list(results), [1, 2, 3, 4, 5, 6, 7, 8])
+
+        # All pages requested: 3 requests, 3 raise for status
+        self.assertEqual(mock_get.call_count, 3)
+        self.assertEqual(mock_hf_raise_for_status.call_count, 3)
+
+        # Params not passed to next pages
+        self.assertListEqual(
+            mock_get.call_args_list,
+            [
+                call("url", params=mock_params, headers=mock_headers),
+                call("url_p2", headers=mock_headers),
+                call("url_p3", headers=mock_headers),
+            ],
+        )
+
+    def test_paginate_github_api(self) -> None:
+        # Real test: paginate over huggingface repos on Github
+        # Use enumerate and stop after first page to avoid loading all repos
+        for num, _ in enumerate(
+            paginate(
+                "https://api.github.com/orgs/huggingface/repos?limit=4",
+                params={},
+                headers={},
+            )
+        ):
+            if num == 6:
+                break
+        else:
+            self.fail("Did not get more than 6 repos")