Bump version to 2.0.0 (#139)

lukasschwab · web-flow · commit 124f66d0880e · 2023-10-17T01:50:27.000-07:00
+ Improve `max_results`/`delay_seconds` types, defaults (#138) + Eliminate `get`, deprecate `Search.Results` (#137) + Accelerate CI integration tests (#140)
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -12,7 +12,6 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
-      max-parallel: 1
       matrix:
         python-version: ["3.7", "3.10", "3.11"]
     steps:
diff --git a/README.md b/README.md
@@ -36,19 +36,19 @@ A `Search` specifies a search of arXiv's database.
 arxiv.Search(
   query: str = "",
   id_list: List[str] = [],
-  max_results: float = float('inf'),
+  max_results: int | None = None,
   sort_by: SortCriterion = SortCriterion.Relevance,
   sort_order: SortOrder = SortOrder.Descending
 )
 ```
 
 + `query`: an arXiv query string. Advanced query formats are documented in the [arXiv API User Manual](https://arxiv.org/help/api/user-manual#query_details).
 + `id_list`: list of arXiv record IDs (typically of the format `"0710.5765v1"`). See [the arXiv API User's Manual](https://arxiv.org/help/api/user-manual#search_query_and_id_list) for documentation of the interaction between `query` and `id_list`.
-+ `max_results`: The maximum number of results to be returned in an execution of this search. To fetch every result available, set `max_results=float('inf')` (default); to fetch up to 10 results, set `max_results=10`. The API's limit is 300,000 results.
++ `max_results`: The maximum number of results to be returned in an execution of this search. To fetch every result available, set `max_results=None` (default); to fetch up to 10 results, set `max_results=10`. The API's limit is 300,000 results.
 + `sort_by`: The sort criterion for results: `relevance`, `lastUpdatedDate`, or `submittedDate`.
 + `sort_order`: The sort order for results: `'descending'` or `'ascending'`.
 
-To fetch arXiv records matching a `Search`, use `search.results()` or `(Client).results(search)` to get a generator yielding `Result`s.
+To fetch arXiv records matching a `Search`, use `(Client).results(search)` to get a generator yielding `Result`s.
 
 #### Example: fetching results
 
@@ -63,7 +63,7 @@ search = arxiv.Search(
   sort_by = arxiv.SortCriterion.SubmittedDate
 )
 
-for result in search.results():
+for result in arxiv.Client().results(search):
   print(result.title)
 ```
 
@@ -72,16 +72,18 @@ Fetch and print the title of the paper with ID "1605.08386v1:"
 ```python
 import arxiv
 
+client = arxiv.Client()
 search = arxiv.Search(id_list=["1605.08386v1"])
-paper = next(search.results())
+
+paper = next(arxiv.Client().results(search))
 print(paper.title)
 ```
 
 ### Result
 
 <!-- TODO: improve this section. -->
 
-The `Result` objects yielded by `(Search).results()` include metadata about each paper and some helper functions for downloading their content.
+The `Result` objects yielded by `(Client).results()` include metadata about each paper and some helper functions for downloading their content.
 
 The meaning of the underlying raw data is documented in the [arXiv API User Manual: Details of Atom Results Returned](https://arxiv.org/help/api/user-manual#_details_of_atom_results_returned).
 
@@ -108,7 +110,7 @@ To download a PDF of the paper with ID "1605.08386v1," run a `Search` and then u
 ```python
 import arxiv
 
-paper = next(arxiv.Search(id_list=["1605.08386v1"]).results())
+paper = next(arxiv.Client().results(arxiv.Search(id_list=["1605.08386v1"])))
 # Download the PDF to the PWD with a default filename.
 paper.download_pdf()
 # Download the PDF to the PWD with a custom filename.
@@ -122,7 +124,7 @@ The same interface is available for downloading .tar.gz files of the paper sourc
 ```python
 import arxiv
 
-paper = next(arxiv.Search(id_list=["1605.08386v1"]).results())
+paper = next(arxiv.Client().results(arxiv.Search(id_list=["1605.08386v1"])))
 # Download the archive to the PWD with a default filename.
 paper.download_source()
 # Download the archive to the PWD with a custom filename.
@@ -133,14 +135,13 @@ paper.download_source(dirpath="./mydir", filename="downloaded-paper.tar.gz")
 
 ### Client
 
-A `Client` specifies a strategy for fetching results from arXiv's API; it obscures pagination and retry logic.
-
-For most use cases the default client should suffice. You can construct it explicitly with `arxiv.Client()`, or use it via the `(Search).results()` method.
+A `Client` specifies a strategy for fetching results from arXiv's API; it obscures pagination and retry logic. For most use cases the default client should suffice.
 
 ```python
+# Default client properties.
 arxiv.Client(
   page_size: int = 100,
-  delay_seconds: int = 3,
+  delay_seconds: float = 3.0,
   num_retries: int = 3
 )
 ```
@@ -151,14 +152,12 @@ arxiv.Client(
 
 #### Example: fetching results with a custom client
 
-`(Search).results()` uses the default client settings. If you want to use a client you've defined instead of the defaults, use `(Client).results(...)`:
-
 ```python
 import arxiv
 
 big_slow_client = arxiv.Client(
   page_size = 1000,
-  delay_seconds = 10,
+  delay_seconds = 10.0,
   num_retries = 5
 )
 
@@ -173,9 +172,11 @@ To inspect this package's network behavior and API logic, configure an `INFO`-le
 
 ```pycon
 >>> import logging, arxiv
->>> logging.basicConfig(level=logging.INFO)
->>> paper = next(arxiv.Search(id_list=["1605.08386v1"]).results())
+>>> logging.basicConfig(level=logging.DEBUG)
+>>> client = arxiv.Client()
+>>> paper = next(client.results(arxiv.Search(id_list=["1605.08386v1"])))
 INFO:arxiv.arxiv:Requesting 100 results at offset 0
-INFO:arxiv.arxiv:Requesting page of results
-INFO:arxiv.arxiv:Got first page; 1 of inf results available
+INFO:arxiv.arxiv:Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=&id_list=1605.08386v1&sortBy=relevance&sortOrder=descending&start=0&max_results=100
+DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): export.arxiv.org:443
+DEBUG:urllib3.connectionpool:https://export.arxiv.org:443 "GET /api/query?search_query=&id_list=1605.08386v1&sortBy=relevance&sortOrder=descending&start=0&max_results=100&user-agent=arxiv.py%2F1.4.8 HTTP/1.1" 200 979
 ```
diff --git a/arxiv/arxiv.py b/arxiv/arxiv.py
@@ -3,8 +3,10 @@
 
 import logging
 import time
+import itertools
 import feedparser
 import os
+import math
 import re
 import requests
 import warnings
@@ -422,12 +424,12 @@ class Search(object):
     Manual](https://arxiv.org/help/api/user-manual#search_query_and_id_list)
     for documentation of the interaction between `query` and `id_list`.
     """
-    max_results: float
+    max_results: int | None
     """
     The maximum number of results to be returned in an execution of this
     search.
 
-    To fetch every result available, set `max_results=float('inf')`.
+    To fetch every result available, set `max_results=None`.
     """
     sort_by: SortCriterion
     """The sort criterion for results."""
@@ -438,7 +440,7 @@ def __init__(
         self,
         query: str = "",
         id_list: List[str] = [],
-        max_results: float = float("inf"),
+        max_results: int | None = None,
         sort_by: SortCriterion = SortCriterion.Relevance,
         sort_order: SortOrder = SortOrder.Descending,
     ):
@@ -447,7 +449,8 @@ def __init__(
         """
         self.query = query
         self.id_list = id_list
-        self.max_results = max_results
+        # Handle deprecated v1 default behavior.
+        self.max_results = None if max_results == math.inf else max_results
         self.sort_by = sort_by
         self.sort_order = sort_order
 
@@ -479,23 +482,19 @@ def _url_args(self) -> Dict[str, str]:
             "sortOrder": self.sort_order.value,
         }
 
-    def get(self) -> Generator[Result, None, None]:
-        """
-        **Deprecated** after 1.2.0; use `Search.results`.
-        """
-        warnings.warn(
-            "The 'get' method is deprecated, use 'results' instead",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        return self.results()
-
     def results(self, offset: int = 0) -> Generator[Result, None, None]:
         """
         Executes the specified search using a default arXiv API client.
 
         For info on default behavior, see `Client.__init__` and `Client.results`.
+
+        **Deprecated** after 2.0.0; use `Client.results`.
         """
+        warnings.warn(
+            "The '(Search).results' method is deprecated, use 'Client.results' instead",
+            DeprecationWarning,
+            stacklevel=2,
+        )
         return Client().results(self, offset=offset)
 
 
@@ -511,7 +510,7 @@ class Client(object):
     """The arXiv query API endpoint format."""
     page_size: int
     """Maximum number of results fetched in a single API request."""
-    delay_seconds: int
+    delay_seconds: float
     """Number of seconds to wait between API requests."""
     num_retries: int
     """Number of times to retry a failing API request."""
@@ -520,7 +519,7 @@ class Client(object):
     _session: requests.Session
 
     def __init__(
-        self, page_size: int = 100, delay_seconds: int = 3, num_retries: int = 3
+        self, page_size: int = 100, delay_seconds: float = 3.0, num_retries: int = 3
     ):
         """
         Constructs an arXiv API client with the specified options.
@@ -548,17 +547,6 @@ def __repr__(self) -> str:
             repr(self.num_retries),
         )
 
-    def get(self, search: Search) -> Generator[Result, None, None]:
-        """
-        **Deprecated** after 1.2.0; use `Client.results`.
-        """
-        warnings.warn(
-            "The 'get' method is deprecated, use 'results' instead",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        return self.results(search)
-
     def results(self, search: Search, offset: int = 0) -> Generator[Result, None, None]:
         """
         Uses this client configuration to fetch one page of the search results
@@ -574,46 +562,37 @@ def results(self, search: Search, offset: int = 0) -> Generator[Result, None, No
         For more on using generators, see
         [Generators](https://wiki.python.org/moin/Generators).
         """
+        limit = search.max_results - offset if search.max_results else None
+        if limit and limit < 0:
+            return iter(())
+        return itertools.islice(self._results(search, offset), limit)
+
+    def _results(
+        self, search: Search, offset: int = 0
+    ) -> Generator[Result, None, None]:
+        page_url = self._format_url(search, offset, self.page_size)
+        feed = self._parse_feed(page_url, first_page=True)
+        if not feed.entries:
+            logger.info("Got empty first page; stopping generation")
+            return
+        total_results = int(feed.feed.opensearch_totalresults)
+        logger.info(
+            "Got first page: %d of %d total results",
+            len(feed.entries),
+            total_results,
+        )
 
-        # total_results may be reduced according to the feed's
-        # opensearch:totalResults value.
-        total_results = search.max_results
-        first_page = True
-        while offset < total_results:
-            page_size = min(self.page_size, search.max_results - offset)
-            logger.info("Requesting %d results at offset %d", page_size, offset)
-            page_url = self._format_url(search, offset, page_size)
-            feed = self._parse_feed(page_url, first_page=first_page)
-            if first_page:
-                # NOTE: this is an ugly fix for a known bug. The totalresults
-                # value is set to 1 for results with zero entries. If that API
-                # bug is fixed, we can remove this conditional and always set
-                # `total_results = min(...)`.
-                if len(feed.entries) == 0:
-                    logger.info("Got empty first page; stopping generation")
-                    total_results = 0
-                else:
-                    total_results = min(
-                        total_results, int(feed.feed.opensearch_totalresults)
-                    )
-                    logger.info(
-                        "Got first page: %d of %d total results",
-                        total_results,
-                        search.max_results
-                        if search.max_results != float("inf")
-                        else -1,
-                    )
-                # Subsequent pages are not the first page.
-                first_page = False
-            # Update offset for next request: account for received results.
-            offset += len(feed.entries)
-            # Yield query results until page is exhausted.
+        while feed.entries:
             for entry in feed.entries:
                 try:
                     yield Result._from_feed_entry(entry)
                 except Result.MissingFieldError as e:
                     logger.warning("Skipping partial result: %s", e)
-                    continue
+            offset += len(feed.entries)
+            if offset >= total_results:
+                break
+            page_url = self._format_url(search, offset, self.page_size)
+            feed = self._parse_feed(page_url, first_page=False)
 
     def _format_url(self, search: Search, start: int, page_size: int) -> str:
         """
@@ -679,7 +658,7 @@ def __try_parse_feed(
             "Requesting page (first: %r, try: %d): %s", first_page, try_index, url
         )
 
-        resp = self._session.get(url, headers={"user-agent": "arxiv.py/1.4.8"})
+        resp = self._session.get(url, headers={"user-agent": "arxiv.py/2.0.0"})
         self._last_request_dt = datetime.now()
         if resp.status_code != requests.codes.OK:
             raise HTTPError(url, try_index, resp.status_code)
diff --git a/setup.py b/setup.py
@@ -1,6 +1,6 @@
 from setuptools import setup
 
-version = "1.4.8"
+version = "2.0.0"
 
 with open("README.md", "r") as fh:
     long_description = fh.read()
diff --git a/tests/test_api_bugs.py b/tests/test_api_bugs.py
@@ -5,7 +5,7 @@
 import unittest
 
 
-class TestClient(unittest.TestCase):
+class TestAPIBugs(unittest.TestCase):
     def test_missing_title(self):
         """
         Papers with the title "0" do not have a title element in the Atom feed.
diff --git a/tests/test_client.py b/tests/test_client.py
@@ -3,15 +3,9 @@
 import arxiv
 from datetime import datetime, timedelta
 from pytest import approx
-import time
 
 
 class TestClient(unittest.TestCase):
-    def tearDown(self) -> None:
-        # Bodge: sleep three seconds between tests to simulate a shared rate limit.
-        time.sleep(3)
-        return super().tearDown()
-
     def test_invalid_format_id(self):
         with self.assertRaises(arxiv.HTTPError):
             list(arxiv.Client(num_retries=0).results(arxiv.Search(id_list=["abc"])))
@@ -58,7 +52,7 @@ def test_query_page_count(self):
                 "https://export.arxiv.org/api/query?search_query=testing&id_list=&sortBy=relevance&sortOrder=descending&start=20&max_results=10",
                 "https://export.arxiv.org/api/query?search_query=testing&id_list=&sortBy=relevance&sortOrder=descending&start=30&max_results=10",
                 "https://export.arxiv.org/api/query?search_query=testing&id_list=&sortBy=relevance&sortOrder=descending&start=40&max_results=10",
-                "https://export.arxiv.org/api/query?search_query=testing&id_list=&sortBy=relevance&sortOrder=descending&start=50&max_results=5",
+                "https://export.arxiv.org/api/query?search_query=testing&id_list=&sortBy=relevance&sortOrder=descending&start=50&max_results=10",
             },
         )
 
@@ -79,14 +73,12 @@ def test_offset(self):
         self.assertListEqual(offset_above_max_results, [])
 
     def test_search_results_offset(self):
+        # NOTE: page size is irrelevant here.
+        client = arxiv.Client(page_size=15)
         search = arxiv.Search(query="testing", max_results=10)
-        client = arxiv.Client()
-
-        all_results = list(client.results(search, 0))
+        all_results = list(client.results(search, offset=0))
         self.assertEqual(len(all_results), 10)
 
-        client.page_size = 5
-
         for offset in [0, 5, 9, 10, 11]:
             client_results = list(client.results(search, offset=offset))
             self.assertEqual(len(client_results), max(0, search.max_results - offset))
@@ -191,12 +183,12 @@ def test_sleep_between_errors(self, patched_time_sleep):
         self.assertEqual(patched_time_sleep.call_count, client.num_retries)
         patched_time_sleep.assert_has_calls(
             [
-                call(approx(client.delay_seconds, rel=1e-3)),
+                call(approx(client.delay_seconds, abs=1e-2)),
             ]
             * client.num_retries
         )
 
-    def get_code_client(code: int, delay_seconds=3, num_retries=3) -> arxiv.Client:
+    def get_code_client(code: int, delay_seconds=0.1, num_retries=3) -> arxiv.Client:
         """
         get_code_client returns an arxiv.Cient with HTTP requests routed to
         httpstat.us.
diff --git a/tests/test_result.py b/tests/test_result.py