Merge pull request the-library-code#27 from dpk/iter-methods

kshepherd · web-flow · commit 0d96c7746bd9 · 2024-11-20T16:17:58.000+01:00
Add auto-paginating `get_*_iter` methods for most `get_*` methods
diff --git a/dspace_rest_client/client.py b/dspace_rest_client/client.py
@@ -16,6 +16,7 @@
 """
 import json
 import logging
+import functools
 
 import requests
 from requests import Request
@@ -74,6 +75,7 @@ class DSpaceClient:
     if 'USER_AGENT' in os.environ:
         USER_AGENT = os.environ['USER_AGENT']
     verbose = False
+    ITER_PAGE_SIZE = 20
 
     # Simple enum for patch operation types
     class PatchOperation:
@@ -82,6 +84,38 @@ class PatchOperation:
         REPLACE = 'replace'
         MOVE = 'move'
 
+    def paginated(embed_name, item_constructor, embedding=lambda x: x):
+        """
+        @param embed_name: The key under '_embedded' in the JSON response that contains the resources to be paginated.
+                           (e.g. 'collections', 'objects', 'items', etc.)
+        @param item_constructor: A callable that takes a resource dictionary and returns an item.
+        @param embedding: Optional post-fetch processing lambda (default: identity function) for each resource
+        @return: A decorator that, when applied to a method, follows pagination and yields each resource
+        """
+        def decorator(fun):
+            @functools.wraps(fun)
+            def decorated(self, *args, **kwargs):
+                def do_paginate(url, params):
+                    params['size'] = self.ITER_PAGE_SIZE
+
+                    while url is not None:
+                        r_json = embedding(self.fetch_resource(url, params))
+                        for resource in r_json.get('_embedded', {}).get(embed_name, []):
+                            yield item_constructor(resource)
+
+                        if 'next' in r_json.get('_links', {}):
+                            url = r_json['_links']['next']['href']
+                            # assume the ‘next’ link contains all the
+                            # params needed for the correct next page:
+                            params = {}
+                        else:
+                            url = None
+
+                return fun(do_paginate, self, *args, **kwargs)
+            return decorated
+
+        return decorator
+
     def __init__(self, api_endpoint=API_ENDPOINT, username=USERNAME, password=PASSWORD, solr_endpoint=SOLR_ENDPOINT,
                  solr_auth=SOLR_AUTH, fake_user_agent=False):
         """
@@ -397,6 +431,36 @@ def search_objects(self, query=None, scope=None, filters=None, page=0, size=20,
 
         return dsos
 
+    @paginated(
+        embed_name='objects',
+        item_constructor=lambda x: SimpleDSpaceObject(x['_embedded']['indexableObject']),
+        embedding=lambda x: x['_embedded']['searchResult']
+    )
+    def search_objects_iter(do_paginate, self, query=None, scope=None, filters=None, dso_type=None, sort=None):
+        """
+        Do a basic search as in search_objects, automatically handling pagination by requesting the next page when all items from one page have been consumed
+        @param query:   query string
+        @param scope:   uuid to limit search scope, eg. owning collection, parent community, etc.
+        @param filters: discovery filters as dict eg. {'f.entityType': 'Publication,equals', ... }
+        @param sort: sort eg. 'title,asc'
+        @param dso_type: DSO type to further filter results
+        @return:        Iterator of SimpleDSpaceObject
+        """
+        if filters is None:
+            filters = {}
+        url = f'{self.API_ENDPOINT}/discover/search/objects'
+        params = {}
+        if query is not None:
+            params['query'] = query
+        if scope is not None:
+            params['scope'] = scope
+        if dso_type is not None:
+            params['dsoType'] = dso_type
+        if sort is not None:
+            params['sort'] = sort
+
+        return do_paginate(url, {**params, **filters})
+
     def fetch_resource(self, url, params=None):
         """
         Simple function for higher-level 'get' functions to use whenever they want
@@ -571,6 +635,20 @@ def get_bundles(self, parent=None, uuid=None, page=0, size=20, sort=None):
 
         return bundles
 
+    @paginated('bundles', Bundle)
+    def get_bundles_iter(do_paginate, self, parent, sort=None):
+        """
+        Get bundles for an item, automatically handling pagination by requesting the next page when all items from one page have been consumed
+        @param parent:  python Item object, from which the UUID will be referenced in the URL.
+        @return:        Iterator of Bundle
+        """
+        url = f'{self.API_ENDPOINT}/core/items/{parent.uuid}/bundles'
+        params = {}
+        if sort is not None:
+            params['sort'] = sort
+
+        return do_paginate(url, params)
+
     def create_bundle(self, parent=None, name='ORIGINAL'):
         """
         Create new bundle in the specified item
@@ -621,6 +699,24 @@ def get_bitstreams(self, uuid=None, bundle=None, page=0, size=20, sort=None):
                     bitstreams.append(Bitstream(bitstream_resource))
                 return bitstreams
 
+    @paginated('bitstreams', Bitstream)
+    def get_bitstreams_iter(do_paginate, self, bundle, sort=None):
+        """
+        Get all bitstreams for a specific bundle, automatically handling pagination by requesting the next page when all items from one page have been consumed
+        @param bundle:  A python Bundle object to parse for bitstream links to retrieve
+        @return:        Iterator of Bitstream
+        """
+        if 'bitstreams' in bundle.links:
+            url = bundle.links['bitstreams']['href']
+        else:
+            url = f'{self.API_ENDPOINT}/core/bundles/{bundle.uuid}/bitstreams'
+            logging.warning(f'Cannot find bundle bitstream links, will try to construct manually: {url}')
+        params = {}
+        if sort is not None:
+            params['sort'] = sort
+
+        return do_paginate(url, params)
+
     def create_bitstream(self, bundle=None, name=None, path=None, mime=None, metadata=None, retry=False):
         """
         Upload a file and create a bitstream for a specified parent bundle, from the uploaded file and
@@ -734,6 +830,24 @@ def get_communities(self, uuid=None, page=0, size=20, sort=None, top=False):
         # Return list (populated or empty)
         return communities
 
+    @paginated('communities', Community)
+    def get_communities_iter(do_paginate, self, sort=None, top=False):
+        """
+        Get communities as an iterator, automatically handling pagination by requesting the next page when all items from one page have been consumed
+        @param top:     whether to restrict search to top communities (default: false)
+        @return: Iterator of Community
+        """
+        if top:
+            url = f'{self.API_ENDPOINT}/core/communities/search/top'
+        else:
+            url = f'{self.API_ENDPOINT}/core/communities'
+
+        params = {}
+        if sort is not None:
+            params['sort'] = sort
+
+        return do_paginate(url, params)
+
     def create_community(self, parent, data):
         """
         Create a community, either top-level or beneath a given parent
@@ -799,6 +913,21 @@ def get_collections(self, uuid=None, community=None, page=0, size=20, sort=None)
         # Return list (populated or empty)
         return collections
 
+    @paginated('collections', Collection)
+    def get_collections_iter(do_paginate, self, community=None, sort=None):
+        """
+        Get collections as an iterator, automatically handling pagination by requesting the next page when all items from one page have been consumed
+        @param community:   Community object. If present, collections for a community
+        @return:            Iterator of Collection
+        """
+        url = f'{self.API_ENDPOINT}/core/collections'
+
+        if community is not None:
+            if 'collections' in community.links and 'href' in community.links['collections']:
+                url = community.links['collections']['href']
+
+        return do_paginate(url, {})
+
     def create_collection(self, parent, data):
         """
         Create collection beneath a given parent community.
@@ -975,6 +1104,12 @@ def delete_user(self, user):
 
     # PAGINATION
     def get_users(self, page=0, size=20, sort=None):
+        """
+        Get a list of users (epersons) in the DSpace instance
+        @param page: Integer for page / offset of results. Default: 0
+        @param size: Integer for page size. Default: 20 (same as REST API default)
+        @return:     list of User objects
+        """
         url = f'{self.API_ENDPOINT}/eperson/epersons'
         users = list()
         params = {}
@@ -992,6 +1127,19 @@ def get_users(self, page=0, size=20, sort=None):
                     users.append(User(user_resource))
         return users
 
+    @paginated('epersons', User)
+    def get_users_iter(do_paginate, self, sort=None):
+        """
+        Get an iterator of users (epersons) in the DSpace instance, automatically handling pagination by requesting the next page when all items from one page have been consumed
+        @return:     Iterator of User
+        """
+        url = f'{self.API_ENDPOINT}/eperson/epersons'
+        params = {}
+        if sort is not None:
+            params['sort'] = sort
+
+        return do_paginate(url, params)
+
     def create_group(self, group):
         """
         Create a group
diff --git a/example.py b/example.py
@@ -9,17 +9,19 @@
 
 from dspace_rest_client.client import DSpaceClient
 from dspace_rest_client.models import Community, Collection, Item, Bundle, Bitstream
+import os
 
-# Example variables needed for authentication and basic API requests
-# SET THESE TO MATCH YOUR TEST SYSTEM BEFORE RUNNING THE EXAMPLE SCRIPT
-# You can also leave them out of the constructor and set environment variables instead:
-# DSPACE_API_ENDPOINT=
-# DSPACE_API_USERNAME=
-# DSPACE_API_PASSWORD=
-# USER_AGENT=
+# The DSpace client will look for the same environment variables but we can also look for them here explicitly
+# and as an example
 url = 'http://localhost:8080/server/api'
+if 'DSPACE_API_ENDPOINT' in os.environ:
+    url = os.environ['DSPACE_API_ENDPOINT']
 username = 'username@test.system.edu'
+if 'DSPACE_API_USERNAME' in os.environ:
+    username = os.environ['DSPACE_API_USERNAME']
 password = 'password'
+if 'DSPACE_API_PASSWORD' in os.environ:
+    password = os.environ['DSPACE_API_PASSWORD']
 
 # Instantiate DSpace client
 # Note the 'fake_user_agent' setting here -- this will set a string like the following, to get by Cloudfront:
@@ -221,3 +223,8 @@
                     # print, or write to file, etc. You want to use the 'content' property of the response object
                     #
                     # print(r.content)
+
+# Finally, let's show the new _iter methods which will transparently handle pagination and return iterators
+# which you can use as normal
+for i, search_result in enumerate(d.search_objects_iter('*:*')):
+    print(f'Result #{i}: {search_result.name} ({search_result.uuid})')