Skip to content

Commit 0d96c77

Browse files
authored
Merge pull request the-library-code#27 from dpk/iter-methods
Add auto-paginating `get_*_iter` methods for most `get_*` methods
2 parents 888f796 + f43deff commit 0d96c77

File tree

2 files changed

+162
-7
lines changed

2 files changed

+162
-7
lines changed

dspace_rest_client/client.py

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
"""
1717
import json
1818
import logging
19+
import functools
1920

2021
import requests
2122
from requests import Request
@@ -74,6 +75,7 @@ class DSpaceClient:
7475
if 'USER_AGENT' in os.environ:
7576
USER_AGENT = os.environ['USER_AGENT']
7677
verbose = False
78+
ITER_PAGE_SIZE = 20
7779

7880
# Simple enum for patch operation types
7981
class PatchOperation:
@@ -82,6 +84,38 @@ class PatchOperation:
8284
REPLACE = 'replace'
8385
MOVE = 'move'
8486

87+
def paginated(embed_name, item_constructor, embedding=lambda x: x):
88+
"""
89+
@param embed_name: The key under '_embedded' in the JSON response that contains the resources to be paginated.
90+
(e.g. 'collections', 'objects', 'items', etc.)
91+
@param item_constructor: A callable that takes a resource dictionary and returns an item.
92+
@param embedding: Optional post-fetch processing lambda (default: identity function) for each resource
93+
@return: A decorator that, when applied to a method, follows pagination and yields each resource
94+
"""
95+
def decorator(fun):
96+
@functools.wraps(fun)
97+
def decorated(self, *args, **kwargs):
98+
def do_paginate(url, params):
99+
params['size'] = self.ITER_PAGE_SIZE
100+
101+
while url is not None:
102+
r_json = embedding(self.fetch_resource(url, params))
103+
for resource in r_json.get('_embedded', {}).get(embed_name, []):
104+
yield item_constructor(resource)
105+
106+
if 'next' in r_json.get('_links', {}):
107+
url = r_json['_links']['next']['href']
108+
# assume the ‘next’ link contains all the
109+
# params needed for the correct next page:
110+
params = {}
111+
else:
112+
url = None
113+
114+
return fun(do_paginate, self, *args, **kwargs)
115+
return decorated
116+
117+
return decorator
118+
85119
def __init__(self, api_endpoint=API_ENDPOINT, username=USERNAME, password=PASSWORD, solr_endpoint=SOLR_ENDPOINT,
86120
solr_auth=SOLR_AUTH, fake_user_agent=False):
87121
"""
@@ -397,6 +431,36 @@ def search_objects(self, query=None, scope=None, filters=None, page=0, size=20,
397431

398432
return dsos
399433

434+
@paginated(
435+
embed_name='objects',
436+
item_constructor=lambda x: SimpleDSpaceObject(x['_embedded']['indexableObject']),
437+
embedding=lambda x: x['_embedded']['searchResult']
438+
)
439+
def search_objects_iter(do_paginate, self, query=None, scope=None, filters=None, dso_type=None, sort=None):
440+
"""
441+
Do a basic search as in search_objects, automatically handling pagination by requesting the next page when all items from one page have been consumed
442+
@param query: query string
443+
@param scope: uuid to limit search scope, eg. owning collection, parent community, etc.
444+
@param filters: discovery filters as dict eg. {'f.entityType': 'Publication,equals', ... }
445+
@param sort: sort eg. 'title,asc'
446+
@param dso_type: DSO type to further filter results
447+
@return: Iterator of SimpleDSpaceObject
448+
"""
449+
if filters is None:
450+
filters = {}
451+
url = f'{self.API_ENDPOINT}/discover/search/objects'
452+
params = {}
453+
if query is not None:
454+
params['query'] = query
455+
if scope is not None:
456+
params['scope'] = scope
457+
if dso_type is not None:
458+
params['dsoType'] = dso_type
459+
if sort is not None:
460+
params['sort'] = sort
461+
462+
return do_paginate(url, {**params, **filters})
463+
400464
def fetch_resource(self, url, params=None):
401465
"""
402466
Simple function for higher-level 'get' functions to use whenever they want
@@ -571,6 +635,20 @@ def get_bundles(self, parent=None, uuid=None, page=0, size=20, sort=None):
571635

572636
return bundles
573637

638+
@paginated('bundles', Bundle)
639+
def get_bundles_iter(do_paginate, self, parent, sort=None):
640+
"""
641+
Get bundles for an item, automatically handling pagination by requesting the next page when all items from one page have been consumed
642+
@param parent: python Item object, from which the UUID will be referenced in the URL.
643+
@return: Iterator of Bundle
644+
"""
645+
url = f'{self.API_ENDPOINT}/core/items/{parent.uuid}/bundles'
646+
params = {}
647+
if sort is not None:
648+
params['sort'] = sort
649+
650+
return do_paginate(url, params)
651+
574652
def create_bundle(self, parent=None, name='ORIGINAL'):
575653
"""
576654
Create new bundle in the specified item
@@ -621,6 +699,24 @@ def get_bitstreams(self, uuid=None, bundle=None, page=0, size=20, sort=None):
621699
bitstreams.append(Bitstream(bitstream_resource))
622700
return bitstreams
623701

702+
@paginated('bitstreams', Bitstream)
703+
def get_bitstreams_iter(do_paginate, self, bundle, sort=None):
704+
"""
705+
Get all bitstreams for a specific bundle, automatically handling pagination by requesting the next page when all items from one page have been consumed
706+
@param bundle: A python Bundle object to parse for bitstream links to retrieve
707+
@return: Iterator of Bitstream
708+
"""
709+
if 'bitstreams' in bundle.links:
710+
url = bundle.links['bitstreams']['href']
711+
else:
712+
url = f'{self.API_ENDPOINT}/core/bundles/{bundle.uuid}/bitstreams'
713+
logging.warning(f'Cannot find bundle bitstream links, will try to construct manually: {url}')
714+
params = {}
715+
if sort is not None:
716+
params['sort'] = sort
717+
718+
return do_paginate(url, params)
719+
624720
def create_bitstream(self, bundle=None, name=None, path=None, mime=None, metadata=None, retry=False):
625721
"""
626722
Upload a file and create a bitstream for a specified parent bundle, from the uploaded file and
@@ -734,6 +830,24 @@ def get_communities(self, uuid=None, page=0, size=20, sort=None, top=False):
734830
# Return list (populated or empty)
735831
return communities
736832

833+
@paginated('communities', Community)
834+
def get_communities_iter(do_paginate, self, sort=None, top=False):
835+
"""
836+
Get communities as an iterator, automatically handling pagination by requesting the next page when all items from one page have been consumed
837+
@param top: whether to restrict search to top communities (default: false)
838+
@return: Iterator of Community
839+
"""
840+
if top:
841+
url = f'{self.API_ENDPOINT}/core/communities/search/top'
842+
else:
843+
url = f'{self.API_ENDPOINT}/core/communities'
844+
845+
params = {}
846+
if sort is not None:
847+
params['sort'] = sort
848+
849+
return do_paginate(url, params)
850+
737851
def create_community(self, parent, data):
738852
"""
739853
Create a community, either top-level or beneath a given parent
@@ -799,6 +913,21 @@ def get_collections(self, uuid=None, community=None, page=0, size=20, sort=None)
799913
# Return list (populated or empty)
800914
return collections
801915

916+
@paginated('collections', Collection)
917+
def get_collections_iter(do_paginate, self, community=None, sort=None):
918+
"""
919+
Get collections as an iterator, automatically handling pagination by requesting the next page when all items from one page have been consumed
920+
@param community: Community object. If present, collections for a community
921+
@return: Iterator of Collection
922+
"""
923+
url = f'{self.API_ENDPOINT}/core/collections'
924+
925+
if community is not None:
926+
if 'collections' in community.links and 'href' in community.links['collections']:
927+
url = community.links['collections']['href']
928+
929+
return do_paginate(url, {})
930+
802931
def create_collection(self, parent, data):
803932
"""
804933
Create collection beneath a given parent community.
@@ -975,6 +1104,12 @@ def delete_user(self, user):
9751104

9761105
# PAGINATION
9771106
def get_users(self, page=0, size=20, sort=None):
1107+
"""
1108+
Get a list of users (epersons) in the DSpace instance
1109+
@param page: Integer for page / offset of results. Default: 0
1110+
@param size: Integer for page size. Default: 20 (same as REST API default)
1111+
@return: list of User objects
1112+
"""
9781113
url = f'{self.API_ENDPOINT}/eperson/epersons'
9791114
users = list()
9801115
params = {}
@@ -992,6 +1127,19 @@ def get_users(self, page=0, size=20, sort=None):
9921127
users.append(User(user_resource))
9931128
return users
9941129

1130+
@paginated('epersons', User)
1131+
def get_users_iter(do_paginate, self, sort=None):
1132+
"""
1133+
Get an iterator of users (epersons) in the DSpace instance, automatically handling pagination by requesting the next page when all items from one page have been consumed
1134+
@return: Iterator of User
1135+
"""
1136+
url = f'{self.API_ENDPOINT}/eperson/epersons'
1137+
params = {}
1138+
if sort is not None:
1139+
params['sort'] = sort
1140+
1141+
return do_paginate(url, params)
1142+
9951143
def create_group(self, group):
9961144
"""
9971145
Create a group

example.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,19 @@
99

1010
from dspace_rest_client.client import DSpaceClient
1111
from dspace_rest_client.models import Community, Collection, Item, Bundle, Bitstream
12+
import os
1213

13-
# Example variables needed for authentication and basic API requests
14-
# SET THESE TO MATCH YOUR TEST SYSTEM BEFORE RUNNING THE EXAMPLE SCRIPT
15-
# You can also leave them out of the constructor and set environment variables instead:
16-
# DSPACE_API_ENDPOINT=
17-
# DSPACE_API_USERNAME=
18-
# DSPACE_API_PASSWORD=
19-
# USER_AGENT=
14+
# The DSpace client will look for the same environment variables but we can also look for them here explicitly
15+
# and as an example
2016
url = 'http://localhost:8080/server/api'
17+
if 'DSPACE_API_ENDPOINT' in os.environ:
18+
url = os.environ['DSPACE_API_ENDPOINT']
2119
username = '[email protected]'
20+
if 'DSPACE_API_USERNAME' in os.environ:
21+
username = os.environ['DSPACE_API_USERNAME']
2222
password = 'password'
23+
if 'DSPACE_API_PASSWORD' in os.environ:
24+
password = os.environ['DSPACE_API_PASSWORD']
2325

2426
# Instantiate DSpace client
2527
# Note the 'fake_user_agent' setting here -- this will set a string like the following, to get by Cloudfront:
@@ -221,3 +223,8 @@
221223
# print, or write to file, etc. You want to use the 'content' property of the response object
222224
#
223225
# print(r.content)
226+
227+
# Finally, let's show the new _iter methods which will transparently handle pagination and return iterators
228+
# which you can use as normal
229+
for i, search_result in enumerate(d.search_objects_iter('*:*')):
230+
print(f'Result #{i}: {search_result.name} ({search_result.uuid})')

0 commit comments

Comments
 (0)