Skip to content

Commit b3f2168

Browse files
committed
improve paging when server count is approximate
1 parent 9ac3164 commit b3f2168

File tree

4 files changed

+222
-30
lines changed

4 files changed

+222
-30
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
Unreleased
22
----------
3-
-
3+
**Improvements**
4+
- `PagedList` handles situations where the server over-estimates the number of items available for paging.
45

56
v1.5.5 (2021-03-26)
67
-------------------

src/sasctl/core.py

Lines changed: 95 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -797,9 +797,6 @@ def __init__(self, obj, session=None, threads=4):
797797
self._obj = obj
798798

799799
def __next__(self):
800-
return self.next()
801-
802-
def next(self):
803800
if self._pool is None:
804801
self._pool = concurrent.futures.ThreadPoolExecutor(max_workers=self._num_threads)
805802

@@ -831,6 +828,8 @@ def __iter__(self):
831828
# All Iterators are also Iterables
832829
return self
833830

831+
next = __next__ # Python 2 compatible
832+
834833
def _request_async(self, start):
835834
"""Used by worker threads to retrieve next batch of items."""
836835

@@ -847,6 +846,9 @@ def _request_async(self, start):
847846
class PagedItemIterator:
848847
"""Iterates through a collection that must be "paged" from the server.
849848
849+
Uses `PageIterator` to transparently download pages of items from the server
850+
as needed.
851+
850852
Parameters
851853
----------
852854
obj : RestObj
@@ -862,78 +864,135 @@ class PagedItemIterator:
862864
------
863865
RestObj
864866
867+
Notes
868+
-----
869+
Value returned by len() is an approximate count of the items available. The actual
870+
number of items returned may be greater than or less than this number.
871+
872+
See Also
873+
--------
874+
PageIterator
875+
865876
"""
866877
def __init__(self, obj, session=None, threads=4):
867878
# Iterates over whole pages of items
868879
self._pager = PageIterator(obj, session, threads)
869880

881+
# Store items from latest page that haven't been returned yet.
882+
self._cache = []
883+
870884
# Total number of items to iterate over
871885
if 'count' in obj:
886+
# NOTE: "count" may be an (over) estimate of the number of items available
887+
# since some may be inaccessible due to user permissions & won't actually be returned.
872888
self._count = int(obj.count)
873889
else:
874890
self._count = len(obj['items'])
875891

876-
self._cache = []
877-
878892
def __len__(self):
879893
return self._count
880894

881895
def __next__(self):
882-
return self.next()
883-
884-
def next(self):
885896
# Get next page of items if we're currently out
886897
if not self._cache:
887898
self._cache = next(self._pager)
888899

900+
if len(self._cache) < self._pager._limit:
901+
print('oops')
902+
# number of items returned in page was less than expected
903+
# might be last page, or items might have been filtered out by server.
904+
889905
# Return the next item
890906
if self._cache:
907+
self._count -= 1
891908
return self._cache.pop(0)
892909

893-
raise StopIteration
910+
raise StopIteration()
911+
# Out of items and out of pages
912+
# self._count = 0
913+
# raise StopIteration
894914

895915
def __iter__(self):
896916
return self
897917

918+
next = __next__ # Python 2 compatible
919+
920+
921+
class PagedListIterator:
922+
"""Iterates over an instance of PagedList
923+
924+
Parameters
925+
----------
926+
l : list-like
898927
899-
class PagedListIterator():
928+
"""
900929
def __init__(self, l):
901930
self.__list = l
902931
self.__index = 0
903932

904933
def __next__(self):
905-
return self.next()
906-
907-
def next(self):
908934
if self.__index >= len(self.__list):
909935
raise StopIteration
910936

911-
item = self.__list[self.__index]
912-
self.__index += 1
913-
return item
937+
try:
938+
item = self.__list[self.__index]
939+
self.__index += 1
940+
return item
941+
except IndexError:
942+
# Because PagedList length is approximate, iterating can result in
943+
# indexing outside the array. Just stop the iteration if that occurs.
944+
raise StopIteration
914945

915946
def __iter__(self):
916947
return self
917948

949+
next = __next__ # Python 2 compatibility
950+
918951

919952
class PagedList(list):
920-
def __init__(self, obj, **kwargs):
953+
"""List that dynamically loads items from the server.
954+
955+
Parameters
956+
----------
957+
obj : RestObj
958+
An instance of `RestObj` containing any initial items and a link to
959+
retrieve additional items.
960+
session : Session, optional
961+
The `Session` instance to use for requesting additional items. Defaults
962+
to current_session()
963+
threads : int, optional
964+
Number of threads allocated to loading additional items.
965+
966+
Notes
967+
-----
968+
Value returned by len() is an approximate count of the items available. The actual
969+
length is not known until all items have been pulled from the server.
970+
971+
See Also
972+
--------
973+
PagedItemIterator
974+
975+
"""
976+
def __init__(self, obj, session=None, threads=4):
921977
super(PagedList, self).__init__()
922-
self._pager = PagedItemIterator(obj, **kwargs)
978+
self._pager = PagedItemIterator(obj, session=session, threads=threads)
923979

924-
# Force caching of first page
925-
self.append(next(self._pager))
926-
items = self._pager._cache
927-
self.extend(items)
980+
# Add the first page of items to the list
981+
for _ in range(len(self._pager._cache)):
982+
self.append(next(self._pager))
928983

929-
# clear the list (py27 compatible)
930-
del self._pager._cache[:]
984+
# Assume that server has more items available
985+
self._has_more = True
931986

932987
def __len__(self):
933-
return len(self._pager)
988+
if self._has_more:
989+
# Estimate the total length as items downloaded + items still on server
990+
return super(PagedList, self).__len__() + len(self._pager)
991+
else:
992+
# We've pulled everything from the server, so we have an exact length now.
993+
return super(PagedList, self).__len__()
934994

935995
def __iter__(self):
936-
# return super(PagedList, self).__iter__()
937996
return PagedListIterator(self)
938997

939998
def __getslice__(self, i, j):
@@ -949,15 +1008,22 @@ def __getitem__(self, item):
9491008
else:
9501009
idx = int(item)
9511010

1011+
# Support negative indexing. Need to load items up to len() - idx.
1012+
if idx < 0:
1013+
idx = len(self) + idx
1014+
9521015
try:
1016+
# Iterate through server-side pages until we've loaded
1017+
# the item at the requested index.
9531018
while super(PagedList, self).__len__() <= idx:
9541019
n = next(self._pager)
9551020
self.append(n)
1021+
9561022
except StopIteration:
957-
# May cause if slice or index extends beyond array
958-
# Ignore and let List handle IndexErrors if necessary.
959-
pass
1023+
# We've hit the end of the paging so the server has no more items to retrieve.
1024+
self._has_more = False
9601025

1026+
# Get the item from the list
9611027
return super(PagedList, self).__getitem__(item)
9621028

9631029
def __str__(self):

tests/unit/test_pageditemiterator.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from sasctl.core import PagedItemIterator, RestObj
1010
from .test_pageiterator import paging
1111

12+
1213
def test_no_paging_required():
1314
"""If "next" link not present, current items should be included."""
1415

@@ -67,3 +68,51 @@ def test_paging(paging):
6768
for i, o in enumerate(pager):
6869
assert RestObj(items[i]) == o
6970

71+
72+
def test_paging_inflated_count():
73+
"""Test behavior when server overestimates the number of items available."""
74+
import re
75+
76+
start = 10
77+
limit = 10
78+
79+
# Only defines 20 items to return
80+
pages = [
81+
[{'name': x} for x in list('abcdefghi')],
82+
[{'name': x} for x in list('klmnopqrs')],
83+
[{'name': x} for x in list('uv')],
84+
]
85+
actual_num_items = sum(len(page) for page in pages)
86+
87+
# services (like Files) may overestimate how many items are available.
88+
# Simulate that behavior
89+
num_items = 23
90+
91+
obj = RestObj(items=pages[0],
92+
count=num_items,
93+
links=[{'rel': 'next',
94+
'href': '/moaritems?start=%d&limit=%d' % (start, limit)}])
95+
96+
with mock.patch('sasctl.core.request') as req:
97+
def side_effect(_, link, **kwargs):
98+
assert 'limit=%d' % limit in link
99+
start = int(re.search(r'(?<=start=)[\d]+', link).group())
100+
if start == 10:
101+
return RestObj(items=pages[1])
102+
elif start == 20:
103+
return RestObj(items=pages[2])
104+
else:
105+
return RestObj(items=[])
106+
107+
req.side_effect = side_effect
108+
109+
pager = PagedItemIterator(obj, threads=1)
110+
111+
# Initially, length is estimated based on how many items the server says it has
112+
assert len(pager) == num_items
113+
114+
# Retrieve all of the items
115+
items = [x for x in pager]
116+
117+
assert len(items) == actual_num_items
118+
assert len(pager) == num_items - actual_num_items

tests/unit/test_pagedlist.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
from sasctl.core import PagedList, RestObj
1010
from .test_pageiterator import paging
1111

12+
1213
def test_len_no_paging():
14+
"""len() should return the correct # of objects if no paging is required."""
1315
items = [{'name': 'a'}, {'name': 'b'}, {'name': 'c'}]
1416
obj = RestObj(items=items, count=len(items))
1517

@@ -89,6 +91,80 @@ def test_getitem_paging(paging):
8991
assert item.name == RestObj(items[i]).name
9092

9193

94+
def test_get_item_inflated_len():
95+
"""Test behavior when server overestimates the number of items available."""
96+
import re
97+
98+
start = 10
99+
limit = 10
100+
101+
# Only defines 20 items to return
102+
pages = [
103+
[{'name': x} for x in list('abcdefghi')],
104+
[{'name': x} for x in list('klmnopqrs')],
105+
[{'name': x} for x in list('uv')],
106+
]
107+
actual_num_items = sum(len(page) for page in pages)
108+
109+
# services (like Files) may overestimate how many items are available.
110+
# Simulate that behavior
111+
num_items = 23
112+
113+
obj = RestObj(items=pages[0],
114+
count=num_items,
115+
links=[{'rel': 'next',
116+
'href': '/moaritems?start=%d&limit=%d' % (start, limit)}])
117+
118+
with mock.patch('sasctl.core.request') as req:
119+
def side_effect(_, link, **kwargs):
120+
assert 'limit=%d' % limit in link
121+
start = int(re.search(r'(?<=start=)[\d]+', link).group())
122+
if start == 10:
123+
return RestObj(items=pages[1])
124+
elif start == 20:
125+
return RestObj(items=pages[2])
126+
else:
127+
return RestObj(items=[])
128+
129+
req.side_effect = side_effect
130+
pager = PagedList(obj, threads=1)
131+
132+
# Initially, length is estimated based on how many items the server says it has available
133+
assert len(pager) == num_items
134+
135+
# Retrieve all of the items
136+
items = [x for x in pager]
137+
138+
# Should have retrieved all of the items
139+
assert len(items) == actual_num_items
140+
141+
# List length should now be updated to indicate the correct number of items
142+
assert len(pager) == actual_num_items
143+
144+
# Recreate the pager
145+
with mock.patch('sasctl.core.request') as req:
146+
def side_effect(_, link, **kwargs):
147+
assert 'limit=%d' % limit in link
148+
start = int(re.search(r'(?<=start=)[\d]+', link).group())
149+
if start == 10:
150+
return RestObj(items=pages[1])
151+
elif start == 20:
152+
return RestObj(items=pages[2])
153+
else:
154+
return RestObj(items=[])
155+
156+
req.side_effect = side_effect
157+
158+
pager = PagedList(obj, threads=1)
159+
160+
# Requesting the last item should work & cause loading of all items
161+
last_item = pager[-1]
162+
163+
# Make sure the last item is correct (even though server inflated item count)
164+
assert last_item == pages[-1][-1]
165+
assert len(pager) == actual_num_items
166+
167+
92168
def test_zip_paging(paging):
93169
"""Check that zip() works correctly with the list."""
94170
obj, items, _ = paging

0 commit comments

Comments
 (0)