Skip to content

Commit bc4de34

Browse files
committed
Add manifest cache, PDF early limit check, and parallel S3 fetching
1 parent d5e4631 commit bc4de34

File tree

7 files changed

+459
-26
lines changed

7 files changed

+459
-26
lines changed

config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
OPEN_SEARCH_INDEX = 'scan-explorer'
2222
OPEN_SEARCH_AGG_BUCKET_LIMIT = 10000
2323

24+
REDIS_URL = 'redis://redis-backend:6379/1'
25+
2426
ADS_SEARCH_SERVICE_URL = 'https://api.adsabs.harvard.edu/v1/search/query'
2527
ADS_SEARCH_SERVICE_TOKEN = '<CHANGE ME>'
2628

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@ alembic==1.8.0
1414
img2pdf==0.4.4
1515
appmap>=1.1.0.dev0
1616
boto3==1.34.75
17+
redis==4.6.0
Lines changed: 340 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,340 @@
1+
import unittest
2+
import json
3+
import time
4+
from flask import url_for
5+
from unittest.mock import patch, MagicMock
6+
from scan_explorer_service.tests.base import TestCaseDatabase
7+
from scan_explorer_service.models import Article, Base, Collection, Page
8+
9+
10+
class TestManifestCache(TestCaseDatabase):
11+
12+
def create_app(self):
13+
from scan_explorer_service.app import create_app
14+
return create_app(**{
15+
'SQLALCHEMY_DATABASE_URI': self.postgresql_url,
16+
'SQLALCHEMY_ECHO': False,
17+
'TESTING': True,
18+
'PROPAGATE_EXCEPTIONS': True,
19+
'TRAP_BAD_REQUEST_ERRORS': True,
20+
'PRESERVE_CONTEXT_ON_EXCEPTION': False
21+
})
22+
23+
def setUp(self):
24+
Base.metadata.drop_all(bind=self.app.db.engine)
25+
Base.metadata.create_all(bind=self.app.db.engine)
26+
27+
import scan_explorer_service.views.manifest as m
28+
m._redis_client = None
29+
30+
self.collection = Collection(type='type', journal='cacheJ', volume='0099')
31+
self.app.db.session.add(self.collection)
32+
self.app.db.session.commit()
33+
self.app.db.session.refresh(self.collection)
34+
35+
self.article = Article(bibcode='2099CacheTest..001A',
36+
collection_id=self.collection.id)
37+
self.app.db.session.add(self.article)
38+
self.app.db.session.commit()
39+
self.app.db.session.refresh(self.article)
40+
41+
self.page1 = Page(name='cp1', collection_id=self.collection.id,
42+
volume_running_page_num=1)
43+
self.page1.width = 100
44+
self.page1.height = 100
45+
self.page1.label = '1'
46+
self.page2 = Page(name='cp2', collection_id=self.collection.id,
47+
volume_running_page_num=2)
48+
self.page2.width = 100
49+
self.page2.height = 100
50+
self.page2.label = '2'
51+
self.app.db.session.add_all([self.page1, self.page2])
52+
self.app.db.session.commit()
53+
54+
self.article.pages.append(self.page1)
55+
self.article.pages.append(self.page2)
56+
self.app.db.session.commit()
57+
58+
def tearDown(self):
59+
import scan_explorer_service.views.manifest as m
60+
m._redis_client = None
61+
self.app.db.session.remove()
62+
self.app.db.drop_all()
63+
64+
def _mock_redis(self):
65+
mock_r = MagicMock()
66+
store = {}
67+
68+
def mock_get(key):
69+
entry = store.get(key)
70+
if entry is None:
71+
return None
72+
val, exp = entry
73+
if exp and time.monotonic() > exp:
74+
del store[key]
75+
return None
76+
return val
77+
78+
def mock_setex(key, ttl, val):
79+
store[key] = (val, time.monotonic() + ttl)
80+
81+
def mock_delete(key):
82+
store.pop(key, None)
83+
84+
mock_r.get = mock_get
85+
mock_r.setex = mock_setex
86+
mock_r.delete = mock_delete
87+
mock_r.ping.return_value = True
88+
89+
import scan_explorer_service.views.manifest as m
90+
m._redis_client = mock_r
91+
return mock_r, store
92+
93+
def test_cache_hit_returns_cached_json(self):
94+
mock_r, store = self._mock_redis()
95+
from scan_explorer_service.views.manifest import MANIFEST_CACHE_PREFIX
96+
store[MANIFEST_CACHE_PREFIX + self.article.id] = ('{"@type":"sc:Manifest","cached":true}', time.monotonic() + 3600)
97+
98+
url = url_for("manifest.get_manifest", id=self.article.id)
99+
r = self.client.get(url)
100+
self.assertStatus(r, 200)
101+
data = json.loads(r.data)
102+
self.assertTrue(data.get('cached'))
103+
104+
def test_cache_hit_returns_correct_content_type(self):
105+
mock_r, store = self._mock_redis()
106+
from scan_explorer_service.views.manifest import MANIFEST_CACHE_PREFIX
107+
store[MANIFEST_CACHE_PREFIX + self.collection.id] = ('{"@type":"sc:Manifest"}', time.monotonic() + 3600)
108+
109+
url = url_for("manifest.get_manifest", id=self.collection.id)
110+
r = self.client.get(url)
111+
self.assertStatus(r, 200)
112+
self.assertIn('application/json', r.content_type)
113+
114+
def test_cache_miss_calls_setex(self):
115+
mock_r, store = self._mock_redis()
116+
original_setex = mock_r.setex
117+
setex_calls = []
118+
119+
def tracking_setex(key, ttl, val):
120+
setex_calls.append(key)
121+
return original_setex(key, ttl, val)
122+
123+
mock_r.setex = tracking_setex
124+
125+
from scan_explorer_service.views.manifest import _cache_set, MANIFEST_CACHE_PREFIX
126+
_cache_set(self.article.id, '{"@type":"sc:Manifest"}')
127+
128+
self.assertEqual(len(setex_calls), 1)
129+
self.assertEqual(setex_calls[0], MANIFEST_CACHE_PREFIX + self.article.id)
130+
131+
def test_cached_manifest_skips_manifest_factory(self):
132+
mock_r, store = self._mock_redis()
133+
from scan_explorer_service.views.manifest import MANIFEST_CACHE_PREFIX
134+
store[MANIFEST_CACHE_PREFIX + self.article.id] = ('{"@type":"sc:Manifest"}', time.monotonic() + 3600)
135+
136+
with patch('scan_explorer_service.views.manifest.manifest_factory') as mock_factory:
137+
url = url_for("manifest.get_manifest", id=self.article.id)
138+
r = self.client.get(url)
139+
self.assertStatus(r, 200)
140+
mock_factory.create_manifest.assert_not_called()
141+
142+
def test_404_not_cached(self):
143+
mock_r, store = self._mock_redis()
144+
from scan_explorer_service.views.manifest import MANIFEST_CACHE_PREFIX
145+
146+
url = url_for("manifest.get_manifest", id='nonexistent')
147+
r = self.client.get(url)
148+
self.assertStatus(r, 404)
149+
self.assertNotIn(MANIFEST_CACHE_PREFIX + 'nonexistent', store)
150+
151+
def test_redis_unavailable_falls_through(self):
152+
import scan_explorer_service.views.manifest as m
153+
m._redis_client = None
154+
155+
with patch('scan_explorer_service.views.manifest.redis.from_url', side_effect=Exception("connection refused")):
156+
url = url_for("manifest.get_manifest", id=self.article.id)
157+
r = self.client.get(url)
158+
self.assertStatus(r, 200)
159+
data = json.loads(r.data)
160+
self.assertEqual(data['@type'], 'sc:Manifest')
161+
162+
163+
class TestPdfEarlyLimitCheck(TestCaseDatabase):
164+
165+
def create_app(self):
166+
from scan_explorer_service.app import create_app
167+
return create_app(**{
168+
'SQLALCHEMY_DATABASE_URI': self.postgresql_url,
169+
'SQLALCHEMY_ECHO': False,
170+
'TESTING': True,
171+
'PROPAGATE_EXCEPTIONS': True,
172+
'TRAP_BAD_REQUEST_ERRORS': True,
173+
'PRESERVE_CONTEXT_ON_EXCEPTION': False,
174+
'IMAGE_PDF_MEMORY_LIMIT': 100 * 1024 * 1024,
175+
'IMAGE_PDF_PAGE_LIMIT': 100,
176+
})
177+
178+
def setUp(self):
179+
Base.metadata.drop_all(bind=self.app.db.engine)
180+
Base.metadata.create_all(bind=self.app.db.engine)
181+
182+
self.collection = Collection(type='type', journal='journal', volume='volume')
183+
self.app.db.session.add(self.collection)
184+
self.app.db.session.commit()
185+
186+
def test_over_limit_returns_400_immediately(self):
187+
response = self.client.get(url_for('proxy.pdf_save',
188+
id=self.collection.id,
189+
page_start=1,
190+
page_end=150))
191+
self.assertEqual(response.status_code, 400)
192+
data = json.loads(response.data)
193+
self.assertIn('exceeds limit', data['Message'])
194+
195+
def test_exactly_at_limit_passes(self):
196+
with patch('scan_explorer_service.views.image_proxy.fetch_images') as mock_fi, \
197+
patch('scan_explorer_service.views.image_proxy.img2pdf.convert') as mock_conv:
198+
mock_fi.return_value = [b'data']
199+
mock_conv.return_value = b'pdf'
200+
response = self.client.get(url_for('proxy.pdf_save',
201+
id=self.collection.id,
202+
page_start=1,
203+
page_end=100))
204+
self.assertEqual(response.status_code, 200)
205+
206+
def test_one_over_limit_returns_400(self):
207+
response = self.client.get(url_for('proxy.pdf_save',
208+
id=self.collection.id,
209+
page_start=1,
210+
page_end=101))
211+
self.assertEqual(response.status_code, 400)
212+
213+
def test_no_page_end_passes_limit_check(self):
214+
with patch('scan_explorer_service.views.image_proxy.fetch_images') as mock_fi, \
215+
patch('scan_explorer_service.views.image_proxy.img2pdf.convert') as mock_conv:
216+
mock_fi.return_value = [b'data']
217+
mock_conv.return_value = b'pdf'
218+
response = self.client.get(url_for('proxy.pdf_save',
219+
id=self.collection.id,
220+
page_start=1))
221+
self.assertEqual(response.status_code, 200)
222+
223+
@patch('scan_explorer_service.views.image_proxy.get_item')
224+
def test_over_limit_does_not_touch_db(self, mock_get_item):
225+
response = self.client.get(url_for('proxy.pdf_save',
226+
id=self.collection.id,
227+
page_start=1,
228+
page_end=200))
229+
self.assertEqual(response.status_code, 400)
230+
mock_get_item.assert_not_called()
231+
232+
def test_inverted_page_range_returns_empty_pdf(self):
233+
with patch('scan_explorer_service.views.image_proxy.fetch_images') as mock_fi, \
234+
patch('scan_explorer_service.views.image_proxy.img2pdf.convert') as mock_conv:
235+
mock_fi.return_value = []
236+
mock_conv.return_value = b'pdf'
237+
response = self.client.get(url_for('proxy.pdf_save',
238+
id=self.collection.id,
239+
page_start=10,
240+
page_end=5))
241+
self.assertIn(response.status_code, [200, 400])
242+
243+
244+
class TestParallelFetchImages(TestCaseDatabase):
245+
246+
def create_app(self):
247+
from scan_explorer_service.app import create_app
248+
return create_app(**{
249+
'SQLALCHEMY_DATABASE_URI': self.postgresql_url,
250+
'SQLALCHEMY_ECHO': False,
251+
'TESTING': True,
252+
'PROPAGATE_EXCEPTIONS': True,
253+
'TRAP_BAD_REQUEST_ERRORS': True,
254+
'PRESERVE_CONTEXT_ON_EXCEPTION': False,
255+
'IMAGE_PDF_MEMORY_LIMIT': 100 * 1024 * 1024,
256+
'IMAGE_PDF_PAGE_LIMIT': 100,
257+
})
258+
259+
def setUp(self):
260+
Base.metadata.drop_all(bind=self.app.db.engine)
261+
Base.metadata.create_all(bind=self.app.db.engine)
262+
263+
self.collection = Collection(type='type', journal='journal', volume='volume')
264+
self.app.db.session.add(self.collection)
265+
self.app.db.session.commit()
266+
267+
self.article = Article(bibcode='1988ApJ...333..341R',
268+
collection_id=self.collection.id)
269+
self.app.db.session.add(self.article)
270+
self.app.db.session.commit()
271+
272+
pages = []
273+
for i in range(5):
274+
p = Page(name=f'page{i}', collection_id=self.collection.id,
275+
volume_running_page_num=i + 1)
276+
p.width = 100
277+
p.height = 100
278+
p.label = str(i + 1)
279+
pages.append(p)
280+
self.app.db.session.add_all(pages)
281+
self.app.db.session.commit()
282+
283+
for p in pages:
284+
self.article.pages.append(p)
285+
self.app.db.session.commit()
286+
287+
self.pages = pages
288+
289+
@patch('scan_explorer_service.views.image_proxy.S3Provider')
290+
def test_fetch_images_returns_all_pages(self, mock_s3_cls):
291+
mock_s3 = MagicMock()
292+
mock_s3.read_object_s3.return_value = b'image_data'
293+
mock_s3_cls.return_value = mock_s3
294+
295+
from scan_explorer_service.views.image_proxy import fetch_images
296+
images = list(fetch_images(
297+
self.app.db.session, self.collection, 1, 5, 100,
298+
100 * 1024 * 1024))
299+
self.assertEqual(len(images), 5)
300+
self.assertTrue(all(img == b'image_data' for img in images))
301+
302+
@patch('scan_explorer_service.views.image_proxy.S3Provider')
303+
def test_fetch_images_respects_memory_limit(self, mock_s3_cls):
304+
mock_s3 = MagicMock()
305+
mock_s3.read_object_s3.return_value = b'x' * 1000
306+
mock_s3_cls.return_value = mock_s3
307+
308+
from scan_explorer_service.views.image_proxy import fetch_images
309+
images = list(fetch_images(
310+
self.app.db.session, self.collection, 1, 5, 100,
311+
500))
312+
self.assertLess(len(images), 5)
313+
314+
@patch('scan_explorer_service.views.image_proxy.S3Provider')
315+
def test_fetch_images_skips_none_results(self, mock_s3_cls):
316+
mock_s3 = MagicMock()
317+
mock_s3.read_object_s3.side_effect = [b'data1', None, b'data3', b'data4', b'data5']
318+
mock_s3_cls.return_value = mock_s3
319+
320+
from scan_explorer_service.views.image_proxy import fetch_images
321+
images = list(fetch_images(
322+
self.app.db.session, self.collection, 1, 5, 100,
323+
100 * 1024 * 1024))
324+
self.assertEqual(len(images), 4)
325+
326+
@patch('scan_explorer_service.views.image_proxy.S3Provider')
327+
def test_single_s3provider_instance(self, mock_s3_cls):
328+
mock_s3 = MagicMock()
329+
mock_s3.read_object_s3.return_value = b'image_data'
330+
mock_s3_cls.return_value = mock_s3
331+
332+
from scan_explorer_service.views.image_proxy import fetch_images
333+
list(fetch_images(
334+
self.app.db.session, self.collection, 1, 5, 100,
335+
100 * 1024 * 1024))
336+
self.assertEqual(mock_s3_cls.call_count, 1)
337+
338+
339+
if __name__ == '__main__':
340+
unittest.main()

scan_explorer_service/tests/test_proxy.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -157,9 +157,11 @@ def test_get_item(self):
157157
get_item(self.app.db.session, 'non-existent-id')
158158
assert("ID: non-existent-id not found" in str(context.exception))
159159

160-
@patch('scan_explorer_service.views.image_proxy.fetch_object')
161-
def test_fetch_images(self, mock_fetch_object):
162-
mock_fetch_object.return_value = b'image_data'
160+
@patch('scan_explorer_service.views.image_proxy.S3Provider')
161+
def test_fetch_images(self, mock_s3_cls):
162+
mock_s3 = MagicMock()
163+
mock_s3.read_object_s3.return_value = b'image_data'
164+
mock_s3_cls.return_value = mock_s3
163165
item = self.article
164166
page_start = 1
165167
page_end = 2
@@ -169,7 +171,7 @@ def test_fetch_images(self, mock_fetch_object):
169171
gen = fetch_images(self.app.db.session, item, page_start, page_end, page_limit, memory_limit)
170172
images = list(gen)
171173
self.assertEqual(images, [b'image_data', b'image_data'])
172-
mock_fetch_object.assert_called()
174+
mock_s3.read_object_s3.assert_called()
173175

174176
@patch('scan_explorer_service.utils.s3_utils.S3Provider.read_object_s3')
175177
def test_fetch_object(self, mock_read_object_s3):

0 commit comments

Comments
 (0)