Skip to content

Commit 100295b

Browse files
authored
fix for #865!! (#959)
Differentiate between when LiveWebLoader is used for fully remote loading (eg. remote CDX + pywb) vs just remote index (remote CDX + local WARCs) by checking if 'archive_paths' has been explicitly set. If it has, then skip LiveWebLoader when filename/offset are provided (to fallback to revisit)
1 parent 7e5a21c commit 100295b

File tree

3 files changed

+16
-5
lines changed

3 files changed

+16
-5
lines changed

pywb/warcserver/handlers.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,9 +175,10 @@ def __call__(self, params):
175175
#=============================================================================
176176
class DefaultResourceHandler(ResourceHandler):
177177
def __init__(self, index_source, warc_paths='', forward_proxy_prefix='',
178+
use_local_file_load=False,
178179
**kwargs):
179180
loaders = [WARCPathLoader(warc_paths, index_source),
180-
LiveWebLoader(forward_proxy_prefix),
181+
LiveWebLoader(forward_proxy_prefix, use_local_file_load=use_local_file_load),
181182
VideoLoader()
182183
]
183184
super(DefaultResourceHandler, self).__init__(index_source, loaders, **kwargs)

pywb/warcserver/resource/responseloader.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -274,9 +274,12 @@ class LiveWebLoader(BaseLoader):
274274
'application/vnd.apple.mpegurl',
275275
'application/dash+xml')
276276

277-
def __init__(self, forward_proxy_prefix=None, adapter=None):
277+
def __init__(self, forward_proxy_prefix=None, adapter=None, use_local_file_load=False):
278278
self.forward_proxy_prefix = forward_proxy_prefix
279279

280+
# indicates if WARCs can be loaded locally, even if we get to this fallback
281+
self.use_local_file_load = use_local_file_load
282+
280283
socks_host = os.environ.get('SOCKS_HOST')
281284
socks_port = os.environ.get('SOCKS_PORT', 9050)
282285
if socks_host and socks_port:
@@ -285,8 +288,11 @@ def __init__(self, forward_proxy_prefix=None, adapter=None):
285288
self.socks_proxy = None
286289

287290
def load_resource(self, cdx, params):
288-
#if cdx.get('filename') and cdx.get('offset') is not None:
289-
# return None
291+
if cdx.get('filename') and cdx.get('offset') is not None:
292+
# if loading locally, skip here so can retry
293+
# in case of revisit
294+
if self.use_local_file_load:
295+
return None
290296

291297
load_url = cdx.get('load_url')
292298
if not load_url:

pywb/warcserver/warcserver.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,9 @@ def load_coll(self, name, coll_config):
244244
# ARCHIVE CONFIG
245245
if not archive_paths:
246246
archive_paths = self.config.get('archive_paths')
247+
use_local_file_load = False
248+
else:
249+
use_local_file_load = True
247250

248251
# ACCESS CONFIG
249252
access_checker = None
@@ -252,7 +255,8 @@ def load_coll(self, name, coll_config):
252255

253256
return DefaultResourceHandler(agg, archive_paths,
254257
rules_file=self.rules_file,
255-
access_checker=access_checker)
258+
access_checker=access_checker,
259+
use_local_file_load=use_local_file_load)
256260

257261
def init_sequence(self, coll_name, seq_config):
258262
if not isinstance(seq_config, list):

0 commit comments

Comments
 (0)