Skip to content

Commit dfdc044

Browse files
authored
deprecate dl_verify (#1844)
1 parent 0dd524e commit dfdc044

File tree

5 files changed

+8
-267
lines changed

5 files changed

+8
-267
lines changed

oggm/cfg.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -516,7 +516,6 @@ def initialize_minimal(file=None, logging_level='INFO', params=None):
516516
PARAMS['use_tar_shapefiles'] = cp.as_bool('use_tar_shapefiles')
517517
PARAMS['keep_multipolygon_outlines'] = cp.as_bool('keep_multipolygon_outlines')
518518
PARAMS['clip_tidewater_border'] = cp.as_bool('clip_tidewater_border')
519-
PARAMS['dl_verify'] = cp.as_bool('dl_verify')
520519
PARAMS['use_kcalving_for_inversion'] = cp.as_bool('use_kcalving_for_inversion')
521520
PARAMS['use_kcalving_for_run'] = cp.as_bool('use_kcalving_for_run')
522521
PARAMS['calving_use_limiter'] = cp.as_bool('calving_use_limiter')
@@ -580,7 +579,7 @@ def initialize_minimal(file=None, logging_level='INFO', params=None):
580579
'hydro_month_sh', 'hydro_month_nh', 'by_bin_bins',
581580
'use_intersects', 'filter_min_slope', 'clip_tidewater_border',
582581
'auto_skip_task', 'ref_mb_valid_window',
583-
'rgi_version', 'dl_verify', 'use_mp_spawn', 'calving_use_limiter',
582+
'rgi_version', 'use_mp_spawn', 'calving_use_limiter',
584583
'use_rgi_area', 'baseline_climate',
585584
'calving_line_extension', 'use_kcalving_for_run', 'lru_maxsize',
586585
'free_board_marine_terminating', 'use_kcalving_for_inversion',
@@ -652,10 +651,6 @@ def initialize(file=None, logging_level='INFO', params=None):
652651
pass
653652
DATA['dem_grids'] = grids
654653

655-
# Trigger a one time check of the hash file
656-
from oggm.utils import get_dl_verify_data
657-
get_dl_verify_data('dummy_section')
658-
659654
# OK
660655
PARAMS.do_log = True
661656

oggm/params.cfg

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,6 @@ keep_multipolygon_outlines = False
6363
# If you receive "Message truncated" errors from MPI, increase this
6464
mpi_recv_buf_size = 131072
6565

66-
# Check for the integrity of the files OGGM downloads at run time
67-
dl_verify = False
68-
6966
# Default number of files to be cached in the temporary directory
7067
lru_maxsize = 100
7168

oggm/tests/test_utils.py

Lines changed: 0 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -721,73 +721,6 @@ def test_to_and_from_tar_string(self):
721721
assert new_base_dir in new_gdir.base_dir
722722

723723

724-
class TestDLVerify(unittest.TestCase):
725-
726-
def setUp(self):
727-
# test directory
728-
self.testdir = os.path.join(get_test_dir(), 'tmp_prepro_tools')
729-
self.dldir = os.path.join(get_test_dir(), 'dl_cache')
730-
731-
# Init
732-
cfg.initialize()
733-
cfg.PATHS['dl_cache_dir'] = self.dldir
734-
cfg.PARAMS['dl_verify'] = True
735-
736-
# Read in the RGI file
737-
rgi_file = utils.get_demo_file('rgi_oetztal.shp')
738-
self.rgidf = gpd.read_file(rgi_file)
739-
self.rgidf['RGIId'] = [rid.replace('RGI50', 'RGI60')
740-
for rid in self.rgidf.RGIId]
741-
cfg.PATHS['working_dir'] = self.testdir
742-
self.clean_dir()
743-
744-
def tearDown(self):
745-
self.rm_dir()
746-
747-
def rm_dir(self):
748-
shutil.rmtree(self.testdir)
749-
shutil.rmtree(self.dldir)
750-
751-
def clean_dir(self):
752-
utils.mkdir(self.testdir, reset=True)
753-
utils.mkdir(self.dldir, reset=True)
754-
755-
def test_corrupted_file(self):
756-
757-
# Go - initialize working directories
758-
gdirs = workflow.init_glacier_directories(['RGI60-11.00787'],
759-
prepro_base_url=TEST_GDIR_URL_v11,
760-
from_prepro_level=4,
761-
prepro_rgi_version='61',
762-
prepro_border=20)
763-
764-
cfile = utils.get_prepro_gdir('61', 'RGI60-11.00787', 20, 4,
765-
base_url=TEST_GDIR_URL_v11)
766-
assert 'cluster.klima.uni-bremen.de/~oggm/' in cfile
767-
768-
# Replace with a dummy file
769-
os.remove(cfile)
770-
with open(cfile, 'w') as f:
771-
f.write('ups')
772-
773-
# Since we already verified this will error
774-
with pytest.raises(tarfile.ReadError):
775-
workflow.init_glacier_directories(['RGI60-11.00787'],
776-
prepro_base_url=TEST_GDIR_URL_v11,
777-
from_prepro_level=4,
778-
prepro_rgi_version='61',
779-
prepro_border=20)
780-
781-
# This should retrigger a download and just work
782-
cfg.DL_VERIFIED.clear()
783-
gdirs = workflow.init_glacier_directories(['RGI60-11.00787'],
784-
prepro_base_url=TEST_GDIR_URL_v11,
785-
from_prepro_level=4,
786-
prepro_rgi_version='61',
787-
prepro_border=20)
788-
assert gdirs[0].has_file('model_flowlines')
789-
790-
791724
class TestStartFromV14(unittest.TestCase):
792725

793726
def setUp(self):
@@ -1982,63 +1915,6 @@ def reset_dir(self):
19821915
utils.mkdir(cfg.PATHS['tmp_dir'])
19831916
utils.mkdir(cfg.PATHS['rgi_dir'])
19841917

1985-
def prepare_verify_test(self, valid_size=True, valid_crc32=True,
1986-
reset_dl_dict=True):
1987-
self.reset_dir()
1988-
cfg.PARAMS['dl_verify'] = True
1989-
1990-
if reset_dl_dict:
1991-
cfg.DL_VERIFIED.clear()
1992-
1993-
tgt_path = os.path.join(cfg.PATHS['dl_cache_dir'], 'test.com',
1994-
'test.txt')
1995-
1996-
file_size = 1024
1997-
file_data = os.urandom(file_size)
1998-
file_sha256 = hashlib.sha256()
1999-
file_sha256.update(file_data)
2000-
2001-
utils.mkdir(os.path.dirname(tgt_path))
2002-
with open(tgt_path, 'wb') as f:
2003-
f.write(file_data)
2004-
2005-
if not valid_size:
2006-
file_size += 1
2007-
if not valid_crc32:
2008-
file_sha256.update(b'1234ABCD')
2009-
2010-
file_sha256 = file_sha256.digest()
2011-
2012-
data = utils.get_dl_verify_data('cluster.klima.uni-bremen.de')
2013-
s = pd.DataFrame({'size': file_size, 'sha256': file_sha256},
2014-
index=['test.txt'])
2015-
cfg.DATA['dl_verify_data_test.com'] = pd.concat([data, s])
2016-
2017-
return 'https://test.com/test.txt'
2018-
2019-
def test_dl_verify(self):
2020-
2021-
cfg.PARAMS['dl_verify'] = True
2022-
2023-
def fake_down(dl_func, cache_path):
2024-
assert False
2025-
2026-
with FakeDownloadManager('_call_dl_func', fake_down):
2027-
url = self.prepare_verify_test(True, True)
2028-
utils.oggm_urlretrieve(url)
2029-
2030-
url = self.prepare_verify_test(False, True)
2031-
with self.assertRaises(DownloadVerificationFailedException):
2032-
utils.oggm_urlretrieve(url)
2033-
2034-
url = self.prepare_verify_test(True, False)
2035-
with self.assertRaises(DownloadVerificationFailedException):
2036-
utils.oggm_urlretrieve(url)
2037-
2038-
url = self.prepare_verify_test(False, False)
2039-
with self.assertRaises(DownloadVerificationFailedException):
2040-
utils.oggm_urlretrieve(url)
2041-
20421918
def test_github_no_internet(self):
20431919
self.reset_dir()
20441920
cache_dir = cfg.CACHE_DIR

oggm/utils/_downloads.py

Lines changed: 7 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,6 @@
7070
SAMPLE_DATA_GH_REPO = 'OGGM/oggm-sample-data'
7171
SAMPLE_DATA_COMMIT = '9bfeb6dfea9513f790877819d9a6cbd2c7b61611'
7272

73-
CHECKSUM_URL = 'https://cluster.klima.uni-bremen.de/data/downloads.sha256.hdf'
74-
CHECKSUM_VALIDATION_URL = CHECKSUM_URL + '.sha256'
75-
CHECKSUM_LIFETIME = 24 * 60 * 60
76-
7773
# Recommended url for runs
7874
DEFAULT_BASE_URL = ('https://cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.6/'
7975
'L3-L5_files/2023.3/elev_bands/W5E5_spinup')
@@ -180,88 +176,6 @@ def get_lock():
180176
return lock
181177

182178

183-
def get_dl_verify_data(section):
184-
"""Returns a pandas DataFrame with all known download object hashes.
185-
186-
The returned dictionary resolves str: cache_obj_name (without section)
187-
to a tuple int(size) and bytes(sha256)
188-
"""
189-
190-
verify_key = 'dl_verify_data_' + section
191-
if verify_key in cfg.DATA:
192-
return cfg.DATA[verify_key]
193-
194-
verify_file_path = os.path.join(cfg.CACHE_DIR, 'downloads.sha256.hdf')
195-
196-
def verify_file(force=False):
197-
"""Check the hash file's own hash"""
198-
if not cfg.PARAMS['has_internet']:
199-
return
200-
201-
if not force and os.path.isfile(verify_file_path) and \
202-
os.path.getmtime(verify_file_path) + CHECKSUM_LIFETIME > time.time():
203-
return
204-
205-
logger.info('Checking the download verification file checksum...')
206-
try:
207-
with requests.get(CHECKSUM_VALIDATION_URL) as req:
208-
req.raise_for_status()
209-
verify_file_sha256 = req.text.split(maxsplit=1)[0]
210-
verify_file_sha256 = bytearray.fromhex(verify_file_sha256)
211-
except Exception as e:
212-
verify_file_sha256 = None
213-
logger.warning('Failed getting verification checksum: ' + repr(e))
214-
215-
if os.path.isfile(verify_file_path) and verify_file_sha256:
216-
sha256 = hashlib.sha256()
217-
with open(verify_file_path, 'rb') as f:
218-
for b in iter(lambda: f.read(0xFFFF), b''):
219-
sha256.update(b)
220-
if sha256.digest() != verify_file_sha256:
221-
logger.warning('%s changed or invalid, deleting.'
222-
% (verify_file_path))
223-
os.remove(verify_file_path)
224-
else:
225-
os.utime(verify_file_path)
226-
227-
if not np.any(['dl_verify_data_' in k for k in cfg.DATA.keys()]):
228-
# We check the hash file only once per session
229-
# no need to do it at each call
230-
verify_file()
231-
232-
if not os.path.isfile(verify_file_path):
233-
if not cfg.PARAMS['has_internet']:
234-
return pd.DataFrame()
235-
236-
logger.info('Downloading %s to %s...'
237-
% (CHECKSUM_URL, verify_file_path))
238-
239-
with requests.get(CHECKSUM_URL, stream=True) as req:
240-
if req.status_code == 200:
241-
mkdir(os.path.dirname(verify_file_path))
242-
with open(verify_file_path, 'wb') as f:
243-
for b in req.iter_content(chunk_size=0xFFFF):
244-
if b:
245-
f.write(b)
246-
247-
logger.info('Done downloading.')
248-
249-
verify_file(force=True)
250-
251-
if not os.path.isfile(verify_file_path):
252-
logger.warning('Downloading and verifying checksums failed.')
253-
return pd.DataFrame()
254-
255-
try:
256-
data = pd.read_hdf(verify_file_path, key=section)
257-
except KeyError:
258-
data = pd.DataFrame()
259-
260-
cfg.DATA[verify_key] = data
261-
262-
return data
263-
264-
265179
def _call_dl_func(dl_func, cache_path):
266180
"""Helper so the actual call to downloads can be overridden
267181
"""
@@ -332,44 +246,6 @@ def _cached_download_helper(cache_obj_name, dl_func, reset=False):
332246
return cache_path
333247

334248

335-
def _verified_download_helper(cache_obj_name, dl_func, reset=False):
336-
"""Helper function for downloads.
337-
338-
Verifies the size and hash of the downloaded file against the included
339-
list of known static files.
340-
Uses _cached_download_helper to perform the actual download.
341-
"""
342-
path = _cached_download_helper(cache_obj_name, dl_func, reset)
343-
344-
dl_verify = cfg.PARAMS.get('dl_verify', False)
345-
346-
if dl_verify and path and cache_obj_name not in cfg.DL_VERIFIED:
347-
cache_section, cache_path = cache_obj_name.split('/', 1)
348-
data = get_dl_verify_data(cache_section)
349-
if cache_path not in data.index:
350-
logger.info('No known hash for %s' % cache_obj_name)
351-
cfg.DL_VERIFIED[cache_obj_name] = True
352-
else:
353-
# compute the hash
354-
sha256 = hashlib.sha256()
355-
with open(path, 'rb') as f:
356-
for b in iter(lambda: f.read(0xFFFF), b''):
357-
sha256.update(b)
358-
sha256 = sha256.digest()
359-
size = os.path.getsize(path)
360-
361-
# check
362-
data = data.loc[cache_path]
363-
if data['size'] != size or bytes(data['sha256']) != sha256:
364-
err = '%s failed to verify!\nis: %s %s\nexpected: %s %s' % (
365-
path, size, sha256.hex(), data.iloc[0], data.iloc[1].hex())
366-
raise DownloadVerificationFailedException(msg=err, path=path)
367-
logger.info('%s verified successfully.' % path)
368-
cfg.DL_VERIFIED[cache_obj_name] = True
369-
370-
return path
371-
372-
373249
def _requests_urlretrieve(url, path, reporthook, auth=None, timeout=None):
374250
"""Implements the required features of urlretrieve on top of requests
375251
"""
@@ -512,7 +388,7 @@ def oggm_urlretrieve(url, cache_obj_name=None, reset=False,
512388
reporthook=None, auth=None, timeout=None):
513389
"""Wrapper around urlretrieve, to implement our caching logic.
514390
515-
Instead of accepting a destination path, it decided where to store the file
391+
Instead of accepting a destination path, it decides where to store the file
516392
and returns the local path.
517393
518394
auth is expected to be either a tuple of ('username', 'password') or None.
@@ -533,7 +409,7 @@ def _dlf(cache_path):
533409
timeout)
534410
return cache_path
535411

536-
return _verified_download_helper(cache_obj_name, _dlf, reset)
412+
return _cached_download_helper(cache_obj_name, _dlf, reset)
537413

538414

539415
def _progress_urlretrieve(url, cache_name=None, reset=False,
@@ -595,7 +471,7 @@ def _aws_file_download_unlocked(aws_path, cache_name=None, reset=False):
595471
def _dlf(cache_path):
596472
raise NotImplementedError("Downloads from AWS are no longer supported")
597473

598-
return _verified_download_helper(cache_obj_name, _dlf, reset)
474+
return _cached_download_helper(cache_obj_name, _dlf, reset)
599475

600476

601477
def file_downloader(www_path, retry_max=3, sleep_on_retry=5,
@@ -793,7 +669,7 @@ def _always_none(foo):
793669
return None
794670

795671
cache_obj_name = _get_url_cache_name(wwwfile)
796-
dest_file = _verified_download_helper(cache_obj_name, _always_none)
672+
dest_file = _cached_download_helper(cache_obj_name, _always_none)
797673

798674
# Grab auth parameters
799675
if not dest_file:
@@ -1317,14 +1193,14 @@ def get_geodetic_mb_dataframe(file_path=None):
13171193

13181194
def get_temp_bias_dataframe(dataset='w5e5'):
13191195
"""Fetches the temperature bias dataframe created by the OGGM>=v16 pre-calibration
1320-
(further explained in the OGGM mass balance tutorial:
1196+
(further explained in the OGGM mass balance tutorial:
13211197
https:// tutorials.oggm.org/stable/notebooks/tutorials/massbalance_calibration.html).
1322-
The data preparation script is available at
1198+
The data preparation script is available at
13231199
https://nbviewer.jupyter.org/urls/cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.6/calibration/1.6.1/prepare_bias_map.ipynb
13241200
13251201
The file differs between climate datasets and OGGM versions. For W5E5 and OGGM v162, it is e.g.
13261202
https://cluster.klima.uni-bremen.de/~oggm/ref_mb_params/oggm_v1.6/w5e5_temp_bias_v2023.4.csv
1327-
1203+
13281204
Parameters
13291205
----------
13301206
dataset : str

oggm/workflow.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -421,9 +421,6 @@ def init_glacier_directories(rgidf=None, *, reset=False, force=False,
421421
log.workflow('init_glacier_directories from prepro level {} on '
422422
'{} glaciers.'.format(from_prepro_level,
423423
len(entities)))
424-
# Read the hash dictionary before we use multiproc
425-
if cfg.PARAMS['dl_verify']:
426-
utils.get_dl_verify_data('cluster.klima.uni-bremen.de')
427424
gdirs = execute_entity_task(gdir_from_prepro, entities,
428425
from_prepro_level=from_prepro_level,
429426
prepro_border=prepro_border,

0 commit comments

Comments
 (0)