@@ -390,56 +390,55 @@ def _download_or_get_cache(
390
390
expected_url_info
391
391
):
392
392
computed_url_info = checksums .compute_url_info (manually_downloaded_path )
393
- self ._register_or_validate_checksums (
394
- resource = resource ,
395
- path = manually_downloaded_path ,
396
- computed_url_info = computed_url_info ,
397
- )
398
- self ._log_skip_download (
399
- url = url , url_info = computed_url_info , path = manually_downloaded_path
393
+ dl_result = downloader .DownloadResult (
394
+ path = manually_downloaded_path , url_info = computed_url_info
400
395
)
401
- return promise .Promise .resolve (manually_downloaded_path )
402
396
403
397
# Force download
404
398
elif self ._force_download :
405
- return self . _download ( resource )
399
+ dl_result = None
406
400
407
401
# Download has been cached (checksum known)
408
402
elif expected_url_info and resource_lib .is_locally_cached (
409
403
checksum_path := self ._get_dl_path (resource , expected_url_info .checksum )
410
404
):
411
- self ._register_or_validate_checksums (
412
- resource = resource ,
413
- path = checksum_path ,
414
- computed_url_info = expected_url_info ,
405
+ dl_result = downloader .DownloadResult (
406
+ path = checksum_path , url_info = expected_url_info
415
407
)
416
- self ._log_skip_download (
417
- url = url , url_info = expected_url_info , path = checksum_path
418
- )
419
- return promise .Promise .resolve (checksum_path )
420
408
421
409
# Download has been cached (checksum unknown)
422
410
elif resource_lib .is_locally_cached (
423
411
url_path := self ._get_dl_path (resource )
424
412
):
425
- computed_url_info = downloader .read_url_info (url_path )
426
- if expected_url_info and expected_url_info != computed_url_info :
413
+ url_info = downloader .read_url_info (url_path )
414
+
415
+ if expected_url_info and expected_url_info != url_info :
427
416
# If checksums are registered but do not match, trigger a new
428
417
# download (e.g. previous file corrupted, checksums updated)
429
- return self ._download (resource )
430
- if checksum_path := self ._register_or_validate_checksums (
431
- resource = resource , path = url_path , computed_url_info = computed_url_info
432
- ):
418
+ dl_result = None
419
+ elif self ._is_checksum_registered (url = url ):
433
420
# Checksums were registered: Rename -> checksum_path
434
- resource_lib .replace_info_file (url_path , checksum_path )
435
- path = url_path .replace (checksum_path )
421
+ path = self ._get_dl_path (resource , url_info .checksum )
422
+ resource_lib .replace_info_file (url_path , path )
423
+ url_path .replace (path )
424
+ dl_result = downloader .DownloadResult (path = path , url_info = url_info )
436
425
else :
437
426
# Checksums not registered: -> do nothing
438
- path = url_path
439
- self ._log_skip_download (url = url , url_info = computed_url_info , path = path )
440
- return promise .Promise .resolve (path )
427
+ dl_result = downloader .DownloadResult (path = url_path , url_info = url_info )
441
428
442
429
# Cache not found
430
+ else :
431
+ dl_result = None
432
+
433
+ if dl_result :
434
+ path = dl_result .path
435
+ url_info = dl_result .url_info
436
+
437
+ self ._log_skip_download (url = url , url_info = url_info , path = path )
438
+ self ._register_or_validate_checksums (
439
+ url = url , url_info = url_info , path = path
440
+ )
441
+ return promise .Promise .resolve (path )
443
442
else :
444
443
return self ._download (resource )
445
444
@@ -451,58 +450,40 @@ def _log_skip_download(
451
450
self ._downloader .increase_tqdm (url_info )
452
451
453
452
def _register_or_validate_checksums (
454
- self ,
455
- resource : resource_lib .Resource ,
456
- path : epath .Path ,
457
- computed_url_info : checksums .UrlInfo ,
458
- ) -> epath .Path | None :
459
- """Validates/records checksums and returns checksum path if registered."""
460
- url = resource .url
461
-
453
+ self , url : str , url_info : checksums .UrlInfo , path : epath .Path
454
+ ) -> None :
455
+ """Registers or validates checksums depending on `self._register_checksums`."""
462
456
if self ._register_checksums :
463
457
# Note:
464
- # * We save even if `expected_url_info == computed_url_info ` as
458
+ # * We save even if `expected_url_info == url_info ` as
465
459
# `expected_url_info` might have been loaded from another dataset.
466
460
# * `register_checksums_path` was validated in `__init__` so this
467
461
# shouldn't fail.
468
- self ._recorded_url_infos [url ] = computed_url_info
462
+ self ._recorded_url_infos [url ] = url_info
469
463
self ._record_url_infos ()
470
- return self ._get_dl_path (resource , computed_url_info .checksum )
471
464
elif expected_url_info := self ._url_infos .get (url ):
472
465
# Eventually validate checksums
473
- # Note:
474
- # * If path is cached at `url_path` but cached
475
- # `computed_url_info != expected_url_info`, a new download has
476
- # been triggered (as _get_cached_path returns None)
477
- # * If path was downloaded but checksums don't match expected, then
478
- # the download isn't cached (re-running build will retrigger a new
479
- # download). This is expected as it might mean the downloaded file
480
- # was corrupted. Note: The tmp file isn't deleted to allow inspection.
481
- self ._validate_checksums (
482
- url = url ,
483
- expected_url_info = expected_url_info ,
484
- computed_url_info = computed_url_info ,
485
- path = path ,
486
- )
487
- return self ._get_dl_path (resource , expected_url_info .checksum )
488
-
489
- def _validate_checksums (
490
- self ,
491
- url : str ,
492
- expected_url_info : checksums .UrlInfo ,
493
- computed_url_info : checksums .UrlInfo ,
494
- path : epath .Path ,
495
- ) -> None :
496
- """Validate computed_url_info match expected_url_info."""
497
- if expected_url_info != computed_url_info :
498
- msg = (
499
- f'Artifact { url } , downloaded to { path } , has wrong checksum:\n '
500
- f'* Expected: { expected_url_info } \n '
501
- f'* Got: { computed_url_info } \n '
502
- 'To debug, see: '
503
- 'https://www.tensorflow.org/datasets/overview#fixing_nonmatchingchecksumerror'
504
- )
505
- raise NonMatchingChecksumError (msg )
466
+ if expected_url_info != url_info :
467
+ msg = (
468
+ f'Artifact { url } , downloaded to { path } , has wrong checksum:\n '
469
+ f'* Expected: { expected_url_info } \n '
470
+ f'* Got: { url_info } \n '
471
+ 'To debug, see: '
472
+ 'https://www.tensorflow.org/datasets/overview#fixing_nonmatchingchecksumerror'
473
+ )
474
+ raise NonMatchingChecksumError (msg )
475
+
476
+ def _is_checksum_registered (self , url : str ) -> bool :
477
+ """Returns whether checksums are registered for the given url."""
478
+ if url in self ._url_infos :
479
+ # Checksum is already registered
480
+ return True
481
+ elif self ._register_checksums :
482
+ # Checksum is being registered
483
+ return True
484
+ else :
485
+ # Checksum is not registered
486
+ return False
506
487
507
488
def _download (
508
489
self , resource : resource_lib .Resource
@@ -534,10 +515,12 @@ def callback(dl_result: downloader.DownloadResult) -> epath.Path:
534
515
dl_path = dl_result .path
535
516
dl_url_info = dl_result .url_info
536
517
537
- dst_path = self ._register_or_validate_checksums (
538
- resource = resource , computed_url_info = dl_url_info , path = dl_path
518
+ self ._register_or_validate_checksums (
519
+ url = url , url_info = dl_url_info , path = dl_path
539
520
)
540
- if not dst_path :
521
+ if self ._is_checksum_registered (url = url ):
522
+ dst_path = self ._get_dl_path (resource , dl_url_info .checksum )
523
+ else :
541
524
dst_path = url_path
542
525
543
526
resource_lib .write_info_file (
0 commit comments