@@ -355,15 +355,11 @@ def _get_manually_downloaded_path(
355
355
# processed once, even if passed twice to download_manager.
356
356
@utils .build_synchronize_decorator ()
357
357
@utils .memoize ()
358
- def _download (self , resource : Url ) -> promise .Promise [epath .Path ]:
358
+ def _download_or_get_cache (
359
+ self , resource : Url
360
+ ) -> promise .Promise [epath .Path ]:
359
361
"""Downloads resource or gets downloaded cache.
360
362
361
- This function:
362
-
363
- 1. Reuse cache (`_get_cached_path`) or download the file
364
- 2. Register or validate checksums (`_register_or_validate_checksums`)
365
- 3. Rename download to final path (`_rename_and_get_final_dl_path`)
366
-
367
363
Args:
368
364
resource: The URL to download.
369
365
@@ -378,76 +374,79 @@ def _download(self, resource: Url) -> promise.Promise[epath.Path]:
378
374
379
375
expected_url_info = self ._url_infos .get (url )
380
376
381
- # 3 possible destinations for the path:
382
- # * In `manual_dir` (manually downloaded data)
383
- # * In `downloads/url_path` (checksum unknown)
384
- # * In `downloads/checksum_path` (checksum registered)
385
- manually_downloaded_path = self ._get_manually_downloaded_path (
386
- expected_url_info = expected_url_info
387
- )
388
- url_path = self ._get_dl_path (resource )
389
- checksum_path = (
390
- self ._get_dl_path (resource , expected_url_info .checksum )
391
- if expected_url_info
392
- else None
393
- )
394
-
395
- # Get the cached path and url_info (if they exists)
396
- dl_result = downloader .get_cached_path (
397
- manually_downloaded_path = manually_downloaded_path ,
398
- checksum_path = checksum_path ,
399
- url_path = url_path ,
400
- expected_url_info = expected_url_info ,
401
- )
402
- if dl_result and not self ._force_download : # Download was cached
403
- logging .info (
404
- f'Skipping download of { url } : File cached in { dl_result .path } '
377
+ # User has manually downloaded the file.
378
+ if manually_downloaded_path := self ._get_manually_downloaded_path (
379
+ expected_url_info
380
+ ):
381
+ computed_url_info = checksums .compute_url_info (manually_downloaded_path )
382
+ self ._register_or_validate_checksums (
383
+ resource = resource ,
384
+ path = manually_downloaded_path ,
385
+ computed_url_info = computed_url_info ,
405
386
)
406
- # Still update the progression bar to indicate the file was downloaded
407
- self ._downloader .increase_tqdm (dl_result .url_info )
408
- future = promise .Promise .resolve (dl_result )
409
- else :
410
- # Download in a tmp directory next to url_path (to avoid name collisions)
411
- # `download_tmp_dir` is cleaned-up in `_rename_and_get_final_dl_path`
412
- download_tmp_dir = (
413
- url_path .parent / f'{ url_path .name } .tmp.{ uuid .uuid4 ().hex } '
387
+ self ._log_skip_download (
388
+ url = url , url_info = computed_url_info , path = manually_downloaded_path
414
389
)
415
- download_tmp_dir .mkdir ()
416
- logging .info (f'Downloading { url } into { download_tmp_dir } ...' )
417
- future = self ._downloader .download (
418
- url , download_tmp_dir , verify = self ._verify_ssl
390
+ return promise .Promise .resolve (manually_downloaded_path )
391
+
392
+ # Force download
393
+ elif self ._force_download :
394
+ return self ._download (resource )
395
+
396
+ # Download has been cached (checksum known)
397
+ elif expected_url_info and resource_lib .Resource .exists_locally (
398
+ checksum_path := self ._get_dl_path (resource , expected_url_info .checksum )
399
+ ):
400
+ self ._register_or_validate_checksums (
401
+ resource = resource ,
402
+ path = checksum_path ,
403
+ computed_url_info = expected_url_info ,
419
404
)
405
+ self ._log_skip_download (
406
+ url = url , url_info = expected_url_info , path = checksum_path
407
+ )
408
+ return promise .Promise .resolve (checksum_path )
409
+
410
+ # Download has been cached (checksum unknown)
411
+ elif resource_lib .Resource .exists_locally (
412
+ url_path := self ._get_dl_path (resource )
413
+ ):
414
+ computed_url_info = downloader .read_url_info (url_path )
415
+ if expected_url_info and expected_url_info != computed_url_info :
416
+ # If checksums are registered but do not match, trigger a new
417
+ # download (e.g. previous file corrupted, checksums updated)
418
+ return self ._download (resource )
419
+ if checksum_path := self ._register_or_validate_checksums (
420
+ resource = resource , path = url_path , computed_url_info = computed_url_info
421
+ ):
422
+ # Checksums were registered: Rename -> checksum_path
423
+ resource_lib .replace_info_file (url_path , checksum_path )
424
+ path = url_path .replace (checksum_path )
425
+ else :
426
+ # Checksums not registered: -> do nothing
427
+ path = url_path
428
+ self ._log_skip_download (url = url , url_info = computed_url_info , path = path )
429
+ return promise .Promise .resolve (path )
420
430
421
- # Post-process the result
422
- return future .then (
423
- lambda dl_result : self ._register_or_validate_checksums ( # pylint: disable=g-long-lambda
424
- resource = resource ,
425
- path = dl_result .path ,
426
- computed_url_info = dl_result .url_info ,
427
- expected_url_info = expected_url_info ,
428
- checksum_path = checksum_path ,
429
- url_path = url_path ,
430
- )
431
- )
431
+ # Cache not found
432
+ else :
433
+ return self ._download (resource )
434
+
435
+ def _log_skip_download (
436
+ self , url : str , url_info : checksums .UrlInfo , path : epath .Path
437
+ ) -> None :
438
+ logging .info (f'Skipping download of { url } : File cached in { path } ' )
439
+ # Still update the progression bar to indicate the file was downloaded
440
+ self ._downloader .increase_tqdm (url_info )
432
441
433
442
def _register_or_validate_checksums (
434
443
self ,
435
444
resource : resource_lib .Resource ,
436
445
path : epath .Path ,
437
- expected_url_info : checksums .UrlInfo | None ,
438
446
computed_url_info : checksums .UrlInfo ,
439
- checksum_path : epath .Path | None ,
440
- url_path : epath .Path ,
441
- ) -> epath .Path :
442
- """Validates/records checksums and renames final downloaded path."""
443
- # `path` can be:
444
- # * Manually downloaded
445
- # * (cached) checksum_path
446
- # * (cached) url_path
447
- # * `tmp_dir/file` (downloaded path)
448
-
447
+ ) -> epath .Path | None :
448
+ """Validates/records checksums and returns checksum path if registered."""
449
449
url : str = resource .url # pytype: disable=annotation-type-mismatch
450
- # Used both in `.downloaded_size` and `_record_url_infos()`
451
450
self ._recorded_url_infos [url ] = computed_url_info
452
451
453
452
if self ._register_checksums :
@@ -457,12 +456,9 @@ def _register_or_validate_checksums(
457
456
# * `register_checksums_path` was validated in `__init__` so this
458
457
# shouldn't fail.
459
458
self ._record_url_infos ()
460
-
461
- # Checksum path should now match the new registered checksum (even if
462
- # checksums were previously registered)
463
- expected_url_info = computed_url_info
464
- checksum_path = self ._get_dl_path (resource , computed_url_info .checksum )
459
+ return self ._get_dl_path (resource , computed_url_info .checksum )
465
460
else :
461
+ expected_url_info = self ._url_infos .get (url )
466
462
# Eventually validate checksums
467
463
# Note:
468
464
# * If path is cached at `url_path` but cached
@@ -478,15 +474,8 @@ def _register_or_validate_checksums(
478
474
computed_url_info = computed_url_info ,
479
475
path = path ,
480
476
)
481
-
482
- return self ._rename_and_get_final_dl_path (
483
- url = url ,
484
- path = path ,
485
- expected_url_info = expected_url_info ,
486
- computed_url_info = computed_url_info ,
487
- checksum_path = checksum_path ,
488
- url_path = url_path ,
489
- )
477
+ if expected_url_info :
478
+ return self ._get_dl_path (resource , expected_url_info .checksum )
490
479
491
480
def _validate_checksums (
492
481
self ,
@@ -517,47 +506,56 @@ def _validate_checksums(
517
506
)
518
507
raise NonMatchingChecksumError (msg )
519
508
520
- def _rename_and_get_final_dl_path (
521
- self ,
522
- url : str ,
523
- path : epath .Path ,
524
- expected_url_info : checksums .UrlInfo | None ,
525
- computed_url_info : checksums .UrlInfo | None ,
526
- checksum_path : epath .Path | None ,
527
- url_path : epath .Path ,
528
- ) -> epath .Path :
529
- """Eventually rename the downloaded file if checksums were recorded."""
530
- # `path` can be:
531
- # * Manually downloaded
532
- # * (cached) checksum_path
533
- # * (cached) url_path
534
- # * `tmp_dir/file` (downloaded path)
535
- if self ._manual_dir and path .is_relative_to (self ._manual_dir ):
536
- return path # Manually downloaded data
537
- elif path == checksum_path : # Path already at final destination
538
- assert computed_url_info == expected_url_info # Sanity check
539
- return checksum_path # pytype: disable=bad-return-type
540
- elif path == url_path :
541
- if checksum_path :
542
- # Checksums were registered: Rename -> checksums_path
543
- resource_lib .replace_info_file (path , checksum_path )
544
- return path .replace (checksum_path )
545
- else :
546
- # Checksums not registered: -> do nothing
547
- return path
548
- else : # Path was downloaded in tmp dir
549
- dst_path = checksum_path or url_path
509
+ def _download (
510
+ self , resource : resource_lib .Resource
511
+ ) -> promise .Promise [epath .Path ]:
512
+ """Downloads resource.
513
+
514
+ Args:
515
+ resource: The resource to download.
516
+
517
+ Returns:
518
+ Promise of the path to the downloaded url.
519
+ """
520
+ url_path = self ._get_dl_path (resource )
521
+ url : str = resource .url # pytype: disable=annotation-type-mismatch
522
+
523
+ # Download in a tmp directory next to url_path (to avoid name collisions)
524
+ # `download_tmp_dir` is cleaned-up in `callback`
525
+ download_tmp_dir = (
526
+ url_path .parent / f'{ url_path .name } .tmp.{ uuid .uuid4 ().hex } '
527
+ )
528
+ download_tmp_dir .mkdir ()
529
+ logging .info (f'Downloading { url } into { download_tmp_dir } ...' )
530
+ future = self ._downloader .download (
531
+ url , download_tmp_dir , verify = self ._verify_ssl
532
+ )
533
+
534
+ def callback (dl_result : downloader .DownloadResult ) -> epath .Path :
535
+ """Post-process the download result."""
536
+ dl_path = dl_result .path
537
+ dl_url_info = dl_result .url_info
538
+
539
+ dst_path = self ._register_or_validate_checksums (
540
+ resource = resource , computed_url_info = dl_url_info , path = dl_path
541
+ )
542
+ if not dst_path :
543
+ dst_path = url_path
544
+
550
545
resource_lib .write_info_file (
551
546
url = url ,
552
547
path = dst_path ,
553
548
dataset_name = self ._dataset_name ,
554
- original_fname = path .name ,
555
- url_info = computed_url_info ,
549
+ original_fname = dl_path .name ,
550
+ url_info = dl_url_info ,
556
551
)
557
- path .replace (dst_path )
558
- path .parent .rmdir () # Cleanup tmp dir (will fail if dir not empty)
552
+ dl_path .replace (dst_path )
553
+ dl_path .parent .rmdir () # Cleanup tmp dir (will fail if dir not empty)
554
+
559
555
return dst_path
560
556
557
+ return future .then (callback )
558
+
561
559
@utils .build_synchronize_decorator ()
562
560
@utils .memoize ()
563
561
def _extract (self , resource : ExtractPath ) -> promise .Promise [epath .Path ]:
@@ -587,7 +585,7 @@ def callback(path):
587
585
resource .path = path
588
586
return self ._extract (resource )
589
587
590
- return self ._download (resource ).then (callback )
588
+ return self ._download_or_get_cache (resource ).then (callback )
591
589
592
590
def download_checksums (self , checksums_url ):
593
591
"""Downloads checksum file from the given URL and adds it to registry."""
@@ -636,7 +634,7 @@ def download(self, url_or_urls):
636
634
"""
637
635
# Add progress bar to follow the download state
638
636
with self ._downloader .tqdm ():
639
- return _map_promise (self ._download , url_or_urls )
637
+ return _map_promise (self ._download_or_get_cache , url_or_urls )
640
638
641
639
def iter_archive (
642
640
self ,
0 commit comments