@@ -297,21 +297,21 @@ def __getstate__(self):
297
297
return state
298
298
299
299
@property
300
- def _downloader (self ):
300
+ def _downloader (self ) -> downloader . _Downloader :
301
301
if not self .__downloader :
302
302
self .__downloader = get_downloader (
303
303
max_simultaneous_downloads = self ._max_simultaneous_downloads
304
304
)
305
305
return self .__downloader
306
306
307
307
@property
308
- def _extractor (self ):
308
+ def _extractor (self ) -> extractor . _Extractor :
309
309
if not self .__extractor :
310
310
self .__extractor = extractor .get_extractor ()
311
311
return self .__extractor
312
312
313
313
@property
314
- def downloaded_size (self ):
314
+ def downloaded_size (self ) -> int :
315
315
"""Returns the total size of downloaded files."""
316
316
return sum (url_info .size for url_info in self ._recorded_url_infos .values ())
317
317
@@ -331,6 +331,22 @@ def _record_url_infos(self):
331
331
self ._recorded_url_infos ,
332
332
)
333
333
334
+ def _get_manually_downloaded_path (
335
+ self , expected_url_info : checksums .UrlInfo | None
336
+ ) -> epath .Path | None :
337
+ """Checks if file is already downloaded in manual_dir."""
338
+ if not self ._manual_dir : # Manual dir not passed
339
+ return None
340
+
341
+ if not expected_url_info or not expected_url_info .filename :
342
+ return None # Filename unknown.
343
+
344
+ manual_path = self ._manual_dir / expected_url_info .filename
345
+ if not manual_path .exists (): # File not manually downloaded
346
+ return None
347
+
348
+ return manual_path
349
+
334
350
# Synchronize and memoize decorators ensure same resource will only be
335
351
# processed once, even if passed twice to download_manager.
336
352
@utils .build_synchronize_decorator ()
@@ -363,9 +379,8 @@ def _download(self, resource: Url) -> promise.Promise[epath.Path]:
363
379
# * In `manual_dir` (manually downloaded data)
364
380
# * In `downloads/url_path` (checksum unknown)
365
381
# * In `downloads/checksum_path` (checksum registered)
366
- manually_downloaded_path = _get_manually_downloaded_path (
367
- manual_dir = self ._manual_dir ,
368
- expected_url_info = expected_url_info ,
382
+ manually_downloaded_path = self ._get_manually_downloaded_path (
383
+ expected_url_info = expected_url_info
369
384
)
370
385
url_path = self ._get_dl_path (url )
371
386
checksum_path = (
@@ -459,12 +474,11 @@ def _register_or_validate_checksums(
459
474
# the download isn't cached (re-running build will retrigger a new
460
475
# download). This is expected as it might mean the downloaded file
461
476
# was corrupted. Note: The tmp file isn't deleted to allow inspection.
462
- _validate_checksums (
477
+ self . _validate_checksums (
463
478
url = url ,
464
479
path = path ,
465
480
expected_url_info = expected_url_info ,
466
481
computed_url_info = computed_url_info ,
467
- force_checksums_validation = self ._force_checksums_validation ,
468
482
)
469
483
470
484
return self ._rename_and_get_final_dl_path (
@@ -476,6 +490,42 @@ def _register_or_validate_checksums(
476
490
url_path = url_path ,
477
491
)
478
492
493
+ def _validate_checksums (
494
+ self ,
495
+ url : str ,
496
+ path : epath .Path ,
497
+ computed_url_info : checksums .UrlInfo | None ,
498
+ expected_url_info : checksums .UrlInfo | None ,
499
+ ) -> None :
500
+ """Validate computed_url_info match expected_url_info."""
501
+ # If force-checksums validations, both expected and computed url_info
502
+ # should exists
503
+ if self ._force_checksums_validation :
504
+ # Checksum of the downloaded file unknown (for manually downloaded file)
505
+ if not computed_url_info :
506
+ computed_url_info = checksums .compute_url_info (path )
507
+ # Checksums have not been registered
508
+ if not expected_url_info :
509
+ raise ValueError (
510
+ f'Missing checksums url: { url } , yet '
511
+ '`force_checksums_validation=True`. '
512
+ 'Did you forget to register checksums?'
513
+ )
514
+
515
+ if (
516
+ expected_url_info
517
+ and computed_url_info
518
+ and expected_url_info != computed_url_info
519
+ ):
520
+ msg = (
521
+ f'Artifact { url } , downloaded to { path } , has wrong checksum:\n '
522
+ f'* Expected: { expected_url_info } \n '
523
+ f'* Got: { computed_url_info } \n '
524
+ 'To debug, see: '
525
+ 'https://www.tensorflow.org/datasets/overview#fixing_nonmatchingchecksumerror'
526
+ )
527
+ raise NonMatchingChecksumError (msg )
528
+
479
529
def _rename_and_get_final_dl_path (
480
530
self ,
481
531
url : str ,
@@ -707,61 +757,6 @@ def manual_dir(self) -> epath.Path:
707
757
return self ._manual_dir
708
758
709
759
710
- def _get_manually_downloaded_path (
711
- manual_dir : epath .Path | None ,
712
- expected_url_info : checksums .UrlInfo | None ,
713
- ) -> epath .Path | None :
714
- """Checks if file is already downloaded in manual_dir."""
715
- if not manual_dir : # Manual dir not passed
716
- return None
717
-
718
- if not expected_url_info or not expected_url_info .filename :
719
- return None # Filename unknown.
720
-
721
- manual_path = manual_dir / expected_url_info .filename
722
- if not manual_path .exists (): # File not manually downloaded
723
- return None
724
-
725
- return manual_path
726
-
727
-
728
- def _validate_checksums (
729
- url : str ,
730
- path : epath .Path ,
731
- computed_url_info : checksums .UrlInfo | None ,
732
- expected_url_info : checksums .UrlInfo | None ,
733
- force_checksums_validation : bool ,
734
- ) -> None :
735
- """Validate computed_url_info match expected_url_info."""
736
- # If force-checksums validations, both expected and computed url_info
737
- # should exists
738
- if force_checksums_validation :
739
- # Checksum of the downloaded file unknown (for manually downloaded file)
740
- if not computed_url_info :
741
- computed_url_info = checksums .compute_url_info (path )
742
- # Checksums have not been registered
743
- if not expected_url_info :
744
- raise ValueError (
745
- f'Missing checksums url: { url } , yet '
746
- '`force_checksums_validation=True`. '
747
- 'Did you forget to register checksums?'
748
- )
749
-
750
- if (
751
- expected_url_info
752
- and computed_url_info
753
- and expected_url_info != computed_url_info
754
- ):
755
- msg = (
756
- f'Artifact { url } , downloaded to { path } , has wrong checksum:\n '
757
- f'* Expected: { expected_url_info } \n '
758
- f'* Got: { computed_url_info } \n '
759
- 'To debug, see: '
760
- 'https://www.tensorflow.org/datasets/overview#fixing_nonmatchingchecksumerror'
761
- )
762
- raise NonMatchingChecksumError (msg )
763
-
764
-
765
760
def _map_promise (map_fn , all_inputs ):
766
761
"""Map the function into each element and resolve the promise."""
767
762
all_promises = tree .map_structure (map_fn , all_inputs ) # Apply the function
0 commit comments