1313from pathlib import Path
1414import random
1515from shutil import rmtree
16+ import sys
1617from threading import Lock
1718import time
1819from types import TracebackType
@@ -692,17 +693,26 @@ def _download_file(
692693 # TODO: how do we discover the total size????
693694 # TODO: do not do it in-place, but rather into some "hidden" file
694695 resuming = False
695- for attempt in range (3 ):
696+ attempt = 0
697+ nattempts = 3 # number to do, could be incremented if we downloaded a little
698+ while attempt <= nattempts :
699+ attempt += 1
696700 try :
697701 if digester :
698702 downloaded_digest = digester () # start empty
699703 warned = False
700704 # I wonder if we could make writing async with downloader
701705 with DownloadDirectory (path , digests or {}) as dldir :
702706 assert dldir .offset is not None
707+ downloaded_in_attempt = 0
703708 downloaded = dldir .offset
704709 resuming = downloaded > 0
705710 if size is not None and downloaded == size :
711+ lgr .debug (
712+ "%s - downloaded size matches target size of %d, exiting the loop" ,
713+ path ,
714+ size ,
715+ )
706716 # Exit early when downloaded == size, as making a Range
707717 # request in such a case results in a 416 error from S3.
708718 # Problems will result if `size` is None but we've already
@@ -713,6 +723,7 @@ def _download_file(
713723 assert downloaded_digest is not None
714724 downloaded_digest .update (block )
715725 downloaded += len (block )
726+ downloaded_in_attempt += len (block )
716727 # TODO: yield progress etc
717728 out : dict [str , Any ] = {"done" : downloaded }
718729 if size :
@@ -738,30 +749,83 @@ def _download_file(
738749 # Catching RequestException lets us retry on timeout & connection
739750 # errors (among others) in addition to HTTP status errors.
740751 except requests .RequestException as exc :
752+ sleep_amount = random .random () * 5 * attempt
753+ if os .environ .get ("DANDI_DOWNLOAD_AGGRESSIVE_RETRY" ):
754+ # in such a case if we downloaded a little more --
755+ # consider it a successful attempt
756+ if downloaded_in_attempt > 0 :
757+ lgr .debug (
758+ "%s - download failed on attempt #%d: %s, "
759+ "but did download %d bytes, so considering "
760+ "it a success and incrementing number of allowed attempts." ,
761+ path ,
762+ attempt ,
763+ exc ,
764+ downloaded_in_attempt ,
765+ )
766+ nattempts += 1
741767 # TODO: actually we should probably retry only on selected codes,
742- # and also respect Retry-After
743- if attempt >= 2 or (
744- exc .response is not None
745- and exc .response .status_code
746- not in (
768+ if exc .response is not None :
769+ if exc .response .status_code not in (
747770 400 , # Bad Request, but happened with gider:
748771 # https://github.com/dandi/dandi-cli/issues/87
749772 * RETRY_STATUSES ,
773+ ):
774+ lgr .debug (
775+ "%s - download failed due to response %d: %s" ,
776+ path ,
777+ exc .response .status_code ,
778+ exc ,
779+ )
780+ yield {"status" : "error" , "message" : str (exc )}
781+ return
782+ elif retry_after := exc .response .headers .get ("Retry-After" ):
783+ # playing safe
784+ if not str (retry_after ).isdigit ():
785+ # our code is wrong, do not crash but issue warning so
786+ # we might get report/fix it up
787+ lgr .warning (
788+ "%s - download failed due to response %d with non-integer"
789+ " Retry-After=%r: %s" ,
790+ path ,
791+ exc .response .status_code ,
792+ retry_after ,
793+ exc ,
794+ )
795+ yield {"status" : "error" , "message" : str (exc )}
796+ return
797+ sleep_amount = int (retry_after )
798+ lgr .debug (
799+ "%s - download failed due to response %d with "
800+ "Retry-After=%d: %s, will sleep and retry" ,
801+ path ,
802+ exc .response .status_code ,
803+ sleep_amount ,
804+ exc ,
805+ )
806+ else :
807+ lgr .debug ("%s - download failed: %s" , path , exc )
808+ yield {"status" : "error" , "message" : str (exc )}
809+ return
810+ elif attempt >= nattempts :
811+ lgr .debug (
812+ "%s - download failed after %d attempts: %s" , path , attempt , exc
750813 )
751- ):
752- lgr .debug ("%s - download failed: %s" , path , exc )
753814 yield {"status" : "error" , "message" : str (exc )}
754815 return
755816 # if is_access_denied(exc) or attempt >= 2:
756817 # raise
757818 # sleep a little and retry
758- lgr .debug (
759- "%s - failed to download on attempt #%d: %s, will sleep a bit and retry" ,
760- path ,
761- attempt ,
762- exc ,
763- )
764- time .sleep (random .random () * 5 )
819+ else :
820+ lgr .debug (
821+ "%s - download failed on attempt #%d: %s, will sleep a bit and retry" ,
822+ path ,
823+ attempt ,
824+ exc ,
825+ )
826+ time .sleep (sleep_amount )
827+ else :
828+ lgr .warning ("downloader logic: We should not be here!" )
765829
766830 if downloaded_digest and not resuming :
767831 assert downloaded_digest is not None
@@ -829,16 +893,22 @@ def __enter__(self) -> DownloadDirectory:
829893 ):
830894 # Pick up where we left off, writing to the end of the file
831895 lgr .debug (
832- "Download directory exists and has matching checksum; resuming download"
896+ "%s - download directory exists and has matching checksum(s) %s; resuming download" ,
897+ self .dirpath ,
898+ matching_algs ,
833899 )
834900 self .fp = self .writefile .open ("ab" )
835901 else :
836902 # Delete the file (if it even exists) and start anew
837903 if not chkpath .exists ():
838- lgr .debug ("Starting new download in new download directory" )
904+ lgr .debug (
905+ "%s - no prior digests found; starting new download" , self .dirpath
906+ )
839907 else :
840908 lgr .debug (
841- "Download directory found, but digests do not match; starting new download"
909+ "%s - download directory found, but digests do not match;"
910+ " starting new download" ,
911+ self .dirpath ,
842912 )
843913 try :
844914 self .writefile .unlink ()
@@ -857,12 +927,35 @@ def __exit__(
857927 exc_tb : TracebackType | None ,
858928 ) -> None :
859929 assert self .fp is not None
930+ if exc_type is not None or exc_val is not None or exc_tb is not None :
931+ lgr .debug (
932+ "%s - entered __exit__ with position %d with exception: %s, %s" ,
933+ self .dirpath ,
934+ self .fp .tell (),
935+ exc_type ,
936+ exc_val ,
937+ )
938+ else :
939+ lgr .debug (
940+ "%s - entered __exit__ with position %d without any exception" ,
941+ self .dirpath ,
942+ self .fp .tell (),
943+ )
860944 self .fp .close ()
861945 try :
862946 if exc_type is None :
863947 try :
864948 self .writefile .replace (self .filepath )
865- except IsADirectoryError :
949+ except (IsADirectoryError , PermissionError ) as exc :
950+ if isinstance (exc , PermissionError ):
951+ if not (
952+ sys .platform .startswith ("win" ) and self .filepath .is_dir ()
953+ ):
954+ raise
955+ lgr .debug (
956+ "Destination path %s is a directory; removing it and retrying" ,
957+ self .filepath ,
958+ )
866959 rmtree (self .filepath )
867960 self .writefile .replace (self .filepath )
868961 finally :
0 commit comments