@@ -23,10 +23,9 @@ class ImageDownloaderCSV(CSVParser.CSVParser):
2323
2424 '''
2525
26- def __init__ (self , filename , check_image = True ):
26+ def __init__ (self , filename ):
2727
2828 self .has_filename = False
29- self .check_img = check_image
3029
3130 super ().__init__ (filename )
3231
@@ -60,13 +59,15 @@ def validate(self):
6059
6160class ImageDownloader (ParallelLoader .ParallelLoader ):
6261
63- def __init__ (self , db , dry_run = False ):
62+ def __init__ (self , db = None , dry_run = False , n_download_retries = 0 , check_if_present = False ):
6463
6564 super ().__init__ (db , dry_run = dry_run )
6665
6766 self .type = "image"
6867
69- self .check_img = False
68+ self .check_img = check_if_present
69+ self .images_already_downloaded = 0
70+ self .n_download_retries = n_download_retries
7071
7172 def check_if_image_is_ok (self , filename , url ):
7273
@@ -89,13 +90,26 @@ def download_image(self, url, filename):
8990 start = time .time ()
9091
9192 if self .check_img and self .check_if_image_is_ok (filename , url ):
93+ self .images_already_downloaded += 1
94+ self .times_arr .append (time .time () - start )
9295 return
9396
9497 folder = os .path .dirname (filename )
9598 if not os .path .exists (folder ):
9699 os .makedirs (folder , exist_ok = True )
97100
98- imgdata = requests .get (url )
101+ retries = 0
102+ while True :
103+ imgdata = requests .get (url )
104+ if imgdata .ok :
105+ break
106+ else :
107+ if retries >= self .n_download_retries :
108+ break
109+ print ("WARNING: Retrying object:" , url )
110+ retries += 1
111+ time .sleep (2 )
112+
99113 if imgdata .ok :
100114 fd = open (filename , "wb" )
101115 fd .write (imgdata .content )
@@ -120,7 +134,7 @@ def download_image(self, url, filename):
120134 def worker (self , thid , generator , start , end ):
121135
122136 if thid == 0 and self .stats :
123- pb = ProgressBar .ProgressBar ("download_progress.txt" )
137+ pb = ProgressBar .ProgressBar ()
124138
125139 for i in range (start , end ):
126140
@@ -139,13 +153,22 @@ def print_stats(self):
139153 print ("====== ApertureDB ImageDownloader Stats ======" )
140154
141155 times = np .array (self .times_arr )
142- print ("Avg image download time(s):" , np .mean (times ))
143- print ("Img download time std:" , np .std (times ))
144- print ("Avg download throughput (images/s)):" ,
156+ if len (times ) <= 0 :
157+ print ("Error: No downloads." )
158+ return
159+
160+ if self .images_already_downloaded > 0 :
161+ print ("Images already present:" , self .images_already_downloaded )
162+
163+ print ("Images downloaded:" , len (times ) - self .images_already_downloaded )
164+ print ("Avg image time(s):" , np .mean (times ))
165+ print ("Image time std:" , np .std (times ))
166+ print ("Throughput (images/s)):" ,
145167 1 / np .mean (times ) * self .numthreads )
146168
147169 print ("Total time(s):" , self .ingestion_time )
148170 print ("Overall throughput (img/s):" ,
149171 self .total_elements / self .ingestion_time )
150- print ("Total errors encountered:" , self .error_counter )
172+ if self .error_counter > 0 :
173+ print ("Errors encountered:" , self .error_counter )
151174 print ("=============================================" )
0 commit comments