1616def fetch_image_urls (
1717 query : str ,
1818 limit : int = 20 ,
19- file_type : str = "jpg" ,
19+ file_type : str = '' ,
2020 filters : str = ''
2121) -> List [str ]:
2222 result = list ()
23- keywords = query + " " + file_type
24- max_number = math .ceil (limit * 1.5 )
25- urls = crawl_image_urls (keywords , filters , max_number )
23+ keywords = query
24+ if len (file_type ) > 0 :
25+ keywords = query + " " + file_type
26+ urls = crawl_image_urls (keywords , filters , limit )
2627 for url in urls :
27- if url . endswith ( file_type ) and url not in result :
28+ if isValidURL ( url , file_type ) and url not in result :
2829 result .append (url )
2930 if len (result ) >= limit :
3031 break
3132 return result
3233
3334
35+ def isValidURL (url , file_type ):
36+ if len (file_type ) < 1 :
37+ return True
38+ return url .endswith (file_type )
39+
40+
3441def download_images (
3542 query : str ,
3643 limit : int = 20 ,
3744 output_dir = '' ,
3845 pool_size : int = 20 ,
39- file_type : str = "jpg" ,
46+ file_type : str = '' ,
4047 filters : str = '' ,
4148 force_replace = False
4249):
4350 start = timer ()
4451 image_dir = make_image_dir (output_dir , force_replace )
4552 print ("Save path: {}" .format (image_dir ))
4653
47- urls = fetch_image_urls (query , limit , file_type , filters )
54+ # Fetch more image URLs to avoid some images are invalid.
55+ max_number = math .ceil (limit * 1.5 )
56+ urls = fetch_image_urls (query , max_number , file_type , filters )
4857 entries = get_image_entries (urls , image_dir )
4958
5059 print ("Downloading images" )
5160 ps = pool_size
5261 if limit < pool_size :
5362 ps = limit
54- download_image_entries (entries , ps )
63+ download_image_entries (entries , ps , limit )
5564
5665 rename_images (image_dir , query )
5766
@@ -76,11 +85,13 @@ def rename_images(dir, prefix):
7685 print ("Finished renaming" )
7786
7887
79- def download_image_entries (entries , pool_size ):
88+ def download_image_entries (entries , pool_size , limit ):
8089 counter = 1
8190 results = ThreadPool (pool_size ).imap_unordered (
8291 download_image_with_thread , entries )
8392 for (url , result ) in results :
93+ if counter > limit :
94+ break
8495 if result :
8596 print ("#{} {} Downloaded" .format (counter , url ))
8697 counter = counter + 1
0 commit comments