Skip to content

Commit 8b2cd71

Browse files
committed
fix: download limit
1 parent 014caa6 commit 8b2cd71

File tree

2 files changed

+21
-10
lines changed

2 files changed

+21
-10
lines changed

bing_images/bing.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,42 +16,51 @@
1616
def fetch_image_urls(
1717
query: str,
1818
limit: int = 20,
19-
file_type: str = "jpg",
19+
file_type: str = '',
2020
filters: str = ''
2121
) -> List[str]:
2222
result = list()
23-
keywords = query + " " + file_type
24-
max_number = math.ceil(limit*1.5)
25-
urls = crawl_image_urls(keywords, filters, max_number)
23+
keywords = query
24+
if len(file_type) > 0:
25+
keywords = query + " " + file_type
26+
urls = crawl_image_urls(keywords, filters, limit)
2627
for url in urls:
27-
if url.endswith(file_type) and url not in result:
28+
if isValidURL(url, file_type) and url not in result:
2829
result.append(url)
2930
if len(result) >= limit:
3031
break
3132
return result
3233

3334

35+
def isValidURL(url, file_type):
36+
if len(file_type) < 1:
37+
return True
38+
return url.endswith(file_type)
39+
40+
3441
def download_images(
3542
query: str,
3643
limit: int = 20,
3744
output_dir='',
3845
pool_size: int = 20,
39-
file_type: str = "jpg",
46+
file_type: str = '',
4047
filters: str = '',
4148
force_replace=False
4249
):
4350
start = timer()
4451
image_dir = make_image_dir(output_dir, force_replace)
4552
print("Save path: {}".format(image_dir))
4653

47-
urls = fetch_image_urls(query, limit, file_type, filters)
54+
# Fetch more image URLs to avoid some images are invalid.
55+
max_number = math.ceil(limit*1.5)
56+
urls = fetch_image_urls(query, max_number, file_type, filters)
4857
entries = get_image_entries(urls, image_dir)
4958

5059
print("Downloading images")
5160
ps = pool_size
5261
if limit < pool_size:
5362
ps = limit
54-
download_image_entries(entries, ps)
63+
download_image_entries(entries, ps, limit)
5564

5665
rename_images(image_dir, query)
5766

@@ -76,11 +85,13 @@ def rename_images(dir, prefix):
7685
print("Finished renaming")
7786

7887

79-
def download_image_entries(entries, pool_size):
88+
def download_image_entries(entries, pool_size, limit):
8089
counter = 1
8190
results = ThreadPool(pool_size).imap_unordered(
8291
download_image_with_thread, entries)
8392
for (url, result) in results:
93+
if counter > limit:
94+
break
8495
if result:
8596
print("#{} {} Downloaded".format(counter, url))
8697
counter = counter + 1

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = bing_images
3-
version = 0.0.6
3+
version = 0.1.0
44
author = CatchZeng
55
author_email = catchzenghh@gmail.com
66
description = Python library to fetch image urls and download using multithreading from Bing.com.

0 commit comments

Comments
 (0)