Skip to content

Commit c5c1e14

Browse files
committed
feat: add extra_query_params
1 parent 9a5d728 commit c5c1e14

File tree

4 files changed

+15
-10
lines changed

4 files changed

+15
-10
lines changed

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ Python library to fetch image urls and download using **multithreading** from [B
1010

1111
- [x] Support **file type** filters.
1212
- [x] Support [Bing.com](https://bing.com/) **filterui** filters.
13+
- [x] Support **extra query params**, such as `&first=100&tsc=ImageBasicHover` in `https://cn.bing.com/images/search?q=cat&first=100&tsc=ImageBasicHover`
1314
- [x] Download using **multithreading** and custom thread **pool size**.
1415
- [x] Support **purely** obtaining the image urls.
1516

@@ -36,7 +37,7 @@ fetch_image_urls.py
3637
```py
3738
from bing_images import bing
3839

39-
urls = bing.fetch_image_urls("cat", limit=10, file_type='png', filters='+filterui:aspect-square+filterui:color2-bw')
40+
urls = bing.fetch_image_urls("cat", limit=10, file_type='png', filters='+filterui:aspect-square+filterui:color2-bw', extra_query_params='&first=1')
4041
print("{} images.".format(len(urls)))
4142
counter = 1
4243
for url in urls:
@@ -76,7 +77,8 @@ bing.download_images("cat",
7677
output_dir="/Users/catchzeng/Desktop/cat",
7778
pool_size=10,
7879
file_type="png",
79-
force_replace=True)
80+
force_replace=True,
81+
extra_query_params='&first=1')
8082
```
8183

8284
> - **output_dir**: the default output_dir is `os.path.join(os.getcwd(), "bing-images")`

bing_images/bing.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,14 @@ def fetch_image_urls(
1919
query: str,
2020
limit: int = 20,
2121
file_type: str = '',
22-
filters: str = ''
22+
filters: str = '',
23+
extra_query_params: str =''
2324
) -> List[str]:
2425
result = list()
2526
keywords = query
2627
if len(file_type) > 0:
2728
keywords = query + " " + file_type
28-
urls = crawl_image_urls(keywords, filters, limit)
29+
urls = crawl_image_urls(keywords, filters, limit, extra_query_params)
2930
for url in urls:
3031
if isValidURL(url, file_type) and url not in result:
3132
result.append(url)
@@ -47,15 +48,16 @@ def download_images(
4748
pool_size: int = 20,
4849
file_type: str = '',
4950
filters: str = '',
50-
force_replace=False
51+
force_replace=False,
52+
extra_query_params: str =''
5153
):
5254
start = timer()
5355
image_dir = make_image_dir(output_dir, force_replace)
5456
print("Save path: {}".format(image_dir))
5557

5658
# Fetch more image URLs to avoid some images are invalid.
5759
max_number = math.ceil(limit*1.5)
58-
urls = fetch_image_urls(query, max_number, file_type, filters)
60+
urls = fetch_image_urls(query, max_number, file_type, filters, extra_query_params)
5961
entries = get_image_entries(urls, image_dir)
6062

6163
print("Downloading images")

bing_images/crawler.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,12 @@
77
BASE_URL = "https://www.bing.com/images/search?"
88

99

10-
def gen_query_url(keywords, filters):
10+
def gen_query_url(keywords, filters, extra_query_params =''):
1111
keywords_str = "&q=" + quote(keywords)
1212
query_url = BASE_URL + keywords_str
1313
if len(filters) > 0:
1414
query_url += "&qft="+filters
15+
query_url += extra_query_params
1516
return query_url
1617

1718

@@ -43,7 +44,7 @@ def image_url_from_webpage(driver, max_number=10000):
4344
return image_urls
4445

4546

46-
def crawl_image_urls(keywords, filters, max_number=10000, proxy=None, proxy_type="http"):
47+
def crawl_image_urls(keywords, filters, max_number=10000, proxy=None, proxy_type="http", extra_query_params =''):
4748
chrome_path = shutil.which("chromedriver")
4849
chrome_path = "./bin/chromedriver" if chrome_path is None else chrome_path
4950
chrome_options = webdriver.ChromeOptions()
@@ -52,7 +53,7 @@ def crawl_image_urls(keywords, filters, max_number=10000, proxy=None, proxy_type
5253
"--proxy-server={}://{}".format(proxy_type, proxy))
5354
driver = webdriver.Chrome(chrome_path, chrome_options=chrome_options)
5455

55-
query_url = gen_query_url(keywords, filters)
56+
query_url = gen_query_url(keywords, filters, extra_query_params)
5657
driver.set_window_size(1920, 1080)
5758
driver.get(query_url)
5859
image_urls = image_url_from_webpage(driver, max_number)

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = bing_images
3-
version = 0.1.1
3+
version = 0.2.1
44
author = CatchZeng
55
author_email = catchzenghh@gmail.com
66
description = Python library to fetch image urls and download using multithreading from Bing.com.

0 commit comments

Comments
 (0)