feat: add extra_query_params

CatchZeng · CatchZeng · commit c5c1e14f292f · 2022-01-25T15:21:27.000+08:00
diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@ Python library to fetch image urls and download using **multithreading** from [B
 
 - [x] Support **file type** filters.
 - [x] Support [Bing.com](https://bing.com/) **filterui** filters.
+- [x] Support **extra query params**, such as `&first=100&tsc=ImageBasicHover` in `https://cn.bing.com/images/search?q=cat&first=100&tsc=ImageBasicHover`
 - [x] Download using **multithreading** and custom thread **pool size**.
 - [x] Support **purely** obtaining the image urls.
 
@@ -36,7 +37,7 @@ fetch_image_urls.py
 ```py
 from bing_images import bing
 
-urls = bing.fetch_image_urls("cat", limit=10, file_type='png', filters='+filterui:aspect-square+filterui:color2-bw')
+urls = bing.fetch_image_urls("cat", limit=10, file_type='png', filters='+filterui:aspect-square+filterui:color2-bw', extra_query_params='&first=1')
 print("{} images.".format(len(urls)))
 counter = 1
 for url in urls:
@@ -76,7 +77,8 @@ bing.download_images("cat",
                       output_dir="/Users/catchzeng/Desktop/cat",
                       pool_size=10,
                       file_type="png",
-                      force_replace=True)
+                      force_replace=True,
+                      extra_query_params='&first=1')
 ```
 
 > - **output_dir**: the default output_dir is `os.path.join(os.getcwd(), "bing-images")`
diff --git a/bing_images/bing.py b/bing_images/bing.py
@@ -19,13 +19,14 @@ def fetch_image_urls(
     query: str,
     limit: int = 20,
     file_type: str = '',
-    filters: str = ''
+    filters: str = '',
+    extra_query_params: str =''
 ) -> List[str]:
     result = list()
     keywords = query
     if len(file_type) > 0:
         keywords = query + " " + file_type
-    urls = crawl_image_urls(keywords, filters, limit)
+    urls = crawl_image_urls(keywords, filters, limit, extra_query_params)
     for url in urls:
         if isValidURL(url, file_type) and url not in result:
             result.append(url)
@@ -47,15 +48,16 @@ def download_images(
     pool_size: int = 20,
     file_type: str = '',
     filters: str = '',
-    force_replace=False
+    force_replace=False,
+    extra_query_params: str =''
 ):
     start = timer()
     image_dir = make_image_dir(output_dir, force_replace)
     print("Save path: {}".format(image_dir))
 
     # Fetch more image URLs to avoid some images are invalid.
     max_number = math.ceil(limit*1.5)
-    urls = fetch_image_urls(query, max_number, file_type, filters)
+    urls = fetch_image_urls(query, max_number, file_type, filters, extra_query_params)
     entries = get_image_entries(urls, image_dir)
 
     print("Downloading images")
diff --git a/bing_images/crawler.py b/bing_images/crawler.py
@@ -7,11 +7,12 @@
 BASE_URL = "https://www.bing.com/images/search?"
 
 
-def gen_query_url(keywords, filters):
+def gen_query_url(keywords, filters, extra_query_params =''):
     keywords_str = "&q=" + quote(keywords)
     query_url = BASE_URL + keywords_str
     if len(filters) > 0:
         query_url += "&qft="+filters
+    query_url += extra_query_params
     return query_url
 
 
@@ -43,7 +44,7 @@ def image_url_from_webpage(driver, max_number=10000):
     return image_urls
 
 
-def crawl_image_urls(keywords, filters, max_number=10000, proxy=None, proxy_type="http"):
+def crawl_image_urls(keywords, filters, max_number=10000, proxy=None, proxy_type="http", extra_query_params =''):
     chrome_path = shutil.which("chromedriver")
     chrome_path = "./bin/chromedriver" if chrome_path is None else chrome_path
     chrome_options = webdriver.ChromeOptions()
@@ -52,7 +53,7 @@ def crawl_image_urls(keywords, filters, max_number=10000, proxy=None, proxy_type
             "--proxy-server={}://{}".format(proxy_type, proxy))
     driver = webdriver.Chrome(chrome_path, chrome_options=chrome_options)
 
-    query_url = gen_query_url(keywords, filters)
+    query_url = gen_query_url(keywords, filters, extra_query_params)
     driver.set_window_size(1920, 1080)
     driver.get(query_url)
     image_urls = image_url_from_webpage(driver, max_number)
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = bing_images
-version = 0.1.1
+version = 0.2.1
 author = CatchZeng
 author_email = catchzenghh@gmail.com
 description = Python library to fetch image urls and download using multithreading from Bing.com.