Skip to content

Commit 723bf8a

Browse files
authored
Merge pull request #11 from KTS-o7/MultiSystem
Multi system
2 parents 7b4dcea + 6f76cdd commit 723bf8a

File tree

7 files changed

+580
-3
lines changed

7 files changed

+580
-3
lines changed

README.md

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ pip install better-bing-image-downloader
3838
[Package Link](https://pypi.org/project/better-bing-image-downloader/)
3939

4040
### Usage <br />
41-
41+
#### Using as a Package:
4242
```python
4343
from better_bing_image_downloader import downloader
4444

@@ -57,6 +57,41 @@ force_replace=False, timeout=60, filter="", verbose=True, badsites= [], name='Im
5757
`bad-sites` : (optional, defualt is empty list) Can limit the query to not access the bad sites.<br/>
5858
`name` : (optional, default is 'Image') Can add a custom name for the images that are downloaded.<br/>
5959

60+
#### Using as a Command Line Tool:
61+
```bash
62+
git clone https://github.com/KTS-o7/better_bing_image_downloader.git
63+
cd better_bing_image_downloader
64+
python -m venv ./env
65+
source env/bin/activate
66+
pip install -r requirements.txt
67+
cd better_bing_image_downloader
68+
# This is an example query
69+
python multidownloader.py "cool doggos" --engine "Bing" --max-number 50 --num-threads 5 --driver "firefox_headless"
70+
```
71+
#### Command Line Arguments:
72+
```bash
73+
multidownloader.py "keywords" [-h] [--engine {Google,Bing}] [--driver {chrome_headless,chrome,api,firefox,firefox_headless}] [--max-number MAX_NUMBER] [--num-threads NUM_THREADS] [--timeout TIMEOUT] [--output OUTPUT] [--safe-mode] [--face-only] [--proxy_http PROXY_HTTP] [--proxy_socks5 PROXY_SOCKS5] [--type {clipart,linedrawing,photograph}] [--color COLOR]
74+
```
75+
- `"keywords"`: Keywords to search. ("in quotes")
76+
- `-h, --help`: Show the help message and exit
77+
- `--engine, -e`: Image search engine. Choices are "Google" and "Bing". Default is "Bing".
78+
- `--driver, -d`: Image search engine. Choices are "chrome_headless", "chrome", "api", "firefox", "firefox_headless". Default is "firefox_headless".
79+
- `--max-number, -n`: Max number of images download for the keywords. Default is 100.
80+
- `--num-threads, -j`: Number of threads to concurrently download images. Default is 50.
81+
- `--timeout, -t`: Seconds to timeout when download an image. Default is 10.
82+
- `--output, -o`: Output directory to save downloaded images. Default is "./download_images".
83+
- `--safe-mode, -S`: Turn on safe search mode. (Only effective in Google)
84+
- `--face-only, -`F: Only search for faces.
85+
- `--proxy_http, -ph`: Set http proxy (e.g. 192.168.0.2:8080)
86+
- `--proxy_socks5, -ps`: Set socks5 proxy (e.g. 192.168.0.2:1080)
87+
- -`-type, -ty`: What kinds of images to download. Choices are "clipart", "linedrawing", "photograph".
88+
- `--color, -cl`: Specify the color of desired images.
89+
90+
```bash
91+
# Example usage
92+
python multidownloader.py "Cool Doggos" --engine Google --driver chrome --max-number 50 --num-threads 10 --timeout 60 --output "./doggo_images" --safe-mode --proxy_http "192.168.0.2:8080" --type photograph --color blue
93+
```
94+
---
6095
## Star History
6196

6297
[![Star History Chart](https://api.star-history.com/svg?repos=KTS-o7/better-bing-image-downloader&type=Date)](https://star-history.com/#KTS-o7/better-bing-image-downloader&Date)
Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
""" Crawl image urls from image search engine. """
2+
3+
# author: Krishnatejaswi S
4+
# Email: shentharkrishnatejaswi@gmail.com
5+
6+
from __future__ import print_function
7+
8+
import re
9+
import time
10+
import sys
11+
import os
12+
import json
13+
import shutil
14+
15+
from urllib.parse import unquote, quote
16+
from selenium import webdriver
17+
from selenium.webdriver.chrome.service import Service
18+
from selenium.webdriver.common.by import By
19+
import requests
20+
from concurrent import futures
21+
22+
g_headers = {
23+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
24+
"Proxy-Connection": "keep-alive",
25+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
26+
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
27+
"Accept-Encoding": "gzip, deflate, sdch",
28+
# 'Connection': 'close',
29+
}
30+
31+
if getattr(sys, 'frozen', False):
32+
bundle_dir = sys._MEIPASS
33+
else:
34+
bundle_dir = os.path.dirname(os.path.abspath(__file__))
35+
36+
37+
def my_print(msg, quiet=False):
38+
if not quiet:
39+
print(msg)
40+
41+
42+
def google_gen_query_url(keywords, face_only=False, safe_mode=False, image_type=None, color=None):
43+
base_url = "https://www.google.com/search?tbm=isch&hl=en"
44+
keywords_str = "&q=" + quote(keywords)
45+
query_url = base_url + keywords_str
46+
47+
if safe_mode is True:
48+
query_url += "&safe=on"
49+
else:
50+
query_url += "&safe=off"
51+
52+
filter_url = "&tbs="
53+
54+
if color is not None:
55+
if color == "bw":
56+
filter_url += "ic:gray%2C"
57+
else:
58+
filter_url += "ic:specific%2Cisc:{}%2C".format(color.lower())
59+
60+
if image_type is not None:
61+
if image_type.lower() == "linedrawing":
62+
image_type = "lineart"
63+
filter_url += "itp:{}".format(image_type)
64+
65+
if face_only is True:
66+
filter_url += "itp:face"
67+
68+
query_url += filter_url
69+
return query_url
70+
71+
72+
def google_image_url_from_webpage(driver, max_number, quiet=False):
73+
thumb_elements_old = []
74+
thumb_elements = []
75+
while True:
76+
try:
77+
thumb_elements = driver.find_elements(By.CLASS_NAME, "rg_i")
78+
my_print("Find {} images.".format(len(thumb_elements)), quiet)
79+
if len(thumb_elements) >= max_number:
80+
break
81+
if len(thumb_elements) == len(thumb_elements_old):
82+
break
83+
thumb_elements_old = thumb_elements
84+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
85+
time.sleep(2)
86+
show_more = driver.find_elements(By.CLASS_NAME, "mye4qd")
87+
if len(show_more) == 1 and show_more[0].is_displayed() and show_more[0].is_enabled():
88+
my_print("Click show_more button.", quiet)
89+
show_more[0].click()
90+
time.sleep(3)
91+
except Exception as e:
92+
print("Exception ", e)
93+
pass
94+
95+
if len(thumb_elements) == 0:
96+
return []
97+
98+
my_print("Click on each thumbnail image to get image url, may take a moment ...", quiet)
99+
100+
retry_click = []
101+
for i, elem in enumerate(thumb_elements):
102+
try:
103+
if i != 0 and i % 50 == 0:
104+
my_print("{} thumbnail clicked.".format(i), quiet)
105+
if not elem.is_displayed() or not elem.is_enabled():
106+
retry_click.append(elem)
107+
continue
108+
elem.click()
109+
except Exception as e:
110+
print("Error while clicking in thumbnail:", e)
111+
retry_click.append(elem)
112+
113+
if len(retry_click) > 0:
114+
my_print("Retry some failed clicks ...", quiet)
115+
for elem in retry_click:
116+
try:
117+
if elem.is_displayed() and elem.is_enabled():
118+
elem.click()
119+
except Exception as e:
120+
print("Error while retrying click:", e)
121+
122+
image_elements = driver.find_elements(By.CLASS_NAME, "islib")
123+
image_urls = list()
124+
url_pattern = r"imgurl=\S*&amp;imgrefurl"
125+
126+
for image_element in image_elements[:max_number]:
127+
outer_html = image_element.get_attribute("outerHTML")
128+
re_group = re.search(url_pattern, outer_html)
129+
if re_group is not None:
130+
image_url = unquote(re_group.group()[7:-14])
131+
image_urls.append(image_url)
132+
return image_urls
133+
134+
135+
def bing_gen_query_url(keywords, face_only=False, safe_mode=False, image_type=None, color=None):
136+
base_url = "https://www.bing.com/images/search?"
137+
keywords_str = "&q=" + quote(keywords)
138+
query_url = base_url + keywords_str
139+
filter_url = "&qft="
140+
if face_only is True:
141+
filter_url += "+filterui:face-face"
142+
143+
if image_type is not None:
144+
filter_url += "+filterui:photo-{}".format(image_type)
145+
146+
if color is not None:
147+
if color == "bw" or color == "color":
148+
filter_url += "+filterui:color2-{}".format(color.lower())
149+
else:
150+
filter_url += "+filterui:color2-FGcls_{}".format(color.upper())
151+
152+
query_url += filter_url
153+
154+
return query_url
155+
156+
157+
def bing_image_url_from_webpage(driver):
158+
image_urls = list()
159+
160+
time.sleep(7)
161+
img_count = 0
162+
163+
while True:
164+
image_elements = driver.find_elements(By.CLASS_NAME, "iusc")
165+
if len(image_elements) > img_count:
166+
img_count = len(image_elements)
167+
driver.execute_script(
168+
"window.scrollTo(0, document.body.scrollHeight);")
169+
else:
170+
smb = driver.find_elements(By.CLASS_NAME, "btn_seemore")
171+
if len(smb) > 0 and smb[0].is_displayed():
172+
smb[0].click()
173+
else:
174+
break
175+
time.sleep(2)
176+
for image_element in image_elements:
177+
m_json_str = image_element.get_attribute("m")
178+
m_json = json.loads(m_json_str)
179+
image_urls.append(m_json["murl"])
180+
return image_urls
181+
182+
def bing_get_image_url_using_api(keywords, max_number=10000, face_only=False,
183+
proxy=None, proxy_type=None):
184+
proxies = None
185+
if proxy and proxy_type:
186+
proxies = {"http": "{}://{}".format(proxy_type, proxy),
187+
"https": "{}://{}".format(proxy_type, proxy)}
188+
start = 1
189+
image_urls = []
190+
while start <= max_number:
191+
url = 'https://www.bing.com/images/async?q={}&first={}&count=35'.format(keywords, start)
192+
res = requests.get(url, proxies=proxies, headers=g_headers)
193+
res.encoding = "utf-8"
194+
image_urls_batch = re.findall('murl&quot;:&quot;(.*?)&quot;', res.text)
195+
if len(image_urls) > 0 and image_urls_batch[-1] == image_urls[-1]:
196+
break
197+
image_urls += image_urls_batch
198+
start += len(image_urls_batch)
199+
return image_urls
200+
201+
def crawl_image_urls(keywords, engine="Google", max_number=10000,
202+
face_only=False, safe_mode=False, proxy=None,
203+
proxy_type="http", quiet=False, browser="chrome_headless", image_type=None, color=None):
204+
"""
205+
Scrape image urls of keywords from Google Image Search
206+
:param keywords: keywords you want to search
207+
:param engine: search engine used to search images
208+
:param max_number: limit the max number of image urls the function output, equal or less than 0 for unlimited
209+
:param face_only: image type set to face only, provided by Google
210+
:param safe_mode: switch for safe mode of Google Search
211+
:param proxy: proxy address, example: socks5 127.0.0.1:1080
212+
:param proxy_type: socks5, http
213+
:param browser: browser to use when crawl image urls
214+
:return: list of scraped image urls
215+
"""
216+
217+
my_print("\nScraping From {} Image Search ...\n".format(engine), quiet)
218+
my_print("Keywords: " + keywords, quiet)
219+
if max_number <= 0:
220+
my_print("Number: No limit", quiet)
221+
max_number = 10000
222+
else:
223+
my_print("Number: {}".format(max_number), quiet)
224+
my_print("Face Only: {}".format(str(face_only)), quiet)
225+
my_print("Safe Mode: {}".format(str(safe_mode)), quiet)
226+
227+
if engine == "Google":
228+
query_url = google_gen_query_url(keywords, face_only, safe_mode, image_type, color)
229+
elif engine == "Bing":
230+
query_url = bing_gen_query_url(keywords, face_only, safe_mode, image_type, color)
231+
else:
232+
return
233+
234+
my_print("Query URL: " + query_url, quiet)
235+
236+
image_urls = []
237+
238+
if browser != "api":
239+
browser = str.lower(browser)
240+
if "firefox" in browser:
241+
firefox_path = shutil.which("geckodriver")
242+
firefox_options = webdriver.FirefoxOptions()
243+
if "headless" in browser:
244+
firefox_options.add_argument("-headless")
245+
if proxy is not None and proxy_type is not None:
246+
firefox_options.add_argument("--proxy-server={}://{}".format(proxy_type, proxy))
247+
#driver = webdriver.Firefox(options=firefox_options)
248+
service = Service(executable_path=firefox_path)
249+
driver = webdriver.Chrome(service=service, options=firefox_options)
250+
else:
251+
chrome_path = shutil.which("chromedriver")
252+
chrome_options = webdriver.ChromeOptions()
253+
if "headless" in browser:
254+
chrome_options.add_argument("headless")
255+
if proxy is not None and proxy_type is not None:
256+
chrome_options.add_argument("--proxy-server={}://{}".format(proxy_type, proxy))
257+
#driver = webdriver.Chrome(chrome_path, chrome_options=chrome_options)
258+
service = Service(executable_path=chrome_path)
259+
driver = webdriver.Chrome(service=service, options=chrome_options)
260+
261+
if engine == "Google":
262+
driver.set_window_size(1920, 1080)
263+
driver.get(query_url)
264+
image_urls = google_image_url_from_webpage(driver, max_number, quiet)
265+
elif engine == "Bing":
266+
driver.set_window_size(1920, 1080)
267+
driver.get(query_url)
268+
image_urls = bing_image_url_from_webpage(driver)
269+
270+
driver.close()
271+
else: # api
272+
if engine == "Bing":
273+
image_urls = bing_get_image_url_using_api(keywords, max_number=max_number, face_only=face_only,
274+
proxy=proxy, proxy_type=proxy_type)
275+
else:
276+
my_print("Engine {} is not supported on API mode.".format(engine))
277+
278+
if max_number > len(image_urls):
279+
output_num = len(image_urls)
280+
else:
281+
output_num = max_number
282+
283+
my_print("\n== {0} out of {1} crawled images urls will be used.\n".format(
284+
output_num, len(image_urls)), quiet)
285+
286+
return image_urls[0:output_num]

0 commit comments

Comments
 (0)