1+ """ Crawl image urls from image search engine. """
2+
3+ # author: Krishnatejaswi S
4+ # Email: shentharkrishnatejaswi@gmail.com
5+
6+ from __future__ import print_function
7+
8+ import re
9+ import time
10+ import sys
11+ import os
12+ import json
13+ import shutil
14+
15+ from urllib .parse import unquote , quote
16+ from selenium import webdriver
17+ from selenium .webdriver .chrome .service import Service
18+ from selenium .webdriver .common .by import By
19+ import requests
20+ from concurrent import futures
21+
22+ g_headers = {
23+ "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" ,
24+ "Proxy-Connection" : "keep-alive" ,
25+ "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
26+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36" ,
27+ "Accept-Encoding" : "gzip, deflate, sdch" ,
28+ # 'Connection': 'close',
29+ }
30+
31+ if getattr (sys , 'frozen' , False ):
32+ bundle_dir = sys ._MEIPASS
33+ else :
34+ bundle_dir = os .path .dirname (os .path .abspath (__file__ ))
35+
36+
37+ def my_print (msg , quiet = False ):
38+ if not quiet :
39+ print (msg )
40+
41+
42+ def google_gen_query_url (keywords , face_only = False , safe_mode = False , image_type = None , color = None ):
43+ base_url = "https://www.google.com/search?tbm=isch&hl=en"
44+ keywords_str = "&q=" + quote (keywords )
45+ query_url = base_url + keywords_str
46+
47+ if safe_mode is True :
48+ query_url += "&safe=on"
49+ else :
50+ query_url += "&safe=off"
51+
52+ filter_url = "&tbs="
53+
54+ if color is not None :
55+ if color == "bw" :
56+ filter_url += "ic:gray%2C"
57+ else :
58+ filter_url += "ic:specific%2Cisc:{}%2C" .format (color .lower ())
59+
60+ if image_type is not None :
61+ if image_type .lower () == "linedrawing" :
62+ image_type = "lineart"
63+ filter_url += "itp:{}" .format (image_type )
64+
65+ if face_only is True :
66+ filter_url += "itp:face"
67+
68+ query_url += filter_url
69+ return query_url
70+
71+
72+ def google_image_url_from_webpage (driver , max_number , quiet = False ):
73+ thumb_elements_old = []
74+ thumb_elements = []
75+ while True :
76+ try :
77+ thumb_elements = driver .find_elements (By .CLASS_NAME , "rg_i" )
78+ my_print ("Find {} images." .format (len (thumb_elements )), quiet )
79+ if len (thumb_elements ) >= max_number :
80+ break
81+ if len (thumb_elements ) == len (thumb_elements_old ):
82+ break
83+ thumb_elements_old = thumb_elements
84+ driver .execute_script ("window.scrollTo(0, document.body.scrollHeight);" )
85+ time .sleep (2 )
86+ show_more = driver .find_elements (By .CLASS_NAME , "mye4qd" )
87+ if len (show_more ) == 1 and show_more [0 ].is_displayed () and show_more [0 ].is_enabled ():
88+ my_print ("Click show_more button." , quiet )
89+ show_more [0 ].click ()
90+ time .sleep (3 )
91+ except Exception as e :
92+ print ("Exception " , e )
93+ pass
94+
95+ if len (thumb_elements ) == 0 :
96+ return []
97+
98+ my_print ("Click on each thumbnail image to get image url, may take a moment ..." , quiet )
99+
100+ retry_click = []
101+ for i , elem in enumerate (thumb_elements ):
102+ try :
103+ if i != 0 and i % 50 == 0 :
104+ my_print ("{} thumbnail clicked." .format (i ), quiet )
105+ if not elem .is_displayed () or not elem .is_enabled ():
106+ retry_click .append (elem )
107+ continue
108+ elem .click ()
109+ except Exception as e :
110+ print ("Error while clicking in thumbnail:" , e )
111+ retry_click .append (elem )
112+
113+ if len (retry_click ) > 0 :
114+ my_print ("Retry some failed clicks ..." , quiet )
115+ for elem in retry_click :
116+ try :
117+ if elem .is_displayed () and elem .is_enabled ():
118+ elem .click ()
119+ except Exception as e :
120+ print ("Error while retrying click:" , e )
121+
122+ image_elements = driver .find_elements (By .CLASS_NAME , "islib" )
123+ image_urls = list ()
124+ url_pattern = r"imgurl=\S*&imgrefurl"
125+
126+ for image_element in image_elements [:max_number ]:
127+ outer_html = image_element .get_attribute ("outerHTML" )
128+ re_group = re .search (url_pattern , outer_html )
129+ if re_group is not None :
130+ image_url = unquote (re_group .group ()[7 :- 14 ])
131+ image_urls .append (image_url )
132+ return image_urls
133+
134+
135+ def bing_gen_query_url (keywords , face_only = False , safe_mode = False , image_type = None , color = None ):
136+ base_url = "https://www.bing.com/images/search?"
137+ keywords_str = "&q=" + quote (keywords )
138+ query_url = base_url + keywords_str
139+ filter_url = "&qft="
140+ if face_only is True :
141+ filter_url += "+filterui:face-face"
142+
143+ if image_type is not None :
144+ filter_url += "+filterui:photo-{}" .format (image_type )
145+
146+ if color is not None :
147+ if color == "bw" or color == "color" :
148+ filter_url += "+filterui:color2-{}" .format (color .lower ())
149+ else :
150+ filter_url += "+filterui:color2-FGcls_{}" .format (color .upper ())
151+
152+ query_url += filter_url
153+
154+ return query_url
155+
156+
157+ def bing_image_url_from_webpage (driver ):
158+ image_urls = list ()
159+
160+ time .sleep (7 )
161+ img_count = 0
162+
163+ while True :
164+ image_elements = driver .find_elements (By .CLASS_NAME , "iusc" )
165+ if len (image_elements ) > img_count :
166+ img_count = len (image_elements )
167+ driver .execute_script (
168+ "window.scrollTo(0, document.body.scrollHeight);" )
169+ else :
170+ smb = driver .find_elements (By .CLASS_NAME , "btn_seemore" )
171+ if len (smb ) > 0 and smb [0 ].is_displayed ():
172+ smb [0 ].click ()
173+ else :
174+ break
175+ time .sleep (2 )
176+ for image_element in image_elements :
177+ m_json_str = image_element .get_attribute ("m" )
178+ m_json = json .loads (m_json_str )
179+ image_urls .append (m_json ["murl" ])
180+ return image_urls
181+
182+ def bing_get_image_url_using_api (keywords , max_number = 10000 , face_only = False ,
183+ proxy = None , proxy_type = None ):
184+ proxies = None
185+ if proxy and proxy_type :
186+ proxies = {"http" : "{}://{}" .format (proxy_type , proxy ),
187+ "https" : "{}://{}" .format (proxy_type , proxy )}
188+ start = 1
189+ image_urls = []
190+ while start <= max_number :
191+ url = 'https://www.bing.com/images/async?q={}&first={}&count=35' .format (keywords , start )
192+ res = requests .get (url , proxies = proxies , headers = g_headers )
193+ res .encoding = "utf-8"
194+ image_urls_batch = re .findall ('murl":"(.*?)"' , res .text )
195+ if len (image_urls ) > 0 and image_urls_batch [- 1 ] == image_urls [- 1 ]:
196+ break
197+ image_urls += image_urls_batch
198+ start += len (image_urls_batch )
199+ return image_urls
200+
201+ def crawl_image_urls (keywords , engine = "Google" , max_number = 10000 ,
202+ face_only = False , safe_mode = False , proxy = None ,
203+ proxy_type = "http" , quiet = False , browser = "chrome_headless" , image_type = None , color = None ):
204+ """
205+ Scrape image urls of keywords from Google Image Search
206+ :param keywords: keywords you want to search
207+ :param engine: search engine used to search images
208+ :param max_number: limit the max number of image urls the function output, equal or less than 0 for unlimited
209+ :param face_only: image type set to face only, provided by Google
210+ :param safe_mode: switch for safe mode of Google Search
211+ :param proxy: proxy address, example: socks5 127.0.0.1:1080
212+ :param proxy_type: socks5, http
213+ :param browser: browser to use when crawl image urls
214+ :return: list of scraped image urls
215+ """
216+
217+ my_print ("\n Scraping From {} Image Search ...\n " .format (engine ), quiet )
218+ my_print ("Keywords: " + keywords , quiet )
219+ if max_number <= 0 :
220+ my_print ("Number: No limit" , quiet )
221+ max_number = 10000
222+ else :
223+ my_print ("Number: {}" .format (max_number ), quiet )
224+ my_print ("Face Only: {}" .format (str (face_only )), quiet )
225+ my_print ("Safe Mode: {}" .format (str (safe_mode )), quiet )
226+
227+ if engine == "Google" :
228+ query_url = google_gen_query_url (keywords , face_only , safe_mode , image_type , color )
229+ elif engine == "Bing" :
230+ query_url = bing_gen_query_url (keywords , face_only , safe_mode , image_type , color )
231+ else :
232+ return
233+
234+ my_print ("Query URL: " + query_url , quiet )
235+
236+ image_urls = []
237+
238+ if browser != "api" :
239+ browser = str .lower (browser )
240+ if "firefox" in browser :
241+ firefox_path = shutil .which ("geckodriver" )
242+ firefox_options = webdriver .FirefoxOptions ()
243+ if "headless" in browser :
244+ firefox_options .add_argument ("-headless" )
245+ if proxy is not None and proxy_type is not None :
246+ firefox_options .add_argument ("--proxy-server={}://{}" .format (proxy_type , proxy ))
247+ #driver = webdriver.Firefox(options=firefox_options)
248+ service = Service (executable_path = firefox_path )
249+ driver = webdriver .Chrome (service = service , options = firefox_options )
250+ else :
251+ chrome_path = shutil .which ("chromedriver" )
252+ chrome_options = webdriver .ChromeOptions ()
253+ if "headless" in browser :
254+ chrome_options .add_argument ("headless" )
255+ if proxy is not None and proxy_type is not None :
256+ chrome_options .add_argument ("--proxy-server={}://{}" .format (proxy_type , proxy ))
257+ #driver = webdriver.Chrome(chrome_path, chrome_options=chrome_options)
258+ service = Service (executable_path = chrome_path )
259+ driver = webdriver .Chrome (service = service , options = chrome_options )
260+
261+ if engine == "Google" :
262+ driver .set_window_size (1920 , 1080 )
263+ driver .get (query_url )
264+ image_urls = google_image_url_from_webpage (driver , max_number , quiet )
265+ elif engine == "Bing" :
266+ driver .set_window_size (1920 , 1080 )
267+ driver .get (query_url )
268+ image_urls = bing_image_url_from_webpage (driver )
269+
270+ driver .close ()
271+ else : # api
272+ if engine == "Bing" :
273+ image_urls = bing_get_image_url_using_api (keywords , max_number = max_number , face_only = face_only ,
274+ proxy = proxy , proxy_type = proxy_type )
275+ else :
276+ my_print ("Engine {} is not supported on API mode." .format (engine ))
277+
278+ if max_number > len (image_urls ):
279+ output_num = len (image_urls )
280+ else :
281+ output_num = max_number
282+
283+ my_print ("\n == {0} out of {1} crawled images urls will be used.\n " .format (
284+ output_num , len (image_urls )), quiet )
285+
286+ return image_urls [0 :output_num ]
0 commit comments