Skip to content

Commit f953f4a

Browse files
committed
[chore]: Added scrapper module
1 parent 19495cd commit f953f4a

File tree

1 file changed

+137
-0
lines changed

1 file changed

+137
-0
lines changed

Google-Image-Scrapper/scrapper.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
import os
2+
import time
3+
import urllib
4+
import requests
5+
from urllib.parse import quote
6+
import array as arr
7+
8+
class simple_image_download:
9+
def __init__(self):
10+
pass
11+
12+
def urls(self, keywords, limit):
13+
keyword_to_search = [str(item).strip() for item in keywords.split(',')]
14+
i = 0
15+
links = []
16+
while i < len(keyword_to_search):
17+
url = 'https://www.google.com/search?q=' + quote(
18+
keyword_to_search[i].encode(
19+
'utf-8')) + '&biw=1536&bih=674&tbm=isch&sxsrf=ACYBGNSXXpS6YmAKUiLKKBs6xWb4uUY5gA:1581168823770&source=lnms&sa=X&ved=0ahUKEwioj8jwiMLnAhW9AhAIHbXTBMMQ_AUI3QUoAQ'
20+
raw_html = self._download_page(url)
21+
22+
end_object = -1;
23+
24+
j = 0
25+
while j < limit:
26+
while (True):
27+
try:
28+
new_line = raw_html.find('"https://', end_object + 1)
29+
end_object = raw_html.find('"', new_line + 1)
30+
31+
buffor = raw_html.find('\\', new_line + 1, end_object)
32+
if buffor != -1:
33+
object_raw = (raw_html[new_line + 1:buffor])
34+
else:
35+
object_raw = (raw_html[new_line + 1:end_object])
36+
37+
if '.jpg' in object_raw or 'png' in object_raw or '.ico' in object_raw or '.gif' in object_raw or '.jpeg' in object_raw:
38+
break
39+
40+
except Exception as e:
41+
print(e)
42+
break
43+
44+
links.append(object_raw)
45+
j += 1
46+
47+
i += 1
48+
return(links)
49+
50+
51+
def download(self, keywords, limit):
52+
keyword_to_search = [str(item).strip() for item in keywords.split(',')]
53+
main_directory = "simple_images/"
54+
i = 0
55+
56+
while i < len(keyword_to_search):
57+
self._create_directories(main_directory, keyword_to_search[i])
58+
url = 'https://www.google.com/search?q=' + quote(
59+
keyword_to_search[i].encode('utf-8')) + '&biw=1536&bih=674&tbm=isch&sxsrf=ACYBGNSXXpS6YmAKUiLKKBs6xWb4uUY5gA:1581168823770&source=lnms&sa=X&ved=0ahUKEwioj8jwiMLnAhW9AhAIHbXTBMMQ_AUI3QUoAQ'
60+
raw_html = self._download_page(url)
61+
62+
end_object = -1;
63+
64+
j = 0
65+
while j < limit:
66+
while (True):
67+
try:
68+
new_line = raw_html.find('"https://', end_object + 1)
69+
end_object = raw_html.find('"', new_line + 1)
70+
71+
buffor = raw_html.find('\\', new_line + 1, end_object)
72+
if buffor != -1:
73+
object_raw = (raw_html[new_line+1:buffor])
74+
else:
75+
object_raw = (raw_html[new_line+1:end_object])
76+
77+
if '.jpg' in object_raw or 'png' in object_raw or '.ico' in object_raw or '.gif' in object_raw or '.jpeg' in object_raw:
78+
break
79+
80+
except Exception as e:
81+
print(e)
82+
break
83+
84+
path = main_directory + keyword_to_search[i]
85+
86+
#print(object_raw)
87+
88+
if not os.path.exists(path):
89+
os.makedirs(path)
90+
91+
filename = str(keyword_to_search[i]) + "_" + str(j + 1) + ".jpg"
92+
93+
try:
94+
r = requests.get(object_raw, allow_redirects=True)
95+
open(os.path.join(path, filename), 'wb').write(r.content)
96+
except Exception as e:
97+
print(e)
98+
j -= 1
99+
j += 1
100+
101+
i += 1
102+
103+
104+
def _create_directories(self, main_directory, name):
105+
try:
106+
if not os.path.exists(main_directory):
107+
os.makedirs(main_directory)
108+
time.sleep(0.2)
109+
path = (name)
110+
sub_directory = os.path.join(main_directory, path)
111+
if not os.path.exists(sub_directory):
112+
os.makedirs(sub_directory)
113+
else:
114+
path = (name)
115+
sub_directory = os.path.join(main_directory, path)
116+
if not os.path.exists(sub_directory):
117+
os.makedirs(sub_directory)
118+
119+
except OSError as e:
120+
if e.errno != 17:
121+
raise
122+
pass
123+
return
124+
125+
def _download_page(self,url):
126+
127+
try:
128+
headers = {}
129+
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"
130+
req = urllib.request.Request(url, headers=headers)
131+
resp = urllib.request.urlopen(req)
132+
respData = str(resp.read())
133+
return respData
134+
135+
except Exception as e:
136+
print(e)
137+
exit(0)

0 commit comments

Comments
 (0)