Skip to content

Commit 2c0cebb

Browse files
Merge pull request #1400 from keenborder786/master
Google Image Scrapper
2 parents 19495cd + bea2a08 commit 2c0cebb

File tree

7 files changed

+333
-0
lines changed

7 files changed

+333
-0
lines changed

Google-Image-Scrapper/Dockerfile

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
FROM docker.io/condaforge/mambaforge@sha256:a119fe148b8a276397cb7423797f8ee82670e64b071dc39c918b6c3513bd0174
2+
3+
RUN bin/bash
4+
EXPOSE 5000
5+
## Creating the new conda environment with the desired packages using mamba
6+
WORKDIR /opt
7+
COPY environment.yml .
8+
RUN mamba env create -f environment.yml
9+
RUN echo "conda activate amazing_python_script" >> ~/.bashrc
10+
11+
# COPYING THE RELEVANT FILES
12+
COPY static /opt/static
13+
COPY templates /opt/templates
14+
COPY main.py /opt/main.py
15+
COPY scrapper.py /opt/scrapper.py
16+
17+
# Starting the server
18+
ENTRYPOINT ["/opt/conda/envs/amazing_python_script/bin/python","-u", "/opt/main.py"]

Google-Image-Scrapper/README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Google Image Scrapper
2+
3+
![](http://ForTheBadge.com/images/badges/made-with-python.svg)
4+
5+
6+
7+
## You will need docker to run the application
8+
9+
10+
## Run the following command to run the application
11+
12+
```console
13+
docker build --tag google_image:1.0 .
14+
docker run --name google_image_flask -p 8000:8000 -v ~/simple_images:/opt/simple_images google_image:1.0
15+
```
16+
- Your downloaded images will be at ~/simple_images
17+
18+
***The real craft is scrapper.py module which can be engineered according to your use case***

Google-Image-Scrapper/environment.yml

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
name: amazing_python_script
2+
channels:
3+
- conda-forge
4+
- defaults
5+
- pypi
6+
dependencies:
7+
- _libgcc_mutex=0.1
8+
- _openmp_mutex=4.5
9+
- bzip2=1.0.8
10+
- ca-certificates=2022.12.7
11+
- ld_impl_linux-64=2.40
12+
- libffi=3.4.2
13+
- libgcc-ng=12.2.0
14+
- libgomp=12.2.0
15+
- libnsl=2.0.0
16+
- libsqlite=3.40.0
17+
- libuuid=2.38.1
18+
- libzlib=1.2.13
19+
- ncurses=6.3
20+
- openssl=3.1.0
21+
- pip=23.1.2
22+
- python=3.9.16
23+
- readline=8.2
24+
- setuptools=67.7.2
25+
- tk=8.6.12
26+
- tzdata=2023c
27+
- wheel=0.40.0
28+
- xz=5.2.6
29+
- pip:
30+
- async-generator==1.10
31+
- attrs==23.1.0
32+
- blinker==1.6.2
33+
- certifi==2022.12.7
34+
- charset-normalizer==3.1.0
35+
- click==8.1.3
36+
- dominate==2.7.0
37+
- exceptiongroup==1.1.1
38+
- flask==2.3.1
39+
- flask-bootstrap==3.3.7.1
40+
- flask-modals==0.5.1
41+
- flask-wtf==1.1.1
42+
- google-images-download==2.8.0
43+
- h11==0.14.0
44+
- idna==3.4
45+
- importlib-metadata==6.6.0
46+
- itsdangerous==2.1.2
47+
- jinja2==3.1.2
48+
- markupsafe==2.1.2
49+
- outcome==1.2.0
50+
- pysocks==1.7.1
51+
- requests==2.29.0
52+
- selenium==4.9.0
53+
- simple-image-download==0.2
54+
- sniffio==1.3.0
55+
- sortedcontainers==2.4.0
56+
- trio==0.22.0
57+
- trio-websocket==0.10.2
58+
- urllib3==1.26.15
59+
- visitor==0.1.3
60+
- werkzeug==2.3.2
61+
- wsproto==1.2.0
62+
- wtforms==3.0.1
63+
- zipp==3.15.0

Google-Image-Scrapper/main.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from flask import Flask, render_template, request, flash, redirect, send_from_directory
2+
from scrapper import simple_image_download
3+
from flask_bootstrap import Bootstrap
4+
from flask_wtf import FlaskForm
5+
from wtforms import StringField, SubmitField, IntegerField, SelectField
6+
from wtforms.validators import DataRequired, Email
7+
import os
8+
9+
app = Flask(__name__, template_folder='templates')
10+
response = simple_image_download()
11+
app.secret_key = 'tO$&!|0wkamvVia0?n$NqIRVWOG'
12+
bootstrap = Bootstrap(app)
13+
downloaded = [False]
14+
image_request = {'name': '', 'number_of_images': 0}
15+
16+
17+
class ImageForm(FlaskForm):
18+
name = StringField('name', validators=[DataRequired()])
19+
number_of_images = IntegerField('number_of_images', validators=[DataRequired()])
20+
submit = SubmitField('Submit')
21+
22+
23+
@app.route('/', methods=['GET', 'POST'])
24+
def index():
25+
form = ImageForm()
26+
if form.validate_on_submit():
27+
image_request['name'] = request.form['name']
28+
image_request['number_of_images'] = request.form['number_of_images']
29+
flash('Your images are being downloaded. Please wait.')
30+
downloaded[0] = True
31+
return redirect('/')
32+
33+
if downloaded[0]:
34+
response.download(image_request['name'], int(image_request['number_of_images']))
35+
flash('All of your images have been downloaded')
36+
downloaded[0] = False
37+
return redirect('/')
38+
return render_template('index.html', form=form)
39+
40+
41+
if __name__ == '__main__':
42+
app.run(host="0.0.0.0", port=8000)

Google-Image-Scrapper/scrapper.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
import os
2+
import time
3+
import urllib
4+
import requests
5+
from urllib.parse import quote
6+
import array as arr
7+
8+
9+
class simple_image_download:
10+
def __init__(self):
11+
pass
12+
13+
def urls(self, keywords, limit):
14+
keyword_to_search = [str(item).strip() for item in keywords.split(',')]
15+
i = 0
16+
links = []
17+
while i < len(keyword_to_search):
18+
url = 'https://www.google.com/search?q=' + quote(
19+
keyword_to_search[i].encode(
20+
'utf-8')) + '&biw=1536&bih=674&tbm=isch&sxsrf=ACYBGNSXXpS6YmAKUiLKKBs6xWb4uUY5gA:1581168823770&source=lnms&sa=X&ved=0ahUKEwioj8jwiMLnAhW9AhAIHbXTBMMQ_AUI3QUoAQ'
21+
raw_html = self._download_page(url)
22+
23+
end_object = -1
24+
25+
j = 0
26+
while j < limit:
27+
while (True):
28+
try:
29+
new_line = raw_html.find('"https://', end_object + 1)
30+
end_object = raw_html.find('"', new_line + 1)
31+
32+
buffor = raw_html.find('\\', new_line + 1, end_object)
33+
if buffor != -1:
34+
object_raw = (raw_html[new_line + 1:buffor])
35+
else:
36+
object_raw = (raw_html[new_line + 1:end_object])
37+
38+
if '.jpg' in object_raw or 'png' in object_raw or '.ico' in object_raw or '.gif' in object_raw or '.jpeg' in object_raw:
39+
break
40+
41+
except Exception as e:
42+
print(e)
43+
break
44+
45+
links.append(object_raw)
46+
j += 1
47+
48+
i += 1
49+
return (links)
50+
51+
def download(self, keywords, limit):
52+
keyword_to_search = [str(item).strip() for item in keywords.split(',')]
53+
main_directory = "simple_images/"
54+
i = 0
55+
56+
while i < len(keyword_to_search):
57+
self._create_directories(main_directory, keyword_to_search[i])
58+
url = 'https://www.google.com/search?q=' + quote(
59+
keyword_to_search[i].encode('utf-8')) + '&biw=1536&bih=674&tbm=isch&sxsrf=ACYBGNSXXpS6YmAKUiLKKBs6xWb4uUY5gA:1581168823770&source=lnms&sa=X&ved=0ahUKEwioj8jwiMLnAhW9AhAIHbXTBMMQ_AUI3QUoAQ'
60+
raw_html = self._download_page(url)
61+
62+
end_object = -1
63+
64+
j = 0
65+
while j < limit:
66+
while (True):
67+
try:
68+
new_line = raw_html.find('"https://', end_object + 1)
69+
end_object = raw_html.find('"', new_line + 1)
70+
71+
buffor = raw_html.find('\\', new_line + 1, end_object)
72+
if buffor != -1:
73+
object_raw = (raw_html[new_line+1:buffor])
74+
else:
75+
object_raw = (raw_html[new_line+1:end_object])
76+
77+
if '.jpg' in object_raw or 'png' in object_raw or '.ico' in object_raw or '.gif' in object_raw or '.jpeg' in object_raw:
78+
break
79+
80+
except Exception as e:
81+
print(e)
82+
break
83+
84+
path = main_directory + keyword_to_search[i]
85+
86+
# print(object_raw)
87+
88+
if not os.path.exists(path):
89+
os.makedirs(path)
90+
91+
filename = str(keyword_to_search[i]) + "_" + str(j + 1) + ".jpg"
92+
93+
try:
94+
r = requests.get(object_raw, allow_redirects=True)
95+
open(os.path.join(path, filename), 'wb').write(r.content)
96+
except Exception as e:
97+
print(e)
98+
j -= 1
99+
j += 1
100+
101+
i += 1
102+
103+
def _create_directories(self, main_directory, name):
104+
try:
105+
if not os.path.exists(main_directory):
106+
os.makedirs(main_directory)
107+
time.sleep(0.2)
108+
path = (name)
109+
sub_directory = os.path.join(main_directory, path)
110+
if not os.path.exists(sub_directory):
111+
os.makedirs(sub_directory)
112+
else:
113+
path = (name)
114+
sub_directory = os.path.join(main_directory, path)
115+
if not os.path.exists(sub_directory):
116+
os.makedirs(sub_directory)
117+
118+
except OSError as e:
119+
if e.errno != 17:
120+
raise
121+
pass
122+
return
123+
124+
def _download_page(self, url):
125+
126+
try:
127+
headers = {}
128+
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"
129+
req = urllib.request.Request(url, headers=headers)
130+
resp = urllib.request.urlopen(req)
131+
respData = str(resp.read())
132+
return respData
133+
134+
except Exception as e:
135+
print(e)
136+
exit(0)
34.2 KB
Loading
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
{% extends 'bootstrap/base.html' %}
2+
3+
{% block title %}Flask Bootstrap Form Example{% endblock %}
4+
{% block content %}
5+
<div class="container">
6+
{% for message in get_flashed_messages() %}
7+
<div class="alert alert-warning">
8+
<button type="button" class="close" data-dismiss="alert">&times;</button>
9+
{{ message }}
10+
</div>
11+
{% endfor %}
12+
<div class="row">
13+
<div class="col-sm">
14+
<img src="static/images/google-logo.jpg" alt="Google Logo" width="200" height="100">
15+
</div>
16+
<div class="col-sm">
17+
<h1>Google Image Downloader</h1>
18+
<form method="post" action="/">
19+
{{ form.hidden_tag() }}
20+
<div class="form-group">
21+
<label for="name">Name</label>
22+
{{ form.name(class="form-control", id="name", required="required") }}
23+
</div>
24+
<div class="form-group">
25+
<label for="number_of_images">Number of Images</label>
26+
{{ form.number_of_images(class="form-control", id="number_of_images", required="required") }}
27+
</div>
28+
<button type="submit" class="btn btn-primary">Submit</button>
29+
</form>
30+
</div>
31+
</div>
32+
</div>
33+
34+
<script>
35+
// Example starter JavaScript for disabling form submissions if there are invalid fields
36+
(function () {
37+
'use strict'
38+
39+
// Fetch all the forms we want to apply custom Bootstrap validation styles to
40+
var forms = document.querySelectorAll('.needs-validation')
41+
42+
// Loop over them and prevent submission
43+
Array.prototype.slice.call(forms)
44+
.forEach(function (form) {
45+
form.addEventListener('submit', function (event) {
46+
if (!form.checkValidity()) {
47+
event.preventDefault()
48+
event.stopPropagation()
49+
}
50+
51+
form.classList.add('was-validated')
52+
}, false)
53+
})
54+
})()
55+
</script>
56+
{% endblock %}

0 commit comments

Comments
 (0)