-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathimgCrawler.py
More file actions
59 lines (47 loc) · 1.85 KB
/
imgCrawler.py
File metadata and controls
59 lines (47 loc) · 1.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# Script : imCrawler1.py
# Author : 0zDxt
import requests, re, sys, argparse, shutil
from urllib.parse import urljoin
from rich.console import Console
from rich.progress import track
import time
console = Console()
width = shutil.get_terminal_size().columns
parser = argparse.ArgumentParser(description="Web crawler to parse images from a website.\nRun with python3 imgCrawler.py --domain <url>")
parser.add_argument('-d', '--domain', required=True, help='the domain to crawl in')
args = parser.parse_args()
domain = args.domain
title = "\t\t ___ ___ _ \n" +\
"\t\t |_ _|_ __ __ _ __ _ ___ / __|_ _ __ ___ __ _| |___ _ _ \n" +\
"\t\t | || ' \/ _` / _` / -_) | (__| '_/ _` \ V V / / -_) '_|\n" +\
"\t\t |___|_|_|_\__,_\__, \___| \___|_| \__,_|\_/\_/|_\___|_| \n" +\
"\t\t |___/ "
# request a webpage and return the html text
def fetch_page(domain):
try:
response = requests.get(domain)
except requests.exceptions.RequestException as e:
sys.exit(e)
return response.text
print("=" * width + "\n")
print(title)
print("\n" + "=" * width)
print(f"\nSearching for images from {domain}")
def get_images(html):
image_pattern = r'<img[^>]+src=["\'](.*?)["\'][^>]*>'
images = re.findall(image_pattern, html, re.I)
images_found = []
for image in images:
this_image = image[1]
images_found.append(image)
for i in track(range(100), description="Processing..."):
time.sleep(0.15)
return images_found
html = fetch_page(domain)
images_found = get_images(html)
if not images_found:
console.print("No image found", style="red")
print(f"The html for {domain} has no <img> tag.")
else:
console.print("\nImages found:", style="green")
print("\n".join(images_found))