-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathweb_crawler.py
More file actions
80 lines (60 loc) · 2.37 KB
/
web_crawler.py
File metadata and controls
80 lines (60 loc) · 2.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# Script : web_crawler.py
# Author : 0zDxt
import argparse, requests, sys, re, shutil
from rich.console import Console
from urllib.parse import urljoin
console = Console()
width = shutil.get_terminal_size().columns
parser = argparse.ArgumentParser(description='Web crawler script to recursively visit pages within a specified domain, logging crawled URLs and external references.\nRun with: python3 web_crawler.py --domain <URL>')
parser.add_argument('-d', '--domain', required=True, help='the domain to crawl in')
args = parser.parse_args()
dmn = args.domain
to_crawl = [dmn]
crawled = []
external = []
title = "\t\t _ ___ _ \n" +\
"\t\t __ __ _____| |__ / __|_ _ __ ___ __ _| |___ _ _ \n" +\
"\t\t \ V V / -_) '_ \ | (__| '_/ _` \ V V / / -_) '_|\n" +\
"\t\t \_/\_/\___|_.__/ \___|_| \__,_|\_/\_/|_\___|_| \n"
# Request a webpage and return the html text
def fetch_page(url):
try:
response = requests.get(url)
except requests.exceptions.RequestException as e:
sys.exit(e)
# Remove the url if it is in the to_crawl list
if url in to_crawl:
to_crawl.remove(url)
# Add the url to the crawled list
crawled.append(url)
return response.text
# Parse the HTML for links, decide which needs to be visited
def get_linked_pages(html):
pattern = r'<a\s+href=\"?([^\">\s]+).*?([a-z0-9: ]+)</a>'
links = re.findall(pattern, html, re.I)
for link in links:
# Access the URL in match group 0.
this_url = link[0]
this_url = this_url.split("#")[0]
page = urljoin(dmn, this_url)
# Check if the domain is in the current URL
inDomain = re.search(dmn, page, re.I)
if inDomain:
if page not in crawled and page not in to_crawl:
to_crawl.append(page)
elif page not in external:
external.append(page)
print("=" * width + "\n")
print(title)
print("\n" + "=" * width )
print(f"\nEnumerating links form {args.domain} ...\n")
# Loopin to_crawl until list is empty
while to_crawl:
for url in to_crawl:
html = fetch_page(url)
get_linked_pages(html)
print("\nCrawled URLs:\n" + "\n".join(crawled))
if not external:
console.print(f"\nNo external links were found in {dmn}", style="red")
else:
print("\nExternal URLs:\n" + "\n".join(external))