web-crawler/web_crawler.py at main · 0zDxt/web-crawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# Script : web_crawler.py
# Author : 0zDxt

import argparse, requests, sys, re, shutil
from rich.console import Console
from urllib.parse import urljoin

console = Console()

width = shutil.get_terminal_size().columns

parser = argparse.ArgumentParser(description='Web crawler script to recursively visit pages within a specified domain, logging crawled URLs and external references.\nRun with: python3 web_crawler.py --domain <URL>')
parser.add_argument('-d', '--domain', required=True, help='the domain to crawl in')
args = parser.parse_args()

dmn = args.domain
to_crawl = [dmn]
crawled = []
external = []

title = "\t\t             _       ___                 _         \n" +\
        "\t\t __ __ _____| |__   / __|_ _ __ ___ __ _| |___ _ _ \n" +\
        "\t\t \ V  V / -_) '_ \ | (__| '_/ _` \ V  V / / -_) '_|\n" +\
        "\t\t  \_/\_/\___|_.__/  \___|_| \__,_|\_/\_/|_\___|_|  \n"

# Request a webpage and return the html text
def fetch_page(url):

    try:
        response = requests.get(url)

    except requests.exceptions.RequestException as e:
        sys.exit(e)
    # Remove the url if it is in the to_crawl list
    if url in to_crawl:
        to_crawl.remove(url)
    # Add the url to the crawled list
    crawled.append(url)

    return response.text

# Parse the HTML for links, decide which needs to be visited
def get_linked_pages(html):

    pattern = r'<a\s+href=\"?([^\">\s]+).*?([a-z0-9: ]+)</a>'

    links = re.findall(pattern, html, re.I)

    for link in links:

        # Access the URL in match group 0.
        this_url = link[0]
        this_url = this_url.split("#")[0]
        page = urljoin(dmn, this_url)

        # Check if the domain is in the current URL
        inDomain = re.search(dmn, page, re.I)
        if inDomain:
            if page not in crawled and page not in to_crawl:
                to_crawl.append(page)

        elif page not in external:
             external.append(page)

print("=" * width + "\n")
print(title)
print("\n" + "=" * width )
print(f"\nEnumerating links form {args.domain} ...\n")

# Loopin to_crawl until list is empty
while to_crawl:
    for url in to_crawl:
        html = fetch_page(url)
        get_linked_pages(html)

print("\nCrawled URLs:\n" + "\n".join(crawled))
if not external:
    console.print(f"\nNo external links were found in {dmn}", style="red")
else:
    print("\nExternal URLs:\n" + "\n".join(external))