-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlinkCralwl.py
More file actions
executable file
·125 lines (98 loc) · 3.05 KB
/
linkCralwl.py
File metadata and controls
executable file
·125 lines (98 loc) · 3.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/python
import requests
import urllib.error
import urllib.request
import urllib.parse
import random
import sys
import argparse
from bs4 import BeautifulSoup
__author__ = "polish"
_description = "Scan and collect all links on a website."
parser = argparse.ArgumentParser(description= _description)
parser.add_argument('--url', metavar='url', type=str, help='The site url for scanning')
ROOT_DIR = "./"
USERAGENTS = [
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
]
# get contents from url #
def randomUserAgent():
return random.choice(USERAGENTS)
USERAGENT = randomUserAgent()
def get_content(url):
try:
req = requests.get(url, headers={'User-Agent': USERAGENT}, verify=False)
content = req.content
return content
except urllib.error.HTTPError as e:
print("<--404-->", e)
return ""
except requests.exceptions.InvalidSchema as invalidschema:
collect_wrong_urls(url)
return ""
def collect_wrong_urls(url):
f = open("malformed_urls.txt", 'a')
f.write(url)
f.close()
def generateFilename(url):
return url.replace("/", "").replace(":", "")
def apply_link_filter(link):
if link is '':
return False
elif link is "#":
return False
elif link is "javascript://":
return False
else:
return True
def get_all_links_from(current_url, m, h):
content = get_content(current_url)
soupContent = BeautifulSoup(content, 'html.parser')
urls=[]
links = soupContent.findAll(href=True)
for i in links:
link = i['href']
link = create_valid_url(link, m, h)
if apply_link_filter(link):
urls.append(link)
return urls
def write_current_urls_into_file(f, c, l):
f.write('PARENT URL>>>>' + str(c) + "\n")
for link in l:
f.write(link + "\n")
def create_valid_url(url, main_url, http_prefix):
if url.startswith("//"):
url = http_prefix + ":" + url
elif url.startswith("/"):
url = main_url + url
elif url.startswith("../"):
url = main_url+ "/" + url
elif url.startswith("http"):
return url
else:
url = main_url + "/" + url
return url
def ready(_url):
if _url == None:
print("No url given bye. please use ./lincCrawl --url <siteurl>")
sys.exit()
requests.packages.urllib3.disable_warnings()
COUNTER = 0
MAIN_URL = _url
http_prefix = _url.split("://")[0]
link_pool = []
visited_links = []
f_http = open(generateFilename(_url), 'w')
link_pool.append(_url)
while len(link_pool) > 0:
current_url = link_pool.pop(0)
if current_url not in visited_links:
COUNTER += 1
print(current_url, COUNTER, len(link_pool))
links = get_all_links_from(current_url, MAIN_URL, http_prefix)
write_current_urls_into_file(f_http, current_url, links)
link_pool += links
visited_links.append(current_url)
f_http.close()
args = parser.parse_args()
ready(args.url)