Skip to content

Commit a40bd21

Browse files
authored
check broken links on website #194
1 parent 8a980ca commit a40bd21

File tree

1 file changed

+216
-0
lines changed

1 file changed

+216
-0
lines changed

tools/check-links

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
#!/usr/bin/env python3
2+
3+
# Copyright (C) 2020 Matthew Glazar
4+
# See end of file for extended copyright information.
5+
6+
import argparse
7+
import re
8+
import sys
9+
from concurrent.futures import ThreadPoolExecutor as Pool
10+
from urllib.parse import urldefrag, urljoin, urlparse
11+
12+
import requests
13+
from bs4 import BeautifulSoup
14+
15+
allow_mails = [
16+
17+
18+
19+
20+
21+
22+
23+
]
24+
25+
allowed_file_ext_for_soup = [
26+
"text/html",
27+
]
28+
29+
headers = {
30+
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3"
31+
}
32+
33+
34+
def check_fragment(soup, fragment) -> bool:
35+
if fragment:
36+
return bool(soup.find_all(id=fragment))
37+
return True
38+
39+
40+
def is_mailto_link(link) -> bool:
41+
return True if re.search(r"^mailto:*", link) else False
42+
43+
44+
class UrlPacket(object):
45+
def __init__(self, parent, url, defraged_url=None, fragment=None) -> None:
46+
self.parent = parent
47+
self.url = url
48+
self.defraged_url = defraged_url
49+
self.fragment = fragment
50+
51+
52+
class UrlNotFound(Exception):
53+
"""Raised when requests returns response code other than 200"""
54+
55+
def __init__(self, response_code):
56+
self.response_code = response_code
57+
58+
59+
def check_response(url) -> None:
60+
response = requests.head(url, headers=headers, timeout=10, allow_redirects=True)
61+
if not response.ok:
62+
if not requests.get(url).ok:
63+
raise UrlNotFound(response.status_code)
64+
return response
65+
66+
67+
class Crawler:
68+
def __init__(
69+
self, site, allowed_file_ext_for_soup=allowed_file_ext_for_soup
70+
) -> None:
71+
self.site = site
72+
self.allowed_file_ext_for_soup = allowed_file_ext_for_soup
73+
self.site_namespace = urlparse(self.site).netloc
74+
self.visted_urls = list()
75+
self.visted_urls_soup = dict()
76+
self.external_links_to_check = list()
77+
self.broken_links = list()
78+
79+
try:
80+
result = requests.get(self.site)
81+
soup = BeautifulSoup(result.text, "html.parser")
82+
self.visted_urls.append(self.site)
83+
self.visted_urls_soup[self.site] = soup
84+
self.urls = self.get_urls_from_page(soup)
85+
except requests.exceptions.ConnectionError:
86+
print("(error) failed to get", self.site)
87+
sys.exit(1)
88+
89+
def in_namespace(self, url) -> bool:
90+
return urlparse(url).netloc == self.site_namespace
91+
92+
def get_urls_from_page(self, soup) -> list:
93+
href_tags = soup.find_all(href=True)
94+
hrefs = [tag.get("href") for tag in href_tags]
95+
script_srcs = list(map(lambda x: x.get("src"), soup.findAll("script")))
96+
if None not in script_srcs:
97+
hrefs.extend(script_srcs)
98+
return hrefs
99+
100+
def in_allowed_file_soup(self, url_response) -> bool:
101+
return url_response.headers['Content-type'] in self.allowed_file_ext_for_soup
102+
103+
def report_error(self, error, packet) -> None:
104+
self.broken_links.append(packet.url)
105+
print(f"({error}) {packet.parent}, {packet.url}")
106+
107+
def start_crawl(self) -> None:
108+
self.crawl_and_report(self.site, self.urls)
109+
self.check_external_links(self.external_links_to_check)
110+
111+
def get_urls_to_crawl(self, packet) -> list:
112+
urls_from_page = list()
113+
result = requests.get(packet.defraged_url)
114+
self.visted_urls_soup[packet.defraged_url] = BeautifulSoup(
115+
result.text, "html.parser"
116+
)
117+
if not check_fragment(
118+
self.visted_urls_soup[packet.defraged_url], packet.fragment
119+
):
120+
self.report_error("fragment missing", packet)
121+
urls_from_page = self.get_urls_from_page(
122+
self.visted_urls_soup[packet.defraged_url]
123+
)
124+
return urls_from_page
125+
126+
def check_mail_link(self, packet) -> None:
127+
mail = packet.url.partition(":")[-1]
128+
if mail not in allow_mails:
129+
self.report_error("unknown mail", packet)
130+
131+
def check_internal_links(self, packet) -> None:
132+
try:
133+
response = check_response(packet.defraged_url)
134+
if self.in_allowed_file_soup(response):
135+
urls_from_page = self.get_urls_to_crawl(packet)
136+
if len(urls_from_page) != 0:
137+
self.crawl_and_report(response.url, urls_from_page)
138+
except UrlNotFound as e:
139+
self.report_error(f"{e.response_code} error", packet)
140+
141+
def check_external_links(self, urls) -> None:
142+
with Pool(max_workers=len(urls)) as executor:
143+
future_to_url = {
144+
executor.submit(check_response, packet.url): packet for packet in urls
145+
}
146+
for future in future_to_url:
147+
packet = future_to_url[future]
148+
try:
149+
future.result()
150+
except UrlNotFound as e:
151+
self.report_error(f"{e.response_code} error", packet)
152+
except Exception as e:
153+
print(f"{packet.parent}, {packet.url} generated an exception: {e}")
154+
155+
def crawl_and_report(self, parent_url, urls) -> None:
156+
for link in urls:
157+
if is_mailto_link(link):
158+
self.check_mail_link(UrlPacket(parent_url, link))
159+
else:
160+
url = urljoin(parent_url, link)
161+
if url not in self.visted_urls:
162+
self.visted_urls.append(url)
163+
defraged_url, fragment = urldefrag(url)
164+
if defraged_url not in self.visted_urls_soup:
165+
if not self.in_namespace(defraged_url):
166+
self.external_links_to_check.append(
167+
UrlPacket(parent_url, url)
168+
)
169+
self.visted_urls_soup[defraged_url] = None
170+
else:
171+
self.check_internal_links(
172+
UrlPacket(parent_url, url, defraged_url, fragment)
173+
)
174+
else:
175+
soup = self.visted_urls_soup[defraged_url]
176+
if soup is not None and not check_fragment(soup, fragment):
177+
self.report_error(
178+
"fragment missing",
179+
UrlPacket(parent_url, url, fragment=fragment),
180+
)
181+
182+
183+
def main() -> None:
184+
parser = argparse.ArgumentParser()
185+
parser.add_argument("url", type=str)
186+
if len(sys.argv) == 1:
187+
parser.print_help()
188+
sys.exit(1)
189+
args = parser.parse_args()
190+
crawler = Crawler(args.url)
191+
crawler.start_crawl()
192+
193+
if len(crawler.broken_links) > 0:
194+
sys.exit(1)
195+
196+
197+
if __name__ == "__main__":
198+
main()
199+
200+
# quick-lint-js finds bugs in JavaScript programs.
201+
# Copyright (C) 2020 Matthew Glazar
202+
#
203+
# This file is part of quick-lint-js.
204+
#
205+
# quick-lint-js is free software: you can redistribute it and/or modify
206+
# it under the terms of the GNU General Public License as published by
207+
# the Free Software Foundation, either version 3 of the License, or
208+
# (at your option) any later version.
209+
#
210+
# quick-lint-js is distributed in the hope that it will be useful,
211+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
212+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
213+
# GNU General Public License for more details.
214+
#
215+
# You should have received a copy of the GNU General Public License
216+
# along with quick-lint-js. If not, see <https://www.gnu.org/licenses/>.

0 commit comments

Comments
 (0)