|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +# Copyright (C) 2020 Matthew Glazar |
| 4 | +# See end of file for extended copyright information. |
| 5 | + |
| 6 | +import argparse |
| 7 | +import re |
| 8 | +import sys |
| 9 | +from concurrent.futures import ThreadPoolExecutor as Pool |
| 10 | +from urllib.parse import urldefrag, urljoin, urlparse |
| 11 | + |
| 12 | +import requests |
| 13 | +from bs4 import BeautifulSoup |
| 14 | + |
| 15 | +allow_mails = [ |
| 16 | + |
| 17 | + |
| 18 | + |
| 19 | + |
| 20 | + |
| 21 | + |
| 22 | + |
| 23 | +] |
| 24 | + |
| 25 | +allowed_file_ext_for_soup = [ |
| 26 | + "text/html", |
| 27 | +] |
| 28 | + |
| 29 | +headers = { |
| 30 | + "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3" |
| 31 | +} |
| 32 | + |
| 33 | + |
| 34 | +def check_fragment(soup, fragment) -> bool: |
| 35 | + if fragment: |
| 36 | + return bool(soup.find_all(id=fragment)) |
| 37 | + return True |
| 38 | + |
| 39 | + |
| 40 | +def is_mailto_link(link) -> bool: |
| 41 | + return True if re.search(r"^mailto:*", link) else False |
| 42 | + |
| 43 | + |
| 44 | +class UrlPacket(object): |
| 45 | + def __init__(self, parent, url, defraged_url=None, fragment=None) -> None: |
| 46 | + self.parent = parent |
| 47 | + self.url = url |
| 48 | + self.defraged_url = defraged_url |
| 49 | + self.fragment = fragment |
| 50 | + |
| 51 | + |
| 52 | +class UrlNotFound(Exception): |
| 53 | + """Raised when requests returns response code other than 200""" |
| 54 | + |
| 55 | + def __init__(self, response_code): |
| 56 | + self.response_code = response_code |
| 57 | + |
| 58 | + |
| 59 | +def check_response(url) -> None: |
| 60 | + response = requests.head(url, headers=headers, timeout=10, allow_redirects=True) |
| 61 | + if not response.ok: |
| 62 | + if not requests.get(url).ok: |
| 63 | + raise UrlNotFound(response.status_code) |
| 64 | + return response |
| 65 | + |
| 66 | + |
| 67 | +class Crawler: |
| 68 | + def __init__( |
| 69 | + self, site, allowed_file_ext_for_soup=allowed_file_ext_for_soup |
| 70 | + ) -> None: |
| 71 | + self.site = site |
| 72 | + self.allowed_file_ext_for_soup = allowed_file_ext_for_soup |
| 73 | + self.site_namespace = urlparse(self.site).netloc |
| 74 | + self.visted_urls = list() |
| 75 | + self.visted_urls_soup = dict() |
| 76 | + self.external_links_to_check = list() |
| 77 | + self.broken_links = list() |
| 78 | + |
| 79 | + try: |
| 80 | + result = requests.get(self.site) |
| 81 | + soup = BeautifulSoup(result.text, "html.parser") |
| 82 | + self.visted_urls.append(self.site) |
| 83 | + self.visted_urls_soup[self.site] = soup |
| 84 | + self.urls = self.get_urls_from_page(soup) |
| 85 | + except requests.exceptions.ConnectionError: |
| 86 | + print("(error) failed to get", self.site) |
| 87 | + sys.exit(1) |
| 88 | + |
| 89 | + def in_namespace(self, url) -> bool: |
| 90 | + return urlparse(url).netloc == self.site_namespace |
| 91 | + |
| 92 | + def get_urls_from_page(self, soup) -> list: |
| 93 | + href_tags = soup.find_all(href=True) |
| 94 | + hrefs = [tag.get("href") for tag in href_tags] |
| 95 | + script_srcs = list(map(lambda x: x.get("src"), soup.findAll("script"))) |
| 96 | + if None not in script_srcs: |
| 97 | + hrefs.extend(script_srcs) |
| 98 | + return hrefs |
| 99 | + |
| 100 | + def in_allowed_file_soup(self, url_response) -> bool: |
| 101 | + return url_response.headers['Content-type'] in self.allowed_file_ext_for_soup |
| 102 | + |
| 103 | + def report_error(self, error, packet) -> None: |
| 104 | + self.broken_links.append(packet.url) |
| 105 | + print(f"({error}) {packet.parent}, {packet.url}") |
| 106 | + |
| 107 | + def start_crawl(self) -> None: |
| 108 | + self.crawl_and_report(self.site, self.urls) |
| 109 | + self.check_external_links(self.external_links_to_check) |
| 110 | + |
| 111 | + def get_urls_to_crawl(self, packet) -> list: |
| 112 | + urls_from_page = list() |
| 113 | + result = requests.get(packet.defraged_url) |
| 114 | + self.visted_urls_soup[packet.defraged_url] = BeautifulSoup( |
| 115 | + result.text, "html.parser" |
| 116 | + ) |
| 117 | + if not check_fragment( |
| 118 | + self.visted_urls_soup[packet.defraged_url], packet.fragment |
| 119 | + ): |
| 120 | + self.report_error("fragment missing", packet) |
| 121 | + urls_from_page = self.get_urls_from_page( |
| 122 | + self.visted_urls_soup[packet.defraged_url] |
| 123 | + ) |
| 124 | + return urls_from_page |
| 125 | + |
| 126 | + def check_mail_link(self, packet) -> None: |
| 127 | + mail = packet.url.partition(":")[-1] |
| 128 | + if mail not in allow_mails: |
| 129 | + self.report_error("unknown mail", packet) |
| 130 | + |
| 131 | + def check_internal_links(self, packet) -> None: |
| 132 | + try: |
| 133 | + response = check_response(packet.defraged_url) |
| 134 | + if self.in_allowed_file_soup(response): |
| 135 | + urls_from_page = self.get_urls_to_crawl(packet) |
| 136 | + if len(urls_from_page) != 0: |
| 137 | + self.crawl_and_report(response.url, urls_from_page) |
| 138 | + except UrlNotFound as e: |
| 139 | + self.report_error(f"{e.response_code} error", packet) |
| 140 | + |
| 141 | + def check_external_links(self, urls) -> None: |
| 142 | + with Pool(max_workers=len(urls)) as executor: |
| 143 | + future_to_url = { |
| 144 | + executor.submit(check_response, packet.url): packet for packet in urls |
| 145 | + } |
| 146 | + for future in future_to_url: |
| 147 | + packet = future_to_url[future] |
| 148 | + try: |
| 149 | + future.result() |
| 150 | + except UrlNotFound as e: |
| 151 | + self.report_error(f"{e.response_code} error", packet) |
| 152 | + except Exception as e: |
| 153 | + print(f"{packet.parent}, {packet.url} generated an exception: {e}") |
| 154 | + |
| 155 | + def crawl_and_report(self, parent_url, urls) -> None: |
| 156 | + for link in urls: |
| 157 | + if is_mailto_link(link): |
| 158 | + self.check_mail_link(UrlPacket(parent_url, link)) |
| 159 | + else: |
| 160 | + url = urljoin(parent_url, link) |
| 161 | + if url not in self.visted_urls: |
| 162 | + self.visted_urls.append(url) |
| 163 | + defraged_url, fragment = urldefrag(url) |
| 164 | + if defraged_url not in self.visted_urls_soup: |
| 165 | + if not self.in_namespace(defraged_url): |
| 166 | + self.external_links_to_check.append( |
| 167 | + UrlPacket(parent_url, url) |
| 168 | + ) |
| 169 | + self.visted_urls_soup[defraged_url] = None |
| 170 | + else: |
| 171 | + self.check_internal_links( |
| 172 | + UrlPacket(parent_url, url, defraged_url, fragment) |
| 173 | + ) |
| 174 | + else: |
| 175 | + soup = self.visted_urls_soup[defraged_url] |
| 176 | + if soup is not None and not check_fragment(soup, fragment): |
| 177 | + self.report_error( |
| 178 | + "fragment missing", |
| 179 | + UrlPacket(parent_url, url, fragment=fragment), |
| 180 | + ) |
| 181 | + |
| 182 | + |
| 183 | +def main() -> None: |
| 184 | + parser = argparse.ArgumentParser() |
| 185 | + parser.add_argument("url", type=str) |
| 186 | + if len(sys.argv) == 1: |
| 187 | + parser.print_help() |
| 188 | + sys.exit(1) |
| 189 | + args = parser.parse_args() |
| 190 | + crawler = Crawler(args.url) |
| 191 | + crawler.start_crawl() |
| 192 | + |
| 193 | + if len(crawler.broken_links) > 0: |
| 194 | + sys.exit(1) |
| 195 | + |
| 196 | + |
| 197 | +if __name__ == "__main__": |
| 198 | + main() |
| 199 | + |
| 200 | +# quick-lint-js finds bugs in JavaScript programs. |
| 201 | +# Copyright (C) 2020 Matthew Glazar |
| 202 | +# |
| 203 | +# This file is part of quick-lint-js. |
| 204 | +# |
| 205 | +# quick-lint-js is free software: you can redistribute it and/or modify |
| 206 | +# it under the terms of the GNU General Public License as published by |
| 207 | +# the Free Software Foundation, either version 3 of the License, or |
| 208 | +# (at your option) any later version. |
| 209 | +# |
| 210 | +# quick-lint-js is distributed in the hope that it will be useful, |
| 211 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 212 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 213 | +# GNU General Public License for more details. |
| 214 | +# |
| 215 | +# You should have received a copy of the GNU General Public License |
| 216 | +# along with quick-lint-js. If not, see <https://www.gnu.org/licenses/>. |
0 commit comments