|
3 | 3 | import shutil |
4 | 4 | import subprocess |
5 | 5 | import sys |
6 | | -from urllib import request |
7 | | -import urllib.error |
| 6 | +import requests |
8 | 7 | import webbrowser |
9 | 8 | from itertools import repeat |
10 | 9 | from pathlib import Path |
|
15 | 14 | Tuple, |
16 | 15 | ) |
17 | 16 |
|
18 | | - |
| 17 | +import re |
19 | 18 | import nox |
20 | 19 | from nox import Session |
21 | 20 |
|
@@ -67,27 +66,18 @@ def should_filter(url: str) -> bool: |
67 | 66 | return url.startswith("mailto") or url in _filtered |
68 | 67 |
|
69 | 68 | for file in files: |
70 | | - cmd = ["python", "-m", "urlscan", "-n", f"{file}"] |
71 | | - result = subprocess.run(cmd, capture_output=True) |
72 | | - if result.returncode != 0: |
73 | | - stderr = result.stderr.decode("utf8") |
74 | | - msg = f"Could not retrieve url's from file: {file}, details: {stderr}" |
75 | | - raise Exception(msg) |
76 | | - stdout = result.stdout.decode("utf8").strip() |
77 | | - _urls = (url.strip() for url in stdout.split("\n")) |
78 | | - _urls = (url for url in _urls if url) # filter empty strings and none |
79 | | - yield from zip(repeat(file), filter(lambda url: not should_filter(url), _urls)) |
| 69 | + urls = re.findall( r"http[s]?://[^\s<>'\"\,\)\]]+[^\s<>'\"\,\.\)\]]" , file.open().read()) |
| 70 | + yield from zip(repeat(file), filter(lambda url: not should_filter(url), urls)) |
80 | 71 |
|
81 | 72 |
|
82 | 73 | def _doc_links_check(url: str) -> Tuple[Optional[int], str]: |
83 | 74 | """Checks if an url is still working (can be accessed)""" |
84 | 75 | try: |
85 | 76 | # User-Agent needs to be faked otherwise some webpages will deny access with a 403 |
86 | | - req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/10.0"}) |
87 | | - result = request.urlopen(req) |
88 | | - return result.code, f"{result.msg}" |
89 | | - except urllib.error.HTTPError as ex: |
90 | | - return ex.code, f"{ex}" |
| 77 | + result = requests.get(url, timeout=5) |
| 78 | + return result.status_code, f"{result.reason}" |
| 79 | + except requests.exceptions.RequestException as ex: |
| 80 | + print("error:", ex) |
91 | 81 |
|
92 | 82 |
|
93 | 83 | def _git_diff_changes_main() -> int: |
@@ -150,10 +140,15 @@ def docs_list_links(session: Session) -> None: |
150 | 140 | def docs_links_check(session: Session) -> None: |
151 | 141 | """Checks whether all links in the documentation are accessible.""" |
152 | 142 | errors = [] |
153 | | - for path, url in _doc_urls(_doc_files(PROJECT_CONFIG.root)): |
| 143 | + urls = list(_doc_urls(_doc_files(PROJECT_CONFIG.root))) |
| 144 | + urls_count = len(urls) |
| 145 | + count = 1 |
| 146 | + for path, url in urls: |
| 147 | + print(f"({count}/{urls_count}): {url}") |
154 | 148 | status, details = _doc_links_check(url) |
155 | 149 | if status != 200: |
156 | 150 | errors.append((path, url, status, details)) |
| 151 | + count += 1 |
157 | 152 |
|
158 | 153 | if errors: |
159 | 154 | session.error( |
|
0 commit comments