|
2 | 2 |
|
3 | 3 | __all__ = ['dict2obj', 'tuplify', 'uniqueify', 'is_listy', 'shufflish', 'mapped', 'IterLen', 'ReindexCollection', |
4 | 4 | 'open_file', 'save_pickle', 'load_pickle', 'maybe_open', 'image_size', 'bunzip', 'join_path_file', 'urlread', |
5 | | - 'urljson', 'run', 'do_request', 'sort_by_run', 'trace', 'round_multiple', 'modified_env', 'ContextManagers', |
6 | | - 'str2bool', 'set_num_threads', 'ProcessPoolExecutor', 'ThreadPoolExecutor', 'parallel', 'run_procs', |
7 | | - 'parallel_gen', 'threaded'] |
| 5 | + 'urljson', 'urlwrap', 'urlcheck', 'run', 'do_request', 'sort_by_run', 'trace', 'round_multiple', |
| 6 | + 'modified_env', 'ContextManagers', 'str2bool', 'set_num_threads', 'ProcessPoolExecutor', |
| 7 | + 'ThreadPoolExecutor', 'parallel', 'run_procs', 'parallel_gen', 'threaded'] |
8 | 8 |
|
9 | 9 | # Cell |
10 | 10 | from .imports import * |
|
16 | 16 | from contextlib import contextmanager,ExitStack |
17 | 17 | from pdb import set_trace |
18 | 18 | from urllib.request import Request,urlopen |
19 | | -from urllib.error import HTTPError |
| 19 | +from urllib.error import HTTPError,URLError |
20 | 20 | from urllib.parse import urlencode |
| 21 | +from http.client import InvalidURL |
21 | 22 | from threading import Thread |
22 | 23 |
|
23 | 24 | # Cell |
@@ -186,6 +187,25 @@ def urljson(url, data=None): |
186 | 187 | "Retrieve `url` and decode json" |
187 | 188 | return json.loads(urlread(url, data=data)) |
188 | 189 |
|
| 190 | +# Cell |
| 191 | +_ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36' |
| 192 | + |
| 193 | +# Cell |
| 194 | +def urlwrap(url): |
| 195 | + "Wrap `url` in a urllib `Request` with a user-agent header" |
| 196 | + if not isinstance(url,Request): url = Request(url) |
| 197 | + url.headers['User-Agent'] = _ua |
| 198 | + return url |
| 199 | + |
| 200 | +# Cell |
| 201 | +def urlcheck(url, timeout=10): |
| 202 | + if not url: return True |
| 203 | + try: |
| 204 | + with urlopen(urlwrap(url), timeout=timeout) as u: return u.status<400 |
| 205 | + except URLError: return False |
| 206 | + except socket.timeout: return False |
| 207 | + except InvalidURL: return False |
| 208 | + |
189 | 209 | # Cell |
190 | 210 | def run(cmd, *rest, ignore_ex=False, as_bytes=False): |
191 | 211 | "Pass `cmd` (splitting with `shlex` if string) to `subprocess.run`; return `stdout`; raise `IOError` if fails" |
|
0 commit comments