|
3 | 3 | from hashlib import blake2b |
4 | 4 |
|
5 | 5 | from traitlets import Integer, TraitError |
| 6 | +from tornado.httpclient import AsyncHTTPClient, HTTPRequest, HTTPResponse |
| 7 | +from typing import Any, Union, Awaitable |
| 8 | +from urllib.parse import urlparse |
| 9 | +import ipaddress |
| 10 | +import re |
| 11 | +import os |
6 | 12 |
|
7 | 13 |
|
8 | 14 | def blake2b_hash_as_int(b): |
@@ -113,6 +119,106 @@ def set(self, key, value): |
113 | 119 | self.pop(first_key) |
114 | 120 |
|
115 | 121 |
|
| 122 | +class ProxiedAsyncHTTPClient(): |
| 123 | + """wrapper for automatic proxy support in tornado's non-blocking HTTP client. |
| 124 | +
|
| 125 | + see tornado.httplib.AsyncHTTPClient for usage/documentation |
| 126 | + """ |
| 127 | + def __init__(self): |
| 128 | + self.client = AsyncHTTPClient() |
| 129 | + |
| 130 | + # use the first found proxy environment variable |
| 131 | + self.http_proxy_host = None |
| 132 | + self.http_proxy_port = None |
| 133 | + for proxy_var in ['HTTPS_PROXY', 'https_proxy', 'HTTP_PROXY', 'http_proxy']: |
| 134 | + try: |
| 135 | + parsed_proxy = urlparse(os.environ[proxy_var]) |
| 136 | + self.http_proxy_host = parsed_proxy.hostname |
| 137 | + proxy_port = parsed_proxy.port |
| 138 | + if proxy_port: # can be None |
| 139 | + self.http_proxy_port = int(proxy_port) |
| 140 | + else: |
| 141 | + self.http_proxy_port = 443 if parsed_proxy.scheme == 'https' else 80 |
| 142 | + break |
| 143 | + except KeyError: |
| 144 | + pass |
| 145 | + |
| 146 | + # sort no_proxy environment variable into CIDR ranges (e.g. 10.0.0.0/8) |
| 147 | + # and "simple" matches (e.g. my-institution.org or 10.1.2.3) |
| 148 | + self.no_proxy_simple = [] |
| 149 | + self.no_proxy_cidr = [] |
| 150 | + no_proxy = None |
| 151 | + for no_proxy_var in ['NO_PROXY', 'no_proxy']: |
| 152 | + try: |
| 153 | + no_proxy = os.environ[no_proxy_var] |
| 154 | + except KeyError: |
| 155 | + pass |
| 156 | + if no_proxy: |
| 157 | + for no_proxy_part in no_proxy.split(','): |
| 158 | + if self._is_cidr_range(no_proxy_part): |
| 159 | + self.no_proxy_cidr.append(no_proxy_part) |
| 160 | + else: |
| 161 | + self.no_proxy_simple.append(no_proxy_part) |
| 162 | + |
| 163 | + @staticmethod |
| 164 | + def _is_cidr_range(test_string): |
| 165 | + range_parts = test_string.split('/') |
| 166 | + if len(range_parts) != 2: |
| 167 | + return False |
| 168 | + ip, suffix = range_parts |
| 169 | + ip_is_valid = ProxiedAsyncHTTPClient._is_ip(ip) |
| 170 | + suffix_is_valid = bool(re.fullmatch('(?:[0-9]|[12][0-9]|3[0-2])', suffix)) |
| 171 | + return ip_is_valid and suffix_is_valid |
| 172 | + |
| 173 | + @staticmethod |
| 174 | + def _is_ip(test_string): |
| 175 | + ip_digit = '(?:1[0-9]?[0-9]|[1-9][0-9]|[0-9]|2[0-4][0-9]|25[0-5])' |
| 176 | + return bool(re.fullmatch(rf'{ip_digit}\.{ip_digit}\.{ip_digit}\.{ip_digit}', test_string)) |
| 177 | + |
| 178 | + def fetch( |
| 179 | + self, |
| 180 | + request: Union[str, "HTTPRequest"], |
| 181 | + raise_error: bool = True, |
| 182 | + **kwargs: Any |
| 183 | + ) -> Awaitable["HTTPResponse"]: |
| 184 | + """Executes a request, asynchronously returning an `HTTPResponse`. |
| 185 | +
|
| 186 | + see tornado.httpclient.AsyncHTTPClient.fetch for documentation |
| 187 | + """ |
| 188 | + # convert request argument into HTTPRequest if necessary |
| 189 | + if isinstance(request, str): |
| 190 | + request = HTTPRequest(url=request, **kwargs) |
| 191 | + |
| 192 | + # determine correct proxy host and port |
| 193 | + parsed_url = urlparse(request.url) |
| 194 | + if self.http_proxy_host and parsed_url.scheme in ('http', 'https'): |
| 195 | + bypass_proxy = False |
| 196 | + url_hostname = str(parsed_url.hostname) |
| 197 | + if ProxiedAsyncHTTPClient._is_ip(url_hostname): |
| 198 | + for no_proxy_cidr in self.no_proxy_cidr: |
| 199 | + if ipaddress.ip_address(url_hostname) in ipaddress.ip_network(no_proxy_cidr): |
| 200 | + bypass_proxy = True |
| 201 | + break |
| 202 | + for no_proxy_simple in self.no_proxy_simple: |
| 203 | + escaped_no_proxy = re.escape(no_proxy_simple) |
| 204 | + # try to match as full domain or last part of it |
| 205 | + # for example: when "my-institution.org" is given as part of no_proxy, try to match |
| 206 | + # "my-institution.org" and subdomains like "www.my-institution.org" |
| 207 | + if re.fullmatch(rf'(?:{escaped_no_proxy})|(?:.+\.{escaped_no_proxy})', url_hostname): |
| 208 | + bypass_proxy = True |
| 209 | + break |
| 210 | + |
| 211 | + if not bypass_proxy: |
| 212 | + request.proxy_host = self.http_proxy_host |
| 213 | + request.proxy_port = self.http_proxy_port |
| 214 | + |
| 215 | + # pass call on to AsyncHTTPClient's configured implementation |
| 216 | + return self.client.fetch(request, raise_error) |
| 217 | + |
| 218 | + def close(self): |
| 219 | + return self.client.close() |
| 220 | + |
| 221 | + |
116 | 222 | def url_path_join(*pieces): |
117 | 223 | """Join components of url into a relative url. |
118 | 224 |
|
|
0 commit comments