diff --git a/binderhub/build.py b/binderhub/build.py index f26cc4ba2..ced7a07f3 100644 --- a/binderhub/build.py +++ b/binderhub/build.py @@ -7,6 +7,7 @@ import json import threading from urllib.parse import urlparse +import os from kubernetes import client, watch from tornado.ioloop import IOLoop @@ -235,6 +236,24 @@ def submit(self): if self.git_credentials: env.append(client.V1EnvVar(name='GIT_CREDENTIAL_ENV', value=self.git_credentials)) + # copy additional variables from current environment to new pod environment + proxy_environment_variables = [ + 'http_proxy', + 'https_proxy', + 'HTTP_PROXY', + 'HTTPS_PROXY', + 'no_proxy', + 'NO_PROXY', + ] + + for env_var in proxy_environment_variables: + try: + env.append(client.V1EnvVar(name=env_var, value=os.environ[env_var])) + except KeyError: + # skip the environment variable if it isn't present + pass + + component_label = "binderhub-build" self.pod = client.V1Pod( metadata=client.V1ObjectMeta( name=self.name, diff --git a/binderhub/main.py b/binderhub/main.py index de1a1a711..0a95eedbe 100644 --- a/binderhub/main.py +++ b/binderhub/main.py @@ -3,12 +3,13 @@ """ import urllib.parse -from tornado.httpclient import AsyncHTTPClient, HTTPRequest +from tornado.httpclient import HTTPRequest from tornado.web import HTTPError, authenticated from tornado.httputil import url_concat from tornado.log import app_log from .base import BaseHandler +from .utils import ProxiedAsyncHTTPClient SPEC_NAMES = { "gh": "GitHub", @@ -77,7 +78,7 @@ async def get(self, provider_prefix, _unescaped_spec): # Check if the nbviewer URL is valid and would display something # useful to the reader, if not we don't show it - client = AsyncHTTPClient() + client = ProxiedAsyncHTTPClient() # quote any unicode characters in the URL proto, rest = nbviewer_url.split("://") rest = urllib.parse.quote(rest) diff --git a/binderhub/registry.py b/binderhub/registry.py index 8891041b2..de6521da6 100644 --- a/binderhub/registry.py +++ b/binderhub/registry.py @@ -11,6 +11,8 @@ from traitlets.config import LoggingConfigurable from traitlets import Dict, Unicode, default +from .utils import ProxiedAsyncHTTPClient + DEFAULT_DOCKER_REGISTRY_URL = "https://registry.hub.docker.com" DEFAULT_DOCKER_AUTH_URL = "https://index.docker.io/v1" @@ -187,7 +189,7 @@ def _default_password(self): @gen.coroutine def get_image_manifest(self, image, tag): - client = httpclient.AsyncHTTPClient() + client = ProxiedAsyncHTTPClient() url = "{}/v2/{}/manifests/{}".format(self.url, image, tag) # first, get a token to perform the manifest request if self.token_url: diff --git a/binderhub/repoproviders.py b/binderhub/repoproviders.py index 3b0ae53a9..b086121dd 100644 --- a/binderhub/repoproviders.py +++ b/binderhub/repoproviders.py @@ -18,13 +18,14 @@ from prometheus_client import Gauge from tornado import gen -from tornado.httpclient import AsyncHTTPClient, HTTPError, HTTPRequest +from tornado.httpclient import HTTPError, HTTPRequest from tornado.httputil import url_concat from traitlets import Dict, Unicode, Bool, default, List, observe from traitlets.config import LoggingConfigurable from .utils import Cache +from .utils import ProxiedAsyncHTTPClient GITHUB_RATE_LIMIT = Gauge('binderhub_github_rate_limit_remaining', 'GitHub rate limit remaining') SHA1_PATTERN = re.compile(r'[0-9a-f]{40}') @@ -217,7 +218,7 @@ class ZenodoProvider(RepoProvider): @gen.coroutine def get_resolved_ref(self): - client = AsyncHTTPClient() + client = ProxiedAsyncHTTPClient() req = HTTPRequest("https://doi.org/{}".format(self.spec), user_agent="BinderHub") r = yield client.fetch(req) @@ -257,7 +258,7 @@ class FigshareProvider(RepoProvider): @gen.coroutine def get_resolved_ref(self): - client = AsyncHTTPClient() + client = ProxiedAsyncHTTPClient() req = HTTPRequest("https://doi.org/{}".format(self.spec), user_agent="BinderHub") r = yield client.fetch(req) @@ -439,7 +440,7 @@ def get_resolved_ref(self): return self.resolved_ref namespace = urllib.parse.quote(self.namespace, safe='') - client = AsyncHTTPClient() + client = ProxiedAsyncHTTPClient() api_url = "https://{hostname}/api/v4/projects/{namespace}/repository/commits/{ref}".format( hostname=self.hostname, namespace=namespace, @@ -582,7 +583,7 @@ async def get_resolved_ref_url(self): @gen.coroutine def github_api_request(self, api_url, etag=None): - client = AsyncHTTPClient() + client = ProxiedAsyncHTTPClient() if self.auth: # Add auth params. After logging! api_url = url_concat(api_url, self.auth) diff --git a/binderhub/utils.py b/binderhub/utils.py index 9b3f7a7fa..3522fbe63 100644 --- a/binderhub/utils.py +++ b/binderhub/utils.py @@ -3,6 +3,12 @@ from hashlib import blake2b from traitlets import Integer, TraitError +from tornado.httpclient import AsyncHTTPClient, HTTPRequest, HTTPResponse +from typing import Any, Union, Awaitable +from urllib.parse import urlparse +import ipaddress +import re +import os def blake2b_hash_as_int(b): @@ -113,6 +119,106 @@ def set(self, key, value): self.pop(first_key) +class ProxiedAsyncHTTPClient(): + """wrapper for automatic proxy support in tornado's non-blocking HTTP client. + + see tornado.httplib.AsyncHTTPClient for usage/documentation + """ + def __init__(self): + self.client = AsyncHTTPClient() + + # use the first found proxy environment variable + self.http_proxy_host = None + self.http_proxy_port = None + for proxy_var in ['HTTPS_PROXY', 'https_proxy', 'HTTP_PROXY', 'http_proxy']: + try: + parsed_proxy = urlparse(os.environ[proxy_var]) + self.http_proxy_host = parsed_proxy.hostname + proxy_port = parsed_proxy.port + if proxy_port: # can be None + self.http_proxy_port = int(proxy_port) + else: + self.http_proxy_port = 443 if parsed_proxy.scheme == 'https' else 80 + break + except KeyError: + pass + + # sort no_proxy environment variable into CIDR ranges (e.g. 10.0.0.0/8) + # and "simple" matches (e.g. my-institution.org or 10.1.2.3) + self.no_proxy_simple = [] + self.no_proxy_cidr = [] + no_proxy = None + for no_proxy_var in ['NO_PROXY', 'no_proxy']: + try: + no_proxy = os.environ[no_proxy_var] + except KeyError: + pass + if no_proxy: + for no_proxy_part in no_proxy.split(','): + if self._is_cidr_range(no_proxy_part): + self.no_proxy_cidr.append(no_proxy_part) + else: + self.no_proxy_simple.append(no_proxy_part) + + @staticmethod + def _is_cidr_range(test_string): + range_parts = test_string.split('/') + if len(range_parts) != 2: + return False + ip, suffix = range_parts + ip_is_valid = ProxiedAsyncHTTPClient._is_ip(ip) + suffix_is_valid = bool(re.fullmatch('(?:[0-9]|[12][0-9]|3[0-2])', suffix)) + return ip_is_valid and suffix_is_valid + + @staticmethod + def _is_ip(test_string): + ip_digit = '(?:1[0-9]?[0-9]|[1-9][0-9]|[0-9]|2[0-4][0-9]|25[0-5])' + return bool(re.fullmatch(rf'{ip_digit}\.{ip_digit}\.{ip_digit}\.{ip_digit}', test_string)) + + def fetch( + self, + request: Union[str, "HTTPRequest"], + raise_error: bool = True, + **kwargs: Any + ) -> Awaitable["HTTPResponse"]: + """Executes a request, asynchronously returning an `HTTPResponse`. + + see tornado.httpclient.AsyncHTTPClient.fetch for documentation + """ + # convert request argument into HTTPRequest if necessary + if isinstance(request, str): + request = HTTPRequest(url=request, **kwargs) + + # determine correct proxy host and port + parsed_url = urlparse(request.url) + if self.http_proxy_host and parsed_url.scheme in ('http', 'https'): + bypass_proxy = False + url_hostname = str(parsed_url.hostname) + if ProxiedAsyncHTTPClient._is_ip(url_hostname): + for no_proxy_cidr in self.no_proxy_cidr: + if ipaddress.ip_address(url_hostname) in ipaddress.ip_network(no_proxy_cidr): + bypass_proxy = True + break + for no_proxy_simple in self.no_proxy_simple: + escaped_no_proxy = re.escape(no_proxy_simple) + # try to match as full domain or last part of it + # for example: when "my-institution.org" is given as part of no_proxy, try to match + # "my-institution.org" and subdomains like "www.my-institution.org" + if re.fullmatch(rf'(?:{escaped_no_proxy})|(?:.+\.{escaped_no_proxy})', url_hostname): + bypass_proxy = True + break + + if not bypass_proxy: + request.proxy_host = self.http_proxy_host + request.proxy_port = self.http_proxy_port + + # pass call on to AsyncHTTPClient's configured implementation + return self.client.fetch(request, raise_error) + + def close(self): + return self.client.close() + + def url_path_join(*pieces): """Join components of url into a relative url.