jupyterhub · betatim · Jan 19, 2021 · Dec 10, 2020 · betatim · Jan 13, 2021
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -5,4 +5,4 @@ pytest>=4.6
 wheel
 pytest-cov
 pre-commit
-requests
+requests_mock
diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py
@@ -2,7 +2,6 @@
 import json
 import shutil
 
-from urllib.request import Request
 from urllib.parse import urlparse, urlunparse, parse_qs
 
 from .doi import DoiProvider
@@ -56,7 +55,6 @@ def detect(self, doi, ref=None, extra_args=None):
             return
 
         query_args = parse_qs(parsed_url.query)
-
         # Corner case handling
         if parsed_url.path.startswith("/file.xhtml"):
             # There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
@@ -75,8 +73,7 @@ def detect(self, doi, ref=None, extra_args=None):
                 parsed_url._replace(path="/api/search", query=search_query)
             )
             self.log.debug("Querying Dataverse: " + search_url)
-            resp = self.urlopen(search_url).read()
-            data = json.loads(resp.decode("utf-8"))["data"]
+            data = self.urlopen(search_url).json()
             if data["count_in_response"] != 1:
                 self.log.debug(
                     "Dataverse search query failed!\n - doi: {}\n - url: {}\n - resp: {}\n".format(
@@ -101,14 +98,12 @@ def fetch(self, spec, output_dir, yield_output=False):
         host = spec["host"]
 
         yield "Fetching Dataverse record {}.\n".format(record_id)
-        req = Request(
-            "{}/api/datasets/:persistentId?persistentId={}".format(
-                host["url"], record_id
-            ),
-            headers={"accept": "application/json"},
+        url = "{}/api/datasets/:persistentId?persistentId={}".format(
+            host["url"], record_id
         )
-        resp = self.urlopen(req)
-        record = json.loads(resp.read().decode("utf-8"))["data"]
+
+        resp = self.urlopen(url, headers={"accept": "application/json"})
+        record = resp.json()
 
         for fobj in deep_get(record, "latestVersion.files"):
             file_url = "{}/api/access/datafile/{}".format(

diff --git a/repo2docker/contentproviders/doi.py b/repo2docker/contentproviders/doi.py
@@ -5,8 +5,8 @@
 
 from os import makedirs
 from os import path
-from urllib import request  # urlopen, Request
-from urllib.error import HTTPError
+from requests import Session, HTTPError
+
 from zipfile import ZipFile, is_zipfile
 
 from .base import ContentProvider
@@ -18,7 +18,21 @@
 class DoiProvider(ContentProvider):
     """Provide contents of a repository identified by a DOI and some helper functions."""
 
-    def urlopen(self, req, headers=None):
+    def __init__(self):
+        super().__init__()
+        self.session = Session()
+        self.session.headers.update(
+            {
+                "user-agent": "repo2docker {}".format(__version__),
+            }
+        )
+
+    def _request(self, url, **kwargs):
+        return self.session.get(url, **kwargs)
+
+    urlopen = _request
+
+    def _urlopen(self, req, headers=None):
         """A urlopen() helper"""
         # someone passed a string, not a request
         if not isinstance(req, request.Request):
@@ -38,7 +52,8 @@ def doi2url(self, doi):
             doi = normalize_doi(doi)
 
             try:
-                resp = self.urlopen("https://doi.org/{}".format(doi))
+                resp = self._request("https://doi.org/{}".format(doi))
+                resp.raise_for_status()
             # If the DOI doesn't resolve, just return URL
             except HTTPError:
                 return doi
@@ -53,38 +68,42 @@ def fetch_file(self, file_ref, host, output_dir, unzip=False):
         file_url = deep_get(file_ref, host["download"])
         fname = deep_get(file_ref, host["filename"])
         logging.debug("Downloading file {} as {}\n".format(file_url, fname))
-        with self.urlopen(file_url) as src:
+
+        yield "Requesting {}\n".format(file_url)
+        resp = self._request(file_url, stream=True)
+        resp.raise_for_status()
+
+        if path.dirname(fname):
+            sub_dir = path.join(output_dir, path.dirname(fname))
+            if not path.exists(sub_dir):
+                yield "Creating {}\n".format(sub_dir)
+                makedirs(sub_dir, exist_ok=True)
+
+        dst_fname = path.join(output_dir, fname)
+        with open(dst_fname, "wb") as dst:
+            yield "Fetching {}\n".format(fname)
+            for chunk in resp.iter_content(chunk_size=None):
+                dst.write(chunk)
+
+        if unzip and is_zipfile(dst_fname):
+            yield "Extracting {}\n".format(fname)
+            zfile = ZipFile(dst_fname)
+            zfile.extractall(path=output_dir)
+            zfile.close()
+
+            # delete downloaded file ...
+            os.remove(dst_fname)
+            # ... and any directories we might have created,
+            # in which case sub_dir will be defined
             if path.dirname(fname):
-                sub_dir = path.join(output_dir, path.dirname(fname))
-                if not path.exists(sub_dir):
-                    yield "Creating {}\n".format(sub_dir)
-                    makedirs(sub_dir, exist_ok=True)
-
-            dst_fname = path.join(output_dir, fname)
-            with open(dst_fname, "wb") as dst:
-                yield "Fetching {}\n".format(fname)
-                shutil.copyfileobj(src, dst)
-            # first close the newly written file, then continue
-            # processing it
-            if unzip and is_zipfile(dst_fname):
-                yield "Extracting {}\n".format(fname)
-                zfile = ZipFile(dst_fname)
-                zfile.extractall(path=output_dir)
-                zfile.close()
-
-                # delete downloaded file ...
-                os.remove(dst_fname)
-                # ... and any directories we might have created,
-                # in which case sub_dir will be defined
-                if path.dirname(fname):
-                    shutil.rmtree(sub_dir)
-
-                new_subdirs = os.listdir(output_dir)
-                # if there is only one new subdirectory move its contents
-                # to the top level directory
-                if len(new_subdirs) == 1:
-                    d = new_subdirs[0]
-                    copytree(path.join(output_dir, d), output_dir)
-                    shutil.rmtree(path.join(output_dir, d))
-
-                yield "Fetched files: {}\n".format(os.listdir(output_dir))
+                shutil.rmtree(sub_dir)
+
+            new_subdirs = os.listdir(output_dir)
+            # if there is only one new subdirectory move its contents
+            # to the top level directory
+            if len(new_subdirs) == 1:
+                d = new_subdirs[0]
+                copytree(path.join(output_dir, d), output_dir)
+                shutil.rmtree(path.join(output_dir, d))
+
+            yield "Fetched files: {}\n".format(os.listdir(output_dir))
diff --git a/repo2docker/contentproviders/figshare.py b/repo2docker/contentproviders/figshare.py
@@ -25,6 +25,7 @@ class Figshare(DoiProvider):
     """
 
     def __init__(self):
+        super().__init__()
         self.hosts = [
             {
                 "hostname": [
@@ -74,13 +75,12 @@ def fetch(self, spec, output_dir, yield_output=False):
         yield "Fetching Figshare article {} in version {}.\n".format(
             article_id, article_version
         )
-        req = Request(
+        resp = self.urlopen(
             "{}{}/versions/{}".format(host["api"], article_id, article_version),
             headers={"accept": "application/json"},
         )
-        resp = self.urlopen(req)
 
-        article = json.loads(resp.read().decode("utf-8"))
+        article = resp.json()
 
         files = deep_get(article, host["filepath"])
         # only fetch files where is_link_only: False

diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py
@@ -16,9 +16,7 @@ class Hydroshare(DoiProvider):
 
     def _fetch_version(self, host):
         """Fetch resource modified date and convert to epoch"""
-        json_response = json.loads(
-            self.urlopen(host["version"].format(self.resource_id)).read()
-        )
+        json_response = self.urlopen(host["version"].format(self.resource_id)).json()
         date = next(
             item for item in json_response["dates"] if item["type"] == "modified"
         )["start_date"]

diff --git a/repo2docker/contentproviders/zenodo.py b/repo2docker/contentproviders/zenodo.py
@@ -15,6 +15,7 @@ class Zenodo(DoiProvider):
     """Provide contents of a Zenodo deposit."""
 
     def __init__(self):
+        super().__init__()
         # We need the hostname (url where records are), api url (for metadata),
         # filepath (path to files in metadata), filename (path to filename in
         # metadata), download (path to file download URL), and type (path to item type in metadata)
@@ -55,13 +56,12 @@ def fetch(self, spec, output_dir, yield_output=False):
         host = spec["host"]
 
         yield "Fetching Zenodo record {}.\n".format(record_id)
-        req = Request(
+        resp = self.urlopen(
             "{}{}".format(host["api"], record_id),
             headers={"accept": "application/json"},
         )
-        resp = self.urlopen(req)
 
-        record = json.loads(resp.read().decode("utf-8"))
+        record = resp.json()
 
         is_software = deep_get(record, host["type"]).lower() == "software"
         files = deep_get(record, host["filepath"])

diff --git a/setup.py b/setup.py
@@ -52,6 +52,7 @@ def get_identifier(json):
         "python-json-logger",
         "escapism",
         "jinja2",
+        "requests",
         "ruamel.yaml>=0.15",
         "toml",
         "semver",