diff --git a/dev-requirements.txt b/dev-requirements.txt index bcecb7ba7..4b75d1bbc 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -5,4 +5,4 @@ pytest>=4.6 wheel pytest-cov pre-commit -requests +requests_mock diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 7519d700a..66632f5e6 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -2,7 +2,6 @@ import json import shutil -from urllib.request import Request from urllib.parse import urlparse, urlunparse, parse_qs from .doi import DoiProvider @@ -56,7 +55,6 @@ def detect(self, doi, ref=None, extra_args=None): return query_args = parse_qs(parsed_url.query) - # Corner case handling if parsed_url.path.startswith("/file.xhtml"): # There's no way of getting file information using its persistentId, the only thing we can do is assume that doi @@ -75,8 +73,7 @@ def detect(self, doi, ref=None, extra_args=None): parsed_url._replace(path="/api/search", query=search_query) ) self.log.debug("Querying Dataverse: " + search_url) - resp = self.urlopen(search_url).read() - data = json.loads(resp.decode("utf-8"))["data"] + data = self.urlopen(search_url).json() if data["count_in_response"] != 1: self.log.debug( "Dataverse search query failed!\n - doi: {}\n - url: {}\n - resp: {}\n".format( @@ -101,14 +98,12 @@ def fetch(self, spec, output_dir, yield_output=False): host = spec["host"] yield "Fetching Dataverse record {}.\n".format(record_id) - req = Request( - "{}/api/datasets/:persistentId?persistentId={}".format( - host["url"], record_id - ), - headers={"accept": "application/json"}, + url = "{}/api/datasets/:persistentId?persistentId={}".format( + host["url"], record_id ) - resp = self.urlopen(req) - record = json.loads(resp.read().decode("utf-8"))["data"] + + resp = self.urlopen(url, headers={"accept": "application/json"}) + record = resp.json() for fobj in deep_get(record, "latestVersion.files"): file_url = "{}/api/access/datafile/{}".format( diff --git a/repo2docker/contentproviders/doi.py b/repo2docker/contentproviders/doi.py index 6ec2c8e17..c1941ba83 100644 --- a/repo2docker/contentproviders/doi.py +++ b/repo2docker/contentproviders/doi.py @@ -5,8 +5,8 @@ from os import makedirs from os import path -from urllib import request # urlopen, Request -from urllib.error import HTTPError +from requests import Session, HTTPError + from zipfile import ZipFile, is_zipfile from .base import ContentProvider @@ -18,7 +18,21 @@ class DoiProvider(ContentProvider): """Provide contents of a repository identified by a DOI and some helper functions.""" - def urlopen(self, req, headers=None): + def __init__(self): + super().__init__() + self.session = Session() + self.session.headers.update( + { + "user-agent": "repo2docker {}".format(__version__), + } + ) + + def _request(self, url, **kwargs): + return self.session.get(url, **kwargs) + + urlopen = _request + + def _urlopen(self, req, headers=None): """A urlopen() helper""" # someone passed a string, not a request if not isinstance(req, request.Request): @@ -38,7 +52,8 @@ def doi2url(self, doi): doi = normalize_doi(doi) try: - resp = self.urlopen("https://doi.org/{}".format(doi)) + resp = self._request("https://doi.org/{}".format(doi)) + resp.raise_for_status() # If the DOI doesn't resolve, just return URL except HTTPError: return doi @@ -53,38 +68,42 @@ def fetch_file(self, file_ref, host, output_dir, unzip=False): file_url = deep_get(file_ref, host["download"]) fname = deep_get(file_ref, host["filename"]) logging.debug("Downloading file {} as {}\n".format(file_url, fname)) - with self.urlopen(file_url) as src: + + yield "Requesting {}\n".format(file_url) + resp = self._request(file_url, stream=True) + resp.raise_for_status() + + if path.dirname(fname): + sub_dir = path.join(output_dir, path.dirname(fname)) + if not path.exists(sub_dir): + yield "Creating {}\n".format(sub_dir) + makedirs(sub_dir, exist_ok=True) + + dst_fname = path.join(output_dir, fname) + with open(dst_fname, "wb") as dst: + yield "Fetching {}\n".format(fname) + for chunk in resp.iter_content(chunk_size=None): + dst.write(chunk) + + if unzip and is_zipfile(dst_fname): + yield "Extracting {}\n".format(fname) + zfile = ZipFile(dst_fname) + zfile.extractall(path=output_dir) + zfile.close() + + # delete downloaded file ... + os.remove(dst_fname) + # ... and any directories we might have created, + # in which case sub_dir will be defined if path.dirname(fname): - sub_dir = path.join(output_dir, path.dirname(fname)) - if not path.exists(sub_dir): - yield "Creating {}\n".format(sub_dir) - makedirs(sub_dir, exist_ok=True) - - dst_fname = path.join(output_dir, fname) - with open(dst_fname, "wb") as dst: - yield "Fetching {}\n".format(fname) - shutil.copyfileobj(src, dst) - # first close the newly written file, then continue - # processing it - if unzip and is_zipfile(dst_fname): - yield "Extracting {}\n".format(fname) - zfile = ZipFile(dst_fname) - zfile.extractall(path=output_dir) - zfile.close() - - # delete downloaded file ... - os.remove(dst_fname) - # ... and any directories we might have created, - # in which case sub_dir will be defined - if path.dirname(fname): - shutil.rmtree(sub_dir) - - new_subdirs = os.listdir(output_dir) - # if there is only one new subdirectory move its contents - # to the top level directory - if len(new_subdirs) == 1: - d = new_subdirs[0] - copytree(path.join(output_dir, d), output_dir) - shutil.rmtree(path.join(output_dir, d)) - - yield "Fetched files: {}\n".format(os.listdir(output_dir)) + shutil.rmtree(sub_dir) + + new_subdirs = os.listdir(output_dir) + # if there is only one new subdirectory move its contents + # to the top level directory + if len(new_subdirs) == 1: + d = new_subdirs[0] + copytree(path.join(output_dir, d), output_dir) + shutil.rmtree(path.join(output_dir, d)) + + yield "Fetched files: {}\n".format(os.listdir(output_dir)) diff --git a/repo2docker/contentproviders/figshare.py b/repo2docker/contentproviders/figshare.py index 43ec71faf..0fa09719b 100644 --- a/repo2docker/contentproviders/figshare.py +++ b/repo2docker/contentproviders/figshare.py @@ -25,6 +25,7 @@ class Figshare(DoiProvider): """ def __init__(self): + super().__init__() self.hosts = [ { "hostname": [ @@ -74,13 +75,12 @@ def fetch(self, spec, output_dir, yield_output=False): yield "Fetching Figshare article {} in version {}.\n".format( article_id, article_version ) - req = Request( + resp = self.urlopen( "{}{}/versions/{}".format(host["api"], article_id, article_version), headers={"accept": "application/json"}, ) - resp = self.urlopen(req) - article = json.loads(resp.read().decode("utf-8")) + article = resp.json() files = deep_get(article, host["filepath"]) # only fetch files where is_link_only: False diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index 2531aa8bf..eac66e21c 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -16,9 +16,7 @@ class Hydroshare(DoiProvider): def _fetch_version(self, host): """Fetch resource modified date and convert to epoch""" - json_response = json.loads( - self.urlopen(host["version"].format(self.resource_id)).read() - ) + json_response = self.urlopen(host["version"].format(self.resource_id)).json() date = next( item for item in json_response["dates"] if item["type"] == "modified" )["start_date"] diff --git a/repo2docker/contentproviders/zenodo.py b/repo2docker/contentproviders/zenodo.py index 87e7967f4..68d3494dd 100644 --- a/repo2docker/contentproviders/zenodo.py +++ b/repo2docker/contentproviders/zenodo.py @@ -15,6 +15,7 @@ class Zenodo(DoiProvider): """Provide contents of a Zenodo deposit.""" def __init__(self): + super().__init__() # We need the hostname (url where records are), api url (for metadata), # filepath (path to files in metadata), filename (path to filename in # metadata), download (path to file download URL), and type (path to item type in metadata) @@ -55,13 +56,12 @@ def fetch(self, spec, output_dir, yield_output=False): host = spec["host"] yield "Fetching Zenodo record {}.\n".format(record_id) - req = Request( + resp = self.urlopen( "{}{}".format(host["api"], record_id), headers={"accept": "application/json"}, ) - resp = self.urlopen(req) - record = json.loads(resp.read().decode("utf-8")) + record = resp.json() is_software = deep_get(record, host["type"]).lower() == "software" files = deep_get(record, host["filepath"]) diff --git a/setup.py b/setup.py index 5084b486c..8bfe64ebc 100644 --- a/setup.py +++ b/setup.py @@ -52,6 +52,7 @@ def get_identifier(json): "python-json-logger", "escapism", "jinja2", + "requests", "ruamel.yaml>=0.15", "toml", "semver", diff --git a/tests/unit/contentproviders/test_dataverse.py b/tests/unit/contentproviders/test_dataverse.py index 76d396544..0e53e50aa 100644 --- a/tests/unit/contentproviders/test_dataverse.py +++ b/tests/unit/contentproviders/test_dataverse.py @@ -1,6 +1,7 @@ import json import os import pytest +import re from io import BytesIO from tempfile import TemporaryDirectory @@ -19,7 +20,7 @@ "doi:10.7910/DVN/6ZXAGT/3YRRYJ", "10.7910/DVN/6ZXAGT", "https://dataverse.harvard.edu/api/access/datafile/3323458", - "hdl:11529/10016", + "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", ], [ {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}, @@ -27,57 +28,68 @@ ], ) ] -test_responses = { - "doi:10.7910/DVN/6ZXAGT/3YRRYJ": ( +doi_responses = { + "https://doi.org/10.7910/DVN/6ZXAGT/3YRRYJ": ( "https://dataverse.harvard.edu/file.xhtml" "?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ" ), - "doi:10.7910/DVN/6ZXAGT": ( + "https://doi.org/10.7910/DVN/6ZXAGT": ( "https://dataverse.harvard.edu/dataset.xhtml" "?persistentId=doi:10.7910/DVN/6ZXAGT" ), - "10.7910/DVN/6ZXAGT": ( - "https://dataverse.harvard.edu/dataset.xhtml" - "?persistentId=doi:10.7910/DVN/6ZXAGT" + "https://dataverse.harvard.edu/api/access/datafile/3323458": ( + "https://dataverse.harvard.edu/api/access/datafile/3323458" + ), + "https://doi.org/10.21105/joss.01277": ( + "https://joss.theoj.org/papers/10.21105/joss.01277" ), - "https://dataverse.harvard.edu/api/access/datafile/3323458": "https://dataverse.harvard.edu/api/access/datafile/3323458", - "hdl:11529/10016": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", -} -test_search = { - "data": { - "count_in_response": 1, - "items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}], - } } @pytest.mark.parametrize("test_input, expected", test_hosts) -def test_detect_dataverse(test_input, expected): - def doi_resolver(url): - return test_responses.get(url) - - with patch.object(Dataverse, "urlopen") as fake_urlopen, patch.object( - Dataverse, "doi2url", side_effect=doi_resolver - ) as fake_doi2url: - fake_urlopen.return_value.read.return_value = json.dumps(test_search).encode() - # valid Dataverse DOIs trigger this content provider - assert Dataverse().detect(test_input[0]) == expected[0] - assert fake_doi2url.call_count == 2 # File, then dataset - assert Dataverse().detect(test_input[1]) == expected[0] - assert Dataverse().detect(test_input[2]) == expected[0] - # only two of the three calls above have to resolve a DOI - assert fake_urlopen.call_count == 1 - assert Dataverse().detect(test_input[3]) == expected[1] - - with patch.object(Dataverse, "urlopen") as fake_urlopen: - # Don't trigger the Dataverse content provider - assert Dataverse().detect("/some/path/here") is None - assert Dataverse().detect("https://example.com/path/here") is None - # don't handle DOIs that aren't from Dataverse - fake_urlopen.return_value.url = ( - "http://joss.theoj.org/papers/10.21105/joss.01277" - ) - assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None +def test_detect_dataverse(test_input, expected, requests_mock): + def doi_resolver(req, context): + resp = doi_responses.get(req.url) + # doi responses are redirects + if resp is not None: + context.status_code = 302 + context.headers["Location"] = resp + return resp + + requests_mock.get(re.compile("https://"), json=doi_resolver) + requests_mock.get( + "https://dataverse.harvard.edu/api/search?q=entityId:3323458&type=file", + json={ + "count_in_response": 1, + "items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}], + }, + ) + + assert requests_mock.call_count == 0 + # valid Dataverse DOIs trigger this content provider + assert Dataverse().detect(test_input[0]) == expected[0] + # 4: doi resolution (302), File, doi resolution (302), then dataset + assert requests_mock.call_count == 4 + requests_mock.reset_mock() + + assert Dataverse().detect(test_input[1]) == expected[0] + # 2: doi (302), dataset + assert requests_mock.call_count == 2 + requests_mock.reset_mock() + + assert Dataverse().detect(test_input[2]) == expected[0] + # 1: datafile (search dataverse for the file id) + assert requests_mock.call_count == 1 + requests_mock.reset_mock() + + assert Dataverse().detect(test_input[3]) == expected[1] + requests_mock.reset_mock() + + # Don't trigger the Dataverse content provider + assert Dataverse().detect("/some/path/here") is None + assert Dataverse().detect("https://example.com/path/here") is None + # don't handle DOIs that aren't from Dataverse + assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None @pytest.fixture @@ -95,50 +107,50 @@ def dv_files(tmpdir): return [f1, f2, f3] -def test_dataverse_fetch(dv_files): - mock_response_ds_query = BytesIO( - json.dumps( - { - "data": { - "latestVersion": { - "files": [ - {"dataFile": {"id": 1}, "label": "some-file.txt"}, - { - "dataFile": {"id": 2}, - "label": "some-other-file.txt", - "directoryLabel": "directory", - }, - { - "dataFile": {"id": 3}, - "label": "the-other-file.txt", - "directoryLabel": "directory/subdirectory", - }, - ] - } - } - } - ).encode("utf-8") - ) +def test_dataverse_fetch(dv_files, requests_mock): + mock_response = { + "latestVersion": { + "files": [ + {"dataFile": {"id": 1}, "label": "some-file.txt"}, + { + "dataFile": {"id": 2}, + "label": "some-other-file.txt", + "directoryLabel": "directory", + }, + { + "dataFile": {"id": 3}, + "label": "the-other-file.txt", + "directoryLabel": "directory/subdirectory", + }, + ] + } + } + spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"} + def mock_filecontent(req, context): + file_no = int(req.url.split("/")[-1]) - 1 + return open(dv_files[file_no], "rb").read() + + requests_mock.get( + "https://dataverse.harvard.edu/api/datasets/" + ":persistentId?persistentId=doi:10.7910/DVN/6ZXAGT", + json=mock_response, + ) + requests_mock.get( + re.compile("https://dataverse.harvard.edu/api/access/datafile"), + content=mock_filecontent, + ) + dv = Dataverse() - def mock_urlopen(self, req): - if isinstance(req, Request): - return mock_response_ds_query - else: - file_no = int(req.split("/")[-1]) - 1 - return urlopen("file://{}".format(dv_files[file_no])) - - with patch.object(Dataverse, "urlopen", new=mock_urlopen): - with TemporaryDirectory() as d: - output = [] - for l in dv.fetch(spec, d): - output.append(l) - - unpacked_files = set(os.listdir(d)) - expected = set(["directory", "some-file.txt"]) - assert expected == unpacked_files - assert os.path.isfile( - os.path.join(d, "directory", "subdirectory", "the-other-file.txt") - ) + with TemporaryDirectory() as d: + output = [] + for l in dv.fetch(spec, d): + output.append(l) + unpacked_files = set(os.listdir(d)) + expected = set(["directory", "some-file.txt"]) + assert expected == unpacked_files + assert os.path.isfile( + os.path.join(d, "directory", "subdirectory", "the-other-file.txt") + ) diff --git a/tests/unit/contentproviders/test_doi.py b/tests/unit/contentproviders/test_doi.py index 16d321e67..dbe391604 100644 --- a/tests/unit/contentproviders/test_doi.py +++ b/tests/unit/contentproviders/test_doi.py @@ -11,6 +11,7 @@ from repo2docker.contentproviders.doi import DoiProvider from repo2docker.contentproviders.base import ContentProviderException +from repo2docker import __version__ def test_content_id(): @@ -18,20 +19,15 @@ def test_content_id(): assert doi.content_id is None -def fake_urlopen(req): - print(req) - return req.headers - - -@patch("urllib.request.urlopen", fake_urlopen) -def test_url_headers(): +def test_url_headers(requests_mock): + requests_mock.get("https://mybinder.org", text="resp") doi = DoiProvider() headers = {"test1": "value1", "Test2": "value2"} result = doi.urlopen("https://mybinder.org", headers=headers) - assert "Test1" in result - assert "Test2" in result - assert len(result) is 3 # User-agent is also set + assert "test1" in result.request.headers + assert "Test2" in result.request.headers + assert result.request.headers["User-Agent"] == "repo2docker {}".format(__version__) def test_unresolving_doi(): diff --git a/tests/unit/contentproviders/test_figshare.py b/tests/unit/contentproviders/test_figshare.py index 0e152ecf8..3377d642f 100644 --- a/tests/unit/contentproviders/test_figshare.py +++ b/tests/unit/contentproviders/test_figshare.py @@ -22,12 +22,17 @@ @pytest.mark.parametrize("link,expected", test_content_ids) -def test_content_id(link, expected): - with patch.object(Figshare, "urlopen") as fake_urlopen: - fake_urlopen.return_value.url = link - fig = Figshare() - fig.detect("10.6084/m9.figshare.9782777") - assert fig.content_id == expected +def test_content_id(link, expected, requests_mock): + def mocked_get(req, context): + if req.url.startswith("https://doi.org"): + context.status_code = 302 + context.headers["Location"] = link + return link + + requests_mock.get(re.compile("https://"), text=mocked_get) + fig = Figshare() + fig.detect("10.6084/m9.figshare.9782777") + assert fig.content_id == expected test_fig = Figshare() @@ -103,76 +108,73 @@ def figshare_archive(prefix="a_directory"): yield zfile.name -def test_fetch_zip(): +def test_fetch_zip(requests_mock): # see test_zenodo.py/test_fetch_software with figshare_archive() as fig_path: - mock_response = BytesIO( - json.dumps( + mock_response = { + "files": [ { - "files": [ - { - "name": "afake.zip", - "is_link_only": False, - "download_url": "file://{}".format(fig_path), - } - ] + "name": "afake.zip", + "is_link_only": False, + "download_url": "file://{}".format(fig_path), } - ).encode("utf-8") + ] + } + requests_mock.get( + "https://api.figshare.com/v2/articles/123456/versions/42", + json=mock_response, + ) + requests_mock.get( + "file://{}".format(fig_path), content=open(fig_path, "rb").read() ) - def mock_urlopen(self, req): - if isinstance(req, Request): - return mock_response - else: - return urlopen(req) - - with patch.object(Figshare, "urlopen", new=mock_urlopen): - with TemporaryDirectory() as d: - output = [] - for l in test_fig.fetch(test_spec, d): - output.append(l) + # with patch.object(Figshare, "urlopen", new=mock_urlopen): + with TemporaryDirectory() as d: + output = [] + for l in test_fig.fetch(test_spec, d): + output.append(l) - unpacked_files = set(os.listdir(d)) - expected = set(["some-other-file.txt", "some-file.txt"]) - assert expected == unpacked_files + unpacked_files = set(os.listdir(d)) + expected = set(["some-other-file.txt", "some-file.txt"]) + assert expected == unpacked_files -def test_fetch_data(): +def test_fetch_data(requests_mock): with figshare_archive() as a_path: with figshare_archive() as b_path: - mock_response = BytesIO( - json.dumps( + mock_response = { + "files": [ + { + "name": "afake.file", + "download_url": "file://{}".format(a_path), + "is_link_only": False, + }, { - "files": [ - { - "name": "afake.file", - "download_url": "file://{}".format(a_path), - "is_link_only": False, - }, - { - "name": "bfake.data", - "download_url": "file://{}".format(b_path), - "is_link_only": False, - }, - {"name": "cfake.link", "is_link_only": True}, - ] - } - ).encode("utf-8") + "name": "bfake.data", + "download_url": "file://{}".format(b_path), + "is_link_only": False, + }, + {"name": "cfake.link", "is_link_only": True}, + ] + } + + requests_mock.get( + "https://api.figshare.com/v2/articles/123456/versions/42", + json=mock_response, + ) + requests_mock.get( + "file://{}".format(a_path), content=open(a_path, "rb").read() + ) + requests_mock.get( + "file://{}".format(b_path), content=open(b_path, "rb").read() ) - def mock_urlopen(self, req): - if isinstance(req, Request): - return mock_response - else: - return urlopen(req) - - with patch.object(Figshare, "urlopen", new=mock_urlopen): - with TemporaryDirectory() as d: - output = [] - for l in test_fig.fetch(test_spec, d): - output.append(l) - - unpacked_files = set(os.listdir(d)) - # ZIP files shouldn't have been unpacked - expected = {"bfake.data", "afake.file"} - assert expected == unpacked_files + with TemporaryDirectory() as d: + output = [] + for l in test_fig.fetch(test_spec, d): + output.append(l) + + unpacked_files = set(os.listdir(d)) + # ZIP files shouldn't have been unpacked + expected = {"bfake.data", "afake.file"} + assert expected == unpacked_files diff --git a/tests/unit/contentproviders/test_hydroshare.py b/tests/unit/contentproviders/test_hydroshare.py index d662dbff5..4af7d1579 100755 --- a/tests/unit/contentproviders/test_hydroshare.py +++ b/tests/unit/contentproviders/test_hydroshare.py @@ -5,87 +5,98 @@ from tempfile import TemporaryDirectory, NamedTemporaryFile from unittest.mock import patch from zipfile import ZipFile +import re from repo2docker.contentproviders import Hydroshare from repo2docker.contentproviders.base import ContentProviderException -def test_content_id(): - with patch.object(Hydroshare, "urlopen") as fake_urlopen: - fake_urlopen.return_value.url = ( - "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" - ) +doi_responses = { + "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61": ( + "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" + ), + "https://doi.org/10.21105/joss.01277": ( + "https://joss.theoj.org/papers/10.21105/joss.01277" + ), +} - def read(): - return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' - fake_urlopen.return_value.read = read - hydro = Hydroshare() +def doi_resolver(req, context): + resp = doi_responses.get(req.url) + # doi responses are redirects + if resp is not None: + context.status_code = 302 + context.headers["Location"] = resp + return resp - hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") - assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757" +hydroshare_data = { + "dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}] +} -def test_detect_hydroshare(): - with patch.object(Hydroshare, "urlopen") as fake_urlopen: - fake_urlopen.return_value.url = ( - "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" - ) - def read(): - return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' - - fake_urlopen.return_value.read = read - # valid Hydroshare DOIs trigger this content provider - expected = { - "host": { - "hostname": [ - "https://www.hydroshare.org/resource/", - "http://www.hydroshare.org/resource/", - ], - "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", - "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements", - }, - "resource": "b8f6eae9d89241cf8b5904033460af61", - "version": "1569427757", - } - assert ( - Hydroshare().detect( - "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" - ) - == expected - ) - # assert a call to urlopen was called to fetch version - assert fake_urlopen.call_count == 1 - assert ( - Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") - == expected - ) - # assert 2 more calls were made, one to resolve the DOI and another to fetch the version - assert fake_urlopen.call_count == 3 - assert ( - Hydroshare().detect( - "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61" - ) - == expected - ) - # assert 2 more calls were made, one to resolve the DOI and another to fetch the version - assert fake_urlopen.call_count == 5 - - with patch.object(Hydroshare, "urlopen") as fake_urlopen: - # Don't trigger the Hydroshare content provider - assert Hydroshare().detect("/some/path/here") is None - assert Hydroshare().detect("https://example.com/path/here") is None - # don't handle DOIs that aren't from Hydroshare - fake_urlopen.return_value.url = ( - "http://joss.theoj.org/papers/10.21105/joss.01277" - ) +def test_content_id(requests_mock): - def read(): - return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' + requests_mock.get(re.compile("https://"), json=hydroshare_data) + requests_mock.get(re.compile("https://doi.org"), json=doi_resolver) - fake_urlopen.return_value.read = read - assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None + hydro = Hydroshare() + + hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") + assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757" + + +def test_detect_hydroshare(requests_mock): + requests_mock.get(re.compile("https://"), json=hydroshare_data) + requests_mock.get(re.compile("https://doi.org"), json=doi_resolver) + + # valid Hydroshare DOIs trigger this content provider + expected = { + "host": { + "hostname": [ + "https://www.hydroshare.org/resource/", + "http://www.hydroshare.org/resource/", + ], + "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", + "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements", + }, + "resource": "b8f6eae9d89241cf8b5904033460af61", + "version": "1569427757", + } + + assert ( + Hydroshare().detect( + "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" + ) + == expected + ) + # assert a call to urlopen was called to fetch version + assert requests_mock.call_count == 1 + requests_mock.reset_mock() + + assert ( + Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected + ) + # assert 3 calls were made, 2 to resolve the DOI (302 + 200) and another to fetch the version + assert requests_mock.call_count == 3 + requests_mock.reset_mock() + + assert ( + Hydroshare().detect( + "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61" + ) + == expected + ) + # assert 3 more calls were made, 2 to resolve the DOI and another to fetch the version + assert requests_mock.call_count == 3 + requests_mock.reset_mock() + + # Don't trigger the Hydroshare content provider + assert Hydroshare().detect("/some/path/here") is None + assert Hydroshare().detect("https://example.com/path/here") is None + + # don't handle DOIs that aren't from Hydroshare + assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None @contextmanager diff --git a/tests/unit/contentproviders/test_zenodo.py b/tests/unit/contentproviders/test_zenodo.py index 61dcbfbf3..fa8b30fa8 100644 --- a/tests/unit/contentproviders/test_zenodo.py +++ b/tests/unit/contentproviders/test_zenodo.py @@ -1,6 +1,7 @@ import json import os import pytest +import re from contextlib import contextmanager from io import BytesIO @@ -11,14 +12,30 @@ from repo2docker.contentproviders import Zenodo +doi_responses = { + "https://doi.org/10.5281/zenodo.3232985": ("https://zenodo.org/record/3232985"), + "https://doi.org/10.22002/d1.1235": ("https://data.caltech.edu/records/1235"), + "https://doi.org/10.21105/joss.01277": ( + "https://joss.theoj.org/papers/10.21105/joss.01277" + ), +} -def test_content_id(): - with patch.object(Zenodo, "urlopen") as fake_urlopen: - fake_urlopen.return_value.url = "https://zenodo.org/record/3232985" - zen = Zenodo() - zen.detect("10.5281/zenodo.3232985") - assert zen.content_id == "3232985" +def doi_resolver(req, context): + resp = doi_responses.get(req.url) + # doi responses are redirects + if resp is not None: + context.status_code = 302 + context.headers["Location"] = resp + return resp + + +def test_content_id(requests_mock): + requests_mock.get(re.compile("https://"), json=doi_resolver) + + zen = Zenodo() + zen.detect("10.5281/zenodo.3232985") + assert zen.content_id == "3232985" test_zen = Zenodo() @@ -43,25 +60,22 @@ def test_content_id(): @pytest.mark.parametrize("test_input,expected", test_hosts) -def test_detect_zenodo(test_input, expected): - with patch.object(Zenodo, "urlopen") as fake_urlopen: - fake_urlopen.return_value.url = test_input[0] - # valid Zenodo DOIs trigger this content provider - assert Zenodo().detect(test_input[0]) == expected - assert Zenodo().detect(test_input[1]) == expected - assert Zenodo().detect(test_input[2]) == expected - # only two of the three calls above have to resolve a DOI - assert fake_urlopen.call_count == 2 - - with patch.object(Zenodo, "urlopen") as fake_urlopen: - # Don't trigger the Zenodo content provider - assert Zenodo().detect("/some/path/here") is None - assert Zenodo().detect("https://example.com/path/here") is None - # don't handle DOIs that aren't from Zenodo - fake_urlopen.return_value.url = ( - "http://joss.theoj.org/papers/10.21105/joss.01277" - ) - assert Zenodo().detect("https://doi.org/10.21105/joss.01277") is None +def test_detect_zenodo(test_input, expected, requests_mock): + requests_mock.get(re.compile("https://"), json=doi_resolver) + # valid Zenodo DOIs trigger this content provider + assert Zenodo().detect(test_input[0]) == expected + assert Zenodo().detect(test_input[1]) == expected + assert Zenodo().detect(test_input[2]) == expected + # only two of the three calls above have to resolve a DOI (2 req per doi resolution) + assert requests_mock.call_count == 4 + requests_mock.reset_mock() + + # Don't trigger the Zenodo content provider + assert Zenodo().detect("/some/path/here") is None + assert Zenodo().detect("https://example.com/path/here") is None + + # don't handle DOIs that aren't from Zenodo + assert Zenodo().detect("https://doi.org/10.21105/joss.01277") is None @contextmanager @@ -74,120 +88,102 @@ def zenodo_archive(prefix="a_directory"): yield zfile.name -def test_fetch_software_from_github_archive(): +def test_fetch_software_from_github_archive(requests_mock): # we "fetch" a local ZIP file to simulate a Zenodo record created from a # GitHub repository via the Zenodo-GitHub integration with zenodo_archive() as zen_path: - mock_response = BytesIO( - json.dumps( + mock_response = { + "files": [ { - "files": [ - { - "filename": "some_dir/afake.zip", - "links": {"download": "file://{}".format(zen_path)}, - } - ], - "metadata": {"upload_type": "software"}, + "filename": "some_dir/afake.zip", + "links": {"download": "file://{}".format(zen_path)}, } - ).encode("utf-8") + ], + "metadata": {"upload_type": "software"}, + } + requests_mock.get("https://zenodo.org/api/records/1234", json=mock_response) + requests_mock.get( + "file://{}".format(zen_path), content=open(zen_path, "rb").read() ) - def mock_urlopen(self, req): - if isinstance(req, Request): - return mock_response - else: - return urlopen(req) - - with patch.object(Zenodo, "urlopen", new=mock_urlopen): - zen = Zenodo() - spec = {"host": test_zen.hosts[0], "record": "1234"} + zen = Zenodo() + spec = {"host": test_zen.hosts[0], "record": "1234"} - with TemporaryDirectory() as d: - output = [] - for l in zen.fetch(spec, d): - output.append(l) + with TemporaryDirectory() as d: + output = [] + for l in zen.fetch(spec, d): + output.append(l) - unpacked_files = set(os.listdir(d)) - expected = set(["some-other-file.txt", "some-file.txt"]) - assert expected == unpacked_files + unpacked_files = set(os.listdir(d)) + expected = set(["some-other-file.txt", "some-file.txt"]) + assert expected == unpacked_files -def test_fetch_software(): +def test_fetch_software(requests_mock): # we "fetch" a local ZIP file to simulate a Zenodo software record with a # ZIP file in it with zenodo_archive() as zen_path: - mock_response = BytesIO( - json.dumps( + mock_response = { + "files": [ { - "files": [ - { - # this is the difference to the GitHub generated one, - # the ZIP file isn't in a directory - "filename": "afake.zip", - "links": {"download": "file://{}".format(zen_path)}, - } - ], - "metadata": {"upload_type": "software"}, + # this is the difference to the GitHub generated one, + # the ZIP file isn't in a directory + "filename": "afake.zip", + "links": {"download": "file://{}".format(zen_path)}, } - ).encode("utf-8") + ], + "metadata": {"upload_type": "software"}, + } + requests_mock.get("https://zenodo.org/api/records/1234", json=mock_response) + requests_mock.get( + "file://{}".format(zen_path), content=open(zen_path, "rb").read() ) - def mock_urlopen(self, req): - if isinstance(req, Request): - return mock_response - else: - return urlopen(req) - - with patch.object(Zenodo, "urlopen", new=mock_urlopen): - with TemporaryDirectory() as d: - zen = Zenodo() - spec = spec = {"host": test_zen.hosts[0], "record": "1234"} - output = [] - for l in zen.fetch(spec, d): - output.append(l) + with TemporaryDirectory() as d: + zen = Zenodo() + spec = spec = {"host": test_zen.hosts[0], "record": "1234"} + output = [] + for l in zen.fetch(spec, d): + output.append(l) - unpacked_files = set(os.listdir(d)) - expected = set(["some-other-file.txt", "some-file.txt"]) - assert expected == unpacked_files + unpacked_files = set(os.listdir(d)) + expected = set(["some-other-file.txt", "some-file.txt"]) + assert expected == unpacked_files -def test_fetch_data(): +def test_fetch_data(requests_mock): # we "fetch" a local ZIP file to simulate a Zenodo data record with zenodo_archive() as a_zen_path: with zenodo_archive() as b_zen_path: - mock_response = BytesIO( - json.dumps( + mock_response = { + "files": [ + { + "filename": "afake.zip", + "links": {"download": "file://{}".format(a_zen_path)}, + }, { - "files": [ - { - "filename": "afake.zip", - "links": {"download": "file://{}".format(a_zen_path)}, - }, - { - "filename": "bfake.zip", - "links": {"download": "file://{}".format(b_zen_path)}, - }, - ], - "metadata": {"upload_type": "data"}, - } - ).encode("utf-8") + "filename": "bfake.zip", + "links": {"download": "file://{}".format(b_zen_path)}, + }, + ], + "metadata": {"upload_type": "data"}, + } + requests_mock.get("https://zenodo.org/api/records/1234", json=mock_response) + requests_mock.get( + "file://{}".format(a_zen_path), content=open(a_zen_path, "rb").read() + ) + requests_mock.get( + "file://{}".format(b_zen_path), content=open(b_zen_path, "rb").read() ) - def mock_urlopen(self, req): - if isinstance(req, Request): - return mock_response - else: - return urlopen(req) - - with patch.object(Zenodo, "urlopen", new=mock_urlopen): - with TemporaryDirectory() as d: - zen = Zenodo() - spec = {"host": test_zen.hosts[0], "record": "1234"} - output = [] - for l in zen.fetch(spec, d): - output.append(l) - - unpacked_files = set(os.listdir(d)) - # ZIP files shouldn't have been unpacked - expected = {"bfake.zip", "afake.zip"} - assert expected == unpacked_files + with TemporaryDirectory() as d: + zen = Zenodo() + spec = {"host": test_zen.hosts[0], "record": "1234"} + output = [] + for l in zen.fetch(spec, d): + output.append(l) + + unpacked_files = set(os.listdir(d)) + # ZIP files shouldn't have been unpacked + expected = {"bfake.zip", "afake.zip"} + assert expected == unpacked_files