Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ pytest>=4.6
wheel
pytest-cov
pre-commit
requests
requests_mock
17 changes: 6 additions & 11 deletions repo2docker/contentproviders/dataverse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import json
import shutil

from urllib.request import Request
from urllib.parse import urlparse, urlunparse, parse_qs

from .doi import DoiProvider
Expand Down Expand Up @@ -56,7 +55,6 @@ def detect(self, doi, ref=None, extra_args=None):
return

query_args = parse_qs(parsed_url.query)

# Corner case handling
if parsed_url.path.startswith("/file.xhtml"):
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
Expand All @@ -75,8 +73,7 @@ def detect(self, doi, ref=None, extra_args=None):
parsed_url._replace(path="/api/search", query=search_query)
)
self.log.debug("Querying Dataverse: " + search_url)
resp = self.urlopen(search_url).read()
data = json.loads(resp.decode("utf-8"))["data"]
data = self.urlopen(search_url).json()
if data["count_in_response"] != 1:
self.log.debug(
"Dataverse search query failed!\n - doi: {}\n - url: {}\n - resp: {}\n".format(
Expand All @@ -101,14 +98,12 @@ def fetch(self, spec, output_dir, yield_output=False):
host = spec["host"]

yield "Fetching Dataverse record {}.\n".format(record_id)
req = Request(
"{}/api/datasets/:persistentId?persistentId={}".format(
host["url"], record_id
),
headers={"accept": "application/json"},
url = "{}/api/datasets/:persistentId?persistentId={}".format(
host["url"], record_id
)
resp = self.urlopen(req)
record = json.loads(resp.read().decode("utf-8"))["data"]

resp = self.urlopen(url, headers={"accept": "application/json"})
record = resp.json()

for fobj in deep_get(record, "latestVersion.files"):
file_url = "{}/api/access/datafile/{}".format(
Expand Down
95 changes: 57 additions & 38 deletions repo2docker/contentproviders/doi.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

from os import makedirs
from os import path
from urllib import request # urlopen, Request
from urllib.error import HTTPError
from requests import Session, HTTPError

from zipfile import ZipFile, is_zipfile

from .base import ContentProvider
Expand All @@ -18,7 +18,21 @@
class DoiProvider(ContentProvider):
"""Provide contents of a repository identified by a DOI and some helper functions."""

def urlopen(self, req, headers=None):
def __init__(self):
super().__init__()
self.session = Session()
self.session.headers.update(
{
"user-agent": "repo2docker {}".format(__version__),
}
)

def _request(self, url, **kwargs):
return self.session.get(url, **kwargs)

urlopen = _request

def _urlopen(self, req, headers=None):
"""A urlopen() helper"""
# someone passed a string, not a request
if not isinstance(req, request.Request):
Expand All @@ -38,7 +52,8 @@ def doi2url(self, doi):
doi = normalize_doi(doi)

try:
resp = self.urlopen("https://doi.org/{}".format(doi))
resp = self._request("https://doi.org/{}".format(doi))
resp.raise_for_status()
# If the DOI doesn't resolve, just return URL
except HTTPError:
return doi
Expand All @@ -53,38 +68,42 @@ def fetch_file(self, file_ref, host, output_dir, unzip=False):
file_url = deep_get(file_ref, host["download"])
fname = deep_get(file_ref, host["filename"])
logging.debug("Downloading file {} as {}\n".format(file_url, fname))
with self.urlopen(file_url) as src:

yield "Requesting {}\n".format(file_url)
resp = self._request(file_url, stream=True)
resp.raise_for_status()

if path.dirname(fname):
sub_dir = path.join(output_dir, path.dirname(fname))
if not path.exists(sub_dir):
yield "Creating {}\n".format(sub_dir)
makedirs(sub_dir, exist_ok=True)

dst_fname = path.join(output_dir, fname)
with open(dst_fname, "wb") as dst:
yield "Fetching {}\n".format(fname)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as further down: useful for the average user on a day to day basis or more of a debug help?

Copy link
Contributor Author

@douardda douardda Jan 18, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question. Previsouly, this fetchfile() method did log (as generated strings):

  • the directory creation ("Creating [...]"),
  • the saving of the downloaded zip file in dst_fname ("Fetching [...]")
  • the zip file extraction ("Extracting [...]")
  • the list of extracted files ("Extracted [...]")

In my version of the code, the only thing logged in addition to these is the "Requesting [...]" line in which I display the actually retrieved URL (after doi resolution) I though would be useful in a log.

for chunk in resp.iter_content(chunk_size=None):
dst.write(chunk)

if unzip and is_zipfile(dst_fname):
yield "Extracting {}\n".format(fname)
zfile = ZipFile(dst_fname)
zfile.extractall(path=output_dir)
zfile.close()

# delete downloaded file ...
os.remove(dst_fname)
# ... and any directories we might have created,
# in which case sub_dir will be defined
if path.dirname(fname):
sub_dir = path.join(output_dir, path.dirname(fname))
if not path.exists(sub_dir):
yield "Creating {}\n".format(sub_dir)
makedirs(sub_dir, exist_ok=True)

dst_fname = path.join(output_dir, fname)
with open(dst_fname, "wb") as dst:
yield "Fetching {}\n".format(fname)
shutil.copyfileobj(src, dst)
# first close the newly written file, then continue
# processing it
if unzip and is_zipfile(dst_fname):
yield "Extracting {}\n".format(fname)
zfile = ZipFile(dst_fname)
zfile.extractall(path=output_dir)
zfile.close()

# delete downloaded file ...
os.remove(dst_fname)
# ... and any directories we might have created,
# in which case sub_dir will be defined
if path.dirname(fname):
shutil.rmtree(sub_dir)

new_subdirs = os.listdir(output_dir)
# if there is only one new subdirectory move its contents
# to the top level directory
if len(new_subdirs) == 1:
d = new_subdirs[0]
copytree(path.join(output_dir, d), output_dir)
shutil.rmtree(path.join(output_dir, d))

yield "Fetched files: {}\n".format(os.listdir(output_dir))
shutil.rmtree(sub_dir)

new_subdirs = os.listdir(output_dir)
# if there is only one new subdirectory move its contents
# to the top level directory
if len(new_subdirs) == 1:
d = new_subdirs[0]
copytree(path.join(output_dir, d), output_dir)
shutil.rmtree(path.join(output_dir, d))

yield "Fetched files: {}\n".format(os.listdir(output_dir))
6 changes: 3 additions & 3 deletions repo2docker/contentproviders/figshare.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class Figshare(DoiProvider):
"""

def __init__(self):
super().__init__()
self.hosts = [
{
"hostname": [
Expand Down Expand Up @@ -74,13 +75,12 @@ def fetch(self, spec, output_dir, yield_output=False):
yield "Fetching Figshare article {} in version {}.\n".format(
article_id, article_version
)
req = Request(
resp = self.urlopen(
"{}{}/versions/{}".format(host["api"], article_id, article_version),
headers={"accept": "application/json"},
)
resp = self.urlopen(req)

article = json.loads(resp.read().decode("utf-8"))
article = resp.json()

files = deep_get(article, host["filepath"])
# only fetch files where is_link_only: False
Expand Down
4 changes: 1 addition & 3 deletions repo2docker/contentproviders/hydroshare.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@ class Hydroshare(DoiProvider):

def _fetch_version(self, host):
"""Fetch resource modified date and convert to epoch"""
json_response = json.loads(
self.urlopen(host["version"].format(self.resource_id)).read()
)
json_response = self.urlopen(host["version"].format(self.resource_id)).json()
date = next(
item for item in json_response["dates"] if item["type"] == "modified"
)["start_date"]
Expand Down
6 changes: 3 additions & 3 deletions repo2docker/contentproviders/zenodo.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class Zenodo(DoiProvider):
"""Provide contents of a Zenodo deposit."""

def __init__(self):
super().__init__()
# We need the hostname (url where records are), api url (for metadata),
# filepath (path to files in metadata), filename (path to filename in
# metadata), download (path to file download URL), and type (path to item type in metadata)
Expand Down Expand Up @@ -55,13 +56,12 @@ def fetch(self, spec, output_dir, yield_output=False):
host = spec["host"]

yield "Fetching Zenodo record {}.\n".format(record_id)
req = Request(
resp = self.urlopen(
"{}{}".format(host["api"], record_id),
headers={"accept": "application/json"},
)
resp = self.urlopen(req)

record = json.loads(resp.read().decode("utf-8"))
record = resp.json()

is_software = deep_get(record, host["type"]).lower() == "software"
files = deep_get(record, host["filepath"])
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def get_identifier(json):
"python-json-logger",
"escapism",
"jinja2",
"requests",
"ruamel.yaml>=0.15",
"toml",
"semver",
Expand Down
Loading