Skip to content

Commit a0606f2

Browse files
authored
Merge pull request #993 from douardda/requests
Replace urllib by requests in contentproviders
2 parents c75b5ab + 830a9c8 commit a0606f2

File tree

12 files changed

+431
-401
lines changed

12 files changed

+431
-401
lines changed

dev-requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ pytest>=4.6
55
wheel
66
pytest-cov
77
pre-commit
8-
requests
8+
requests_mock

repo2docker/contentproviders/dataverse.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import json
33
import shutil
44

5-
from urllib.request import Request
65
from urllib.parse import urlparse, urlunparse, parse_qs
76

87
from .doi import DoiProvider
@@ -56,7 +55,6 @@ def detect(self, doi, ref=None, extra_args=None):
5655
return
5756

5857
query_args = parse_qs(parsed_url.query)
59-
6058
# Corner case handling
6159
if parsed_url.path.startswith("/file.xhtml"):
6260
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
@@ -75,8 +73,7 @@ def detect(self, doi, ref=None, extra_args=None):
7573
parsed_url._replace(path="/api/search", query=search_query)
7674
)
7775
self.log.debug("Querying Dataverse: " + search_url)
78-
resp = self.urlopen(search_url).read()
79-
data = json.loads(resp.decode("utf-8"))["data"]
76+
data = self.urlopen(search_url).json()
8077
if data["count_in_response"] != 1:
8178
self.log.debug(
8279
"Dataverse search query failed!\n - doi: {}\n - url: {}\n - resp: {}\n".format(
@@ -101,14 +98,12 @@ def fetch(self, spec, output_dir, yield_output=False):
10198
host = spec["host"]
10299

103100
yield "Fetching Dataverse record {}.\n".format(record_id)
104-
req = Request(
105-
"{}/api/datasets/:persistentId?persistentId={}".format(
106-
host["url"], record_id
107-
),
108-
headers={"accept": "application/json"},
101+
url = "{}/api/datasets/:persistentId?persistentId={}".format(
102+
host["url"], record_id
109103
)
110-
resp = self.urlopen(req)
111-
record = json.loads(resp.read().decode("utf-8"))["data"]
104+
105+
resp = self.urlopen(url, headers={"accept": "application/json"})
106+
record = resp.json()
112107

113108
for fobj in deep_get(record, "latestVersion.files"):
114109
file_url = "{}/api/access/datafile/{}".format(

repo2docker/contentproviders/doi.py

Lines changed: 57 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55

66
from os import makedirs
77
from os import path
8-
from urllib import request # urlopen, Request
9-
from urllib.error import HTTPError
8+
from requests import Session, HTTPError
9+
1010
from zipfile import ZipFile, is_zipfile
1111

1212
from .base import ContentProvider
@@ -18,7 +18,21 @@
1818
class DoiProvider(ContentProvider):
1919
"""Provide contents of a repository identified by a DOI and some helper functions."""
2020

21-
def urlopen(self, req, headers=None):
21+
def __init__(self):
22+
super().__init__()
23+
self.session = Session()
24+
self.session.headers.update(
25+
{
26+
"user-agent": "repo2docker {}".format(__version__),
27+
}
28+
)
29+
30+
def _request(self, url, **kwargs):
31+
return self.session.get(url, **kwargs)
32+
33+
urlopen = _request
34+
35+
def _urlopen(self, req, headers=None):
2236
"""A urlopen() helper"""
2337
# someone passed a string, not a request
2438
if not isinstance(req, request.Request):
@@ -38,7 +52,8 @@ def doi2url(self, doi):
3852
doi = normalize_doi(doi)
3953

4054
try:
41-
resp = self.urlopen("https://doi.org/{}".format(doi))
55+
resp = self._request("https://doi.org/{}".format(doi))
56+
resp.raise_for_status()
4257
# If the DOI doesn't resolve, just return URL
4358
except HTTPError:
4459
return doi
@@ -53,38 +68,42 @@ def fetch_file(self, file_ref, host, output_dir, unzip=False):
5368
file_url = deep_get(file_ref, host["download"])
5469
fname = deep_get(file_ref, host["filename"])
5570
logging.debug("Downloading file {} as {}\n".format(file_url, fname))
56-
with self.urlopen(file_url) as src:
71+
72+
yield "Requesting {}\n".format(file_url)
73+
resp = self._request(file_url, stream=True)
74+
resp.raise_for_status()
75+
76+
if path.dirname(fname):
77+
sub_dir = path.join(output_dir, path.dirname(fname))
78+
if not path.exists(sub_dir):
79+
yield "Creating {}\n".format(sub_dir)
80+
makedirs(sub_dir, exist_ok=True)
81+
82+
dst_fname = path.join(output_dir, fname)
83+
with open(dst_fname, "wb") as dst:
84+
yield "Fetching {}\n".format(fname)
85+
for chunk in resp.iter_content(chunk_size=None):
86+
dst.write(chunk)
87+
88+
if unzip and is_zipfile(dst_fname):
89+
yield "Extracting {}\n".format(fname)
90+
zfile = ZipFile(dst_fname)
91+
zfile.extractall(path=output_dir)
92+
zfile.close()
93+
94+
# delete downloaded file ...
95+
os.remove(dst_fname)
96+
# ... and any directories we might have created,
97+
# in which case sub_dir will be defined
5798
if path.dirname(fname):
58-
sub_dir = path.join(output_dir, path.dirname(fname))
59-
if not path.exists(sub_dir):
60-
yield "Creating {}\n".format(sub_dir)
61-
makedirs(sub_dir, exist_ok=True)
62-
63-
dst_fname = path.join(output_dir, fname)
64-
with open(dst_fname, "wb") as dst:
65-
yield "Fetching {}\n".format(fname)
66-
shutil.copyfileobj(src, dst)
67-
# first close the newly written file, then continue
68-
# processing it
69-
if unzip and is_zipfile(dst_fname):
70-
yield "Extracting {}\n".format(fname)
71-
zfile = ZipFile(dst_fname)
72-
zfile.extractall(path=output_dir)
73-
zfile.close()
74-
75-
# delete downloaded file ...
76-
os.remove(dst_fname)
77-
# ... and any directories we might have created,
78-
# in which case sub_dir will be defined
79-
if path.dirname(fname):
80-
shutil.rmtree(sub_dir)
81-
82-
new_subdirs = os.listdir(output_dir)
83-
# if there is only one new subdirectory move its contents
84-
# to the top level directory
85-
if len(new_subdirs) == 1:
86-
d = new_subdirs[0]
87-
copytree(path.join(output_dir, d), output_dir)
88-
shutil.rmtree(path.join(output_dir, d))
89-
90-
yield "Fetched files: {}\n".format(os.listdir(output_dir))
99+
shutil.rmtree(sub_dir)
100+
101+
new_subdirs = os.listdir(output_dir)
102+
# if there is only one new subdirectory move its contents
103+
# to the top level directory
104+
if len(new_subdirs) == 1:
105+
d = new_subdirs[0]
106+
copytree(path.join(output_dir, d), output_dir)
107+
shutil.rmtree(path.join(output_dir, d))
108+
109+
yield "Fetched files: {}\n".format(os.listdir(output_dir))

repo2docker/contentproviders/figshare.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ class Figshare(DoiProvider):
2525
"""
2626

2727
def __init__(self):
28+
super().__init__()
2829
self.hosts = [
2930
{
3031
"hostname": [
@@ -74,13 +75,12 @@ def fetch(self, spec, output_dir, yield_output=False):
7475
yield "Fetching Figshare article {} in version {}.\n".format(
7576
article_id, article_version
7677
)
77-
req = Request(
78+
resp = self.urlopen(
7879
"{}{}/versions/{}".format(host["api"], article_id, article_version),
7980
headers={"accept": "application/json"},
8081
)
81-
resp = self.urlopen(req)
8282

83-
article = json.loads(resp.read().decode("utf-8"))
83+
article = resp.json()
8484

8585
files = deep_get(article, host["filepath"])
8686
# only fetch files where is_link_only: False

repo2docker/contentproviders/hydroshare.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@ class Hydroshare(DoiProvider):
1616

1717
def _fetch_version(self, host):
1818
"""Fetch resource modified date and convert to epoch"""
19-
json_response = json.loads(
20-
self.urlopen(host["version"].format(self.resource_id)).read()
21-
)
19+
json_response = self.urlopen(host["version"].format(self.resource_id)).json()
2220
date = next(
2321
item for item in json_response["dates"] if item["type"] == "modified"
2422
)["start_date"]

repo2docker/contentproviders/zenodo.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ class Zenodo(DoiProvider):
1515
"""Provide contents of a Zenodo deposit."""
1616

1717
def __init__(self):
18+
super().__init__()
1819
# We need the hostname (url where records are), api url (for metadata),
1920
# filepath (path to files in metadata), filename (path to filename in
2021
# metadata), download (path to file download URL), and type (path to item type in metadata)
@@ -55,13 +56,12 @@ def fetch(self, spec, output_dir, yield_output=False):
5556
host = spec["host"]
5657

5758
yield "Fetching Zenodo record {}.\n".format(record_id)
58-
req = Request(
59+
resp = self.urlopen(
5960
"{}{}".format(host["api"], record_id),
6061
headers={"accept": "application/json"},
6162
)
62-
resp = self.urlopen(req)
6363

64-
record = json.loads(resp.read().decode("utf-8"))
64+
record = resp.json()
6565

6666
is_software = deep_get(record, host["type"]).lower() == "software"
6767
files = deep_get(record, host["filepath"])

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ def get_identifier(json):
5252
"python-json-logger",
5353
"escapism",
5454
"jinja2",
55+
"requests",
5556
"ruamel.yaml>=0.15",
5657
"toml",
5758
"semver",

0 commit comments

Comments
 (0)