Skip to content

Commit 830a9c8

Browse files
committed
Replace urllib by requests in contentproviders
requests is globally simpler to use, and more and more people are more familiar with this later than urllib.
1 parent 560b1d9 commit 830a9c8

File tree

12 files changed

+431
-401
lines changed

12 files changed

+431
-401
lines changed

dev-requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ pytest>=4.6
55
wheel
66
pytest-cov
77
pre-commit
8-
requests
8+
requests_mock

repo2docker/contentproviders/dataverse.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import json
33
import shutil
44

5-
from urllib.request import Request
65
from urllib.parse import urlparse, urlunparse, parse_qs
76

87
from .doi import DoiProvider
@@ -56,7 +55,6 @@ def detect(self, doi, ref=None, extra_args=None):
5655
return
5756

5857
query_args = parse_qs(parsed_url.query)
59-
6058
# Corner case handling
6159
if parsed_url.path.startswith("/file.xhtml"):
6260
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
@@ -75,8 +73,7 @@ def detect(self, doi, ref=None, extra_args=None):
7573
parsed_url._replace(path="/api/search", query=search_query)
7674
)
7775
self.log.debug("Querying Dataverse: " + search_url)
78-
resp = self.urlopen(search_url).read()
79-
data = json.loads(resp.decode("utf-8"))["data"]
76+
data = self.urlopen(search_url).json()
8077
if data["count_in_response"] != 1:
8178
self.log.debug(
8279
"Dataverse search query failed!\n - doi: {}\n - url: {}\n - resp: {}\n".format(
@@ -101,14 +98,12 @@ def fetch(self, spec, output_dir, yield_output=False):
10198
host = spec["host"]
10299

103100
yield "Fetching Dataverse record {}.\n".format(record_id)
104-
req = Request(
105-
"{}/api/datasets/:persistentId?persistentId={}".format(
106-
host["url"], record_id
107-
),
108-
headers={"accept": "application/json"},
101+
url = "{}/api/datasets/:persistentId?persistentId={}".format(
102+
host["url"], record_id
109103
)
110-
resp = self.urlopen(req)
111-
record = json.loads(resp.read().decode("utf-8"))["data"]
104+
105+
resp = self.urlopen(url, headers={"accept": "application/json"})
106+
record = resp.json()
112107

113108
for fobj in deep_get(record, "latestVersion.files"):
114109
file_url = "{}/api/access/datafile/{}".format(

repo2docker/contentproviders/doi.py

Lines changed: 57 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55

66
from os import makedirs
77
from os import path
8-
from urllib import request # urlopen, Request
9-
from urllib.error import HTTPError
8+
from requests import Session, HTTPError
9+
1010
from zipfile import ZipFile, is_zipfile
1111

1212
from .base import ContentProvider
@@ -18,7 +18,21 @@
1818
class DoiProvider(ContentProvider):
1919
"""Provide contents of a repository identified by a DOI and some helper functions."""
2020

21-
def urlopen(self, req, headers=None):
21+
def __init__(self):
22+
super().__init__()
23+
self.session = Session()
24+
self.session.headers.update(
25+
{
26+
"user-agent": "repo2docker {}".format(__version__),
27+
}
28+
)
29+
30+
def _request(self, url, **kwargs):
31+
return self.session.get(url, **kwargs)
32+
33+
urlopen = _request
34+
35+
def _urlopen(self, req, headers=None):
2236
"""A urlopen() helper"""
2337
# someone passed a string, not a request
2438
if not isinstance(req, request.Request):
@@ -38,7 +52,8 @@ def doi2url(self, doi):
3852
doi = normalize_doi(doi)
3953

4054
try:
41-
resp = self.urlopen("https://doi.org/{}".format(doi))
55+
resp = self._request("https://doi.org/{}".format(doi))
56+
resp.raise_for_status()
4257
# If the DOI doesn't resolve, just return URL
4358
except HTTPError:
4459
return doi
@@ -53,38 +68,42 @@ def fetch_file(self, file_ref, host, output_dir, unzip=False):
5368
file_url = deep_get(file_ref, host["download"])
5469
fname = deep_get(file_ref, host["filename"])
5570
logging.debug("Downloading file {} as {}\n".format(file_url, fname))
56-
with self.urlopen(file_url) as src:
71+
72+
yield "Requesting {}\n".format(file_url)
73+
resp = self._request(file_url, stream=True)
74+
resp.raise_for_status()
75+
76+
if path.dirname(fname):
77+
sub_dir = path.join(output_dir, path.dirname(fname))
78+
if not path.exists(sub_dir):
79+
yield "Creating {}\n".format(sub_dir)
80+
makedirs(sub_dir, exist_ok=True)
81+
82+
dst_fname = path.join(output_dir, fname)
83+
with open(dst_fname, "wb") as dst:
84+
yield "Fetching {}\n".format(fname)
85+
for chunk in resp.iter_content(chunk_size=None):
86+
dst.write(chunk)
87+
88+
if unzip and is_zipfile(dst_fname):
89+
yield "Extracting {}\n".format(fname)
90+
zfile = ZipFile(dst_fname)
91+
zfile.extractall(path=output_dir)
92+
zfile.close()
93+
94+
# delete downloaded file ...
95+
os.remove(dst_fname)
96+
# ... and any directories we might have created,
97+
# in which case sub_dir will be defined
5798
if path.dirname(fname):
58-
sub_dir = path.join(output_dir, path.dirname(fname))
59-
if not path.exists(sub_dir):
60-
yield "Creating {}\n".format(sub_dir)
61-
makedirs(sub_dir, exist_ok=True)
62-
63-
dst_fname = path.join(output_dir, fname)
64-
with open(dst_fname, "wb") as dst:
65-
yield "Fetching {}\n".format(fname)
66-
shutil.copyfileobj(src, dst)
67-
# first close the newly written file, then continue
68-
# processing it
69-
if unzip and is_zipfile(dst_fname):
70-
yield "Extracting {}\n".format(fname)
71-
zfile = ZipFile(dst_fname)
72-
zfile.extractall(path=output_dir)
73-
zfile.close()
74-
75-
# delete downloaded file ...
76-
os.remove(dst_fname)
77-
# ... and any directories we might have created,
78-
# in which case sub_dir will be defined
79-
if path.dirname(fname):
80-
shutil.rmtree(sub_dir)
81-
82-
new_subdirs = os.listdir(output_dir)
83-
# if there is only one new subdirectory move its contents
84-
# to the top level directory
85-
if len(new_subdirs) == 1:
86-
d = new_subdirs[0]
87-
copytree(path.join(output_dir, d), output_dir)
88-
shutil.rmtree(path.join(output_dir, d))
89-
90-
yield "Fetched files: {}\n".format(os.listdir(output_dir))
99+
shutil.rmtree(sub_dir)
100+
101+
new_subdirs = os.listdir(output_dir)
102+
# if there is only one new subdirectory move its contents
103+
# to the top level directory
104+
if len(new_subdirs) == 1:
105+
d = new_subdirs[0]
106+
copytree(path.join(output_dir, d), output_dir)
107+
shutil.rmtree(path.join(output_dir, d))
108+
109+
yield "Fetched files: {}\n".format(os.listdir(output_dir))

repo2docker/contentproviders/figshare.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ class Figshare(DoiProvider):
2525
"""
2626

2727
def __init__(self):
28+
super().__init__()
2829
self.hosts = [
2930
{
3031
"hostname": [
@@ -74,13 +75,12 @@ def fetch(self, spec, output_dir, yield_output=False):
7475
yield "Fetching Figshare article {} in version {}.\n".format(
7576
article_id, article_version
7677
)
77-
req = Request(
78+
resp = self.urlopen(
7879
"{}{}/versions/{}".format(host["api"], article_id, article_version),
7980
headers={"accept": "application/json"},
8081
)
81-
resp = self.urlopen(req)
8282

83-
article = json.loads(resp.read().decode("utf-8"))
83+
article = resp.json()
8484

8585
files = deep_get(article, host["filepath"])
8686
# only fetch files where is_link_only: False

repo2docker/contentproviders/hydroshare.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@ class Hydroshare(DoiProvider):
1616

1717
def _fetch_version(self, host):
1818
"""Fetch resource modified date and convert to epoch"""
19-
json_response = json.loads(
20-
self.urlopen(host["version"].format(self.resource_id)).read()
21-
)
19+
json_response = self.urlopen(host["version"].format(self.resource_id)).json()
2220
date = next(
2321
item for item in json_response["dates"] if item["type"] == "modified"
2422
)["start_date"]

repo2docker/contentproviders/zenodo.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ class Zenodo(DoiProvider):
1515
"""Provide contents of a Zenodo deposit."""
1616

1717
def __init__(self):
18+
super().__init__()
1819
# We need the hostname (url where records are), api url (for metadata),
1920
# filepath (path to files in metadata), filename (path to filename in
2021
# metadata), download (path to file download URL), and type (path to item type in metadata)
@@ -55,13 +56,12 @@ def fetch(self, spec, output_dir, yield_output=False):
5556
host = spec["host"]
5657

5758
yield "Fetching Zenodo record {}.\n".format(record_id)
58-
req = Request(
59+
resp = self.urlopen(
5960
"{}{}".format(host["api"], record_id),
6061
headers={"accept": "application/json"},
6162
)
62-
resp = self.urlopen(req)
6363

64-
record = json.loads(resp.read().decode("utf-8"))
64+
record = resp.json()
6565

6666
is_software = deep_get(record, host["type"]).lower() == "software"
6767
files = deep_get(record, host["filepath"])

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ def get_identifier(json):
5252
"python-json-logger",
5353
"escapism",
5454
"jinja2",
55+
"requests",
5556
"ruamel.yaml>=0.15",
5657
"toml",
5758
"semver",

0 commit comments

Comments
 (0)