Skip to content

Commit b854b77

Browse files
committed
Fix tests
1 parent f6037ca commit b854b77

File tree

2 files changed

+64
-63
lines changed

2 files changed

+64
-63
lines changed

repo2docker/contentproviders/dataverse.py

Lines changed: 52 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import json
33
import os
44
import shutil
5-
from typing import List
5+
from typing import List, Tuple
66
from urllib.parse import parse_qs, urlparse
77

88
from ..utils import copytree, deep_get, is_doi
@@ -67,53 +67,44 @@ def detect(self, spec, ref=None, extra_args=None):
6767
#
6868
# We don't know exactly what kind of dataverse object this is, but
6969
# that can be figured out during fetch as needed
70-
return {"host": host, "url": url}
70+
return url
7171

72-
def get_dataset_id_from_file_id(self, host: str, file_id: str) -> str:
72+
def get_dataset_id_from_file_id(self, base_url: str, file_id: str) -> str:
7373
"""
7474
Return the persistent_id (DOI) that a given file_id (int or doi) belongs to
7575
"""
7676
if file_id.isdigit():
7777
# the file_id is an integer, rather than a persistent id (DOI)
78-
api_url = f"{host}/api/files/{file_id}?returnDatasetVersion=true"
78+
api_url = f"{base_url}/api/files/{file_id}?returnDatasetVersion=true"
7979
else:
8080
# the file_id is a doi itself
81-
api_url = f"{host}/api/files/:persistentId?persistentId={file_id}&returnDatasetVersion=true"
81+
api_url = f"{base_url}/api/files/:persistentId?persistentId={file_id}&returnDatasetVersion=true"
8282

8383
resp = self._request(api_url)
8484
if resp.status_code == 404:
85-
raise ValueError(f"File with id {file_id} not found in {host}")
85+
raise ValueError(f"File with id {file_id} not found in {base_url}")
8686

8787
resp.raise_for_status()
8888

8989
data = resp.json()["data"]
9090
return data["datasetVersion"]["datasetPersistentId"]
9191

92-
def get_datafiles(self, dataverse_host: str, url: str) -> List[dict]:
92+
def parse_dataverse_url(self, url: str) -> Tuple[str, bool]:
9393
"""
94-
Return a list of dataFiles for given persistent_id
95-
96-
Supports the following *dataset* URL styles:
97-
- /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
98-
- /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
94+
Parse the persistent id out of a dataverse URL
9995
100-
Supports the following *file* URL styles:
101-
- /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458
102-
103-
Supports a subset of the following *file* URL styles:
104-
- /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
96+
persistent_id can point to either a dataset or a file. The second return
97+
value is False if we know that the persistent id is a file or a dataset,
98+
and True if it is ambiguous.
10599
106-
If a URL can not be parsed, throw an exception
100+
Raises a ValueError if we can not parse the url
107101
"""
108-
109-
parsed_url = urlparse(url)
102+
parsed_url= urlparse(url)
110103
path = parsed_url.path
111104
qs = parse_qs(parsed_url.query)
112-
dataverse_host = f"{parsed_url.scheme}://{parsed_url.netloc}"
113-
url_kind = None
114-
persistent_id = None
115-
is_ambiguous = False
105+
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
116106

107+
is_ambiguous = False
117108
# https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
118109
if path.startswith("/citation"):
119110
is_ambiguous = True
@@ -124,35 +115,59 @@ def get_datafiles(self, dataverse_host: str, url: str) -> List[dict]:
124115
persistent_id = qs["persistentId"][0]
125116
elif path.startswith("/api/access/datafile"):
126117
# What we have here is an entity id, which we can use to get a persistentId
127-
file_id = os.path.basename(parsed_url.path)
128-
persistent_id = self.get_dataset_id_from_file_id(dataverse_host, file_id)
118+
file_id = os.path.basename(path)
119+
persistent_id = self.get_dataset_id_from_file_id(base_url, file_id)
129120
elif parsed_url.path.startswith("/file.xhtml"):
130121
file_persistent_id = qs["persistentId"][0]
131122
persistent_id = self.get_dataset_id_from_file_id(
132-
dataverse_host, file_persistent_id
123+
base_url, file_persistent_id
133124
)
134125
else:
135126
raise ValueError(
136127
f"Could not determine persistent id for dataverse URL {url}"
137128
)
138129

130+
return persistent_id, is_ambiguous
131+
132+
def get_datafiles(self, url: str) -> List[dict]:
133+
"""
134+
Return a list of dataFiles for given persistent_id
135+
136+
Supports the following *dataset* URL styles:
137+
- /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
138+
- /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
139+
140+
Supports the following *file* URL styles:
141+
- /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458
142+
143+
Supports a subset of the following *file* URL styles:
144+
- /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
145+
146+
If a URL can not be parsed, throw an exception
147+
"""
148+
149+
parsed_url = urlparse(url)
150+
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
151+
152+
persistent_id, is_ambiguous = self.parse_dataverse_url(url)
153+
139154
dataset_api_url = (
140-
f"{dataverse_host}/api/datasets/:persistentId?persistentId={persistent_id}"
155+
f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}"
141156
)
142157
resp = self._request(dataset_api_url, headers={"accept": "application/json"})
143158
if resp.status_code == 404 and is_ambiguous:
144159
# It's possible this is a *file* persistent_id, not a dataset one
145160
persistent_id = self.get_dataset_id_from_file_id(
146-
dataverse_host, persistent_id
161+
base_url, persistent_id
147162
)
148-
dataset_api_url = f"{dataverse_host}/api/datasets/:persistentId?persistentId={persistent_id}"
163+
dataset_api_url = f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}"
149164
resp = self._request(
150165
dataset_api_url, headers={"accept": "application/json"}
151166
)
152167

153168
if resp.status_code == 404:
154169
# This persistent id is just not here
155-
raise ValueError(f"{persistent_id} on {dataverse_host} is not found")
170+
raise ValueError(f"{persistent_id} on {base_url} is not found")
156171

157172
# We already handled 404, raise error for everything else
158173
resp.raise_for_status()
@@ -163,15 +178,17 @@ def get_datafiles(self, dataverse_host: str, url: str) -> List[dict]:
163178

164179
def fetch(self, spec, output_dir, yield_output=False):
165180
"""Fetch and unpack a Dataverse dataset."""
166-
url = spec["url"]
167-
host = spec["host"]
181+
url = spec
182+
parsed_url = urlparse(url)
183+
# FIXME: Support determining API URL better
184+
base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
168185

169186
yield f"Fetching Dataverse record {url}.\n"
170187

171-
for fobj in self.get_datafiles(host["url"], url):
188+
for fobj in self.get_datafiles(url):
172189
file_url = (
173190
# without format=original you get the preservation format (plain text, tab separated)
174-
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
191+
f'{base_url}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
175192
)
176193
filename = fobj["label"]
177194
original_filename = fobj["dataFile"].get("originalFileName", None)

tests/contentproviders/test_dataverse.py

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,48 +6,28 @@
66

77
from repo2docker.contentproviders import Dataverse
88

9-
test_dv = Dataverse()
10-
harvard_dv = next(_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse")
11-
cimmyt_dv = next(_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data")
12-
13-
149
@pytest.mark.parametrize(
1510
("doi", "resolved"),
1611
[
1712
(
1813
"doi:10.7910/DVN/6ZXAGT/3YRRYJ",
19-
{
20-
"host": harvard_dv,
21-
"url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
22-
},
14+
"https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
2315
),
2416
(
2517
"10.7910/DVN/6ZXAGT/3YRRYJ",
26-
{
27-
"host": harvard_dv,
28-
"url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
29-
},
18+
"https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
3019
),
3120
(
3221
"10.7910/DVN/TJCLKP",
33-
{
34-
"host": harvard_dv,
35-
"url": "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
36-
},
22+
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
3723
),
3824
(
3925
"https://dataverse.harvard.edu/api/access/datafile/3323458",
40-
{
41-
"host": harvard_dv,
42-
"url": "https://dataverse.harvard.edu/api/access/datafile/3323458",
43-
},
26+
"https://dataverse.harvard.edu/api/access/datafile/3323458",
4427
),
4528
(
4629
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
47-
{
48-
"host": cimmyt_dv,
49-
"url": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
50-
},
30+
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
5131
),
5232
("/some/random/string", None),
5333
("https://example.com/path/here", None),
@@ -60,28 +40,32 @@ def test_detect(doi, resolved):
6040

6141

6242
@pytest.mark.parametrize(
63-
("url", "persistent_id"),
43+
("url", "persistent_id", "is_ambiguous"),
6444
[
6545
(
6646
"https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
6747
"doi:10.7910/DVN/6ZXAGT",
48+
False
6849
),
6950
(
7051
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
7152
"doi:10.7910/DVN/TJCLKP",
53+
True
7254
),
7355
(
7456
"https://dataverse.harvard.edu/api/access/datafile/3323458",
7557
"doi:10.7910/DVN/3MJ7IR",
58+
False
7659
),
7760
(
7861
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
7962
"hdl:11529/10016",
63+
False
8064
),
8165
],
8266
)
83-
def test_get_persistent_id(url, persistent_id):
84-
assert Dataverse().get_persistent_id_from_url(url) == persistent_id
67+
def test_get_persistent_id(url, persistent_id, is_ambiguous):
68+
assert Dataverse().parse_dataverse_url(url) == (persistent_id, is_ambiguous)
8569

8670

8771
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)