Skip to content

Commit 172f8b0

Browse files
committed
[WIP] Cleanup dataverse contentprovider
1 parent 3eab292 commit 172f8b0

File tree

3 files changed

+92
-46
lines changed

3 files changed

+92
-46
lines changed

repo2docker/contentproviders/dataverse.py

Lines changed: 72 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import shutil
44
from urllib.parse import parse_qs, urlparse, urlunparse
55

6-
from ..utils import copytree, deep_get
6+
from ..utils import copytree, deep_get, is_doi
77
from .doi import DoiProvider
88

99

@@ -23,10 +23,11 @@ def __init__(self):
2323
self.hosts = json.load(fp)["installations"]
2424
super().__init__()
2525

26-
def detect(self, doi, ref=None, extra_args=None):
27-
"""Trigger this provider for things that resolve to a Dataverse dataset.
26+
def detect(self, spec, ref=None, extra_args=None):
27+
"""
28+
Detect if given spec is hosted on dataverse
2829
29-
Handles:
30+
The spec can be:
3031
- DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId}
3132
- DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&...
3233
- URL {siteURL}/api/access/datafile/{fileId}
@@ -35,9 +36,11 @@ def detect(self, doi, ref=None, extra_args=None):
3536
- https://dataverse.harvard.edu/api/access/datafile/3323458
3637
- doi:10.7910/DVN/6ZXAGT
3738
- doi:10.7910/DVN/6ZXAGT/3YRRYJ
38-
3939
"""
40-
url = self.doi2url(doi)
40+
if is_doi(spec):
41+
url = self.doi2url(spec)
42+
else:
43+
url = spec
4144
# Parse the url, to get the base for later API calls
4245
parsed_url = urlparse(url)
4346

@@ -53,51 +56,77 @@ def detect(self, doi, ref=None, extra_args=None):
5356
if host is None:
5457
return
5558

56-
query_args = parse_qs(parsed_url.query)
57-
# Corner case handling
58-
if parsed_url.path.startswith("/file.xhtml"):
59-
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
60-
# is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way.
61-
new_doi = doi.rsplit("/", 1)[0]
62-
if new_doi == doi:
63-
# tough luck :( Avoid inifite recursion and exit.
64-
return
65-
return self.detect(new_doi)
66-
elif parsed_url.path.startswith("/api/access/datafile"):
67-
# Raw url pointing to a datafile is a typical output from an External Tool integration
59+
# At this point, we *know* this is a dataverse URL, because:
60+
# 1. The DOI resolved to a particular host (if using DOI)
61+
# 2. The host is in the list of known dataverse installations
62+
#
63+
# We don't know exactly what kind of dataverse object this is, but
64+
# that can be figured out during fetch as needed
65+
return {"host": host, "url": url}
66+
67+
def get_persistent_id_from_url(self, url: str) -> str:
68+
"""
69+
Return the persistentId for given dataverse URL.
70+
71+
Supports the following *dataset* URL styles:
72+
- /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
73+
- /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
74+
75+
Supports the following *file* URL styles:
76+
- /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458
77+
78+
Supports a subset of the following *file* URL styles:
79+
- /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
80+
81+
If a URL can not be parsed, throw an exception
82+
"""
83+
parsed_url = urlparse(url)
84+
path = parsed_url.path
85+
qs = parse_qs(parsed_url.query)
86+
87+
# https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
88+
# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
89+
if path.startswith("/citation") or path.startswith("/dataset.xhtml"):
90+
return qs["persistentId"][0]
91+
# https://dataverse.harvard.edu/api/access/datafile/3323458
92+
elif path.startswith("/api/access/datafile"):
93+
# What we have here is an entity id, which we can use to get a persistentId
6894
entity_id = os.path.basename(parsed_url.path)
69-
search_query = "q=entityId:" + entity_id + "&type=file"
70-
# Knowing the file identifier query search api to get parent dataset
71-
search_url = urlunparse(
95+
# FIXME: Should we be URL Encoding something here to protect from path traversal
96+
# or similar attacks?
97+
search_query = f"q=entityId:{entity_id}&type=file"
98+
search_api_url = urlunparse(
7299
parsed_url._replace(path="/api/search", query=search_query)
73100
)
74-
self.log.debug("Querying Dataverse: " + search_url)
75-
data = self.urlopen(search_url).json()["data"]
101+
self.log.debug("Querying Dataverse: " + search_api_url)
102+
data = self.urlopen(search_api_url).json()["data"]
76103
if data["count_in_response"] != 1:
77-
self.log.debug(
78-
f"Dataverse search query failed!\n - doi: {doi}\n - url: {url}\n - resp: {json.dump(data)}\n"
104+
raise ValueError(
105+
f"Dataverse search query failed!\n - url: {url}\n - resp: {json.dumps(data)}\n"
79106
)
80-
return
81-
82-
self.record_id = deep_get(data, "items.0.dataset_persistent_id")
83-
elif (
84-
parsed_url.path.startswith("/dataset.xhtml")
85-
and "persistentId" in query_args
86-
):
87-
self.record_id = deep_get(query_args, "persistentId.0")
107+
return data["items"][0]["dataset_persistent_id"]
108+
elif parsed_url.path.startswith("/file.xhtml"):
109+
file_persistent_id = qs['persistentId'][0]
110+
dataset_persistent_id = file_persistent_id.rsplit("/", 1)[0]
111+
if file_persistent_id == dataset_persistent_id:
112+
# We can't figure this one out, throw an error
113+
raise ValueError(f"Could not find dataset id for {url}")
114+
return dataset_persistent_id
88115

89-
if hasattr(self, "record_id"):
90-
return {"record": self.record_id, "host": host}
116+
raise ValueError(f"Could not determine persistent id for dataverse URL {url}")
91117

92118
def fetch(self, spec, output_dir, yield_output=False):
93119
"""Fetch and unpack a Dataverse dataset."""
94-
record_id = spec["record"]
120+
url = spec["url"]
95121
host = spec["host"]
96122

97-
yield f"Fetching Dataverse record {record_id}.\n"
98-
url = f'{host["url"]}/api/datasets/:persistentId?persistentId={record_id}'
123+
persistent_id = self.get_persistent_id_from_url(url)
124+
125+
yield f"Fetching Dataverse record {persistent_id}.\n"
126+
url = f'{host["url"]}/api/datasets/:persistentId?persistentId={persistent_id}'
99127

100128
resp = self.urlopen(url, headers={"accept": "application/json"})
129+
print(resp.json())
101130
record = resp.json()["data"]
102131

103132
for fobj in deep_get(record, "latestVersion.files"):
@@ -126,7 +155,11 @@ def fetch(self, spec, output_dir, yield_output=False):
126155
copytree(os.path.join(output_dir, d), output_dir)
127156
shutil.rmtree(os.path.join(output_dir, d))
128157

158+
159+
# Save persistent id
160+
self.persitent_id = persistent_id
161+
129162
@property
130163
def content_id(self):
131164
"""The Dataverse persistent identifier."""
132-
return self.record_id
165+
return self.persistent_id

repo2docker/contentproviders/doi.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def doi2url(self, doi):
5050

5151
# Use the doi.org resolver API
5252
# documented at https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation#5-proxy-server-rest-api
53-
req_url = f"https://doi.org/api/handles/{normalize_doi}"
53+
req_url = f"https://doi.org/api/handles/{normalized_doi}"
5454
resp = self._request(req_url)
5555
if resp.status_code == 404:
5656
# Not a doi, return what we were passed in
@@ -60,7 +60,7 @@ def doi2url(self, doi):
6060
# Pick the first URL we find from the doi response
6161
for v in data["values"]:
6262
if v["type"] == "URL":
63-
return v["data"]["string"]
63+
return v["data"]["value"]
6464

6565
# No URLs found for this doi, what do we do?
6666
self.log.error("DOI {normalized_doi} doesn't point to any URLs")

tests/contentproviders/test_dataverse.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,11 @@
1313
@pytest.mark.parametrize(
1414
("doi", "resolved"),
1515
[
16-
("doi:10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}),
17-
("10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}),
18-
("https://dataverse.harvard.edu/api/access/datafile/3323458", {"host": harvard_dv, "record": "doi:10.7910/DVN/3MJ7IR"}),
19-
("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", {"host": cimmyt_dv, "record": "hdl:11529/10016"}),
16+
("doi:10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"}),
17+
("10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"}),
18+
("10.7910/DVN/TJCLKP", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP"}),
19+
("https://dataverse.harvard.edu/api/access/datafile/3323458", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/api/access/datafile/3323458"}),
20+
("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", {"host": cimmyt_dv, "url": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016"}),
2021
("/some/random/string", None),
2122
("https://example.com/path/here", None),
2223
# Non dataverse DOIs
@@ -27,10 +28,22 @@ def test_detect(doi, resolved):
2728
assert Dataverse().detect(doi) == resolved
2829

2930

31+
@pytest.mark.parametrize(
32+
("url", "persistent_id"),
33+
[
34+
("https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", "doi:10.7910/DVN/6ZXAGT"),
35+
("https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", "doi:10.7910/DVN/TJCLKP"),
36+
("https://dataverse.harvard.edu/api/access/datafile/3323458", "doi:10.7910/DVN/3MJ7IR"),
37+
("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", "hdl:11529/10016"),
38+
]
39+
)
40+
def test_get_persistent_id(url, persistent_id):
41+
assert Dataverse().get_persistent_id_from_url(url) == persistent_id
42+
3043
def test_dataverse_fetch():
31-
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/TJCLKP"}
3244

3345
dv = Dataverse()
46+
spec = dv.detect("doi:10.7910/DVN/TJCLKP")
3447

3548
with TemporaryDirectory() as d:
3649
output = []

0 commit comments

Comments
 (0)