Skip to content

Commit b7050ba

Browse files
committed
Always fetch entire dataset for dataverse
1 parent 1260a5a commit b7050ba

File tree

2 files changed

+103
-63
lines changed

2 files changed

+103
-63
lines changed

repo2docker/contentproviders/dataverse.py

Lines changed: 49 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,26 @@ def detect(self, spec, ref=None, extra_args=None):
6464
# that can be figured out during fetch as needed
6565
return {"host": host, "url": url}
6666

67+
def get_dataset_id_from_file_id(self, host: str, file_id: str) -> str:
68+
"""
69+
Return the persistent_id (DOI) that a given file_id (int or doi) belongs to
70+
"""
71+
if file_id.isdigit():
72+
# the file_id is an integer, rather than a persistent id (DOI)
73+
api_url = f"{host}/api/files/{file_id}?returnDatasetVersion=true"
74+
else:
75+
# the file_id is a doi itself
76+
api_url = f"{host}/api/files/:persistentId?persistentId={file_id}&returnDatasetVersion=true"
77+
78+
resp = self._request(api_url)
79+
if resp.status_code == 404:
80+
raise ValueError(f"File with id {file_id} not found in {host}")
81+
82+
resp.raise_for_status()
83+
84+
data = resp.json()["data"]
85+
return data["datasetVersion"]["datasetPersistentId"]
86+
6787
def get_persistent_id_from_url(self, url: str) -> str:
6888
"""
6989
Return the persistentId for given dataverse URL.
@@ -80,72 +100,56 @@ def get_persistent_id_from_url(self, url: str) -> str:
80100
81101
If a URL can not be parsed, throw an exception
82102
"""
103+
104+
def get_datafiles(self, dataverse_host: str, url: str) -> list[dict]:
105+
"""
106+
Return a list of dataFiles for given persistent_id
107+
"""
108+
83109
parsed_url = urlparse(url)
84110
path = parsed_url.path
85111
qs = parse_qs(parsed_url.query)
112+
dataverse_host = f"{parsed_url.scheme}://{parsed_url.netloc}"
113+
url_kind = None
114+
persistent_id = None
115+
is_ambiguous = False
86116

87117
# https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
118+
if path.startswith("/citation"):
119+
is_ambiguous = True
120+
persistent_id = qs["persistentId"][0]
88121
# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
89-
if path.startswith("/citation") or path.startswith("/dataset.xhtml"):
90-
return qs["persistentId"][0]
122+
elif path.startswith("/dataset.xhtml"):
91123
# https://dataverse.harvard.edu/api/access/datafile/3323458
124+
persistent_id = qs["persistentId"][0]
92125
elif path.startswith("/api/access/datafile"):
93126
# What we have here is an entity id, which we can use to get a persistentId
94-
entity_id = os.path.basename(parsed_url.path)
95-
# FIXME: Should we be URL Encoding something here to protect from path traversal
96-
# or similar attacks?
97-
search_query = f"q=entityId:{entity_id}&type=file"
98-
search_api_url = urlunparse(
99-
parsed_url._replace(path="/api/search", query=search_query)
100-
)
101-
self.log.debug("Querying Dataverse: " + search_api_url)
102-
data = self.urlopen(search_api_url).json()["data"]
103-
if data["count_in_response"] != 1:
104-
raise ValueError(
105-
f"Dataverse search query failed!\n - url: {url}\n - resp: {json.dumps(data)}\n"
106-
)
107-
return data["items"][0]["dataset_persistent_id"]
127+
file_id = os.path.basename(parsed_url.path)
128+
persistent_id = self.get_dataset_id_from_file_id(dataverse_host, file_id)
108129
elif parsed_url.path.startswith("/file.xhtml"):
109130
file_persistent_id = qs["persistentId"][0]
110-
dataset_persistent_id = file_persistent_id.rsplit("/", 1)[0]
111-
if file_persistent_id == dataset_persistent_id:
112-
# We can't figure this one out, throw an error
113-
raise ValueError(f"Could not find dataset id for {url}")
114-
return dataset_persistent_id
115-
116-
raise ValueError(f"Could not determine persistent id for dataverse URL {url}")
117-
118-
def get_datafiles(self, host: str, persistent_id: str) -> list[dict]:
119-
"""
120-
Return a list of dataFiles for given persistent_id
121-
"""
122-
dataset_url = f"{host}/api/datasets/:persistentId?persistentId={persistent_id}"
131+
persistent_id = self.get_dataset_id_from_file_id(dataverse_host, file_persistent_id)
132+
else:
133+
raise ValueError(f"Could not determine persistent id for dataverse URL {url}")
123134

124-
resp = self._request(dataset_url, headers={"accept": "application/json"})
125-
# Assume it's a dataset
126-
is_dataset = True
127-
if resp.status_code == 404:
135+
dataset_api_url = f"{dataverse_host}/api/datasets/:persistentId?persistentId={persistent_id}"
136+
resp = self._request(dataset_api_url, headers={"accept": "application/json"})
137+
if resp.status_code == 404 and is_ambiguous:
128138
# It's possible this is a *file* persistent_id, not a dataset one
129-
file_url = f"{host}/api/files/:persistentId?persistentId={persistent_id}"
130-
resp = self._request(file_url, headers={"accept": "application/json"})
139+
persistent_id = self.get_dataset_id_from_file_id(dataverse_host, persistent_id)
140+
dataset_api_url = f"{dataverse_host}/api/datasets/:persistentId?persistentId={persistent_id}"
141+
resp = self._request(dataset_api_url, headers={"accept": "application/json"})
131142

132143
if resp.status_code == 404:
133144
# This persistent id is just not here
134-
raise ValueError(f"{persistent_id} on {host} is not found")
135-
136-
# It's not a dataset, it's a file!
137-
is_dataset = False
145+
raise ValueError(f"{persistent_id} on {dataverse_host} is not found")
138146

139147
# We already handled 404, raise error for everything else
140148
resp.raise_for_status()
141149

142150
data = resp.json()["data"]
143151

144-
if is_dataset:
145-
return data["latestVersion"]["files"]
146-
else:
147-
# Only one file object
148-
return [data]
152+
return data["latestVersion"]["files"]
149153

150154
def fetch(self, spec, output_dir, yield_output=False):
151155
"""Fetch and unpack a Dataverse dataset."""
@@ -156,7 +160,7 @@ def fetch(self, spec, output_dir, yield_output=False):
156160

157161
yield f"Fetching Dataverse record {persistent_id}.\n"
158162

159-
for fobj in self.get_datafiles(host["url"], persistent_id):
163+
for fobj in self.get_datafiles(host["url"], url):
160164
file_url = (
161165
# without format=original you get the preservation format (plain text, tab separated)
162166
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'

tests/contentproviders/test_dataverse.py

Lines changed: 54 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -85,38 +85,74 @@ def test_get_persistent_id(url, persistent_id):
8585

8686

8787
@pytest.mark.parametrize(
88-
("spec", "md5tree"),
88+
("specs", "md5tree"),
8989
[
9090
(
91-
"doi:10.7910/DVN/TJCLKP",
91+
(
92+
"doi:10.7910/DVN/TJCLKP",
93+
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
94+
),
9295
{
9396
"data/primary/primary-data.zip": "a8f6fc3fc58f503cd48e23fa8b088694",
9497
"data/2023-01-03.tsv": "6fd497bf13dab9a06fe737ebc22f1917",
9598
"code/language.py": "9d61582bcf497c83bbd1ed0eed3c772e",
9699
},
97100
),
98101
(
99-
# A citation targeting a single file
100-
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
102+
(
103+
"https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
104+
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
105+
"doi:10.7910/DVN/6ZXAGT/3YRRYJ"
106+
),
101107
{
102-
"ARCHAEOGLOBE_CONSENSUS_ASSESSMENT.tab": "17a91888ed8e91dfb63acbbab6127ac5"
108+
'ArchaeoGLOBE-master/analysis/figures/1_response_distribution.png': '243c6a3dd66bc3c84102829b277ef333',
109+
'ArchaeoGLOBE-master/analysis/figures/2_trends_map_knowledge.png': '2ace6ae9d470dda6cf2f9f9a6588171a',
110+
'ArchaeoGLOBE-master/analysis/figures/3_trends_global.png': '63ccd0a7b2d20440cd8f418d4ee88c4d',
111+
'ArchaeoGLOBE-master/analysis/figures/4_consensus_transitions.png': 'facfaedabeac77c4496d4b9e962a917f',
112+
'ArchaeoGLOBE-master/analysis/figures/5_ArchaeoGLOBE_HYDE_comparison.png': '8e002e4d50f179fc1808f562b1353588',
113+
'ArchaeoGLOBE-master/apt.txt': 'b4224032da6c71d48f46c9b78fc6ed77',
114+
'ArchaeoGLOBE-master/analysis/archaeoglobe.pdf': 'f575be4790efc963ef1bd40d097cc06d',
115+
'ArchaeoGLOBE-master/analysis/archaeoglobe.Rmd': 'f37d5f7993fde9ebd64d16b20fc22905',
116+
'ArchaeoGLOBE-master/ArchaeoGLOBE.Rproj': 'd0250e7918993bab1e707358fe5633e0',
117+
'ArchaeoGLOBE-master/CONDUCT.md': 'f87ef290340322089c32b4e573d8f1e8',
118+
'ArchaeoGLOBE-master/.circleci/config.yml': '6eaa54073a682b3195d8fab3a9dd8344',
119+
'ArchaeoGLOBE-master/CONTRIBUTING.md': 'b3a6abfc749dd155a3049f94a855bf9f',
120+
'ArchaeoGLOBE-master/DESCRIPTION': '745ef979494999e483987de72c0adfbd',
121+
'ArchaeoGLOBE-master/dockerfile': 'aedce68e5a7d6e79cbb24c9cffeae593',
122+
'ArchaeoGLOBE-master/.binder/Dockerfile': '7564a41246ba99b60144afb1d3b6d7de',
123+
'ArchaeoGLOBE-master/.gitignore': '62c1482e4febbd35dc02fb7e2a31246b',
124+
'ArchaeoGLOBE-master/analysis/data/derived-data/hyde_crop_prop.RDS': '2aea7748b5586923b0de9d13af58e59d',
125+
'ArchaeoGLOBE-master/analysis/data/derived-data/kk_anthro_prop.RDS': '145a9e5dd2c95625626a720b52178b70',
126+
'ArchaeoGLOBE-master/LICENSE.md': '3aa9d41a92a57944bd4590e004898445',
127+
'ArchaeoGLOBE-master/analysis/data/derived-data/placeholder': 'd41d8cd98f00b204e9800998ecf8427e',
128+
'ArchaeoGLOBE-master/.Rbuildignore': 'df15e4fed49abd685b536fef4472b01f',
129+
'ArchaeoGLOBE-master/README.md': '0b0faabe580c4d76a0e0d64a4f54bca4',
130+
'ArchaeoGLOBE-master/analysis/data/derived-data/README.md': '547fd1a6e874f6178b1cf525b5b9ae72',
131+
'ArchaeoGLOBE-master/analysis/figures/S1_FHG_consensus.png': 'd2584352e5442b33e4b23e361ca70fe1',
132+
'ArchaeoGLOBE-master/analysis/figures/S2_EXAG_consensus.png': '513eddfdad01fd01a20263a55ca6dbe3',
133+
'ArchaeoGLOBE-master/analysis/figures/S3_INAG_consensus.png': 'b16ba0ecd21b326f873209a7e55a8deb',
134+
'ArchaeoGLOBE-master/analysis/figures/S4_PAS_consensus.png': '05695f9412337a00c1cb6d1757d0ec5c',
135+
'ArchaeoGLOBE-master/analysis/figures/S5_URBAN_consensus.png': '10119f7495d3b8e7ad7f8a0770574f15',
136+
'ArchaeoGLOBE-master/analysis/figures/S6_trends_map_landuse.png': 'b1db7c97f39ccfc3a9e094c3e6307af0',
137+
'ArchaeoGLOBE-master/analysis/figures/S7_ArchaeoGLOBE_KK10_comparison.png': '30341748324f5f66acadb34c114c3e9d',
103138
}
104139
)
105140
],
106141
)
107-
def test_fetch(spec, md5tree):
142+
def test_fetch(specs: list[str], md5tree):
108143
dv = Dataverse()
109144

110-
with TemporaryDirectory() as d:
111-
output = []
112-
for l in dv.fetch(dv.detect(spec), d):
113-
output.append(l)
145+
for spec in specs:
146+
with TemporaryDirectory() as d:
147+
output = []
148+
for l in dv.fetch(dv.detect(spec), d):
149+
output.append(l)
114150

115-
# Verify md5 sum of the files we expect to find
116-
# We are using md5 instead of something more secure because that is what
117-
# dataverse itself uses
118-
for subpath, expected_sha in md5tree.items():
119-
with open(os.path.join(d, subpath), "rb") as f:
120-
h = hashlib.md5()
121-
h.update(f.read())
122-
assert h.hexdigest() == expected_sha
151+
# Verify md5 sum of the files we expect to find
152+
# We are using md5 instead of something more secure because that is what
153+
# dataverse itself uses
154+
for subpath, expected_sha in md5tree.items():
155+
with open(os.path.join(d, subpath), "rb") as f:
156+
h = hashlib.md5()
157+
h.update(f.read())
158+
assert h.hexdigest() == expected_sha

0 commit comments

Comments
 (0)