2
2
import json
3
3
import os
4
4
import shutil
5
- from typing import List
5
+ from typing import List , Tuple
6
6
from urllib .parse import parse_qs , urlparse
7
7
8
8
from ..utils import copytree , deep_get , is_doi
@@ -67,53 +67,44 @@ def detect(self, spec, ref=None, extra_args=None):
67
67
#
68
68
# We don't know exactly what kind of dataverse object this is, but
69
69
# that can be figured out during fetch as needed
70
- return { "host" : host , " url" : url }
70
+ return url
71
71
72
- def get_dataset_id_from_file_id (self , host : str , file_id : str ) -> str :
72
+ def get_dataset_id_from_file_id (self , base_url : str , file_id : str ) -> str :
73
73
"""
74
74
Return the persistent_id (DOI) that a given file_id (int or doi) belongs to
75
75
"""
76
76
if file_id .isdigit ():
77
77
# the file_id is an integer, rather than a persistent id (DOI)
78
- api_url = f"{ host } /api/files/{ file_id } ?returnDatasetVersion=true"
78
+ api_url = f"{ base_url } /api/files/{ file_id } ?returnDatasetVersion=true"
79
79
else :
80
80
# the file_id is a doi itself
81
- api_url = f"{ host } /api/files/:persistentId?persistentId={ file_id } &returnDatasetVersion=true"
81
+ api_url = f"{ base_url } /api/files/:persistentId?persistentId={ file_id } &returnDatasetVersion=true"
82
82
83
83
resp = self ._request (api_url )
84
84
if resp .status_code == 404 :
85
- raise ValueError (f"File with id { file_id } not found in { host } " )
85
+ raise ValueError (f"File with id { file_id } not found in { base_url } " )
86
86
87
87
resp .raise_for_status ()
88
88
89
89
data = resp .json ()["data" ]
90
90
return data ["datasetVersion" ]["datasetPersistentId" ]
91
91
92
- def get_datafiles (self , dataverse_host : str , url : str ) -> List [ dict ]:
92
+ def parse_dataverse_url (self , url : str ) -> Tuple [ str , bool ]:
93
93
"""
94
- Return a list of dataFiles for given persistent_id
95
-
96
- Supports the following *dataset* URL styles:
97
- - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
98
- - /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
94
+ Parse the persistent id out of a dataverse URL
99
95
100
- Supports the following *file* URL styles:
101
- - /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458
102
-
103
- Supports a subset of the following *file* URL styles:
104
- - /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
96
+ persistent_id can point to either a dataset or a file. The second return
97
+ value is False if we know that the persistent id is a file or a dataset,
98
+ and True if it is ambiguous.
105
99
106
- If a URL can not be parsed, throw an exception
100
+ Raises a ValueError if we can not parse the url
107
101
"""
108
-
109
- parsed_url = urlparse (url )
102
+ parsed_url = urlparse (url )
110
103
path = parsed_url .path
111
104
qs = parse_qs (parsed_url .query )
112
- dataverse_host = f"{ parsed_url .scheme } ://{ parsed_url .netloc } "
113
- url_kind = None
114
- persistent_id = None
115
- is_ambiguous = False
105
+ base_url = f"{ parsed_url .scheme } ://{ parsed_url .netloc } "
116
106
107
+ is_ambiguous = False
117
108
# https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
118
109
if path .startswith ("/citation" ):
119
110
is_ambiguous = True
@@ -124,35 +115,59 @@ def get_datafiles(self, dataverse_host: str, url: str) -> List[dict]:
124
115
persistent_id = qs ["persistentId" ][0 ]
125
116
elif path .startswith ("/api/access/datafile" ):
126
117
# What we have here is an entity id, which we can use to get a persistentId
127
- file_id = os .path .basename (parsed_url . path )
128
- persistent_id = self .get_dataset_id_from_file_id (dataverse_host , file_id )
118
+ file_id = os .path .basename (path )
119
+ persistent_id = self .get_dataset_id_from_file_id (base_url , file_id )
129
120
elif parsed_url .path .startswith ("/file.xhtml" ):
130
121
file_persistent_id = qs ["persistentId" ][0 ]
131
122
persistent_id = self .get_dataset_id_from_file_id (
132
- dataverse_host , file_persistent_id
123
+ base_url , file_persistent_id
133
124
)
134
125
else :
135
126
raise ValueError (
136
127
f"Could not determine persistent id for dataverse URL { url } "
137
128
)
138
129
130
+ return persistent_id , is_ambiguous
131
+
132
+ def get_datafiles (self , url : str ) -> List [dict ]:
133
+ """
134
+ Return a list of dataFiles for given persistent_id
135
+
136
+ Supports the following *dataset* URL styles:
137
+ - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
138
+ - /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
139
+
140
+ Supports the following *file* URL styles:
141
+ - /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458
142
+
143
+ Supports a subset of the following *file* URL styles:
144
+ - /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
145
+
146
+ If a URL can not be parsed, throw an exception
147
+ """
148
+
149
+ parsed_url = urlparse (url )
150
+ base_url = f"{ parsed_url .scheme } ://{ parsed_url .netloc } "
151
+
152
+ persistent_id , is_ambiguous = self .parse_dataverse_url (url )
153
+
139
154
dataset_api_url = (
140
- f"{ dataverse_host } /api/datasets/:persistentId?persistentId={ persistent_id } "
155
+ f"{ base_url } /api/datasets/:persistentId?persistentId={ persistent_id } "
141
156
)
142
157
resp = self ._request (dataset_api_url , headers = {"accept" : "application/json" })
143
158
if resp .status_code == 404 and is_ambiguous :
144
159
# It's possible this is a *file* persistent_id, not a dataset one
145
160
persistent_id = self .get_dataset_id_from_file_id (
146
- dataverse_host , persistent_id
161
+ base_url , persistent_id
147
162
)
148
- dataset_api_url = f"{ dataverse_host } /api/datasets/:persistentId?persistentId={ persistent_id } "
163
+ dataset_api_url = f"{ base_url } /api/datasets/:persistentId?persistentId={ persistent_id } "
149
164
resp = self ._request (
150
165
dataset_api_url , headers = {"accept" : "application/json" }
151
166
)
152
167
153
168
if resp .status_code == 404 :
154
169
# This persistent id is just not here
155
- raise ValueError (f"{ persistent_id } on { dataverse_host } is not found" )
170
+ raise ValueError (f"{ persistent_id } on { base_url } is not found" )
156
171
157
172
# We already handled 404, raise error for everything else
158
173
resp .raise_for_status ()
@@ -163,15 +178,17 @@ def get_datafiles(self, dataverse_host: str, url: str) -> List[dict]:
163
178
164
179
def fetch (self , spec , output_dir , yield_output = False ):
165
180
"""Fetch and unpack a Dataverse dataset."""
166
- url = spec ["url" ]
167
- host = spec ["host" ]
181
+ url = spec
182
+ parsed_url = urlparse (url )
183
+ # FIXME: Support determining API URL better
184
+ base_url = f'{ parsed_url .scheme } ://{ parsed_url .netloc } '
168
185
169
186
yield f"Fetching Dataverse record { url } .\n "
170
187
171
- for fobj in self .get_datafiles (host [ "url" ], url ):
188
+ for fobj in self .get_datafiles (url ):
172
189
file_url = (
173
190
# without format=original you get the preservation format (plain text, tab separated)
174
- f'{ host [ "url" ] } /api/access/datafile/{ deep_get (fobj , "dataFile.id" )} ?format=original'
191
+ f'{ base_url } /api/access/datafile/{ deep_get (fobj , "dataFile.id" )} ?format=original'
175
192
)
176
193
filename = fobj ["label" ]
177
194
original_filename = fobj ["dataFile" ].get ("originalFileName" , None )
0 commit comments