3
3
import shutil
4
4
from urllib .parse import parse_qs , urlparse , urlunparse
5
5
6
- from ..utils import copytree , deep_get
6
+ from ..utils import copytree , deep_get , is_doi
7
7
from .doi import DoiProvider
8
8
9
9
@@ -23,10 +23,11 @@ def __init__(self):
23
23
self .hosts = json .load (fp )["installations" ]
24
24
super ().__init__ ()
25
25
26
- def detect (self , doi , ref = None , extra_args = None ):
27
- """Trigger this provider for things that resolve to a Dataverse dataset.
26
+ def detect (self , spec , ref = None , extra_args = None ):
27
+ """
28
+ Detect if given spec is hosted on dataverse
28
29
29
- Handles :
30
+ The spec can be :
30
31
- DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId}
31
32
- DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&...
32
33
- URL {siteURL}/api/access/datafile/{fileId}
@@ -35,9 +36,11 @@ def detect(self, doi, ref=None, extra_args=None):
35
36
- https://dataverse.harvard.edu/api/access/datafile/3323458
36
37
- doi:10.7910/DVN/6ZXAGT
37
38
- doi:10.7910/DVN/6ZXAGT/3YRRYJ
38
-
39
39
"""
40
- url = self .doi2url (doi )
40
+ if is_doi (spec ):
41
+ url = self .doi2url (spec )
42
+ else :
43
+ url = spec
41
44
# Parse the url, to get the base for later API calls
42
45
parsed_url = urlparse (url )
43
46
@@ -53,51 +56,77 @@ def detect(self, doi, ref=None, extra_args=None):
53
56
if host is None :
54
57
return
55
58
56
- query_args = parse_qs (parsed_url .query )
57
- # Corner case handling
58
- if parsed_url .path .startswith ("/file.xhtml" ):
59
- # There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
60
- # is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way.
61
- new_doi = doi .rsplit ("/" , 1 )[0 ]
62
- if new_doi == doi :
63
- # tough luck :( Avoid inifite recursion and exit.
64
- return
65
- return self .detect (new_doi )
66
- elif parsed_url .path .startswith ("/api/access/datafile" ):
67
- # Raw url pointing to a datafile is a typical output from an External Tool integration
59
+ # At this point, we *know* this is a dataverse URL, because:
60
+ # 1. The DOI resolved to a particular host (if using DOI)
61
+ # 2. The host is in the list of known dataverse installations
62
+ #
63
+ # We don't know exactly what kind of dataverse object this is, but
64
+ # that can be figured out during fetch as needed
65
+ return {"host" : host , "url" : url }
66
+
67
+ def get_persistent_id_from_url (self , url : str ) -> str :
68
+ """
69
+ Return the persistentId for given dataverse URL.
70
+
71
+ Supports the following *dataset* URL styles:
72
+ - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
73
+ - /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
74
+
75
+ Supports the following *file* URL styles:
76
+ - /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458
77
+
78
+ Supports a subset of the following *file* URL styles:
79
+ - /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
80
+
81
+ If a URL can not be parsed, throw an exception
82
+ """
83
+ parsed_url = urlparse (url )
84
+ path = parsed_url .path
85
+ qs = parse_qs (parsed_url .query )
86
+
87
+ # https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
88
+ # https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
89
+ if path .startswith ("/citation" ) or path .startswith ("/dataset.xhtml" ):
90
+ return qs ["persistentId" ][0 ]
91
+ # https://dataverse.harvard.edu/api/access/datafile/3323458
92
+ elif path .startswith ("/api/access/datafile" ):
93
+ # What we have here is an entity id, which we can use to get a persistentId
68
94
entity_id = os .path .basename (parsed_url .path )
69
- search_query = "q=entityId:" + entity_id + "&type=file"
70
- # Knowing the file identifier query search api to get parent dataset
71
- search_url = urlunparse (
95
+ # FIXME: Should we be URL Encoding something here to protect from path traversal
96
+ # or similar attacks?
97
+ search_query = f"q=entityId:{ entity_id } &type=file"
98
+ search_api_url = urlunparse (
72
99
parsed_url ._replace (path = "/api/search" , query = search_query )
73
100
)
74
- self .log .debug ("Querying Dataverse: " + search_url )
75
- data = self .urlopen (search_url ).json ()["data" ]
101
+ self .log .debug ("Querying Dataverse: " + search_api_url )
102
+ data = self .urlopen (search_api_url ).json ()["data" ]
76
103
if data ["count_in_response" ] != 1 :
77
- self . log . debug (
78
- f"Dataverse search query failed!\n - doi: { doi } \n - url: { url } \n - resp: { json .dump (data )} \n "
104
+ raise ValueError (
105
+ f"Dataverse search query failed!\n - url: { url } \n - resp: { json .dumps (data )} \n "
79
106
)
80
- return
81
-
82
- self . record_id = deep_get ( data , "items.0.dataset_persistent_id" )
83
- elif (
84
- parsed_url . path . startswith ( "/dataset.xhtml" )
85
- and "persistentId" in query_args
86
- ):
87
- self . record_id = deep_get ( query_args , "persistentId.0" )
107
+ return data [ "items" ][ 0 ][ "dataset_persistent_id" ]
108
+ elif parsed_url . path . startswith ( "/file.xhtml" ):
109
+ file_persistent_id = qs [ 'persistentId' ][ 0 ]
110
+ dataset_persistent_id = file_persistent_id . rsplit ( "/" , 1 )[ 0 ]
111
+ if file_persistent_id == dataset_persistent_id :
112
+ # We can't figure this one out, throw an error
113
+ raise ValueError ( f"Could not find dataset id for { url } " )
114
+ return dataset_persistent_id
88
115
89
- if hasattr (self , "record_id" ):
90
- return {"record" : self .record_id , "host" : host }
116
+ raise ValueError (f"Could not determine persistent id for dataverse URL { url } " )
91
117
92
118
def fetch (self , spec , output_dir , yield_output = False ):
93
119
"""Fetch and unpack a Dataverse dataset."""
94
- record_id = spec ["record " ]
120
+ url = spec ["url " ]
95
121
host = spec ["host" ]
96
122
97
- yield f"Fetching Dataverse record { record_id } .\n "
98
- url = f'{ host ["url" ]} /api/datasets/:persistentId?persistentId={ record_id } '
123
+ persistent_id = self .get_persistent_id_from_url (url )
124
+
125
+ yield f"Fetching Dataverse record { persistent_id } .\n "
126
+ url = f'{ host ["url" ]} /api/datasets/:persistentId?persistentId={ persistent_id } '
99
127
100
128
resp = self .urlopen (url , headers = {"accept" : "application/json" })
129
+ print (resp .json ())
101
130
record = resp .json ()["data" ]
102
131
103
132
for fobj in deep_get (record , "latestVersion.files" ):
@@ -126,7 +155,11 @@ def fetch(self, spec, output_dir, yield_output=False):
126
155
copytree (os .path .join (output_dir , d ), output_dir )
127
156
shutil .rmtree (os .path .join (output_dir , d ))
128
157
158
+
159
+ # Save persistent id
160
+ self .persitent_id = persistent_id
161
+
129
162
@property
130
163
def content_id (self ):
131
164
"""The Dataverse persistent identifier."""
132
- return self .record_id
165
+ return self .persistent_id
0 commit comments