@@ -64,6 +64,26 @@ def detect(self, spec, ref=None, extra_args=None):
64
64
# that can be figured out during fetch as needed
65
65
return {"host" : host , "url" : url }
66
66
67
+ def get_dataset_id_from_file_id (self , host : str , file_id : str ) -> str :
68
+ """
69
+ Return the persistent_id (DOI) that a given file_id (int or doi) belongs to
70
+ """
71
+ if file_id .isdigit ():
72
+ # the file_id is an integer, rather than a persistent id (DOI)
73
+ api_url = f"{ host } /api/files/{ file_id } ?returnDatasetVersion=true"
74
+ else :
75
+ # the file_id is a doi itself
76
+ api_url = f"{ host } /api/files/:persistentId?persistentId={ file_id } &returnDatasetVersion=true"
77
+
78
+ resp = self ._request (api_url )
79
+ if resp .status_code == 404 :
80
+ raise ValueError (f"File with id { file_id } not found in { host } " )
81
+
82
+ resp .raise_for_status ()
83
+
84
+ data = resp .json ()["data" ]
85
+ return data ["datasetVersion" ]["datasetPersistentId" ]
86
+
67
87
def get_persistent_id_from_url (self , url : str ) -> str :
68
88
"""
69
89
Return the persistentId for given dataverse URL.
@@ -80,72 +100,56 @@ def get_persistent_id_from_url(self, url: str) -> str:
80
100
81
101
If a URL can not be parsed, throw an exception
82
102
"""
103
+
104
+ def get_datafiles (self , dataverse_host : str , url : str ) -> list [dict ]:
105
+ """
106
+ Return a list of dataFiles for given persistent_id
107
+ """
108
+
83
109
parsed_url = urlparse (url )
84
110
path = parsed_url .path
85
111
qs = parse_qs (parsed_url .query )
112
+ dataverse_host = f"{ parsed_url .scheme } ://{ parsed_url .netloc } "
113
+ url_kind = None
114
+ persistent_id = None
115
+ is_ambiguous = False
86
116
87
117
# https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
118
+ if path .startswith ("/citation" ):
119
+ is_ambiguous = True
120
+ persistent_id = qs ["persistentId" ][0 ]
88
121
# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
89
- if path .startswith ("/citation" ) or path .startswith ("/dataset.xhtml" ):
90
- return qs ["persistentId" ][0 ]
122
+ elif path .startswith ("/dataset.xhtml" ):
91
123
# https://dataverse.harvard.edu/api/access/datafile/3323458
124
+ persistent_id = qs ["persistentId" ][0 ]
92
125
elif path .startswith ("/api/access/datafile" ):
93
126
# What we have here is an entity id, which we can use to get a persistentId
94
- entity_id = os .path .basename (parsed_url .path )
95
- # FIXME: Should we be URL Encoding something here to protect from path traversal
96
- # or similar attacks?
97
- search_query = f"q=entityId:{ entity_id } &type=file"
98
- search_api_url = urlunparse (
99
- parsed_url ._replace (path = "/api/search" , query = search_query )
100
- )
101
- self .log .debug ("Querying Dataverse: " + search_api_url )
102
- data = self .urlopen (search_api_url ).json ()["data" ]
103
- if data ["count_in_response" ] != 1 :
104
- raise ValueError (
105
- f"Dataverse search query failed!\n - url: { url } \n - resp: { json .dumps (data )} \n "
106
- )
107
- return data ["items" ][0 ]["dataset_persistent_id" ]
127
+ file_id = os .path .basename (parsed_url .path )
128
+ persistent_id = self .get_dataset_id_from_file_id (dataverse_host , file_id )
108
129
elif parsed_url .path .startswith ("/file.xhtml" ):
109
130
file_persistent_id = qs ["persistentId" ][0 ]
110
- dataset_persistent_id = file_persistent_id .rsplit ("/" , 1 )[0 ]
111
- if file_persistent_id == dataset_persistent_id :
112
- # We can't figure this one out, throw an error
113
- raise ValueError (f"Could not find dataset id for { url } " )
114
- return dataset_persistent_id
115
-
116
- raise ValueError (f"Could not determine persistent id for dataverse URL { url } " )
117
-
118
- def get_datafiles (self , host : str , persistent_id : str ) -> list [dict ]:
119
- """
120
- Return a list of dataFiles for given persistent_id
121
- """
122
- dataset_url = f"{ host } /api/datasets/:persistentId?persistentId={ persistent_id } "
131
+ persistent_id = self .get_dataset_id_from_file_id (dataverse_host , file_persistent_id )
132
+ else :
133
+ raise ValueError (f"Could not determine persistent id for dataverse URL { url } " )
123
134
124
- resp = self ._request (dataset_url , headers = {"accept" : "application/json" })
125
- # Assume it's a dataset
126
- is_dataset = True
127
- if resp .status_code == 404 :
135
+ dataset_api_url = f"{ dataverse_host } /api/datasets/:persistentId?persistentId={ persistent_id } "
136
+ resp = self ._request (dataset_api_url , headers = {"accept" : "application/json" })
137
+ if resp .status_code == 404 and is_ambiguous :
128
138
# It's possible this is a *file* persistent_id, not a dataset one
129
- file_url = f"{ host } /api/files/:persistentId?persistentId={ persistent_id } "
130
- resp = self ._request (file_url , headers = {"accept" : "application/json" })
139
+ persistent_id = self .get_dataset_id_from_file_id (dataverse_host , persistent_id )
140
+ dataset_api_url = f"{ dataverse_host } /api/datasets/:persistentId?persistentId={ persistent_id } "
141
+ resp = self ._request (dataset_api_url , headers = {"accept" : "application/json" })
131
142
132
143
if resp .status_code == 404 :
133
144
# This persistent id is just not here
134
- raise ValueError (f"{ persistent_id } on { host } is not found" )
135
-
136
- # It's not a dataset, it's a file!
137
- is_dataset = False
145
+ raise ValueError (f"{ persistent_id } on { dataverse_host } is not found" )
138
146
139
147
# We already handled 404, raise error for everything else
140
148
resp .raise_for_status ()
141
149
142
150
data = resp .json ()["data" ]
143
151
144
- if is_dataset :
145
- return data ["latestVersion" ]["files" ]
146
- else :
147
- # Only one file object
148
- return [data ]
152
+ return data ["latestVersion" ]["files" ]
149
153
150
154
def fetch (self , spec , output_dir , yield_output = False ):
151
155
"""Fetch and unpack a Dataverse dataset."""
@@ -156,7 +160,7 @@ def fetch(self, spec, output_dir, yield_output=False):
156
160
157
161
yield f"Fetching Dataverse record { persistent_id } .\n "
158
162
159
- for fobj in self .get_datafiles (host ["url" ], persistent_id ):
163
+ for fobj in self .get_datafiles (host ["url" ], url ):
160
164
file_url = (
161
165
# without format=original you get the preservation format (plain text, tab separated)
162
166
f'{ host ["url" ]} /api/access/datafile/{ deep_get (fobj , "dataFile.id" )} ?format=original'
0 commit comments