1+ import hashlib
12import json
23import os
34import shutil
4- from urllib .parse import parse_qs , urlparse , urlunparse
5+ from typing import List , Tuple
6+ from urllib .parse import parse_qs , urlparse
57
6- from ..utils import copytree , deep_get
8+ from ..utils import copytree , deep_get , is_doi
79from .doi import DoiProvider
810
911
@@ -23,10 +25,11 @@ def __init__(self):
2325 self .hosts = json .load (fp )["installations" ]
2426 super ().__init__ ()
2527
26- def detect (self , doi , ref = None , extra_args = None ):
27- """Trigger this provider for things that resolve to a Dataverse dataset.
28+ def detect (self , spec , ref = None , extra_args = None ):
29+ """
30+ Detect if given spec is hosted on dataverse
2831
29- Handles :
32+ The spec can be :
3033 - DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId}
3134 - DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&...
3235 - URL {siteURL}/api/access/datafile/{fileId}
@@ -35,9 +38,11 @@ def detect(self, doi, ref=None, extra_args=None):
3538 - https://dataverse.harvard.edu/api/access/datafile/3323458
3639 - doi:10.7910/DVN/6ZXAGT
3740 - doi:10.7910/DVN/6ZXAGT/3YRRYJ
38-
3941 """
40- url = self .doi2url (doi )
42+ if is_doi (spec ):
43+ url = self .doi2url (spec )
44+ else :
45+ url = spec
4146 # Parse the url, to get the base for later API calls
4247 parsed_url = urlparse (url )
4348
@@ -53,57 +58,137 @@ def detect(self, doi, ref=None, extra_args=None):
5358 if host is None :
5459 return
5560
56- query_args = parse_qs (parsed_url .query )
57- # Corner case handling
58- if parsed_url .path .startswith ("/file.xhtml" ):
59- # There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
60- # is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way.
61- new_doi = doi .rsplit ("/" , 1 )[0 ]
62- if new_doi == doi :
63- # tough luck :( Avoid inifite recursion and exit.
64- return
65- return self .detect (new_doi )
66- elif parsed_url .path .startswith ("/api/access/datafile" ):
67- # Raw url pointing to a datafile is a typical output from an External Tool integration
68- entity_id = os .path .basename (parsed_url .path )
69- search_query = "q=entityId:" + entity_id + "&type=file"
70- # Knowing the file identifier query search api to get parent dataset
71- search_url = urlunparse (
72- parsed_url ._replace (path = "/api/search" , query = search_query )
61+ # At this point, we *know* this is a dataverse URL, because:
62+ # 1. The DOI resolved to a particular host (if using DOI)
63+ # 2. The host is in the list of known dataverse installations
64+ #
65+ # We don't know exactly what kind of dataverse object this is, but
66+ # that can be figured out during fetch as needed
67+ return url
68+
69+ def get_dataset_id_from_file_id (self , base_url : str , file_id : str ) -> str :
70+ """
71+ Return the persistent_id (DOI) of a dataset that a given file_id (int or doi) belongs to
72+ """
73+ if file_id .isdigit ():
74+ # the file_id is an integer, rather than a persistent id (DOI)
75+ api_url = f"{ base_url } /api/files/{ file_id } ?returnDatasetVersion=true"
76+ else :
77+ # the file_id is a doi itself
78+ api_url = f"{ base_url } /api/files/:persistentId?persistentId={ file_id } &returnDatasetVersion=true"
79+
80+ resp = self ._request (api_url )
81+ if resp .status_code == 404 :
82+ raise ValueError (f"File with id { file_id } not found in { base_url } " )
83+
84+ resp .raise_for_status ()
85+
86+ data = resp .json ()["data" ]
87+ return data ["datasetVersion" ]["datasetPersistentId" ]
88+
89+ def parse_dataverse_url (self , url : str ) -> Tuple [str , bool ]:
90+ """
91+ Parse the persistent id out of a dataverse URL
92+
93+ persistent_id can point to either a dataset or a file. The second return
94+ value is False if we know that the persistent id is a file or a dataset,
95+ and True if it is ambiguous.
96+
97+ Raises a ValueError if we can not parse the url
98+ """
99+ parsed_url = urlparse (url )
100+ path = parsed_url .path
101+ qs = parse_qs (parsed_url .query )
102+ base_url = f"{ parsed_url .scheme } ://{ parsed_url .netloc } "
103+
104+ is_ambiguous = False
105+ # https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
106+ if path .startswith ("/citation" ):
107+ is_ambiguous = True
108+ persistent_id = qs ["persistentId" ][0 ]
109+ # https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
110+ elif path .startswith ("/dataset.xhtml" ):
111+ # https://dataverse.harvard.edu/api/access/datafile/3323458
112+ persistent_id = qs ["persistentId" ][0 ]
113+ elif path .startswith ("/api/access/datafile" ):
114+ # What we have here is an entity id, which we can use to get a persistentId
115+ file_id = os .path .basename (path )
116+ persistent_id = self .get_dataset_id_from_file_id (base_url , file_id )
117+ elif parsed_url .path .startswith ("/file.xhtml" ):
118+ file_persistent_id = qs ["persistentId" ][0 ]
119+ persistent_id = self .get_dataset_id_from_file_id (
120+ base_url , file_persistent_id
121+ )
122+ else :
123+ raise ValueError (
124+ f"Could not determine persistent id for dataverse URL { url } "
73125 )
74- self .log .debug ("Querying Dataverse: " + search_url )
75- data = self .urlopen (search_url ).json ()["data" ]
76- if data ["count_in_response" ] != 1 :
77- self .log .debug (
78- f"Dataverse search query failed!\n - doi: { doi } \n - url: { url } \n - resp: { json .dump (data )} \n "
79- )
80- return
81-
82- self .record_id = deep_get (data , "items.0.dataset_persistent_id" )
83- elif (
84- parsed_url .path .startswith ("/dataset.xhtml" )
85- and "persistentId" in query_args
86- ):
87- self .record_id = deep_get (query_args , "persistentId.0" )
88-
89- if hasattr (self , "record_id" ):
90- return {"record" : self .record_id , "host" : host }
126+
127+ return persistent_id , is_ambiguous
128+
129+ def get_datafiles (self , url : str ) -> List [dict ]:
130+ """
131+ Return a list of dataFiles for given persistent_id
132+
133+ Supports the following *dataset* URL styles:
134+ - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
135+ - /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
136+
137+ Supports the following *file* URL styles (entire dataset file belongs to will be fetched):
138+ - /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458
139+ - /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
140+ - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
141+
142+ If a URL can not be parsed, throw an exception
143+ """
144+
145+ parsed_url = urlparse (url )
146+ base_url = f"{ parsed_url .scheme } ://{ parsed_url .netloc } "
147+
148+ persistent_id , is_ambiguous = self .parse_dataverse_url (url )
149+
150+ dataset_api_url = (
151+ f"{ base_url } /api/datasets/:persistentId?persistentId={ persistent_id } "
152+ )
153+ resp = self ._request (dataset_api_url , headers = {"accept" : "application/json" })
154+ if resp .status_code == 404 and is_ambiguous :
155+ # It's possible this is a *file* persistent_id, not a dataset one
156+ persistent_id = self .get_dataset_id_from_file_id (base_url , persistent_id )
157+ dataset_api_url = (
158+ f"{ base_url } /api/datasets/:persistentId?persistentId={ persistent_id } "
159+ )
160+ resp = self ._request (
161+ dataset_api_url , headers = {"accept" : "application/json" }
162+ )
163+
164+ if resp .status_code == 404 :
165+ # This persistent id is just not here
166+ raise ValueError (f"{ persistent_id } on { base_url } is not found" )
167+
168+ # We already handled 404, raise error for everything else
169+ resp .raise_for_status ()
170+
171+ # We know the exact persistent_id of the dataset we fetched now
172+ # Save it for use as content_id
173+ self .persistent_id = persistent_id
174+
175+ data = resp .json ()["data" ]
176+
177+ return data ["latestVersion" ]["files" ]
91178
92179 def fetch (self , spec , output_dir , yield_output = False ):
93180 """Fetch and unpack a Dataverse dataset."""
94- record_id = spec ["record" ]
95- host = spec ["host" ]
96-
97- yield f"Fetching Dataverse record { record_id } .\n "
98- url = f'{ host ["url" ]} /api/datasets/:persistentId?persistentId={ record_id } '
181+ url = spec
182+ parsed_url = urlparse (url )
183+ # FIXME: Support determining API URL better
184+ base_url = f"{ parsed_url .scheme } ://{ parsed_url .netloc } "
99185
100- resp = self .urlopen (url , headers = {"accept" : "application/json" })
101- record = resp .json ()["data" ]
186+ yield f"Fetching Dataverse record { url } .\n "
102187
103- for fobj in deep_get ( record , "latestVersion.files" ):
188+ for fobj in self . get_datafiles ( url ):
104189 file_url = (
105190 # without format=original you get the preservation format (plain text, tab separated)
106- f'{ host [ "url" ] } /api/access/datafile/{ deep_get (fobj , "dataFile.id" )} ?format=original'
191+ f'{ base_url } /api/access/datafile/{ deep_get (fobj , "dataFile.id" )} ?format=original'
107192 )
108193 filename = fobj ["label" ]
109194 original_filename = fobj ["dataFile" ].get ("originalFileName" , None )
@@ -128,5 +213,9 @@ def fetch(self, spec, output_dir, yield_output=False):
128213
129214 @property
130215 def content_id (self ):
131- """The Dataverse persistent identifier."""
132- return self .record_id
216+ """
217+ The Dataverse persistent identifier.
218+
219+ Only valid if called after a succesfull fetch
220+ """
221+ return self .persistent_id
0 commit comments