[WIP] Cleanup dataverse contentprovider

yuvipanda · yuvipanda · commit 172f8b017d82 · 2024-12-16T16:05:30.000-08:00
diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py
@@ -3,7 +3,7 @@
 import shutil
 from urllib.parse import parse_qs, urlparse, urlunparse
 
-from ..utils import copytree, deep_get
+from ..utils import copytree, deep_get, is_doi
 from .doi import DoiProvider
 
 
@@ -23,10 +23,11 @@ def __init__(self):
             self.hosts = json.load(fp)["installations"]
         super().__init__()
 
-    def detect(self, doi, ref=None, extra_args=None):
-        """Trigger this provider for things that resolve to a Dataverse dataset.
+    def detect(self, spec, ref=None, extra_args=None):
+        """
+        Detect if given spec is hosted on dataverse
 
-        Handles:
+        The spec can be:
         - DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId}
         - DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&...
         - URL {siteURL}/api/access/datafile/{fileId}
@@ -35,9 +36,11 @@ def detect(self, doi, ref=None, extra_args=None):
         - https://dataverse.harvard.edu/api/access/datafile/3323458
         - doi:10.7910/DVN/6ZXAGT
         - doi:10.7910/DVN/6ZXAGT/3YRRYJ
-
         """
-        url = self.doi2url(doi)
+        if is_doi(spec):
+            url = self.doi2url(spec)
+        else:
+            url = spec
         # Parse the url, to get the base for later API calls
         parsed_url = urlparse(url)
 
@@ -53,51 +56,77 @@ def detect(self, doi, ref=None, extra_args=None):
         if host is None:
             return
 
-        query_args = parse_qs(parsed_url.query)
-        # Corner case handling
-        if parsed_url.path.startswith("/file.xhtml"):
-            # There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
-            # is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way.
-            new_doi = doi.rsplit("/", 1)[0]
-            if new_doi == doi:
-                # tough luck :( Avoid inifite recursion and exit.
-                return
-            return self.detect(new_doi)
-        elif parsed_url.path.startswith("/api/access/datafile"):
-            # Raw url pointing to a datafile is a typical output from an External Tool integration
+        # At this point, we *know* this is a dataverse URL, because:
+        # 1. The DOI resolved to a particular host (if using DOI)
+        # 2. The host is in the list of known dataverse installations
+        #
+        # We don't know exactly what kind of dataverse object this is, but
+        # that can be figured out during fetch as needed
+        return {"host": host, "url": url}
+
+    def get_persistent_id_from_url(self, url: str) -> str:
+        """
+        Return the persistentId for given dataverse URL.
+
+        Supports the following *dataset* URL styles:
+        - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
+        - /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
+
+        Supports the following *file* URL styles:
+        - /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458
+
+        Supports a subset of the following *file* URL styles:
+        - /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
+
+        If a URL can not be parsed, throw an exception
+        """
+        parsed_url = urlparse(url)
+        path = parsed_url.path
+        qs = parse_qs(parsed_url.query)
+
+        # https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
+        # https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
+        if path.startswith("/citation") or path.startswith("/dataset.xhtml"):
+            return qs["persistentId"][0]
+        #  https://dataverse.harvard.edu/api/access/datafile/3323458
+        elif path.startswith("/api/access/datafile"):
+            # What we have here is an entity id, which we can use to get a persistentId
             entity_id = os.path.basename(parsed_url.path)
-            search_query = "q=entityId:" + entity_id + "&type=file"
-            # Knowing the file identifier query search api to get parent dataset
-            search_url = urlunparse(
+            # FIXME: Should we be URL Encoding something here to protect from path traversal
+            # or similar attacks?
+            search_query = f"q=entityId:{entity_id}&type=file"
+            search_api_url = urlunparse(
                 parsed_url._replace(path="/api/search", query=search_query)
             )
-            self.log.debug("Querying Dataverse: " + search_url)
-            data = self.urlopen(search_url).json()["data"]
+            self.log.debug("Querying Dataverse: " + search_api_url)
+            data = self.urlopen(search_api_url).json()["data"]
             if data["count_in_response"] != 1:
-                self.log.debug(
-                    f"Dataverse search query failed!\n - doi: {doi}\n - url: {url}\n - resp: {json.dump(data)}\n"
+                raise ValueError(
+                    f"Dataverse search query failed!\n - url: {url}\n - resp: {json.dumps(data)}\n"
                 )
-                return
-
-            self.record_id = deep_get(data, "items.0.dataset_persistent_id")
-        elif (
-            parsed_url.path.startswith("/dataset.xhtml")
-            and "persistentId" in query_args
-        ):
-            self.record_id = deep_get(query_args, "persistentId.0")
+            return data["items"][0]["dataset_persistent_id"]
+        elif parsed_url.path.startswith("/file.xhtml"):
+            file_persistent_id = qs['persistentId'][0]
+            dataset_persistent_id = file_persistent_id.rsplit("/", 1)[0]
+            if file_persistent_id == dataset_persistent_id:
+                # We can't figure this one out, throw an error
+                raise ValueError(f"Could not find dataset id for {url}")
+            return dataset_persistent_id
 
-        if hasattr(self, "record_id"):
-            return {"record": self.record_id, "host": host}
+        raise ValueError(f"Could not determine persistent id for dataverse URL {url}")
 
     def fetch(self, spec, output_dir, yield_output=False):
         """Fetch and unpack a Dataverse dataset."""
-        record_id = spec["record"]
+        url = spec["url"]
         host = spec["host"]
 
-        yield f"Fetching Dataverse record {record_id}.\n"
-        url = f'{host["url"]}/api/datasets/:persistentId?persistentId={record_id}'
+        persistent_id = self.get_persistent_id_from_url(url)
+
+        yield f"Fetching Dataverse record {persistent_id}.\n"
+        url = f'{host["url"]}/api/datasets/:persistentId?persistentId={persistent_id}'
 
         resp = self.urlopen(url, headers={"accept": "application/json"})
+        print(resp.json())
         record = resp.json()["data"]
 
         for fobj in deep_get(record, "latestVersion.files"):
@@ -126,7 +155,11 @@ def fetch(self, spec, output_dir, yield_output=False):
             copytree(os.path.join(output_dir, d), output_dir)
             shutil.rmtree(os.path.join(output_dir, d))
 
+
+        # Save persistent id
+        self.persitent_id = persistent_id
+
     @property
     def content_id(self):
         """The Dataverse persistent identifier."""
-        return self.record_id
+        return self.persistent_id
diff --git a/repo2docker/contentproviders/doi.py b/repo2docker/contentproviders/doi.py
@@ -50,7 +50,7 @@ def doi2url(self, doi):
 
             # Use the doi.org resolver API
             # documented at https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation#5-proxy-server-rest-api
-            req_url = f"https://doi.org/api/handles/{normalize_doi}"
+            req_url = f"https://doi.org/api/handles/{normalized_doi}"
             resp = self._request(req_url)
             if resp.status_code == 404:
                 # Not a doi, return what we were passed in
@@ -60,7 +60,7 @@ def doi2url(self, doi):
                 # Pick the first URL we find from the doi response
                 for v in data["values"]:
                     if v["type"] == "URL":
-                        return v["data"]["string"]
+                        return v["data"]["value"]
 
                 # No URLs found for this doi, what do we do?
                 self.log.error("DOI {normalized_doi} doesn't point to any URLs")
diff --git a/tests/contentproviders/test_dataverse.py b/tests/contentproviders/test_dataverse.py
@@ -13,10 +13,11 @@
 @pytest.mark.parametrize(
     ("doi", "resolved"),
     [
-        ("doi:10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}),
-        ("10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}),
-        ("https://dataverse.harvard.edu/api/access/datafile/3323458", {"host": harvard_dv, "record": "doi:10.7910/DVN/3MJ7IR"}),
-        ("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", {"host": cimmyt_dv, "record": "hdl:11529/10016"}),
+        ("doi:10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"}),
+        ("10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"}),
+        ("10.7910/DVN/TJCLKP", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP"}),
+        ("https://dataverse.harvard.edu/api/access/datafile/3323458", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/api/access/datafile/3323458"}),
+        ("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", {"host": cimmyt_dv, "url": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016"}),
         ("/some/random/string", None),
         ("https://example.com/path/here", None),
         # Non dataverse DOIs
@@ -27,10 +28,22 @@ def test_detect(doi, resolved):
     assert Dataverse().detect(doi) == resolved
 
 
+@pytest.mark.parametrize(
+    ("url", "persistent_id"),
+    [
+        ("https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", "doi:10.7910/DVN/6ZXAGT"),
+        ("https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", "doi:10.7910/DVN/TJCLKP"),
+        ("https://dataverse.harvard.edu/api/access/datafile/3323458", "doi:10.7910/DVN/3MJ7IR"),
+        ("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", "hdl:11529/10016"),
+    ]
+)
+def test_get_persistent_id(url, persistent_id):
+    assert Dataverse().get_persistent_id_from_url(url) == persistent_id
+
 def test_dataverse_fetch():
-    spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/TJCLKP"}
 
     dv = Dataverse()
+    spec = dv.detect("doi:10.7910/DVN/TJCLKP")
 
     with TemporaryDirectory() as d:
         output = []