Support Searching for Publications via URL

bencap · bencap · commit a0c90d5a3310 · 2024-07-24T11:01:07.000-07:00
Adds support to search for publications via URLs from accepted publication databases. If a user submits a
publication identifier that appears to be a URL, the publication database is inferred via the URL string and
an identifier is parsed from the URL based on the known format of that publication databases URL.
diff --git a/src/mavedb/lib/identifiers.py b/src/mavedb/lib/identifiers.py
@@ -12,7 +12,7 @@
 
 from mavedb.lib.exceptions import AmbiguousIdentifierError, NonexistentIdentifierError
 from mavedb.lib.external_publications import Rxiv, Crossref, CrossrefWork, RxivContentDetail, PublicationAuthors
-from mavedb.lib.validation.publication import identifier_valid_for, validate_db_name
+from mavedb.lib.validation.publication import identifier_valid_for, validate_db_name, infer_identifier_from_url
 from mavedb.models.doi_identifier import DoiIdentifier
 from mavedb.models.ensembl_identifier import EnsemblIdentifier
 from mavedb.models.ensembl_offset import EnsemblOffset
@@ -260,6 +260,9 @@ async def find_generic_article(
         "medRxiv": fetch_medrxiv_article,
     }
 
+    # We also accept URLs from our accepted publications. Attempt to convert a potential URL to an identifier.
+    identifier = infer_identifier_from_url(identifier)
+
     # Only check entries with the appropriate `db_name` if one is provided.
     db_specific_match: dict[str, Union[PublicationIdentifier, ExternalPublication, None]] = {}
     if db_name:
diff --git a/src/mavedb/lib/validation/publication.py b/src/mavedb/lib/validation/publication.py
@@ -1,6 +1,8 @@
 import idutils
 import datetime
 
+from urllib.parse import urlparse
+
 from mavedb.lib.validation.exceptions import ValidationError
 from mavedb.lib.validation.constants.publication import valid_dbnames
 
@@ -35,9 +37,46 @@ def identifier_valid_for(identifier: str) -> dict[str, bool]:
         "PubMed": validate_pubmed(identifier),
         "bioRxiv": validate_biorxiv(identifier),
         "medRxiv": validate_medrxiv(identifier),
+        "Crossref": idutils.is_doi(identifier) is not None,
     }
 
 
+def infer_identifier_from_url(identifier: str) -> str:
+    """
+    Infers an identifier from a potential URL based on the database we believe the URL
+    to be from.
+
+    Parameters
+    __________
+    identifier : str
+        The identifier / URL to parse
+
+    Returns
+    _______
+    str
+        The parsed identifier from the url or the original identifier.
+    """
+    url = urlparse(identifier)
+    if url.netloc:
+        # http://www.dx.doi.org/{DOI}
+        if "dx.doi.org" in url.netloc:
+            identifier = url.path.strip("/")
+
+        # https://www.biorxiv.org/content/10.1101/2024.04.26.591310, # https://www.medrxiv.org/content/10.1101/2024.04.26.59131023
+        elif "biorxiv.org" in url.netloc or "medrxiv.org" in url.netloc:
+            identifier = url.path.strip("/").split("/")[-1]
+
+        # https://pubmed.ncbi.nlm.nih.gov/24567513/, http://www.ncbi.nlm.nih.gov/pubmed/432
+        elif "ncbi.nlm.nih.gov" in url.netloc:
+            identifier = url.path.strip("/").split("/")[-1]
+
+        # The url does not come from an accepted database.
+        else:
+            return identifier
+
+    return identifier
+
+
 def validate_publication(identifier: str) -> None:
     """
     Validates that a passed identifier is one we accept. Currently allowed
diff --git a/tests/validation/test_publication.py b/tests/validation/test_publication.py
@@ -7,6 +7,7 @@
     validate_medrxiv,
     identifier_valid_for,
     validate_db_name,
+    infer_identifier_from_url,
 )
 from mavedb.lib.validation.exceptions import ValidationError
 from mavedb.lib.validation.constants.publication import valid_dbnames
@@ -22,6 +23,9 @@ def test_valid_biorxiv(self):
     def test_valid_medrxiv(self):
         assert validate_publication("20733333") == None
 
+    def test_valid_crossref(self):
+        assert validate_publication("10.1101/1234") == None
+
     def test_invalid_identifier(self):
         with self.assertRaises(ValidationError):
             assert validate_publication("2074d44")
@@ -74,41 +78,55 @@ def test_valid_pubmed(self):
             "PubMed": True,
             "bioRxiv": False,
             "medRxiv": False,
+            "Crossref": False,
         }
 
     def test_valid_biorxiv(self):
         assert identifier_valid_for("2022.12.12.207222") == {
             "PubMed": False,
             "bioRxiv": True,
             "medRxiv": False,
+            "Crossref": False,
         }
 
     def test_valid_medrxiv(self):
         assert identifier_valid_for("2022.12.12.20733333") == {
             "PubMed": False,
             "bioRxiv": False,
             "medRxiv": True,
+            "Crossref": False,
         }
 
     def test_valid_pubmed_biorxiv(self):
         assert identifier_valid_for("207222") == {
             "PubMed": True,
             "bioRxiv": True,
             "medRxiv": False,
+            "Crossref": False,
         }
 
     def test_valid_pubmed_medrxiv(self):
         assert identifier_valid_for("20733333") == {
             "PubMed": True,
             "bioRxiv": False,
             "medRxiv": True,
+            "Crossref": False,
         }
 
     def test_valid_pubmed_none(self):
         assert identifier_valid_for("invalid") == {
             "PubMed": False,
             "bioRxiv": False,
             "medRxiv": False,
+            "Crossref": False,
+        }
+
+    def test_valid_crossref(self):
+        assert identifier_valid_for("10.1101/1234") == {
+            "PubMed": False,
+            "bioRxiv": False,
+            "medRxiv": False,
+            "Crossref": True,
         }
 
 
@@ -124,3 +142,30 @@ def test_empty_name(self):
     def test_invalid_name(self):
         with self.assertRaises(ValidationError):
             validate_db_name("invalid db")
+
+
+class TestInferIdentifierFromUrl(TestCase):
+    def test_doi_url(self):
+        assert infer_identifier_from_url("http://www.dx.doi.org/10.1101/1234") == "10.1101/1234"
+
+    def test_biorxiv_url(self):
+        assert (
+            infer_identifier_from_url("https://www.biorxiv.org/content/10.1101/2024.04.26.591310")
+            == "2024.04.26.591310"
+        )
+
+    def test_medrxiv_url(self):
+        assert (
+            infer_identifier_from_url("https://www.medrxiv.org/content/10.1101/2024.04.26.59131023")
+            == "2024.04.26.59131023"
+        )
+
+    def test_pubmed_url(self):
+        assert infer_identifier_from_url("https://pubmed.ncbi.nlm.nih.gov/24567513/") == "24567513"
+        assert infer_identifier_from_url("http://www.ncbi.nlm.nih.gov/pubmed/432") == "432"
+
+    def test_identifier_not_url(self):
+        assert infer_identifier_from_url("29023") == "29023"
+
+    def test_url_not_accepted(self):
+        assert infer_identifier_from_url("www.notaccepted.org/29023") == "www.notaccepted.org/29023"