Merge pull request #253 from VariantEffect/release-2024.2.2

bencap · web-flow · commit baddfe8cccb0 · 2024-08-12T16:46:13.000-07:00
Release 2024.2.2
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "mavedb"
-version = "2024.2.1"
+version = "2024.2.2"
 description = "API for MaveDB, the database of Multiplexed Assays of Variant Effect."
 license = "AGPL-3.0-only"
 readme = "README.md"
diff --git a/src/mavedb/__init__.py b/src/mavedb/__init__.py
@@ -1,2 +1,2 @@
 __project__ = "mavedb-api"
-__version__ = "2024.2.1"
+__version__ = "2024.2.2"
diff --git a/src/mavedb/lib/identifiers.py b/src/mavedb/lib/identifiers.py
@@ -12,7 +12,7 @@
 
 from mavedb.lib.exceptions import AmbiguousIdentifierError, NonexistentIdentifierError
 from mavedb.lib.external_publications import Rxiv, Crossref, CrossrefWork, RxivContentDetail, PublicationAuthors
-from mavedb.lib.validation.publication import identifier_valid_for, validate_db_name
+from mavedb.lib.validation.publication import identifier_valid_for, validate_db_name, infer_identifier_from_url
 from mavedb.models.doi_identifier import DoiIdentifier
 from mavedb.models.ensembl_identifier import EnsemblIdentifier
 from mavedb.models.ensembl_offset import EnsemblOffset
@@ -260,6 +260,9 @@ async def find_generic_article(
         "medRxiv": fetch_medrxiv_article,
     }
 
+    # We also accept URLs from our accepted publications. Attempt to convert a potential URL to an identifier.
+    identifier = infer_identifier_from_url(identifier)
+
     # Only check entries with the appropriate `db_name` if one is provided.
     db_specific_match: dict[str, Union[PublicationIdentifier, ExternalPublication, None]] = {}
     if db_name:
diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py
@@ -607,6 +607,7 @@ def csv_data_to_df(file_data: BinaryIO) -> pd.DataFrame:
         sep=",",
         encoding="utf-8",
         quotechar="'",
+        index_col=False,
         na_values=extra_na_values,
         keep_default_na=True,
         dtype={**{col: str for col in HGVSColumns.options()}, "scores": float},
diff --git a/src/mavedb/lib/validation/publication.py b/src/mavedb/lib/validation/publication.py
@@ -1,6 +1,8 @@
 import idutils
 import datetime
 
+from urllib.parse import urlparse
+
 from mavedb.lib.validation.exceptions import ValidationError
 from mavedb.lib.validation.constants.publication import valid_dbnames
 
@@ -35,9 +37,46 @@ def identifier_valid_for(identifier: str) -> dict[str, bool]:
         "PubMed": validate_pubmed(identifier),
         "bioRxiv": validate_biorxiv(identifier),
         "medRxiv": validate_medrxiv(identifier),
+        "Crossref": idutils.is_doi(identifier) is not None,
     }
 
 
+def infer_identifier_from_url(identifier: str) -> str:
+    """
+    Infers an identifier from a potential URL based on the database we believe the URL
+    to be from.
+
+    Parameters
+    __________
+    identifier : str
+        The identifier / URL to parse
+
+    Returns
+    _______
+    str
+        The parsed identifier from the url or the original identifier.
+    """
+    url = urlparse(identifier)
+    if url.netloc:
+        # http://www.dx.doi.org/{DOI}
+        if "dx.doi.org" in url.netloc:
+            identifier = url.path.strip("/")
+
+        # https://www.biorxiv.org/content/10.1101/2024.04.26.591310, # https://www.medrxiv.org/content/10.1101/2024.04.26.59131023
+        elif "biorxiv.org" in url.netloc or "medrxiv.org" in url.netloc:
+            identifier = url.path.strip("/").split("/")[-1]
+
+        # https://pubmed.ncbi.nlm.nih.gov/24567513/, http://www.ncbi.nlm.nih.gov/pubmed/432
+        elif "ncbi.nlm.nih.gov" in url.netloc:
+            identifier = url.path.strip("/").split("/")[-1]
+
+        # The url does not come from an accepted database.
+        else:
+            return identifier
+
+    return identifier
+
+
 def validate_publication(identifier: str) -> None:
     """
     Validates that a passed identifier is one we accept. Currently allowed
diff --git a/src/mavedb/view_models/experiment.py b/src/mavedb/view_models/experiment.py
@@ -3,6 +3,7 @@
 
 from mavedb.lib.validation import keywords
 from mavedb.lib.validation.exceptions import ValidationError
+from mavedb.lib.validation.utilities import is_null
 from mavedb.view_models import PublicationIdentifiersGetter
 from mavedb.view_models.base.base import BaseModel, validator
 from mavedb.view_models.doi_identifier import (
@@ -72,6 +73,12 @@ def validate_keywords(cls, v):
         keywords.validate_keywords(v)
         return v
 
+    @validator("title", "short_description", "abstract_text", "method_text")
+    def validate_field_is_non_empty(cls, v):
+        if is_null(v) or not isinstance(v, str):
+            raise ValidationError("This field is required and cannot be empty.")
+        return v.strip()
+
 
 class ExperimentCreate(ExperimentModify):
     experiment_set_urn: Optional[str]
diff --git a/src/mavedb/view_models/score_set.py b/src/mavedb/view_models/score_set.py
@@ -8,6 +8,7 @@
 
 from mavedb.lib.validation import keywords, urn_re
 from mavedb.lib.validation.exceptions import ValidationError
+from mavedb.lib.validation.utilities import is_null
 from mavedb.models.enums.processing_state import ProcessingState
 from mavedb.models.target_sequence import TargetSequence
 from mavedb.view_models import PublicationIdentifiersGetter
@@ -68,6 +69,12 @@ class ScoreSetModify(ScoreSetBase):
     doi_identifiers: Optional[list[DoiIdentifierCreate]]
     target_genes: list[TargetGeneCreate]
 
+    @validator("title", "short_description", "abstract_text", "method_text")
+    def validate_field_is_non_empty(cls, v):
+        if is_null(v) or not isinstance(v, str):
+            raise ValidationError("This field is required and cannot be empty.")
+        return v.strip()
+
     @validator("primary_publication_identifiers")
     def max_one_primary_publication_identifier(cls, v):
         if isinstance(v, list):
diff --git a/tests/validation/test_publication.py b/tests/validation/test_publication.py
@@ -7,6 +7,7 @@
     validate_medrxiv,
     identifier_valid_for,
     validate_db_name,
+    infer_identifier_from_url,
 )
 from mavedb.lib.validation.exceptions import ValidationError
 from mavedb.lib.validation.constants.publication import valid_dbnames
@@ -22,6 +23,9 @@ def test_valid_biorxiv(self):
     def test_valid_medrxiv(self):
         assert validate_publication("20733333") == None
 
+    def test_valid_crossref(self):
+        assert validate_publication("10.1101/1234") == None
+
     def test_invalid_identifier(self):
         with self.assertRaises(ValidationError):
             assert validate_publication("2074d44")
@@ -74,41 +78,55 @@ def test_valid_pubmed(self):
             "PubMed": True,
             "bioRxiv": False,
             "medRxiv": False,
+            "Crossref": False,
         }
 
     def test_valid_biorxiv(self):
         assert identifier_valid_for("2022.12.12.207222") == {
             "PubMed": False,
             "bioRxiv": True,
             "medRxiv": False,
+            "Crossref": False,
         }
 
     def test_valid_medrxiv(self):
         assert identifier_valid_for("2022.12.12.20733333") == {
             "PubMed": False,
             "bioRxiv": False,
             "medRxiv": True,
+            "Crossref": False,
         }
 
     def test_valid_pubmed_biorxiv(self):
         assert identifier_valid_for("207222") == {
             "PubMed": True,
             "bioRxiv": True,
             "medRxiv": False,
+            "Crossref": False,
         }
 
     def test_valid_pubmed_medrxiv(self):
         assert identifier_valid_for("20733333") == {
             "PubMed": True,
             "bioRxiv": False,
             "medRxiv": True,
+            "Crossref": False,
         }
 
     def test_valid_pubmed_none(self):
         assert identifier_valid_for("invalid") == {
             "PubMed": False,
             "bioRxiv": False,
             "medRxiv": False,
+            "Crossref": False,
+        }
+
+    def test_valid_crossref(self):
+        assert identifier_valid_for("10.1101/1234") == {
+            "PubMed": False,
+            "bioRxiv": False,
+            "medRxiv": False,
+            "Crossref": True,
         }
 
 
@@ -124,3 +142,30 @@ def test_empty_name(self):
     def test_invalid_name(self):
         with self.assertRaises(ValidationError):
             validate_db_name("invalid db")
+
+
+class TestInferIdentifierFromUrl(TestCase):
+    def test_doi_url(self):
+        assert infer_identifier_from_url("http://www.dx.doi.org/10.1101/1234") == "10.1101/1234"
+
+    def test_biorxiv_url(self):
+        assert (
+            infer_identifier_from_url("https://www.biorxiv.org/content/10.1101/2024.04.26.591310")
+            == "2024.04.26.591310"
+        )
+
+    def test_medrxiv_url(self):
+        assert (
+            infer_identifier_from_url("https://www.medrxiv.org/content/10.1101/2024.04.26.59131023")
+            == "2024.04.26.59131023"
+        )
+
+    def test_pubmed_url(self):
+        assert infer_identifier_from_url("https://pubmed.ncbi.nlm.nih.gov/24567513/") == "24567513"
+        assert infer_identifier_from_url("http://www.ncbi.nlm.nih.gov/pubmed/432") == "432"
+
+    def test_identifier_not_url(self):
+        assert infer_identifier_from_url("29023") == "29023"
+
+    def test_url_not_accepted(self):
+        assert infer_identifier_from_url("www.notaccepted.org/29023") == "www.notaccepted.org/29023"

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`__project__ = "mavedb-api"`
`2`		`-__version__ = "2024.2.1"`
	`2`	`+__version__ = "2024.2.2"`