Skip to content

Commit baddfe8

Browse files
authored
Merge pull request #253 from VariantEffect/release-2024.2.2
Release 2024.2.2
2 parents 131b6a8 + 88670f9 commit baddfe8

File tree

8 files changed

+105
-3
lines changed

8 files changed

+105
-3
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
44

55
[tool.poetry]
66
name = "mavedb"
7-
version = "2024.2.1"
7+
version = "2024.2.2"
88
description = "API for MaveDB, the database of Multiplexed Assays of Variant Effect."
99
license = "AGPL-3.0-only"
1010
readme = "README.md"

src/mavedb/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
__project__ = "mavedb-api"
2-
__version__ = "2024.2.1"
2+
__version__ = "2024.2.2"

src/mavedb/lib/identifiers.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from mavedb.lib.exceptions import AmbiguousIdentifierError, NonexistentIdentifierError
1414
from mavedb.lib.external_publications import Rxiv, Crossref, CrossrefWork, RxivContentDetail, PublicationAuthors
15-
from mavedb.lib.validation.publication import identifier_valid_for, validate_db_name
15+
from mavedb.lib.validation.publication import identifier_valid_for, validate_db_name, infer_identifier_from_url
1616
from mavedb.models.doi_identifier import DoiIdentifier
1717
from mavedb.models.ensembl_identifier import EnsemblIdentifier
1818
from mavedb.models.ensembl_offset import EnsemblOffset
@@ -260,6 +260,9 @@ async def find_generic_article(
260260
"medRxiv": fetch_medrxiv_article,
261261
}
262262

263+
# We also accept URLs from our accepted publications. Attempt to convert a potential URL to an identifier.
264+
identifier = infer_identifier_from_url(identifier)
265+
263266
# Only check entries with the appropriate `db_name` if one is provided.
264267
db_specific_match: dict[str, Union[PublicationIdentifier, ExternalPublication, None]] = {}
265268
if db_name:

src/mavedb/lib/score_sets.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -607,6 +607,7 @@ def csv_data_to_df(file_data: BinaryIO) -> pd.DataFrame:
607607
sep=",",
608608
encoding="utf-8",
609609
quotechar="'",
610+
index_col=False,
610611
na_values=extra_na_values,
611612
keep_default_na=True,
612613
dtype={**{col: str for col in HGVSColumns.options()}, "scores": float},

src/mavedb/lib/validation/publication.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import idutils
22
import datetime
33

4+
from urllib.parse import urlparse
5+
46
from mavedb.lib.validation.exceptions import ValidationError
57
from mavedb.lib.validation.constants.publication import valid_dbnames
68

@@ -35,9 +37,46 @@ def identifier_valid_for(identifier: str) -> dict[str, bool]:
3537
"PubMed": validate_pubmed(identifier),
3638
"bioRxiv": validate_biorxiv(identifier),
3739
"medRxiv": validate_medrxiv(identifier),
40+
"Crossref": idutils.is_doi(identifier) is not None,
3841
}
3942

4043

44+
def infer_identifier_from_url(identifier: str) -> str:
45+
"""
46+
Infers an identifier from a potential URL based on the database we believe the URL
47+
to be from.
48+
49+
Parameters
50+
__________
51+
identifier : str
52+
The identifier / URL to parse
53+
54+
Returns
55+
_______
56+
str
57+
The parsed identifier from the url or the original identifier.
58+
"""
59+
url = urlparse(identifier)
60+
if url.netloc:
61+
# http://www.dx.doi.org/{DOI}
62+
if "dx.doi.org" in url.netloc:
63+
identifier = url.path.strip("/")
64+
65+
# https://www.biorxiv.org/content/10.1101/2024.04.26.591310, # https://www.medrxiv.org/content/10.1101/2024.04.26.59131023
66+
elif "biorxiv.org" in url.netloc or "medrxiv.org" in url.netloc:
67+
identifier = url.path.strip("/").split("/")[-1]
68+
69+
# https://pubmed.ncbi.nlm.nih.gov/24567513/, http://www.ncbi.nlm.nih.gov/pubmed/432
70+
elif "ncbi.nlm.nih.gov" in url.netloc:
71+
identifier = url.path.strip("/").split("/")[-1]
72+
73+
# The url does not come from an accepted database.
74+
else:
75+
return identifier
76+
77+
return identifier
78+
79+
4180
def validate_publication(identifier: str) -> None:
4281
"""
4382
Validates that a passed identifier is one we accept. Currently allowed

src/mavedb/view_models/experiment.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from mavedb.lib.validation import keywords
55
from mavedb.lib.validation.exceptions import ValidationError
6+
from mavedb.lib.validation.utilities import is_null
67
from mavedb.view_models import PublicationIdentifiersGetter
78
from mavedb.view_models.base.base import BaseModel, validator
89
from mavedb.view_models.doi_identifier import (
@@ -72,6 +73,12 @@ def validate_keywords(cls, v):
7273
keywords.validate_keywords(v)
7374
return v
7475

76+
@validator("title", "short_description", "abstract_text", "method_text")
77+
def validate_field_is_non_empty(cls, v):
78+
if is_null(v) or not isinstance(v, str):
79+
raise ValidationError("This field is required and cannot be empty.")
80+
return v.strip()
81+
7582

7683
class ExperimentCreate(ExperimentModify):
7784
experiment_set_urn: Optional[str]

src/mavedb/view_models/score_set.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from mavedb.lib.validation import keywords, urn_re
1010
from mavedb.lib.validation.exceptions import ValidationError
11+
from mavedb.lib.validation.utilities import is_null
1112
from mavedb.models.enums.processing_state import ProcessingState
1213
from mavedb.models.target_sequence import TargetSequence
1314
from mavedb.view_models import PublicationIdentifiersGetter
@@ -68,6 +69,12 @@ class ScoreSetModify(ScoreSetBase):
6869
doi_identifiers: Optional[list[DoiIdentifierCreate]]
6970
target_genes: list[TargetGeneCreate]
7071

72+
@validator("title", "short_description", "abstract_text", "method_text")
73+
def validate_field_is_non_empty(cls, v):
74+
if is_null(v) or not isinstance(v, str):
75+
raise ValidationError("This field is required and cannot be empty.")
76+
return v.strip()
77+
7178
@validator("primary_publication_identifiers")
7279
def max_one_primary_publication_identifier(cls, v):
7380
if isinstance(v, list):

tests/validation/test_publication.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
validate_medrxiv,
88
identifier_valid_for,
99
validate_db_name,
10+
infer_identifier_from_url,
1011
)
1112
from mavedb.lib.validation.exceptions import ValidationError
1213
from mavedb.lib.validation.constants.publication import valid_dbnames
@@ -22,6 +23,9 @@ def test_valid_biorxiv(self):
2223
def test_valid_medrxiv(self):
2324
assert validate_publication("20733333") == None
2425

26+
def test_valid_crossref(self):
27+
assert validate_publication("10.1101/1234") == None
28+
2529
def test_invalid_identifier(self):
2630
with self.assertRaises(ValidationError):
2731
assert validate_publication("2074d44")
@@ -74,41 +78,55 @@ def test_valid_pubmed(self):
7478
"PubMed": True,
7579
"bioRxiv": False,
7680
"medRxiv": False,
81+
"Crossref": False,
7782
}
7883

7984
def test_valid_biorxiv(self):
8085
assert identifier_valid_for("2022.12.12.207222") == {
8186
"PubMed": False,
8287
"bioRxiv": True,
8388
"medRxiv": False,
89+
"Crossref": False,
8490
}
8591

8692
def test_valid_medrxiv(self):
8793
assert identifier_valid_for("2022.12.12.20733333") == {
8894
"PubMed": False,
8995
"bioRxiv": False,
9096
"medRxiv": True,
97+
"Crossref": False,
9198
}
9299

93100
def test_valid_pubmed_biorxiv(self):
94101
assert identifier_valid_for("207222") == {
95102
"PubMed": True,
96103
"bioRxiv": True,
97104
"medRxiv": False,
105+
"Crossref": False,
98106
}
99107

100108
def test_valid_pubmed_medrxiv(self):
101109
assert identifier_valid_for("20733333") == {
102110
"PubMed": True,
103111
"bioRxiv": False,
104112
"medRxiv": True,
113+
"Crossref": False,
105114
}
106115

107116
def test_valid_pubmed_none(self):
108117
assert identifier_valid_for("invalid") == {
109118
"PubMed": False,
110119
"bioRxiv": False,
111120
"medRxiv": False,
121+
"Crossref": False,
122+
}
123+
124+
def test_valid_crossref(self):
125+
assert identifier_valid_for("10.1101/1234") == {
126+
"PubMed": False,
127+
"bioRxiv": False,
128+
"medRxiv": False,
129+
"Crossref": True,
112130
}
113131

114132

@@ -124,3 +142,30 @@ def test_empty_name(self):
124142
def test_invalid_name(self):
125143
with self.assertRaises(ValidationError):
126144
validate_db_name("invalid db")
145+
146+
147+
class TestInferIdentifierFromUrl(TestCase):
148+
def test_doi_url(self):
149+
assert infer_identifier_from_url("http://www.dx.doi.org/10.1101/1234") == "10.1101/1234"
150+
151+
def test_biorxiv_url(self):
152+
assert (
153+
infer_identifier_from_url("https://www.biorxiv.org/content/10.1101/2024.04.26.591310")
154+
== "2024.04.26.591310"
155+
)
156+
157+
def test_medrxiv_url(self):
158+
assert (
159+
infer_identifier_from_url("https://www.medrxiv.org/content/10.1101/2024.04.26.59131023")
160+
== "2024.04.26.59131023"
161+
)
162+
163+
def test_pubmed_url(self):
164+
assert infer_identifier_from_url("https://pubmed.ncbi.nlm.nih.gov/24567513/") == "24567513"
165+
assert infer_identifier_from_url("http://www.ncbi.nlm.nih.gov/pubmed/432") == "432"
166+
167+
def test_identifier_not_url(self):
168+
assert infer_identifier_from_url("29023") == "29023"
169+
170+
def test_url_not_accepted(self):
171+
assert infer_identifier_from_url("www.notaccepted.org/29023") == "www.notaccepted.org/29023"

0 commit comments

Comments
 (0)