Skip to content

Commit a0c90d5

Browse files
committed
Support Searching for Publications via URL
Adds support to search for publications via URLs from accepted publication databases. If a user submits a publication identifier that appears to be a URL, the publication database is inferred via the URL string and an identifier is parsed from the URL based on the known format of that publication databases URL.
1 parent ab5ceab commit a0c90d5

File tree

3 files changed

+88
-1
lines changed

3 files changed

+88
-1
lines changed

src/mavedb/lib/identifiers.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from mavedb.lib.exceptions import AmbiguousIdentifierError, NonexistentIdentifierError
1414
from mavedb.lib.external_publications import Rxiv, Crossref, CrossrefWork, RxivContentDetail, PublicationAuthors
15-
from mavedb.lib.validation.publication import identifier_valid_for, validate_db_name
15+
from mavedb.lib.validation.publication import identifier_valid_for, validate_db_name, infer_identifier_from_url
1616
from mavedb.models.doi_identifier import DoiIdentifier
1717
from mavedb.models.ensembl_identifier import EnsemblIdentifier
1818
from mavedb.models.ensembl_offset import EnsemblOffset
@@ -260,6 +260,9 @@ async def find_generic_article(
260260
"medRxiv": fetch_medrxiv_article,
261261
}
262262

263+
# We also accept URLs from our accepted publications. Attempt to convert a potential URL to an identifier.
264+
identifier = infer_identifier_from_url(identifier)
265+
263266
# Only check entries with the appropriate `db_name` if one is provided.
264267
db_specific_match: dict[str, Union[PublicationIdentifier, ExternalPublication, None]] = {}
265268
if db_name:

src/mavedb/lib/validation/publication.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import idutils
22
import datetime
33

4+
from urllib.parse import urlparse
5+
46
from mavedb.lib.validation.exceptions import ValidationError
57
from mavedb.lib.validation.constants.publication import valid_dbnames
68

@@ -35,9 +37,46 @@ def identifier_valid_for(identifier: str) -> dict[str, bool]:
3537
"PubMed": validate_pubmed(identifier),
3638
"bioRxiv": validate_biorxiv(identifier),
3739
"medRxiv": validate_medrxiv(identifier),
40+
"Crossref": idutils.is_doi(identifier) is not None,
3841
}
3942

4043

44+
def infer_identifier_from_url(identifier: str) -> str:
45+
"""
46+
Infers an identifier from a potential URL based on the database we believe the URL
47+
to be from.
48+
49+
Parameters
50+
__________
51+
identifier : str
52+
The identifier / URL to parse
53+
54+
Returns
55+
_______
56+
str
57+
The parsed identifier from the url or the original identifier.
58+
"""
59+
url = urlparse(identifier)
60+
if url.netloc:
61+
# http://www.dx.doi.org/{DOI}
62+
if "dx.doi.org" in url.netloc:
63+
identifier = url.path.strip("/")
64+
65+
# https://www.biorxiv.org/content/10.1101/2024.04.26.591310, # https://www.medrxiv.org/content/10.1101/2024.04.26.59131023
66+
elif "biorxiv.org" in url.netloc or "medrxiv.org" in url.netloc:
67+
identifier = url.path.strip("/").split("/")[-1]
68+
69+
# https://pubmed.ncbi.nlm.nih.gov/24567513/, http://www.ncbi.nlm.nih.gov/pubmed/432
70+
elif "ncbi.nlm.nih.gov" in url.netloc:
71+
identifier = url.path.strip("/").split("/")[-1]
72+
73+
# The url does not come from an accepted database.
74+
else:
75+
return identifier
76+
77+
return identifier
78+
79+
4180
def validate_publication(identifier: str) -> None:
4281
"""
4382
Validates that a passed identifier is one we accept. Currently allowed

tests/validation/test_publication.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
validate_medrxiv,
88
identifier_valid_for,
99
validate_db_name,
10+
infer_identifier_from_url,
1011
)
1112
from mavedb.lib.validation.exceptions import ValidationError
1213
from mavedb.lib.validation.constants.publication import valid_dbnames
@@ -22,6 +23,9 @@ def test_valid_biorxiv(self):
2223
def test_valid_medrxiv(self):
2324
assert validate_publication("20733333") == None
2425

26+
def test_valid_crossref(self):
27+
assert validate_publication("10.1101/1234") == None
28+
2529
def test_invalid_identifier(self):
2630
with self.assertRaises(ValidationError):
2731
assert validate_publication("2074d44")
@@ -74,41 +78,55 @@ def test_valid_pubmed(self):
7478
"PubMed": True,
7579
"bioRxiv": False,
7680
"medRxiv": False,
81+
"Crossref": False,
7782
}
7883

7984
def test_valid_biorxiv(self):
8085
assert identifier_valid_for("2022.12.12.207222") == {
8186
"PubMed": False,
8287
"bioRxiv": True,
8388
"medRxiv": False,
89+
"Crossref": False,
8490
}
8591

8692
def test_valid_medrxiv(self):
8793
assert identifier_valid_for("2022.12.12.20733333") == {
8894
"PubMed": False,
8995
"bioRxiv": False,
9096
"medRxiv": True,
97+
"Crossref": False,
9198
}
9299

93100
def test_valid_pubmed_biorxiv(self):
94101
assert identifier_valid_for("207222") == {
95102
"PubMed": True,
96103
"bioRxiv": True,
97104
"medRxiv": False,
105+
"Crossref": False,
98106
}
99107

100108
def test_valid_pubmed_medrxiv(self):
101109
assert identifier_valid_for("20733333") == {
102110
"PubMed": True,
103111
"bioRxiv": False,
104112
"medRxiv": True,
113+
"Crossref": False,
105114
}
106115

107116
def test_valid_pubmed_none(self):
108117
assert identifier_valid_for("invalid") == {
109118
"PubMed": False,
110119
"bioRxiv": False,
111120
"medRxiv": False,
121+
"Crossref": False,
122+
}
123+
124+
def test_valid_crossref(self):
125+
assert identifier_valid_for("10.1101/1234") == {
126+
"PubMed": False,
127+
"bioRxiv": False,
128+
"medRxiv": False,
129+
"Crossref": True,
112130
}
113131

114132

@@ -124,3 +142,30 @@ def test_empty_name(self):
124142
def test_invalid_name(self):
125143
with self.assertRaises(ValidationError):
126144
validate_db_name("invalid db")
145+
146+
147+
class TestInferIdentifierFromUrl(TestCase):
148+
def test_doi_url(self):
149+
assert infer_identifier_from_url("http://www.dx.doi.org/10.1101/1234") == "10.1101/1234"
150+
151+
def test_biorxiv_url(self):
152+
assert (
153+
infer_identifier_from_url("https://www.biorxiv.org/content/10.1101/2024.04.26.591310")
154+
== "2024.04.26.591310"
155+
)
156+
157+
def test_medrxiv_url(self):
158+
assert (
159+
infer_identifier_from_url("https://www.medrxiv.org/content/10.1101/2024.04.26.59131023")
160+
== "2024.04.26.59131023"
161+
)
162+
163+
def test_pubmed_url(self):
164+
assert infer_identifier_from_url("https://pubmed.ncbi.nlm.nih.gov/24567513/") == "24567513"
165+
assert infer_identifier_from_url("http://www.ncbi.nlm.nih.gov/pubmed/432") == "432"
166+
167+
def test_identifier_not_url(self):
168+
assert infer_identifier_from_url("29023") == "29023"
169+
170+
def test_url_not_accepted(self):
171+
assert infer_identifier_from_url("www.notaccepted.org/29023") == "www.notaccepted.org/29023"

0 commit comments

Comments
 (0)