Skip to content

Commit 196c407

Browse files
bencapjstone-dev
authored andcommitted
Replace Metapub with Eutils
Metapub is not as well supported as it was in the past and uses a better maintained package, eutils, under the hood. This change replaces previous functionality that used Metapub to search for PubMed articles with eutils.
1 parent e0ba977 commit 196c407

File tree

4 files changed

+98
-133
lines changed

4 files changed

+98
-133
lines changed

alembic/versions/22e2d92d602e_add_publication_identifier_metadata_.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,13 @@
66
77
"""
88
from typing import Optional
9+
import os
910

10-
import metapub
11+
import eutils
12+
from eutils._internal.xmlfacades.pubmedarticleset import PubmedArticleSet
1113
import sqlalchemy as sa
1214
from eutils import EutilsNCBIError
15+
from mavedb.lib.exceptions import AmbiguousIdentifierError
1316
from sqlalchemy.dialects.postgresql import JSONB
1417
from sqlalchemy.orm import Session
1518

@@ -82,11 +85,11 @@ def upgrade():
8285
if pub_article:
8386
item.title = pub_article.title
8487
item.abstract = pub_article.abstract
85-
item.publication_doi = pub_article.published_doi
88+
item.publication_doi = pub_article.publication_doi
8689
item.publication_year = pub_article.publication_year
8790
item.publication_journal = pub_article.publication_journal
8891

89-
authors = [author["name"].replace("'", "''") for author in pub_article.authors]
92+
authors = [str(author["name"]).replace("'", "''") for author in pub_article.authors]
9093
authors = [{"name": author, "primary": idx == 0} for idx, author in enumerate(authors)]
9194
item.authors = authors
9295

@@ -97,7 +100,7 @@ def upgrade():
97100
item.preprint_date = bio_article.preprint_date
98101
item.reference_html = bio_article.reference_html
99102

100-
authors = [author["name"].replace("'", "''") for author in bio_article.authors]
103+
authors = [str(author["name"]).replace("'", "''") for author in bio_article.authors]
101104
authors = [{"name": author, "primary": idx == 0} for idx, author in enumerate(authors)]
102105
item.authors = authors
103106

@@ -108,7 +111,7 @@ def upgrade():
108111
item.preprint_date = med_article.preprint_date
109112
item.reference_html = med_article.reference_html
110113

111-
authors = [author["name"].replace("'", "''") for author in med_article.authors]
114+
authors = [str(author["name"]).replace("'", "''") for author in med_article.authors]
112115
authors = [{"name": author, "primary": idx == 0} for idx, author in enumerate(authors)]
113116
item.authors = authors
114117

@@ -145,13 +148,18 @@ def fetch_pubmed_article(identifier: str) -> Optional[ExternalPublication]:
145148
"""
146149
Fetch an existing PubMed article from NCBI
147150
"""
148-
fetch = metapub.PubMedFetcher()
151+
fetch = eutils.QueryService(api_key=os.getenv("NCBI_API_KEY"))
149152
try:
150-
article = fetch.article_by_pmid(pmid=identifier)
151-
if article:
152-
article = ExternalPublication(identifier=identifier, db_name="PubMed", external_publication=article)
153+
fetched_articles = list(PubmedArticleSet(fetch.efetch({"db": "pubmed", "id": identifier})))
154+
assert len(fetched_articles) < 2
155+
article = ExternalPublication(identifier=identifier, db_name="PubMed", external_publication=fetched_articles[0])
156+
157+
except AssertionError as exc:
158+
raise AmbiguousIdentifierError(f"Fetched more than 1 PubMed article associated with PMID {identifier}") from exc
153159
except EutilsNCBIError:
154160
return None
161+
except IndexError:
162+
return None
155163
else:
156164
return article
157165

src/mavedb/lib/identifiers.py

Lines changed: 73 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
1+
import os
12
from datetime import date
23
from typing import Optional, Union
34

4-
import metapub
5+
import eutils
56
from eutils import EutilsNCBIError
7+
from eutils._internal.xmlfacades.pubmedarticle import PubmedArticle
8+
from eutils._internal.xmlfacades.pubmedarticleset import PubmedArticleSet
69
from sqlalchemy.orm import Session
710

811
from mavedb.lib.exceptions import AmbiguousIdentifierError, NonexistentIdentifierError
@@ -43,20 +46,23 @@ class ExternalPublication:
4346
identifier: str
4447
title: str
4548
abstract: str
46-
authors: list[dict[str, str]]
49+
authors: list[dict[str, Union[str, bool]]]
4750
publication_year: int
48-
published_doi: Optional[str]
49-
preprint_doi: Optional[str]
51+
publication_volume: Optional[str]
52+
publication_pages: Optional[str]
53+
publication_doi: Optional[str]
5054
publication_journal: Optional[str]
55+
preprint_doi: Optional[str]
5156
preprint_date: Optional[date]
5257
db_name: str
53-
reference_html: str
58+
59+
_article_cit_fmt = "{author}. {title}. {journal}. {year}; {volume}:{pages}. {doi}"
5460

5561
def __init__(
5662
self,
5763
identifier: str,
5864
db_name: str,
59-
external_publication: Union[RxivContentDetail, metapub.PubMedArticle],
65+
external_publication: Union[RxivContentDetail, PubmedArticle],
6066
) -> None:
6167
"""
6268
NOTE: We assume here that the first author in each of these author lists is the primary author
@@ -71,54 +77,46 @@ def __init__(
7177
self.db_name = db_name
7278
self.title = str(external_publication.title)
7379
self.abstract = str(external_publication.abstract)
74-
self.authors = self._generate_author_list(external_publication.author_list)
80+
self.authors = self._generate_author_list(external_publication.authors)
7581

7682
# Non-shared fields
77-
if isinstance(external_publication, metapub.PubMedArticle):
83+
if isinstance(external_publication, PubmedArticle):
7884
self.publication_year = int(external_publication.year)
79-
self.publication_journal = str(external_publication.journal)
80-
self.published_doi = str(external_publication.doi)
81-
self.preprint_doi = None
82-
self.preprint_date = None
85+
self.publication_journal = external_publication.jrnl
86+
self.publication_doi = external_publication.doi
87+
self.publication_volume = external_publication.volume
88+
self.publication_pages = external_publication.pages
8389
elif isinstance(external_publication, RxivContentDetail):
8490
self.preprint_doi = external_publication.doi
8591
self.preprint_date = external_publication.date
86-
self.publication_journal = None
8792

88-
self.reference_html = str(external_publication.citation_html)
89-
90-
def _generate_author_list(self, authors: Union[list[str], list[metapub.PubMedAuthor]]) -> list[dict[str, str]]:
93+
def _generate_author_list(self, authors: list[str]) -> list[dict[str, Union[str, bool]]]:
9194
"""
92-
Generates a tuple of author names associated with this publication.
95+
Generates a list of author names and thier authorship level associated with this publication.
9396
"""
94-
if not authors:
95-
return []
96-
97-
if isinstance(authors[0], metapub.PubMedAuthor):
98-
created_authors = [
99-
{"name": ", ".join([str(authors[0].last_name), str(authors[0].fore_name)]), "primary": True}
100-
]
97+
return [{"name": author, "primary": idx == 0} for idx, author in enumerate(authors)]
98+
99+
def _format_authors(self) -> str:
100+
"""Helper function for returning a well formatted HTML author list"""
101+
if self.authors and len(self.authors) > 2:
102+
author = str(self.authors[0]["name"]) + ", <i>et al</i>"
103+
elif self.authors and len(self.authors) == 2:
104+
author = " and ".join([str(author["name"]) for author in self.authors])
105+
elif self.authors and len(self.authors) < 2:
106+
author = str(self.authors[0]["name"])
101107
else:
102-
created_authors = [{"name": authors[0], "primary": True}]
103-
104-
for author in authors[1:]:
105-
if isinstance(author, metapub.PubMedAuthor):
106-
created_authors.append(
107-
{"name": ", ".join([str(author.last_name), str(author.fore_name)]), "primary": False}
108-
)
109-
else:
110-
created_authors.append({"name": author, "primary": False})
108+
author = ""
111109

112-
return created_authors
110+
return author
113111

114112
@property
115113
def first_author(self) -> str:
116-
return self.authors[0]["name"]
114+
return str(self.authors[0]["name"])
117115

118116
@property
119117
def secondary_authors(self) -> list[str]:
120118
if len(self.authors) > 1:
121-
return [author["name"] for author in self.authors[1:]]
119+
return [str(author["name"]) for author in self.authors[1:]]
122120
else:
123121
return []
124122

@@ -133,6 +131,35 @@ def url(self) -> str:
133131
else:
134132
return ""
135133

134+
@property
135+
def reference_html(self) -> str:
136+
"""
137+
Return a well formatted citation HTML string based on article data.
138+
Intends to return an identical citation html string to metapub.PubMedArticle.
139+
"""
140+
author = self._format_authors()
141+
142+
if self.db_name in ["PubMed"]:
143+
doi_str = "" if not self.publication_doi else self.publication_doi
144+
title = "(None)" if not self.title else self.title.strip(".")
145+
journal = "(None)" if not self.publication_journal else self.publication_journal.strip(".")
146+
year = "(Unknown year)" if not self.publication_year else self.publication_year
147+
volume = "(Unknown volume)" if not self.publication_volume else self.publication_volume
148+
pages = "(Unknown pages)" if not self.publication_pages else self.publication_pages
149+
else:
150+
doi_str = "" if not self.preprint_doi else self.preprint_doi
151+
title = "(None)" if not self.title else self.title.strip(".")
152+
journal = "(None)" if not self.publication_journal else self.publication_journal.strip(".")
153+
year = "(Unknown year)" if not self.preprint_date else self.preprint_date.year
154+
155+
# We don't receive these fields from rxiv platforms
156+
volume = "(Unknown volume)"
157+
pages = "(Unknown pages)"
158+
159+
return self._article_cit_fmt.format(
160+
author=author, volume=volume, pages=pages, year=year, title=title, journal=journal, doi=doi_str
161+
)
162+
136163

137164
async def find_or_create_doi_identifier(db: Session, identifier: str):
138165
"""
@@ -152,13 +179,18 @@ async def fetch_pubmed_article(identifier: str) -> Optional[ExternalPublication]
152179
"""
153180
Fetch an existing PubMed article from NCBI
154181
"""
155-
fetch = metapub.PubMedFetcher()
182+
fetch = eutils.QueryService(api_key=os.getenv("NCBI_API_KEY"))
156183
try:
157-
article = fetch.article_by_pmid(pmid=identifier)
158-
if article:
159-
article = ExternalPublication(identifier=identifier, db_name="PubMed", external_publication=article)
184+
fetched_articles = list(PubmedArticleSet(fetch.efetch({"db": "pubmed", "id": identifier})))
185+
assert len(fetched_articles) < 2
186+
article = ExternalPublication(identifier=identifier, db_name="PubMed", external_publication=fetched_articles[0])
187+
188+
except AssertionError as exc:
189+
raise AmbiguousIdentifierError(f"Fetched more than 1 PubMed article associated with PMID {identifier}") from exc
160190
except EutilsNCBIError:
161191
return None
192+
except IndexError:
193+
return None
162194
else:
163195
return article
164196

@@ -280,7 +312,7 @@ def create_generic_article(article: ExternalPublication) -> PublicationIdentifie
280312
title=article.title,
281313
abstract=article.abstract,
282314
authors=article.authors,
283-
publication_doi=article.published_doi,
315+
publication_doi=article.publication_doi,
284316
publication_year=article.publication_year,
285317
publication_journal=article.publication_journal,
286318
reference_html=article.reference_html,

src/mavedb/lib/rxiv.py

Lines changed: 4 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ class RxivContentDetail:
2020
title: str
2121
doi: str
2222
category: str
23-
author_list: list[str]
23+
authors: list[str]
2424
author_corresponding: str
2525
author_corresponding_institution: str
2626
date: datetime.date
@@ -32,13 +32,11 @@ class RxivContentDetail:
3232
published: str
3333
server: Optional[str] # not guaranteed
3434

35-
_article_cit_fmt = "{author}. {title}. {journal}. {year}; {volume}:{pages}.{doi}"
36-
3735
def __init__(self, metadata: dict[str, str]) -> None:
3836
self.title = metadata["title"]
3937
self.doi = metadata["doi"]
4038
self.category = metadata["category"]
41-
self.author_list = [s.strip() for s in metadata.get("authors", "").split(";")]
39+
self.authors = [s.strip() for s in metadata.get("authors", "").split(";")]
4240
self.author_corresponding = metadata["author_corresponding"]
4341
self.author_corresponding_institution = metadata["author_corresponding_institution"]
4442
self.date = datetime.datetime.strptime(metadata["date"], "%Y-%m-%d")
@@ -50,46 +48,13 @@ def __init__(self, metadata: dict[str, str]) -> None:
5048
self.published = metadata["published"]
5149
self.server = metadata.get("server")
5250

53-
def _format_authors(self) -> str:
54-
"""Helper function for returning a well formatted HTML author list"""
55-
if self.author_list and len(self.author_list) > 2:
56-
author = self.author_list[0] + ", <i>et al</i>"
57-
elif self.author_list and len(self.author_list) == 2:
58-
author = " and ".join([author for author in self.author_list])
59-
elif self.author_list and len(self.author_list) < 2:
60-
author = self.author_list[0]
61-
else:
62-
author = ""
63-
64-
return author
65-
6651
@property
6752
def first_author(self) -> Optional[str]:
68-
if len(self.author_list) > 0:
69-
return self.author_list[0]
53+
if len(self.authors) > 0:
54+
return self.authors[0]
7055
else:
7156
return None
7257

73-
@property
74-
def citation_html(self):
75-
"""
76-
Return a well formatted citation HTML string based on pre-print article data.
77-
Intends to return an identical citation html string to metapub.PubMedArticle.
78-
"""
79-
author = self._format_authors()
80-
doi_str = "" if not self.doi else self.doi
81-
title = "(None)" if not self.title else self.title.strip(".")
82-
journal = "(None)" if not self.server else self.server.strip(".")
83-
year = "(Unknown yeaer)" if not self.date.year else self.date.year
84-
85-
# We don't receive these fields from rxiv platforms
86-
volume = "(Unknown volume)"
87-
pages = "(Unknown pages)"
88-
89-
return self._article_cit_fmt.format(
90-
author=author, volume=volume, pages=pages, year=year, title=title, journal=journal, doi=doi_str
91-
)
92-
9358

9459
class RxivPublication:
9560
"""
@@ -167,46 +132,6 @@ def first_author(self) -> Optional[str]:
167132
else:
168133
return None
169134

170-
@property
171-
def citation_html(self):
172-
"""
173-
Return a well formatted citation HTML string based on pre-print article data.
174-
Intends to return an identical citation html string to metapub.PubMedArticle.
175-
"""
176-
author = self._format_authors()
177-
doi_str = "" if not self.preprint_doi else self.preprint_doi
178-
title = "(None)" if not self.preprint_title else self.preprint_title.strip(".")
179-
journal = "(None)" if not self.preprint_platform else self.preprint_platform.strip(".")
180-
year = "(Unknown yeaer)" if not self.preprint_date.year else self.preprint_date.year
181-
182-
# We don't receive these fields from rxiv platforms
183-
volume = "(Unknown volume)"
184-
pages = "(Unknown pages)"
185-
186-
return self._article_cit_fmt.format(
187-
author=author, volume=volume, pages=pages, year=year, title=title, journal=journal, doi=doi_str
188-
)
189-
190-
@property
191-
def publication_citation_html(self):
192-
"""
193-
Return a well formatted citation HTML string based on publication data.
194-
Intends to return an identical citation html string to metapub.PubMedArticle.
195-
"""
196-
author = self._format_authors()
197-
doi_str = "" if not self.published_doi else self.published_doi
198-
title = "(None)" if not self.preprint_title else self.preprint_title.strip(".")
199-
journal = "(None)" if not self.published_journal else self.published_journal.strip(".")
200-
year = "(Unknown yeaer)" if not self.published_date.year else self.published_date.year
201-
202-
# We don't receive these fields from rxiv platforms
203-
volume = "(Unknown volume)"
204-
pages = "(Unknown pages)"
205-
206-
return self._article_cit_fmt.format(
207-
author=author, volume=volume, pages=pages, year=year, title=title, journal=journal, doi=doi_str
208-
)
209-
210135

211136
class RxivStatistics:
212137
interval: str

src/mavedb/server_main.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
from slack_sdk.webhook import WebhookClient
1313
from sqlalchemy.orm import configure_mappers
1414
from starlette import status
15-
from starlette.responses import JSONResponse, Response
16-
from metapub.exceptions import InvalidPMID
15+
from starlette.responses import JSONResponse
16+
from eutils._internal.exceptions import EutilsRequestError
1717

1818
from mavedb.models import *
1919

@@ -94,8 +94,8 @@ async def nonexistent_identifier_error_exception_handler(request: Request, exc:
9494
)
9595

9696

97-
@app.exception_handler(InvalidPMID)
98-
async def nonexistent_pmid_error_exception_handler(request: Request, exc: InvalidPMID):
97+
@app.exception_handler(EutilsRequestError)
98+
async def nonexistent_pmid_error_exception_handler(request: Request, exc: EutilsRequestError):
9999
return JSONResponse(
100100
status_code=404,
101101
content={"message": str(exc)},

0 commit comments

Comments
 (0)