Skip to content

Commit d9b90c6

Browse files
Implement fulltext and search based fetcher for EuropePMC (#14079)
* add fulltext * Implement search based fetcher as well * Implement search based fetcher as well * Update jablib/src/main/java/org/jabref/logic/importer/fetcher/EuropePmcFetcher.java Co-authored-by: Subhramit Basu <[email protected]> * Update jablib/src/main/java/org/jabref/logic/importer/fetcher/EuropePmcFetcher.java Co-authored-by: Subhramit Basu <[email protected]> * Update jablib/src/main/java/org/jabref/logic/importer/fetcher/EuropePmcFetcher.java Co-authored-by: Subhramit Basu <[email protected]> * apply suggestions * Fix failing SLR tests & extract date pattern constant --------- Co-authored-by: Subhramit Basu <[email protected]>
1 parent c9adc54 commit d9b90c6

File tree

4 files changed

+188
-30
lines changed

4 files changed

+188
-30
lines changed

jabgui/src/test/java/org/jabref/gui/slr/ManageStudyDefinitionViewModelTest.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ void emptyStudyConstructorFillsDatabasesCorrectly() {
4646
new StudyCatalogItem("DBLP", true),
4747
new StudyCatalogItem("DOAB", false),
4848
new StudyCatalogItem("DOAJ", false),
49+
new StudyCatalogItem("Europe/PMCID", false),
4950
new StudyCatalogItem("GVK", false),
5051
new StudyCatalogItem("IEEEXplore", true),
5152
new StudyCatalogItem("INSPIRE", false),
@@ -76,6 +77,7 @@ void studyConstructorFillsDatabasesCorrectly(@TempDir Path tempDir) {
7677
new StudyCatalogItem("DBLP", false),
7778
new StudyCatalogItem("DOAB", false),
7879
new StudyCatalogItem("DOAJ", false),
80+
new StudyCatalogItem("Europe/PMCID", false),
7981
new StudyCatalogItem("GVK", false),
8082
new StudyCatalogItem("IEEEXplore", false),
8183
new StudyCatalogItem("INSPIRE", false),

jablib/src/main/java/org/jabref/logic/importer/WebFetchers.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ public static SortedSet<SearchBasedFetcher> getSearchBasedFetchers(ImportFormatP
135135
set.add(new BiodiversityLibrary(importerPreferences));
136136
set.add(new LOBIDFetcher());
137137
set.add(new ScholarArchiveFetcher());
138+
set.add(new EuropePmcFetcher());
138139
return set;
139140
}
140141

jablib/src/main/java/org/jabref/logic/importer/fetcher/EuropePmcFetcher.java

Lines changed: 151 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
import org.jabref.logic.importer.IdBasedParserFetcher;
1414
import org.jabref.logic.importer.ParseException;
1515
import org.jabref.logic.importer.Parser;
16+
import org.jabref.logic.importer.SearchBasedParserFetcher;
17+
import org.jabref.logic.importer.fetcher.transformers.DefaultSearchQueryTransformer;
1618
import org.jabref.logic.importer.util.JsonReader;
1719
import org.jabref.model.entry.Author;
1820
import org.jabref.model.entry.AuthorList;
@@ -22,21 +24,35 @@
2224
import org.jabref.model.entry.field.UnknownField;
2325
import org.jabref.model.entry.types.EntryType;
2426
import org.jabref.model.entry.types.StandardEntryType;
27+
import org.jabref.model.search.query.BaseQueryNode;
2528

2629
import kong.unirest.core.json.JSONArray;
2730
import kong.unirest.core.json.JSONException;
2831
import kong.unirest.core.json.JSONObject;
32+
import org.apache.hc.core5.net.URIBuilder;
2933
import org.slf4j.Logger;
3034
import org.slf4j.LoggerFactory;
3135

32-
public class EuropePmcFetcher implements IdBasedParserFetcher {
36+
public class EuropePmcFetcher implements IdBasedParserFetcher, SearchBasedParserFetcher {
3337
private static final Logger LOGGER = LoggerFactory.getLogger(EuropePmcFetcher.class);
3438

3539
@Override
3640
public URL getUrlForIdentifier(String identifier) throws URISyntaxException, MalformedURLException {
3741
return new URI("https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=" + identifier + "&resultType=core&format=json").toURL();
3842
}
3943

44+
@Override
45+
public URL getURLForQuery(BaseQueryNode queryList) throws URISyntaxException, MalformedURLException {
46+
DefaultSearchQueryTransformer transformer = new DefaultSearchQueryTransformer();
47+
String query = transformer.transformSearchQuery(queryList).orElse("");
48+
URIBuilder uriBuilder = new URIBuilder("https://www.ebi.ac.uk/europepmc/webservices/rest/search");
49+
// Europe PMC expects a Lucene-like query in the 'query' parameter
50+
uriBuilder.addParameter("query", query);
51+
uriBuilder.addParameter("resultType", "core");
52+
uriBuilder.addParameter("format", "json");
53+
return uriBuilder.build().toURL();
54+
}
55+
4056
@Override
4157
public Parser getParser() {
4258
return inputStream -> {
@@ -54,40 +70,46 @@ private BibEntry jsonItemToBibEntry(JSONObject item) throws ParseException {
5470

5571
LOGGER.debug(result.toString(2));
5672

57-
EntryType entryType = StandardEntryType.Article;
58-
if (result.has("pubTypeList")) {
59-
for (Object o : result.getJSONObject("pubTypeList").getJSONArray("pubType")) {
60-
if ("letter".equalsIgnoreCase(o.toString())) {
61-
entryType = StandardEntryType.Article;
62-
break;
63-
// TODO: handle other types e.g. books
64-
}
65-
}
66-
}
73+
// Determine entry type from pubTypeList if available
74+
EntryType entryType = determineEntryType(result);
6775

6876
BibEntry entry = new BibEntry(entryType);
6977

70-
entry.setField(StandardField.TITLE, result.optString("title"));
71-
entry.setField(StandardField.ABSTRACT, result.optString("abstractText"));
78+
entry.withField(StandardField.TITLE, result.optString("title"))
79+
.withField(StandardField.ABSTRACT, result.optString("abstractText"))
80+
.withField(StandardField.YEAR, result.optString("pubYear"))
81+
.withField(StandardField.PAGES, result.optString("pageInfo"));
7282

73-
entry.setField(StandardField.YEAR, result.optString("pubYear"));
74-
entry.setField(StandardField.VOLUME, result.optString("journalVolume"));
75-
entry.setField(StandardField.ISSUE, result.optString("journalIssue"));
83+
String doi = result.optString("doi");
84+
entry.withField(StandardField.DOI, doi)
85+
.withField(StandardField.PMID, result.optString("pmid"));
7686

77-
String pages = result.optString("pageInfo");
78-
entry.setField(StandardField.PAGES, pages);
79-
80-
entry.setField(StandardField.DOI, result.optString("doi"));
81-
entry.setField(StandardField.PMID, result.optString("pmid"));
82-
83-
// Handle URL
84-
if (result.has("pmid")) {
85-
entry.setField(StandardField.URL, "https://pubmed.ncbi.nlm.nih.gov/" + result.getString("pmid") + "/");
87+
// Prefer fulltext URLs (e.g., PDF) when available, otherwise fall back to DOI or PubMed page
88+
String bestUrl = extractBestFullTextUrl(result).orElseGet(() -> {
89+
if (result.has("pmid")) {
90+
return "https://pubmed.ncbi.nlm.nih.gov/" + result.optString("pmid") + "/";
91+
}
92+
if (doi != null && !doi.isBlank()) {
93+
return "https://doi.org/" + doi;
94+
}
95+
return null;
96+
});
97+
if (bestUrl != null && !bestUrl.isBlank()) {
98+
entry.setField(StandardField.URL, bestUrl);
8699
}
87100

88101
if (result.has("journalInfo") && result.getJSONObject("journalInfo").has("issn")) {
89102
entry.setField(StandardField.ISSN, result.getJSONObject("journalInfo").getString("issn"));
90103
}
104+
// Prefer a full ISO date if provided
105+
final String datePattern = "\\d{4}-\\d{2}-\\d{2}";
106+
String printPubDate = result.optString("printPublicationDate");
107+
String dateOfPublication = result.optString("dateOfPublication");
108+
if (printPubDate != null && printPubDate.matches(datePattern)) {
109+
entry.setField(StandardField.DATE, printPubDate);
110+
} else if (dateOfPublication != null && dateOfPublication.matches(datePattern)) {
111+
entry.setField(StandardField.DATE, dateOfPublication);
112+
}
91113

92114
// Handle authors
93115
if (result.has("authorList") && result.getJSONObject("authorList").has("author")) {
@@ -113,11 +135,41 @@ private BibEntry jsonItemToBibEntry(JSONObject item) throws ParseException {
113135
}
114136
}
115137

138+
if (result.has("keywordList") && result.getJSONObject("keywordList").has("keyword")) {
139+
JSONArray keywords = result.getJSONObject("keywordList").getJSONArray("keyword");
140+
for (int i = 0; i < keywords.length(); i++) {
141+
if (!keywords.isNull(i)) {
142+
String keyword = keywords.optString(i, "").trim();
143+
if (!keyword.isEmpty()) {
144+
entry.addKeyword(keyword, ',');
145+
}
146+
}
147+
}
148+
}
149+
if (result.has("meshHeadingList") && result.getJSONObject("meshHeadingList").has("meshHeading")) {
150+
JSONArray mesh = result.getJSONObject("meshHeadingList").getJSONArray("meshHeading");
151+
for (int i = 0; i < mesh.length(); i++) {
152+
JSONObject meshHeading = mesh.optJSONObject(i);
153+
if (meshHeading != null) {
154+
String descriptor = meshHeading.optString("descriptorName", "").trim();
155+
if (!descriptor.isEmpty()) {
156+
entry.addKeyword(descriptor, ',');
157+
}
158+
} else if (!mesh.isNull(i)) {
159+
// Sometimes MeSH heading may be a plain string
160+
String meshPlain = mesh.optString(i, "").trim();
161+
if (!meshPlain.isEmpty()) {
162+
entry.addKeyword(meshPlain, ',');
163+
}
164+
}
165+
}
166+
}
167+
116168
if (result.has("pubModel")) {
117-
Optional.ofNullable(result.optString("pubModel")).ifPresent(pubModel -> entry.setField(StandardField.HOWPUBLISHED, pubModel));
169+
Optional.ofNullable(result.optString("pubModel")).ifPresent(pubModel -> entry.withField(StandardField.HOWPUBLISHED, pubModel));
118170
}
119171
if (result.has("publicationStatus")) {
120-
Optional.ofNullable(result.optString("publicationStatus")).ifPresent(pubStatus -> entry.setField(StandardField.PUBSTATE, pubStatus));
172+
Optional.ofNullable(result.optString("publicationStatus")).ifPresent(pubStatus -> entry.withField(StandardField.PUBSTATE, pubStatus));
121173
}
122174

123175
if (result.has("journalInfo")) {
@@ -142,6 +194,78 @@ private BibEntry jsonItemToBibEntry(JSONObject item) throws ParseException {
142194
}
143195
}
144196

197+
private EntryType determineEntryType(JSONObject result) {
198+
EntryType defaultType = StandardEntryType.Article;
199+
if (!(result.has("pubTypeList") && result.getJSONObject("pubTypeList").has("pubType"))) {
200+
return defaultType;
201+
}
202+
JSONArray pubTypes = result.getJSONObject("pubTypeList").getJSONArray("pubType");
203+
List<String> types = new ArrayList<>();
204+
for (int i = 0; i < pubTypes.length(); i++) {
205+
types.add(pubTypes.optString(i, "").toLowerCase());
206+
}
207+
if (matchesAny(types, "book chapter") || matchesAny(types, "chapter")) {
208+
return StandardEntryType.InCollection;
209+
}
210+
if (matchesAny(types, "book")) {
211+
return StandardEntryType.Book;
212+
}
213+
if (matchesAny(types, "conference") || matchesAny(types, "proceedings") || matchesAny(types, "conference paper") || matchesAny(types, "proceedings paper")) {
214+
return StandardEntryType.InProceedings;
215+
}
216+
if (matchesAny(types, "phd") || matchesAny(types, "phd thesis") || matchesAny(types, "doctoral thesis")) {
217+
return StandardEntryType.PhdThesis;
218+
}
219+
if (matchesAny(types, "master") || matchesAny(types, "masters thesis") || matchesAny(types, "master's thesis")) {
220+
return StandardEntryType.MastersThesis;
221+
}
222+
// Letters, reviews, editorials are usually articles
223+
return defaultType;
224+
}
225+
226+
// substring matches
227+
private boolean matchesAny(List<String> list, String searchString) {
228+
return list.stream().anyMatch(entry -> entry.contains(searchString));
229+
}
230+
231+
private Optional<String> extractBestFullTextUrl(JSONObject result) {
232+
try {
233+
if (!(result.has("fullTextUrlList") && result.getJSONObject("fullTextUrlList").has("fullTextUrl"))) {
234+
return Optional.empty();
235+
}
236+
JSONArray urls = result.getJSONObject("fullTextUrlList").getJSONArray("fullTextUrl");
237+
// First pass: prefer open/free PDF
238+
for (int i = 0; i < urls.length(); i++) {
239+
JSONObject urlEntry = urls.getJSONObject(i);
240+
String style = urlEntry.optString("documentStyle", "").toLowerCase();
241+
String availability = urlEntry.optString("availability", "").toLowerCase();
242+
String url = urlEntry.optString("url", "");
243+
if (url == null || url.isBlank()) {
244+
continue;
245+
}
246+
if ((availability.contains("open") || availability.contains("free")) && style.contains("pdf")) {
247+
return Optional.of(url);
248+
}
249+
}
250+
// Second pass: any PDF
251+
for (int i = 0; i < urls.length(); i++) {
252+
JSONObject urlEntry = urls.getJSONObject(i);
253+
String style = urlEntry.optString("documentStyle", "").toLowerCase();
254+
String url = urlEntry.optString("url", "");
255+
if (url == null || url.isBlank()) {
256+
continue;
257+
}
258+
if (style.contains("pdf")) {
259+
return Optional.of(url);
260+
}
261+
}
262+
return Optional.empty();
263+
} catch (JSONException e) {
264+
LOGGER.error("Error parsing EuropePMC response for {}", result, e);
265+
return Optional.empty();
266+
}
267+
}
268+
145269
@Override
146270
public void doPostCleanup(BibEntry entry) {
147271
new FieldFormatterCleanup(StandardField.PAGES, new NormalizePagesFormatter()).cleanup(entry);

jablib/src/test/java/org/jabref/logic/importer/fetcher/EuropePmcFetcherTest.java

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,13 @@
1313
import org.junit.jupiter.api.Test;
1414

1515
import static org.junit.jupiter.api.Assertions.assertEquals;
16-
import static org.junit.jupiter.api.Assertions.assertTrue;
1716

1817
@FetcherTest
1918
class EuropePmcFetcherTest {
2019

2120
private EuropePmcFetcher fetcher;
2221
private BibEntry entryWijedasa;
22+
private BibEntry entryWithFulltextAndKeywords;
2323

2424
@BeforeEach
2525
void setUp() {
@@ -40,14 +40,45 @@ void setUp() {
4040
.withField(StandardField.VOLUME, "23")
4141
.withField(StandardField.URL, "https://pubmed.ncbi.nlm.nih.gov/27670948/")
4242
.withField(StandardField.YEAR, "2017");
43+
44+
entryWithFulltextAndKeywords = new BibEntry(StandardEntryType.Article)
45+
.withField(StandardField.AUTHOR, "Okpala, Chibuike and Umeh, Ifeoma and Anagu, Linda Onyeka")
46+
.withField(StandardField.DOI, "10.12688/openresafrica.15809.2")
47+
.withField(StandardField.HOWPUBLISHED, "Electronic-eCollection")
48+
.withField(StandardField.ISSN, "2752-6925")
49+
.withField(StandardField.JOURNAL, "Open research Africa")
50+
.withField(StandardField.KEYWORDS, "rainy season, Preventive Measures, Asymptomatic Malaria, Malaria Transmission.")
51+
.withField(new UnknownField("nlmid"), "9918487345206676")
52+
.withField(StandardField.PAGES, "5")
53+
.withField(StandardField.PMID, "40860931")
54+
.withField(StandardField.PUBSTATE, "epublish")
55+
.withField(StandardField.TITLE, "Economic empowerment and various preventive strategies play a role in reducing asymptomatic malaria towards the end of the rainy season.")
56+
.withField(StandardField.URL, "https://europepmc.org/articles/PMC12375191?pdf=render")
57+
.withField(StandardField.VOLUME, "8")
58+
.withField(StandardField.YEAR, "2025");
4359
}
4460

4561
@Test
4662
void searchByIDWijedasa() throws FetcherException {
4763
Optional<BibEntry> fetchedEntry = fetcher.performSearchById("27670948");
48-
assertTrue(fetchedEntry.isPresent());
49-
5064
fetchedEntry.get().clearField(StandardField.ABSTRACT); // Remove abstract due to copyright
5165
assertEquals(Optional.of(entryWijedasa), fetchedEntry);
5266
}
67+
68+
@Test
69+
void searchByIDDownloadsFulltextAndKeywords() throws FetcherException {
70+
Optional<BibEntry> fetchedEntry;
71+
fetchedEntry = fetcher.performSearchById("40860931");
72+
fetchedEntry.get().clearField(StandardField.ABSTRACT);
73+
assertEquals(Optional.of(entryWithFulltextAndKeywords), fetchedEntry);
74+
}
75+
76+
@Test
77+
void searchByDoiTermReturnsWijedasa() throws FetcherException {
78+
// Use Europe PMC fielded search: DOI
79+
var results = fetcher.performSearch("doi:10.1111/gcb.13516");
80+
BibEntry first = results.getFirst();
81+
first.clearField(StandardField.ABSTRACT);
82+
assertEquals(entryWijedasa, first);
83+
}
5384
}

0 commit comments

Comments
 (0)