diff --git a/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java b/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java index 5aae78d1729..c1152dcb167 100644 --- a/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java +++ b/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java @@ -364,4 +364,32 @@ public Optional containsDuplicate(final BibDatabase database, return database.getEntries().stream().filter(other -> isDuplicate(entry, other, bibDatabaseMode)).findFirst(); } + + /** + * Checks across all fields of the entries, + * any matching ones get compared. + * If they are not the same the score goes down. + * The score goes down depending on the StringSimilarity score. + *

+ * If the result is zero, it means that either no common fields were found + * or that all common fields were very far apart lexically. + *

+ * If the result is one, it means that there was at least one common field + * and all the common fields were the same. + *

+ * Similar entries have a score of above 0.8 + * + * @param one The first entry + * @param two The second entry + * @return number [0,1] 1 representing the same (one potentially having more fields), 0 representing completely different + */ + public double degreeOfSimilarity(final BibEntry one, final BibEntry two) { + StringSimilarity stringSimilarity = new StringSimilarity(); + return one.getFields((field) -> two.getField(field).isPresent()) + .stream().mapToDouble((field) -> { + String first = one.getField(field).get(); + String second = two.getField(field).get(); + return stringSimilarity.similarity(first, second); + }).average().orElse(0.0); + } } diff --git a/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java new file mode 100644 index 00000000000..fced43e5e35 --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java @@ -0,0 +1,258 @@ +package org.jabref.logic.integrity; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; + +import org.jabref.logic.database.DuplicateCheck; +import org.jabref.logic.importer.FetcherException; +import org.jabref.logic.importer.IdBasedFetcher; +import org.jabref.logic.importer.fetcher.ArXivFetcher; +import org.jabref.logic.importer.fetcher.CrossRef; +import org.jabref.logic.importer.fetcher.DoiFetcher; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.BibEntryTypesManager; +import org.jabref.model.entry.identifier.ArXivIdentifier; +import org.jabref.model.entry.identifier.DOI; + +import com.airhacks.afterburner.injection.Injector; + +/** + * Validates a BibEntry depending on if it + * is consistent with the fetched Entry + */ +public class RefChecker { + DoiFetcher doiFetcher; + ArXivFetcher arxivFetcher; + CrossRef crossRef; + DuplicateCheck duplicateCheck; + + public RefChecker(DoiFetcher doiFetcher, ArXivFetcher arXivFetcher) { + this(doiFetcher, arXivFetcher, new CrossRef(), new DuplicateCheck(Injector.instantiateModelOrService(BibEntryTypesManager.class))); + } + + public RefChecker( + DoiFetcher doiFetcher, + ArXivFetcher arXivFetcher, + CrossRef crossRef, + DuplicateCheck duplicateCheck) { + this.doiFetcher = doiFetcher; + this.arxivFetcher = arXivFetcher; + this.crossRef = crossRef; + this.duplicateCheck = duplicateCheck; + } + + /** + * Tries to find the best reference validity + * among current ways. If any of the methods signal + * that it is real, it returns early. + *

+ * DoiFetcher -> CrossRef -> ArxivFetcher + * + * @param entry entry checking + * @return the reference validity + * @throws FetcherException any error from fetchers + */ + public ReferenceValidity referenceValidityOfEntry(BibEntry entry) throws FetcherException { + return validityFromDoiFetcher(entry).lazyOr(() -> + validityFromCrossRef(entry) + ).lazyOr(() -> validityFromArxiv(entry)); + } + + private ReferenceValidity validityFromFetcher(BibEntry entry, IdBasedFetcher fetcher) throws FetcherException { + Optional doi = entry.getDOI(); + if (doi.isEmpty()) { + return new Fake(); + } + + Optional other = fetcher.performSearchById(doi.get().asString()); + return other.map(foundEntry -> compareReferences(entry, foundEntry)) + .orElse(new Fake()); + } + + /** + * Tests validity only from the DoiFetcher. + * + * @param entry the entry + * @return the reference validity + * @throws FetcherException the fetcher exception + */ + public ReferenceValidity validityFromDoiFetcher(BibEntry entry) throws FetcherException { + return validityFromFetcher(entry, doiFetcher); + } + + /** + * Validity only from the CrossRef and later from the DoiFetcher. + * + * @param entry the entry + * @return the reference validity + * @throws FetcherException the fetcher exception + */ + public ReferenceValidity validityFromCrossRef(BibEntry entry) throws FetcherException { + Optional doiFound = crossRef.findIdentifier(entry); + + if (doiFound.isEmpty()) { + return new Fake(); + } else { + DOI doi = doiFound.get(); + return doiFetcher.performSearchById(doi.asString()).map( + (found) -> compareReferences(entry, found) + ).orElse(new Fake()); + } + } + + /** + * Validity only from the arxivFetcher. + * + * @param entry the entry + * @return the reference validity + * @throws FetcherException the fetcher exception + */ + public ReferenceValidity validityFromArxiv(BibEntry entry) throws FetcherException { + + Optional foundIdentifier = arxivFetcher.findIdentifier(entry); + if (foundIdentifier.isEmpty()) { + return new Fake(); + } + return arxivFetcher.performSearchById(foundIdentifier.get().asString()).map( + foundEntry -> compareReferences(entry, foundEntry) + ).orElse(new Fake()); + } + + /** + * Takes a list for entries and returns the mapping of them with their corresponding + * reference validity. + * + * @param entries the entries + * @return the map + * @throws FetcherException the fetcher exception + */ + public Map validateListOfEntries(List entries) throws FetcherException { + + Map entriesToValidity = new HashMap<>(); + for (BibEntry entry : entries) { + entriesToValidity.put(entry, referenceValidityOfEntry(entry)); + } + return entriesToValidity; + } + + private ReferenceValidity compareReferences(BibEntry localEntry, BibEntry validFoundEntry) { + double similarity = duplicateCheck.degreeOfSimilarity(localEntry, validFoundEntry); + if (similarity >= 0.999) { + return new Real(validFoundEntry); + } else if (similarity > 0.8) { + return new Unsure(validFoundEntry); + } else { + return new Fake(); + } + } + + @FunctionalInterface + private interface ReferenceValiditySupplier { + ReferenceValidity get() throws FetcherException; + } + + public static abstract sealed class ReferenceValidity permits Real, Unsure, Fake { + + public ReferenceValidity or(ReferenceValidity other) { + if (this instanceof Real || other instanceof Fake) { + return this; + } + if (other instanceof Unsure otherUnsure && this instanceof Unsure thisUnsure) { + Unsure merge = new Unsure(); + merge.addAll(thisUnsure); + merge.addAll(otherUnsure); + return merge; + } + return other; + } + + private ReferenceValidity lazyOr(ReferenceValiditySupplier other) throws FetcherException { + if (this instanceof Real) { + return this; + } else { + return or(other.get()); + } + } + } + + public static final class Real extends ReferenceValidity { + BibEntry matchingReference; + + public Real(BibEntry matchingReference) { + this.matchingReference = matchingReference; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Real real = (Real) o; + return Objects.equals(matchingReference, real.matchingReference); + } + + @Override + public int hashCode() { + return Objects.hashCode(matchingReference); + } + + public BibEntry getMatchingReference() { + return matchingReference; + } + } + + public static final class Unsure extends ReferenceValidity { + Set matchingReferences; + + public Unsure(BibEntry matchingReference) { + this.matchingReferences = new HashSet<>(Set.of(matchingReference)); + } + + private Unsure() { + this.matchingReferences = new HashSet<>(); + } + + void addAll(Unsure other) { + this.matchingReferences.addAll(other.matchingReferences); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + Unsure unsure = (Unsure) o; + return Objects.equals(matchingReferences, unsure.matchingReferences); + } + + @Override + public int hashCode() { + return Objects.hashCode(matchingReferences); + } + + public Set getMatchingReferences() { + return matchingReferences; + } + } + + public static final class Fake extends ReferenceValidity { + public boolean equals(Object o) { + return o.getClass() == Fake.class; + } + + public int hashCode() { + return Objects.hashCode(Fake.class); + } + } +} diff --git a/jablib/src/test/java/org/jabref/logic/database/DuplicateCheckTest.java b/jablib/src/test/java/org/jabref/logic/database/DuplicateCheckTest.java index 609b50109dc..c428a8a9961 100644 --- a/jablib/src/test/java/org/jabref/logic/database/DuplicateCheckTest.java +++ b/jablib/src/test/java/org/jabref/logic/database/DuplicateCheckTest.java @@ -7,6 +7,7 @@ import org.jabref.model.entry.BibEntryTypesManager; import org.jabref.model.entry.field.Field; import org.jabref.model.entry.field.StandardField; +import org.jabref.model.entry.field.UnknownField; import org.jabref.model.entry.types.StandardEntryType; import org.junit.jupiter.api.BeforeEach; @@ -611,4 +612,50 @@ void differentInCollectionWithTheSameISBNAreNotDuplicates() { assertFalse(duplicateChecker.isDuplicate(entryOne, entryTwo, BibDatabaseMode.BIBTEX)); } + + @Test + void degreeOfSimilarityOfSameEntryIsOne() { + assertEquals(1.0, duplicateChecker.degreeOfSimilarity(getSimpleArticle(), getSimpleArticle())); + assertEquals(1.0, duplicateChecker.degreeOfSimilarity(getSimpleInCollection(), getSimpleInCollection())); + } + + @Test + void differentEntriesHaveSmallDegreeOfSimilarity() { + assertTrue(0.3 > + duplicateChecker.degreeOfSimilarity( + new BibEntry(StandardEntryType.Article) + .withField(StandardField.TITLE, "Some Article"), + new BibEntry(StandardEntryType.InCollection) + .withField(StandardField.TITLE, "Other Collection") + ) + ); + } + + @Test + void entriesWithNoMatchingFieldHaveNoSimilarity() { + assertEquals(0.0, duplicateChecker.degreeOfSimilarity( + new BibEntry(StandardEntryType.Article) + .withField(StandardField.TITLE, "Some Article"), + new BibEntry(StandardEntryType.Article) + .withField(StandardField.AUTHOR, "Some Author") + )); + } + + @Test + void moreFieldsDoesNotAffectTheSimilarity() { + assertEquals(1.0, duplicateChecker.degreeOfSimilarity( + getSimpleArticle(), + getSimpleArticle().withField(new UnknownField("secret"), "Something") + )); + } + + @Test + void similarEntriesHaveAHighDegreeOfSimilarity() { + double similarity = duplicateChecker.degreeOfSimilarity( + getSimpleArticle().withField(StandardField.YEAR, "2018"), + getSimpleArticle() + ); + assertTrue(0.8 < similarity); + assertTrue(1.0 > similarity); + } } diff --git a/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java b/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java new file mode 100644 index 00000000000..f835ff5dfe6 --- /dev/null +++ b/jablib/src/test/java/org/jabref/logic/integrity/RefCheckerTest.java @@ -0,0 +1,198 @@ +package org.jabref.logic.integrity; + +import java.util.List; +import java.util.Map; + +import javafx.collections.FXCollections; + +import org.jabref.logic.importer.FetcherException; +import org.jabref.logic.importer.ImportFormatPreferences; +import org.jabref.logic.importer.fetcher.ArXivFetcher; +import org.jabref.logic.importer.fetcher.DoiFetcher; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.InternalField; +import org.jabref.model.entry.field.StandardField; +import org.jabref.model.entry.field.UnknownField; +import org.jabref.model.entry.types.StandardEntryType; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; +import org.mockito.Answers; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class RefCheckerTest { + private static final ImportFormatPreferences IMPORT_FORMAT_PREFERENCES = mock(ImportFormatPreferences.class, Answers.RETURNS_DEEP_STUBS); + + public BibEntry realEntry = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("Decker_2007") + .withField(StandardField.AUTHOR, "Decker, Gero and Kopp, Oliver and Leymann, Frank and Weske, Mathias") + .withField(StandardField.BOOKTITLE, "IEEE International Conference on Web Services (ICWS 2007)") + .withField(StandardField.MONTH, "#jul#") + .withField(StandardField.PUBLISHER, "IEEE") + .withField(StandardField.TITLE, "BPEL4Chor: Extending BPEL for Modeling Choreographies") + .withField(StandardField.YEAR, "2007") + .withField(StandardField.PAGES, "296--303") + .withField(StandardField.DOI, "10.1109/icws.2007.59"); + public BibEntry realEntryNoDoi = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("Decker_2007") + .withField(StandardField.AUTHOR, "Decker, Gero and Kopp, Oliver and Leymann, Frank and Weske, Mathias") + .withField(StandardField.BOOKTITLE, "IEEE International Conference on Web Services (ICWS 2007)") + .withField(StandardField.MONTH, "#jul#") + .withField(StandardField.PUBLISHER, "IEEE") + .withField(StandardField.TITLE, "BPEL4Chor: Extending BPEL for Modeling Choreographies") + .withField(StandardField.YEAR, "2007") + .withField(StandardField.PAGES, "296--303"); + public BibEntry realEntryArxiv = new BibEntry(StandardEntryType.Article) + .withField(StandardField.TITLE, "The Architecture of Mr. DLib's Scientific Recommender-System API") + .withField(StandardField.DATE, "2018-11-26") + .withField(StandardField.ABSTRACT, "Recommender systems in academia are not widely available. This may be in part due to the difficulty and cost of developing and maintaining recommender systems. Many operators of academic products such as digital libraries and reference managers avoid this effort, although a recommender system could provide significant benefits to their users. In this paper, we introduce Mr. DLib's \"Recommendations as-a-Service\" (RaaS) API that allows operators of academic products to easily integrate a scientific recommender system into their products. Mr. DLib generates recommendations for research articles but in the future, recommendations may include call for papers, grants, etc. Operators of academic products can request recommendations from Mr. DLib and display these recommendations to their users. Mr. DLib can be integrated in just a few hours or days; creating an equivalent recommender system from scratch would require several months for an academic operator. Mr. DLib has been used by GESIS Sowiport and by the reference manager JabRef. Mr. DLib is open source and its goal is to facilitate the application of, and research on, scientific recommender systems. In this paper, we present the motivation for Mr. DLib, the architecture and details about the effectiveness. Mr. DLib has delivered 94m recommendations over a span of two years with an average click-through rate of 0.12%.") + .withField(StandardField.EPRINT, "1811.10364") + .withField(StandardField.FILE, ":http\\://arxiv.org/pdf/1811.10364v1:PDF") + .withField(StandardField.EPRINTTYPE, "arXiv") + .withField(StandardField.EPRINTCLASS, "cs.IR") + .withField(new UnknownField("copyright"), "arXiv.org perpetual, non-exclusive license") + .withField(InternalField.KEY_FIELD, "https://doi.org/10.48550/arxiv.1811.10364") + .withField(StandardField.YEAR, "2018") + .withField(StandardField.KEYWORDS, "Information Retrieval (cs.IR), Artificial Intelligence (cs.AI), Digital Libraries (cs.DL), Machine Learning (cs.LG), FOS: Computer and information sciences") + .withField(StandardField.AUTHOR, "Beel, Joeran and Collins, Andrew and Aizawa, Akiko") + .withField(StandardField.PUBLISHER, "arXiv") + .withField(StandardField.DOI, "10.48550/ARXIV.1811.10364"); + public BibEntry closeToRealEntry = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("Decker_2007") + .withField(StandardField.AUTHOR, "Decker, Gero and Kopp, Oliver and Leymann, Frank and Weske, Mathias") + .withField(StandardField.BOOKTITLE, "IEEE International Conference on Web Services (ICWS 2007)") + .withField(StandardField.MONTH, "#jul#") + .withField(StandardField.PUBLISHER, "IEEE") + .withField(StandardField.TITLE, "BPEL4Chor: Extending BPEL for Modeling Choreographies") + .withField(StandardField.YEAR, "2008") // Incorrect Field + .withField(StandardField.PAGES, "296--303") + .withField(StandardField.DOI, "10.1109/icws.2007.59"); + public BibEntry fakeEntry = new BibEntry(StandardEntryType.InProceedings) + .withCitationKey("Decker_2003") + .withField(StandardField.AUTHOR, "Kopp, Oliver") + .withField(StandardField.BOOKTITLE, "IEEE International Conference on Web Services (ICWS 2007)") + .withField(StandardField.MONTH, "#jul#") + .withField(StandardField.PUBLISHER, "IEEE") + .withField(StandardField.TITLE, "Some Title") + .withField(StandardField.YEAR, "2013") + .withField(StandardField.PAGES, "296--303"); + + public RefChecker refChecker; + + @BeforeAll + public static void setUpAll() { + when(IMPORT_FORMAT_PREFERENCES.bibEntryPreferences().getKeywordSeparator()).thenReturn(','); + // Used during DOI fetch process + when(IMPORT_FORMAT_PREFERENCES.fieldPreferences().getNonWrappableFields()).thenReturn( + FXCollections.observableArrayList(List.of( + StandardField.PDF, + StandardField.PS, + StandardField.URL, + StandardField.DOI, + StandardField.FILE, + StandardField.ISBN, + StandardField.ISSN))); + } + + @BeforeEach + public void setUp() { + ArXivFetcher af = new ArXivFetcher(IMPORT_FORMAT_PREFERENCES); + DoiFetcher df = new DoiFetcher(IMPORT_FORMAT_PREFERENCES); + this.refChecker = new RefChecker(df, af); + } + + @Test + void findsRealEntry() throws FetcherException { + RefChecker.ReferenceValidity rv = refChecker.referenceValidityOfEntry(realEntry); + assertEquals(RefChecker.Real.class, rv.getClass()); + } + + @Test + void findsRealEntryFromDoi() throws FetcherException { + RefChecker.ReferenceValidity rv = refChecker.validityFromDoiFetcher(realEntry); + assertEquals(RefChecker.Real.class, rv.getClass()); + } + + @Test + void closeToRealEntry() throws FetcherException { + RefChecker.ReferenceValidity rv = refChecker.referenceValidityOfEntry(closeToRealEntry); + assertEquals(RefChecker.Unsure.class, rv.getClass()); + } + + @Test + void findsRealEntryWithoutDoi() throws FetcherException { + RefChecker.ReferenceValidity rv = refChecker.referenceValidityOfEntry(realEntryNoDoi); + assertEquals(RefChecker.Real.class, rv.getClass()); + } + + @Test + void noFakeEntry() throws FetcherException { + RefChecker.ReferenceValidity rv = refChecker.referenceValidityOfEntry(fakeEntry); + assertEquals(RefChecker.Fake.class, rv.getClass()); + } + + @Test + void findsRealFromArxiv() throws FetcherException { + RefChecker.ReferenceValidity rv = refChecker.referenceValidityOfEntry(realEntryArxiv); + assertEquals(RefChecker.Real.class, rv.getClass()); + assertEquals(RefChecker.Real.class, refChecker.validityFromArxiv(realEntryArxiv).getClass()); + } + + @Test + void validateListOfEntriesTest() throws FetcherException { + List entries = List.of(realEntry, realEntryNoDoi, fakeEntry); + Map e = refChecker.validateListOfEntries(entries); + + assertEquals(3, e.size()); + assertEquals(RefChecker.Real.class, e.get(realEntry).getClass()); + assertEquals(RefChecker.Real.class, e.get(realEntryNoDoi).getClass()); + assertEquals(RefChecker.Fake.class, e.get(fakeEntry).getClass()); + } + + @Nested + public class ReferenceValidityTest { + @Test + void realEquals() { + RefChecker.ReferenceValidity t1 = new RefChecker.Real(realEntry); + RefChecker.ReferenceValidity t2 = new RefChecker.Real(realEntry); + assertEquals(t1, t2); + assertNotEquals(t1, new RefChecker.Real(fakeEntry)); + } + + @Test + void fakeEquals() { + RefChecker.ReferenceValidity t1 = new RefChecker.Real(null); + RefChecker.ReferenceValidity t2 = new RefChecker.Fake(); + + assertNotEquals(t1, t2); + + assertEquals(t2, new RefChecker.Fake()); + } + + @Test + void orTest() { + RefChecker.ReferenceValidity t1 = new RefChecker.Real(realEntry); + RefChecker.ReferenceValidity t2 = new RefChecker.Real(fakeEntry); + RefChecker.ReferenceValidity t3 = new RefChecker.Fake(); + assertEquals(t1, t1.or(t2)); + assertEquals(t1, t1.or(t3)); + assertEquals(t2, t3.or(t2)); + } + + @Test + void unsureTest() { + RefChecker.ReferenceValidity t1 = new RefChecker.Unsure(realEntry); + RefChecker.ReferenceValidity t2 = new RefChecker.Unsure(fakeEntry); + assertNotEquals(t1, t2); + RefChecker.ReferenceValidity result = t1.or(t2); + RefChecker.ReferenceValidity otherResult = t2.or(t1); + assertEquals(result, otherResult); + } + } +}