-
-
Notifications
You must be signed in to change notification settings - Fork 3k
Refchecker Logic Functionality #14213
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 16 commits
50c3ca9
eeec045
735f985
be4d588
9939899
037d243
9725af0
43dc552
b9027da
ead14fd
e442b24
151aaa7
c8bfb21
6d783e9
f77341a
a33eb13
6c1aa0e
80b8058
6f0e9c7
b98b78a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -364,4 +364,32 @@ public Optional<BibEntry> containsDuplicate(final BibDatabase database, | |
|
|
||
| return database.getEntries().stream().filter(other -> isDuplicate(entry, other, bibDatabaseMode)).findFirst(); | ||
| } | ||
|
|
||
| /** | ||
| * Checks across all fields of the entries, | ||
| * any matching ones get compared. | ||
| * If they are not the same the score goes down. | ||
| * The score goes down depending on the | ||
| * Levenshtein distance between the two entries. | ||
| * <p> | ||
| * If the result is zero, it means that either no common fields were found | ||
| * or that all common fields were very far apart lexically. | ||
| * <p> | ||
| * If the result is one, it means that there was at least one common field | ||
| * and all the common fields were the same. | ||
| * <p> | ||
| * Similar entries have a score of above 0.8 | ||
| * | ||
| * @param one The first entry | ||
| * @param two The second entry | ||
| * @return number [0,1] 1 representing the same (one potentially having more fields), 0 representing completely different | ||
| */ | ||
| public double degreeOfSimilarity(final BibEntry one, final BibEntry two) { | ||
subhramit marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return one.getFields((f) -> two.getField(f).isPresent()) | ||
|
||
| .stream().mapToDouble((field) -> { | ||
| String first = one.getField(field).get(); | ||
| String second = two.getField(field).get(); | ||
| return new StringSimilarity().similarity(first, second); | ||
|
||
| }).average().orElse(0.0); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,244 @@ | ||||||||||
| package org.jabref.logic.integrity; | ||||||||||
|
|
||||||||||
| import java.util.HashMap; | ||||||||||
| import java.util.HashSet; | ||||||||||
| import java.util.List; | ||||||||||
| import java.util.Map; | ||||||||||
| import java.util.Objects; | ||||||||||
| import java.util.Optional; | ||||||||||
| import java.util.Set; | ||||||||||
|
|
||||||||||
| import org.jabref.logic.database.DuplicateCheck; | ||||||||||
| import org.jabref.logic.importer.FetcherException; | ||||||||||
| import org.jabref.logic.importer.IdBasedFetcher; | ||||||||||
| import org.jabref.logic.importer.fetcher.ArXivFetcher; | ||||||||||
| import org.jabref.logic.importer.fetcher.CrossRef; | ||||||||||
| import org.jabref.logic.importer.fetcher.DoiFetcher; | ||||||||||
| import org.jabref.model.entry.BibEntry; | ||||||||||
| import org.jabref.model.entry.BibEntryTypesManager; | ||||||||||
| import org.jabref.model.entry.identifier.DOI; | ||||||||||
|
|
||||||||||
| /** | ||||||||||
| * Validates a BibEntry depending on if it | ||||||||||
| * is consistent with the fetched Entry | ||||||||||
| */ | ||||||||||
| public class RefChecker { | ||||||||||
| DoiFetcher doiFetcher; | ||||||||||
| ArXivFetcher arxivFetcher; | ||||||||||
| CrossRef crossRef; | ||||||||||
| DuplicateCheck duplicateCheck; | ||||||||||
|
Comment on lines
+29
to
+32
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These should be private? |
||||||||||
|
|
||||||||||
| public RefChecker( | ||||||||||
| DoiFetcher doiFetcher, | ||||||||||
| ArXivFetcher arXivFetcher) { | ||||||||||
|
||||||||||
| this(doiFetcher, arXivFetcher, new CrossRef(), new DuplicateCheck(new BibEntryTypesManager())); | ||||||||||
|
||||||||||
| } | ||||||||||
|
|
||||||||||
| public RefChecker( | ||||||||||
| DoiFetcher doiFetcher, | ||||||||||
| ArXivFetcher arXivFetcher, | ||||||||||
| CrossRef crossRef, | ||||||||||
| DuplicateCheck duplicateCheck) { | ||||||||||
| this.doiFetcher = doiFetcher; | ||||||||||
| this.arxivFetcher = arXivFetcher; | ||||||||||
| this.crossRef = crossRef; | ||||||||||
| this.duplicateCheck = duplicateCheck; | ||||||||||
| } | ||||||||||
|
|
||||||||||
| /** | ||||||||||
| * Tries to find the best reference validity | ||||||||||
| * among current ways. If any of the methods signal | ||||||||||
| * that it is real, it returns early. | ||||||||||
| * <p> | ||||||||||
| * DoiFetcher -> CrossRef -> ArxivFetcher | ||||||||||
| * | ||||||||||
| * @param entry entry checking | ||||||||||
| * @return the reference validity | ||||||||||
| * @throws FetcherException any error from fetchers | ||||||||||
| */ | ||||||||||
|
Comment on lines
+49
to
+59
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Try to use Markdown as JavaDoc format. Otherwise you need to escape |
||||||||||
| public ReferenceValidity referenceValidityOfEntry(BibEntry entry) throws FetcherException { | ||||||||||
| return validityFromDoiFetcher(entry).lazyOr(() -> | ||||||||||
| validityFromCrossRef(entry) | ||||||||||
| ).lazyOr(() -> validityFromArxiv(entry)); | ||||||||||
| } | ||||||||||
|
|
||||||||||
| private ReferenceValidity validityFromFetcher(BibEntry entry, IdBasedFetcher fetcher) throws FetcherException { | ||||||||||
| Optional<DOI> doi = entry.getDOI(); | ||||||||||
| if (doi.isEmpty()) { | ||||||||||
| return new Fake(); | ||||||||||
| } | ||||||||||
|
|
||||||||||
| Optional<BibEntry> other = fetcher.performSearchById(doi.get().asString()); | ||||||||||
| return other.map(o -> compareReferences(entry, o)) | ||||||||||
|
||||||||||
| .orElse(new Fake()); | ||||||||||
| } | ||||||||||
|
|
||||||||||
| /** | ||||||||||
| * Tests validity only from the DoiFetcher. | ||||||||||
| * | ||||||||||
| * @param entry the entry | ||||||||||
| * @return the reference validity | ||||||||||
| * @throws FetcherException the fetcher exception | ||||||||||
| */ | ||||||||||
| public ReferenceValidity validityFromDoiFetcher(BibEntry entry) throws FetcherException { | ||||||||||
| return validityFromFetcher(entry, doiFetcher); | ||||||||||
| } | ||||||||||
|
|
||||||||||
| /** | ||||||||||
| * Validity only from the CrossRef and later from the DoiFetcher. | ||||||||||
| * | ||||||||||
| * @param entry the entry | ||||||||||
| * @return the reference validity | ||||||||||
| * @throws FetcherException the fetcher exception | ||||||||||
| */ | ||||||||||
| public ReferenceValidity validityFromCrossRef(BibEntry entry) throws FetcherException { | ||||||||||
| Optional<DOI> doiFound = crossRef.findIdentifier(entry); | ||||||||||
|
|
||||||||||
| if (doiFound.isEmpty()) { | ||||||||||
| return new Fake(); | ||||||||||
| } else { | ||||||||||
| DOI doi = doiFound.get(); | ||||||||||
| return doiFetcher.performSearchById(doi.asString()).map( | ||||||||||
| (found) -> compareReferences(entry, found) | ||||||||||
| ).orElse(new Fake()); | ||||||||||
| } | ||||||||||
| } | ||||||||||
|
|
||||||||||
| /** | ||||||||||
| * Validity only from the arxivFetcher. | ||||||||||
| * | ||||||||||
| * @param entry the entry | ||||||||||
| * @return the reference validity | ||||||||||
| * @throws FetcherException the fetcher exception | ||||||||||
| */ | ||||||||||
| public ReferenceValidity validityFromArxiv(BibEntry entry) throws FetcherException { | ||||||||||
|
|
||||||||||
| var m = arxivFetcher.findIdentifier(entry); | ||||||||||
|
||||||||||
| if (m.isEmpty()) { | ||||||||||
|
||||||||||
| return new Fake(); | ||||||||||
| } | ||||||||||
| return arxivFetcher.performSearchById(m.get().asString()).map( | ||||||||||
| found -> compareReferences(entry, found) | ||||||||||
| ).orElse(new Fake()); | ||||||||||
| } | ||||||||||
|
|
||||||||||
| /** | ||||||||||
| * Takes a list for entries and returns the mapping of them with their corresponding | ||||||||||
| * reference validity. | ||||||||||
| * | ||||||||||
| * @param entries the entries | ||||||||||
| * @return the map | ||||||||||
| * @throws FetcherException the fetcher exception | ||||||||||
| */ | ||||||||||
| public Map<BibEntry, ReferenceValidity> validateListOfEntries(List<BibEntry> entries) throws FetcherException { | ||||||||||
|
|
||||||||||
| Map<BibEntry, ReferenceValidity> entriesToValidity = new HashMap<>(); | ||||||||||
| for (BibEntry entry : entries) { | ||||||||||
| entriesToValidity.put(entry, referenceValidityOfEntry(entry)); | ||||||||||
| } | ||||||||||
| return entriesToValidity; | ||||||||||
| } | ||||||||||
|
|
||||||||||
| private ReferenceValidity compareReferences(BibEntry original, BibEntry trueEntry) { | ||||||||||
|
||||||||||
| double similarity = duplicateCheck.degreeOfSimilarity(original, trueEntry); | ||||||||||
| if (similarity >= 0.999) { | ||||||||||
| return new Real(trueEntry); | ||||||||||
| } else if (similarity > 0.8) { | ||||||||||
| return new Unsure(trueEntry); | ||||||||||
| } else { | ||||||||||
| return new Fake(); | ||||||||||
| } | ||||||||||
| } | ||||||||||
|
|
||||||||||
| @FunctionalInterface | ||||||||||
| private interface ReferenceValiditySupplier { | ||||||||||
| ReferenceValidity get() throws FetcherException; | ||||||||||
| } | ||||||||||
|
|
||||||||||
| public static abstract sealed class ReferenceValidity permits Real, Unsure, Fake { | ||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe And I am not sure if there could be a group of other entries in some cases; but this design would be more future proof and more readable. @subhramit |
||||||||||
|
|
||||||||||
| public ReferenceValidity or(ReferenceValidity other) { | ||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add JavaDoc ( |
||||||||||
| if (this instanceof Real || other instanceof Fake) { | ||||||||||
| return this; | ||||||||||
| } | ||||||||||
| if (other instanceof Unsure otherUnsure && this instanceof Unsure thisUnsure) { | ||||||||||
| otherUnsure.addAll(thisUnsure); | ||||||||||
|
||||||||||
| } | ||||||||||
| return other; | ||||||||||
| } | ||||||||||
|
|
||||||||||
| private ReferenceValidity lazyOr(ReferenceValiditySupplier other) throws FetcherException { | ||||||||||
| if (this instanceof Real) { | ||||||||||
| return this; | ||||||||||
| } else { | ||||||||||
| return or(other.get()); | ||||||||||
| } | ||||||||||
| } | ||||||||||
| } | ||||||||||
|
|
||||||||||
| public static final class Real extends ReferenceValidity { | ||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this can be a record? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sadly records already extend the Record class, so they cant inherit any further. I could refactor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch, keep this as a class itself |
||||||||||
| BibEntry matchingReference; | ||||||||||
|
|
||||||||||
| public Real(BibEntry matchingReference) { | ||||||||||
| this.matchingReference = matchingReference; | ||||||||||
| } | ||||||||||
|
|
||||||||||
| @Override | ||||||||||
| public boolean equals(Object o) { | ||||||||||
| if (this == o) { | ||||||||||
| return true; | ||||||||||
| } | ||||||||||
| if (o == null || getClass() != o.getClass()) { | ||||||||||
| return false; | ||||||||||
| } | ||||||||||
| Real real = (Real) o; | ||||||||||
| return Objects.equals(matchingReference, real.matchingReference); | ||||||||||
| } | ||||||||||
|
|
||||||||||
| @Override | ||||||||||
| public int hashCode() { | ||||||||||
| return Objects.hashCode(matchingReference); | ||||||||||
| } | ||||||||||
subhramit marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||
| } | ||||||||||
|
|
||||||||||
| public static final class Unsure extends ReferenceValidity { | ||||||||||
| Set<BibEntry> matchingReferences; | ||||||||||
|
|
||||||||||
| public Unsure(BibEntry matchingReference) { | ||||||||||
| Set<BibEntry> matchingReferences = new HashSet<>(); | ||||||||||
| matchingReferences.add(matchingReference); | ||||||||||
| this.matchingReferences = matchingReferences; | ||||||||||
|
||||||||||
| Set<BibEntry> matchingReferences = new HashSet<>(); | |
| matchingReferences.add(matchingReference); | |
| this.matchingReferences = matchingReferences; | |
| this.matchingReferences = new HashSet<>(Set.of(matchingReference)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Note: Don't take this suggestion right away, let another maintainer confirm)
Creating a new Fake every time seems a bit odd as unlike Real, it is a stateless marker. This should be a singleton.
Maybe do something like
public static final Fake INSTANCE = new Fake();
private Fake() {}
@Override
public boolean equals(Object o) {
return o instanceof Fake;
}and for the hashcode return an arbitrary constant.
This way, you can just do something like return Fake.INSTANCE instead of return new Fake().
@calixtus do you have a better idea? Something feels odd about the design.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No neeed for an implementation, JDK's default works well in most cases
Override is only necessary if equals is overridden: https://thefinestartist.com/effective-java/09
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,6 +7,7 @@ | |
| import org.jabref.model.entry.BibEntryTypesManager; | ||
| import org.jabref.model.entry.field.Field; | ||
| import org.jabref.model.entry.field.StandardField; | ||
| import org.jabref.model.entry.field.UnknownField; | ||
| import org.jabref.model.entry.types.StandardEntryType; | ||
|
|
||
| import org.junit.jupiter.api.BeforeEach; | ||
|
|
@@ -611,4 +612,50 @@ void differentInCollectionWithTheSameISBNAreNotDuplicates() { | |
|
|
||
| assertFalse(duplicateChecker.isDuplicate(entryOne, entryTwo, BibDatabaseMode.BIBTEX)); | ||
| } | ||
|
|
||
| @Test | ||
| void degreeOfSimilarityOfSameEntryIsOne() { | ||
| assertEquals(1.0, duplicateChecker.degreeOfSimilarity(getSimpleArticle(), getSimpleArticle())); | ||
| assertEquals(1.0, duplicateChecker.degreeOfSimilarity(getSimpleInCollection(), getSimpleInCollection())); | ||
| } | ||
|
Comment on lines
+616
to
+620
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please only a single assertion per method - see https://devdocs.jabref.org/code-howtos/testing.html I think, you can rewrite using I think, this can also be merged |
||
|
|
||
| @Test | ||
| void differentEntriesHaveSmallDegreeOfSimilarity() { | ||
| assertTrue(0.3 > | ||
| duplicateChecker.degreeOfSimilarity( | ||
| new BibEntry(StandardEntryType.Article) | ||
| .withField(StandardField.TITLE, "Some Article"), | ||
| new BibEntry(StandardEntryType.InCollection) | ||
| .withField(StandardField.TITLE, "Other Collection") | ||
| ) | ||
| ); | ||
| } | ||
|
|
||
| @Test | ||
| void entriesWithNoMatchingFieldHaveNoSimilarity() { | ||
| assertEquals(0.0, duplicateChecker.degreeOfSimilarity( | ||
| new BibEntry(StandardEntryType.Article) | ||
| .withField(StandardField.TITLE, "Some Article"), | ||
| new BibEntry(StandardEntryType.Article) | ||
| .withField(StandardField.AUTHOR, "Some Author") | ||
| )); | ||
| } | ||
|
|
||
| @Test | ||
| void moreFieldsDoesNotAffectTheSimilarity() { | ||
| assertEquals(1.0, duplicateChecker.degreeOfSimilarity( | ||
| getSimpleArticle(), | ||
| getSimpleArticle().withField(new UnknownField("secret"), "Something") | ||
| )); | ||
| } | ||
|
|
||
| @Test | ||
| void similarEntriesHaveAHighDegreeOfSimilarity() { | ||
| double similarity = duplicateChecker.degreeOfSimilarity( | ||
| getSimpleArticle().withField(StandardField.YEAR, "2018"), | ||
| getSimpleArticle() | ||
| ); | ||
| assertTrue(0.8 < similarity); | ||
| assertTrue(1.0 > similarity); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
0.8 should be constant in
DuplicateCheck/ linked from there.