-
-
Notifications
You must be signed in to change notification settings - Fork 3k
Refchecker Logic Functionality #14213
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
50c3ca9
eeec045
735f985
be4d588
9939899
037d243
9725af0
43dc552
b9027da
ead14fd
e442b24
151aaa7
c8bfb21
6d783e9
f77341a
a33eb13
6c1aa0e
80b8058
6f0e9c7
b98b78a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,258 @@ | ||
| package org.jabref.logic.integrity; | ||
|
|
||
| import java.util.HashMap; | ||
| import java.util.HashSet; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Objects; | ||
| import java.util.Optional; | ||
| import java.util.Set; | ||
|
|
||
| import org.jabref.logic.database.DuplicateCheck; | ||
| import org.jabref.logic.importer.FetcherException; | ||
| import org.jabref.logic.importer.IdBasedFetcher; | ||
| import org.jabref.logic.importer.fetcher.ArXivFetcher; | ||
| import org.jabref.logic.importer.fetcher.CrossRef; | ||
| import org.jabref.logic.importer.fetcher.DoiFetcher; | ||
| import org.jabref.model.entry.BibEntry; | ||
| import org.jabref.model.entry.BibEntryTypesManager; | ||
| import org.jabref.model.entry.identifier.ArXivIdentifier; | ||
| import org.jabref.model.entry.identifier.DOI; | ||
|
|
||
| import com.airhacks.afterburner.injection.Injector; | ||
|
|
||
| /** | ||
| * Validates a BibEntry depending on if it | ||
| * is consistent with the fetched Entry | ||
| */ | ||
| public class RefChecker { | ||
| DoiFetcher doiFetcher; | ||
| ArXivFetcher arxivFetcher; | ||
| CrossRef crossRef; | ||
| DuplicateCheck duplicateCheck; | ||
|
Comment on lines
+29
to
+32
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These should be private? |
||
|
|
||
| public RefChecker(DoiFetcher doiFetcher, ArXivFetcher arXivFetcher) { | ||
| this(doiFetcher, arXivFetcher, new CrossRef(), new DuplicateCheck(Injector.instantiateModelOrService(BibEntryTypesManager.class))); | ||
| } | ||
|
|
||
| public RefChecker( | ||
| DoiFetcher doiFetcher, | ||
| ArXivFetcher arXivFetcher, | ||
| CrossRef crossRef, | ||
| DuplicateCheck duplicateCheck) { | ||
| this.doiFetcher = doiFetcher; | ||
| this.arxivFetcher = arXivFetcher; | ||
| this.crossRef = crossRef; | ||
| this.duplicateCheck = duplicateCheck; | ||
| } | ||
|
|
||
| /** | ||
| * Tries to find the best reference validity | ||
| * among current ways. If any of the methods signal | ||
| * that it is real, it returns early. | ||
| * <p> | ||
| * DoiFetcher -> CrossRef -> ArxivFetcher | ||
| * | ||
| * @param entry entry checking | ||
| * @return the reference validity | ||
| * @throws FetcherException any error from fetchers | ||
| */ | ||
|
Comment on lines
+49
to
+59
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Try to use Markdown as JavaDoc format. Otherwise you need to escape |
||
| public ReferenceValidity referenceValidityOfEntry(BibEntry entry) throws FetcherException { | ||
| return validityFromDoiFetcher(entry).lazyOr(() -> | ||
| validityFromCrossRef(entry) | ||
| ).lazyOr(() -> validityFromArxiv(entry)); | ||
| } | ||
|
|
||
| private ReferenceValidity validityFromFetcher(BibEntry entry, IdBasedFetcher fetcher) throws FetcherException { | ||
| Optional<DOI> doi = entry.getDOI(); | ||
| if (doi.isEmpty()) { | ||
| return new Fake(); | ||
| } | ||
|
|
||
| Optional<BibEntry> other = fetcher.performSearchById(doi.get().asString()); | ||
| return other.map(foundEntry -> compareReferences(entry, foundEntry)) | ||
| .orElse(new Fake()); | ||
| } | ||
|
|
||
| /** | ||
| * Tests validity only from the DoiFetcher. | ||
| * | ||
| * @param entry the entry | ||
| * @return the reference validity | ||
| * @throws FetcherException the fetcher exception | ||
| */ | ||
| public ReferenceValidity validityFromDoiFetcher(BibEntry entry) throws FetcherException { | ||
| return validityFromFetcher(entry, doiFetcher); | ||
| } | ||
|
|
||
| /** | ||
| * Validity only from the CrossRef and later from the DoiFetcher. | ||
| * | ||
| * @param entry the entry | ||
| * @return the reference validity | ||
| * @throws FetcherException the fetcher exception | ||
| */ | ||
| public ReferenceValidity validityFromCrossRef(BibEntry entry) throws FetcherException { | ||
| Optional<DOI> doiFound = crossRef.findIdentifier(entry); | ||
|
|
||
| if (doiFound.isEmpty()) { | ||
| return new Fake(); | ||
| } else { | ||
| DOI doi = doiFound.get(); | ||
| return doiFetcher.performSearchById(doi.asString()).map( | ||
| (found) -> compareReferences(entry, found) | ||
| ).orElse(new Fake()); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Validity only from the arxivFetcher. | ||
| * | ||
| * @param entry the entry | ||
| * @return the reference validity | ||
| * @throws FetcherException the fetcher exception | ||
| */ | ||
| public ReferenceValidity validityFromArxiv(BibEntry entry) throws FetcherException { | ||
|
|
||
| Optional<ArXivIdentifier> foundIdentifier = arxivFetcher.findIdentifier(entry); | ||
| if (foundIdentifier.isEmpty()) { | ||
| return new Fake(); | ||
| } | ||
| return arxivFetcher.performSearchById(foundIdentifier.get().asString()).map( | ||
| foundEntry -> compareReferences(entry, foundEntry) | ||
| ).orElse(new Fake()); | ||
| } | ||
|
|
||
| /** | ||
| * Takes a list for entries and returns the mapping of them with their corresponding | ||
| * reference validity. | ||
| * | ||
| * @param entries the entries | ||
| * @return the map | ||
| * @throws FetcherException the fetcher exception | ||
| */ | ||
| public Map<BibEntry, ReferenceValidity> validateListOfEntries(List<BibEntry> entries) throws FetcherException { | ||
|
|
||
| Map<BibEntry, ReferenceValidity> entriesToValidity = new HashMap<>(); | ||
| for (BibEntry entry : entries) { | ||
| entriesToValidity.put(entry, referenceValidityOfEntry(entry)); | ||
| } | ||
| return entriesToValidity; | ||
| } | ||
|
|
||
| private ReferenceValidity compareReferences(BibEntry localEntry, BibEntry validFoundEntry) { | ||
| double similarity = duplicateCheck.degreeOfSimilarity(localEntry, validFoundEntry); | ||
| if (similarity >= 0.999) { | ||
| return new Real(validFoundEntry); | ||
| } else if (similarity > 0.8) { | ||
| return new Unsure(validFoundEntry); | ||
| } else { | ||
| return new Fake(); | ||
| } | ||
| } | ||
|
|
||
| @FunctionalInterface | ||
| private interface ReferenceValiditySupplier { | ||
| ReferenceValidity get() throws FetcherException; | ||
| } | ||
|
|
||
| public static abstract sealed class ReferenceValidity permits Real, Unsure, Fake { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe And I am not sure if there could be a group of other entries in some cases; but this design would be more future proof and more readable. @subhramit |
||
|
|
||
| public ReferenceValidity or(ReferenceValidity other) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add JavaDoc ( |
||
| if (this instanceof Real || other instanceof Fake) { | ||
| return this; | ||
| } | ||
| if (other instanceof Unsure otherUnsure && this instanceof Unsure thisUnsure) { | ||
| Unsure merge = new Unsure(); | ||
| merge.addAll(thisUnsure); | ||
| merge.addAll(otherUnsure); | ||
| return merge; | ||
| } | ||
| return other; | ||
| } | ||
|
|
||
| private ReferenceValidity lazyOr(ReferenceValiditySupplier other) throws FetcherException { | ||
| if (this instanceof Real) { | ||
| return this; | ||
| } else { | ||
| return or(other.get()); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| public static final class Real extends ReferenceValidity { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this can be a record? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sadly records already extend the Record class, so they cant inherit any further. I could refactor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch, keep this as a class itself |
||
| BibEntry matchingReference; | ||
|
|
||
| public Real(BibEntry matchingReference) { | ||
| this.matchingReference = matchingReference; | ||
| } | ||
|
|
||
| @Override | ||
| public boolean equals(Object o) { | ||
| if (this == o) { | ||
| return true; | ||
| } | ||
| if (o == null || getClass() != o.getClass()) { | ||
| return false; | ||
| } | ||
| Real real = (Real) o; | ||
| return Objects.equals(matchingReference, real.matchingReference); | ||
| } | ||
|
|
||
| @Override | ||
| public int hashCode() { | ||
| return Objects.hashCode(matchingReference); | ||
| } | ||
subhramit marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| public BibEntry getMatchingReference() { | ||
| return matchingReference; | ||
| } | ||
| } | ||
|
|
||
| public static final class Unsure extends ReferenceValidity { | ||
| Set<BibEntry> matchingReferences; | ||
|
|
||
| public Unsure(BibEntry matchingReference) { | ||
| this.matchingReferences = new HashSet<>(Set.of(matchingReference)); | ||
| } | ||
|
|
||
| private Unsure() { | ||
| this.matchingReferences = new HashSet<>(); | ||
| } | ||
|
|
||
| void addAll(Unsure other) { | ||
| this.matchingReferences.addAll(other.matchingReferences); | ||
| } | ||
|
|
||
| @Override | ||
| public boolean equals(Object o) { | ||
| if (this == o) { | ||
| return true; | ||
| } | ||
| if (o == null || getClass() != o.getClass()) { | ||
| return false; | ||
| } | ||
| Unsure unsure = (Unsure) o; | ||
| return Objects.equals(matchingReferences, unsure.matchingReferences); | ||
| } | ||
|
|
||
| @Override | ||
| public int hashCode() { | ||
| return Objects.hashCode(matchingReferences); | ||
| } | ||
|
|
||
| public Set<BibEntry> getMatchingReferences() { | ||
| return matchingReferences; | ||
| } | ||
| } | ||
|
|
||
| public static final class Fake extends ReferenceValidity { | ||
| public boolean equals(Object o) { | ||
| return o.getClass() == Fake.class; | ||
|
Comment on lines
+249
to
+251
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (Note: Don't take this suggestion right away, let another maintainer confirm) public static final Fake INSTANCE = new Fake();
private Fake() {}
@Override
public boolean equals(Object o) {
return o instanceof Fake;
}and for the hashcode return an arbitrary constant. @calixtus do you have a better idea? Something feels odd about the design. |
||
| } | ||
|
|
||
| public int hashCode() { | ||
| return Objects.hashCode(Fake.class); | ||
| } | ||
|
Comment on lines
+254
to
+256
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No neeed for an implementation, JDK's default works well in most cases Override is only necessary if equals is overridden: https://thefinestartist.com/effective-java/09 |
||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,6 +7,7 @@ | |
| import org.jabref.model.entry.BibEntryTypesManager; | ||
| import org.jabref.model.entry.field.Field; | ||
| import org.jabref.model.entry.field.StandardField; | ||
| import org.jabref.model.entry.field.UnknownField; | ||
| import org.jabref.model.entry.types.StandardEntryType; | ||
|
|
||
| import org.junit.jupiter.api.BeforeEach; | ||
|
|
@@ -611,4 +612,50 @@ void differentInCollectionWithTheSameISBNAreNotDuplicates() { | |
|
|
||
| assertFalse(duplicateChecker.isDuplicate(entryOne, entryTwo, BibDatabaseMode.BIBTEX)); | ||
| } | ||
|
|
||
| @Test | ||
| void degreeOfSimilarityOfSameEntryIsOne() { | ||
| assertEquals(1.0, duplicateChecker.degreeOfSimilarity(getSimpleArticle(), getSimpleArticle())); | ||
| assertEquals(1.0, duplicateChecker.degreeOfSimilarity(getSimpleInCollection(), getSimpleInCollection())); | ||
| } | ||
|
Comment on lines
+616
to
+620
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please only a single assertion per method - see https://devdocs.jabref.org/code-howtos/testing.html I think, you can rewrite using I think, this can also be merged |
||
|
|
||
| @Test | ||
| void differentEntriesHaveSmallDegreeOfSimilarity() { | ||
| assertTrue(0.3 > | ||
| duplicateChecker.degreeOfSimilarity( | ||
| new BibEntry(StandardEntryType.Article) | ||
| .withField(StandardField.TITLE, "Some Article"), | ||
| new BibEntry(StandardEntryType.InCollection) | ||
| .withField(StandardField.TITLE, "Other Collection") | ||
| ) | ||
| ); | ||
| } | ||
|
|
||
| @Test | ||
| void entriesWithNoMatchingFieldHaveNoSimilarity() { | ||
| assertEquals(0.0, duplicateChecker.degreeOfSimilarity( | ||
| new BibEntry(StandardEntryType.Article) | ||
| .withField(StandardField.TITLE, "Some Article"), | ||
| new BibEntry(StandardEntryType.Article) | ||
| .withField(StandardField.AUTHOR, "Some Author") | ||
| )); | ||
| } | ||
|
|
||
| @Test | ||
| void moreFieldsDoesNotAffectTheSimilarity() { | ||
| assertEquals(1.0, duplicateChecker.degreeOfSimilarity( | ||
| getSimpleArticle(), | ||
| getSimpleArticle().withField(new UnknownField("secret"), "Something") | ||
| )); | ||
| } | ||
|
|
||
| @Test | ||
| void similarEntriesHaveAHighDegreeOfSimilarity() { | ||
| double similarity = duplicateChecker.degreeOfSimilarity( | ||
| getSimpleArticle().withField(StandardField.YEAR, "2018"), | ||
| getSimpleArticle() | ||
| ); | ||
| assertTrue(0.8 < similarity); | ||
| assertTrue(1.0 > similarity); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
0.8 should be constant in
DuplicateCheck/ linked from there.