Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
50c3ca9
add refchecker class which does the core functionality of refchecking
ravatex Oct 26, 2025
eeec045
add made some methods public that are useful
ravatex Oct 26, 2025
735f985
removed unused Parser argument
ravatex Oct 26, 2025
be4d588
add tests for refchecker and modified refchecker to add equals methods
ravatex Oct 26, 2025
9939899
added validation for entry lists and parsing from input
ravatex Oct 26, 2025
037d243
added test for list of entries
ravatex Oct 27, 2025
9725af0
removed parse and validate, there is no reason to have that in this c…
ravatex Oct 27, 2025
43dc552
added documentation
ravatex Oct 27, 2025
b9027da
added degree of similarity to see if entries have similar fields
ravatex Oct 31, 2025
ead14fd
added tests for degree of similarity
ravatex Oct 31, 2025
e442b24
changed refchecker to work with degree of similarity instead of dupli…
ravatex Oct 31, 2025
151aaa7
added tests for updated refchecker with unsure
ravatex Oct 31, 2025
c8bfb21
improved degree of similarity docs
ravatex Oct 31, 2025
6d783e9
fixed checkstyle
ravatex Oct 31, 2025
f77341a
fix rewrite run
ravatex Oct 31, 2025
a33eb13
refactored duplicateCheck#degreeOfSimilarity to use the StringSimilar…
ravatex Nov 1, 2025
6c1aa0e
fix small changes for code quality
ravatex Nov 1, 2025
80b8058
changed the or functionality to return a new Unsure object instead of…
ravatex Nov 2, 2025
6f0e9c7
changed var to qualified names and removed a print statements
ravatex Nov 2, 2025
b98b78a
fixed degree of similarity documentation
ravatex Nov 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java
Original file line number Diff line number Diff line change
Expand Up @@ -364,4 +364,32 @@ public Optional<BibEntry> containsDuplicate(final BibDatabase database,

return database.getEntries().stream().filter(other -> isDuplicate(entry, other, bibDatabaseMode)).findFirst();
}

/**
* Checks across all fields of the entries,
* any matching ones get compared.
* If they are not the same the score goes down.
* The score goes down depending on the StringSimilarity score.
* <p>
* If the result is zero, it means that either no common fields were found
* or that all common fields were very far apart lexically.
* <p>
* If the result is one, it means that there was at least one common field
* and all the common fields were the same.
* <p>
* Similar entries have a score of above 0.8
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

0.8 should be constant in DuplicateCheck / linked from there.

*
* @param one The first entry
* @param two The second entry
* @return number [0,1] 1 representing the same (one potentially having more fields), 0 representing completely different
*/
public double degreeOfSimilarity(final BibEntry one, final BibEntry two) {
StringSimilarity stringSimilarity = new StringSimilarity();
return one.getFields((field) -> two.getField(field).isPresent())
.stream().mapToDouble((field) -> {
String first = one.getField(field).get();
String second = two.getField(field).get();
return stringSimilarity.similarity(first, second);
}).average().orElse(0.0);
}
}
258 changes: 258 additions & 0 deletions jablib/src/main/java/org/jabref/logic/integrity/RefChecker.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
package org.jabref.logic.integrity;

import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;

import org.jabref.logic.database.DuplicateCheck;
import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.IdBasedFetcher;
import org.jabref.logic.importer.fetcher.ArXivFetcher;
import org.jabref.logic.importer.fetcher.CrossRef;
import org.jabref.logic.importer.fetcher.DoiFetcher;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.BibEntryTypesManager;
import org.jabref.model.entry.identifier.ArXivIdentifier;
import org.jabref.model.entry.identifier.DOI;

import com.airhacks.afterburner.injection.Injector;

/**
* Validates a BibEntry depending on if it
* is consistent with the fetched Entry
*/
public class RefChecker {
DoiFetcher doiFetcher;
ArXivFetcher arxivFetcher;
CrossRef crossRef;
DuplicateCheck duplicateCheck;
Comment on lines +29 to +32
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These should be private?


public RefChecker(DoiFetcher doiFetcher, ArXivFetcher arXivFetcher) {
this(doiFetcher, arXivFetcher, new CrossRef(), new DuplicateCheck(Injector.instantiateModelOrService(BibEntryTypesManager.class)));
}

public RefChecker(
DoiFetcher doiFetcher,
ArXivFetcher arXivFetcher,
CrossRef crossRef,
DuplicateCheck duplicateCheck) {
this.doiFetcher = doiFetcher;
this.arxivFetcher = arXivFetcher;
this.crossRef = crossRef;
this.duplicateCheck = duplicateCheck;
}

/**
* Tries to find the best reference validity
* among current ways. If any of the methods signal
* that it is real, it returns early.
* <p>
* DoiFetcher -> CrossRef -> ArxivFetcher
*
* @param entry entry checking
* @return the reference validity
* @throws FetcherException any error from fetchers
*/
Comment on lines +49 to +59
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Try to use Markdown as JavaDoc format. Otherwise you need to escape >.

public ReferenceValidity referenceValidityOfEntry(BibEntry entry) throws FetcherException {
return validityFromDoiFetcher(entry).lazyOr(() ->
validityFromCrossRef(entry)
).lazyOr(() -> validityFromArxiv(entry));
}

private ReferenceValidity validityFromFetcher(BibEntry entry, IdBasedFetcher fetcher) throws FetcherException {
Optional<DOI> doi = entry.getDOI();
if (doi.isEmpty()) {
return new Fake();
}

Optional<BibEntry> other = fetcher.performSearchById(doi.get().asString());
return other.map(foundEntry -> compareReferences(entry, foundEntry))
.orElse(new Fake());
}

/**
* Tests validity only from the DoiFetcher.
*
* @param entry the entry
* @return the reference validity
* @throws FetcherException the fetcher exception
*/
public ReferenceValidity validityFromDoiFetcher(BibEntry entry) throws FetcherException {
return validityFromFetcher(entry, doiFetcher);
}

/**
* Validity only from the CrossRef and later from the DoiFetcher.
*
* @param entry the entry
* @return the reference validity
* @throws FetcherException the fetcher exception
*/
public ReferenceValidity validityFromCrossRef(BibEntry entry) throws FetcherException {
Optional<DOI> doiFound = crossRef.findIdentifier(entry);

if (doiFound.isEmpty()) {
return new Fake();
} else {
DOI doi = doiFound.get();
return doiFetcher.performSearchById(doi.asString()).map(
(found) -> compareReferences(entry, found)
).orElse(new Fake());
}
}

/**
* Validity only from the arxivFetcher.
*
* @param entry the entry
* @return the reference validity
* @throws FetcherException the fetcher exception
*/
public ReferenceValidity validityFromArxiv(BibEntry entry) throws FetcherException {

Optional<ArXivIdentifier> foundIdentifier = arxivFetcher.findIdentifier(entry);
if (foundIdentifier.isEmpty()) {
return new Fake();
}
return arxivFetcher.performSearchById(foundIdentifier.get().asString()).map(
foundEntry -> compareReferences(entry, foundEntry)
).orElse(new Fake());
}

/**
* Takes a list for entries and returns the mapping of them with their corresponding
* reference validity.
*
* @param entries the entries
* @return the map
* @throws FetcherException the fetcher exception
*/
public Map<BibEntry, ReferenceValidity> validateListOfEntries(List<BibEntry> entries) throws FetcherException {

Map<BibEntry, ReferenceValidity> entriesToValidity = new HashMap<>();
for (BibEntry entry : entries) {
entriesToValidity.put(entry, referenceValidityOfEntry(entry));
}
return entriesToValidity;
}

private ReferenceValidity compareReferences(BibEntry localEntry, BibEntry validFoundEntry) {
double similarity = duplicateCheck.degreeOfSimilarity(localEntry, validFoundEntry);
if (similarity >= 0.999) {
return new Real(validFoundEntry);
} else if (similarity > 0.8) {
return new Unsure(validFoundEntry);
} else {
return new Fake();
}
}

@FunctionalInterface
private interface ReferenceValiditySupplier {
ReferenceValidity get() throws FetcherException;
}

public static abstract sealed class ReferenceValidity permits Real, Unsure, Fake {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe

record(Validity validtiy, @Nullable BibTeX otherEntry)

And Valildity being an enum - and otherEntry allowed for null in the fake entry.

I am not sure if there could be a group of other entries in some cases; but this design would be more future proof and more readable. @subhramit


public ReferenceValidity or(ReferenceValidity other) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add JavaDoc (///) for this.

if (this instanceof Real || other instanceof Fake) {
return this;
}
if (other instanceof Unsure otherUnsure && this instanceof Unsure thisUnsure) {
Unsure merge = new Unsure();
merge.addAll(thisUnsure);
merge.addAll(otherUnsure);
return merge;
}
return other;
}

private ReferenceValidity lazyOr(ReferenceValiditySupplier other) throws FetcherException {
if (this instanceof Real) {
return this;
} else {
return or(other.get());
}
}
}

public static final class Real extends ReferenceValidity {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this can be a record?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sadly records already extend the Record class, so they cant inherit any further. I could refactor ReferenceValidity into an interface, but that would lose the sealed class capability.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, keep this as a class itself

BibEntry matchingReference;

public Real(BibEntry matchingReference) {
this.matchingReference = matchingReference;
}

@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
Real real = (Real) o;
return Objects.equals(matchingReference, real.matchingReference);
}

@Override
public int hashCode() {
return Objects.hashCode(matchingReference);
}

public BibEntry getMatchingReference() {
return matchingReference;
}
}

public static final class Unsure extends ReferenceValidity {
Set<BibEntry> matchingReferences;

public Unsure(BibEntry matchingReference) {
this.matchingReferences = new HashSet<>(Set.of(matchingReference));
}

private Unsure() {
this.matchingReferences = new HashSet<>();
}

void addAll(Unsure other) {
this.matchingReferences.addAll(other.matchingReferences);
}

@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
Unsure unsure = (Unsure) o;
return Objects.equals(matchingReferences, unsure.matchingReferences);
}

@Override
public int hashCode() {
return Objects.hashCode(matchingReferences);
}

public Set<BibEntry> getMatchingReferences() {
return matchingReferences;
}
}

public static final class Fake extends ReferenceValidity {
public boolean equals(Object o) {
return o.getClass() == Fake.class;
Comment on lines +249 to +251
Copy link
Member

@subhramit subhramit Nov 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Note: Don't take this suggestion right away, let another maintainer confirm)
Creating a new Fake every time seems a bit odd as unlike Real, it is a stateless marker. This should be a singleton.
Maybe do something like

public static final Fake INSTANCE = new Fake();
    
private Fake() {}
    
@Override
public boolean equals(Object o) {
    return o instanceof Fake;
}

and for the hashcode return an arbitrary constant.
This way, you can just do something like return Fake.INSTANCE instead of return new Fake().

@calixtus do you have a better idea? Something feels odd about the design.

}

public int hashCode() {
return Objects.hashCode(Fake.class);
}
Comment on lines +254 to +256
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No neeed for an implementation, JDK's default works well in most cases

Override is only necessary if equals is overridden: https://thefinestartist.com/effective-java/09

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import org.jabref.model.entry.BibEntryTypesManager;
import org.jabref.model.entry.field.Field;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.field.UnknownField;
import org.jabref.model.entry.types.StandardEntryType;

import org.junit.jupiter.api.BeforeEach;
Expand Down Expand Up @@ -611,4 +612,50 @@ void differentInCollectionWithTheSameISBNAreNotDuplicates() {

assertFalse(duplicateChecker.isDuplicate(entryOne, entryTwo, BibDatabaseMode.BIBTEX));
}

@Test
void degreeOfSimilarityOfSameEntryIsOne() {
assertEquals(1.0, duplicateChecker.degreeOfSimilarity(getSimpleArticle(), getSimpleArticle()));
assertEquals(1.0, duplicateChecker.degreeOfSimilarity(getSimpleInCollection(), getSimpleInCollection()));
}
Comment on lines +616 to +620
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please only a single assertion per method - see https://devdocs.jabref.org/code-howtos/testing.html

I think, you can rewrite using @ParamterizedTest.

I think, this can also be merged entriesWithNoMatchingFieldHaveNoSimilarity and more.


@Test
void differentEntriesHaveSmallDegreeOfSimilarity() {
assertTrue(0.3 >
duplicateChecker.degreeOfSimilarity(
new BibEntry(StandardEntryType.Article)
.withField(StandardField.TITLE, "Some Article"),
new BibEntry(StandardEntryType.InCollection)
.withField(StandardField.TITLE, "Other Collection")
)
);
}

@Test
void entriesWithNoMatchingFieldHaveNoSimilarity() {
assertEquals(0.0, duplicateChecker.degreeOfSimilarity(
new BibEntry(StandardEntryType.Article)
.withField(StandardField.TITLE, "Some Article"),
new BibEntry(StandardEntryType.Article)
.withField(StandardField.AUTHOR, "Some Author")
));
}

@Test
void moreFieldsDoesNotAffectTheSimilarity() {
assertEquals(1.0, duplicateChecker.degreeOfSimilarity(
getSimpleArticle(),
getSimpleArticle().withField(new UnknownField("secret"), "Something")
));
}

@Test
void similarEntriesHaveAHighDegreeOfSimilarity() {
double similarity = duplicateChecker.degreeOfSimilarity(
getSimpleArticle().withField(StandardField.YEAR, "2018"),
getSimpleArticle()
);
assertTrue(0.8 < similarity);
assertTrue(1.0 > similarity);
}
}
Loading
Loading