diff --git a/jablib/src/main/java/org/jabref/logic/crawler/AutomaticDuplicateRemover.java b/jablib/src/main/java/org/jabref/logic/crawler/AutomaticDuplicateRemover.java new file mode 100644 index 00000000000..ba3455e77d2 --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/crawler/AutomaticDuplicateRemover.java @@ -0,0 +1,53 @@ +package org.jabref.logic.crawler; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.jabref.logic.database.DuplicateCheck; +import org.jabref.model.database.BibDatabase; +import org.jabref.model.database.BibDatabaseContext; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.BibEntryTypesManager; + +public class AutomaticDuplicateRemover { + private final BibEntryTypesManager bibEntryTypesManager; + + public AutomaticDuplicateRemover(BibEntryTypesManager bibEntryTypesManager) { + this.bibEntryTypesManager = bibEntryTypesManager; + } + + public void removeDuplicates(BibDatabaseContext databaseContext) { + DuplicateCheck duplicateCheck = new DuplicateCheck(bibEntryTypesManager); + BibDatabase database = databaseContext.getDatabase(); + List entries = database.getEntries(); + List entriesToRemove = new ArrayList<>(); + Set handledEntries = new HashSet<>(); + + for (int i = 0; i < entries.size(); i++) { + BibEntry entry1 = entries.get(i); + if (handledEntries.contains(entry1)) { + continue; + } + + for (int j = i + 1; j < entries.size(); j++) { + BibEntry entry2 = entries.get(j); + if (handledEntries.contains(entry2)) { + continue; + } + + if (duplicateCheck.isDuplicate(entry1, entry2, databaseContext.getMode())) { + entry1.mergeWith(entry2); + entriesToRemove.add(entry2); + handledEntries.add(entry2); + } + } + handledEntries.add(entry1); + } + + for (BibEntry entry : entriesToRemove) { + database.removeEntry(entry); + } + } +} diff --git a/jablib/src/main/java/org/jabref/logic/crawler/StudyRepository.java b/jablib/src/main/java/org/jabref/logic/crawler/StudyRepository.java index 6f6c4db5b10..59fcd3740e1 100644 --- a/jablib/src/main/java/org/jabref/logic/crawler/StudyRepository.java +++ b/jablib/src/main/java/org/jabref/logic/crawler/StudyRepository.java @@ -7,13 +7,17 @@ import java.nio.file.Path; import java.time.LocalDateTime; import java.time.temporal.ChronoUnit; +import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.regex.Pattern; import java.util.stream.Collectors; import org.jabref.logic.JabRefException; import org.jabref.logic.citationkeypattern.CitationKeyGenerator; import org.jabref.logic.database.DatabaseMerger; +import org.jabref.logic.database.DuplicateCheck; import org.jabref.logic.exporter.AtomicFileWriter; import org.jabref.logic.exporter.BibDatabaseWriter; import org.jabref.logic.exporter.BibWriter; @@ -419,6 +423,9 @@ private void persistResults(List crawlResults) throws IOException, // Merge new entries into study result file merger.merge(existingStudyResultEntries.getDatabase(), newStudyResultEntries); + LOGGER.info("Removing duplicates..."); + new AutomaticDuplicateRemover(bibEntryTypesManager).removeDuplicates(existingStudyResultEntries); + writeResultToFile(getPathToStudyResultFile(), existingStudyResultEntries); } @@ -463,4 +470,5 @@ private Path getPathToStudyResultFile() { private Path getPathToQueryDirectory(String query) { return repositoryPath.resolve(trimNameAndAddID(query)); } + }