Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Stream;

import org.jabref.logic.FilePreferences;
Expand Down Expand Up @@ -161,17 +162,106 @@ private void fetchData(BibEntry candidate, StandardField field, IdBasedFetcher f
}

private static BibEntry mergeCandidates(Stream<BibEntry> candidates) {
// Collect all candidate entries
List<BibEntry> allCandidates = candidates.toList();

// Merge all fields from candidates into one entry
final BibEntry entry = new BibEntry();
candidates.forEach(entry::mergeWith);
allCandidates.forEach(entry::mergeWith);

// Retain online links only
// Retain only online links
List<LinkedFile> onlineLinks = entry.getFiles().stream().filter(LinkedFile::isOnlineLink).toList();
entry.clearField(StandardField.FILE);
entry.addFiles(onlineLinks);

// === NEW PART ===
// Step 1: get the current merged title
Optional<String> currentTitle = entry.getField(StandardField.TITLE);

// Step 2: find any other "better" title from the candidates
Optional<String> betterTitle = allCandidates.stream()
.map(e -> e.getField(StandardField.TITLE))
.flatMap(Optional::stream)
.filter(t -> isBetterTitle(t, currentTitle.orElse("")))
.findFirst();

// Step 3: replace the title if a better one was found
betterTitle.ifPresent(title -> {
entry.setField(StandardField.TITLE, title);
LOGGER.debug("Replaced title with better one: {}", title);
});

return entry;
}

/**
* Decide if a new title looks "better" than the old one.
* Very basic heuristic: longer, contains spaces, and not a generic filename.
*/
public static boolean isBetterTitle(String newTitle, String oldTitle) {
if (newTitle == null || newTitle.isBlank()) {
return false;
}

newTitle = newTitle.trim();
oldTitle = oldTitle == null ? "" : oldTitle.trim();

String lower = newTitle.toLowerCase();

// 1. Exclude parasites titles
if (lower.matches(".*(microsoft word|adobe acrobat|document|untitled|journal template).*")) {
return false;
}
if (lower.matches(".*\\.(pdf|docx?|tex|rtf|zip|txt)$")) {
return false;
}

// 2. Exclude titles too short or with blank
if (newTitle.split("\\s+").length < 3) {
return false;
}

// 3. Exclude titles that could be path of the file
if (newTitle.contains(":\\") || newTitle.contains("/") || newTitle.contains("\\")) {
return false;
}

// 4. Check if there is a title style
boolean hasCapitalizedWords = Pattern.compile("\\b[A-Z][a-z]+").matcher(newTitle).find();
boolean allUppercase = newTitle.equals(newTitle.toUpperCase());
boolean allLowercase = newTitle.equals(newTitle.toLowerCase());

if (!hasCapitalizedWords || allUppercase || allLowercase) {
// Exclude if everything is upper case or lower case
return false;
}

// 5. Better grade if it uses punctuation ("-", ":", ",")
int punctuationScore = 0;
for (char c : newTitle.toCharArray()) {
if (c == ':' || c == '-' || c == ',') {
punctuationScore++;
}
}

// 6. Evaluate with a longer size (with a certain limit)
boolean longer = newTitle.length() > oldTitle.length() + 5 && newTitle.length() < 300;

// 7. Choose the better one
int score = 0;
if (longer) {
score++;
}
if (punctuationScore > 0) {
score++;
}
if (Character.isUpperCase(newTitle.charAt(0))) {
score++;
}

return score >= 2;
}

/**
* Imports the BibTeX data from the given PDF file and relativized the paths of each linked file based on the context and the file preferences.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import org.jabref.logic.importer.fileformat.BibliographyFromPdfImporter;
import org.jabref.logic.importer.fileformat.PdfMergeMetadataImporter;
import org.jabref.logic.importer.util.AuthorHeuristics;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.os.OS;
import org.jabref.logic.util.PdfUtils;
Expand Down Expand Up @@ -382,30 +383,55 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
arXivId = getArXivId(arXivId);
// start: title
fillCurStringWithNonEmptyLines();
title = streamlineTitle(curString);
// i points to the next non-empty line
String contentTitle = streamlineTitle(curString);
curString = "";

String finalTitle = contentTitle;

if (titleByFontSize.isPresent() && !isNullOrEmpty(titleByFontSize.get())) {
title = titleByFontSize.get();
}
String fontSizeTitle = titleByFontSize.get();

// after title: authors
author = null;
while ((lineIndex < lines.length) && !"".equals(lines[lineIndex])) {
// author names are unlikely to be lines among different lines
// treat them line by line
curString = streamlineNames(lines[lineIndex]);
if (author == null) {
author = curString;
// Better heuristics
if (PdfMergeMetadataImporter.isBetterTitle(contentTitle, fontSizeTitle)) {
finalTitle = contentTitle;
} else {
if (!"".equals(curString)) {
author = author.concat(" and ").concat(curString);
} // if lines[i] is "and" then "" is returned by streamlineNames -> do nothing
finalTitle = fontSizeTitle;
}
}

title = finalTitle;
// Start the analysis after the title block
StringBuilder collectedAuthors = new StringBuilder();

while (lineIndex < lines.length) {
String line = lines[lineIndex].trim();
String lower = line.toLowerCase(Locale.ROOT);

// Stop if we reach Abstract / Introduction
if (lower.contains("abstract") || lower.startsWith("i.")) {
break;
}

if (AuthorHeuristics.looksLikeAuthors(line)) {
// Check if the line isn't part of the title
if (!title.toLowerCase(Locale.ROOT).contains(line.toLowerCase(Locale.ROOT))) {
if (!collectedAuthors.isEmpty()) {
collectedAuthors.append(" and ");
}
collectedAuthors.append(line);
}
}

lineIndex++;
}
curString = "";
lineIndex++;

// Clean the names, and add "and"
author = AuthorHeuristics.cleanAuthors(collectedAuthors.toString().trim());

// Fallback if nothing's found
if (author.isBlank()) {
author = "Unknown";
}

// then, abstract and keywords follow
while (lineIndex < lines.length) {
Expand Down Expand Up @@ -528,9 +554,8 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS

// TODO: institution parsing missing

if (author != null) {
entry.setField(StandardField.AUTHOR, author);
}
entry.setField(StandardField.AUTHOR, author);

if (editor != null) {
entry.setField(StandardField.EDITOR, editor);
}
Expand Down Expand Up @@ -703,3 +728,4 @@ public String getDescription() {
return Localization.lang("This importer parses data of the first page of the PDF and creates a BibTeX entry. Currently, Springer and IEEE formats are supported.");
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
package org.jabref.logic.importer.util;

import java.util.regex.Pattern;

public class AuthorHeuristics {

private static final String[] AFFILIATION_HINTS = {
"university", "institute", "department", "school",
"college", "laboratory", "lab", "company", "corporation",
"center", "centre", "faculty"
};

/**
* Heuristic detection of author lines.
* Accepts lines with initials, uppercase names, or "and"/"," separators.
*/
public static boolean looksLikeAuthors(String line) {
if (line == null) {
return false;
}

String trimmed = line.trim();
if (trimmed.isEmpty()) {
return false;
}

String lower = trimmed.toLowerCase();

// Reject clear non-author lines
if (lower.contains("abstract") || lower.contains("keywords") || lower.contains("introduction")) {
return false;
}

for (String bad : AFFILIATION_HINTS) {
if (lower.contains(bad)) {
return false;
}
}

if (lower.contains("@") || lower.contains("http") || lower.contains("doi")) {
return false;
}

// Detect initials or capitalized words
boolean hasInitials = Pattern.compile("[A-Z]\\. ?[A-Z]?[a-zA-Z]+").matcher(trimmed).find();

// Count capitalized words
String[] tokens = trimmed.split("\\s+|,|and");
int capitalizedCount = 0;
for (String token : tokens) {
if (token.length() > 1 && Character.isUpperCase(token.charAt(0))) {
capitalizedCount++;
}
}

boolean capitalizedRatioOk = capitalizedCount >= Math.max(2, tokens.length / 3);
boolean hasSeparator = trimmed.contains(",") || lower.contains(" and ");
boolean notTooLong = tokens.length < 25;

return (hasSeparator || hasInitials || capitalizedRatioOk) && notTooLong;
}

/**
* Cleans detected author text (removes affiliations, numbers, emails, etc.).
*/
public static String cleanAuthors(String line) {
if (line == null) {
return "";
}

// Remove digits and superscripts
line = line.replaceAll("\\d+", "");
// Remove email addresses
line = line.replaceAll("\\S*@\\S*", "");
// Remove parentheses (affiliations)
line = line.replaceAll("\\([^)]*\\)", "");
// Replace commas with "and"
line = line.replaceAll("\\s*,\\s*", " and ");
// Normalize "and"
line = line.replaceAll("\\s+and\\s+", " and ");
// Collapse multiple "and"
line = line.replaceAll("(and\\s+)+", "and ");
// Remove extra spaces
line = line.replaceAll("\\s{2,}", " ").trim();

// Normalize uppercase names
if (line.equals(line.toUpperCase())) {
line = line.toLowerCase();
line = Character.toUpperCase(line.charAt(0)) + line.substring(1);
}

return line.trim();
}
}
Loading