JabRef · alexvsf0 · Nov 5, 2025 · Nov 5, 2025
diff --git a/jablib/src/main/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporter.java b/jablib/src/main/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporter.java
@@ -7,6 +7,7 @@
 import java.util.List;
 import java.util.Optional;
 import java.util.Set;
+import java.util.regex.Pattern;
 import java.util.stream.Stream;
 
 import org.jabref.logic.FilePreferences;
@@ -161,17 +162,106 @@ private void fetchData(BibEntry candidate, StandardField field, IdBasedFetcher f
     }
 
     private static BibEntry mergeCandidates(Stream<BibEntry> candidates) {
+        // Collect all candidate entries
+        List<BibEntry> allCandidates = candidates.toList();
+
+        // Merge all fields from candidates into one entry
         final BibEntry entry = new BibEntry();
-        candidates.forEach(entry::mergeWith);
+        allCandidates.forEach(entry::mergeWith);
 
-        // Retain online links only
+        // Retain only online links
         List<LinkedFile> onlineLinks = entry.getFiles().stream().filter(LinkedFile::isOnlineLink).toList();
         entry.clearField(StandardField.FILE);
         entry.addFiles(onlineLinks);
 
+        // === NEW PART ===
+        // Step 1: get the current merged title
+        Optional<String> currentTitle = entry.getField(StandardField.TITLE);
+
+        // Step 2: find any other "better" title from the candidates
+        Optional<String> betterTitle = allCandidates.stream()
+                                                    .map(e -> e.getField(StandardField.TITLE))
+                                                    .flatMap(Optional::stream)
+                                                    .filter(t -> isBetterTitle(t, currentTitle.orElse("")))
+                                                    .findFirst();
+
+        // Step 3: replace the title if a better one was found
+        betterTitle.ifPresent(title -> {
+            entry.setField(StandardField.TITLE, title);
+            LOGGER.debug("Replaced title with better one: {}", title);
+        });
+
         return entry;
     }
 
+    /**
+     * Decide if a new title looks "better" than the old one.
+     * Very basic heuristic: longer, contains spaces, and not a generic filename.
+     */
+    public static boolean isBetterTitle(String newTitle, String oldTitle) {
+        if (newTitle == null || newTitle.isBlank()) {
+            return false;
+        }
+
+        newTitle = newTitle.trim();
+        oldTitle = oldTitle == null ? "" : oldTitle.trim();
+
+        String lower = newTitle.toLowerCase();
+
+        // 1. Exclude parasites titles
+        if (lower.matches(".*(microsoft word|adobe acrobat|document|untitled|journal template).*")) {
+            return false;
+        }
+        if (lower.matches(".*\\.(pdf|docx?|tex|rtf|zip|txt)$")) {
+            return false;
+        }
+
+        // 2. Exclude titles too short or with blank
+        if (newTitle.split("\\s+").length < 3) {
+            return false;
+        }
+
+        // 3. Exclude titles that could be path of the file
+        if (newTitle.contains(":\\") || newTitle.contains("/") || newTitle.contains("\\")) {
+            return false;
+        }
+
+        // 4. Check if there is a title style
+        boolean hasCapitalizedWords = Pattern.compile("\\b[A-Z][a-z]+").matcher(newTitle).find();
+        boolean allUppercase = newTitle.equals(newTitle.toUpperCase());
+        boolean allLowercase = newTitle.equals(newTitle.toLowerCase());
+
+        if (!hasCapitalizedWords || allUppercase || allLowercase) {
+            // Exclude if everything is upper case or lower case
+            return false;
+        }
+
+        // 5. Better grade if it uses punctuation ("-", ":", ",")
+        int punctuationScore = 0;
+        for (char c : newTitle.toCharArray()) {
+            if (c == ':' || c == '-' || c == ',') {
+                punctuationScore++;
+            }
+        }
+
+        // 6. Evaluate with a longer size (with a certain limit)
+        boolean longer = newTitle.length() > oldTitle.length() + 5 && newTitle.length() < 300;
+
+        // 7. Choose the better one
+        int score = 0;
+        if (longer) {
+            score++;
+        }
+        if (punctuationScore > 0) {
+            score++;
+        }
+        if (Character.isUpperCase(newTitle.charAt(0))) {
+            score++;
+        }
+
+        return score >= 2;
+    }
+
     /**
      * Imports the BibTeX data from the given PDF file and relativized the paths of each linked file based on the context and the file preferences.
      */

diff --git a/jablib/src/main/java/org/jabref/logic/importer/fileformat/pdf/PdfContentImporter.java b/jablib/src/main/java/org/jabref/logic/importer/fileformat/pdf/PdfContentImporter.java
@@ -15,6 +15,7 @@
 
 import org.jabref.logic.importer.fileformat.BibliographyFromPdfImporter;
 import org.jabref.logic.importer.fileformat.PdfMergeMetadataImporter;
+import org.jabref.logic.importer.util.AuthorHeuristics;
 import org.jabref.logic.l10n.Localization;
 import org.jabref.logic.os.OS;
 import org.jabref.logic.util.PdfUtils;
@@ -382,30 +383,55 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
         arXivId = getArXivId(arXivId);
         // start: title
         fillCurStringWithNonEmptyLines();
-        title = streamlineTitle(curString);
-        // i points to the next non-empty line
+        String contentTitle = streamlineTitle(curString);
         curString = "";
+
+        String finalTitle = contentTitle;
+
         if (titleByFontSize.isPresent() && !isNullOrEmpty(titleByFontSize.get())) {
-            title = titleByFontSize.get();
-        }
+            String fontSizeTitle = titleByFontSize.get();
 
-        // after title: authors
-        author = null;
-        while ((lineIndex < lines.length) && !"".equals(lines[lineIndex])) {
-            // author names are unlikely to be lines among different lines
-            // treat them line by line
-            curString = streamlineNames(lines[lineIndex]);
-            if (author == null) {
-                author = curString;
+            // Better heuristics
+            if (PdfMergeMetadataImporter.isBetterTitle(contentTitle, fontSizeTitle)) {
+                finalTitle = contentTitle;
             } else {
-                if (!"".equals(curString)) {
-                    author = author.concat(" and ").concat(curString);
-                }  // if lines[i] is "and" then "" is returned by streamlineNames -> do nothing
+                finalTitle = fontSizeTitle;
+            }
+        }
+
+        title = finalTitle;
+        // Start the analysis after the title block
+        StringBuilder collectedAuthors = new StringBuilder();
+
+        while (lineIndex < lines.length) {
+            String line = lines[lineIndex].trim();
+            String lower = line.toLowerCase(Locale.ROOT);
+
+            // Stop if we reach Abstract / Introduction
+            if (lower.contains("abstract") || lower.startsWith("i.")) {
+                break;
             }
+
+            if (AuthorHeuristics.looksLikeAuthors(line)) {
+                // Check if the line isn't part of the title
+                if (!title.toLowerCase(Locale.ROOT).contains(line.toLowerCase(Locale.ROOT))) {
+                    if (!collectedAuthors.isEmpty()) {
+                        collectedAuthors.append(" and ");
+                    }
+                    collectedAuthors.append(line);
+                }
+            }
+
             lineIndex++;
         }
-        curString = "";
-        lineIndex++;
+
+        // Clean the names, and add "and"
+        author = AuthorHeuristics.cleanAuthors(collectedAuthors.toString().trim());
+
+        // Fallback if nothing's found
+        if (author.isBlank()) {
+            author = "Unknown";
+        }
 
         // then, abstract and keywords follow
         while (lineIndex < lines.length) {
@@ -528,9 +554,8 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
 
         // TODO: institution parsing missing
 
-        if (author != null) {
-            entry.setField(StandardField.AUTHOR, author);
-        }
+        entry.setField(StandardField.AUTHOR, author);
+
         if (editor != null) {
             entry.setField(StandardField.EDITOR, editor);
         }
@@ -703,3 +728,4 @@ public String getDescription() {
         return Localization.lang("This importer parses data of the first page of the PDF and creates a BibTeX entry. Currently, Springer and IEEE formats are supported.");
     }
 }
+
diff --git a/jablib/src/main/java/org/jabref/logic/importer/util/AuthorHeuristics.java b/jablib/src/main/java/org/jabref/logic/importer/util/AuthorHeuristics.java
@@ -0,0 +1,94 @@
+package org.jabref.logic.importer.util;
+
+import java.util.regex.Pattern;
+
+public class AuthorHeuristics {
+
+    private static final String[] AFFILIATION_HINTS = {
+            "university", "institute", "department", "school",
+            "college", "laboratory", "lab", "company", "corporation",
+            "center", "centre", "faculty"
+    };
+
+    /**
+     * Heuristic detection of author lines.
+     * Accepts lines with initials, uppercase names, or "and"/"," separators.
+     */
+    public static boolean looksLikeAuthors(String line) {
+        if (line == null) {
+            return false;
+        }
+
+        String trimmed = line.trim();
+        if (trimmed.isEmpty()) {
+            return false;
+        }
+
+        String lower = trimmed.toLowerCase();
+
+        // Reject clear non-author lines
+        if (lower.contains("abstract") || lower.contains("keywords") || lower.contains("introduction")) {
+            return false;
+        }
+
+        for (String bad : AFFILIATION_HINTS) {
+            if (lower.contains(bad)) {
+                return false;
+            }
+        }
+
+        if (lower.contains("@") || lower.contains("http") || lower.contains("doi")) {
+            return false;
+        }
+
+        // Detect initials or capitalized words
+        boolean hasInitials = Pattern.compile("[A-Z]\\. ?[A-Z]?[a-zA-Z]+").matcher(trimmed).find();
+
+        // Count capitalized words
+        String[] tokens = trimmed.split("\\s+|,|and");
+        int capitalizedCount = 0;
+        for (String token : tokens) {
+            if (token.length() > 1 && Character.isUpperCase(token.charAt(0))) {
+                capitalizedCount++;
+            }
+        }
+
+        boolean capitalizedRatioOk = capitalizedCount >= Math.max(2, tokens.length / 3);
+        boolean hasSeparator = trimmed.contains(",") || lower.contains(" and ");
+        boolean notTooLong = tokens.length < 25;
+
+        return (hasSeparator || hasInitials || capitalizedRatioOk) && notTooLong;
+    }
+
+    /**
+     * Cleans detected author text (removes affiliations, numbers, emails, etc.).
+     */
+    public static String cleanAuthors(String line) {
+        if (line == null) {
+            return "";
+        }
+
+        // Remove digits and superscripts
+        line = line.replaceAll("\\d+", "");
+        // Remove email addresses
+        line = line.replaceAll("\\S*@\\S*", "");
+        // Remove parentheses (affiliations)
+        line = line.replaceAll("\\([^)]*\\)", "");
+        // Replace commas with "and"
+        line = line.replaceAll("\\s*,\\s*", " and ");
+        // Normalize "and"
+        line = line.replaceAll("\\s+and\\s+", " and ");
+        // Collapse multiple "and"
+        line = line.replaceAll("(and\\s+)+", "and ");
+        // Remove extra spaces
+        line = line.replaceAll("\\s{2,}", " ").trim();
+
+        // Normalize uppercase names
+        if (line.equals(line.toUpperCase())) {
+            line = line.toLowerCase();
+            line = Character.toUpperCase(line.charAt(0)) + line.substring(1);
+        }
+
+        return line.trim();
+    }
+}