diff --git a/jablib/src/main/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporter.java b/jablib/src/main/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporter.java index c2ad25a3a96..a58c80ba8d0 100644 --- a/jablib/src/main/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporter.java +++ b/jablib/src/main/java/org/jabref/logic/importer/fileformat/PdfMergeMetadataImporter.java @@ -7,6 +7,7 @@ import java.util.List; import java.util.Optional; import java.util.Set; +import java.util.regex.Pattern; import java.util.stream.Stream; import org.jabref.logic.FilePreferences; @@ -161,17 +162,106 @@ private void fetchData(BibEntry candidate, StandardField field, IdBasedFetcher f } private static BibEntry mergeCandidates(Stream candidates) { + // Collect all candidate entries + List allCandidates = candidates.toList(); + + // Merge all fields from candidates into one entry final BibEntry entry = new BibEntry(); - candidates.forEach(entry::mergeWith); + allCandidates.forEach(entry::mergeWith); - // Retain online links only + // Retain only online links List onlineLinks = entry.getFiles().stream().filter(LinkedFile::isOnlineLink).toList(); entry.clearField(StandardField.FILE); entry.addFiles(onlineLinks); + // === NEW PART === + // Step 1: get the current merged title + Optional currentTitle = entry.getField(StandardField.TITLE); + + // Step 2: find any other "better" title from the candidates + Optional betterTitle = allCandidates.stream() + .map(e -> e.getField(StandardField.TITLE)) + .flatMap(Optional::stream) + .filter(t -> isBetterTitle(t, currentTitle.orElse(""))) + .findFirst(); + + // Step 3: replace the title if a better one was found + betterTitle.ifPresent(title -> { + entry.setField(StandardField.TITLE, title); + LOGGER.debug("Replaced title with better one: {}", title); + }); + return entry; } + /** + * Decide if a new title looks "better" than the old one. + * Very basic heuristic: longer, contains spaces, and not a generic filename. + */ + public static boolean isBetterTitle(String newTitle, String oldTitle) { + if (newTitle == null || newTitle.isBlank()) { + return false; + } + + newTitle = newTitle.trim(); + oldTitle = oldTitle == null ? "" : oldTitle.trim(); + + String lower = newTitle.toLowerCase(); + + // 1. Exclude parasites titles + if (lower.matches(".*(microsoft word|adobe acrobat|document|untitled|journal template).*")) { + return false; + } + if (lower.matches(".*\\.(pdf|docx?|tex|rtf|zip|txt)$")) { + return false; + } + + // 2. Exclude titles too short or with blank + if (newTitle.split("\\s+").length < 3) { + return false; + } + + // 3. Exclude titles that could be path of the file + if (newTitle.contains(":\\") || newTitle.contains("/") || newTitle.contains("\\")) { + return false; + } + + // 4. Check if there is a title style + boolean hasCapitalizedWords = Pattern.compile("\\b[A-Z][a-z]+").matcher(newTitle).find(); + boolean allUppercase = newTitle.equals(newTitle.toUpperCase()); + boolean allLowercase = newTitle.equals(newTitle.toLowerCase()); + + if (!hasCapitalizedWords || allUppercase || allLowercase) { + // Exclude if everything is upper case or lower case + return false; + } + + // 5. Better grade if it uses punctuation ("-", ":", ",") + int punctuationScore = 0; + for (char c : newTitle.toCharArray()) { + if (c == ':' || c == '-' || c == ',') { + punctuationScore++; + } + } + + // 6. Evaluate with a longer size (with a certain limit) + boolean longer = newTitle.length() > oldTitle.length() + 5 && newTitle.length() < 300; + + // 7. Choose the better one + int score = 0; + if (longer) { + score++; + } + if (punctuationScore > 0) { + score++; + } + if (Character.isUpperCase(newTitle.charAt(0))) { + score++; + } + + return score >= 2; + } + /** * Imports the BibTeX data from the given PDF file and relativized the paths of each linked file based on the context and the file preferences. */ diff --git a/jablib/src/main/java/org/jabref/logic/importer/fileformat/pdf/PdfContentImporter.java b/jablib/src/main/java/org/jabref/logic/importer/fileformat/pdf/PdfContentImporter.java index 1e4771e8495..37baf3581bb 100644 --- a/jablib/src/main/java/org/jabref/logic/importer/fileformat/pdf/PdfContentImporter.java +++ b/jablib/src/main/java/org/jabref/logic/importer/fileformat/pdf/PdfContentImporter.java @@ -15,6 +15,7 @@ import org.jabref.logic.importer.fileformat.BibliographyFromPdfImporter; import org.jabref.logic.importer.fileformat.PdfMergeMetadataImporter; +import org.jabref.logic.importer.util.AuthorHeuristics; import org.jabref.logic.l10n.Localization; import org.jabref.logic.os.OS; import org.jabref.logic.util.PdfUtils; @@ -382,30 +383,55 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS arXivId = getArXivId(arXivId); // start: title fillCurStringWithNonEmptyLines(); - title = streamlineTitle(curString); - // i points to the next non-empty line + String contentTitle = streamlineTitle(curString); curString = ""; + + String finalTitle = contentTitle; + if (titleByFontSize.isPresent() && !isNullOrEmpty(titleByFontSize.get())) { - title = titleByFontSize.get(); - } + String fontSizeTitle = titleByFontSize.get(); - // after title: authors - author = null; - while ((lineIndex < lines.length) && !"".equals(lines[lineIndex])) { - // author names are unlikely to be lines among different lines - // treat them line by line - curString = streamlineNames(lines[lineIndex]); - if (author == null) { - author = curString; + // Better heuristics + if (PdfMergeMetadataImporter.isBetterTitle(contentTitle, fontSizeTitle)) { + finalTitle = contentTitle; } else { - if (!"".equals(curString)) { - author = author.concat(" and ").concat(curString); - } // if lines[i] is "and" then "" is returned by streamlineNames -> do nothing + finalTitle = fontSizeTitle; + } + } + + title = finalTitle; + // Start the analysis after the title block + StringBuilder collectedAuthors = new StringBuilder(); + + while (lineIndex < lines.length) { + String line = lines[lineIndex].trim(); + String lower = line.toLowerCase(Locale.ROOT); + + // Stop if we reach Abstract / Introduction + if (lower.contains("abstract") || lower.startsWith("i.")) { + break; } + + if (AuthorHeuristics.looksLikeAuthors(line)) { + // Check if the line isn't part of the title + if (!title.toLowerCase(Locale.ROOT).contains(line.toLowerCase(Locale.ROOT))) { + if (!collectedAuthors.isEmpty()) { + collectedAuthors.append(" and "); + } + collectedAuthors.append(line); + } + } + lineIndex++; } - curString = ""; - lineIndex++; + + // Clean the names, and add "and" + author = AuthorHeuristics.cleanAuthors(collectedAuthors.toString().trim()); + + // Fallback if nothing's found + if (author.isBlank()) { + author = "Unknown"; + } // then, abstract and keywords follow while (lineIndex < lines.length) { @@ -528,9 +554,8 @@ Optional getEntryFromPDFContent(String firstpageContents, String lineS // TODO: institution parsing missing - if (author != null) { - entry.setField(StandardField.AUTHOR, author); - } + entry.setField(StandardField.AUTHOR, author); + if (editor != null) { entry.setField(StandardField.EDITOR, editor); } @@ -703,3 +728,4 @@ public String getDescription() { return Localization.lang("This importer parses data of the first page of the PDF and creates a BibTeX entry. Currently, Springer and IEEE formats are supported."); } } + diff --git a/jablib/src/main/java/org/jabref/logic/importer/util/AuthorHeuristics.java b/jablib/src/main/java/org/jabref/logic/importer/util/AuthorHeuristics.java new file mode 100644 index 00000000000..b86ad55ed51 --- /dev/null +++ b/jablib/src/main/java/org/jabref/logic/importer/util/AuthorHeuristics.java @@ -0,0 +1,94 @@ +package org.jabref.logic.importer.util; + +import java.util.regex.Pattern; + +public class AuthorHeuristics { + + private static final String[] AFFILIATION_HINTS = { + "university", "institute", "department", "school", + "college", "laboratory", "lab", "company", "corporation", + "center", "centre", "faculty" + }; + + /** + * Heuristic detection of author lines. + * Accepts lines with initials, uppercase names, or "and"/"," separators. + */ + public static boolean looksLikeAuthors(String line) { + if (line == null) { + return false; + } + + String trimmed = line.trim(); + if (trimmed.isEmpty()) { + return false; + } + + String lower = trimmed.toLowerCase(); + + // Reject clear non-author lines + if (lower.contains("abstract") || lower.contains("keywords") || lower.contains("introduction")) { + return false; + } + + for (String bad : AFFILIATION_HINTS) { + if (lower.contains(bad)) { + return false; + } + } + + if (lower.contains("@") || lower.contains("http") || lower.contains("doi")) { + return false; + } + + // Detect initials or capitalized words + boolean hasInitials = Pattern.compile("[A-Z]\\. ?[A-Z]?[a-zA-Z]+").matcher(trimmed).find(); + + // Count capitalized words + String[] tokens = trimmed.split("\\s+|,|and"); + int capitalizedCount = 0; + for (String token : tokens) { + if (token.length() > 1 && Character.isUpperCase(token.charAt(0))) { + capitalizedCount++; + } + } + + boolean capitalizedRatioOk = capitalizedCount >= Math.max(2, tokens.length / 3); + boolean hasSeparator = trimmed.contains(",") || lower.contains(" and "); + boolean notTooLong = tokens.length < 25; + + return (hasSeparator || hasInitials || capitalizedRatioOk) && notTooLong; + } + + /** + * Cleans detected author text (removes affiliations, numbers, emails, etc.). + */ + public static String cleanAuthors(String line) { + if (line == null) { + return ""; + } + + // Remove digits and superscripts + line = line.replaceAll("\\d+", ""); + // Remove email addresses + line = line.replaceAll("\\S*@\\S*", ""); + // Remove parentheses (affiliations) + line = line.replaceAll("\\([^)]*\\)", ""); + // Replace commas with "and" + line = line.replaceAll("\\s*,\\s*", " and "); + // Normalize "and" + line = line.replaceAll("\\s+and\\s+", " and "); + // Collapse multiple "and" + line = line.replaceAll("(and\\s+)+", "and "); + // Remove extra spaces + line = line.replaceAll("\\s{2,}", " ").trim(); + + // Normalize uppercase names + if (line.equals(line.toUpperCase())) { + line = line.toLowerCase(); + line = Character.toUpperCase(line.charAt(0)) + line.substring(1); + } + + return line.trim(); + } +}