Enhance LTWA code (#12977)

subhramit · web-flow · commit 782bd34f3a3c · 2025-04-21T12:48:04.000Z
* Improve LTWA code

Signed-off-by: subhramit &lt;subhramit.bb@live.in&gt;

* Remove unused parameter

Signed-off-by: subhramit &lt;subhramit.bb@live.in&gt;

* Un-refactor `matchesInternal`

Signed-off-by: subhramit &lt;subhramit.bb@live.in&gt;

* Remove null check, better variable name

Signed-off-by: subhramit &lt;subhramit.bb@live.in&gt;

* Remove javadoc

Signed-off-by: subhramit &lt;subhramit.bb@live.in&gt;

* Remove equals and hashcode from record implementation

Signed-off-by: subhramit &lt;subhramit.bb@live.in&gt;

* Remove remaining vars

Signed-off-by: subhramit &lt;subhramit.bb@live.in&gt;

---------

Signed-off-by: subhramit &lt;subhramit.bb@live.in&gt;
diff --git a/src/main/java/org/jabref/logic/journals/ltwa/LtwaRepository.java b/src/main/java/org/jabref/logic/journals/ltwa/LtwaRepository.java
@@ -5,6 +5,7 @@
 import java.util.Comparator;
 import java.util.List;
 import java.util.Optional;
+import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.IntStream;
 import java.util.stream.Stream;
@@ -23,34 +24,45 @@
  * A repository for LTWA (List of Title Word Abbreviations) entries.
  * Provides methods for retrieving and applying abbreviations based on LTWA rules.
  */
-@SuppressWarnings("checkstyle:RegexpMultiline")
 public class LtwaRepository {
     private static final Logger LOGGER = LoggerFactory.getLogger(LtwaRepository.class);
     private static final Pattern INFLECTION = Pattern.compile("[ieasn'’]{1,3}");
     private static final Pattern BOUNDARY = Pattern.compile("[\\s\\u2013\\u2014_.,:;!|=+*\\\\/\"()&#%@$?]");
+    private static final String PREFIX_MAP_NAME = "Prefixes";
+    private static final String SUFFIX_MAP_NAME = "Suffixes";
 
-    private final PrefixTree<LtwaEntry> prefix = new PrefixTree<>();
-    private final PrefixTree<LtwaEntry> suffix = new PrefixTree<>();
+    private final PrefixTree<LtwaEntry> prefix;
+    private final PrefixTree<LtwaEntry> suffix;
+
+    /**
+     * Creates an empty LtwaRepository.
+     */
+    public LtwaRepository() {
+        this.prefix = new PrefixTree<>();
+        this.suffix = new PrefixTree<>();
+    }
 
     /**
      * Creates a new LtwaRepository from an MV store file.
      *
      * @param ltwaListFile Path to the LTWA MVStore file
      */
     public LtwaRepository(Path ltwaListFile) {
-        try (var store = new MVStore.Builder().readOnly().fileName(ltwaListFile.toAbsolutePath().toString()).open()) {
-            MVMap<String, List<LtwaEntry>> prefixMap = store.openMap("Prefixes");
-            MVMap<String, List<LtwaEntry>> suffixMap = store.openMap("Suffixes");
+        this();
+
+        try (MVStore store = new MVStore.Builder().readOnly().fileName(ltwaListFile.toAbsolutePath().toString()).open()) {
+            MVMap<String, List<LtwaEntry>> prefixMap = store.openMap(PREFIX_MAP_NAME);
+            MVMap<String, List<LtwaEntry>> suffixMap = store.openMap(SUFFIX_MAP_NAME);
 
             for (String key : prefixMap.keySet()) {
-                var value = prefixMap.get(key);
+                List<LtwaEntry> value = prefixMap.get(key);
                 if (value != null) {
                     prefix.insert(key, value);
                 }
             }
 
             for (String key : suffixMap.keySet()) {
-                var value = suffixMap.get(key);
+                List<LtwaEntry> value = suffixMap.get(key);
                 if (value != null) {
                     suffix.insert(key, value);
                 }
@@ -60,17 +72,18 @@ public LtwaRepository(Path ltwaListFile) {
         }
     }
 
-    public LtwaRepository() {
-    }
-
     /**
      * Abbreviates a given title using the ISO4 rules.
      *
      * @param title The title to be abbreviated
      * @return The abbreviated title
      */
     public Optional<String> abbreviate(String title) {
-        return Optional.ofNullable(title)
+        if (title == null || title.isEmpty()) {
+            return Optional.empty();
+        }
+
+        return Optional.of(title)
                        .flatMap(NormalizeUtils::toNFKC)
                        .flatMap(normalizedTitle -> {
                            CharStream charStream = CharStreams.fromString(normalizedTitle);
@@ -164,7 +177,7 @@ private void addAbbreviation(int position, String initialText) {
             }
 
             String remainingTitle = originalTitle.substring(position);
-            Optional<String> normalizedOpt = NormalizeUtils.normalize(originalTitle.substring(position))
+            Optional<String> normalizedOpt = NormalizeUtils.normalize(remainingTitle)
                                                            .map(String::toLowerCase);
 
             if (normalizedOpt.isEmpty()) {
@@ -175,39 +188,52 @@ private void addAbbreviation(int position, String initialText) {
 
             String normalizedRemaining = normalizedOpt.get();
 
-            List<LtwaEntry> matchingEntries = Stream.concat(
-                                                            prefix.search(normalizedRemaining).stream(),
-                                                            suffix.search(reverse(normalizedRemaining)).stream())
-                                                    .filter(e -> matches(normalizedRemaining, e))
-                                                    .toList();
+            List<LtwaEntry> matchingEntries = findMatchingEntries(normalizedRemaining);
 
             if (matchingEntries.isEmpty()) {
                 appendWithSpace(initialText);
                 return;
             }
 
-            Optional<LtwaEntry> bestEntryOpt = matchingEntries.stream()
-                                                              .max(Comparator
-                                                                      .<LtwaEntry>comparingInt(e -> e.word().endsWith("-") ? 1 : 0)
-                                                                      .thenComparingInt(e -> e.word().length())
-                                                                      .thenComparingInt(e -> e.abbreviation() != null ? 1 : 0)
-                                                                      .thenComparingInt(e -> e.languages().contains("eng") ? 1 : 0));
+            LtwaEntry bestEntry = findBestEntry(matchingEntries).get();
 
-            LtwaEntry entry = bestEntryOpt.get();
-            if (entry.abbreviation() == null) {
+            if (bestEntry.abbreviation() == null) {
                 appendWithSpace(initialText);
                 return;
             }
 
-            abbreviatedTitlePosition += entry.word().length();
-            Optional<String> matchedOpt = restoreCapitalizationAndDiacritics(entry.abbreviation(), remainingTitle);
+            abbreviatedTitlePosition += bestEntry.word().length();
+            Optional<String> matchedOpt = restoreCapitalizationAndDiacritics(bestEntry.abbreviation(), remainingTitle);
             if (matchedOpt.isPresent()) {
                 appendWithSpace(matchedOpt.get());
             } else {
                 error = true;
             }
         }
 
+        /**
+         * Find matching entries from prefix and suffix trees
+         */
+        private List<LtwaEntry> findMatchingEntries(String normalizedText) {
+            return Stream.concat(
+                                 prefix.search(normalizedText).stream(),
+                                 suffix.search(reverse(normalizedText)).stream())
+                         .filter(e -> matches(normalizedText, e))
+                         .toList();
+        }
+
+        /**
+         * Find the best entry based on prioritization criteria
+         */
+        private Optional<LtwaEntry> findBestEntry(List<LtwaEntry> entries) {
+            return entries.stream()
+                          .max(Comparator
+                                  .<LtwaEntry>comparingInt(e -> e.word().endsWith("-") ? 1 : 0)
+                                  .thenComparingInt(e -> e.word().length())
+                                  .thenComparingInt(e -> e.abbreviation() != null ? 1 : 0)
+                                  .thenComparingInt(e -> e.languages().contains("eng") ? 1 : 0));
+        }
+
         @Override
         public void exitSingleWordTitleFull(LtwaParser.SingleWordTitleFullContext ctx) {
             result.append(ctx.singleWordTitle().getText());
@@ -254,7 +280,14 @@ public Optional<String> getResult() {
         }
     }
 
+    /**
+     * Restore capitalization and diacritics from the original text to the abbreviation
+     */
     private static Optional<String> restoreCapitalizationAndDiacritics(String abbreviation, String original) {
+        if (abbreviation == null || original == null) {
+            return Optional.empty();
+        }
+
         int abbrCodePointCount = abbreviation.codePointCount(0, abbreviation.length());
         int origCodePointCount = original.codePointCount(0, original.length());
 
@@ -269,22 +302,36 @@ private static Optional<String> restoreCapitalizationAndDiacritics(String abbrev
                                  int[] resultCodePoints = Arrays.copyOf(normalizedAbbrCodePoints,
                                          Math.min(normalizedAbbrCodePoints.length, origCodePoints.length));
                                  IntStream.range(0, resultCodePoints.length)
-                                          .forEach(i -> {
-                                              String normalizedAbbrChar = new String(Character.toChars(normalizedAbbrCodePoints[i]));
-                                              String origChar = new String(Character.toChars(origCodePoints[i]));
-
-                                              NormalizeUtils.toNFKC(origChar)
-                                                            .filter(normalizedOrigChar -> !normalizedOrigChar.isEmpty() &&
-                                                                    normalizedAbbrChar.equalsIgnoreCase(normalizedOrigChar))
-                                                            .ifPresent(_ -> resultCodePoints[i] = origCodePoints[i]);
-                                          });
+                                          .forEach(i -> preserveOriginalCharacterProperties(
+                                                  normalizedAbbrCodePoints[i],
+                                                  origCodePoints[i],
+                                                  resultCodePoints,
+                                                  i));
 
                                  return new String(resultCodePoints, 0, resultCodePoints.length);
                              });
     }
 
+    /**
+     * Helper method to preserve original character properties (case, diacritics)
+     */
+    private static void preserveOriginalCharacterProperties(
+            int normalizedChar, int originalChar, int[] resultCodePoints, int index) {
+
+        String normalizedCharStr = new String(Character.toChars(normalizedChar));
+        String origCharStr = new String(Character.toChars(originalChar));
+
+        NormalizeUtils.toNFKC(origCharStr)
+                      .filter(normalizedOrigChar -> !normalizedOrigChar.isEmpty() &&
+                              normalizedCharStr.equalsIgnoreCase(normalizedOrigChar))
+                      .ifPresent(_ -> resultCodePoints[index] = originalChar);
+    }
+
+    /**
+     * Determines if a title matches an LTWA entry
+     */
     private static boolean matches(String title, LtwaEntry entry) {
-        var word = entry.word();
+        String word = entry.word();
         int margin = (word.startsWith("-") ? 1 : 0) + (word.endsWith("-") ? 1 : 0);
         if (title.length() < word.length() - margin) {
             return false;
@@ -295,38 +342,55 @@ private static boolean matches(String title, LtwaEntry entry) {
             title = reverse(title);
         }
 
+        return matchesInternal(title, word);
+    }
+
+    /**
+     * Internal matching logic after handling special cases
+     */
+    private static boolean matchesInternal(String title, String word) {
         int wordPosition = 0;
         int titlePosition = 0;
-        int wordCp;
-        int titleCp;
+
         while (wordPosition < word.length() && titlePosition < title.length()) {
-            wordCp = word.codePointAt(wordPosition);
-            titleCp = title.codePointAt(titlePosition);
+            int wordCp = word.codePointAt(wordPosition);
+            int titleCp = title.codePointAt(titlePosition);
+
             if (wordCp == '-' && wordPosition == word.length() - 1) {
                 return true;
             }
+
             if (Character.toLowerCase(wordCp) != Character.toLowerCase(titleCp)) {
-                var match = INFLECTION.matcher(title.substring(titlePosition));
-                if (match.lookingAt()) {
-                    titlePosition += match.end();
-
-                    match = BOUNDARY.matcher(title.substring(titlePosition));
-                    if (match.lookingAt()) {
-                        titlePosition += match.end();
-                        wordPosition += match.end();
-                        continue;
-                    } else {
-                        return false;
-                    }
-                } else {
+                Matcher match = INFLECTION.matcher(title.substring(titlePosition));
+                if (!match.lookingAt()) {
+                    return false;
+                }
+
+                titlePosition += match.end();
+                match = BOUNDARY.matcher(title.substring(titlePosition));
+
+                if (!match.lookingAt()) {
                     return false;
                 }
+
+                int boundaryLength = match.end();
+                titlePosition += boundaryLength;
+                wordPosition += boundaryLength;
+                continue;
             }
+
             wordPosition += Character.charCount(wordCp);
             titlePosition += Character.charCount(titleCp);
         }
 
-        var match = INFLECTION.matcher(title.substring(titlePosition));
+        return handleRemainingText(title, titlePosition);
+    }
+
+    /**
+     * Handle remaining text after initial match
+     */
+    private static boolean handleRemainingText(String title, int titlePosition) {
+        Matcher match = INFLECTION.matcher(title.substring(titlePosition));
         if (match.lookingAt()) {
             titlePosition += match.end();
         }
diff --git a/src/main/java/org/jabref/logic/journals/ltwa/LtwaTsvParser.java b/src/main/java/org/jabref/logic/journals/ltwa/LtwaTsvParser.java
@@ -8,6 +8,8 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Optional;
+import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 /**
@@ -17,6 +19,8 @@
 public class LtwaTsvParser {
     private static final Pattern ANNOTATION = Pattern.compile("\\s*\\(.*?\\)");
     private static final Pattern LINE_FORMAT = Pattern.compile("\"\\s*(.*?)\\s*\";\"\\s*(.*?)\\s*\";\"\\s*(.*?)\\s*\"");
+    private static final String NO_ABBREVIATION = "n.a.";
+
     private final Path file;
 
     public LtwaTsvParser(Path file) {
@@ -41,21 +45,21 @@ public List<LtwaEntry> parse() throws IOException {
                     continue;
                 }
 
-                var matcher = LINE_FORMAT.matcher(line);
+                Matcher matcher = LINE_FORMAT.matcher(line);
                 if (!matcher.find()) {
                     continue;
                 }
 
-                var word = matcher.group(1);
-                var abbreviationStr = matcher.group(2);
-                var languageStr = matcher.group(3);
+                String word = matcher.group(1);
+                String abbreviationStr = matcher.group(2);
+                String languageStr = matcher.group(3);
 
-                var normalizeResult = NormalizeUtils.normalize(ANNOTATION.matcher(word).replaceAll("").strip());
+                Optional<String> normalizeResult = NormalizeUtils.normalize(ANNOTATION.matcher(word).replaceAll("").strip());
                 if (normalizeResult.isEmpty()) {
                     continue;
                 }
                 word = normalizeResult.get();
-                var abbreviation = "n.a.".equals(abbreviationStr) ? null : abbreviationStr;
+                String abbreviation = NO_ABBREVIATION.equals(abbreviationStr) ? null : abbreviationStr;
                 List<String> languages = Arrays.stream(languageStr.split("\\s*,\\s*")).map(String::trim)
                         .filter(s -> !s.isEmpty()).toList();
 
diff --git a/src/main/java/org/jabref/logic/journals/ltwa/NormalizeUtils.java b/src/main/java/org/jabref/logic/journals/ltwa/NormalizeUtils.java
@@ -3,14 +3,23 @@
 import java.text.Normalizer;
 import java.util.Optional;
 
-public class NormalizeUtils {
+public final class NormalizeUtils {
+
+    /**
+     * Normalizes text using Unicode normalization form NFKC
+     * (Compatibility Decomposition, followed by Canonical Composition)
+     */
     public static Optional<String> toNFKC(String input) {
         return Optional.ofNullable(input)
                        .map(s -> Normalizer.normalize(s, Normalizer.Form.NFKC));
     }
 
+    /**
+     * Normalizes text by removing diacritical marks
+     */
     public static Optional<String> normalize(String input) {
         return Optional.ofNullable(input)
-                       .map(s -> Normalizer.normalize(s, Normalizer.Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", ""));
+                       .map(s -> Normalizer.normalize(s, Normalizer.Form.NFD)
+                                           .replaceAll("\\p{InCombiningDiacriticalMarks}+", ""));
     }
 }
diff --git a/src/main/java/org/jabref/logic/journals/ltwa/PrefixTree.java b/src/main/java/org/jabref/logic/journals/ltwa/PrefixTree.java
@@ -79,28 +79,7 @@ private void searchRecursive(Node<D> node, String word, int index, Set<D> result
         }
     }
 
-    private static class SearchState {
-        private final Node<?> node;
-        private final int index;
-
-        public SearchState(Node<?> node, int index) {
-            this.node = node;
-            this.index = index;
-        }
-
-        @Override
-        public boolean equals(Object obj) {
-            if (!(obj instanceof SearchState)) {
-                return false;
-            }
-            SearchState other = (SearchState) obj;
-            return this.node == other.node && this.index == other.index;
-        }
-
-        @Override
-        public int hashCode() {
-            return 31 * System.identityHashCode(node) + index;
-        }
+    private record SearchState(Node<?> node, int index) {
     }
 
     private static class Node<D> {