Skip to content

Commit 782bd34

Browse files
authored
Enhance LTWA code (#12977)
* Improve LTWA code Signed-off-by: subhramit <[email protected]> * Remove unused parameter Signed-off-by: subhramit <[email protected]> * Un-refactor `matchesInternal` Signed-off-by: subhramit <[email protected]> * Remove null check, better variable name Signed-off-by: subhramit <[email protected]> * Remove javadoc Signed-off-by: subhramit <[email protected]> * Remove equals and hashcode from record implementation Signed-off-by: subhramit <[email protected]> * Remove remaining vars Signed-off-by: subhramit <[email protected]> --------- Signed-off-by: subhramit <[email protected]>
1 parent 4916a01 commit 782bd34

File tree

4 files changed

+142
-86
lines changed

4 files changed

+142
-86
lines changed

src/main/java/org/jabref/logic/journals/ltwa/LtwaRepository.java

Lines changed: 120 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import java.util.Comparator;
66
import java.util.List;
77
import java.util.Optional;
8+
import java.util.regex.Matcher;
89
import java.util.regex.Pattern;
910
import java.util.stream.IntStream;
1011
import java.util.stream.Stream;
@@ -23,34 +24,45 @@
2324
* A repository for LTWA (List of Title Word Abbreviations) entries.
2425
* Provides methods for retrieving and applying abbreviations based on LTWA rules.
2526
*/
26-
@SuppressWarnings("checkstyle:RegexpMultiline")
2727
public class LtwaRepository {
2828
private static final Logger LOGGER = LoggerFactory.getLogger(LtwaRepository.class);
2929
private static final Pattern INFLECTION = Pattern.compile("[ieasn'’]{1,3}");
3030
private static final Pattern BOUNDARY = Pattern.compile("[\\s\\u2013\\u2014_.,:;!|=+*\\\\/\"()&#%@$?]");
31+
private static final String PREFIX_MAP_NAME = "Prefixes";
32+
private static final String SUFFIX_MAP_NAME = "Suffixes";
3133

32-
private final PrefixTree<LtwaEntry> prefix = new PrefixTree<>();
33-
private final PrefixTree<LtwaEntry> suffix = new PrefixTree<>();
34+
private final PrefixTree<LtwaEntry> prefix;
35+
private final PrefixTree<LtwaEntry> suffix;
36+
37+
/**
38+
* Creates an empty LtwaRepository.
39+
*/
40+
public LtwaRepository() {
41+
this.prefix = new PrefixTree<>();
42+
this.suffix = new PrefixTree<>();
43+
}
3444

3545
/**
3646
* Creates a new LtwaRepository from an MV store file.
3747
*
3848
* @param ltwaListFile Path to the LTWA MVStore file
3949
*/
4050
public LtwaRepository(Path ltwaListFile) {
41-
try (var store = new MVStore.Builder().readOnly().fileName(ltwaListFile.toAbsolutePath().toString()).open()) {
42-
MVMap<String, List<LtwaEntry>> prefixMap = store.openMap("Prefixes");
43-
MVMap<String, List<LtwaEntry>> suffixMap = store.openMap("Suffixes");
51+
this();
52+
53+
try (MVStore store = new MVStore.Builder().readOnly().fileName(ltwaListFile.toAbsolutePath().toString()).open()) {
54+
MVMap<String, List<LtwaEntry>> prefixMap = store.openMap(PREFIX_MAP_NAME);
55+
MVMap<String, List<LtwaEntry>> suffixMap = store.openMap(SUFFIX_MAP_NAME);
4456

4557
for (String key : prefixMap.keySet()) {
46-
var value = prefixMap.get(key);
58+
List<LtwaEntry> value = prefixMap.get(key);
4759
if (value != null) {
4860
prefix.insert(key, value);
4961
}
5062
}
5163

5264
for (String key : suffixMap.keySet()) {
53-
var value = suffixMap.get(key);
65+
List<LtwaEntry> value = suffixMap.get(key);
5466
if (value != null) {
5567
suffix.insert(key, value);
5668
}
@@ -60,17 +72,18 @@ public LtwaRepository(Path ltwaListFile) {
6072
}
6173
}
6274

63-
public LtwaRepository() {
64-
}
65-
6675
/**
6776
* Abbreviates a given title using the ISO4 rules.
6877
*
6978
* @param title The title to be abbreviated
7079
* @return The abbreviated title
7180
*/
7281
public Optional<String> abbreviate(String title) {
73-
return Optional.ofNullable(title)
82+
if (title == null || title.isEmpty()) {
83+
return Optional.empty();
84+
}
85+
86+
return Optional.of(title)
7487
.flatMap(NormalizeUtils::toNFKC)
7588
.flatMap(normalizedTitle -> {
7689
CharStream charStream = CharStreams.fromString(normalizedTitle);
@@ -164,7 +177,7 @@ private void addAbbreviation(int position, String initialText) {
164177
}
165178

166179
String remainingTitle = originalTitle.substring(position);
167-
Optional<String> normalizedOpt = NormalizeUtils.normalize(originalTitle.substring(position))
180+
Optional<String> normalizedOpt = NormalizeUtils.normalize(remainingTitle)
168181
.map(String::toLowerCase);
169182

170183
if (normalizedOpt.isEmpty()) {
@@ -175,39 +188,52 @@ private void addAbbreviation(int position, String initialText) {
175188

176189
String normalizedRemaining = normalizedOpt.get();
177190

178-
List<LtwaEntry> matchingEntries = Stream.concat(
179-
prefix.search(normalizedRemaining).stream(),
180-
suffix.search(reverse(normalizedRemaining)).stream())
181-
.filter(e -> matches(normalizedRemaining, e))
182-
.toList();
191+
List<LtwaEntry> matchingEntries = findMatchingEntries(normalizedRemaining);
183192

184193
if (matchingEntries.isEmpty()) {
185194
appendWithSpace(initialText);
186195
return;
187196
}
188197

189-
Optional<LtwaEntry> bestEntryOpt = matchingEntries.stream()
190-
.max(Comparator
191-
.<LtwaEntry>comparingInt(e -> e.word().endsWith("-") ? 1 : 0)
192-
.thenComparingInt(e -> e.word().length())
193-
.thenComparingInt(e -> e.abbreviation() != null ? 1 : 0)
194-
.thenComparingInt(e -> e.languages().contains("eng") ? 1 : 0));
198+
LtwaEntry bestEntry = findBestEntry(matchingEntries).get();
195199

196-
LtwaEntry entry = bestEntryOpt.get();
197-
if (entry.abbreviation() == null) {
200+
if (bestEntry.abbreviation() == null) {
198201
appendWithSpace(initialText);
199202
return;
200203
}
201204

202-
abbreviatedTitlePosition += entry.word().length();
203-
Optional<String> matchedOpt = restoreCapitalizationAndDiacritics(entry.abbreviation(), remainingTitle);
205+
abbreviatedTitlePosition += bestEntry.word().length();
206+
Optional<String> matchedOpt = restoreCapitalizationAndDiacritics(bestEntry.abbreviation(), remainingTitle);
204207
if (matchedOpt.isPresent()) {
205208
appendWithSpace(matchedOpt.get());
206209
} else {
207210
error = true;
208211
}
209212
}
210213

214+
/**
215+
* Find matching entries from prefix and suffix trees
216+
*/
217+
private List<LtwaEntry> findMatchingEntries(String normalizedText) {
218+
return Stream.concat(
219+
prefix.search(normalizedText).stream(),
220+
suffix.search(reverse(normalizedText)).stream())
221+
.filter(e -> matches(normalizedText, e))
222+
.toList();
223+
}
224+
225+
/**
226+
* Find the best entry based on prioritization criteria
227+
*/
228+
private Optional<LtwaEntry> findBestEntry(List<LtwaEntry> entries) {
229+
return entries.stream()
230+
.max(Comparator
231+
.<LtwaEntry>comparingInt(e -> e.word().endsWith("-") ? 1 : 0)
232+
.thenComparingInt(e -> e.word().length())
233+
.thenComparingInt(e -> e.abbreviation() != null ? 1 : 0)
234+
.thenComparingInt(e -> e.languages().contains("eng") ? 1 : 0));
235+
}
236+
211237
@Override
212238
public void exitSingleWordTitleFull(LtwaParser.SingleWordTitleFullContext ctx) {
213239
result.append(ctx.singleWordTitle().getText());
@@ -254,7 +280,14 @@ public Optional<String> getResult() {
254280
}
255281
}
256282

283+
/**
284+
* Restore capitalization and diacritics from the original text to the abbreviation
285+
*/
257286
private static Optional<String> restoreCapitalizationAndDiacritics(String abbreviation, String original) {
287+
if (abbreviation == null || original == null) {
288+
return Optional.empty();
289+
}
290+
258291
int abbrCodePointCount = abbreviation.codePointCount(0, abbreviation.length());
259292
int origCodePointCount = original.codePointCount(0, original.length());
260293

@@ -269,22 +302,36 @@ private static Optional<String> restoreCapitalizationAndDiacritics(String abbrev
269302
int[] resultCodePoints = Arrays.copyOf(normalizedAbbrCodePoints,
270303
Math.min(normalizedAbbrCodePoints.length, origCodePoints.length));
271304
IntStream.range(0, resultCodePoints.length)
272-
.forEach(i -> {
273-
String normalizedAbbrChar = new String(Character.toChars(normalizedAbbrCodePoints[i]));
274-
String origChar = new String(Character.toChars(origCodePoints[i]));
275-
276-
NormalizeUtils.toNFKC(origChar)
277-
.filter(normalizedOrigChar -> !normalizedOrigChar.isEmpty() &&
278-
normalizedAbbrChar.equalsIgnoreCase(normalizedOrigChar))
279-
.ifPresent(_ -> resultCodePoints[i] = origCodePoints[i]);
280-
});
305+
.forEach(i -> preserveOriginalCharacterProperties(
306+
normalizedAbbrCodePoints[i],
307+
origCodePoints[i],
308+
resultCodePoints,
309+
i));
281310

282311
return new String(resultCodePoints, 0, resultCodePoints.length);
283312
});
284313
}
285314

315+
/**
316+
* Helper method to preserve original character properties (case, diacritics)
317+
*/
318+
private static void preserveOriginalCharacterProperties(
319+
int normalizedChar, int originalChar, int[] resultCodePoints, int index) {
320+
321+
String normalizedCharStr = new String(Character.toChars(normalizedChar));
322+
String origCharStr = new String(Character.toChars(originalChar));
323+
324+
NormalizeUtils.toNFKC(origCharStr)
325+
.filter(normalizedOrigChar -> !normalizedOrigChar.isEmpty() &&
326+
normalizedCharStr.equalsIgnoreCase(normalizedOrigChar))
327+
.ifPresent(_ -> resultCodePoints[index] = originalChar);
328+
}
329+
330+
/**
331+
* Determines if a title matches an LTWA entry
332+
*/
286333
private static boolean matches(String title, LtwaEntry entry) {
287-
var word = entry.word();
334+
String word = entry.word();
288335
int margin = (word.startsWith("-") ? 1 : 0) + (word.endsWith("-") ? 1 : 0);
289336
if (title.length() < word.length() - margin) {
290337
return false;
@@ -295,38 +342,55 @@ private static boolean matches(String title, LtwaEntry entry) {
295342
title = reverse(title);
296343
}
297344

345+
return matchesInternal(title, word);
346+
}
347+
348+
/**
349+
* Internal matching logic after handling special cases
350+
*/
351+
private static boolean matchesInternal(String title, String word) {
298352
int wordPosition = 0;
299353
int titlePosition = 0;
300-
int wordCp;
301-
int titleCp;
354+
302355
while (wordPosition < word.length() && titlePosition < title.length()) {
303-
wordCp = word.codePointAt(wordPosition);
304-
titleCp = title.codePointAt(titlePosition);
356+
int wordCp = word.codePointAt(wordPosition);
357+
int titleCp = title.codePointAt(titlePosition);
358+
305359
if (wordCp == '-' && wordPosition == word.length() - 1) {
306360
return true;
307361
}
362+
308363
if (Character.toLowerCase(wordCp) != Character.toLowerCase(titleCp)) {
309-
var match = INFLECTION.matcher(title.substring(titlePosition));
310-
if (match.lookingAt()) {
311-
titlePosition += match.end();
312-
313-
match = BOUNDARY.matcher(title.substring(titlePosition));
314-
if (match.lookingAt()) {
315-
titlePosition += match.end();
316-
wordPosition += match.end();
317-
continue;
318-
} else {
319-
return false;
320-
}
321-
} else {
364+
Matcher match = INFLECTION.matcher(title.substring(titlePosition));
365+
if (!match.lookingAt()) {
366+
return false;
367+
}
368+
369+
titlePosition += match.end();
370+
match = BOUNDARY.matcher(title.substring(titlePosition));
371+
372+
if (!match.lookingAt()) {
322373
return false;
323374
}
375+
376+
int boundaryLength = match.end();
377+
titlePosition += boundaryLength;
378+
wordPosition += boundaryLength;
379+
continue;
324380
}
381+
325382
wordPosition += Character.charCount(wordCp);
326383
titlePosition += Character.charCount(titleCp);
327384
}
328385

329-
var match = INFLECTION.matcher(title.substring(titlePosition));
386+
return handleRemainingText(title, titlePosition);
387+
}
388+
389+
/**
390+
* Handle remaining text after initial match
391+
*/
392+
private static boolean handleRemainingText(String title, int titlePosition) {
393+
Matcher match = INFLECTION.matcher(title.substring(titlePosition));
330394
if (match.lookingAt()) {
331395
titlePosition += match.end();
332396
}

src/main/java/org/jabref/logic/journals/ltwa/LtwaTsvParser.java

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import java.util.ArrayList;
99
import java.util.Arrays;
1010
import java.util.List;
11+
import java.util.Optional;
12+
import java.util.regex.Matcher;
1113
import java.util.regex.Pattern;
1214

1315
/**
@@ -17,6 +19,8 @@
1719
public class LtwaTsvParser {
1820
private static final Pattern ANNOTATION = Pattern.compile("\\s*\\(.*?\\)");
1921
private static final Pattern LINE_FORMAT = Pattern.compile("\"\\s*(.*?)\\s*\";\"\\s*(.*?)\\s*\";\"\\s*(.*?)\\s*\"");
22+
private static final String NO_ABBREVIATION = "n.a.";
23+
2024
private final Path file;
2125

2226
public LtwaTsvParser(Path file) {
@@ -41,21 +45,21 @@ public List<LtwaEntry> parse() throws IOException {
4145
continue;
4246
}
4347

44-
var matcher = LINE_FORMAT.matcher(line);
48+
Matcher matcher = LINE_FORMAT.matcher(line);
4549
if (!matcher.find()) {
4650
continue;
4751
}
4852

49-
var word = matcher.group(1);
50-
var abbreviationStr = matcher.group(2);
51-
var languageStr = matcher.group(3);
53+
String word = matcher.group(1);
54+
String abbreviationStr = matcher.group(2);
55+
String languageStr = matcher.group(3);
5256

53-
var normalizeResult = NormalizeUtils.normalize(ANNOTATION.matcher(word).replaceAll("").strip());
57+
Optional<String> normalizeResult = NormalizeUtils.normalize(ANNOTATION.matcher(word).replaceAll("").strip());
5458
if (normalizeResult.isEmpty()) {
5559
continue;
5660
}
5761
word = normalizeResult.get();
58-
var abbreviation = "n.a.".equals(abbreviationStr) ? null : abbreviationStr;
62+
String abbreviation = NO_ABBREVIATION.equals(abbreviationStr) ? null : abbreviationStr;
5963
List<String> languages = Arrays.stream(languageStr.split("\\s*,\\s*")).map(String::trim)
6064
.filter(s -> !s.isEmpty()).toList();
6165

src/main/java/org/jabref/logic/journals/ltwa/NormalizeUtils.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,23 @@
33
import java.text.Normalizer;
44
import java.util.Optional;
55

6-
public class NormalizeUtils {
6+
public final class NormalizeUtils {
7+
8+
/**
9+
* Normalizes text using Unicode normalization form NFKC
10+
* (Compatibility Decomposition, followed by Canonical Composition)
11+
*/
712
public static Optional<String> toNFKC(String input) {
813
return Optional.ofNullable(input)
914
.map(s -> Normalizer.normalize(s, Normalizer.Form.NFKC));
1015
}
1116

17+
/**
18+
* Normalizes text by removing diacritical marks
19+
*/
1220
public static Optional<String> normalize(String input) {
1321
return Optional.ofNullable(input)
14-
.map(s -> Normalizer.normalize(s, Normalizer.Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", ""));
22+
.map(s -> Normalizer.normalize(s, Normalizer.Form.NFD)
23+
.replaceAll("\\p{InCombiningDiacriticalMarks}+", ""));
1524
}
1625
}

src/main/java/org/jabref/logic/journals/ltwa/PrefixTree.java

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -79,28 +79,7 @@ private void searchRecursive(Node<D> node, String word, int index, Set<D> result
7979
}
8080
}
8181

82-
private static class SearchState {
83-
private final Node<?> node;
84-
private final int index;
85-
86-
public SearchState(Node<?> node, int index) {
87-
this.node = node;
88-
this.index = index;
89-
}
90-
91-
@Override
92-
public boolean equals(Object obj) {
93-
if (!(obj instanceof SearchState)) {
94-
return false;
95-
}
96-
SearchState other = (SearchState) obj;
97-
return this.node == other.node && this.index == other.index;
98-
}
99-
100-
@Override
101-
public int hashCode() {
102-
return 31 * System.identityHashCode(node) + index;
103-
}
82+
private record SearchState(Node<?> node, int index) {
10483
}
10584

10685
private static class Node<D> {

0 commit comments

Comments
 (0)