5
5
import java .util .Comparator ;
6
6
import java .util .List ;
7
7
import java .util .Optional ;
8
+ import java .util .regex .Matcher ;
8
9
import java .util .regex .Pattern ;
9
10
import java .util .stream .IntStream ;
10
11
import java .util .stream .Stream ;
23
24
* A repository for LTWA (List of Title Word Abbreviations) entries.
24
25
* Provides methods for retrieving and applying abbreviations based on LTWA rules.
25
26
*/
26
- @ SuppressWarnings ("checkstyle:RegexpMultiline" )
27
27
public class LtwaRepository {
28
28
private static final Logger LOGGER = LoggerFactory .getLogger (LtwaRepository .class );
29
29
private static final Pattern INFLECTION = Pattern .compile ("[ieasn'’]{1,3}" );
30
30
private static final Pattern BOUNDARY = Pattern .compile ("[\\ s\\ u2013\\ u2014_.,:;!|=+*\\ \\ /\" ()&#%@$?]" );
31
+ private static final String PREFIX_MAP_NAME = "Prefixes" ;
32
+ private static final String SUFFIX_MAP_NAME = "Suffixes" ;
31
33
32
- private final PrefixTree <LtwaEntry > prefix = new PrefixTree <>();
33
- private final PrefixTree <LtwaEntry > suffix = new PrefixTree <>();
34
+ private final PrefixTree <LtwaEntry > prefix ;
35
+ private final PrefixTree <LtwaEntry > suffix ;
36
+
37
+ /**
38
+ * Creates an empty LtwaRepository.
39
+ */
40
+ public LtwaRepository () {
41
+ this .prefix = new PrefixTree <>();
42
+ this .suffix = new PrefixTree <>();
43
+ }
34
44
35
45
/**
36
46
* Creates a new LtwaRepository from an MV store file.
37
47
*
38
48
* @param ltwaListFile Path to the LTWA MVStore file
39
49
*/
40
50
public LtwaRepository (Path ltwaListFile ) {
41
- try (var store = new MVStore .Builder ().readOnly ().fileName (ltwaListFile .toAbsolutePath ().toString ()).open ()) {
42
- MVMap <String , List <LtwaEntry >> prefixMap = store .openMap ("Prefixes" );
43
- MVMap <String , List <LtwaEntry >> suffixMap = store .openMap ("Suffixes" );
51
+ this ();
52
+
53
+ try (MVStore store = new MVStore .Builder ().readOnly ().fileName (ltwaListFile .toAbsolutePath ().toString ()).open ()) {
54
+ MVMap <String , List <LtwaEntry >> prefixMap = store .openMap (PREFIX_MAP_NAME );
55
+ MVMap <String , List <LtwaEntry >> suffixMap = store .openMap (SUFFIX_MAP_NAME );
44
56
45
57
for (String key : prefixMap .keySet ()) {
46
- var value = prefixMap .get (key );
58
+ List < LtwaEntry > value = prefixMap .get (key );
47
59
if (value != null ) {
48
60
prefix .insert (key , value );
49
61
}
50
62
}
51
63
52
64
for (String key : suffixMap .keySet ()) {
53
- var value = suffixMap .get (key );
65
+ List < LtwaEntry > value = suffixMap .get (key );
54
66
if (value != null ) {
55
67
suffix .insert (key , value );
56
68
}
@@ -60,17 +72,18 @@ public LtwaRepository(Path ltwaListFile) {
60
72
}
61
73
}
62
74
63
- public LtwaRepository () {
64
- }
65
-
66
75
/**
67
76
* Abbreviates a given title using the ISO4 rules.
68
77
*
69
78
* @param title The title to be abbreviated
70
79
* @return The abbreviated title
71
80
*/
72
81
public Optional <String > abbreviate (String title ) {
73
- return Optional .ofNullable (title )
82
+ if (title == null || title .isEmpty ()) {
83
+ return Optional .empty ();
84
+ }
85
+
86
+ return Optional .of (title )
74
87
.flatMap (NormalizeUtils ::toNFKC )
75
88
.flatMap (normalizedTitle -> {
76
89
CharStream charStream = CharStreams .fromString (normalizedTitle );
@@ -164,7 +177,7 @@ private void addAbbreviation(int position, String initialText) {
164
177
}
165
178
166
179
String remainingTitle = originalTitle .substring (position );
167
- Optional <String > normalizedOpt = NormalizeUtils .normalize (originalTitle . substring ( position ) )
180
+ Optional <String > normalizedOpt = NormalizeUtils .normalize (remainingTitle )
168
181
.map (String ::toLowerCase );
169
182
170
183
if (normalizedOpt .isEmpty ()) {
@@ -175,39 +188,52 @@ private void addAbbreviation(int position, String initialText) {
175
188
176
189
String normalizedRemaining = normalizedOpt .get ();
177
190
178
- List <LtwaEntry > matchingEntries = Stream .concat (
179
- prefix .search (normalizedRemaining ).stream (),
180
- suffix .search (reverse (normalizedRemaining )).stream ())
181
- .filter (e -> matches (normalizedRemaining , e ))
182
- .toList ();
191
+ List <LtwaEntry > matchingEntries = findMatchingEntries (normalizedRemaining );
183
192
184
193
if (matchingEntries .isEmpty ()) {
185
194
appendWithSpace (initialText );
186
195
return ;
187
196
}
188
197
189
- Optional <LtwaEntry > bestEntryOpt = matchingEntries .stream ()
190
- .max (Comparator
191
- .<LtwaEntry >comparingInt (e -> e .word ().endsWith ("-" ) ? 1 : 0 )
192
- .thenComparingInt (e -> e .word ().length ())
193
- .thenComparingInt (e -> e .abbreviation () != null ? 1 : 0 )
194
- .thenComparingInt (e -> e .languages ().contains ("eng" ) ? 1 : 0 ));
198
+ LtwaEntry bestEntry = findBestEntry (matchingEntries ).get ();
195
199
196
- LtwaEntry entry = bestEntryOpt .get ();
197
- if (entry .abbreviation () == null ) {
200
+ if (bestEntry .abbreviation () == null ) {
198
201
appendWithSpace (initialText );
199
202
return ;
200
203
}
201
204
202
- abbreviatedTitlePosition += entry .word ().length ();
203
- Optional <String > matchedOpt = restoreCapitalizationAndDiacritics (entry .abbreviation (), remainingTitle );
205
+ abbreviatedTitlePosition += bestEntry .word ().length ();
206
+ Optional <String > matchedOpt = restoreCapitalizationAndDiacritics (bestEntry .abbreviation (), remainingTitle );
204
207
if (matchedOpt .isPresent ()) {
205
208
appendWithSpace (matchedOpt .get ());
206
209
} else {
207
210
error = true ;
208
211
}
209
212
}
210
213
214
+ /**
215
+ * Find matching entries from prefix and suffix trees
216
+ */
217
+ private List <LtwaEntry > findMatchingEntries (String normalizedText ) {
218
+ return Stream .concat (
219
+ prefix .search (normalizedText ).stream (),
220
+ suffix .search (reverse (normalizedText )).stream ())
221
+ .filter (e -> matches (normalizedText , e ))
222
+ .toList ();
223
+ }
224
+
225
+ /**
226
+ * Find the best entry based on prioritization criteria
227
+ */
228
+ private Optional <LtwaEntry > findBestEntry (List <LtwaEntry > entries ) {
229
+ return entries .stream ()
230
+ .max (Comparator
231
+ .<LtwaEntry >comparingInt (e -> e .word ().endsWith ("-" ) ? 1 : 0 )
232
+ .thenComparingInt (e -> e .word ().length ())
233
+ .thenComparingInt (e -> e .abbreviation () != null ? 1 : 0 )
234
+ .thenComparingInt (e -> e .languages ().contains ("eng" ) ? 1 : 0 ));
235
+ }
236
+
211
237
@ Override
212
238
public void exitSingleWordTitleFull (LtwaParser .SingleWordTitleFullContext ctx ) {
213
239
result .append (ctx .singleWordTitle ().getText ());
@@ -254,7 +280,14 @@ public Optional<String> getResult() {
254
280
}
255
281
}
256
282
283
+ /**
284
+ * Restore capitalization and diacritics from the original text to the abbreviation
285
+ */
257
286
private static Optional <String > restoreCapitalizationAndDiacritics (String abbreviation , String original ) {
287
+ if (abbreviation == null || original == null ) {
288
+ return Optional .empty ();
289
+ }
290
+
258
291
int abbrCodePointCount = abbreviation .codePointCount (0 , abbreviation .length ());
259
292
int origCodePointCount = original .codePointCount (0 , original .length ());
260
293
@@ -269,22 +302,36 @@ private static Optional<String> restoreCapitalizationAndDiacritics(String abbrev
269
302
int [] resultCodePoints = Arrays .copyOf (normalizedAbbrCodePoints ,
270
303
Math .min (normalizedAbbrCodePoints .length , origCodePoints .length ));
271
304
IntStream .range (0 , resultCodePoints .length )
272
- .forEach (i -> {
273
- String normalizedAbbrChar = new String (Character .toChars (normalizedAbbrCodePoints [i ]));
274
- String origChar = new String (Character .toChars (origCodePoints [i ]));
275
-
276
- NormalizeUtils .toNFKC (origChar )
277
- .filter (normalizedOrigChar -> !normalizedOrigChar .isEmpty () &&
278
- normalizedAbbrChar .equalsIgnoreCase (normalizedOrigChar ))
279
- .ifPresent (_ -> resultCodePoints [i ] = origCodePoints [i ]);
280
- });
305
+ .forEach (i -> preserveOriginalCharacterProperties (
306
+ normalizedAbbrCodePoints [i ],
307
+ origCodePoints [i ],
308
+ resultCodePoints ,
309
+ i ));
281
310
282
311
return new String (resultCodePoints , 0 , resultCodePoints .length );
283
312
});
284
313
}
285
314
315
+ /**
316
+ * Helper method to preserve original character properties (case, diacritics)
317
+ */
318
+ private static void preserveOriginalCharacterProperties (
319
+ int normalizedChar , int originalChar , int [] resultCodePoints , int index ) {
320
+
321
+ String normalizedCharStr = new String (Character .toChars (normalizedChar ));
322
+ String origCharStr = new String (Character .toChars (originalChar ));
323
+
324
+ NormalizeUtils .toNFKC (origCharStr )
325
+ .filter (normalizedOrigChar -> !normalizedOrigChar .isEmpty () &&
326
+ normalizedCharStr .equalsIgnoreCase (normalizedOrigChar ))
327
+ .ifPresent (_ -> resultCodePoints [index ] = originalChar );
328
+ }
329
+
330
+ /**
331
+ * Determines if a title matches an LTWA entry
332
+ */
286
333
private static boolean matches (String title , LtwaEntry entry ) {
287
- var word = entry .word ();
334
+ String word = entry .word ();
288
335
int margin = (word .startsWith ("-" ) ? 1 : 0 ) + (word .endsWith ("-" ) ? 1 : 0 );
289
336
if (title .length () < word .length () - margin ) {
290
337
return false ;
@@ -295,38 +342,55 @@ private static boolean matches(String title, LtwaEntry entry) {
295
342
title = reverse (title );
296
343
}
297
344
345
+ return matchesInternal (title , word );
346
+ }
347
+
348
+ /**
349
+ * Internal matching logic after handling special cases
350
+ */
351
+ private static boolean matchesInternal (String title , String word ) {
298
352
int wordPosition = 0 ;
299
353
int titlePosition = 0 ;
300
- int wordCp ;
301
- int titleCp ;
354
+
302
355
while (wordPosition < word .length () && titlePosition < title .length ()) {
303
- wordCp = word .codePointAt (wordPosition );
304
- titleCp = title .codePointAt (titlePosition );
356
+ int wordCp = word .codePointAt (wordPosition );
357
+ int titleCp = title .codePointAt (titlePosition );
358
+
305
359
if (wordCp == '-' && wordPosition == word .length () - 1 ) {
306
360
return true ;
307
361
}
362
+
308
363
if (Character .toLowerCase (wordCp ) != Character .toLowerCase (titleCp )) {
309
- var match = INFLECTION .matcher (title .substring (titlePosition ));
310
- if (match .lookingAt ()) {
311
- titlePosition += match .end ();
312
-
313
- match = BOUNDARY .matcher (title .substring (titlePosition ));
314
- if (match .lookingAt ()) {
315
- titlePosition += match .end ();
316
- wordPosition += match .end ();
317
- continue ;
318
- } else {
319
- return false ;
320
- }
321
- } else {
364
+ Matcher match = INFLECTION .matcher (title .substring (titlePosition ));
365
+ if (!match .lookingAt ()) {
366
+ return false ;
367
+ }
368
+
369
+ titlePosition += match .end ();
370
+ match = BOUNDARY .matcher (title .substring (titlePosition ));
371
+
372
+ if (!match .lookingAt ()) {
322
373
return false ;
323
374
}
375
+
376
+ int boundaryLength = match .end ();
377
+ titlePosition += boundaryLength ;
378
+ wordPosition += boundaryLength ;
379
+ continue ;
324
380
}
381
+
325
382
wordPosition += Character .charCount (wordCp );
326
383
titlePosition += Character .charCount (titleCp );
327
384
}
328
385
329
- var match = INFLECTION .matcher (title .substring (titlePosition ));
386
+ return handleRemainingText (title , titlePosition );
387
+ }
388
+
389
+ /**
390
+ * Handle remaining text after initial match
391
+ */
392
+ private static boolean handleRemainingText (String title , int titlePosition ) {
393
+ Matcher match = INFLECTION .matcher (title .substring (titlePosition ));
330
394
if (match .lookingAt ()) {
331
395
titlePosition += match .end ();
332
396
}
0 commit comments