Skip to content

Commit c7e1079

Browse files
authored
LUCENE-9690: Hunspell: support special title-case for words with apostrophe (#2235)
1 parent 7f4d4df commit c7e1079

File tree

8 files changed

+45
-0
lines changed

8 files changed

+45
-0
lines changed

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) {
7474
if (checkWord(caseVariant, wordChars.length, true)) {
7575
return true;
7676
}
77+
char[] aposCase = stemmer.capitalizeAfterApostrophe(caseVariant, wordChars.length);
78+
if (aposCase != null && checkWord(aposCase, aposCase.length, true)) {
79+
return true;
80+
}
7781
}
7882
return checkWord(stemmer.caseFoldLower(caseVariant, wordChars.length), wordChars.length, true);
7983
}

lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,10 @@ public List<CharsRef> stem(char[] word, int length) {
9898
List<CharsRef> list = doStem(word, length, false);
9999
if (wordCase == WordCase.UPPER) {
100100
caseFoldTitle(word, length);
101+
char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
102+
if (aposCase != null) {
103+
list.addAll(doStem(aposCase, length, true));
104+
}
101105
list.addAll(doStem(titleBuffer, length, true));
102106
}
103107
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
@@ -138,6 +142,23 @@ char[] caseFoldLower(char[] word, int length) {
138142
return lowerBuffer;
139143
}
140144

145+
// Special prefix handling for Catalan, French, Italian:
146+
// prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
147+
char[] capitalizeAfterApostrophe(char[] word, int length) {
148+
for (int i = 1; i < length - 1; i++) {
149+
if (word[i] == '\'') {
150+
char next = word[i + 1];
151+
char upper = Character.toUpperCase(next);
152+
if (upper != next) {
153+
char[] copy = ArrayUtil.copyOfSubArray(word, 0, length);
154+
copy[i + 1] = Character.toUpperCase(upper);
155+
return copy;
156+
}
157+
}
158+
}
159+
return null;
160+
}
161+
141162
List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
142163
List<CharsRef> stems = new ArrayList<>();
143164
IntsRef forms = dictionary.lookupWord(word, 0, length);

lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@
2626
import org.junit.Test;
2727

2828
public class SpellCheckerTest extends StemmerTestBase {
29+
@Test
30+
public void allcaps() throws Exception {
31+
doTest("allcaps");
32+
}
33+
2934
@Test
3035
public void i53643_numbersWithSeparators() throws Exception {
3136
doTest("i53643");

lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllCaps.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ public static void beforeClass() throws Exception {
2828
public void testGood() {
2929
assertStemsTo("OpenOffice.org", "OpenOffice.org");
3030
assertStemsTo("UNICEF's", "UNICEF");
31+
assertStemsTo("L'Afrique", "Afrique");
32+
assertStemsTo("L'AFRIQUE", "Afrique");
3133

3234
// Hunspell returns these title-cased stems, so for consistency we do, too
3335
assertStemsTo("OPENOFFICE.ORG", "Openoffice.org");

lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/allcaps.aff

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,6 @@ WORDCHARS '.
33

44
SFX S N 1
55
SFX S 0 's .
6+
7+
PFX L N 1
8+
PFX L 0 L'
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
2
22
OpenOffice.org
33
UNICEF/S
4+
Afrique/L
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
OpenOffice.org
2+
OPENOFFICE.ORG
3+
UNICEF's
4+
UNICEF'S
5+
L'AFRIQUE
6+
L'Afrique
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Openoffice.org
2+
Unicef
3+
Unicef's

0 commit comments

Comments
 (0)