Skip to content

Commit c4d8365

Browse files
committed
org.apache.commons.codec.language.DaitchMokotoffSoundex.cleanup(String)
does not remove special characters like punctuation
1 parent ab9c920 commit c4d8365

File tree

4 files changed

+36
-26
lines changed

4 files changed

+36
-26
lines changed

src/changes/changes.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ The <action> type attribute can be add,update,fix,remove.
5757
<action type="fix" dev="ggregory" due-to="Sebastian Baunsgaard">Javadoc typo in Base16.java #380.</action>
5858
<action type="fix" dev="ggregory" due-to="Gary Gregory">Deprecate unused constant org.apache.commons.codec.language.bm.Rule.ALL.</action>
5959
<action type="fix" dev="ggregory" issue="CODEC-331" due-to="IlikeCode, Gary Gregory">org.apache.commons.codec.language.bm.Rule.parsePhonemeExpr(String) adds duplicate empty phoneme when input ends with |.</action>
60+
<action type="fix" dev="ggregory" issue="CODEC-331" due-to="IlikeCode, Gary Gregory">org.apache.commons.codec.language.DaitchMokotoffSoundex.cleanup(String) does not remove special characters like punctuation.</action>
6061
<!-- ADD -->
6162
<action type="add" dev="ggregory" due-to="Gary Gregory">Add HmacUtils.hmac(Path).</action>
6263
<action type="add" dev="ggregory" due-to="Gary Gregory">Add HmacUtils.hmacHex(Path).</action>

src/main/java/org/apache/commons/codec/language/DaitchMokotoffSoundex.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ public DaitchMokotoffSoundex(final boolean folding) {
347347
private String cleanup(final String input) {
348348
final StringBuilder sb = new StringBuilder();
349349
for (char ch : input.toCharArray()) {
350-
if (Character.isWhitespace(ch)) {
350+
if (Character.isWhitespace(ch) | !Character.isLetter(ch)) {
351351
continue;
352352
}
353353
ch = Character.toLowerCase(ch);

src/test/java/org/apache/commons/codec/AbstractStringEncoderTest.java

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -29,59 +29,56 @@
2929
*/
3030
public abstract class AbstractStringEncoderTest<T extends StringEncoder> {
3131

32-
protected T stringEncoder = this.createStringEncoder();
32+
protected T stringEncoder = createStringEncoder();
3333

3434
public void checkEncoding(final String expected, final String source) throws EncoderException {
35-
assertEquals(expected, this.getStringEncoder().encode(source), "Source: " + source);
35+
assertEquals(expected, getStringEncoder().encode(source), "Source: " + source);
3636
}
3737

3838
protected void checkEncodings(final String[][] data) throws EncoderException {
3939
for (final String[] element : data) {
40-
this.checkEncoding(element[1], element[0]);
40+
checkEncoding(element[1], element[0]);
4141
}
4242
}
4343

44-
protected void checkEncodingVariations(final String expected, final String[] data) throws EncoderException {
44+
protected void checkEncodingVariations(final String expected, final String... data) throws EncoderException {
4545
for (final String element : data) {
46-
this.checkEncoding(expected, element);
46+
checkEncoding(expected, element);
4747
}
4848
}
4949

5050
protected abstract T createStringEncoder();
5151

5252
public T getStringEncoder() {
53-
return this.stringEncoder;
53+
return stringEncoder;
5454
}
5555

5656
@Test
5757
void testEncodeEmpty() throws Exception {
58-
final Encoder encoder = this.getStringEncoder();
58+
final Encoder encoder = getStringEncoder();
5959
encoder.encode("");
6060
encoder.encode(" ");
6161
encoder.encode("\t");
6262
}
6363

6464
@Test
6565
void testEncodeNull() throws EncoderException {
66-
final StringEncoder encoder = this.getStringEncoder();
66+
final StringEncoder encoder = getStringEncoder();
6767
encoder.encode(null);
6868
}
6969

7070
@Test
7171
void testEncodeWithInvalidObject() throws Exception {
72-
final StringEncoder encoder = this.getStringEncoder();
73-
assertThrows(EncoderException.class, () -> encoder.encode(Float.valueOf(3.4f)),
74-
"An exception was not thrown when we tried to encode a Float object");
72+
final StringEncoder encoder = getStringEncoder();
73+
assertThrows(EncoderException.class, () -> encoder.encode(Float.valueOf(3.4f)), "An exception was not thrown when we tried to encode a Float object");
7574
}
75+
7676
@Test
7777
void testLocaleIndependence() throws Exception {
78-
final StringEncoder encoder = this.getStringEncoder();
79-
78+
final StringEncoder encoder = getStringEncoder();
8079
final String[] data = { "I", "i" };
81-
8280
final Locale orig = Locale.getDefault();
8381
final Locale[] locales = { Locale.ENGLISH, new Locale("tr"), Locale.getDefault() };
84-
8582
try {
8683
for (final String element : data) {
8784
String ref = null;
@@ -104,5 +101,4 @@ void testLocaleIndependence() throws Exception {
104101
Locale.setDefault(orig);
105102
}
106103
}
107-
108104
}

src/test/java/org/apache/commons/codec/language/DaitchMokotoffSoundexTest.java

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,18 @@
1414
* See the License for the specific language governing permissions and
1515
* limitations under the License.
1616
*/
17+
1718
package org.apache.commons.codec.language;
1819

1920
import static org.junit.jupiter.api.Assertions.assertEquals;
2021

22+
import java.util.stream.IntStream;
23+
2124
import org.apache.commons.codec.AbstractStringEncoderTest;
2225
import org.apache.commons.codec.EncoderException;
2326
import org.junit.jupiter.api.Test;
27+
import org.junit.jupiter.params.ParameterizedTest;
28+
import org.junit.jupiter.params.provider.MethodSource;
2429

2530
/**
2631
* Tests {@link DaitchMokotoffSoundex}.
@@ -47,7 +52,6 @@ private String soundex(final String source) {
4752
void testAccentedCharacterFolding() {
4853
assertEquals("294795", soundex("Straßburg"));
4954
assertEquals("294795", soundex("Strasburg"));
50-
5155
assertEquals("095600", soundex("Éregon"));
5256
assertEquals("095600", soundex("Eregon"));
5357
}
@@ -59,7 +63,6 @@ void testAdjacentCodes() {
5963
// 0-54-4---8 -> wrong
6064
// 0-54-----8 -> correct
6165
assertEquals("054800", soundex("AKSSOL"));
62-
6366
// GERSCHFELD
6467
// G-E-RS-CH-F-E-L-D
6568
// 5--4/94-5/4-7-8-3 -> wrong
@@ -82,18 +85,18 @@ void testEncodeBasic() {
8285

8386
@Test
8487
void testEncodeIgnoreApostrophes() throws EncoderException {
85-
checkEncodingVariations("079600", new String[] { "OBrien", "'OBrien", "O'Brien", "OB'rien", "OBr'ien",
86-
"OBri'en", "OBrie'n", "OBrien'" });
88+
checkEncodingVariations("079600", "OBrien", "'OBrien", "O'Brien", "OB'rien", "OBr'ien", "OBri'en", "OBrie'n", "OBrien'");
8789
}
8890

8991
/**
9092
* Test data from http://www.myatt.demon.co.uk/sxalg.htm
9193
*
92-
* @throws EncoderException for some failure scenarios */
94+
* @throws EncoderException for some failure scenarios
95+
*/
9396
@Test
9497
void testEncodeIgnoreHyphens() throws EncoderException {
95-
checkEncodingVariations("565463", new String[] { "KINGSMITH", "-KINGSMITH", "K-INGSMITH", "KI-NGSMITH",
96-
"KIN-GSMITH", "KING-SMITH", "KINGS-MITH", "KINGSM-ITH", "KINGSMI-TH", "KINGSMIT-H", "KINGSMITH-" });
98+
checkEncodingVariations("565463", "KINGSMITH", "-KINGSMITH", "K-INGSMITH", "KI-NGSMITH", "KIN-GSMITH", "KING-SMITH", "KINGS-MITH", "KINGSM-ITH",
99+
"KINGSMI-TH", "KINGSMIT-H", "KINGSMITH-");
97100
}
98101

99102
@Test
@@ -102,6 +105,18 @@ void testEncodeIgnoreTrimmable() {
102105
assertEquals("746536", encode("Washington"));
103106
}
104107

108+
static IntStream getNonLetters() {
109+
return IntStream.rangeClosed(Character.MIN_VALUE, Character.MAX_VALUE).filter(c -> !Character.isLetter(c));
110+
}
111+
112+
@ParameterizedTest
113+
@MethodSource("getNonLetters")
114+
void testEncodeIgnoreNonLetters(final int nonLetterInt) throws EncoderException {
115+
final char nonLetterChar = (char) nonLetterInt;
116+
checkEncodingVariations("746536", "Washington" + nonLetterChar, nonLetterChar + "Washington", nonLetterChar + "Washington" + nonLetterChar,
117+
"Washi" + nonLetterChar + "ngton");
118+
}
119+
105120
/**
106121
* Examples from http://www.jewishgen.org/infofiles/soundex.html
107122
*/
@@ -116,7 +131,6 @@ void testSoundexBasic() {
116131
assertEquals("370000", soundex("Topf"));
117132
assertEquals("586660", soundex("Kleinmann"));
118133
assertEquals("769600", soundex("Ben Aron"));
119-
120134
assertEquals("097400|097500", soundex("AUERBACH"));
121135
assertEquals("097400|097500", soundex("OHRBACH"));
122136
assertEquals("874400", soundex("LIPSHITZ"));
@@ -166,5 +180,4 @@ void testSpecialRomanianCharacters() {
166180
assertEquals("364000|464000", soundex("ţamas")); // t-cedilla
167181
assertEquals("364000|464000", soundex("țamas")); // t-comma
168182
}
169-
170183
}

0 commit comments

Comments
 (0)