Skip to content

Commit b46c615

Browse files
committed
OPENNLP-991 - Validate all passed in language codes
1 parent 81f8544 commit b46c615

File tree

4 files changed

+225
-10
lines changed

4 files changed

+225
-10
lines changed
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package opennlp.tools.util;
19+
20+
import java.util.Arrays;
21+
import java.util.HashSet;
22+
import java.util.Locale;
23+
import java.util.Set;
24+
25+
/**
26+
* Validates language codes against ISO 639 standards.
27+
* <p>
28+
* Accepts:
29+
* <ul>
30+
* <li>ISO 639-1 two-letter language codes (e.g., {@code "en"}, {@code "de"})</li>
31+
* <li>ISO 639-2/3 three-letter language codes consisting of lowercase ASCII letters
32+
* (e.g., {@code "eng"}, {@code "deu"}, {@code "dut"}, {@code "und"})</li>
33+
* <li>The special code {@code "x-unspecified"} used internally by OpenNLP</li>
34+
* </ul>
35+
*
36+
* @see <a href="https://iso639-3.sil.org/">ISO 639-3</a>
37+
*/
38+
public final class LanguageCodeValidator {
39+
40+
private static final String X_UNSPECIFIED = "x-unspecified";
41+
42+
private static final Set<String> ISO639_1_CODES =
43+
new HashSet<>(Arrays.asList(Locale.getISOLanguages()));
44+
45+
private LanguageCodeValidator() {
46+
// utility class, not intended to be instantiated
47+
}
48+
49+
/**
50+
* Checks whether the given language code is a valid ISO 639 code.
51+
* <p>
52+
* Two-letter codes are validated against the known set of ISO 639-1 codes.
53+
* Three-letter codes are accepted if they consist entirely of lowercase ASCII letters,
54+
* covering ISO 639-2 (both bibliographic and terminological) and ISO 639-3 codes.
55+
*
56+
* @param languageCode The language code to check. Must not be {@code null}.
57+
* @return {@code true} if the code is valid, {@code false} otherwise.
58+
* @throws NullPointerException if {@code languageCode} is {@code null}.
59+
*/
60+
public static boolean isValid(String languageCode) {
61+
if (languageCode == null) {
62+
throw new NullPointerException("languageCode must not be null");
63+
}
64+
65+
if (X_UNSPECIFIED.equals(languageCode)) {
66+
return true;
67+
}
68+
69+
int len = languageCode.length();
70+
if (len == 2) {
71+
return ISO639_1_CODES.contains(languageCode);
72+
}
73+
if (len == 3) {
74+
return isLowerAsciiAlpha(languageCode);
75+
}
76+
return false;
77+
}
78+
79+
/**
80+
* Validates the given language code and throws an {@link IllegalArgumentException}
81+
* if it is not a recognized ISO 639 language code.
82+
*
83+
* @param languageCode The language code to validate. Must not be {@code null}.
84+
* @throws IllegalArgumentException if the code is not a valid ISO 639 language code.
85+
* @throws NullPointerException if {@code languageCode} is {@code null}.
86+
*/
87+
public static void validateLanguageCode(String languageCode) {
88+
if (!isValid(languageCode)) {
89+
throw new IllegalArgumentException(
90+
"Unknown language code '" + languageCode + "', must be a valid ISO 639 code!");
91+
}
92+
}
93+
94+
private static boolean isLowerAsciiAlpha(String s) {
95+
for (int i = 0; i < s.length(); i++) {
96+
char c = s.charAt(i);
97+
if (c < 'a' || c > 'z') {
98+
return false;
99+
}
100+
}
101+
return true;
102+
}
103+
}

opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,14 @@
2626
import java.io.IOException;
2727
import java.io.InputStream;
2828
import java.io.OutputStream;
29-
import java.util.ArrayList;
30-
import java.util.Arrays;
31-
import java.util.List;
32-
import java.util.Locale;
3329

3430
import org.slf4j.Logger;
3531
import org.slf4j.LoggerFactory;
3632

3733
import opennlp.tools.commons.Internal;
3834
import opennlp.tools.ml.TrainerFactory;
3935
import opennlp.tools.util.InputStreamFactory;
36+
import opennlp.tools.util.LanguageCodeValidator;
4037
import opennlp.tools.util.MarkableFileInputStreamFactory;
4138
import opennlp.tools.util.TrainingParameters;
4239
import opennlp.tools.util.model.BaseModel;
@@ -278,12 +275,10 @@ public static Double getDoubleParameter(String param, String[] args) {
278275
}
279276

280277
public static void checkLanguageCode(String code) {
281-
List<String> languageCodes = new ArrayList<>(Arrays.asList(Locale.getISOLanguages()));
282-
languageCodes.add("x-unspecified");
283-
284-
if (!languageCodes.contains(code)) {
285-
throw new TerminateToolException(1, "Unknown language code " + code + ", " +
286-
"must be an ISO 639 code!");
278+
if (!LanguageCodeValidator.isValid(code)) {
279+
throw new TerminateToolException(1,
280+
"Unknown language code " + code
281+
+ ", must be an ISO 639 code!");
287282
}
288283
}
289284

opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/model/BaseModel.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343

4444
import opennlp.tools.util.BaseToolFactory;
4545
import opennlp.tools.util.InvalidFormatException;
46+
import opennlp.tools.util.LanguageCodeValidator;
4647
import opennlp.tools.util.Version;
4748
import opennlp.tools.util.ext.ExtensionLoader;
4849

@@ -108,6 +109,7 @@ protected BaseModel(String componentName, String languageCode,
108109
this(componentName, false);
109110

110111
Objects.requireNonNull(languageCode, "languageCode must not be null");
112+
LanguageCodeValidator.validateLanguageCode(languageCode);
111113

112114
createBaseArtifactSerializers(artifactSerializers);
113115

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package opennlp.tools.util;
19+
20+
import org.junit.jupiter.api.Assertions;
21+
import org.junit.jupiter.api.Test;
22+
23+
class LanguageCodeValidatorTest {
24+
25+
@Test
26+
void testValidIso639_1Codes() {
27+
Assertions.assertTrue(LanguageCodeValidator.isValid("en"));
28+
Assertions.assertTrue(LanguageCodeValidator.isValid("de"));
29+
Assertions.assertTrue(LanguageCodeValidator.isValid("fr"));
30+
Assertions.assertTrue(LanguageCodeValidator.isValid("es"));
31+
Assertions.assertTrue(LanguageCodeValidator.isValid("pt"));
32+
Assertions.assertTrue(LanguageCodeValidator.isValid("it"));
33+
Assertions.assertTrue(LanguageCodeValidator.isValid("nl"));
34+
Assertions.assertTrue(LanguageCodeValidator.isValid("th"));
35+
Assertions.assertTrue(LanguageCodeValidator.isValid("ja"));
36+
Assertions.assertTrue(LanguageCodeValidator.isValid("pl"));
37+
}
38+
39+
@Test
40+
void testValidIso639_3TerminologicalCodes() {
41+
Assertions.assertTrue(LanguageCodeValidator.isValid("eng"));
42+
Assertions.assertTrue(LanguageCodeValidator.isValid("deu"));
43+
Assertions.assertTrue(LanguageCodeValidator.isValid("fra"));
44+
Assertions.assertTrue(LanguageCodeValidator.isValid("spa"));
45+
Assertions.assertTrue(LanguageCodeValidator.isValid("por"));
46+
Assertions.assertTrue(LanguageCodeValidator.isValid("ita"));
47+
Assertions.assertTrue(LanguageCodeValidator.isValid("nld"));
48+
Assertions.assertTrue(LanguageCodeValidator.isValid("tha"));
49+
Assertions.assertTrue(LanguageCodeValidator.isValid("jpn"));
50+
Assertions.assertTrue(LanguageCodeValidator.isValid("pol"));
51+
}
52+
53+
@Test
54+
void testValidIso639_2BibliographicCodes() {
55+
Assertions.assertTrue(LanguageCodeValidator.isValid("dut"));
56+
Assertions.assertTrue(LanguageCodeValidator.isValid("fre"));
57+
Assertions.assertTrue(LanguageCodeValidator.isValid("ger"));
58+
}
59+
60+
@Test
61+
void testUndeterminedCode() {
62+
Assertions.assertTrue(LanguageCodeValidator.isValid("und"));
63+
}
64+
65+
@Test
66+
void testSpecialCodes() {
67+
Assertions.assertTrue(LanguageCodeValidator.isValid("x-unspecified"));
68+
}
69+
70+
@Test
71+
void testInvalidCodes() {
72+
Assertions.assertFalse(LanguageCodeValidator.isValid(""));
73+
Assertions.assertFalse(LanguageCodeValidator.isValid("xyz123"));
74+
Assertions.assertFalse(LanguageCodeValidator.isValid("invalid"));
75+
Assertions.assertFalse(LanguageCodeValidator.isValid("EN"));
76+
Assertions.assertFalse(LanguageCodeValidator.isValid("ENG"));
77+
Assertions.assertFalse(LanguageCodeValidator.isValid("123"));
78+
Assertions.assertFalse(LanguageCodeValidator.isValid("e"));
79+
Assertions.assertFalse(LanguageCodeValidator.isValid("en-US"));
80+
Assertions.assertFalse(LanguageCodeValidator.isValid("abcd"));
81+
}
82+
83+
@Test
84+
void testInvalidTwoLetterCode() {
85+
Assertions.assertFalse(LanguageCodeValidator.isValid("xx"));
86+
Assertions.assertFalse(LanguageCodeValidator.isValid("zz"));
87+
}
88+
89+
@Test
90+
void testNullCode() {
91+
Assertions.assertThrows(NullPointerException.class,
92+
() -> LanguageCodeValidator.isValid(null));
93+
}
94+
95+
@Test
96+
void testValidateThrowsForInvalidCode() {
97+
IllegalArgumentException ex = Assertions.assertThrows(IllegalArgumentException.class,
98+
() -> LanguageCodeValidator.validateLanguageCode("invalid_code"));
99+
Assertions.assertTrue(ex.getMessage().contains("invalid_code"));
100+
}
101+
102+
@Test
103+
void testValidatePassesForValidCode() {
104+
Assertions.assertDoesNotThrow(
105+
() -> LanguageCodeValidator.validateLanguageCode("en"));
106+
Assertions.assertDoesNotThrow(
107+
() -> LanguageCodeValidator.validateLanguageCode("eng"));
108+
Assertions.assertDoesNotThrow(
109+
() -> LanguageCodeValidator.validateLanguageCode("dut"));
110+
Assertions.assertDoesNotThrow(
111+
() -> LanguageCodeValidator.validateLanguageCode("und"));
112+
Assertions.assertDoesNotThrow(
113+
() -> LanguageCodeValidator.validateLanguageCode("x-unspecified"));
114+
}
115+
}

0 commit comments

Comments
 (0)