diff --git a/opennlp-api/src/main/java/opennlp/tools/util/LanguageCodeValidator.java b/opennlp-api/src/main/java/opennlp/tools/util/LanguageCodeValidator.java new file mode 100644 index 000000000..8c8ead1d1 --- /dev/null +++ b/opennlp-api/src/main/java/opennlp/tools/util/LanguageCodeValidator.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Validates language codes against ISO 639 standards. + *

+ * Accepts: + *

+ *

+ * Valid codes are derived from {@link Locale#availableLocales()} + * plus additional ISO 639-2 bibliographic codes and + * {@code "und"} (undetermined). + * + * @see ISO 639-3 + */ +public final class LanguageCodeValidator { + + private static final String X_UNSPECIFIED = "x-unspecified"; + + private static final Set VALID_CODES; + + static { + VALID_CODES = Locale.availableLocales() + .flatMap(loc -> { + final Set codes = new HashSet<>(); + final String lang = loc.getLanguage(); + if (!lang.isEmpty()) { + codes.add(lang); + } + try { + final String iso3 = loc.getISO3Language(); + if (!iso3.isEmpty()) { + codes.add(iso3); + } + } catch (Exception ignored) { + // Some locales may not have a 3-letter equivalent + } + return codes.stream(); + }) + .collect(Collectors.toCollection(HashSet::new)); + + // ISO 639-2 bibliographic codes not returned by Locale + VALID_CODES.add("dut"); // Dutch (bibliographic) + VALID_CODES.add("fre"); // French (bibliographic) + VALID_CODES.add("ger"); // German (bibliographic) + + // ISO 639-3 special code for undetermined language + VALID_CODES.add("und"); + + // OpenNLP-specific special code + VALID_CODES.add(X_UNSPECIFIED); + } + + private LanguageCodeValidator() { + // utility class, not intended to be instantiated + } + + /** + * Checks whether the given language code is a valid ISO 639 code. + * + * @param languageCode The language code to check. + * Must not be {@code null}. + * @return {@code true} if the code is valid, + * {@code false} otherwise. + * @throws IllegalArgumentException if {@code languageCode} + * is {@code null}. + */ + public static boolean isValid(String languageCode) { + if (languageCode == null) { + throw new IllegalArgumentException( + "languageCode must not be null"); + } + return VALID_CODES.contains(languageCode); + } + + /** + * Validates the given language code and throws an + * {@link IllegalArgumentException} if it is not a recognized + * ISO 639 language code. + * + * @param languageCode The language code to validate. + * Must not be {@code null}. + * @throws IllegalArgumentException if the code is not a valid + * ISO 639 language code or is {@code null}. + */ + public static void validateLanguageCode(String languageCode) { + if (!isValid(languageCode)) { + throw new IllegalArgumentException( + "Unknown language code '" + languageCode + + "', must be a valid ISO 639 code!"); + } + } +} diff --git a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java index 0c774ff8a..10c1127b8 100644 --- a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java +++ b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java @@ -26,10 +26,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Locale; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -37,6 +33,7 @@ import opennlp.tools.commons.Internal; import opennlp.tools.ml.TrainerFactory; import opennlp.tools.util.InputStreamFactory; +import opennlp.tools.util.LanguageCodeValidator; import opennlp.tools.util.MarkableFileInputStreamFactory; import opennlp.tools.util.TrainingParameters; import opennlp.tools.util.model.BaseModel; @@ -278,12 +275,10 @@ public static Double getDoubleParameter(String param, String[] args) { } public static void checkLanguageCode(String code) { - List languageCodes = new ArrayList<>(Arrays.asList(Locale.getISOLanguages())); - languageCodes.add("x-unspecified"); - - if (!languageCodes.contains(code)) { - throw new TerminateToolException(1, "Unknown language code " + code + ", " + - "must be an ISO 639 code!"); + if (!LanguageCodeValidator.isValid(code)) { + throw new TerminateToolException(1, + "Unknown language code " + code + + ", must be an ISO 639 code!"); } } diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/model/BaseModel.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/model/BaseModel.java index 7716bd0de..0d3171b30 100644 --- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/model/BaseModel.java +++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/model/BaseModel.java @@ -43,6 +43,7 @@ import opennlp.tools.util.BaseToolFactory; import opennlp.tools.util.InvalidFormatException; +import opennlp.tools.util.LanguageCodeValidator; import opennlp.tools.util.Version; import opennlp.tools.util.ext.ExtensionLoader; @@ -108,6 +109,7 @@ protected BaseModel(String componentName, String languageCode, this(componentName, false); Objects.requireNonNull(languageCode, "languageCode must not be null"); + LanguageCodeValidator.validateLanguageCode(languageCode); createBaseArtifactSerializers(artifactSerializers); diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/LanguageCodeValidatorTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/LanguageCodeValidatorTest.java new file mode 100644 index 000000000..6dabe46ec --- /dev/null +++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/LanguageCodeValidatorTest.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +class LanguageCodeValidatorTest { + + @Test + void testValidIso639_1Codes() { + Assertions.assertTrue(LanguageCodeValidator.isValid("en")); + Assertions.assertTrue(LanguageCodeValidator.isValid("de")); + Assertions.assertTrue(LanguageCodeValidator.isValid("fr")); + Assertions.assertTrue(LanguageCodeValidator.isValid("es")); + Assertions.assertTrue(LanguageCodeValidator.isValid("pt")); + Assertions.assertTrue(LanguageCodeValidator.isValid("it")); + Assertions.assertTrue(LanguageCodeValidator.isValid("nl")); + Assertions.assertTrue(LanguageCodeValidator.isValid("th")); + Assertions.assertTrue(LanguageCodeValidator.isValid("ja")); + Assertions.assertTrue(LanguageCodeValidator.isValid("pl")); + } + + @Test + void testValidIso639_3TerminologicalCodes() { + Assertions.assertTrue(LanguageCodeValidator.isValid("eng")); + Assertions.assertTrue(LanguageCodeValidator.isValid("deu")); + Assertions.assertTrue(LanguageCodeValidator.isValid("fra")); + Assertions.assertTrue(LanguageCodeValidator.isValid("spa")); + Assertions.assertTrue(LanguageCodeValidator.isValid("por")); + Assertions.assertTrue(LanguageCodeValidator.isValid("ita")); + Assertions.assertTrue(LanguageCodeValidator.isValid("nld")); + Assertions.assertTrue(LanguageCodeValidator.isValid("tha")); + Assertions.assertTrue(LanguageCodeValidator.isValid("jpn")); + Assertions.assertTrue(LanguageCodeValidator.isValid("pol")); + } + + @Test + void testValidIso639_2BibliographicCodes() { + Assertions.assertTrue(LanguageCodeValidator.isValid("dut")); + Assertions.assertTrue(LanguageCodeValidator.isValid("fre")); + Assertions.assertTrue(LanguageCodeValidator.isValid("ger")); + } + + @Test + void testUndeterminedCode() { + Assertions.assertTrue(LanguageCodeValidator.isValid("und")); + } + + @Test + void testSpecialCodes() { + Assertions.assertTrue(LanguageCodeValidator.isValid("x-unspecified")); + } + + @Test + void testInvalidCodes() { + Assertions.assertFalse(LanguageCodeValidator.isValid("")); + Assertions.assertFalse(LanguageCodeValidator.isValid("xyz123")); + Assertions.assertFalse(LanguageCodeValidator.isValid("invalid")); + Assertions.assertFalse(LanguageCodeValidator.isValid("EN")); + Assertions.assertFalse(LanguageCodeValidator.isValid("ENG")); + Assertions.assertFalse(LanguageCodeValidator.isValid("123")); + Assertions.assertFalse(LanguageCodeValidator.isValid("e")); + Assertions.assertFalse(LanguageCodeValidator.isValid("en-US")); + Assertions.assertFalse(LanguageCodeValidator.isValid("abcd")); + } + + @Test + void testInvalidTwoLetterCode() { + Assertions.assertFalse(LanguageCodeValidator.isValid("xx")); + Assertions.assertFalse(LanguageCodeValidator.isValid("zz")); + } + + @Test + void testInvalidThreeLetterCode() { + Assertions.assertFalse(LanguageCodeValidator.isValid("xyz")); + Assertions.assertFalse(LanguageCodeValidator.isValid("abc")); + Assertions.assertFalse(LanguageCodeValidator.isValid("zzz")); + } + + @Test + void testNullCode() { + Assertions.assertThrows(IllegalArgumentException.class, + () -> LanguageCodeValidator.isValid(null)); + } + + @Test + void testValidateThrowsForInvalidCode() { + IllegalArgumentException ex = Assertions.assertThrows(IllegalArgumentException.class, + () -> LanguageCodeValidator.validateLanguageCode("invalid_code")); + Assertions.assertTrue(ex.getMessage().contains("invalid_code")); + } + + @Test + void testValidatePassesForValidCode() { + Assertions.assertDoesNotThrow( + () -> LanguageCodeValidator.validateLanguageCode("en")); + Assertions.assertDoesNotThrow( + () -> LanguageCodeValidator.validateLanguageCode("eng")); + Assertions.assertDoesNotThrow( + () -> LanguageCodeValidator.validateLanguageCode("dut")); + Assertions.assertDoesNotThrow( + () -> LanguageCodeValidator.validateLanguageCode("und")); + Assertions.assertDoesNotThrow( + () -> LanguageCodeValidator.validateLanguageCode("x-unspecified")); + } +}