diff --git a/opennlp-api/src/main/java/opennlp/tools/util/LanguageCodeValidator.java b/opennlp-api/src/main/java/opennlp/tools/util/LanguageCodeValidator.java
new file mode 100644
index 000000000..8c8ead1d1
--- /dev/null
+++ b/opennlp-api/src/main/java/opennlp/tools/util/LanguageCodeValidator.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util;
+
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+/**
+ * Validates language codes against ISO 639 standards.
+ *
+ * Accepts:
+ *
+ * - ISO 639-1 two-letter language codes
+ * (e.g., {@code "en"}, {@code "de"})
+ * - ISO 639-2/3 three-letter language codes
+ * (e.g., {@code "eng"}, {@code "deu"})
+ * - The special code {@code "x-unspecified"}
+ * used internally by OpenNLP
+ *
+ *
+ * Valid codes are derived from {@link Locale#availableLocales()}
+ * plus additional ISO 639-2 bibliographic codes and
+ * {@code "und"} (undetermined).
+ *
+ * @see ISO 639-3
+ */
+public final class LanguageCodeValidator {
+
+ private static final String X_UNSPECIFIED = "x-unspecified";
+
+ private static final Set VALID_CODES;
+
+ static {
+ VALID_CODES = Locale.availableLocales()
+ .flatMap(loc -> {
+ final Set codes = new HashSet<>();
+ final String lang = loc.getLanguage();
+ if (!lang.isEmpty()) {
+ codes.add(lang);
+ }
+ try {
+ final String iso3 = loc.getISO3Language();
+ if (!iso3.isEmpty()) {
+ codes.add(iso3);
+ }
+ } catch (Exception ignored) {
+ // Some locales may not have a 3-letter equivalent
+ }
+ return codes.stream();
+ })
+ .collect(Collectors.toCollection(HashSet::new));
+
+ // ISO 639-2 bibliographic codes not returned by Locale
+ VALID_CODES.add("dut"); // Dutch (bibliographic)
+ VALID_CODES.add("fre"); // French (bibliographic)
+ VALID_CODES.add("ger"); // German (bibliographic)
+
+ // ISO 639-3 special code for undetermined language
+ VALID_CODES.add("und");
+
+ // OpenNLP-specific special code
+ VALID_CODES.add(X_UNSPECIFIED);
+ }
+
+ private LanguageCodeValidator() {
+ // utility class, not intended to be instantiated
+ }
+
+ /**
+ * Checks whether the given language code is a valid ISO 639 code.
+ *
+ * @param languageCode The language code to check.
+ * Must not be {@code null}.
+ * @return {@code true} if the code is valid,
+ * {@code false} otherwise.
+ * @throws IllegalArgumentException if {@code languageCode}
+ * is {@code null}.
+ */
+ public static boolean isValid(String languageCode) {
+ if (languageCode == null) {
+ throw new IllegalArgumentException(
+ "languageCode must not be null");
+ }
+ return VALID_CODES.contains(languageCode);
+ }
+
+ /**
+ * Validates the given language code and throws an
+ * {@link IllegalArgumentException} if it is not a recognized
+ * ISO 639 language code.
+ *
+ * @param languageCode The language code to validate.
+ * Must not be {@code null}.
+ * @throws IllegalArgumentException if the code is not a valid
+ * ISO 639 language code or is {@code null}.
+ */
+ public static void validateLanguageCode(String languageCode) {
+ if (!isValid(languageCode)) {
+ throw new IllegalArgumentException(
+ "Unknown language code '" + languageCode
+ + "', must be a valid ISO 639 code!");
+ }
+ }
+}
diff --git a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java
index 0c774ff8a..10c1127b8 100644
--- a/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java
+++ b/opennlp-core/opennlp-cli/src/main/java/opennlp/tools/cmdline/CmdLineUtil.java
@@ -26,10 +26,6 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Locale;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -37,6 +33,7 @@
import opennlp.tools.commons.Internal;
import opennlp.tools.ml.TrainerFactory;
import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.LanguageCodeValidator;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.BaseModel;
@@ -278,12 +275,10 @@ public static Double getDoubleParameter(String param, String[] args) {
}
public static void checkLanguageCode(String code) {
- List languageCodes = new ArrayList<>(Arrays.asList(Locale.getISOLanguages()));
- languageCodes.add("x-unspecified");
-
- if (!languageCodes.contains(code)) {
- throw new TerminateToolException(1, "Unknown language code " + code + ", " +
- "must be an ISO 639 code!");
+ if (!LanguageCodeValidator.isValid(code)) {
+ throw new TerminateToolException(1,
+ "Unknown language code " + code
+ + ", must be an ISO 639 code!");
}
}
diff --git a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/model/BaseModel.java b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/model/BaseModel.java
index 7716bd0de..0d3171b30 100644
--- a/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/model/BaseModel.java
+++ b/opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/model/BaseModel.java
@@ -43,6 +43,7 @@
import opennlp.tools.util.BaseToolFactory;
import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.LanguageCodeValidator;
import opennlp.tools.util.Version;
import opennlp.tools.util.ext.ExtensionLoader;
@@ -108,6 +109,7 @@ protected BaseModel(String componentName, String languageCode,
this(componentName, false);
Objects.requireNonNull(languageCode, "languageCode must not be null");
+ LanguageCodeValidator.validateLanguageCode(languageCode);
createBaseArtifactSerializers(artifactSerializers);
diff --git a/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/LanguageCodeValidatorTest.java b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/LanguageCodeValidatorTest.java
new file mode 100644
index 000000000..6dabe46ec
--- /dev/null
+++ b/opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/util/LanguageCodeValidatorTest.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+class LanguageCodeValidatorTest {
+
+ @Test
+ void testValidIso639_1Codes() {
+ Assertions.assertTrue(LanguageCodeValidator.isValid("en"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("de"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("fr"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("es"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("pt"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("it"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("nl"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("th"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("ja"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("pl"));
+ }
+
+ @Test
+ void testValidIso639_3TerminologicalCodes() {
+ Assertions.assertTrue(LanguageCodeValidator.isValid("eng"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("deu"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("fra"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("spa"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("por"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("ita"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("nld"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("tha"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("jpn"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("pol"));
+ }
+
+ @Test
+ void testValidIso639_2BibliographicCodes() {
+ Assertions.assertTrue(LanguageCodeValidator.isValid("dut"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("fre"));
+ Assertions.assertTrue(LanguageCodeValidator.isValid("ger"));
+ }
+
+ @Test
+ void testUndeterminedCode() {
+ Assertions.assertTrue(LanguageCodeValidator.isValid("und"));
+ }
+
+ @Test
+ void testSpecialCodes() {
+ Assertions.assertTrue(LanguageCodeValidator.isValid("x-unspecified"));
+ }
+
+ @Test
+ void testInvalidCodes() {
+ Assertions.assertFalse(LanguageCodeValidator.isValid(""));
+ Assertions.assertFalse(LanguageCodeValidator.isValid("xyz123"));
+ Assertions.assertFalse(LanguageCodeValidator.isValid("invalid"));
+ Assertions.assertFalse(LanguageCodeValidator.isValid("EN"));
+ Assertions.assertFalse(LanguageCodeValidator.isValid("ENG"));
+ Assertions.assertFalse(LanguageCodeValidator.isValid("123"));
+ Assertions.assertFalse(LanguageCodeValidator.isValid("e"));
+ Assertions.assertFalse(LanguageCodeValidator.isValid("en-US"));
+ Assertions.assertFalse(LanguageCodeValidator.isValid("abcd"));
+ }
+
+ @Test
+ void testInvalidTwoLetterCode() {
+ Assertions.assertFalse(LanguageCodeValidator.isValid("xx"));
+ Assertions.assertFalse(LanguageCodeValidator.isValid("zz"));
+ }
+
+ @Test
+ void testInvalidThreeLetterCode() {
+ Assertions.assertFalse(LanguageCodeValidator.isValid("xyz"));
+ Assertions.assertFalse(LanguageCodeValidator.isValid("abc"));
+ Assertions.assertFalse(LanguageCodeValidator.isValid("zzz"));
+ }
+
+ @Test
+ void testNullCode() {
+ Assertions.assertThrows(IllegalArgumentException.class,
+ () -> LanguageCodeValidator.isValid(null));
+ }
+
+ @Test
+ void testValidateThrowsForInvalidCode() {
+ IllegalArgumentException ex = Assertions.assertThrows(IllegalArgumentException.class,
+ () -> LanguageCodeValidator.validateLanguageCode("invalid_code"));
+ Assertions.assertTrue(ex.getMessage().contains("invalid_code"));
+ }
+
+ @Test
+ void testValidatePassesForValidCode() {
+ Assertions.assertDoesNotThrow(
+ () -> LanguageCodeValidator.validateLanguageCode("en"));
+ Assertions.assertDoesNotThrow(
+ () -> LanguageCodeValidator.validateLanguageCode("eng"));
+ Assertions.assertDoesNotThrow(
+ () -> LanguageCodeValidator.validateLanguageCode("dut"));
+ Assertions.assertDoesNotThrow(
+ () -> LanguageCodeValidator.validateLanguageCode("und"));
+ Assertions.assertDoesNotThrow(
+ () -> LanguageCodeValidator.validateLanguageCode("x-unspecified"));
+ }
+}